2 * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * We compute "carryless multiplications" through normal integer
29 * multiplications, masking out enough bits to create "holes" in which
30 * carries may expand without altering our bits; we really use 8 data
31 * bits per 32-bit word, spaced every fourth bit. Accumulated carries
32 * may not exceed 8 in total, which fits in 4 bits.
34 * It would be possible to use a 3-bit spacing, allowing two operands,
35 * one with 7 non-zero data bits, the other one with 10 or 11 non-zero
36 * data bits; this asymmetric splitting makes the overall code more
37 * complex with thresholds and exceptions, and does not appear to be
42 * We cannot really autodetect whether multiplications are "slow" or
43 * not. A typical example is the ARM Cortex M0+, which exists in two
44 * versions: one with a 1-cycle multiplication opcode, the other with
45 * a 32-cycle multiplication opcode. They both use exactly the same
46 * architecture and ABI, and cannot be distinguished from each other
49 * Since most modern CPU (even embedded CPU) still have fast
50 * multiplications, we use the "fast mul" code by default.
56 * This implementation uses Karatsuba-like reduction to make fewer
57 * integer multiplications (9 instead of 16), at the expense of extra
58 * logical operations (XOR, shifts...). On modern x86 CPU that offer
59 * fast, pipelined multiplications, this code is about twice slower than
60 * the simpler code with 16 multiplications. This tendency may be
61 * reversed on low-end platforms with expensive multiplications.
64 #define MUL32(h, l, x, y) do { \
65 uint64_t mul32tmp = MUL(x, y); \
66 (h) = (uint32_t)(mul32tmp >> 32); \
67 (l) = (uint32_t)mul32tmp; \
71 bmul(uint32_t *hi
, uint32_t *lo
, uint32_t x
, uint32_t y
)
73 uint32_t x0
, x1
, x2
, x3
;
74 uint32_t y0
, y1
, y2
, y3
;
75 uint32_t a0
, a1
, a2
, a3
, a4
, a5
, a6
, a7
, a8
;
76 uint32_t b0
, b1
, b2
, b3
, b4
, b5
, b6
, b7
, b8
;
78 x0
= x
& (uint32_t)0x11111111;
79 x1
= x
& (uint32_t)0x22222222;
80 x2
= x
& (uint32_t)0x44444444;
81 x3
= x
& (uint32_t)0x88888888;
82 y0
= y
& (uint32_t)0x11111111;
83 y1
= y
& (uint32_t)0x22222222;
84 y2
= y
& (uint32_t)0x44444444;
85 y3
= y
& (uint32_t)0x88888888;
88 * (x0+W*x1)*(y0+W*y1) -> a0:b0
89 * (x2+W*x3)*(y2+W*y3) -> a3:b3
90 * ((x0+x2)+W*(x1+x3))*((y0+y2)+W*(y1+y3)) -> a6:b6
111 MUL32(b0
, a0
, b0
, a0
);
112 MUL32(b1
, a1
, b1
, a1
);
113 MUL32(b2
, a2
, b2
, a2
);
114 MUL32(b3
, a3
, b3
, a3
);
115 MUL32(b4
, a4
, b4
, a4
);
116 MUL32(b5
, a5
, b5
, a5
);
117 MUL32(b6
, a6
, b6
, a6
);
118 MUL32(b7
, a7
, b7
, a7
);
119 MUL32(b8
, a8
, b8
, a8
);
121 a0
&= (uint32_t)0x11111111;
122 a1
&= (uint32_t)0x11111111;
123 a2
&= (uint32_t)0x11111111;
124 a3
&= (uint32_t)0x11111111;
125 a4
&= (uint32_t)0x11111111;
126 a5
&= (uint32_t)0x11111111;
127 a6
&= (uint32_t)0x11111111;
128 a7
&= (uint32_t)0x11111111;
129 a8
&= (uint32_t)0x11111111;
130 b0
&= (uint32_t)0x11111111;
131 b1
&= (uint32_t)0x11111111;
132 b2
&= (uint32_t)0x11111111;
133 b3
&= (uint32_t)0x11111111;
134 b4
&= (uint32_t)0x11111111;
135 b5
&= (uint32_t)0x11111111;
136 b6
&= (uint32_t)0x11111111;
137 b7
&= (uint32_t)0x11111111;
138 b8
&= (uint32_t)0x11111111;
142 a0
^= (a2
<< 1) ^ (a1
<< 2);
143 b0
^= (b2
<< 1) ^ (b1
<< 2);
146 a3
^= (a5
<< 1) ^ (a4
<< 2);
147 b3
^= (b5
<< 1) ^ (b4
<< 2);
150 a6
^= (a8
<< 1) ^ (a7
<< 2);
151 b6
^= (b8
<< 1) ^ (b7
<< 2);
154 *lo
= a0
^ (a6
<< 2) ^ (a3
<< 4);
155 *hi
= b0
^ (b6
<< 2) ^ (b3
<< 4) ^ (a6
>> 30) ^ (a3
>> 28);
161 * Simple multiplication in GF(2)[X], using 16 integer multiplications.
165 bmul(uint32_t *hi
, uint32_t *lo
, uint32_t x
, uint32_t y
)
167 uint32_t x0
, x1
, x2
, x3
;
168 uint32_t y0
, y1
, y2
, y3
;
169 uint64_t z0
, z1
, z2
, z3
;
172 x0
= x
& (uint32_t)0x11111111;
173 x1
= x
& (uint32_t)0x22222222;
174 x2
= x
& (uint32_t)0x44444444;
175 x3
= x
& (uint32_t)0x88888888;
176 y0
= y
& (uint32_t)0x11111111;
177 y1
= y
& (uint32_t)0x22222222;
178 y2
= y
& (uint32_t)0x44444444;
179 y3
= y
& (uint32_t)0x88888888;
180 z0
= MUL(x0
, y0
) ^ MUL(x1
, y3
) ^ MUL(x2
, y2
) ^ MUL(x3
, y1
);
181 z1
= MUL(x0
, y1
) ^ MUL(x1
, y0
) ^ MUL(x2
, y3
) ^ MUL(x3
, y2
);
182 z2
= MUL(x0
, y2
) ^ MUL(x1
, y1
) ^ MUL(x2
, y0
) ^ MUL(x3
, y3
);
183 z3
= MUL(x0
, y3
) ^ MUL(x1
, y2
) ^ MUL(x2
, y1
) ^ MUL(x3
, y0
);
184 z0
&= (uint64_t)0x1111111111111111;
185 z1
&= (uint64_t)0x2222222222222222;
186 z2
&= (uint64_t)0x4444444444444444;
187 z3
&= (uint64_t)0x8888888888888888;
188 z
= z0
| z1
| z2
| z3
;
190 *hi
= (uint32_t)(z
>> 32);
195 /* see bearssl_hash.h */
197 br_ghash_ctmul(void *y
, const void *h
, const void *data
, size_t len
)
199 const unsigned char *buf
, *hb
;
205 * Throughout the loop we handle the y and h values as arrays
211 yw
[3] = br_dec32be(yb
);
212 yw
[2] = br_dec32be(yb
+ 4);
213 yw
[1] = br_dec32be(yb
+ 8);
214 yw
[0] = br_dec32be(yb
+ 12);
215 hw
[3] = br_dec32be(hb
);
216 hw
[2] = br_dec32be(hb
+ 4);
217 hw
[1] = br_dec32be(hb
+ 8);
218 hw
[0] = br_dec32be(hb
+ 12);
220 const unsigned char *src
;
221 unsigned char tmp
[16];
223 uint32_t a
[9], b
[9], zw
[8];
224 uint32_t c0
, c1
, c2
, c3
, d0
, d1
, d2
, d3
, e0
, e1
, e2
, e3
;
227 * Get the next 16-byte block (using zero-padding if
235 memcpy(tmp
, buf
, len
);
236 memset(tmp
+ len
, 0, (sizeof tmp
) - len
);
242 * Decode the block. The GHASH standard mandates
243 * big-endian encoding.
245 yw
[3] ^= br_dec32be(src
);
246 yw
[2] ^= br_dec32be(src
+ 4);
247 yw
[1] ^= br_dec32be(src
+ 8);
248 yw
[0] ^= br_dec32be(src
+ 12);
251 * We multiply two 128-bit field elements. We use
252 * Karatsuba to turn that into three 64-bit
253 * multiplications, which are themselves done with a
254 * total of nine 32-bit multiplications.
258 * y[0,1]*h[0,1] -> 0..2
259 * y[2,3]*h[2,3] -> 3..5
260 * (y[0,1]+y[2,3])*(h[0,1]+h[2,3]) -> 6..8
283 for (i
= 0; i
< 9; i
++) {
284 bmul(&b
[i
], &a
[i
], b
[i
], a
[i
]);
288 c1
= b
[0] ^ a
[2] ^ a
[0] ^ a
[1];
289 c2
= a
[1] ^ b
[2] ^ b
[0] ^ b
[1];
292 d1
= b
[3] ^ a
[5] ^ a
[3] ^ a
[4];
293 d2
= a
[4] ^ b
[5] ^ b
[3] ^ b
[4];
296 e1
= b
[6] ^ a
[8] ^ a
[6] ^ a
[7];
297 e2
= a
[7] ^ b
[8] ^ b
[6] ^ b
[7];
310 * GHASH specification has the bits "reversed" (most
311 * significant is in fact least significant), which does
312 * not matter for a carryless multiplication, except that
313 * the 255-bit result must be shifted by 1 bit.
316 zw
[1] = (c1
<< 1) | (c0
>> 31);
317 zw
[2] = (c2
<< 1) | (c1
>> 31);
318 zw
[3] = (c3
<< 1) | (c2
>> 31);
319 zw
[4] = (d0
<< 1) | (c3
>> 31);
320 zw
[5] = (d1
<< 1) | (d0
>> 31);
321 zw
[6] = (d2
<< 1) | (d1
>> 31);
322 zw
[7] = (d3
<< 1) | (d2
>> 31);
325 * We now do the reduction modulo the field polynomial
326 * to get back to 128 bits.
328 for (i
= 0; i
< 4; i
++) {
332 zw
[i
+ 4] ^= lw
^ (lw
>> 1) ^ (lw
>> 2) ^ (lw
>> 7);
333 zw
[i
+ 3] ^= (lw
<< 31) ^ (lw
<< 30) ^ (lw
<< 25);
335 memcpy(yw
, zw
+ 4, sizeof yw
);
339 * Encode back the result.
341 br_enc32be(yb
, yw
[3]);
342 br_enc32be(yb
+ 4, yw
[2]);
343 br_enc32be(yb
+ 8, yw
[1]);
344 br_enc32be(yb
+ 12, yw
[0]);