2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * This is the GHASH implementation that leverages the pclmulqdq opcode
29 * (from the AES-NI instructions).
35 #if BR_AES_X86NI_GCC_OLD
36 #pragma GCC push_options
37 #pragma GCC target("sse2,ssse3,pclmul")
39 #include <tmmintrin.h>
40 #include <wmmintrin.h>
42 #if BR_AES_X86NI_GCC_OLD
43 #pragma GCC pop_options
52 * GHASH is defined over elements of GF(2^128) with "full little-endian"
53 * representation: leftmost byte is least significant, and, within each
54 * byte, leftmost _bit_ is least significant. The natural ordering in
55 * x86 is "mixed little-endian": bytes are ordered from least to most
56 * significant, but bits within a byte are in most-to-least significant
57 * order. Going to full little-endian representation would require
58 * reversing bits within each byte, which is doable but expensive.
60 * Instead, we go to full big-endian representation, by swapping bytes
61 * around, which is done with a single _mm_shuffle_epi8() opcode (it
62 * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
63 * can use a full big-endian representation because in a carryless
64 * multiplication, we have a nice bit reversal property:
66 * rev_128(x) * rev_128(y) = rev_255(x * y)
68 * So by using full big-endian, we still get the right result, except
69 * that it is right-shifted by 1 bit. The left-shift is relatively
70 * inexpensive, and it can be mutualised.
73 * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
74 * values with bit precision, we have to break down values into 64-bit
75 * chunks. We number chunks from 0 to 3 in left to right order.
79 * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
80 * halves of kw (into the right half of kx; left half is unspecified).
82 #define BK(kw, kx) do { \
83 kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
87 * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
88 * the XOR of the two values (kx).
90 #define PBK(k0, k1, kw, kx) do { \
91 kw = _mm_unpacklo_epi64(k1, k0); \
92 kx = _mm_xor_si128(k0, k1); \
96 * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
98 #define SL_256(x0, x1, x2, x3) do { \
100 _mm_slli_epi64(x0, 1), \
101 _mm_srli_epi64(x1, 63)); \
103 _mm_slli_epi64(x1, 1), \
104 _mm_srli_epi64(x2, 63)); \
106 _mm_slli_epi64(x2, 1), \
107 _mm_srli_epi64(x3, 63)); \
108 x3 = _mm_slli_epi64(x3, 1); \
112 * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
113 * result is written in x0..x1.
115 #define REDUCE_F128(x0, x1, x2, x3) do { \
116 x1 = _mm_xor_si128( \
121 _mm_srli_epi64(x3, 1)), \
123 _mm_srli_epi64(x3, 2), \
124 _mm_srli_epi64(x3, 7)))); \
125 x2 = _mm_xor_si128( \
128 _mm_slli_epi64(x3, 63)), \
130 _mm_slli_epi64(x3, 62), \
131 _mm_slli_epi64(x3, 57))); \
132 x0 = _mm_xor_si128( \
137 _mm_srli_epi64(x2, 1)), \
139 _mm_srli_epi64(x2, 2), \
140 _mm_srli_epi64(x2, 7)))); \
141 x1 = _mm_xor_si128( \
144 _mm_slli_epi64(x2, 63)), \
146 _mm_slli_epi64(x2, 62), \
147 _mm_slli_epi64(x2, 57))); \
151 * Square value kw into (dw,dx).
153 #define SQUARE_F128(kw, dw, dx) do { \
154 __m128i z0, z1, z2, z3; \
155 z1 = _mm_clmulepi64_si128(kw, kw, 0x11); \
156 z3 = _mm_clmulepi64_si128(kw, kw, 0x00); \
157 z0 = _mm_shuffle_epi32(z1, 0x0E); \
158 z2 = _mm_shuffle_epi32(z3, 0x0E); \
159 SL_256(z0, z1, z2, z3); \
160 REDUCE_F128(z0, z1, z2, z3); \
161 PBK(z0, z1, dw, dx); \
164 /* see bearssl_hash.h */
165 BR_TARGET("ssse3,pclmul")
167 br_ghash_pclmul(void *y
, const void *h
, const void *data
, size_t len
)
169 const unsigned char *buf1
, *buf2
;
170 unsigned char tmp
[64];
172 __m128i yw
, h1w
, h1x
;
173 __m128i byteswap_index
;
176 * We split data into two chunks. First chunk starts at buf1
177 * and contains num4 blocks of 64-byte values. Second chunk
178 * starts at buf2 and contains num1 blocks of 16-byte values.
179 * We want the first chunk to be as large as possible.
184 buf2
= buf1
+ (num4
<< 6);
185 num1
= (len
+ 15) >> 4;
186 if ((len
& 15) != 0) {
187 memcpy(tmp
, buf2
, len
);
188 memset(tmp
+ len
, 0, (num1
<< 4) - len
);
193 * Constant value to perform endian conversion.
195 byteswap_index
= _mm_set_epi8(
196 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
201 yw
= _mm_loadu_si128(y
);
202 h1w
= _mm_loadu_si128(h
);
203 yw
= _mm_shuffle_epi8(yw
, byteswap_index
);
204 h1w
= _mm_shuffle_epi8(h1w
, byteswap_index
);
208 __m128i h2w
, h2x
, h3w
, h3x
, h4w
, h4x
;
209 __m128i t0
, t1
, t2
, t3
;
214 SQUARE_F128(h1w
, h2w
, h2x
);
217 * Compute h3 = h^3 = h*(h^2).
219 t1
= _mm_clmulepi64_si128(h1w
, h2w
, 0x11);
220 t3
= _mm_clmulepi64_si128(h1w
, h2w
, 0x00);
221 t2
= _mm_xor_si128(_mm_clmulepi64_si128(h1x
, h2x
, 0x00),
222 _mm_xor_si128(t1
, t3
));
223 t0
= _mm_shuffle_epi32(t1
, 0x0E);
224 t1
= _mm_xor_si128(t1
, _mm_shuffle_epi32(t2
, 0x0E));
225 t2
= _mm_xor_si128(t2
, _mm_shuffle_epi32(t3
, 0x0E));
226 SL_256(t0
, t1
, t2
, t3
);
227 REDUCE_F128(t0
, t1
, t2
, t3
);
228 PBK(t0
, t1
, h3w
, h3x
);
231 * Compute h4 = h^4 = (h^2)^2.
233 SQUARE_F128(h2w
, h4w
, h4x
);
235 while (num4
-- > 0) {
236 __m128i aw0
, aw1
, aw2
, aw3
;
237 __m128i ax0
, ax1
, ax2
, ax3
;
239 aw0
= _mm_loadu_si128((void *)(buf1
+ 0));
240 aw1
= _mm_loadu_si128((void *)(buf1
+ 16));
241 aw2
= _mm_loadu_si128((void *)(buf1
+ 32));
242 aw3
= _mm_loadu_si128((void *)(buf1
+ 48));
243 aw0
= _mm_shuffle_epi8(aw0
, byteswap_index
);
244 aw1
= _mm_shuffle_epi8(aw1
, byteswap_index
);
245 aw2
= _mm_shuffle_epi8(aw2
, byteswap_index
);
246 aw3
= _mm_shuffle_epi8(aw3
, byteswap_index
);
249 aw0
= _mm_xor_si128(aw0
, yw
);
257 _mm_clmulepi64_si128(aw0
, h4w
, 0x11),
258 _mm_clmulepi64_si128(aw1
, h3w
, 0x11)),
260 _mm_clmulepi64_si128(aw2
, h2w
, 0x11),
261 _mm_clmulepi64_si128(aw3
, h1w
, 0x11)));
264 _mm_clmulepi64_si128(aw0
, h4w
, 0x00),
265 _mm_clmulepi64_si128(aw1
, h3w
, 0x00)),
267 _mm_clmulepi64_si128(aw2
, h2w
, 0x00),
268 _mm_clmulepi64_si128(aw3
, h1w
, 0x00)));
271 _mm_clmulepi64_si128(ax0
, h4x
, 0x00),
272 _mm_clmulepi64_si128(ax1
, h3x
, 0x00)),
274 _mm_clmulepi64_si128(ax2
, h2x
, 0x00),
275 _mm_clmulepi64_si128(ax3
, h1x
, 0x00)));
276 t2
= _mm_xor_si128(t2
, _mm_xor_si128(t1
, t3
));
277 t0
= _mm_shuffle_epi32(t1
, 0x0E);
278 t1
= _mm_xor_si128(t1
, _mm_shuffle_epi32(t2
, 0x0E));
279 t2
= _mm_xor_si128(t2
, _mm_shuffle_epi32(t3
, 0x0E));
280 SL_256(t0
, t1
, t2
, t3
);
281 REDUCE_F128(t0
, t1
, t2
, t3
);
282 yw
= _mm_unpacklo_epi64(t1
, t0
);
286 while (num1
-- > 0) {
288 __m128i t0
, t1
, t2
, t3
;
290 aw
= _mm_loadu_si128((void *)buf2
);
291 aw
= _mm_shuffle_epi8(aw
, byteswap_index
);
294 aw
= _mm_xor_si128(aw
, yw
);
297 t1
= _mm_clmulepi64_si128(aw
, h1w
, 0x11);
298 t3
= _mm_clmulepi64_si128(aw
, h1w
, 0x00);
299 t2
= _mm_clmulepi64_si128(ax
, h1x
, 0x00);
300 t2
= _mm_xor_si128(t2
, _mm_xor_si128(t1
, t3
));
301 t0
= _mm_shuffle_epi32(t1
, 0x0E);
302 t1
= _mm_xor_si128(t1
, _mm_shuffle_epi32(t2
, 0x0E));
303 t2
= _mm_xor_si128(t2
, _mm_shuffle_epi32(t3
, 0x0E));
304 SL_256(t0
, t1
, t2
, t3
);
305 REDUCE_F128(t0
, t1
, t2
, t3
);
306 yw
= _mm_unpacklo_epi64(t1
, t0
);
309 yw
= _mm_shuffle_epi8(yw
, byteswap_index
);
310 _mm_storeu_si128(y
, yw
);
314 * Test CPU support for PCLMULQDQ.
317 pclmul_supported(void)
320 * Bit mask for features in ECX:
321 * 1 PCLMULQDQ support
323 #define MASK 0x00000002
326 unsigned eax
, ebx
, ecx
, edx
;
328 if (__get_cpuid(1, &eax
, &ebx
, &ecx
, &edx
)) {
329 return (ecx
& MASK
) == MASK
;
333 #elif BR_AES_X86NI_MSC
337 return ((uint32_t)info
[2] & MASK
) == MASK
;
345 /* see bearssl_hash.h */
347 br_ghash_pclmul_get(void)
349 return pclmul_supported() ? &br_ghash_pclmul
: 0;
354 /* see bearssl_hash.h */
356 br_ghash_pclmul_get(void)