2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #define BR_ENABLE_INTRINSICS 1
29 * This is the GHASH implementation that leverages the pclmulqdq opcode
30 * (from the AES-NI instructions).
36 * Test CPU support for PCLMULQDQ.
39 pclmul_supported(void)
42 * Bit mask for features in ECX:
45 return br_cpuid(0, 0, 0x00000002, 0);
48 /* see bearssl_hash.h */
50 br_ghash_pclmul_get(void)
52 return pclmul_supported() ? &br_ghash_pclmul
: 0;
58 * GHASH is defined over elements of GF(2^128) with "full little-endian"
59 * representation: leftmost byte is least significant, and, within each
60 * byte, leftmost _bit_ is least significant. The natural ordering in
61 * x86 is "mixed little-endian": bytes are ordered from least to most
62 * significant, but bits within a byte are in most-to-least significant
63 * order. Going to full little-endian representation would require
64 * reversing bits within each byte, which is doable but expensive.
66 * Instead, we go to full big-endian representation, by swapping bytes
67 * around, which is done with a single _mm_shuffle_epi8() opcode (it
68 * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
69 * can use a full big-endian representation because in a carryless
70 * multiplication, we have a nice bit reversal property:
72 * rev_128(x) * rev_128(y) = rev_255(x * y)
74 * So by using full big-endian, we still get the right result, except
75 * that it is right-shifted by 1 bit. The left-shift is relatively
76 * inexpensive, and it can be mutualised.
79 * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
80 * values with bit precision, we have to break down values into 64-bit
81 * chunks. We number chunks from 0 to 3 in left to right order.
85 * Byte-swap a complete 128-bit value. This normally uses
86 * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
87 * However, this crashes old Clang versions, so, for Clang before 3.8,
88 * we use an alternate (and less efficient) version.
90 #if BR_CLANG && !BR_CLANG_3_8
92 #define BYTESWAP_PREP (void)0
93 #define BYTESWAP(x) do { \
94 __m128i byteswap1, byteswap2; \
96 byteswap2 = _mm_srli_epi16(byteswap1, 8); \
97 byteswap1 = _mm_slli_epi16(byteswap1, 8); \
98 byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
99 byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
100 byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
101 (x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
104 #define BYTESWAP_DECL __m128i byteswap_index;
105 #define BYTESWAP_PREP do { \
106 byteswap_index = _mm_set_epi8( \
107 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
109 #define BYTESWAP(x) do { \
110 (x) = _mm_shuffle_epi8((x), byteswap_index); \
115 * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
116 * for that compiler, we use inline assembly. Inline assembly is
117 * potentially a bit slower because the compiler does not understand
118 * what the opcode does, and thus cannot optimize instruction
121 * We use a target of "sse2" only, so that Clang may still handle the
122 * '__m128i' type and allocate SSE2 registers.
126 static inline __m128i
127 pclmulqdq00(__m128i x
, __m128i y
)
129 __asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x
) : "x" (y
));
133 static inline __m128i
134 pclmulqdq11(__m128i x
, __m128i y
)
136 __asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x
) : "x" (y
));
140 #define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00)
141 #define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11)
145 * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
146 * halves of kw (into the right half of kx; left half is unspecified).
148 #define BK(kw, kx) do { \
149 kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
153 * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
154 * the XOR of the two values (kx).
156 #define PBK(k0, k1, kw, kx) do { \
157 kw = _mm_unpacklo_epi64(k1, k0); \
158 kx = _mm_xor_si128(k0, k1); \
162 * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
164 #define SL_256(x0, x1, x2, x3) do { \
166 _mm_slli_epi64(x0, 1), \
167 _mm_srli_epi64(x1, 63)); \
169 _mm_slli_epi64(x1, 1), \
170 _mm_srli_epi64(x2, 63)); \
172 _mm_slli_epi64(x2, 1), \
173 _mm_srli_epi64(x3, 63)); \
174 x3 = _mm_slli_epi64(x3, 1); \
178 * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
179 * result is written in x0..x1.
181 #define REDUCE_F128(x0, x1, x2, x3) do { \
182 x1 = _mm_xor_si128( \
187 _mm_srli_epi64(x3, 1)), \
189 _mm_srli_epi64(x3, 2), \
190 _mm_srli_epi64(x3, 7)))); \
191 x2 = _mm_xor_si128( \
194 _mm_slli_epi64(x3, 63)), \
196 _mm_slli_epi64(x3, 62), \
197 _mm_slli_epi64(x3, 57))); \
198 x0 = _mm_xor_si128( \
203 _mm_srli_epi64(x2, 1)), \
205 _mm_srli_epi64(x2, 2), \
206 _mm_srli_epi64(x2, 7)))); \
207 x1 = _mm_xor_si128( \
210 _mm_slli_epi64(x2, 63)), \
212 _mm_slli_epi64(x2, 62), \
213 _mm_slli_epi64(x2, 57))); \
217 * Square value kw into (dw,dx).
219 #define SQUARE_F128(kw, dw, dx) do { \
220 __m128i z0, z1, z2, z3; \
221 z1 = pclmulqdq11(kw, kw); \
222 z3 = pclmulqdq00(kw, kw); \
223 z0 = _mm_shuffle_epi32(z1, 0x0E); \
224 z2 = _mm_shuffle_epi32(z3, 0x0E); \
225 SL_256(z0, z1, z2, z3); \
226 REDUCE_F128(z0, z1, z2, z3); \
227 PBK(z0, z1, dw, dx); \
230 /* see bearssl_hash.h */
231 BR_TARGET("ssse3,pclmul")
233 br_ghash_pclmul(void *y
, const void *h
, const void *data
, size_t len
)
235 const unsigned char *buf1
, *buf2
;
236 unsigned char tmp
[64];
238 __m128i yw
, h1w
, h1x
;
242 * We split data into two chunks. First chunk starts at buf1
243 * and contains num4 blocks of 64-byte values. Second chunk
244 * starts at buf2 and contains num1 blocks of 16-byte values.
245 * We want the first chunk to be as large as possible.
250 buf2
= buf1
+ (num4
<< 6);
251 num1
= (len
+ 15) >> 4;
252 if ((len
& 15) != 0) {
253 memcpy(tmp
, buf2
, len
);
254 memset(tmp
+ len
, 0, (num1
<< 4) - len
);
259 * Preparatory step for endian conversions.
266 yw
= _mm_loadu_si128(y
);
267 h1w
= _mm_loadu_si128(h
);
273 __m128i h2w
, h2x
, h3w
, h3x
, h4w
, h4x
;
274 __m128i t0
, t1
, t2
, t3
;
279 SQUARE_F128(h1w
, h2w
, h2x
);
282 * Compute h3 = h^3 = h*(h^2).
284 t1
= pclmulqdq11(h1w
, h2w
);
285 t3
= pclmulqdq00(h1w
, h2w
);
286 t2
= _mm_xor_si128(pclmulqdq00(h1x
, h2x
),
287 _mm_xor_si128(t1
, t3
));
288 t0
= _mm_shuffle_epi32(t1
, 0x0E);
289 t1
= _mm_xor_si128(t1
, _mm_shuffle_epi32(t2
, 0x0E));
290 t2
= _mm_xor_si128(t2
, _mm_shuffle_epi32(t3
, 0x0E));
291 SL_256(t0
, t1
, t2
, t3
);
292 REDUCE_F128(t0
, t1
, t2
, t3
);
293 PBK(t0
, t1
, h3w
, h3x
);
296 * Compute h4 = h^4 = (h^2)^2.
298 SQUARE_F128(h2w
, h4w
, h4x
);
300 while (num4
-- > 0) {
301 __m128i aw0
, aw1
, aw2
, aw3
;
302 __m128i ax0
, ax1
, ax2
, ax3
;
304 aw0
= _mm_loadu_si128((void *)(buf1
+ 0));
305 aw1
= _mm_loadu_si128((void *)(buf1
+ 16));
306 aw2
= _mm_loadu_si128((void *)(buf1
+ 32));
307 aw3
= _mm_loadu_si128((void *)(buf1
+ 48));
314 aw0
= _mm_xor_si128(aw0
, yw
);
322 pclmulqdq11(aw0
, h4w
),
323 pclmulqdq11(aw1
, h3w
)),
325 pclmulqdq11(aw2
, h2w
),
326 pclmulqdq11(aw3
, h1w
)));
329 pclmulqdq00(aw0
, h4w
),
330 pclmulqdq00(aw1
, h3w
)),
332 pclmulqdq00(aw2
, h2w
),
333 pclmulqdq00(aw3
, h1w
)));
336 pclmulqdq00(ax0
, h4x
),
337 pclmulqdq00(ax1
, h3x
)),
339 pclmulqdq00(ax2
, h2x
),
340 pclmulqdq00(ax3
, h1x
)));
341 t2
= _mm_xor_si128(t2
, _mm_xor_si128(t1
, t3
));
342 t0
= _mm_shuffle_epi32(t1
, 0x0E);
343 t1
= _mm_xor_si128(t1
, _mm_shuffle_epi32(t2
, 0x0E));
344 t2
= _mm_xor_si128(t2
, _mm_shuffle_epi32(t3
, 0x0E));
345 SL_256(t0
, t1
, t2
, t3
);
346 REDUCE_F128(t0
, t1
, t2
, t3
);
347 yw
= _mm_unpacklo_epi64(t1
, t0
);
351 while (num1
-- > 0) {
353 __m128i t0
, t1
, t2
, t3
;
355 aw
= _mm_loadu_si128((void *)buf2
);
359 aw
= _mm_xor_si128(aw
, yw
);
362 t1
= pclmulqdq11(aw
, h1w
);
363 t3
= pclmulqdq00(aw
, h1w
);
364 t2
= pclmulqdq00(ax
, h1x
);
365 t2
= _mm_xor_si128(t2
, _mm_xor_si128(t1
, t3
));
366 t0
= _mm_shuffle_epi32(t1
, 0x0E);
367 t1
= _mm_xor_si128(t1
, _mm_shuffle_epi32(t2
, 0x0E));
368 t2
= _mm_xor_si128(t2
, _mm_shuffle_epi32(t3
, 0x0E));
369 SL_256(t0
, t1
, t2
, t3
);
370 REDUCE_F128(t0
, t1
, t2
, t3
);
371 yw
= _mm_unpacklo_epi64(t1
, t0
);
375 _mm_storeu_si128(y
, yw
);
382 /* see bearssl_hash.h */
384 br_ghash_pclmul_get(void)