c70988933df551395cb5bb07fa120beb2c59d933
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * This is the GHASH implementation that leverages the pclmulqdq opcode
29 * (from the AES-NI instructions).
35 /* #pragma GCC target "sse2,ssse3,pclmul" */
36 #include <tmmintrin.h>
37 #include <wmmintrin.h>
45 /* see bearssl_hash.h */
46 BR_TARGET("ssse3,pclmul")
48 br_ghash_pclmul(void *y
, const void *h
, const void *data
, size_t len
)
51 * TODO: loop below processes one 16-bit word at a time. We
52 * could parallelize, using:
53 * ((y+x0)*h+x1)*h = (y+x0)*(h^2) + x1*h
54 * i.e. precompute h^2, then handle two words at a time, mostly
55 * in parallel (this may extend to more words as well...).
58 const unsigned char *buf
;
61 __m128i byteswap_index
;
63 byteswap_index
= _mm_set_epi8(
64 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
65 yx
= _mm_loadu_si128(y
);
66 hx
= _mm_loadu_si128(h
);
67 yx
= _mm_shuffle_epi8(yx
, byteswap_index
);
68 hx
= _mm_shuffle_epi8(hx
, byteswap_index
);
71 * We byte-swap y and h for full big-endian interpretation
76 h1
= _mm_shuffle_epi32(hx
, 0x0E);
77 h2
= _mm_xor_si128(h0
, h1
);
82 __m128i t0
, t1
, t2
, v0
, v1
, v2
, v3
;
86 * Load next 128-bit word. If there are not enough bytes
87 * for the next word, we pad it with zeros (as per the
88 * API for this function; it's also what is useful for
89 * implementation of GCM).
92 x
= _mm_loadu_si128((const void *)buf
);
96 unsigned char tmp
[16];
98 memcpy(tmp
, buf
, len
);
99 memset(tmp
+ len
, 0, (sizeof tmp
) - len
);
100 x
= _mm_loadu_si128((void *)tmp
);
105 * Specification of GCM is basically "full little-endian",
106 * i.e. leftmost bit is most significant; but decoding
107 * performed by _mm_loadu_si128 is "mixed endian" (leftmost
108 * _byte_ is least significant, but within each byte, the
109 * leftmost _bit_ is most significant). We could reverse
110 * bits in each byte; however, it is more efficient to
111 * swap the bytes and thus emulate full big-endian
114 * Big-endian works here because multiplication in
115 * GF[2](X) is "carry-less", thereby allowing reversal:
116 * if rev_n(x) consists in reversing the order of bits
118 * rev_128(A)*rev_128(B) = rev_255(A*B)
119 * so we can compute A*B by using rev_128(A) and rev_128(B),
120 * and an extra shift at the end (because 255 != 256). Bit
121 * reversal is exactly what happens when converting from
122 * full little-endian to full big-endian.
124 x
= _mm_shuffle_epi8(x
, byteswap_index
);
125 yx
= _mm_xor_si128(yx
, x
);
128 * We want the product to be broken down into four
129 * 64-bit values, because there is no SSE* opcode that
130 * can do a shift on a 128-bit value.
133 y1
= _mm_shuffle_epi32(yx
, 0x0E);
134 y2
= _mm_xor_si128(y0
, y1
);
135 t0
= _mm_clmulepi64_si128(y0
, h0
, 0x00);
136 t1
= _mm_clmulepi64_si128(yx
, hx
, 0x11);
137 t2
= _mm_clmulepi64_si128(y2
, h2
, 0x00);
138 t2
= _mm_xor_si128(t2
, _mm_xor_si128(t0
, t1
));
140 v1
= _mm_xor_si128(_mm_shuffle_epi32(t0
, 0x0E), t2
);
141 v2
= _mm_xor_si128(t1
, _mm_shuffle_epi32(t2
, 0x0E));
142 v3
= _mm_shuffle_epi32(t1
, 0x0E);
145 * Do the corrective 1-bit shift (255->256).
148 _mm_slli_epi64(v3
, 1),
149 _mm_srli_epi64(v2
, 63));
151 _mm_slli_epi64(v2
, 1),
152 _mm_srli_epi64(v1
, 63));
154 _mm_slli_epi64(v1
, 1),
155 _mm_srli_epi64(v0
, 63));
156 v0
= _mm_slli_epi64(v0
, 1);
159 * Perform polynomial reduction into GF(2^128).
166 _mm_srli_epi64(v0
, 1)),
168 _mm_srli_epi64(v0
, 2),
169 _mm_srli_epi64(v0
, 7))));
173 _mm_slli_epi64(v0
, 63)),
175 _mm_slli_epi64(v0
, 62),
176 _mm_slli_epi64(v0
, 57)));
182 _mm_srli_epi64(v1
, 1)),
184 _mm_srli_epi64(v1
, 2),
185 _mm_srli_epi64(v1
, 7))));
189 _mm_slli_epi64(v1
, 63)),
191 _mm_slli_epi64(v1
, 62),
192 _mm_slli_epi64(v1
, 57)));
195 * We reduced toward the high words (v2 and v3), which
196 * are the new value for y.
198 yx
= _mm_unpacklo_epi64(v2
, v3
);
201 yx
= _mm_shuffle_epi8(yx
, byteswap_index
);
202 _mm_storeu_si128(y
, yx
);
206 * Test CPU support for PCLMULQDQ.
209 pclmul_supported(void)
212 * Bit mask for features in ECX:
213 * 1 PCLMULQDQ support
215 #define MASK 0x00000002
218 unsigned eax
, ebx
, ecx
, edx
;
220 if (__get_cpuid(1, &eax
, &ebx
, &ecx
, &edx
)) {
221 return (ecx
& MASK
) == MASK
;
225 #elif BR_AES_X86NI_MSC
229 return ((uint32_t)info
[2] & MASK
) == MASK
;
237 /* see bearssl_hash.h */
239 br_ghash_pclmul_get(void)
241 return pclmul_supported() ? &br_ghash_pclmul
: 0;
246 /* see bearssl_hash.h */
248 br_ghash_pclmul_get(void)