_ Git - BearSSL/blob - ghash_pclmul.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * This is the GHASH implementation that leverages the pclmulqdq opcode
  29  * (from the AES-NI instructions).
  30  */
  31
  32 #if BR_AES_X86NI
  33
  34 #if BR_AES_X86NI_GCC
  35 /* #pragma GCC target "sse2,ssse3,pclmul" */
  36 #include <tmmintrin.h>
  37 #include <wmmintrin.h>
  38 #include <cpuid.h>
  39 #endif
  40
  41 #if BR_AES_X86NI_MSC
  42 #include <intrin.h>
  43 #endif
  44
  45 /* see bearssl_hash.h */
  46 BR_TARGET("ssse3,pclmul")
  47 void
  48 br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
  49 {
  50         /*
  51          * TODO: loop below processes one 16-bit word at a time. We
  52          * could parallelize, using:
  53          *   ((y+x0)*h+x1)*h = (y+x0)*(h^2) + x1*h
  54          * i.e. precompute h^2, then handle two words at a time, mostly
  55          * in parallel (this may extend to more words as well...).
  56          */
  57
  58         const unsigned char *buf;
  59         __m128i yx, hx;
  60         __m128i h0, h1, h2;
  61         __m128i byteswap_index;
  62
  63         byteswap_index = _mm_set_epi8(
  64                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
  65         yx = _mm_loadu_si128(y);
  66         hx = _mm_loadu_si128(h);
  67         yx = _mm_shuffle_epi8(yx, byteswap_index);
  68         hx = _mm_shuffle_epi8(hx, byteswap_index);
  69
  70         /*
  71          * We byte-swap y and h for full big-endian interpretation
  72          * (see below).
  73          */
  74
  75         h0 = hx;
  76         h1 = _mm_shuffle_epi32(hx, 0x0E);
  77         h2 = _mm_xor_si128(h0, h1);
  78
  79         buf = data;
  80         while (len > 0) {
  81                 __m128i x;
  82                 __m128i t0, t1, t2, v0, v1, v2, v3;
  83                 __m128i y0, y1, y2;
  84
  85                 /*
  86                  * Load next 128-bit word. If there are not enough bytes
  87                  * for the next word, we pad it with zeros (as per the
  88                  * API for this function; it's also what is useful for
  89                  * implementation of GCM).
  90                  */
  91                 if (len >= 16) {
  92                         x = _mm_loadu_si128((const void *)buf);
  93                         buf += 16;
  94                         len -= 16;
  95                 } else {
  96                         unsigned char tmp[16];
  97
  98                         memcpy(tmp, buf, len);
  99                         memset(tmp + len, 0, (sizeof tmp) - len);
 100                         x = _mm_loadu_si128((void *)tmp);
 101                         len = 0;
 102                 }
 103
 104                 /*
 105                  * Specification of GCM is basically "full little-endian",
 106                  * i.e. leftmost bit is most significant; but decoding
 107                  * performed by _mm_loadu_si128 is "mixed endian" (leftmost
 108                  * _byte_ is least significant, but within each byte, the
 109                  * leftmost _bit_ is most significant). We could reverse
 110                  * bits in each byte; however, it is more efficient to
 111                  * swap the bytes and thus emulate full big-endian
 112                  * decoding.
 113                  *
 114                  * Big-endian works here because multiplication in
 115                  * GF[2](X) is "carry-less", thereby allowing reversal:
 116                  * if rev_n(x) consists in reversing the order of bits
 117                  * in x, then:
 118                  *   rev_128(A)*rev_128(B) = rev_255(A*B)
 119                  * so we can compute A*B by using rev_128(A) and rev_128(B),
 120                  * and an extra shift at the end (because 255 != 256). Bit
 121                  * reversal is exactly what happens when converting from
 122                  * full little-endian to full big-endian.
 123                  */
 124                 x = _mm_shuffle_epi8(x, byteswap_index);
 125                 yx = _mm_xor_si128(yx, x);
 126
 127                 /*
 128                  * We want the product to be broken down into four
 129                  * 64-bit values, because there is no SSE* opcode that
 130                  * can do a shift on a 128-bit value.
 131                  */
 132                 y0 = yx;
 133                 y1 = _mm_shuffle_epi32(yx, 0x0E);
 134                 y2 = _mm_xor_si128(y0, y1);
 135                 t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
 136                 t1 = _mm_clmulepi64_si128(yx, hx, 0x11);
 137                 t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
 138                 t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
 139                 v0 = t0;
 140                 v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
 141                 v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
 142                 v3 = _mm_shuffle_epi32(t1, 0x0E);
 143
 144                 /*
 145                  * Do the corrective 1-bit shift (255->256).
 146                  */
 147                 v3 = _mm_or_si128(
 148                         _mm_slli_epi64(v3, 1),
 149                         _mm_srli_epi64(v2, 63));
 150                 v2 = _mm_or_si128(
 151                         _mm_slli_epi64(v2, 1),
 152                         _mm_srli_epi64(v1, 63));
 153                 v1 = _mm_or_si128(
 154                         _mm_slli_epi64(v1, 1),
 155                         _mm_srli_epi64(v0, 63));
 156                 v0 = _mm_slli_epi64(v0, 1);
 157
 158                 /*
 159                  * Perform polynomial reduction into GF(2^128).
 160                  */
 161                 v2 = _mm_xor_si128(
 162                         v2,
 163                         _mm_xor_si128(
 164                                 _mm_xor_si128(
 165                                         v0,
 166                                         _mm_srli_epi64(v0, 1)),
 167                                 _mm_xor_si128(
 168                                         _mm_srli_epi64(v0, 2),
 169                                         _mm_srli_epi64(v0, 7))));
 170                 v1 = _mm_xor_si128(
 171                         _mm_xor_si128(
 172                                 v1,
 173                                 _mm_slli_epi64(v0, 63)),
 174                         _mm_xor_si128(
 175                                 _mm_slli_epi64(v0, 62),
 176                                 _mm_slli_epi64(v0, 57)));
 177                 v3 = _mm_xor_si128(
 178                         v3,
 179                         _mm_xor_si128(
 180                                 _mm_xor_si128(
 181                                         v1,
 182                                         _mm_srli_epi64(v1, 1)),
 183                                 _mm_xor_si128(
 184                                         _mm_srli_epi64(v1, 2),
 185                                         _mm_srli_epi64(v1, 7))));
 186                 v2 = _mm_xor_si128(
 187                         _mm_xor_si128(
 188                                 v2,
 189                                 _mm_slli_epi64(v1, 63)),
 190                         _mm_xor_si128(
 191                                 _mm_slli_epi64(v1, 62),
 192                                 _mm_slli_epi64(v1, 57)));
 193
 194                 /*
 195                  * We reduced toward the high words (v2 and v3), which
 196                  * are the new value for y.
 197                  */
 198                 yx = _mm_unpacklo_epi64(v2, v3);
 199         }
 200
 201         yx = _mm_shuffle_epi8(yx, byteswap_index);
 202         _mm_storeu_si128(y, yx);
 203 }
 204
 205 /*
 206  * Test CPU support for PCLMULQDQ.
 207  */
 208 static int
 209 pclmul_supported(void)
 210 {
 211         /*
 212          * Bit mask for features in ECX:
 213          *    1   PCLMULQDQ support
 214          */
 215 #define MASK   0x00000002
 216
 217 #if BR_AES_X86NI_GCC
 218         unsigned eax, ebx, ecx, edx;
 219
 220         if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
 221                 return (ecx & MASK) == MASK;
 222         } else {
 223                 return 0;
 224         }
 225 #elif BR_AES_X86NI_MSC
 226         int info[4];
 227
 228         __cpuid(info, 1);
 229         return ((uint32_t)info[2] & MASK) == MASK;
 230 #else
 231         return 0;
 232 #endif
 233
 234 #undef MASK
 235 }
 236
 237 /* see bearssl_hash.h */
 238 br_ghash
 239 br_ghash_pclmul_get(void)
 240 {
 241         return pclmul_supported() ? &br_ghash_pclmul : 0;
 242 }
 243
 244 #else
 245
 246 /* see bearssl_hash.h */
 247 br_ghash
 248 br_ghash_pclmul_get(void)
 249 {
 250         return 0;
 251 }
 252
 253 #endif