Added ChaCha20 implementation with SSE2 opcodes.

[BearSSL] / src / symcipher / chacha20_sse2.c
diff --git a/src/symcipher/chacha20_sse2.c b/src/symcipher/chacha20_sse2.c

new file mode 100644 (file)

index 0000000..0b32d51
--- /dev/null
+++ b/src/symcipher/chacha20_sse2.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * This file contains a ChaCha20 implementation that leverages SSE2
+ * opcodes for better performance.
+ */
+
+#if BR_SSE2
+
+#if BR_SSE2_GCC
+#include <emmintrin.h>
+#include <cpuid.h>
+#endif
+#if BR_SSE2_MSC
+#include <intrin.h>
+#endif
+
+/* see bearssl_block.h */
+BR_TARGET("sse2")
+uint32_t
+br_chacha20_sse2_run(const void *key,
+       const void *iv, uint32_t cc, void *data, size_t len)
+{
+       unsigned char *buf;
+       uint32_t ivtmp[4];
+       __m128i kw0, kw1;
+       __m128i iw, cw;
+       __m128i one;
+
+       static const uint32_t CW[] = {
+               0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+       };
+
+       buf = data;
+       kw0 = _mm_loadu_si128(key);
+       kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
+       ivtmp[0] = cc;
+       memcpy(ivtmp + 1, iv, 12);
+       iw = _mm_loadu_si128((const void *)ivtmp);
+       cw = _mm_loadu_si128((const void *)CW);
+       one = _mm_set_epi32(0, 0, 0, 1);
+
+       while (len > 0) {
+               /*
+                * sj contains state words 4*j to 4*j+3.
+                */
+               __m128i s0, s1, s2, s3;
+               int i;
+
+               s0 = cw;
+               s1 = kw0;
+               s2 = kw1;
+               s3 = iw;
+               for (i = 0; i < 10; i ++) {
+                       /*
+                        * Even round is straightforward application on
+                        * the state words.
+                        */
+                       s0 = _mm_add_epi32(s0, s1);
+                       s3 = _mm_xor_si128(s3, s0);
+                       s3 = _mm_or_si128(
+                               _mm_slli_epi32(s3, 16),
+                               _mm_srli_epi32(s3, 16));
+
+                       s2 = _mm_add_epi32(s2, s3);
+                       s1 = _mm_xor_si128(s1, s2);
+                       s1 = _mm_or_si128(
+                               _mm_slli_epi32(s1, 12),
+                               _mm_srli_epi32(s1, 20));
+
+                       s0 = _mm_add_epi32(s0, s1);
+                       s3 = _mm_xor_si128(s3, s0);
+                       s3 = _mm_or_si128(
+                               _mm_slli_epi32(s3, 8),
+                               _mm_srli_epi32(s3, 24));
+
+                       s2 = _mm_add_epi32(s2, s3);
+                       s1 = _mm_xor_si128(s1, s2);
+                       s1 = _mm_or_si128(
+                               _mm_slli_epi32(s1, 7),
+                               _mm_srli_epi32(s1, 25));
+
+                       /*
+                        * For the odd round, we must rotate some state
+                        * words so that the computations apply on the
+                        * right combinations of words.
+                        */
+                       s1 = _mm_shuffle_epi32(s1, 0x39);
+                       s2 = _mm_shuffle_epi32(s2, 0x4E);
+                       s3 = _mm_shuffle_epi32(s3, 0x93);
+
+                       s0 = _mm_add_epi32(s0, s1);
+                       s3 = _mm_xor_si128(s3, s0);
+                       s3 = _mm_or_si128(
+                               _mm_slli_epi32(s3, 16),
+                               _mm_srli_epi32(s3, 16));
+
+                       s2 = _mm_add_epi32(s2, s3);
+                       s1 = _mm_xor_si128(s1, s2);
+                       s1 = _mm_or_si128(
+                               _mm_slli_epi32(s1, 12),
+                               _mm_srli_epi32(s1, 20));
+
+                       s0 = _mm_add_epi32(s0, s1);
+                       s3 = _mm_xor_si128(s3, s0);
+                       s3 = _mm_or_si128(
+                               _mm_slli_epi32(s3, 8),
+                               _mm_srli_epi32(s3, 24));
+
+                       s2 = _mm_add_epi32(s2, s3);
+                       s1 = _mm_xor_si128(s1, s2);
+                       s1 = _mm_or_si128(
+                               _mm_slli_epi32(s1, 7),
+                               _mm_srli_epi32(s1, 25));
+
+                       /*
+                        * After the odd round, we rotate back the values
+                        * to undo the rotate at the start of the odd round.
+                        */
+                       s1 = _mm_shuffle_epi32(s1, 0x93);
+                       s2 = _mm_shuffle_epi32(s2, 0x4E);
+                       s3 = _mm_shuffle_epi32(s3, 0x39);
+               }
+
+               /*
+                * Addition with the initial state.
+                */
+               s0 = _mm_add_epi32(s0, cw);
+               s1 = _mm_add_epi32(s1, kw0);
+               s2 = _mm_add_epi32(s2, kw1);
+               s3 = _mm_add_epi32(s3, iw);
+
+               /*
+                * Increment block counter.
+                */
+               iw = _mm_add_epi32(iw, one);
+
+               /*
+                * XOR final state with the data.
+                */
+               if (len < 64) {
+                       unsigned char tmp[64];
+                       size_t u;
+
+                       _mm_storeu_si128((void *)(tmp +  0), s0);
+                       _mm_storeu_si128((void *)(tmp + 16), s1);
+                       _mm_storeu_si128((void *)(tmp + 32), s2);
+                       _mm_storeu_si128((void *)(tmp + 48), s3);
+                       for (u = 0; u < len; u ++) {
+                               buf[u] ^= tmp[u];
+                       }
+                       break;
+               } else {
+                       __m128i b0, b1, b2, b3;
+
+                       b0 = _mm_loadu_si128((const void *)(buf +  0));
+                       b1 = _mm_loadu_si128((const void *)(buf + 16));
+                       b2 = _mm_loadu_si128((const void *)(buf + 32));
+                       b3 = _mm_loadu_si128((const void *)(buf + 48));
+                       b0 = _mm_xor_si128(b0, s0);
+                       b1 = _mm_xor_si128(b1, s1);
+                       b2 = _mm_xor_si128(b2, s2);
+                       b3 = _mm_xor_si128(b3, s3);
+                       _mm_storeu_si128((void *)(buf +  0), b0);
+                       _mm_storeu_si128((void *)(buf + 16), b1);
+                       _mm_storeu_si128((void *)(buf + 32), b2);
+                       _mm_storeu_si128((void *)(buf + 48), b3);
+                       buf += 64;
+                       len -= 64;
+               }
+       }
+
+       /*
+        * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
+        * raw SSE2, thus we use _mm_extract_epi16().
+        */
+       return (uint32_t)_mm_extract_epi16(iw, 0)
+               | ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
+}
+
+/* see bearssl_block.h */
+br_chacha20_run
+br_chacha20_sse2_get(void)
+{
+       /*
+        * If using 64-bit mode, then SSE2 opcodes should be automatically
+        * available, since they are part of the ABI.
+        *
+        * In 32-bit mode, we use CPUID to detect the SSE2 feature.
+        */
+
+#if __x86_64__ || _M_X64
+
+       return &br_chacha20_sse2_run;
+
+#else
+
+       /*
+        * SSE2 support is indicated by bit 26 in EDX.
+        */
+#define MASK   0x04000000
+
+#if BR_SSE2_GCC
+       unsigned eax, ebx, ecx, edx;
+
+       if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+               if ((edx & MASK) == MASK) {
+                       return &br_chacha20_sse2_run;
+               }
+       }
+#elif BR_SSE2_MSC
+       int info[4];
+
+       __cpuid(info, 1);
+       if (((uint32_t)info[3] & MASK) == MASK) {
+               return &br_chacha20_sse2_run;
+       }
+#endif
+       return 0;
+
+#endif
+}
+
+#else
+
+/* see bearssl_block.h */
+br_chacha20_run
+br_chacha20_sse2_get(void)
+{
+       return 0;
+}
+
+#endif