From 24c6f09bf83015e04e16666e8a5fb66e75967e0d Mon Sep 17 00:00:00 2001 From: Thomas Pornin Date: Wed, 26 Jul 2017 15:58:01 +0200 Subject: [PATCH] Added ChaCha20 implementation with SSE2 opcodes. --- inc/bearssl_block.h | 50 ++++++ mk/Rules.mk | 5 +- mk/mkrules.sh | 1 + src/config.h | 10 ++ src/inner.h | 42 ++++- src/ssl/ssl_engine_default_chapol.c | 24 ++- src/symcipher/chacha20_sse2.c | 257 ++++++++++++++++++++++++++++ test/test_crypto.c | 40 ++++- test/test_speed.c | 17 +- 9 files changed, 429 insertions(+), 17 deletions(-) create mode 100644 src/symcipher/chacha20_sse2.c diff --git a/inc/bearssl_block.h b/inc/bearssl_block.h index 415bc28..24f09ac 100644 --- a/inc/bearssl_block.h +++ b/inc/bearssl_block.h @@ -259,9 +259,26 @@ extern "C" { * `chacha20_ct` is a straightforward implementation of ChaCha20 in * plain C; it is constant-time, small, and reasonably fast. * + * `chacha20_sse2` leverages SSE2 opcodes (on x86 architectures that + * support these opcodes). It is faster than `chacha20_ct`. + * * `poly1305_ctmul` is an implementation of the ChaCha20+Poly1305 AEAD * construction, where the Poly1305 part is performed with mixed 32-bit * multiplications (operands are 32-bit, result is 64-bit). + * + * `poly1305_ctmul32` implements ChaCha20+Poly1305 using pure 32-bit + * multiplications (32-bit operands, 32-bit result). It is slower than + * `poly1305_ctmul`, except on some specific architectures such as + * the ARM Cortex M0+. + * + * `poly1305_ctmulq` implements ChaCha20+Poly1305 with mixed 64-bit + * multiplications (operands are 64-bit, result is 128-bit) on 64-bit + * platforms that support such operations. + * + * `poly1305_i15` implements ChaCha20+Poly1305 with the generic "i15" + * big integer implementation. It is meant mostly for testing purposes, + * although it can help with saving a few hundred bytes of code footprint + * on systems where code size is scarce. */ /** @@ -1684,6 +1701,39 @@ typedef uint32_t (*br_chacha20_run)(const void *key, uint32_t br_chacha20_ct_run(const void *key, const void *iv, uint32_t cc, void *data, size_t len); +/** + * \brief ChaCha20 implementation (SSE2 code, constant-time). + * + * This implementation is available only on x86 platforms, depending on + * compiler support. Moreover, in 32-bit mode, it might not actually run, + * if the underlying hardware does not implement the SSE2 opcode (in + * 64-bit mode, SSE2 is part of the ABI, so if the code could be compiled + * at all, then it can run). Use `br_chacha20_sse2_get()` to safely obtain + * a pointer to that function. + * + * \see br_chacha20_run + * + * \param key secret key (32 bytes). + * \param iv IV (12 bytes). + * \param cc initial counter value. + * \param data data to encrypt or decrypt. + * \param len data length (in bytes). + */ +uint32_t br_chacha20_sse2_run(const void *key, + const void *iv, uint32_t cc, void *data, size_t len); + +/** + * \brief Obtain the `sse2` ChaCha20 implementation, if available. + * + * This function returns a pointer to `br_chacha20_sse2_run`, if + * that implementation was compiled in the library _and_ the SSE2 + * opcodes are available on the currently running CPU. If either of + * these conditions is not met, then this function returns `0`. + * + * \return the `sse2` ChaCha20 implementation, or `0`. + */ +br_chacha20_run br_chacha20_sse2_get(void); + /** * \brief Type for a ChaCha20+Poly1305 AEAD implementation. * diff --git a/mk/Rules.mk b/mk/Rules.mk index fdedc6e..694d04e 100644 --- a/mk/Rules.mk +++ b/mk/Rules.mk @@ -1,6 +1,6 @@ # Automatically generated rules. Use 'mkrules.sh' to modify/regenerate. -OBJ = $(OBJDIR)$Pgcm$O $(OBJDIR)$Pccopy$O $(OBJDIR)$Pdec16be$O $(OBJDIR)$Pdec16le$O $(OBJDIR)$Pdec32be$O $(OBJDIR)$Pdec32le$O $(OBJDIR)$Pdec64be$O $(OBJDIR)$Pdec64le$O $(OBJDIR)$Penc16be$O $(OBJDIR)$Penc16le$O $(OBJDIR)$Penc32be$O $(OBJDIR)$Penc32le$O $(OBJDIR)$Penc64be$O $(OBJDIR)$Penc64le$O $(OBJDIR)$Ppemdec$O $(OBJDIR)$Pec_all_m15$O $(OBJDIR)$Pec_all_m31$O $(OBJDIR)$Pec_c25519_i15$O $(OBJDIR)$Pec_c25519_i31$O $(OBJDIR)$Pec_c25519_m15$O $(OBJDIR)$Pec_c25519_m31$O $(OBJDIR)$Pec_curve25519$O $(OBJDIR)$Pec_default$O $(OBJDIR)$Pec_p256_m15$O $(OBJDIR)$Pec_p256_m31$O $(OBJDIR)$Pec_prime_i15$O $(OBJDIR)$Pec_prime_i31$O $(OBJDIR)$Pec_secp256r1$O $(OBJDIR)$Pec_secp384r1$O $(OBJDIR)$Pec_secp521r1$O $(OBJDIR)$Pecdsa_atr$O $(OBJDIR)$Pecdsa_default_sign_asn1$O $(OBJDIR)$Pecdsa_default_sign_raw$O $(OBJDIR)$Pecdsa_default_vrfy_asn1$O $(OBJDIR)$Pecdsa_default_vrfy_raw$O $(OBJDIR)$Pecdsa_i15_bits$O $(OBJDIR)$Pecdsa_i15_sign_asn1$O $(OBJDIR)$Pecdsa_i15_sign_raw$O $(OBJDIR)$Pecdsa_i15_vrfy_asn1$O $(OBJDIR)$Pecdsa_i15_vrfy_raw$O $(OBJDIR)$Pecdsa_i31_bits$O $(OBJDIR)$Pecdsa_i31_sign_asn1$O $(OBJDIR)$Pecdsa_i31_sign_raw$O $(OBJDIR)$Pecdsa_i31_vrfy_asn1$O $(OBJDIR)$Pecdsa_i31_vrfy_raw$O $(OBJDIR)$Pecdsa_rta$O $(OBJDIR)$Pdig_oid$O $(OBJDIR)$Pdig_size$O $(OBJDIR)$Pghash_ctmul$O $(OBJDIR)$Pghash_ctmul32$O $(OBJDIR)$Pghash_ctmul64$O $(OBJDIR)$Pghash_pclmul$O $(OBJDIR)$Pghash_pwr8$O $(OBJDIR)$Pmd5$O $(OBJDIR)$Pmd5sha1$O $(OBJDIR)$Pmultihash$O $(OBJDIR)$Psha1$O $(OBJDIR)$Psha2big$O $(OBJDIR)$Psha2small$O $(OBJDIR)$Pi15_add$O $(OBJDIR)$Pi15_bitlen$O $(OBJDIR)$Pi15_decmod$O $(OBJDIR)$Pi15_decode$O $(OBJDIR)$Pi15_decred$O $(OBJDIR)$Pi15_encode$O $(OBJDIR)$Pi15_fmont$O $(OBJDIR)$Pi15_iszero$O $(OBJDIR)$Pi15_modpow$O $(OBJDIR)$Pi15_modpow2$O $(OBJDIR)$Pi15_montmul$O $(OBJDIR)$Pi15_mulacc$O $(OBJDIR)$Pi15_muladd$O $(OBJDIR)$Pi15_ninv15$O $(OBJDIR)$Pi15_reduce$O $(OBJDIR)$Pi15_rshift$O $(OBJDIR)$Pi15_sub$O $(OBJDIR)$Pi15_tmont$O $(OBJDIR)$Pi31_add$O $(OBJDIR)$Pi31_bitlen$O $(OBJDIR)$Pi31_decmod$O $(OBJDIR)$Pi31_decode$O $(OBJDIR)$Pi31_decred$O $(OBJDIR)$Pi31_encode$O $(OBJDIR)$Pi31_fmont$O $(OBJDIR)$Pi31_iszero$O $(OBJDIR)$Pi31_modpow$O $(OBJDIR)$Pi31_modpow2$O $(OBJDIR)$Pi31_montmul$O $(OBJDIR)$Pi31_mulacc$O $(OBJDIR)$Pi31_muladd$O $(OBJDIR)$Pi31_ninv31$O $(OBJDIR)$Pi31_reduce$O $(OBJDIR)$Pi31_rshift$O $(OBJDIR)$Pi31_sub$O $(OBJDIR)$Pi31_tmont$O $(OBJDIR)$Pi32_add$O $(OBJDIR)$Pi32_bitlen$O $(OBJDIR)$Pi32_decmod$O $(OBJDIR)$Pi32_decode$O $(OBJDIR)$Pi32_decred$O $(OBJDIR)$Pi32_div32$O $(OBJDIR)$Pi32_encode$O $(OBJDIR)$Pi32_fmont$O $(OBJDIR)$Pi32_iszero$O $(OBJDIR)$Pi32_modpow$O $(OBJDIR)$Pi32_montmul$O $(OBJDIR)$Pi32_mulacc$O $(OBJDIR)$Pi32_muladd$O $(OBJDIR)$Pi32_ninv32$O $(OBJDIR)$Pi32_reduce$O $(OBJDIR)$Pi32_sub$O $(OBJDIR)$Pi32_tmont$O $(OBJDIR)$Pi62_modpow2$O $(OBJDIR)$Phmac$O $(OBJDIR)$Phmac_ct$O $(OBJDIR)$Phmac_drbg$O $(OBJDIR)$Prsa_default_pkcs1_sign$O $(OBJDIR)$Prsa_default_pkcs1_vrfy$O $(OBJDIR)$Prsa_default_priv$O $(OBJDIR)$Prsa_default_pub$O $(OBJDIR)$Prsa_i15_pkcs1_sign$O $(OBJDIR)$Prsa_i15_pkcs1_vrfy$O $(OBJDIR)$Prsa_i15_priv$O $(OBJDIR)$Prsa_i15_pub$O $(OBJDIR)$Prsa_i31_pkcs1_sign$O $(OBJDIR)$Prsa_i31_pkcs1_vrfy$O $(OBJDIR)$Prsa_i31_priv$O $(OBJDIR)$Prsa_i31_pub$O $(OBJDIR)$Prsa_i32_pkcs1_sign$O $(OBJDIR)$Prsa_i32_pkcs1_vrfy$O $(OBJDIR)$Prsa_i32_priv$O $(OBJDIR)$Prsa_i32_pub$O $(OBJDIR)$Prsa_i62_pkcs1_sign$O $(OBJDIR)$Prsa_i62_pkcs1_vrfy$O $(OBJDIR)$Prsa_i62_priv$O $(OBJDIR)$Prsa_i62_pub$O $(OBJDIR)$Prsa_pkcs1_sig_pad$O $(OBJDIR)$Prsa_pkcs1_sig_unpad$O $(OBJDIR)$Prsa_ssl_decrypt$O $(OBJDIR)$Pprf$O $(OBJDIR)$Pprf_md5sha1$O $(OBJDIR)$Pprf_sha256$O $(OBJDIR)$Pprf_sha384$O $(OBJDIR)$Pssl_ccert_single_ec$O $(OBJDIR)$Pssl_ccert_single_rsa$O $(OBJDIR)$Pssl_client$O $(OBJDIR)$Pssl_client_default_rsapub$O $(OBJDIR)$Pssl_client_full$O $(OBJDIR)$Pssl_engine$O $(OBJDIR)$Pssl_engine_default_aescbc$O $(OBJDIR)$Pssl_engine_default_aesgcm$O $(OBJDIR)$Pssl_engine_default_chapol$O $(OBJDIR)$Pssl_engine_default_descbc$O $(OBJDIR)$Pssl_engine_default_ec$O $(OBJDIR)$Pssl_engine_default_ecdsa$O $(OBJDIR)$Pssl_engine_default_rsavrfy$O $(OBJDIR)$Pssl_hashes$O $(OBJDIR)$Pssl_hs_client$O $(OBJDIR)$Pssl_hs_server$O $(OBJDIR)$Pssl_io$O $(OBJDIR)$Pssl_keyexport$O $(OBJDIR)$Pssl_lru$O $(OBJDIR)$Pssl_rec_cbc$O $(OBJDIR)$Pssl_rec_chapol$O $(OBJDIR)$Pssl_rec_gcm$O $(OBJDIR)$Pssl_scert_single_ec$O $(OBJDIR)$Pssl_scert_single_rsa$O $(OBJDIR)$Pssl_server$O $(OBJDIR)$Pssl_server_full_ec$O $(OBJDIR)$Pssl_server_full_rsa$O $(OBJDIR)$Pssl_server_mine2c$O $(OBJDIR)$Pssl_server_mine2g$O $(OBJDIR)$Pssl_server_minf2c$O $(OBJDIR)$Pssl_server_minf2g$O $(OBJDIR)$Pssl_server_minr2g$O $(OBJDIR)$Pssl_server_minu2g$O $(OBJDIR)$Pssl_server_minv2g$O $(OBJDIR)$Paes_big_cbcdec$O $(OBJDIR)$Paes_big_cbcenc$O $(OBJDIR)$Paes_big_ctr$O $(OBJDIR)$Paes_big_dec$O $(OBJDIR)$Paes_big_enc$O $(OBJDIR)$Paes_common$O $(OBJDIR)$Paes_ct$O $(OBJDIR)$Paes_ct64$O $(OBJDIR)$Paes_ct64_cbcdec$O $(OBJDIR)$Paes_ct64_cbcenc$O $(OBJDIR)$Paes_ct64_ctr$O $(OBJDIR)$Paes_ct64_dec$O $(OBJDIR)$Paes_ct64_enc$O $(OBJDIR)$Paes_ct_cbcdec$O $(OBJDIR)$Paes_ct_cbcenc$O $(OBJDIR)$Paes_ct_ctr$O $(OBJDIR)$Paes_ct_dec$O $(OBJDIR)$Paes_ct_enc$O $(OBJDIR)$Paes_pwr8$O $(OBJDIR)$Paes_pwr8_cbcdec$O $(OBJDIR)$Paes_pwr8_cbcenc$O $(OBJDIR)$Paes_pwr8_ctr$O $(OBJDIR)$Paes_small_cbcdec$O $(OBJDIR)$Paes_small_cbcenc$O $(OBJDIR)$Paes_small_ctr$O $(OBJDIR)$Paes_small_dec$O $(OBJDIR)$Paes_small_enc$O $(OBJDIR)$Paes_x86ni$O $(OBJDIR)$Paes_x86ni_cbcdec$O $(OBJDIR)$Paes_x86ni_cbcenc$O $(OBJDIR)$Paes_x86ni_ctr$O $(OBJDIR)$Pchacha20_ct$O $(OBJDIR)$Pdes_ct$O $(OBJDIR)$Pdes_ct_cbcdec$O $(OBJDIR)$Pdes_ct_cbcenc$O $(OBJDIR)$Pdes_support$O $(OBJDIR)$Pdes_tab$O $(OBJDIR)$Pdes_tab_cbcdec$O $(OBJDIR)$Pdes_tab_cbcenc$O $(OBJDIR)$Ppoly1305_ctmul$O $(OBJDIR)$Ppoly1305_ctmul32$O $(OBJDIR)$Ppoly1305_ctmulq$O $(OBJDIR)$Ppoly1305_i15$O $(OBJDIR)$Pskey_decoder$O $(OBJDIR)$Px509_decoder$O $(OBJDIR)$Px509_knownkey$O $(OBJDIR)$Px509_minimal$O $(OBJDIR)$Px509_minimal_full$O +OBJ = $(OBJDIR)$Pgcm$O $(OBJDIR)$Pccopy$O $(OBJDIR)$Pdec16be$O $(OBJDIR)$Pdec16le$O $(OBJDIR)$Pdec32be$O $(OBJDIR)$Pdec32le$O $(OBJDIR)$Pdec64be$O $(OBJDIR)$Pdec64le$O $(OBJDIR)$Penc16be$O $(OBJDIR)$Penc16le$O $(OBJDIR)$Penc32be$O $(OBJDIR)$Penc32le$O $(OBJDIR)$Penc64be$O $(OBJDIR)$Penc64le$O $(OBJDIR)$Ppemdec$O $(OBJDIR)$Pec_all_m15$O $(OBJDIR)$Pec_all_m31$O $(OBJDIR)$Pec_c25519_i15$O $(OBJDIR)$Pec_c25519_i31$O $(OBJDIR)$Pec_c25519_m15$O $(OBJDIR)$Pec_c25519_m31$O $(OBJDIR)$Pec_curve25519$O $(OBJDIR)$Pec_default$O $(OBJDIR)$Pec_p256_m15$O $(OBJDIR)$Pec_p256_m31$O $(OBJDIR)$Pec_prime_i15$O $(OBJDIR)$Pec_prime_i31$O $(OBJDIR)$Pec_secp256r1$O $(OBJDIR)$Pec_secp384r1$O $(OBJDIR)$Pec_secp521r1$O $(OBJDIR)$Pecdsa_atr$O $(OBJDIR)$Pecdsa_default_sign_asn1$O $(OBJDIR)$Pecdsa_default_sign_raw$O $(OBJDIR)$Pecdsa_default_vrfy_asn1$O $(OBJDIR)$Pecdsa_default_vrfy_raw$O $(OBJDIR)$Pecdsa_i15_bits$O $(OBJDIR)$Pecdsa_i15_sign_asn1$O $(OBJDIR)$Pecdsa_i15_sign_raw$O $(OBJDIR)$Pecdsa_i15_vrfy_asn1$O $(OBJDIR)$Pecdsa_i15_vrfy_raw$O $(OBJDIR)$Pecdsa_i31_bits$O $(OBJDIR)$Pecdsa_i31_sign_asn1$O $(OBJDIR)$Pecdsa_i31_sign_raw$O $(OBJDIR)$Pecdsa_i31_vrfy_asn1$O $(OBJDIR)$Pecdsa_i31_vrfy_raw$O $(OBJDIR)$Pecdsa_rta$O $(OBJDIR)$Pdig_oid$O $(OBJDIR)$Pdig_size$O $(OBJDIR)$Pghash_ctmul$O $(OBJDIR)$Pghash_ctmul32$O $(OBJDIR)$Pghash_ctmul64$O $(OBJDIR)$Pghash_pclmul$O $(OBJDIR)$Pghash_pwr8$O $(OBJDIR)$Pmd5$O $(OBJDIR)$Pmd5sha1$O $(OBJDIR)$Pmultihash$O $(OBJDIR)$Psha1$O $(OBJDIR)$Psha2big$O $(OBJDIR)$Psha2small$O $(OBJDIR)$Pi15_add$O $(OBJDIR)$Pi15_bitlen$O $(OBJDIR)$Pi15_decmod$O $(OBJDIR)$Pi15_decode$O $(OBJDIR)$Pi15_decred$O $(OBJDIR)$Pi15_encode$O $(OBJDIR)$Pi15_fmont$O $(OBJDIR)$Pi15_iszero$O $(OBJDIR)$Pi15_modpow$O $(OBJDIR)$Pi15_modpow2$O $(OBJDIR)$Pi15_montmul$O $(OBJDIR)$Pi15_mulacc$O $(OBJDIR)$Pi15_muladd$O $(OBJDIR)$Pi15_ninv15$O $(OBJDIR)$Pi15_reduce$O $(OBJDIR)$Pi15_rshift$O $(OBJDIR)$Pi15_sub$O $(OBJDIR)$Pi15_tmont$O $(OBJDIR)$Pi31_add$O $(OBJDIR)$Pi31_bitlen$O $(OBJDIR)$Pi31_decmod$O $(OBJDIR)$Pi31_decode$O $(OBJDIR)$Pi31_decred$O $(OBJDIR)$Pi31_encode$O $(OBJDIR)$Pi31_fmont$O $(OBJDIR)$Pi31_iszero$O $(OBJDIR)$Pi31_modpow$O $(OBJDIR)$Pi31_modpow2$O $(OBJDIR)$Pi31_montmul$O $(OBJDIR)$Pi31_mulacc$O $(OBJDIR)$Pi31_muladd$O $(OBJDIR)$Pi31_ninv31$O $(OBJDIR)$Pi31_reduce$O $(OBJDIR)$Pi31_rshift$O $(OBJDIR)$Pi31_sub$O $(OBJDIR)$Pi31_tmont$O $(OBJDIR)$Pi32_add$O $(OBJDIR)$Pi32_bitlen$O $(OBJDIR)$Pi32_decmod$O $(OBJDIR)$Pi32_decode$O $(OBJDIR)$Pi32_decred$O $(OBJDIR)$Pi32_div32$O $(OBJDIR)$Pi32_encode$O $(OBJDIR)$Pi32_fmont$O $(OBJDIR)$Pi32_iszero$O $(OBJDIR)$Pi32_modpow$O $(OBJDIR)$Pi32_montmul$O $(OBJDIR)$Pi32_mulacc$O $(OBJDIR)$Pi32_muladd$O $(OBJDIR)$Pi32_ninv32$O $(OBJDIR)$Pi32_reduce$O $(OBJDIR)$Pi32_sub$O $(OBJDIR)$Pi32_tmont$O $(OBJDIR)$Pi62_modpow2$O $(OBJDIR)$Phmac$O $(OBJDIR)$Phmac_ct$O $(OBJDIR)$Phmac_drbg$O $(OBJDIR)$Prsa_default_pkcs1_sign$O $(OBJDIR)$Prsa_default_pkcs1_vrfy$O $(OBJDIR)$Prsa_default_priv$O $(OBJDIR)$Prsa_default_pub$O $(OBJDIR)$Prsa_i15_pkcs1_sign$O $(OBJDIR)$Prsa_i15_pkcs1_vrfy$O $(OBJDIR)$Prsa_i15_priv$O $(OBJDIR)$Prsa_i15_pub$O $(OBJDIR)$Prsa_i31_pkcs1_sign$O $(OBJDIR)$Prsa_i31_pkcs1_vrfy$O $(OBJDIR)$Prsa_i31_priv$O $(OBJDIR)$Prsa_i31_pub$O $(OBJDIR)$Prsa_i32_pkcs1_sign$O $(OBJDIR)$Prsa_i32_pkcs1_vrfy$O $(OBJDIR)$Prsa_i32_priv$O $(OBJDIR)$Prsa_i32_pub$O $(OBJDIR)$Prsa_i62_pkcs1_sign$O $(OBJDIR)$Prsa_i62_pkcs1_vrfy$O $(OBJDIR)$Prsa_i62_priv$O $(OBJDIR)$Prsa_i62_pub$O $(OBJDIR)$Prsa_pkcs1_sig_pad$O $(OBJDIR)$Prsa_pkcs1_sig_unpad$O $(OBJDIR)$Prsa_ssl_decrypt$O $(OBJDIR)$Pprf$O $(OBJDIR)$Pprf_md5sha1$O $(OBJDIR)$Pprf_sha256$O $(OBJDIR)$Pprf_sha384$O $(OBJDIR)$Pssl_ccert_single_ec$O $(OBJDIR)$Pssl_ccert_single_rsa$O $(OBJDIR)$Pssl_client$O $(OBJDIR)$Pssl_client_default_rsapub$O $(OBJDIR)$Pssl_client_full$O $(OBJDIR)$Pssl_engine$O $(OBJDIR)$Pssl_engine_default_aescbc$O $(OBJDIR)$Pssl_engine_default_aesgcm$O $(OBJDIR)$Pssl_engine_default_chapol$O $(OBJDIR)$Pssl_engine_default_descbc$O $(OBJDIR)$Pssl_engine_default_ec$O $(OBJDIR)$Pssl_engine_default_ecdsa$O $(OBJDIR)$Pssl_engine_default_rsavrfy$O $(OBJDIR)$Pssl_hashes$O $(OBJDIR)$Pssl_hs_client$O $(OBJDIR)$Pssl_hs_server$O $(OBJDIR)$Pssl_io$O $(OBJDIR)$Pssl_keyexport$O $(OBJDIR)$Pssl_lru$O $(OBJDIR)$Pssl_rec_cbc$O $(OBJDIR)$Pssl_rec_chapol$O $(OBJDIR)$Pssl_rec_gcm$O $(OBJDIR)$Pssl_scert_single_ec$O $(OBJDIR)$Pssl_scert_single_rsa$O $(OBJDIR)$Pssl_server$O $(OBJDIR)$Pssl_server_full_ec$O $(OBJDIR)$Pssl_server_full_rsa$O $(OBJDIR)$Pssl_server_mine2c$O $(OBJDIR)$Pssl_server_mine2g$O $(OBJDIR)$Pssl_server_minf2c$O $(OBJDIR)$Pssl_server_minf2g$O $(OBJDIR)$Pssl_server_minr2g$O $(OBJDIR)$Pssl_server_minu2g$O $(OBJDIR)$Pssl_server_minv2g$O $(OBJDIR)$Paes_big_cbcdec$O $(OBJDIR)$Paes_big_cbcenc$O $(OBJDIR)$Paes_big_ctr$O $(OBJDIR)$Paes_big_dec$O $(OBJDIR)$Paes_big_enc$O $(OBJDIR)$Paes_common$O $(OBJDIR)$Paes_ct$O $(OBJDIR)$Paes_ct64$O $(OBJDIR)$Paes_ct64_cbcdec$O $(OBJDIR)$Paes_ct64_cbcenc$O $(OBJDIR)$Paes_ct64_ctr$O $(OBJDIR)$Paes_ct64_dec$O $(OBJDIR)$Paes_ct64_enc$O $(OBJDIR)$Paes_ct_cbcdec$O $(OBJDIR)$Paes_ct_cbcenc$O $(OBJDIR)$Paes_ct_ctr$O $(OBJDIR)$Paes_ct_dec$O $(OBJDIR)$Paes_ct_enc$O $(OBJDIR)$Paes_pwr8$O $(OBJDIR)$Paes_pwr8_cbcdec$O $(OBJDIR)$Paes_pwr8_cbcenc$O $(OBJDIR)$Paes_pwr8_ctr$O $(OBJDIR)$Paes_small_cbcdec$O $(OBJDIR)$Paes_small_cbcenc$O $(OBJDIR)$Paes_small_ctr$O $(OBJDIR)$Paes_small_dec$O $(OBJDIR)$Paes_small_enc$O $(OBJDIR)$Paes_x86ni$O $(OBJDIR)$Paes_x86ni_cbcdec$O $(OBJDIR)$Paes_x86ni_cbcenc$O $(OBJDIR)$Paes_x86ni_ctr$O $(OBJDIR)$Pchacha20_ct$O $(OBJDIR)$Pchacha20_sse2$O $(OBJDIR)$Pdes_ct$O $(OBJDIR)$Pdes_ct_cbcdec$O $(OBJDIR)$Pdes_ct_cbcenc$O $(OBJDIR)$Pdes_support$O $(OBJDIR)$Pdes_tab$O $(OBJDIR)$Pdes_tab_cbcdec$O $(OBJDIR)$Pdes_tab_cbcenc$O $(OBJDIR)$Ppoly1305_ctmul$O $(OBJDIR)$Ppoly1305_ctmul32$O $(OBJDIR)$Ppoly1305_ctmulq$O $(OBJDIR)$Ppoly1305_i15$O $(OBJDIR)$Pskey_decoder$O $(OBJDIR)$Px509_decoder$O $(OBJDIR)$Px509_knownkey$O $(OBJDIR)$Px509_minimal$O $(OBJDIR)$Px509_minimal_full$O OBJBRSSL = $(OBJDIR)$Pbrssl$O $(OBJDIR)$Pcerts$O $(OBJDIR)$Pchain$O $(OBJDIR)$Pclient$O $(OBJDIR)$Perrors$O $(OBJDIR)$Pfiles$O $(OBJDIR)$Pkeys$O $(OBJDIR)$Pnames$O $(OBJDIR)$Pserver$O $(OBJDIR)$Pskey$O $(OBJDIR)$Psslio$O $(OBJDIR)$Pta$O $(OBJDIR)$Pvector$O $(OBJDIR)$Pverify$O $(OBJDIR)$Pxmem$O OBJTESTCRYPTO = $(OBJDIR)$Ptest_crypto$O OBJTESTSPEED = $(OBJDIR)$Ptest_speed$O @@ -688,6 +688,9 @@ $(OBJDIR)$Paes_x86ni_ctr$O: src$Psymcipher$Paes_x86ni_ctr.c $(HEADERSPRIV) $(OBJDIR)$Pchacha20_ct$O: src$Psymcipher$Pchacha20_ct.c $(HEADERSPRIV) $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pchacha20_ct$O src$Psymcipher$Pchacha20_ct.c +$(OBJDIR)$Pchacha20_sse2$O: src$Psymcipher$Pchacha20_sse2.c $(HEADERSPRIV) + $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pchacha20_sse2$O src$Psymcipher$Pchacha20_sse2.c + $(OBJDIR)$Pdes_ct$O: src$Psymcipher$Pdes_ct.c $(HEADERSPRIV) $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pdes_ct$O src$Psymcipher$Pdes_ct.c diff --git a/mk/mkrules.sh b/mk/mkrules.sh index 4929294..eab5287 100755 --- a/mk/mkrules.sh +++ b/mk/mkrules.sh @@ -258,6 +258,7 @@ coresrc=" \ src/symcipher/aes_x86ni_cbcenc.c \ src/symcipher/aes_x86ni_ctr.c \ src/symcipher/chacha20_ct.c \ + src/symcipher/chacha20_sse2.c \ src/symcipher/des_ct.c \ src/symcipher/des_ct_cbcdec.c \ src/symcipher/des_ct_cbcenc.c \ diff --git a/src/config.h b/src/config.h index c315a53..b06807b 100644 --- a/src/config.h +++ b/src/config.h @@ -162,6 +162,16 @@ #define BR_AES_X86NI 1 */ +/* + * When BR_SSE2 is enabled, SSE2 intrinsics will be used for some + * algorithm implementations that use them (e.g. chacha20_sse2). If this + * is not enabled explicitly, then support for SSE2 intrinsics will be + * automatically detected. If set explicitly to 0, then SSE2 code will + * not be compiled at all. + * +#define BR_SSE2 1 + */ + /* * When BR_POWER8 is enabled, the AES implementation using the POWER ISA * 2.07 opcodes (available on POWER8 processors and later) is compiled. diff --git a/src/inner.h b/src/inner.h index fb7f4a4..2829f23 100644 --- a/src/inner.h +++ b/src/inner.h @@ -144,10 +144,46 @@ #endif #endif +/* + * Determine whether SSE2 intrinsics are understood by the compiler. + * Right now, we restrict ourselves to compiler versions where things + * are documented to work well: + * -- GCC 4.4+ and Clang 3.7+ understand the function attribute "target" + * -- MS Visual Studio 2005 documents the existence of + * SSE2-powered code _might_ work with older versions, but there is no + * pressing need to do so right now. + */ +#ifndef BR_SSE2 +#if (__i386__ || __x86_64__) \ + && ((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) \ + || (__clang_major__ > 3 \ + || (__clang_major__ == 3 && __clang_minor__ >= 7))) +#define BR_SSE2 1 +#elif (_M_IX86 || _M_X64) && (_MSC_VER >= 1400) +#define BR_SSE2 1 +#endif +#endif + +/* + * If we use SSE2 intrinsics, determine the compiler brand. + */ +#if BR_SSE2 +#ifndef BR_SSE2_GCC +#if __GNUC__ +#define BR_SSE2_GCC 1 +#endif +#endif +#ifndef BR_SSE2_MSC +#if _MSC_VER >= 1400 +#define BR_SSE2_MSC 1 +#endif +#endif +#endif + /* * A macro to tag a function with a "target" attribute (for GCC and Clang). */ -#if BR_AES_X86NI_GCC +#if BR_AES_X86NI_GCC || BR_SSE2_GCC #define BR_TARGET(x) __attribute__((target(x))) #else #define BR_TARGET(x) @@ -156,8 +192,8 @@ /* * GCC versions from 4.4 to 4.8 (inclusive) must use a special #pragma * to activate extra opcodes before including the relevant intrinsic - * headers. But these don't work with Clang (which does not need them - * either). We also need that #pragma for GCC 4.9 in order to work + * AES-NI headers. But these don't work with Clang (which does not need + * them either). We also need that #pragma for GCC 4.9 in order to work * around a compiler bug (it tends to blow up on ghash_pclmul code * otherwise). */ diff --git a/src/ssl/ssl_engine_default_chapol.c b/src/ssl/ssl_engine_default_chapol.c index 630286a..47a0c98 100644 --- a/src/ssl/ssl_engine_default_chapol.c +++ b/src/ssl/ssl_engine_default_chapol.c @@ -31,21 +31,35 @@ br_ssl_engine_set_default_chapol(br_ssl_engine_context *cc) #if BR_INT128 || BR_UMUL128 br_poly1305_run bp; #endif +#if BR_SSE2 + br_chacha20_run bc; +#endif br_ssl_engine_set_chapol(cc, &br_sslrec_in_chapol_vtable, &br_sslrec_out_chapol_vtable); - br_ssl_engine_set_chacha20(cc, &br_chacha20_ct_run); +#if BR_SSE2 + bc = br_chacha20_sse2_get(); + if (bc) { + br_ssl_engine_set_chacha20(cc, bc); + } else { +#endif + br_ssl_engine_set_chacha20(cc, &br_chacha20_ct_run); +#if BR_SSE2 + } +#endif #if BR_INT128 || BR_UMUL128 bp = br_poly1305_ctmulq_get(); if (bp) { br_ssl_engine_set_poly1305(cc, bp); - return; - } + } else { #endif #if BR_LOMUL - br_ssl_engine_set_poly1305(cc, &br_poly1305_ctmul32_run); + br_ssl_engine_set_poly1305(cc, &br_poly1305_ctmul32_run); #else - br_ssl_engine_set_poly1305(cc, &br_poly1305_ctmul_run); + br_ssl_engine_set_poly1305(cc, &br_poly1305_ctmul_run); +#endif +#if BR_INT128 || BR_UMUL128 + } #endif } diff --git a/src/symcipher/chacha20_sse2.c b/src/symcipher/chacha20_sse2.c new file mode 100644 index 0000000..0b32d51 --- /dev/null +++ b/src/symcipher/chacha20_sse2.c @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * This file contains a ChaCha20 implementation that leverages SSE2 + * opcodes for better performance. + */ + +#if BR_SSE2 + +#if BR_SSE2_GCC +#include +#include +#endif +#if BR_SSE2_MSC +#include +#endif + +/* see bearssl_block.h */ +BR_TARGET("sse2") +uint32_t +br_chacha20_sse2_run(const void *key, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + uint32_t ivtmp[4]; + __m128i kw0, kw1; + __m128i iw, cw; + __m128i one; + + static const uint32_t CW[] = { + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 + }; + + buf = data; + kw0 = _mm_loadu_si128(key); + kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16)); + ivtmp[0] = cc; + memcpy(ivtmp + 1, iv, 12); + iw = _mm_loadu_si128((const void *)ivtmp); + cw = _mm_loadu_si128((const void *)CW); + one = _mm_set_epi32(0, 0, 0, 1); + + while (len > 0) { + /* + * sj contains state words 4*j to 4*j+3. + */ + __m128i s0, s1, s2, s3; + int i; + + s0 = cw; + s1 = kw0; + s2 = kw1; + s3 = iw; + for (i = 0; i < 10; i ++) { + /* + * Even round is straightforward application on + * the state words. + */ + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 16), + _mm_srli_epi32(s3, 16)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 12), + _mm_srli_epi32(s1, 20)); + + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 8), + _mm_srli_epi32(s3, 24)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 7), + _mm_srli_epi32(s1, 25)); + + /* + * For the odd round, we must rotate some state + * words so that the computations apply on the + * right combinations of words. + */ + s1 = _mm_shuffle_epi32(s1, 0x39); + s2 = _mm_shuffle_epi32(s2, 0x4E); + s3 = _mm_shuffle_epi32(s3, 0x93); + + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 16), + _mm_srli_epi32(s3, 16)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 12), + _mm_srli_epi32(s1, 20)); + + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 8), + _mm_srli_epi32(s3, 24)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 7), + _mm_srli_epi32(s1, 25)); + + /* + * After the odd round, we rotate back the values + * to undo the rotate at the start of the odd round. + */ + s1 = _mm_shuffle_epi32(s1, 0x93); + s2 = _mm_shuffle_epi32(s2, 0x4E); + s3 = _mm_shuffle_epi32(s3, 0x39); + } + + /* + * Addition with the initial state. + */ + s0 = _mm_add_epi32(s0, cw); + s1 = _mm_add_epi32(s1, kw0); + s2 = _mm_add_epi32(s2, kw1); + s3 = _mm_add_epi32(s3, iw); + + /* + * Increment block counter. + */ + iw = _mm_add_epi32(iw, one); + + /* + * XOR final state with the data. + */ + if (len < 64) { + unsigned char tmp[64]; + size_t u; + + _mm_storeu_si128((void *)(tmp + 0), s0); + _mm_storeu_si128((void *)(tmp + 16), s1); + _mm_storeu_si128((void *)(tmp + 32), s2); + _mm_storeu_si128((void *)(tmp + 48), s3); + for (u = 0; u < len; u ++) { + buf[u] ^= tmp[u]; + } + break; + } else { + __m128i b0, b1, b2, b3; + + b0 = _mm_loadu_si128((const void *)(buf + 0)); + b1 = _mm_loadu_si128((const void *)(buf + 16)); + b2 = _mm_loadu_si128((const void *)(buf + 32)); + b3 = _mm_loadu_si128((const void *)(buf + 48)); + b0 = _mm_xor_si128(b0, s0); + b1 = _mm_xor_si128(b1, s1); + b2 = _mm_xor_si128(b2, s2); + b3 = _mm_xor_si128(b3, s3); + _mm_storeu_si128((void *)(buf + 0), b0); + _mm_storeu_si128((void *)(buf + 16), b1); + _mm_storeu_si128((void *)(buf + 32), b2); + _mm_storeu_si128((void *)(buf + 48), b3); + buf += 64; + len -= 64; + } + } + + /* + * _mm_extract_epi32() requires SSE4.1. We prefer to stick to + * raw SSE2, thus we use _mm_extract_epi16(). + */ + return (uint32_t)_mm_extract_epi16(iw, 0) + | ((uint32_t)_mm_extract_epi16(iw, 1) << 16); +} + +/* see bearssl_block.h */ +br_chacha20_run +br_chacha20_sse2_get(void) +{ + /* + * If using 64-bit mode, then SSE2 opcodes should be automatically + * available, since they are part of the ABI. + * + * In 32-bit mode, we use CPUID to detect the SSE2 feature. + */ + +#if __x86_64__ || _M_X64 + + return &br_chacha20_sse2_run; + +#else + + /* + * SSE2 support is indicated by bit 26 in EDX. + */ +#define MASK 0x04000000 + +#if BR_SSE2_GCC + unsigned eax, ebx, ecx, edx; + + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { + if ((edx & MASK) == MASK) { + return &br_chacha20_sse2_run; + } + } +#elif BR_SSE2_MSC + int info[4]; + + __cpuid(info, 1); + if (((uint32_t)info[3] & MASK) == MASK) { + return &br_chacha20_sse2_run; + } +#endif + return 0; + +#endif +} + +#else + +/* see bearssl_block.h */ +br_chacha20_run +br_chacha20_sse2_get(void) +{ + return 0; +} + +#endif diff --git a/test/test_crypto.c b/test/test_crypto.c index 9e68cd9..bea236b 100644 --- a/test/test_crypto.c +++ b/test/test_crypto.c @@ -4178,12 +4178,16 @@ static const struct { }; static void -test_ChaCha20_ct(void) +test_ChaCha20_generic(const char *name, br_chacha20_run cr) { size_t u; - printf("Test ChaCha20_ct: "); + printf("Test %s: ", name); fflush(stdout); + if (cr == 0) { + printf("UNAVAILABLE\n"); + return; + } for (u = 0; KAT_CHACHA20[u].skey; u ++) { unsigned char key[32], nonce[12], plain[400], cipher[400]; @@ -4199,10 +4203,11 @@ test_ChaCha20_ct(void) for (v = 0; v < len; v ++) { unsigned char tmp[400]; size_t w; + uint32_t cc2; memset(tmp, 0, sizeof tmp); memcpy(tmp, plain, v); - if (br_chacha20_ct_run(key, nonce, cc, tmp, v) + if (cr(key, nonce, cc, tmp, v) != cc + (uint32_t)((v + 63) >> 6)) { fprintf(stderr, "ChaCha20: wrong counter\n"); @@ -4218,7 +4223,21 @@ test_ChaCha20_ct(void) exit(EXIT_FAILURE); } } - br_chacha20_ct_run(key, nonce, cc, tmp, v); + for (w = 0, cc2 = cc; w < v; w += 64, cc2 ++) { + size_t x; + + x = v - w; + if (x > 64) { + x = 64; + } + if (cr(key, nonce, cc2, tmp + w, x) + != (cc2 + 1)) + { + fprintf(stderr, "ChaCha20:" + " wrong counter (2)\n"); + exit(EXIT_FAILURE); + } + } if (memcmp(tmp, plain, v) != 0) { fprintf(stderr, "ChaCha20 KAT fail (2)\n"); exit(EXIT_FAILURE); @@ -4233,6 +4252,18 @@ test_ChaCha20_ct(void) fflush(stdout); } +static void +test_ChaCha20_ct(void) +{ + test_ChaCha20_generic("ChaCha20_ct", &br_chacha20_ct_run); +} + +static void +test_ChaCha20_sse2(void) +{ + test_ChaCha20_generic("ChaCha20_sse2", br_chacha20_sse2_get()); +} + static const struct { const char *splain; const char *saad; @@ -6173,6 +6204,7 @@ static const struct { STU(DES_tab), STU(DES_ct), STU(ChaCha20_ct), + STU(ChaCha20_sse2), STU(Poly1305_ctmul), STU(Poly1305_ctmul32), STU(Poly1305_ctmulq), diff --git a/test/test_speed.c b/test/test_speed.c index a09aa04..296e914 100644 --- a/test/test_speed.c +++ b/test/test_speed.c @@ -175,17 +175,24 @@ test_speed_ ## fname(void) \ static void \ test_speed_ ## fname(void) \ { \ + br_chacha20_run bc; \ unsigned char key[32]; \ unsigned char buf[8192]; \ unsigned char iv[12]; \ int i; \ long num; \ \ + bc = br_ ## fname ## _get(); \ + if (bc == 0) { \ + printf("%-30s UNAVAILABLE\n", #Name); \ + fflush(stdout); \ + return; \ + } \ memset(key, 'T', sizeof key); \ memset(buf, 'P', sizeof buf); \ memset(iv, 'X', sizeof iv); \ for (i = 0; i < 10; i ++) { \ - br_ ## fname ## _run(key, iv, i, buf, sizeof buf); \ + bc(key, iv, i, buf, sizeof buf); \ } \ num = 10; \ for (;;) { \ @@ -195,8 +202,7 @@ test_speed_ ## fname(void) \ \ begin = clock(); \ for (k = num; k > 0; k --) { \ - br_ ## fname ## _run(key, iv, \ - (uint32_t)k, buf, sizeof buf); \ + bc(key, iv, (uint32_t)k, buf, sizeof buf); \ } \ end = clock(); \ tt = (double)(end - begin) / CLOCKS_PER_SEC; \ @@ -232,6 +238,7 @@ SPEED_HASH(SHA-512, sha512) #define br_aes_ct64_cbcenc_get_vtable() (&br_aes_ct64_cbcenc_vtable) #define br_aes_ct64_cbcdec_get_vtable() (&br_aes_ct64_cbcdec_vtable) #define br_aes_ct64_ctr_get_vtable() (&br_aes_ct64_ctr_vtable) +#define br_chacha20_ct_get() (&br_chacha20_ct_run) #define SPEED_AES(iname) \ SPEED_BLOCKCIPHER_CBC(AES-128 CBC encrypt (iname), aes128_ ## iname ## _cbcenc, aes_ ## iname, 16, enc) \ @@ -265,7 +272,8 @@ SPEED_BLOCKCIPHER_CBC(3DES CBC decrypt (iname), 3des_ ## iname ## _cbcdec, des_ SPEED_DES(tab) SPEED_DES(ct) -SPEED_CHACHA20(ChaCha20, chacha20_ct) +SPEED_CHACHA20(ChaCha20 (ct), chacha20_ct) +SPEED_CHACHA20(ChaCha20 (sse2), chacha20_sse2) static void test_speed_ghash_inner(char *name, br_ghash gh) @@ -1279,6 +1287,7 @@ static const struct { STU(3des_ct_cbcdec), STU(chacha20_ct), + STU(chacha20_sse2), STU(ghash_ctmul), STU(ghash_ctmul32), -- 2.17.1