* `chacha20_ct` is a straightforward implementation of ChaCha20 in
* plain C; it is constant-time, small, and reasonably fast.
*
+ * `chacha20_sse2` leverages SSE2 opcodes (on x86 architectures that
+ * support these opcodes). It is faster than `chacha20_ct`.
+ *
* `poly1305_ctmul` is an implementation of the ChaCha20+Poly1305 AEAD
* construction, where the Poly1305 part is performed with mixed 32-bit
* multiplications (operands are 32-bit, result is 64-bit).
+ *
+ * `poly1305_ctmul32` implements ChaCha20+Poly1305 using pure 32-bit
+ * multiplications (32-bit operands, 32-bit result). It is slower than
+ * `poly1305_ctmul`, except on some specific architectures such as
+ * the ARM Cortex M0+.
+ *
+ * `poly1305_ctmulq` implements ChaCha20+Poly1305 with mixed 64-bit
+ * multiplications (operands are 64-bit, result is 128-bit) on 64-bit
+ * platforms that support such operations.
+ *
+ * `poly1305_i15` implements ChaCha20+Poly1305 with the generic "i15"
+ * big integer implementation. It is meant mostly for testing purposes,
+ * although it can help with saving a few hundred bytes of code footprint
+ * on systems where code size is scarce.
*/
/**
uint32_t br_chacha20_ct_run(const void *key,
const void *iv, uint32_t cc, void *data, size_t len);
+/**
+ * \brief ChaCha20 implementation (SSE2 code, constant-time).
+ *
+ * This implementation is available only on x86 platforms, depending on
+ * compiler support. Moreover, in 32-bit mode, it might not actually run,
+ * if the underlying hardware does not implement the SSE2 opcode (in
+ * 64-bit mode, SSE2 is part of the ABI, so if the code could be compiled
+ * at all, then it can run). Use `br_chacha20_sse2_get()` to safely obtain
+ * a pointer to that function.
+ *
+ * \see br_chacha20_run
+ *
+ * \param key secret key (32 bytes).
+ * \param iv IV (12 bytes).
+ * \param cc initial counter value.
+ * \param data data to encrypt or decrypt.
+ * \param len data length (in bytes).
+ */
+uint32_t br_chacha20_sse2_run(const void *key,
+ const void *iv, uint32_t cc, void *data, size_t len);
+
+/**
+ * \brief Obtain the `sse2` ChaCha20 implementation, if available.
+ *
+ * This function returns a pointer to `br_chacha20_sse2_run`, if
+ * that implementation was compiled in the library _and_ the SSE2
+ * opcodes are available on the currently running CPU. If either of
+ * these conditions is not met, then this function returns `0`.
+ *
+ * \return the `sse2` ChaCha20 implementation, or `0`.
+ */
+br_chacha20_run br_chacha20_sse2_get(void);
+
/**
* \brief Type for a ChaCha20+Poly1305 AEAD implementation.
*
# Automatically generated rules. Use 'mkrules.sh' to modify/regenerate.
-OBJ = $(OBJDIR)$Pgcm$O $(OBJDIR)$Pccopy$O $(OBJDIR)$Pdec16be$O $(OBJDIR)$Pdec16le$O $(OBJDIR)$Pdec32be$O $(OBJDIR)$Pdec32le$O $(OBJDIR)$Pdec64be$O $(OBJDIR)$Pdec64le$O $(OBJDIR)$Penc16be$O $(OBJDIR)$Penc16le$O $(OBJDIR)$Penc32be$O $(OBJDIR)$Penc32le$O $(OBJDIR)$Penc64be$O $(OBJDIR)$Penc64le$O $(OBJDIR)$Ppemdec$O $(OBJDIR)$Pec_all_m15$O $(OBJDIR)$Pec_all_m31$O $(OBJDIR)$Pec_c25519_i15$O $(OBJDIR)$Pec_c25519_i31$O $(OBJDIR)$Pec_c25519_m15$O $(OBJDIR)$Pec_c25519_m31$O $(OBJDIR)$Pec_curve25519$O $(OBJDIR)$Pec_default$O $(OBJDIR)$Pec_p256_m15$O $(OBJDIR)$Pec_p256_m31$O $(OBJDIR)$Pec_prime_i15$O $(OBJDIR)$Pec_prime_i31$O $(OBJDIR)$Pec_secp256r1$O $(OBJDIR)$Pec_secp384r1$O $(OBJDIR)$Pec_secp521r1$O $(OBJDIR)$Pecdsa_atr$O $(OBJDIR)$Pecdsa_default_sign_asn1$O $(OBJDIR)$Pecdsa_default_sign_raw$O $(OBJDIR)$Pecdsa_default_vrfy_asn1$O $(OBJDIR)$Pecdsa_default_vrfy_raw$O $(OBJDIR)$Pecdsa_i15_bits$O $(OBJDIR)$Pecdsa_i15_sign_asn1$O $(OBJDIR)$Pecdsa_i15_sign_raw$O $(OBJDIR)$Pecdsa_i15_vrfy_asn1$O $(OBJDIR)$Pecdsa_i15_vrfy_raw$O $(OBJDIR)$Pecdsa_i31_bits$O $(OBJDIR)$Pecdsa_i31_sign_asn1$O $(OBJDIR)$Pecdsa_i31_sign_raw$O $(OBJDIR)$Pecdsa_i31_vrfy_asn1$O $(OBJDIR)$Pecdsa_i31_vrfy_raw$O $(OBJDIR)$Pecdsa_rta$O $(OBJDIR)$Pdig_oid$O $(OBJDIR)$Pdig_size$O $(OBJDIR)$Pghash_ctmul$O $(OBJDIR)$Pghash_ctmul32$O $(OBJDIR)$Pghash_ctmul64$O $(OBJDIR)$Pghash_pclmul$O $(OBJDIR)$Pghash_pwr8$O $(OBJDIR)$Pmd5$O $(OBJDIR)$Pmd5sha1$O $(OBJDIR)$Pmultihash$O $(OBJDIR)$Psha1$O $(OBJDIR)$Psha2big$O $(OBJDIR)$Psha2small$O $(OBJDIR)$Pi15_add$O $(OBJDIR)$Pi15_bitlen$O $(OBJDIR)$Pi15_decmod$O $(OBJDIR)$Pi15_decode$O $(OBJDIR)$Pi15_decred$O $(OBJDIR)$Pi15_encode$O $(OBJDIR)$Pi15_fmont$O $(OBJDIR)$Pi15_iszero$O $(OBJDIR)$Pi15_modpow$O $(OBJDIR)$Pi15_modpow2$O $(OBJDIR)$Pi15_montmul$O $(OBJDIR)$Pi15_mulacc$O $(OBJDIR)$Pi15_muladd$O $(OBJDIR)$Pi15_ninv15$O $(OBJDIR)$Pi15_reduce$O $(OBJDIR)$Pi15_rshift$O $(OBJDIR)$Pi15_sub$O $(OBJDIR)$Pi15_tmont$O $(OBJDIR)$Pi31_add$O $(OBJDIR)$Pi31_bitlen$O $(OBJDIR)$Pi31_decmod$O $(OBJDIR)$Pi31_decode$O $(OBJDIR)$Pi31_decred$O $(OBJDIR)$Pi31_encode$O $(OBJDIR)$Pi31_fmont$O $(OBJDIR)$Pi31_iszero$O $(OBJDIR)$Pi31_modpow$O $(OBJDIR)$Pi31_modpow2$O $(OBJDIR)$Pi31_montmul$O $(OBJDIR)$Pi31_mulacc$O $(OBJDIR)$Pi31_muladd$O $(OBJDIR)$Pi31_ninv31$O $(OBJDIR)$Pi31_reduce$O $(OBJDIR)$Pi31_rshift$O $(OBJDIR)$Pi31_sub$O $(OBJDIR)$Pi31_tmont$O $(OBJDIR)$Pi32_add$O $(OBJDIR)$Pi32_bitlen$O $(OBJDIR)$Pi32_decmod$O $(OBJDIR)$Pi32_decode$O $(OBJDIR)$Pi32_decred$O $(OBJDIR)$Pi32_div32$O $(OBJDIR)$Pi32_encode$O $(OBJDIR)$Pi32_fmont$O $(OBJDIR)$Pi32_iszero$O $(OBJDIR)$Pi32_modpow$O $(OBJDIR)$Pi32_montmul$O $(OBJDIR)$Pi32_mulacc$O $(OBJDIR)$Pi32_muladd$O $(OBJDIR)$Pi32_ninv32$O $(OBJDIR)$Pi32_reduce$O $(OBJDIR)$Pi32_sub$O $(OBJDIR)$Pi32_tmont$O $(OBJDIR)$Pi62_modpow2$O $(OBJDIR)$Phmac$O $(OBJDIR)$Phmac_ct$O $(OBJDIR)$Phmac_drbg$O $(OBJDIR)$Prsa_default_pkcs1_sign$O $(OBJDIR)$Prsa_default_pkcs1_vrfy$O $(OBJDIR)$Prsa_default_priv$O $(OBJDIR)$Prsa_default_pub$O $(OBJDIR)$Prsa_i15_pkcs1_sign$O $(OBJDIR)$Prsa_i15_pkcs1_vrfy$O $(OBJDIR)$Prsa_i15_priv$O $(OBJDIR)$Prsa_i15_pub$O $(OBJDIR)$Prsa_i31_pkcs1_sign$O $(OBJDIR)$Prsa_i31_pkcs1_vrfy$O $(OBJDIR)$Prsa_i31_priv$O $(OBJDIR)$Prsa_i31_pub$O $(OBJDIR)$Prsa_i32_pkcs1_sign$O $(OBJDIR)$Prsa_i32_pkcs1_vrfy$O $(OBJDIR)$Prsa_i32_priv$O $(OBJDIR)$Prsa_i32_pub$O $(OBJDIR)$Prsa_i62_pkcs1_sign$O $(OBJDIR)$Prsa_i62_pkcs1_vrfy$O $(OBJDIR)$Prsa_i62_priv$O $(OBJDIR)$Prsa_i62_pub$O $(OBJDIR)$Prsa_pkcs1_sig_pad$O $(OBJDIR)$Prsa_pkcs1_sig_unpad$O $(OBJDIR)$Prsa_ssl_decrypt$O $(OBJDIR)$Pprf$O $(OBJDIR)$Pprf_md5sha1$O $(OBJDIR)$Pprf_sha256$O $(OBJDIR)$Pprf_sha384$O $(OBJDIR)$Pssl_ccert_single_ec$O $(OBJDIR)$Pssl_ccert_single_rsa$O $(OBJDIR)$Pssl_client$O $(OBJDIR)$Pssl_client_default_rsapub$O $(OBJDIR)$Pssl_client_full$O $(OBJDIR)$Pssl_engine$O $(OBJDIR)$Pssl_engine_default_aescbc$O $(OBJDIR)$Pssl_engine_default_aesgcm$O $(OBJDIR)$Pssl_engine_default_chapol$O $(OBJDIR)$Pssl_engine_default_descbc$O $(OBJDIR)$Pssl_engine_default_ec$O $(OBJDIR)$Pssl_engine_default_ecdsa$O $(OBJDIR)$Pssl_engine_default_rsavrfy$O $(OBJDIR)$Pssl_hashes$O $(OBJDIR)$Pssl_hs_client$O $(OBJDIR)$Pssl_hs_server$O $(OBJDIR)$Pssl_io$O $(OBJDIR)$Pssl_keyexport$O $(OBJDIR)$Pssl_lru$O $(OBJDIR)$Pssl_rec_cbc$O $(OBJDIR)$Pssl_rec_chapol$O $(OBJDIR)$Pssl_rec_gcm$O $(OBJDIR)$Pssl_scert_single_ec$O $(OBJDIR)$Pssl_scert_single_rsa$O $(OBJDIR)$Pssl_server$O $(OBJDIR)$Pssl_server_full_ec$O $(OBJDIR)$Pssl_server_full_rsa$O $(OBJDIR)$Pssl_server_mine2c$O $(OBJDIR)$Pssl_server_mine2g$O $(OBJDIR)$Pssl_server_minf2c$O $(OBJDIR)$Pssl_server_minf2g$O $(OBJDIR)$Pssl_server_minr2g$O $(OBJDIR)$Pssl_server_minu2g$O $(OBJDIR)$Pssl_server_minv2g$O $(OBJDIR)$Paes_big_cbcdec$O $(OBJDIR)$Paes_big_cbcenc$O $(OBJDIR)$Paes_big_ctr$O $(OBJDIR)$Paes_big_dec$O $(OBJDIR)$Paes_big_enc$O $(OBJDIR)$Paes_common$O $(OBJDIR)$Paes_ct$O $(OBJDIR)$Paes_ct64$O $(OBJDIR)$Paes_ct64_cbcdec$O $(OBJDIR)$Paes_ct64_cbcenc$O $(OBJDIR)$Paes_ct64_ctr$O $(OBJDIR)$Paes_ct64_dec$O $(OBJDIR)$Paes_ct64_enc$O $(OBJDIR)$Paes_ct_cbcdec$O $(OBJDIR)$Paes_ct_cbcenc$O $(OBJDIR)$Paes_ct_ctr$O $(OBJDIR)$Paes_ct_dec$O $(OBJDIR)$Paes_ct_enc$O $(OBJDIR)$Paes_pwr8$O $(OBJDIR)$Paes_pwr8_cbcdec$O $(OBJDIR)$Paes_pwr8_cbcenc$O $(OBJDIR)$Paes_pwr8_ctr$O $(OBJDIR)$Paes_small_cbcdec$O $(OBJDIR)$Paes_small_cbcenc$O $(OBJDIR)$Paes_small_ctr$O $(OBJDIR)$Paes_small_dec$O $(OBJDIR)$Paes_small_enc$O $(OBJDIR)$Paes_x86ni$O $(OBJDIR)$Paes_x86ni_cbcdec$O $(OBJDIR)$Paes_x86ni_cbcenc$O $(OBJDIR)$Paes_x86ni_ctr$O $(OBJDIR)$Pchacha20_ct$O $(OBJDIR)$Pdes_ct$O $(OBJDIR)$Pdes_ct_cbcdec$O $(OBJDIR)$Pdes_ct_cbcenc$O $(OBJDIR)$Pdes_support$O $(OBJDIR)$Pdes_tab$O $(OBJDIR)$Pdes_tab_cbcdec$O $(OBJDIR)$Pdes_tab_cbcenc$O $(OBJDIR)$Ppoly1305_ctmul$O $(OBJDIR)$Ppoly1305_ctmul32$O $(OBJDIR)$Ppoly1305_ctmulq$O $(OBJDIR)$Ppoly1305_i15$O $(OBJDIR)$Pskey_decoder$O $(OBJDIR)$Px509_decoder$O $(OBJDIR)$Px509_knownkey$O $(OBJDIR)$Px509_minimal$O $(OBJDIR)$Px509_minimal_full$O
+OBJ = $(OBJDIR)$Pgcm$O $(OBJDIR)$Pccopy$O $(OBJDIR)$Pdec16be$O $(OBJDIR)$Pdec16le$O $(OBJDIR)$Pdec32be$O $(OBJDIR)$Pdec32le$O $(OBJDIR)$Pdec64be$O $(OBJDIR)$Pdec64le$O $(OBJDIR)$Penc16be$O $(OBJDIR)$Penc16le$O $(OBJDIR)$Penc32be$O $(OBJDIR)$Penc32le$O $(OBJDIR)$Penc64be$O $(OBJDIR)$Penc64le$O $(OBJDIR)$Ppemdec$O $(OBJDIR)$Pec_all_m15$O $(OBJDIR)$Pec_all_m31$O $(OBJDIR)$Pec_c25519_i15$O $(OBJDIR)$Pec_c25519_i31$O $(OBJDIR)$Pec_c25519_m15$O $(OBJDIR)$Pec_c25519_m31$O $(OBJDIR)$Pec_curve25519$O $(OBJDIR)$Pec_default$O $(OBJDIR)$Pec_p256_m15$O $(OBJDIR)$Pec_p256_m31$O $(OBJDIR)$Pec_prime_i15$O $(OBJDIR)$Pec_prime_i31$O $(OBJDIR)$Pec_secp256r1$O $(OBJDIR)$Pec_secp384r1$O $(OBJDIR)$Pec_secp521r1$O $(OBJDIR)$Pecdsa_atr$O $(OBJDIR)$Pecdsa_default_sign_asn1$O $(OBJDIR)$Pecdsa_default_sign_raw$O $(OBJDIR)$Pecdsa_default_vrfy_asn1$O $(OBJDIR)$Pecdsa_default_vrfy_raw$O $(OBJDIR)$Pecdsa_i15_bits$O $(OBJDIR)$Pecdsa_i15_sign_asn1$O $(OBJDIR)$Pecdsa_i15_sign_raw$O $(OBJDIR)$Pecdsa_i15_vrfy_asn1$O $(OBJDIR)$Pecdsa_i15_vrfy_raw$O $(OBJDIR)$Pecdsa_i31_bits$O $(OBJDIR)$Pecdsa_i31_sign_asn1$O $(OBJDIR)$Pecdsa_i31_sign_raw$O $(OBJDIR)$Pecdsa_i31_vrfy_asn1$O $(OBJDIR)$Pecdsa_i31_vrfy_raw$O $(OBJDIR)$Pecdsa_rta$O $(OBJDIR)$Pdig_oid$O $(OBJDIR)$Pdig_size$O $(OBJDIR)$Pghash_ctmul$O $(OBJDIR)$Pghash_ctmul32$O $(OBJDIR)$Pghash_ctmul64$O $(OBJDIR)$Pghash_pclmul$O $(OBJDIR)$Pghash_pwr8$O $(OBJDIR)$Pmd5$O $(OBJDIR)$Pmd5sha1$O $(OBJDIR)$Pmultihash$O $(OBJDIR)$Psha1$O $(OBJDIR)$Psha2big$O $(OBJDIR)$Psha2small$O $(OBJDIR)$Pi15_add$O $(OBJDIR)$Pi15_bitlen$O $(OBJDIR)$Pi15_decmod$O $(OBJDIR)$Pi15_decode$O $(OBJDIR)$Pi15_decred$O $(OBJDIR)$Pi15_encode$O $(OBJDIR)$Pi15_fmont$O $(OBJDIR)$Pi15_iszero$O $(OBJDIR)$Pi15_modpow$O $(OBJDIR)$Pi15_modpow2$O $(OBJDIR)$Pi15_montmul$O $(OBJDIR)$Pi15_mulacc$O $(OBJDIR)$Pi15_muladd$O $(OBJDIR)$Pi15_ninv15$O $(OBJDIR)$Pi15_reduce$O $(OBJDIR)$Pi15_rshift$O $(OBJDIR)$Pi15_sub$O $(OBJDIR)$Pi15_tmont$O $(OBJDIR)$Pi31_add$O $(OBJDIR)$Pi31_bitlen$O $(OBJDIR)$Pi31_decmod$O $(OBJDIR)$Pi31_decode$O $(OBJDIR)$Pi31_decred$O $(OBJDIR)$Pi31_encode$O $(OBJDIR)$Pi31_fmont$O $(OBJDIR)$Pi31_iszero$O $(OBJDIR)$Pi31_modpow$O $(OBJDIR)$Pi31_modpow2$O $(OBJDIR)$Pi31_montmul$O $(OBJDIR)$Pi31_mulacc$O $(OBJDIR)$Pi31_muladd$O $(OBJDIR)$Pi31_ninv31$O $(OBJDIR)$Pi31_reduce$O $(OBJDIR)$Pi31_rshift$O $(OBJDIR)$Pi31_sub$O $(OBJDIR)$Pi31_tmont$O $(OBJDIR)$Pi32_add$O $(OBJDIR)$Pi32_bitlen$O $(OBJDIR)$Pi32_decmod$O $(OBJDIR)$Pi32_decode$O $(OBJDIR)$Pi32_decred$O $(OBJDIR)$Pi32_div32$O $(OBJDIR)$Pi32_encode$O $(OBJDIR)$Pi32_fmont$O $(OBJDIR)$Pi32_iszero$O $(OBJDIR)$Pi32_modpow$O $(OBJDIR)$Pi32_montmul$O $(OBJDIR)$Pi32_mulacc$O $(OBJDIR)$Pi32_muladd$O $(OBJDIR)$Pi32_ninv32$O $(OBJDIR)$Pi32_reduce$O $(OBJDIR)$Pi32_sub$O $(OBJDIR)$Pi32_tmont$O $(OBJDIR)$Pi62_modpow2$O $(OBJDIR)$Phmac$O $(OBJDIR)$Phmac_ct$O $(OBJDIR)$Phmac_drbg$O $(OBJDIR)$Prsa_default_pkcs1_sign$O $(OBJDIR)$Prsa_default_pkcs1_vrfy$O $(OBJDIR)$Prsa_default_priv$O $(OBJDIR)$Prsa_default_pub$O $(OBJDIR)$Prsa_i15_pkcs1_sign$O $(OBJDIR)$Prsa_i15_pkcs1_vrfy$O $(OBJDIR)$Prsa_i15_priv$O $(OBJDIR)$Prsa_i15_pub$O $(OBJDIR)$Prsa_i31_pkcs1_sign$O $(OBJDIR)$Prsa_i31_pkcs1_vrfy$O $(OBJDIR)$Prsa_i31_priv$O $(OBJDIR)$Prsa_i31_pub$O $(OBJDIR)$Prsa_i32_pkcs1_sign$O $(OBJDIR)$Prsa_i32_pkcs1_vrfy$O $(OBJDIR)$Prsa_i32_priv$O $(OBJDIR)$Prsa_i32_pub$O $(OBJDIR)$Prsa_i62_pkcs1_sign$O $(OBJDIR)$Prsa_i62_pkcs1_vrfy$O $(OBJDIR)$Prsa_i62_priv$O $(OBJDIR)$Prsa_i62_pub$O $(OBJDIR)$Prsa_pkcs1_sig_pad$O $(OBJDIR)$Prsa_pkcs1_sig_unpad$O $(OBJDIR)$Prsa_ssl_decrypt$O $(OBJDIR)$Pprf$O $(OBJDIR)$Pprf_md5sha1$O $(OBJDIR)$Pprf_sha256$O $(OBJDIR)$Pprf_sha384$O $(OBJDIR)$Pssl_ccert_single_ec$O $(OBJDIR)$Pssl_ccert_single_rsa$O $(OBJDIR)$Pssl_client$O $(OBJDIR)$Pssl_client_default_rsapub$O $(OBJDIR)$Pssl_client_full$O $(OBJDIR)$Pssl_engine$O $(OBJDIR)$Pssl_engine_default_aescbc$O $(OBJDIR)$Pssl_engine_default_aesgcm$O $(OBJDIR)$Pssl_engine_default_chapol$O $(OBJDIR)$Pssl_engine_default_descbc$O $(OBJDIR)$Pssl_engine_default_ec$O $(OBJDIR)$Pssl_engine_default_ecdsa$O $(OBJDIR)$Pssl_engine_default_rsavrfy$O $(OBJDIR)$Pssl_hashes$O $(OBJDIR)$Pssl_hs_client$O $(OBJDIR)$Pssl_hs_server$O $(OBJDIR)$Pssl_io$O $(OBJDIR)$Pssl_keyexport$O $(OBJDIR)$Pssl_lru$O $(OBJDIR)$Pssl_rec_cbc$O $(OBJDIR)$Pssl_rec_chapol$O $(OBJDIR)$Pssl_rec_gcm$O $(OBJDIR)$Pssl_scert_single_ec$O $(OBJDIR)$Pssl_scert_single_rsa$O $(OBJDIR)$Pssl_server$O $(OBJDIR)$Pssl_server_full_ec$O $(OBJDIR)$Pssl_server_full_rsa$O $(OBJDIR)$Pssl_server_mine2c$O $(OBJDIR)$Pssl_server_mine2g$O $(OBJDIR)$Pssl_server_minf2c$O $(OBJDIR)$Pssl_server_minf2g$O $(OBJDIR)$Pssl_server_minr2g$O $(OBJDIR)$Pssl_server_minu2g$O $(OBJDIR)$Pssl_server_minv2g$O $(OBJDIR)$Paes_big_cbcdec$O $(OBJDIR)$Paes_big_cbcenc$O $(OBJDIR)$Paes_big_ctr$O $(OBJDIR)$Paes_big_dec$O $(OBJDIR)$Paes_big_enc$O $(OBJDIR)$Paes_common$O $(OBJDIR)$Paes_ct$O $(OBJDIR)$Paes_ct64$O $(OBJDIR)$Paes_ct64_cbcdec$O $(OBJDIR)$Paes_ct64_cbcenc$O $(OBJDIR)$Paes_ct64_ctr$O $(OBJDIR)$Paes_ct64_dec$O $(OBJDIR)$Paes_ct64_enc$O $(OBJDIR)$Paes_ct_cbcdec$O $(OBJDIR)$Paes_ct_cbcenc$O $(OBJDIR)$Paes_ct_ctr$O $(OBJDIR)$Paes_ct_dec$O $(OBJDIR)$Paes_ct_enc$O $(OBJDIR)$Paes_pwr8$O $(OBJDIR)$Paes_pwr8_cbcdec$O $(OBJDIR)$Paes_pwr8_cbcenc$O $(OBJDIR)$Paes_pwr8_ctr$O $(OBJDIR)$Paes_small_cbcdec$O $(OBJDIR)$Paes_small_cbcenc$O $(OBJDIR)$Paes_small_ctr$O $(OBJDIR)$Paes_small_dec$O $(OBJDIR)$Paes_small_enc$O $(OBJDIR)$Paes_x86ni$O $(OBJDIR)$Paes_x86ni_cbcdec$O $(OBJDIR)$Paes_x86ni_cbcenc$O $(OBJDIR)$Paes_x86ni_ctr$O $(OBJDIR)$Pchacha20_ct$O $(OBJDIR)$Pchacha20_sse2$O $(OBJDIR)$Pdes_ct$O $(OBJDIR)$Pdes_ct_cbcdec$O $(OBJDIR)$Pdes_ct_cbcenc$O $(OBJDIR)$Pdes_support$O $(OBJDIR)$Pdes_tab$O $(OBJDIR)$Pdes_tab_cbcdec$O $(OBJDIR)$Pdes_tab_cbcenc$O $(OBJDIR)$Ppoly1305_ctmul$O $(OBJDIR)$Ppoly1305_ctmul32$O $(OBJDIR)$Ppoly1305_ctmulq$O $(OBJDIR)$Ppoly1305_i15$O $(OBJDIR)$Pskey_decoder$O $(OBJDIR)$Px509_decoder$O $(OBJDIR)$Px509_knownkey$O $(OBJDIR)$Px509_minimal$O $(OBJDIR)$Px509_minimal_full$O
OBJBRSSL = $(OBJDIR)$Pbrssl$O $(OBJDIR)$Pcerts$O $(OBJDIR)$Pchain$O $(OBJDIR)$Pclient$O $(OBJDIR)$Perrors$O $(OBJDIR)$Pfiles$O $(OBJDIR)$Pkeys$O $(OBJDIR)$Pnames$O $(OBJDIR)$Pserver$O $(OBJDIR)$Pskey$O $(OBJDIR)$Psslio$O $(OBJDIR)$Pta$O $(OBJDIR)$Pvector$O $(OBJDIR)$Pverify$O $(OBJDIR)$Pxmem$O
OBJTESTCRYPTO = $(OBJDIR)$Ptest_crypto$O
OBJTESTSPEED = $(OBJDIR)$Ptest_speed$O
$(OBJDIR)$Pchacha20_ct$O: src$Psymcipher$Pchacha20_ct.c $(HEADERSPRIV)
$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pchacha20_ct$O src$Psymcipher$Pchacha20_ct.c
+$(OBJDIR)$Pchacha20_sse2$O: src$Psymcipher$Pchacha20_sse2.c $(HEADERSPRIV)
+ $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pchacha20_sse2$O src$Psymcipher$Pchacha20_sse2.c
+
$(OBJDIR)$Pdes_ct$O: src$Psymcipher$Pdes_ct.c $(HEADERSPRIV)
$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pdes_ct$O src$Psymcipher$Pdes_ct.c
src/symcipher/aes_x86ni_cbcenc.c \
src/symcipher/aes_x86ni_ctr.c \
src/symcipher/chacha20_ct.c \
+ src/symcipher/chacha20_sse2.c \
src/symcipher/des_ct.c \
src/symcipher/des_ct_cbcdec.c \
src/symcipher/des_ct_cbcenc.c \
#define BR_AES_X86NI 1
*/
+/*
+ * When BR_SSE2 is enabled, SSE2 intrinsics will be used for some
+ * algorithm implementations that use them (e.g. chacha20_sse2). If this
+ * is not enabled explicitly, then support for SSE2 intrinsics will be
+ * automatically detected. If set explicitly to 0, then SSE2 code will
+ * not be compiled at all.
+ *
+#define BR_SSE2 1
+ */
+
/*
* When BR_POWER8 is enabled, the AES implementation using the POWER ISA
* 2.07 opcodes (available on POWER8 processors and later) is compiled.
#endif
#endif
+/*
+ * Determine whether SSE2 intrinsics are understood by the compiler.
+ * Right now, we restrict ourselves to compiler versions where things
+ * are documented to work well:
+ * -- GCC 4.4+ and Clang 3.7+ understand the function attribute "target"
+ * -- MS Visual Studio 2005 documents the existence of <emmintrin.h>
+ * SSE2-powered code _might_ work with older versions, but there is no
+ * pressing need to do so right now.
+ */
+#ifndef BR_SSE2
+#if (__i386__ || __x86_64__) \
+ && ((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) \
+ || (__clang_major__ > 3 \
+ || (__clang_major__ == 3 && __clang_minor__ >= 7)))
+#define BR_SSE2 1
+#elif (_M_IX86 || _M_X64) && (_MSC_VER >= 1400)
+#define BR_SSE2 1
+#endif
+#endif
+
+/*
+ * If we use SSE2 intrinsics, determine the compiler brand.
+ */
+#if BR_SSE2
+#ifndef BR_SSE2_GCC
+#if __GNUC__
+#define BR_SSE2_GCC 1
+#endif
+#endif
+#ifndef BR_SSE2_MSC
+#if _MSC_VER >= 1400
+#define BR_SSE2_MSC 1
+#endif
+#endif
+#endif
+
/*
* A macro to tag a function with a "target" attribute (for GCC and Clang).
*/
-#if BR_AES_X86NI_GCC
+#if BR_AES_X86NI_GCC || BR_SSE2_GCC
#define BR_TARGET(x) __attribute__((target(x)))
#else
#define BR_TARGET(x)
/*
* GCC versions from 4.4 to 4.8 (inclusive) must use a special #pragma
* to activate extra opcodes before including the relevant intrinsic
- * headers. But these don't work with Clang (which does not need them
- * either). We also need that #pragma for GCC 4.9 in order to work
+ * AES-NI headers. But these don't work with Clang (which does not need
+ * them either). We also need that #pragma for GCC 4.9 in order to work
* around a compiler bug (it tends to blow up on ghash_pclmul code
* otherwise).
*/
#if BR_INT128 || BR_UMUL128
br_poly1305_run bp;
#endif
+#if BR_SSE2
+ br_chacha20_run bc;
+#endif
br_ssl_engine_set_chapol(cc,
&br_sslrec_in_chapol_vtable,
&br_sslrec_out_chapol_vtable);
- br_ssl_engine_set_chacha20(cc, &br_chacha20_ct_run);
+#if BR_SSE2
+ bc = br_chacha20_sse2_get();
+ if (bc) {
+ br_ssl_engine_set_chacha20(cc, bc);
+ } else {
+#endif
+ br_ssl_engine_set_chacha20(cc, &br_chacha20_ct_run);
+#if BR_SSE2
+ }
+#endif
#if BR_INT128 || BR_UMUL128
bp = br_poly1305_ctmulq_get();
if (bp) {
br_ssl_engine_set_poly1305(cc, bp);
- return;
- }
+ } else {
#endif
#if BR_LOMUL
- br_ssl_engine_set_poly1305(cc, &br_poly1305_ctmul32_run);
+ br_ssl_engine_set_poly1305(cc, &br_poly1305_ctmul32_run);
#else
- br_ssl_engine_set_poly1305(cc, &br_poly1305_ctmul_run);
+ br_ssl_engine_set_poly1305(cc, &br_poly1305_ctmul_run);
+#endif
+#if BR_INT128 || BR_UMUL128
+ }
#endif
}
--- /dev/null
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * This file contains a ChaCha20 implementation that leverages SSE2
+ * opcodes for better performance.
+ */
+
+#if BR_SSE2
+
+#if BR_SSE2_GCC
+#include <emmintrin.h>
+#include <cpuid.h>
+#endif
+#if BR_SSE2_MSC
+#include <intrin.h>
+#endif
+
+/* see bearssl_block.h */
+BR_TARGET("sse2")
+uint32_t
+br_chacha20_sse2_run(const void *key,
+ const void *iv, uint32_t cc, void *data, size_t len)
+{
+ unsigned char *buf;
+ uint32_t ivtmp[4];
+ __m128i kw0, kw1;
+ __m128i iw, cw;
+ __m128i one;
+
+ static const uint32_t CW[] = {
+ 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+ };
+
+ buf = data;
+ kw0 = _mm_loadu_si128(key);
+ kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
+ ivtmp[0] = cc;
+ memcpy(ivtmp + 1, iv, 12);
+ iw = _mm_loadu_si128((const void *)ivtmp);
+ cw = _mm_loadu_si128((const void *)CW);
+ one = _mm_set_epi32(0, 0, 0, 1);
+
+ while (len > 0) {
+ /*
+ * sj contains state words 4*j to 4*j+3.
+ */
+ __m128i s0, s1, s2, s3;
+ int i;
+
+ s0 = cw;
+ s1 = kw0;
+ s2 = kw1;
+ s3 = iw;
+ for (i = 0; i < 10; i ++) {
+ /*
+ * Even round is straightforward application on
+ * the state words.
+ */
+ s0 = _mm_add_epi32(s0, s1);
+ s3 = _mm_xor_si128(s3, s0);
+ s3 = _mm_or_si128(
+ _mm_slli_epi32(s3, 16),
+ _mm_srli_epi32(s3, 16));
+
+ s2 = _mm_add_epi32(s2, s3);
+ s1 = _mm_xor_si128(s1, s2);
+ s1 = _mm_or_si128(
+ _mm_slli_epi32(s1, 12),
+ _mm_srli_epi32(s1, 20));
+
+ s0 = _mm_add_epi32(s0, s1);
+ s3 = _mm_xor_si128(s3, s0);
+ s3 = _mm_or_si128(
+ _mm_slli_epi32(s3, 8),
+ _mm_srli_epi32(s3, 24));
+
+ s2 = _mm_add_epi32(s2, s3);
+ s1 = _mm_xor_si128(s1, s2);
+ s1 = _mm_or_si128(
+ _mm_slli_epi32(s1, 7),
+ _mm_srli_epi32(s1, 25));
+
+ /*
+ * For the odd round, we must rotate some state
+ * words so that the computations apply on the
+ * right combinations of words.
+ */
+ s1 = _mm_shuffle_epi32(s1, 0x39);
+ s2 = _mm_shuffle_epi32(s2, 0x4E);
+ s3 = _mm_shuffle_epi32(s3, 0x93);
+
+ s0 = _mm_add_epi32(s0, s1);
+ s3 = _mm_xor_si128(s3, s0);
+ s3 = _mm_or_si128(
+ _mm_slli_epi32(s3, 16),
+ _mm_srli_epi32(s3, 16));
+
+ s2 = _mm_add_epi32(s2, s3);
+ s1 = _mm_xor_si128(s1, s2);
+ s1 = _mm_or_si128(
+ _mm_slli_epi32(s1, 12),
+ _mm_srli_epi32(s1, 20));
+
+ s0 = _mm_add_epi32(s0, s1);
+ s3 = _mm_xor_si128(s3, s0);
+ s3 = _mm_or_si128(
+ _mm_slli_epi32(s3, 8),
+ _mm_srli_epi32(s3, 24));
+
+ s2 = _mm_add_epi32(s2, s3);
+ s1 = _mm_xor_si128(s1, s2);
+ s1 = _mm_or_si128(
+ _mm_slli_epi32(s1, 7),
+ _mm_srli_epi32(s1, 25));
+
+ /*
+ * After the odd round, we rotate back the values
+ * to undo the rotate at the start of the odd round.
+ */
+ s1 = _mm_shuffle_epi32(s1, 0x93);
+ s2 = _mm_shuffle_epi32(s2, 0x4E);
+ s3 = _mm_shuffle_epi32(s3, 0x39);
+ }
+
+ /*
+ * Addition with the initial state.
+ */
+ s0 = _mm_add_epi32(s0, cw);
+ s1 = _mm_add_epi32(s1, kw0);
+ s2 = _mm_add_epi32(s2, kw1);
+ s3 = _mm_add_epi32(s3, iw);
+
+ /*
+ * Increment block counter.
+ */
+ iw = _mm_add_epi32(iw, one);
+
+ /*
+ * XOR final state with the data.
+ */
+ if (len < 64) {
+ unsigned char tmp[64];
+ size_t u;
+
+ _mm_storeu_si128((void *)(tmp + 0), s0);
+ _mm_storeu_si128((void *)(tmp + 16), s1);
+ _mm_storeu_si128((void *)(tmp + 32), s2);
+ _mm_storeu_si128((void *)(tmp + 48), s3);
+ for (u = 0; u < len; u ++) {
+ buf[u] ^= tmp[u];
+ }
+ break;
+ } else {
+ __m128i b0, b1, b2, b3;
+
+ b0 = _mm_loadu_si128((const void *)(buf + 0));
+ b1 = _mm_loadu_si128((const void *)(buf + 16));
+ b2 = _mm_loadu_si128((const void *)(buf + 32));
+ b3 = _mm_loadu_si128((const void *)(buf + 48));
+ b0 = _mm_xor_si128(b0, s0);
+ b1 = _mm_xor_si128(b1, s1);
+ b2 = _mm_xor_si128(b2, s2);
+ b3 = _mm_xor_si128(b3, s3);
+ _mm_storeu_si128((void *)(buf + 0), b0);
+ _mm_storeu_si128((void *)(buf + 16), b1);
+ _mm_storeu_si128((void *)(buf + 32), b2);
+ _mm_storeu_si128((void *)(buf + 48), b3);
+ buf += 64;
+ len -= 64;
+ }
+ }
+
+ /*
+ * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
+ * raw SSE2, thus we use _mm_extract_epi16().
+ */
+ return (uint32_t)_mm_extract_epi16(iw, 0)
+ | ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
+}
+
+/* see bearssl_block.h */
+br_chacha20_run
+br_chacha20_sse2_get(void)
+{
+ /*
+ * If using 64-bit mode, then SSE2 opcodes should be automatically
+ * available, since they are part of the ABI.
+ *
+ * In 32-bit mode, we use CPUID to detect the SSE2 feature.
+ */
+
+#if __x86_64__ || _M_X64
+
+ return &br_chacha20_sse2_run;
+
+#else
+
+ /*
+ * SSE2 support is indicated by bit 26 in EDX.
+ */
+#define MASK 0x04000000
+
+#if BR_SSE2_GCC
+ unsigned eax, ebx, ecx, edx;
+
+ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+ if ((edx & MASK) == MASK) {
+ return &br_chacha20_sse2_run;
+ }
+ }
+#elif BR_SSE2_MSC
+ int info[4];
+
+ __cpuid(info, 1);
+ if (((uint32_t)info[3] & MASK) == MASK) {
+ return &br_chacha20_sse2_run;
+ }
+#endif
+ return 0;
+
+#endif
+}
+
+#else
+
+/* see bearssl_block.h */
+br_chacha20_run
+br_chacha20_sse2_get(void)
+{
+ return 0;
+}
+
+#endif
};
static void
-test_ChaCha20_ct(void)
+test_ChaCha20_generic(const char *name, br_chacha20_run cr)
{
size_t u;
- printf("Test ChaCha20_ct: ");
+ printf("Test %s: ", name);
fflush(stdout);
+ if (cr == 0) {
+ printf("UNAVAILABLE\n");
+ return;
+ }
for (u = 0; KAT_CHACHA20[u].skey; u ++) {
unsigned char key[32], nonce[12], plain[400], cipher[400];
for (v = 0; v < len; v ++) {
unsigned char tmp[400];
size_t w;
+ uint32_t cc2;
memset(tmp, 0, sizeof tmp);
memcpy(tmp, plain, v);
- if (br_chacha20_ct_run(key, nonce, cc, tmp, v)
+ if (cr(key, nonce, cc, tmp, v)
!= cc + (uint32_t)((v + 63) >> 6))
{
fprintf(stderr, "ChaCha20: wrong counter\n");
exit(EXIT_FAILURE);
}
}
- br_chacha20_ct_run(key, nonce, cc, tmp, v);
+ for (w = 0, cc2 = cc; w < v; w += 64, cc2 ++) {
+ size_t x;
+
+ x = v - w;
+ if (x > 64) {
+ x = 64;
+ }
+ if (cr(key, nonce, cc2, tmp + w, x)
+ != (cc2 + 1))
+ {
+ fprintf(stderr, "ChaCha20:"
+ " wrong counter (2)\n");
+ exit(EXIT_FAILURE);
+ }
+ }
if (memcmp(tmp, plain, v) != 0) {
fprintf(stderr, "ChaCha20 KAT fail (2)\n");
exit(EXIT_FAILURE);
fflush(stdout);
}
+static void
+test_ChaCha20_ct(void)
+{
+ test_ChaCha20_generic("ChaCha20_ct", &br_chacha20_ct_run);
+}
+
+static void
+test_ChaCha20_sse2(void)
+{
+ test_ChaCha20_generic("ChaCha20_sse2", br_chacha20_sse2_get());
+}
+
static const struct {
const char *splain;
const char *saad;
STU(DES_tab),
STU(DES_ct),
STU(ChaCha20_ct),
+ STU(ChaCha20_sse2),
STU(Poly1305_ctmul),
STU(Poly1305_ctmul32),
STU(Poly1305_ctmulq),
static void \
test_speed_ ## fname(void) \
{ \
+ br_chacha20_run bc; \
unsigned char key[32]; \
unsigned char buf[8192]; \
unsigned char iv[12]; \
int i; \
long num; \
\
+ bc = br_ ## fname ## _get(); \
+ if (bc == 0) { \
+ printf("%-30s UNAVAILABLE\n", #Name); \
+ fflush(stdout); \
+ return; \
+ } \
memset(key, 'T', sizeof key); \
memset(buf, 'P', sizeof buf); \
memset(iv, 'X', sizeof iv); \
for (i = 0; i < 10; i ++) { \
- br_ ## fname ## _run(key, iv, i, buf, sizeof buf); \
+ bc(key, iv, i, buf, sizeof buf); \
} \
num = 10; \
for (;;) { \
\
begin = clock(); \
for (k = num; k > 0; k --) { \
- br_ ## fname ## _run(key, iv, \
- (uint32_t)k, buf, sizeof buf); \
+ bc(key, iv, (uint32_t)k, buf, sizeof buf); \
} \
end = clock(); \
tt = (double)(end - begin) / CLOCKS_PER_SEC; \
#define br_aes_ct64_cbcenc_get_vtable() (&br_aes_ct64_cbcenc_vtable)
#define br_aes_ct64_cbcdec_get_vtable() (&br_aes_ct64_cbcdec_vtable)
#define br_aes_ct64_ctr_get_vtable() (&br_aes_ct64_ctr_vtable)
+#define br_chacha20_ct_get() (&br_chacha20_ct_run)
#define SPEED_AES(iname) \
SPEED_BLOCKCIPHER_CBC(AES-128 CBC encrypt (iname), aes128_ ## iname ## _cbcenc, aes_ ## iname, 16, enc) \
SPEED_DES(tab)
SPEED_DES(ct)
-SPEED_CHACHA20(ChaCha20, chacha20_ct)
+SPEED_CHACHA20(ChaCha20 (ct), chacha20_ct)
+SPEED_CHACHA20(ChaCha20 (sse2), chacha20_sse2)
static void
test_speed_ghash_inner(char *name, br_ghash gh)
STU(3des_ct_cbcdec),
STU(chacha20_ct),
+ STU(chacha20_sse2),
STU(ghash_ctmul),
STU(ghash_ctmul32),