From: Thomas Pornin Date: Wed, 15 Feb 2017 14:08:37 +0000 (+0000) Subject: New AES and GHASH implementations using POWER8 crypto opcodes. X-Git-Tag: v0.4~9 X-Git-Url: https://bearssl.org/gitweb//home/git/?a=commitdiff_plain;h=db8f1b664524e3fbeea8a0730b2bbe2f0bdcea86;p=BearSSL New AES and GHASH implementations using POWER8 crypto opcodes. --- diff --git a/inc/bearssl_block.h b/inc/bearssl_block.h index c6f20f5..88f51b2 100644 --- a/inc/bearssl_block.h +++ b/inc/bearssl_block.h @@ -192,6 +192,7 @@ * | aes_ct | AES | 16 | 16, 24 and 32 | * | aes_ct64 | AES | 16 | 16, 24 and 32 | * | aes_x86ni | AES | 16 | 16, 24 and 32 | + * | aes_pwr8 | AES | 16 | 16, 24 and 32 | * | des_ct | DES/3DES | 8 | 8, 16 and 24 | * | des_tab | DES/3DES | 8 | 8, 16 and 24 | * @@ -225,9 +226,11 @@ * operations (i.e. CTR, and CBC decryption, but not CBC encryption). * * `aes_x86ni` exists only on x86 architectures (32-bit and 64-bit). It - * uses the AES-NI opcodes when available; if the opcodes are not present, - * then it automatically fall backs on an appropriate constant-time - * implementation (`aes_ct` for 32-bit, `aes_ct64` for 64-bit). + * uses the AES-NI opcodes when available. + * + * `aes_pwr8` exists only on PowerPC / POWER architectures (32-bit and + * 64-bit, both little-endian and big-endian). It uses the AES opcodes + * present in POWER8 and later. * * `des_tab` is a classic, table-based implementation of DES/3DES. It * is not constant-time. @@ -1001,9 +1004,7 @@ uint32_t br_aes_ct64_ctr_run(const br_aes_ct64_ctr_keys *ctx, const void *iv, uint32_t cc, void *data, size_t len); /* - * AES implementation using AES-NI opcode (x86 platform). When the - * opcodes are not present, this falls back to "ct" or "ct64" (depending - * on architecture). + * AES implementation using AES-NI opcodes (x86 platform). */ /** \brief AES block size (16 bytes). */ @@ -1021,12 +1022,6 @@ typedef struct { #ifndef BR_DOXYGEN_IGNORE union { unsigned char skni[16 * 15]; - struct { - uint32_t skey[60]; - } fallback_ct; - struct { - uint64_t skey[30]; - } fallback_ct64; } skey; unsigned num_rounds; #endif @@ -1044,12 +1039,6 @@ typedef struct { #ifndef BR_DOXYGEN_IGNORE union { unsigned char skni[16 * 15]; - struct { - uint32_t skey[60]; - } fallback_ct; - struct { - uint64_t skey[30]; - } fallback_ct64; } skey; unsigned num_rounds; #endif @@ -1068,12 +1057,6 @@ typedef struct { #ifndef BR_DOXYGEN_IGNORE union { unsigned char skni[16 * 15]; - struct { - uint32_t skey[60]; - } fallback_ct; - struct { - uint64_t skey[30]; - } fallback_ct64; } skey; unsigned num_rounds; #endif @@ -1213,6 +1196,199 @@ const br_block_cbcdec_class *br_aes_x86ni_cbcdec_get_vtable(void); */ const br_block_ctr_class *br_aes_x86ni_ctr_get_vtable(void); +/* + * AES implementation using POWER8 opcodes. + */ + +/** \brief AES block size (16 bytes). */ +#define br_aes_pwr8_BLOCK_SIZE 16 + +/** + * \brief Context for AES subkeys (`aes_pwr8` implementation, CBC encryption). + * + * First field is a pointer to the vtable; it is set by the initialisation + * function. Other fields are not supposed to be accessed by user code. + */ +typedef struct { + /** \brief Pointer to vtable for this context. */ + const br_block_cbcenc_class *vtable; +#ifndef BR_DOXYGEN_IGNORE + union { + unsigned char skni[16 * 15]; + } skey; + unsigned num_rounds; +#endif +} br_aes_pwr8_cbcenc_keys; + +/** + * \brief Context for AES subkeys (`aes_pwr8` implementation, CBC decryption). + * + * First field is a pointer to the vtable; it is set by the initialisation + * function. Other fields are not supposed to be accessed by user code. + */ +typedef struct { + /** \brief Pointer to vtable for this context. */ + const br_block_cbcdec_class *vtable; +#ifndef BR_DOXYGEN_IGNORE + union { + unsigned char skni[16 * 15]; + } skey; + unsigned num_rounds; +#endif +} br_aes_pwr8_cbcdec_keys; + +/** + * \brief Context for AES subkeys (`aes_pwr8` implementation, CTR encryption + * and decryption). + * + * First field is a pointer to the vtable; it is set by the initialisation + * function. Other fields are not supposed to be accessed by user code. + */ +typedef struct { + /** \brief Pointer to vtable for this context. */ + const br_block_ctr_class *vtable; +#ifndef BR_DOXYGEN_IGNORE + union { + unsigned char skni[16 * 15]; + } skey; + unsigned num_rounds; +#endif +} br_aes_pwr8_ctr_keys; + +/** + * \brief Class instance for AES CBC encryption (`aes_pwr8` implementation). + * + * Since this implementation might be omitted from the library, or the + * AES opcode unavailable on the current CPU, a pointer to this class + * instance should be obtained through `br_aes_pwr8_cbcenc_get_vtable()`. + */ +extern const br_block_cbcenc_class br_aes_pwr8_cbcenc_vtable; + +/** + * \brief Class instance for AES CBC decryption (`aes_pwr8` implementation). + * + * Since this implementation might be omitted from the library, or the + * AES opcode unavailable on the current CPU, a pointer to this class + * instance should be obtained through `br_aes_pwr8_cbcdec_get_vtable()`. + */ +extern const br_block_cbcdec_class br_aes_pwr8_cbcdec_vtable; + +/** + * \brief Class instance for AES CTR encryption and decryption + * (`aes_pwr8` implementation). + * + * Since this implementation might be omitted from the library, or the + * AES opcode unavailable on the current CPU, a pointer to this class + * instance should be obtained through `br_aes_pwr8_ctr_get_vtable()`. + */ +extern const br_block_ctr_class br_aes_pwr8_ctr_vtable; + +/** + * \brief Context initialisation (key schedule) for AES CBC encryption + * (`aes_pwr8` implementation). + * + * \param ctx context to initialise. + * \param key secret key. + * \param len secret key length (in bytes). + */ +void br_aes_pwr8_cbcenc_init(br_aes_pwr8_cbcenc_keys *ctx, + const void *key, size_t len); + +/** + * \brief Context initialisation (key schedule) for AES CBC decryption + * (`aes_pwr8` implementation). + * + * \param ctx context to initialise. + * \param key secret key. + * \param len secret key length (in bytes). + */ +void br_aes_pwr8_cbcdec_init(br_aes_pwr8_cbcdec_keys *ctx, + const void *key, size_t len); + +/** + * \brief Context initialisation (key schedule) for AES CTR encryption + * and decryption (`aes_pwr8` implementation). + * + * \param ctx context to initialise. + * \param key secret key. + * \param len secret key length (in bytes). + */ +void br_aes_pwr8_ctr_init(br_aes_pwr8_ctr_keys *ctx, + const void *key, size_t len); + +/** + * \brief CBC encryption with AES (`aes_pwr8` implementation). + * + * \param ctx context (already initialised). + * \param iv IV (updated). + * \param data data to encrypt (updated). + * \param len data length (in bytes, MUST be multiple of 16). + */ +void br_aes_pwr8_cbcenc_run(const br_aes_pwr8_cbcenc_keys *ctx, void *iv, + void *data, size_t len); + +/** + * \brief CBC decryption with AES (`aes_pwr8` implementation). + * + * \param ctx context (already initialised). + * \param iv IV (updated). + * \param data data to decrypt (updated). + * \param len data length (in bytes, MUST be multiple of 16). + */ +void br_aes_pwr8_cbcdec_run(const br_aes_pwr8_cbcdec_keys *ctx, void *iv, + void *data, size_t len); + +/** + * \brief CTR encryption and decryption with AES (`aes_pwr8` implementation). + * + * \param ctx context (already initialised). + * \param iv IV (constant, 12 bytes). + * \param cc initial block counter value. + * \param data data to decrypt (updated). + * \param len data length (in bytes). + * \return new block counter value. + */ +uint32_t br_aes_pwr8_ctr_run(const br_aes_pwr8_ctr_keys *ctx, + const void *iv, uint32_t cc, void *data, size_t len); + +/** + * \brief Obtain the `aes_pwr8` AES-CBC (encryption) implementation, if + * available. + * + * This function returns a pointer to `br_aes_pwr8_cbcenc_vtable`, if + * that implementation was compiled in the library _and_ the x86 AES + * opcodes are available on the currently running CPU. If either of + * these conditions is not met, then this function returns `NULL`. + * + * \return the `aes_x868ni` AES-CBC (encryption) implementation, or `NULL`. + */ +const br_block_cbcenc_class *br_aes_pwr8_cbcenc_get_vtable(void); + +/** + * \brief Obtain the `aes_pwr8` AES-CBC (decryption) implementation, if + * available. + * + * This function returns a pointer to `br_aes_pwr8_cbcdec_vtable`, if + * that implementation was compiled in the library _and_ the x86 AES + * opcodes are available on the currently running CPU. If either of + * these conditions is not met, then this function returns `NULL`. + * + * \return the `aes_x868ni` AES-CBC (decryption) implementation, or `NULL`. + */ +const br_block_cbcdec_class *br_aes_pwr8_cbcdec_get_vtable(void); + +/** + * \brief Obtain the `aes_pwr8` AES-CTR implementation, if available. + * + * This function returns a pointer to `br_aes_pwr8_ctr_vtable`, if + * that implementation was compiled in the library _and_ the x86 AES + * opcodes are available on the currently running CPU. If either of + * these conditions is not met, then this function returns `NULL`. + * + * \return the `aes_x868ni` AES-CTR implementation, or `NULL`. + */ +const br_block_ctr_class *br_aes_pwr8_ctr_get_vtable(void); + /** * \brief Aggregate structure large enough to be used as context for * subkeys (CBC encryption) for all AES implementations. @@ -1224,6 +1400,7 @@ typedef union { br_aes_ct_cbcenc_keys c_ct; br_aes_ct64_cbcenc_keys c_ct64; br_aes_x86ni_cbcenc_keys c_x86ni; + br_aes_pwr8_cbcenc_keys c_pwr8; } br_aes_gen_cbcenc_keys; /** @@ -1237,6 +1414,7 @@ typedef union { br_aes_ct_cbcdec_keys c_ct; br_aes_ct64_cbcdec_keys c_ct64; br_aes_x86ni_cbcdec_keys c_x86ni; + br_aes_pwr8_cbcdec_keys c_pwr8; } br_aes_gen_cbcdec_keys; /** @@ -1250,6 +1428,7 @@ typedef union { br_aes_ct_ctr_keys c_ct; br_aes_ct64_ctr_keys c_ct64; br_aes_x86ni_ctr_keys c_x86ni; + br_aes_pwr8_ctr_keys c_pwr8; } br_aes_gen_ctr_keys; /* diff --git a/inc/bearssl_hash.h b/inc/bearssl_hash.h index 524ac01..d06bae4 100644 --- a/inc/bearssl_hash.h +++ b/inc/bearssl_hash.h @@ -1309,4 +1309,30 @@ void br_ghash_pclmul(void *y, const void *h, const void *data, size_t len); */ br_ghash br_ghash_pclmul_get(void); +/** + * \brief GHASH implementation using the POWER8 opcodes. + * + * This implementation is available only on POWER8 platforms (and later). + * To safely obtain a pointer to this function when supported (or 0 + * otherwise), use `br_ghash_pwr8_get()`. + * + * \param y the array to update. + * \param h the GHASH key. + * \param data the input data (may be `NULL` if `len` is zero). + * \param len the input data length (in bytes). + */ +void br_ghash_pwr8(void *y, const void *h, const void *data, size_t len); + +/** + * \brief Obtain the `pwr8` GHASH implementation, if available. + * + * If the `pwr8` implementation was compiled in the library (depending + * on the compiler abilities) _and_ the local CPU appears to support the + * opcode, then this function will return a pointer to the + * `br_ghash_pwr8()` function. Otherwise, it will return `0`. + * + * \return the `pwr8` GHASH implementation, or `0`. + */ +br_ghash br_ghash_pwr8_get(void); + #endif diff --git a/mk/Rules.mk b/mk/Rules.mk index 9e2ba07..742f35c 100644 --- a/mk/Rules.mk +++ b/mk/Rules.mk @@ -1,6 +1,6 @@ # Automatically generated rules. Use 'mkrules.sh' to modify/regenerate. -OBJ = $(OBJDIR)$Pccopy$O $(OBJDIR)$Pdec16be$O $(OBJDIR)$Pdec16le$O $(OBJDIR)$Pdec32be$O $(OBJDIR)$Pdec32le$O $(OBJDIR)$Pdec64be$O $(OBJDIR)$Pdec64le$O $(OBJDIR)$Penc16be$O $(OBJDIR)$Penc16le$O $(OBJDIR)$Penc32be$O $(OBJDIR)$Penc32le$O $(OBJDIR)$Penc64be$O $(OBJDIR)$Penc64le$O $(OBJDIR)$Ppemdec$O $(OBJDIR)$Pec_all_m15$O $(OBJDIR)$Pec_all_m31$O $(OBJDIR)$Pec_c25519_i15$O $(OBJDIR)$Pec_c25519_i31$O $(OBJDIR)$Pec_c25519_m15$O $(OBJDIR)$Pec_c25519_m31$O $(OBJDIR)$Pec_curve25519$O $(OBJDIR)$Pec_default$O $(OBJDIR)$Pec_p256_m15$O $(OBJDIR)$Pec_p256_m31$O $(OBJDIR)$Pec_prime_i15$O $(OBJDIR)$Pec_prime_i31$O $(OBJDIR)$Pec_secp256r1$O $(OBJDIR)$Pec_secp384r1$O $(OBJDIR)$Pec_secp521r1$O $(OBJDIR)$Pecdsa_atr$O $(OBJDIR)$Pecdsa_default_sign_asn1$O $(OBJDIR)$Pecdsa_default_sign_raw$O $(OBJDIR)$Pecdsa_default_vrfy_asn1$O $(OBJDIR)$Pecdsa_default_vrfy_raw$O $(OBJDIR)$Pecdsa_i15_bits$O $(OBJDIR)$Pecdsa_i15_sign_asn1$O $(OBJDIR)$Pecdsa_i15_sign_raw$O $(OBJDIR)$Pecdsa_i15_vrfy_asn1$O $(OBJDIR)$Pecdsa_i15_vrfy_raw$O $(OBJDIR)$Pecdsa_i31_bits$O $(OBJDIR)$Pecdsa_i31_sign_asn1$O $(OBJDIR)$Pecdsa_i31_sign_raw$O $(OBJDIR)$Pecdsa_i31_vrfy_asn1$O $(OBJDIR)$Pecdsa_i31_vrfy_raw$O $(OBJDIR)$Pecdsa_rta$O $(OBJDIR)$Pdig_oid$O $(OBJDIR)$Pdig_size$O $(OBJDIR)$Pghash_ctmul$O $(OBJDIR)$Pghash_ctmul32$O $(OBJDIR)$Pghash_ctmul64$O $(OBJDIR)$Pghash_pclmul$O $(OBJDIR)$Pmd5$O $(OBJDIR)$Pmd5sha1$O $(OBJDIR)$Pmultihash$O $(OBJDIR)$Psha1$O $(OBJDIR)$Psha2big$O $(OBJDIR)$Psha2small$O $(OBJDIR)$Pi15_add$O $(OBJDIR)$Pi15_bitlen$O $(OBJDIR)$Pi15_decmod$O $(OBJDIR)$Pi15_decode$O $(OBJDIR)$Pi15_decred$O $(OBJDIR)$Pi15_encode$O $(OBJDIR)$Pi15_fmont$O $(OBJDIR)$Pi15_iszero$O $(OBJDIR)$Pi15_modpow$O $(OBJDIR)$Pi15_modpow2$O $(OBJDIR)$Pi15_montmul$O $(OBJDIR)$Pi15_mulacc$O $(OBJDIR)$Pi15_muladd$O $(OBJDIR)$Pi15_ninv15$O $(OBJDIR)$Pi15_reduce$O $(OBJDIR)$Pi15_rshift$O $(OBJDIR)$Pi15_sub$O $(OBJDIR)$Pi15_tmont$O $(OBJDIR)$Pi31_add$O $(OBJDIR)$Pi31_bitlen$O $(OBJDIR)$Pi31_decmod$O $(OBJDIR)$Pi31_decode$O $(OBJDIR)$Pi31_decred$O $(OBJDIR)$Pi31_encode$O $(OBJDIR)$Pi31_fmont$O $(OBJDIR)$Pi31_iszero$O $(OBJDIR)$Pi31_modpow$O $(OBJDIR)$Pi31_montmul$O $(OBJDIR)$Pi31_mulacc$O $(OBJDIR)$Pi31_muladd$O $(OBJDIR)$Pi31_ninv31$O $(OBJDIR)$Pi31_reduce$O $(OBJDIR)$Pi31_rshift$O $(OBJDIR)$Pi31_sub$O $(OBJDIR)$Pi31_tmont$O $(OBJDIR)$Pi32_add$O $(OBJDIR)$Pi32_bitlen$O $(OBJDIR)$Pi32_decmod$O $(OBJDIR)$Pi32_decode$O $(OBJDIR)$Pi32_decred$O $(OBJDIR)$Pi32_div32$O $(OBJDIR)$Pi32_encode$O $(OBJDIR)$Pi32_fmont$O $(OBJDIR)$Pi32_iszero$O $(OBJDIR)$Pi32_modpow$O $(OBJDIR)$Pi32_montmul$O $(OBJDIR)$Pi32_mulacc$O $(OBJDIR)$Pi32_muladd$O $(OBJDIR)$Pi32_ninv32$O $(OBJDIR)$Pi32_reduce$O $(OBJDIR)$Pi32_sub$O $(OBJDIR)$Pi32_tmont$O $(OBJDIR)$Phmac$O $(OBJDIR)$Phmac_ct$O $(OBJDIR)$Phmac_drbg$O $(OBJDIR)$Prsa_default_pkcs1_sign$O $(OBJDIR)$Prsa_default_pkcs1_vrfy$O $(OBJDIR)$Prsa_default_priv$O $(OBJDIR)$Prsa_default_pub$O $(OBJDIR)$Prsa_i15_pkcs1_sign$O $(OBJDIR)$Prsa_i15_pkcs1_vrfy$O $(OBJDIR)$Prsa_i15_priv$O $(OBJDIR)$Prsa_i15_pub$O $(OBJDIR)$Prsa_i31_pkcs1_sign$O $(OBJDIR)$Prsa_i31_pkcs1_vrfy$O $(OBJDIR)$Prsa_i31_priv$O $(OBJDIR)$Prsa_i31_pub$O $(OBJDIR)$Prsa_i32_pkcs1_sign$O $(OBJDIR)$Prsa_i32_pkcs1_vrfy$O $(OBJDIR)$Prsa_i32_priv$O $(OBJDIR)$Prsa_i32_pub$O $(OBJDIR)$Prsa_pkcs1_sig_pad$O $(OBJDIR)$Prsa_pkcs1_sig_unpad$O $(OBJDIR)$Prsa_ssl_decrypt$O $(OBJDIR)$Pprf$O $(OBJDIR)$Pprf_md5sha1$O $(OBJDIR)$Pprf_sha256$O $(OBJDIR)$Pprf_sha384$O $(OBJDIR)$Pssl_ccert_single_ec$O $(OBJDIR)$Pssl_ccert_single_rsa$O $(OBJDIR)$Pssl_client$O $(OBJDIR)$Pssl_client_default_rsapub$O $(OBJDIR)$Pssl_client_full$O $(OBJDIR)$Pssl_engine$O $(OBJDIR)$Pssl_engine_default_aescbc$O $(OBJDIR)$Pssl_engine_default_aesgcm$O $(OBJDIR)$Pssl_engine_default_chapol$O $(OBJDIR)$Pssl_engine_default_descbc$O $(OBJDIR)$Pssl_engine_default_ec$O $(OBJDIR)$Pssl_engine_default_ecdsa$O $(OBJDIR)$Pssl_engine_default_rsavrfy$O $(OBJDIR)$Pssl_hashes$O $(OBJDIR)$Pssl_hs_client$O $(OBJDIR)$Pssl_hs_server$O $(OBJDIR)$Pssl_io$O $(OBJDIR)$Pssl_lru$O $(OBJDIR)$Pssl_rec_cbc$O $(OBJDIR)$Pssl_rec_chapol$O $(OBJDIR)$Pssl_rec_gcm$O $(OBJDIR)$Pssl_scert_single_ec$O $(OBJDIR)$Pssl_scert_single_rsa$O $(OBJDIR)$Pssl_server$O $(OBJDIR)$Pssl_server_full_ec$O $(OBJDIR)$Pssl_server_full_rsa$O $(OBJDIR)$Pssl_server_mine2c$O $(OBJDIR)$Pssl_server_mine2g$O $(OBJDIR)$Pssl_server_minf2c$O $(OBJDIR)$Pssl_server_minf2g$O $(OBJDIR)$Pssl_server_minr2g$O $(OBJDIR)$Pssl_server_minu2g$O $(OBJDIR)$Pssl_server_minv2g$O $(OBJDIR)$Paes_big_cbcdec$O $(OBJDIR)$Paes_big_cbcenc$O $(OBJDIR)$Paes_big_ctr$O $(OBJDIR)$Paes_big_dec$O $(OBJDIR)$Paes_big_enc$O $(OBJDIR)$Paes_common$O $(OBJDIR)$Paes_ct$O $(OBJDIR)$Paes_ct64$O $(OBJDIR)$Paes_ct64_cbcdec$O $(OBJDIR)$Paes_ct64_cbcenc$O $(OBJDIR)$Paes_ct64_ctr$O $(OBJDIR)$Paes_ct64_dec$O $(OBJDIR)$Paes_ct64_enc$O $(OBJDIR)$Paes_ct_cbcdec$O $(OBJDIR)$Paes_ct_cbcenc$O $(OBJDIR)$Paes_ct_ctr$O $(OBJDIR)$Paes_ct_dec$O $(OBJDIR)$Paes_ct_enc$O $(OBJDIR)$Paes_small_cbcdec$O $(OBJDIR)$Paes_small_cbcenc$O $(OBJDIR)$Paes_small_ctr$O $(OBJDIR)$Paes_small_dec$O $(OBJDIR)$Paes_small_enc$O $(OBJDIR)$Paes_x86ni$O $(OBJDIR)$Paes_x86ni_cbcdec$O $(OBJDIR)$Paes_x86ni_cbcenc$O $(OBJDIR)$Paes_x86ni_ctr$O $(OBJDIR)$Pchacha20_ct$O $(OBJDIR)$Pdes_ct$O $(OBJDIR)$Pdes_ct_cbcdec$O $(OBJDIR)$Pdes_ct_cbcenc$O $(OBJDIR)$Pdes_support$O $(OBJDIR)$Pdes_tab$O $(OBJDIR)$Pdes_tab_cbcdec$O $(OBJDIR)$Pdes_tab_cbcenc$O $(OBJDIR)$Ppoly1305_ctmul$O $(OBJDIR)$Ppoly1305_ctmul32$O $(OBJDIR)$Ppoly1305_i15$O $(OBJDIR)$Pskey_decoder$O $(OBJDIR)$Px509_decoder$O $(OBJDIR)$Px509_knownkey$O $(OBJDIR)$Px509_minimal$O $(OBJDIR)$Px509_minimal_full$O +OBJ = $(OBJDIR)$Pccopy$O $(OBJDIR)$Pdec16be$O $(OBJDIR)$Pdec16le$O $(OBJDIR)$Pdec32be$O $(OBJDIR)$Pdec32le$O $(OBJDIR)$Pdec64be$O $(OBJDIR)$Pdec64le$O $(OBJDIR)$Penc16be$O $(OBJDIR)$Penc16le$O $(OBJDIR)$Penc32be$O $(OBJDIR)$Penc32le$O $(OBJDIR)$Penc64be$O $(OBJDIR)$Penc64le$O $(OBJDIR)$Ppemdec$O $(OBJDIR)$Pec_all_m15$O $(OBJDIR)$Pec_all_m31$O $(OBJDIR)$Pec_c25519_i15$O $(OBJDIR)$Pec_c25519_i31$O $(OBJDIR)$Pec_c25519_m15$O $(OBJDIR)$Pec_c25519_m31$O $(OBJDIR)$Pec_curve25519$O $(OBJDIR)$Pec_default$O $(OBJDIR)$Pec_p256_m15$O $(OBJDIR)$Pec_p256_m31$O $(OBJDIR)$Pec_prime_i15$O $(OBJDIR)$Pec_prime_i31$O $(OBJDIR)$Pec_secp256r1$O $(OBJDIR)$Pec_secp384r1$O $(OBJDIR)$Pec_secp521r1$O $(OBJDIR)$Pecdsa_atr$O $(OBJDIR)$Pecdsa_default_sign_asn1$O $(OBJDIR)$Pecdsa_default_sign_raw$O $(OBJDIR)$Pecdsa_default_vrfy_asn1$O $(OBJDIR)$Pecdsa_default_vrfy_raw$O $(OBJDIR)$Pecdsa_i15_bits$O $(OBJDIR)$Pecdsa_i15_sign_asn1$O $(OBJDIR)$Pecdsa_i15_sign_raw$O $(OBJDIR)$Pecdsa_i15_vrfy_asn1$O $(OBJDIR)$Pecdsa_i15_vrfy_raw$O $(OBJDIR)$Pecdsa_i31_bits$O $(OBJDIR)$Pecdsa_i31_sign_asn1$O $(OBJDIR)$Pecdsa_i31_sign_raw$O $(OBJDIR)$Pecdsa_i31_vrfy_asn1$O $(OBJDIR)$Pecdsa_i31_vrfy_raw$O $(OBJDIR)$Pecdsa_rta$O $(OBJDIR)$Pdig_oid$O $(OBJDIR)$Pdig_size$O $(OBJDIR)$Pghash_ctmul$O $(OBJDIR)$Pghash_ctmul32$O $(OBJDIR)$Pghash_ctmul64$O $(OBJDIR)$Pghash_pclmul$O $(OBJDIR)$Pghash_pwr8$O $(OBJDIR)$Pmd5$O $(OBJDIR)$Pmd5sha1$O $(OBJDIR)$Pmultihash$O $(OBJDIR)$Psha1$O $(OBJDIR)$Psha2big$O $(OBJDIR)$Psha2small$O $(OBJDIR)$Pi15_add$O $(OBJDIR)$Pi15_bitlen$O $(OBJDIR)$Pi15_decmod$O $(OBJDIR)$Pi15_decode$O $(OBJDIR)$Pi15_decred$O $(OBJDIR)$Pi15_encode$O $(OBJDIR)$Pi15_fmont$O $(OBJDIR)$Pi15_iszero$O $(OBJDIR)$Pi15_modpow$O $(OBJDIR)$Pi15_modpow2$O $(OBJDIR)$Pi15_montmul$O $(OBJDIR)$Pi15_mulacc$O $(OBJDIR)$Pi15_muladd$O $(OBJDIR)$Pi15_ninv15$O $(OBJDIR)$Pi15_reduce$O $(OBJDIR)$Pi15_rshift$O $(OBJDIR)$Pi15_sub$O $(OBJDIR)$Pi15_tmont$O $(OBJDIR)$Pi31_add$O $(OBJDIR)$Pi31_bitlen$O $(OBJDIR)$Pi31_decmod$O $(OBJDIR)$Pi31_decode$O $(OBJDIR)$Pi31_decred$O $(OBJDIR)$Pi31_encode$O $(OBJDIR)$Pi31_fmont$O $(OBJDIR)$Pi31_iszero$O $(OBJDIR)$Pi31_modpow$O $(OBJDIR)$Pi31_montmul$O $(OBJDIR)$Pi31_mulacc$O $(OBJDIR)$Pi31_muladd$O $(OBJDIR)$Pi31_ninv31$O $(OBJDIR)$Pi31_reduce$O $(OBJDIR)$Pi31_rshift$O $(OBJDIR)$Pi31_sub$O $(OBJDIR)$Pi31_tmont$O $(OBJDIR)$Pi32_add$O $(OBJDIR)$Pi32_bitlen$O $(OBJDIR)$Pi32_decmod$O $(OBJDIR)$Pi32_decode$O $(OBJDIR)$Pi32_decred$O $(OBJDIR)$Pi32_div32$O $(OBJDIR)$Pi32_encode$O $(OBJDIR)$Pi32_fmont$O $(OBJDIR)$Pi32_iszero$O $(OBJDIR)$Pi32_modpow$O $(OBJDIR)$Pi32_montmul$O $(OBJDIR)$Pi32_mulacc$O $(OBJDIR)$Pi32_muladd$O $(OBJDIR)$Pi32_ninv32$O $(OBJDIR)$Pi32_reduce$O $(OBJDIR)$Pi32_sub$O $(OBJDIR)$Pi32_tmont$O $(OBJDIR)$Phmac$O $(OBJDIR)$Phmac_ct$O $(OBJDIR)$Phmac_drbg$O $(OBJDIR)$Prsa_default_pkcs1_sign$O $(OBJDIR)$Prsa_default_pkcs1_vrfy$O $(OBJDIR)$Prsa_default_priv$O $(OBJDIR)$Prsa_default_pub$O $(OBJDIR)$Prsa_i15_pkcs1_sign$O $(OBJDIR)$Prsa_i15_pkcs1_vrfy$O $(OBJDIR)$Prsa_i15_priv$O $(OBJDIR)$Prsa_i15_pub$O $(OBJDIR)$Prsa_i31_pkcs1_sign$O $(OBJDIR)$Prsa_i31_pkcs1_vrfy$O $(OBJDIR)$Prsa_i31_priv$O $(OBJDIR)$Prsa_i31_pub$O $(OBJDIR)$Prsa_i32_pkcs1_sign$O $(OBJDIR)$Prsa_i32_pkcs1_vrfy$O $(OBJDIR)$Prsa_i32_priv$O $(OBJDIR)$Prsa_i32_pub$O $(OBJDIR)$Prsa_pkcs1_sig_pad$O $(OBJDIR)$Prsa_pkcs1_sig_unpad$O $(OBJDIR)$Prsa_ssl_decrypt$O $(OBJDIR)$Pprf$O $(OBJDIR)$Pprf_md5sha1$O $(OBJDIR)$Pprf_sha256$O $(OBJDIR)$Pprf_sha384$O $(OBJDIR)$Pssl_ccert_single_ec$O $(OBJDIR)$Pssl_ccert_single_rsa$O $(OBJDIR)$Pssl_client$O $(OBJDIR)$Pssl_client_default_rsapub$O $(OBJDIR)$Pssl_client_full$O $(OBJDIR)$Pssl_engine$O $(OBJDIR)$Pssl_engine_default_aescbc$O $(OBJDIR)$Pssl_engine_default_aesgcm$O $(OBJDIR)$Pssl_engine_default_chapol$O $(OBJDIR)$Pssl_engine_default_descbc$O $(OBJDIR)$Pssl_engine_default_ec$O $(OBJDIR)$Pssl_engine_default_ecdsa$O $(OBJDIR)$Pssl_engine_default_rsavrfy$O $(OBJDIR)$Pssl_hashes$O $(OBJDIR)$Pssl_hs_client$O $(OBJDIR)$Pssl_hs_server$O $(OBJDIR)$Pssl_io$O $(OBJDIR)$Pssl_lru$O $(OBJDIR)$Pssl_rec_cbc$O $(OBJDIR)$Pssl_rec_chapol$O $(OBJDIR)$Pssl_rec_gcm$O $(OBJDIR)$Pssl_scert_single_ec$O $(OBJDIR)$Pssl_scert_single_rsa$O $(OBJDIR)$Pssl_server$O $(OBJDIR)$Pssl_server_full_ec$O $(OBJDIR)$Pssl_server_full_rsa$O $(OBJDIR)$Pssl_server_mine2c$O $(OBJDIR)$Pssl_server_mine2g$O $(OBJDIR)$Pssl_server_minf2c$O $(OBJDIR)$Pssl_server_minf2g$O $(OBJDIR)$Pssl_server_minr2g$O $(OBJDIR)$Pssl_server_minu2g$O $(OBJDIR)$Pssl_server_minv2g$O $(OBJDIR)$Paes_big_cbcdec$O $(OBJDIR)$Paes_big_cbcenc$O $(OBJDIR)$Paes_big_ctr$O $(OBJDIR)$Paes_big_dec$O $(OBJDIR)$Paes_big_enc$O $(OBJDIR)$Paes_common$O $(OBJDIR)$Paes_ct$O $(OBJDIR)$Paes_ct64$O $(OBJDIR)$Paes_ct64_cbcdec$O $(OBJDIR)$Paes_ct64_cbcenc$O $(OBJDIR)$Paes_ct64_ctr$O $(OBJDIR)$Paes_ct64_dec$O $(OBJDIR)$Paes_ct64_enc$O $(OBJDIR)$Paes_ct_cbcdec$O $(OBJDIR)$Paes_ct_cbcenc$O $(OBJDIR)$Paes_ct_ctr$O $(OBJDIR)$Paes_ct_dec$O $(OBJDIR)$Paes_ct_enc$O $(OBJDIR)$Paes_pwr8$O $(OBJDIR)$Paes_pwr8_cbcdec$O $(OBJDIR)$Paes_pwr8_cbcenc$O $(OBJDIR)$Paes_pwr8_ctr$O $(OBJDIR)$Paes_small_cbcdec$O $(OBJDIR)$Paes_small_cbcenc$O $(OBJDIR)$Paes_small_ctr$O $(OBJDIR)$Paes_small_dec$O $(OBJDIR)$Paes_small_enc$O $(OBJDIR)$Paes_x86ni$O $(OBJDIR)$Paes_x86ni_cbcdec$O $(OBJDIR)$Paes_x86ni_cbcenc$O $(OBJDIR)$Paes_x86ni_ctr$O $(OBJDIR)$Pchacha20_ct$O $(OBJDIR)$Pdes_ct$O $(OBJDIR)$Pdes_ct_cbcdec$O $(OBJDIR)$Pdes_ct_cbcenc$O $(OBJDIR)$Pdes_support$O $(OBJDIR)$Pdes_tab$O $(OBJDIR)$Pdes_tab_cbcdec$O $(OBJDIR)$Pdes_tab_cbcenc$O $(OBJDIR)$Ppoly1305_ctmul$O $(OBJDIR)$Ppoly1305_ctmul32$O $(OBJDIR)$Ppoly1305_i15$O $(OBJDIR)$Pskey_decoder$O $(OBJDIR)$Px509_decoder$O $(OBJDIR)$Px509_knownkey$O $(OBJDIR)$Px509_minimal$O $(OBJDIR)$Px509_minimal_full$O OBJBRSSL = $(OBJDIR)$Pbrssl$O $(OBJDIR)$Pcerts$O $(OBJDIR)$Pchain$O $(OBJDIR)$Pclient$O $(OBJDIR)$Perrors$O $(OBJDIR)$Pfiles$O $(OBJDIR)$Pkeys$O $(OBJDIR)$Pnames$O $(OBJDIR)$Pserver$O $(OBJDIR)$Pskey$O $(OBJDIR)$Psslio$O $(OBJDIR)$Pta$O $(OBJDIR)$Pvector$O $(OBJDIR)$Pverify$O $(OBJDIR)$Pxmem$O OBJTESTCRYPTO = $(OBJDIR)$Ptest_crypto$O OBJTESTSPEED = $(OBJDIR)$Ptest_speed$O @@ -212,6 +212,9 @@ $(OBJDIR)$Pghash_ctmul64$O: src$Phash$Pghash_ctmul64.c $(HEADERSPRIV) $(OBJDIR)$Pghash_pclmul$O: src$Phash$Pghash_pclmul.c $(HEADERSPRIV) $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pghash_pclmul$O src$Phash$Pghash_pclmul.c +$(OBJDIR)$Pghash_pwr8$O: src$Phash$Pghash_pwr8.c $(HEADERSPRIV) + $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pghash_pwr8$O src$Phash$Pghash_pwr8.c + $(OBJDIR)$Pmd5$O: src$Phash$Pmd5.c $(HEADERSPRIV) $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pmd5$O src$Phash$Pmd5.c @@ -617,6 +620,18 @@ $(OBJDIR)$Paes_ct_dec$O: src$Psymcipher$Paes_ct_dec.c $(HEADERSPRIV) $(OBJDIR)$Paes_ct_enc$O: src$Psymcipher$Paes_ct_enc.c $(HEADERSPRIV) $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_ct_enc$O src$Psymcipher$Paes_ct_enc.c +$(OBJDIR)$Paes_pwr8$O: src$Psymcipher$Paes_pwr8.c $(HEADERSPRIV) + $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_pwr8$O src$Psymcipher$Paes_pwr8.c + +$(OBJDIR)$Paes_pwr8_cbcdec$O: src$Psymcipher$Paes_pwr8_cbcdec.c $(HEADERSPRIV) + $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_pwr8_cbcdec$O src$Psymcipher$Paes_pwr8_cbcdec.c + +$(OBJDIR)$Paes_pwr8_cbcenc$O: src$Psymcipher$Paes_pwr8_cbcenc.c $(HEADERSPRIV) + $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_pwr8_cbcenc$O src$Psymcipher$Paes_pwr8_cbcenc.c + +$(OBJDIR)$Paes_pwr8_ctr$O: src$Psymcipher$Paes_pwr8_ctr.c $(HEADERSPRIV) + $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_pwr8_ctr$O src$Psymcipher$Paes_pwr8_ctr.c + $(OBJDIR)$Paes_small_cbcdec$O: src$Psymcipher$Paes_small_cbcdec.c $(HEADERSPRIV) $(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_small_cbcdec$O src$Psymcipher$Paes_small_cbcdec.c diff --git a/mk/mkrules.sh b/mk/mkrules.sh index 4c9d2cd..929d305 100755 --- a/mk/mkrules.sh +++ b/mk/mkrules.sh @@ -100,6 +100,7 @@ coresrc=" \ src/hash/ghash_ctmul32.c \ src/hash/ghash_ctmul64.c \ src/hash/ghash_pclmul.c \ + src/hash/ghash_pwr8.c \ src/hash/md5.c \ src/hash/md5sha1.c \ src/hash/multihash.c \ @@ -235,6 +236,10 @@ coresrc=" \ src/symcipher/aes_ct_ctr.c \ src/symcipher/aes_ct_dec.c \ src/symcipher/aes_ct_enc.c \ + src/symcipher/aes_pwr8.c \ + src/symcipher/aes_pwr8_cbcdec.c \ + src/symcipher/aes_pwr8_cbcenc.c \ + src/symcipher/aes_pwr8_ctr.c \ src/symcipher/aes_small_cbcdec.c \ src/symcipher/aes_small_cbcenc.c \ src/symcipher/aes_small_ctr.c \ diff --git a/src/config.h b/src/config.h index 9eadaf4..d2e7a7d 100644 --- a/src/config.h +++ b/src/config.h @@ -162,4 +162,14 @@ #define BR_AES_X86NI 1 */ +/* + * When BR_POWER8 is enabled, the AES implementation using the POWER ISA + * 2.07 opcodes (available on POWER8 processors and later) is compiled. + * If this is not enabled explicitly, then that implementation will be + * compiled only if a compatible compiler is detected, _and_ the target + * architecture is POWER8 or later. + * +#define BR_POWER8 1 + */ + #endif diff --git a/src/hash/ghash_pwr8.c b/src/hash/ghash_pwr8.c new file mode 100644 index 0000000..2e7b0f4 --- /dev/null +++ b/src/hash/ghash_pwr8.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_POWER_ASM_MACROS 1 +#include "inner.h" + +/* + * This is the GHASH implementation that leverages the POWER8 opcodes. + */ + +#if BR_POWER8 + +/* + * Some symbolic names for registers. + * HB0 = 16 bytes of value 0 + * HB1 = 16 bytes of value 1 + * HB2 = 16 bytes of value 2 + * HB6 = 16 bytes of value 6 + * HB7 = 16 bytes of value 7 + * TT0, TT1 and TT2 are temporaries + * + * BSW holds the pattern for byteswapping 32-bit words; this is set only + * on little-endian systems. XBSW is the same register with the +32 offset + * for access with the VSX opcodes. + */ +#define HB0 0 +#define HB1 1 +#define HB2 2 +#define HB6 3 +#define HB7 4 +#define TT0 5 +#define TT1 6 +#define TT2 7 + +#define BSW 8 +#define XBSW 40 + +/* + * Macro to initialise the constants. + */ +#define INIT \ + vxor(HB0, HB0, HB0) \ + vspltisb(HB1, 1) \ + vspltisb(HB2, 2) \ + vspltisb(HB6, 6) \ + vspltisb(HB7, 7) \ + INIT_BSW + +/* + * Fix endianness of a value after reading it or before writing it, if + * necessary. + */ +#if BR_POWER8_LE +#define INIT_BSW lxvw4x(XBSW, 0, %[idx2be]) +#define FIX_ENDIAN(xx) vperm(xx, xx, xx, BSW) +#else +#define INIT_BSW +#define FIX_ENDIAN(xx) +#endif + +/* + * Left-shift x0:x1 by one bit to the left. This is a corrective action + * needed because GHASH is defined in full little-endian specification, + * while the opcodes use full big-endian convention, so the 255-bit product + * ends up one bit to the right. + */ +#define SL_256(x0, x1) \ + vsldoi(TT0, HB0, x1, 1) \ + vsl(x0, x0, HB1) \ + vsr(TT0, TT0, HB7) \ + vsl(x1, x1, HB1) \ + vxor(x0, x0, TT0) + +/* + * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as + * x0 or x1, or a different register). x0 and x1 are modified. + */ +#define REDUCE_F128(xd, x0, x1) \ + vxor(x0, x0, x1) \ + vsr(TT0, x1, HB1) \ + vsr(TT1, x1, HB2) \ + vsr(TT2, x1, HB7) \ + vxor(x0, x0, TT0) \ + vxor(TT1, TT1, TT2) \ + vxor(x0, x0, TT1) \ + vsldoi(x1, x1, HB0, 15) \ + vsl(TT1, x1, HB6) \ + vsl(TT2, x1, HB1) \ + vxor(x1, TT1, TT2) \ + vsr(TT0, x1, HB1) \ + vsr(TT1, x1, HB2) \ + vsr(TT2, x1, HB7) \ + vxor(x0, x0, x1) \ + vxor(x0, x0, TT0) \ + vxor(TT1, TT1, TT2) \ + vxor(xd, x0, TT1) + +/* see bearssl_hash.h */ +void +br_ghash_pwr8(void *y, const void *h, const void *data, size_t len) +{ + const unsigned char *buf1, *buf2; + size_t num4, num1; + unsigned char tmp[64]; + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + buf1 = data; + + /* + * Assembly code requires data into two chunks; first chunk + * must contain a number of blocks which is a multiple of 4. + * Since the processing for the first chunk is faster, we want + * to make it as big as possible. + * + * For the remainder, there are two possibilities: + * -- if the remainder size is a multiple of 16, then use it + * in place; + * -- otherwise, copy it to the tmp[] array and pad it with + * zeros. + */ + num4 = len >> 6; + buf2 = buf1 + (num4 << 6); + len &= 63; + num1 = (len + 15) >> 4; + if ((len & 15) != 0) { + memcpy(tmp, buf2, len); + memset(tmp + len, 0, (num1 << 4) - len); + buf2 = tmp; + } + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + INIT + + /* + * Load current h (denoted hereafter h1) in v9. + */ + lxvw4x(41, 0, %[h]) + FIX_ENDIAN(9) + + /* + * Load current y into v28. + */ + lxvw4x(60, 0, %[y]) + FIX_ENDIAN(28) + + /* + * Split h1 into three registers: + * v17 = h1_1:h1_0 + * v18 = 0:h1_0 + * v19 = h1_1:0 + */ + xxpermdi(49, 41, 41, 2) + vsldoi(18, HB0, 9, 8) + vsldoi(19, 9, HB0, 8) + + /* + * If num4 is 0, skip directly to the second chunk. + */ + cmpldi(%[num4], 0) + beq(chunk1) + + /* + * Compute h2 = h*h in v10. + */ + vpmsumd(10, 18, 18) + vpmsumd(11, 19, 19) + SL_256(10, 11) + REDUCE_F128(10, 10, 11) + + /* + * Compute h3 = h*h*h in v11. + * We first split h2 into: + * v10 = h2_0:h2_1 + * v11 = 0:h2_0 + * v12 = h2_1:0 + * Then we do the product with h1, and reduce into v11. + */ + vsldoi(11, HB0, 10, 8) + vsldoi(12, 10, HB0, 8) + vpmsumd(13, 10, 17) + vpmsumd(11, 11, 18) + vpmsumd(12, 12, 19) + vsldoi(14, HB0, 13, 8) + vsldoi(15, 13, HB0, 8) + vxor(11, 11, 14) + vxor(12, 12, 15) + SL_256(11, 12) + REDUCE_F128(11, 11, 12) + + /* + * Compute h4 = h*h*h*h in v12. This is done by squaring h2. + */ + vsldoi(12, HB0, 10, 8) + vsldoi(13, 10, HB0, 8) + vpmsumd(12, 12, 12) + vpmsumd(13, 13, 13) + SL_256(12, 13) + REDUCE_F128(12, 12, 13) + + /* + * Repack h1, h2, h3 and h4: + * v13 = h4_0:h3_0 + * v14 = h4_1:h3_1 + * v15 = h2_0:h1_0 + * v16 = h2_1:h1_1 + */ + xxpermdi(45, 44, 43, 0) + xxpermdi(46, 44, 43, 3) + xxpermdi(47, 42, 41, 0) + xxpermdi(48, 42, 41, 3) + + /* + * Loop for each group of four blocks. + */ + mtctr(%[num4]) + label(loop4) + /* + * Read the four next blocks. + * v20 = y + a0 = b0 + * v21 = a1 = b1 + * v22 = a2 = b2 + * v23 = a3 = b3 + */ + lxvw4x(52, %[cc0], %[buf1]) + lxvw4x(53, %[cc1], %[buf1]) + lxvw4x(54, %[cc2], %[buf1]) + lxvw4x(55, %[cc3], %[buf1]) + FIX_ENDIAN(20) + FIX_ENDIAN(21) + FIX_ENDIAN(22) + FIX_ENDIAN(23) + addi(%[buf1], %[buf1], 64) + vxor(20, 20, 28) + + /* + * Repack the blocks into v9, v10, v11 and v12. + * v9 = b0_0:b1_0 + * v10 = b0_1:b1_1 + * v11 = b2_0:b3_0 + * v12 = b2_1:b3_1 + */ + xxpermdi(41, 52, 53, 0) + xxpermdi(42, 52, 53, 3) + xxpermdi(43, 54, 55, 0) + xxpermdi(44, 54, 55, 3) + + /* + * Compute the products. + * v20 = b0_0*h4_0 + b1_0*h3_0 + * v21 = b0_1*h4_0 + b1_1*h3_0 + * v22 = b0_0*h4_1 + b1_0*h3_1 + * v23 = b0_1*h4_1 + b1_1*h3_1 + * v24 = b2_0*h2_0 + b3_0*h1_0 + * v25 = b2_1*h2_0 + b3_1*h1_0 + * v26 = b2_0*h2_1 + b3_0*h1_1 + * v27 = b2_1*h2_1 + b3_1*h1_1 + */ + vpmsumd(20, 13, 9) + vpmsumd(21, 13, 10) + vpmsumd(22, 14, 9) + vpmsumd(23, 14, 10) + vpmsumd(24, 15, 11) + vpmsumd(25, 15, 12) + vpmsumd(26, 16, 11) + vpmsumd(27, 16, 12) + + /* + * Sum products into a single 256-bit result in v11:v12. + */ + vxor(11, 20, 24) + vxor(12, 23, 27) + vxor( 9, 21, 22) + vxor(10, 25, 26) + vxor(20, 9, 10) + vsldoi( 9, HB0, 20, 8) + vsldoi(10, 20, HB0, 8) + vxor(11, 11, 9) + vxor(12, 12, 10) + + /* + * Fix and reduce in GF(2^128); this is the new y (in v28). + */ + SL_256(11, 12) + REDUCE_F128(28, 11, 12) + + /* + * Loop for next group of four blocks. + */ + bdnz(loop4) + + /* + * Process second chunk, one block at a time. + */ + label(chunk1) + cmpldi(%[num1], 0) + beq(done) + + mtctr(%[num1]) + label(loop1) + /* + * Load next data block and XOR it into y. + */ + lxvw4x(41, 0, %[buf2]) +#if BR_POWER8_LE + FIX_ENDIAN(9) +#endif + addi(%[buf2], %[buf2], 16) + vxor(9, 28, 9) + + /* + * Split y into doublewords: + * v9 = y_0:y_1 + * v10 = 0:y_0 + * v11 = y_1:0 + */ + vsldoi(10, HB0, 9, 8) + vsldoi(11, 9, HB0, 8) + + /* + * Compute products with h: + * v12 = y_0 * h_0 + * v13 = y_1 * h_1 + * v14 = y_1 * h_0 + y_0 * h_1 + */ + vpmsumd(14, 9, 17) + vpmsumd(12, 10, 18) + vpmsumd(13, 11, 19) + + /* + * Propagate v14 into v12:v13 to finalise product. + */ + vsldoi(10, HB0, 14, 8) + vsldoi(11, 14, HB0, 8) + vxor(12, 12, 10) + vxor(13, 13, 11) + + /* + * Fix result and reduce into v28 (next value for y). + */ + SL_256(12, 13) + REDUCE_F128(28, 12, 13) + bdnz(loop1) + + label(done) + /* + * Write back the new y. + */ + FIX_ENDIAN(28) + stxvw4x(60, 0, %[y]) + +: [buf1] "+b" (buf1), [buf2] "+b" (buf2) +: [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1), + [cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +/* see bearssl_hash.h */ +br_ghash +br_ghash_pwr8_get(void) +{ + return &br_ghash_pwr8; +} + +#else + +/* see bearssl_hash.h */ +br_ghash +br_ghash_pwr8_get(void) +{ + return 0; +} + +#endif diff --git a/src/inner.h b/src/inner.h index 472dc2e..b03f043 100644 --- a/src/inner.h +++ b/src/inner.h @@ -154,6 +154,49 @@ #define BR_TARGET(x) #endif +/* + * POWER8 crypto support. We rely on compiler macros for the + * architecture, since we do not have a reliable, simple way to detect + * the required support at runtime (we could try running an opcode, and + * trapping the exception or signal on illegal instruction, but this + * induces some non-trivial OS dependencies that we would prefer to + * avoid if possible). + */ +#ifndef BR_POWER8 +#if __GNUC__ && ((_ARCH_PWR8 || _ARCH_PPC) && __CRYPTO__) +#define BR_POWER8 1 +#endif +#endif + +/* + * Detect endinanness on POWER8. + */ +#if BR_POWER8 +#if defined BR_POWER8_LE +#undef BR_POWER8_BE +#if BR_POWER8_LE +#define BR_POWER8_BE 0 +#else +#define BR_POWER8_BE 1 +#endif +#elif defined BR_POWER8_BE +#undef BR_POWER8_LE +#if BR_POWER8_BE +#define BR_POWER8_LE 0 +#else +#define BR_POWER8_LE 1 +#endif +#else +#if __LITTLE_ENDIAN__ +#define BR_POWER8_LE 1 +#define BR_POWER8_BE 0 +#else +#define BR_POWER8_LE 0 +#define BR_POWER8_BE 1 +#endif +#endif +#endif + /* ==================================================================== */ /* * Encoding/decoding functions. @@ -1498,6 +1541,19 @@ unsigned br_aes_x86ni_keysched_enc(unsigned char *skni, unsigned br_aes_x86ni_keysched_dec(unsigned char *skni, const void *key, size_t len); +/* + * Test support for AES POWER8 opcodes. + */ +int br_aes_pwr8_supported(void); + +/* + * AES key schedule, using POWER8 instructions. This yields the + * subkeys in the encryption direction. Number of rounds is returned. + * Key size MUST be 16, 24 or 32 bytes; otherwise, 0 is returned. + */ +unsigned br_aes_pwr8_keysched(unsigned char *skni, + const void *key, size_t len); + /* ==================================================================== */ /* * RSA. @@ -1775,4 +1831,85 @@ int br_ssl_choose_hash(unsigned bf); /* ==================================================================== */ +/* + * PowerPC / POWER assembly stuff. The special BR_POWER_ASM_MACROS macro + * must be defined before including this file; this is done by source + * files that use some inline assembly for PowerPC / POWER machines. + */ + +#if BR_POWER_ASM_MACROS + +#define lxvw4x(xt, ra, rb) lxvw4x_(xt, ra, rb) +#define stxvw4x(xt, ra, rb) stxvw4x_(xt, ra, rb) + +#define bdnz(foo) bdnz_(foo) +#define beq(foo) beq_(foo) + +#define li(rx, value) li_(rx, value) +#define addi(rx, ra, imm) addi_(rx, ra, imm) +#define cmpldi(rx, imm) cmpldi_(rx, imm) +#define mtctr(rx) mtctr_(rx) +#define vspltb(vrt, vrb, uim) vspltb_(vrt, vrb, uim) +#define vspltw(vrt, vrb, uim) vspltw_(vrt, vrb, uim) +#define vspltisb(vrt, imm) vspltisb_(vrt, imm) +#define vspltisw(vrt, imm) vspltisw_(vrt, imm) +#define vrlw(vrt, vra, vrb) vrlw_(vrt, vra, vrb) +#define vsbox(vrt, vra) vsbox_(vrt, vra) +#define vxor(vrt, vra, vrb) vxor_(vrt, vra, vrb) +#define vand(vrt, vra, vrb) vand_(vrt, vra, vrb) +#define vsro(vrt, vra, vrb) vsro_(vrt, vra, vrb) +#define vsl(vrt, vra, vrb) vsl_(vrt, vra, vrb) +#define vsldoi(vt, va, vb, sh) vsldoi_(vt, va, vb, sh) +#define vsr(vrt, vra, vrb) vsr_(vrt, vra, vrb) +#define vadduwm(vrt, vra, vrb) vadduwm_(vrt, vra, vrb) +#define vsububm(vrt, vra, vrb) vsububm_(vrt, vra, vrb) +#define vsubuwm(vrt, vra, vrb) vsubuwm_(vrt, vra, vrb) +#define vsrw(vrt, vra, vrb) vsrw_(vrt, vra, vrb) +#define vcipher(vt, va, vb) vcipher_(vt, va, vb) +#define vcipherlast(vt, va, vb) vcipherlast_(vt, va, vb) +#define vncipher(vt, va, vb) vncipher_(vt, va, vb) +#define vncipherlast(vt, va, vb) vncipherlast_(vt, va, vb) +#define vperm(vt, va, vb, vc) vperm_(vt, va, vb, vc) +#define vpmsumd(vt, va, vb) vpmsumd_(vt, va, vb) +#define xxpermdi(vt, va, vb, d) xxpermdi_(vt, va, vb, d) + +#define lxvw4x_(xt, ra, rb) "\tlxvw4x\t" #xt "," #ra "," #rb "\n" +#define stxvw4x_(xt, ra, rb) "\tstxvw4x\t" #xt "," #ra "," #rb "\n" + +#define label(foo) #foo "%=:\n" +#define bdnz_(foo) "\tbdnz\t" #foo "%=\n" +#define beq_(foo) "\tbeq\t" #foo "%=\n" + +#define li_(rx, value) "\tli\t" #rx "," #value "\n" +#define addi_(rx, ra, imm) "\taddi\t" #rx "," #ra "," #imm "\n" +#define cmpldi_(rx, imm) "\tcmpldi\t" #rx "," #imm "\n" +#define mtctr_(rx) "\tmtctr\t" #rx "\n" +#define vspltb_(vrt, vrb, uim) "\tvspltb\t" #vrt "," #vrb "," #uim "\n" +#define vspltw_(vrt, vrb, uim) "\tvspltw\t" #vrt "," #vrb "," #uim "\n" +#define vspltisb_(vrt, imm) "\tvspltisb\t" #vrt "," #imm "\n" +#define vspltisw_(vrt, imm) "\tvspltisw\t" #vrt "," #imm "\n" +#define vrlw_(vrt, vra, vrb) "\tvrlw\t" #vrt "," #vra "," #vrb "\n" +#define vsbox_(vrt, vra) "\tvsbox\t" #vrt "," #vra "\n" +#define vxor_(vrt, vra, vrb) "\tvxor\t" #vrt "," #vra "," #vrb "\n" +#define vand_(vrt, vra, vrb) "\tvand\t" #vrt "," #vra "," #vrb "\n" +#define vsro_(vrt, vra, vrb) "\tvsro\t" #vrt "," #vra "," #vrb "\n" +#define vsl_(vrt, vra, vrb) "\tvsl\t" #vrt "," #vra "," #vrb "\n" +#define vsldoi_(vt, va, vb, sh) "\tvsldoi\t" #vt "," #va "," #vb "," #sh "\n" +#define vsr_(vrt, vra, vrb) "\tvsr\t" #vrt "," #vra "," #vrb "\n" +#define vadduwm_(vrt, vra, vrb) "\tvadduwm\t" #vrt "," #vra "," #vrb "\n" +#define vsububm_(vrt, vra, vrb) "\tvsububm\t" #vrt "," #vra "," #vrb "\n" +#define vsubuwm_(vrt, vra, vrb) "\tvsubuwm\t" #vrt "," #vra "," #vrb "\n" +#define vsrw_(vrt, vra, vrb) "\tvsrw\t" #vrt "," #vra "," #vrb "\n" +#define vcipher_(vt, va, vb) "\tvcipher\t" #vt "," #va "," #vb "\n" +#define vcipherlast_(vt, va, vb) "\tvcipherlast\t" #vt "," #va "," #vb "\n" +#define vncipher_(vt, va, vb) "\tvncipher\t" #vt "," #va "," #vb "\n" +#define vncipherlast_(vt, va, vb) "\tvncipherlast\t" #vt "," #va "," #vb "\n" +#define vperm_(vt, va, vb, vc) "\tvperm\t" #vt "," #va "," #vb "," #vc "\n" +#define vpmsumd_(vt, va, vb) "\tvpmsumd\t" #vt "," #va "," #vb "\n" +#define xxpermdi_(vt, va, vb, d) "\txxpermdi\t" #vt "," #va "," #vb "," #d "\n" + +#endif + +/* ==================================================================== */ + #endif diff --git a/src/ssl/ssl_engine_default_aescbc.c b/src/ssl/ssl_engine_default_aescbc.c index 556d6eb..8c5cdb5 100644 --- a/src/ssl/ssl_engine_default_aescbc.c +++ b/src/ssl/ssl_engine_default_aescbc.c @@ -28,7 +28,7 @@ void br_ssl_engine_set_default_aes_cbc(br_ssl_engine_context *cc) { -#if BR_AES_X86NI +#if BR_AES_X86NI || BR_POWER8 const br_block_cbcenc_class *ienc; const br_block_cbcdec_class *idec; #endif @@ -44,6 +44,14 @@ br_ssl_engine_set_default_aes_cbc(br_ssl_engine_context *cc) return; } #endif +#if BR_POWER8 + ienc = br_aes_pwr8_cbcenc_get_vtable(); + idec = br_aes_pwr8_cbcdec_get_vtable(); + if (ienc != NULL && idec != NULL) { + br_ssl_engine_set_aes_cbc(cc, ienc, idec); + return; + } +#endif #if BR_64 br_ssl_engine_set_aes_cbc(cc, &br_aes_ct64_cbcenc_vtable, diff --git a/src/ssl/ssl_engine_default_aesgcm.c b/src/ssl/ssl_engine_default_aesgcm.c index 9968342..c44a707 100644 --- a/src/ssl/ssl_engine_default_aesgcm.c +++ b/src/ssl/ssl_engine_default_aesgcm.c @@ -28,7 +28,7 @@ void br_ssl_engine_set_default_aes_gcm(br_ssl_engine_context *cc) { -#if BR_AES_X86NI +#if BR_AES_X86NI || BR_POWER8 const br_block_ctr_class *ictr; br_ghash ighash; #endif @@ -47,6 +47,17 @@ br_ssl_engine_set_default_aes_gcm(br_ssl_engine_context *cc) br_ssl_engine_set_aes_ctr(cc, &br_aes_ct_ctr_vtable); #endif } +#elif BR_POWER8 + ictr = br_aes_pwr8_ctr_get_vtable(); + if (ictr != NULL) { + br_ssl_engine_set_aes_ctr(cc, ictr); + } else { +#if BR_64 + br_ssl_engine_set_aes_ctr(cc, &br_aes_ct64_ctr_vtable); +#else + br_ssl_engine_set_aes_ctr(cc, &br_aes_ct_ctr_vtable); +#endif + } #else #if BR_64 br_ssl_engine_set_aes_ctr(cc, &br_aes_ct64_ctr_vtable); @@ -61,6 +72,13 @@ br_ssl_engine_set_default_aes_gcm(br_ssl_engine_context *cc) return; } #endif +#if BR_POWER8 + ighash = br_ghash_pwr8_get(); + if (ighash != 0) { + br_ssl_engine_set_ghash(cc, ighash); + return; + } +#endif #if BR_LOMUL br_ssl_engine_set_ghash(cc, &br_ghash_ctmul32); #elif BR_64 diff --git a/src/symcipher/aes_pwr8.c b/src/symcipher/aes_pwr8.c new file mode 100644 index 0000000..b2c63c3 --- /dev/null +++ b/src/symcipher/aes_pwr8.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_POWER_ASM_MACROS 1 +#include "inner.h" + +/* + * This code contains the AES key schedule implementation using the + * POWER8 opcodes. + */ + +#if BR_POWER8 + +static void +key_schedule_128(unsigned char *sk, const unsigned char *key) +{ + long cc; + + static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B }; +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + + /* + * We use the VSX instructions for loading and storing the + * key/subkeys, since they support unaligned accesses. The rest + * of the computation is VMX only. VMX register 0 is VSX + * register 32. + */ + asm volatile ( + + /* + * v0 = all-zero word + * v1 = constant -8 / +8, copied into four words + * v2 = current subkey + * v3 = Rcon (x4 words) + * v6 = constant 8, copied into four words + * v7 = constant 0x11B, copied into four words + * v8 = constant for byteswapping words + */ + vspltisw(0, 0) +#if BR_POWER8_LE + vspltisw(1, -8) +#else + vspltisw(1, 8) +#endif + lxvw4x(34, 0, %[key]) + vspltisw(3, 1) + vspltisw(6, 8) + lxvw4x(39, 0, %[fmod]) +#if BR_POWER8_LE + lxvw4x(40, 0, %[idx2be]) +#endif + + /* + * First subkey is a copy of the key itself. + */ +#if BR_POWER8_LE + vperm(4, 2, 2, 8) + stxvw4x(36, 0, %[sk]) +#else + stxvw4x(34, 0, %[sk]) +#endif + + /* + * Loop must run 10 times. + */ + li(%[cc], 10) + mtctr(%[cc]) + label(loop) + /* Increment subkey address */ + addi(%[sk], %[sk], 16) + + /* Compute SubWord(RotWord(temp)) xor Rcon (into v4, splat) */ + vrlw(4, 2, 1) + vsbox(4, 4) +#if BR_POWER8_LE + vxor(4, 4, 3) +#else + vsldoi(5, 3, 0, 3) + vxor(4, 4, 5) +#endif + vspltw(4, 4, 3) + + /* XOR words for next subkey */ + vsldoi(5, 0, 2, 12) + vxor(2, 2, 5) + vsldoi(5, 0, 2, 12) + vxor(2, 2, 5) + vsldoi(5, 0, 2, 12) + vxor(2, 2, 5) + vxor(2, 2, 4) + + /* Store next subkey */ +#if BR_POWER8_LE + vperm(4, 2, 2, 8) + stxvw4x(36, 0, %[sk]) +#else + stxvw4x(34, 0, %[sk]) +#endif + + /* Update Rcon */ + vadduwm(3, 3, 3) + vsrw(4, 3, 6) + vsubuwm(4, 0, 4) + vand(4, 4, 7) + vxor(3, 3, 4) + + bdnz(loop) + +: [sk] "+b" (sk), [cc] "+b" (cc) +: [key] "b" (key), [fmod] "b" (fmod) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory" + ); +} + +static void +key_schedule_192(unsigned char *sk, const unsigned char *key) +{ + long cc; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + + /* + * We use the VSX instructions for loading and storing the + * key/subkeys, since they support unaligned accesses. The rest + * of the computation is VMX only. VMX register 0 is VSX + * register 32. + */ + asm volatile ( + + /* + * v0 = all-zero word + * v1 = constant -8 / +8, copied into four words + * v2, v3 = current subkey + * v5 = Rcon (x4 words) (already shifted on big-endian) + * v6 = constant 8, copied into four words + * v8 = constant for byteswapping words + * + * The left two words of v3 are ignored. + */ + vspltisw(0, 0) +#if BR_POWER8_LE + vspltisw(1, -8) +#else + vspltisw(1, 8) +#endif + li(%[cc], 8) + lxvw4x(34, 0, %[key]) + lxvw4x(35, %[cc], %[key]) + vsldoi(3, 3, 0, 8) + vspltisw(5, 1) +#if !BR_POWER8_LE + vsldoi(5, 5, 0, 3) +#endif + vspltisw(6, 8) +#if BR_POWER8_LE + lxvw4x(40, 0, %[idx2be]) +#endif + + /* + * Loop must run 8 times. Each iteration produces 256 + * bits of subkeys, with a 64-bit overlap. + */ + li(%[cc], 8) + mtctr(%[cc]) + li(%[cc], 16) + label(loop) + + /* + * Last 6 words in v2:v3l. Compute next 6 words into + * v3r:v4. + */ + vrlw(10, 3, 1) + vsbox(10, 10) + vxor(10, 10, 5) + vspltw(10, 10, 1) + vsldoi(11, 0, 10, 8) + + vsldoi(12, 0, 2, 12) + vxor(12, 2, 12) + vsldoi(13, 0, 12, 12) + vxor(12, 12, 13) + vsldoi(13, 0, 12, 12) + vxor(12, 12, 13) + + vspltw(13, 12, 3) + vxor(13, 13, 3) + vsldoi(14, 0, 3, 12) + vxor(13, 13, 14) + + vsldoi(4, 12, 13, 8) + vsldoi(14, 0, 3, 8) + vsldoi(3, 14, 12, 8) + + vxor(3, 3, 11) + vxor(4, 4, 10) + + /* + * Update Rcon. Since for a 192-bit key, we use only 8 + * such constants, we will not hit the field modulus, + * so a simple shift (addition) works well. + */ + vadduwm(5, 5, 5) + + /* + * Write out the two left 128-bit words + */ +#if BR_POWER8_LE + vperm(10, 2, 2, 8) + vperm(11, 3, 3, 8) + stxvw4x(42, 0, %[sk]) + stxvw4x(43, %[cc], %[sk]) +#else + stxvw4x(34, 0, %[sk]) + stxvw4x(35, %[cc], %[sk]) +#endif + addi(%[sk], %[sk], 24) + + /* + * Shift words for next iteration. + */ + vsldoi(2, 3, 4, 8) + vsldoi(3, 4, 0, 8) + + bdnz(loop) + + /* + * The loop wrote the first 50 subkey words, but we need + * to produce 52, so we must do one last write. + */ +#if BR_POWER8_LE + vperm(10, 2, 2, 8) + stxvw4x(42, 0, %[sk]) +#else + stxvw4x(34, 0, %[sk]) +#endif + +: [sk] "+b" (sk), [cc] "+b" (cc) +: [key] "b" (key) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory" + ); +} + +static void +key_schedule_256(unsigned char *sk, const unsigned char *key) +{ + long cc; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + + /* + * We use the VSX instructions for loading and storing the + * key/subkeys, since they support unaligned accesses. The rest + * of the computation is VMX only. VMX register 0 is VSX + * register 32. + */ + asm volatile ( + + /* + * v0 = all-zero word + * v1 = constant -8 / +8, copied into four words + * v2, v3 = current subkey + * v6 = Rcon (x4 words) (already shifted on big-endian) + * v7 = constant 8, copied into four words + * v8 = constant for byteswapping words + * + * The left two words of v3 are ignored. + */ + vspltisw(0, 0) +#if BR_POWER8_LE + vspltisw(1, -8) +#else + vspltisw(1, 8) +#endif + li(%[cc], 16) + lxvw4x(34, 0, %[key]) + lxvw4x(35, %[cc], %[key]) + vspltisw(6, 1) +#if !BR_POWER8_LE + vsldoi(6, 6, 0, 3) +#endif + vspltisw(7, 8) +#if BR_POWER8_LE + lxvw4x(40, 0, %[idx2be]) +#endif + + /* + * Loop must run 7 times. Each iteration produces two + * subkeys. + */ + li(%[cc], 7) + mtctr(%[cc]) + li(%[cc], 16) + label(loop) + + /* + * Current words are in v2:v3. Compute next word in v4. + */ + vrlw(10, 3, 1) + vsbox(10, 10) + vxor(10, 10, 6) + vspltw(10, 10, 3) + + vsldoi(4, 0, 2, 12) + vxor(4, 2, 4) + vsldoi(5, 0, 4, 12) + vxor(4, 4, 5) + vsldoi(5, 0, 4, 12) + vxor(4, 4, 5) + vxor(4, 4, 10) + + /* + * Then other word in v5. + */ + vsbox(10, 4) + vspltw(10, 10, 3) + + vsldoi(5, 0, 3, 12) + vxor(5, 3, 5) + vsldoi(11, 0, 5, 12) + vxor(5, 5, 11) + vsldoi(11, 0, 5, 12) + vxor(5, 5, 11) + vxor(5, 5, 10) + + /* + * Update Rcon. Since for a 256-bit key, we use only 7 + * such constants, we will not hit the field modulus, + * so a simple shift (addition) works well. + */ + vadduwm(6, 6, 6) + + /* + * Write out the two left 128-bit words + */ +#if BR_POWER8_LE + vperm(10, 2, 2, 8) + vperm(11, 3, 3, 8) + stxvw4x(42, 0, %[sk]) + stxvw4x(43, %[cc], %[sk]) +#else + stxvw4x(34, 0, %[sk]) + stxvw4x(35, %[cc], %[sk]) +#endif + addi(%[sk], %[sk], 32) + + /* + * Replace v2:v3 with v4:v5. + */ + vxor(2, 0, 4) + vxor(3, 0, 5) + + bdnz(loop) + + /* + * The loop wrote the first 14 subkeys, but we need 15, + * so we must do an extra write. + */ +#if BR_POWER8_LE + vperm(10, 2, 2, 8) + stxvw4x(42, 0, %[sk]) +#else + stxvw4x(34, 0, %[sk]) +#endif + +: [sk] "+b" (sk), [cc] "+b" (cc) +: [key] "b" (key) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory" + ); +} + +/* see inner.h */ +int +br_aes_pwr8_supported(void) +{ + return 1; +} + +/* see inner.h */ +unsigned +br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len) +{ + switch (len) { + case 16: + key_schedule_128(sk, key); + return 10; + case 24: + key_schedule_192(sk, key); + return 12; + default: + key_schedule_256(sk, key); + return 14; + } +} + +#endif diff --git a/src/symcipher/aes_pwr8_cbcdec.c b/src/symcipher/aes_pwr8_cbcdec.c new file mode 100644 index 0000000..e535ba6 --- /dev/null +++ b/src/symcipher/aes_pwr8_cbcdec.c @@ -0,0 +1,670 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_POWER_ASM_MACROS 1 +#include "inner.h" + +#if BR_POWER8 + +/* see bearssl_block.h */ +void +br_aes_pwr8_cbcdec_init(br_aes_pwr8_cbcdec_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_pwr8_cbcdec_vtable; + ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len); +} + +static void +cbcdec_128(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v10 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v24. + */ + lxvw4x(56, 0, %[iv]) +#if BR_POWER8_LE + vperm(24, 24, 24, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next ciphertext words in v16..v19. Also save them + * in v20..v23. + */ + lxvw4x(48, %[cc0], %[buf]) + lxvw4x(49, %[cc1], %[buf]) + lxvw4x(50, %[cc2], %[buf]) + lxvw4x(51, %[cc3], %[buf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + vand(20, 16, 16) + vand(21, 17, 17) + vand(22, 18, 18) + vand(23, 19, 19) + + /* + * Decrypt the blocks. + */ + vxor(16, 16, 10) + vxor(17, 17, 10) + vxor(18, 18, 10) + vxor(19, 19, 10) + vncipher(16, 16, 9) + vncipher(17, 17, 9) + vncipher(18, 18, 9) + vncipher(19, 19, 9) + vncipher(16, 16, 8) + vncipher(17, 17, 8) + vncipher(18, 18, 8) + vncipher(19, 19, 8) + vncipher(16, 16, 7) + vncipher(17, 17, 7) + vncipher(18, 18, 7) + vncipher(19, 19, 7) + vncipher(16, 16, 6) + vncipher(17, 17, 6) + vncipher(18, 18, 6) + vncipher(19, 19, 6) + vncipher(16, 16, 5) + vncipher(17, 17, 5) + vncipher(18, 18, 5) + vncipher(19, 19, 5) + vncipher(16, 16, 4) + vncipher(17, 17, 4) + vncipher(18, 18, 4) + vncipher(19, 19, 4) + vncipher(16, 16, 3) + vncipher(17, 17, 3) + vncipher(18, 18, 3) + vncipher(19, 19, 3) + vncipher(16, 16, 2) + vncipher(17, 17, 2) + vncipher(18, 18, 2) + vncipher(19, 19, 2) + vncipher(16, 16, 1) + vncipher(17, 17, 1) + vncipher(18, 18, 1) + vncipher(19, 19, 1) + vncipherlast(16, 16, 0) + vncipherlast(17, 17, 0) + vncipherlast(18, 18, 0) + vncipherlast(19, 19, 0) + + /* + * XOR decrypted blocks with IV / previous block. + */ + vxor(16, 16, 24) + vxor(17, 17, 20) + vxor(18, 18, 21) + vxor(19, 19, 22) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + /* + * Fourth encrypted block is IV for next run. + */ + vand(24, 23, 23) + + addi(%[buf], %[buf], 64) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +static void +cbcdec_192(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v12 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(43, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(44, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v24. + */ + lxvw4x(56, 0, %[iv]) +#if BR_POWER8_LE + vperm(24, 24, 24, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next ciphertext words in v16..v19. Also save them + * in v20..v23. + */ + lxvw4x(48, %[cc0], %[buf]) + lxvw4x(49, %[cc1], %[buf]) + lxvw4x(50, %[cc2], %[buf]) + lxvw4x(51, %[cc3], %[buf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + vand(20, 16, 16) + vand(21, 17, 17) + vand(22, 18, 18) + vand(23, 19, 19) + + /* + * Decrypt the blocks. + */ + vxor(16, 16, 12) + vxor(17, 17, 12) + vxor(18, 18, 12) + vxor(19, 19, 12) + vncipher(16, 16, 11) + vncipher(17, 17, 11) + vncipher(18, 18, 11) + vncipher(19, 19, 11) + vncipher(16, 16, 10) + vncipher(17, 17, 10) + vncipher(18, 18, 10) + vncipher(19, 19, 10) + vncipher(16, 16, 9) + vncipher(17, 17, 9) + vncipher(18, 18, 9) + vncipher(19, 19, 9) + vncipher(16, 16, 8) + vncipher(17, 17, 8) + vncipher(18, 18, 8) + vncipher(19, 19, 8) + vncipher(16, 16, 7) + vncipher(17, 17, 7) + vncipher(18, 18, 7) + vncipher(19, 19, 7) + vncipher(16, 16, 6) + vncipher(17, 17, 6) + vncipher(18, 18, 6) + vncipher(19, 19, 6) + vncipher(16, 16, 5) + vncipher(17, 17, 5) + vncipher(18, 18, 5) + vncipher(19, 19, 5) + vncipher(16, 16, 4) + vncipher(17, 17, 4) + vncipher(18, 18, 4) + vncipher(19, 19, 4) + vncipher(16, 16, 3) + vncipher(17, 17, 3) + vncipher(18, 18, 3) + vncipher(19, 19, 3) + vncipher(16, 16, 2) + vncipher(17, 17, 2) + vncipher(18, 18, 2) + vncipher(19, 19, 2) + vncipher(16, 16, 1) + vncipher(17, 17, 1) + vncipher(18, 18, 1) + vncipher(19, 19, 1) + vncipherlast(16, 16, 0) + vncipherlast(17, 17, 0) + vncipherlast(18, 18, 0) + vncipherlast(19, 19, 0) + + /* + * XOR decrypted blocks with IV / previous block. + */ + vxor(16, 16, 24) + vxor(17, 17, 20) + vxor(18, 18, 21) + vxor(19, 19, 22) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + /* + * Fourth encrypted block is IV for next run. + */ + vand(24, 23, 23) + + addi(%[buf], %[buf], 64) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +static void +cbcdec_256(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v14 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(43, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(44, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(45, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(46, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v24. + */ + lxvw4x(56, 0, %[iv]) +#if BR_POWER8_LE + vperm(24, 24, 24, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next ciphertext words in v16..v19. Also save them + * in v20..v23. + */ + lxvw4x(48, %[cc0], %[buf]) + lxvw4x(49, %[cc1], %[buf]) + lxvw4x(50, %[cc2], %[buf]) + lxvw4x(51, %[cc3], %[buf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + vand(20, 16, 16) + vand(21, 17, 17) + vand(22, 18, 18) + vand(23, 19, 19) + + /* + * Decrypt the blocks. + */ + vxor(16, 16, 14) + vxor(17, 17, 14) + vxor(18, 18, 14) + vxor(19, 19, 14) + vncipher(16, 16, 13) + vncipher(17, 17, 13) + vncipher(18, 18, 13) + vncipher(19, 19, 13) + vncipher(16, 16, 12) + vncipher(17, 17, 12) + vncipher(18, 18, 12) + vncipher(19, 19, 12) + vncipher(16, 16, 11) + vncipher(17, 17, 11) + vncipher(18, 18, 11) + vncipher(19, 19, 11) + vncipher(16, 16, 10) + vncipher(17, 17, 10) + vncipher(18, 18, 10) + vncipher(19, 19, 10) + vncipher(16, 16, 9) + vncipher(17, 17, 9) + vncipher(18, 18, 9) + vncipher(19, 19, 9) + vncipher(16, 16, 8) + vncipher(17, 17, 8) + vncipher(18, 18, 8) + vncipher(19, 19, 8) + vncipher(16, 16, 7) + vncipher(17, 17, 7) + vncipher(18, 18, 7) + vncipher(19, 19, 7) + vncipher(16, 16, 6) + vncipher(17, 17, 6) + vncipher(18, 18, 6) + vncipher(19, 19, 6) + vncipher(16, 16, 5) + vncipher(17, 17, 5) + vncipher(18, 18, 5) + vncipher(19, 19, 5) + vncipher(16, 16, 4) + vncipher(17, 17, 4) + vncipher(18, 18, 4) + vncipher(19, 19, 4) + vncipher(16, 16, 3) + vncipher(17, 17, 3) + vncipher(18, 18, 3) + vncipher(19, 19, 3) + vncipher(16, 16, 2) + vncipher(17, 17, 2) + vncipher(18, 18, 2) + vncipher(19, 19, 2) + vncipher(16, 16, 1) + vncipher(17, 17, 1) + vncipher(18, 18, 1) + vncipher(19, 19, 1) + vncipherlast(16, 16, 0) + vncipherlast(17, 17, 0) + vncipherlast(18, 18, 0) + vncipherlast(19, 19, 0) + + /* + * XOR decrypted blocks with IV / previous block. + */ + vxor(16, 16, 24) + vxor(17, 17, 20) + vxor(18, 18, 21) + vxor(19, 19, 22) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + /* + * Fourth encrypted block is IV for next run. + */ + vand(24, 23, 23) + + addi(%[buf], %[buf], 64) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +/* see bearssl_block.h */ +void +br_aes_pwr8_cbcdec_run(const br_aes_pwr8_cbcdec_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char nextiv[16]; + unsigned char *buf; + + if (len == 0) { + return; + } + buf = data; + memcpy(nextiv, buf + len - 16, 16); + if (len >= 64) { + size_t num_blocks; + unsigned char tmp[16]; + + num_blocks = (len >> 4) & ~(size_t)3; + memcpy(tmp, buf + (num_blocks << 4) - 16, 16); + switch (ctx->num_rounds) { + case 10: + cbcdec_128(ctx->skey.skni, iv, buf, num_blocks); + break; + case 12: + cbcdec_192(ctx->skey.skni, iv, buf, num_blocks); + break; + default: + cbcdec_256(ctx->skey.skni, iv, buf, num_blocks); + break; + } + buf += num_blocks << 4; + len &= 63; + memcpy(iv, tmp, 16); + } + if (len > 0) { + unsigned char tmp[64]; + + memcpy(tmp, buf, len); + memset(tmp + len, 0, (sizeof tmp) - len); + switch (ctx->num_rounds) { + case 10: + cbcdec_128(ctx->skey.skni, iv, tmp, 4); + break; + case 12: + cbcdec_192(ctx->skey.skni, iv, tmp, 4); + break; + default: + cbcdec_256(ctx->skey.skni, iv, tmp, 4); + break; + } + memcpy(buf, tmp, len); + } + memcpy(iv, nextiv, 16); +} + +/* see bearssl_block.h */ +const br_block_cbcdec_class br_aes_pwr8_cbcdec_vtable = { + sizeof(br_aes_pwr8_cbcdec_keys), + 16, + 4, + (void (*)(const br_block_cbcdec_class **, const void *, size_t)) + &br_aes_pwr8_cbcdec_init, + (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t)) + &br_aes_pwr8_cbcdec_run +}; + +/* see bearssl_block.h */ +const br_block_cbcdec_class * +br_aes_pwr8_cbcdec_get_vtable(void) +{ + return br_aes_pwr8_supported() ? &br_aes_pwr8_cbcdec_vtable : NULL; +} + +#else + +/* see bearssl_block.h */ +const br_block_cbcdec_class * +br_aes_pwr8_cbcdec_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/src/symcipher/aes_pwr8_cbcenc.c b/src/symcipher/aes_pwr8_cbcenc.c new file mode 100644 index 0000000..00f8eca --- /dev/null +++ b/src/symcipher/aes_pwr8_cbcenc.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_POWER_ASM_MACROS 1 +#include "inner.h" + +#if BR_POWER8 + +/* see bearssl_block.h */ +void +br_aes_pwr8_cbcenc_init(br_aes_pwr8_cbcenc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_pwr8_cbcenc_vtable; + ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len); +} + +static void +cbcenc_128(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t len) +{ + long cc; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + asm volatile ( + + /* + * Load subkeys into v0..v10 + */ + lxvw4x(32, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(33, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(34, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(35, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(36, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(37, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(38, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(39, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(40, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(41, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(42, %[cc], %[sk]) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v16. + */ + lxvw4x(48, 0, %[iv]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next plaintext word and XOR with current IV. + */ + lxvw4x(49, 0, %[buf]) +#if BR_POWER8_LE + vperm(17, 17, 17, 15) +#endif + vxor(16, 16, 17) + + /* + * Encrypt the block. + */ + vxor(16, 16, 0) + vcipher(16, 16, 1) + vcipher(16, 16, 2) + vcipher(16, 16, 3) + vcipher(16, 16, 4) + vcipher(16, 16, 5) + vcipher(16, 16, 6) + vcipher(16, 16, 7) + vcipher(16, 16, 8) + vcipher(16, 16, 9) + vcipherlast(16, 16, 10) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(17, 16, 16, 15) + stxvw4x(49, 0, %[buf]) +#else + stxvw4x(48, 0, %[buf]) +#endif + addi(%[buf], %[buf], 16) + + bdnz(loop) + +: [cc] "+b" (cc), [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "ctr", "memory" + ); +} + +static void +cbcenc_192(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t len) +{ + long cc; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + asm volatile ( + + /* + * Load subkeys into v0..v12 + */ + lxvw4x(32, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(33, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(34, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(35, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(36, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(37, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(38, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(39, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(40, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(41, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(42, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(43, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(44, %[cc], %[sk]) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v16. + */ + lxvw4x(48, 0, %[iv]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next plaintext word and XOR with current IV. + */ + lxvw4x(49, 0, %[buf]) +#if BR_POWER8_LE + vperm(17, 17, 17, 15) +#endif + vxor(16, 16, 17) + + /* + * Encrypt the block. + */ + vxor(16, 16, 0) + vcipher(16, 16, 1) + vcipher(16, 16, 2) + vcipher(16, 16, 3) + vcipher(16, 16, 4) + vcipher(16, 16, 5) + vcipher(16, 16, 6) + vcipher(16, 16, 7) + vcipher(16, 16, 8) + vcipher(16, 16, 9) + vcipher(16, 16, 10) + vcipher(16, 16, 11) + vcipherlast(16, 16, 12) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(17, 16, 16, 15) + stxvw4x(49, 0, %[buf]) +#else + stxvw4x(48, 0, %[buf]) +#endif + addi(%[buf], %[buf], 16) + + bdnz(loop) + +: [cc] "+b" (cc), [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "ctr", "memory" + ); +} + +static void +cbcenc_256(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t len) +{ + long cc; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + asm volatile ( + + /* + * Load subkeys into v0..v14 + */ + lxvw4x(32, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(33, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(34, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(35, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(36, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(37, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(38, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(39, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(40, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(41, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(42, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(43, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(44, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(45, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(46, %[cc], %[sk]) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v16. + */ + lxvw4x(48, 0, %[iv]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next plaintext word and XOR with current IV. + */ + lxvw4x(49, 0, %[buf]) +#if BR_POWER8_LE + vperm(17, 17, 17, 15) +#endif + vxor(16, 16, 17) + + /* + * Encrypt the block. + */ + vxor(16, 16, 0) + vcipher(16, 16, 1) + vcipher(16, 16, 2) + vcipher(16, 16, 3) + vcipher(16, 16, 4) + vcipher(16, 16, 5) + vcipher(16, 16, 6) + vcipher(16, 16, 7) + vcipher(16, 16, 8) + vcipher(16, 16, 9) + vcipher(16, 16, 10) + vcipher(16, 16, 11) + vcipher(16, 16, 12) + vcipher(16, 16, 13) + vcipherlast(16, 16, 14) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(17, 16, 16, 15) + stxvw4x(49, 0, %[buf]) +#else + stxvw4x(48, 0, %[buf]) +#endif + addi(%[buf], %[buf], 16) + + bdnz(loop) + +: [cc] "+b" (cc), [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "ctr", "memory" + ); +} + +/* see bearssl_block.h */ +void +br_aes_pwr8_cbcenc_run(const br_aes_pwr8_cbcenc_keys *ctx, + void *iv, void *data, size_t len) +{ + if (len > 0) { + switch (ctx->num_rounds) { + case 10: + cbcenc_128(ctx->skey.skni, iv, data, len); + break; + case 12: + cbcenc_192(ctx->skey.skni, iv, data, len); + break; + default: + cbcenc_256(ctx->skey.skni, iv, data, len); + break; + } + memcpy(iv, (unsigned char *)data + (len - 16), 16); + } +} + +/* see bearssl_block.h */ +const br_block_cbcenc_class br_aes_pwr8_cbcenc_vtable = { + sizeof(br_aes_pwr8_cbcenc_keys), + 16, + 4, + (void (*)(const br_block_cbcenc_class **, const void *, size_t)) + &br_aes_pwr8_cbcenc_init, + (void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t)) + &br_aes_pwr8_cbcenc_run +}; + +/* see bearssl_block.h */ +const br_block_cbcenc_class * +br_aes_pwr8_cbcenc_get_vtable(void) +{ + return br_aes_pwr8_supported() ? &br_aes_pwr8_cbcenc_vtable : NULL; +} + +#else + +/* see bearssl_block.h */ +const br_block_cbcenc_class * +br_aes_pwr8_cbcenc_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/src/symcipher/aes_pwr8_ctr.c b/src/symcipher/aes_pwr8_ctr.c new file mode 100644 index 0000000..f5d20c0 --- /dev/null +++ b/src/symcipher/aes_pwr8_ctr.c @@ -0,0 +1,717 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_POWER_ASM_MACROS 1 +#include "inner.h" + +#if BR_POWER8 + +/* see bearssl_block.h */ +void +br_aes_pwr8_ctr_init(br_aes_pwr8_ctr_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_pwr8_ctr_vtable; + ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len); +} + +static void +ctr_128(const unsigned char *sk, const unsigned char *ivbuf, + unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + static const uint32_t ctrinc[] = { + 0, 0, 0, 4 + }; + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v10 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * v28 = increment for IV counter. + */ + lxvw4x(60, 0, %[ctrinc]) + + /* + * Load IV into v16..v19 + */ + lxvw4x(48, %[cc0], %[ivbuf]) + lxvw4x(49, %[cc1], %[ivbuf]) + lxvw4x(50, %[cc2], %[ivbuf]) + lxvw4x(51, %[cc3], %[ivbuf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Compute next IV into v24..v27 + */ + vadduwm(24, 16, 28) + vadduwm(25, 17, 28) + vadduwm(26, 18, 28) + vadduwm(27, 19, 28) + + /* + * Load next data blocks. We do this early on but we + * won't need them until IV encryption is done. + */ + lxvw4x(52, %[cc0], %[buf]) + lxvw4x(53, %[cc1], %[buf]) + lxvw4x(54, %[cc2], %[buf]) + lxvw4x(55, %[cc3], %[buf]) + + /* + * Encrypt the current IV. + */ + vxor(16, 16, 0) + vxor(17, 17, 0) + vxor(18, 18, 0) + vxor(19, 19, 0) + vcipher(16, 16, 1) + vcipher(17, 17, 1) + vcipher(18, 18, 1) + vcipher(19, 19, 1) + vcipher(16, 16, 2) + vcipher(17, 17, 2) + vcipher(18, 18, 2) + vcipher(19, 19, 2) + vcipher(16, 16, 3) + vcipher(17, 17, 3) + vcipher(18, 18, 3) + vcipher(19, 19, 3) + vcipher(16, 16, 4) + vcipher(17, 17, 4) + vcipher(18, 18, 4) + vcipher(19, 19, 4) + vcipher(16, 16, 5) + vcipher(17, 17, 5) + vcipher(18, 18, 5) + vcipher(19, 19, 5) + vcipher(16, 16, 6) + vcipher(17, 17, 6) + vcipher(18, 18, 6) + vcipher(19, 19, 6) + vcipher(16, 16, 7) + vcipher(17, 17, 7) + vcipher(18, 18, 7) + vcipher(19, 19, 7) + vcipher(16, 16, 8) + vcipher(17, 17, 8) + vcipher(18, 18, 8) + vcipher(19, 19, 8) + vcipher(16, 16, 9) + vcipher(17, 17, 9) + vcipher(18, 18, 9) + vcipher(19, 19, 9) + vcipherlast(16, 16, 10) + vcipherlast(17, 17, 10) + vcipherlast(18, 18, 10) + vcipherlast(19, 19, 10) + +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + /* + * Load next plaintext word and XOR with encrypted IV. + */ + vxor(16, 20, 16) + vxor(17, 21, 17) + vxor(18, 22, 18) + vxor(19, 23, 19) + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + addi(%[buf], %[buf], 64) + + /* + * Update IV. + */ + vand(16, 24, 24) + vand(17, 25, 25) + vand(18, 26, 26) + vand(19, 27, 27) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2), + [ctrinc] "b" (ctrinc) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +static void +ctr_192(const unsigned char *sk, const unsigned char *ivbuf, + unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + static const uint32_t ctrinc[] = { + 0, 0, 0, 4 + }; + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v12 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(43, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(44, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * v28 = increment for IV counter. + */ + lxvw4x(60, 0, %[ctrinc]) + + /* + * Load IV into v16..v19 + */ + lxvw4x(48, %[cc0], %[ivbuf]) + lxvw4x(49, %[cc1], %[ivbuf]) + lxvw4x(50, %[cc2], %[ivbuf]) + lxvw4x(51, %[cc3], %[ivbuf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Compute next IV into v24..v27 + */ + vadduwm(24, 16, 28) + vadduwm(25, 17, 28) + vadduwm(26, 18, 28) + vadduwm(27, 19, 28) + + /* + * Load next data blocks. We do this early on but we + * won't need them until IV encryption is done. + */ + lxvw4x(52, %[cc0], %[buf]) + lxvw4x(53, %[cc1], %[buf]) + lxvw4x(54, %[cc2], %[buf]) + lxvw4x(55, %[cc3], %[buf]) + + /* + * Encrypt the current IV. + */ + vxor(16, 16, 0) + vxor(17, 17, 0) + vxor(18, 18, 0) + vxor(19, 19, 0) + vcipher(16, 16, 1) + vcipher(17, 17, 1) + vcipher(18, 18, 1) + vcipher(19, 19, 1) + vcipher(16, 16, 2) + vcipher(17, 17, 2) + vcipher(18, 18, 2) + vcipher(19, 19, 2) + vcipher(16, 16, 3) + vcipher(17, 17, 3) + vcipher(18, 18, 3) + vcipher(19, 19, 3) + vcipher(16, 16, 4) + vcipher(17, 17, 4) + vcipher(18, 18, 4) + vcipher(19, 19, 4) + vcipher(16, 16, 5) + vcipher(17, 17, 5) + vcipher(18, 18, 5) + vcipher(19, 19, 5) + vcipher(16, 16, 6) + vcipher(17, 17, 6) + vcipher(18, 18, 6) + vcipher(19, 19, 6) + vcipher(16, 16, 7) + vcipher(17, 17, 7) + vcipher(18, 18, 7) + vcipher(19, 19, 7) + vcipher(16, 16, 8) + vcipher(17, 17, 8) + vcipher(18, 18, 8) + vcipher(19, 19, 8) + vcipher(16, 16, 9) + vcipher(17, 17, 9) + vcipher(18, 18, 9) + vcipher(19, 19, 9) + vcipher(16, 16, 10) + vcipher(17, 17, 10) + vcipher(18, 18, 10) + vcipher(19, 19, 10) + vcipher(16, 16, 11) + vcipher(17, 17, 11) + vcipher(18, 18, 11) + vcipher(19, 19, 11) + vcipherlast(16, 16, 12) + vcipherlast(17, 17, 12) + vcipherlast(18, 18, 12) + vcipherlast(19, 19, 12) + +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + /* + * Load next plaintext word and XOR with encrypted IV. + */ + vxor(16, 20, 16) + vxor(17, 21, 17) + vxor(18, 22, 18) + vxor(19, 23, 19) + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + addi(%[buf], %[buf], 64) + + /* + * Update IV. + */ + vand(16, 24, 24) + vand(17, 25, 25) + vand(18, 26, 26) + vand(19, 27, 27) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2), + [ctrinc] "b" (ctrinc) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +static void +ctr_256(const unsigned char *sk, const unsigned char *ivbuf, + unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + static const uint32_t ctrinc[] = { + 0, 0, 0, 4 + }; + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v14 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(43, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(44, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(45, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(46, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * v28 = increment for IV counter. + */ + lxvw4x(60, 0, %[ctrinc]) + + /* + * Load IV into v16..v19 + */ + lxvw4x(48, %[cc0], %[ivbuf]) + lxvw4x(49, %[cc1], %[ivbuf]) + lxvw4x(50, %[cc2], %[ivbuf]) + lxvw4x(51, %[cc3], %[ivbuf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Compute next IV into v24..v27 + */ + vadduwm(24, 16, 28) + vadduwm(25, 17, 28) + vadduwm(26, 18, 28) + vadduwm(27, 19, 28) + + /* + * Load next data blocks. We do this early on but we + * won't need them until IV encryption is done. + */ + lxvw4x(52, %[cc0], %[buf]) + lxvw4x(53, %[cc1], %[buf]) + lxvw4x(54, %[cc2], %[buf]) + lxvw4x(55, %[cc3], %[buf]) + + /* + * Encrypt the current IV. + */ + vxor(16, 16, 0) + vxor(17, 17, 0) + vxor(18, 18, 0) + vxor(19, 19, 0) + vcipher(16, 16, 1) + vcipher(17, 17, 1) + vcipher(18, 18, 1) + vcipher(19, 19, 1) + vcipher(16, 16, 2) + vcipher(17, 17, 2) + vcipher(18, 18, 2) + vcipher(19, 19, 2) + vcipher(16, 16, 3) + vcipher(17, 17, 3) + vcipher(18, 18, 3) + vcipher(19, 19, 3) + vcipher(16, 16, 4) + vcipher(17, 17, 4) + vcipher(18, 18, 4) + vcipher(19, 19, 4) + vcipher(16, 16, 5) + vcipher(17, 17, 5) + vcipher(18, 18, 5) + vcipher(19, 19, 5) + vcipher(16, 16, 6) + vcipher(17, 17, 6) + vcipher(18, 18, 6) + vcipher(19, 19, 6) + vcipher(16, 16, 7) + vcipher(17, 17, 7) + vcipher(18, 18, 7) + vcipher(19, 19, 7) + vcipher(16, 16, 8) + vcipher(17, 17, 8) + vcipher(18, 18, 8) + vcipher(19, 19, 8) + vcipher(16, 16, 9) + vcipher(17, 17, 9) + vcipher(18, 18, 9) + vcipher(19, 19, 9) + vcipher(16, 16, 10) + vcipher(17, 17, 10) + vcipher(18, 18, 10) + vcipher(19, 19, 10) + vcipher(16, 16, 11) + vcipher(17, 17, 11) + vcipher(18, 18, 11) + vcipher(19, 19, 11) + vcipher(16, 16, 12) + vcipher(17, 17, 12) + vcipher(18, 18, 12) + vcipher(19, 19, 12) + vcipher(16, 16, 13) + vcipher(17, 17, 13) + vcipher(18, 18, 13) + vcipher(19, 19, 13) + vcipherlast(16, 16, 14) + vcipherlast(17, 17, 14) + vcipherlast(18, 18, 14) + vcipherlast(19, 19, 14) + +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + /* + * Load next plaintext word and XOR with encrypted IV. + */ + vxor(16, 20, 16) + vxor(17, 21, 17) + vxor(18, 22, 18) + vxor(19, 23, 19) + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + addi(%[buf], %[buf], 64) + + /* + * Update IV. + */ + vand(16, 24, 24) + vand(17, 25, 25) + vand(18, 26, 26) + vand(19, 27, 27) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2), + [ctrinc] "b" (ctrinc) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +/* see bearssl_block.h */ +uint32_t +br_aes_pwr8_ctr_run(const br_aes_pwr8_ctr_keys *ctx, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + unsigned char ivbuf[64]; + + buf = data; + memcpy(ivbuf + 0, iv, 12); + memcpy(ivbuf + 16, iv, 12); + memcpy(ivbuf + 32, iv, 12); + memcpy(ivbuf + 48, iv, 12); + if (len >= 64) { + br_enc32be(ivbuf + 12, cc + 0); + br_enc32be(ivbuf + 28, cc + 1); + br_enc32be(ivbuf + 44, cc + 2); + br_enc32be(ivbuf + 60, cc + 3); + switch (ctx->num_rounds) { + case 10: + ctr_128(ctx->skey.skni, ivbuf, buf, + (len >> 4) & ~(size_t)3); + break; + case 12: + ctr_192(ctx->skey.skni, ivbuf, buf, + (len >> 4) & ~(size_t)3); + break; + default: + ctr_256(ctx->skey.skni, ivbuf, buf, + (len >> 4) & ~(size_t)3); + break; + } + cc += (len >> 4) & ~(size_t)3; + buf += len & ~(size_t)63; + len &= 63; + } + if (len > 0) { + unsigned char tmp[64]; + + memcpy(tmp, buf, len); + memset(tmp + len, 0, (sizeof tmp) - len); + br_enc32be(ivbuf + 12, cc + 0); + br_enc32be(ivbuf + 28, cc + 1); + br_enc32be(ivbuf + 44, cc + 2); + br_enc32be(ivbuf + 60, cc + 3); + switch (ctx->num_rounds) { + case 10: + ctr_128(ctx->skey.skni, ivbuf, tmp, 4); + break; + case 12: + ctr_192(ctx->skey.skni, ivbuf, tmp, 4); + break; + default: + ctr_256(ctx->skey.skni, ivbuf, tmp, 4); + break; + } + memcpy(buf, tmp, len); + cc += (len + 15) >> 4; + } + return cc; +} + +/* see bearssl_block.h */ +const br_block_ctr_class br_aes_pwr8_ctr_vtable = { + sizeof(br_aes_pwr8_ctr_keys), + 16, + 4, + (void (*)(const br_block_ctr_class **, const void *, size_t)) + &br_aes_pwr8_ctr_init, + (uint32_t (*)(const br_block_ctr_class *const *, + const void *, uint32_t, void *, size_t)) + &br_aes_pwr8_ctr_run +}; + +/* see bearssl_block.h */ +const br_block_ctr_class * +br_aes_pwr8_ctr_get_vtable(void) +{ + return br_aes_pwr8_supported() ? &br_aes_pwr8_ctr_vtable : NULL; +} + +#else + +/* see bearssl_block.h */ +const br_block_ctr_class * +br_aes_pwr8_ctr_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/test/test_crypto.c b/test/test_crypto.c index ca7234d..c05fca5 100644 --- a/test/test_crypto.c +++ b/test/test_crypto.c @@ -3297,6 +3297,33 @@ test_AES_x86ni(void) } } +static void +test_AES_pwr8(void) +{ + const br_block_cbcenc_class *x_cbcenc; + const br_block_cbcdec_class *x_cbcdec; + const br_block_ctr_class *x_ctr; + int hcbcenc, hcbcdec, hctr; + + x_cbcenc = br_aes_pwr8_cbcenc_get_vtable(); + x_cbcdec = br_aes_pwr8_cbcdec_get_vtable(); + x_ctr = br_aes_pwr8_ctr_get_vtable(); + hcbcenc = (x_cbcenc != NULL); + hcbcdec = (x_cbcdec != NULL); + hctr = (x_ctr != NULL); + if (hcbcenc != hctr || hcbcdec != hctr) { + fprintf(stderr, "AES_pwr8 availability mismatch (%d/%d/%d)\n", + hcbcenc, hcbcdec, hctr); + exit(EXIT_FAILURE); + } + if (hctr) { + test_AES_generic("AES_pwr8", + x_cbcenc, x_cbcdec, x_ctr, 1, 1); + } else { + printf("Test AES_pwr8: UNAVAILABLE\n"); + } +} + /* * DES known-answer tests. Order: plaintext, key, ciphertext. * (mostly from NIST SP 800-20). @@ -4675,6 +4702,31 @@ test_GHASH(const char *name, br_ghash gh) check_equals("KAT GHASH", y, ref, sizeof ref); } + for (u = 0; u <= 1024; u ++) { + unsigned char key[32], iv[12]; + unsigned char buf[1024 + 32]; + unsigned char y0[16], y1[16]; + char tmp[100]; + + memset(key, 0, sizeof key); + memset(iv, 0, sizeof iv); + br_enc32be(key, u); + memset(buf, 0, sizeof buf); + br_chacha20_ct_run(key, iv, 1, buf, sizeof buf); + + memcpy(y0, buf, 16); + br_ghash_ctmul32(y0, buf + 16, buf + 32, u); + memcpy(y1, buf, 16); + gh(y1, buf + 16, buf + 32, u); + sprintf(tmp, "XREF %s (len = %u)", name, (unsigned)u); + check_equals(tmp, y0, y1, 16); + + if ((u & 31) == 0) { + printf("."); + fflush(stdout); + } + } + printf("done.\n"); fflush(stdout); } @@ -4710,6 +4762,19 @@ test_GHASH_pclmul(void) } } +static void +test_GHASH_pwr8(void) +{ + br_ghash gh; + + gh = br_ghash_pwr8_get(); + if (gh == 0) { + printf("Test GHASH_pwr8: UNAVAILABLE\n"); + } else { + test_GHASH("GHASH_pwr8", gh); + } +} + static void test_EC_inner(const char *sk, const char *sU, const br_ec_impl *impl, int curve) @@ -5598,6 +5663,7 @@ static const struct { STU(AES_small), STU(AES_ct), STU(AES_ct64), + STU(AES_pwr8), STU(AES_x86ni), STU(DES_tab), STU(DES_ct), @@ -5612,6 +5678,7 @@ static const struct { STU(GHASH_ctmul32), STU(GHASH_ctmul64), STU(GHASH_pclmul), + STU(GHASH_pwr8), STU(EC_prime_i15), STU(EC_prime_i31), STU(EC_p256_m15), diff --git a/test/test_speed.c b/test/test_speed.c index d7dfaad..6981299 100644 --- a/test/test_speed.c +++ b/test/test_speed.c @@ -249,6 +249,7 @@ SPEED_AES(small) SPEED_AES(ct) SPEED_AES(ct64) SPEED_AES(x86ni) +SPEED_AES(pwr8) #define br_des_tab_cbcenc_get_vtable() (&br_des_tab_cbcenc_vtable) #define br_des_tab_cbcdec_get_vtable() (&br_des_tab_cbcdec_vtable) @@ -334,6 +335,20 @@ test_speed_ghash_pclmul(void) } } +static void +test_speed_ghash_pwr8(void) +{ + br_ghash gh; + + gh = br_ghash_pwr8_get(); + if (gh == 0) { + printf("%-30s UNAVAILABLE\n", "GHASH (pwr8)"); + fflush(stdout); + } else { + test_speed_ghash_inner("GHASH (pwr8)", gh); + } +} + static uint32_t fake_chacha20(const void *key, const void *iv, uint32_t cc, void *data, size_t len) @@ -1215,6 +1230,16 @@ static const struct { STU(aes192_x86ni_ctr), STU(aes256_x86ni_ctr), + STU(aes128_pwr8_cbcenc), + STU(aes128_pwr8_cbcdec), + STU(aes192_pwr8_cbcenc), + STU(aes192_pwr8_cbcdec), + STU(aes256_pwr8_cbcenc), + STU(aes256_pwr8_cbcdec), + STU(aes128_pwr8_ctr), + STU(aes192_pwr8_ctr), + STU(aes256_pwr8_ctr), + STU(des_tab_cbcenc), STU(des_tab_cbcdec), STU(3des_tab_cbcenc), @@ -1231,6 +1256,7 @@ static const struct { STU(ghash_ctmul32), STU(ghash_ctmul64), STU(ghash_pclmul), + STU(ghash_pwr8), STU(poly1305_ctmul), STU(poly1305_ctmul32), diff --git a/tools/names.c b/tools/names.c index 753a736..a8bb645 100644 --- a/tools/names.c +++ b/tools/names.c @@ -406,6 +406,12 @@ static const struct { const char *short_name; const void *(*get)(void); } algo_names_dyn[] = { + { "aes_pwr8_cbcenc", "pwr8", + (const void *(*)(void))&br_aes_pwr8_cbcenc_get_vtable }, + { "aes_pwr8_cbcdec", "pwr8", + (const void *(*)(void))&br_aes_pwr8_cbcdec_get_vtable }, + { "aes_pwr8_ctr", "pwr8", + (const void *(*)(void))&br_aes_pwr8_ctr_get_vtable }, { "aes_x86ni_cbcenc", "x86ni", (const void *(*)(void))&br_aes_x86ni_cbcenc_get_vtable }, { "aes_x86ni_cbcdec", "x86ni", @@ -414,6 +420,8 @@ static const struct { (const void *(*)(void))&br_aes_x86ni_ctr_get_vtable }, { "ghash_pclmul", "pclmul", (const void *(*)(void))&br_ghash_pclmul_get }, + { "ghash_pwr8", "pwr8", + (const void *(*)(void))&br_ghash_pwr8_get }, { 0, 0, 0, } };