Fixed computing of intermediate buffer size for maximum-size RSA keys.
[BearSSL] / test / test_speed.c
index 458d765..eb1b964 100644 (file)
@@ -88,7 +88,12 @@ test_speed_ ## fname(void) \
        memset(key, 'T', sizeof key); \
        memset(buf, 'P', sizeof buf); \
        memset(iv, 'X', sizeof iv); \
-       vt = &br_ ## cname ## _cbc ## dir ## _vtable; \
+       vt = br_ ## cname ## _cbc ## dir ## _get_vtable(); \
+       if (vt == NULL) { \
+               printf("%-30s UNAVAILABLE\n", #Name); \
+               fflush(stdout); \
+               return; \
+       } \
        for (i = 0; i < 10; i ++) { \
                vt->init(&ec.vtable, key, sizeof key); \
                vt->run(&ec.vtable, iv, buf, sizeof buf); \
@@ -132,7 +137,12 @@ test_speed_ ## fname(void) \
        memset(key, 'T', sizeof key); \
        memset(buf, 'P', sizeof buf); \
        memset(iv, 'X', sizeof iv); \
-       vt = &br_ ## cname ## _ctr_vtable; \
+       vt = br_ ## cname ## _ctr_get_vtable(); \
+       if (vt == NULL) { \
+               printf("%-30s UNAVAILABLE\n", #Name); \
+               fflush(stdout); \
+               return; \
+       } \
        for (i = 0; i < 10; i ++) { \
                vt->init(&ec.vtable, key, sizeof key); \
                vt->run(&ec.vtable, iv, 1, buf, sizeof buf); \
@@ -165,17 +175,24 @@ test_speed_ ## fname(void) \
 static void \
 test_speed_ ## fname(void) \
 { \
+       br_chacha20_run bc; \
        unsigned char key[32]; \
        unsigned char buf[8192]; \
        unsigned char iv[12]; \
        int i; \
        long num; \
  \
+       bc = br_ ## fname ## _get(); \
+       if (bc == 0) { \
+               printf("%-30s UNAVAILABLE\n", #Name); \
+               fflush(stdout); \
+               return; \
+       } \
        memset(key, 'T', sizeof key); \
        memset(buf, 'P', sizeof buf); \
        memset(iv, 'X', sizeof iv); \
        for (i = 0; i < 10; i ++) { \
-               br_ ## fname ## _run(key, iv, i, buf, sizeof buf); \
+               bc(key, iv, i, buf, sizeof buf); \
        } \
        num = 10; \
        for (;;) { \
@@ -185,8 +202,7 @@ test_speed_ ## fname(void) \
  \
                begin = clock(); \
                for (k = num; k > 0; k --) { \
-                       br_ ## fname ## _run(key, iv, \
-                               (uint32_t)k, buf, sizeof buf); \
+                       bc(key, iv, (uint32_t)k, buf, sizeof buf); \
                } \
                end = clock(); \
                tt = (double)(end - begin) / CLOCKS_PER_SEC; \
@@ -206,6 +222,28 @@ SPEED_HASH(SHA-1, sha1)
 SPEED_HASH(SHA-256, sha256)
 SPEED_HASH(SHA-512, sha512)
 
+/*
+ * There are no vtable selection functions for the portable implementations,
+ * so we define some custom macros.
+ */
+#define br_aes_big_cbcenc_get_vtable()     (&br_aes_big_cbcenc_vtable)
+#define br_aes_big_cbcdec_get_vtable()     (&br_aes_big_cbcdec_vtable)
+#define br_aes_big_ctr_get_vtable()        (&br_aes_big_ctr_vtable)
+#define br_aes_big_ctrcbc_get_vtable()     (&br_aes_big_ctrcbc_vtable)
+#define br_aes_small_cbcenc_get_vtable()   (&br_aes_small_cbcenc_vtable)
+#define br_aes_small_cbcdec_get_vtable()   (&br_aes_small_cbcdec_vtable)
+#define br_aes_small_ctr_get_vtable()      (&br_aes_small_ctr_vtable)
+#define br_aes_small_ctrcbc_get_vtable()   (&br_aes_small_ctrcbc_vtable)
+#define br_aes_ct_cbcenc_get_vtable()      (&br_aes_ct_cbcenc_vtable)
+#define br_aes_ct_cbcdec_get_vtable()      (&br_aes_ct_cbcdec_vtable)
+#define br_aes_ct_ctr_get_vtable()         (&br_aes_ct_ctr_vtable)
+#define br_aes_ct_ctrcbc_get_vtable()      (&br_aes_ct_ctrcbc_vtable)
+#define br_aes_ct64_cbcenc_get_vtable()    (&br_aes_ct64_cbcenc_vtable)
+#define br_aes_ct64_cbcdec_get_vtable()    (&br_aes_ct64_cbcdec_vtable)
+#define br_aes_ct64_ctr_get_vtable()       (&br_aes_ct64_ctr_vtable)
+#define br_aes_ct64_ctrcbc_get_vtable()    (&br_aes_ct64_ctrcbc_vtable)
+#define br_chacha20_ct_get()               (&br_chacha20_ct_run)
+
 #define SPEED_AES(iname) \
 SPEED_BLOCKCIPHER_CBC(AES-128 CBC encrypt (iname), aes128_ ## iname ## _cbcenc, aes_ ## iname, 16, enc) \
 SPEED_BLOCKCIPHER_CBC(AES-128 CBC decrypt (iname), aes128_ ## iname ## _cbcdec, aes_ ## iname, 16, dec) \
@@ -221,6 +259,13 @@ SPEED_AES(big)
 SPEED_AES(small)
 SPEED_AES(ct)
 SPEED_AES(ct64)
+SPEED_AES(x86ni)
+SPEED_AES(pwr8)
+
+#define br_des_tab_cbcenc_get_vtable()     (&br_des_tab_cbcenc_vtable)
+#define br_des_tab_cbcdec_get_vtable()     (&br_des_tab_cbcdec_vtable)
+#define br_des_ct_cbcenc_get_vtable()      (&br_des_ct_cbcenc_vtable)
+#define br_des_ct_cbcdec_get_vtable()      (&br_des_ct_cbcdec_vtable)
 
 #define SPEED_DES(iname) \
 SPEED_BLOCKCIPHER_CBC(DES CBC encrypt (iname), des_ ## iname ## _cbcenc, des_ ## iname, 8, enc) \
@@ -231,7 +276,8 @@ SPEED_BLOCKCIPHER_CBC(3DES CBC decrypt (iname), 3des_ ## iname ## _cbcdec, des_
 SPEED_DES(tab)
 SPEED_DES(ct)
 
-SPEED_CHACHA20(ChaCha20, chacha20_ct)
+SPEED_CHACHA20(ChaCha20 (ct), chacha20_ct)
+SPEED_CHACHA20(ChaCha20 (sse2), chacha20_sse2)
 
 static void
 test_speed_ghash_inner(char *name, br_ghash gh)
@@ -287,6 +333,34 @@ test_speed_ghash_ctmul64(void)
        test_speed_ghash_inner("GHASH (ctmul64)", &br_ghash_ctmul64);
 }
 
+static void
+test_speed_ghash_pclmul(void)
+{
+       br_ghash gh;
+
+       gh = br_ghash_pclmul_get();
+       if (gh == 0) {
+               printf("%-30s UNAVAILABLE\n", "GHASH (pclmul)");
+               fflush(stdout);
+       } else {
+               test_speed_ghash_inner("GHASH (pclmul)", gh);
+       }
+}
+
+static void
+test_speed_ghash_pwr8(void)
+{
+       br_ghash gh;
+
+       gh = br_ghash_pwr8_get();
+       if (gh == 0) {
+               printf("%-30s UNAVAILABLE\n", "GHASH (pwr8)");
+               fflush(stdout);
+       } else {
+               test_speed_ghash_inner("GHASH (pwr8)", gh);
+       }
+}
+
 static uint32_t
 fake_chacha20(const void *key, const void *iv,
        uint32_t cc, void *data, size_t len)
@@ -354,12 +428,187 @@ test_speed_poly1305_ctmul32(void)
                &br_poly1305_ctmul32_run);
 }
 
+static void
+test_speed_poly1305_ctmulq(void)
+{
+       br_poly1305_run bp;
+
+       bp = br_poly1305_ctmulq_get();
+       if (bp == 0) {
+               printf("%-30s UNAVAILABLE\n", "Poly1305 (ctmulq)");
+       } else {
+               test_speed_poly1305_inner("Poly1305 (ctmulq)", bp);
+       }
+}
+
 static void
 test_speed_poly1305_i15(void)
 {
        test_speed_poly1305_inner("Poly1305 (i15)", &br_poly1305_i15_run);
 }
 
+static void
+test_speed_eax_inner(char *name,
+       const br_block_ctrcbc_class *vt, size_t key_len)
+{
+       unsigned char buf[8192], key[32], nonce[16], aad[16], tag[16];
+       int i;
+       long num;
+       br_aes_gen_ctrcbc_keys ac;
+       br_eax_context ec;
+
+       if (vt == NULL) {
+               printf("%-30s UNAVAILABLE\n", name);
+               fflush(stdout);
+               return;
+       }
+       memset(key, 'K', key_len);
+       memset(nonce, 'N', sizeof nonce);
+       memset(aad, 'A', sizeof aad);
+       memset(buf, 'T', sizeof buf);
+       for (i = 0; i < 10; i ++) {
+               vt->init(&ac.vtable, key, key_len);
+               br_eax_init(&ec, &ac.vtable);
+               br_eax_reset(&ec, nonce, sizeof nonce);
+               br_eax_aad_inject(&ec, aad, sizeof aad);
+               br_eax_flip(&ec);
+               br_eax_run(&ec, 1, buf, sizeof buf);
+               br_eax_get_tag(&ec, tag);
+       }
+       num = 10;
+       for (;;) {
+               clock_t begin, end;
+               double tt;
+               long k;
+
+               begin = clock();
+               for (k = num; k > 0; k --) {
+                       vt->init(&ac.vtable, key, key_len);
+                       br_eax_init(&ec, &ac.vtable);
+                       br_eax_reset(&ec, nonce, sizeof nonce);
+                       br_eax_aad_inject(&ec, aad, sizeof aad);
+                       br_eax_flip(&ec);
+                       br_eax_run(&ec, 1, buf, sizeof buf);
+                       br_eax_get_tag(&ec, tag);
+               }
+               end = clock();
+               tt = (double)(end - begin) / CLOCKS_PER_SEC;
+               if (tt >= 2.0) {
+                       printf("%-30s %8.2f MB/s\n", name,
+                               ((double)sizeof buf) * (double)num
+                               / (tt * 1000000.0));
+                       fflush(stdout);
+                       return;
+               }
+               num <<= 1;
+       }
+}
+
+#define SPEED_EAX(Algo, algo, keysize, impl) \
+static void \
+test_speed_eax_ ## algo ## keysize ## _ ## impl(void) \
+{ \
+       test_speed_eax_inner("EAX " #Algo "-" #keysize "(" #impl ")", \
+               br_ ## algo ## _ ## impl ##  _ctrcbc_get_vtable() \
+               , (keysize) >> 3); \
+}
+
+SPEED_EAX(AES, aes, 128, big)
+SPEED_EAX(AES, aes, 128, small)
+SPEED_EAX(AES, aes, 128, ct)
+SPEED_EAX(AES, aes, 128, ct64)
+SPEED_EAX(AES, aes, 128, x86ni)
+SPEED_EAX(AES, aes, 128, pwr8)
+SPEED_EAX(AES, aes, 192, big)
+SPEED_EAX(AES, aes, 192, small)
+SPEED_EAX(AES, aes, 192, ct)
+SPEED_EAX(AES, aes, 192, ct64)
+SPEED_EAX(AES, aes, 192, x86ni)
+SPEED_EAX(AES, aes, 192, pwr8)
+SPEED_EAX(AES, aes, 256, big)
+SPEED_EAX(AES, aes, 256, small)
+SPEED_EAX(AES, aes, 256, ct)
+SPEED_EAX(AES, aes, 256, ct64)
+SPEED_EAX(AES, aes, 256, x86ni)
+SPEED_EAX(AES, aes, 256, pwr8)
+
+static void
+test_speed_shake_inner(int security_level)
+{
+       unsigned char buf[8192];
+       br_shake_context sc;
+       int i;
+       long num;
+
+       memset(buf, 'D', sizeof buf);
+       br_shake_init(&sc, security_level);
+       for (i = 0; i < 10; i ++) {
+               br_shake_inject(&sc, buf, sizeof buf);
+       }
+       num = 10;
+       for (;;) {
+               clock_t begin, end;
+               double tt;
+               long k;
+
+               begin = clock();
+               for (k = num; k > 0; k --) {
+                       br_shake_inject(&sc, buf, sizeof buf);
+               }
+               end = clock();
+               tt = (double)(end - begin) / CLOCKS_PER_SEC;
+               if (tt >= 2.0) {
+                       printf("SHAKE%-3d (inject)              %8.2f MB/s\n",
+                               security_level,
+                               ((double)sizeof buf) * (double)num
+                               / (tt * 1000000.0));
+                       fflush(stdout);
+                       break;
+               }
+               num <<= 1;
+       }
+
+       br_shake_flip(&sc);
+       for (i = 0; i < 10; i ++) {
+               br_shake_produce(&sc, buf, sizeof buf);
+       }
+
+       num = 10;
+       for (;;) {
+               clock_t begin, end;
+               double tt;
+               long k;
+
+               begin = clock();
+               for (k = num; k > 0; k --) {
+                       br_shake_produce(&sc, buf, sizeof buf);
+               }
+               end = clock();
+               tt = (double)(end - begin) / CLOCKS_PER_SEC;
+               if (tt >= 2.0) {
+                       printf("SHAKE%-3d (produce)             %8.2f MB/s\n",
+                               security_level,
+                               ((double)sizeof buf) * (double)num
+                               / (tt * 1000000.0));
+                       fflush(stdout);
+                       break;
+               }
+               num <<= 1;
+       }
+}
+
+static void
+test_speed_shake128(void)
+{
+       test_speed_shake_inner(128);
+}
+
+static void
+test_speed_shake256(void)
+{
+       test_speed_shake_inner(256);
+}
+
 static const unsigned char RSA_N[] = {
        0xE9, 0xF2, 0x4A, 0x2F, 0x96, 0xDF, 0x0A, 0x23,
        0x01, 0x85, 0xF1, 0x2C, 0xB2, 0xA8, 0xEF, 0x23,
@@ -510,11 +759,16 @@ static const br_rsa_private_key RSA_SK = {
 
 static void
 test_speed_rsa_inner(char *name,
-       br_rsa_public fpub, br_rsa_private fpriv)
+       br_rsa_public fpub, br_rsa_private fpriv, br_rsa_keygen kgen)
 {
        unsigned char tmp[sizeof RSA_N];
        int i;
        long num;
+       /*
+       br_hmac_drbg_context rng;
+       */
+       br_aesctr_drbg_context rng;
+       const br_block_ctr_class *ictr;
 
        memset(tmp, 'R', sizeof tmp);
        tmp[0] = 0;
@@ -568,27 +822,113 @@ test_speed_rsa_inner(char *name,
                }
                num <<= 1;
        }
+
+       if (kgen == 0) {
+               printf("%-30s KEYGEN UNAVAILABLE\n", name);
+               fflush(stdout);
+               return;
+       }
+       /*
+       br_hmac_drbg_init(&rng, &br_sha256_vtable, "RSA keygen seed", 15);
+       */
+       ictr = br_aes_x86ni_ctr_get_vtable();
+       if (ictr == NULL) {
+               ictr = br_aes_pwr8_ctr_get_vtable();
+               if (ictr == NULL) {
+#if BR_64
+                       ictr = &br_aes_ct64_ctr_vtable;
+#else
+                       ictr = &br_aes_ct_ctr_vtable;
+#endif
+               }
+       }
+       br_aesctr_drbg_init(&rng, ictr, "RSA keygen seed", 15);
+
+       num = 10;
+       for (;;) {
+               clock_t begin, end;
+               double tt;
+               long k;
+
+               begin = clock();
+               for (k = num; k > 0; k --) {
+                       br_rsa_private_key sk;
+                       unsigned char kbuf[BR_RSA_KBUF_PRIV_SIZE(1024)];
+
+                       kgen(&rng.vtable, &sk, kbuf, NULL, NULL, 1024, 0);
+               }
+               end = clock();
+               tt = (double)(end - begin) / CLOCKS_PER_SEC;
+               if (tt >= 10.0) {
+                       printf("%-30s %8.2f kgen[1024]/s\n", name,
+                               (double)num / tt);
+                       fflush(stdout);
+                       break;
+               }
+               num <<= 1;
+       }
+
+       num = 10;
+       for (;;) {
+               clock_t begin, end;
+               double tt;
+               long k;
+
+               begin = clock();
+               for (k = num; k > 0; k --) {
+                       br_rsa_private_key sk;
+                       unsigned char kbuf[BR_RSA_KBUF_PRIV_SIZE(2048)];
+
+                       kgen(&rng.vtable, &sk, kbuf, NULL, NULL, 2048, 0);
+               }
+               end = clock();
+               tt = (double)(end - begin) / CLOCKS_PER_SEC;
+               if (tt >= 10.0) {
+                       printf("%-30s %8.2f kgen[2048]/s\n", name,
+                               (double)num / tt);
+                       fflush(stdout);
+                       break;
+               }
+               num <<= 1;
+       }
 }
 
 static void
 test_speed_rsa_i15(void)
 {
        test_speed_rsa_inner("RSA i15",
-               &br_rsa_i15_public, &br_rsa_i15_private);
+               &br_rsa_i15_public, &br_rsa_i15_private, &br_rsa_i15_keygen);
 }
 
 static void
 test_speed_rsa_i31(void)
 {
        test_speed_rsa_inner("RSA i31",
-               &br_rsa_i31_public, &br_rsa_i31_private);
+               &br_rsa_i31_public, &br_rsa_i31_private, &br_rsa_i31_keygen);
 }
 
 static void
 test_speed_rsa_i32(void)
 {
        test_speed_rsa_inner("RSA i32",
-               &br_rsa_i32_public, &br_rsa_i32_private);
+               &br_rsa_i32_public, &br_rsa_i32_private, 0);
+}
+
+static void
+test_speed_rsa_i62(void)
+{
+       br_rsa_public pub;
+       br_rsa_private priv;
+       br_rsa_keygen kgen;
+
+       pub = br_rsa_i62_public_get();
+       priv = br_rsa_i62_private_get();
+       kgen = br_rsa_i62_keygen_get();
+       if (pub) {
+               test_speed_rsa_inner("RSA i62", pub, priv, kgen);
+       } else {
+               printf("%-30s UNAVAILABLE\n", "RSA i62");
+       }
 }
 
 static void
@@ -691,6 +1031,39 @@ test_speed_ec_p256_m15(void)
                &br_ec_p256_m15, &br_secp256r1);
 }
 
+static void
+test_speed_ec_p256_m31(void)
+{
+       test_speed_ec_inner("EC p256_m31",
+               &br_ec_p256_m31, &br_secp256r1);
+}
+
+static void
+test_speed_ec_p256_m62(void)
+{
+       const br_ec_impl *ec;
+
+       ec = br_ec_p256_m62_get();
+       if (ec != NULL) {
+               test_speed_ec_inner("EC p256_m62", ec, &br_secp256r1);
+       } else {
+               printf("%-30s UNAVAILABLE\n", "EC p256_m62");
+       }
+}
+
+static void
+test_speed_ec_p256_m64(void)
+{
+       const br_ec_impl *ec;
+
+       ec = br_ec_p256_m64_get();
+       if (ec != NULL) {
+               test_speed_ec_inner("EC p256_m64", ec, &br_secp256r1);
+       } else {
+               printf("%-30s UNAVAILABLE\n", "EC p256_m64");
+       }
+}
+
 static void
 test_speed_ec_prime_i15(void)
 {
@@ -741,6 +1114,32 @@ test_speed_ec_c25519_m31(void)
                &br_ec_c25519_m31, &br_curve25519);
 }
 
+static void
+test_speed_ec_c25519_m62(void)
+{
+       const br_ec_impl *ec;
+
+       ec = br_ec_c25519_m62_get();
+       if (ec != NULL) {
+               test_speed_ec_inner("EC c25519_m62", ec, &br_curve25519);
+       } else {
+               printf("%-30s UNAVAILABLE\n", "EC c25519_m62");
+       }
+}
+
+static void
+test_speed_ec_c25519_m64(void)
+{
+       const br_ec_impl *ec;
+
+       ec = br_ec_c25519_m64_get();
+       if (ec != NULL) {
+               test_speed_ec_inner("EC c25519_m64", ec, &br_curve25519);
+       } else {
+               printf("%-30s UNAVAILABLE\n", "EC c25519_m64");
+       }
+}
+
 static void
 test_speed_ecdsa_inner(const char *name,
        const br_ec_impl *impl, const br_ec_curve_def *cd,
@@ -835,6 +1234,47 @@ test_speed_ecdsa_p256_m15(void)
                &br_ecdsa_i15_vrfy_asn1);
 }
 
+static void
+test_speed_ecdsa_p256_m31(void)
+{
+       test_speed_ecdsa_inner("ECDSA m31 P-256",
+               &br_ec_p256_m31, &br_secp256r1,
+               &br_ecdsa_i31_sign_asn1,
+               &br_ecdsa_i31_vrfy_asn1);
+}
+
+static void
+test_speed_ecdsa_p256_m62(void)
+{
+       const br_ec_impl *ec;
+
+       ec = br_ec_p256_m62_get();
+       if (ec != NULL) {
+               test_speed_ecdsa_inner("ECDSA m62 P-256",
+                       ec, &br_secp256r1,
+                       &br_ecdsa_i31_sign_asn1,
+                       &br_ecdsa_i31_vrfy_asn1);
+       } else {
+               printf("%-30s UNAVAILABLE\n", "ECDSA m62 P-256");
+       }
+}
+
+static void
+test_speed_ecdsa_p256_m64(void)
+{
+       const br_ec_impl *ec;
+
+       ec = br_ec_p256_m64_get();
+       if (ec != NULL) {
+               test_speed_ecdsa_inner("ECDSA m64 P-256",
+                       ec, &br_secp256r1,
+                       &br_ecdsa_i31_sign_asn1,
+                       &br_ecdsa_i31_vrfy_asn1);
+       } else {
+               printf("%-30s UNAVAILABLE\n", "ECDSA m64 P-256");
+       }
+}
+
 static void
 test_speed_ecdsa_i15(void)
 {
@@ -869,132 +1309,24 @@ test_speed_ecdsa_i31(void)
                &br_ecdsa_i31_vrfy_asn1);
 }
 
-#if 0
-/* obsolete */
-static void
-test_speed_ec_prime_i31_inner(const char *name,
-       const unsigned char *bg, const br_ec_prime_i31_curve *cc)
-{
-       unsigned char bx[80], point[160];
-       uint32_t x[BR_EC_I31_LEN];
-       br_ec_prime_i31_jacobian P;
-       uint32_t xbl;
-       size_t plen;
-       int i;
-       long num;
-
-       xbl = cc->p[0];
-       xbl -= (xbl >> 5);
-       plen = (xbl + 7) >> 3;
-       memset(bx, 'T', sizeof bx);
-       br_i31_decode_reduce(x, bx, sizeof bx, cc->p);
-       br_i31_encode(bx, plen, x);
-       br_ec_prime_i31_decode(&P, bg, 1 + (plen << 1), cc);
-       for (i = 0; i < 10; i ++) {
-               br_ec_prime_i31_mul(&P, bx, plen, cc);
-               br_ec_prime_i31_encode(point, &P, cc);
-       }
-       num = 10;
-       for (;;) {
-               clock_t begin, end;
-               double tt;
-               long k;
-
-               begin = clock();
-               for (k = num; k > 0; k --) {
-                       br_ec_prime_i31_mul(&P, bx, plen, cc);
-                       br_ec_prime_i31_encode(point, &P, cc);
-               }
-               end = clock();
-               tt = (double)(end - begin) / CLOCKS_PER_SEC;
-               if (tt >= 2.0) {
-                       printf("%-30s %8.2f mul/s\n", name,
-                               (double)num / tt);
-                       fflush(stdout);
-                       break;
-               }
-               num <<= 1;
-       }
-}
-
-static void
-test_speed_ec_prime_i31(void)
-{
-       test_speed_ec_prime_i31_inner("EC i31 P-256",
-               br_g_secp256r1, &br_ec_prime_i31_secp256r1);
-       test_speed_ec_prime_i31_inner("EC i31 P-384",
-               br_g_secp384r1, &br_ec_prime_i31_secp384r1);
-       test_speed_ec_prime_i31_inner("EC i31 P-521",
-               br_g_secp521r1, &br_ec_prime_i31_secp521r1);
-}
-
-static void
-test_speed_ec_prime_i32_inner(const char *name,
-       const unsigned char *bg, const br_ec_prime_i32_curve *cc)
-{
-       unsigned char bx[80], point[160];
-       uint32_t x[BR_EC_I32_LEN];
-       br_ec_prime_i32_jacobian P;
-       size_t plen;
-       int i;
-       long num;
-
-       plen = (cc->p[0] + 7) >> 3;
-       memset(bx, 'T', sizeof bx);
-       br_i32_decode_reduce(x, bx, sizeof bx, cc->p);
-       br_i32_encode(bx, plen, x);
-       br_ec_prime_i32_decode(&P, bg, 1 + (plen << 1), cc);
-       for (i = 0; i < 10; i ++) {
-               br_ec_prime_i32_mul(&P, bx, plen, cc);
-               br_ec_prime_i32_encode(point, &P, cc);
-       }
-       num = 10;
-       for (;;) {
-               clock_t begin, end;
-               double tt;
-               long k;
-
-               begin = clock();
-               for (k = num; k > 0; k --) {
-                       br_ec_prime_i32_mul(&P, bx, plen, cc);
-                       br_ec_prime_i32_encode(point, &P, cc);
-               }
-               end = clock();
-               tt = (double)(end - begin) / CLOCKS_PER_SEC;
-               if (tt >= 2.0) {
-                       printf("%-30s %8.2f mul/s\n", name,
-                               (double)num / tt);
-                       fflush(stdout);
-                       break;
-               }
-               num <<= 1;
-       }
-}
-
-static void
-test_speed_ec_prime_i32(void)
-{
-       test_speed_ec_prime_i32_inner("EC i32 P-256",
-               br_g_secp256r1, &br_ec_prime_i32_secp256r1);
-       test_speed_ec_prime_i32_inner("EC i32 P-384",
-               br_g_secp384r1, &br_ec_prime_i32_secp384r1);
-       test_speed_ec_prime_i32_inner("EC i32 P-521",
-               br_g_secp521r1, &br_ec_prime_i32_secp521r1);
-}
-#endif
-
 static void
 test_speed_i31(void)
 {
        static const unsigned char bp[] = {
-               0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
+               /* A 521-bit prime integer (order of the P-521 curve). */
+               0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+               0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
                0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-               0xBC, 0xE6, 0xFA, 0xAD, 0xA7, 0x17, 0x9E, 0x84,
-               0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, 0x25, 0x51
+               0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+               0xFF, 0xFA, 0x51, 0x86, 0x87, 0x83, 0xBF, 0x2F,
+               0x96, 0x6B, 0x7F, 0xCC, 0x01, 0x48, 0xF7, 0x09,
+               0xA5, 0xD0, 0x3B, 0xB5, 0xC9, 0xB8, 0x89, 0x9C,
+               0x47, 0xAE, 0xBB, 0x6F, 0xB7, 0x1E, 0x91, 0x38,
+               0x64, 0x09
        };
 
        unsigned char tmp[60 + sizeof bp];
-       uint32_t p[10], x[10], y[10], z[10], p0i;
+       uint32_t p[20], x[20], y[20], z[20], uu[60], p0i;
        int i;
        long num;
 
@@ -1076,6 +1408,30 @@ test_speed_i31(void)
                }
                num <<= 1;
        }
+
+       for (i = 0; i < 10; i ++) {
+               br_i31_moddiv(x, y, p, p0i, uu);
+       }
+       num = 10;
+       for (;;) {
+               clock_t begin, end;
+               double tt;
+               long k;
+
+               begin = clock();
+               for (k = num; k > 0; k --) {
+                       br_i31_moddiv(x, y, p, p0i, uu);
+               }
+               end = clock();
+               tt = (double)(end - begin) / CLOCKS_PER_SEC;
+               if (tt >= 2.0) {
+                       printf("%-30s %8.2f ops/s\n", "i31 moddiv",
+                               (double)num / tt);
+                       fflush(stdout);
+                       break;
+               }
+               num <<= 1;
+       }
 }
 
 #if 0
@@ -1256,6 +1612,26 @@ static const struct {
        STU(aes192_ct64_ctr),
        STU(aes256_ct64_ctr),
 
+       STU(aes128_x86ni_cbcenc),
+       STU(aes128_x86ni_cbcdec),
+       STU(aes192_x86ni_cbcenc),
+       STU(aes192_x86ni_cbcdec),
+       STU(aes256_x86ni_cbcenc),
+       STU(aes256_x86ni_cbcdec),
+       STU(aes128_x86ni_ctr),
+       STU(aes192_x86ni_ctr),
+       STU(aes256_x86ni_ctr),
+
+       STU(aes128_pwr8_cbcenc),
+       STU(aes128_pwr8_cbcdec),
+       STU(aes192_pwr8_cbcenc),
+       STU(aes192_pwr8_cbcdec),
+       STU(aes256_pwr8_cbcenc),
+       STU(aes256_pwr8_cbcdec),
+       STU(aes128_pwr8_ctr),
+       STU(aes192_pwr8_ctr),
+       STU(aes256_pwr8_ctr),
+
        STU(des_tab_cbcenc),
        STU(des_tab_cbcdec),
        STU(3des_tab_cbcenc),
@@ -1267,26 +1643,61 @@ static const struct {
        STU(3des_ct_cbcdec),
 
        STU(chacha20_ct),
+       STU(chacha20_sse2),
 
        STU(ghash_ctmul),
        STU(ghash_ctmul32),
        STU(ghash_ctmul64),
+       STU(ghash_pclmul),
+       STU(ghash_pwr8),
 
        STU(poly1305_ctmul),
        STU(poly1305_ctmul32),
+       STU(poly1305_ctmulq),
        STU(poly1305_i15),
 
+       STU(eax_aes128_big),
+       STU(eax_aes192_big),
+       STU(eax_aes256_big),
+       STU(eax_aes128_small),
+       STU(eax_aes192_small),
+       STU(eax_aes256_small),
+       STU(eax_aes128_ct),
+       STU(eax_aes192_ct),
+       STU(eax_aes256_ct),
+       STU(eax_aes128_ct64),
+       STU(eax_aes192_ct64),
+       STU(eax_aes256_ct64),
+       STU(eax_aes128_x86ni),
+       STU(eax_aes192_x86ni),
+       STU(eax_aes256_x86ni),
+       STU(eax_aes128_pwr8),
+       STU(eax_aes192_pwr8),
+       STU(eax_aes256_pwr8),
+
+       STU(shake128),
+       STU(shake256),
+
        STU(rsa_i15),
        STU(rsa_i31),
        STU(rsa_i32),
+       STU(rsa_i62),
        STU(ec_prime_i15),
        STU(ec_prime_i31),
        STU(ec_p256_m15),
+       STU(ec_p256_m31),
+       STU(ec_p256_m62),
+       STU(ec_p256_m64),
        STU(ec_c25519_i15),
        STU(ec_c25519_i31),
        STU(ec_c25519_m15),
        STU(ec_c25519_m31),
+       STU(ec_c25519_m62),
+       STU(ec_c25519_m64),
        STU(ecdsa_p256_m15),
+       STU(ecdsa_p256_m31),
+       STU(ecdsa_p256_m62),
+       STU(ecdsa_p256_m64),
        STU(ecdsa_i15),
        STU(ecdsa_i31),