From: Thomas Pornin Date: Thu, 3 Jan 2019 18:50:54 +0000 (+0100) Subject: Some small performance improvements on 32-bit architectures. X-Git-Url: https://bearssl.org/gitweb//home/git/?a=commitdiff_plain;h=001d094d140488def90cb3876d5c03f4d79b3e27;p=BearSSL Some small performance improvements on 32-bit architectures. --- diff --git a/src/ec/ec_c25519_m31.c b/src/ec/ec_c25519_m31.c index 95f1275..1dd6d51 100644 --- a/src/ec/ec_c25519_m31.c +++ b/src/ec/ec_c25519_m31.c @@ -372,8 +372,7 @@ reduce_final_f255(uint32_t *d) static void f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b) { - uint32_t t[18]; - uint64_t cc, w; + uint32_t t[18], cc; int i; /* @@ -389,21 +388,42 @@ f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b) * offset 9*30 = 270, word 9+k must be added to word k with * a factor of 19*2^15 = 622592. The extra bits in word 8 are also * added that way. + * + * Keeping the carry on 32 bits helps with 32-bit architectures, + * and does not noticeably impact performance on 64-bit systems. */ - cc = MUL31(t[8] >> 15, 19); + cc = MUL15(t[8] >> 15, 19); /* at most 19*(2^15-1) = 622573 */ t[8] &= 0x7FFF; for (i = 0; i < 9; i ++) { - w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592); + uint64_t w; + + w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592); t[i] = (uint32_t)w & 0x3FFFFFFF; - cc = w >> 30; + cc = (uint32_t)(w >> 30); /* at most 622592 */ } - cc = MUL31(w >> 15, 19); + + /* + * Original product was up to (2^256-1)^2, i.e. a 512-bit integer. + * This was split into two parts (upper of 257 bits, lower of 255 + * bits), and the upper was added to the lower with a factor 19, + * which means that the intermediate value is less than 77*2^255 + * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are + * less than 77, and the initial carry cc is at most 76*19 = 1444. + */ + cc = MUL15(t[8] >> 15, 19); t[8] &= 0x7FFF; for (i = 0; i < 9; i ++) { - w = t[i] + cc; - d[i] = (uint32_t)w & 0x3FFFFFFF; - cc = w >> 30; + uint32_t z; + + z = t[i] + cc; + d[i] = z & 0x3FFFFFFF; + cc = z >> 30; } + + /* + * Final result is at most 2^255 + 1443. In particular, the last + * carry is necessarily 0, since t[8] was truncated to 15 bits. + */ } /* @@ -415,8 +435,7 @@ f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b) static void f255_square(uint32_t *d, const uint32_t *a) { - uint32_t t[18]; - uint64_t cc, w; + uint32_t t[18], cc; int i; /* @@ -428,24 +447,25 @@ f255_square(uint32_t *d, const uint32_t *a) /* * Modular reduction: each high word is added where necessary. - * Since the modulus is 2^255-19 and word 9 corresponds to - * offset 9*30 = 270, word 9+k must be added to word k with - * a factor of 19*2^15 = 622592. The extra bits in word 8 are also - * added that way. + * See f255_mul() for details on the reduction and carry limits. */ - cc = MUL31(t[8] >> 15, 19); + cc = MUL15(t[8] >> 15, 19); t[8] &= 0x7FFF; for (i = 0; i < 9; i ++) { - w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592); + uint64_t w; + + w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592); t[i] = (uint32_t)w & 0x3FFFFFFF; - cc = w >> 30; + cc = (uint32_t)(w >> 30); } - cc = MUL31(w >> 15, 19); + cc = MUL15(t[8] >> 15, 19); t[8] &= 0x7FFF; for (i = 0; i < 9; i ++) { - w = t[i] + cc; - d[i] = (uint32_t)w & 0x3FFFFFFF; - cc = w >> 30; + uint32_t z; + + z = t[i] + cc; + d[i] = z & 0x3FFFFFFF; + cc = z >> 30; } } @@ -515,20 +535,31 @@ static void f255_mul_a24(uint32_t *d, const uint32_t *a) { int i; - uint64_t cc, w; + uint64_t w; + uint32_t cc; + /* + * a[] is over 256 bits, thus a[8] has length at most 16 bits. + * We single out the processing of the last word: intermediate + * value w is up to 121665*2^16, yielding a carry for the next + * loop of at most 19*(121665*2^16/2^15) = 4623289. + */ cc = 0; - for (i = 0; i < 9; i ++) { - w = MUL31(a[i], 121665) + cc; + for (i = 0; i < 8; i ++) { + w = MUL31(a[i], 121665) + (uint64_t)cc; d[i] = (uint32_t)w & 0x3FFFFFFF; - cc = w >> 30; + cc = (uint32_t)(w >> 30); } - cc = MUL31((uint32_t)(w >> 15), 19); - d[8] &= 0x7FFF; + w = MUL31(a[8], 121665) + (uint64_t)cc; + d[8] = (uint32_t)w & 0x7FFF; + cc = MUL15((uint32_t)(w >> 15), 19); + for (i = 0; i < 9; i ++) { - w = (uint64_t)d[i] + cc; - d[i] = w & 0x3FFFFFFF; - cc = w >> 30; + uint32_t z; + + z = d[i] + cc; + d[i] = z & 0x3FFFFFFF; + cc = z >> 30; } } diff --git a/src/ec/ec_p256_m15.c b/src/ec/ec_p256_m15.c index 6ce57e0..8d68d1d 100644 --- a/src/ec/ec_p256_m15.c +++ b/src/ec/ec_p256_m15.c @@ -1739,7 +1739,7 @@ p256_decode(p256_jacobian *P, const void *src, size_t len) memcpy(P->y, ty, sizeof ty); memset(P->z, 0, sizeof P->z); P->z[0] = 1; - return NEQ(bad, 0) ^ 1; + return EQ(bad, 0); } /* diff --git a/src/ec/ec_p256_m31.c b/src/ec/ec_p256_m31.c index ec22c3e..d57ef7b 100644 --- a/src/ec/ec_p256_m31.c +++ b/src/ec/ec_p256_m31.c @@ -1089,7 +1089,7 @@ p256_decode(p256_jacobian *P, const void *src, size_t len) memcpy(P->y, ty, sizeof ty); memset(P->z, 0, sizeof P->z); P->z[0] = 1; - return NEQ(bad, 0) ^ 1; + return EQ(bad, 0); } /* diff --git a/src/inner.h b/src/inner.h index 1b52673..21d996e 100644 --- a/src/inner.h +++ b/src/inner.h @@ -114,6 +114,8 @@ #define BR_64 1 #elif defined(__x86_64__) || defined(_M_X64) #define BR_64 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#define BR_64 1 #endif #endif diff --git a/src/int/i31_montmul.c b/src/int/i31_montmul.c index 8066808..758f8f4 100644 --- a/src/int/i31_montmul.c +++ b/src/int/i31_montmul.c @@ -29,16 +29,45 @@ void br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y, const uint32_t *m, uint32_t m0i) { + /* + * Each outer loop iteration computes: + * d <- (d + xu*y + f*m) / 2^31 + * We have xu <= 2^31-1 and f <= 2^31-1. + * Thus, if d <= 2*m-1 on input, then: + * 2*m-1 + 2*(2^31-1)*m <= (2^32)*m-1 + * and the new d value is less than 2*m. + * + * We represent d over 31-bit words, with an extra word 'dh' + * which can thus be only 0 or 1. + */ size_t len, len4, u, v; - uint64_t dh; + uint32_t dh; len = (m[0] + 31) >> 5; len4 = len & ~(size_t)3; br_i31_zero(d, m[0]); dh = 0; for (u = 0; u < len; u ++) { + /* + * The carry for each operation fits on 32 bits: + * d[v+1] <= 2^31-1 + * xu*y[v+1] <= (2^31-1)*(2^31-1) + * f*m[v+1] <= (2^31-1)*(2^31-1) + * r <= 2^32-1 + * (2^31-1) + 2*(2^31-1)*(2^31-1) + (2^32-1) = 2^63 - 2^31 + * After division by 2^31, the new r is then at most 2^32-1 + * + * Using a 32-bit carry has performance benefits on 32-bit + * systems; however, on 64-bit architectures, we prefer to + * keep the carry (r) in a 64-bit register, thus avoiding some + * "clear high bits" operations. + */ uint32_t f, xu; - uint64_t r, zh; +#if BR_64 + uint64_t r; +#else + uint32_t r; +#endif xu = x[u + 1]; f = MUL31_lo((d[1] + MUL31_lo(x[u + 1], y[1])), m0i); @@ -73,9 +102,14 @@ br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y, d[v] = (uint32_t)z & 0x7FFFFFFF; } - zh = dh + r; - d[len] = (uint32_t)zh & 0x7FFFFFFF; - dh = zh >> 31; + /* + * Since the new dh can only be 0 or 1, the addition of + * the old dh with the carry MUST fit on 32 bits, and + * thus can be done into dh itself. + */ + dh += r; + d[len] = dh & 0x7FFFFFFF; + dh >>= 31; } /* diff --git a/src/int/i31_mulacc.c b/src/int/i31_mulacc.c index 024d095..7410e54 100644 --- a/src/int/i31_mulacc.c +++ b/src/int/i31_mulacc.c @@ -45,7 +45,20 @@ br_i31_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b) for (u = 0; u < blen; u ++) { uint32_t f; size_t v; + + /* + * Carry always fits on 31 bits; we want to keep it in a + * 32-bit register on 32-bit architectures (on a 64-bit + * architecture, cast down from 64 to 32 bits means + * clearing the high bits, which is not free; on a 32-bit + * architecture, the same operation really means ignoring + * the top register, which has negative or zero cost). + */ +#if BR_64 uint64_t cc; +#else + uint32_t cc; +#endif f = b[1 + u]; cc = 0; diff --git a/src/int/i32_mulacc.c b/src/int/i32_mulacc.c index f62c782..55da385 100644 --- a/src/int/i32_mulacc.c +++ b/src/int/i32_mulacc.c @@ -36,7 +36,11 @@ br_i32_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b) for (u = 0; u < blen; u ++) { uint32_t f; size_t v; +#if BR_64 uint64_t cc; +#else + uint32_t cc; +#endif f = b[1 + u]; cc = 0;