2 * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * Perform the inner processing of blocks for Poly1305. The accumulator
29 * and the r key are provided as arrays of 26-bit words (these words
30 * are allowed to have an extra bit, i.e. use 27 bits).
32 * On output, all accumulator words fit on 26 bits, except acc[1], which
33 * may be slightly larger (but by a very small amount only).
36 poly1305_inner(uint32_t *acc
, const uint32_t *r
, const void *data
, size_t len
)
39 * Implementation notes: we split the 130-bit values into five
40 * 26-bit words. This gives us some space for carries.
42 * This code is inspired from the public-domain code available
44 * https://github.com/floodyberry/poly1305-donna
46 * Since we compute modulo 2^130-5, the "upper words" become
47 * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
49 const unsigned char *buf
;
50 uint32_t a0
, a1
, a2
, a3
, a4
;
51 uint32_t r0
, r1
, r2
, r3
, r4
;
52 uint32_t u1
, u2
, u3
, u4
;
73 uint64_t w0
, w1
, w2
, w3
, w4
;
75 unsigned char tmp
[16];
78 * If there is a partial block, right-pad it with zeros.
81 memset(tmp
, 0, sizeof tmp
);
82 memcpy(tmp
, buf
, len
);
88 * Decode next block and apply the "high bit"; that value
89 * is added to the accumulator.
91 a0
+= br_dec32le(buf
) & 0x03FFFFFF;
92 a1
+= (br_dec32le(buf
+ 3) >> 2) & 0x03FFFFFF;
93 a2
+= (br_dec32le(buf
+ 6) >> 4) & 0x03FFFFFF;
94 a3
+= (br_dec32le(buf
+ 9) >> 6) & 0x03FFFFFF;
95 a4
+= (br_dec32le(buf
+ 12) >> 8) | 0x01000000;
98 * Compute multiplication.
100 #define M(x, y) ((uint64_t)(x) * (uint64_t)(y))
102 w0
= M(a0
, r0
) + M(a1
, u4
) + M(a2
, u3
) + M(a3
, u2
) + M(a4
, u1
);
103 w1
= M(a0
, r1
) + M(a1
, r0
) + M(a2
, u4
) + M(a3
, u3
) + M(a4
, u2
);
104 w2
= M(a0
, r2
) + M(a1
, r1
) + M(a2
, r0
) + M(a3
, u4
) + M(a4
, u3
);
105 w3
= M(a0
, r3
) + M(a1
, r2
) + M(a2
, r1
) + M(a3
, r0
) + M(a4
, u4
);
106 w4
= M(a0
, r4
) + M(a1
, r3
) + M(a2
, r2
) + M(a3
, r1
) + M(a4
, r0
);
110 * Perform some (partial) modular reduction. This step is
111 * enough to keep values in ranges such that there won't
112 * be carry overflows. Most of the reduction was done in
113 * the multiplication step (by using the 'u*' values, and
114 * using the fact that 2^130 = -5 mod p); here we perform
115 * some carry propagation.
118 a0
= (uint32_t)w0
& 0x3FFFFFF;
121 a1
= (uint32_t)w1
& 0x3FFFFFF;
124 a2
= (uint32_t)w2
& 0x3FFFFFF;
127 a3
= (uint32_t)w3
& 0x3FFFFFF;
130 a4
= (uint32_t)w4
& 0x3FFFFFF;
131 a0
+= (uint32_t)c
* 5;
146 /* see bearssl_block.h */
148 br_poly1305_ctmul_run(const void *key
, const void *iv
,
149 void *data
, size_t len
, const void *aad
, size_t aad_len
,
150 void *tag
, br_chacha20_run ichacha
, int encrypt
)
152 unsigned char pkey
[32], foot
[16];
153 uint32_t r
[5], acc
[5], cc
, ctl
, hi
;
158 * Compute the MAC key. The 'r' value is the first 16 bytes of
161 memset(pkey
, 0, sizeof pkey
);
162 ichacha(key
, iv
, 0, pkey
, sizeof pkey
);
165 * If encrypting, ChaCha20 must run first, followed by Poly1305.
166 * When decrypting, the operations are reversed.
169 ichacha(key
, iv
, 1, data
, len
);
173 * Run Poly1305. We must process the AAD, then ciphertext, then
174 * the footer (with the lengths). Note that the AAD and ciphertext
175 * are meant to be padded with zeros up to the next multiple of 16,
176 * and the length of the footer is 16 bytes as well.
180 * Decode the 'r' value into 26-bit words, with the "clamping"
183 r
[0] = br_dec32le(pkey
) & 0x03FFFFFF;
184 r
[1] = (br_dec32le(pkey
+ 3) >> 2) & 0x03FFFF03;
185 r
[2] = (br_dec32le(pkey
+ 6) >> 4) & 0x03FFC0FF;
186 r
[3] = (br_dec32le(pkey
+ 9) >> 6) & 0x03F03FFF;
187 r
[4] = (br_dec32le(pkey
+ 12) >> 8) & 0x000FFFFF;
192 memset(acc
, 0, sizeof acc
);
195 * Process the additional authenticated data, ciphertext, and
196 * footer in due order.
198 br_enc64le(foot
, (uint64_t)aad_len
);
199 br_enc64le(foot
+ 8, (uint64_t)len
);
200 poly1305_inner(acc
, r
, aad
, aad_len
);
201 poly1305_inner(acc
, r
, data
, len
);
202 poly1305_inner(acc
, r
, foot
, sizeof foot
);
205 * Finalise modular reduction. This is done with carry propagation
206 * and applying the '2^130 = -5 mod p' rule. Note that the output
207 * of poly1035_inner() is already mostly reduced, since only
208 * acc[1] may be (very slightly) above 2^26. A single loop back
209 * to acc[1] will be enough to make the value fit in 130 bits.
212 for (i
= 1; i
<= 6; i
++) {
215 j
= (i
>= 5) ? i
- 5 : i
;
218 acc
[j
] &= 0x03FFFFFF;
222 * We may still have a value in the 2^130-5..2^130-1 range, in
223 * which case we must reduce it again. The code below selects,
224 * in constant-time, between 'acc' and 'acc-p',
226 ctl
= GT(acc
[0], 0x03FFFFFA);
227 for (i
= 1; i
< 5; i
++) {
228 ctl
&= EQ(acc
[i
], 0x03FFFFFF);
231 for (i
= 0; i
< 5; i
++) {
237 acc
[i
] = MUX(ctl
, t
, acc
[i
]);
241 * Convert back the accumulator to 32-bit words, and add the
242 * 's' value (second half of pkey[]). That addition is done
245 w
= (uint64_t)acc
[0] + ((uint64_t)acc
[1] << 26) + br_dec32le(pkey
+ 16);
246 br_enc32le((unsigned char *)tag
, (uint32_t)w
);
247 w
= (w
>> 32) + ((uint64_t)acc
[2] << 20) + br_dec32le(pkey
+ 20);
248 br_enc32le((unsigned char *)tag
+ 4, (uint32_t)w
);
249 w
= (w
>> 32) + ((uint64_t)acc
[3] << 14) + br_dec32le(pkey
+ 24);
250 br_enc32le((unsigned char *)tag
+ 8, (uint32_t)w
);
251 hi
= (uint32_t)(w
>> 32) + (acc
[4] << 8) + br_dec32le(pkey
+ 28);
252 br_enc32le((unsigned char *)tag
+ 12, hi
);
255 * If decrypting, then ChaCha20 runs _after_ Poly1305.
258 ichacha(key
, iv
, 1, data
, len
);