2 * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * During key schedule, we need to apply bit extraction PC-2 then permute
29 * things into our bitslice representation. PC-2 extracts 48 bits out
30 * of two 28-bit words (kl and kr), and we store these bits into two
31 * 32-bit words sk0 and sk1.
33 * -- bit 16+x of sk0 comes from bit QL0[x] of kl
34 * -- bit x of sk0 comes from bit QR0[x] of kr
35 * -- bit 16+x of sk1 comes from bit QL1[x] of kl
36 * -- bit x of sk1 comes from bit QR1[x] of kr
39 static const unsigned char QL0
[] = {
40 17, 4, 27, 23, 13, 22, 7, 18,
41 16, 24, 2, 20, 1, 8, 15, 26
44 static const unsigned char QR0
[] = {
45 25, 19, 9, 1, 5, 11, 23, 8,
46 17, 0, 22, 3, 6, 20, 27, 24
49 static const unsigned char QL1
[] = {
50 28, 28, 14, 11, 28, 28, 25, 0,
51 28, 28, 5, 9, 28, 28, 12, 21
54 static const unsigned char QR1
[] = {
55 28, 28, 15, 4, 28, 28, 26, 16,
56 28, 28, 12, 7, 28, 28, 10, 14
60 * 32-bit rotation. The C compiler is supposed to recognize it as a
61 * rotation and use the local architecture rotation opcode (if available).
63 static inline uint32_t
64 rotl(uint32_t x
, int n
)
66 return (x
<< n
) | (x
>> (32 - n
));
70 * Compute key schedule for 8 key bytes (produces 32 subkey words).
73 keysched_unit(uint32_t *skey
, const void *key
)
77 br_des_keysched_unit(skey
, key
);
80 * Apply PC-2 + bitslicing.
82 for (i
= 0; i
< 16; i
++) {
83 uint32_t kl
, kr
, sk0
, sk1
;
86 kl
= skey
[(i
<< 1) + 0];
87 kr
= skey
[(i
<< 1) + 1];
90 for (j
= 0; j
< 16; j
++) {
93 sk0
|= ((kl
>> QL0
[j
]) & (uint32_t)1) << 16;
94 sk0
|= (kr
>> QR0
[j
]) & (uint32_t)1;
95 sk1
|= ((kl
>> QL1
[j
]) & (uint32_t)1) << 16;
96 sk1
|= (kr
>> QR1
[j
]) & (uint32_t)1;
99 skey
[(i
<< 1) + 0] = sk0
;
100 skey
[(i
<< 1) + 1] = sk1
;
105 * Speed-optimized version for PC-2 + bitslicing.
106 * (Unused. Kept for reference only.)
108 sk0
= kl
& (uint32_t)0x00100000;
109 sk0
|= (kl
& (uint32_t)0x08008000) << 2;
110 sk0
|= (kl
& (uint32_t)0x00400000) << 4;
111 sk0
|= (kl
& (uint32_t)0x00800000) << 5;
112 sk0
|= (kl
& (uint32_t)0x00040000) << 6;
113 sk0
|= (kl
& (uint32_t)0x00010000) << 7;
114 sk0
|= (kl
& (uint32_t)0x00000100) << 10;
115 sk0
|= (kl
& (uint32_t)0x00022000) << 14;
116 sk0
|= (kl
& (uint32_t)0x00000082) << 18;
117 sk0
|= (kl
& (uint32_t)0x00000004) << 19;
118 sk0
|= (kl
& (uint32_t)0x04000000) >> 10;
119 sk0
|= (kl
& (uint32_t)0x00000010) << 26;
120 sk0
|= (kl
& (uint32_t)0x01000000) >> 2;
122 sk0
|= kr
& (uint32_t)0x00000100;
123 sk0
|= (kr
& (uint32_t)0x00000008) << 1;
124 sk0
|= (kr
& (uint32_t)0x00000200) << 4;
125 sk0
|= rotl(kr
& (uint32_t)0x08000021, 6);
126 sk0
|= (kr
& (uint32_t)0x01000000) >> 24;
127 sk0
|= (kr
& (uint32_t)0x00000002) << 11;
128 sk0
|= (kr
& (uint32_t)0x00100000) >> 18;
129 sk0
|= (kr
& (uint32_t)0x00400000) >> 17;
130 sk0
|= (kr
& (uint32_t)0x00800000) >> 14;
131 sk0
|= (kr
& (uint32_t)0x02020000) >> 10;
132 sk0
|= (kr
& (uint32_t)0x00080000) >> 5;
133 sk0
|= (kr
& (uint32_t)0x00000040) >> 3;
134 sk0
|= (kr
& (uint32_t)0x00000800) >> 1;
136 sk1
= kl
& (uint32_t)0x02000000;
137 sk1
|= (kl
& (uint32_t)0x00001000) << 5;
138 sk1
|= (kl
& (uint32_t)0x00000200) << 11;
139 sk1
|= (kl
& (uint32_t)0x00004000) << 15;
140 sk1
|= (kl
& (uint32_t)0x00000020) << 16;
141 sk1
|= (kl
& (uint32_t)0x00000800) << 17;
142 sk1
|= (kl
& (uint32_t)0x00000001) << 24;
143 sk1
|= (kl
& (uint32_t)0x00200000) >> 5;
145 sk1
|= (kr
& (uint32_t)0x00000010) << 8;
146 sk1
|= (kr
& (uint32_t)0x04000000) >> 17;
147 sk1
|= (kr
& (uint32_t)0x00004000) >> 14;
148 sk1
|= (kr
& (uint32_t)0x00000400) >> 9;
149 sk1
|= (kr
& (uint32_t)0x00010000) >> 8;
150 sk1
|= (kr
& (uint32_t)0x00001000) >> 7;
151 sk1
|= (kr
& (uint32_t)0x00000080) >> 3;
152 sk1
|= (kr
& (uint32_t)0x00008000) >> 2;
158 br_des_ct_keysched(uint32_t *skey
, const void *key
, size_t key_len
)
162 keysched_unit(skey
, key
);
165 keysched_unit(skey
, key
);
166 keysched_unit(skey
+ 32, (const unsigned char *)key
+ 8);
167 br_des_rev_skey(skey
+ 32);
168 memcpy(skey
+ 64, skey
, 32 * sizeof *skey
);
171 keysched_unit(skey
, key
);
172 keysched_unit(skey
+ 32, (const unsigned char *)key
+ 8);
173 br_des_rev_skey(skey
+ 32);
174 keysched_unit(skey
+ 64, (const unsigned char *)key
+ 16);
180 * DES confusion function. This function performs expansion E (32 to
181 * 48 bits), XOR with subkey, S-boxes, and permutation P.
183 static inline uint32_t
184 Fconf(uint32_t r0
, const uint32_t *sk
)
187 * Each 6->4 S-box is virtually turned into four 6->1 boxes; we
188 * thus end up with 32 boxes that we call "T-boxes" here. We will
189 * evaluate them with bitslice code.
191 * Each T-box is a circuit of multiplexers (sort of) and thus
192 * takes 70 inputs: the 6 actual T-box inputs, and 64 constants
193 * that describe the T-box output for all combinations of the
194 * 6 inputs. With this model, all T-boxes are identical (with
195 * distinct inputs) and thus can be executed in parallel with
198 * T-boxes are numbered from 0 to 31, in least-to-most
199 * significant order. Thus, S-box S1 corresponds to T-boxes 31,
200 * 30, 29 and 28, in that order. T-box 'n' is computed with the
201 * bits at rank 'n' in the 32-bit words.
203 * Words x0 to x5 contain the T-box inputs 0 to 5.
205 uint32_t x0
, x1
, x2
, x3
, x4
, x5
, z0
;
206 uint32_t y0
, y1
, y2
, y3
, y4
, y5
, y6
, y7
, y8
, y9
;
207 uint32_t y10
, y11
, y12
, y13
, y14
, y15
, y16
, y17
, y18
, y19
;
208 uint32_t y20
, y21
, y22
, y23
, y24
, y25
, y26
, y27
, y28
, y29
;
212 * Spread input bits over the 6 input words x*.
214 x1
= r0
& (uint32_t)0x11111111;
215 x2
= (r0
>> 1) & (uint32_t)0x11111111;
216 x3
= (r0
>> 2) & (uint32_t)0x11111111;
217 x4
= (r0
>> 3) & (uint32_t)0x11111111;
222 x0
= (x4
<< 4) | (x4
>> 28);
223 x5
= (x1
>> 4) | (x1
<< 28);
226 * XOR with the subkey for this round.
236 * The T-boxes are done in parallel, since they all use a
237 * "tree of multiplexer". We use "fake multiplexers":
241 * computes y as either 'a' (if x == 0) or 'a ^ b' (if x == 1).
243 y0
= (uint32_t)0xEFA72C4D ^ (x0
& (uint32_t)0xEC7AC69C);
244 y1
= (uint32_t)0xAEAAEDFF ^ (x0
& (uint32_t)0x500FB821);
245 y2
= (uint32_t)0x37396665 ^ (x0
& (uint32_t)0x40EFA809);
246 y3
= (uint32_t)0x68D7B833 ^ (x0
& (uint32_t)0xA5EC0B28);
247 y4
= (uint32_t)0xC9C755BB ^ (x0
& (uint32_t)0x252CF820);
248 y5
= (uint32_t)0x73FC3606 ^ (x0
& (uint32_t)0x40205801);
249 y6
= (uint32_t)0xA2A0A918 ^ (x0
& (uint32_t)0xE220F929);
250 y7
= (uint32_t)0x8222BD90 ^ (x0
& (uint32_t)0x44A3F9E1);
251 y8
= (uint32_t)0xD6B6AC77 ^ (x0
& (uint32_t)0x794F104A);
252 y9
= (uint32_t)0x3069300C ^ (x0
& (uint32_t)0x026F320B);
253 y10
= (uint32_t)0x6CE0D5CC ^ (x0
& (uint32_t)0x7640B01A);
254 y11
= (uint32_t)0x59A9A22D ^ (x0
& (uint32_t)0x238F1572);
255 y12
= (uint32_t)0xAC6D0BD4 ^ (x0
& (uint32_t)0x7A63C083);
256 y13
= (uint32_t)0x21C83200 ^ (x0
& (uint32_t)0x11CCA000);
257 y14
= (uint32_t)0xA0E62188 ^ (x0
& (uint32_t)0x202F69AA);
258 /* y15 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */
259 y16
= (uint32_t)0xAF7D655A ^ (x0
& (uint32_t)0x51B33BE9);
260 y17
= (uint32_t)0xF0168AA3 ^ (x0
& (uint32_t)0x3B0FE8AE);
261 y18
= (uint32_t)0x90AA30C6 ^ (x0
& (uint32_t)0x90BF8816);
262 y19
= (uint32_t)0x5AB2750A ^ (x0
& (uint32_t)0x09E34F9B);
263 y20
= (uint32_t)0x5391BE65 ^ (x0
& (uint32_t)0x0103BE88);
264 y21
= (uint32_t)0x93372BAF ^ (x0
& (uint32_t)0x49AC8E25);
265 y22
= (uint32_t)0xF288210C ^ (x0
& (uint32_t)0x922C313D);
266 y23
= (uint32_t)0x920AF5C0 ^ (x0
& (uint32_t)0x70EF31B0);
267 y24
= (uint32_t)0x63D312C0 ^ (x0
& (uint32_t)0x6A707100);
268 y25
= (uint32_t)0x537B3006 ^ (x0
& (uint32_t)0xB97C9011);
269 y26
= (uint32_t)0xA2EFB0A5 ^ (x0
& (uint32_t)0xA320C959);
270 y27
= (uint32_t)0xBC8F96A5 ^ (x0
& (uint32_t)0x6EA0AB4A);
271 y28
= (uint32_t)0xFAD176A5 ^ (x0
& (uint32_t)0x6953DDF8);
272 y29
= (uint32_t)0x665A14A3 ^ (x0
& (uint32_t)0xF74F3E2B);
273 y30
= (uint32_t)0xF2EFF0CC ^ (x0
& (uint32_t)0xF0306CAD);
274 /* y31 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */
281 y5
= y10
^ (x1
& y11
);
282 y6
= y12
^ (x1
& y13
);
283 y7
= y14
; /* was: y14 ^ (x1 & y15) */
284 y8
= y16
^ (x1
& y17
);
285 y9
= y18
^ (x1
& y19
);
286 y10
= y20
^ (x1
& y21
);
287 y11
= y22
^ (x1
& y23
);
288 y12
= y24
^ (x1
& y25
);
289 y13
= y26
^ (x1
& y27
);
290 y14
= y28
^ (x1
& y29
);
291 y15
= y30
; /* was: y30 ^ (x1 & y31) */
298 y5
= y10
^ (x2
& y11
);
299 y6
= y12
^ (x2
& y13
);
300 y7
= y14
^ (x2
& y15
);
314 * -- Each bit move is converted into a mask + left rotation.
315 * -- Rotations that use the same movement are coalesced together.
316 * -- Left and right shifts are used as alternatives to a rotation
317 * where appropriate (this will help architectures that do not have
318 * a rotation opcode).
320 z0
= (y0
& (uint32_t)0x00000004) << 3;
321 z0
|= (y0
& (uint32_t)0x00004000) << 4;
322 z0
|= rotl(y0
& 0x12020120, 5);
323 z0
|= (y0
& (uint32_t)0x00100000) << 6;
324 z0
|= (y0
& (uint32_t)0x00008000) << 9;
325 z0
|= (y0
& (uint32_t)0x04000000) >> 22;
326 z0
|= (y0
& (uint32_t)0x00000001) << 11;
327 z0
|= rotl(y0
& 0x20000200, 12);
328 z0
|= (y0
& (uint32_t)0x00200000) >> 19;
329 z0
|= (y0
& (uint32_t)0x00000040) << 14;
330 z0
|= (y0
& (uint32_t)0x00010000) << 15;
331 z0
|= (y0
& (uint32_t)0x00000002) << 16;
332 z0
|= rotl(y0
& 0x40801800, 17);
333 z0
|= (y0
& (uint32_t)0x00080000) >> 13;
334 z0
|= (y0
& (uint32_t)0x00000010) << 21;
335 z0
|= (y0
& (uint32_t)0x01000000) >> 10;
336 z0
|= rotl(y0
& 0x88000008, 24);
337 z0
|= (y0
& (uint32_t)0x00000480) >> 7;
338 z0
|= (y0
& (uint32_t)0x00442000) >> 6;
343 * Process one block through 16 successive rounds, omitting the swap
344 * in the final round.
347 process_block_unit(uint32_t *pl
, uint32_t *pr
, const uint32_t *sk_exp
)
354 for (i
= 0; i
< 16; i
++) {
357 t
= l
^ Fconf(r
, sk_exp
);
368 br_des_ct_process_block(unsigned num_rounds
,
369 const uint32_t *sk_exp
, void *block
)
376 r
= br_dec32be(buf
+ 4);
377 br_des_do_IP(&l
, &r
);
378 while (num_rounds
-- > 0) {
379 process_block_unit(&l
, &r
, sk_exp
);
382 br_des_do_invIP(&l
, &r
);
384 br_enc32be(buf
+ 4, r
);
389 br_des_ct_skey_expand(uint32_t *sk_exp
,
390 unsigned num_rounds
, const uint32_t *skey
)
393 while (num_rounds
-- > 0) {
394 uint32_t v
, w0
, w1
, w2
, w3
;
398 w1
= (v
>> 1) & 0x11111111;
399 w2
= (v
>> 2) & 0x11111111;
400 w3
= (v
>> 3) & 0x11111111;
401 *sk_exp
++ = (w0
<< 4) - w0
;
402 *sk_exp
++ = (w1
<< 4) - w1
;
403 *sk_exp
++ = (w2
<< 4) - w2
;
404 *sk_exp
++ = (w3
<< 4) - w3
;
407 w1
= (v
>> 1) & 0x11111111;
408 *sk_exp
++ = (w0
<< 4) - w0
;
409 *sk_exp
++ = (w1
<< 4) - w1
;