diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2017-01-19 16:45:41 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2017-01-19 16:45:41 +0100 |
commit | f7806f9d8fc889f1d6cd365b69d9d99a4a5a6e26 (patch) | |
tree | f7dca43d7506da675080d3a3b26dcde3835ecdeb /networking/tls_symmetric.h | |
parent | 432f1ae2ff184e07fa78bd3797073094069e521d (diff) | |
download | busybox-w32-f7806f9d8fc889f1d6cd365b69d9d99a4a5a6e26.tar.gz busybox-w32-f7806f9d8fc889f1d6cd365b69d9d99a4a5a6e26.tar.bz2 busybox-w32-f7806f9d8fc889f1d6cd365b69d9d99a4a5a6e26.zip |
tls: fix ROL/ROR x86 optimization
ALWAYS_INLINE:
function old new delta
psAesInitKey 825 824 -1
ROR 5 - -5
setup_mix2 148 134 -14
psAesDecryptBlock 1184 1139 -45
psAesEncryptBlock 1193 1102 -91
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 0/4 up/down: 0/-156) Total: -156 bytes
ALWAYS_INLINE + __builtin_constant_p(shift_cnt):
function old new delta
ROR 5 - -5
psAesInitKey 825 818 -7
setup_mix2 148 123 -25
psAesDecryptBlock 1184 1078 -106
psAesEncryptBlock 1193 1017 -176
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 0/4 up/down: 0/-319) Total: -319 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'networking/tls_symmetric.h')
-rw-r--r-- | networking/tls_symmetric.h | 19 |
1 files changed, 14 insertions, 5 deletions
diff --git a/networking/tls_symmetric.h b/networking/tls_symmetric.h index b6b55c78c..8488b437e 100644 --- a/networking/tls_symmetric.h +++ b/networking/tls_symmetric.h | |||
@@ -7,9 +7,6 @@ | |||
7 | 7 | ||
8 | /* The part below is a section of matrixssl-3-7-2b-open/crypto/cryptolib.h | 8 | /* The part below is a section of matrixssl-3-7-2b-open/crypto/cryptolib.h |
9 | * Changes are flagged with //bbox | 9 | * Changes are flagged with //bbox |
10 | * TODO: | ||
11 | * Take a look at "roll %%cl" part... rotates by constant use fewer registers, | ||
12 | * and on many Intel CPUs rotates by %cl are slower: they take 2 cycles, not 1. | ||
13 | */ | 10 | */ |
14 | 11 | ||
15 | /******************************************************************************/ | 12 | /******************************************************************************/ |
@@ -28,16 +25,28 @@ | |||
28 | #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && \ | 25 | #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && \ |
29 | !defined(INTEL_CC) && !defined(PS_NO_ASM) | 26 | !defined(INTEL_CC) && !defined(PS_NO_ASM) |
30 | 27 | ||
31 | static inline unsigned ROL(unsigned word, int i) | 28 | static ALWAYS_INLINE unsigned ROL(unsigned word, int i) |
32 | { | 29 | { |
30 | if (__builtin_constant_p(i)) //box | ||
31 | // Rotates by constant use fewer registers, | ||
32 | // and on many Intel CPUs rotates by %cl take 2 cycles, not 1. | ||
33 | asm ("roll %2,%0" //box | ||
34 | :"=r" (word) | ||
35 | :"0" (word),"i" (i)); | ||
36 | else //box | ||
33 | asm ("roll %%cl,%0" | 37 | asm ("roll %%cl,%0" |
34 | :"=r" (word) | 38 | :"=r" (word) |
35 | :"0" (word),"c" (i)); | 39 | :"0" (word),"c" (i)); |
36 | return word; | 40 | return word; |
37 | } | 41 | } |
38 | 42 | ||
39 | static inline unsigned ROR(unsigned word, int i) | 43 | static ALWAYS_INLINE unsigned ROR(unsigned word, int i) |
40 | { | 44 | { |
45 | if (__builtin_constant_p(i)) //box | ||
46 | asm ("rorl %2,%0" //box | ||
47 | :"=r" (word) | ||
48 | :"0" (word),"i" (i)); | ||
49 | else //box | ||
41 | asm ("rorl %%cl,%0" | 50 | asm ("rorl %%cl,%0" |
42 | :"=r" (word) | 51 | :"=r" (word) |
43 | :"0" (word),"c" (i)); | 52 | :"0" (word),"c" (i)); |