diff options
author | Ron Yorston <rmy@pobox.com> | 2022-01-13 08:02:41 +0000 |
---|---|---|
committer | Ron Yorston <rmy@pobox.com> | 2022-01-13 08:02:41 +0000 |
commit | 4734416a21312488a5099a297907783bee4ccc22 (patch) | |
tree | e42b034f9685a0a07ad080076b757bfba654cf7d /libbb | |
parent | b8751bbc9ac24e71fbe1e79c69074b4c87a134d8 (diff) | |
parent | b3eec1651fb02d70716caa355f49320719f74c75 (diff) | |
download | busybox-w32-4734416a21312488a5099a297907783bee4ccc22.tar.gz busybox-w32-4734416a21312488a5099a297907783bee4ccc22.tar.bz2 busybox-w32-4734416a21312488a5099a297907783bee4ccc22.zip |
Merge busybox into merge
Fix merge conflicts in coreutils/ls.c and shell/ash.c.
Update config files to turn off SHA1_HWACCEL. It uses non-portable
assembler.
Diffstat (limited to 'libbb')
-rw-r--r-- | libbb/Config.src | 7 | ||||
-rw-r--r-- | libbb/Kbuild.src | 2 | ||||
-rw-r--r-- | libbb/hash_md5_sha.c | 35 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-32_shaNI.S | 231 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 14 | ||||
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 36 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-64_shaNI.S | 225 | ||||
-rw-r--r-- | libbb/setup_environment.c | 8 |
8 files changed, 539 insertions, 19 deletions
diff --git a/libbb/Config.src b/libbb/Config.src index c80bee286..708d3b0c8 100644 --- a/libbb/Config.src +++ b/libbb/Config.src | |||
@@ -63,6 +63,13 @@ config SHA1_SMALL | |||
63 | 1 224 229 654 732 | 63 | 1 224 229 654 732 |
64 | 2,3 200 195 358 380 | 64 | 2,3 200 195 358 380 |
65 | 65 | ||
66 | config SHA1_HWACCEL | ||
67 | bool "SHA1: Use hardware accelerated instructions if possible" | ||
68 | default y | ||
69 | help | ||
70 | On x86, this adds ~590 bytes of code. Throughput | ||
71 | is about twice as fast as fully-unrolled generic code. | ||
72 | |||
66 | config SHA3_SMALL | 73 | config SHA3_SMALL |
67 | int "SHA3: Trade bytes for speed (0:fast, 1:slow)" | 74 | int "SHA3: Trade bytes for speed (0:fast, 1:slow)" |
68 | default 1 # all "fast or small" options default to small | 75 | default 1 # all "fast or small" options default to small |
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index 41bf54e75..67d3c7cf7 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src | |||
@@ -46,6 +46,8 @@ lib-y += llist.o | |||
46 | lib-y += make_directory.o | 46 | lib-y += make_directory.o |
47 | lib-y += hash_md5_sha.o | 47 | lib-y += hash_md5_sha.o |
48 | lib-y += hash_md5_sha_x86-64.o | 48 | lib-y += hash_md5_sha_x86-64.o |
49 | lib-y += hash_md5_sha_x86-64_shaNI.o | ||
50 | lib-y += hash_md5_sha_x86-32_shaNI.o | ||
49 | # Alternative (disabled) MD5 implementation | 51 | # Alternative (disabled) MD5 implementation |
50 | #lib-y += hash_md5prime.o | 52 | #lib-y += hash_md5prime.o |
51 | lib-y += messages.o | 53 | lib-y += messages.o |
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index ee19c1cb7..a23db5152 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
@@ -699,7 +699,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
699 | 699 | ||
700 | /* in hash_md5_sha_x86-64.S */ | 700 | /* in hash_md5_sha_x86-64.S */ |
701 | struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; | 701 | struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; |
702 | void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM); | 702 | void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx); |
703 | 703 | ||
704 | # else | 704 | # else |
705 | /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. | 705 | /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. |
@@ -1142,6 +1142,25 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx) | |||
1142 | } | 1142 | } |
1143 | #endif /* NEED_SHA512 */ | 1143 | #endif /* NEED_SHA512 */ |
1144 | 1144 | ||
1145 | #if ENABLE_SHA1_HWACCEL | ||
1146 | # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | ||
1147 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) | ||
1148 | { | ||
1149 | asm ("cpuid" | ||
1150 | : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) | ||
1151 | : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx) | ||
1152 | ); | ||
1153 | } | ||
1154 | void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); | ||
1155 | # if defined(__i386__) | ||
1156 | struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; }; | ||
1157 | # endif | ||
1158 | # if defined(__x86_64__) | ||
1159 | struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; | ||
1160 | # endif | ||
1161 | # endif | ||
1162 | #endif | ||
1163 | |||
1145 | void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) | 1164 | void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) |
1146 | { | 1165 | { |
1147 | ctx->hash[0] = 0x67452301; | 1166 | ctx->hash[0] = 0x67452301; |
@@ -1151,6 +1170,20 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) | |||
1151 | ctx->hash[4] = 0xc3d2e1f0; | 1170 | ctx->hash[4] = 0xc3d2e1f0; |
1152 | ctx->total64 = 0; | 1171 | ctx->total64 = 0; |
1153 | ctx->process_block = sha1_process_block64; | 1172 | ctx->process_block = sha1_process_block64; |
1173 | #if ENABLE_SHA1_HWACCEL | ||
1174 | # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | ||
1175 | { | ||
1176 | static smallint shaNI; | ||
1177 | if (!shaNI) { | ||
1178 | unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; | ||
1179 | cpuid(&eax, &ebx, &ecx, &edx); | ||
1180 | shaNI = ((ebx >> 29) << 1) - 1; | ||
1181 | } | ||
1182 | if (shaNI > 0) | ||
1183 | ctx->process_block = sha1_process_block64_shaNI; | ||
1184 | } | ||
1185 | # endif | ||
1186 | #endif | ||
1154 | } | 1187 | } |
1155 | 1188 | ||
1156 | static const uint32_t init256[] ALIGN4 = { | 1189 | static const uint32_t init256[] ALIGN4 = { |
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S new file mode 100644 index 000000000..166cfd38a --- /dev/null +++ b/libbb/hash_md5_sha_x86-32_shaNI.S | |||
@@ -0,0 +1,231 @@ | |||
1 | #if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__) | ||
2 | /* The code is adapted from Linux kernel's source */ | ||
3 | |||
4 | // We use shorter insns, even though they are for "wrong" | ||
5 | // data type (fp, not int). | ||
6 | // For Intel, there is no penalty for doing it at all | ||
7 | // (CPUs which do have such penalty do not support SHA1 insns). | ||
8 | // For AMD, the penalty is one extra cycle | ||
9 | // (allegedly: I failed to find measurable difference). | ||
10 | |||
11 | //#define mova128 movdqa | ||
12 | #define mova128 movaps | ||
13 | //#define movu128 movdqu | ||
14 | #define movu128 movups | ||
15 | //#define xor128 pxor | ||
16 | #define xor128 xorps | ||
17 | //#define shuf128_32 pshufd | ||
18 | #define shuf128_32 shufps | ||
19 | |||
20 | #define extr128_32 pextrd | ||
21 | //#define extr128_32 extractps # not shorter | ||
22 | |||
23 | .section .text.sha1_process_block64_shaNI,"ax",@progbits | ||
24 | .globl sha1_process_block64_shaNI | ||
25 | .hidden sha1_process_block64_shaNI | ||
26 | .type sha1_process_block64_shaNI, @function | ||
27 | |||
28 | #define ABCD %xmm0 | ||
29 | #define E0 %xmm1 /* Need two E's b/c they ping pong */ | ||
30 | #define E1 %xmm2 | ||
31 | #define MSG0 %xmm3 | ||
32 | #define MSG1 %xmm4 | ||
33 | #define MSG2 %xmm5 | ||
34 | #define MSG3 %xmm6 | ||
35 | #define SHUF_MASK %xmm7 | ||
36 | |||
37 | .balign 8 # allow decoders to fetch at least 3 first insns | ||
38 | sha1_process_block64_shaNI: | ||
39 | pushl %ebp | ||
40 | movl %esp, %ebp | ||
41 | subl $32, %esp | ||
42 | andl $~0xF, %esp # paddd needs aligned memory operand | ||
43 | |||
44 | /* load initial hash values */ | ||
45 | xor128 E0, E0 | ||
46 | movu128 76(%eax), ABCD | ||
47 | pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word | ||
48 | shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD | ||
49 | |||
50 | mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK | ||
51 | |||
52 | /* Save hash values for addition after rounds */ | ||
53 | movu128 E0, 16(%esp) | ||
54 | movu128 ABCD, (%esp) | ||
55 | |||
56 | /* Rounds 0-3 */ | ||
57 | movu128 0*16(%eax), MSG0 | ||
58 | pshufb SHUF_MASK, MSG0 | ||
59 | paddd MSG0, E0 | ||
60 | mova128 ABCD, E1 | ||
61 | sha1rnds4 $0, E0, ABCD | ||
62 | |||
63 | /* Rounds 4-7 */ | ||
64 | movu128 1*16(%eax), MSG1 | ||
65 | pshufb SHUF_MASK, MSG1 | ||
66 | sha1nexte MSG1, E1 | ||
67 | mova128 ABCD, E0 | ||
68 | sha1rnds4 $0, E1, ABCD | ||
69 | sha1msg1 MSG1, MSG0 | ||
70 | |||
71 | /* Rounds 8-11 */ | ||
72 | movu128 2*16(%eax), MSG2 | ||
73 | pshufb SHUF_MASK, MSG2 | ||
74 | sha1nexte MSG2, E0 | ||
75 | mova128 ABCD, E1 | ||
76 | sha1rnds4 $0, E0, ABCD | ||
77 | sha1msg1 MSG2, MSG1 | ||
78 | xor128 MSG2, MSG0 | ||
79 | |||
80 | /* Rounds 12-15 */ | ||
81 | movu128 3*16(%eax), MSG3 | ||
82 | pshufb SHUF_MASK, MSG3 | ||
83 | sha1nexte MSG3, E1 | ||
84 | mova128 ABCD, E0 | ||
85 | sha1msg2 MSG3, MSG0 | ||
86 | sha1rnds4 $0, E1, ABCD | ||
87 | sha1msg1 MSG3, MSG2 | ||
88 | xor128 MSG3, MSG1 | ||
89 | |||
90 | /* Rounds 16-19 */ | ||
91 | sha1nexte MSG0, E0 | ||
92 | mova128 ABCD, E1 | ||
93 | sha1msg2 MSG0, MSG1 | ||
94 | sha1rnds4 $0, E0, ABCD | ||
95 | sha1msg1 MSG0, MSG3 | ||
96 | xor128 MSG0, MSG2 | ||
97 | |||
98 | /* Rounds 20-23 */ | ||
99 | sha1nexte MSG1, E1 | ||
100 | mova128 ABCD, E0 | ||
101 | sha1msg2 MSG1, MSG2 | ||
102 | sha1rnds4 $1, E1, ABCD | ||
103 | sha1msg1 MSG1, MSG0 | ||
104 | xor128 MSG1, MSG3 | ||
105 | |||
106 | /* Rounds 24-27 */ | ||
107 | sha1nexte MSG2, E0 | ||
108 | mova128 ABCD, E1 | ||
109 | sha1msg2 MSG2, MSG3 | ||
110 | sha1rnds4 $1, E0, ABCD | ||
111 | sha1msg1 MSG2, MSG1 | ||
112 | xor128 MSG2, MSG0 | ||
113 | |||
114 | /* Rounds 28-31 */ | ||
115 | sha1nexte MSG3, E1 | ||
116 | mova128 ABCD, E0 | ||
117 | sha1msg2 MSG3, MSG0 | ||
118 | sha1rnds4 $1, E1, ABCD | ||
119 | sha1msg1 MSG3, MSG2 | ||
120 | xor128 MSG3, MSG1 | ||
121 | |||
122 | /* Rounds 32-35 */ | ||
123 | sha1nexte MSG0, E0 | ||
124 | mova128 ABCD, E1 | ||
125 | sha1msg2 MSG0, MSG1 | ||
126 | sha1rnds4 $1, E0, ABCD | ||
127 | sha1msg1 MSG0, MSG3 | ||
128 | xor128 MSG0, MSG2 | ||
129 | |||
130 | /* Rounds 36-39 */ | ||
131 | sha1nexte MSG1, E1 | ||
132 | mova128 ABCD, E0 | ||
133 | sha1msg2 MSG1, MSG2 | ||
134 | sha1rnds4 $1, E1, ABCD | ||
135 | sha1msg1 MSG1, MSG0 | ||
136 | xor128 MSG1, MSG3 | ||
137 | |||
138 | /* Rounds 40-43 */ | ||
139 | sha1nexte MSG2, E0 | ||
140 | mova128 ABCD, E1 | ||
141 | sha1msg2 MSG2, MSG3 | ||
142 | sha1rnds4 $2, E0, ABCD | ||
143 | sha1msg1 MSG2, MSG1 | ||
144 | xor128 MSG2, MSG0 | ||
145 | |||
146 | /* Rounds 44-47 */ | ||
147 | sha1nexte MSG3, E1 | ||
148 | mova128 ABCD, E0 | ||
149 | sha1msg2 MSG3, MSG0 | ||
150 | sha1rnds4 $2, E1, ABCD | ||
151 | sha1msg1 MSG3, MSG2 | ||
152 | xor128 MSG3, MSG1 | ||
153 | |||
154 | /* Rounds 48-51 */ | ||
155 | sha1nexte MSG0, E0 | ||
156 | mova128 ABCD, E1 | ||
157 | sha1msg2 MSG0, MSG1 | ||
158 | sha1rnds4 $2, E0, ABCD | ||
159 | sha1msg1 MSG0, MSG3 | ||
160 | xor128 MSG0, MSG2 | ||
161 | |||
162 | /* Rounds 52-55 */ | ||
163 | sha1nexte MSG1, E1 | ||
164 | mova128 ABCD, E0 | ||
165 | sha1msg2 MSG1, MSG2 | ||
166 | sha1rnds4 $2, E1, ABCD | ||
167 | sha1msg1 MSG1, MSG0 | ||
168 | xor128 MSG1, MSG3 | ||
169 | |||
170 | /* Rounds 56-59 */ | ||
171 | sha1nexte MSG2, E0 | ||
172 | mova128 ABCD, E1 | ||
173 | sha1msg2 MSG2, MSG3 | ||
174 | sha1rnds4 $2, E0, ABCD | ||
175 | sha1msg1 MSG2, MSG1 | ||
176 | xor128 MSG2, MSG0 | ||
177 | |||
178 | /* Rounds 60-63 */ | ||
179 | sha1nexte MSG3, E1 | ||
180 | mova128 ABCD, E0 | ||
181 | sha1msg2 MSG3, MSG0 | ||
182 | sha1rnds4 $3, E1, ABCD | ||
183 | sha1msg1 MSG3, MSG2 | ||
184 | xor128 MSG3, MSG1 | ||
185 | |||
186 | /* Rounds 64-67 */ | ||
187 | sha1nexte MSG0, E0 | ||
188 | mova128 ABCD, E1 | ||
189 | sha1msg2 MSG0, MSG1 | ||
190 | sha1rnds4 $3, E0, ABCD | ||
191 | sha1msg1 MSG0, MSG3 | ||
192 | xor128 MSG0, MSG2 | ||
193 | |||
194 | /* Rounds 68-71 */ | ||
195 | sha1nexte MSG1, E1 | ||
196 | mova128 ABCD, E0 | ||
197 | sha1msg2 MSG1, MSG2 | ||
198 | sha1rnds4 $3, E1, ABCD | ||
199 | xor128 MSG1, MSG3 | ||
200 | |||
201 | /* Rounds 72-75 */ | ||
202 | sha1nexte MSG2, E0 | ||
203 | mova128 ABCD, E1 | ||
204 | sha1msg2 MSG2, MSG3 | ||
205 | sha1rnds4 $3, E0, ABCD | ||
206 | |||
207 | /* Rounds 76-79 */ | ||
208 | sha1nexte MSG3, E1 | ||
209 | mova128 ABCD, E0 | ||
210 | sha1rnds4 $3, E1, ABCD | ||
211 | |||
212 | /* Add current hash values with previously saved */ | ||
213 | sha1nexte 16(%esp), E0 | ||
214 | paddd (%esp), ABCD | ||
215 | |||
216 | /* Write hash values back in the correct order */ | ||
217 | shuf128_32 $0x1B, ABCD, ABCD | ||
218 | movu128 ABCD, 76(%eax) | ||
219 | extr128_32 $3, E0, 76+4*4(%eax) | ||
220 | |||
221 | movl %ebp, %esp | ||
222 | popl %ebp | ||
223 | ret | ||
224 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | ||
225 | |||
226 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | ||
227 | .align 16 | ||
228 | PSHUFFLE_BYTE_FLIP_MASK: | ||
229 | .octa 0x000102030405060708090a0b0c0d0e0f | ||
230 | |||
231 | #endif | ||
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index ff78fc049..87fb616a1 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
@@ -2,8 +2,8 @@ | |||
2 | 2 | ||
3 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 3 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
4 | .section .text.sha1_process_block64,"ax",@progbits | 4 | .section .text.sha1_process_block64,"ax",@progbits |
5 | .globl sha1_process_block64 | 5 | .globl sha1_process_block64 |
6 | .hidden sha1_process_block64 | 6 | .hidden sha1_process_block64 |
7 | .type sha1_process_block64, @function | 7 | .type sha1_process_block64, @function |
8 | 8 | ||
9 | .balign 8 # allow decoders to fetch at least 5 first insns | 9 | .balign 8 # allow decoders to fetch at least 5 first insns |
@@ -1273,15 +1273,15 @@ sha1_process_block64: | |||
1273 | 1273 | ||
1274 | popq %rdi # | 1274 | popq %rdi # |
1275 | popq %r12 # | 1275 | popq %r12 # |
1276 | addl %eax, 80(%rdi) # ctx->hash[0] += a | 1276 | addl %eax, 80(%rdi) # ctx->hash[0] += a |
1277 | popq %r13 # | 1277 | popq %r13 # |
1278 | addl %ebx, 84(%rdi) # ctx->hash[1] += b | 1278 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
1279 | popq %r14 # | 1279 | popq %r14 # |
1280 | addl %ecx, 88(%rdi) # ctx->hash[2] += c | 1280 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
1281 | popq %r15 # | 1281 | popq %r15 # |
1282 | addl %edx, 92(%rdi) # ctx->hash[3] += d | 1282 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
1283 | popq %rbx # | 1283 | popq %rbx # |
1284 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | 1284 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
1285 | popq %rbp # | 1285 | popq %rbp # |
1286 | 1286 | ||
1287 | ret | 1287 | ret |
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 7e50b64fb..901896e6e 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -6,13 +6,35 @@ | |||
6 | # also contains the diff of the generated file. | 6 | # also contains the diff of the generated file. |
7 | exec >hash_md5_sha_x86-64.S | 7 | exec >hash_md5_sha_x86-64.S |
8 | 8 | ||
9 | # There is a way to use XMM registers (which always exist for x86-64!) for W[] | ||
10 | # For example, if we load W as follows: | ||
11 | # %xmm0: w[0x0] w[0x1] w[0x2] w[0x3] | ||
12 | # %xmm4: w[0x4] w[0x5] w[0x6] w[0x7] | ||
13 | # %xmm8: w[0x8] w[0x9] w[0xa] w[0xb] | ||
14 | # %xmm12: w[0xc] w[0xd] w[0xe] w[0xf] | ||
15 | # then the xor'ing operation to generate next W[0..3] is: | ||
16 | # movaps %xmm0, %xmmT2 | ||
17 | # palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5]) | ||
18 | # # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn. | ||
19 | # movaps %xmm0, %xmmT13 | ||
20 | # palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0]) | ||
21 | # xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13 | ||
22 | # xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or | ||
23 | # and then results can be extracted for use: | ||
24 | # movd %xmm0, %esi # new W[0] | ||
25 | # pextrd $1, %xmm0, %esi # new W[1] | ||
26 | # # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1) | ||
27 | # pextrd $2, %xmm0, %esi # new W[2] | ||
28 | # pextrd $3, %xmm0, %esi # new W[3] | ||
29 | # ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64. | ||
30 | |||
9 | echo \ | 31 | echo \ |
10 | '### Generated by hash_md5_sha_x86-64.S.sh ### | 32 | '### Generated by hash_md5_sha_x86-64.S.sh ### |
11 | 33 | ||
12 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 34 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
13 | .section .text.sha1_process_block64,"ax",@progbits | 35 | .section .text.sha1_process_block64,"ax",@progbits |
14 | .globl sha1_process_block64 | 36 | .globl sha1_process_block64 |
15 | .hidden sha1_process_block64 | 37 | .hidden sha1_process_block64 |
16 | .type sha1_process_block64, @function | 38 | .type sha1_process_block64, @function |
17 | 39 | ||
18 | .balign 8 # allow decoders to fetch at least 5 first insns | 40 | .balign 8 # allow decoders to fetch at least 5 first insns |
@@ -265,15 +287,15 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b | |||
265 | echo " | 287 | echo " |
266 | popq %rdi # | 288 | popq %rdi # |
267 | popq %r12 # | 289 | popq %r12 # |
268 | addl %eax, 80(%rdi) # ctx->hash[0] += a | 290 | addl %eax, 80(%rdi) # ctx->hash[0] += a |
269 | popq %r13 # | 291 | popq %r13 # |
270 | addl %ebx, 84(%rdi) # ctx->hash[1] += b | 292 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
271 | popq %r14 # | 293 | popq %r14 # |
272 | addl %ecx, 88(%rdi) # ctx->hash[2] += c | 294 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
273 | popq %r15 # | 295 | popq %r15 # |
274 | addl %edx, 92(%rdi) # ctx->hash[3] += d | 296 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
275 | popq %rbx # | 297 | popq %rbx # |
276 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | 298 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
277 | popq %rbp # | 299 | popq %rbp # |
278 | 300 | ||
279 | ret | 301 | ret |
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S new file mode 100644 index 000000000..33cc3bf7f --- /dev/null +++ b/libbb/hash_md5_sha_x86-64_shaNI.S | |||
@@ -0,0 +1,225 @@ | |||
1 | #if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__) | ||
2 | /* The code is adapted from Linux kernel's source */ | ||
3 | |||
4 | // We use shorter insns, even though they are for "wrong" | ||
5 | // data type (fp, not int). | ||
6 | // For Intel, there is no penalty for doing it at all | ||
7 | // (CPUs which do have such penalty do not support SHA1 insns). | ||
8 | // For AMD, the penalty is one extra cycle | ||
9 | // (allegedly: I failed to find measurable difference). | ||
10 | |||
11 | //#define mova128 movdqa | ||
12 | #define mova128 movaps | ||
13 | //#define movu128 movdqu | ||
14 | #define movu128 movups | ||
15 | //#define xor128 pxor | ||
16 | #define xor128 xorps | ||
17 | //#define shuf128_32 pshufd | ||
18 | #define shuf128_32 shufps | ||
19 | |||
20 | #define extr128_32 pextrd | ||
21 | //#define extr128_32 extractps # not shorter | ||
22 | |||
23 | .section .text.sha1_process_block64_shaNI,"ax",@progbits | ||
24 | .globl sha1_process_block64_shaNI | ||
25 | .hidden sha1_process_block64_shaNI | ||
26 | .type sha1_process_block64_shaNI, @function | ||
27 | |||
28 | #define ABCD %xmm0 | ||
29 | #define E0 %xmm1 /* Need two E's b/c they ping pong */ | ||
30 | #define E1 %xmm2 | ||
31 | #define MSG0 %xmm3 | ||
32 | #define MSG1 %xmm4 | ||
33 | #define MSG2 %xmm5 | ||
34 | #define MSG3 %xmm6 | ||
35 | #define SHUF_MASK %xmm7 | ||
36 | |||
37 | .balign 8 # allow decoders to fetch at least 2 first insns | ||
38 | sha1_process_block64_shaNI: | ||
39 | /* load initial hash values */ | ||
40 | |||
41 | xor128 E0, E0 | ||
42 | movu128 80(%rdi), ABCD | ||
43 | pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word | ||
44 | shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD | ||
45 | |||
46 | mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK | ||
47 | |||
48 | /* Save hash values for addition after rounds */ | ||
49 | mova128 E0, %xmm9 | ||
50 | mova128 ABCD, %xmm8 | ||
51 | |||
52 | /* Rounds 0-3 */ | ||
53 | movu128 0*16(%rdi), MSG0 | ||
54 | pshufb SHUF_MASK, MSG0 | ||
55 | paddd MSG0, E0 | ||
56 | mova128 ABCD, E1 | ||
57 | sha1rnds4 $0, E0, ABCD | ||
58 | |||
59 | /* Rounds 4-7 */ | ||
60 | movu128 1*16(%rdi), MSG1 | ||
61 | pshufb SHUF_MASK, MSG1 | ||
62 | sha1nexte MSG1, E1 | ||
63 | mova128 ABCD, E0 | ||
64 | sha1rnds4 $0, E1, ABCD | ||
65 | sha1msg1 MSG1, MSG0 | ||
66 | |||
67 | /* Rounds 8-11 */ | ||
68 | movu128 2*16(%rdi), MSG2 | ||
69 | pshufb SHUF_MASK, MSG2 | ||
70 | sha1nexte MSG2, E0 | ||
71 | mova128 ABCD, E1 | ||
72 | sha1rnds4 $0, E0, ABCD | ||
73 | sha1msg1 MSG2, MSG1 | ||
74 | xor128 MSG2, MSG0 | ||
75 | |||
76 | /* Rounds 12-15 */ | ||
77 | movu128 3*16(%rdi), MSG3 | ||
78 | pshufb SHUF_MASK, MSG3 | ||
79 | sha1nexte MSG3, E1 | ||
80 | mova128 ABCD, E0 | ||
81 | sha1msg2 MSG3, MSG0 | ||
82 | sha1rnds4 $0, E1, ABCD | ||
83 | sha1msg1 MSG3, MSG2 | ||
84 | xor128 MSG3, MSG1 | ||
85 | |||
86 | /* Rounds 16-19 */ | ||
87 | sha1nexte MSG0, E0 | ||
88 | mova128 ABCD, E1 | ||
89 | sha1msg2 MSG0, MSG1 | ||
90 | sha1rnds4 $0, E0, ABCD | ||
91 | sha1msg1 MSG0, MSG3 | ||
92 | xor128 MSG0, MSG2 | ||
93 | |||
94 | /* Rounds 20-23 */ | ||
95 | sha1nexte MSG1, E1 | ||
96 | mova128 ABCD, E0 | ||
97 | sha1msg2 MSG1, MSG2 | ||
98 | sha1rnds4 $1, E1, ABCD | ||
99 | sha1msg1 MSG1, MSG0 | ||
100 | xor128 MSG1, MSG3 | ||
101 | |||
102 | /* Rounds 24-27 */ | ||
103 | sha1nexte MSG2, E0 | ||
104 | mova128 ABCD, E1 | ||
105 | sha1msg2 MSG2, MSG3 | ||
106 | sha1rnds4 $1, E0, ABCD | ||
107 | sha1msg1 MSG2, MSG1 | ||
108 | xor128 MSG2, MSG0 | ||
109 | |||
110 | /* Rounds 28-31 */ | ||
111 | sha1nexte MSG3, E1 | ||
112 | mova128 ABCD, E0 | ||
113 | sha1msg2 MSG3, MSG0 | ||
114 | sha1rnds4 $1, E1, ABCD | ||
115 | sha1msg1 MSG3, MSG2 | ||
116 | xor128 MSG3, MSG1 | ||
117 | |||
118 | /* Rounds 32-35 */ | ||
119 | sha1nexte MSG0, E0 | ||
120 | mova128 ABCD, E1 | ||
121 | sha1msg2 MSG0, MSG1 | ||
122 | sha1rnds4 $1, E0, ABCD | ||
123 | sha1msg1 MSG0, MSG3 | ||
124 | xor128 MSG0, MSG2 | ||
125 | |||
126 | /* Rounds 36-39 */ | ||
127 | sha1nexte MSG1, E1 | ||
128 | mova128 ABCD, E0 | ||
129 | sha1msg2 MSG1, MSG2 | ||
130 | sha1rnds4 $1, E1, ABCD | ||
131 | sha1msg1 MSG1, MSG0 | ||
132 | xor128 MSG1, MSG3 | ||
133 | |||
134 | /* Rounds 40-43 */ | ||
135 | sha1nexte MSG2, E0 | ||
136 | mova128 ABCD, E1 | ||
137 | sha1msg2 MSG2, MSG3 | ||
138 | sha1rnds4 $2, E0, ABCD | ||
139 | sha1msg1 MSG2, MSG1 | ||
140 | xor128 MSG2, MSG0 | ||
141 | |||
142 | /* Rounds 44-47 */ | ||
143 | sha1nexte MSG3, E1 | ||
144 | mova128 ABCD, E0 | ||
145 | sha1msg2 MSG3, MSG0 | ||
146 | sha1rnds4 $2, E1, ABCD | ||
147 | sha1msg1 MSG3, MSG2 | ||
148 | xor128 MSG3, MSG1 | ||
149 | |||
150 | /* Rounds 48-51 */ | ||
151 | sha1nexte MSG0, E0 | ||
152 | mova128 ABCD, E1 | ||
153 | sha1msg2 MSG0, MSG1 | ||
154 | sha1rnds4 $2, E0, ABCD | ||
155 | sha1msg1 MSG0, MSG3 | ||
156 | xor128 MSG0, MSG2 | ||
157 | |||
158 | /* Rounds 52-55 */ | ||
159 | sha1nexte MSG1, E1 | ||
160 | mova128 ABCD, E0 | ||
161 | sha1msg2 MSG1, MSG2 | ||
162 | sha1rnds4 $2, E1, ABCD | ||
163 | sha1msg1 MSG1, MSG0 | ||
164 | xor128 MSG1, MSG3 | ||
165 | |||
166 | /* Rounds 56-59 */ | ||
167 | sha1nexte MSG2, E0 | ||
168 | mova128 ABCD, E1 | ||
169 | sha1msg2 MSG2, MSG3 | ||
170 | sha1rnds4 $2, E0, ABCD | ||
171 | sha1msg1 MSG2, MSG1 | ||
172 | xor128 MSG2, MSG0 | ||
173 | |||
174 | /* Rounds 60-63 */ | ||
175 | sha1nexte MSG3, E1 | ||
176 | mova128 ABCD, E0 | ||
177 | sha1msg2 MSG3, MSG0 | ||
178 | sha1rnds4 $3, E1, ABCD | ||
179 | sha1msg1 MSG3, MSG2 | ||
180 | xor128 MSG3, MSG1 | ||
181 | |||
182 | /* Rounds 64-67 */ | ||
183 | sha1nexte MSG0, E0 | ||
184 | mova128 ABCD, E1 | ||
185 | sha1msg2 MSG0, MSG1 | ||
186 | sha1rnds4 $3, E0, ABCD | ||
187 | sha1msg1 MSG0, MSG3 | ||
188 | xor128 MSG0, MSG2 | ||
189 | |||
190 | /* Rounds 68-71 */ | ||
191 | sha1nexte MSG1, E1 | ||
192 | mova128 ABCD, E0 | ||
193 | sha1msg2 MSG1, MSG2 | ||
194 | sha1rnds4 $3, E1, ABCD | ||
195 | xor128 MSG1, MSG3 | ||
196 | |||
197 | /* Rounds 72-75 */ | ||
198 | sha1nexte MSG2, E0 | ||
199 | mova128 ABCD, E1 | ||
200 | sha1msg2 MSG2, MSG3 | ||
201 | sha1rnds4 $3, E0, ABCD | ||
202 | |||
203 | /* Rounds 76-79 */ | ||
204 | sha1nexte MSG3, E1 | ||
205 | mova128 ABCD, E0 | ||
206 | sha1rnds4 $3, E1, ABCD | ||
207 | |||
208 | /* Add current hash values with previously saved */ | ||
209 | sha1nexte %xmm9, E0 | ||
210 | paddd %xmm8, ABCD | ||
211 | |||
212 | /* Write hash values back in the correct order */ | ||
213 | shuf128_32 $0x1B, ABCD, ABCD | ||
214 | movu128 ABCD, 80(%rdi) | ||
215 | extr128_32 $3, E0, 80+4*4(%rdi) | ||
216 | |||
217 | ret | ||
218 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | ||
219 | |||
220 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | ||
221 | .align 16 | ||
222 | PSHUFFLE_BYTE_FLIP_MASK: | ||
223 | .octa 0x000102030405060708090a0b0c0d0e0f | ||
224 | |||
225 | #endif | ||
diff --git a/libbb/setup_environment.c b/libbb/setup_environment.c index f8de44967..df2983958 100644 --- a/libbb/setup_environment.c +++ b/libbb/setup_environment.c | |||
@@ -54,15 +54,15 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass | |||
54 | xsetenv("TERM", term); | 54 | xsetenv("TERM", term); |
55 | xsetenv("PATH", (pw->pw_uid ? bb_default_path : bb_default_root_path)); | 55 | xsetenv("PATH", (pw->pw_uid ? bb_default_path : bb_default_root_path)); |
56 | goto shortcut; | 56 | goto shortcut; |
57 | // No, gcc (4.2.1) is not clever enougn to do it itself. | 57 | // No, gcc (4.2.1) is not clever enough to do it itself. |
58 | //xsetenv("USER", pw->pw_name); | 58 | //xsetenv("USER", pw->pw_name); |
59 | //xsetenv("LOGNAME", pw->pw_name); | 59 | //xsetenv("LOGNAME", pw->pw_name); |
60 | //xsetenv("HOME", pw->pw_dir); | 60 | //xsetenv("HOME", pw->pw_dir); |
61 | //xsetenv("SHELL", shell); | 61 | //xsetenv("SHELL", shell); |
62 | } else if (flags & SETUP_ENV_CHANGEENV) { | 62 | } else if (flags & SETUP_ENV_CHANGEENV) { |
63 | /* Set HOME, SHELL, and if not becoming a super-user, | 63 | /* Set HOME, SHELL, and if not becoming a super-user |
64 | * USER and LOGNAME. */ | 64 | * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME. */ |
65 | if (pw->pw_uid) { | 65 | if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) { |
66 | shortcut: | 66 | shortcut: |
67 | xsetenv("USER", pw->pw_name); | 67 | xsetenv("USER", pw->pw_name); |
68 | xsetenv("LOGNAME", pw->pw_name); | 68 | xsetenv("LOGNAME", pw->pw_name); |