aboutsummaryrefslogtreecommitdiff
path: root/libbb
diff options
context:
space:
mode:
authorRon Yorston <rmy@pobox.com>2022-01-13 08:02:41 +0000
committerRon Yorston <rmy@pobox.com>2022-01-13 08:02:41 +0000
commit4734416a21312488a5099a297907783bee4ccc22 (patch)
treee42b034f9685a0a07ad080076b757bfba654cf7d /libbb
parentb8751bbc9ac24e71fbe1e79c69074b4c87a134d8 (diff)
parentb3eec1651fb02d70716caa355f49320719f74c75 (diff)
downloadbusybox-w32-4734416a21312488a5099a297907783bee4ccc22.tar.gz
busybox-w32-4734416a21312488a5099a297907783bee4ccc22.tar.bz2
busybox-w32-4734416a21312488a5099a297907783bee4ccc22.zip
Merge busybox into merge
Fix merge conflicts in coreutils/ls.c and shell/ash.c. Update config files to turn off SHA1_HWACCEL. It uses non-portable assembler.
Diffstat (limited to 'libbb')
-rw-r--r--libbb/Config.src7
-rw-r--r--libbb/Kbuild.src2
-rw-r--r--libbb/hash_md5_sha.c35
-rw-r--r--libbb/hash_md5_sha_x86-32_shaNI.S231
-rw-r--r--libbb/hash_md5_sha_x86-64.S14
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh36
-rw-r--r--libbb/hash_md5_sha_x86-64_shaNI.S225
-rw-r--r--libbb/setup_environment.c8
8 files changed, 539 insertions, 19 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index c80bee286..708d3b0c8 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -63,6 +63,13 @@ config SHA1_SMALL
63 1 224 229 654 732 63 1 224 229 654 732
64 2,3 200 195 358 380 64 2,3 200 195 358 380
65 65
66config SHA1_HWACCEL
67 bool "SHA1: Use hardware accelerated instructions if possible"
68 default y
69 help
70 On x86, this adds ~590 bytes of code. Throughput
71 is about twice as fast as fully-unrolled generic code.
72
66config SHA3_SMALL 73config SHA3_SMALL
67 int "SHA3: Trade bytes for speed (0:fast, 1:slow)" 74 int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
68 default 1 # all "fast or small" options default to small 75 default 1 # all "fast or small" options default to small
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index 41bf54e75..67d3c7cf7 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -46,6 +46,8 @@ lib-y += llist.o
46lib-y += make_directory.o 46lib-y += make_directory.o
47lib-y += hash_md5_sha.o 47lib-y += hash_md5_sha.o
48lib-y += hash_md5_sha_x86-64.o 48lib-y += hash_md5_sha_x86-64.o
49lib-y += hash_md5_sha_x86-64_shaNI.o
50lib-y += hash_md5_sha_x86-32_shaNI.o
49# Alternative (disabled) MD5 implementation 51# Alternative (disabled) MD5 implementation
50#lib-y += hash_md5prime.o 52#lib-y += hash_md5prime.o
51lib-y += messages.o 53lib-y += messages.o
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index ee19c1cb7..a23db5152 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -699,7 +699,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
699 699
700/* in hash_md5_sha_x86-64.S */ 700/* in hash_md5_sha_x86-64.S */
701struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; 701struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
702void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM); 702void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx);
703 703
704# else 704# else
705/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. 705/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
@@ -1142,6 +1142,25 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
1142} 1142}
1143#endif /* NEED_SHA512 */ 1143#endif /* NEED_SHA512 */
1144 1144
1145#if ENABLE_SHA1_HWACCEL
1146# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1147static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
1148{
1149 asm ("cpuid"
1150 : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
1151 : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx)
1152 );
1153}
1154void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
1155# if defined(__i386__)
1156struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; };
1157# endif
1158# if defined(__x86_64__)
1159struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
1160# endif
1161# endif
1162#endif
1163
1145void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) 1164void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1146{ 1165{
1147 ctx->hash[0] = 0x67452301; 1166 ctx->hash[0] = 0x67452301;
@@ -1151,6 +1170,20 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1151 ctx->hash[4] = 0xc3d2e1f0; 1170 ctx->hash[4] = 0xc3d2e1f0;
1152 ctx->total64 = 0; 1171 ctx->total64 = 0;
1153 ctx->process_block = sha1_process_block64; 1172 ctx->process_block = sha1_process_block64;
1173#if ENABLE_SHA1_HWACCEL
1174# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1175 {
1176 static smallint shaNI;
1177 if (!shaNI) {
1178 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
1179 cpuid(&eax, &ebx, &ecx, &edx);
1180 shaNI = ((ebx >> 29) << 1) - 1;
1181 }
1182 if (shaNI > 0)
1183 ctx->process_block = sha1_process_block64_shaNI;
1184 }
1185# endif
1186#endif
1154} 1187}
1155 1188
1156static const uint32_t init256[] ALIGN4 = { 1189static const uint32_t init256[] ALIGN4 = {
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
new file mode 100644
index 000000000..166cfd38a
--- /dev/null
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -0,0 +1,231 @@
1#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define xor128 pxor
16#define xor128 xorps
17//#define shuf128_32 pshufd
18#define shuf128_32 shufps
19
20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter
22
23 .section .text.sha1_process_block64_shaNI,"ax",@progbits
24 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI
26 .type sha1_process_block64_shaNI, @function
27
28#define ABCD %xmm0
29#define E0 %xmm1 /* Need two E's b/c they ping pong */
30#define E1 %xmm2
31#define MSG0 %xmm3
32#define MSG1 %xmm4
33#define MSG2 %xmm5
34#define MSG3 %xmm6
35#define SHUF_MASK %xmm7
36
37 .balign 8 # allow decoders to fetch at least 3 first insns
38sha1_process_block64_shaNI:
39 pushl %ebp
40 movl %esp, %ebp
41 subl $32, %esp
42 andl $~0xF, %esp # paddd needs aligned memory operand
43
44 /* load initial hash values */
45 xor128 E0, E0
46 movu128 76(%eax), ABCD
47 pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word
48 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
49
50 mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK
51
52 /* Save hash values for addition after rounds */
53 movu128 E0, 16(%esp)
54 movu128 ABCD, (%esp)
55
56 /* Rounds 0-3 */
57 movu128 0*16(%eax), MSG0
58 pshufb SHUF_MASK, MSG0
59 paddd MSG0, E0
60 mova128 ABCD, E1
61 sha1rnds4 $0, E0, ABCD
62
63 /* Rounds 4-7 */
64 movu128 1*16(%eax), MSG1
65 pshufb SHUF_MASK, MSG1
66 sha1nexte MSG1, E1
67 mova128 ABCD, E0
68 sha1rnds4 $0, E1, ABCD
69 sha1msg1 MSG1, MSG0
70
71 /* Rounds 8-11 */
72 movu128 2*16(%eax), MSG2
73 pshufb SHUF_MASK, MSG2
74 sha1nexte MSG2, E0
75 mova128 ABCD, E1
76 sha1rnds4 $0, E0, ABCD
77 sha1msg1 MSG2, MSG1
78 xor128 MSG2, MSG0
79
80 /* Rounds 12-15 */
81 movu128 3*16(%eax), MSG3
82 pshufb SHUF_MASK, MSG3
83 sha1nexte MSG3, E1
84 mova128 ABCD, E0
85 sha1msg2 MSG3, MSG0
86 sha1rnds4 $0, E1, ABCD
87 sha1msg1 MSG3, MSG2
88 xor128 MSG3, MSG1
89
90 /* Rounds 16-19 */
91 sha1nexte MSG0, E0
92 mova128 ABCD, E1
93 sha1msg2 MSG0, MSG1
94 sha1rnds4 $0, E0, ABCD
95 sha1msg1 MSG0, MSG3
96 xor128 MSG0, MSG2
97
98 /* Rounds 20-23 */
99 sha1nexte MSG1, E1
100 mova128 ABCD, E0
101 sha1msg2 MSG1, MSG2
102 sha1rnds4 $1, E1, ABCD
103 sha1msg1 MSG1, MSG0
104 xor128 MSG1, MSG3
105
106 /* Rounds 24-27 */
107 sha1nexte MSG2, E0
108 mova128 ABCD, E1
109 sha1msg2 MSG2, MSG3
110 sha1rnds4 $1, E0, ABCD
111 sha1msg1 MSG2, MSG1
112 xor128 MSG2, MSG0
113
114 /* Rounds 28-31 */
115 sha1nexte MSG3, E1
116 mova128 ABCD, E0
117 sha1msg2 MSG3, MSG0
118 sha1rnds4 $1, E1, ABCD
119 sha1msg1 MSG3, MSG2
120 xor128 MSG3, MSG1
121
122 /* Rounds 32-35 */
123 sha1nexte MSG0, E0
124 mova128 ABCD, E1
125 sha1msg2 MSG0, MSG1
126 sha1rnds4 $1, E0, ABCD
127 sha1msg1 MSG0, MSG3
128 xor128 MSG0, MSG2
129
130 /* Rounds 36-39 */
131 sha1nexte MSG1, E1
132 mova128 ABCD, E0
133 sha1msg2 MSG1, MSG2
134 sha1rnds4 $1, E1, ABCD
135 sha1msg1 MSG1, MSG0
136 xor128 MSG1, MSG3
137
138 /* Rounds 40-43 */
139 sha1nexte MSG2, E0
140 mova128 ABCD, E1
141 sha1msg2 MSG2, MSG3
142 sha1rnds4 $2, E0, ABCD
143 sha1msg1 MSG2, MSG1
144 xor128 MSG2, MSG0
145
146 /* Rounds 44-47 */
147 sha1nexte MSG3, E1
148 mova128 ABCD, E0
149 sha1msg2 MSG3, MSG0
150 sha1rnds4 $2, E1, ABCD
151 sha1msg1 MSG3, MSG2
152 xor128 MSG3, MSG1
153
154 /* Rounds 48-51 */
155 sha1nexte MSG0, E0
156 mova128 ABCD, E1
157 sha1msg2 MSG0, MSG1
158 sha1rnds4 $2, E0, ABCD
159 sha1msg1 MSG0, MSG3
160 xor128 MSG0, MSG2
161
162 /* Rounds 52-55 */
163 sha1nexte MSG1, E1
164 mova128 ABCD, E0
165 sha1msg2 MSG1, MSG2
166 sha1rnds4 $2, E1, ABCD
167 sha1msg1 MSG1, MSG0
168 xor128 MSG1, MSG3
169
170 /* Rounds 56-59 */
171 sha1nexte MSG2, E0
172 mova128 ABCD, E1
173 sha1msg2 MSG2, MSG3
174 sha1rnds4 $2, E0, ABCD
175 sha1msg1 MSG2, MSG1
176 xor128 MSG2, MSG0
177
178 /* Rounds 60-63 */
179 sha1nexte MSG3, E1
180 mova128 ABCD, E0
181 sha1msg2 MSG3, MSG0
182 sha1rnds4 $3, E1, ABCD
183 sha1msg1 MSG3, MSG2
184 xor128 MSG3, MSG1
185
186 /* Rounds 64-67 */
187 sha1nexte MSG0, E0
188 mova128 ABCD, E1
189 sha1msg2 MSG0, MSG1
190 sha1rnds4 $3, E0, ABCD
191 sha1msg1 MSG0, MSG3
192 xor128 MSG0, MSG2
193
194 /* Rounds 68-71 */
195 sha1nexte MSG1, E1
196 mova128 ABCD, E0
197 sha1msg2 MSG1, MSG2
198 sha1rnds4 $3, E1, ABCD
199 xor128 MSG1, MSG3
200
201 /* Rounds 72-75 */
202 sha1nexte MSG2, E0
203 mova128 ABCD, E1
204 sha1msg2 MSG2, MSG3
205 sha1rnds4 $3, E0, ABCD
206
207 /* Rounds 76-79 */
208 sha1nexte MSG3, E1
209 mova128 ABCD, E0
210 sha1rnds4 $3, E1, ABCD
211
212 /* Add current hash values with previously saved */
213 sha1nexte 16(%esp), E0
214 paddd (%esp), ABCD
215
216 /* Write hash values back in the correct order */
217 shuf128_32 $0x1B, ABCD, ABCD
218 movu128 ABCD, 76(%eax)
219 extr128_32 $3, E0, 76+4*4(%eax)
220
221 movl %ebp, %esp
222 popl %ebp
223 ret
224 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
225
226.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
227.align 16
228PSHUFFLE_BYTE_FLIP_MASK:
229 .octa 0x000102030405060708090a0b0c0d0e0f
230
231#endif
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index ff78fc049..87fb616a1 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -2,8 +2,8 @@
2 2
3#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) 3#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
4 .section .text.sha1_process_block64,"ax",@progbits 4 .section .text.sha1_process_block64,"ax",@progbits
5 .globl sha1_process_block64 5 .globl sha1_process_block64
6 .hidden sha1_process_block64 6 .hidden sha1_process_block64
7 .type sha1_process_block64, @function 7 .type sha1_process_block64, @function
8 8
9 .balign 8 # allow decoders to fetch at least 5 first insns 9 .balign 8 # allow decoders to fetch at least 5 first insns
@@ -1273,15 +1273,15 @@ sha1_process_block64:
1273 1273
1274 popq %rdi # 1274 popq %rdi #
1275 popq %r12 # 1275 popq %r12 #
1276 addl %eax, 80(%rdi) # ctx->hash[0] += a 1276 addl %eax, 80(%rdi) # ctx->hash[0] += a
1277 popq %r13 # 1277 popq %r13 #
1278 addl %ebx, 84(%rdi) # ctx->hash[1] += b 1278 addl %ebx, 84(%rdi) # ctx->hash[1] += b
1279 popq %r14 # 1279 popq %r14 #
1280 addl %ecx, 88(%rdi) # ctx->hash[2] += c 1280 addl %ecx, 88(%rdi) # ctx->hash[2] += c
1281 popq %r15 # 1281 popq %r15 #
1282 addl %edx, 92(%rdi) # ctx->hash[3] += d 1282 addl %edx, 92(%rdi) # ctx->hash[3] += d
1283 popq %rbx # 1283 popq %rbx #
1284 addl %ebp, 96(%rdi) # ctx->hash[4] += e 1284 addl %ebp, 96(%rdi) # ctx->hash[4] += e
1285 popq %rbp # 1285 popq %rbp #
1286 1286
1287 ret 1287 ret
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 7e50b64fb..901896e6e 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -6,13 +6,35 @@
6# also contains the diff of the generated file. 6# also contains the diff of the generated file.
7exec >hash_md5_sha_x86-64.S 7exec >hash_md5_sha_x86-64.S
8 8
9# There is a way to use XMM registers (which always exist for x86-64!) for W[]
10# For example, if we load W as follows:
11# %xmm0: w[0x0] w[0x1] w[0x2] w[0x3]
12# %xmm4: w[0x4] w[0x5] w[0x6] w[0x7]
13# %xmm8: w[0x8] w[0x9] w[0xa] w[0xb]
14# %xmm12: w[0xc] w[0xd] w[0xe] w[0xf]
15# then the xor'ing operation to generate next W[0..3] is:
16# movaps %xmm0, %xmmT2
17# palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5])
18# # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn.
19# movaps %xmm0, %xmmT13
20# palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0])
21# xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13
22# xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or
23# and then results can be extracted for use:
24# movd %xmm0, %esi # new W[0]
25# pextrd $1, %xmm0, %esi # new W[1]
26# # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1)
27# pextrd $2, %xmm0, %esi # new W[2]
28# pextrd $3, %xmm0, %esi # new W[3]
29# ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64.
30
9echo \ 31echo \
10'### Generated by hash_md5_sha_x86-64.S.sh ### 32'### Generated by hash_md5_sha_x86-64.S.sh ###
11 33
12#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) 34#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
13 .section .text.sha1_process_block64,"ax",@progbits 35 .section .text.sha1_process_block64,"ax",@progbits
14 .globl sha1_process_block64 36 .globl sha1_process_block64
15 .hidden sha1_process_block64 37 .hidden sha1_process_block64
16 .type sha1_process_block64, @function 38 .type sha1_process_block64, @function
17 39
18 .balign 8 # allow decoders to fetch at least 5 first insns 40 .balign 8 # allow decoders to fetch at least 5 first insns
@@ -265,15 +287,15 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b
265echo " 287echo "
266 popq %rdi # 288 popq %rdi #
267 popq %r12 # 289 popq %r12 #
268 addl %eax, 80(%rdi) # ctx->hash[0] += a 290 addl %eax, 80(%rdi) # ctx->hash[0] += a
269 popq %r13 # 291 popq %r13 #
270 addl %ebx, 84(%rdi) # ctx->hash[1] += b 292 addl %ebx, 84(%rdi) # ctx->hash[1] += b
271 popq %r14 # 293 popq %r14 #
272 addl %ecx, 88(%rdi) # ctx->hash[2] += c 294 addl %ecx, 88(%rdi) # ctx->hash[2] += c
273 popq %r15 # 295 popq %r15 #
274 addl %edx, 92(%rdi) # ctx->hash[3] += d 296 addl %edx, 92(%rdi) # ctx->hash[3] += d
275 popq %rbx # 297 popq %rbx #
276 addl %ebp, 96(%rdi) # ctx->hash[4] += e 298 addl %ebp, 96(%rdi) # ctx->hash[4] += e
277 popq %rbp # 299 popq %rbp #
278 300
279 ret 301 ret
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
new file mode 100644
index 000000000..33cc3bf7f
--- /dev/null
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -0,0 +1,225 @@
1#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define xor128 pxor
16#define xor128 xorps
17//#define shuf128_32 pshufd
18#define shuf128_32 shufps
19
20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter
22
23 .section .text.sha1_process_block64_shaNI,"ax",@progbits
24 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI
26 .type sha1_process_block64_shaNI, @function
27
28#define ABCD %xmm0
29#define E0 %xmm1 /* Need two E's b/c they ping pong */
30#define E1 %xmm2
31#define MSG0 %xmm3
32#define MSG1 %xmm4
33#define MSG2 %xmm5
34#define MSG3 %xmm6
35#define SHUF_MASK %xmm7
36
37 .balign 8 # allow decoders to fetch at least 2 first insns
38sha1_process_block64_shaNI:
39 /* load initial hash values */
40
41 xor128 E0, E0
42 movu128 80(%rdi), ABCD
43 pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word
44 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
45
46 mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
47
48 /* Save hash values for addition after rounds */
49 mova128 E0, %xmm9
50 mova128 ABCD, %xmm8
51
52 /* Rounds 0-3 */
53 movu128 0*16(%rdi), MSG0
54 pshufb SHUF_MASK, MSG0
55 paddd MSG0, E0
56 mova128 ABCD, E1
57 sha1rnds4 $0, E0, ABCD
58
59 /* Rounds 4-7 */
60 movu128 1*16(%rdi), MSG1
61 pshufb SHUF_MASK, MSG1
62 sha1nexte MSG1, E1
63 mova128 ABCD, E0
64 sha1rnds4 $0, E1, ABCD
65 sha1msg1 MSG1, MSG0
66
67 /* Rounds 8-11 */
68 movu128 2*16(%rdi), MSG2
69 pshufb SHUF_MASK, MSG2
70 sha1nexte MSG2, E0
71 mova128 ABCD, E1
72 sha1rnds4 $0, E0, ABCD
73 sha1msg1 MSG2, MSG1
74 xor128 MSG2, MSG0
75
76 /* Rounds 12-15 */
77 movu128 3*16(%rdi), MSG3
78 pshufb SHUF_MASK, MSG3
79 sha1nexte MSG3, E1
80 mova128 ABCD, E0
81 sha1msg2 MSG3, MSG0
82 sha1rnds4 $0, E1, ABCD
83 sha1msg1 MSG3, MSG2
84 xor128 MSG3, MSG1
85
86 /* Rounds 16-19 */
87 sha1nexte MSG0, E0
88 mova128 ABCD, E1
89 sha1msg2 MSG0, MSG1
90 sha1rnds4 $0, E0, ABCD
91 sha1msg1 MSG0, MSG3
92 xor128 MSG0, MSG2
93
94 /* Rounds 20-23 */
95 sha1nexte MSG1, E1
96 mova128 ABCD, E0
97 sha1msg2 MSG1, MSG2
98 sha1rnds4 $1, E1, ABCD
99 sha1msg1 MSG1, MSG0
100 xor128 MSG1, MSG3
101
102 /* Rounds 24-27 */
103 sha1nexte MSG2, E0
104 mova128 ABCD, E1
105 sha1msg2 MSG2, MSG3
106 sha1rnds4 $1, E0, ABCD
107 sha1msg1 MSG2, MSG1
108 xor128 MSG2, MSG0
109
110 /* Rounds 28-31 */
111 sha1nexte MSG3, E1
112 mova128 ABCD, E0
113 sha1msg2 MSG3, MSG0
114 sha1rnds4 $1, E1, ABCD
115 sha1msg1 MSG3, MSG2
116 xor128 MSG3, MSG1
117
118 /* Rounds 32-35 */
119 sha1nexte MSG0, E0
120 mova128 ABCD, E1
121 sha1msg2 MSG0, MSG1
122 sha1rnds4 $1, E0, ABCD
123 sha1msg1 MSG0, MSG3
124 xor128 MSG0, MSG2
125
126 /* Rounds 36-39 */
127 sha1nexte MSG1, E1
128 mova128 ABCD, E0
129 sha1msg2 MSG1, MSG2
130 sha1rnds4 $1, E1, ABCD
131 sha1msg1 MSG1, MSG0
132 xor128 MSG1, MSG3
133
134 /* Rounds 40-43 */
135 sha1nexte MSG2, E0
136 mova128 ABCD, E1
137 sha1msg2 MSG2, MSG3
138 sha1rnds4 $2, E0, ABCD
139 sha1msg1 MSG2, MSG1
140 xor128 MSG2, MSG0
141
142 /* Rounds 44-47 */
143 sha1nexte MSG3, E1
144 mova128 ABCD, E0
145 sha1msg2 MSG3, MSG0
146 sha1rnds4 $2, E1, ABCD
147 sha1msg1 MSG3, MSG2
148 xor128 MSG3, MSG1
149
150 /* Rounds 48-51 */
151 sha1nexte MSG0, E0
152 mova128 ABCD, E1
153 sha1msg2 MSG0, MSG1
154 sha1rnds4 $2, E0, ABCD
155 sha1msg1 MSG0, MSG3
156 xor128 MSG0, MSG2
157
158 /* Rounds 52-55 */
159 sha1nexte MSG1, E1
160 mova128 ABCD, E0
161 sha1msg2 MSG1, MSG2
162 sha1rnds4 $2, E1, ABCD
163 sha1msg1 MSG1, MSG0
164 xor128 MSG1, MSG3
165
166 /* Rounds 56-59 */
167 sha1nexte MSG2, E0
168 mova128 ABCD, E1
169 sha1msg2 MSG2, MSG3
170 sha1rnds4 $2, E0, ABCD
171 sha1msg1 MSG2, MSG1
172 xor128 MSG2, MSG0
173
174 /* Rounds 60-63 */
175 sha1nexte MSG3, E1
176 mova128 ABCD, E0
177 sha1msg2 MSG3, MSG0
178 sha1rnds4 $3, E1, ABCD
179 sha1msg1 MSG3, MSG2
180 xor128 MSG3, MSG1
181
182 /* Rounds 64-67 */
183 sha1nexte MSG0, E0
184 mova128 ABCD, E1
185 sha1msg2 MSG0, MSG1
186 sha1rnds4 $3, E0, ABCD
187 sha1msg1 MSG0, MSG3
188 xor128 MSG0, MSG2
189
190 /* Rounds 68-71 */
191 sha1nexte MSG1, E1
192 mova128 ABCD, E0
193 sha1msg2 MSG1, MSG2
194 sha1rnds4 $3, E1, ABCD
195 xor128 MSG1, MSG3
196
197 /* Rounds 72-75 */
198 sha1nexte MSG2, E0
199 mova128 ABCD, E1
200 sha1msg2 MSG2, MSG3
201 sha1rnds4 $3, E0, ABCD
202
203 /* Rounds 76-79 */
204 sha1nexte MSG3, E1
205 mova128 ABCD, E0
206 sha1rnds4 $3, E1, ABCD
207
208 /* Add current hash values with previously saved */
209 sha1nexte %xmm9, E0
210 paddd %xmm8, ABCD
211
212 /* Write hash values back in the correct order */
213 shuf128_32 $0x1B, ABCD, ABCD
214 movu128 ABCD, 80(%rdi)
215 extr128_32 $3, E0, 80+4*4(%rdi)
216
217 ret
218 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
219
220.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
221.align 16
222PSHUFFLE_BYTE_FLIP_MASK:
223 .octa 0x000102030405060708090a0b0c0d0e0f
224
225#endif
diff --git a/libbb/setup_environment.c b/libbb/setup_environment.c
index f8de44967..df2983958 100644
--- a/libbb/setup_environment.c
+++ b/libbb/setup_environment.c
@@ -54,15 +54,15 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass
54 xsetenv("TERM", term); 54 xsetenv("TERM", term);
55 xsetenv("PATH", (pw->pw_uid ? bb_default_path : bb_default_root_path)); 55 xsetenv("PATH", (pw->pw_uid ? bb_default_path : bb_default_root_path));
56 goto shortcut; 56 goto shortcut;
57 // No, gcc (4.2.1) is not clever enougn to do it itself. 57 // No, gcc (4.2.1) is not clever enough to do it itself.
58 //xsetenv("USER", pw->pw_name); 58 //xsetenv("USER", pw->pw_name);
59 //xsetenv("LOGNAME", pw->pw_name); 59 //xsetenv("LOGNAME", pw->pw_name);
60 //xsetenv("HOME", pw->pw_dir); 60 //xsetenv("HOME", pw->pw_dir);
61 //xsetenv("SHELL", shell); 61 //xsetenv("SHELL", shell);
62 } else if (flags & SETUP_ENV_CHANGEENV) { 62 } else if (flags & SETUP_ENV_CHANGEENV) {
63 /* Set HOME, SHELL, and if not becoming a super-user, 63 /* Set HOME, SHELL, and if not becoming a super-user
64 * USER and LOGNAME. */ 64 * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME. */
65 if (pw->pw_uid) { 65 if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) {
66 shortcut: 66 shortcut:
67 xsetenv("USER", pw->pw_name); 67 xsetenv("USER", pw->pw_name);
68 xsetenv("LOGNAME", pw->pw_name); 68 xsetenv("LOGNAME", pw->pw_name);