diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2023-06-21 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2023-12-17 14:59:19 +0500 |
commit | 5b39dc76f1bc82f941d5c800ab9f34407a06b53a (patch) | |
tree | fe5e17420300b715021a76328444088d32047963 /C/SwapBytes.c | |
parent | 93be7d4abfd4233228f58ee1fbbcd76d91be66a4 (diff) | |
download | 7zip-23.01.tar.gz 7zip-23.01.tar.bz2 7zip-23.01.zip |
23.0123.01
Diffstat (limited to 'C/SwapBytes.c')
-rw-r--r-- | C/SwapBytes.c | 800 |
1 files changed, 800 insertions, 0 deletions
diff --git a/C/SwapBytes.c b/C/SwapBytes.c new file mode 100644 index 0000000..7901bba --- /dev/null +++ b/C/SwapBytes.c | |||
@@ -0,0 +1,800 @@ | |||
1 | /* SwapBytes.c -- Byte Swap conversion filter | ||
2 | 2023-04-07 : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #include "Precomp.h" | ||
5 | |||
6 | #include "Compiler.h" | ||
7 | #include "CpuArch.h" | ||
8 | #include "RotateDefs.h" | ||
9 | #include "SwapBytes.h" | ||
10 | |||
11 | typedef UInt16 CSwapUInt16; | ||
12 | typedef UInt32 CSwapUInt32; | ||
13 | |||
14 | // #define k_SwapBytes_Mode_BASE 0 | ||
15 | |||
16 | #ifdef MY_CPU_X86_OR_AMD64 | ||
17 | |||
18 | #define k_SwapBytes_Mode_SSE2 1 | ||
19 | #define k_SwapBytes_Mode_SSSE3 2 | ||
20 | #define k_SwapBytes_Mode_AVX2 3 | ||
21 | |||
22 | // #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) | ||
23 | #if defined(__clang__) && (__clang_major__ >= 4) \ | ||
24 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701) | ||
25 | #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_AVX2 | ||
26 | #define SWAP_ATTRIB_SSE2 __attribute__((__target__("sse2"))) | ||
27 | #define SWAP_ATTRIB_SSSE3 __attribute__((__target__("ssse3"))) | ||
28 | #define SWAP_ATTRIB_AVX2 __attribute__((__target__("avx2"))) | ||
29 | #elif defined(_MSC_VER) | ||
30 | #if (_MSC_VER == 1900) | ||
31 | #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX | ||
32 | #endif | ||
33 | #if (_MSC_VER >= 1900) | ||
34 | #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_AVX2 | ||
35 | #elif (_MSC_VER >= 1500) // (VS2008) | ||
36 | #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_SSSE3 | ||
37 | #elif (_MSC_VER >= 1310) // (VS2003) | ||
38 | #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_SSE2 | ||
39 | #endif | ||
40 | #endif // _MSC_VER | ||
41 | |||
42 | /* | ||
43 | // for debug | ||
44 | #ifdef k_SwapBytes_Mode_MAX | ||
45 | #undef k_SwapBytes_Mode_MAX | ||
46 | #endif | ||
47 | */ | ||
48 | |||
49 | #ifndef k_SwapBytes_Mode_MAX | ||
50 | #define k_SwapBytes_Mode_MAX 0 | ||
51 | #endif | ||
52 | |||
53 | #if (k_SwapBytes_Mode_MAX != 0) && defined(MY_CPU_AMD64) | ||
54 | #define k_SwapBytes_Mode_MIN k_SwapBytes_Mode_SSE2 | ||
55 | #else | ||
56 | #define k_SwapBytes_Mode_MIN 0 | ||
57 | #endif | ||
58 | |||
59 | #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_AVX2) | ||
60 | #define USE_SWAP_AVX2 | ||
61 | #endif | ||
62 | #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSSE3) | ||
63 | #define USE_SWAP_SSSE3 | ||
64 | #endif | ||
65 | #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSE2) | ||
66 | #define USE_SWAP_128 | ||
67 | #endif | ||
68 | |||
69 | #if k_SwapBytes_Mode_MAX <= k_SwapBytes_Mode_MIN || !defined(USE_SWAP_128) | ||
70 | #define FORCE_SWAP_MODE | ||
71 | #endif | ||
72 | |||
73 | |||
74 | #ifdef USE_SWAP_128 | ||
75 | /* | ||
76 | <mmintrin.h> MMX | ||
77 | <xmmintrin.h> SSE | ||
78 | <emmintrin.h> SSE2 | ||
79 | <pmmintrin.h> SSE3 | ||
80 | <tmmintrin.h> SSSE3 | ||
81 | <smmintrin.h> SSE4.1 | ||
82 | <nmmintrin.h> SSE4.2 | ||
83 | <ammintrin.h> SSE4A | ||
84 | <wmmintrin.h> AES | ||
85 | <immintrin.h> AVX, AVX2, FMA | ||
86 | */ | ||
87 | |||
88 | #include <emmintrin.h> // sse2 | ||
89 | // typedef __m128i v128; | ||
90 | |||
91 | #define SWAP2_128(i) { \ | ||
92 | const __m128i v = *(const __m128i *)(const void *)(items + (i) * 8); \ | ||
93 | *( __m128i *)( void *)(items + (i) * 8) = \ | ||
94 | _mm_or_si128( \ | ||
95 | _mm_slli_epi16(v, 8), \ | ||
96 | _mm_srli_epi16(v, 8)); } | ||
97 | // _mm_or_si128() has more ports to execute than _mm_add_epi16(). | ||
98 | |||
99 | static | ||
100 | #ifdef SWAP_ATTRIB_SSE2 | ||
101 | SWAP_ATTRIB_SSE2 | ||
102 | #endif | ||
103 | void | ||
104 | Z7_FASTCALL | ||
105 | SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim) | ||
106 | { | ||
107 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
108 | do | ||
109 | { | ||
110 | SWAP2_128(0) SWAP2_128(1) items += 2 * 8; | ||
111 | SWAP2_128(0) SWAP2_128(1) items += 2 * 8; | ||
112 | } | ||
113 | while (items != lim); | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | // sse2 | ||
118 | #define SWAP4_128_pack(i) { \ | ||
119 | __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \ | ||
120 | __m128i v0 = _mm_unpacklo_epi8(v, mask); \ | ||
121 | __m128i v1 = _mm_unpackhi_epi8(v, mask); \ | ||
122 | v0 = _mm_shufflelo_epi16(v0, 0x1b); \ | ||
123 | v1 = _mm_shufflelo_epi16(v1, 0x1b); \ | ||
124 | v0 = _mm_shufflehi_epi16(v0, 0x1b); \ | ||
125 | v1 = _mm_shufflehi_epi16(v1, 0x1b); \ | ||
126 | *(__m128i *)(void *)(items + (i) * 4) = _mm_packus_epi16(v0, v1); } | ||
127 | |||
128 | static | ||
129 | #ifdef SWAP_ATTRIB_SSE2 | ||
130 | SWAP_ATTRIB_SSE2 | ||
131 | #endif | ||
132 | void | ||
133 | Z7_FASTCALL | ||
134 | SwapBytes4_128_pack(CSwapUInt32 *items, const CSwapUInt32 *lim) | ||
135 | { | ||
136 | const __m128i mask = _mm_setzero_si128(); | ||
137 | // const __m128i mask = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0); | ||
138 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
139 | do | ||
140 | { | ||
141 | SWAP4_128_pack(0); items += 1 * 4; | ||
142 | // SWAP4_128_pack(0); SWAP4_128_pack(1); items += 2 * 4; | ||
143 | } | ||
144 | while (items != lim); | ||
145 | } | ||
146 | |||
147 | // sse2 | ||
148 | #define SWAP4_128_shift(i) { \ | ||
149 | __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \ | ||
150 | __m128i v2; \ | ||
151 | v2 = _mm_or_si128( \ | ||
152 | _mm_slli_si128(_mm_and_si128(v, mask), 1), \ | ||
153 | _mm_and_si128(_mm_srli_si128(v, 1), mask)); \ | ||
154 | v = _mm_or_si128( \ | ||
155 | _mm_slli_epi32(v, 24), \ | ||
156 | _mm_srli_epi32(v, 24)); \ | ||
157 | *(__m128i *)(void *)(items + (i) * 4) = _mm_or_si128(v2, v); } | ||
158 | |||
159 | static | ||
160 | #ifdef SWAP_ATTRIB_SSE2 | ||
161 | SWAP_ATTRIB_SSE2 | ||
162 | #endif | ||
163 | void | ||
164 | Z7_FASTCALL | ||
165 | SwapBytes4_128_shift(CSwapUInt32 *items, const CSwapUInt32 *lim) | ||
166 | { | ||
167 | #define M1 0xff00 | ||
168 | const __m128i mask = _mm_set_epi32(M1, M1, M1, M1); | ||
169 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
170 | do | ||
171 | { | ||
172 | // SWAP4_128_shift(0) SWAP4_128_shift(1) items += 2 * 4; | ||
173 | // SWAP4_128_shift(0) SWAP4_128_shift(1) items += 2 * 4; | ||
174 | SWAP4_128_shift(0); items += 1 * 4; | ||
175 | } | ||
176 | while (items != lim); | ||
177 | } | ||
178 | */ | ||
179 | |||
180 | |||
181 | #if defined(USE_SWAP_SSSE3) || defined(USE_SWAP_AVX2) | ||
182 | |||
183 | #define SWAP_SHUF_REV_SEQ_2_VALS(v) (v)+1, (v) | ||
184 | #define SWAP_SHUF_REV_SEQ_4_VALS(v) (v)+3, (v)+2, (v)+1, (v) | ||
185 | |||
186 | #define SWAP2_SHUF_MASK_16_BYTES \ | ||
187 | SWAP_SHUF_REV_SEQ_2_VALS (0 * 2), \ | ||
188 | SWAP_SHUF_REV_SEQ_2_VALS (1 * 2), \ | ||
189 | SWAP_SHUF_REV_SEQ_2_VALS (2 * 2), \ | ||
190 | SWAP_SHUF_REV_SEQ_2_VALS (3 * 2), \ | ||
191 | SWAP_SHUF_REV_SEQ_2_VALS (4 * 2), \ | ||
192 | SWAP_SHUF_REV_SEQ_2_VALS (5 * 2), \ | ||
193 | SWAP_SHUF_REV_SEQ_2_VALS (6 * 2), \ | ||
194 | SWAP_SHUF_REV_SEQ_2_VALS (7 * 2) | ||
195 | |||
196 | #define SWAP4_SHUF_MASK_16_BYTES \ | ||
197 | SWAP_SHUF_REV_SEQ_4_VALS (0 * 4), \ | ||
198 | SWAP_SHUF_REV_SEQ_4_VALS (1 * 4), \ | ||
199 | SWAP_SHUF_REV_SEQ_4_VALS (2 * 4), \ | ||
200 | SWAP_SHUF_REV_SEQ_4_VALS (3 * 4) | ||
201 | |||
202 | #if defined(USE_SWAP_AVX2) | ||
203 | /* if we use 256_BIT_INIT_MASK, each static array mask will be larger for 16 bytes */ | ||
204 | // #define SWAP_USE_256_BIT_INIT_MASK | ||
205 | #endif | ||
206 | |||
207 | #if defined(SWAP_USE_256_BIT_INIT_MASK) && defined(USE_SWAP_AVX2) | ||
208 | #define SWAP_MASK_INIT_SIZE 32 | ||
209 | #else | ||
210 | #define SWAP_MASK_INIT_SIZE 16 | ||
211 | #endif | ||
212 | |||
213 | MY_ALIGN(SWAP_MASK_INIT_SIZE) | ||
214 | static const Byte k_ShufMask_Swap2[] = | ||
215 | { | ||
216 | SWAP2_SHUF_MASK_16_BYTES | ||
217 | #if SWAP_MASK_INIT_SIZE > 16 | ||
218 | , SWAP2_SHUF_MASK_16_BYTES | ||
219 | #endif | ||
220 | }; | ||
221 | |||
222 | MY_ALIGN(SWAP_MASK_INIT_SIZE) | ||
223 | static const Byte k_ShufMask_Swap4[] = | ||
224 | { | ||
225 | SWAP4_SHUF_MASK_16_BYTES | ||
226 | #if SWAP_MASK_INIT_SIZE > 16 | ||
227 | , SWAP4_SHUF_MASK_16_BYTES | ||
228 | #endif | ||
229 | }; | ||
230 | |||
231 | |||
232 | #ifdef USE_SWAP_SSSE3 | ||
233 | |||
234 | #include <tmmintrin.h> // ssse3 | ||
235 | |||
236 | #define SHUF_128(i) *(items + (i)) = \ | ||
237 | _mm_shuffle_epi8(*(items + (i)), mask); // SSSE3 | ||
238 | |||
239 | // Z7_NO_INLINE | ||
240 | static | ||
241 | #ifdef SWAP_ATTRIB_SSSE3 | ||
242 | SWAP_ATTRIB_SSSE3 | ||
243 | #endif | ||
244 | Z7_ATTRIB_NO_VECTORIZE | ||
245 | void | ||
246 | Z7_FASTCALL | ||
247 | ShufBytes_128(void *items8, const void *lim8, const void *mask128_ptr) | ||
248 | { | ||
249 | __m128i *items = (__m128i *)items8; | ||
250 | const __m128i *lim = (const __m128i *)lim8; | ||
251 | // const __m128i mask = _mm_set_epi8(SHUF_SWAP2_MASK_16_VALS); | ||
252 | // const __m128i mask = _mm_set_epi8(SHUF_SWAP4_MASK_16_VALS); | ||
253 | // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); | ||
254 | // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); | ||
255 | // const __m128i mask = *(const __m128i *)(const void *)&(k_ShufMask_Swap4[0]); | ||
256 | const __m128i mask = *(const __m128i *)mask128_ptr; | ||
257 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
258 | do | ||
259 | { | ||
260 | SHUF_128(0) SHUF_128(1) items += 2; | ||
261 | SHUF_128(0) SHUF_128(1) items += 2; | ||
262 | } | ||
263 | while (items != lim); | ||
264 | } | ||
265 | |||
266 | #endif // USE_SWAP_SSSE3 | ||
267 | |||
268 | |||
269 | |||
270 | #ifdef USE_SWAP_AVX2 | ||
271 | |||
272 | #include <immintrin.h> // avx, avx2 | ||
273 | #if defined(__clang__) | ||
274 | #include <avxintrin.h> | ||
275 | #include <avx2intrin.h> | ||
276 | #endif | ||
277 | |||
278 | #define SHUF_256(i) *(items + (i)) = \ | ||
279 | _mm256_shuffle_epi8(*(items + (i)), mask); // AVX2 | ||
280 | |||
281 | // Z7_NO_INLINE | ||
282 | static | ||
283 | #ifdef SWAP_ATTRIB_AVX2 | ||
284 | SWAP_ATTRIB_AVX2 | ||
285 | #endif | ||
286 | Z7_ATTRIB_NO_VECTORIZE | ||
287 | void | ||
288 | Z7_FASTCALL | ||
289 | ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr) | ||
290 | { | ||
291 | __m256i *items = (__m256i *)items8; | ||
292 | const __m256i *lim = (const __m256i *)lim8; | ||
293 | /* | ||
294 | UNUSED_VAR(mask128_ptr) | ||
295 | __m256i mask = | ||
296 | for Swap4: _mm256_setr_epi8(SWAP4_SHUF_MASK_16_BYTES, SWAP4_SHUF_MASK_16_BYTES); | ||
297 | for Swap2: _mm256_setr_epi8(SWAP2_SHUF_MASK_16_BYTES, SWAP2_SHUF_MASK_16_BYTES); | ||
298 | */ | ||
299 | const __m256i mask = | ||
300 | #if SWAP_MASK_INIT_SIZE > 16 | ||
301 | *(const __m256i *)(const void *)mask128_ptr; | ||
302 | #else | ||
303 | /* msvc: broadcastsi128() version reserves the stack for no reason | ||
304 | msvc 19.29-: _mm256_insertf128_si256() / _mm256_set_m128i)) versions use non-avx movdqu xmm0,XMMWORD PTR [r8] | ||
305 | msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want | ||
306 | */ | ||
307 | // _mm256_broadcastsi128_si256(*mask128_ptr); | ||
308 | /* | ||
309 | #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) | ||
310 | MY_mm256_set_m128i | ||
311 | */ | ||
312 | _mm256_set_m128i( | ||
313 | *(const __m128i *)mask128_ptr, | ||
314 | *(const __m128i *)mask128_ptr); | ||
315 | #endif | ||
316 | |||
317 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
318 | do | ||
319 | { | ||
320 | SHUF_256(0) SHUF_256(1) items += 2; | ||
321 | SHUF_256(0) SHUF_256(1) items += 2; | ||
322 | } | ||
323 | while (items != lim); | ||
324 | } | ||
325 | |||
326 | #endif // USE_SWAP_AVX2 | ||
327 | #endif // USE_SWAP_SSSE3 || USE_SWAP_AVX2 | ||
328 | #endif // USE_SWAP_128 | ||
329 | |||
330 | |||
331 | |||
332 | // compile message "NEON intrinsics not available with the soft-float ABI" | ||
333 | #elif defined(MY_CPU_ARM_OR_ARM64) || \ | ||
334 | (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) | ||
335 | // #elif defined(MY_CPU_ARM64) | ||
336 | |||
337 | #if defined(__clang__) && (__clang_major__ >= 8) \ | ||
338 | || defined(__GNUC__) && (__GNUC__ >= 8) | ||
339 | #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) \ | ||
340 | || defined(MY_CPU_ARM64) | ||
341 | #define USE_SWAP_128 | ||
342 | #endif | ||
343 | #ifdef MY_CPU_ARM64 | ||
344 | // #define SWAP_ATTRIB_NEON __attribute__((__target__(""))) | ||
345 | #else | ||
346 | // #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | ||
347 | #endif | ||
348 | #elif defined(_MSC_VER) | ||
349 | #if (_MSC_VER >= 1910) | ||
350 | #define USE_SWAP_128 | ||
351 | #endif | ||
352 | #endif | ||
353 | |||
354 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | ||
355 | #include <arm64_neon.h> | ||
356 | #else | ||
357 | #include <arm_neon.h> | ||
358 | #endif | ||
359 | |||
360 | #ifndef USE_SWAP_128 | ||
361 | #define FORCE_SWAP_MODE | ||
362 | #else | ||
363 | |||
364 | #ifdef MY_CPU_ARM64 | ||
365 | // for debug : comment it | ||
366 | #define FORCE_SWAP_MODE | ||
367 | #else | ||
368 | #define k_SwapBytes_Mode_NEON 1 | ||
369 | #endif | ||
370 | // typedef uint8x16_t v128; | ||
371 | #define SWAP2_128(i) *(uint8x16_t *) (void *)(items + (i) * 8) = \ | ||
372 | vrev16q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 8)); | ||
373 | #define SWAP4_128(i) *(uint8x16_t *) (void *)(items + (i) * 4) = \ | ||
374 | vrev32q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 4)); | ||
375 | |||
376 | // Z7_NO_INLINE | ||
377 | static | ||
378 | #ifdef SWAP_ATTRIB_NEON | ||
379 | SWAP_ATTRIB_NEON | ||
380 | #endif | ||
381 | Z7_ATTRIB_NO_VECTORIZE | ||
382 | void | ||
383 | Z7_FASTCALL | ||
384 | SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim) | ||
385 | { | ||
386 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
387 | do | ||
388 | { | ||
389 | SWAP2_128(0) SWAP2_128(1) items += 2 * 8; | ||
390 | SWAP2_128(0) SWAP2_128(1) items += 2 * 8; | ||
391 | } | ||
392 | while (items != lim); | ||
393 | } | ||
394 | |||
395 | // Z7_NO_INLINE | ||
396 | static | ||
397 | #ifdef SWAP_ATTRIB_NEON | ||
398 | SWAP_ATTRIB_NEON | ||
399 | #endif | ||
400 | Z7_ATTRIB_NO_VECTORIZE | ||
401 | void | ||
402 | Z7_FASTCALL | ||
403 | SwapBytes4_128(CSwapUInt32 *items, const CSwapUInt32 *lim) | ||
404 | { | ||
405 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
406 | do | ||
407 | { | ||
408 | SWAP4_128(0) SWAP4_128(1) items += 2 * 4; | ||
409 | SWAP4_128(0) SWAP4_128(1) items += 2 * 4; | ||
410 | } | ||
411 | while (items != lim); | ||
412 | } | ||
413 | |||
414 | #endif // USE_SWAP_128 | ||
415 | |||
416 | #else // MY_CPU_ARM_OR_ARM64 | ||
417 | #define FORCE_SWAP_MODE | ||
418 | #endif // MY_CPU_ARM_OR_ARM64 | ||
419 | |||
420 | |||
421 | |||
422 | |||
423 | |||
424 | |||
425 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_X86) | ||
426 | /* _byteswap_ushort() in MSVC x86 32-bit works via slow { mov dh, al; mov dl, ah } | ||
427 | So we use own versions of byteswap function */ | ||
428 | #if (_MSC_VER < 1400 ) // old MSVC-X86 without _rotr16() support | ||
429 | #define SWAP2_16(i) { UInt32 v = items[i]; v += (v << 16); v >>= 8; items[i] = (CSwapUInt16)v; } | ||
430 | #else // is new MSVC-X86 with fast _rotr16() | ||
431 | #include <intrin.h> | ||
432 | #define SWAP2_16(i) { items[i] = _rotr16(items[i], 8); } | ||
433 | #endif | ||
434 | #else // is not MSVC-X86 | ||
435 | #define SWAP2_16(i) { CSwapUInt16 v = items[i]; items[i] = Z7_BSWAP16(v); } | ||
436 | #endif // MSVC-X86 | ||
437 | |||
438 | #if defined(Z7_CPU_FAST_BSWAP_SUPPORTED) | ||
439 | #define SWAP4_32(i) { CSwapUInt32 v = items[i]; items[i] = Z7_BSWAP32(v); } | ||
440 | #else | ||
441 | #define SWAP4_32(i) \ | ||
442 | { UInt32 v = items[i]; \ | ||
443 | v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); \ | ||
444 | v = rotlFixed(v, 16); \ | ||
445 | items[i] = v; } | ||
446 | #endif | ||
447 | |||
448 | |||
449 | |||
450 | |||
451 | #if defined(FORCE_SWAP_MODE) && defined(USE_SWAP_128) | ||
452 | #define DEFAULT_Swap2 SwapBytes2_128 | ||
453 | #if !defined(MY_CPU_X86_OR_AMD64) | ||
454 | #define DEFAULT_Swap4 SwapBytes4_128 | ||
455 | #endif | ||
456 | #endif | ||
457 | |||
458 | #if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4) | ||
459 | |||
460 | #define SWAP_BASE_FUNCS_PREFIXES \ | ||
461 | Z7_FORCE_INLINE \ | ||
462 | static \ | ||
463 | Z7_ATTRIB_NO_VECTOR \ | ||
464 | void Z7_FASTCALL | ||
465 | |||
466 | |||
467 | #ifdef MY_CPU_64BIT | ||
468 | |||
469 | #if defined(MY_CPU_ARM64) \ | ||
470 | && defined(__ARM_ARCH) && (__ARM_ARCH >= 8) \ | ||
471 | && ( (defined(__GNUC__) && (__GNUC__ >= 4)) \ | ||
472 | || (defined(__clang__) && (__clang_major__ >= 4))) | ||
473 | |||
474 | #define SWAP2_64_VAR(v) asm ("rev16 %x0,%x0" : "+r" (v)); | ||
475 | #define SWAP4_64_VAR(v) asm ("rev32 %x0,%x0" : "+r" (v)); | ||
476 | |||
477 | #else // is not ARM64-GNU | ||
478 | |||
479 | #if !defined(MY_CPU_X86_OR_AMD64) || (k_SwapBytes_Mode_MIN == 0) || !defined(USE_SWAP_128) | ||
480 | #define SWAP2_64_VAR(v) \ | ||
481 | v = ( 0x00ff00ff00ff00ff & (v >> 8)) \ | ||
482 | + ((0x00ff00ff00ff00ff & v) << 8); | ||
483 | /* plus gives faster code in MSVC */ | ||
484 | #endif | ||
485 | |||
486 | #ifdef Z7_CPU_FAST_BSWAP_SUPPORTED | ||
487 | #define SWAP4_64_VAR(v) \ | ||
488 | v = Z7_BSWAP64(v); \ | ||
489 | v = Z7_ROTL64(v, 32); | ||
490 | #else | ||
491 | #define SWAP4_64_VAR(v) \ | ||
492 | v = ( 0x000000ff000000ff & (v >> 24)) \ | ||
493 | + ((0x000000ff000000ff & v) << 24 ) \ | ||
494 | + ( 0x0000ff000000ff00 & (v >> 8)) \ | ||
495 | + ((0x0000ff000000ff00 & v) << 8 ) \ | ||
496 | ; | ||
497 | #endif | ||
498 | |||
499 | #endif // ARM64-GNU | ||
500 | |||
501 | |||
502 | #ifdef SWAP2_64_VAR | ||
503 | |||
504 | #define SWAP2_64(i) { \ | ||
505 | UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 4); \ | ||
506 | SWAP2_64_VAR(v) \ | ||
507 | *(UInt64 *)(void *)(items + (i) * 4) = v; } | ||
508 | |||
509 | SWAP_BASE_FUNCS_PREFIXES | ||
510 | SwapBytes2_64(CSwapUInt16 *items, const CSwapUInt16 *lim) | ||
511 | { | ||
512 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
513 | do | ||
514 | { | ||
515 | SWAP2_64(0) SWAP2_64(1) items += 2 * 4; | ||
516 | SWAP2_64(0) SWAP2_64(1) items += 2 * 4; | ||
517 | } | ||
518 | while (items != lim); | ||
519 | } | ||
520 | |||
521 | #define DEFAULT_Swap2 SwapBytes2_64 | ||
522 | #if !defined(FORCE_SWAP_MODE) | ||
523 | #define SWAP2_DEFAULT_MODE 0 | ||
524 | #endif | ||
525 | #else // !defined(SWAP2_64_VAR) | ||
526 | #define DEFAULT_Swap2 SwapBytes2_128 | ||
527 | #if !defined(FORCE_SWAP_MODE) | ||
528 | #define SWAP2_DEFAULT_MODE 1 | ||
529 | #endif | ||
530 | #endif // SWAP2_64_VAR | ||
531 | |||
532 | |||
533 | #define SWAP4_64(i) { \ | ||
534 | UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 2); \ | ||
535 | SWAP4_64_VAR(v) \ | ||
536 | *(UInt64 *)(void *)(items + (i) * 2) = v; } | ||
537 | |||
538 | SWAP_BASE_FUNCS_PREFIXES | ||
539 | SwapBytes4_64(CSwapUInt32 *items, const CSwapUInt32 *lim) | ||
540 | { | ||
541 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
542 | do | ||
543 | { | ||
544 | SWAP4_64(0) SWAP4_64(1) items += 2 * 2; | ||
545 | SWAP4_64(0) SWAP4_64(1) items += 2 * 2; | ||
546 | } | ||
547 | while (items != lim); | ||
548 | } | ||
549 | |||
550 | #define DEFAULT_Swap4 SwapBytes4_64 | ||
551 | |||
552 | #else // is not 64BIT | ||
553 | |||
554 | |||
555 | #if defined(MY_CPU_ARM_OR_ARM64) \ | ||
556 | && defined(__ARM_ARCH) && (__ARM_ARCH >= 6) \ | ||
557 | && ( (defined(__GNUC__) && (__GNUC__ >= 4)) \ | ||
558 | || (defined(__clang__) && (__clang_major__ >= 4))) | ||
559 | |||
560 | #ifdef MY_CPU_64BIT | ||
561 | #define SWAP2_32_VAR(v) asm ("rev16 %w0,%w0" : "+r" (v)); | ||
562 | #else | ||
563 | #define SWAP2_32_VAR(v) asm ("rev16 %0,%0" : "+r" (v)); // for clang/gcc | ||
564 | // asm ("rev16 %r0,%r0" : "+r" (a)); // for gcc | ||
565 | #endif | ||
566 | |||
567 | #elif defined(_MSC_VER) && (_MSC_VER < 1300) && defined(MY_CPU_X86) \ | ||
568 | || !defined(Z7_CPU_FAST_BSWAP_SUPPORTED) \ | ||
569 | || !defined(Z7_CPU_FAST_ROTATE_SUPPORTED) | ||
570 | // old msvc doesn't support _byteswap_ulong() | ||
571 | #define SWAP2_32_VAR(v) \ | ||
572 | v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); | ||
573 | |||
574 | #else // is not ARM and is not old-MSVC-X86 and fast BSWAP/ROTATE are supported | ||
575 | #define SWAP2_32_VAR(v) \ | ||
576 | v = Z7_BSWAP32(v); \ | ||
577 | v = rotlFixed(v, 16); | ||
578 | |||
579 | #endif // GNU-ARM* | ||
580 | |||
581 | #define SWAP2_32(i) { \ | ||
582 | UInt32 v = *(const UInt32 *)(const void *)(items + (i) * 2); \ | ||
583 | SWAP2_32_VAR(v); \ | ||
584 | *(UInt32 *)(void *)(items + (i) * 2) = v; } | ||
585 | |||
586 | |||
587 | SWAP_BASE_FUNCS_PREFIXES | ||
588 | SwapBytes2_32(CSwapUInt16 *items, const CSwapUInt16 *lim) | ||
589 | { | ||
590 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
591 | do | ||
592 | { | ||
593 | SWAP2_32(0) SWAP2_32(1) items += 2 * 2; | ||
594 | SWAP2_32(0) SWAP2_32(1) items += 2 * 2; | ||
595 | } | ||
596 | while (items != lim); | ||
597 | } | ||
598 | |||
599 | |||
600 | SWAP_BASE_FUNCS_PREFIXES | ||
601 | SwapBytes4_32(CSwapUInt32 *items, const CSwapUInt32 *lim) | ||
602 | { | ||
603 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
604 | do | ||
605 | { | ||
606 | SWAP4_32(0) SWAP4_32(1) items += 2; | ||
607 | SWAP4_32(0) SWAP4_32(1) items += 2; | ||
608 | } | ||
609 | while (items != lim); | ||
610 | } | ||
611 | |||
612 | #define DEFAULT_Swap2 SwapBytes2_32 | ||
613 | #define DEFAULT_Swap4 SwapBytes4_32 | ||
614 | #if !defined(FORCE_SWAP_MODE) | ||
615 | #define SWAP2_DEFAULT_MODE 0 | ||
616 | #endif | ||
617 | |||
618 | #endif // MY_CPU_64BIT | ||
619 | #endif // if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4) | ||
620 | |||
621 | |||
622 | |||
623 | #if !defined(FORCE_SWAP_MODE) | ||
624 | static unsigned g_SwapBytes_Mode; | ||
625 | #endif | ||
626 | |||
627 | /* size of largest unrolled loop iteration: 128 bytes = 4 * 32 bytes (AVX). */ | ||
628 | #define SWAP_ITERATION_BLOCK_SIZE_MAX (1 << 7) | ||
629 | |||
630 | // 32 bytes for (AVX) or 2 * 16-bytes for NEON. | ||
631 | #define SWAP_VECTOR_ALIGN_SIZE (1 << 5) | ||
632 | |||
633 | Z7_NO_INLINE | ||
634 | void z7_SwapBytes2(CSwapUInt16 *items, size_t numItems) | ||
635 | { | ||
636 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
637 | for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--) | ||
638 | { | ||
639 | SWAP2_16(0) | ||
640 | items++; | ||
641 | } | ||
642 | { | ||
643 | const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt16) - 1; | ||
644 | size_t numItems2 = numItems; | ||
645 | CSwapUInt16 *lim; | ||
646 | numItems &= k_Align_Mask; | ||
647 | numItems2 &= ~(size_t)k_Align_Mask; | ||
648 | lim = items + numItems2; | ||
649 | if (numItems2 != 0) | ||
650 | { | ||
651 | #if !defined(FORCE_SWAP_MODE) | ||
652 | #ifdef MY_CPU_X86_OR_AMD64 | ||
653 | #ifdef USE_SWAP_AVX2 | ||
654 | if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3) | ||
655 | ShufBytes_256((__m256i *)(void *)items, | ||
656 | (const __m256i *)(const void *)lim, | ||
657 | (const __m128i *)(const void *)&(k_ShufMask_Swap2[0])); | ||
658 | else | ||
659 | #endif | ||
660 | #ifdef USE_SWAP_SSSE3 | ||
661 | if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3) | ||
662 | ShufBytes_128((__m128i *)(void *)items, | ||
663 | (const __m128i *)(const void *)lim, | ||
664 | (const __m128i *)(const void *)&(k_ShufMask_Swap2[0])); | ||
665 | else | ||
666 | #endif | ||
667 | #endif // MY_CPU_X86_OR_AMD64 | ||
668 | #if SWAP2_DEFAULT_MODE == 0 | ||
669 | if (g_SwapBytes_Mode != 0) | ||
670 | SwapBytes2_128(items, lim); | ||
671 | else | ||
672 | #endif | ||
673 | #endif // FORCE_SWAP_MODE | ||
674 | DEFAULT_Swap2(items, lim); | ||
675 | } | ||
676 | items = lim; | ||
677 | } | ||
678 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
679 | for (; numItems != 0; numItems--) | ||
680 | { | ||
681 | SWAP2_16(0) | ||
682 | items++; | ||
683 | } | ||
684 | } | ||
685 | |||
686 | |||
687 | Z7_NO_INLINE | ||
688 | void z7_SwapBytes4(CSwapUInt32 *items, size_t numItems) | ||
689 | { | ||
690 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
691 | for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--) | ||
692 | { | ||
693 | SWAP4_32(0) | ||
694 | items++; | ||
695 | } | ||
696 | { | ||
697 | const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt32) - 1; | ||
698 | size_t numItems2 = numItems; | ||
699 | CSwapUInt32 *lim; | ||
700 | numItems &= k_Align_Mask; | ||
701 | numItems2 &= ~(size_t)k_Align_Mask; | ||
702 | lim = items + numItems2; | ||
703 | if (numItems2 != 0) | ||
704 | { | ||
705 | #if !defined(FORCE_SWAP_MODE) | ||
706 | #ifdef MY_CPU_X86_OR_AMD64 | ||
707 | #ifdef USE_SWAP_AVX2 | ||
708 | if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3) | ||
709 | ShufBytes_256((__m256i *)(void *)items, | ||
710 | (const __m256i *)(const void *)lim, | ||
711 | (const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); | ||
712 | else | ||
713 | #endif | ||
714 | #ifdef USE_SWAP_SSSE3 | ||
715 | if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3) | ||
716 | ShufBytes_128((__m128i *)(void *)items, | ||
717 | (const __m128i *)(const void *)lim, | ||
718 | (const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); | ||
719 | else | ||
720 | #endif | ||
721 | #else // MY_CPU_X86_OR_AMD64 | ||
722 | |||
723 | if (g_SwapBytes_Mode != 0) | ||
724 | SwapBytes4_128(items, lim); | ||
725 | else | ||
726 | #endif // MY_CPU_X86_OR_AMD64 | ||
727 | #endif // FORCE_SWAP_MODE | ||
728 | DEFAULT_Swap4(items, lim); | ||
729 | } | ||
730 | items = lim; | ||
731 | } | ||
732 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
733 | for (; numItems != 0; numItems--) | ||
734 | { | ||
735 | SWAP4_32(0) | ||
736 | items++; | ||
737 | } | ||
738 | } | ||
739 | |||
740 | |||
741 | // #define SHOW_HW_STATUS | ||
742 | |||
743 | #ifdef SHOW_HW_STATUS | ||
744 | #include <stdio.h> | ||
745 | #define PRF(x) x | ||
746 | #else | ||
747 | #define PRF(x) | ||
748 | #endif | ||
749 | |||
750 | void z7_SwapBytesPrepare(void) | ||
751 | { | ||
752 | #ifndef FORCE_SWAP_MODE | ||
753 | unsigned mode = 0; // k_SwapBytes_Mode_BASE; | ||
754 | |||
755 | #ifdef MY_CPU_ARM_OR_ARM64 | ||
756 | { | ||
757 | if (CPU_IsSupported_NEON()) | ||
758 | { | ||
759 | // #pragma message ("=== SwapBytes NEON") | ||
760 | PRF(printf("\n=== SwapBytes NEON\n");) | ||
761 | mode = k_SwapBytes_Mode_NEON; | ||
762 | } | ||
763 | } | ||
764 | #else // MY_CPU_ARM_OR_ARM64 | ||
765 | { | ||
766 | #ifdef USE_SWAP_AVX2 | ||
767 | if (CPU_IsSupported_AVX2()) | ||
768 | { | ||
769 | // #pragma message ("=== SwapBytes AVX2") | ||
770 | PRF(printf("\n=== SwapBytes AVX2\n");) | ||
771 | mode = k_SwapBytes_Mode_AVX2; | ||
772 | } | ||
773 | else | ||
774 | #endif | ||
775 | #ifdef USE_SWAP_SSSE3 | ||
776 | if (CPU_IsSupported_SSSE3()) | ||
777 | { | ||
778 | // #pragma message ("=== SwapBytes SSSE3") | ||
779 | PRF(printf("\n=== SwapBytes SSSE3\n");) | ||
780 | mode = k_SwapBytes_Mode_SSSE3; | ||
781 | } | ||
782 | else | ||
783 | #endif | ||
784 | #if !defined(MY_CPU_AMD64) | ||
785 | if (CPU_IsSupported_SSE2()) | ||
786 | #endif | ||
787 | { | ||
788 | // #pragma message ("=== SwapBytes SSE2") | ||
789 | PRF(printf("\n=== SwapBytes SSE2\n");) | ||
790 | mode = k_SwapBytes_Mode_SSE2; | ||
791 | } | ||
792 | } | ||
793 | #endif // MY_CPU_ARM_OR_ARM64 | ||
794 | g_SwapBytes_Mode = mode; | ||
795 | // g_SwapBytes_Mode = 0; // for debug | ||
796 | #endif // FORCE_SWAP_MODE | ||
797 | PRF(printf("\n=== SwapBytesPrepare\n");) | ||
798 | } | ||
799 | |||
800 | #undef PRF | ||