aboutsummaryrefslogtreecommitdiff
path: root/C/SwapBytes.c
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2023-06-21 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2023-12-17 14:59:19 +0500
commit5b39dc76f1bc82f941d5c800ab9f34407a06b53a (patch)
treefe5e17420300b715021a76328444088d32047963 /C/SwapBytes.c
parent93be7d4abfd4233228f58ee1fbbcd76d91be66a4 (diff)
download7zip-23.01.tar.gz
7zip-23.01.tar.bz2
7zip-23.01.zip
23.0123.01
Diffstat (limited to 'C/SwapBytes.c')
-rw-r--r--C/SwapBytes.c800
1 files changed, 800 insertions, 0 deletions
diff --git a/C/SwapBytes.c b/C/SwapBytes.c
new file mode 100644
index 0000000..7901bba
--- /dev/null
+++ b/C/SwapBytes.c
@@ -0,0 +1,800 @@
1/* SwapBytes.c -- Byte Swap conversion filter
22023-04-07 : Igor Pavlov : Public domain */
3
4#include "Precomp.h"
5
6#include "Compiler.h"
7#include "CpuArch.h"
8#include "RotateDefs.h"
9#include "SwapBytes.h"
10
11typedef UInt16 CSwapUInt16;
12typedef UInt32 CSwapUInt32;
13
14// #define k_SwapBytes_Mode_BASE 0
15
16#ifdef MY_CPU_X86_OR_AMD64
17
18#define k_SwapBytes_Mode_SSE2 1
19#define k_SwapBytes_Mode_SSSE3 2
20#define k_SwapBytes_Mode_AVX2 3
21
22 // #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
23 #if defined(__clang__) && (__clang_major__ >= 4) \
24 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
25 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_AVX2
26 #define SWAP_ATTRIB_SSE2 __attribute__((__target__("sse2")))
27 #define SWAP_ATTRIB_SSSE3 __attribute__((__target__("ssse3")))
28 #define SWAP_ATTRIB_AVX2 __attribute__((__target__("avx2")))
29 #elif defined(_MSC_VER)
30 #if (_MSC_VER == 1900)
31 #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
32 #endif
33 #if (_MSC_VER >= 1900)
34 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_AVX2
35 #elif (_MSC_VER >= 1500) // (VS2008)
36 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_SSSE3
37 #elif (_MSC_VER >= 1310) // (VS2003)
38 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_SSE2
39 #endif
40 #endif // _MSC_VER
41
42/*
43// for debug
44#ifdef k_SwapBytes_Mode_MAX
45#undef k_SwapBytes_Mode_MAX
46#endif
47*/
48
49#ifndef k_SwapBytes_Mode_MAX
50#define k_SwapBytes_Mode_MAX 0
51#endif
52
53#if (k_SwapBytes_Mode_MAX != 0) && defined(MY_CPU_AMD64)
54 #define k_SwapBytes_Mode_MIN k_SwapBytes_Mode_SSE2
55#else
56 #define k_SwapBytes_Mode_MIN 0
57#endif
58
59#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_AVX2)
60 #define USE_SWAP_AVX2
61#endif
62#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSSE3)
63 #define USE_SWAP_SSSE3
64#endif
65#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSE2)
66 #define USE_SWAP_128
67#endif
68
69#if k_SwapBytes_Mode_MAX <= k_SwapBytes_Mode_MIN || !defined(USE_SWAP_128)
70#define FORCE_SWAP_MODE
71#endif
72
73
74#ifdef USE_SWAP_128
75/*
76 <mmintrin.h> MMX
77<xmmintrin.h> SSE
78<emmintrin.h> SSE2
79<pmmintrin.h> SSE3
80<tmmintrin.h> SSSE3
81<smmintrin.h> SSE4.1
82<nmmintrin.h> SSE4.2
83<ammintrin.h> SSE4A
84<wmmintrin.h> AES
85<immintrin.h> AVX, AVX2, FMA
86*/
87
88#include <emmintrin.h> // sse2
89// typedef __m128i v128;
90
91#define SWAP2_128(i) { \
92 const __m128i v = *(const __m128i *)(const void *)(items + (i) * 8); \
93 *( __m128i *)( void *)(items + (i) * 8) = \
94 _mm_or_si128( \
95 _mm_slli_epi16(v, 8), \
96 _mm_srli_epi16(v, 8)); }
97// _mm_or_si128() has more ports to execute than _mm_add_epi16().
98
99static
100#ifdef SWAP_ATTRIB_SSE2
101SWAP_ATTRIB_SSE2
102#endif
103void
104Z7_FASTCALL
105SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim)
106{
107 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
108 do
109 {
110 SWAP2_128(0) SWAP2_128(1) items += 2 * 8;
111 SWAP2_128(0) SWAP2_128(1) items += 2 * 8;
112 }
113 while (items != lim);
114}
115
116/*
117// sse2
118#define SWAP4_128_pack(i) { \
119 __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \
120 __m128i v0 = _mm_unpacklo_epi8(v, mask); \
121 __m128i v1 = _mm_unpackhi_epi8(v, mask); \
122 v0 = _mm_shufflelo_epi16(v0, 0x1b); \
123 v1 = _mm_shufflelo_epi16(v1, 0x1b); \
124 v0 = _mm_shufflehi_epi16(v0, 0x1b); \
125 v1 = _mm_shufflehi_epi16(v1, 0x1b); \
126 *(__m128i *)(void *)(items + (i) * 4) = _mm_packus_epi16(v0, v1); }
127
128static
129#ifdef SWAP_ATTRIB_SSE2
130SWAP_ATTRIB_SSE2
131#endif
132void
133Z7_FASTCALL
134SwapBytes4_128_pack(CSwapUInt32 *items, const CSwapUInt32 *lim)
135{
136 const __m128i mask = _mm_setzero_si128();
137 // const __m128i mask = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
138 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
139 do
140 {
141 SWAP4_128_pack(0); items += 1 * 4;
142 // SWAP4_128_pack(0); SWAP4_128_pack(1); items += 2 * 4;
143 }
144 while (items != lim);
145}
146
147// sse2
148#define SWAP4_128_shift(i) { \
149 __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \
150 __m128i v2; \
151 v2 = _mm_or_si128( \
152 _mm_slli_si128(_mm_and_si128(v, mask), 1), \
153 _mm_and_si128(_mm_srli_si128(v, 1), mask)); \
154 v = _mm_or_si128( \
155 _mm_slli_epi32(v, 24), \
156 _mm_srli_epi32(v, 24)); \
157 *(__m128i *)(void *)(items + (i) * 4) = _mm_or_si128(v2, v); }
158
159static
160#ifdef SWAP_ATTRIB_SSE2
161SWAP_ATTRIB_SSE2
162#endif
163void
164Z7_FASTCALL
165SwapBytes4_128_shift(CSwapUInt32 *items, const CSwapUInt32 *lim)
166{
167 #define M1 0xff00
168 const __m128i mask = _mm_set_epi32(M1, M1, M1, M1);
169 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
170 do
171 {
172 // SWAP4_128_shift(0) SWAP4_128_shift(1) items += 2 * 4;
173 // SWAP4_128_shift(0) SWAP4_128_shift(1) items += 2 * 4;
174 SWAP4_128_shift(0); items += 1 * 4;
175 }
176 while (items != lim);
177}
178*/
179
180
181#if defined(USE_SWAP_SSSE3) || defined(USE_SWAP_AVX2)
182
183#define SWAP_SHUF_REV_SEQ_2_VALS(v) (v)+1, (v)
184#define SWAP_SHUF_REV_SEQ_4_VALS(v) (v)+3, (v)+2, (v)+1, (v)
185
186#define SWAP2_SHUF_MASK_16_BYTES \
187 SWAP_SHUF_REV_SEQ_2_VALS (0 * 2), \
188 SWAP_SHUF_REV_SEQ_2_VALS (1 * 2), \
189 SWAP_SHUF_REV_SEQ_2_VALS (2 * 2), \
190 SWAP_SHUF_REV_SEQ_2_VALS (3 * 2), \
191 SWAP_SHUF_REV_SEQ_2_VALS (4 * 2), \
192 SWAP_SHUF_REV_SEQ_2_VALS (5 * 2), \
193 SWAP_SHUF_REV_SEQ_2_VALS (6 * 2), \
194 SWAP_SHUF_REV_SEQ_2_VALS (7 * 2)
195
196#define SWAP4_SHUF_MASK_16_BYTES \
197 SWAP_SHUF_REV_SEQ_4_VALS (0 * 4), \
198 SWAP_SHUF_REV_SEQ_4_VALS (1 * 4), \
199 SWAP_SHUF_REV_SEQ_4_VALS (2 * 4), \
200 SWAP_SHUF_REV_SEQ_4_VALS (3 * 4)
201
202#if defined(USE_SWAP_AVX2)
203/* if we use 256_BIT_INIT_MASK, each static array mask will be larger for 16 bytes */
204// #define SWAP_USE_256_BIT_INIT_MASK
205#endif
206
207#if defined(SWAP_USE_256_BIT_INIT_MASK) && defined(USE_SWAP_AVX2)
208#define SWAP_MASK_INIT_SIZE 32
209#else
210#define SWAP_MASK_INIT_SIZE 16
211#endif
212
213MY_ALIGN(SWAP_MASK_INIT_SIZE)
214static const Byte k_ShufMask_Swap2[] =
215{
216 SWAP2_SHUF_MASK_16_BYTES
217 #if SWAP_MASK_INIT_SIZE > 16
218 , SWAP2_SHUF_MASK_16_BYTES
219 #endif
220};
221
222MY_ALIGN(SWAP_MASK_INIT_SIZE)
223static const Byte k_ShufMask_Swap4[] =
224{
225 SWAP4_SHUF_MASK_16_BYTES
226 #if SWAP_MASK_INIT_SIZE > 16
227 , SWAP4_SHUF_MASK_16_BYTES
228 #endif
229};
230
231
232#ifdef USE_SWAP_SSSE3
233
234#include <tmmintrin.h> // ssse3
235
236#define SHUF_128(i) *(items + (i)) = \
237 _mm_shuffle_epi8(*(items + (i)), mask); // SSSE3
238
239// Z7_NO_INLINE
240static
241#ifdef SWAP_ATTRIB_SSSE3
242SWAP_ATTRIB_SSSE3
243#endif
244Z7_ATTRIB_NO_VECTORIZE
245void
246Z7_FASTCALL
247ShufBytes_128(void *items8, const void *lim8, const void *mask128_ptr)
248{
249 __m128i *items = (__m128i *)items8;
250 const __m128i *lim = (const __m128i *)lim8;
251 // const __m128i mask = _mm_set_epi8(SHUF_SWAP2_MASK_16_VALS);
252 // const __m128i mask = _mm_set_epi8(SHUF_SWAP4_MASK_16_VALS);
253 // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
254 // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
255 // const __m128i mask = *(const __m128i *)(const void *)&(k_ShufMask_Swap4[0]);
256 const __m128i mask = *(const __m128i *)mask128_ptr;
257 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
258 do
259 {
260 SHUF_128(0) SHUF_128(1) items += 2;
261 SHUF_128(0) SHUF_128(1) items += 2;
262 }
263 while (items != lim);
264}
265
266#endif // USE_SWAP_SSSE3
267
268
269
270#ifdef USE_SWAP_AVX2
271
272#include <immintrin.h> // avx, avx2
273#if defined(__clang__)
274#include <avxintrin.h>
275#include <avx2intrin.h>
276#endif
277
278#define SHUF_256(i) *(items + (i)) = \
279 _mm256_shuffle_epi8(*(items + (i)), mask); // AVX2
280
281// Z7_NO_INLINE
282static
283#ifdef SWAP_ATTRIB_AVX2
284SWAP_ATTRIB_AVX2
285#endif
286Z7_ATTRIB_NO_VECTORIZE
287void
288Z7_FASTCALL
289ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr)
290{
291 __m256i *items = (__m256i *)items8;
292 const __m256i *lim = (const __m256i *)lim8;
293 /*
294 UNUSED_VAR(mask128_ptr)
295 __m256i mask =
296 for Swap4: _mm256_setr_epi8(SWAP4_SHUF_MASK_16_BYTES, SWAP4_SHUF_MASK_16_BYTES);
297 for Swap2: _mm256_setr_epi8(SWAP2_SHUF_MASK_16_BYTES, SWAP2_SHUF_MASK_16_BYTES);
298 */
299 const __m256i mask =
300 #if SWAP_MASK_INIT_SIZE > 16
301 *(const __m256i *)(const void *)mask128_ptr;
302 #else
303 /* msvc: broadcastsi128() version reserves the stack for no reason
304 msvc 19.29-: _mm256_insertf128_si256() / _mm256_set_m128i)) versions use non-avx movdqu xmm0,XMMWORD PTR [r8]
305 msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want
306 */
307 // _mm256_broadcastsi128_si256(*mask128_ptr);
308 /*
309 #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
310 MY_mm256_set_m128i
311 */
312 _mm256_set_m128i(
313 *(const __m128i *)mask128_ptr,
314 *(const __m128i *)mask128_ptr);
315 #endif
316
317 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
318 do
319 {
320 SHUF_256(0) SHUF_256(1) items += 2;
321 SHUF_256(0) SHUF_256(1) items += 2;
322 }
323 while (items != lim);
324}
325
326#endif // USE_SWAP_AVX2
327#endif // USE_SWAP_SSSE3 || USE_SWAP_AVX2
328#endif // USE_SWAP_128
329
330
331
332// compile message "NEON intrinsics not available with the soft-float ABI"
333#elif defined(MY_CPU_ARM_OR_ARM64) || \
334 (defined(__ARM_ARCH) && (__ARM_ARCH >= 7))
335// #elif defined(MY_CPU_ARM64)
336
337 #if defined(__clang__) && (__clang_major__ >= 8) \
338 || defined(__GNUC__) && (__GNUC__ >= 8)
339 #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) \
340 || defined(MY_CPU_ARM64)
341 #define USE_SWAP_128
342 #endif
343 #ifdef MY_CPU_ARM64
344 // #define SWAP_ATTRIB_NEON __attribute__((__target__("")))
345 #else
346 // #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
347 #endif
348 #elif defined(_MSC_VER)
349 #if (_MSC_VER >= 1910)
350 #define USE_SWAP_128
351 #endif
352 #endif
353
354 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
355 #include <arm64_neon.h>
356 #else
357 #include <arm_neon.h>
358 #endif
359
360#ifndef USE_SWAP_128
361 #define FORCE_SWAP_MODE
362#else
363
364#ifdef MY_CPU_ARM64
365 // for debug : comment it
366 #define FORCE_SWAP_MODE
367#else
368 #define k_SwapBytes_Mode_NEON 1
369#endif
370// typedef uint8x16_t v128;
371#define SWAP2_128(i) *(uint8x16_t *) (void *)(items + (i) * 8) = \
372 vrev16q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 8));
373#define SWAP4_128(i) *(uint8x16_t *) (void *)(items + (i) * 4) = \
374 vrev32q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 4));
375
376// Z7_NO_INLINE
377static
378#ifdef SWAP_ATTRIB_NEON
379SWAP_ATTRIB_NEON
380#endif
381Z7_ATTRIB_NO_VECTORIZE
382void
383Z7_FASTCALL
384SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim)
385{
386 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
387 do
388 {
389 SWAP2_128(0) SWAP2_128(1) items += 2 * 8;
390 SWAP2_128(0) SWAP2_128(1) items += 2 * 8;
391 }
392 while (items != lim);
393}
394
395// Z7_NO_INLINE
396static
397#ifdef SWAP_ATTRIB_NEON
398SWAP_ATTRIB_NEON
399#endif
400Z7_ATTRIB_NO_VECTORIZE
401void
402Z7_FASTCALL
403SwapBytes4_128(CSwapUInt32 *items, const CSwapUInt32 *lim)
404{
405 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
406 do
407 {
408 SWAP4_128(0) SWAP4_128(1) items += 2 * 4;
409 SWAP4_128(0) SWAP4_128(1) items += 2 * 4;
410 }
411 while (items != lim);
412}
413
414#endif // USE_SWAP_128
415
416#else // MY_CPU_ARM_OR_ARM64
417#define FORCE_SWAP_MODE
418#endif // MY_CPU_ARM_OR_ARM64
419
420
421
422
423
424
425#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_X86)
426 /* _byteswap_ushort() in MSVC x86 32-bit works via slow { mov dh, al; mov dl, ah }
427 So we use own versions of byteswap function */
428 #if (_MSC_VER < 1400 ) // old MSVC-X86 without _rotr16() support
429 #define SWAP2_16(i) { UInt32 v = items[i]; v += (v << 16); v >>= 8; items[i] = (CSwapUInt16)v; }
430 #else // is new MSVC-X86 with fast _rotr16()
431 #include <intrin.h>
432 #define SWAP2_16(i) { items[i] = _rotr16(items[i], 8); }
433 #endif
434#else // is not MSVC-X86
435 #define SWAP2_16(i) { CSwapUInt16 v = items[i]; items[i] = Z7_BSWAP16(v); }
436#endif // MSVC-X86
437
438#if defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
439 #define SWAP4_32(i) { CSwapUInt32 v = items[i]; items[i] = Z7_BSWAP32(v); }
440#else
441 #define SWAP4_32(i) \
442 { UInt32 v = items[i]; \
443 v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); \
444 v = rotlFixed(v, 16); \
445 items[i] = v; }
446#endif
447
448
449
450
451#if defined(FORCE_SWAP_MODE) && defined(USE_SWAP_128)
452 #define DEFAULT_Swap2 SwapBytes2_128
453 #if !defined(MY_CPU_X86_OR_AMD64)
454 #define DEFAULT_Swap4 SwapBytes4_128
455 #endif
456#endif
457
458#if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4)
459
460#define SWAP_BASE_FUNCS_PREFIXES \
461Z7_FORCE_INLINE \
462static \
463Z7_ATTRIB_NO_VECTOR \
464void Z7_FASTCALL
465
466
467#ifdef MY_CPU_64BIT
468
469#if defined(MY_CPU_ARM64) \
470 && defined(__ARM_ARCH) && (__ARM_ARCH >= 8) \
471 && ( (defined(__GNUC__) && (__GNUC__ >= 4)) \
472 || (defined(__clang__) && (__clang_major__ >= 4)))
473
474 #define SWAP2_64_VAR(v) asm ("rev16 %x0,%x0" : "+r" (v));
475 #define SWAP4_64_VAR(v) asm ("rev32 %x0,%x0" : "+r" (v));
476
477#else // is not ARM64-GNU
478
479#if !defined(MY_CPU_X86_OR_AMD64) || (k_SwapBytes_Mode_MIN == 0) || !defined(USE_SWAP_128)
480 #define SWAP2_64_VAR(v) \
481 v = ( 0x00ff00ff00ff00ff & (v >> 8)) \
482 + ((0x00ff00ff00ff00ff & v) << 8);
483 /* plus gives faster code in MSVC */
484#endif
485
486#ifdef Z7_CPU_FAST_BSWAP_SUPPORTED
487 #define SWAP4_64_VAR(v) \
488 v = Z7_BSWAP64(v); \
489 v = Z7_ROTL64(v, 32);
490#else
491 #define SWAP4_64_VAR(v) \
492 v = ( 0x000000ff000000ff & (v >> 24)) \
493 + ((0x000000ff000000ff & v) << 24 ) \
494 + ( 0x0000ff000000ff00 & (v >> 8)) \
495 + ((0x0000ff000000ff00 & v) << 8 ) \
496 ;
497#endif
498
499#endif // ARM64-GNU
500
501
502#ifdef SWAP2_64_VAR
503
504#define SWAP2_64(i) { \
505 UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 4); \
506 SWAP2_64_VAR(v) \
507 *(UInt64 *)(void *)(items + (i) * 4) = v; }
508
509SWAP_BASE_FUNCS_PREFIXES
510SwapBytes2_64(CSwapUInt16 *items, const CSwapUInt16 *lim)
511{
512 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
513 do
514 {
515 SWAP2_64(0) SWAP2_64(1) items += 2 * 4;
516 SWAP2_64(0) SWAP2_64(1) items += 2 * 4;
517 }
518 while (items != lim);
519}
520
521 #define DEFAULT_Swap2 SwapBytes2_64
522 #if !defined(FORCE_SWAP_MODE)
523 #define SWAP2_DEFAULT_MODE 0
524 #endif
525#else // !defined(SWAP2_64_VAR)
526 #define DEFAULT_Swap2 SwapBytes2_128
527 #if !defined(FORCE_SWAP_MODE)
528 #define SWAP2_DEFAULT_MODE 1
529 #endif
530#endif // SWAP2_64_VAR
531
532
533#define SWAP4_64(i) { \
534 UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 2); \
535 SWAP4_64_VAR(v) \
536 *(UInt64 *)(void *)(items + (i) * 2) = v; }
537
538SWAP_BASE_FUNCS_PREFIXES
539SwapBytes4_64(CSwapUInt32 *items, const CSwapUInt32 *lim)
540{
541 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
542 do
543 {
544 SWAP4_64(0) SWAP4_64(1) items += 2 * 2;
545 SWAP4_64(0) SWAP4_64(1) items += 2 * 2;
546 }
547 while (items != lim);
548}
549
550#define DEFAULT_Swap4 SwapBytes4_64
551
552#else // is not 64BIT
553
554
555#if defined(MY_CPU_ARM_OR_ARM64) \
556 && defined(__ARM_ARCH) && (__ARM_ARCH >= 6) \
557 && ( (defined(__GNUC__) && (__GNUC__ >= 4)) \
558 || (defined(__clang__) && (__clang_major__ >= 4)))
559
560#ifdef MY_CPU_64BIT
561 #define SWAP2_32_VAR(v) asm ("rev16 %w0,%w0" : "+r" (v));
562#else
563 #define SWAP2_32_VAR(v) asm ("rev16 %0,%0" : "+r" (v)); // for clang/gcc
564 // asm ("rev16 %r0,%r0" : "+r" (a)); // for gcc
565#endif
566
567#elif defined(_MSC_VER) && (_MSC_VER < 1300) && defined(MY_CPU_X86) \
568 || !defined(Z7_CPU_FAST_BSWAP_SUPPORTED) \
569 || !defined(Z7_CPU_FAST_ROTATE_SUPPORTED)
570 // old msvc doesn't support _byteswap_ulong()
571 #define SWAP2_32_VAR(v) \
572 v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff);
573
574#else // is not ARM and is not old-MSVC-X86 and fast BSWAP/ROTATE are supported
575 #define SWAP2_32_VAR(v) \
576 v = Z7_BSWAP32(v); \
577 v = rotlFixed(v, 16);
578
579#endif // GNU-ARM*
580
581#define SWAP2_32(i) { \
582 UInt32 v = *(const UInt32 *)(const void *)(items + (i) * 2); \
583 SWAP2_32_VAR(v); \
584 *(UInt32 *)(void *)(items + (i) * 2) = v; }
585
586
587SWAP_BASE_FUNCS_PREFIXES
588SwapBytes2_32(CSwapUInt16 *items, const CSwapUInt16 *lim)
589{
590 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
591 do
592 {
593 SWAP2_32(0) SWAP2_32(1) items += 2 * 2;
594 SWAP2_32(0) SWAP2_32(1) items += 2 * 2;
595 }
596 while (items != lim);
597}
598
599
600SWAP_BASE_FUNCS_PREFIXES
601SwapBytes4_32(CSwapUInt32 *items, const CSwapUInt32 *lim)
602{
603 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
604 do
605 {
606 SWAP4_32(0) SWAP4_32(1) items += 2;
607 SWAP4_32(0) SWAP4_32(1) items += 2;
608 }
609 while (items != lim);
610}
611
612#define DEFAULT_Swap2 SwapBytes2_32
613#define DEFAULT_Swap4 SwapBytes4_32
614#if !defined(FORCE_SWAP_MODE)
615 #define SWAP2_DEFAULT_MODE 0
616#endif
617
618#endif // MY_CPU_64BIT
619#endif // if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4)
620
621
622
623#if !defined(FORCE_SWAP_MODE)
624static unsigned g_SwapBytes_Mode;
625#endif
626
627/* size of largest unrolled loop iteration: 128 bytes = 4 * 32 bytes (AVX). */
628#define SWAP_ITERATION_BLOCK_SIZE_MAX (1 << 7)
629
630// 32 bytes for (AVX) or 2 * 16-bytes for NEON.
631#define SWAP_VECTOR_ALIGN_SIZE (1 << 5)
632
633Z7_NO_INLINE
634void z7_SwapBytes2(CSwapUInt16 *items, size_t numItems)
635{
636 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
637 for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--)
638 {
639 SWAP2_16(0)
640 items++;
641 }
642 {
643 const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt16) - 1;
644 size_t numItems2 = numItems;
645 CSwapUInt16 *lim;
646 numItems &= k_Align_Mask;
647 numItems2 &= ~(size_t)k_Align_Mask;
648 lim = items + numItems2;
649 if (numItems2 != 0)
650 {
651 #if !defined(FORCE_SWAP_MODE)
652 #ifdef MY_CPU_X86_OR_AMD64
653 #ifdef USE_SWAP_AVX2
654 if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3)
655 ShufBytes_256((__m256i *)(void *)items,
656 (const __m256i *)(const void *)lim,
657 (const __m128i *)(const void *)&(k_ShufMask_Swap2[0]));
658 else
659 #endif
660 #ifdef USE_SWAP_SSSE3
661 if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3)
662 ShufBytes_128((__m128i *)(void *)items,
663 (const __m128i *)(const void *)lim,
664 (const __m128i *)(const void *)&(k_ShufMask_Swap2[0]));
665 else
666 #endif
667 #endif // MY_CPU_X86_OR_AMD64
668 #if SWAP2_DEFAULT_MODE == 0
669 if (g_SwapBytes_Mode != 0)
670 SwapBytes2_128(items, lim);
671 else
672 #endif
673 #endif // FORCE_SWAP_MODE
674 DEFAULT_Swap2(items, lim);
675 }
676 items = lim;
677 }
678 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
679 for (; numItems != 0; numItems--)
680 {
681 SWAP2_16(0)
682 items++;
683 }
684}
685
686
687Z7_NO_INLINE
688void z7_SwapBytes4(CSwapUInt32 *items, size_t numItems)
689{
690 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
691 for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--)
692 {
693 SWAP4_32(0)
694 items++;
695 }
696 {
697 const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt32) - 1;
698 size_t numItems2 = numItems;
699 CSwapUInt32 *lim;
700 numItems &= k_Align_Mask;
701 numItems2 &= ~(size_t)k_Align_Mask;
702 lim = items + numItems2;
703 if (numItems2 != 0)
704 {
705 #if !defined(FORCE_SWAP_MODE)
706 #ifdef MY_CPU_X86_OR_AMD64
707 #ifdef USE_SWAP_AVX2
708 if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3)
709 ShufBytes_256((__m256i *)(void *)items,
710 (const __m256i *)(const void *)lim,
711 (const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
712 else
713 #endif
714 #ifdef USE_SWAP_SSSE3
715 if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3)
716 ShufBytes_128((__m128i *)(void *)items,
717 (const __m128i *)(const void *)lim,
718 (const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
719 else
720 #endif
721 #else // MY_CPU_X86_OR_AMD64
722
723 if (g_SwapBytes_Mode != 0)
724 SwapBytes4_128(items, lim);
725 else
726 #endif // MY_CPU_X86_OR_AMD64
727 #endif // FORCE_SWAP_MODE
728 DEFAULT_Swap4(items, lim);
729 }
730 items = lim;
731 }
732 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
733 for (; numItems != 0; numItems--)
734 {
735 SWAP4_32(0)
736 items++;
737 }
738}
739
740
741// #define SHOW_HW_STATUS
742
743#ifdef SHOW_HW_STATUS
744#include <stdio.h>
745#define PRF(x) x
746#else
747#define PRF(x)
748#endif
749
750void z7_SwapBytesPrepare(void)
751{
752#ifndef FORCE_SWAP_MODE
753 unsigned mode = 0; // k_SwapBytes_Mode_BASE;
754
755#ifdef MY_CPU_ARM_OR_ARM64
756 {
757 if (CPU_IsSupported_NEON())
758 {
759 // #pragma message ("=== SwapBytes NEON")
760 PRF(printf("\n=== SwapBytes NEON\n");)
761 mode = k_SwapBytes_Mode_NEON;
762 }
763 }
764#else // MY_CPU_ARM_OR_ARM64
765 {
766 #ifdef USE_SWAP_AVX2
767 if (CPU_IsSupported_AVX2())
768 {
769 // #pragma message ("=== SwapBytes AVX2")
770 PRF(printf("\n=== SwapBytes AVX2\n");)
771 mode = k_SwapBytes_Mode_AVX2;
772 }
773 else
774 #endif
775 #ifdef USE_SWAP_SSSE3
776 if (CPU_IsSupported_SSSE3())
777 {
778 // #pragma message ("=== SwapBytes SSSE3")
779 PRF(printf("\n=== SwapBytes SSSE3\n");)
780 mode = k_SwapBytes_Mode_SSSE3;
781 }
782 else
783 #endif
784 #if !defined(MY_CPU_AMD64)
785 if (CPU_IsSupported_SSE2())
786 #endif
787 {
788 // #pragma message ("=== SwapBytes SSE2")
789 PRF(printf("\n=== SwapBytes SSE2\n");)
790 mode = k_SwapBytes_Mode_SSE2;
791 }
792 }
793#endif // MY_CPU_ARM_OR_ARM64
794 g_SwapBytes_Mode = mode;
795 // g_SwapBytes_Mode = 0; // for debug
796#endif // FORCE_SWAP_MODE
797 PRF(printf("\n=== SwapBytesPrepare\n");)
798}
799
800#undef PRF