1 files changed, 133 insertions, 11 deletions
diff --git a/C/CpuArch.h b/C/CpuArch.h
index 8e5d8a5..dfc68f1 100644
--- a/C/CpuArch.h
+++ b/C/CpuArch.h
@@ -1,5 +1,5 @@
 /* CpuArch.h -- CPU specific code
-2023-04-02 : Igor Pavlov : Public domain */
+2024-05-13 : Igor Pavlov : Public domain */
 #ifndef ZIP7_INC_CPU_ARCH_H
 #define ZIP7_INC_CPU_ARCH_H
@@ -20,6 +20,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
  MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8)
 */
+#if !defined(_M_ARM64EC)
 #if  defined(_M_X64) \
  || defined(_M_AMD64) \
  || defined(__x86_64__) \
@@ -35,6 +36,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
  #endif
  #define MY_CPU_64BIT
 #endif
+#endif
 #if  defined(_M_IX86) \
@@ -47,17 +49,26 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
 #if  defined(_M_ARM64) \
+  || defined(_M_ARM64EC) \
  || defined(__AARCH64EL__) \
  || defined(__AARCH64EB__) \
  || defined(__aarch64__)
  #define MY_CPU_ARM64
-  #ifdef __ILP32__
+#if   defined(__ILP32__) \
+   || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
    #define MY_CPU_NAME "arm64-32"
    #define MY_CPU_SIZEOF_POINTER 4
-  #else
+#elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
+    #define MY_CPU_NAME "arm64-128"
+    #define MY_CPU_SIZEOF_POINTER 16
+#else
+#if defined(_M_ARM64EC)
+    #define MY_CPU_NAME "arm64ec"
+#else
    #define MY_CPU_NAME "arm64"
+#endif
    #define MY_CPU_SIZEOF_POINTER 8
-  #endif
+#endif
  #define MY_CPU_64BIT
 #endif
@@ -133,8 +144,36 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
 #endif
+#if   defined(__sparc__) \
+   || defined(__sparc)
+  #define MY_CPU_SPARC
+  #if  defined(__LP64__) \
+    || defined(_LP64) \
+    || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
+    #define MY_CPU_NAME "sparcv9"
+    #define MY_CPU_SIZEOF_POINTER 8
+    #define MY_CPU_64BIT
+  #elif defined(__sparc_v9__) \
+     || defined(__sparcv9)
+    #define MY_CPU_64BIT
+    #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
+      #define MY_CPU_NAME "sparcv9-32"
+    #else
+      #define MY_CPU_NAME "sparcv9m"
+    #endif
+  #elif defined(__sparc_v8__) \
+     || defined(__sparcv8)
+    #define MY_CPU_NAME "sparcv8"
+    #define MY_CPU_SIZEOF_POINTER 4
+  #else
+    #define MY_CPU_NAME "sparc"
+  #endif
+#endif
 #if  defined(__riscv) \
  || defined(__riscv__)
+    #define MY_CPU_RISCV
  #if __riscv_xlen == 32
    #define MY_CPU_NAME "riscv32"
  #elif __riscv_xlen == 64
@@ -145,6 +184,39 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
 #endif
+#if defined(__loongarch__)
+  #define MY_CPU_LOONGARCH
+  #if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64)
+  #define MY_CPU_64BIT
+  #endif
+  #if defined(__loongarch64)
+  #define MY_CPU_NAME "loongarch64"
+  #define MY_CPU_LOONGARCH64
+  #else
+  #define MY_CPU_NAME "loongarch"
+  #endif
+#endif
+// #undef MY_CPU_NAME
+// #undef MY_CPU_SIZEOF_POINTER
+// #define __e2k__
+// #define __SIZEOF_POINTER__ 4
+#if  defined(__e2k__)
+  #define MY_CPU_E2K
+  #if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
+    #define MY_CPU_NAME "e2k-32"
+    #define MY_CPU_SIZEOF_POINTER 4
+  #else
+    #define MY_CPU_NAME "e2k"
+    #if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
+      #define MY_CPU_SIZEOF_POINTER 8
+    #endif
+  #endif
+  #define MY_CPU_64BIT
+#endif
 #if defined(MY_CPU_X86) || defined(MY_CPU_AMD64)
 #define MY_CPU_X86_OR_AMD64
 #endif
@@ -175,6 +247,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
    || defined(MY_CPU_ARM_LE) \
    || defined(MY_CPU_ARM64_LE) \
    || defined(MY_CPU_IA64_LE) \
+    || defined(_LITTLE_ENDIAN) \
    || defined(__LITTLE_ENDIAN__) \
    || defined(__ARMEL__) \
    || defined(__THUMBEL__) \
@@ -251,6 +324,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
 #ifndef MY_CPU_NAME
+  // #define MY_CPU_IS_UNKNOWN
  #ifdef MY_CPU_LE
    #define MY_CPU_NAME "LE"
  #elif defined(MY_CPU_BE)
@@ -295,9 +369,19 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
 #define Z7_BSWAP64(v)  _byteswap_uint64(v)
 #define Z7_CPU_FAST_BSWAP_SUPPORTED
-#elif  (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
+/* GCC can generate slow code that calls function for __builtin_bswap32() for:
-    || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16))
+     - GCC for RISCV, if Zbb extension is not used.
- 
+     - GCC for SPARC.
+   The code from CLANG for SPARC also is not fastest.
+   So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases.
+*/
+#elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb)) \
+    && !defined(MY_CPU_SPARC) \
+    && ( \
+       (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
+    || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \
+    )
 #define Z7_BSWAP16(v)  __builtin_bswap16(v)
 #define Z7_BSWAP32(v)  __builtin_bswap32(v)
 #define Z7_BSWAP64(v)  __builtin_bswap64(v)
@@ -329,13 +413,48 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
 #ifdef MY_CPU_LE
  #if defined(MY_CPU_X86_OR_AMD64) \
-      || defined(MY_CPU_ARM64)
+      || defined(MY_CPU_ARM64) \
+      || defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \
+      || defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6)
    #define MY_CPU_LE_UNALIGN
    #define MY_CPU_LE_UNALIGN_64
  #elif defined(__ARM_FEATURE_UNALIGNED)
-    /* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment.
+/* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions.
-       So we can't use unaligned 64-bit operations. */
+  Description of problems:
-    #define MY_CPU_LE_UNALIGN
+problem-1 : 32-bit ARM architecture:
+  multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM)
+  require 32-bit (WORD) alignment (by 32-bit ARM architecture).
+  So there is "Alignment fault exception", if data is not aligned for 32-bit.
+problem-2 : 32-bit kernels and arm64 kernels:
+  32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception".
+  So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux.
+ 
+  But some arm64 kernels do not handle these faults in 32-bit programs.
+  So we have unhandled exception for such instructions.
+  Probably some new arm64 kernels have fixed it, and unaligned
+  paired-access instructions work in new kernels?
+problem-3 : compiler for 32-bit arm:
+  Compilers use LDRD/STRD/LDM/STM for UInt64 accesses
+  and for another cases where two 32-bit accesses are fused
+  to one multi-access instruction.
+  So UInt64 variables must be aligned for 32-bit, and each
+  32-bit access must be aligned for 32-bit, if we want to
+  avoid "Alignment fault" exception (handled or unhandled).
+problem-4 : performace:
+  Even if unaligned access is handled by kernel, it will be slow.
+  So if we allow unaligned access, we can get fast unaligned
+  single-access, and slow unaligned paired-access.
+  We don't allow unaligned access on 32-bit arm, because compiler
+  genarates paired-access instructions that require 32-bit alignment,
+  and some arm64 kernels have no handler for these instructions.
+  Also unaligned paired-access instructions will be slow, if kernel handles them.
+*/
+    // it must be disabled:
+    // #define MY_CPU_LE_UNALIGN
  #endif
 #endif
@@ -439,6 +558,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
 #if defined(MY_CPU_BE)
+#define GetBe64a(p)      (*(const UInt64 *)(const void *)(p))
 #define GetBe32a(p)      (*(const UInt32 *)(const void *)(p))
 #define GetBe16a(p)      (*(const UInt16 *)(const void *)(p))
 #define SetBe32a(p, v)   { *(UInt32 *)(void *)(p) = (v); }
@@ -456,6 +576,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
 #define SetUi32a(p, v)   { *(UInt32 *)(void *)(p) = (v); }
 #define SetUi16a(p, v)   { *(UInt16 *)(void *)(p) = (v); }
+#define GetBe64a(p)      GetBe64(p)
 #define GetBe32a(p)      GetBe32(p)
 #define GetBe16a(p)      GetBe16(p)
 #define SetBe32a(p, v)   SetBe32(p, v)
@@ -486,6 +607,7 @@ UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void);
 BoolInt CPU_IsSupported_AES(void);
 BoolInt CPU_IsSupported_AVX(void);
 BoolInt CPU_IsSupported_AVX2(void);
+// BoolInt CPU_IsSupported_AVX512F_AVX512VL(void);
 BoolInt CPU_IsSupported_VAES_AVX2(void);
 BoolInt CPU_IsSupported_CMOV(void);
 BoolInt CPU_IsSupported_SSE(void);

diff --git a/C/CpuArch.h b/C/CpuArch.h index 8e5d8a5..dfc68f1 100644 --- a/C/CpuArch.h +++ b/C/CpuArch.h
@@ -1,5 +1,5 @@
1	/* CpuArch.h -- CPU specific code	1	/* CpuArch.h -- CPU specific code
2	2023-04-02 : Igor Pavlov : Public domain */	2	2024-05-13 : Igor Pavlov : Public domain */
3		3
4	#ifndef ZIP7_INC_CPU_ARCH_H	4	#ifndef ZIP7_INC_CPU_ARCH_H
5	#define ZIP7_INC_CPU_ARCH_H	5	#define ZIP7_INC_CPU_ARCH_H
@@ -20,6 +20,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
20	MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8)	20	MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8)
21	*/	21	*/
22		22
		23	#if !defined(_M_ARM64EC)
23	#if defined(_M_X64) \	24	#if defined(_M_X64) \
24	\|\| defined(_M_AMD64) \	25	\|\| defined(_M_AMD64) \
25	\|\| defined(__x86_64__) \	26	\|\| defined(__x86_64__) \
@@ -35,6 +36,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
35	#endif	36	#endif
36	#define MY_CPU_64BIT	37	#define MY_CPU_64BIT
37	#endif	38	#endif
		39	#endif
38		40
39		41
40	#if defined(_M_IX86) \	42	#if defined(_M_IX86) \
@@ -47,17 +49,26 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
47		49
48		50
49	#if defined(_M_ARM64) \	51	#if defined(_M_ARM64) \
		52	\|\| defined(_M_ARM64EC) \
50	\|\| defined(__AARCH64EL__) \	53	\|\| defined(__AARCH64EL__) \
51	\|\| defined(__AARCH64EB__) \	54	\|\| defined(__AARCH64EB__) \
52	\|\| defined(__aarch64__)	55	\|\| defined(__aarch64__)
53	#define MY_CPU_ARM64	56	#define MY_CPU_ARM64
54	#ifdef __ILP32__	57	#if defined(__ILP32__) \
		58	\|\| defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
55	#define MY_CPU_NAME "arm64-32"	59	#define MY_CPU_NAME "arm64-32"
56	#define MY_CPU_SIZEOF_POINTER 4	60	#define MY_CPU_SIZEOF_POINTER 4
57	#else	61	#elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
		62	#define MY_CPU_NAME "arm64-128"
		63	#define MY_CPU_SIZEOF_POINTER 16
		64	#else
		65	#if defined(_M_ARM64EC)
		66	#define MY_CPU_NAME "arm64ec"
		67	#else
58	#define MY_CPU_NAME "arm64"	68	#define MY_CPU_NAME "arm64"
		69	#endif
59	#define MY_CPU_SIZEOF_POINTER 8	70	#define MY_CPU_SIZEOF_POINTER 8
60	#endif	71	#endif
61	#define MY_CPU_64BIT	72	#define MY_CPU_64BIT
62	#endif	73	#endif
63		74
@@ -133,8 +144,36 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
133	#endif	144	#endif
134		145
135		146
		147	#if defined(__sparc__) \
		148	\|\| defined(__sparc)
		149	#define MY_CPU_SPARC
		150	#if defined(__LP64__) \
		151	\|\| defined(_LP64) \
		152	\|\| defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
		153	#define MY_CPU_NAME "sparcv9"
		154	#define MY_CPU_SIZEOF_POINTER 8
		155	#define MY_CPU_64BIT
		156	#elif defined(__sparc_v9__) \
		157	\|\| defined(__sparcv9)
		158	#define MY_CPU_64BIT
		159	#if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
		160	#define MY_CPU_NAME "sparcv9-32"
		161	#else
		162	#define MY_CPU_NAME "sparcv9m"
		163	#endif
		164	#elif defined(__sparc_v8__) \
		165	\|\| defined(__sparcv8)
		166	#define MY_CPU_NAME "sparcv8"
		167	#define MY_CPU_SIZEOF_POINTER 4
		168	#else
		169	#define MY_CPU_NAME "sparc"
		170	#endif
		171	#endif
		172
		173
136	#if defined(__riscv) \	174	#if defined(__riscv) \
137	\|\| defined(__riscv__)	175	\|\| defined(__riscv__)
		176	#define MY_CPU_RISCV
138	#if __riscv_xlen == 32	177	#if __riscv_xlen == 32
139	#define MY_CPU_NAME "riscv32"	178	#define MY_CPU_NAME "riscv32"
140	#elif __riscv_xlen == 64	179	#elif __riscv_xlen == 64
@@ -145,6 +184,39 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
145	#endif	184	#endif
146		185
147		186
		187	#if defined(__loongarch__)
		188	#define MY_CPU_LOONGARCH
		189	#if defined(__loongarch64) \|\| defined(__loongarch_grlen) && (__loongarch_grlen == 64)
		190	#define MY_CPU_64BIT
		191	#endif
		192	#if defined(__loongarch64)
		193	#define MY_CPU_NAME "loongarch64"
		194	#define MY_CPU_LOONGARCH64
		195	#else
		196	#define MY_CPU_NAME "loongarch"
		197	#endif
		198	#endif
		199
		200
		201	// #undef MY_CPU_NAME
		202	// #undef MY_CPU_SIZEOF_POINTER
		203	// #define __e2k__
		204	// #define __SIZEOF_POINTER__ 4
		205	#if defined(__e2k__)
		206	#define MY_CPU_E2K
		207	#if defined(__ILP32__) \|\| defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
		208	#define MY_CPU_NAME "e2k-32"
		209	#define MY_CPU_SIZEOF_POINTER 4
		210	#else
		211	#define MY_CPU_NAME "e2k"
		212	#if defined(__LP64__) \|\| defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
		213	#define MY_CPU_SIZEOF_POINTER 8
		214	#endif
		215	#endif
		216	#define MY_CPU_64BIT
		217	#endif
		218
		219
148	#if defined(MY_CPU_X86) \|\| defined(MY_CPU_AMD64)	220	#if defined(MY_CPU_X86) \|\| defined(MY_CPU_AMD64)
149	#define MY_CPU_X86_OR_AMD64	221	#define MY_CPU_X86_OR_AMD64
150	#endif	222	#endif
@@ -175,6 +247,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
175	\|\| defined(MY_CPU_ARM_LE) \	247	\|\| defined(MY_CPU_ARM_LE) \
176	\|\| defined(MY_CPU_ARM64_LE) \	248	\|\| defined(MY_CPU_ARM64_LE) \
177	\|\| defined(MY_CPU_IA64_LE) \	249	\|\| defined(MY_CPU_IA64_LE) \
		250	\|\| defined(_LITTLE_ENDIAN) \
178	\|\| defined(__LITTLE_ENDIAN__) \	251	\|\| defined(__LITTLE_ENDIAN__) \
179	\|\| defined(__ARMEL__) \	252	\|\| defined(__ARMEL__) \
180	\|\| defined(__THUMBEL__) \	253	\|\| defined(__THUMBEL__) \
@@ -251,6 +324,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
251		324
252		325
253	#ifndef MY_CPU_NAME	326	#ifndef MY_CPU_NAME
		327	// #define MY_CPU_IS_UNKNOWN
254	#ifdef MY_CPU_LE	328	#ifdef MY_CPU_LE
255	#define MY_CPU_NAME "LE"	329	#define MY_CPU_NAME "LE"
256	#elif defined(MY_CPU_BE)	330	#elif defined(MY_CPU_BE)
@@ -295,9 +369,19 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
295	#define Z7_BSWAP64(v) _byteswap_uint64(v)	369	#define Z7_BSWAP64(v) _byteswap_uint64(v)
296	#define Z7_CPU_FAST_BSWAP_SUPPORTED	370	#define Z7_CPU_FAST_BSWAP_SUPPORTED
297		371
298	#elif (defined(__GNUC__) && (__GNUC__ > 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \	372	/* GCC can generate slow code that calls function for __builtin_bswap32() for:
299	\|\| (defined(__clang__) && Z7_has_builtin(__builtin_bswap16))	373	- GCC for RISCV, if Zbb extension is not used.
300		374	- GCC for SPARC.
		375	The code from CLANG for SPARC also is not fastest.
		376	So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases.
		377	*/
		378	#elif (!defined(MY_CPU_RISCV) \|\| defined (__riscv_zbb)) \
		379	&& !defined(MY_CPU_SPARC) \
		380	&& ( \
		381	(defined(__GNUC__) && (__GNUC__ > 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
		382	\|\| (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \
		383	)
		384
301	#define Z7_BSWAP16(v) __builtin_bswap16(v)	385	#define Z7_BSWAP16(v) __builtin_bswap16(v)
302	#define Z7_BSWAP32(v) __builtin_bswap32(v)	386	#define Z7_BSWAP32(v) __builtin_bswap32(v)
303	#define Z7_BSWAP64(v) __builtin_bswap64(v)	387	#define Z7_BSWAP64(v) __builtin_bswap64(v)
@@ -329,13 +413,48 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
329		413
330	#ifdef MY_CPU_LE	414	#ifdef MY_CPU_LE
331	#if defined(MY_CPU_X86_OR_AMD64) \	415	#if defined(MY_CPU_X86_OR_AMD64) \
332	\|\| defined(MY_CPU_ARM64)	416	\|\| defined(MY_CPU_ARM64) \
		417	\|\| defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \
		418	\|\| defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6)
333	#define MY_CPU_LE_UNALIGN	419	#define MY_CPU_LE_UNALIGN
334	#define MY_CPU_LE_UNALIGN_64	420	#define MY_CPU_LE_UNALIGN_64
335	#elif defined(__ARM_FEATURE_UNALIGNED)	421	#elif defined(__ARM_FEATURE_UNALIGNED)
336	/* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment.	422	/* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions.
337	So we can't use unaligned 64-bit operations. */	423	Description of problems:
338	#define MY_CPU_LE_UNALIGN	424	problem-1 : 32-bit ARM architecture:
		425	multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM)
		426	require 32-bit (WORD) alignment (by 32-bit ARM architecture).
		427	So there is "Alignment fault exception", if data is not aligned for 32-bit.
		428
		429	problem-2 : 32-bit kernels and arm64 kernels:
		430	32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception".
		431	So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux.
		432
		433	But some arm64 kernels do not handle these faults in 32-bit programs.
		434	So we have unhandled exception for such instructions.
		435	Probably some new arm64 kernels have fixed it, and unaligned
		436	paired-access instructions work in new kernels?
		437
		438	problem-3 : compiler for 32-bit arm:
		439	Compilers use LDRD/STRD/LDM/STM for UInt64 accesses
		440	and for another cases where two 32-bit accesses are fused
		441	to one multi-access instruction.
		442	So UInt64 variables must be aligned for 32-bit, and each
		443	32-bit access must be aligned for 32-bit, if we want to
		444	avoid "Alignment fault" exception (handled or unhandled).
		445
		446	problem-4 : performace:
		447	Even if unaligned access is handled by kernel, it will be slow.
		448	So if we allow unaligned access, we can get fast unaligned
		449	single-access, and slow unaligned paired-access.
		450
		451	We don't allow unaligned access on 32-bit arm, because compiler
		452	genarates paired-access instructions that require 32-bit alignment,
		453	and some arm64 kernels have no handler for these instructions.
		454	Also unaligned paired-access instructions will be slow, if kernel handles them.
		455	*/
		456	// it must be disabled:
		457	// #define MY_CPU_LE_UNALIGN
339	#endif	458	#endif
340	#endif	459	#endif
341		460
@@ -439,6 +558,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
439		558
440	#if defined(MY_CPU_BE)	559	#if defined(MY_CPU_BE)
441		560
		561	#define GetBe64a(p) ((const UInt64 )(const void *)(p))
442	#define GetBe32a(p) ((const UInt32 )(const void *)(p))	562	#define GetBe32a(p) ((const UInt32 )(const void *)(p))
443	#define GetBe16a(p) ((const UInt16 )(const void *)(p))	563	#define GetBe16a(p) ((const UInt16 )(const void *)(p))
444	#define SetBe32a(p, v) { (UInt32 )(void *)(p) = (v); }	564	#define SetBe32a(p, v) { (UInt32 )(void *)(p) = (v); }
@@ -456,6 +576,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
456	#define SetUi32a(p, v) { (UInt32 )(void *)(p) = (v); }	576	#define SetUi32a(p, v) { (UInt32 )(void *)(p) = (v); }
457	#define SetUi16a(p, v) { (UInt16 )(void *)(p) = (v); }	577	#define SetUi16a(p, v) { (UInt16 )(void *)(p) = (v); }
458		578
		579	#define GetBe64a(p) GetBe64(p)
459	#define GetBe32a(p) GetBe32(p)	580	#define GetBe32a(p) GetBe32(p)
460	#define GetBe16a(p) GetBe16(p)	581	#define GetBe16a(p) GetBe16(p)
461	#define SetBe32a(p, v) SetBe32(p, v)	582	#define SetBe32a(p, v) SetBe32(p, v)
@@ -486,6 +607,7 @@ UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void);
486	BoolInt CPU_IsSupported_AES(void);	607	BoolInt CPU_IsSupported_AES(void);
487	BoolInt CPU_IsSupported_AVX(void);	608	BoolInt CPU_IsSupported_AVX(void);
488	BoolInt CPU_IsSupported_AVX2(void);	609	BoolInt CPU_IsSupported_AVX2(void);
		610	// BoolInt CPU_IsSupported_AVX512F_AVX512VL(void);
489	BoolInt CPU_IsSupported_VAES_AVX2(void);	611	BoolInt CPU_IsSupported_VAES_AVX2(void);
490	BoolInt CPU_IsSupported_CMOV(void);	612	BoolInt CPU_IsSupported_CMOV(void);
491	BoolInt CPU_IsSupported_SSE(void);	613	BoolInt CPU_IsSupported_SSE(void);