diff options
Diffstat (limited to 'C/CpuArch.h')
-rw-r--r-- | C/CpuArch.h | 144 |
1 files changed, 133 insertions, 11 deletions
diff --git a/C/CpuArch.h b/C/CpuArch.h index 8e5d8a5..dfc68f1 100644 --- a/C/CpuArch.h +++ b/C/CpuArch.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* CpuArch.h -- CPU specific code | 1 | /* CpuArch.h -- CPU specific code |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-05-13 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_CPU_ARCH_H | 4 | #ifndef ZIP7_INC_CPU_ARCH_H |
5 | #define ZIP7_INC_CPU_ARCH_H | 5 | #define ZIP7_INC_CPU_ARCH_H |
@@ -20,6 +20,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
20 | MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) | 20 | MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #if !defined(_M_ARM64EC) | ||
23 | #if defined(_M_X64) \ | 24 | #if defined(_M_X64) \ |
24 | || defined(_M_AMD64) \ | 25 | || defined(_M_AMD64) \ |
25 | || defined(__x86_64__) \ | 26 | || defined(__x86_64__) \ |
@@ -35,6 +36,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
35 | #endif | 36 | #endif |
36 | #define MY_CPU_64BIT | 37 | #define MY_CPU_64BIT |
37 | #endif | 38 | #endif |
39 | #endif | ||
38 | 40 | ||
39 | 41 | ||
40 | #if defined(_M_IX86) \ | 42 | #if defined(_M_IX86) \ |
@@ -47,17 +49,26 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
47 | 49 | ||
48 | 50 | ||
49 | #if defined(_M_ARM64) \ | 51 | #if defined(_M_ARM64) \ |
52 | || defined(_M_ARM64EC) \ | ||
50 | || defined(__AARCH64EL__) \ | 53 | || defined(__AARCH64EL__) \ |
51 | || defined(__AARCH64EB__) \ | 54 | || defined(__AARCH64EB__) \ |
52 | || defined(__aarch64__) | 55 | || defined(__aarch64__) |
53 | #define MY_CPU_ARM64 | 56 | #define MY_CPU_ARM64 |
54 | #ifdef __ILP32__ | 57 | #if defined(__ILP32__) \ |
58 | || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) | ||
55 | #define MY_CPU_NAME "arm64-32" | 59 | #define MY_CPU_NAME "arm64-32" |
56 | #define MY_CPU_SIZEOF_POINTER 4 | 60 | #define MY_CPU_SIZEOF_POINTER 4 |
57 | #else | 61 | #elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) |
62 | #define MY_CPU_NAME "arm64-128" | ||
63 | #define MY_CPU_SIZEOF_POINTER 16 | ||
64 | #else | ||
65 | #if defined(_M_ARM64EC) | ||
66 | #define MY_CPU_NAME "arm64ec" | ||
67 | #else | ||
58 | #define MY_CPU_NAME "arm64" | 68 | #define MY_CPU_NAME "arm64" |
69 | #endif | ||
59 | #define MY_CPU_SIZEOF_POINTER 8 | 70 | #define MY_CPU_SIZEOF_POINTER 8 |
60 | #endif | 71 | #endif |
61 | #define MY_CPU_64BIT | 72 | #define MY_CPU_64BIT |
62 | #endif | 73 | #endif |
63 | 74 | ||
@@ -133,8 +144,36 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
133 | #endif | 144 | #endif |
134 | 145 | ||
135 | 146 | ||
147 | #if defined(__sparc__) \ | ||
148 | || defined(__sparc) | ||
149 | #define MY_CPU_SPARC | ||
150 | #if defined(__LP64__) \ | ||
151 | || defined(_LP64) \ | ||
152 | || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8) | ||
153 | #define MY_CPU_NAME "sparcv9" | ||
154 | #define MY_CPU_SIZEOF_POINTER 8 | ||
155 | #define MY_CPU_64BIT | ||
156 | #elif defined(__sparc_v9__) \ | ||
157 | || defined(__sparcv9) | ||
158 | #define MY_CPU_64BIT | ||
159 | #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) | ||
160 | #define MY_CPU_NAME "sparcv9-32" | ||
161 | #else | ||
162 | #define MY_CPU_NAME "sparcv9m" | ||
163 | #endif | ||
164 | #elif defined(__sparc_v8__) \ | ||
165 | || defined(__sparcv8) | ||
166 | #define MY_CPU_NAME "sparcv8" | ||
167 | #define MY_CPU_SIZEOF_POINTER 4 | ||
168 | #else | ||
169 | #define MY_CPU_NAME "sparc" | ||
170 | #endif | ||
171 | #endif | ||
172 | |||
173 | |||
136 | #if defined(__riscv) \ | 174 | #if defined(__riscv) \ |
137 | || defined(__riscv__) | 175 | || defined(__riscv__) |
176 | #define MY_CPU_RISCV | ||
138 | #if __riscv_xlen == 32 | 177 | #if __riscv_xlen == 32 |
139 | #define MY_CPU_NAME "riscv32" | 178 | #define MY_CPU_NAME "riscv32" |
140 | #elif __riscv_xlen == 64 | 179 | #elif __riscv_xlen == 64 |
@@ -145,6 +184,39 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
145 | #endif | 184 | #endif |
146 | 185 | ||
147 | 186 | ||
187 | #if defined(__loongarch__) | ||
188 | #define MY_CPU_LOONGARCH | ||
189 | #if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64) | ||
190 | #define MY_CPU_64BIT | ||
191 | #endif | ||
192 | #if defined(__loongarch64) | ||
193 | #define MY_CPU_NAME "loongarch64" | ||
194 | #define MY_CPU_LOONGARCH64 | ||
195 | #else | ||
196 | #define MY_CPU_NAME "loongarch" | ||
197 | #endif | ||
198 | #endif | ||
199 | |||
200 | |||
201 | // #undef MY_CPU_NAME | ||
202 | // #undef MY_CPU_SIZEOF_POINTER | ||
203 | // #define __e2k__ | ||
204 | // #define __SIZEOF_POINTER__ 4 | ||
205 | #if defined(__e2k__) | ||
206 | #define MY_CPU_E2K | ||
207 | #if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) | ||
208 | #define MY_CPU_NAME "e2k-32" | ||
209 | #define MY_CPU_SIZEOF_POINTER 4 | ||
210 | #else | ||
211 | #define MY_CPU_NAME "e2k" | ||
212 | #if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8) | ||
213 | #define MY_CPU_SIZEOF_POINTER 8 | ||
214 | #endif | ||
215 | #endif | ||
216 | #define MY_CPU_64BIT | ||
217 | #endif | ||
218 | |||
219 | |||
148 | #if defined(MY_CPU_X86) || defined(MY_CPU_AMD64) | 220 | #if defined(MY_CPU_X86) || defined(MY_CPU_AMD64) |
149 | #define MY_CPU_X86_OR_AMD64 | 221 | #define MY_CPU_X86_OR_AMD64 |
150 | #endif | 222 | #endif |
@@ -175,6 +247,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
175 | || defined(MY_CPU_ARM_LE) \ | 247 | || defined(MY_CPU_ARM_LE) \ |
176 | || defined(MY_CPU_ARM64_LE) \ | 248 | || defined(MY_CPU_ARM64_LE) \ |
177 | || defined(MY_CPU_IA64_LE) \ | 249 | || defined(MY_CPU_IA64_LE) \ |
250 | || defined(_LITTLE_ENDIAN) \ | ||
178 | || defined(__LITTLE_ENDIAN__) \ | 251 | || defined(__LITTLE_ENDIAN__) \ |
179 | || defined(__ARMEL__) \ | 252 | || defined(__ARMEL__) \ |
180 | || defined(__THUMBEL__) \ | 253 | || defined(__THUMBEL__) \ |
@@ -251,6 +324,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
251 | 324 | ||
252 | 325 | ||
253 | #ifndef MY_CPU_NAME | 326 | #ifndef MY_CPU_NAME |
327 | // #define MY_CPU_IS_UNKNOWN | ||
254 | #ifdef MY_CPU_LE | 328 | #ifdef MY_CPU_LE |
255 | #define MY_CPU_NAME "LE" | 329 | #define MY_CPU_NAME "LE" |
256 | #elif defined(MY_CPU_BE) | 330 | #elif defined(MY_CPU_BE) |
@@ -295,9 +369,19 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
295 | #define Z7_BSWAP64(v) _byteswap_uint64(v) | 369 | #define Z7_BSWAP64(v) _byteswap_uint64(v) |
296 | #define Z7_CPU_FAST_BSWAP_SUPPORTED | 370 | #define Z7_CPU_FAST_BSWAP_SUPPORTED |
297 | 371 | ||
298 | #elif (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ | 372 | /* GCC can generate slow code that calls function for __builtin_bswap32() for: |
299 | || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) | 373 | - GCC for RISCV, if Zbb extension is not used. |
300 | 374 | - GCC for SPARC. | |
375 | The code from CLANG for SPARC also is not fastest. | ||
376 | So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases. | ||
377 | */ | ||
378 | #elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb)) \ | ||
379 | && !defined(MY_CPU_SPARC) \ | ||
380 | && ( \ | ||
381 | (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ | ||
382 | || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \ | ||
383 | ) | ||
384 | |||
301 | #define Z7_BSWAP16(v) __builtin_bswap16(v) | 385 | #define Z7_BSWAP16(v) __builtin_bswap16(v) |
302 | #define Z7_BSWAP32(v) __builtin_bswap32(v) | 386 | #define Z7_BSWAP32(v) __builtin_bswap32(v) |
303 | #define Z7_BSWAP64(v) __builtin_bswap64(v) | 387 | #define Z7_BSWAP64(v) __builtin_bswap64(v) |
@@ -329,13 +413,48 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
329 | 413 | ||
330 | #ifdef MY_CPU_LE | 414 | #ifdef MY_CPU_LE |
331 | #if defined(MY_CPU_X86_OR_AMD64) \ | 415 | #if defined(MY_CPU_X86_OR_AMD64) \ |
332 | || defined(MY_CPU_ARM64) | 416 | || defined(MY_CPU_ARM64) \ |
417 | || defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \ | ||
418 | || defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6) | ||
333 | #define MY_CPU_LE_UNALIGN | 419 | #define MY_CPU_LE_UNALIGN |
334 | #define MY_CPU_LE_UNALIGN_64 | 420 | #define MY_CPU_LE_UNALIGN_64 |
335 | #elif defined(__ARM_FEATURE_UNALIGNED) | 421 | #elif defined(__ARM_FEATURE_UNALIGNED) |
336 | /* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment. | 422 | /* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions. |
337 | So we can't use unaligned 64-bit operations. */ | 423 | Description of problems: |
338 | #define MY_CPU_LE_UNALIGN | 424 | problem-1 : 32-bit ARM architecture: |
425 | multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM) | ||
426 | require 32-bit (WORD) alignment (by 32-bit ARM architecture). | ||
427 | So there is "Alignment fault exception", if data is not aligned for 32-bit. | ||
428 | |||
429 | problem-2 : 32-bit kernels and arm64 kernels: | ||
430 | 32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception". | ||
431 | So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux. | ||
432 | |||
433 | But some arm64 kernels do not handle these faults in 32-bit programs. | ||
434 | So we have unhandled exception for such instructions. | ||
435 | Probably some new arm64 kernels have fixed it, and unaligned | ||
436 | paired-access instructions work in new kernels? | ||
437 | |||
438 | problem-3 : compiler for 32-bit arm: | ||
439 | Compilers use LDRD/STRD/LDM/STM for UInt64 accesses | ||
440 | and for another cases where two 32-bit accesses are fused | ||
441 | to one multi-access instruction. | ||
442 | So UInt64 variables must be aligned for 32-bit, and each | ||
443 | 32-bit access must be aligned for 32-bit, if we want to | ||
444 | avoid "Alignment fault" exception (handled or unhandled). | ||
445 | |||
446 | problem-4 : performace: | ||
447 | Even if unaligned access is handled by kernel, it will be slow. | ||
448 | So if we allow unaligned access, we can get fast unaligned | ||
449 | single-access, and slow unaligned paired-access. | ||
450 | |||
451 | We don't allow unaligned access on 32-bit arm, because compiler | ||
452 | genarates paired-access instructions that require 32-bit alignment, | ||
453 | and some arm64 kernels have no handler for these instructions. | ||
454 | Also unaligned paired-access instructions will be slow, if kernel handles them. | ||
455 | */ | ||
456 | // it must be disabled: | ||
457 | // #define MY_CPU_LE_UNALIGN | ||
339 | #endif | 458 | #endif |
340 | #endif | 459 | #endif |
341 | 460 | ||
@@ -439,6 +558,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
439 | 558 | ||
440 | #if defined(MY_CPU_BE) | 559 | #if defined(MY_CPU_BE) |
441 | 560 | ||
561 | #define GetBe64a(p) (*(const UInt64 *)(const void *)(p)) | ||
442 | #define GetBe32a(p) (*(const UInt32 *)(const void *)(p)) | 562 | #define GetBe32a(p) (*(const UInt32 *)(const void *)(p)) |
443 | #define GetBe16a(p) (*(const UInt16 *)(const void *)(p)) | 563 | #define GetBe16a(p) (*(const UInt16 *)(const void *)(p)) |
444 | #define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); } | 564 | #define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); } |
@@ -456,6 +576,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
456 | #define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); } | 576 | #define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); } |
457 | #define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); } | 577 | #define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); } |
458 | 578 | ||
579 | #define GetBe64a(p) GetBe64(p) | ||
459 | #define GetBe32a(p) GetBe32(p) | 580 | #define GetBe32a(p) GetBe32(p) |
460 | #define GetBe16a(p) GetBe16(p) | 581 | #define GetBe16a(p) GetBe16(p) |
461 | #define SetBe32a(p, v) SetBe32(p, v) | 582 | #define SetBe32a(p, v) SetBe32(p, v) |
@@ -486,6 +607,7 @@ UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void); | |||
486 | BoolInt CPU_IsSupported_AES(void); | 607 | BoolInt CPU_IsSupported_AES(void); |
487 | BoolInt CPU_IsSupported_AVX(void); | 608 | BoolInt CPU_IsSupported_AVX(void); |
488 | BoolInt CPU_IsSupported_AVX2(void); | 609 | BoolInt CPU_IsSupported_AVX2(void); |
610 | // BoolInt CPU_IsSupported_AVX512F_AVX512VL(void); | ||
489 | BoolInt CPU_IsSupported_VAES_AVX2(void); | 611 | BoolInt CPU_IsSupported_VAES_AVX2(void); |
490 | BoolInt CPU_IsSupported_CMOV(void); | 612 | BoolInt CPU_IsSupported_CMOV(void); |
491 | BoolInt CPU_IsSupported_SSE(void); | 613 | BoolInt CPU_IsSupported_SSE(void); |