diff options
| -rw-r--r-- | CMakeLists.txt | 2 | ||||
| -rw-r--r-- | Makefile.in | 11 | ||||
| -rwxr-xr-x | configure | 70 | ||||
| -rw-r--r-- | contrib/CMakeLists.txt | 9 | ||||
| -rw-r--r-- | contrib/README.contrib | 3 | ||||
| -rw-r--r-- | contrib/crc32vx/CMakeLists.txt | 67 | ||||
| -rw-r--r-- | contrib/crc32vx/README | 9 | ||||
| -rw-r--r-- | contrib/crc32vx/crc32_vx.c | 254 | ||||
| -rw-r--r-- | contrib/crc32vx/crc32_vx_hooks.h | 9 | ||||
| -rw-r--r-- | crc32.c | 7 |
10 files changed, 439 insertions, 2 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 8aa8751f..8a8fde79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt | |||
| @@ -103,7 +103,7 @@ check_include_file(unistd.h HAVE_UNISTD_H) | |||
| 103 | if(MSVC) | 103 | if(MSVC) |
| 104 | set(CMAKE_REQUIRED_FLAGS "-WX") | 104 | set(CMAKE_REQUIRED_FLAGS "-WX") |
| 105 | else(MSVC) | 105 | else(MSVC) |
| 106 | set(CMAKE_REQUIRED_FLAGS "-WError") | 106 | set(CMAKE_REQUIRED_FLAGS "-Werror") |
| 107 | endif(MSVC) | 107 | endif(MSVC) |
| 108 | 108 | ||
| 109 | check_c_source_compiles( | 109 | check_c_source_compiles( |
diff --git a/Makefile.in b/Makefile.in index 52d5c2c4..d9de5630 100644 --- a/Makefile.in +++ b/Makefile.in | |||
| @@ -27,6 +27,7 @@ LDFLAGS= | |||
| 27 | TEST_LIBS=-L. libz.a | 27 | TEST_LIBS=-L. libz.a |
| 28 | LDSHARED=$(CC) | 28 | LDSHARED=$(CC) |
| 29 | CPP=$(CC) -E | 29 | CPP=$(CC) -E |
| 30 | VGFMAFLAG= | ||
| 30 | 31 | ||
| 31 | STATICLIB=libz.a | 32 | STATICLIB=libz.a |
| 32 | SHAREDLIB=libz.so | 33 | SHAREDLIB=libz.so |
| @@ -164,6 +165,9 @@ adler32.o: $(SRCDIR)adler32.c | |||
| 164 | crc32.o: $(SRCDIR)crc32.c | 165 | crc32.o: $(SRCDIR)crc32.c |
| 165 | $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c | 166 | $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c |
| 166 | 167 | ||
| 168 | crc32_vx.o: $(SRCDIR)contrib/crc32vx/crc32_vx.c | ||
| 169 | $(CC) $(CFLAGS) $(VGFMAFLAG) $(ZINC) -c -o $@ $(SRCDIR)contrib/crc32vx/crc32_vx.c | ||
| 170 | |||
| 167 | deflate.o: $(SRCDIR)deflate.c | 171 | deflate.o: $(SRCDIR)deflate.c |
| 168 | $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c | 172 | $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c |
| 169 | 173 | ||
| @@ -214,6 +218,11 @@ crc32.lo: $(SRCDIR)crc32.c | |||
| 214 | $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c | 218 | $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c |
| 215 | -@mv objs/crc32.o $@ | 219 | -@mv objs/crc32.o $@ |
| 216 | 220 | ||
| 221 | crc32_vx.lo: $(SRCDIR)contrib/crc32vx/crc32_vx.c | ||
| 222 | -@mkdir objs 2>/dev/null || test -d objs | ||
| 223 | $(CC) $(SFLAGS) $(VGFMAFLAG) $(ZINC) -DPIC -c -o objs/crc32_vx.o $(SRCDIR)contrib/crc32vx/crc32_vx.c | ||
| 224 | -@mv objs/crc32_vx.o $@ | ||
| 225 | |||
| 217 | deflate.lo: $(SRCDIR)deflate.c | 226 | deflate.lo: $(SRCDIR)deflate.c |
| 218 | -@mkdir objs 2>/dev/null || test -d objs | 227 | -@mkdir objs 2>/dev/null || test -d objs |
| 219 | $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c | 228 | $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c |
| @@ -406,6 +415,7 @@ infback.o inflate.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees. | |||
| 406 | inffast.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h $(SRCDIR)inflate.h $(SRCDIR)inffast.h | 415 | inffast.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h $(SRCDIR)inflate.h $(SRCDIR)inffast.h |
| 407 | inftrees.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h | 416 | inftrees.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h |
| 408 | trees.o: $(SRCDIR)deflate.h $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)trees.h | 417 | trees.o: $(SRCDIR)deflate.h $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)trees.h |
| 418 | crc32_vx.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)contrib/crc32vx/crc32_vx_hooks.h | ||
| 409 | 419 | ||
| 410 | adler32.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h | 420 | adler32.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h |
| 411 | zutil.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)gzguts.h | 421 | zutil.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)gzguts.h |
| @@ -417,3 +427,4 @@ infback.lo inflate.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftree | |||
| 417 | inffast.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h $(SRCDIR)inflate.h $(SRCDIR)inffast.h | 427 | inffast.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h $(SRCDIR)inflate.h $(SRCDIR)inffast.h |
| 418 | inftrees.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h | 428 | inftrees.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h |
| 419 | trees.lo: $(SRCDIR)deflate.h $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)trees.h | 429 | trees.lo: $(SRCDIR)deflate.h $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)trees.h |
| 430 | crc32_vx.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)contrib/crc32vx/crc32_vx_hooks.h \ No newline at end of file | ||
| @@ -95,6 +95,7 @@ memory=0 | |||
| 95 | undefined=0 | 95 | undefined=0 |
| 96 | insecure=0 | 96 | insecure=0 |
| 97 | unknown=0 | 97 | unknown=0 |
| 98 | enable_crcvx=1 | ||
| 98 | old_cc="$CC" | 99 | old_cc="$CC" |
| 99 | old_cflags="$CFLAGS" | 100 | old_cflags="$CFLAGS" |
| 100 | OBJC='$(OBJZ) $(OBJG)' | 101 | OBJC='$(OBJZ) $(OBJG)' |
| @@ -122,6 +123,7 @@ case "$1" in | |||
| 122 | echo ' configure [--const] [--zprefix] [--prefix=PREFIX] [--eprefix=EXPREFIX]' | tee -a configure.log | 123 | echo ' configure [--const] [--zprefix] [--prefix=PREFIX] [--eprefix=EXPREFIX]' | tee -a configure.log |
| 123 | echo ' [--insecure] [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log | 124 | echo ' [--insecure] [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log |
| 124 | echo ' [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log | 125 | echo ' [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log |
| 126 | echo ' [--disable-crcvx]' | tee -a configure.log | ||
| 125 | exit 0 ;; | 127 | exit 0 ;; |
| 126 | -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;; | 128 | -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;; |
| 127 | -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;; | 129 | -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;; |
| @@ -150,6 +152,7 @@ case "$1" in | |||
| 150 | --memory) memory=1; shift ;; | 152 | --memory) memory=1; shift ;; |
| 151 | --undefined) undefined=1; shift ;; | 153 | --undefined) undefined=1; shift ;; |
| 152 | --insecure) insecure=1; shift ;; | 154 | --insecure) insecure=1; shift ;; |
| 155 | --disable-crcvx) enable_crcvx=0; shift ;; | ||
| 153 | *) unknown=1; echo "unknown option ignored: $1" | tee -a configure.log; shift;; | 156 | *) unknown=1; echo "unknown option ignored: $1" | tee -a configure.log; shift;; |
| 154 | esac | 157 | esac |
| 155 | done | 158 | done |
| @@ -888,6 +891,70 @@ EOF | |||
| 888 | fi | 891 | fi |
| 889 | fi | 892 | fi |
| 890 | 893 | ||
| 894 | # check for ibm s390x build | ||
| 895 | HAVE_S390X=0 | ||
| 896 | cat > $test.c << EOF | ||
| 897 | #ifndef __s390x__ | ||
| 898 | #error | ||
| 899 | #endif | ||
| 900 | EOF | ||
| 901 | if try $CC -c $CFLAGS $test.c; then | ||
| 902 | echo "Checking for s390x build ... Yes." | tee -a configure.log | ||
| 903 | HAVE_S390X=1 | ||
| 904 | else | ||
| 905 | echo "Checking for s390x build ... No." | tee -a configure.log | ||
| 906 | fi | ||
| 907 | |||
| 908 | # check for ibm s390x vx vector extensions | ||
| 909 | HAVE_S390X_VX=0 | ||
| 910 | if test $HAVE_S390X -eq 1 && test $enable_crcvx -eq 1 ; then | ||
| 911 | # preset the compiler specific flags | ||
| 912 | if test $clang -eq 1; then | ||
| 913 | VGFMAFLAG=-fzvector | ||
| 914 | else | ||
| 915 | VGFMAFLAG=-mzarch | ||
| 916 | fi | ||
| 917 | |||
| 918 | cat > $test.c <<EOF | ||
| 919 | #ifndef __s390x__ | ||
| 920 | #error | ||
| 921 | #endif | ||
| 922 | #include <vecintrin.h> | ||
| 923 | int main(void) { | ||
| 924 | unsigned long long a __attribute__((vector_size(16))) = { 0 }; | ||
| 925 | unsigned long long b __attribute__((vector_size(16))) = { 0 }; | ||
| 926 | unsigned char c __attribute__((vector_size(16))) = { 0 }; | ||
| 927 | c = vec_gfmsum_accum_128(a, b, c); | ||
| 928 | return c[0]; | ||
| 929 | } | ||
| 930 | EOF | ||
| 931 | |||
| 932 | # cflags already contains a valid march | ||
| 933 | if try $CC -c $CFLAGS $VGFMAFLAG $test.c; then | ||
| 934 | echo "Checking for s390x vx vector extension ... Yes." | tee -a configure.log | ||
| 935 | HAVE_S390X_VX=1 | ||
| 936 | # or set march for our compile units | ||
| 937 | elif try $CC -c $CFLAGS $VGFMAFLAG -march=z13 $test.c; then | ||
| 938 | echo "Checking for s390x vx vector extension (march=z13) ... Yes." | tee -a configure.log | ||
| 939 | HAVE_S390X_VX=1 | ||
| 940 | VGFMAFLAG="$VGFMAFLAG -march=z13" | ||
| 941 | # else we are not on s390x | ||
| 942 | else | ||
| 943 | echo "Checking for s390x vx vector extension ... No." | tee -a configure.log | ||
| 944 | fi | ||
| 945 | |||
| 946 | # prepare compiling for s390x | ||
| 947 | if test $HAVE_S390X_VX -eq 1; then | ||
| 948 | CFLAGS="$CFLAGS -DHAVE_S390X_VX" | ||
| 949 | SFLAGS="$SFLAGS -DHAVE_S390X_VX" | ||
| 950 | OBJC="$OBJC crc32_vx.o" | ||
| 951 | PIC_OBJC="$PIC_OBJC crc32_vx.lo" | ||
| 952 | else | ||
| 953 | # target has no vx extension | ||
| 954 | VGFMAFLAG="" | ||
| 955 | fi | ||
| 956 | fi | ||
| 957 | |||
| 891 | # show the results in the log | 958 | # show the results in the log |
| 892 | echo >> configure.log | 959 | echo >> configure.log |
| 893 | echo ALL = $ALL >> configure.log | 960 | echo ALL = $ALL >> configure.log |
| @@ -919,6 +986,9 @@ echo mandir = $mandir >> configure.log | |||
| 919 | echo prefix = $prefix >> configure.log | 986 | echo prefix = $prefix >> configure.log |
| 920 | echo sharedlibdir = $sharedlibdir >> configure.log | 987 | echo sharedlibdir = $sharedlibdir >> configure.log |
| 921 | echo uname = $uname >> configure.log | 988 | echo uname = $uname >> configure.log |
| 989 | echo HAVE_S390X = $HAVE_S390X >> configure.log | ||
| 990 | echo HAVE_S390X_VX = $HAVE_S390X_VX >> configure.log | ||
| 991 | echo VGFMAFLAG = $VGFMAFLAG >> configure.log | ||
| 922 | 992 | ||
| 923 | # update Makefile with the configure results | 993 | # update Makefile with the configure results |
| 924 | sed < ${SRCDIR}Makefile.in " | 994 | sed < ${SRCDIR}Makefile.in " |
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 8ec3bbd9..37a3491f 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt | |||
| @@ -24,9 +24,15 @@ function(zlib_add_contrib_lib name description dir) | |||
| 24 | endfunction(zlib_add_contrib_lib name description dir) | 24 | endfunction(zlib_add_contrib_lib name description dir) |
| 25 | 25 | ||
| 26 | function(zlib_add_contrib_feature name description dir) | 26 | function(zlib_add_contrib_feature name description dir) |
| 27 | if(ARGC EQUAL 4) | ||
| 28 | set(default_on ${ARGV3}) | ||
| 29 | else() | ||
| 30 | set(default_on Off) | ||
| 31 | endif() | ||
| 32 | |||
| 27 | option(ZLIB_WITH_${name} | 33 | option(ZLIB_WITH_${name} |
| 28 | "Enable build ${description}" | 34 | "Enable build ${description}" |
| 29 | OFF) | 35 | ${default_on}) |
| 30 | 36 | ||
| 31 | if(ZLIB_WITH_${name}) | 37 | if(ZLIB_WITH_${name}) |
| 32 | add_subdirectory(${dir}/) | 38 | add_subdirectory(${dir}/) |
| @@ -38,6 +44,7 @@ zlib_add_contrib_feature("GVMAT64" | |||
| 38 | gcc_gvmat64) | 44 | gcc_gvmat64) |
| 39 | 45 | ||
| 40 | zlib_add_contrib_feature(INFBACK9 "with support for method 9 deflate" infback9) | 46 | zlib_add_contrib_feature(INFBACK9 "with support for method 9 deflate" infback9) |
| 47 | zlib_add_contrib_feature(CRC32VX "with S390X-CRC32VX implementation" crc32vx On) | ||
| 41 | zlib_add_contrib_lib(ADA "Ada bindings" ada) | 48 | zlib_add_contrib_lib(ADA "Ada bindings" ada) |
| 42 | zlib_add_contrib_lib(BLAST "blast binary" blast) | 49 | zlib_add_contrib_lib(BLAST "blast binary" blast) |
| 43 | zlib_add_contrib_lib(IOSTREAM3 "IOStream C++ bindings V3" iostream3) | 50 | zlib_add_contrib_lib(IOSTREAM3 "IOStream C++ bindings V3" iostream3) |
diff --git a/contrib/README.contrib b/contrib/README.contrib index d9480eea..173f1d48 100644 --- a/contrib/README.contrib +++ b/contrib/README.contrib | |||
| @@ -46,6 +46,9 @@ puff/ by Mark Adler <madler@alumni.caltech.edu> | |||
| 46 | Small, low memory usage inflate. Also serves to provide an | 46 | Small, low memory usage inflate. Also serves to provide an |
| 47 | unambiguous description of the deflate format. | 47 | unambiguous description of the deflate format. |
| 48 | 48 | ||
| 49 | crc32vx/ by Ilya Leoshkevich <iii@linux.ibm.com> | ||
| 50 | Hardware-accelerated CRC32 on IBM Z with Z13 VX extension. | ||
| 51 | |||
| 49 | testzlib/ by Gilles Vollant <info@winimage.com> | 52 | testzlib/ by Gilles Vollant <info@winimage.com> |
| 50 | Example of the use of zlib | 53 | Example of the use of zlib |
| 51 | 54 | ||
diff --git a/contrib/crc32vx/CMakeLists.txt b/contrib/crc32vx/CMakeLists.txt new file mode 100644 index 00000000..ee46fa3a --- /dev/null +++ b/contrib/crc32vx/CMakeLists.txt | |||
| @@ -0,0 +1,67 @@ | |||
| 1 | # check if we compile for IBM s390x | ||
| 2 | # | ||
| 3 | CHECK_C_SOURCE_COMPILES(" | ||
| 4 | #ifndef __s390x__ | ||
| 5 | #error | ||
| 6 | #endif | ||
| 7 | int main() {return 0;} | ||
| 8 | " HAS_S390X_SUPPORT) | ||
| 9 | |||
| 10 | # | ||
| 11 | # Check for IBM S390X - VX extensions | ||
| 12 | # | ||
| 13 | if(ZLIB_WITH_CRC32VX AND HAS_S390X_SUPPORT) | ||
| 14 | # preset the compiler specific flags | ||
| 15 | if (CMAKE_C_COMPILER_ID STREQUAL "Clang") | ||
| 16 | set(VGFMAFLAG "-fzvector") | ||
| 17 | else() | ||
| 18 | set(VGFMAFLAG "-mzarch") | ||
| 19 | endif(CMAKE_C_COMPILER_ID STREQUAL "Clang") | ||
| 20 | |||
| 21 | set(S390X_VX_TEST | ||
| 22 | "#ifndef __s390x__ \n\ | ||
| 23 | #error \n\ | ||
| 24 | #endif \n\ | ||
| 25 | #include <vecintrin.h> \n\ | ||
| 26 | int main(void) { \ | ||
| 27 | unsigned long long a __attribute__((vector_size(16))) = { 0 }; \ | ||
| 28 | unsigned long long b __attribute__((vector_size(16))) = { 0 }; \ | ||
| 29 | unsigned char c __attribute__((vector_size(16))) = { 0 }; \ | ||
| 30 | c = vec_gfmsum_accum_128(a, b, c); \ | ||
| 31 | return c[0]; \ | ||
| 32 | }") | ||
| 33 | |||
| 34 | # cflags already contains a valid march | ||
| 35 | set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG}") | ||
| 36 | check_c_source_compiles("${S390X_VX_TEST}" HAS_S390X_VX_SUPPORT) | ||
| 37 | unset(CMAKE_REQUIRED_FLAGS) | ||
| 38 | |||
| 39 | # or set march for our compile units | ||
| 40 | if(NOT HAS_S390X_VX_SUPPORT) | ||
| 41 | set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} -march=z13") | ||
| 42 | check_c_source_compiles("${S390X_VX_TEST}" HAS_Z13_S390X_VX_SUPPORT) | ||
| 43 | unset(CMAKE_REQUIRED_FLAGS ) | ||
| 44 | list(APPEND VGFMAFLAG "-march=z13") | ||
| 45 | endif(NOT HAS_S390X_VX_SUPPORT) | ||
| 46 | |||
| 47 | # prepare compiling for s390x | ||
| 48 | if(HAS_S390X_VX_SUPPORT OR HAS_Z13_S390X_VX_SUPPORT) | ||
| 49 | if(ZLIB_BUILD_SHARED) | ||
| 50 | target_sources(zlib | ||
| 51 | PRIVATE | ||
| 52 | crc32_vx.c | ||
| 53 | crc32_vx_hooks.h) | ||
| 54 | target_compile_definitions(zlib PUBLIC -DHAVE_S390X_VX=1) | ||
| 55 | endif(ZLIB_BUILD_SHARED) | ||
| 56 | if(ZLIB_BUILD_STATIC) | ||
| 57 | target_sources(zlibstatic | ||
| 58 | PRIVATE | ||
| 59 | crc32_vx.c | ||
| 60 | crc32_vx_hooks.h) | ||
| 61 | target_compile_definitions(zlibstatic PUBLIC -DHAVE_S390X_VX=1) | ||
| 62 | endif(ZLIB_BUILD_STATIC) | ||
| 63 | set_source_files_properties( | ||
| 64 | crc32_vx.c | ||
| 65 | PROPERTIES COMPILE_OPTIONS "${VGFMAFLAG}") | ||
| 66 | endif(HAS_S390X_VX_SUPPORT OR HAS_Z13_S390X_VX_SUPPORT) | ||
| 67 | endif(ZLIB_WITH_CRC32VX AND HAS_S390X_SUPPORT) | ||
diff --git a/contrib/crc32vx/README b/contrib/crc32vx/README new file mode 100644 index 00000000..329610d5 --- /dev/null +++ b/contrib/crc32vx/README | |||
| @@ -0,0 +1,9 @@ | |||
| 1 | IBM Z mainframes starting from version z13 provide vector instructions, which | ||
| 2 | allows vectorization of crc32. This extension is build by default when targeting | ||
| 3 | ibm s390x. However this extension can disabled if desired: | ||
| 4 | |||
| 5 | # for configure build | ||
| 6 | $ ./configure --disable-crcvx | ||
| 7 | |||
| 8 | # for cmake build | ||
| 9 | $ cmake .. -DZLIB_CRC32VX=off | ||
diff --git a/contrib/crc32vx/crc32_vx.c b/contrib/crc32vx/crc32_vx.c new file mode 100644 index 00000000..e718e340 --- /dev/null +++ b/contrib/crc32vx/crc32_vx.c | |||
| @@ -0,0 +1,254 @@ | |||
| 1 | /* | ||
| 2 | * Hardware-accelerated CRC-32 variants for Linux on z Systems | ||
| 3 | * | ||
| 4 | * Use the z/Architecture Vector Extension Facility to accelerate the | ||
| 5 | * computing of bitreflected CRC-32 checksums. | ||
| 6 | * | ||
| 7 | * This CRC-32 implementation algorithm is bitreflected and processes | ||
| 8 | * the least-significant bit first (Little-Endian). | ||
| 9 | * | ||
| 10 | * This code was originally written by Hendrik Brueckner | ||
| 11 | * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been | ||
| 12 | * relicensed under the zlib license. | ||
| 13 | */ | ||
| 14 | #define Z_ONCE | ||
| 15 | #include "../../zutil.h" | ||
| 16 | #include "crc32_vx_hooks.h" | ||
| 17 | |||
| 18 | #include <stdint.h> | ||
| 19 | #include <stdio.h> | ||
| 20 | #include <vecintrin.h> | ||
| 21 | #include <sys/auxv.h> | ||
| 22 | |||
| 23 | #ifdef __clang__ | ||
| 24 | # if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2)))) | ||
| 25 | # error crc32_vx optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. \ | ||
| 26 | Either disable the zlib crc32_vx optimization, or switch to another compiler/compiler version. | ||
| 27 | # endif | ||
| 28 | #endif | ||
| 29 | |||
| 30 | #define VX_MIN_LEN 64 | ||
| 31 | #define VX_ALIGNMENT 16L | ||
| 32 | #define VX_ALIGN_MASK (VX_ALIGNMENT - 1) | ||
| 33 | |||
| 34 | typedef unsigned char uv16qi __attribute__((vector_size(16))); | ||
| 35 | typedef unsigned int uv4si __attribute__((vector_size(16))); | ||
| 36 | typedef unsigned long long uv2di __attribute__((vector_size(16))); | ||
| 37 | |||
| 38 | local uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) { | ||
| 39 | /* | ||
| 40 | * The CRC-32 constant block contains reduction constants to fold and | ||
| 41 | * process particular chunks of the input data stream in parallel. | ||
| 42 | * | ||
| 43 | * For the CRC-32 variants, the constants are precomputed according to | ||
| 44 | * these definitions: | ||
| 45 | * | ||
| 46 | * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 | ||
| 47 | * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 | ||
| 48 | * R3 = [(x128+32 mod P'(x) << 32)]' << 1 | ||
| 49 | * R4 = [(x128-32 mod P'(x) << 32)]' << 1 | ||
| 50 | * R5 = [(x64 mod P'(x) << 32)]' << 1 | ||
| 51 | * R6 = [(x32 mod P'(x) << 32)]' << 1 | ||
| 52 | * | ||
| 53 | * The bitreflected Barret reduction constant, u', is defined as | ||
| 54 | * the bit reversal of floor(x**64 / P(x)). | ||
| 55 | * | ||
| 56 | * where P(x) is the polynomial in the normal domain and the P'(x) is the | ||
| 57 | * polynomial in the reversed (bitreflected) domain. | ||
| 58 | * | ||
| 59 | * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: | ||
| 60 | * | ||
| 61 | * P(x) = 0x04C11DB7 | ||
| 62 | * P'(x) = 0xEDB88320 | ||
| 63 | */ | ||
| 64 | const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */ | ||
| 65 | const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */ | ||
| 66 | const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */ | ||
| 67 | const uv2di r5 = {0, 0x163CD6124}; /* R5 */ | ||
| 68 | const uv2di ru_poly = {0, 0x1F7011641}; /* u' */ | ||
| 69 | const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */ | ||
| 70 | |||
| 71 | /* | ||
| 72 | * Load the initial CRC value. | ||
| 73 | * | ||
| 74 | * The CRC value is loaded into the rightmost word of the | ||
| 75 | * vector register and is later XORed with the LSB portion | ||
| 76 | * of the loaded input data. | ||
| 77 | */ | ||
| 78 | uv2di v0 = {0, 0}; | ||
| 79 | v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3); | ||
| 80 | |||
| 81 | /* Load a 64-byte data chunk and XOR with CRC */ | ||
| 82 | uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be); | ||
| 83 | uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be); | ||
| 84 | uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be); | ||
| 85 | uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be); | ||
| 86 | |||
| 87 | v1 ^= v0; | ||
| 88 | buf += 64; | ||
| 89 | len -= 64; | ||
| 90 | |||
| 91 | while (len >= 64) { | ||
| 92 | /* Load the next 64-byte data chunk */ | ||
| 93 | uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be); | ||
| 94 | uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be); | ||
| 95 | uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be); | ||
| 96 | uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be); | ||
| 97 | |||
| 98 | /* | ||
| 99 | * Perform a GF(2) multiplication of the doublewords in V1 with | ||
| 100 | * the R1 and R2 reduction constants in V0. The intermediate result | ||
| 101 | * is then folded (accumulated) with the next data chunk in PART1 and | ||
| 102 | * stored in V1. Repeat this step for the register contents | ||
| 103 | * in V2, V3, and V4 respectively. | ||
| 104 | */ | ||
| 105 | v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1); | ||
| 106 | v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2); | ||
| 107 | v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3); | ||
| 108 | v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4); | ||
| 109 | |||
| 110 | buf += 64; | ||
| 111 | len -= 64; | ||
| 112 | } | ||
| 113 | |||
| 114 | /* | ||
| 115 | * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 | ||
| 116 | * and R4 and accumulating the next 128-bit chunk until a single 128-bit | ||
| 117 | * value remains. | ||
| 118 | */ | ||
| 119 | v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2); | ||
| 120 | v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3); | ||
| 121 | v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4); | ||
| 122 | |||
| 123 | while (len >= 16) { | ||
| 124 | /* Load next data chunk */ | ||
| 125 | v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be); | ||
| 126 | |||
| 127 | /* Fold next data chunk */ | ||
| 128 | v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2); | ||
| 129 | |||
| 130 | buf += 16; | ||
| 131 | len -= 16; | ||
| 132 | } | ||
| 133 | |||
| 134 | /* | ||
| 135 | * Set up a vector register for byte shifts. The shift value must | ||
| 136 | * be loaded in bits 1-4 in byte element 7 of a vector register. | ||
| 137 | * Shift by 8 bytes: 0x40 | ||
| 138 | * Shift by 4 bytes: 0x20 | ||
| 139 | */ | ||
| 140 | uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | ||
| 141 | v9 = vec_insert((unsigned char)0x40, v9, 7); | ||
| 142 | |||
| 143 | /* | ||
| 144 | * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes | ||
| 145 | * to move R4 into the rightmost doubleword and set the leftmost | ||
| 146 | * doubleword to 0x1. | ||
| 147 | */ | ||
| 148 | v0 = vec_srb(r4r3, (uv2di)v9); | ||
| 149 | v0[0] = 1; | ||
| 150 | |||
| 151 | /* | ||
| 152 | * Compute GF(2) product of V1 and V0. The rightmost doubleword | ||
| 153 | * of V1 is multiplied with R4. The leftmost doubleword of V1 is | ||
| 154 | * multiplied by 0x1 and is then XORed with rightmost product. | ||
| 155 | * Implicitly, the intermediate leftmost product becomes padded | ||
| 156 | */ | ||
| 157 | v1 = (uv2di)vec_gfmsum_128(v0, v1); | ||
| 158 | |||
| 159 | /* | ||
| 160 | * Now do the final 32-bit fold by multiplying the rightmost word | ||
| 161 | * in V1 with R5 and XOR the result with the remaining bits in V1. | ||
| 162 | * | ||
| 163 | * To achieve this by a single VGFMAG, right shift V1 by a word | ||
| 164 | * and store the result in V2 which is then accumulated. Use the | ||
| 165 | * vector unpack instruction to load the rightmost half of the | ||
| 166 | * doubleword into the rightmost doubleword element of V1; the other | ||
| 167 | * half is loaded in the leftmost doubleword. | ||
| 168 | * The vector register with CONST_R5 contains the R5 constant in the | ||
| 169 | * rightmost doubleword and the leftmost doubleword is zero to ignore | ||
| 170 | * the leftmost product of V1. | ||
| 171 | */ | ||
| 172 | v9 = vec_insert((unsigned char)0x20, v9, 7); | ||
| 173 | v2 = vec_srb(v1, (uv2di)v9); | ||
| 174 | v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */ | ||
| 175 | v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2); | ||
| 176 | |||
| 177 | /* | ||
| 178 | * Apply a Barret reduction to compute the final 32-bit CRC value. | ||
| 179 | * | ||
| 180 | * The input values to the Barret reduction are the degree-63 polynomial | ||
| 181 | * in V1 (R(x)), degree-32 generator polynomial, and the reduction | ||
| 182 | * constant u. The Barret reduction result is the CRC value of R(x) mod | ||
| 183 | * P(x). | ||
| 184 | * | ||
| 185 | * The Barret reduction algorithm is defined as: | ||
| 186 | * | ||
| 187 | * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u | ||
| 188 | * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) | ||
| 189 | * 3. C(x) = R(x) XOR T2(x) mod x^32 | ||
| 190 | * | ||
| 191 | * Note: The leftmost doubleword of vector register containing | ||
| 192 | * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product | ||
| 193 | * is zero and does not contribute to the final result. | ||
| 194 | */ | ||
| 195 | |||
| 196 | /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ | ||
| 197 | v2 = vec_unpackl((uv4si)v1); | ||
| 198 | v2 = (uv2di)vec_gfmsum_128(ru_poly, v2); | ||
| 199 | |||
| 200 | /* | ||
| 201 | * Compute the GF(2) product of the CRC polynomial with T1(x) in | ||
| 202 | * V2 and XOR the intermediate result, T2(x), with the value in V1. | ||
| 203 | * The final result is stored in word element 2 of V2. | ||
| 204 | */ | ||
| 205 | v2 = vec_unpackl((uv4si)v2); | ||
| 206 | v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1); | ||
| 207 | |||
| 208 | return ((uv4si)v2)[2]; | ||
| 209 | } | ||
| 210 | |||
| 211 | |||
| 212 | local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len) | ||
| 213 | { | ||
| 214 | uintptr_t prealign, aligned, remaining; | ||
| 215 | |||
| 216 | if (buf == Z_NULL) return 0UL; | ||
| 217 | |||
| 218 | if (len < VX_MIN_LEN + VX_ALIGN_MASK) | ||
| 219 | return crc32_z(crc, buf, len); | ||
| 220 | |||
| 221 | if ((uintptr_t)buf & VX_ALIGN_MASK) { | ||
| 222 | prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK); | ||
| 223 | len -= prealign; | ||
| 224 | crc = crc32_z(crc, buf, prealign); | ||
| 225 | buf += prealign; | ||
| 226 | } | ||
| 227 | aligned = len & ~VX_ALIGN_MASK; | ||
| 228 | remaining = len & VX_ALIGN_MASK; | ||
| 229 | |||
| 230 | crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff; | ||
| 231 | |||
| 232 | if (remaining) | ||
| 233 | crc = crc32_z(crc, buf + aligned, remaining); | ||
| 234 | |||
| 235 | return crc; | ||
| 236 | } | ||
| 237 | |||
| 238 | local z_once_t s390_crc32_made = Z_ONCE_INIT; | ||
| 239 | local void s390_crc32_setup() { | ||
| 240 | unsigned long hwcap = getauxval(AT_HWCAP); | ||
| 241 | |||
| 242 | if (hwcap & HWCAP_S390_VX) | ||
| 243 | crc32_z_hook = s390_crc32_vx; | ||
| 244 | else | ||
| 245 | crc32_z_hook = crc32_z; | ||
| 246 | } | ||
| 247 | |||
| 248 | local unsigned long s390_crc32_init(unsigned long crc, const unsigned char FAR *buf, z_size_t len) | ||
| 249 | { | ||
| 250 | z_once(&s390_crc32_made,s390_crc32_setup); | ||
| 251 | return crc32_z_hook(crc, buf, len); | ||
| 252 | } | ||
| 253 | |||
| 254 | ZLIB_INTERNAL unsigned long (*crc32_z_hook)(unsigned long crc, const unsigned char FAR *buf, z_size_t len) = s390_crc32_init; | ||
diff --git a/contrib/crc32vx/crc32_vx_hooks.h b/contrib/crc32vx/crc32_vx_hooks.h new file mode 100644 index 00000000..951c3188 --- /dev/null +++ b/contrib/crc32vx/crc32_vx_hooks.h | |||
| @@ -0,0 +1,9 @@ | |||
| 1 | #ifndef CRC32_VX_HOOKS_H | ||
| 2 | #define CRC32_VX_HOOKS_H | ||
| 3 | |||
| 4 | /** | ||
| 5 | * CRC HOOKS | ||
| 6 | */ | ||
| 7 | ZLIB_INTERNAL extern unsigned long (*crc32_z_hook)(unsigned long crc, const unsigned char FAR *buf, z_size_t len); | ||
| 8 | |||
| 9 | #endif /* CRC32_VX_HOOKS_H */ | ||
| @@ -32,6 +32,10 @@ | |||
| 32 | 32 | ||
| 33 | #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */ | 33 | #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */ |
| 34 | 34 | ||
| 35 | #ifdef HAVE_S390X_VX | ||
| 36 | # include "contrib/crc32vx/crc32_vx_hooks.h" | ||
| 37 | #endif | ||
| 38 | |||
| 35 | /* | 39 | /* |
| 36 | A CRC of a message is computed on N braids of words in the message, where | 40 | A CRC of a message is computed on N braids of words in the message, where |
| 37 | each word consists of W bytes (4 or 8). If N is 3, for example, then three | 41 | each word consists of W bytes (4 or 8). If N is 3, for example, then three |
| @@ -942,6 +946,9 @@ unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf, | |||
| 942 | /* ========================================================================= */ | 946 | /* ========================================================================= */ |
| 943 | unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf, | 947 | unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf, |
| 944 | uInt len) { | 948 | uInt len) { |
| 949 | #ifdef HAVE_S390X_VX | ||
| 950 | return crc32_z_hook(crc, buf, len); | ||
| 951 | #endif | ||
| 945 | return crc32_z(crc, buf, len); | 952 | return crc32_z(crc, buf, len); |
| 946 | } | 953 | } |
| 947 | 954 | ||
