aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIlya Leoshkevich <iii@linux.ibm.com>2025-09-10 11:28:03 +0200
committerMark Adler <git@madler.net>2026-01-27 20:22:17 -0800
commit07f2d4237eade624182b1cf11f1f516985aed620 (patch)
treefad27d61fc7e05c8bc08662fd1caa4fb2bea928e
parent3382ba45561ea82a1d8976578b2a41facff3b8bc (diff)
downloadzlib-07f2d4237eade624182b1cf11f1f516985aed620.tar.gz
zlib-07f2d4237eade624182b1cf11f1f516985aed620.tar.bz2
zlib-07f2d4237eade624182b1cf11f1f516985aed620.zip
Vectorize the CRC-32 calculation on the s390x.
Use vector extensions when compiling for s390x and binutils knows about them. At runtime, check whether kernel supports vector extensions (it has to be not just the CPU, but also the kernel) and choose between the regular and the vectorized implementations. Co-authored-by: Eduard Stefes <eddy@linux.ibm.com>
-rw-r--r--CMakeLists.txt2
-rw-r--r--Makefile.in11
-rwxr-xr-xconfigure70
-rw-r--r--contrib/CMakeLists.txt9
-rw-r--r--contrib/README.contrib3
-rw-r--r--contrib/crc32vx/CMakeLists.txt67
-rw-r--r--contrib/crc32vx/README9
-rw-r--r--contrib/crc32vx/crc32_vx.c254
-rw-r--r--contrib/crc32vx/crc32_vx_hooks.h9
-rw-r--r--crc32.c7
10 files changed, 439 insertions, 2 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8aa8751f..8a8fde79 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,7 +103,7 @@ check_include_file(unistd.h HAVE_UNISTD_H)
103if(MSVC) 103if(MSVC)
104 set(CMAKE_REQUIRED_FLAGS "-WX") 104 set(CMAKE_REQUIRED_FLAGS "-WX")
105else(MSVC) 105else(MSVC)
106 set(CMAKE_REQUIRED_FLAGS "-WError") 106 set(CMAKE_REQUIRED_FLAGS "-Werror")
107endif(MSVC) 107endif(MSVC)
108 108
109check_c_source_compiles( 109check_c_source_compiles(
diff --git a/Makefile.in b/Makefile.in
index 52d5c2c4..d9de5630 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -27,6 +27,7 @@ LDFLAGS=
27TEST_LIBS=-L. libz.a 27TEST_LIBS=-L. libz.a
28LDSHARED=$(CC) 28LDSHARED=$(CC)
29CPP=$(CC) -E 29CPP=$(CC) -E
30VGFMAFLAG=
30 31
31STATICLIB=libz.a 32STATICLIB=libz.a
32SHAREDLIB=libz.so 33SHAREDLIB=libz.so
@@ -164,6 +165,9 @@ adler32.o: $(SRCDIR)adler32.c
164crc32.o: $(SRCDIR)crc32.c 165crc32.o: $(SRCDIR)crc32.c
165 $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c 166 $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c
166 167
168crc32_vx.o: $(SRCDIR)contrib/crc32vx/crc32_vx.c
169 $(CC) $(CFLAGS) $(VGFMAFLAG) $(ZINC) -c -o $@ $(SRCDIR)contrib/crc32vx/crc32_vx.c
170
167deflate.o: $(SRCDIR)deflate.c 171deflate.o: $(SRCDIR)deflate.c
168 $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c 172 $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
169 173
@@ -214,6 +218,11 @@ crc32.lo: $(SRCDIR)crc32.c
214 $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c 218 $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c
215 -@mv objs/crc32.o $@ 219 -@mv objs/crc32.o $@
216 220
221crc32_vx.lo: $(SRCDIR)contrib/crc32vx/crc32_vx.c
222 -@mkdir objs 2>/dev/null || test -d objs
223 $(CC) $(SFLAGS) $(VGFMAFLAG) $(ZINC) -DPIC -c -o objs/crc32_vx.o $(SRCDIR)contrib/crc32vx/crc32_vx.c
224 -@mv objs/crc32_vx.o $@
225
217deflate.lo: $(SRCDIR)deflate.c 226deflate.lo: $(SRCDIR)deflate.c
218 -@mkdir objs 2>/dev/null || test -d objs 227 -@mkdir objs 2>/dev/null || test -d objs
219 $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c 228 $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
@@ -406,6 +415,7 @@ infback.o inflate.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.
406inffast.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h $(SRCDIR)inflate.h $(SRCDIR)inffast.h 415inffast.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h $(SRCDIR)inflate.h $(SRCDIR)inffast.h
407inftrees.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h 416inftrees.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h
408trees.o: $(SRCDIR)deflate.h $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)trees.h 417trees.o: $(SRCDIR)deflate.h $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)trees.h
418crc32_vx.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)contrib/crc32vx/crc32_vx_hooks.h
409 419
410adler32.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h 420adler32.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h
411zutil.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)gzguts.h 421zutil.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)gzguts.h
@@ -417,3 +427,4 @@ infback.lo inflate.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftree
417inffast.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h $(SRCDIR)inflate.h $(SRCDIR)inffast.h 427inffast.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h $(SRCDIR)inflate.h $(SRCDIR)inffast.h
418inftrees.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h 428inftrees.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h
419trees.lo: $(SRCDIR)deflate.h $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)trees.h 429trees.lo: $(SRCDIR)deflate.h $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)trees.h
430crc32_vx.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)contrib/crc32vx/crc32_vx_hooks.h \ No newline at end of file
diff --git a/configure b/configure
index 3770b03d..05bee1f4 100755
--- a/configure
+++ b/configure
@@ -95,6 +95,7 @@ memory=0
95undefined=0 95undefined=0
96insecure=0 96insecure=0
97unknown=0 97unknown=0
98enable_crcvx=1
98old_cc="$CC" 99old_cc="$CC"
99old_cflags="$CFLAGS" 100old_cflags="$CFLAGS"
100OBJC='$(OBJZ) $(OBJG)' 101OBJC='$(OBJZ) $(OBJG)'
@@ -122,6 +123,7 @@ case "$1" in
122 echo ' configure [--const] [--zprefix] [--prefix=PREFIX] [--eprefix=EXPREFIX]' | tee -a configure.log 123 echo ' configure [--const] [--zprefix] [--prefix=PREFIX] [--eprefix=EXPREFIX]' | tee -a configure.log
123 echo ' [--insecure] [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log 124 echo ' [--insecure] [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log
124 echo ' [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log 125 echo ' [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log
126 echo ' [--disable-crcvx]' | tee -a configure.log
125 exit 0 ;; 127 exit 0 ;;
126 -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;; 128 -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;;
127 -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;; 129 -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;;
@@ -150,6 +152,7 @@ case "$1" in
150 --memory) memory=1; shift ;; 152 --memory) memory=1; shift ;;
151 --undefined) undefined=1; shift ;; 153 --undefined) undefined=1; shift ;;
152 --insecure) insecure=1; shift ;; 154 --insecure) insecure=1; shift ;;
155 --disable-crcvx) enable_crcvx=0; shift ;;
153 *) unknown=1; echo "unknown option ignored: $1" | tee -a configure.log; shift;; 156 *) unknown=1; echo "unknown option ignored: $1" | tee -a configure.log; shift;;
154 esac 157 esac
155done 158done
@@ -888,6 +891,70 @@ EOF
888 fi 891 fi
889fi 892fi
890 893
894# check for ibm s390x build
895HAVE_S390X=0
896cat > $test.c << EOF
897#ifndef __s390x__
898 #error
899#endif
900EOF
901if try $CC -c $CFLAGS $test.c; then
902 echo "Checking for s390x build ... Yes." | tee -a configure.log
903 HAVE_S390X=1
904else
905 echo "Checking for s390x build ... No." | tee -a configure.log
906fi
907
908# check for ibm s390x vx vector extensions
909HAVE_S390X_VX=0
910if test $HAVE_S390X -eq 1 && test $enable_crcvx -eq 1 ; then
911 # preset the compiler specific flags
912 if test $clang -eq 1; then
913 VGFMAFLAG=-fzvector
914 else
915 VGFMAFLAG=-mzarch
916 fi
917
918 cat > $test.c <<EOF
919#ifndef __s390x__
920#error
921#endif
922#include <vecintrin.h>
923int main(void) {
924 unsigned long long a __attribute__((vector_size(16))) = { 0 };
925 unsigned long long b __attribute__((vector_size(16))) = { 0 };
926 unsigned char c __attribute__((vector_size(16))) = { 0 };
927 c = vec_gfmsum_accum_128(a, b, c);
928 return c[0];
929}
930EOF
931
932 # cflags already contains a valid march
933 if try $CC -c $CFLAGS $VGFMAFLAG $test.c; then
934 echo "Checking for s390x vx vector extension ... Yes." | tee -a configure.log
935 HAVE_S390X_VX=1
936 # or set march for our compile units
937 elif try $CC -c $CFLAGS $VGFMAFLAG -march=z13 $test.c; then
938 echo "Checking for s390x vx vector extension (march=z13) ... Yes." | tee -a configure.log
939 HAVE_S390X_VX=1
940 VGFMAFLAG="$VGFMAFLAG -march=z13"
941 # else we are not on s390x
942 else
943 echo "Checking for s390x vx vector extension ... No." | tee -a configure.log
944 fi
945
946 # prepare compiling for s390x
947 if test $HAVE_S390X_VX -eq 1; then
948 CFLAGS="$CFLAGS -DHAVE_S390X_VX"
949 SFLAGS="$SFLAGS -DHAVE_S390X_VX"
950 OBJC="$OBJC crc32_vx.o"
951 PIC_OBJC="$PIC_OBJC crc32_vx.lo"
952 else
953 # target has no vx extension
954 VGFMAFLAG=""
955 fi
956fi
957
891# show the results in the log 958# show the results in the log
892echo >> configure.log 959echo >> configure.log
893echo ALL = $ALL >> configure.log 960echo ALL = $ALL >> configure.log
@@ -919,6 +986,9 @@ echo mandir = $mandir >> configure.log
919echo prefix = $prefix >> configure.log 986echo prefix = $prefix >> configure.log
920echo sharedlibdir = $sharedlibdir >> configure.log 987echo sharedlibdir = $sharedlibdir >> configure.log
921echo uname = $uname >> configure.log 988echo uname = $uname >> configure.log
989echo HAVE_S390X = $HAVE_S390X >> configure.log
990echo HAVE_S390X_VX = $HAVE_S390X_VX >> configure.log
991echo VGFMAFLAG = $VGFMAFLAG >> configure.log
922 992
923# update Makefile with the configure results 993# update Makefile with the configure results
924sed < ${SRCDIR}Makefile.in " 994sed < ${SRCDIR}Makefile.in "
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index 8ec3bbd9..37a3491f 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -24,9 +24,15 @@ function(zlib_add_contrib_lib name description dir)
24endfunction(zlib_add_contrib_lib name description dir) 24endfunction(zlib_add_contrib_lib name description dir)
25 25
26function(zlib_add_contrib_feature name description dir) 26function(zlib_add_contrib_feature name description dir)
27 if(ARGC EQUAL 4)
28 set(default_on ${ARGV3})
29 else()
30 set(default_on Off)
31 endif()
32
27 option(ZLIB_WITH_${name} 33 option(ZLIB_WITH_${name}
28 "Enable build ${description}" 34 "Enable build ${description}"
29 OFF) 35 ${default_on})
30 36
31 if(ZLIB_WITH_${name}) 37 if(ZLIB_WITH_${name})
32 add_subdirectory(${dir}/) 38 add_subdirectory(${dir}/)
@@ -38,6 +44,7 @@ zlib_add_contrib_feature("GVMAT64"
38 gcc_gvmat64) 44 gcc_gvmat64)
39 45
40zlib_add_contrib_feature(INFBACK9 "with support for method 9 deflate" infback9) 46zlib_add_contrib_feature(INFBACK9 "with support for method 9 deflate" infback9)
47zlib_add_contrib_feature(CRC32VX "with S390X-CRC32VX implementation" crc32vx On)
41zlib_add_contrib_lib(ADA "Ada bindings" ada) 48zlib_add_contrib_lib(ADA "Ada bindings" ada)
42zlib_add_contrib_lib(BLAST "blast binary" blast) 49zlib_add_contrib_lib(BLAST "blast binary" blast)
43zlib_add_contrib_lib(IOSTREAM3 "IOStream C++ bindings V3" iostream3) 50zlib_add_contrib_lib(IOSTREAM3 "IOStream C++ bindings V3" iostream3)
diff --git a/contrib/README.contrib b/contrib/README.contrib
index d9480eea..173f1d48 100644
--- a/contrib/README.contrib
+++ b/contrib/README.contrib
@@ -46,6 +46,9 @@ puff/ by Mark Adler <madler@alumni.caltech.edu>
46 Small, low memory usage inflate. Also serves to provide an 46 Small, low memory usage inflate. Also serves to provide an
47 unambiguous description of the deflate format. 47 unambiguous description of the deflate format.
48 48
49crc32vx/ by Ilya Leoshkevich <iii@linux.ibm.com>
50 Hardware-accelerated CRC32 on IBM Z with Z13 VX extension.
51
49testzlib/ by Gilles Vollant <info@winimage.com> 52testzlib/ by Gilles Vollant <info@winimage.com>
50 Example of the use of zlib 53 Example of the use of zlib
51 54
diff --git a/contrib/crc32vx/CMakeLists.txt b/contrib/crc32vx/CMakeLists.txt
new file mode 100644
index 00000000..ee46fa3a
--- /dev/null
+++ b/contrib/crc32vx/CMakeLists.txt
@@ -0,0 +1,67 @@
1# check if we compile for IBM s390x
2#
3CHECK_C_SOURCE_COMPILES("
4#ifndef __s390x__
5 #error
6#endif
7int main() {return 0;}
8" HAS_S390X_SUPPORT)
9
10#
11# Check for IBM S390X - VX extensions
12#
13if(ZLIB_WITH_CRC32VX AND HAS_S390X_SUPPORT)
14 # preset the compiler specific flags
15 if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
16 set(VGFMAFLAG "-fzvector")
17 else()
18 set(VGFMAFLAG "-mzarch")
19 endif(CMAKE_C_COMPILER_ID STREQUAL "Clang")
20
21 set(S390X_VX_TEST
22 "#ifndef __s390x__ \n\
23 #error \n\
24 #endif \n\
25 #include <vecintrin.h> \n\
26 int main(void) { \
27 unsigned long long a __attribute__((vector_size(16))) = { 0 }; \
28 unsigned long long b __attribute__((vector_size(16))) = { 0 }; \
29 unsigned char c __attribute__((vector_size(16))) = { 0 }; \
30 c = vec_gfmsum_accum_128(a, b, c); \
31 return c[0]; \
32 }")
33
34 # cflags already contains a valid march
35 set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG}")
36 check_c_source_compiles("${S390X_VX_TEST}" HAS_S390X_VX_SUPPORT)
37 unset(CMAKE_REQUIRED_FLAGS)
38
39 # or set march for our compile units
40 if(NOT HAS_S390X_VX_SUPPORT)
41 set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} -march=z13")
42 check_c_source_compiles("${S390X_VX_TEST}" HAS_Z13_S390X_VX_SUPPORT)
43 unset(CMAKE_REQUIRED_FLAGS )
44 list(APPEND VGFMAFLAG "-march=z13")
45 endif(NOT HAS_S390X_VX_SUPPORT)
46
47 # prepare compiling for s390x
48 if(HAS_S390X_VX_SUPPORT OR HAS_Z13_S390X_VX_SUPPORT)
49 if(ZLIB_BUILD_SHARED)
50 target_sources(zlib
51 PRIVATE
52 crc32_vx.c
53 crc32_vx_hooks.h)
54 target_compile_definitions(zlib PUBLIC -DHAVE_S390X_VX=1)
55 endif(ZLIB_BUILD_SHARED)
56 if(ZLIB_BUILD_STATIC)
57 target_sources(zlibstatic
58 PRIVATE
59 crc32_vx.c
60 crc32_vx_hooks.h)
61 target_compile_definitions(zlibstatic PUBLIC -DHAVE_S390X_VX=1)
62 endif(ZLIB_BUILD_STATIC)
63 set_source_files_properties(
64 crc32_vx.c
65 PROPERTIES COMPILE_OPTIONS "${VGFMAFLAG}")
66 endif(HAS_S390X_VX_SUPPORT OR HAS_Z13_S390X_VX_SUPPORT)
67endif(ZLIB_WITH_CRC32VX AND HAS_S390X_SUPPORT)
diff --git a/contrib/crc32vx/README b/contrib/crc32vx/README
new file mode 100644
index 00000000..329610d5
--- /dev/null
+++ b/contrib/crc32vx/README
@@ -0,0 +1,9 @@
1IBM Z mainframes starting from version z13 provide vector instructions, which
2allows vectorization of crc32. This extension is build by default when targeting
3ibm s390x. However this extension can disabled if desired:
4
5 # for configure build
6 $ ./configure --disable-crcvx
7
8 # for cmake build
9 $ cmake .. -DZLIB_CRC32VX=off
diff --git a/contrib/crc32vx/crc32_vx.c b/contrib/crc32vx/crc32_vx.c
new file mode 100644
index 00000000..e718e340
--- /dev/null
+++ b/contrib/crc32vx/crc32_vx.c
@@ -0,0 +1,254 @@
1/*
2 * Hardware-accelerated CRC-32 variants for Linux on z Systems
3 *
4 * Use the z/Architecture Vector Extension Facility to accelerate the
5 * computing of bitreflected CRC-32 checksums.
6 *
7 * This CRC-32 implementation algorithm is bitreflected and processes
8 * the least-significant bit first (Little-Endian).
9 *
10 * This code was originally written by Hendrik Brueckner
11 * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
12 * relicensed under the zlib license.
13 */
14#define Z_ONCE
15#include "../../zutil.h"
16#include "crc32_vx_hooks.h"
17
18#include <stdint.h>
19#include <stdio.h>
20#include <vecintrin.h>
21#include <sys/auxv.h>
22
23#ifdef __clang__
24# if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
25# error crc32_vx optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. \
26 Either disable the zlib crc32_vx optimization, or switch to another compiler/compiler version.
27# endif
28#endif
29
30#define VX_MIN_LEN 64
31#define VX_ALIGNMENT 16L
32#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
33
34typedef unsigned char uv16qi __attribute__((vector_size(16)));
35typedef unsigned int uv4si __attribute__((vector_size(16)));
36typedef unsigned long long uv2di __attribute__((vector_size(16)));
37
38local uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
39 /*
40 * The CRC-32 constant block contains reduction constants to fold and
41 * process particular chunks of the input data stream in parallel.
42 *
43 * For the CRC-32 variants, the constants are precomputed according to
44 * these definitions:
45 *
46 * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
47 * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
48 * R3 = [(x128+32 mod P'(x) << 32)]' << 1
49 * R4 = [(x128-32 mod P'(x) << 32)]' << 1
50 * R5 = [(x64 mod P'(x) << 32)]' << 1
51 * R6 = [(x32 mod P'(x) << 32)]' << 1
52 *
53 * The bitreflected Barret reduction constant, u', is defined as
54 * the bit reversal of floor(x**64 / P(x)).
55 *
56 * where P(x) is the polynomial in the normal domain and the P'(x) is the
57 * polynomial in the reversed (bitreflected) domain.
58 *
59 * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
60 *
61 * P(x) = 0x04C11DB7
62 * P'(x) = 0xEDB88320
63 */
64 const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
65 const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
66 const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
67 const uv2di r5 = {0, 0x163CD6124}; /* R5 */
68 const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
69 const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
70
71 /*
72 * Load the initial CRC value.
73 *
74 * The CRC value is loaded into the rightmost word of the
75 * vector register and is later XORed with the LSB portion
76 * of the loaded input data.
77 */
78 uv2di v0 = {0, 0};
79 v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
80
81 /* Load a 64-byte data chunk and XOR with CRC */
82 uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
83 uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
84 uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
85 uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
86
87 v1 ^= v0;
88 buf += 64;
89 len -= 64;
90
91 while (len >= 64) {
92 /* Load the next 64-byte data chunk */
93 uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
94 uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
95 uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
96 uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
97
98 /*
99 * Perform a GF(2) multiplication of the doublewords in V1 with
100 * the R1 and R2 reduction constants in V0. The intermediate result
101 * is then folded (accumulated) with the next data chunk in PART1 and
102 * stored in V1. Repeat this step for the register contents
103 * in V2, V3, and V4 respectively.
104 */
105 v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
106 v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
107 v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
108 v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
109
110 buf += 64;
111 len -= 64;
112 }
113
114 /*
115 * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
116 * and R4 and accumulating the next 128-bit chunk until a single 128-bit
117 * value remains.
118 */
119 v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
120 v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
121 v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
122
123 while (len >= 16) {
124 /* Load next data chunk */
125 v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
126
127 /* Fold next data chunk */
128 v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
129
130 buf += 16;
131 len -= 16;
132 }
133
134 /*
135 * Set up a vector register for byte shifts. The shift value must
136 * be loaded in bits 1-4 in byte element 7 of a vector register.
137 * Shift by 8 bytes: 0x40
138 * Shift by 4 bytes: 0x20
139 */
140 uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
141 v9 = vec_insert((unsigned char)0x40, v9, 7);
142
143 /*
144 * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
145 * to move R4 into the rightmost doubleword and set the leftmost
146 * doubleword to 0x1.
147 */
148 v0 = vec_srb(r4r3, (uv2di)v9);
149 v0[0] = 1;
150
151 /*
152 * Compute GF(2) product of V1 and V0. The rightmost doubleword
153 * of V1 is multiplied with R4. The leftmost doubleword of V1 is
154 * multiplied by 0x1 and is then XORed with rightmost product.
155 * Implicitly, the intermediate leftmost product becomes padded
156 */
157 v1 = (uv2di)vec_gfmsum_128(v0, v1);
158
159 /*
160 * Now do the final 32-bit fold by multiplying the rightmost word
161 * in V1 with R5 and XOR the result with the remaining bits in V1.
162 *
163 * To achieve this by a single VGFMAG, right shift V1 by a word
164 * and store the result in V2 which is then accumulated. Use the
165 * vector unpack instruction to load the rightmost half of the
166 * doubleword into the rightmost doubleword element of V1; the other
167 * half is loaded in the leftmost doubleword.
168 * The vector register with CONST_R5 contains the R5 constant in the
169 * rightmost doubleword and the leftmost doubleword is zero to ignore
170 * the leftmost product of V1.
171 */
172 v9 = vec_insert((unsigned char)0x20, v9, 7);
173 v2 = vec_srb(v1, (uv2di)v9);
174 v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
175 v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
176
177 /*
178 * Apply a Barret reduction to compute the final 32-bit CRC value.
179 *
180 * The input values to the Barret reduction are the degree-63 polynomial
181 * in V1 (R(x)), degree-32 generator polynomial, and the reduction
182 * constant u. The Barret reduction result is the CRC value of R(x) mod
183 * P(x).
184 *
185 * The Barret reduction algorithm is defined as:
186 *
187 * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
188 * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
189 * 3. C(x) = R(x) XOR T2(x) mod x^32
190 *
191 * Note: The leftmost doubleword of vector register containing
192 * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
193 * is zero and does not contribute to the final result.
194 */
195
196 /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
197 v2 = vec_unpackl((uv4si)v1);
198 v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
199
200 /*
201 * Compute the GF(2) product of the CRC polynomial with T1(x) in
202 * V2 and XOR the intermediate result, T2(x), with the value in V1.
203 * The final result is stored in word element 2 of V2.
204 */
205 v2 = vec_unpackl((uv4si)v2);
206 v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
207
208 return ((uv4si)v2)[2];
209}
210
211
212local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
213{
214 uintptr_t prealign, aligned, remaining;
215
216 if (buf == Z_NULL) return 0UL;
217
218 if (len < VX_MIN_LEN + VX_ALIGN_MASK)
219 return crc32_z(crc, buf, len);
220
221 if ((uintptr_t)buf & VX_ALIGN_MASK) {
222 prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
223 len -= prealign;
224 crc = crc32_z(crc, buf, prealign);
225 buf += prealign;
226 }
227 aligned = len & ~VX_ALIGN_MASK;
228 remaining = len & VX_ALIGN_MASK;
229
230 crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
231
232 if (remaining)
233 crc = crc32_z(crc, buf + aligned, remaining);
234
235 return crc;
236}
237
238local z_once_t s390_crc32_made = Z_ONCE_INIT;
239local void s390_crc32_setup() {
240 unsigned long hwcap = getauxval(AT_HWCAP);
241
242 if (hwcap & HWCAP_S390_VX)
243 crc32_z_hook = s390_crc32_vx;
244 else
245 crc32_z_hook = crc32_z;
246}
247
248local unsigned long s390_crc32_init(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
249{
250 z_once(&s390_crc32_made,s390_crc32_setup);
251 return crc32_z_hook(crc, buf, len);
252}
253
254ZLIB_INTERNAL unsigned long (*crc32_z_hook)(unsigned long crc, const unsigned char FAR *buf, z_size_t len) = s390_crc32_init;
diff --git a/contrib/crc32vx/crc32_vx_hooks.h b/contrib/crc32vx/crc32_vx_hooks.h
new file mode 100644
index 00000000..951c3188
--- /dev/null
+++ b/contrib/crc32vx/crc32_vx_hooks.h
@@ -0,0 +1,9 @@
1#ifndef CRC32_VX_HOOKS_H
2#define CRC32_VX_HOOKS_H
3
4/**
5 * CRC HOOKS
6 */
7ZLIB_INTERNAL extern unsigned long (*crc32_z_hook)(unsigned long crc, const unsigned char FAR *buf, z_size_t len);
8
9#endif /* CRC32_VX_HOOKS_H */
diff --git a/crc32.c b/crc32.c
index 4cc573f8..4d5f5b23 100644
--- a/crc32.c
+++ b/crc32.c
@@ -32,6 +32,10 @@
32 32
33#include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */ 33#include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
34 34
35#ifdef HAVE_S390X_VX
36# include "contrib/crc32vx/crc32_vx_hooks.h"
37#endif
38
35 /* 39 /*
36 A CRC of a message is computed on N braids of words in the message, where 40 A CRC of a message is computed on N braids of words in the message, where
37 each word consists of W bytes (4 or 8). If N is 3, for example, then three 41 each word consists of W bytes (4 or 8). If N is 3, for example, then three
@@ -942,6 +946,9 @@ unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
942/* ========================================================================= */ 946/* ========================================================================= */
943unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf, 947unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
944 uInt len) { 948 uInt len) {
949 #ifdef HAVE_S390X_VX
950 return crc32_z_hook(crc, buf, len);
951 #endif
945 return crc32_z(crc, buf, len); 952 return crc32_z(crc, buf, len);
946} 953}
947 954