Vectorize the CRC-32 calculation on the s390x.

Use vector extensions when compiling for s390x and binutils knows about them. At runtime, check whether kernel supports vector extensions (it has to be not just the CPU, but also the kernel) and choose between the regular and the vectorized implementations. Co-authored-by: Eduard Stefes <eddy@linux.ibm.com>
author: Ilya Leoshkevich <iii@linux.ibm.com> 2025-09-10 11:28:03 +0200
committer: Mark Adler <git@madler.net> 2026-01-27 20:22:17 -0800
commit: 07f2d4237eade624182b1cf11f1f516985aed620 (patch)
tree: fad27d61fc7e05c8bc08662fd1caa4fb2bea928e /contrib
parent: 3382ba45561ea82a1d8976578b2a41facff3b8bc (diff)
download: zlib-07f2d4237eade624182b1cf11f1f516985aed620.tar.gz
zlib-07f2d4237eade624182b1cf11f1f516985aed620.tar.bz2
zlib-07f2d4237eade624182b1cf11f1f516985aed620.zip
6 files changed, 350 insertions, 1 deletions
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index 8ec3bbd9..37a3491f 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -24,9 +24,15 @@ function(zlib_add_contrib_lib name description dir)
 endfunction(zlib_add_contrib_lib name description dir)
 function(zlib_add_contrib_feature name description dir)
+    if(ARGC EQUAL 4)
+        set(default_on ${ARGV3})
+    else()
+        set(default_on Off)
+    endif()
    option(ZLIB_WITH_${name}
           "Enable build ${description}"
-           OFF)
+           ${default_on})
    if(ZLIB_WITH_${name})
        add_subdirectory(${dir}/)
@@ -38,6 +44,7 @@ zlib_add_contrib_feature("GVMAT64"
    gcc_gvmat64)
 zlib_add_contrib_feature(INFBACK9 "with support for method 9 deflate" infback9)
+zlib_add_contrib_feature(CRC32VX "with S390X-CRC32VX implementation" crc32vx On)
 zlib_add_contrib_lib(ADA "Ada bindings" ada)
 zlib_add_contrib_lib(BLAST "blast binary" blast)
 zlib_add_contrib_lib(IOSTREAM3 "IOStream C++ bindings V3" iostream3)
diff --git a/contrib/README.contrib b/contrib/README.contrib
index d9480eea..173f1d48 100644
--- a/contrib/README.contrib
+++ b/contrib/README.contrib
@@ -46,6 +46,9 @@ puff/       by Mark Adler <madler@alumni.caltech.edu>
        Small, low memory usage inflate.  Also serves to provide an
        unambiguous description of the deflate format.
+crc32vx/      by Ilya Leoshkevich <iii@linux.ibm.com>
+        Hardware-accelerated CRC32 on IBM Z with Z13 VX extension.
 testzlib/   by Gilles Vollant <info@winimage.com>
        Example of the use of zlib
diff --git a/contrib/crc32vx/CMakeLists.txt b/contrib/crc32vx/CMakeLists.txt
new file mode 100644
index 00000000..ee46fa3a
--- /dev/null
+++ b/contrib/crc32vx/CMakeLists.txt
@@ -0,0 +1,67 @@
+# check if we compile for IBM s390x
+#
+CHECK_C_SOURCE_COMPILES("
+#ifndef __s390x__
+    #error
+#endif
+int main() {return 0;}
+" HAS_S390X_SUPPORT)
+#
+# Check for IBM S390X - VX extensions
+#
+if(ZLIB_WITH_CRC32VX AND HAS_S390X_SUPPORT)
+    # preset the compiler specific flags
+    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+        set(VGFMAFLAG "-fzvector")
+    else()
+        set(VGFMAFLAG "-mzarch")
+    endif(CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    set(S390X_VX_TEST
+        "#ifndef __s390x__ \n\
+        #error \n\
+        #endif \n\
+        #include <vecintrin.h> \n\
+        int main(void) { \
+           unsigned long long a __attribute__((vector_size(16))) = { 0 }; \
+           unsigned long long b __attribute__((vector_size(16))) = { 0 }; \
+           unsigned char c __attribute__((vector_size(16))) = { 0 }; \
+           c = vec_gfmsum_accum_128(a, b, c); \
+           return c[0]; \
+        }")
+    # cflags already contains a valid march
+    set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG}")
+    check_c_source_compiles("${S390X_VX_TEST}" HAS_S390X_VX_SUPPORT)
+    unset(CMAKE_REQUIRED_FLAGS)
+    # or set march for our compile units
+    if(NOT HAS_S390X_VX_SUPPORT)
+        set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} -march=z13")
+        check_c_source_compiles("${S390X_VX_TEST}" HAS_Z13_S390X_VX_SUPPORT)
+        unset(CMAKE_REQUIRED_FLAGS )
+        list(APPEND VGFMAFLAG "-march=z13")
+    endif(NOT HAS_S390X_VX_SUPPORT)
+    # prepare compiling for s390x
+    if(HAS_S390X_VX_SUPPORT OR HAS_Z13_S390X_VX_SUPPORT)
+        if(ZLIB_BUILD_SHARED)
+            target_sources(zlib
+                PRIVATE
+                    crc32_vx.c
+                    crc32_vx_hooks.h)
+            target_compile_definitions(zlib PUBLIC -DHAVE_S390X_VX=1)
+        endif(ZLIB_BUILD_SHARED)
+        if(ZLIB_BUILD_STATIC)
+            target_sources(zlibstatic
+                PRIVATE
+                    crc32_vx.c
+                    crc32_vx_hooks.h)
+            target_compile_definitions(zlibstatic PUBLIC -DHAVE_S390X_VX=1)
+        endif(ZLIB_BUILD_STATIC)
+        set_source_files_properties(
+            crc32_vx.c
+            PROPERTIES COMPILE_OPTIONS "${VGFMAFLAG}")
+    endif(HAS_S390X_VX_SUPPORT OR HAS_Z13_S390X_VX_SUPPORT)
+endif(ZLIB_WITH_CRC32VX AND HAS_S390X_SUPPORT)
diff --git a/contrib/crc32vx/README b/contrib/crc32vx/README
new file mode 100644
index 00000000..329610d5
--- /dev/null
+++ b/contrib/crc32vx/README
@@ -0,0 +1,9 @@
+IBM Z mainframes starting from version z13 provide vector instructions, which 
+allows vectorization of crc32. This extension is build by default when targeting
+ibm s390x. However this extension can disabled if desired:
+    # for configure build
+    $ ./configure --disable-crcvx
+    # for cmake build
+    $ cmake .. -DZLIB_CRC32VX=off
diff --git a/contrib/crc32vx/crc32_vx.c b/contrib/crc32vx/crc32_vx.c
new file mode 100644
index 00000000..e718e340
--- /dev/null
+++ b/contrib/crc32vx/crc32_vx.c
@@ -0,0 +1,254 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+#define Z_ONCE
+#include "../../zutil.h"
+#include "crc32_vx_hooks.h"
+#include <stdint.h>
+#include <stdio.h>
+#include <vecintrin.h>
+#include <sys/auxv.h>
+#ifdef __clang__
+#  if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
+# error crc32_vx optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. \
+        Either disable the zlib crc32_vx optimization, or switch to another compiler/compiler version.
+#  endif
+#endif
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+local uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
+    /*
+     * The CRC-32 constant block contains reduction constants to fold and
+     * process particular chunks of the input data stream in parallel.
+     *
+     * For the CRC-32 variants, the constants are precomputed according to
+     * these definitions:
+     *
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *
+     *      The bitreflected Barret reduction constant, u', is defined as
+     *      the bit reversal of floor(x**64 / P(x)).
+     *
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+     *      polynomial in the reversed (bitreflected) domain.
+     *
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+     *
+     *      P(x)  = 0x04C11DB7
+     *      P'(x) = 0xEDB88320
+     */
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
+    /*
+     * Load the initial CRC value.
+     *
+     * The CRC value is loaded into the rightmost word of the
+     * vector register and is later XORed with the LSB portion
+     * of the loaded input data.
+     */
+    uv2di v0 = {0, 0};
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+    /* Load a 64-byte data chunk and XOR with CRC */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+    v1 ^= v0;
+    buf += 64;
+    len -= 64;
+    while (len >= 64) {
+        /* Load the next 64-byte data chunk */
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+        /*
+         * Perform a GF(2) multiplication of the doublewords in V1 with
+         * the R1 and R2 reduction constants in V0.  The intermediate result
+         * is then folded (accumulated) with the next data chunk in PART1 and
+         * stored in V1. Repeat this step for the register contents
+         * in V2, V3, and V4 respectively.
+         */
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+        buf += 64;
+        len -= 64;
+    }
+    /*
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+     * value remains.
+     */
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+    while (len >= 16) {
+        /* Load next data chunk */
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+        /* Fold next data chunk */
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+        buf += 16;
+        len -= 16;
+    }
+    /*
+     * Set up a vector register for byte shifts.  The shift value must
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
+     * Shift by 8 bytes: 0x40
+     * Shift by 4 bytes: 0x20
+     */
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
+    /*
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+     * to move R4 into the rightmost doubleword and set the leftmost
+     * doubleword to 0x1.
+     */
+    v0 = vec_srb(r4r3, (uv2di)v9);
+    v0[0] = 1;
+    /*
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
+     * multiplied by 0x1 and is then XORed with rightmost product.
+     * Implicitly, the intermediate leftmost product becomes padded
+     */
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
+    /*
+     * Now do the final 32-bit fold by multiplying the rightmost word
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
+     *
+     * To achieve this by a single VGFMAG, right shift V1 by a word
+     * and store the result in V2 which is then accumulated.  Use the
+     * vector unpack instruction to load the rightmost half of the
+     * doubleword into the rightmost doubleword element of V1; the other
+     * half is loaded in the leftmost doubleword.
+     * The vector register with CONST_R5 contains the R5 constant in the
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
+     * the leftmost product of V1.
+     */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
+    v2 = vec_srb(v1, (uv2di)v9);
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+    /*
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
+     *
+     * The input values to the Barret reduction are the degree-63 polynomial
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
+     * P(x).
+     *
+     * The Barret reduction algorithm is defined as:
+     *
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
+     *
+     *  Note: The leftmost doubleword of vector register containing
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+     *  is zero and does not contribute to the final result.
+     */
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+    v2 = vec_unpackl((uv4si)v1);
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+    /*
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
+     * The final result is stored in word element 2 of V2.
+     */
+    v2 = vec_unpackl((uv4si)v2);
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+    return ((uv4si)v2)[2];
+}
+local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
+{
+    uintptr_t prealign, aligned, remaining;
+    if (buf == Z_NULL) return 0UL;
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+        return crc32_z(crc, buf, len);
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
+        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
+        len -= prealign;
+        crc = crc32_z(crc, buf, prealign);
+        buf += prealign;
+    }
+    aligned = len & ~VX_ALIGN_MASK;
+    remaining = len & VX_ALIGN_MASK;
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
+    if (remaining)
+        crc = crc32_z(crc, buf + aligned, remaining);
+    return crc;
+}
+local z_once_t s390_crc32_made = Z_ONCE_INIT;
+local void s390_crc32_setup() {
+    unsigned long hwcap = getauxval(AT_HWCAP);
+    if (hwcap & HWCAP_S390_VX)
+        crc32_z_hook = s390_crc32_vx;
+    else
+        crc32_z_hook = crc32_z;
+}
+local unsigned long s390_crc32_init(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
+{
+    z_once(&s390_crc32_made,s390_crc32_setup);
+    return crc32_z_hook(crc, buf, len);
+}
+ZLIB_INTERNAL unsigned long (*crc32_z_hook)(unsigned long crc, const unsigned char FAR *buf, z_size_t len) = s390_crc32_init;
diff --git a/contrib/crc32vx/crc32_vx_hooks.h b/contrib/crc32vx/crc32_vx_hooks.h
new file mode 100644
index 00000000..951c3188
--- /dev/null
+++ b/contrib/crc32vx/crc32_vx_hooks.h
@@ -0,0 +1,9 @@
+#ifndef CRC32_VX_HOOKS_H
+#define CRC32_VX_HOOKS_H
+/**
+ * CRC HOOKS
+ */
+ZLIB_INTERNAL extern unsigned long (*crc32_z_hook)(unsigned long crc, const unsigned char FAR *buf, z_size_t len);
+#endif /* CRC32_VX_HOOKS_H */
author	Ilya Leoshkevich <iii@linux.ibm.com>	2025-09-10 11:28:03 +0200
committer	Mark Adler <git@madler.net>	2026-01-27 20:22:17 -0800
commit	07f2d4237eade624182b1cf11f1f516985aed620 (patch)
tree	fad27d61fc7e05c8bc08662fd1caa4fb2bea928e /contrib
parent	3382ba45561ea82a1d8976578b2a41facff3b8bc (diff)
download	zlib-07f2d4237eade624182b1cf11f1f516985aed620.tar.gz zlib-07f2d4237eade624182b1cf11f1f516985aed620.tar.bz2 zlib-07f2d4237eade624182b1cf11f1f516985aed620.zip

diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 8ec3bbd9..37a3491f 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt
@@ -24,9 +24,15 @@ function(zlib_add_contrib_lib name description dir)
24	endfunction(zlib_add_contrib_lib name description dir)	24	endfunction(zlib_add_contrib_lib name description dir)
25		25
26	function(zlib_add_contrib_feature name description dir)	26	function(zlib_add_contrib_feature name description dir)
		27	if(ARGC EQUAL 4)
		28	set(default_on ${ARGV3})
		29	else()
		30	set(default_on Off)
		31	endif()
		32
27	option(ZLIB_WITH_${name}	33	option(ZLIB_WITH_${name}
28	"Enable build ${description}"	34	"Enable build ${description}"
29	OFF)	35	${default_on})
30		36
31	if(ZLIB_WITH_${name})	37	if(ZLIB_WITH_${name})
32	add_subdirectory(${dir}/)	38	add_subdirectory(${dir}/)
@@ -38,6 +44,7 @@ zlib_add_contrib_feature("GVMAT64"
38	gcc_gvmat64)	44	gcc_gvmat64)
39		45
40	zlib_add_contrib_feature(INFBACK9 "with support for method 9 deflate" infback9)	46	zlib_add_contrib_feature(INFBACK9 "with support for method 9 deflate" infback9)
		47	zlib_add_contrib_feature(CRC32VX "with S390X-CRC32VX implementation" crc32vx On)
41	zlib_add_contrib_lib(ADA "Ada bindings" ada)	48	zlib_add_contrib_lib(ADA "Ada bindings" ada)
42	zlib_add_contrib_lib(BLAST "blast binary" blast)	49	zlib_add_contrib_lib(BLAST "blast binary" blast)
43	zlib_add_contrib_lib(IOSTREAM3 "IOStream C++ bindings V3" iostream3)	50	zlib_add_contrib_lib(IOSTREAM3 "IOStream C++ bindings V3" iostream3)


diff --git a/contrib/README.contrib b/contrib/README.contrib index d9480eea..173f1d48 100644 --- a/contrib/README.contrib +++ b/contrib/README.contrib
@@ -46,6 +46,9 @@ puff/ by Mark Adler <madler@alumni.caltech.edu>
46	Small, low memory usage inflate. Also serves to provide an	46	Small, low memory usage inflate. Also serves to provide an
47	unambiguous description of the deflate format.	47	unambiguous description of the deflate format.
48		48
		49	crc32vx/ by Ilya Leoshkevich <iii@linux.ibm.com>
		50	Hardware-accelerated CRC32 on IBM Z with Z13 VX extension.
		51
49	testzlib/ by Gilles Vollant <info@winimage.com>	52	testzlib/ by Gilles Vollant <info@winimage.com>
50	Example of the use of zlib	53	Example of the use of zlib
51		54


diff --git a/contrib/crc32vx/CMakeLists.txt b/contrib/crc32vx/CMakeLists.txt new file mode 100644 index 00000000..ee46fa3a --- /dev/null +++ b/contrib/crc32vx/CMakeLists.txt
@@ -0,0 +1,67 @@
		1	# check if we compile for IBM s390x
		2	#
		3	CHECK_C_SOURCE_COMPILES("
		4	#ifndef __s390x__
		5	#error
		6	#endif
		7	int main() {return 0;}
		8	" HAS_S390X_SUPPORT)
		9
		10	#
		11	# Check for IBM S390X - VX extensions
		12	#
		13	if(ZLIB_WITH_CRC32VX AND HAS_S390X_SUPPORT)
		14	# preset the compiler specific flags
		15	if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
		16	set(VGFMAFLAG "-fzvector")
		17	else()
		18	set(VGFMAFLAG "-mzarch")
		19	endif(CMAKE_C_COMPILER_ID STREQUAL "Clang")
		20
		21	set(S390X_VX_TEST
		22	"#ifndef __s390x__ \n\
		23	#error \n\
		24	#endif \n\
		25	#include <vecintrin.h> \n\
		26	int main(void) { \
		27	unsigned long long a __attribute__((vector_size(16))) = { 0 }; \
		28	unsigned long long b __attribute__((vector_size(16))) = { 0 }; \
		29	unsigned char c __attribute__((vector_size(16))) = { 0 }; \
		30	c = vec_gfmsum_accum_128(a, b, c); \
		31	return c[0]; \
		32	}")
		33
		34	# cflags already contains a valid march
		35	set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG}")
		36	check_c_source_compiles("${S390X_VX_TEST}" HAS_S390X_VX_SUPPORT)
		37	unset(CMAKE_REQUIRED_FLAGS)
		38
		39	# or set march for our compile units
		40	if(NOT HAS_S390X_VX_SUPPORT)
		41	set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} -march=z13")
		42	check_c_source_compiles("${S390X_VX_TEST}" HAS_Z13_S390X_VX_SUPPORT)
		43	unset(CMAKE_REQUIRED_FLAGS )
		44	list(APPEND VGFMAFLAG "-march=z13")
		45	endif(NOT HAS_S390X_VX_SUPPORT)
		46
		47	# prepare compiling for s390x
		48	if(HAS_S390X_VX_SUPPORT OR HAS_Z13_S390X_VX_SUPPORT)
		49	if(ZLIB_BUILD_SHARED)
		50	target_sources(zlib
		51	PRIVATE
		52	crc32_vx.c
		53	crc32_vx_hooks.h)
		54	target_compile_definitions(zlib PUBLIC -DHAVE_S390X_VX=1)
		55	endif(ZLIB_BUILD_SHARED)
		56	if(ZLIB_BUILD_STATIC)
		57	target_sources(zlibstatic
		58	PRIVATE
		59	crc32_vx.c
		60	crc32_vx_hooks.h)
		61	target_compile_definitions(zlibstatic PUBLIC -DHAVE_S390X_VX=1)
		62	endif(ZLIB_BUILD_STATIC)
		63	set_source_files_properties(
		64	crc32_vx.c
		65	PROPERTIES COMPILE_OPTIONS "${VGFMAFLAG}")
		66	endif(HAS_S390X_VX_SUPPORT OR HAS_Z13_S390X_VX_SUPPORT)
		67	endif(ZLIB_WITH_CRC32VX AND HAS_S390X_SUPPORT)


diff --git a/contrib/crc32vx/README b/contrib/crc32vx/README new file mode 100644 index 00000000..329610d5 --- /dev/null +++ b/contrib/crc32vx/README
@@ -0,0 +1,9 @@
		1	IBM Z mainframes starting from version z13 provide vector instructions, which
		2	allows vectorization of crc32. This extension is build by default when targeting
		3	ibm s390x. However this extension can disabled if desired:
		4
		5	# for configure build
		6	$ ./configure --disable-crcvx
		7
		8	# for cmake build
		9	$ cmake .. -DZLIB_CRC32VX=off


diff --git a/contrib/crc32vx/crc32_vx.c b/contrib/crc32vx/crc32_vx.c new file mode 100644 index 00000000..e718e340 --- /dev/null +++ b/contrib/crc32vx/crc32_vx.c
@@ -0,0 +1,254 @@
		1	/*
		2	* Hardware-accelerated CRC-32 variants for Linux on z Systems
		3	*
		4	* Use the z/Architecture Vector Extension Facility to accelerate the
		5	* computing of bitreflected CRC-32 checksums.
		6	*
		7	* This CRC-32 implementation algorithm is bitreflected and processes
		8	* the least-significant bit first (Little-Endian).
		9	*
		10	* This code was originally written by Hendrik Brueckner
		11	* <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
		12	* relicensed under the zlib license.
		13	*/
		14	#define Z_ONCE
		15	#include "../../zutil.h"
		16	#include "crc32_vx_hooks.h"
		17
		18	#include <stdint.h>
		19	#include <stdio.h>
		20	#include <vecintrin.h>
		21	#include <sys/auxv.h>
		22
		23	#ifdef __clang__
		24	# if ((__clang_major__ == 18) \|\| (__clang_major__ == 19 && (__clang_minor__ < 1 \|\| (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
		25	# error crc32_vx optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. \
		26	Either disable the zlib crc32_vx optimization, or switch to another compiler/compiler version.
		27	# endif
		28	#endif
		29
		30	#define VX_MIN_LEN 64
		31	#define VX_ALIGNMENT 16L
		32	#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
		33
		34	typedef unsigned char uv16qi __attribute__((vector_size(16)));
		35	typedef unsigned int uv4si __attribute__((vector_size(16)));
		36	typedef unsigned long long uv2di __attribute__((vector_size(16)));
		37
		38	local uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
		39	/*
		40	* The CRC-32 constant block contains reduction constants to fold and
		41	* process particular chunks of the input data stream in parallel.
		42	*
		43	* For the CRC-32 variants, the constants are precomputed according to
		44	* these definitions:
		45	*
		46	* R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
		47	* R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
		48	* R3 = [(x128+32 mod P'(x) << 32)]' << 1
		49	* R4 = [(x128-32 mod P'(x) << 32)]' << 1
		50	* R5 = [(x64 mod P'(x) << 32)]' << 1
		51	* R6 = [(x32 mod P'(x) << 32)]' << 1
		52	*
		53	* The bitreflected Barret reduction constant, u', is defined as
		54	* the bit reversal of floor(x**64 / P(x)).
		55	*
		56	* where P(x) is the polynomial in the normal domain and the P'(x) is the
		57	* polynomial in the reversed (bitreflected) domain.
		58	*
		59	* CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
		60	*
		61	* P(x) = 0x04C11DB7
		62	* P'(x) = 0xEDB88320
		63	*/
		64	const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
		65	const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
		66	const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
		67	const uv2di r5 = {0, 0x163CD6124}; /* R5 */
		68	const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
		69	const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
		70
		71	/*
		72	* Load the initial CRC value.
		73	*
		74	* The CRC value is loaded into the rightmost word of the
		75	* vector register and is later XORed with the LSB portion
		76	* of the loaded input data.
		77	*/
		78	uv2di v0 = {0, 0};
		79	v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
		80
		81	/* Load a 64-byte data chunk and XOR with CRC */
		82	uv2di v1 = vec_perm(((uv2di )buf)[0], ((uv2di )buf)[0], perm_le2be);
		83	uv2di v2 = vec_perm(((uv2di )buf)[1], ((uv2di )buf)[1], perm_le2be);
		84	uv2di v3 = vec_perm(((uv2di )buf)[2], ((uv2di )buf)[2], perm_le2be);
		85	uv2di v4 = vec_perm(((uv2di )buf)[3], ((uv2di )buf)[3], perm_le2be);
		86
		87	v1 ^= v0;
		88	buf += 64;
		89	len -= 64;
		90
		91	while (len >= 64) {
		92	/* Load the next 64-byte data chunk */
		93	uv16qi part1 = vec_perm(((uv16qi )buf)[0], ((uv16qi )buf)[0], perm_le2be);
		94	uv16qi part2 = vec_perm(((uv16qi )buf)[1], ((uv16qi )buf)[1], perm_le2be);
		95	uv16qi part3 = vec_perm(((uv16qi )buf)[2], ((uv16qi )buf)[2], perm_le2be);
		96	uv16qi part4 = vec_perm(((uv16qi )buf)[3], ((uv16qi )buf)[3], perm_le2be);
		97
		98	/*
		99	* Perform a GF(2) multiplication of the doublewords in V1 with
		100	* the R1 and R2 reduction constants in V0. The intermediate result
		101	* is then folded (accumulated) with the next data chunk in PART1 and
		102	* stored in V1. Repeat this step for the register contents
		103	* in V2, V3, and V4 respectively.
		104	*/
		105	v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
		106	v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
		107	v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
		108	v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
		109
		110	buf += 64;
		111	len -= 64;
		112	}
		113
		114	/*
		115	* Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
		116	* and R4 and accumulating the next 128-bit chunk until a single 128-bit
		117	* value remains.
		118	*/
		119	v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
		120	v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
		121	v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
		122
		123	while (len >= 16) {
		124	/* Load next data chunk */
		125	v2 = vec_perm((uv2di )buf, (uv2di )buf, perm_le2be);
		126
		127	/* Fold next data chunk */
		128	v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
		129
		130	buf += 16;
		131	len -= 16;
		132	}
		133
		134	/*
		135	* Set up a vector register for byte shifts. The shift value must
		136	* be loaded in bits 1-4 in byte element 7 of a vector register.
		137	* Shift by 8 bytes: 0x40
		138	* Shift by 4 bytes: 0x20
		139	*/
		140	uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
		141	v9 = vec_insert((unsigned char)0x40, v9, 7);
		142
		143	/*
		144	* Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
		145	* to move R4 into the rightmost doubleword and set the leftmost
		146	* doubleword to 0x1.
		147	*/
		148	v0 = vec_srb(r4r3, (uv2di)v9);
		149	v0[0] = 1;
		150
		151	/*
		152	* Compute GF(2) product of V1 and V0. The rightmost doubleword
		153	* of V1 is multiplied with R4. The leftmost doubleword of V1 is
		154	* multiplied by 0x1 and is then XORed with rightmost product.
		155	* Implicitly, the intermediate leftmost product becomes padded
		156	*/
		157	v1 = (uv2di)vec_gfmsum_128(v0, v1);
		158
		159	/*
		160	* Now do the final 32-bit fold by multiplying the rightmost word
		161	* in V1 with R5 and XOR the result with the remaining bits in V1.
		162	*
		163	* To achieve this by a single VGFMAG, right shift V1 by a word
		164	* and store the result in V2 which is then accumulated. Use the
		165	* vector unpack instruction to load the rightmost half of the
		166	* doubleword into the rightmost doubleword element of V1; the other
		167	* half is loaded in the leftmost doubleword.
		168	* The vector register with CONST_R5 contains the R5 constant in the
		169	* rightmost doubleword and the leftmost doubleword is zero to ignore
		170	* the leftmost product of V1.
		171	*/
		172	v9 = vec_insert((unsigned char)0x20, v9, 7);
		173	v2 = vec_srb(v1, (uv2di)v9);
		174	v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
		175	v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
		176
		177	/*
		178	* Apply a Barret reduction to compute the final 32-bit CRC value.
		179	*
		180	* The input values to the Barret reduction are the degree-63 polynomial
		181	* in V1 (R(x)), degree-32 generator polynomial, and the reduction
		182	* constant u. The Barret reduction result is the CRC value of R(x) mod
		183	* P(x).
		184	*
		185	* The Barret reduction algorithm is defined as:
		186	*
		187	* 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
		188	* 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
		189	* 3. C(x) = R(x) XOR T2(x) mod x^32
		190	*
		191	* Note: The leftmost doubleword of vector register containing
		192	* CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
		193	* is zero and does not contribute to the final result.
		194	*/
		195
		196	/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
		197	v2 = vec_unpackl((uv4si)v1);
		198	v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
		199
		200	/*
		201	* Compute the GF(2) product of the CRC polynomial with T1(x) in
		202	* V2 and XOR the intermediate result, T2(x), with the value in V1.
		203	* The final result is stored in word element 2 of V2.
		204	*/
		205	v2 = vec_unpackl((uv4si)v2);
		206	v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
		207
		208	return ((uv4si)v2)[2];
		209	}
		210
		211
		212	local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
		213	{
		214	uintptr_t prealign, aligned, remaining;
		215
		216	if (buf == Z_NULL) return 0UL;
		217
		218	if (len < VX_MIN_LEN + VX_ALIGN_MASK)
		219	return crc32_z(crc, buf, len);
		220
		221	if ((uintptr_t)buf & VX_ALIGN_MASK) {
		222	prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
		223	len -= prealign;
		224	crc = crc32_z(crc, buf, prealign);
		225	buf += prealign;
		226	}
		227	aligned = len & ~VX_ALIGN_MASK;
		228	remaining = len & VX_ALIGN_MASK;
		229
		230	crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
		231
		232	if (remaining)
		233	crc = crc32_z(crc, buf + aligned, remaining);
		234
		235	return crc;
		236	}
		237
		238	local z_once_t s390_crc32_made = Z_ONCE_INIT;
		239	local void s390_crc32_setup() {
		240	unsigned long hwcap = getauxval(AT_HWCAP);
		241
		242	if (hwcap & HWCAP_S390_VX)
		243	crc32_z_hook = s390_crc32_vx;
		244	else
		245	crc32_z_hook = crc32_z;
		246	}
		247
		248	local unsigned long s390_crc32_init(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
		249	{
		250	z_once(&s390_crc32_made,s390_crc32_setup);
		251	return crc32_z_hook(crc, buf, len);
		252	}
		253
		254	ZLIB_INTERNAL unsigned long (crc32_z_hook)(unsigned long crc, const unsigned char FAR buf, z_size_t len) = s390_crc32_init;


diff --git a/contrib/crc32vx/crc32_vx_hooks.h b/contrib/crc32vx/crc32_vx_hooks.h new file mode 100644 index 00000000..951c3188 --- /dev/null +++ b/contrib/crc32vx/crc32_vx_hooks.h
@@ -0,0 +1,9 @@
		1	#ifndef CRC32_VX_HOOKS_H
		2	#define CRC32_VX_HOOKS_H
		3
		4	/**
		5	* CRC HOOKS
		6	*/
		7	ZLIB_INTERNAL extern unsigned long (crc32_z_hook)(unsigned long crc, const unsigned char FAR buf, z_size_t len);
		8
		9	#endif /* CRC32_VX_HOOKS_H */