aboutsummaryrefslogtreecommitdiff
path: root/crc32.c
diff options
context:
space:
mode:
authorMark Adler <madler@alumni.caltech.edu>2019-02-17 19:48:57 -0800
committerMark Adler <madler@alumni.caltech.edu>2019-02-17 19:48:57 -0800
commit52fc78baf216dac4e76c5eb3bd940f68e87086da (patch)
tree01eb3d1a091ef2e35fd2142061ca1efab0a1b590 /crc32.c
parentaec89faa2e366e5cff501310771afaf16289c6ad (diff)
downloadzlib-52fc78baf216dac4e76c5eb3bd940f68e87086da.tar.gz
zlib-52fc78baf216dac4e76c5eb3bd940f68e87086da.tar.bz2
zlib-52fc78baf216dac4e76c5eb3bd940f68e87086da.zip
Add use of the ARMv8 crc32 instructions when requested.
Define the macro Z_ARM_CRC32 at compile time to use the ARMv8 (aarch64) crc32x and crc32b instructions. This code does not check for the presence of the crc32 instructions. Those instructions are optional for ARMv8.0, though mandatory for ARMv8.1 and later. The use of the crc32 instructions is about ten times as fast as the software braided calculation of the CRC-32. This can noticeably speed up the decompression of gzip streams.
Diffstat (limited to 'crc32.c')
-rw-r--r--crc32.c114
1 files changed, 114 insertions, 0 deletions
diff --git a/crc32.c b/crc32.c
index f6cd52f..2d20829 100644
--- a/crc32.c
+++ b/crc32.c
@@ -617,6 +617,118 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
617 return (const z_crc_t FAR *)crc_table; 617 return (const z_crc_t FAR *)crc_table;
618} 618}
619 619
620/* =========================================================================
621 * Use ARM machine instructions if requested. This will compute the CRC about
622 * ten times faster than the braided calculation. This code does not check for
623 * the presence of the CRC instruction. Compile with care.
624 */
625#if defined(Z_ARM_CRC32) && defined(__aarch64__) && W == 8
626
627/*
628 Constants empirically determined to maximize speed. These values are from
629 measurements on a Cortex-A57. Your mileage may vary.
630 */
631#define Z_BATCH 3990 /* number of words in a batch */
632#define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
633#define Z_BATCH_MIN 800 /* fewest words in a final batch */
634
635unsigned long ZEXPORT crc32_z(crc, buf, len)
636 unsigned long crc;
637 const unsigned char FAR *buf;
638 z_size_t len;
639{
640 z_crc_t val;
641 z_word_t crc1, crc2;
642 const z_word_t *word;
643 z_word_t val0, val1, val2;
644 z_size_t last, last2, i;
645 z_size_t num;
646
647 /* Return initial CRC, if requested. */
648 if (buf == Z_NULL) return 0;
649
650#ifdef DYNAMIC_CRC_TABLE
651 once(&made, make_crc_table);
652#endif /* DYNAMIC_CRC_TABLE */
653
654 /* Pre-condition the CRC */
655 crc ^= 0xffffffff;
656
657 /* Compute the CRC up to a word boundary. */
658 while (len && ((z_size_t)buf & 7) != 0) {
659 len--;
660 val = *buf++;
661 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
662 }
663
664 /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
665 word = (z_word_t const *)buf;
666 num = len >> 3;
667 len &= 7;
668
669 /* Do three interleaved CRCs to realize the throughput of one crc32x
670 instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
671 CRCs are combined into a single CRC after each set of batches. */
672 while (num >= 3 * Z_BATCH) {
673 crc1 = 0;
674 crc2 = 0;
675 for (i = 0; i < Z_BATCH; i++) {
676 val0 = word[i];
677 val1 = word[i + Z_BATCH];
678 val2 = word[i + 2 * Z_BATCH];
679 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
680 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
681 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
682 }
683 word += 3 * Z_BATCH;
684 num -= 3 * Z_BATCH;
685 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
686 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
687 }
688
689 /* Do one last smaller batch with the remaining words, if there are enough
690 to pay for the combination of CRCs. */
691 last = num / 3;
692 if (last >= Z_BATCH_MIN) {
693 last2 = last << 1;
694 crc1 = 0;
695 crc2 = 0;
696 for (i = 0; i < last; i++) {
697 val0 = word[i];
698 val1 = word[i + last];
699 val2 = word[i + last2];
700 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
701 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
702 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
703 }
704 word += 3 * last;
705 num -= 3 * last;
706 val = x2nmodp(last, 6);
707 crc = multmodp(val, crc) ^ crc1;
708 crc = multmodp(val, crc) ^ crc2;
709 }
710
711 /* Compute the CRC on any remaining words. */
712 for (i = 0; i < num; i++) {
713 val0 = word[i];
714 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
715 }
716 word += num;
717
718 /* Complete the CRC on any remaining bytes. */
719 buf = (const unsigned char FAR *)word;
720 while (len) {
721 len--;
722 val = *buf++;
723 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
724 }
725
726 /* Return the CRC, post-conditioned. */
727 return crc ^ 0xffffffff;
728}
729
730#else
731
620/* ========================================================================= */ 732/* ========================================================================= */
621unsigned long ZEXPORT crc32_z(crc, buf, len) 733unsigned long ZEXPORT crc32_z(crc, buf, len)
622 unsigned long crc; 734 unsigned long crc;
@@ -939,6 +1051,8 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
939 return crc ^ 0xffffffff; 1051 return crc ^ 0xffffffff;
940} 1052}
941 1053
1054#endif
1055
942/* ========================================================================= */ 1056/* ========================================================================= */
943unsigned long ZEXPORT crc32(crc, buf, len) 1057unsigned long ZEXPORT crc32(crc, buf, len)
944 unsigned long crc; 1058 unsigned long crc;