diff options
| -rw-r--r-- | crc32.c | 114 |
1 files changed, 114 insertions, 0 deletions
| @@ -617,6 +617,118 @@ const z_crc_t FAR * ZEXPORT get_crc_table() | |||
| 617 | return (const z_crc_t FAR *)crc_table; | 617 | return (const z_crc_t FAR *)crc_table; |
| 618 | } | 618 | } |
| 619 | 619 | ||
| 620 | /* ========================================================================= | ||
| 621 | * Use ARM machine instructions if requested. This will compute the CRC about | ||
| 622 | * ten times faster than the braided calculation. This code does not check for | ||
| 623 | * the presence of the CRC instruction. Compile with care. | ||
| 624 | */ | ||
| 625 | #if defined(Z_ARM_CRC32) && defined(__aarch64__) && W == 8 | ||
| 626 | |||
| 627 | /* | ||
| 628 | Constants empirically determined to maximize speed. These values are from | ||
| 629 | measurements on a Cortex-A57. Your mileage may vary. | ||
| 630 | */ | ||
| 631 | #define Z_BATCH 3990 /* number of words in a batch */ | ||
| 632 | #define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */ | ||
| 633 | #define Z_BATCH_MIN 800 /* fewest words in a final batch */ | ||
| 634 | |||
| 635 | unsigned long ZEXPORT crc32_z(crc, buf, len) | ||
| 636 | unsigned long crc; | ||
| 637 | const unsigned char FAR *buf; | ||
| 638 | z_size_t len; | ||
| 639 | { | ||
| 640 | z_crc_t val; | ||
| 641 | z_word_t crc1, crc2; | ||
| 642 | const z_word_t *word; | ||
| 643 | z_word_t val0, val1, val2; | ||
| 644 | z_size_t last, last2, i; | ||
| 645 | z_size_t num; | ||
| 646 | |||
| 647 | /* Return initial CRC, if requested. */ | ||
| 648 | if (buf == Z_NULL) return 0; | ||
| 649 | |||
| 650 | #ifdef DYNAMIC_CRC_TABLE | ||
| 651 | once(&made, make_crc_table); | ||
| 652 | #endif /* DYNAMIC_CRC_TABLE */ | ||
| 653 | |||
| 654 | /* Pre-condition the CRC */ | ||
| 655 | crc ^= 0xffffffff; | ||
| 656 | |||
| 657 | /* Compute the CRC up to a word boundary. */ | ||
| 658 | while (len && ((z_size_t)buf & 7) != 0) { | ||
| 659 | len--; | ||
| 660 | val = *buf++; | ||
| 661 | __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val)); | ||
| 662 | } | ||
| 663 | |||
| 664 | /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */ | ||
| 665 | word = (z_word_t const *)buf; | ||
| 666 | num = len >> 3; | ||
| 667 | len &= 7; | ||
| 668 | |||
| 669 | /* Do three interleaved CRCs to realize the throughput of one crc32x | ||
| 670 | instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three | ||
| 671 | CRCs are combined into a single CRC after each set of batches. */ | ||
| 672 | while (num >= 3 * Z_BATCH) { | ||
| 673 | crc1 = 0; | ||
| 674 | crc2 = 0; | ||
| 675 | for (i = 0; i < Z_BATCH; i++) { | ||
| 676 | val0 = word[i]; | ||
| 677 | val1 = word[i + Z_BATCH]; | ||
| 678 | val2 = word[i + 2 * Z_BATCH]; | ||
| 679 | __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0)); | ||
| 680 | __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1)); | ||
| 681 | __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2)); | ||
| 682 | } | ||
| 683 | word += 3 * Z_BATCH; | ||
| 684 | num -= 3 * Z_BATCH; | ||
| 685 | crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1; | ||
| 686 | crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2; | ||
| 687 | } | ||
| 688 | |||
| 689 | /* Do one last smaller batch with the remaining words, if there are enough | ||
| 690 | to pay for the combination of CRCs. */ | ||
| 691 | last = num / 3; | ||
| 692 | if (last >= Z_BATCH_MIN) { | ||
| 693 | last2 = last << 1; | ||
| 694 | crc1 = 0; | ||
| 695 | crc2 = 0; | ||
| 696 | for (i = 0; i < last; i++) { | ||
| 697 | val0 = word[i]; | ||
| 698 | val1 = word[i + last]; | ||
| 699 | val2 = word[i + last2]; | ||
| 700 | __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0)); | ||
| 701 | __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1)); | ||
| 702 | __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2)); | ||
| 703 | } | ||
| 704 | word += 3 * last; | ||
| 705 | num -= 3 * last; | ||
| 706 | val = x2nmodp(last, 6); | ||
| 707 | crc = multmodp(val, crc) ^ crc1; | ||
| 708 | crc = multmodp(val, crc) ^ crc2; | ||
| 709 | } | ||
| 710 | |||
| 711 | /* Compute the CRC on any remaining words. */ | ||
| 712 | for (i = 0; i < num; i++) { | ||
| 713 | val0 = word[i]; | ||
| 714 | __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0)); | ||
| 715 | } | ||
| 716 | word += num; | ||
| 717 | |||
| 718 | /* Complete the CRC on any remaining bytes. */ | ||
| 719 | buf = (const unsigned char FAR *)word; | ||
| 720 | while (len) { | ||
| 721 | len--; | ||
| 722 | val = *buf++; | ||
| 723 | __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val)); | ||
| 724 | } | ||
| 725 | |||
| 726 | /* Return the CRC, post-conditioned. */ | ||
| 727 | return crc ^ 0xffffffff; | ||
| 728 | } | ||
| 729 | |||
| 730 | #else | ||
| 731 | |||
| 620 | /* ========================================================================= */ | 732 | /* ========================================================================= */ |
| 621 | unsigned long ZEXPORT crc32_z(crc, buf, len) | 733 | unsigned long ZEXPORT crc32_z(crc, buf, len) |
| 622 | unsigned long crc; | 734 | unsigned long crc; |
| @@ -939,6 +1051,8 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) | |||
| 939 | return crc ^ 0xffffffff; | 1051 | return crc ^ 0xffffffff; |
| 940 | } | 1052 | } |
| 941 | 1053 | ||
| 1054 | #endif | ||
| 1055 | |||
| 942 | /* ========================================================================= */ | 1056 | /* ========================================================================= */ |
| 943 | unsigned long ZEXPORT crc32(crc, buf, len) | 1057 | unsigned long ZEXPORT crc32(crc, buf, len) |
| 944 | unsigned long crc; | 1058 | unsigned long crc; |
