diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-01 15:01:53 +0100 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-01 15:01:53 +0100 |
| commit | d643010feeef312c77d7f51c3dd476d4e605c982 (patch) | |
| tree | 16090cd94447527c057f953446e03cc7384c9c4e /libbb | |
| parent | 5f6817020467598868b7d1c9ca477d7ccd66b87d (diff) | |
| download | busybox-w32-d643010feeef312c77d7f51c3dd476d4e605c982.tar.gz busybox-w32-d643010feeef312c77d7f51c3dd476d4e605c982.tar.bz2 busybox-w32-d643010feeef312c77d7f51c3dd476d4e605c982.zip | |
libbb/sha1: shrink x86_64 version - use r8..15 for W[8..15]
function old new delta
sha1_process_block64 3683 3562 -121
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
| -rw-r--r-- | libbb/Config.src | 2 | ||||
| -rw-r--r-- | libbb/hash_md5_sha.c | 299 |
2 files changed, 240 insertions, 61 deletions
diff --git a/libbb/Config.src b/libbb/Config.src index e027c14a8..f66f65f81 100644 --- a/libbb/Config.src +++ b/libbb/Config.src | |||
| @@ -59,7 +59,7 @@ config SHA1_SMALL | |||
| 59 | Trade binary size versus speed for the sha1 algorithm. | 59 | Trade binary size versus speed for the sha1 algorithm. |
| 60 | throughput MB/s size of sha1_process_block64 | 60 | throughput MB/s size of sha1_process_block64 |
| 61 | value 486 x86-64 486 x86-64 | 61 | value 486 x86-64 486 x86-64 |
| 62 | 0 367 367 3657 3683 | 62 | 0 367 367 3657 3562 |
| 63 | 1 224 229 654 732 | 63 | 1 224 229 654 732 |
| 64 | 2,3 200 195 358 380 | 64 | 2,3 200 195 358 380 |
| 65 | 65 | ||
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 9de30dfe6..a4e36066a 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
| @@ -700,22 +700,194 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
| 700 | { | 700 | { |
| 701 | BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80); | 701 | BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80); |
| 702 | asm( | 702 | asm( |
| 703 | // TODO: store W[] in r8..r15? (r8..r11 are callee-clobbered, no need to save) | ||
| 704 | "\n\ | 703 | "\n\ |
| 705 | ##pushq %r15 # \n\ | 704 | pushq %r15 # \n\ |
| 706 | ##pushq %r14 # \n\ | 705 | pushq %r14 # \n\ |
| 707 | ##pushq %r13 # \n\ | 706 | pushq %r13 # \n\ |
| 708 | ##pushq %r12 # \n\ | 707 | pushq %r12 # \n\ |
| 709 | ##pushq %rbp # \n\ | 708 | pushq %rbp # \n\ |
| 710 | ##pushq %rbx # \n\ | 709 | pushq %rbx # \n\ |
| 711 | movq %rbp, %r8 # callee-saved \n\ | 710 | pushq %rdi # we need ctx at the end \n\ |
| 712 | movq %rbx, %r9 # callee-saved \n\ | 711 | \n\ |
| 713 | movq %rdi, %r10 # we need ctx at the end \n\ | 712 | #Register and stack use: \n\ |
| 714 | movl $15, %eax \n\ | 713 | # eax..edx: a..d \n\ |
| 714 | # ebp: e \n\ | ||
| 715 | # esi,edi: temps \n\ | ||
| 716 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] \n\ | ||
| 717 | .macro loadW n,r \n\ | ||
| 718 | .if \\n == 0 \n\ | ||
| 719 | movl -32+4*0(%rsp),\\r \n\ | ||
| 720 | .endif \n\ | ||
| 721 | .if \\n == 1 \n\ | ||
| 722 | movl -32+4*1(%rsp),\\r \n\ | ||
| 723 | .endif \n\ | ||
| 724 | .if \\n == 2 \n\ | ||
| 725 | movl -32+4*2(%rsp),\\r \n\ | ||
| 726 | .endif \n\ | ||
| 727 | .if \\n == 3 \n\ | ||
| 728 | movl -32+4*3(%rsp),\\r \n\ | ||
| 729 | .endif \n\ | ||
| 730 | .if \\n == 4 \n\ | ||
| 731 | movl -32+4*4(%rsp),\\r \n\ | ||
| 732 | .endif \n\ | ||
| 733 | .if \\n == 5 \n\ | ||
| 734 | movl -32+4*5(%rsp),\\r \n\ | ||
| 735 | .endif \n\ | ||
| 736 | .if \\n == 6 \n\ | ||
| 737 | movl -32+4*6(%rsp),\\r \n\ | ||
| 738 | .endif \n\ | ||
| 739 | .if \\n == 7 \n\ | ||
| 740 | movl -32+4*7(%rsp),\\r \n\ | ||
| 741 | .endif \n\ | ||
| 742 | .if \\n == 8 \n\ | ||
| 743 | movl %r8d,\\r \n\ | ||
| 744 | .endif \n\ | ||
| 745 | .if \\n == 9 \n\ | ||
| 746 | movl %r9d,\\r \n\ | ||
| 747 | .endif \n\ | ||
| 748 | .if \\n == 10 \n\ | ||
| 749 | movl %r10d,\\r \n\ | ||
| 750 | .endif \n\ | ||
| 751 | .if \\n == 11 \n\ | ||
| 752 | movl %r11d,\\r \n\ | ||
| 753 | .endif \n\ | ||
| 754 | .if \\n == 12 \n\ | ||
| 755 | movl %r12d,\\r \n\ | ||
| 756 | .endif \n\ | ||
| 757 | .if \\n == 13 \n\ | ||
| 758 | movl %r13d,\\r \n\ | ||
| 759 | .endif \n\ | ||
| 760 | .if \\n == 14 \n\ | ||
| 761 | movl %r14d,\\r \n\ | ||
| 762 | .endif \n\ | ||
| 763 | .if \\n == 15 \n\ | ||
| 764 | movl %r15d,\\r \n\ | ||
| 765 | .endif \n\ | ||
| 766 | .endm \n\ | ||
| 767 | \n\ | ||
| 768 | .macro storeW r,n \n\ | ||
| 769 | .if \\n == 0 \n\ | ||
| 770 | movl \\r,-32+4*0(%rsp) \n\ | ||
| 771 | .endif \n\ | ||
| 772 | .if \\n == 1 \n\ | ||
| 773 | movl \\r,-32+4*1(%rsp) \n\ | ||
| 774 | .endif \n\ | ||
| 775 | .if \\n == 2 \n\ | ||
| 776 | movl \\r,-32+4*2(%rsp) \n\ | ||
| 777 | .endif \n\ | ||
| 778 | .if \\n == 3 \n\ | ||
| 779 | movl \\r,-32+4*3(%rsp) \n\ | ||
| 780 | .endif \n\ | ||
| 781 | .if \\n == 4 \n\ | ||
| 782 | movl \\r,-32+4*4(%rsp) \n\ | ||
| 783 | .endif \n\ | ||
| 784 | .if \\n == 5 \n\ | ||
| 785 | movl \\r,-32+4*5(%rsp) \n\ | ||
| 786 | .endif \n\ | ||
| 787 | .if \\n == 6 \n\ | ||
| 788 | movl \\r,-32+4*6(%rsp) \n\ | ||
| 789 | .endif \n\ | ||
| 790 | .if \\n == 7 \n\ | ||
| 791 | movl \\r,-32+4*7(%rsp) \n\ | ||
| 792 | .endif \n\ | ||
| 793 | .if \\n == 8 \n\ | ||
| 794 | movl \\r,%r8d \n\ | ||
| 795 | .endif \n\ | ||
| 796 | .if \\n == 9 \n\ | ||
| 797 | movl \\r,%r9d \n\ | ||
| 798 | .endif \n\ | ||
| 799 | .if \\n == 10 \n\ | ||
| 800 | movl \\r,%r10d \n\ | ||
| 801 | .endif \n\ | ||
| 802 | .if \\n == 11 \n\ | ||
| 803 | movl \\r,%r11d \n\ | ||
| 804 | .endif \n\ | ||
| 805 | .if \\n == 12 \n\ | ||
| 806 | movl \\r,%r12d \n\ | ||
| 807 | .endif \n\ | ||
| 808 | .if \\n == 13 \n\ | ||
| 809 | movl \\r,%r13d \n\ | ||
| 810 | .endif \n\ | ||
| 811 | .if \\n == 14 \n\ | ||
| 812 | movl \\r,%r14d \n\ | ||
| 813 | .endif \n\ | ||
| 814 | .if \\n == 15 \n\ | ||
| 815 | movl \\r,%r15d \n\ | ||
| 816 | .endif \n\ | ||
| 817 | .endm \n\ | ||
| 818 | \n\ | ||
| 819 | .macro xorW n,r \n\ | ||
| 820 | .if \\n == 0 \n\ | ||
| 821 | xorl -32+4*0(%rsp),\\r \n\ | ||
| 822 | .endif \n\ | ||
| 823 | .if \\n == 1 \n\ | ||
| 824 | xorl -32+4*1(%rsp),\\r \n\ | ||
| 825 | .endif \n\ | ||
| 826 | .if \\n == 2 \n\ | ||
| 827 | xorl -32+4*2(%rsp),\\r \n\ | ||
| 828 | .endif \n\ | ||
| 829 | .if \\n == 3 \n\ | ||
| 830 | xorl -32+4*3(%rsp),\\r \n\ | ||
| 831 | .endif \n\ | ||
| 832 | .if \\n == 4 \n\ | ||
| 833 | xorl -32+4*4(%rsp),\\r \n\ | ||
| 834 | .endif \n\ | ||
| 835 | .if \\n == 5 \n\ | ||
| 836 | xorl -32+4*5(%rsp),\\r \n\ | ||
| 837 | .endif \n\ | ||
| 838 | .if \\n == 6 \n\ | ||
| 839 | xorl -32+4*6(%rsp),\\r \n\ | ||
| 840 | .endif \n\ | ||
| 841 | .if \\n == 7 \n\ | ||
| 842 | xorl -32+4*7(%rsp),\\r \n\ | ||
| 843 | .endif \n\ | ||
| 844 | .if \\n == 8 \n\ | ||
| 845 | xorl %r8d,\\r \n\ | ||
| 846 | .endif \n\ | ||
| 847 | .if \\n == 9 \n\ | ||
| 848 | xorl %r9d,\\r \n\ | ||
| 849 | .endif \n\ | ||
| 850 | .if \\n == 10 \n\ | ||
| 851 | xorl %r10d,\\r \n\ | ||
| 852 | .endif \n\ | ||
| 853 | .if \\n == 11 \n\ | ||
| 854 | xorl %r11d,\\r \n\ | ||
| 855 | .endif \n\ | ||
| 856 | .if \\n == 12 \n\ | ||
| 857 | xorl %r12d,\\r \n\ | ||
| 858 | .endif \n\ | ||
| 859 | .if \\n == 13 \n\ | ||
| 860 | xorl %r13d,\\r \n\ | ||
| 861 | .endif \n\ | ||
| 862 | .if \\n == 14 \n\ | ||
| 863 | xorl %r14d,\\r \n\ | ||
| 864 | .endif \n\ | ||
| 865 | .if \\n == 15 \n\ | ||
| 866 | xorl %r15d,\\r \n\ | ||
| 867 | .endif \n\ | ||
| 868 | .endm \n\ | ||
| 869 | \n\ | ||
| 870 | movl 4*8(%rdi), %r8d \n\ | ||
| 871 | bswap %r8d \n\ | ||
| 872 | movl 4*9(%rdi), %r9d \n\ | ||
| 873 | bswap %r9d \n\ | ||
| 874 | movl 4*10(%rdi), %r10d \n\ | ||
| 875 | bswap %r10d \n\ | ||
| 876 | movl 4*11(%rdi), %r11d \n\ | ||
| 877 | bswap %r11d \n\ | ||
| 878 | movl 4*12(%rdi), %r12d \n\ | ||
| 879 | bswap %r12d \n\ | ||
| 880 | movl 4*13(%rdi), %r13d \n\ | ||
| 881 | bswap %r13d \n\ | ||
| 882 | movl 4*14(%rdi), %r14d \n\ | ||
| 883 | bswap %r14d \n\ | ||
| 884 | movl 4*15(%rdi), %r15d \n\ | ||
| 885 | bswap %r15d \n\ | ||
| 886 | movl $7, %eax \n\ | ||
| 715 | 1: \n\ | 887 | 1: \n\ |
| 716 | movl (%rdi,%rax,4), %esi \n\ | 888 | movl (%rdi,%rax,4), %esi \n\ |
| 717 | bswap %esi \n\ | 889 | bswap %esi \n\ |
| 718 | movl %esi, -64(%rsp,%rax,4) \n\ | 890 | movl %esi, -32(%rsp,%rax,4) \n\ |
| 719 | decl %eax \n\ | 891 | decl %eax \n\ |
| 720 | jns 1b \n\ | 892 | jns 1b \n\ |
| 721 | movl 80(%rdi), %eax # a = ctx->hash[0] \n\ | 893 | movl 80(%rdi), %eax # a = ctx->hash[0] \n\ |
| @@ -723,15 +895,10 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
| 723 | movl 88(%rdi), %ecx # c = ctx->hash[2] \n\ | 895 | movl 88(%rdi), %ecx # c = ctx->hash[2] \n\ |
| 724 | movl 92(%rdi), %edx # d = ctx->hash[3] \n\ | 896 | movl 92(%rdi), %edx # d = ctx->hash[3] \n\ |
| 725 | movl 96(%rdi), %ebp # e = ctx->hash[4] \n\ | 897 | movl 96(%rdi), %ebp # e = ctx->hash[4] \n\ |
| 726 | #Register and stack use: \n\ | ||
| 727 | # eax..edx: a..d \n\ | ||
| 728 | # ebp: e \n\ | ||
| 729 | # esi,edi: temps \n\ | ||
| 730 | # -64+4*n(%rsp): W[n] \n\ | ||
| 731 | " | 898 | " |
| 732 | #define RD1As(a,b,c,d,e, n, RCONST) \ | 899 | #define RD1As(a,b,c,d,e, n, RCONST) \ |
| 733 | "\n\ | 900 | "\n\ |
| 734 | ##movl -64+4*"n"(%rsp), %esi # n=0, W[0] already in %esi \n\ | 901 | ##loadW "n", %esi # n=0, W[0] already in %esi \n\ |
| 735 | movl %e"c", %edi # c \n\ | 902 | movl %e"c", %edi # c \n\ |
| 736 | xorl %e"d", %edi # ^d \n\ | 903 | xorl %e"d", %edi # ^d \n\ |
| 737 | andl %e"b", %edi # &b \n\ | 904 | andl %e"b", %edi # &b \n\ |
| @@ -745,7 +912,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
| 745 | " | 912 | " |
| 746 | #define RD1Bs(a,b,c,d,e, n, RCONST) \ | 913 | #define RD1Bs(a,b,c,d,e, n, RCONST) \ |
| 747 | "\n\ | 914 | "\n\ |
| 748 | movl -64+4*"n"(%rsp), %esi # W[n] \n\ | 915 | loadW "n", %esi # W[n] \n\ |
| 749 | movl %e"c", %edi # c \n\ | 916 | movl %e"c", %edi # c \n\ |
| 750 | xorl %e"d", %edi # ^d \n\ | 917 | xorl %e"d", %edi # ^d \n\ |
| 751 | andl %e"b", %edi # &b \n\ | 918 | andl %e"b", %edi # &b \n\ |
| @@ -757,14 +924,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
| 757 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | 924 | addl %esi, %e"e" # e += rotl32(a,5) \n\ |
| 758 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | 925 | rorl $2, %e"b" # b = rotl32(b,30) \n\ |
| 759 | " | 926 | " |
| 760 | #define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | 927 | #define RD1Cs(a,b,c,d,e, n, RCONST) \ |
| 761 | "\n\ | 928 | "\n\ |
| 762 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | 929 | movl %e"c", %edi # c \n\ |
| 763 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | 930 | xorl %e"d", %edi # ^d \n\ |
| 764 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | 931 | andl %e"b", %edi # &b \n\ |
| 765 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | 932 | xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ |
| 933 | leal "RCONST"(%r"e",%r"n"), %e"e" # e += RCONST + W[n] \n\ | ||
| 934 | addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ | ||
| 935 | movl %e"a", %esi # \n\ | ||
| 936 | roll $5, %esi # rotl32(a,5) \n\ | ||
| 937 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | ||
| 938 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | ||
| 939 | " | ||
| 940 | #define RD1Ds(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
| 941 | "\n\ | ||
| 942 | loadW "n13", %esi # W[(n+13) & 15] \n\ | ||
| 943 | xorW "n8", %esi # ^W[(n+8) & 15] \n\ | ||
| 944 | xorW "n2", %esi # ^W[(n+2) & 15] \n\ | ||
| 945 | xorW "n", %esi # ^W[n & 15] \n\ | ||
| 766 | roll %esi # \n\ | 946 | roll %esi # \n\ |
| 767 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | 947 | storeW %esi, "n" # store to W[n & 15] \n\ |
| 768 | movl %e"c", %edi # c \n\ | 948 | movl %e"c", %edi # c \n\ |
| 769 | xorl %e"d", %edi # ^d \n\ | 949 | xorl %e"d", %edi # ^d \n\ |
| 770 | andl %e"b", %edi # &b \n\ | 950 | andl %e"b", %edi # &b \n\ |
| @@ -776,23 +956,24 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
| 776 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | 956 | addl %esi, %e"e" # e += rotl32(a,5) \n\ |
| 777 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | 957 | rorl $2, %e"b" # b = rotl32(b,30) \n\ |
| 778 | " | 958 | " |
| 779 | #define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST)) | 959 | #define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) |
| 780 | #define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST)) | 960 | #define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) |
| 781 | #define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) | 961 | #define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) |
| 962 | #define RD1D(a,b,c,d,e, n) RD1Ds(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) | ||
| 782 | #undef RCONST | 963 | #undef RCONST |
| 783 | #define RCONST 0x5A827999 | 964 | #define RCONST 0x5A827999 |
| 784 | RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) | 965 | RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) |
| 785 | RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9) | 966 | RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1C(cx,dx,bp,ax,bx, 8) RD1C(bx,cx,dx,bp,ax, 9) |
| 786 | RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14) | 967 | RD1C(ax,bx,cx,dx,bp,10) RD1C(bp,ax,bx,cx,dx,11) RD1C(dx,bp,ax,bx,cx,12) RD1C(cx,dx,bp,ax,bx,13) RD1C(bx,cx,dx,bp,ax,14) |
| 787 | RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19) | 968 | RD1C(ax,bx,cx,dx,bp,15) RD1D(bp,ax,bx,cx,dx,16) RD1D(dx,bp,ax,bx,cx,17) RD1D(cx,dx,bp,ax,bx,18) RD1D(bx,cx,dx,bp,ax,19) |
| 788 | #define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | 969 | #define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ |
| 789 | "\n\ | 970 | "\n\ |
| 790 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | 971 | loadW "n13", %esi # W[(n+13) & 15] \n\ |
| 791 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | 972 | xorW "n8", %esi # ^W[(n+8) & 15] \n\ |
| 792 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | 973 | xorW "n2", %esi # ^W[(n+2) & 15] \n\ |
| 793 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | 974 | xorW "n", %esi # ^W[n & 15] \n\ |
| 794 | roll %esi # \n\ | 975 | roll %esi # \n\ |
| 795 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | 976 | storeW %esi, "n" # store to W[n & 15] \n\ |
| 796 | movl %e"c", %edi # c \n\ | 977 | movl %e"c", %edi # c \n\ |
| 797 | xorl %e"d", %edi # ^d \n\ | 978 | xorl %e"d", %edi # ^d \n\ |
| 798 | xorl %e"b", %edi # ^b \n\ | 979 | xorl %e"b", %edi # ^b \n\ |
| @@ -819,12 +1000,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
| 819 | andl %e"c", %esi # si: b & c \n\ | 1000 | andl %e"c", %esi # si: b & c \n\ |
| 820 | andl %e"d", %edi # di: (b | c) & d \n\ | 1001 | andl %e"d", %edi # di: (b | c) & d \n\ |
| 821 | orl %esi, %edi # ((b | c) & d) | (b & c) \n\ | 1002 | orl %esi, %edi # ((b | c) & d) | (b & c) \n\ |
| 822 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | 1003 | loadW "n13", %esi # W[(n+13) & 15] \n\ |
| 823 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | 1004 | xorW "n8", %esi # ^W[(n+8) & 15] \n\ |
| 824 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | 1005 | xorW "n2", %esi # ^W[(n+2) & 15] \n\ |
| 825 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | 1006 | xorW "n", %esi # ^W[n & 15] \n\ |
| 826 | roll %esi # \n\ | 1007 | roll %esi # \n\ |
| 827 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | 1008 | storeW %esi, "n" # store to W[n & 15] \n\ |
| 828 | addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\ | 1009 | addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\ |
| 829 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ | 1010 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ |
| 830 | movl %e"a", %esi # \n\ | 1011 | movl %e"a", %esi # \n\ |
| @@ -843,12 +1024,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
| 843 | 1024 | ||
| 844 | #define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | 1025 | #define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ |
| 845 | "\n\ | 1026 | "\n\ |
| 846 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | 1027 | loadW "n13", %esi # W[(n+13) & 15] \n\ |
| 847 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | 1028 | xorW "n8", %esi # ^W[(n+8) & 15] \n\ |
| 848 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | 1029 | xorW "n2", %esi # ^W[(n+2) & 15] \n\ |
| 849 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | 1030 | xorW "n", %esi # ^W[n & 15] \n\ |
| 850 | roll %esi # \n\ | 1031 | roll %esi # \n\ |
| 851 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | 1032 | storeW %esi, "n" # store to W[n & 15] \n\ |
| 852 | movl %e"c", %edi # c \n\ | 1033 | movl %e"c", %edi # c \n\ |
| 853 | xorl %e"d", %edi # ^d \n\ | 1034 | xorl %e"d", %edi # ^d \n\ |
| 854 | xorl %e"b", %edi # ^b \n\ | 1035 | xorl %e"b", %edi # ^b \n\ |
| @@ -861,12 +1042,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
| 861 | " | 1042 | " |
| 862 | #define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | 1043 | #define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ |
| 863 | "\n\ | 1044 | "\n\ |
| 864 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | 1045 | loadW "n13", %esi # W[(n+13) & 15] \n\ |
| 865 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | 1046 | xorW "n8", %esi # ^W[(n+8) & 15] \n\ |
| 866 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | 1047 | xorW "n2", %esi # ^W[(n+2) & 15] \n\ |
| 867 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | 1048 | xorW "n", %esi # ^W[n & 15] \n\ |
| 868 | roll %esi # \n\ | 1049 | roll %esi # \n\ |
| 869 | ##movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] elided \n\ | 1050 | #storeW %esi, "n" # store to W[n & 15] elided \n\ |
| 870 | movl %e"c", %edi # c \n\ | 1051 | movl %e"c", %edi # c \n\ |
| 871 | xorl %e"d", %edi # ^d \n\ | 1052 | xorl %e"d", %edi # ^d \n\ |
| 872 | xorl %e"b", %edi # ^b \n\ | 1053 | xorl %e"b", %edi # ^b \n\ |
| @@ -888,20 +1069,18 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
| 888 | RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) | 1069 | RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) |
| 889 | 1070 | ||
| 890 | "\n\ | 1071 | "\n\ |
| 891 | movq %r10, %rdi # \n\ | 1072 | popq %rdi # \n\ |
| 892 | addl %eax, 80(%rdi) # ctx->hash[0] += a \n\ | 1073 | addl %eax, 80(%rdi) # ctx->hash[0] += a \n\ |
| 893 | addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\ | 1074 | addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\ |
| 894 | addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\ | 1075 | addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\ |
| 895 | addl %edx, 92(%rdi) # ctx->hash[3] += d \n\ | 1076 | addl %edx, 92(%rdi) # ctx->hash[3] += d \n\ |
| 896 | addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\ | 1077 | addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\ |
| 897 | movq %r9, %rbx # callee-saved \n\ | 1078 | popq %rbx # \n\ |
| 898 | movq %r8, %rbp # callee-saved \n\ | 1079 | popq %rbp # \n\ |
| 899 | ##popq %rbx # \n\ | 1080 | popq %r12 # \n\ |
| 900 | ##popq %rbp # \n\ | 1081 | popq %r13 # \n\ |
| 901 | ##popq %r12 # \n\ | 1082 | popq %r14 # \n\ |
| 902 | ##popq %r13 # \n\ | 1083 | popq %r15 # \n\ |
| 903 | ##popq %r14 # \n\ | ||
| 904 | ##popq %r15 # \n\ | ||
| 905 | " | 1084 | " |
| 906 | ); /* asm */ | 1085 | ); /* asm */ |
| 907 | #undef RCONST | 1086 | #undef RCONST |
