diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-01 15:01:53 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-01 15:01:53 +0100 |
commit | d643010feeef312c77d7f51c3dd476d4e605c982 (patch) | |
tree | 16090cd94447527c057f953446e03cc7384c9c4e | |
parent | 5f6817020467598868b7d1c9ca477d7ccd66b87d (diff) | |
download | busybox-w32-d643010feeef312c77d7f51c3dd476d4e605c982.tar.gz busybox-w32-d643010feeef312c77d7f51c3dd476d4e605c982.tar.bz2 busybox-w32-d643010feeef312c77d7f51c3dd476d4e605c982.zip |
libbb/sha1: shrink x86_64 version - use r8..15 for W[8..15]
function old new delta
sha1_process_block64 3683 3562 -121
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/Config.src | 2 | ||||
-rw-r--r-- | libbb/hash_md5_sha.c | 299 |
2 files changed, 240 insertions, 61 deletions
diff --git a/libbb/Config.src b/libbb/Config.src index e027c14a8..f66f65f81 100644 --- a/libbb/Config.src +++ b/libbb/Config.src | |||
@@ -59,7 +59,7 @@ config SHA1_SMALL | |||
59 | Trade binary size versus speed for the sha1 algorithm. | 59 | Trade binary size versus speed for the sha1 algorithm. |
60 | throughput MB/s size of sha1_process_block64 | 60 | throughput MB/s size of sha1_process_block64 |
61 | value 486 x86-64 486 x86-64 | 61 | value 486 x86-64 486 x86-64 |
62 | 0 367 367 3657 3683 | 62 | 0 367 367 3657 3562 |
63 | 1 224 229 654 732 | 63 | 1 224 229 654 732 |
64 | 2,3 200 195 358 380 | 64 | 2,3 200 195 358 380 |
65 | 65 | ||
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 9de30dfe6..a4e36066a 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
@@ -700,22 +700,194 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
700 | { | 700 | { |
701 | BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80); | 701 | BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80); |
702 | asm( | 702 | asm( |
703 | // TODO: store W[] in r8..r15? (r8..r11 are callee-clobbered, no need to save) | ||
704 | "\n\ | 703 | "\n\ |
705 | ##pushq %r15 # \n\ | 704 | pushq %r15 # \n\ |
706 | ##pushq %r14 # \n\ | 705 | pushq %r14 # \n\ |
707 | ##pushq %r13 # \n\ | 706 | pushq %r13 # \n\ |
708 | ##pushq %r12 # \n\ | 707 | pushq %r12 # \n\ |
709 | ##pushq %rbp # \n\ | 708 | pushq %rbp # \n\ |
710 | ##pushq %rbx # \n\ | 709 | pushq %rbx # \n\ |
711 | movq %rbp, %r8 # callee-saved \n\ | 710 | pushq %rdi # we need ctx at the end \n\ |
712 | movq %rbx, %r9 # callee-saved \n\ | 711 | \n\ |
713 | movq %rdi, %r10 # we need ctx at the end \n\ | 712 | #Register and stack use: \n\ |
714 | movl $15, %eax \n\ | 713 | # eax..edx: a..d \n\ |
714 | # ebp: e \n\ | ||
715 | # esi,edi: temps \n\ | ||
716 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] \n\ | ||
717 | .macro loadW n,r \n\ | ||
718 | .if \\n == 0 \n\ | ||
719 | movl -32+4*0(%rsp),\\r \n\ | ||
720 | .endif \n\ | ||
721 | .if \\n == 1 \n\ | ||
722 | movl -32+4*1(%rsp),\\r \n\ | ||
723 | .endif \n\ | ||
724 | .if \\n == 2 \n\ | ||
725 | movl -32+4*2(%rsp),\\r \n\ | ||
726 | .endif \n\ | ||
727 | .if \\n == 3 \n\ | ||
728 | movl -32+4*3(%rsp),\\r \n\ | ||
729 | .endif \n\ | ||
730 | .if \\n == 4 \n\ | ||
731 | movl -32+4*4(%rsp),\\r \n\ | ||
732 | .endif \n\ | ||
733 | .if \\n == 5 \n\ | ||
734 | movl -32+4*5(%rsp),\\r \n\ | ||
735 | .endif \n\ | ||
736 | .if \\n == 6 \n\ | ||
737 | movl -32+4*6(%rsp),\\r \n\ | ||
738 | .endif \n\ | ||
739 | .if \\n == 7 \n\ | ||
740 | movl -32+4*7(%rsp),\\r \n\ | ||
741 | .endif \n\ | ||
742 | .if \\n == 8 \n\ | ||
743 | movl %r8d,\\r \n\ | ||
744 | .endif \n\ | ||
745 | .if \\n == 9 \n\ | ||
746 | movl %r9d,\\r \n\ | ||
747 | .endif \n\ | ||
748 | .if \\n == 10 \n\ | ||
749 | movl %r10d,\\r \n\ | ||
750 | .endif \n\ | ||
751 | .if \\n == 11 \n\ | ||
752 | movl %r11d,\\r \n\ | ||
753 | .endif \n\ | ||
754 | .if \\n == 12 \n\ | ||
755 | movl %r12d,\\r \n\ | ||
756 | .endif \n\ | ||
757 | .if \\n == 13 \n\ | ||
758 | movl %r13d,\\r \n\ | ||
759 | .endif \n\ | ||
760 | .if \\n == 14 \n\ | ||
761 | movl %r14d,\\r \n\ | ||
762 | .endif \n\ | ||
763 | .if \\n == 15 \n\ | ||
764 | movl %r15d,\\r \n\ | ||
765 | .endif \n\ | ||
766 | .endm \n\ | ||
767 | \n\ | ||
768 | .macro storeW r,n \n\ | ||
769 | .if \\n == 0 \n\ | ||
770 | movl \\r,-32+4*0(%rsp) \n\ | ||
771 | .endif \n\ | ||
772 | .if \\n == 1 \n\ | ||
773 | movl \\r,-32+4*1(%rsp) \n\ | ||
774 | .endif \n\ | ||
775 | .if \\n == 2 \n\ | ||
776 | movl \\r,-32+4*2(%rsp) \n\ | ||
777 | .endif \n\ | ||
778 | .if \\n == 3 \n\ | ||
779 | movl \\r,-32+4*3(%rsp) \n\ | ||
780 | .endif \n\ | ||
781 | .if \\n == 4 \n\ | ||
782 | movl \\r,-32+4*4(%rsp) \n\ | ||
783 | .endif \n\ | ||
784 | .if \\n == 5 \n\ | ||
785 | movl \\r,-32+4*5(%rsp) \n\ | ||
786 | .endif \n\ | ||
787 | .if \\n == 6 \n\ | ||
788 | movl \\r,-32+4*6(%rsp) \n\ | ||
789 | .endif \n\ | ||
790 | .if \\n == 7 \n\ | ||
791 | movl \\r,-32+4*7(%rsp) \n\ | ||
792 | .endif \n\ | ||
793 | .if \\n == 8 \n\ | ||
794 | movl \\r,%r8d \n\ | ||
795 | .endif \n\ | ||
796 | .if \\n == 9 \n\ | ||
797 | movl \\r,%r9d \n\ | ||
798 | .endif \n\ | ||
799 | .if \\n == 10 \n\ | ||
800 | movl \\r,%r10d \n\ | ||
801 | .endif \n\ | ||
802 | .if \\n == 11 \n\ | ||
803 | movl \\r,%r11d \n\ | ||
804 | .endif \n\ | ||
805 | .if \\n == 12 \n\ | ||
806 | movl \\r,%r12d \n\ | ||
807 | .endif \n\ | ||
808 | .if \\n == 13 \n\ | ||
809 | movl \\r,%r13d \n\ | ||
810 | .endif \n\ | ||
811 | .if \\n == 14 \n\ | ||
812 | movl \\r,%r14d \n\ | ||
813 | .endif \n\ | ||
814 | .if \\n == 15 \n\ | ||
815 | movl \\r,%r15d \n\ | ||
816 | .endif \n\ | ||
817 | .endm \n\ | ||
818 | \n\ | ||
819 | .macro xorW n,r \n\ | ||
820 | .if \\n == 0 \n\ | ||
821 | xorl -32+4*0(%rsp),\\r \n\ | ||
822 | .endif \n\ | ||
823 | .if \\n == 1 \n\ | ||
824 | xorl -32+4*1(%rsp),\\r \n\ | ||
825 | .endif \n\ | ||
826 | .if \\n == 2 \n\ | ||
827 | xorl -32+4*2(%rsp),\\r \n\ | ||
828 | .endif \n\ | ||
829 | .if \\n == 3 \n\ | ||
830 | xorl -32+4*3(%rsp),\\r \n\ | ||
831 | .endif \n\ | ||
832 | .if \\n == 4 \n\ | ||
833 | xorl -32+4*4(%rsp),\\r \n\ | ||
834 | .endif \n\ | ||
835 | .if \\n == 5 \n\ | ||
836 | xorl -32+4*5(%rsp),\\r \n\ | ||
837 | .endif \n\ | ||
838 | .if \\n == 6 \n\ | ||
839 | xorl -32+4*6(%rsp),\\r \n\ | ||
840 | .endif \n\ | ||
841 | .if \\n == 7 \n\ | ||
842 | xorl -32+4*7(%rsp),\\r \n\ | ||
843 | .endif \n\ | ||
844 | .if \\n == 8 \n\ | ||
845 | xorl %r8d,\\r \n\ | ||
846 | .endif \n\ | ||
847 | .if \\n == 9 \n\ | ||
848 | xorl %r9d,\\r \n\ | ||
849 | .endif \n\ | ||
850 | .if \\n == 10 \n\ | ||
851 | xorl %r10d,\\r \n\ | ||
852 | .endif \n\ | ||
853 | .if \\n == 11 \n\ | ||
854 | xorl %r11d,\\r \n\ | ||
855 | .endif \n\ | ||
856 | .if \\n == 12 \n\ | ||
857 | xorl %r12d,\\r \n\ | ||
858 | .endif \n\ | ||
859 | .if \\n == 13 \n\ | ||
860 | xorl %r13d,\\r \n\ | ||
861 | .endif \n\ | ||
862 | .if \\n == 14 \n\ | ||
863 | xorl %r14d,\\r \n\ | ||
864 | .endif \n\ | ||
865 | .if \\n == 15 \n\ | ||
866 | xorl %r15d,\\r \n\ | ||
867 | .endif \n\ | ||
868 | .endm \n\ | ||
869 | \n\ | ||
870 | movl 4*8(%rdi), %r8d \n\ | ||
871 | bswap %r8d \n\ | ||
872 | movl 4*9(%rdi), %r9d \n\ | ||
873 | bswap %r9d \n\ | ||
874 | movl 4*10(%rdi), %r10d \n\ | ||
875 | bswap %r10d \n\ | ||
876 | movl 4*11(%rdi), %r11d \n\ | ||
877 | bswap %r11d \n\ | ||
878 | movl 4*12(%rdi), %r12d \n\ | ||
879 | bswap %r12d \n\ | ||
880 | movl 4*13(%rdi), %r13d \n\ | ||
881 | bswap %r13d \n\ | ||
882 | movl 4*14(%rdi), %r14d \n\ | ||
883 | bswap %r14d \n\ | ||
884 | movl 4*15(%rdi), %r15d \n\ | ||
885 | bswap %r15d \n\ | ||
886 | movl $7, %eax \n\ | ||
715 | 1: \n\ | 887 | 1: \n\ |
716 | movl (%rdi,%rax,4), %esi \n\ | 888 | movl (%rdi,%rax,4), %esi \n\ |
717 | bswap %esi \n\ | 889 | bswap %esi \n\ |
718 | movl %esi, -64(%rsp,%rax,4) \n\ | 890 | movl %esi, -32(%rsp,%rax,4) \n\ |
719 | decl %eax \n\ | 891 | decl %eax \n\ |
720 | jns 1b \n\ | 892 | jns 1b \n\ |
721 | movl 80(%rdi), %eax # a = ctx->hash[0] \n\ | 893 | movl 80(%rdi), %eax # a = ctx->hash[0] \n\ |
@@ -723,15 +895,10 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
723 | movl 88(%rdi), %ecx # c = ctx->hash[2] \n\ | 895 | movl 88(%rdi), %ecx # c = ctx->hash[2] \n\ |
724 | movl 92(%rdi), %edx # d = ctx->hash[3] \n\ | 896 | movl 92(%rdi), %edx # d = ctx->hash[3] \n\ |
725 | movl 96(%rdi), %ebp # e = ctx->hash[4] \n\ | 897 | movl 96(%rdi), %ebp # e = ctx->hash[4] \n\ |
726 | #Register and stack use: \n\ | ||
727 | # eax..edx: a..d \n\ | ||
728 | # ebp: e \n\ | ||
729 | # esi,edi: temps \n\ | ||
730 | # -64+4*n(%rsp): W[n] \n\ | ||
731 | " | 898 | " |
732 | #define RD1As(a,b,c,d,e, n, RCONST) \ | 899 | #define RD1As(a,b,c,d,e, n, RCONST) \ |
733 | "\n\ | 900 | "\n\ |
734 | ##movl -64+4*"n"(%rsp), %esi # n=0, W[0] already in %esi \n\ | 901 | ##loadW "n", %esi # n=0, W[0] already in %esi \n\ |
735 | movl %e"c", %edi # c \n\ | 902 | movl %e"c", %edi # c \n\ |
736 | xorl %e"d", %edi # ^d \n\ | 903 | xorl %e"d", %edi # ^d \n\ |
737 | andl %e"b", %edi # &b \n\ | 904 | andl %e"b", %edi # &b \n\ |
@@ -745,7 +912,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
745 | " | 912 | " |
746 | #define RD1Bs(a,b,c,d,e, n, RCONST) \ | 913 | #define RD1Bs(a,b,c,d,e, n, RCONST) \ |
747 | "\n\ | 914 | "\n\ |
748 | movl -64+4*"n"(%rsp), %esi # W[n] \n\ | 915 | loadW "n", %esi # W[n] \n\ |
749 | movl %e"c", %edi # c \n\ | 916 | movl %e"c", %edi # c \n\ |
750 | xorl %e"d", %edi # ^d \n\ | 917 | xorl %e"d", %edi # ^d \n\ |
751 | andl %e"b", %edi # &b \n\ | 918 | andl %e"b", %edi # &b \n\ |
@@ -757,14 +924,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
757 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | 924 | addl %esi, %e"e" # e += rotl32(a,5) \n\ |
758 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | 925 | rorl $2, %e"b" # b = rotl32(b,30) \n\ |
759 | " | 926 | " |
760 | #define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | 927 | #define RD1Cs(a,b,c,d,e, n, RCONST) \ |
761 | "\n\ | 928 | "\n\ |
762 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | 929 | movl %e"c", %edi # c \n\ |
763 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | 930 | xorl %e"d", %edi # ^d \n\ |
764 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | 931 | andl %e"b", %edi # &b \n\ |
765 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | 932 | xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ |
933 | leal "RCONST"(%r"e",%r"n"), %e"e" # e += RCONST + W[n] \n\ | ||
934 | addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ | ||
935 | movl %e"a", %esi # \n\ | ||
936 | roll $5, %esi # rotl32(a,5) \n\ | ||
937 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | ||
938 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | ||
939 | " | ||
940 | #define RD1Ds(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
941 | "\n\ | ||
942 | loadW "n13", %esi # W[(n+13) & 15] \n\ | ||
943 | xorW "n8", %esi # ^W[(n+8) & 15] \n\ | ||
944 | xorW "n2", %esi # ^W[(n+2) & 15] \n\ | ||
945 | xorW "n", %esi # ^W[n & 15] \n\ | ||
766 | roll %esi # \n\ | 946 | roll %esi # \n\ |
767 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | 947 | storeW %esi, "n" # store to W[n & 15] \n\ |
768 | movl %e"c", %edi # c \n\ | 948 | movl %e"c", %edi # c \n\ |
769 | xorl %e"d", %edi # ^d \n\ | 949 | xorl %e"d", %edi # ^d \n\ |
770 | andl %e"b", %edi # &b \n\ | 950 | andl %e"b", %edi # &b \n\ |
@@ -776,23 +956,24 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
776 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | 956 | addl %esi, %e"e" # e += rotl32(a,5) \n\ |
777 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | 957 | rorl $2, %e"b" # b = rotl32(b,30) \n\ |
778 | " | 958 | " |
779 | #define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST)) | 959 | #define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) |
780 | #define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST)) | 960 | #define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) |
781 | #define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) | 961 | #define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST)) |
962 | #define RD1D(a,b,c,d,e, n) RD1Ds(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) | ||
782 | #undef RCONST | 963 | #undef RCONST |
783 | #define RCONST 0x5A827999 | 964 | #define RCONST 0x5A827999 |
784 | RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) | 965 | RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) |
785 | RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9) | 966 | RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1C(cx,dx,bp,ax,bx, 8) RD1C(bx,cx,dx,bp,ax, 9) |
786 | RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14) | 967 | RD1C(ax,bx,cx,dx,bp,10) RD1C(bp,ax,bx,cx,dx,11) RD1C(dx,bp,ax,bx,cx,12) RD1C(cx,dx,bp,ax,bx,13) RD1C(bx,cx,dx,bp,ax,14) |
787 | RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19) | 968 | RD1C(ax,bx,cx,dx,bp,15) RD1D(bp,ax,bx,cx,dx,16) RD1D(dx,bp,ax,bx,cx,17) RD1D(cx,dx,bp,ax,bx,18) RD1D(bx,cx,dx,bp,ax,19) |
788 | #define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | 969 | #define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ |
789 | "\n\ | 970 | "\n\ |
790 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | 971 | loadW "n13", %esi # W[(n+13) & 15] \n\ |
791 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | 972 | xorW "n8", %esi # ^W[(n+8) & 15] \n\ |
792 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | 973 | xorW "n2", %esi # ^W[(n+2) & 15] \n\ |
793 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | 974 | xorW "n", %esi # ^W[n & 15] \n\ |
794 | roll %esi # \n\ | 975 | roll %esi # \n\ |
795 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | 976 | storeW %esi, "n" # store to W[n & 15] \n\ |
796 | movl %e"c", %edi # c \n\ | 977 | movl %e"c", %edi # c \n\ |
797 | xorl %e"d", %edi # ^d \n\ | 978 | xorl %e"d", %edi # ^d \n\ |
798 | xorl %e"b", %edi # ^b \n\ | 979 | xorl %e"b", %edi # ^b \n\ |
@@ -819,12 +1000,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
819 | andl %e"c", %esi # si: b & c \n\ | 1000 | andl %e"c", %esi # si: b & c \n\ |
820 | andl %e"d", %edi # di: (b | c) & d \n\ | 1001 | andl %e"d", %edi # di: (b | c) & d \n\ |
821 | orl %esi, %edi # ((b | c) & d) | (b & c) \n\ | 1002 | orl %esi, %edi # ((b | c) & d) | (b & c) \n\ |
822 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | 1003 | loadW "n13", %esi # W[(n+13) & 15] \n\ |
823 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | 1004 | xorW "n8", %esi # ^W[(n+8) & 15] \n\ |
824 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | 1005 | xorW "n2", %esi # ^W[(n+2) & 15] \n\ |
825 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | 1006 | xorW "n", %esi # ^W[n & 15] \n\ |
826 | roll %esi # \n\ | 1007 | roll %esi # \n\ |
827 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | 1008 | storeW %esi, "n" # store to W[n & 15] \n\ |
828 | addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\ | 1009 | addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\ |
829 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ | 1010 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ |
830 | movl %e"a", %esi # \n\ | 1011 | movl %e"a", %esi # \n\ |
@@ -843,12 +1024,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
843 | 1024 | ||
844 | #define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | 1025 | #define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ |
845 | "\n\ | 1026 | "\n\ |
846 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | 1027 | loadW "n13", %esi # W[(n+13) & 15] \n\ |
847 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | 1028 | xorW "n8", %esi # ^W[(n+8) & 15] \n\ |
848 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | 1029 | xorW "n2", %esi # ^W[(n+2) & 15] \n\ |
849 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | 1030 | xorW "n", %esi # ^W[n & 15] \n\ |
850 | roll %esi # \n\ | 1031 | roll %esi # \n\ |
851 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | 1032 | storeW %esi, "n" # store to W[n & 15] \n\ |
852 | movl %e"c", %edi # c \n\ | 1033 | movl %e"c", %edi # c \n\ |
853 | xorl %e"d", %edi # ^d \n\ | 1034 | xorl %e"d", %edi # ^d \n\ |
854 | xorl %e"b", %edi # ^b \n\ | 1035 | xorl %e"b", %edi # ^b \n\ |
@@ -861,12 +1042,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
861 | " | 1042 | " |
862 | #define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | 1043 | #define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ |
863 | "\n\ | 1044 | "\n\ |
864 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | 1045 | loadW "n13", %esi # W[(n+13) & 15] \n\ |
865 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | 1046 | xorW "n8", %esi # ^W[(n+8) & 15] \n\ |
866 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | 1047 | xorW "n2", %esi # ^W[(n+2) & 15] \n\ |
867 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | 1048 | xorW "n", %esi # ^W[n & 15] \n\ |
868 | roll %esi # \n\ | 1049 | roll %esi # \n\ |
869 | ##movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] elided \n\ | 1050 | #storeW %esi, "n" # store to W[n & 15] elided \n\ |
870 | movl %e"c", %edi # c \n\ | 1051 | movl %e"c", %edi # c \n\ |
871 | xorl %e"d", %edi # ^d \n\ | 1052 | xorl %e"d", %edi # ^d \n\ |
872 | xorl %e"b", %edi # ^b \n\ | 1053 | xorl %e"b", %edi # ^b \n\ |
@@ -888,20 +1069,18 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
888 | RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) | 1069 | RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) |
889 | 1070 | ||
890 | "\n\ | 1071 | "\n\ |
891 | movq %r10, %rdi # \n\ | 1072 | popq %rdi # \n\ |
892 | addl %eax, 80(%rdi) # ctx->hash[0] += a \n\ | 1073 | addl %eax, 80(%rdi) # ctx->hash[0] += a \n\ |
893 | addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\ | 1074 | addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\ |
894 | addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\ | 1075 | addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\ |
895 | addl %edx, 92(%rdi) # ctx->hash[3] += d \n\ | 1076 | addl %edx, 92(%rdi) # ctx->hash[3] += d \n\ |
896 | addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\ | 1077 | addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\ |
897 | movq %r9, %rbx # callee-saved \n\ | 1078 | popq %rbx # \n\ |
898 | movq %r8, %rbp # callee-saved \n\ | 1079 | popq %rbp # \n\ |
899 | ##popq %rbx # \n\ | 1080 | popq %r12 # \n\ |
900 | ##popq %rbp # \n\ | 1081 | popq %r13 # \n\ |
901 | ##popq %r12 # \n\ | 1082 | popq %r14 # \n\ |
902 | ##popq %r13 # \n\ | 1083 | popq %r15 # \n\ |
903 | ##popq %r14 # \n\ | ||
904 | ##popq %r15 # \n\ | ||
905 | " | 1084 | " |
906 | ); /* asm */ | 1085 | ); /* asm */ |
907 | #undef RCONST | 1086 | #undef RCONST |