aboutsummaryrefslogtreecommitdiff
path: root/libbb
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2022-01-01 12:21:01 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2022-01-01 13:57:34 +0100
commit5f6817020467598868b7d1c9ca477d7ccd66b87d (patch)
tree931b244d18b1dba9afd8861a0474429e33fa7329 /libbb
parentf09d088fdf6eeeba902fb5627930145a3058a5f0 (diff)
downloadbusybox-w32-5f6817020467598868b7d1c9ca477d7ccd66b87d.tar.gz
busybox-w32-5f6817020467598868b7d1c9ca477d7ccd66b87d.tar.bz2
busybox-w32-5f6817020467598868b7d1c9ca477d7ccd66b87d.zip
libbb/sha1: assembly versions for x86
32 bits: function old new delta sha1_process_block64 3950 3657 -293 64 bits: sha1_process_block64 4167 3683 -484 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
-rw-r--r--libbb/Config.src2
-rw-r--r--libbb/hash_md5_sha.c417
2 files changed, 418 insertions, 1 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index d2054dc63..e027c14a8 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -59,7 +59,7 @@ config SHA1_SMALL
59 Trade binary size versus speed for the sha1 algorithm. 59 Trade binary size versus speed for the sha1 algorithm.
60 throughput MB/s size of sha1_process_block64 60 throughput MB/s size of sha1_process_block64
61 value 486 x86-64 486 x86-64 61 value 486 x86-64 486 x86-64
62 0 360 374 3950 4167 62 0 367 367 3657 3683
63 1 224 229 654 732 63 1 224 229 654 732
64 2,3 200 195 358 380 64 2,3 200 195 358 380
65 65
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index faf485df5..9de30dfe6 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -8,6 +8,9 @@
8 */ 8 */
9#include "libbb.h" 9#include "libbb.h"
10 10
11#define STR1(s) #s
12#define STR(s) STR1(s)
13
11#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) 14#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
12 15
13/* gcc 4.2.1 optimizes rotr64 better with inline than with macro 16/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
@@ -491,6 +494,419 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
491 */ 494 */
492 495
493#if CONFIG_SHA1_SMALL == 0 496#if CONFIG_SHA1_SMALL == 0
497# if defined(__GNUC__) && defined(__i386__)
498static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
499{
500 BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 76);
501 asm(
502"\n\
503 pushl %ebp # \n\
504 pushl %edi # \n\
505 pushl %esi # \n\
506 pushl %ebx # \n\
507 pushl %eax \n\
508 movl $15, %edi \n\
5091: \n\
510 movl (%eax,%edi,4), %esi \n\
511 bswap %esi \n\
512 pushl %esi \n\
513 decl %edi \n\
514 jns 1b \n\
515 movl 80(%eax), %ebx # b = ctx->hash[1] \n\
516 movl 84(%eax), %ecx # c = ctx->hash[2] \n\
517 movl 88(%eax), %edx # d = ctx->hash[3] \n\
518 movl 92(%eax), %ebp # e = ctx->hash[4] \n\
519 movl 76(%eax), %eax # a = ctx->hash[0] \n\
520#Register and stack use: \n\
521# eax..edx: a..d \n\
522# ebp: e \n\
523# esi,edi: temps \n\
524# 4*n(%esp): W[n] \n\
525"
526#define RD1As(a,b,c,d,e, n, RCONST) \
527"\n\
528 ##movl 4*"n"(%esp), %esi # n=0, W[0] already in %esi \n\
529 movl "c", %edi # c \n\
530 xorl "d", %edi # ^d \n\
531 andl "b", %edi # &b \n\
532 xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
533 leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\
534 addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
535 movl "a", %esi # \n\
536 roll $5, %esi # rotl32(a,5) \n\
537 addl %esi, "e" # e += rotl32(a,5) \n\
538 rorl $2, "b" # b = rotl32(b,30) \n\
539"
540#define RD1Bs(a,b,c,d,e, n, RCONST) \
541"\n\
542 movl 4*"n"(%esp), %esi # W[n] \n\
543 movl "c", %edi # c \n\
544 xorl "d", %edi # ^d \n\
545 andl "b", %edi # &b \n\
546 xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
547 leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\
548 addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
549 movl "a", %esi # \n\
550 roll $5, %esi # rotl32(a,5) \n\
551 addl %esi, "e" # e += rotl32(a,5) \n\
552 rorl $2, "b" # b = rotl32(b,30) \n\
553"
554#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
555"\n\
556 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
557 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
558 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
559 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
560 roll %esi # \n\
561 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
562 movl "c", %edi # c \n\
563 xorl "d", %edi # ^d \n\
564 andl "b", %edi # &b \n\
565 xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
566 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
567 addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
568 movl "a", %esi # \n\
569 roll $5, %esi # rotl32(a,5) \n\
570 addl %esi, "e" # e += rotl32(a,5) \n\
571 rorl $2, "b" # b = rotl32(b,30) \n\
572"
573#define RD1A(a,b,c,d,e, n) RD1As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
574#define RD1B(a,b,c,d,e, n) RD1Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
575#define RD1C(a,b,c,d,e, n) RD1Cs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
576#undef RCONST
577#define RCONST 0x5A827999
578 RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
579 RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
580 RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
581 RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
582#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
583"\n\
584 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
585 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
586 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
587 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
588 roll %esi # \n\
589 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
590 movl "c", %edi # c \n\
591 xorl "d", %edi # ^d \n\
592 xorl "b", %edi # ^b \n\
593 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
594 addl %edi, "e" # e += (c ^ d ^ b) \n\
595 movl "a", %esi # \n\
596 roll $5, %esi # rotl32(a,5) \n\
597 addl %esi, "e" # e += rotl32(a,5) \n\
598 rorl $2, "b" # b = rotl32(b,30) \n\
599"
600#define RD2(a,b,c,d,e, n) RD2s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
601#undef RCONST
602#define RCONST 0x6ED9EBA1
603 RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
604 RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
605 RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
606 RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
607
608#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
609"\n\
610 movl "b", %edi # di: b \n\
611 movl "b", %esi # si: b \n\
612 orl "c", %edi # di: b | c \n\
613 andl "c", %esi # si: b & c \n\
614 andl "d", %edi # di: (b | c) & d \n\
615 orl %esi, %edi # ((b | c) & d) | (b & c) \n\
616 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
617 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
618 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
619 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
620 roll %esi # \n\
621 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
622 addl %edi, "e" # += ((b | c) & d) | (b & c)\n\
623 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
624 movl "a", %esi # \n\
625 roll $5, %esi # rotl32(a,5) \n\
626 addl %esi, "e" # e += rotl32(a,5) \n\
627 rorl $2, "b" # b = rotl32(b,30) \n\
628"
629#define RD3(a,b,c,d,e, n) RD3s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
630#undef RCONST
631#define RCONST 0x8F1BBCDC
632 RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
633 RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
634 RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
635 RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
636
637#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
638"\n\
639 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
640 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
641 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
642 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
643 roll %esi # \n\
644 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
645 movl "c", %edi # c \n\
646 xorl "d", %edi # ^d \n\
647 xorl "b", %edi # ^b \n\
648 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
649 addl %edi, "e" # e += (c ^ d ^ b) \n\
650 movl "a", %esi # \n\
651 roll $5, %esi # rotl32(a,5) \n\
652 addl %esi, "e" # e += rotl32(a,5) \n\
653 rorl $2, "b" # b = rotl32(b,30) \n\
654"
655#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
656"\n\
657 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
658 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
659 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
660 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
661 roll %esi # \n\
662 ##movl %esi, 4*"n"(%esp) # store to W[n & 15] elided \n\
663 movl "c", %edi # c \n\
664 xorl "d", %edi # ^d \n\
665 xorl "b", %edi # ^b \n\
666 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
667 addl %edi, "e" # e += (c ^ d ^ b) \n\
668 movl "a", %esi # \n\
669 roll $5, %esi # rotl32(a,5) \n\
670 addl %esi, "e" # e += rotl32(a,5) \n\
671 rorl $2, "b" # b = rotl32(b,30) \n\
672"
673#define RD4A(a,b,c,d,e, n) RD4As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
674#define RD4B(a,b,c,d,e, n) RD4Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
675#undef RCONST
676#define RCONST 0xCA62C1D6
677 RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
678 RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
679 RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
680 RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
681
682"\n\
683 movl 4*16(%esp), %esi # \n\
684 addl $4*(16+1), %esp # \n\
685 addl %eax, 76(%esi) # ctx->hash[0] += a \n\
686 addl %ebx, 80(%esi) # ctx->hash[1] += b \n\
687 addl %ecx, 84(%esi) # ctx->hash[2] += c \n\
688 addl %edx, 88(%esi) # ctx->hash[3] += d \n\
689 addl %ebp, 92(%esi) # ctx->hash[4] += e \n\
690 popl %ebx # \n\
691 popl %esi # \n\
692 popl %edi # \n\
693 popl %ebp # \n\
694"
695 ); /* asm */
696#undef RCONST
697}
698# elif defined(__GNUC__) && defined(__x86_64__)
699static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
700{
701 BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80);
702 asm(
703// TODO: store W[] in r8..r15? (r8..r11 are callee-clobbered, no need to save)
704"\n\
705 ##pushq %r15 # \n\
706 ##pushq %r14 # \n\
707 ##pushq %r13 # \n\
708 ##pushq %r12 # \n\
709 ##pushq %rbp # \n\
710 ##pushq %rbx # \n\
711 movq %rbp, %r8 # callee-saved \n\
712 movq %rbx, %r9 # callee-saved \n\
713 movq %rdi, %r10 # we need ctx at the end \n\
714 movl $15, %eax \n\
7151: \n\
716 movl (%rdi,%rax,4), %esi \n\
717 bswap %esi \n\
718 movl %esi, -64(%rsp,%rax,4) \n\
719 decl %eax \n\
720 jns 1b \n\
721 movl 80(%rdi), %eax # a = ctx->hash[0] \n\
722 movl 84(%rdi), %ebx # b = ctx->hash[1] \n\
723 movl 88(%rdi), %ecx # c = ctx->hash[2] \n\
724 movl 92(%rdi), %edx # d = ctx->hash[3] \n\
725 movl 96(%rdi), %ebp # e = ctx->hash[4] \n\
726#Register and stack use: \n\
727# eax..edx: a..d \n\
728# ebp: e \n\
729# esi,edi: temps \n\
730# -64+4*n(%rsp): W[n] \n\
731"
732#define RD1As(a,b,c,d,e, n, RCONST) \
733"\n\
734 ##movl -64+4*"n"(%rsp), %esi # n=0, W[0] already in %esi \n\
735 movl %e"c", %edi # c \n\
736 xorl %e"d", %edi # ^d \n\
737 andl %e"b", %edi # &b \n\
738 xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\
739 leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\
740 addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\
741 movl %e"a", %esi # \n\
742 roll $5, %esi # rotl32(a,5) \n\
743 addl %esi, %e"e" # e += rotl32(a,5) \n\
744 rorl $2, %e"b" # b = rotl32(b,30) \n\
745"
746#define RD1Bs(a,b,c,d,e, n, RCONST) \
747"\n\
748 movl -64+4*"n"(%rsp), %esi # W[n] \n\
749 movl %e"c", %edi # c \n\
750 xorl %e"d", %edi # ^d \n\
751 andl %e"b", %edi # &b \n\
752 xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\
753 leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\
754 addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\
755 movl %e"a", %esi # \n\
756 roll $5, %esi # rotl32(a,5) \n\
757 addl %esi, %e"e" # e += rotl32(a,5) \n\
758 rorl $2, %e"b" # b = rotl32(b,30) \n\
759"
760#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
761"\n\
762 movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\
763 xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\
764 xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\
765 xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\
766 roll %esi # \n\
767 movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\
768 movl %e"c", %edi # c \n\
769 xorl %e"d", %edi # ^d \n\
770 andl %e"b", %edi # &b \n\
771 xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\
772 leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
773 addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\
774 movl %e"a", %esi # \n\
775 roll $5, %esi # rotl32(a,5) \n\
776 addl %esi, %e"e" # e += rotl32(a,5) \n\
777 rorl $2, %e"b" # b = rotl32(b,30) \n\
778"
779#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST))
780#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST))
781#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
782#undef RCONST
783#define RCONST 0x5A827999
784 RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
785 RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
786 RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
787 RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
788#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
789"\n\
790 movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\
791 xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\
792 xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\
793 xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\
794 roll %esi # \n\
795 movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\
796 movl %e"c", %edi # c \n\
797 xorl %e"d", %edi # ^d \n\
798 xorl %e"b", %edi # ^b \n\
799 leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
800 addl %edi, %e"e" # e += (c ^ d ^ b) \n\
801 movl %e"a", %esi # \n\
802 roll $5, %esi # rotl32(a,5) \n\
803 addl %esi, %e"e" # e += rotl32(a,5) \n\
804 rorl $2, %e"b" # b = rotl32(b,30) \n\
805"
806#define RD2(a,b,c,d,e, n) RD2s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
807#undef RCONST
808#define RCONST 0x6ED9EBA1
809 RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
810 RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
811 RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
812 RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
813
814#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
815"\n\
816 movl %e"b", %edi # di: b \n\
817 movl %e"b", %esi # si: b \n\
818 orl %e"c", %edi # di: b | c \n\
819 andl %e"c", %esi # si: b & c \n\
820 andl %e"d", %edi # di: (b | c) & d \n\
821 orl %esi, %edi # ((b | c) & d) | (b & c) \n\
822 movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\
823 xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\
824 xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\
825 xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\
826 roll %esi # \n\
827 movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\
828 addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\
829 leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
830 movl %e"a", %esi # \n\
831 roll $5, %esi # rotl32(a,5) \n\
832 addl %esi, %e"e" # e += rotl32(a,5) \n\
833 rorl $2, %e"b" # b = rotl32(b,30) \n\
834"
835#define RD3(a,b,c,d,e, n) RD3s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
836#undef RCONST
837//#define RCONST 0x8F1BBCDC "out of range for signed 32bit displacement"
838#define RCONST -0x70e44324
839 RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
840 RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
841 RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
842 RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
843
844#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
845"\n\
846 movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\
847 xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\
848 xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\
849 xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\
850 roll %esi # \n\
851 movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\
852 movl %e"c", %edi # c \n\
853 xorl %e"d", %edi # ^d \n\
854 xorl %e"b", %edi # ^b \n\
855 leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
856 addl %edi, %e"e" # e += (c ^ d ^ b) \n\
857 movl %e"a", %esi # \n\
858 roll $5, %esi # rotl32(a,5) \n\
859 addl %esi, %e"e" # e += rotl32(a,5) \n\
860 rorl $2, %e"b" # b = rotl32(b,30) \n\
861"
862#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
863"\n\
864 movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\
865 xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\
866 xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\
867 xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\
868 roll %esi # \n\
869 ##movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] elided \n\
870 movl %e"c", %edi # c \n\
871 xorl %e"d", %edi # ^d \n\
872 xorl %e"b", %edi # ^b \n\
873 leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
874 addl %edi, %e"e" # e += (c ^ d ^ b) \n\
875 movl %e"a", %esi # \n\
876 roll $5, %esi # rotl32(a,5) \n\
877 addl %esi, %e"e" # e += rotl32(a,5) \n\
878 rorl $2, %e"b" # b = rotl32(b,30) \n\
879"
880#define RD4A(a,b,c,d,e, n) RD4As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
881#define RD4B(a,b,c,d,e, n) RD4Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
882#undef RCONST
883//#define RCONST 0xCA62C1D6 "out of range for signed 32bit displacement"
884#define RCONST -0x359d3e2a
885 RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
886 RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
887 RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
888 RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
889
890"\n\
891 movq %r10, %rdi # \n\
892 addl %eax, 80(%rdi) # ctx->hash[0] += a \n\
893 addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\
894 addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\
895 addl %edx, 92(%rdi) # ctx->hash[3] += d \n\
896 addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\
897 movq %r9, %rbx # callee-saved \n\
898 movq %r8, %rbp # callee-saved \n\
899 ##popq %rbx # \n\
900 ##popq %rbp # \n\
901 ##popq %r12 # \n\
902 ##popq %r13 # \n\
903 ##popq %r14 # \n\
904 ##popq %r15 # \n\
905"
906 ); /* asm */
907#undef RCONST
908}
909# else
494/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. 910/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
495 * It seems further speedup can be achieved by handling more than 911 * It seems further speedup can be achieved by handling more than
496 * 64 bytes per one function call (coreutils does that). 912 * 64 bytes per one function call (coreutils does that).
@@ -571,6 +987,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
571 ctx->hash[3] += d; 987 ctx->hash[3] += d;
572 ctx->hash[4] += e; 988 ctx->hash[4] += e;
573} 989}
990# endif
574#elif CONFIG_SHA1_SMALL == 1 991#elif CONFIG_SHA1_SMALL == 1
575/* Middle-sized version, +300 bytes of code on x86. */ 992/* Middle-sized version, +300 bytes of code on x86. */
576static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) 993static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)