diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-01 12:21:01 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-01 13:57:34 +0100 |
commit | 5f6817020467598868b7d1c9ca477d7ccd66b87d (patch) | |
tree | 931b244d18b1dba9afd8861a0474429e33fa7329 /libbb | |
parent | f09d088fdf6eeeba902fb5627930145a3058a5f0 (diff) | |
download | busybox-w32-5f6817020467598868b7d1c9ca477d7ccd66b87d.tar.gz busybox-w32-5f6817020467598868b7d1c9ca477d7ccd66b87d.tar.bz2 busybox-w32-5f6817020467598868b7d1c9ca477d7ccd66b87d.zip |
libbb/sha1: assembly versions for x86
32 bits:
function old new delta
sha1_process_block64 3950 3657 -293
64 bits:
sha1_process_block64 4167 3683 -484
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
-rw-r--r-- | libbb/Config.src | 2 | ||||
-rw-r--r-- | libbb/hash_md5_sha.c | 417 |
2 files changed, 418 insertions, 1 deletions
diff --git a/libbb/Config.src b/libbb/Config.src index d2054dc63..e027c14a8 100644 --- a/libbb/Config.src +++ b/libbb/Config.src | |||
@@ -59,7 +59,7 @@ config SHA1_SMALL | |||
59 | Trade binary size versus speed for the sha1 algorithm. | 59 | Trade binary size versus speed for the sha1 algorithm. |
60 | throughput MB/s size of sha1_process_block64 | 60 | throughput MB/s size of sha1_process_block64 |
61 | value 486 x86-64 486 x86-64 | 61 | value 486 x86-64 486 x86-64 |
62 | 0 360 374 3950 4167 | 62 | 0 367 367 3657 3683 |
63 | 1 224 229 654 732 | 63 | 1 224 229 654 732 |
64 | 2,3 200 195 358 380 | 64 | 2,3 200 195 358 380 |
65 | 65 | ||
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index faf485df5..9de30dfe6 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
@@ -8,6 +8,9 @@ | |||
8 | */ | 8 | */ |
9 | #include "libbb.h" | 9 | #include "libbb.h" |
10 | 10 | ||
11 | #define STR1(s) #s | ||
12 | #define STR(s) STR1(s) | ||
13 | |||
11 | #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) | 14 | #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) |
12 | 15 | ||
13 | /* gcc 4.2.1 optimizes rotr64 better with inline than with macro | 16 | /* gcc 4.2.1 optimizes rotr64 better with inline than with macro |
@@ -491,6 +494,419 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf) | |||
491 | */ | 494 | */ |
492 | 495 | ||
493 | #if CONFIG_SHA1_SMALL == 0 | 496 | #if CONFIG_SHA1_SMALL == 0 |
497 | # if defined(__GNUC__) && defined(__i386__) | ||
498 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | ||
499 | { | ||
500 | BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 76); | ||
501 | asm( | ||
502 | "\n\ | ||
503 | pushl %ebp # \n\ | ||
504 | pushl %edi # \n\ | ||
505 | pushl %esi # \n\ | ||
506 | pushl %ebx # \n\ | ||
507 | pushl %eax \n\ | ||
508 | movl $15, %edi \n\ | ||
509 | 1: \n\ | ||
510 | movl (%eax,%edi,4), %esi \n\ | ||
511 | bswap %esi \n\ | ||
512 | pushl %esi \n\ | ||
513 | decl %edi \n\ | ||
514 | jns 1b \n\ | ||
515 | movl 80(%eax), %ebx # b = ctx->hash[1] \n\ | ||
516 | movl 84(%eax), %ecx # c = ctx->hash[2] \n\ | ||
517 | movl 88(%eax), %edx # d = ctx->hash[3] \n\ | ||
518 | movl 92(%eax), %ebp # e = ctx->hash[4] \n\ | ||
519 | movl 76(%eax), %eax # a = ctx->hash[0] \n\ | ||
520 | #Register and stack use: \n\ | ||
521 | # eax..edx: a..d \n\ | ||
522 | # ebp: e \n\ | ||
523 | # esi,edi: temps \n\ | ||
524 | # 4*n(%esp): W[n] \n\ | ||
525 | " | ||
526 | #define RD1As(a,b,c,d,e, n, RCONST) \ | ||
527 | "\n\ | ||
528 | ##movl 4*"n"(%esp), %esi # n=0, W[0] already in %esi \n\ | ||
529 | movl "c", %edi # c \n\ | ||
530 | xorl "d", %edi # ^d \n\ | ||
531 | andl "b", %edi # &b \n\ | ||
532 | xorl "d", %edi # (((c ^ d) & b) ^ d) \n\ | ||
533 | leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\ | ||
534 | addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\ | ||
535 | movl "a", %esi # \n\ | ||
536 | roll $5, %esi # rotl32(a,5) \n\ | ||
537 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
538 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
539 | " | ||
540 | #define RD1Bs(a,b,c,d,e, n, RCONST) \ | ||
541 | "\n\ | ||
542 | movl 4*"n"(%esp), %esi # W[n] \n\ | ||
543 | movl "c", %edi # c \n\ | ||
544 | xorl "d", %edi # ^d \n\ | ||
545 | andl "b", %edi # &b \n\ | ||
546 | xorl "d", %edi # (((c ^ d) & b) ^ d) \n\ | ||
547 | leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\ | ||
548 | addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\ | ||
549 | movl "a", %esi # \n\ | ||
550 | roll $5, %esi # rotl32(a,5) \n\ | ||
551 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
552 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
553 | " | ||
554 | #define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
555 | "\n\ | ||
556 | movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ | ||
557 | xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ | ||
558 | xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ | ||
559 | xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ | ||
560 | roll %esi # \n\ | ||
561 | movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ | ||
562 | movl "c", %edi # c \n\ | ||
563 | xorl "d", %edi # ^d \n\ | ||
564 | andl "b", %edi # &b \n\ | ||
565 | xorl "d", %edi # (((c ^ d) & b) ^ d) \n\ | ||
566 | leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ | ||
567 | addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\ | ||
568 | movl "a", %esi # \n\ | ||
569 | roll $5, %esi # rotl32(a,5) \n\ | ||
570 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
571 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
572 | " | ||
573 | #define RD1A(a,b,c,d,e, n) RD1As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST)) | ||
574 | #define RD1B(a,b,c,d,e, n) RD1Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST)) | ||
575 | #define RD1C(a,b,c,d,e, n) RD1Cs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) | ||
576 | #undef RCONST | ||
577 | #define RCONST 0x5A827999 | ||
578 | RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) | ||
579 | RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9) | ||
580 | RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14) | ||
581 | RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19) | ||
582 | #define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
583 | "\n\ | ||
584 | movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ | ||
585 | xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ | ||
586 | xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ | ||
587 | xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ | ||
588 | roll %esi # \n\ | ||
589 | movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ | ||
590 | movl "c", %edi # c \n\ | ||
591 | xorl "d", %edi # ^d \n\ | ||
592 | xorl "b", %edi # ^b \n\ | ||
593 | leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ | ||
594 | addl %edi, "e" # e += (c ^ d ^ b) \n\ | ||
595 | movl "a", %esi # \n\ | ||
596 | roll $5, %esi # rotl32(a,5) \n\ | ||
597 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
598 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
599 | " | ||
600 | #define RD2(a,b,c,d,e, n) RD2s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST)) | ||
601 | #undef RCONST | ||
602 | #define RCONST 0x6ED9EBA1 | ||
603 | RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4) | ||
604 | RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9) | ||
605 | RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14) | ||
606 | RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19) | ||
607 | |||
608 | #define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
609 | "\n\ | ||
610 | movl "b", %edi # di: b \n\ | ||
611 | movl "b", %esi # si: b \n\ | ||
612 | orl "c", %edi # di: b | c \n\ | ||
613 | andl "c", %esi # si: b & c \n\ | ||
614 | andl "d", %edi # di: (b | c) & d \n\ | ||
615 | orl %esi, %edi # ((b | c) & d) | (b & c) \n\ | ||
616 | movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ | ||
617 | xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ | ||
618 | xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ | ||
619 | xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ | ||
620 | roll %esi # \n\ | ||
621 | movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ | ||
622 | addl %edi, "e" # += ((b | c) & d) | (b & c)\n\ | ||
623 | leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ | ||
624 | movl "a", %esi # \n\ | ||
625 | roll $5, %esi # rotl32(a,5) \n\ | ||
626 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
627 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
628 | " | ||
629 | #define RD3(a,b,c,d,e, n) RD3s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST)) | ||
630 | #undef RCONST | ||
631 | #define RCONST 0x8F1BBCDC | ||
632 | RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4) | ||
633 | RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9) | ||
634 | RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14) | ||
635 | RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19) | ||
636 | |||
637 | #define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
638 | "\n\ | ||
639 | movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ | ||
640 | xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ | ||
641 | xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ | ||
642 | xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ | ||
643 | roll %esi # \n\ | ||
644 | movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\ | ||
645 | movl "c", %edi # c \n\ | ||
646 | xorl "d", %edi # ^d \n\ | ||
647 | xorl "b", %edi # ^b \n\ | ||
648 | leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ | ||
649 | addl %edi, "e" # e += (c ^ d ^ b) \n\ | ||
650 | movl "a", %esi # \n\ | ||
651 | roll $5, %esi # rotl32(a,5) \n\ | ||
652 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
653 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
654 | " | ||
655 | #define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
656 | "\n\ | ||
657 | movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\ | ||
658 | xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\ | ||
659 | xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\ | ||
660 | xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\ | ||
661 | roll %esi # \n\ | ||
662 | ##movl %esi, 4*"n"(%esp) # store to W[n & 15] elided \n\ | ||
663 | movl "c", %edi # c \n\ | ||
664 | xorl "d", %edi # ^d \n\ | ||
665 | xorl "b", %edi # ^b \n\ | ||
666 | leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\ | ||
667 | addl %edi, "e" # e += (c ^ d ^ b) \n\ | ||
668 | movl "a", %esi # \n\ | ||
669 | roll $5, %esi # rotl32(a,5) \n\ | ||
670 | addl %esi, "e" # e += rotl32(a,5) \n\ | ||
671 | rorl $2, "b" # b = rotl32(b,30) \n\ | ||
672 | " | ||
673 | #define RD4A(a,b,c,d,e, n) RD4As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) | ||
674 | #define RD4B(a,b,c,d,e, n) RD4Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) | ||
675 | #undef RCONST | ||
676 | #define RCONST 0xCA62C1D6 | ||
677 | RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4) | ||
678 | RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9) | ||
679 | RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14) | ||
680 | RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) | ||
681 | |||
682 | "\n\ | ||
683 | movl 4*16(%esp), %esi # \n\ | ||
684 | addl $4*(16+1), %esp # \n\ | ||
685 | addl %eax, 76(%esi) # ctx->hash[0] += a \n\ | ||
686 | addl %ebx, 80(%esi) # ctx->hash[1] += b \n\ | ||
687 | addl %ecx, 84(%esi) # ctx->hash[2] += c \n\ | ||
688 | addl %edx, 88(%esi) # ctx->hash[3] += d \n\ | ||
689 | addl %ebp, 92(%esi) # ctx->hash[4] += e \n\ | ||
690 | popl %ebx # \n\ | ||
691 | popl %esi # \n\ | ||
692 | popl %edi # \n\ | ||
693 | popl %ebp # \n\ | ||
694 | " | ||
695 | ); /* asm */ | ||
696 | #undef RCONST | ||
697 | } | ||
698 | # elif defined(__GNUC__) && defined(__x86_64__) | ||
699 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | ||
700 | { | ||
701 | BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80); | ||
702 | asm( | ||
703 | // TODO: store W[] in r8..r15? (r8..r11 are callee-clobbered, no need to save) | ||
704 | "\n\ | ||
705 | ##pushq %r15 # \n\ | ||
706 | ##pushq %r14 # \n\ | ||
707 | ##pushq %r13 # \n\ | ||
708 | ##pushq %r12 # \n\ | ||
709 | ##pushq %rbp # \n\ | ||
710 | ##pushq %rbx # \n\ | ||
711 | movq %rbp, %r8 # callee-saved \n\ | ||
712 | movq %rbx, %r9 # callee-saved \n\ | ||
713 | movq %rdi, %r10 # we need ctx at the end \n\ | ||
714 | movl $15, %eax \n\ | ||
715 | 1: \n\ | ||
716 | movl (%rdi,%rax,4), %esi \n\ | ||
717 | bswap %esi \n\ | ||
718 | movl %esi, -64(%rsp,%rax,4) \n\ | ||
719 | decl %eax \n\ | ||
720 | jns 1b \n\ | ||
721 | movl 80(%rdi), %eax # a = ctx->hash[0] \n\ | ||
722 | movl 84(%rdi), %ebx # b = ctx->hash[1] \n\ | ||
723 | movl 88(%rdi), %ecx # c = ctx->hash[2] \n\ | ||
724 | movl 92(%rdi), %edx # d = ctx->hash[3] \n\ | ||
725 | movl 96(%rdi), %ebp # e = ctx->hash[4] \n\ | ||
726 | #Register and stack use: \n\ | ||
727 | # eax..edx: a..d \n\ | ||
728 | # ebp: e \n\ | ||
729 | # esi,edi: temps \n\ | ||
730 | # -64+4*n(%rsp): W[n] \n\ | ||
731 | " | ||
732 | #define RD1As(a,b,c,d,e, n, RCONST) \ | ||
733 | "\n\ | ||
734 | ##movl -64+4*"n"(%rsp), %esi # n=0, W[0] already in %esi \n\ | ||
735 | movl %e"c", %edi # c \n\ | ||
736 | xorl %e"d", %edi # ^d \n\ | ||
737 | andl %e"b", %edi # &b \n\ | ||
738 | xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ | ||
739 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\ | ||
740 | addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ | ||
741 | movl %e"a", %esi # \n\ | ||
742 | roll $5, %esi # rotl32(a,5) \n\ | ||
743 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | ||
744 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | ||
745 | " | ||
746 | #define RD1Bs(a,b,c,d,e, n, RCONST) \ | ||
747 | "\n\ | ||
748 | movl -64+4*"n"(%rsp), %esi # W[n] \n\ | ||
749 | movl %e"c", %edi # c \n\ | ||
750 | xorl %e"d", %edi # ^d \n\ | ||
751 | andl %e"b", %edi # &b \n\ | ||
752 | xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ | ||
753 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\ | ||
754 | addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ | ||
755 | movl %e"a", %esi # \n\ | ||
756 | roll $5, %esi # rotl32(a,5) \n\ | ||
757 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | ||
758 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | ||
759 | " | ||
760 | #define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
761 | "\n\ | ||
762 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | ||
763 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | ||
764 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | ||
765 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | ||
766 | roll %esi # \n\ | ||
767 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | ||
768 | movl %e"c", %edi # c \n\ | ||
769 | xorl %e"d", %edi # ^d \n\ | ||
770 | andl %e"b", %edi # &b \n\ | ||
771 | xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\ | ||
772 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ | ||
773 | addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\ | ||
774 | movl %e"a", %esi # \n\ | ||
775 | roll $5, %esi # rotl32(a,5) \n\ | ||
776 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | ||
777 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | ||
778 | " | ||
779 | #define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST)) | ||
780 | #define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST)) | ||
781 | #define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST)) | ||
782 | #undef RCONST | ||
783 | #define RCONST 0x5A827999 | ||
784 | RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4) | ||
785 | RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9) | ||
786 | RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14) | ||
787 | RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19) | ||
788 | #define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
789 | "\n\ | ||
790 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | ||
791 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | ||
792 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | ||
793 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | ||
794 | roll %esi # \n\ | ||
795 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | ||
796 | movl %e"c", %edi # c \n\ | ||
797 | xorl %e"d", %edi # ^d \n\ | ||
798 | xorl %e"b", %edi # ^b \n\ | ||
799 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ | ||
800 | addl %edi, %e"e" # e += (c ^ d ^ b) \n\ | ||
801 | movl %e"a", %esi # \n\ | ||
802 | roll $5, %esi # rotl32(a,5) \n\ | ||
803 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | ||
804 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | ||
805 | " | ||
806 | #define RD2(a,b,c,d,e, n) RD2s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST)) | ||
807 | #undef RCONST | ||
808 | #define RCONST 0x6ED9EBA1 | ||
809 | RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4) | ||
810 | RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9) | ||
811 | RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14) | ||
812 | RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19) | ||
813 | |||
814 | #define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
815 | "\n\ | ||
816 | movl %e"b", %edi # di: b \n\ | ||
817 | movl %e"b", %esi # si: b \n\ | ||
818 | orl %e"c", %edi # di: b | c \n\ | ||
819 | andl %e"c", %esi # si: b & c \n\ | ||
820 | andl %e"d", %edi # di: (b | c) & d \n\ | ||
821 | orl %esi, %edi # ((b | c) & d) | (b & c) \n\ | ||
822 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | ||
823 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | ||
824 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | ||
825 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | ||
826 | roll %esi # \n\ | ||
827 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | ||
828 | addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\ | ||
829 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ | ||
830 | movl %e"a", %esi # \n\ | ||
831 | roll $5, %esi # rotl32(a,5) \n\ | ||
832 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | ||
833 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | ||
834 | " | ||
835 | #define RD3(a,b,c,d,e, n) RD3s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST)) | ||
836 | #undef RCONST | ||
837 | //#define RCONST 0x8F1BBCDC "out of range for signed 32bit displacement" | ||
838 | #define RCONST -0x70e44324 | ||
839 | RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4) | ||
840 | RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9) | ||
841 | RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14) | ||
842 | RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19) | ||
843 | |||
844 | #define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
845 | "\n\ | ||
846 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | ||
847 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | ||
848 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | ||
849 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | ||
850 | roll %esi # \n\ | ||
851 | movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\ | ||
852 | movl %e"c", %edi # c \n\ | ||
853 | xorl %e"d", %edi # ^d \n\ | ||
854 | xorl %e"b", %edi # ^b \n\ | ||
855 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ | ||
856 | addl %edi, %e"e" # e += (c ^ d ^ b) \n\ | ||
857 | movl %e"a", %esi # \n\ | ||
858 | roll $5, %esi # rotl32(a,5) \n\ | ||
859 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | ||
860 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | ||
861 | " | ||
862 | #define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \ | ||
863 | "\n\ | ||
864 | movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\ | ||
865 | xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\ | ||
866 | xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\ | ||
867 | xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\ | ||
868 | roll %esi # \n\ | ||
869 | ##movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] elided \n\ | ||
870 | movl %e"c", %edi # c \n\ | ||
871 | xorl %e"d", %edi # ^d \n\ | ||
872 | xorl %e"b", %edi # ^b \n\ | ||
873 | leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\ | ||
874 | addl %edi, %e"e" # e += (c ^ d ^ b) \n\ | ||
875 | movl %e"a", %esi # \n\ | ||
876 | roll $5, %esi # rotl32(a,5) \n\ | ||
877 | addl %esi, %e"e" # e += rotl32(a,5) \n\ | ||
878 | rorl $2, %e"b" # b = rotl32(b,30) \n\ | ||
879 | " | ||
880 | #define RD4A(a,b,c,d,e, n) RD4As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) | ||
881 | #define RD4B(a,b,c,d,e, n) RD4Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST)) | ||
882 | #undef RCONST | ||
883 | //#define RCONST 0xCA62C1D6 "out of range for signed 32bit displacement" | ||
884 | #define RCONST -0x359d3e2a | ||
885 | RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4) | ||
886 | RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9) | ||
887 | RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14) | ||
888 | RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19) | ||
889 | |||
890 | "\n\ | ||
891 | movq %r10, %rdi # \n\ | ||
892 | addl %eax, 80(%rdi) # ctx->hash[0] += a \n\ | ||
893 | addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\ | ||
894 | addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\ | ||
895 | addl %edx, 92(%rdi) # ctx->hash[3] += d \n\ | ||
896 | addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\ | ||
897 | movq %r9, %rbx # callee-saved \n\ | ||
898 | movq %r8, %rbp # callee-saved \n\ | ||
899 | ##popq %rbx # \n\ | ||
900 | ##popq %rbp # \n\ | ||
901 | ##popq %r12 # \n\ | ||
902 | ##popq %r13 # \n\ | ||
903 | ##popq %r14 # \n\ | ||
904 | ##popq %r15 # \n\ | ||
905 | " | ||
906 | ); /* asm */ | ||
907 | #undef RCONST | ||
908 | } | ||
909 | # else | ||
494 | /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. | 910 | /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. |
495 | * It seems further speedup can be achieved by handling more than | 911 | * It seems further speedup can be achieved by handling more than |
496 | * 64 bytes per one function call (coreutils does that). | 912 | * 64 bytes per one function call (coreutils does that). |
@@ -571,6 +987,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | |||
571 | ctx->hash[3] += d; | 987 | ctx->hash[3] += d; |
572 | ctx->hash[4] += e; | 988 | ctx->hash[4] += e; |
573 | } | 989 | } |
990 | # endif | ||
574 | #elif CONFIG_SHA1_SMALL == 1 | 991 | #elif CONFIG_SHA1_SMALL == 1 |
575 | /* Middle-sized version, +300 bytes of code on x86. */ | 992 | /* Middle-sized version, +300 bytes of code on x86. */ |
576 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | 993 | static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) |