1 files changed, 242 insertions, 0 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
new file mode 100755
index 0000000000..f7ed67a726
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
@@ -0,0 +1,242 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# sha1_block procedure for x86_64.
+#
+# It was brought to my attention that on EM64T compiler-generated code
+# was far behind 32-bit assembler implementation. This is unlike on
+# Opteron where compiler-generated code was only 15% behind 32-bit
+# assembler, which originally made it hard to motivate the effort.
+# There was suggestion to mechanically translate 32-bit code, but I
+# dismissed it, reasoning that x86_64 offers enough register bank
+# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
+# implementation:-) However! While 64-bit code does performs better
+# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
+# x86_64 does offer larger *addressable* bank, but out-of-order core
+# reaches for even more registers through dynamic aliasing, and EM64T
+# core must have managed to run-time optimize even 32-bit code just as
+# good as 64-bit one. Performance improvement is summarized in the
+# following table:
+#
+#               gcc 3.4         32-bit asm      cycles/byte
+# Opteron       +45%            +20%            6.8
+# Xeon P4       +65%            +0%             9.9
+# Core2         +60%            +10%            7.0
+$output=shift;
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+open STDOUT,"| $^X $xlate $output";
+$ctx="%rdi";    # 1st arg
+$inp="%rsi";    # 2nd arg
+$num="%rdx";    # 3rd arg
+# reassign arguments in order to produce more compact code
+$ctx="%r8";
+$inp="%r9";
+$num="%r10";
+$xi="%eax";
+$t0="%ebx";
+$t1="%ecx";
+$A="%edx";
+$B="%esi";
+$C="%edi";
+$D="%ebp";
+$E="%r11d";
+$T="%r12d";
+@V=($A,$B,$C,$D,$E,$T);
+sub PROLOGUE {
+my $func=shift;
+$code.=<<___;
+.globl  $func
+.type   $func,\@function,3
+.align  16
+$func:
+        push    %rbx
+        push    %rbp
+        push    %r12
+        mov     %rsp,%rax
+        mov     %rdi,$ctx       # reassigned argument
+        sub     \$`8+16*4`,%rsp
+        mov     %rsi,$inp       # reassigned argument
+        and     \$-64,%rsp
+        mov     %rdx,$num       # reassigned argument
+        mov     %rax,`16*4`(%rsp)
+        mov     0($ctx),$A
+        mov     4($ctx),$B
+        mov     8($ctx),$C
+        mov     12($ctx),$D
+        mov     16($ctx),$E
+___
+}
+sub EPILOGUE {
+my $func=shift;
+$code.=<<___;
+        mov     `16*4`(%rsp),%rsp
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        ret
+.size   $func,.-$func
+___
+}
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+        mov     `4*$i`($inp),$xi        
+        `"bswap $xi"    if(!defined($host))`
+        mov     $xi,`4*$i`(%rsp)
+___
+$code.=<<___ if ($i<15);
+        lea     0x5a827999($xi,$e),$f
+        mov     $c,$t0
+        mov     `4*$j`($inp),$xi
+        mov     $a,$e
+        xor     $d,$t0
+        `"bswap $xi"    if(!defined($host))`    
+        rol     \$5,$e
+        and     $b,$t0
+        mov     $xi,`4*$j`(%rsp)
+        add     $e,$f
+        xor     $d,$t0
+        rol     \$30,$b
+        add     $t0,$f
+___
+$code.=<<___ if ($i>=15);
+        lea     0x5a827999($xi,$e),$f
+        mov     `4*($j%16)`(%rsp),$xi
+        mov     $c,$t0
+        mov     $a,$e
+        xor     `4*(($j+2)%16)`(%rsp),$xi
+        xor     $d,$t0
+        rol     \$5,$e
+        xor     `4*(($j+8)%16)`(%rsp),$xi
+        and     $b,$t0
+        add     $e,$f
+        xor     `4*(($j+13)%16)`(%rsp),$xi
+        xor     $d,$t0
+        rol     \$30,$b
+        add     $t0,$f
+        rol     \$1,$xi
+        mov     $xi,`4*($j%16)`(%rsp)
+___
+}
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my $j=$i+1;
+my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
+$code.=<<___ if ($i<79);
+        lea     $K($xi,$e),$f
+        mov     `4*($j%16)`(%rsp),$xi
+        mov     $c,$t0
+        mov     $a,$e
+        xor     `4*(($j+2)%16)`(%rsp),$xi
+        xor     $b,$t0
+        rol     \$5,$e
+        xor     `4*(($j+8)%16)`(%rsp),$xi
+        xor     $d,$t0
+        add     $e,$f
+        xor     `4*(($j+13)%16)`(%rsp),$xi
+        rol     \$30,$b
+        add     $t0,$f
+        rol     \$1,$xi
+___
+$code.=<<___ if ($i<76);
+        mov     $xi,`4*($j%16)`(%rsp)
+___
+$code.=<<___ if ($i==79);
+        lea     $K($xi,$e),$f
+        mov     $c,$t0
+        mov     $a,$e
+        xor     $b,$t0
+        rol     \$5,$e
+        xor     $d,$t0
+        add     $e,$f
+        rol     \$30,$b
+        add     $t0,$f
+___
+}
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my $j=$i+1;
+$code.=<<___;
+        lea     0x8f1bbcdc($xi,$e),$f
+        mov     `4*($j%16)`(%rsp),$xi
+        mov     $b,$t0
+        mov     $b,$t1
+        xor     `4*(($j+2)%16)`(%rsp),$xi
+        mov     $a,$e
+        and     $c,$t0
+        xor     `4*(($j+8)%16)`(%rsp),$xi
+        or      $c,$t1
+        rol     \$5,$e
+        xor     `4*(($j+13)%16)`(%rsp),$xi
+        and     $d,$t1
+        add     $e,$f
+        rol     \$1,$xi
+        or      $t1,$t0
+        rol     \$30,$b
+        mov     $xi,`4*($j%16)`(%rsp)
+        add     $t0,$f
+___
+}
+$code=".text\n";
+&PROLOGUE("sha1_block_data_order");
+$code.=".align  4\n.Lloop:\n";
+for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+        add     0($ctx),$E
+        add     4($ctx),$T
+        add     8($ctx),$A
+        add     12($ctx),$B
+        add     16($ctx),$C
+        mov     $E,0($ctx)
+        mov     $T,4($ctx)
+        mov     $A,8($ctx)
+        mov     $B,12($ctx)
+        mov     $C,16($ctx)
+        xchg    $E,$A   # mov   $E,$A
+        xchg    $T,$B   # mov   $T,$B
+        xchg    $E,$C   # mov   $A,$C
+        xchg    $T,$D   # mov   $B,$D
+                        # mov   $C,$E
+        lea     `16*4`($inp),$inp
+        sub     \$1,$num
+        jnz     .Lloop
+___
+&EPILOGUE("sha1_block_data_order");
+$code.=<<___;
+.asciz  "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+####################################################################
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;

diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl new file mode 100755 index 0000000000..f7ed67a726 --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
@@ -0,0 +1,242 @@
	1	#!/usr/bin/env perl
	2	#
	3	# ====================================================================
	4	# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
	5	# project. The module is, however, dual licensed under OpenSSL and
	6	# CRYPTOGAMS licenses depending on where you obtain it. For further
	7	# details see http://www.openssl.org/~appro/cryptogams/.
	8	# ====================================================================
	9	#
	10	# sha1_block procedure for x86_64.
	11	#
	12	# It was brought to my attention that on EM64T compiler-generated code
	13	# was far behind 32-bit assembler implementation. This is unlike on
	14	# Opteron where compiler-generated code was only 15% behind 32-bit
	15	# assembler, which originally made it hard to motivate the effort.
	16	# There was suggestion to mechanically translate 32-bit code, but I
	17	# dismissed it, reasoning that x86_64 offers enough register bank
	18	# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
	19	# implementation:-) However! While 64-bit code does performs better
	20	# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
	21	# x86_64 does offer larger addressable bank, but out-of-order core
	22	# reaches for even more registers through dynamic aliasing, and EM64T
	23	# core must have managed to run-time optimize even 32-bit code just as
	24	# good as 64-bit one. Performance improvement is summarized in the
	25	# following table:
	26	#
	27	# gcc 3.4 32-bit asm cycles/byte
	28	# Opteron +45% +20% 6.8
	29	# Xeon P4 +65% +0% 9.9
	30	# Core2 +60% +10% 7.0
	31
	32	$output=shift;
	33
	34	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
	35	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
	36	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
	37	die "can't locate x86_64-xlate.pl";
	38
	39	open STDOUT,"\| $^X $xlate $output";
	40
	41	$ctx="%rdi"; # 1st arg
	42	$inp="%rsi"; # 2nd arg
	43	$num="%rdx"; # 3rd arg
	44
	45	# reassign arguments in order to produce more compact code
	46	$ctx="%r8";
	47	$inp="%r9";
	48	$num="%r10";
	49
	50	$xi="%eax";
	51	$t0="%ebx";
	52	$t1="%ecx";
	53	$A="%edx";
	54	$B="%esi";
	55	$C="%edi";
	56	$D="%ebp";
	57	$E="%r11d";
	58	$T="%r12d";
	59
	60	@V=($A,$B,$C,$D,$E,$T);
	61
	62	sub PROLOGUE {
	63	my $func=shift;
	64	$code.=<<___;
	65	.globl $func
	66	.type $func,\@function,3
	67	.align 16
	68	$func:
	69	push %rbx
	70	push %rbp
	71	push %r12
	72	mov %rsp,%rax
	73	mov %rdi,$ctx # reassigned argument
	74	sub \$`8+16*4`,%rsp
	75	mov %rsi,$inp # reassigned argument
	76	and \$-64,%rsp
	77	mov %rdx,$num # reassigned argument
	78	mov %rax,`16*4`(%rsp)
	79
	80	mov 0($ctx),$A
	81	mov 4($ctx),$B
	82	mov 8($ctx),$C
	83	mov 12($ctx),$D
	84	mov 16($ctx),$E
	85	___
	86	}
	87
	88	sub EPILOGUE {
	89	my $func=shift;
	90	$code.=<<___;
	91	mov `16*4`(%rsp),%rsp
	92	pop %r12
	93	pop %rbp
	94	pop %rbx
	95	ret
	96	.size $func,.-$func
	97	___
	98	}
	99
	100	sub BODY_00_19 {
	101	my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
	102	my $j=$i+1;
	103	$code.=<<___ if ($i==0);
	104	mov `4*$i`($inp),$xi
	105	`"bswap $xi" if(!defined($host))`
	106	mov $xi,`4*$i`(%rsp)
	107	___
	108	$code.=<<___ if ($i<15);
	109	lea 0x5a827999($xi,$e),$f
	110	mov $c,$t0
	111	mov `4*$j`($inp),$xi
	112	mov $a,$e
	113	xor $d,$t0
	114	`"bswap $xi" if(!defined($host))`
	115	rol \$5,$e
	116	and $b,$t0
	117	mov $xi,`4*$j`(%rsp)
	118	add $e,$f
	119	xor $d,$t0
	120	rol \$30,$b
	121	add $t0,$f
	122	___
	123	$code.=<<___ if ($i>=15);
	124	lea 0x5a827999($xi,$e),$f
	125	mov `4*($j%16)`(%rsp),$xi
	126	mov $c,$t0
	127	mov $a,$e
	128	xor `4*(($j+2)%16)`(%rsp),$xi
	129	xor $d,$t0
	130	rol \$5,$e
	131	xor `4*(($j+8)%16)`(%rsp),$xi
	132	and $b,$t0
	133	add $e,$f
	134	xor `4*(($j+13)%16)`(%rsp),$xi
	135	xor $d,$t0
	136	rol \$30,$b
	137	add $t0,$f
	138	rol \$1,$xi
	139	mov $xi,`4*($j%16)`(%rsp)
	140	___
	141	}
	142
	143	sub BODY_20_39 {
	144	my ($i,$a,$b,$c,$d,$e,$f)=@_;
	145	my $j=$i+1;
	146	my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
	147	$code.=<<___ if ($i<79);
	148	lea $K($xi,$e),$f
	149	mov `4*($j%16)`(%rsp),$xi
	150	mov $c,$t0
	151	mov $a,$e
	152	xor `4*(($j+2)%16)`(%rsp),$xi
	153	xor $b,$t0
	154	rol \$5,$e
	155	xor `4*(($j+8)%16)`(%rsp),$xi
	156	xor $d,$t0
	157	add $e,$f
	158	xor `4*(($j+13)%16)`(%rsp),$xi
	159	rol \$30,$b
	160	add $t0,$f
	161	rol \$1,$xi
	162	___
	163	$code.=<<___ if ($i<76);
	164	mov $xi,`4*($j%16)`(%rsp)
	165	___
	166	$code.=<<___ if ($i==79);
	167	lea $K($xi,$e),$f
	168	mov $c,$t0
	169	mov $a,$e
	170	xor $b,$t0
	171	rol \$5,$e
	172	xor $d,$t0
	173	add $e,$f
	174	rol \$30,$b
	175	add $t0,$f
	176	___
	177	}
	178
	179	sub BODY_40_59 {
	180	my ($i,$a,$b,$c,$d,$e,$f)=@_;
	181	my $j=$i+1;
	182	$code.=<<___;
	183	lea 0x8f1bbcdc($xi,$e),$f
	184	mov `4*($j%16)`(%rsp),$xi
	185	mov $b,$t0
	186	mov $b,$t1
	187	xor `4*(($j+2)%16)`(%rsp),$xi
	188	mov $a,$e
	189	and $c,$t0
	190	xor `4*(($j+8)%16)`(%rsp),$xi
	191	or $c,$t1
	192	rol \$5,$e
	193	xor `4*(($j+13)%16)`(%rsp),$xi
	194	and $d,$t1
	195	add $e,$f
	196	rol \$1,$xi
	197	or $t1,$t0
	198	rol \$30,$b
	199	mov $xi,`4*($j%16)`(%rsp)
	200	add $t0,$f
	201	___
	202	}
	203
	204	$code=".text\n";
	205
	206	&PROLOGUE("sha1_block_data_order");
	207	$code.=".align 4\n.Lloop:\n";
	208	for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
	209	for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
	210	for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
	211	for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
	212	$code.=<<___;
	213	add 0($ctx),$E
	214	add 4($ctx),$T
	215	add 8($ctx),$A
	216	add 12($ctx),$B
	217	add 16($ctx),$C
	218	mov $E,0($ctx)
	219	mov $T,4($ctx)
	220	mov $A,8($ctx)
	221	mov $B,12($ctx)
	222	mov $C,16($ctx)
	223
	224	xchg $E,$A # mov $E,$A
	225	xchg $T,$B # mov $T,$B
	226	xchg $E,$C # mov $A,$C
	227	xchg $T,$D # mov $B,$D
	228	# mov $C,$E
	229	lea `16*4`($inp),$inp
	230	sub \$1,$num
	231	jnz .Lloop
	232	___
	233	&EPILOGUE("sha1_block_data_order");
	234	$code.=<<___;
	235	.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
	236	___
	237
	238	####################################################################
	239
	240	$code =~ s/\`([^\`]*)\`/eval $1/gem;
	241	print $code;
	242	close STDOUT;