2 files changed, 456 insertions, 0 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl
new file mode 100644
index 0000000000..d6e98f0811
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -0,0 +1,229 @@
+#!/usr/local/bin/perl
+# At some point it became apparent that the original SSLeay RC4
+# assembler implementation performs suboptimaly on latest IA-32
+# microarchitectures. After re-tuning performance has changed as
+# following:
+#
+# Pentium       +0%
+# Pentium III   +17%
+# AMD           +52%(*)
+# P4            +180%(**)
+#
+# (*)   This number is actually a trade-off:-) It's possible to
+#       achieve +72%, but at the cost of -48% off PIII performance.
+#       In other words code performing further 13% faster on AMD
+#       would perform almost 2 times slower on Intel PIII...
+#       For reference! This code delivers ~80% of rc4-amd64.pl
+#       performance on the same Opteron machine.
+# (**)  This number requires compressed key schedule set up by
+#       RC4_set_key and therefore doesn't apply to 0.9.7 [option for
+#       compressed key schedule is implemented in 0.9.8 and later,
+#       see commentary section in rc4_skey.c for further details].
+#
+#                                       <appro@fy.chalmers.se>
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+&asm_init($ARGV[0],"rc4-586.pl");
+$x="eax";
+$y="ebx";
+$tx="ecx";
+$ty="edx";
+$in="esi";
+$out="edi";
+$d="ebp";
+&RC4("RC4");
+&asm_finish();
+sub RC4_loop
+        {
+        local($n,$p,$char)=@_;
+        &comment("Round $n");
+        if ($char)
+                {
+                if ($p >= 0)
+                        {
+                         &mov($ty,      &swtmp(2));
+                        &cmp($ty,       $in);
+                         &jbe(&label("finished"));
+                        &inc($in);
+                        }
+                else
+                        {
+                        &add($ty,       8);
+                         &inc($in);
+                        &cmp($ty,       $in);
+                         &jb(&label("finished"));
+                        &mov(&swtmp(2), $ty);
+                        }
+                }
+        # Moved out
+        # &mov( $tx,            &DWP(0,$d,$x,4)) if $p < 0;
+        &add(   &LB($y),        &LB($tx));
+        &mov(   $ty,            &DWP(0,$d,$y,4));
+         # XXX
+        &mov(   &DWP(0,$d,$x,4),$ty);
+         &add(  $ty,            $tx);
+        &mov(   &DWP(0,$d,$y,4),$tx);
+         &and(  $ty,            0xff);
+         &inc(  &LB($x));                       # NEXT ROUND
+        &mov(   $tx,            &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
+         &mov(  $ty,            &DWP(0,$d,$ty,4));
+        if (!$char)
+                {
+                #moved up into last round
+                if ($p >= 1)
+                        {
+                        &add(   $out,   8)
+                        }
+                &movb(  &BP($n,"esp","",0),     &LB($ty));
+                }
+        else
+                {
+                # Note in+=8 has occured
+                &movb(  &HB($ty),       &BP(-1,$in,"",0));
+                 # XXX
+                &xorb(&LB($ty),         &HB($ty));
+                 # XXX
+                &movb(&BP($n,$out,"",0),&LB($ty));
+                }
+        }
+sub RC4
+        {
+        local($name)=@_;
+        &function_begin_B($name,"");
+        &mov($ty,&wparam(1));           # len
+        &cmp($ty,0);
+        &jne(&label("proceed"));
+        &ret();
+        &set_label("proceed");
+        &comment("");
+        &push("ebp");
+         &push("ebx");
+        &push("esi");
+         &xor(  $x,     $x);            # avoid partial register stalls
+        &push("edi");
+         &xor(  $y,     $y);            # avoid partial register stalls
+        &mov(   $d,     &wparam(0));    # key
+         &mov(  $in,    &wparam(2));
+        &movb(  &LB($x),        &BP(0,$d,"",1));
+         &movb( &LB($y),        &BP(4,$d,"",1));
+        &mov(   $out,   &wparam(3));
+         &inc(  &LB($x));
+        &stack_push(3); # 3 temp variables
+         &add(  $d,     8);
+        # detect compressed schedule, see commentary section in rc4_skey.c...
+        # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
+        # as compressed key schedule is set up in 0.9.8 and later.
+        &cmp(&DWP(256,$d),-1);
+        &je(&label("RC4_CHAR"));
+         &lea(  $ty,    &DWP(-8,$ty,$in));
+        # check for 0 length input
+         &mov(  &swtmp(2),      $ty);   # this is now address to exit at
+        &mov(   $tx,    &DWP(0,$d,$x,4));
+         &cmp(  $ty,    $in);
+        &jb(    &label("end")); # less than 8 bytes
+        &set_label("start");
+        # filling DELAY SLOT
+        &add(   $in,    8);
+        &RC4_loop(0,-1,0);
+        &RC4_loop(1,0,0);
+        &RC4_loop(2,0,0);
+        &RC4_loop(3,0,0);
+        &RC4_loop(4,0,0);
+        &RC4_loop(5,0,0);
+        &RC4_loop(6,0,0);
+        &RC4_loop(7,1,0);
+        
+        &comment("apply the cipher text");
+        # xor the cipher data with input
+        #&add(  $out,   8); #moved up into last round
+        &mov(   $tx,    &swtmp(0));
+         &mov(  $ty,    &DWP(-8,$in,"",0));
+        &xor(   $tx,    $ty);
+         &mov(  $ty,    &DWP(-4,$in,"",0)); 
+        &mov(   &DWP(-8,$out,"",0),     $tx);
+         &mov(  $tx,    &swtmp(1));
+        &xor(   $tx,    $ty);
+         &mov(  $ty,    &swtmp(2));     # load end ptr;
+        &mov(   &DWP(-4,$out,"",0),     $tx);
+         &mov(  $tx,            &DWP(0,$d,$x,4));
+        &cmp($in,       $ty);
+         &jbe(&label("start"));
+        &set_label("end");
+        # There is quite a bit of extra crap in RC4_loop() for this
+        # first round
+        &RC4_loop(0,-1,1);
+        &RC4_loop(1,0,1);
+        &RC4_loop(2,0,1);
+        &RC4_loop(3,0,1);
+        &RC4_loop(4,0,1);
+        &RC4_loop(5,0,1);
+        &RC4_loop(6,1,1);
+        &jmp(&label("finished"));
+        &align(16);
+        # this is essentially Intel P4 specific codepath, see rc4_skey.c,
+        # and is engaged in 0.9.8 and later context...
+        &set_label("RC4_CHAR");
+        &lea    ($ty,&DWP(0,$in,$ty));
+        &mov    (&swtmp(2),$ty);
+        # strangely enough unrolled loop performs over 20% slower...
+        &set_label("RC4_CHAR_loop");
+                &movz   ($tx,&BP(0,$d,$x));
+                &add    (&LB($y),&LB($tx));
+                &movz   ($ty,&BP(0,$d,$y));
+                &movb   (&BP(0,$d,$y),&LB($tx));
+                &movb   (&BP(0,$d,$x),&LB($ty));
+                &add    (&LB($ty),&LB($tx));
+                &movz   ($ty,&BP(0,$d,$ty));
+                &xorb   (&LB($ty),&BP(0,$in));
+                &movb   (&BP(0,$out),&LB($ty));
+                &inc    (&LB($x));
+                &inc    ($in);
+                &inc    ($out);
+                &cmp    ($in,&swtmp(2));
+        &jb     (&label("RC4_CHAR_loop"));
+        &set_label("finished");
+        &dec(   $x);
+         &stack_pop(3);
+        &movb(  &BP(-4,$d,"",0),&LB($y));
+         &movb( &BP(-8,$d,"",0),&LB($x));
+        &function_end($name);
+        }
diff --git a/src/lib/libcrypto/rc4/asm/rc4-amd64.pl b/src/lib/libcrypto/rc4/asm/rc4-amd64.pl
new file mode 100755
index 0000000000..9e0da8af99
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-amd64.pl
@@ -0,0 +1,227 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+#
+# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
+# "hand-coded assembler"] doesn't stand for the whole improvement
+# coefficient. It turned out that eliminating RC4_CHAR from config
+# line results in ~40% improvement (yes, even for C implementation).
+# Presumably it has everything to do with AMD cache architecture and
+# RAW or whatever penalties. Once again! The module *requires* config
+# line *without* RC4_CHAR! As for coding "secret," I bet on partial
+# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
+# I simply 'inc %r8b'. Even though optimization manual discourages
+# to operate on partial registers, it turned out to be the best bet.
+# At least for AMD... How IA32E would perform remains to be seen...
+# As was shown by Marc Bevand reordering of couple of load operations
+# results in even higher performance gain of 3.3x:-) At least on
+# Opteron... For reference, 1x in this case is RC4_CHAR C-code
+# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
+# Latter means that if you want to *estimate* what to expect from
+# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.
+# Intel P4 EM64T core was found to run the AMD64 code really slow...
+# The only way to achieve comparable performance on P4 is to keep
+# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
+# compose blended code, which would perform even within 30% marginal
+# on either AMD and Intel platforms, I implement both cases. See
+# rc4_skey.c for further details... This applies to 0.9.8 and later.
+# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes
+# of code remain redundant.
+$output=shift;
+$win64a=1 if ($output =~ /win64a.[s|asm]/);
+open STDOUT,">$output" || die "can't open $output: $!";
+if (defined($win64a)) {
+    $dat="%rcx";        # arg1
+    $len="%rdx";        # arg2
+    $inp="%rsi";        # r8, arg3 moves here
+    $out="%rdi";        # r9, arg4 moves here
+} else {
+    $dat="%rdi";        # arg1
+    $len="%rsi";        # arg2
+    $inp="%rdx";        # arg3
+    $out="%rcx";        # arg4
+}
+$XX="%r10";
+$TX="%r8";
+$YY="%r11";
+$TY="%r9";
+sub PTR() {
+    my $ret=shift;
+    if (defined($win64a)) {
+        $ret =~ s/\[([\S]+)\+([\S]+)\]/[$2+$1]/g;   # [%rN+%rM*4]->[%rM*4+%rN]
+        $ret =~ s/:([^\[]+)\[([^\]]+)\]/:[$2+$1]/g; # :off[ea]->:[ea+off]
+    } else {
+        $ret =~ s/[\+\*]/,/g;           # [%rN+%rM*4]->[%rN,%rM,4]
+        $ret =~ s/\[([^\]]+)\]/($1)/g;  # [%rN]->(%rN)
+    }
+    $ret;
+}
+$code=<<___ if (!defined($win64a));
+.text
+.globl  RC4
+.type   RC4,\@function
+.align  16
+RC4:    or      $len,$len
+        jne     .Lentry
+        repret
+.Lentry:
+___
+$code=<<___ if (defined($win64a));
+_TEXT   SEGMENT
+PUBLIC  RC4
+ALIGN   16
+RC4     PROC
+        or      $len,$len
+        jne     .Lentry
+        repret
+.Lentry:
+        push    %rdi
+        push    %rsi
+        sub     \$40,%rsp
+        mov     %r8,$inp
+        mov     %r9,$out
+___
+$code.=<<___;
+        add     \$8,$dat
+        movl    `&PTR("DWORD:-8[$dat]")`,$XX#d
+        movl    `&PTR("DWORD:-4[$dat]")`,$YY#d
+        cmpl    \$-1,`&PTR("DWORD:256[$dat]")`
+        je      .LRC4_CHAR
+        test    \$-8,$len
+        jz      .Lloop1
+.align  16
+.Lloop8:
+        inc     $XX#b
+        movl    `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
+        add     $TX#b,$YY#b
+        movl    `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
+        movl    $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
+        movl    $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
+        add     $TX#b,$TY#b
+        inc     $XX#b
+        movl    `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
+        movb    `&PTR("BYTE:[$dat+$TY*4]")`,%al
+___
+for ($i=1;$i<=6;$i++) {
+$code.=<<___;
+        add     $TX#b,$YY#b
+        ror     \$8,%rax
+        movl    `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
+        movl    $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
+        movl    $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
+        add     $TX#b,$TY#b
+        inc     $XX#b
+        movl    `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
+        movb    `&PTR("BYTE:[$dat+$TY*4]")`,%al
+___
+}
+$code.=<<___;
+        add     $TX#b,$YY#b
+        ror     \$8,%rax
+        movl    `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
+        movl    $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
+        movl    $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
+        sub     \$8,$len
+        add     $TY#b,$TX#b
+        movb    `&PTR("BYTE:[$dat+$TX*4]")`,%al
+        ror     \$8,%rax
+        add     \$8,$inp
+        add     \$8,$out
+        xor     `&PTR("QWORD:-8[$inp]")`,%rax
+        mov     %rax,`&PTR("QWORD:-8[$out]")`
+        test    \$-8,$len
+        jnz     .Lloop8
+        cmp     \$0,$len
+        jne     .Lloop1
+.Lexit:
+        movl    $XX#d,`&PTR("DWORD:-8[$dat]")`
+        movl    $YY#d,`&PTR("DWORD:-4[$dat]")`
+___
+$code.=<<___ if (defined($win64a));
+        add     \$40,%rsp
+        pop     %rsi
+        pop     %rdi
+___
+$code.=<<___;
+        repret
+.align  16
+.Lloop1:
+        movzb   `&PTR("BYTE:[$inp]")`,%eax
+        inc     $XX#b
+        movl    `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
+        add     $TX#b,$YY#b
+        movl    `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
+        movl    $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
+        movl    $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
+        add     $TY#b,$TX#b
+        movl    `&PTR("DWORD:[$dat+$TX*4]")`,$TY#d
+        xor     $TY,%rax
+        inc     $inp
+        movb    %al,`&PTR("BYTE:[$out]")`
+        inc     $out
+        dec     $len
+        jnz     .Lloop1
+        jmp     .Lexit
+.align  16
+.LRC4_CHAR:
+        inc     $XX#b
+        movzb   `&PTR("BYTE:[$dat+$XX]")`,$TX#d
+        add     $TX#b,$YY#b
+        movzb   `&PTR("BYTE:[$dat+$YY]")`,$TY#d
+        movb    $TX#b,`&PTR("BYTE:[$dat+$YY]")`
+        movb    $TY#b,`&PTR("BYTE:[$dat+$XX]")`
+        add     $TX#b,$TY#b
+        movzb   `&PTR("BYTE:[$dat+$TY]")`,$TY#d
+        xorb    `&PTR("BYTE:[$inp]")`,$TY#b
+        movb    $TY#b,`&PTR("BYTE:[$out]")`
+        inc     $inp
+        inc     $out
+        dec     $len
+        jnz     .LRC4_CHAR
+        jmp     .Lexit
+___
+$code.=<<___ if (defined($win64a));
+RC4     ENDP
+_TEXT   ENDS
+END
+___
+$code.=<<___ if (!defined($win64a));
+.size   RC4,.-RC4
+___
+$code =~ s/#([bwd])/$1/gm;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+if (defined($win64a)) {
+    $code =~ s/\.align/ALIGN/gm;
+    $code =~ s/[\$%]//gm;
+    $code =~ s/\.L/\$L/gm;
+    $code =~ s/([\w]+)([\s]+)([\S]+),([\S]+)/$1$2$4,$3/gm;
+    $code =~ s/([QD]*WORD|BYTE):/$1 PTR/gm;
+    $code =~ s/mov[bwlq]/mov/gm;
+    $code =~ s/movzb/movzx/gm;
+    $code =~ s/repret/DB\t0F3h,0C3h/gm;
+    $code =~ s/cmpl/cmp/gm;
+    $code =~ s/xorb/xor/gm;
+} else {
+    $code =~ s/([QD]*WORD|BYTE)://gm;
+    $code =~ s/repret/.byte\t0xF3,0xC3/gm;
+}
+print $code;

diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl new file mode 100644 index 0000000000..d6e98f0811 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -0,0 +1,229 @@
	1	#!/usr/local/bin/perl
	2
	3	# At some point it became apparent that the original SSLeay RC4
	4	# assembler implementation performs suboptimaly on latest IA-32
	5	# microarchitectures. After re-tuning performance has changed as
	6	# following:
	7	#
	8	# Pentium +0%
	9	# Pentium III +17%
	10	# AMD +52%(*)
	11	# P4 +180%(**)
	12	#
	13	# (*) This number is actually a trade-off:-) It's possible to
	14	# achieve +72%, but at the cost of -48% off PIII performance.
	15	# In other words code performing further 13% faster on AMD
	16	# would perform almost 2 times slower on Intel PIII...
	17	# For reference! This code delivers ~80% of rc4-amd64.pl
	18	# performance on the same Opteron machine.
	19	# (**) This number requires compressed key schedule set up by
	20	# RC4_set_key and therefore doesn't apply to 0.9.7 [option for
	21	# compressed key schedule is implemented in 0.9.8 and later,
	22	# see commentary section in rc4_skey.c for further details].
	23	#
	24	# <appro@fy.chalmers.se>
	25
	26	push(@INC,"perlasm","../../perlasm");
	27	require "x86asm.pl";
	28
	29	&asm_init($ARGV[0],"rc4-586.pl");
	30
	31	$x="eax";
	32	$y="ebx";
	33	$tx="ecx";
	34	$ty="edx";
	35	$in="esi";
	36	$out="edi";
	37	$d="ebp";
	38
	39	&RC4("RC4");
	40
	41	&asm_finish();
	42
	43	sub RC4_loop
	44	{
	45	local($n,$p,$char)=@_;
	46
	47	&comment("Round $n");
	48
	49	if ($char)
	50	{
	51	if ($p >= 0)
	52	{
	53	&mov($ty, &swtmp(2));
	54	&cmp($ty, $in);
	55	&jbe(&label("finished"));
	56	&inc($in);
	57	}
	58	else
	59	{
	60	&add($ty, 8);
	61	&inc($in);
	62	&cmp($ty, $in);
	63	&jb(&label("finished"));
	64	&mov(&swtmp(2), $ty);
	65	}
	66	}
	67	# Moved out
	68	# &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;
	69
	70	&add( &LB($y), &LB($tx));
	71	&mov( $ty, &DWP(0,$d,$y,4));
	72	# XXX
	73	&mov( &DWP(0,$d,$x,4),$ty);
	74	&add( $ty, $tx);
	75	&mov( &DWP(0,$d,$y,4),$tx);
	76	&and( $ty, 0xff);
	77	&inc( &LB($x)); # NEXT ROUND
	78	&mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
	79	&mov( $ty, &DWP(0,$d,$ty,4));
	80
	81	if (!$char)
	82	{
	83	#moved up into last round
	84	if ($p >= 1)
	85	{
	86	&add( $out, 8)
	87	}
	88	&movb( &BP($n,"esp","",0), &LB($ty));
	89	}
	90	else
	91	{
	92	# Note in+=8 has occured
	93	&movb( &HB($ty), &BP(-1,$in,"",0));
	94	# XXX
	95	&xorb(&LB($ty), &HB($ty));
	96	# XXX
	97	&movb(&BP($n,$out,"",0),&LB($ty));
	98	}
	99	}
	100
	101
	102	sub RC4
	103	{
	104	local($name)=@_;
	105
	106	&function_begin_B($name,"");
	107
	108	&mov($ty,&wparam(1)); # len
	109	&cmp($ty,0);
	110	&jne(&label("proceed"));
	111	&ret();
	112	&set_label("proceed");
	113
	114	&comment("");
	115
	116	&push("ebp");
	117	&push("ebx");
	118	&push("esi");
	119	&xor( $x, $x); # avoid partial register stalls
	120	&push("edi");
	121	&xor( $y, $y); # avoid partial register stalls
	122	&mov( $d, &wparam(0)); # key
	123	&mov( $in, &wparam(2));
	124
	125	&movb( &LB($x), &BP(0,$d,"",1));
	126	&movb( &LB($y), &BP(4,$d,"",1));
	127
	128	&mov( $out, &wparam(3));
	129	&inc( &LB($x));
	130
	131	&stack_push(3); # 3 temp variables
	132	&add( $d, 8);
	133
	134	# detect compressed schedule, see commentary section in rc4_skey.c...
	135	# in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
	136	# as compressed key schedule is set up in 0.9.8 and later.
	137	&cmp(&DWP(256,$d),-1);
	138	&je(&label("RC4_CHAR"));
	139
	140	&lea( $ty, &DWP(-8,$ty,$in));
	141
	142	# check for 0 length input
	143
	144	&mov( &swtmp(2), $ty); # this is now address to exit at
	145	&mov( $tx, &DWP(0,$d,$x,4));
	146
	147	&cmp( $ty, $in);
	148	&jb( &label("end")); # less than 8 bytes
	149
	150	&set_label("start");
	151
	152	# filling DELAY SLOT
	153	&add( $in, 8);
	154
	155	&RC4_loop(0,-1,0);
	156	&RC4_loop(1,0,0);
	157	&RC4_loop(2,0,0);
	158	&RC4_loop(3,0,0);
	159	&RC4_loop(4,0,0);
	160	&RC4_loop(5,0,0);
	161	&RC4_loop(6,0,0);
	162	&RC4_loop(7,1,0);
	163
	164	&comment("apply the cipher text");
	165	# xor the cipher data with input
	166
	167	#&add( $out, 8); #moved up into last round
	168
	169	&mov( $tx, &swtmp(0));
	170	&mov( $ty, &DWP(-8,$in,"",0));
	171	&xor( $tx, $ty);
	172	&mov( $ty, &DWP(-4,$in,"",0));
	173	&mov( &DWP(-8,$out,"",0), $tx);
	174	&mov( $tx, &swtmp(1));
	175	&xor( $tx, $ty);
	176	&mov( $ty, &swtmp(2)); # load end ptr;
	177	&mov( &DWP(-4,$out,"",0), $tx);
	178	&mov( $tx, &DWP(0,$d,$x,4));
	179	&cmp($in, $ty);
	180	&jbe(&label("start"));
	181
	182	&set_label("end");
	183
	184	# There is quite a bit of extra crap in RC4_loop() for this
	185	# first round
	186	&RC4_loop(0,-1,1);
	187	&RC4_loop(1,0,1);
	188	&RC4_loop(2,0,1);
	189	&RC4_loop(3,0,1);
	190	&RC4_loop(4,0,1);
	191	&RC4_loop(5,0,1);
	192	&RC4_loop(6,1,1);
	193
	194	&jmp(&label("finished"));
	195
	196	&align(16);
	197	# this is essentially Intel P4 specific codepath, see rc4_skey.c,
	198	# and is engaged in 0.9.8 and later context...
	199	&set_label("RC4_CHAR");
	200
	201	&lea ($ty,&DWP(0,$in,$ty));
	202	&mov (&swtmp(2),$ty);
	203
	204	# strangely enough unrolled loop performs over 20% slower...
	205	&set_label("RC4_CHAR_loop");
	206	&movz ($tx,&BP(0,$d,$x));
	207	&add (&LB($y),&LB($tx));
	208	&movz ($ty,&BP(0,$d,$y));
	209	&movb (&BP(0,$d,$y),&LB($tx));
	210	&movb (&BP(0,$d,$x),&LB($ty));
	211	&add (&LB($ty),&LB($tx));
	212	&movz ($ty,&BP(0,$d,$ty));
	213	&xorb (&LB($ty),&BP(0,$in));
	214	&movb (&BP(0,$out),&LB($ty));
	215	&inc (&LB($x));
	216	&inc ($in);
	217	&inc ($out);
	218	&cmp ($in,&swtmp(2));
	219	&jb (&label("RC4_CHAR_loop"));
	220
	221	&set_label("finished");
	222	&dec( $x);
	223	&stack_pop(3);
	224	&movb( &BP(-4,$d,"",0),&LB($y));
	225	&movb( &BP(-8,$d,"",0),&LB($x));
	226
	227	&function_end($name);
	228	}
	229


diff --git a/src/lib/libcrypto/rc4/asm/rc4-amd64.pl b/src/lib/libcrypto/rc4/asm/rc4-amd64.pl new file mode 100755 index 0000000000..9e0da8af99 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-amd64.pl
@@ -0,0 +1,227 @@
	1	#!/usr/bin/env perl
	2	#
	3	# ====================================================================
	4	# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
	5	# project. Rights for redistribution and usage in source and binary
	6	# forms are granted according to the OpenSSL license.
	7	# ====================================================================
	8	#
	9	# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
	10	# "hand-coded assembler"] doesn't stand for the whole improvement
	11	# coefficient. It turned out that eliminating RC4_CHAR from config
	12	# line results in ~40% improvement (yes, even for C implementation).
	13	# Presumably it has everything to do with AMD cache architecture and
	14	# RAW or whatever penalties. Once again! The module requires config
	15	# line without RC4_CHAR! As for coding "secret," I bet on partial
	16	# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
	17	# I simply 'inc %r8b'. Even though optimization manual discourages
	18	# to operate on partial registers, it turned out to be the best bet.
	19	# At least for AMD... How IA32E would perform remains to be seen...
	20
	21	# As was shown by Marc Bevand reordering of couple of load operations
	22	# results in even higher performance gain of 3.3x:-) At least on
	23	# Opteron... For reference, 1x in this case is RC4_CHAR C-code
	24	# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
	25	# Latter means that if you want to estimate what to expect from
	26	# your CPU, then multiply 54 by 3.3 and clock frequency in GHz.
	27
	28	# Intel P4 EM64T core was found to run the AMD64 code really slow...
	29	# The only way to achieve comparable performance on P4 is to keep
	30	# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
	31	# compose blended code, which would perform even within 30% marginal
	32	# on either AMD and Intel platforms, I implement both cases. See
	33	# rc4_skey.c for further details... This applies to 0.9.8 and later.
	34	# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes
	35	# of code remain redundant.
	36
	37	$output=shift;
	38
	39	$win64a=1 if ($output =~ /win64a.[s\|asm]/);
	40
	41	open STDOUT,">$output" \|\| die "can't open $output: $!";
	42
	43	if (defined($win64a)) {
	44	$dat="%rcx"; # arg1
	45	$len="%rdx"; # arg2
	46	$inp="%rsi"; # r8, arg3 moves here
	47	$out="%rdi"; # r9, arg4 moves here
	48	} else {
	49	$dat="%rdi"; # arg1
	50	$len="%rsi"; # arg2
	51	$inp="%rdx"; # arg3
	52	$out="%rcx"; # arg4
	53	}
	54
	55	$XX="%r10";
	56	$TX="%r8";
	57	$YY="%r11";
	58	$TY="%r9";
	59
	60	sub PTR() {
	61	my $ret=shift;
	62	if (defined($win64a)) {
	63	$ret =~ s/\[([\S]+)\+([\S]+)\]/[$2+$1]/g; # [%rN+%rM4]->[%rM4+%rN]
	64	$ret =~ s/:([^\[]+)\[([^\]]+)\]/:[$2+$1]/g; # :off[ea]->:[ea+off]
	65	} else {
	66	$ret =~ s/[\+\]/,/g; # [%rN+%rM4]->[%rN,%rM,4]
	67	$ret =~ s/\[([^\]]+)\]/($1)/g; # [%rN]->(%rN)
	68	}
	69	$ret;
	70	}
	71
	72	$code=<<___ if (!defined($win64a));
	73	.text
	74
	75	.globl RC4
	76	.type RC4,\@function
	77	.align 16
	78	RC4: or $len,$len
	79	jne .Lentry
	80	repret
	81	.Lentry:
	82	___
	83	$code=<<___ if (defined($win64a));
	84	_TEXT SEGMENT
	85	PUBLIC RC4
	86	ALIGN 16
	87	RC4 PROC
	88	or $len,$len
	89	jne .Lentry
	90	repret
	91	.Lentry:
	92	push %rdi
	93	push %rsi
	94	sub \$40,%rsp
	95	mov %r8,$inp
	96	mov %r9,$out
	97	___
	98	$code.=<<___;
	99	add \$8,$dat
	100	movl `&PTR("DWORD:-8[$dat]")`,$XX#d
	101	movl `&PTR("DWORD:-4[$dat]")`,$YY#d
	102	cmpl \$-1,`&PTR("DWORD:256[$dat]")`
	103	je .LRC4_CHAR
	104	test \$-8,$len
	105	jz .Lloop1
	106	.align 16
	107	.Lloop8:
	108	inc $XX#b
	109	movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
	110	add $TX#b,$YY#b
	111	movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
	112	movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
	113	movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
	114	add $TX#b,$TY#b
	115	inc $XX#b
	116	movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
	117	movb `&PTR("BYTE:[$dat+$TY*4]")`,%al
	118	___
	119	for ($i=1;$i<=6;$i++) {
	120	$code.=<<___;
	121	add $TX#b,$YY#b
	122	ror \$8,%rax
	123	movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
	124	movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
	125	movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
	126	add $TX#b,$TY#b
	127	inc $XX#b
	128	movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
	129	movb `&PTR("BYTE:[$dat+$TY*4]")`,%al
	130	___
	131	}
	132	$code.=<<___;
	133	add $TX#b,$YY#b
	134	ror \$8,%rax
	135	movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
	136	movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
	137	movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
	138	sub \$8,$len
	139	add $TY#b,$TX#b
	140	movb `&PTR("BYTE:[$dat+$TX*4]")`,%al
	141	ror \$8,%rax
	142	add \$8,$inp
	143	add \$8,$out
	144
	145	xor `&PTR("QWORD:-8[$inp]")`,%rax
	146	mov %rax,`&PTR("QWORD:-8[$out]")`
	147
	148	test \$-8,$len
	149	jnz .Lloop8
	150	cmp \$0,$len
	151	jne .Lloop1
	152	.Lexit:
	153	movl $XX#d,`&PTR("DWORD:-8[$dat]")`
	154	movl $YY#d,`&PTR("DWORD:-4[$dat]")`
	155	___
	156	$code.=<<___ if (defined($win64a));
	157	add \$40,%rsp
	158	pop %rsi
	159	pop %rdi
	160	___
	161	$code.=<<___;
	162	repret
	163	.align 16
	164	.Lloop1:
	165	movzb `&PTR("BYTE:[$inp]")`,%eax
	166	inc $XX#b
	167	movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
	168	add $TX#b,$YY#b
	169	movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
	170	movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
	171	movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
	172	add $TY#b,$TX#b
	173	movl `&PTR("DWORD:[$dat+$TX*4]")`,$TY#d
	174	xor $TY,%rax
	175	inc $inp
	176	movb %al,`&PTR("BYTE:[$out]")`
	177	inc $out
	178	dec $len
	179	jnz .Lloop1
	180	jmp .Lexit
	181
	182	.align 16
	183	.LRC4_CHAR:
	184	inc $XX#b
	185	movzb `&PTR("BYTE:[$dat+$XX]")`,$TX#d
	186	add $TX#b,$YY#b
	187	movzb `&PTR("BYTE:[$dat+$YY]")`,$TY#d
	188	movb $TX#b,`&PTR("BYTE:[$dat+$YY]")`
	189	movb $TY#b,`&PTR("BYTE:[$dat+$XX]")`
	190	add $TX#b,$TY#b
	191	movzb `&PTR("BYTE:[$dat+$TY]")`,$TY#d
	192	xorb `&PTR("BYTE:[$inp]")`,$TY#b
	193	movb $TY#b,`&PTR("BYTE:[$out]")`
	194	inc $inp
	195	inc $out
	196	dec $len
	197	jnz .LRC4_CHAR
	198	jmp .Lexit
	199	___
	200	$code.=<<___ if (defined($win64a));
	201	RC4 ENDP
	202	_TEXT ENDS
	203	END
	204	___
	205	$code.=<<___ if (!defined($win64a));
	206	.size RC4,.-RC4
	207	___
	208
	209	$code =~ s/#([bwd])/$1/gm;
	210	$code =~ s/\`([^\`]*)\`/eval $1/gem;
	211
	212	if (defined($win64a)) {
	213	$code =~ s/\.align/ALIGN/gm;
	214	$code =~ s/[\$%]//gm;
	215	$code =~ s/\.L/\$L/gm;
	216	$code =~ s/([\w]+)([\s]+)([\S]+),([\S]+)/$1$2$4,$3/gm;
	217	$code =~ s/([QD]*WORD\|BYTE):/$1 PTR/gm;
	218	$code =~ s/mov[bwlq]/mov/gm;
	219	$code =~ s/movzb/movzx/gm;
	220	$code =~ s/repret/DB\t0F3h,0C3h/gm;
	221	$code =~ s/cmpl/cmp/gm;
	222	$code =~ s/xorb/xor/gm;
	223	} else {
	224	$code =~ s/([QD]*WORD\|BYTE)://gm;
	225	$code =~ s/repret/.byte\t0xF3,0xC3/gm;
	226	}
	227	print $code;