1 files changed, 504 insertions, 0 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
new file mode 100755
index 0000000000..544386bf53
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
@@ -0,0 +1,504 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
+# "hand-coded assembler"] doesn't stand for the whole improvement
+# coefficient. It turned out that eliminating RC4_CHAR from config
+# line results in ~40% improvement (yes, even for C implementation).
+# Presumably it has everything to do with AMD cache architecture and
+# RAW or whatever penalties. Once again! The module *requires* config
+# line *without* RC4_CHAR! As for coding "secret," I bet on partial
+# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
+# I simply 'inc %r8b'. Even though optimization manual discourages
+# to operate on partial registers, it turned out to be the best bet.
+# At least for AMD... How IA32E would perform remains to be seen...
+# As was shown by Marc Bevand reordering of couple of load operations
+# results in even higher performance gain of 3.3x:-) At least on
+# Opteron... For reference, 1x in this case is RC4_CHAR C-code
+# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
+# Latter means that if you want to *estimate* what to expect from
+# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
+# Intel P4 EM64T core was found to run the AMD64 code really slow...
+# The only way to achieve comparable performance on P4 was to keep
+# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
+# compose blended code, which would perform even within 30% marginal
+# on either AMD and Intel platforms, I implement both cases. See
+# rc4_skey.c for further details...
+# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 
+# those with add/sub results in 50% performance improvement of folded
+# loop...
+# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
+# performance by >30% [unlike P4 32-bit case that is]. But this is
+# provided that loads are reordered even more aggressively! Both code
+# pathes, AMD64 and EM64T, reorder loads in essentially same manner
+# as my IA-64 implementation. On Opteron this resulted in modest 5%
+# improvement [I had to test it], while final Intel P4 performance
+# achieves respectful 432MBps on 2.8GHz processor now. For reference.
+# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
+# RC4_INT code-path. While if executed on Opteron, it's only 25%
+# slower than the RC4_INT one [meaning that if CPU �-arch detection
+# is not implemented, then this final RC4_CHAR code-path should be
+# preferred, as it provides better *all-round* performance].
+# Intel Core2 was observed to perform poorly on both code paths:-( It
+# apparently suffers from some kind of partial register stall, which
+# occurs in 64-bit mode only [as virtually identical 32-bit loop was
+# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
+# cloop1 boosts its performance by 80%! This loop appears to be optimal
+# fit for Core2 and therefore the code was modified to skip cloop8 on
+# this CPU.
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+open STDOUT,"| $^X $xlate $flavour $output";
+$dat="%rdi";        # arg1
+$len="%rsi";        # arg2
+$inp="%rdx";        # arg3
+$out="%rcx";        # arg4
+@XX=("%r8","%r10");
+@TX=("%r9","%r11");
+$YY="%r12";
+$TY="%r13";
+$code=<<___;
+.text
+.globl  RC4
+.type   RC4,\@function,4
+.align  16
+RC4:    or      $len,$len
+        jne     .Lentry
+        ret
+.Lentry:
+        push    %rbx
+        push    %r12
+        push    %r13
+.Lprologue:
+        add     \$8,$dat
+        movl    -8($dat),$XX[0]#d
+        movl    -4($dat),$YY#d
+        cmpl    \$-1,256($dat)
+        je      .LRC4_CHAR
+        inc     $XX[0]#b
+        movl    ($dat,$XX[0],4),$TX[0]#d
+        test    \$-8,$len
+        jz      .Lloop1
+        jmp     .Lloop8
+.align  16
+.Lloop8:
+___
+for ($i=0;$i<8;$i++) {
+$code.=<<___;
+        add     $TX[0]#b,$YY#b
+        mov     $XX[0],$XX[1]
+        movl    ($dat,$YY,4),$TY#d
+        ror     \$8,%rax                        # ror is redundant when $i=0
+        inc     $XX[1]#b
+        movl    ($dat,$XX[1],4),$TX[1]#d
+        cmp     $XX[1],$YY
+        movl    $TX[0]#d,($dat,$YY,4)
+        cmove   $TX[0],$TX[1]
+        movl    $TY#d,($dat,$XX[0],4)
+        add     $TX[0]#b,$TY#b
+        movb    ($dat,$TY,4),%al
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers
+}
+$code.=<<___;
+        ror     \$8,%rax
+        sub     \$8,$len
+        xor     ($inp),%rax
+        add     \$8,$inp
+        mov     %rax,($out)
+        add     \$8,$out
+        test    \$-8,$len
+        jnz     .Lloop8
+        cmp     \$0,$len
+        jne     .Lloop1
+        jmp     .Lexit
+.align  16
+.Lloop1:
+        add     $TX[0]#b,$YY#b
+        movl    ($dat,$YY,4),$TY#d
+        movl    $TX[0]#d,($dat,$YY,4)
+        movl    $TY#d,($dat,$XX[0],4)
+        add     $TY#b,$TX[0]#b
+        inc     $XX[0]#b
+        movl    ($dat,$TX[0],4),$TY#d
+        movl    ($dat,$XX[0],4),$TX[0]#d
+        xorb    ($inp),$TY#b
+        inc     $inp
+        movb    $TY#b,($out)
+        inc     $out
+        dec     $len
+        jnz     .Lloop1
+        jmp     .Lexit
+.align  16
+.LRC4_CHAR:
+        add     \$1,$XX[0]#b
+        movzb   ($dat,$XX[0]),$TX[0]#d
+        test    \$-8,$len
+        jz      .Lcloop1
+        cmpl    \$0,260($dat)
+        jnz     .Lcloop1
+        jmp     .Lcloop8
+.align  16
+.Lcloop8:
+        mov     ($inp),%eax
+        mov     4($inp),%ebx
+___
+# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
+for ($i=0;$i<4;$i++) {
+$code.=<<___;
+        add     $TX[0]#b,$YY#b
+        lea     1($XX[0]),$XX[1]
+        movzb   ($dat,$YY),$TY#d
+        movzb   $XX[1]#b,$XX[1]#d
+        movzb   ($dat,$XX[1]),$TX[1]#d
+        movb    $TX[0]#b,($dat,$YY)
+        cmp     $XX[1],$YY
+        movb    $TY#b,($dat,$XX[0])
+        jne     .Lcmov$i                        # Intel cmov is sloooow...
+        mov     $TX[0],$TX[1]
+.Lcmov$i:
+        add     $TX[0]#b,$TY#b
+        xor     ($dat,$TY),%al
+        ror     \$8,%eax
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers
+}
+for ($i=4;$i<8;$i++) {
+$code.=<<___;
+        add     $TX[0]#b,$YY#b
+        lea     1($XX[0]),$XX[1]
+        movzb   ($dat,$YY),$TY#d
+        movzb   $XX[1]#b,$XX[1]#d
+        movzb   ($dat,$XX[1]),$TX[1]#d
+        movb    $TX[0]#b,($dat,$YY)
+        cmp     $XX[1],$YY
+        movb    $TY#b,($dat,$XX[0])
+        jne     .Lcmov$i                        # Intel cmov is sloooow...
+        mov     $TX[0],$TX[1]
+.Lcmov$i:
+        add     $TX[0]#b,$TY#b
+        xor     ($dat,$TY),%bl
+        ror     \$8,%ebx
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers
+}
+$code.=<<___;
+        lea     -8($len),$len
+        mov     %eax,($out)
+        lea     8($inp),$inp
+        mov     %ebx,4($out)
+        lea     8($out),$out
+        test    \$-8,$len
+        jnz     .Lcloop8
+        cmp     \$0,$len
+        jne     .Lcloop1
+        jmp     .Lexit
+___
+$code.=<<___;
+.align  16
+.Lcloop1:
+        add     $TX[0]#b,$YY#b
+        movzb   ($dat,$YY),$TY#d
+        movb    $TX[0]#b,($dat,$YY)
+        movb    $TY#b,($dat,$XX[0])
+        add     $TX[0]#b,$TY#b
+        add     \$1,$XX[0]#b
+        movzb   $TY#b,$TY#d
+        movzb   $XX[0]#b,$XX[0]#d
+        movzb   ($dat,$TY),$TY#d
+        movzb   ($dat,$XX[0]),$TX[0]#d
+        xorb    ($inp),$TY#b
+        lea     1($inp),$inp
+        movb    $TY#b,($out)
+        lea     1($out),$out
+        sub     \$1,$len
+        jnz     .Lcloop1
+        jmp     .Lexit
+.align  16
+.Lexit:
+        sub     \$1,$XX[0]#b
+        movl    $XX[0]#d,-8($dat)
+        movl    $YY#d,-4($dat)
+        mov     (%rsp),%r13
+        mov     8(%rsp),%r12
+        mov     16(%rsp),%rbx
+        add     \$24,%rsp
+.Lepilogue:
+        ret
+.size   RC4,.-RC4
+___
+$idx="%r8";
+$ido="%r9";
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl  RC4_set_key
+.type   RC4_set_key,\@function,3
+.align  16
+RC4_set_key:
+        lea     8($dat),$dat
+        lea     ($inp,$len),$inp
+        neg     $len
+        mov     $len,%rcx
+        xor     %eax,%eax
+        xor     $ido,$ido
+        xor     %r10,%r10
+        xor     %r11,%r11
+        mov     PIC_GOT(OPENSSL_ia32cap_P),$idx#d
+        bt      \$20,$idx#d
+        jnc     .Lw1stloop
+        bt      \$30,$idx#d
+        setc    $ido#b
+        mov     $ido#d,260($dat)
+        jmp     .Lc1stloop
+.align  16
+.Lw1stloop:
+        mov     %eax,($dat,%rax,4)
+        add     \$1,%al
+        jnc     .Lw1stloop
+        xor     $ido,$ido
+        xor     $idx,$idx
+.align  16
+.Lw2ndloop:
+        mov     ($dat,$ido,4),%r10d
+        add     ($inp,$len,1),$idx#b
+        add     %r10b,$idx#b
+        add     \$1,$len
+        mov     ($dat,$idx,4),%r11d
+        cmovz   %rcx,$len
+        mov     %r10d,($dat,$idx,4)
+        mov     %r11d,($dat,$ido,4)
+        add     \$1,$ido#b
+        jnc     .Lw2ndloop
+        jmp     .Lexit_key
+.align  16
+.Lc1stloop:
+        mov     %al,($dat,%rax)
+        add     \$1,%al
+        jnc     .Lc1stloop
+        xor     $ido,$ido
+        xor     $idx,$idx
+.align  16
+.Lc2ndloop:
+        mov     ($dat,$ido),%r10b
+        add     ($inp,$len),$idx#b
+        add     %r10b,$idx#b
+        add     \$1,$len
+        mov     ($dat,$idx),%r11b
+        jnz     .Lcnowrap
+        mov     %rcx,$len
+.Lcnowrap:
+        mov     %r10b,($dat,$idx)
+        mov     %r11b,($dat,$ido)
+        add     \$1,$ido#b
+        jnc     .Lc2ndloop
+        movl    \$-1,256($dat)
+.align  16
+.Lexit_key:
+        xor     %eax,%eax
+        mov     %eax,-8($dat)
+        mov     %eax,-4($dat)
+        ret
+.size   RC4_set_key,.-RC4_set_key
+.globl  RC4_options
+.type   RC4_options,\@abi-omnipotent
+.align  16
+RC4_options:
+        lea     .Lopts(%rip),%rax
+        mov     PIC_GOT(OPENSSL_ia32cap_P),%edx
+        bt      \$20,%edx
+        jnc     .Ldone
+        add     \$12,%rax
+        bt      \$30,%edx
+        jnc     .Ldone
+        add     \$13,%rax
+.Ldone:
+        ret
+.align  64
+.Lopts:
+.asciz  "rc4(8x,int)"
+.asciz  "rc4(8x,char)"
+.asciz  "rc4(1x,char)"
+.asciz  "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align  64
+.size   RC4_options,.-RC4_options
+___
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type   stream_se_handler,\@abi-omnipotent
+.align  16
+stream_se_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     120($context),%rax      # pull context->Rax
+        mov     248($context),%rbx      # pull context->Rip
+        lea     .Lprologue(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip<prologue label
+        jb      .Lin_prologue
+        mov     152($context),%rax      # pull context->Rsp
+        lea     .Lepilogue(%rip),%r10
+        cmp     %r10,%rbx               # context->Rip>=epilogue label
+        jae     .Lin_prologue
+        lea     24(%rax),%rax
+        mov     -8(%rax),%rbx
+        mov     -16(%rax),%r12
+        mov     -24(%rax),%r13
+        mov     %rbx,144($context)      # restore context->Rbx
+        mov     %r12,216($context)      # restore context->R12
+        mov     %r13,224($context)      # restore context->R13
+.Lin_prologue:
+        mov     8(%rax),%rdi
+        mov     16(%rax),%rsi
+        mov     %rax,152($context)      # restore context->Rsp
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+        jmp     .Lcommon_seh_exit
+.size   stream_se_handler,.-stream_se_handler
+.type   key_se_handler,\@abi-omnipotent
+.align  16
+key_se_handler:
+        push    %rsi
+        push    %rdi
+        push    %rbx
+        push    %rbp
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+        pushfq
+        sub     \$64,%rsp
+        mov     152($context),%rax      # pull context->Rsp
+        mov     8(%rax),%rdi
+        mov     16(%rax),%rsi
+        mov     %rsi,168($context)      # restore context->Rsi
+        mov     %rdi,176($context)      # restore context->Rdi
+.Lcommon_seh_exit:
+        mov     40($disp),%rdi          # disp->ContextRecord
+        mov     $context,%rsi           # context
+        mov     \$154,%ecx              # sizeof(CONTEXT)
+        .long   0xa548f3fc              # cld; rep movsq
+        mov     $disp,%rsi
+        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+        mov     0(%rsi),%r8             # arg3, disp->ControlPc
+        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+        mov     40(%rsi),%r10           # disp->ContextRecord
+        lea     56(%rsi),%r11           # &disp->HandlerData
+        lea     24(%rsi),%r12           # &disp->EstablisherFrame
+        mov     %r10,32(%rsp)           # arg5
+        mov     %r11,40(%rsp)           # arg6
+        mov     %r12,48(%rsp)           # arg7
+        mov     %rcx,56(%rsp)           # arg8, (NULL)
+        call    *__imp_RtlVirtualUnwind(%rip)
+        mov     \$1,%eax                # ExceptionContinueSearch
+        add     \$64,%rsp
+        popfq
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        pop     %rbp
+        pop     %rbx
+        pop     %rdi
+        pop     %rsi
+        ret
+.size   key_se_handler,.-key_se_handler
+.section        .pdata
+.align  4
+        .rva    .LSEH_begin_RC4
+        .rva    .LSEH_end_RC4
+        .rva    .LSEH_info_RC4
+        .rva    .LSEH_begin_RC4_set_key
+        .rva    .LSEH_end_RC4_set_key
+        .rva    .LSEH_info_RC4_set_key
+.section        .xdata
+.align  8
+.LSEH_info_RC4:
+        .byte   9,0,0,0
+        .rva    stream_se_handler
+.LSEH_info_RC4_set_key:
+        .byte   9,0,0,0
+        .rva    key_se_handler
+___
+}
+$code =~ s/#([bwd])/$1/gm;
+print $code;
+close STDOUT;

diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl new file mode 100755 index 0000000000..544386bf53 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
@@ -0,0 +1,504 @@
	1	#!/usr/bin/env perl
	2	#
	3	# ====================================================================
	4	# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
	5	# project. The module is, however, dual licensed under OpenSSL and
	6	# CRYPTOGAMS licenses depending on where you obtain it. For further
	7	# details see http://www.openssl.org/~appro/cryptogams/.
	8	# ====================================================================
	9	#
	10	# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
	11	# "hand-coded assembler"] doesn't stand for the whole improvement
	12	# coefficient. It turned out that eliminating RC4_CHAR from config
	13	# line results in ~40% improvement (yes, even for C implementation).
	14	# Presumably it has everything to do with AMD cache architecture and
	15	# RAW or whatever penalties. Once again! The module requires config
	16	# line without RC4_CHAR! As for coding "secret," I bet on partial
	17	# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
	18	# I simply 'inc %r8b'. Even though optimization manual discourages
	19	# to operate on partial registers, it turned out to be the best bet.
	20	# At least for AMD... How IA32E would perform remains to be seen...
	21
	22	# As was shown by Marc Bevand reordering of couple of load operations
	23	# results in even higher performance gain of 3.3x:-) At least on
	24	# Opteron... For reference, 1x in this case is RC4_CHAR C-code
	25	# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
	26	# Latter means that if you want to estimate what to expect from
	27	# your Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
	28
	29	# Intel P4 EM64T core was found to run the AMD64 code really slow...
	30	# The only way to achieve comparable performance on P4 was to keep
	31	# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
	32	# compose blended code, which would perform even within 30% marginal
	33	# on either AMD and Intel platforms, I implement both cases. See
	34	# rc4_skey.c for further details...
	35
	36	# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
	37	# those with add/sub results in 50% performance improvement of folded
	38	# loop...
	39
	40	# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
	41	# performance by >30% [unlike P4 32-bit case that is]. But this is
	42	# provided that loads are reordered even more aggressively! Both code
	43	# pathes, AMD64 and EM64T, reorder loads in essentially same manner
	44	# as my IA-64 implementation. On Opteron this resulted in modest 5%
	45	# improvement [I had to test it], while final Intel P4 performance
	46	# achieves respectful 432MBps on 2.8GHz processor now. For reference.
	47	# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
	48	# RC4_INT code-path. While if executed on Opteron, it's only 25%
	49	# slower than the RC4_INT one [meaning that if CPU �-arch detection
	50	# is not implemented, then this final RC4_CHAR code-path should be
	51	# preferred, as it provides better all-round performance].
	52
	53	# Intel Core2 was observed to perform poorly on both code paths:-( It
	54	# apparently suffers from some kind of partial register stall, which
	55	# occurs in 64-bit mode only [as virtually identical 32-bit loop was
	56	# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
	57	# cloop1 boosts its performance by 80%! This loop appears to be optimal
	58	# fit for Core2 and therefore the code was modified to skip cloop8 on
	59	# this CPU.
	60
	61	$flavour = shift;
	62	$output = shift;
	63	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
	64
	65	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
	66
	67	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
	68	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
	69	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
	70	die "can't locate x86_64-xlate.pl";
	71
	72	open STDOUT,"\| $^X $xlate $flavour $output";
	73
	74	$dat="%rdi"; # arg1
	75	$len="%rsi"; # arg2
	76	$inp="%rdx"; # arg3
	77	$out="%rcx"; # arg4
	78
	79	@XX=("%r8","%r10");
	80	@TX=("%r9","%r11");
	81	$YY="%r12";
	82	$TY="%r13";
	83
	84	$code=<<___;
	85	.text
	86
	87	.globl RC4
	88	.type RC4,\@function,4
	89	.align 16
	90	RC4: or $len,$len
	91	jne .Lentry
	92	ret
	93	.Lentry:
	94	push %rbx
	95	push %r12
	96	push %r13
	97	.Lprologue:
	98
	99	add \$8,$dat
	100	movl -8($dat),$XX[0]#d
	101	movl -4($dat),$YY#d
	102	cmpl \$-1,256($dat)
	103	je .LRC4_CHAR
	104	inc $XX[0]#b
	105	movl ($dat,$XX[0],4),$TX[0]#d
	106	test \$-8,$len
	107	jz .Lloop1
	108	jmp .Lloop8
	109	.align 16
	110	.Lloop8:
	111	___
	112	for ($i=0;$i<8;$i++) {
	113	$code.=<<___;
	114	add $TX[0]#b,$YY#b
	115	mov $XX[0],$XX[1]
	116	movl ($dat,$YY,4),$TY#d
	117	ror \$8,%rax # ror is redundant when $i=0
	118	inc $XX[1]#b
	119	movl ($dat,$XX[1],4),$TX[1]#d
	120	cmp $XX[1],$YY
	121	movl $TX[0]#d,($dat,$YY,4)
	122	cmove $TX[0],$TX[1]
	123	movl $TY#d,($dat,$XX[0],4)
	124	add $TX[0]#b,$TY#b
	125	movb ($dat,$TY,4),%al
	126	___
	127	push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
	128	}
	129	$code.=<<___;
	130	ror \$8,%rax
	131	sub \$8,$len
	132
	133	xor ($inp),%rax
	134	add \$8,$inp
	135	mov %rax,($out)
	136	add \$8,$out
	137
	138	test \$-8,$len
	139	jnz .Lloop8
	140	cmp \$0,$len
	141	jne .Lloop1
	142	jmp .Lexit
	143
	144	.align 16
	145	.Lloop1:
	146	add $TX[0]#b,$YY#b
	147	movl ($dat,$YY,4),$TY#d
	148	movl $TX[0]#d,($dat,$YY,4)
	149	movl $TY#d,($dat,$XX[0],4)
	150	add $TY#b,$TX[0]#b
	151	inc $XX[0]#b
	152	movl ($dat,$TX[0],4),$TY#d
	153	movl ($dat,$XX[0],4),$TX[0]#d
	154	xorb ($inp),$TY#b
	155	inc $inp
	156	movb $TY#b,($out)
	157	inc $out
	158	dec $len
	159	jnz .Lloop1
	160	jmp .Lexit
	161
	162	.align 16
	163	.LRC4_CHAR:
	164	add \$1,$XX[0]#b
	165	movzb ($dat,$XX[0]),$TX[0]#d
	166	test \$-8,$len
	167	jz .Lcloop1
	168	cmpl \$0,260($dat)
	169	jnz .Lcloop1
	170	jmp .Lcloop8
	171	.align 16
	172	.Lcloop8:
	173	mov ($inp),%eax
	174	mov 4($inp),%ebx
	175	___
	176	# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
	177	for ($i=0;$i<4;$i++) {
	178	$code.=<<___;
	179	add $TX[0]#b,$YY#b
	180	lea 1($XX[0]),$XX[1]
	181	movzb ($dat,$YY),$TY#d
	182	movzb $XX[1]#b,$XX[1]#d
	183	movzb ($dat,$XX[1]),$TX[1]#d
	184	movb $TX[0]#b,($dat,$YY)
	185	cmp $XX[1],$YY
	186	movb $TY#b,($dat,$XX[0])
	187	jne .Lcmov$i # Intel cmov is sloooow...
	188	mov $TX[0],$TX[1]
	189	.Lcmov$i:
	190	add $TX[0]#b,$TY#b
	191	xor ($dat,$TY),%al
	192	ror \$8,%eax
	193	___
	194	push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
	195	}
	196	for ($i=4;$i<8;$i++) {
	197	$code.=<<___;
	198	add $TX[0]#b,$YY#b
	199	lea 1($XX[0]),$XX[1]
	200	movzb ($dat,$YY),$TY#d
	201	movzb $XX[1]#b,$XX[1]#d
	202	movzb ($dat,$XX[1]),$TX[1]#d
	203	movb $TX[0]#b,($dat,$YY)
	204	cmp $XX[1],$YY
	205	movb $TY#b,($dat,$XX[0])
	206	jne .Lcmov$i # Intel cmov is sloooow...
	207	mov $TX[0],$TX[1]
	208	.Lcmov$i:
	209	add $TX[0]#b,$TY#b
	210	xor ($dat,$TY),%bl
	211	ror \$8,%ebx
	212	___
	213	push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
	214	}
	215	$code.=<<___;
	216	lea -8($len),$len
	217	mov %eax,($out)
	218	lea 8($inp),$inp
	219	mov %ebx,4($out)
	220	lea 8($out),$out
	221
	222	test \$-8,$len
	223	jnz .Lcloop8
	224	cmp \$0,$len
	225	jne .Lcloop1
	226	jmp .Lexit
	227	___
	228	$code.=<<___;
	229	.align 16
	230	.Lcloop1:
	231	add $TX[0]#b,$YY#b
	232	movzb ($dat,$YY),$TY#d
	233	movb $TX[0]#b,($dat,$YY)
	234	movb $TY#b,($dat,$XX[0])
	235	add $TX[0]#b,$TY#b
	236	add \$1,$XX[0]#b
	237	movzb $TY#b,$TY#d
	238	movzb $XX[0]#b,$XX[0]#d
	239	movzb ($dat,$TY),$TY#d
	240	movzb ($dat,$XX[0]),$TX[0]#d
	241	xorb ($inp),$TY#b
	242	lea 1($inp),$inp
	243	movb $TY#b,($out)
	244	lea 1($out),$out
	245	sub \$1,$len
	246	jnz .Lcloop1
	247	jmp .Lexit
	248
	249	.align 16
	250	.Lexit:
	251	sub \$1,$XX[0]#b
	252	movl $XX[0]#d,-8($dat)
	253	movl $YY#d,-4($dat)
	254
	255	mov (%rsp),%r13
	256	mov 8(%rsp),%r12
	257	mov 16(%rsp),%rbx
	258	add \$24,%rsp
	259	.Lepilogue:
	260	ret
	261	.size RC4,.-RC4
	262	___
	263
	264	$idx="%r8";
	265	$ido="%r9";
	266
	267	$code.=<<___;
	268	.extern OPENSSL_ia32cap_P
	269	.globl RC4_set_key
	270	.type RC4_set_key,\@function,3
	271	.align 16
	272	RC4_set_key:
	273	lea 8($dat),$dat
	274	lea ($inp,$len),$inp
	275	neg $len
	276	mov $len,%rcx
	277	xor %eax,%eax
	278	xor $ido,$ido
	279	xor %r10,%r10
	280	xor %r11,%r11
	281
	282	mov PIC_GOT(OPENSSL_ia32cap_P),$idx#d
	283	bt \$20,$idx#d
	284	jnc .Lw1stloop
	285	bt \$30,$idx#d
	286	setc $ido#b
	287	mov $ido#d,260($dat)
	288	jmp .Lc1stloop
	289
	290	.align 16
	291	.Lw1stloop:
	292	mov %eax,($dat,%rax,4)
	293	add \$1,%al
	294	jnc .Lw1stloop
	295
	296	xor $ido,$ido
	297	xor $idx,$idx
	298	.align 16
	299	.Lw2ndloop:
	300	mov ($dat,$ido,4),%r10d
	301	add ($inp,$len,1),$idx#b
	302	add %r10b,$idx#b
	303	add \$1,$len
	304	mov ($dat,$idx,4),%r11d
	305	cmovz %rcx,$len
	306	mov %r10d,($dat,$idx,4)
	307	mov %r11d,($dat,$ido,4)
	308	add \$1,$ido#b
	309	jnc .Lw2ndloop
	310	jmp .Lexit_key
	311
	312	.align 16
	313	.Lc1stloop:
	314	mov %al,($dat,%rax)
	315	add \$1,%al
	316	jnc .Lc1stloop
	317
	318	xor $ido,$ido
	319	xor $idx,$idx
	320	.align 16
	321	.Lc2ndloop:
	322	mov ($dat,$ido),%r10b
	323	add ($inp,$len),$idx#b
	324	add %r10b,$idx#b
	325	add \$1,$len
	326	mov ($dat,$idx),%r11b
	327	jnz .Lcnowrap
	328	mov %rcx,$len
	329	.Lcnowrap:
	330	mov %r10b,($dat,$idx)
	331	mov %r11b,($dat,$ido)
	332	add \$1,$ido#b
	333	jnc .Lc2ndloop
	334	movl \$-1,256($dat)
	335
	336	.align 16
	337	.Lexit_key:
	338	xor %eax,%eax
	339	mov %eax,-8($dat)
	340	mov %eax,-4($dat)
	341	ret
	342	.size RC4_set_key,.-RC4_set_key
	343
	344	.globl RC4_options
	345	.type RC4_options,\@abi-omnipotent
	346	.align 16
	347	RC4_options:
	348	lea .Lopts(%rip),%rax
	349	mov PIC_GOT(OPENSSL_ia32cap_P),%edx
	350	bt \$20,%edx
	351	jnc .Ldone
	352	add \$12,%rax
	353	bt \$30,%edx
	354	jnc .Ldone
	355	add \$13,%rax
	356	.Ldone:
	357	ret
	358	.align 64
	359	.Lopts:
	360	.asciz "rc4(8x,int)"
	361	.asciz "rc4(8x,char)"
	362	.asciz "rc4(1x,char)"
	363	.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
	364	.align 64
	365	.size RC4_options,.-RC4_options
	366	___
	367
	368	# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
	369	# CONTEXT context,DISPATCHER_CONTEXT disp)
	370	if ($win64) {
	371	$rec="%rcx";
	372	$frame="%rdx";
	373	$context="%r8";
	374	$disp="%r9";
	375
	376	$code.=<<___;
	377	.extern __imp_RtlVirtualUnwind
	378	.type stream_se_handler,\@abi-omnipotent
	379	.align 16
	380	stream_se_handler:
	381	push %rsi
	382	push %rdi
	383	push %rbx
	384	push %rbp
	385	push %r12
	386	push %r13
	387	push %r14
	388	push %r15
	389	pushfq
	390	sub \$64,%rsp
	391
	392	mov 120($context),%rax # pull context->Rax
	393	mov 248($context),%rbx # pull context->Rip
	394
	395	lea .Lprologue(%rip),%r10
	396	cmp %r10,%rbx # context->Rip<prologue label
	397	jb .Lin_prologue
	398
	399	mov 152($context),%rax # pull context->Rsp
	400
	401	lea .Lepilogue(%rip),%r10
	402	cmp %r10,%rbx # context->Rip>=epilogue label
	403	jae .Lin_prologue
	404
	405	lea 24(%rax),%rax
	406
	407	mov -8(%rax),%rbx
	408	mov -16(%rax),%r12
	409	mov -24(%rax),%r13
	410	mov %rbx,144($context) # restore context->Rbx
	411	mov %r12,216($context) # restore context->R12
	412	mov %r13,224($context) # restore context->R13
	413
	414	.Lin_prologue:
	415	mov 8(%rax),%rdi
	416	mov 16(%rax),%rsi
	417	mov %rax,152($context) # restore context->Rsp
	418	mov %rsi,168($context) # restore context->Rsi
	419	mov %rdi,176($context) # restore context->Rdi
	420
	421	jmp .Lcommon_seh_exit
	422	.size stream_se_handler,.-stream_se_handler
	423
	424	.type key_se_handler,\@abi-omnipotent
	425	.align 16
	426	key_se_handler:
	427	push %rsi
	428	push %rdi
	429	push %rbx
	430	push %rbp
	431	push %r12
	432	push %r13
	433	push %r14
	434	push %r15
	435	pushfq
	436	sub \$64,%rsp
	437
	438	mov 152($context),%rax # pull context->Rsp
	439	mov 8(%rax),%rdi
	440	mov 16(%rax),%rsi
	441	mov %rsi,168($context) # restore context->Rsi
	442	mov %rdi,176($context) # restore context->Rdi
	443
	444	.Lcommon_seh_exit:
	445
	446	mov 40($disp),%rdi # disp->ContextRecord
	447	mov $context,%rsi # context
	448	mov \$154,%ecx # sizeof(CONTEXT)
	449	.long 0xa548f3fc # cld; rep movsq
	450
	451	mov $disp,%rsi
	452	xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
	453	mov 8(%rsi),%rdx # arg2, disp->ImageBase
	454	mov 0(%rsi),%r8 # arg3, disp->ControlPc
	455	mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
	456	mov 40(%rsi),%r10 # disp->ContextRecord
	457	lea 56(%rsi),%r11 # &disp->HandlerData
	458	lea 24(%rsi),%r12 # &disp->EstablisherFrame
	459	mov %r10,32(%rsp) # arg5
	460	mov %r11,40(%rsp) # arg6
	461	mov %r12,48(%rsp) # arg7
	462	mov %rcx,56(%rsp) # arg8, (NULL)
	463	call *__imp_RtlVirtualUnwind(%rip)
	464
	465	mov \$1,%eax # ExceptionContinueSearch
	466	add \$64,%rsp
	467	popfq
	468	pop %r15
	469	pop %r14
	470	pop %r13
	471	pop %r12
	472	pop %rbp
	473	pop %rbx
	474	pop %rdi
	475	pop %rsi
	476	ret
	477	.size key_se_handler,.-key_se_handler
	478
	479	.section .pdata
	480	.align 4
	481	.rva .LSEH_begin_RC4
	482	.rva .LSEH_end_RC4
	483	.rva .LSEH_info_RC4
	484
	485	.rva .LSEH_begin_RC4_set_key
	486	.rva .LSEH_end_RC4_set_key
	487	.rva .LSEH_info_RC4_set_key
	488
	489	.section .xdata
	490	.align 8
	491	.LSEH_info_RC4:
	492	.byte 9,0,0,0
	493	.rva stream_se_handler
	494	.LSEH_info_RC4_set_key:
	495	.byte 9,0,0,0
	496	.rva key_se_handler
	497	___
	498	}
	499
	500	$code =~ s/#([bwd])/$1/gm;
	501
	502	print $code;
	503
	504	close STDOUT;