This commit was generated by cvs2git to track changes on a CVS vendor

branch.
author: djm <> 2006-06-27 05:05:42 +0000
committer: djm <> 2006-06-27 05:05:42 +0000
commit: 3f764f48d2626a43b6eeef7652c28303269d1204 (patch)
tree: 764d513589e09d2d10dbe70039b5f3bf58a36803 /src/lib/libcrypto/rc4
parent: 0d2f07cb82812dd6f9e33c493104f4c24e5b13a3 (diff)
parent: f6198d4d0ab97685dc56be2d48715ed39fcc74b9 (diff)
download: openbsd-3f764f48d2626a43b6eeef7652c28303269d1204.tar.gz
openbsd-3f764f48d2626a43b6eeef7652c28303269d1204.tar.bz2
openbsd-3f764f48d2626a43b6eeef7652c28303269d1204.zip
1 files changed, 150 insertions, 0 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
new file mode 100755
index 0000000000..b628daca70
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
@@ -0,0 +1,150 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+#
+# Unlike 0.9.7f this code expects RC4_CHAR back in config line! See
+# commentary section in corresponding script in development branch
+# for background information about this option carousel. For those
+# who don't have energy to figure out these gory details, here is
+# basis in form of performance matrix relative to the original
+# 0.9.7e C code-base:
+#
+#               0.9.7e  0.9.7f  this
+# AMD64         1x      3.3x    2.4x
+# EM64T         1x      0.8x    1.5x
+#
+# In other words idea is to trade -25% AMD64 performance to compensate
+# for deterioration and gain +90% on EM64T core. Development branch
+# maintains best performance for either target, i.e. 3.3x for AMD64
+# and 1.5x for EM64T.
+$output=shift;
+open STDOUT,">$output" || die "can't open $output: $!";
+$dat="%rdi";        # arg1
+$len="%rsi";        # arg2
+$inp="%rdx";        # arg3
+$out="%rcx";        # arg4
+@XX=("%r8","%r10");
+@TX=("%r9","%r11");
+$YY="%r12";
+$TY="%r13";
+$code=<<___;;
+.text
+.globl  RC4
+.type   RC4,\@function
+.align  16
+RC4:    or      $len,$len
+        jne     .Lentry
+        repret
+.Lentry:
+        push    %r12
+        push    %r13
+        add     \$2,$dat
+        movzb   -2($dat),$XX[0]#d
+        movzb   -1($dat),$YY#d
+        add     \$1,$XX[0]#b
+        movzb   ($dat,$XX[0]),$TX[0]#d
+        test    \$-8,$len
+        jz      .Lcloop1
+        push    %rbx
+.align  16      # incidentally aligned already
+.Lcloop8:
+        mov     ($inp),%eax
+        mov     4($inp),%ebx
+___
+# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
+for ($i=0;$i<4;$i++) {
+$code.=<<___;
+        add     $TX[0]#b,$YY#b
+        lea     1($XX[0]),$XX[1]
+        movzb   ($dat,$YY),$TY#d
+        movzb   $XX[1]#b,$XX[1]#d
+        movzb   ($dat,$XX[1]),$TX[1]#d
+        movb    $TX[0]#b,($dat,$YY)
+        cmp     $XX[1],$YY
+        movb    $TY#b,($dat,$XX[0])
+        jne     .Lcmov$i                        # Intel cmov is sloooow...
+        mov     $TX[0],$TX[1]
+.Lcmov$i:
+        add     $TX[0]#b,$TY#b
+        xor     ($dat,$TY),%al
+        ror     \$8,%eax
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers
+}
+for ($i=4;$i<8;$i++) {
+$code.=<<___;
+        add     $TX[0]#b,$YY#b
+        lea     1($XX[0]),$XX[1]
+        movzb   ($dat,$YY),$TY#d
+        movzb   $XX[1]#b,$XX[1]#d
+        movzb   ($dat,$XX[1]),$TX[1]#d
+        movb    $TX[0]#b,($dat,$YY)
+        cmp     $XX[1],$YY
+        movb    $TY#b,($dat,$XX[0])
+        jne     .Lcmov$i                        # Intel cmov is sloooow...
+        mov     $TX[0],$TX[1]
+.Lcmov$i:
+        add     $TX[0]#b,$TY#b
+        xor     ($dat,$TY),%bl
+        ror     \$8,%ebx
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers
+}
+$code.=<<___;
+        lea     -8($len),$len
+        mov     %eax,($out)
+        lea     8($inp),$inp
+        mov     %ebx,4($out)
+        lea     8($out),$out
+        test    \$-8,$len
+        jnz     .Lcloop8
+        pop     %rbx
+        cmp     \$0,$len
+        jne     .Lcloop1
+.Lexit:
+        sub     \$1,$XX[0]#b
+        movb    $XX[0]#b,-2($dat)
+        movb    $YY#b,-1($dat)
+        pop     %r13
+        pop     %r12
+        repret
+.align  16
+.Lcloop1:
+        add     $TX[0]#b,$YY#b
+        movzb   ($dat,$YY),$TY#d
+        movb    $TX[0]#b,($dat,$YY)
+        movb    $TY#b,($dat,$XX[0])
+        add     $TX[0]#b,$TY#b
+        add     \$1,$XX[0]#b
+        movzb   ($dat,$TY),$TY#d
+        movzb   ($dat,$XX[0]),$TX[0]#d
+        xorb    ($inp),$TY#b
+        lea     1($inp),$inp
+        movb    $TY#b,($out)
+        lea     1($out),$out
+        sub     \$1,$len
+        jnz     .Lcloop1
+        jmp     .Lexit
+.size   RC4,.-RC4
+___
+$code =~ s/#([bwd])/$1/gm;
+$code =~ s/repret/.byte\t0xF3,0xC3/gm;
+print $code;
author	djm <>	2006-06-27 05:05:42 +0000
committer	djm <>	2006-06-27 05:05:42 +0000
commit	3f764f48d2626a43b6eeef7652c28303269d1204 (patch)
tree	764d513589e09d2d10dbe70039b5f3bf58a36803 /src/lib/libcrypto/rc4
parent	0d2f07cb82812dd6f9e33c493104f4c24e5b13a3 (diff)
parent	f6198d4d0ab97685dc56be2d48715ed39fcc74b9 (diff)
download	openbsd-3f764f48d2626a43b6eeef7652c28303269d1204.tar.gz openbsd-3f764f48d2626a43b6eeef7652c28303269d1204.tar.bz2 openbsd-3f764f48d2626a43b6eeef7652c28303269d1204.zip

diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl new file mode 100755 index 0000000000..b628daca70 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
@@ -0,0 +1,150 @@
	1	#!/usr/bin/env perl
	2	#
	3	# ====================================================================
	4	# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
	5	# project. Rights for redistribution and usage in source and binary
	6	# forms are granted according to the OpenSSL license.
	7	# ====================================================================
	8	#
	9	# Unlike 0.9.7f this code expects RC4_CHAR back in config line! See
	10	# commentary section in corresponding script in development branch
	11	# for background information about this option carousel. For those
	12	# who don't have energy to figure out these gory details, here is
	13	# basis in form of performance matrix relative to the original
	14	# 0.9.7e C code-base:
	15	#
	16	# 0.9.7e 0.9.7f this
	17	# AMD64 1x 3.3x 2.4x
	18	# EM64T 1x 0.8x 1.5x
	19	#
	20	# In other words idea is to trade -25% AMD64 performance to compensate
	21	# for deterioration and gain +90% on EM64T core. Development branch
	22	# maintains best performance for either target, i.e. 3.3x for AMD64
	23	# and 1.5x for EM64T.
	24
	25	$output=shift;
	26
	27	open STDOUT,">$output" \|\| die "can't open $output: $!";
	28
	29	$dat="%rdi"; # arg1
	30	$len="%rsi"; # arg2
	31	$inp="%rdx"; # arg3
	32	$out="%rcx"; # arg4
	33
	34	@XX=("%r8","%r10");
	35	@TX=("%r9","%r11");
	36	$YY="%r12";
	37	$TY="%r13";
	38
	39	$code=<<___;;
	40	.text
	41
	42	.globl RC4
	43	.type RC4,\@function
	44	.align 16
	45	RC4: or $len,$len
	46	jne .Lentry
	47	repret
	48	.Lentry:
	49	push %r12
	50	push %r13
	51
	52	add \$2,$dat
	53	movzb -2($dat),$XX[0]#d
	54	movzb -1($dat),$YY#d
	55
	56	add \$1,$XX[0]#b
	57	movzb ($dat,$XX[0]),$TX[0]#d
	58	test \$-8,$len
	59	jz .Lcloop1
	60	push %rbx
	61	.align 16 # incidentally aligned already
	62	.Lcloop8:
	63	mov ($inp),%eax
	64	mov 4($inp),%ebx
	65	___
	66	# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
	67	for ($i=0;$i<4;$i++) {
	68	$code.=<<___;
	69	add $TX[0]#b,$YY#b
	70	lea 1($XX[0]),$XX[1]
	71	movzb ($dat,$YY),$TY#d
	72	movzb $XX[1]#b,$XX[1]#d
	73	movzb ($dat,$XX[1]),$TX[1]#d
	74	movb $TX[0]#b,($dat,$YY)
	75	cmp $XX[1],$YY
	76	movb $TY#b,($dat,$XX[0])
	77	jne .Lcmov$i # Intel cmov is sloooow...
	78	mov $TX[0],$TX[1]
	79	.Lcmov$i:
	80	add $TX[0]#b,$TY#b
	81	xor ($dat,$TY),%al
	82	ror \$8,%eax
	83	___
	84	push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
	85	}
	86	for ($i=4;$i<8;$i++) {
	87	$code.=<<___;
	88	add $TX[0]#b,$YY#b
	89	lea 1($XX[0]),$XX[1]
	90	movzb ($dat,$YY),$TY#d
	91	movzb $XX[1]#b,$XX[1]#d
	92	movzb ($dat,$XX[1]),$TX[1]#d
	93	movb $TX[0]#b,($dat,$YY)
	94	cmp $XX[1],$YY
	95	movb $TY#b,($dat,$XX[0])
	96	jne .Lcmov$i # Intel cmov is sloooow...
	97	mov $TX[0],$TX[1]
	98	.Lcmov$i:
	99	add $TX[0]#b,$TY#b
	100	xor ($dat,$TY),%bl
	101	ror \$8,%ebx
	102	___
	103	push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
	104	}
	105	$code.=<<___;
	106	lea -8($len),$len
	107	mov %eax,($out)
	108	lea 8($inp),$inp
	109	mov %ebx,4($out)
	110	lea 8($out),$out
	111
	112	test \$-8,$len
	113	jnz .Lcloop8
	114	pop %rbx
	115	cmp \$0,$len
	116	jne .Lcloop1
	117	.Lexit:
	118	sub \$1,$XX[0]#b
	119	movb $XX[0]#b,-2($dat)
	120	movb $YY#b,-1($dat)
	121
	122	pop %r13
	123	pop %r12
	124	repret
	125
	126	.align 16
	127	.Lcloop1:
	128	add $TX[0]#b,$YY#b
	129	movzb ($dat,$YY),$TY#d
	130	movb $TX[0]#b,($dat,$YY)
	131	movb $TY#b,($dat,$XX[0])
	132	add $TX[0]#b,$TY#b
	133	add \$1,$XX[0]#b
	134	movzb ($dat,$TY),$TY#d
	135	movzb ($dat,$XX[0]),$TX[0]#d
	136	xorb ($inp),$TY#b
	137	lea 1($inp),$inp
	138	movb $TY#b,($out)
	139	lea 1($out),$out
	140	sub \$1,$len
	141	jnz .Lcloop1
	142	jmp .Lexit
	143	.size RC4,.-RC4
	144	___
	145
	146	$code =~ s/#([bwd])/$1/gm;
	147
	148	$code =~ s/repret/.byte\t0xF3,0xC3/gm;
	149
	150	print $code;