summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha
diff options
context:
space:
mode:
authordjm <>2011-11-03 02:32:23 +0000
committerdjm <>2011-11-03 02:32:23 +0000
commit113f799ec7d1728f0a5d7ab5b0e3b42e3de56407 (patch)
tree26d712b25a8fa580b8f2dfc6df470ba5ffea9eb7 /src/lib/libcrypto/sha
parent829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2 (diff)
downloadopenbsd-113f799ec7d1728f0a5d7ab5b0e3b42e3de56407.tar.gz
openbsd-113f799ec7d1728f0a5d7ab5b0e3b42e3de56407.tar.bz2
openbsd-113f799ec7d1728f0a5d7ab5b0e3b42e3de56407.zip
import OpenSSL 1.0.0e
Diffstat (limited to 'src/lib/libcrypto/sha')
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-armv4-large.pl76
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9.pl1
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl1
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-armv4.pl33
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-armv4.pl32
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-sparcv9.pl1
6 files changed, 75 insertions, 69 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
index 88861af641..6e65fe3e01 100644
--- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
@@ -37,9 +37,18 @@
37# modes are limited. As result it takes more instructions to do 37# modes are limited. As result it takes more instructions to do
38# the same job in Thumb, therefore the code is never twice as 38# the same job in Thumb, therefore the code is never twice as
39# small and always slower. 39# small and always slower.
40# [***] which is also ~35% better than compiler generated code. 40# [***] which is also ~35% better than compiler generated code. Dual-
41# issue Cortex A8 core was measured to process input block in
42# ~990 cycles.
41 43
42$output=shift; 44# August 2010.
45#
46# Rescheduling for dual-issue pipeline resulted in 13% improvement on
47# Cortex A8 core and in absolute terms ~870 cycles per input block
48# [or 13.6 cycles per byte].
49
50
51while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
43open STDOUT,">$output"; 52open STDOUT,">$output";
44 53
45$ctx="r0"; 54$ctx="r0";
@@ -58,43 +67,22 @@ $t3="r12";
58$Xi="r14"; 67$Xi="r14";
59@V=($a,$b,$c,$d,$e); 68@V=($a,$b,$c,$d,$e);
60 69
61# One can optimize this for aligned access on big-endian architecture,
62# but code's endian neutrality makes it too pretty:-)
63sub Xload {
64my ($a,$b,$c,$d,$e)=@_;
65$code.=<<___;
66 ldrb $t0,[$inp],#4
67 ldrb $t1,[$inp,#-3]
68 ldrb $t2,[$inp,#-2]
69 ldrb $t3,[$inp,#-1]
70 add $e,$K,$e,ror#2 @ E+=K_00_19
71 orr $t0,$t1,$t0,lsl#8
72 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
73 orr $t0,$t2,$t0,lsl#8
74 eor $t1,$c,$d @ F_xx_xx
75 orr $t0,$t3,$t0,lsl#8
76 add $e,$e,$t0 @ E+=X[i]
77 str $t0,[$Xi,#-4]!
78___
79}
80sub Xupdate { 70sub Xupdate {
81my ($a,$b,$c,$d,$e,$flag)=@_; 71my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
82$code.=<<___; 72$code.=<<___;
83 ldr $t0,[$Xi,#15*4] 73 ldr $t0,[$Xi,#15*4]
84 ldr $t1,[$Xi,#13*4] 74 ldr $t1,[$Xi,#13*4]
85 ldr $t2,[$Xi,#7*4] 75 ldr $t2,[$Xi,#7*4]
86 ldr $t3,[$Xi,#2*4]
87 add $e,$K,$e,ror#2 @ E+=K_xx_xx 76 add $e,$K,$e,ror#2 @ E+=K_xx_xx
77 ldr $t3,[$Xi,#2*4]
88 eor $t0,$t0,$t1 78 eor $t0,$t0,$t1
89 eor $t0,$t0,$t2 79 eor $t2,$t2,$t3
90 eor $t0,$t0,$t3 80 eor $t1,$c,$d @ F_xx_xx
91 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
92___
93$code.=<<___ if (!defined($flag));
94 eor $t1,$c,$d @ F_xx_xx, but not in 40_59
95___
96$code.=<<___;
97 mov $t0,$t0,ror#31 81 mov $t0,$t0,ror#31
82 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
83 eor $t0,$t0,$t2,ror#31
84 $opt1 @ F_xx_xx
85 $opt2 @ F_xx_xx
98 add $e,$e,$t0 @ E+=X[i] 86 add $e,$e,$t0 @ E+=X[i]
99 str $t0,[$Xi,#-4]! 87 str $t0,[$Xi,#-4]!
100___ 88___
@@ -102,19 +90,29 @@ ___
102 90
103sub BODY_00_15 { 91sub BODY_00_15 {
104my ($a,$b,$c,$d,$e)=@_; 92my ($a,$b,$c,$d,$e)=@_;
105 &Xload(@_);
106$code.=<<___; 93$code.=<<___;
94 ldrb $t0,[$inp],#4
95 ldrb $t1,[$inp,#-1]
96 ldrb $t2,[$inp,#-2]
97 add $e,$K,$e,ror#2 @ E+=K_00_19
98 ldrb $t3,[$inp,#-3]
99 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
100 orr $t0,$t1,$t0,lsl#24
101 eor $t1,$c,$d @ F_xx_xx
102 orr $t0,$t0,$t2,lsl#8
103 orr $t0,$t0,$t3,lsl#16
107 and $t1,$b,$t1,ror#2 104 and $t1,$b,$t1,ror#2
105 add $e,$e,$t0 @ E+=X[i]
108 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 106 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
107 str $t0,[$Xi,#-4]!
109 add $e,$e,$t1 @ E+=F_00_19(B,C,D) 108 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
110___ 109___
111} 110}
112 111
113sub BODY_16_19 { 112sub BODY_16_19 {
114my ($a,$b,$c,$d,$e)=@_; 113my ($a,$b,$c,$d,$e)=@_;
115 &Xupdate(@_); 114 &Xupdate(@_,"and $t1,$b,$t1,ror#2");
116$code.=<<___; 115$code.=<<___;
117 and $t1,$b,$t1,ror#2
118 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 116 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
119 add $e,$e,$t1 @ E+=F_00_19(B,C,D) 117 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
120___ 118___
@@ -122,22 +120,18 @@ ___
122 120
123sub BODY_20_39 { 121sub BODY_20_39 {
124my ($a,$b,$c,$d,$e)=@_; 122my ($a,$b,$c,$d,$e)=@_;
125 &Xupdate(@_); 123 &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
126$code.=<<___; 124$code.=<<___;
127 eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D)
128 add $e,$e,$t1 @ E+=F_20_39(B,C,D) 125 add $e,$e,$t1 @ E+=F_20_39(B,C,D)
129___ 126___
130} 127}
131 128
132sub BODY_40_59 { 129sub BODY_40_59 {
133my ($a,$b,$c,$d,$e)=@_; 130my ($a,$b,$c,$d,$e)=@_;
134 &Xupdate(@_,1); 131 &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
135$code.=<<___; 132$code.=<<___;
136 and $t1,$b,$c,ror#2
137 orr $t2,$b,$c,ror#2
138 and $t2,$t2,$d,ror#2
139 orr $t1,$t1,$t2 @ F_40_59(B,C,D)
140 add $e,$e,$t1 @ E+=F_40_59(B,C,D) 133 add $e,$e,$t1 @ E+=F_40_59(B,C,D)
134 add $e,$e,$t2,ror#2
141___ 135___
142} 136}
143 137
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
index 8306fc88cc..5c161cecd6 100644
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
@@ -276,6 +276,7 @@ $code.=<<___;
276.type sha1_block_data_order,#function 276.type sha1_block_data_order,#function
277.size sha1_block_data_order,(.-sha1_block_data_order) 277.size sha1_block_data_order,(.-sha1_block_data_order)
278.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 278.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
279.align 4
279___ 280___
280 281
281$code =~ s/\`([^\`]*)\`/eval $1/gem; 282$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
index 15eb854bad..85e8d68086 100644
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
@@ -539,6 +539,7 @@ $code.=<<___;
539.type sha1_block_data_order,#function 539.type sha1_block_data_order,#function
540.size sha1_block_data_order,(.-sha1_block_data_order) 540.size sha1_block_data_order,(.-sha1_block_data_order)
541.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>" 541.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
542.align 4
542___ 543___
543 544
544# Purpose of these subroutines is to explicitly encode VIS instructions, 545# Purpose of these subroutines is to explicitly encode VIS instructions,
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
index 48d846deec..492cb62bc0 100644
--- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl
+++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
@@ -11,9 +11,14 @@
11 11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso- 12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte. 14# byte [on single-issue Xscale PXA250 core].
15 15
16$output=shift; 16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte.
20
21while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
17open STDOUT,">$output"; 22open STDOUT,">$output";
18 23
19$ctx="r0"; $t0="r0"; 24$ctx="r0"; $t0="r0";
@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16);
52___ 57___
53$code.=<<___; 58$code.=<<___;
54 ldr $t2,[$Ktbl],#4 @ *K256++ 59 ldr $t2,[$Ktbl],#4 @ *K256++
55 str $T1,[sp,#`$i%16`*4]
56 mov $t0,$e,ror#$Sigma1[0] 60 mov $t0,$e,ror#$Sigma1[0]
61 str $T1,[sp,#`$i%16`*4]
57 eor $t0,$t0,$e,ror#$Sigma1[1] 62 eor $t0,$t0,$e,ror#$Sigma1[1]
58 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
59 add $T1,$T1,$t0
60 eor $t1,$f,$g 63 eor $t1,$f,$g
64 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
61 and $t1,$t1,$e 65 and $t1,$t1,$e
66 add $T1,$T1,$t0
62 eor $t1,$t1,$g @ Ch(e,f,g) 67 eor $t1,$t1,$g @ Ch(e,f,g)
63 add $T1,$T1,$t1
64 add $T1,$T1,$h 68 add $T1,$T1,$h
65 add $T1,$T1,$t2
66 mov $h,$a,ror#$Sigma0[0] 69 mov $h,$a,ror#$Sigma0[0]
70 add $T1,$T1,$t1
67 eor $h,$h,$a,ror#$Sigma0[1] 71 eor $h,$h,$a,ror#$Sigma0[1]
72 add $T1,$T1,$t2
68 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) 73 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
69 orr $t0,$a,$b 74 orr $t0,$a,$b
70 and $t0,$t0,$c
71 and $t1,$a,$b 75 and $t1,$a,$b
76 and $t0,$t0,$c
77 add $h,$h,$T1
72 orr $t0,$t0,$t1 @ Maj(a,b,c) 78 orr $t0,$t0,$t1 @ Maj(a,b,c)
73 add $h,$h,$t0
74 add $d,$d,$T1 79 add $d,$d,$T1
75 add $h,$h,$T1 80 add $h,$h,$t0
76___ 81___
77} 82}
78 83
@@ -80,19 +85,19 @@ sub BODY_16_XX {
80my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 85my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
81 86
82$code.=<<___; 87$code.=<<___;
83 ldr $t1,[sp,#`($i+1)%16`*4] @ $i 88 ldr $t1,[sp,#`($i+1)%16`*4] @ $i
84 ldr $t2,[sp,#`($i+14)%16`*4] 89 ldr $t2,[sp,#`($i+14)%16`*4]
85 ldr $T1,[sp,#`($i+0)%16`*4] 90 ldr $T1,[sp,#`($i+0)%16`*4]
86 ldr $inp,[sp,#`($i+9)%16`*4]
87 mov $t0,$t1,ror#$sigma0[0] 91 mov $t0,$t1,ror#$sigma0[0]
92 ldr $inp,[sp,#`($i+9)%16`*4]
88 eor $t0,$t0,$t1,ror#$sigma0[1] 93 eor $t0,$t0,$t1,ror#$sigma0[1]
89 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 94 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
90 mov $t1,$t2,ror#$sigma1[0] 95 mov $t1,$t2,ror#$sigma1[0]
96 add $T1,$T1,$t0
91 eor $t1,$t1,$t2,ror#$sigma1[1] 97 eor $t1,$t1,$t2,ror#$sigma1[1]
98 add $T1,$T1,$inp
92 eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) 99 eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
93 add $T1,$T1,$t0
94 add $T1,$T1,$t1 100 add $T1,$T1,$t1
95 add $T1,$T1,$inp
96___ 101___
97 &BODY_00_15(@_); 102 &BODY_00_15(@_);
98} 103}
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
index 4fbb94a914..3a35861ac6 100644
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
@@ -10,7 +10,13 @@
10# SHA512 block procedure for ARMv4. September 2007. 10# SHA512 block procedure for ARMv4. September 2007.
11 11
12# This code is ~4.5 (four and a half) times faster than code generated 12# This code is ~4.5 (four and a half) times faster than code generated
13# by gcc 3.4 and it spends ~72 clock cycles per byte. 13# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14# Xscale PXA250 core].
15#
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte.
14 20
15# Byte order [in]dependence. ========================================= 21# Byte order [in]dependence. =========================================
16# 22#
@@ -22,7 +28,7 @@ $hi=0;
22$lo=4; 28$lo=4;
23# ==================================================================== 29# ====================================================================
24 30
25$output=shift; 31while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
26open STDOUT,">$output"; 32open STDOUT,">$output";
27 33
28$ctx="r0"; 34$ctx="r0";
@@ -73,33 +79,31 @@ $code.=<<___;
73 eor $t0,$t0,$Elo,lsl#23 79 eor $t0,$t0,$Elo,lsl#23
74 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) 80 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
75 adds $Tlo,$Tlo,$t0 81 adds $Tlo,$Tlo,$t0
76 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
77 adds $Tlo,$Tlo,$t2
78 adc $Thi,$Thi,$t3 @ T += h
79
80 ldr $t0,[sp,#$Foff+0] @ f.lo 82 ldr $t0,[sp,#$Foff+0] @ f.lo
83 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
81 ldr $t1,[sp,#$Foff+4] @ f.hi 84 ldr $t1,[sp,#$Foff+4] @ f.hi
85 adds $Tlo,$Tlo,$t2
82 ldr $t2,[sp,#$Goff+0] @ g.lo 86 ldr $t2,[sp,#$Goff+0] @ g.lo
87 adc $Thi,$Thi,$t3 @ T += h
83 ldr $t3,[sp,#$Goff+4] @ g.hi 88 ldr $t3,[sp,#$Goff+4] @ g.hi
84 str $Elo,[sp,#$Eoff+0]
85 str $Ehi,[sp,#$Eoff+4]
86 str $Alo,[sp,#$Aoff+0]
87 str $Ahi,[sp,#$Aoff+4]
88 89
89 eor $t0,$t0,$t2 90 eor $t0,$t0,$t2
91 str $Elo,[sp,#$Eoff+0]
90 eor $t1,$t1,$t3 92 eor $t1,$t1,$t3
93 str $Ehi,[sp,#$Eoff+4]
91 and $t0,$t0,$Elo 94 and $t0,$t0,$Elo
95 str $Alo,[sp,#$Aoff+0]
92 and $t1,$t1,$Ehi 96 and $t1,$t1,$Ehi
97 str $Ahi,[sp,#$Aoff+4]
93 eor $t0,$t0,$t2 98 eor $t0,$t0,$t2
94 eor $t1,$t1,$t3 @ Ch(e,f,g)
95
96 ldr $t2,[$Ktbl,#4] @ K[i].lo 99 ldr $t2,[$Ktbl,#4] @ K[i].lo
100 eor $t1,$t1,$t3 @ Ch(e,f,g)
97 ldr $t3,[$Ktbl,#0] @ K[i].hi 101 ldr $t3,[$Ktbl,#0] @ K[i].hi
98 ldr $Elo,[sp,#$Doff+0] @ d.lo
99 ldr $Ehi,[sp,#$Doff+4] @ d.hi
100 102
101 adds $Tlo,$Tlo,$t0 103 adds $Tlo,$Tlo,$t0
104 ldr $Elo,[sp,#$Doff+0] @ d.lo
102 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) 105 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
106 ldr $Ehi,[sp,#$Doff+4] @ d.hi
103 adds $Tlo,$Tlo,$t2 107 adds $Tlo,$Tlo,$t2
104 adc $Thi,$Thi,$t3 @ T += K[i] 108 adc $Thi,$Thi,$t3 @ T += K[i]
105 adds $Elo,$Elo,$Tlo 109 adds $Elo,$Elo,$Tlo
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
index 54241aab50..ec5d78135e 100644
--- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
@@ -586,6 +586,7 @@ $code.=<<___;
586.type sha${label}_block_data_order,#function 586.type sha${label}_block_data_order,#function
587.size sha${label}_block_data_order,(.-sha${label}_block_data_order) 587.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
588.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 588.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
589.align 4
589___ 590___
590 591
591$code =~ s/\`([^\`]*)\`/eval $1/gem; 592$code =~ s/\`([^\`]*)\`/eval $1/gem;