summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/sha/asm/sha1-armv4-large.pl')
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-armv4-large.pl76
1 files changed, 35 insertions, 41 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
index 88861af641..6e65fe3e01 100644
--- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
@@ -37,9 +37,18 @@
37# modes are limited. As result it takes more instructions to do 37# modes are limited. As result it takes more instructions to do
38# the same job in Thumb, therefore the code is never twice as 38# the same job in Thumb, therefore the code is never twice as
39# small and always slower. 39# small and always slower.
40# [***] which is also ~35% better than compiler generated code. 40# [***] which is also ~35% better than compiler generated code. Dual-
41# issue Cortex A8 core was measured to process input block in
42# ~990 cycles.
41 43
42$output=shift; 44# August 2010.
45#
46# Rescheduling for dual-issue pipeline resulted in 13% improvement on
47# Cortex A8 core and in absolute terms ~870 cycles per input block
48# [or 13.6 cycles per byte].
49
50
51while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
43open STDOUT,">$output"; 52open STDOUT,">$output";
44 53
45$ctx="r0"; 54$ctx="r0";
@@ -58,43 +67,22 @@ $t3="r12";
58$Xi="r14"; 67$Xi="r14";
59@V=($a,$b,$c,$d,$e); 68@V=($a,$b,$c,$d,$e);
60 69
61# One can optimize this for aligned access on big-endian architecture,
62# but code's endian neutrality makes it too pretty:-)
63sub Xload {
64my ($a,$b,$c,$d,$e)=@_;
65$code.=<<___;
66 ldrb $t0,[$inp],#4
67 ldrb $t1,[$inp,#-3]
68 ldrb $t2,[$inp,#-2]
69 ldrb $t3,[$inp,#-1]
70 add $e,$K,$e,ror#2 @ E+=K_00_19
71 orr $t0,$t1,$t0,lsl#8
72 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
73 orr $t0,$t2,$t0,lsl#8
74 eor $t1,$c,$d @ F_xx_xx
75 orr $t0,$t3,$t0,lsl#8
76 add $e,$e,$t0 @ E+=X[i]
77 str $t0,[$Xi,#-4]!
78___
79}
80sub Xupdate { 70sub Xupdate {
81my ($a,$b,$c,$d,$e,$flag)=@_; 71my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
82$code.=<<___; 72$code.=<<___;
83 ldr $t0,[$Xi,#15*4] 73 ldr $t0,[$Xi,#15*4]
84 ldr $t1,[$Xi,#13*4] 74 ldr $t1,[$Xi,#13*4]
85 ldr $t2,[$Xi,#7*4] 75 ldr $t2,[$Xi,#7*4]
86 ldr $t3,[$Xi,#2*4]
87 add $e,$K,$e,ror#2 @ E+=K_xx_xx 76 add $e,$K,$e,ror#2 @ E+=K_xx_xx
77 ldr $t3,[$Xi,#2*4]
88 eor $t0,$t0,$t1 78 eor $t0,$t0,$t1
89 eor $t0,$t0,$t2 79 eor $t2,$t2,$t3
90 eor $t0,$t0,$t3 80 eor $t1,$c,$d @ F_xx_xx
91 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
92___
93$code.=<<___ if (!defined($flag));
94 eor $t1,$c,$d @ F_xx_xx, but not in 40_59
95___
96$code.=<<___;
97 mov $t0,$t0,ror#31 81 mov $t0,$t0,ror#31
82 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
83 eor $t0,$t0,$t2,ror#31
84 $opt1 @ F_xx_xx
85 $opt2 @ F_xx_xx
98 add $e,$e,$t0 @ E+=X[i] 86 add $e,$e,$t0 @ E+=X[i]
99 str $t0,[$Xi,#-4]! 87 str $t0,[$Xi,#-4]!
100___ 88___
@@ -102,19 +90,29 @@ ___
102 90
103sub BODY_00_15 { 91sub BODY_00_15 {
104my ($a,$b,$c,$d,$e)=@_; 92my ($a,$b,$c,$d,$e)=@_;
105 &Xload(@_);
106$code.=<<___; 93$code.=<<___;
94 ldrb $t0,[$inp],#4
95 ldrb $t1,[$inp,#-1]
96 ldrb $t2,[$inp,#-2]
97 add $e,$K,$e,ror#2 @ E+=K_00_19
98 ldrb $t3,[$inp,#-3]
99 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
100 orr $t0,$t1,$t0,lsl#24
101 eor $t1,$c,$d @ F_xx_xx
102 orr $t0,$t0,$t2,lsl#8
103 orr $t0,$t0,$t3,lsl#16
107 and $t1,$b,$t1,ror#2 104 and $t1,$b,$t1,ror#2
105 add $e,$e,$t0 @ E+=X[i]
108 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 106 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
107 str $t0,[$Xi,#-4]!
109 add $e,$e,$t1 @ E+=F_00_19(B,C,D) 108 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
110___ 109___
111} 110}
112 111
113sub BODY_16_19 { 112sub BODY_16_19 {
114my ($a,$b,$c,$d,$e)=@_; 113my ($a,$b,$c,$d,$e)=@_;
115 &Xupdate(@_); 114 &Xupdate(@_,"and $t1,$b,$t1,ror#2");
116$code.=<<___; 115$code.=<<___;
117 and $t1,$b,$t1,ror#2
118 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 116 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
119 add $e,$e,$t1 @ E+=F_00_19(B,C,D) 117 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
120___ 118___
@@ -122,22 +120,18 @@ ___
122 120
123sub BODY_20_39 { 121sub BODY_20_39 {
124my ($a,$b,$c,$d,$e)=@_; 122my ($a,$b,$c,$d,$e)=@_;
125 &Xupdate(@_); 123 &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
126$code.=<<___; 124$code.=<<___;
127 eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D)
128 add $e,$e,$t1 @ E+=F_20_39(B,C,D) 125 add $e,$e,$t1 @ E+=F_20_39(B,C,D)
129___ 126___
130} 127}
131 128
132sub BODY_40_59 { 129sub BODY_40_59 {
133my ($a,$b,$c,$d,$e)=@_; 130my ($a,$b,$c,$d,$e)=@_;
134 &Xupdate(@_,1); 131 &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
135$code.=<<___; 132$code.=<<___;
136 and $t1,$b,$c,ror#2
137 orr $t2,$b,$c,ror#2
138 and $t2,$t2,$d,ror#2
139 orr $t1,$t1,$t2 @ F_40_59(B,C,D)
140 add $e,$e,$t1 @ E+=F_40_59(B,C,D) 133 add $e,$e,$t1 @ E+=F_40_59(B,C,D)
134 add $e,$e,$t2,ror#2
141___ 135___
142} 136}
143 137