diff options
Diffstat (limited to 'src/lib/libcrypto/sha/asm/sha1-armv4-large.pl')
-rw-r--r-- | src/lib/libcrypto/sha/asm/sha1-armv4-large.pl | 76 |
1 files changed, 35 insertions, 41 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl index 88861af641..6e65fe3e01 100644 --- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl +++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl | |||
@@ -37,9 +37,18 @@ | |||
37 | # modes are limited. As result it takes more instructions to do | 37 | # modes are limited. As result it takes more instructions to do |
38 | # the same job in Thumb, therefore the code is never twice as | 38 | # the same job in Thumb, therefore the code is never twice as |
39 | # small and always slower. | 39 | # small and always slower. |
40 | # [***] which is also ~35% better than compiler generated code. | 40 | # [***] which is also ~35% better than compiler generated code. Dual- |
41 | # issue Cortex A8 core was measured to process input block in | ||
42 | # ~990 cycles. | ||
41 | 43 | ||
42 | $output=shift; | 44 | # August 2010. |
45 | # | ||
46 | # Rescheduling for dual-issue pipeline resulted in 13% improvement on | ||
47 | # Cortex A8 core and in absolute terms ~870 cycles per input block | ||
48 | # [or 13.6 cycles per byte]. | ||
49 | |||
50 | |||
51 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
43 | open STDOUT,">$output"; | 52 | open STDOUT,">$output"; |
44 | 53 | ||
45 | $ctx="r0"; | 54 | $ctx="r0"; |
@@ -58,43 +67,22 @@ $t3="r12"; | |||
58 | $Xi="r14"; | 67 | $Xi="r14"; |
59 | @V=($a,$b,$c,$d,$e); | 68 | @V=($a,$b,$c,$d,$e); |
60 | 69 | ||
61 | # One can optimize this for aligned access on big-endian architecture, | ||
62 | # but code's endian neutrality makes it too pretty:-) | ||
63 | sub Xload { | ||
64 | my ($a,$b,$c,$d,$e)=@_; | ||
65 | $code.=<<___; | ||
66 | ldrb $t0,[$inp],#4 | ||
67 | ldrb $t1,[$inp,#-3] | ||
68 | ldrb $t2,[$inp,#-2] | ||
69 | ldrb $t3,[$inp,#-1] | ||
70 | add $e,$K,$e,ror#2 @ E+=K_00_19 | ||
71 | orr $t0,$t1,$t0,lsl#8 | ||
72 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) | ||
73 | orr $t0,$t2,$t0,lsl#8 | ||
74 | eor $t1,$c,$d @ F_xx_xx | ||
75 | orr $t0,$t3,$t0,lsl#8 | ||
76 | add $e,$e,$t0 @ E+=X[i] | ||
77 | str $t0,[$Xi,#-4]! | ||
78 | ___ | ||
79 | } | ||
80 | sub Xupdate { | 70 | sub Xupdate { |
81 | my ($a,$b,$c,$d,$e,$flag)=@_; | 71 | my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_; |
82 | $code.=<<___; | 72 | $code.=<<___; |
83 | ldr $t0,[$Xi,#15*4] | 73 | ldr $t0,[$Xi,#15*4] |
84 | ldr $t1,[$Xi,#13*4] | 74 | ldr $t1,[$Xi,#13*4] |
85 | ldr $t2,[$Xi,#7*4] | 75 | ldr $t2,[$Xi,#7*4] |
86 | ldr $t3,[$Xi,#2*4] | ||
87 | add $e,$K,$e,ror#2 @ E+=K_xx_xx | 76 | add $e,$K,$e,ror#2 @ E+=K_xx_xx |
77 | ldr $t3,[$Xi,#2*4] | ||
88 | eor $t0,$t0,$t1 | 78 | eor $t0,$t0,$t1 |
89 | eor $t0,$t0,$t2 | 79 | eor $t2,$t2,$t3 |
90 | eor $t0,$t0,$t3 | 80 | eor $t1,$c,$d @ F_xx_xx |
91 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) | ||
92 | ___ | ||
93 | $code.=<<___ if (!defined($flag)); | ||
94 | eor $t1,$c,$d @ F_xx_xx, but not in 40_59 | ||
95 | ___ | ||
96 | $code.=<<___; | ||
97 | mov $t0,$t0,ror#31 | 81 | mov $t0,$t0,ror#31 |
82 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) | ||
83 | eor $t0,$t0,$t2,ror#31 | ||
84 | $opt1 @ F_xx_xx | ||
85 | $opt2 @ F_xx_xx | ||
98 | add $e,$e,$t0 @ E+=X[i] | 86 | add $e,$e,$t0 @ E+=X[i] |
99 | str $t0,[$Xi,#-4]! | 87 | str $t0,[$Xi,#-4]! |
100 | ___ | 88 | ___ |
@@ -102,19 +90,29 @@ ___ | |||
102 | 90 | ||
103 | sub BODY_00_15 { | 91 | sub BODY_00_15 { |
104 | my ($a,$b,$c,$d,$e)=@_; | 92 | my ($a,$b,$c,$d,$e)=@_; |
105 | &Xload(@_); | ||
106 | $code.=<<___; | 93 | $code.=<<___; |
94 | ldrb $t0,[$inp],#4 | ||
95 | ldrb $t1,[$inp,#-1] | ||
96 | ldrb $t2,[$inp,#-2] | ||
97 | add $e,$K,$e,ror#2 @ E+=K_00_19 | ||
98 | ldrb $t3,[$inp,#-3] | ||
99 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) | ||
100 | orr $t0,$t1,$t0,lsl#24 | ||
101 | eor $t1,$c,$d @ F_xx_xx | ||
102 | orr $t0,$t0,$t2,lsl#8 | ||
103 | orr $t0,$t0,$t3,lsl#16 | ||
107 | and $t1,$b,$t1,ror#2 | 104 | and $t1,$b,$t1,ror#2 |
105 | add $e,$e,$t0 @ E+=X[i] | ||
108 | eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) | 106 | eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) |
107 | str $t0,[$Xi,#-4]! | ||
109 | add $e,$e,$t1 @ E+=F_00_19(B,C,D) | 108 | add $e,$e,$t1 @ E+=F_00_19(B,C,D) |
110 | ___ | 109 | ___ |
111 | } | 110 | } |
112 | 111 | ||
113 | sub BODY_16_19 { | 112 | sub BODY_16_19 { |
114 | my ($a,$b,$c,$d,$e)=@_; | 113 | my ($a,$b,$c,$d,$e)=@_; |
115 | &Xupdate(@_); | 114 | &Xupdate(@_,"and $t1,$b,$t1,ror#2"); |
116 | $code.=<<___; | 115 | $code.=<<___; |
117 | and $t1,$b,$t1,ror#2 | ||
118 | eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) | 116 | eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) |
119 | add $e,$e,$t1 @ E+=F_00_19(B,C,D) | 117 | add $e,$e,$t1 @ E+=F_00_19(B,C,D) |
120 | ___ | 118 | ___ |
@@ -122,22 +120,18 @@ ___ | |||
122 | 120 | ||
123 | sub BODY_20_39 { | 121 | sub BODY_20_39 { |
124 | my ($a,$b,$c,$d,$e)=@_; | 122 | my ($a,$b,$c,$d,$e)=@_; |
125 | &Xupdate(@_); | 123 | &Xupdate(@_,"eor $t1,$b,$t1,ror#2"); |
126 | $code.=<<___; | 124 | $code.=<<___; |
127 | eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D) | ||
128 | add $e,$e,$t1 @ E+=F_20_39(B,C,D) | 125 | add $e,$e,$t1 @ E+=F_20_39(B,C,D) |
129 | ___ | 126 | ___ |
130 | } | 127 | } |
131 | 128 | ||
132 | sub BODY_40_59 { | 129 | sub BODY_40_59 { |
133 | my ($a,$b,$c,$d,$e)=@_; | 130 | my ($a,$b,$c,$d,$e)=@_; |
134 | &Xupdate(@_,1); | 131 | &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d"); |
135 | $code.=<<___; | 132 | $code.=<<___; |
136 | and $t1,$b,$c,ror#2 | ||
137 | orr $t2,$b,$c,ror#2 | ||
138 | and $t2,$t2,$d,ror#2 | ||
139 | orr $t1,$t1,$t2 @ F_40_59(B,C,D) | ||
140 | add $e,$e,$t1 @ E+=F_40_59(B,C,D) | 133 | add $e,$e,$t1 @ E+=F_40_59(B,C,D) |
134 | add $e,$e,$t2,ror#2 | ||
141 | ___ | 135 | ___ |
142 | } | 136 | } |
143 | 137 | ||