diff options
Diffstat (limited to '')
-rw-r--r-- | src/lib/libcrypto/sha/asm/sha256-armv4.pl | 55 |
1 files changed, 40 insertions, 15 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl index 492cb62bc0..9c84e8d93c 100644 --- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl +++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl | |||
@@ -18,11 +18,16 @@ | |||
18 | # Rescheduling for dual-issue pipeline resulted in 22% improvement on | 18 | # Rescheduling for dual-issue pipeline resulted in 22% improvement on |
19 | # Cortex A8 core and ~20 cycles per processed byte. | 19 | # Cortex A8 core and ~20 cycles per processed byte. |
20 | 20 | ||
21 | # February 2011. | ||
22 | # | ||
23 | # Profiler-assisted and platform-specific optimization resulted in 16% | ||
24 | # improvement on Cortex A8 core and ~17 cycles per processed byte. | ||
25 | |||
21 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | 26 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
22 | open STDOUT,">$output"; | 27 | open STDOUT,">$output"; |
23 | 28 | ||
24 | $ctx="r0"; $t0="r0"; | 29 | $ctx="r0"; $t0="r0"; |
25 | $inp="r1"; | 30 | $inp="r1"; $t3="r1"; |
26 | $len="r2"; $t1="r2"; | 31 | $len="r2"; $t1="r2"; |
27 | $T1="r3"; | 32 | $T1="r3"; |
28 | $A="r4"; | 33 | $A="r4"; |
@@ -46,6 +51,9 @@ sub BODY_00_15 { | |||
46 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 51 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
47 | 52 | ||
48 | $code.=<<___ if ($i<16); | 53 | $code.=<<___ if ($i<16); |
54 | #if __ARM_ARCH__>=7 | ||
55 | ldr $T1,[$inp],#4 | ||
56 | #else | ||
49 | ldrb $T1,[$inp,#3] @ $i | 57 | ldrb $T1,[$inp,#3] @ $i |
50 | ldrb $t2,[$inp,#2] | 58 | ldrb $t2,[$inp,#2] |
51 | ldrb $t1,[$inp,#1] | 59 | ldrb $t1,[$inp,#1] |
@@ -53,16 +61,24 @@ $code.=<<___ if ($i<16); | |||
53 | orr $T1,$T1,$t2,lsl#8 | 61 | orr $T1,$T1,$t2,lsl#8 |
54 | orr $T1,$T1,$t1,lsl#16 | 62 | orr $T1,$T1,$t1,lsl#16 |
55 | orr $T1,$T1,$t0,lsl#24 | 63 | orr $T1,$T1,$t0,lsl#24 |
56 | `"str $inp,[sp,#17*4]" if ($i==15)` | 64 | #endif |
57 | ___ | 65 | ___ |
58 | $code.=<<___; | 66 | $code.=<<___; |
59 | ldr $t2,[$Ktbl],#4 @ *K256++ | ||
60 | mov $t0,$e,ror#$Sigma1[0] | 67 | mov $t0,$e,ror#$Sigma1[0] |
61 | str $T1,[sp,#`$i%16`*4] | 68 | ldr $t2,[$Ktbl],#4 @ *K256++ |
62 | eor $t0,$t0,$e,ror#$Sigma1[1] | 69 | eor $t0,$t0,$e,ror#$Sigma1[1] |
63 | eor $t1,$f,$g | 70 | eor $t1,$f,$g |
71 | #if $i>=16 | ||
72 | add $T1,$T1,$t3 @ from BODY_16_xx | ||
73 | #elif __ARM_ARCH__>=7 && defined(__ARMEL__) | ||
74 | rev $T1,$T1 | ||
75 | #endif | ||
76 | #if $i==15 | ||
77 | str $inp,[sp,#17*4] @ leave room for $t3 | ||
78 | #endif | ||
64 | eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) | 79 | eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) |
65 | and $t1,$t1,$e | 80 | and $t1,$t1,$e |
81 | str $T1,[sp,#`$i%16`*4] | ||
66 | add $T1,$T1,$t0 | 82 | add $T1,$T1,$t0 |
67 | eor $t1,$t1,$g @ Ch(e,f,g) | 83 | eor $t1,$t1,$g @ Ch(e,f,g) |
68 | add $T1,$T1,$h | 84 | add $T1,$T1,$h |
@@ -71,6 +87,9 @@ $code.=<<___; | |||
71 | eor $h,$h,$a,ror#$Sigma0[1] | 87 | eor $h,$h,$a,ror#$Sigma0[1] |
72 | add $T1,$T1,$t2 | 88 | add $T1,$T1,$t2 |
73 | eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) | 89 | eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) |
90 | #if $i>=15 | ||
91 | ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx | ||
92 | #endif | ||
74 | orr $t0,$a,$b | 93 | orr $t0,$a,$b |
75 | and $t1,$a,$b | 94 | and $t1,$a,$b |
76 | and $t0,$t0,$c | 95 | and $t0,$t0,$c |
@@ -85,24 +104,26 @@ sub BODY_16_XX { | |||
85 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 104 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
86 | 105 | ||
87 | $code.=<<___; | 106 | $code.=<<___; |
88 | ldr $t1,[sp,#`($i+1)%16`*4] @ $i | 107 | @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i |
89 | ldr $t2,[sp,#`($i+14)%16`*4] | 108 | ldr $t2,[sp,#`($i+14)%16`*4] |
109 | mov $t0,$t3,ror#$sigma0[0] | ||
90 | ldr $T1,[sp,#`($i+0)%16`*4] | 110 | ldr $T1,[sp,#`($i+0)%16`*4] |
91 | mov $t0,$t1,ror#$sigma0[0] | 111 | eor $t0,$t0,$t3,ror#$sigma0[1] |
92 | ldr $inp,[sp,#`($i+9)%16`*4] | 112 | ldr $t1,[sp,#`($i+9)%16`*4] |
93 | eor $t0,$t0,$t1,ror#$sigma0[1] | 113 | eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1]) |
94 | eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) | 114 | mov $t3,$t2,ror#$sigma1[0] |
95 | mov $t1,$t2,ror#$sigma1[0] | ||
96 | add $T1,$T1,$t0 | 115 | add $T1,$T1,$t0 |
97 | eor $t1,$t1,$t2,ror#$sigma1[1] | 116 | eor $t3,$t3,$t2,ror#$sigma1[1] |
98 | add $T1,$T1,$inp | ||
99 | eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) | ||
100 | add $T1,$T1,$t1 | 117 | add $T1,$T1,$t1 |
118 | eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) | ||
119 | @ add $T1,$T1,$t3 | ||
101 | ___ | 120 | ___ |
102 | &BODY_00_15(@_); | 121 | &BODY_00_15(@_); |
103 | } | 122 | } |
104 | 123 | ||
105 | $code=<<___; | 124 | $code=<<___; |
125 | #include "arm_arch.h" | ||
126 | |||
106 | .text | 127 | .text |
107 | .code 32 | 128 | .code 32 |
108 | 129 | ||
@@ -132,7 +153,7 @@ K256: | |||
132 | sha256_block_data_order: | 153 | sha256_block_data_order: |
133 | sub r3,pc,#8 @ sha256_block_data_order | 154 | sub r3,pc,#8 @ sha256_block_data_order |
134 | add $len,$inp,$len,lsl#6 @ len to point at the end of inp | 155 | add $len,$inp,$len,lsl#6 @ len to point at the end of inp |
135 | stmdb sp!,{$ctx,$inp,$len,r4-r12,lr} | 156 | stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} |
136 | ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} | 157 | ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} |
137 | sub $Ktbl,r3,#256 @ K256 | 158 | sub $Ktbl,r3,#256 @ K256 |
138 | sub sp,sp,#16*4 @ alloca(X[16]) | 159 | sub sp,sp,#16*4 @ alloca(X[16]) |
@@ -171,10 +192,14 @@ $code.=<<___; | |||
171 | bne .Loop | 192 | bne .Loop |
172 | 193 | ||
173 | add sp,sp,#`16+3`*4 @ destroy frame | 194 | add sp,sp,#`16+3`*4 @ destroy frame |
174 | ldmia sp!,{r4-r12,lr} | 195 | #if __ARM_ARCH__>=5 |
196 | ldmia sp!,{r4-r11,pc} | ||
197 | #else | ||
198 | ldmia sp!,{r4-r11,lr} | ||
175 | tst lr,#1 | 199 | tst lr,#1 |
176 | moveq pc,lr @ be binary compatible with V4, yet | 200 | moveq pc,lr @ be binary compatible with V4, yet |
177 | bx lr @ interoperable with Thumb ISA:-) | 201 | bx lr @ interoperable with Thumb ISA:-) |
202 | #endif | ||
178 | .size sha256_block_data_order,.-sha256_block_data_order | 203 | .size sha256_block_data_order,.-sha256_block_data_order |
179 | .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | 204 | .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" |
180 | .align 2 | 205 | .align 2 |