diff options
Diffstat (limited to 'src/lib/libcrypto/sha/asm/sha256-armv4.pl')
-rw-r--r-- | src/lib/libcrypto/sha/asm/sha256-armv4.pl | 33 |
1 files changed, 19 insertions, 14 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl index 48d846deec..492cb62bc0 100644 --- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl +++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl | |||
@@ -11,9 +11,14 @@ | |||
11 | 11 | ||
12 | # Performance is ~2x better than gcc 3.4 generated code and in "abso- | 12 | # Performance is ~2x better than gcc 3.4 generated code and in "abso- |
13 | # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per | 13 | # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per |
14 | # byte. | 14 | # byte [on single-issue Xscale PXA250 core]. |
15 | 15 | ||
16 | $output=shift; | 16 | # July 2010. |
17 | # | ||
18 | # Rescheduling for dual-issue pipeline resulted in 22% improvement on | ||
19 | # Cortex A8 core and ~20 cycles per processed byte. | ||
20 | |||
21 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
17 | open STDOUT,">$output"; | 22 | open STDOUT,">$output"; |
18 | 23 | ||
19 | $ctx="r0"; $t0="r0"; | 24 | $ctx="r0"; $t0="r0"; |
@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16); | |||
52 | ___ | 57 | ___ |
53 | $code.=<<___; | 58 | $code.=<<___; |
54 | ldr $t2,[$Ktbl],#4 @ *K256++ | 59 | ldr $t2,[$Ktbl],#4 @ *K256++ |
55 | str $T1,[sp,#`$i%16`*4] | ||
56 | mov $t0,$e,ror#$Sigma1[0] | 60 | mov $t0,$e,ror#$Sigma1[0] |
61 | str $T1,[sp,#`$i%16`*4] | ||
57 | eor $t0,$t0,$e,ror#$Sigma1[1] | 62 | eor $t0,$t0,$e,ror#$Sigma1[1] |
58 | eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) | ||
59 | add $T1,$T1,$t0 | ||
60 | eor $t1,$f,$g | 63 | eor $t1,$f,$g |
64 | eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) | ||
61 | and $t1,$t1,$e | 65 | and $t1,$t1,$e |
66 | add $T1,$T1,$t0 | ||
62 | eor $t1,$t1,$g @ Ch(e,f,g) | 67 | eor $t1,$t1,$g @ Ch(e,f,g) |
63 | add $T1,$T1,$t1 | ||
64 | add $T1,$T1,$h | 68 | add $T1,$T1,$h |
65 | add $T1,$T1,$t2 | ||
66 | mov $h,$a,ror#$Sigma0[0] | 69 | mov $h,$a,ror#$Sigma0[0] |
70 | add $T1,$T1,$t1 | ||
67 | eor $h,$h,$a,ror#$Sigma0[1] | 71 | eor $h,$h,$a,ror#$Sigma0[1] |
72 | add $T1,$T1,$t2 | ||
68 | eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) | 73 | eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) |
69 | orr $t0,$a,$b | 74 | orr $t0,$a,$b |
70 | and $t0,$t0,$c | ||
71 | and $t1,$a,$b | 75 | and $t1,$a,$b |
76 | and $t0,$t0,$c | ||
77 | add $h,$h,$T1 | ||
72 | orr $t0,$t0,$t1 @ Maj(a,b,c) | 78 | orr $t0,$t0,$t1 @ Maj(a,b,c) |
73 | add $h,$h,$t0 | ||
74 | add $d,$d,$T1 | 79 | add $d,$d,$T1 |
75 | add $h,$h,$T1 | 80 | add $h,$h,$t0 |
76 | ___ | 81 | ___ |
77 | } | 82 | } |
78 | 83 | ||
@@ -80,19 +85,19 @@ sub BODY_16_XX { | |||
80 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 85 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
81 | 86 | ||
82 | $code.=<<___; | 87 | $code.=<<___; |
83 | ldr $t1,[sp,#`($i+1)%16`*4] @ $i | 88 | ldr $t1,[sp,#`($i+1)%16`*4] @ $i |
84 | ldr $t2,[sp,#`($i+14)%16`*4] | 89 | ldr $t2,[sp,#`($i+14)%16`*4] |
85 | ldr $T1,[sp,#`($i+0)%16`*4] | 90 | ldr $T1,[sp,#`($i+0)%16`*4] |
86 | ldr $inp,[sp,#`($i+9)%16`*4] | ||
87 | mov $t0,$t1,ror#$sigma0[0] | 91 | mov $t0,$t1,ror#$sigma0[0] |
92 | ldr $inp,[sp,#`($i+9)%16`*4] | ||
88 | eor $t0,$t0,$t1,ror#$sigma0[1] | 93 | eor $t0,$t0,$t1,ror#$sigma0[1] |
89 | eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) | 94 | eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) |
90 | mov $t1,$t2,ror#$sigma1[0] | 95 | mov $t1,$t2,ror#$sigma1[0] |
96 | add $T1,$T1,$t0 | ||
91 | eor $t1,$t1,$t2,ror#$sigma1[1] | 97 | eor $t1,$t1,$t2,ror#$sigma1[1] |
98 | add $T1,$T1,$inp | ||
92 | eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) | 99 | eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) |
93 | add $T1,$T1,$t0 | ||
94 | add $T1,$T1,$t1 | 100 | add $T1,$T1,$t1 |
95 | add $T1,$T1,$inp | ||
96 | ___ | 101 | ___ |
97 | &BODY_00_15(@_); | 102 | &BODY_00_15(@_); |
98 | } | 103 | } |