diff options
Diffstat (limited to 'src/lib/libcrypto/sha/asm/sha512-armv4.pl')
-rw-r--r-- | src/lib/libcrypto/sha/asm/sha512-armv4.pl | 32 |
1 files changed, 18 insertions, 14 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl index 4fbb94a914..3a35861ac6 100644 --- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl +++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl | |||
@@ -10,7 +10,13 @@ | |||
10 | # SHA512 block procedure for ARMv4. September 2007. | 10 | # SHA512 block procedure for ARMv4. September 2007. |
11 | 11 | ||
12 | # This code is ~4.5 (four and a half) times faster than code generated | 12 | # This code is ~4.5 (four and a half) times faster than code generated |
13 | # by gcc 3.4 and it spends ~72 clock cycles per byte. | 13 | # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue |
14 | # Xscale PXA250 core]. | ||
15 | # | ||
16 | # July 2010. | ||
17 | # | ||
18 | # Rescheduling for dual-issue pipeline resulted in 6% improvement on | ||
19 | # Cortex A8 core and ~40 cycles per processed byte. | ||
14 | 20 | ||
15 | # Byte order [in]dependence. ========================================= | 21 | # Byte order [in]dependence. ========================================= |
16 | # | 22 | # |
@@ -22,7 +28,7 @@ $hi=0; | |||
22 | $lo=4; | 28 | $lo=4; |
23 | # ==================================================================== | 29 | # ==================================================================== |
24 | 30 | ||
25 | $output=shift; | 31 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
26 | open STDOUT,">$output"; | 32 | open STDOUT,">$output"; |
27 | 33 | ||
28 | $ctx="r0"; | 34 | $ctx="r0"; |
@@ -73,33 +79,31 @@ $code.=<<___; | |||
73 | eor $t0,$t0,$Elo,lsl#23 | 79 | eor $t0,$t0,$Elo,lsl#23 |
74 | eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) | 80 | eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) |
75 | adds $Tlo,$Tlo,$t0 | 81 | adds $Tlo,$Tlo,$t0 |
76 | adc $Thi,$Thi,$t1 @ T += Sigma1(e) | ||
77 | adds $Tlo,$Tlo,$t2 | ||
78 | adc $Thi,$Thi,$t3 @ T += h | ||
79 | |||
80 | ldr $t0,[sp,#$Foff+0] @ f.lo | 82 | ldr $t0,[sp,#$Foff+0] @ f.lo |
83 | adc $Thi,$Thi,$t1 @ T += Sigma1(e) | ||
81 | ldr $t1,[sp,#$Foff+4] @ f.hi | 84 | ldr $t1,[sp,#$Foff+4] @ f.hi |
85 | adds $Tlo,$Tlo,$t2 | ||
82 | ldr $t2,[sp,#$Goff+0] @ g.lo | 86 | ldr $t2,[sp,#$Goff+0] @ g.lo |
87 | adc $Thi,$Thi,$t3 @ T += h | ||
83 | ldr $t3,[sp,#$Goff+4] @ g.hi | 88 | ldr $t3,[sp,#$Goff+4] @ g.hi |
84 | str $Elo,[sp,#$Eoff+0] | ||
85 | str $Ehi,[sp,#$Eoff+4] | ||
86 | str $Alo,[sp,#$Aoff+0] | ||
87 | str $Ahi,[sp,#$Aoff+4] | ||
88 | 89 | ||
89 | eor $t0,$t0,$t2 | 90 | eor $t0,$t0,$t2 |
91 | str $Elo,[sp,#$Eoff+0] | ||
90 | eor $t1,$t1,$t3 | 92 | eor $t1,$t1,$t3 |
93 | str $Ehi,[sp,#$Eoff+4] | ||
91 | and $t0,$t0,$Elo | 94 | and $t0,$t0,$Elo |
95 | str $Alo,[sp,#$Aoff+0] | ||
92 | and $t1,$t1,$Ehi | 96 | and $t1,$t1,$Ehi |
97 | str $Ahi,[sp,#$Aoff+4] | ||
93 | eor $t0,$t0,$t2 | 98 | eor $t0,$t0,$t2 |
94 | eor $t1,$t1,$t3 @ Ch(e,f,g) | ||
95 | |||
96 | ldr $t2,[$Ktbl,#4] @ K[i].lo | 99 | ldr $t2,[$Ktbl,#4] @ K[i].lo |
100 | eor $t1,$t1,$t3 @ Ch(e,f,g) | ||
97 | ldr $t3,[$Ktbl,#0] @ K[i].hi | 101 | ldr $t3,[$Ktbl,#0] @ K[i].hi |
98 | ldr $Elo,[sp,#$Doff+0] @ d.lo | ||
99 | ldr $Ehi,[sp,#$Doff+4] @ d.hi | ||
100 | 102 | ||
101 | adds $Tlo,$Tlo,$t0 | 103 | adds $Tlo,$Tlo,$t0 |
104 | ldr $Elo,[sp,#$Doff+0] @ d.lo | ||
102 | adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) | 105 | adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) |
106 | ldr $Ehi,[sp,#$Doff+4] @ d.hi | ||
103 | adds $Tlo,$Tlo,$t2 | 107 | adds $Tlo,$Tlo,$t2 |
104 | adc $Thi,$Thi,$t3 @ T += K[i] | 108 | adc $Thi,$Thi,$t3 @ T += K[i] |
105 | adds $Elo,$Elo,$Tlo | 109 | adds $Elo,$Elo,$Tlo |