summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha/asm/sha512-armv4.pl
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-armv4.pl357
1 files changed, 268 insertions, 89 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
index 3a35861ac6..7faf37b147 100644
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
@@ -18,22 +18,33 @@
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on 18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte. 19# Cortex A8 core and ~40 cycles per processed byte.
20 20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 7%
24# improvement on Coxtex A8 core and ~38 cycles per byte.
25
26# March 2011.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process
29# one byte in 25.5 cycles or 47% faster than integer-only code.
30
21# Byte order [in]dependence. ========================================= 31# Byte order [in]dependence. =========================================
22# 32#
23# Caller is expected to maintain specific *dword* order in h[0-7], 33# Originally caller was expected to maintain specific *dword* order in
24# namely with most significant dword at *lower* address, which is 34# h[0-7], namely with most significant dword at *lower* address, which
25# reflected in below two parameters. *Byte* order within these dwords 35# was reflected in below two parameters as 0 and 4. Now caller is
26# in turn is whatever *native* byte order on current platform. 36# expected to maintain native byte order for whole 64-bit values.
27$hi=0; 37$hi="HI";
28$lo=4; 38$lo="LO";
29# ==================================================================== 39# ====================================================================
30 40
31while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 41while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
32open STDOUT,">$output"; 42open STDOUT,">$output";
33 43
34$ctx="r0"; 44$ctx="r0"; # parameter block
35$inp="r1"; 45$inp="r1";
36$len="r2"; 46$len="r2";
47
37$Tlo="r3"; 48$Tlo="r3";
38$Thi="r4"; 49$Thi="r4";
39$Alo="r5"; 50$Alo="r5";
@@ -61,15 +72,17 @@ $Xoff=8*8;
61sub BODY_00_15() { 72sub BODY_00_15() {
62my $magic = shift; 73my $magic = shift;
63$code.=<<___; 74$code.=<<___;
64 ldr $t2,[sp,#$Hoff+0] @ h.lo
65 ldr $t3,[sp,#$Hoff+4] @ h.hi
66 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) 75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
67 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
68 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
69 mov $t0,$Elo,lsr#14 78 mov $t0,$Elo,lsr#14
79 str $Tlo,[sp,#$Xoff+0]
70 mov $t1,$Ehi,lsr#14 80 mov $t1,$Ehi,lsr#14
81 str $Thi,[sp,#$Xoff+4]
71 eor $t0,$t0,$Ehi,lsl#18 82 eor $t0,$t0,$Ehi,lsl#18
83 ldr $t2,[sp,#$Hoff+0] @ h.lo
72 eor $t1,$t1,$Elo,lsl#18 84 eor $t1,$t1,$Elo,lsl#18
85 ldr $t3,[sp,#$Hoff+4] @ h.hi
73 eor $t0,$t0,$Elo,lsr#18 86 eor $t0,$t0,$Elo,lsr#18
74 eor $t1,$t1,$Ehi,lsr#18 87 eor $t1,$t1,$Ehi,lsr#18
75 eor $t0,$t0,$Ehi,lsl#14 88 eor $t0,$t0,$Ehi,lsl#14
@@ -96,25 +109,24 @@ $code.=<<___;
96 and $t1,$t1,$Ehi 109 and $t1,$t1,$Ehi
97 str $Ahi,[sp,#$Aoff+4] 110 str $Ahi,[sp,#$Aoff+4]
98 eor $t0,$t0,$t2 111 eor $t0,$t0,$t2
99 ldr $t2,[$Ktbl,#4] @ K[i].lo 112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
100 eor $t1,$t1,$t3 @ Ch(e,f,g) 113 eor $t1,$t1,$t3 @ Ch(e,f,g)
101 ldr $t3,[$Ktbl,#0] @ K[i].hi 114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
102 115
103 adds $Tlo,$Tlo,$t0 116 adds $Tlo,$Tlo,$t0
104 ldr $Elo,[sp,#$Doff+0] @ d.lo 117 ldr $Elo,[sp,#$Doff+0] @ d.lo
105 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) 118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
106 ldr $Ehi,[sp,#$Doff+4] @ d.hi 119 ldr $Ehi,[sp,#$Doff+4] @ d.hi
107 adds $Tlo,$Tlo,$t2 120 adds $Tlo,$Tlo,$t2
121 and $t0,$t2,#0xff
108 adc $Thi,$Thi,$t3 @ T += K[i] 122 adc $Thi,$Thi,$t3 @ T += K[i]
109 adds $Elo,$Elo,$Tlo 123 adds $Elo,$Elo,$Tlo
124 ldr $t2,[sp,#$Boff+0] @ b.lo
110 adc $Ehi,$Ehi,$Thi @ d += T 125 adc $Ehi,$Ehi,$Thi @ d += T
111
112 and $t0,$t2,#0xff
113 teq $t0,#$magic 126 teq $t0,#$magic
114 orreq $Ktbl,$Ktbl,#1
115 127
116 ldr $t2,[sp,#$Boff+0] @ b.lo
117 ldr $t3,[sp,#$Coff+0] @ c.lo 128 ldr $t3,[sp,#$Coff+0] @ c.lo
129 orreq $Ktbl,$Ktbl,#1
118 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) 130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
119 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
120 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 132 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -131,80 +143,100 @@ $code.=<<___;
131 eor $t0,$t0,$Alo,lsl#25 143 eor $t0,$t0,$Alo,lsl#25
132 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) 144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
133 adds $Tlo,$Tlo,$t0 145 adds $Tlo,$Tlo,$t0
146 and $t0,$Alo,$t2
134 adc $Thi,$Thi,$t1 @ T += Sigma0(a) 147 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
135 148
136 and $t0,$Alo,$t2
137 orr $Alo,$Alo,$t2
138 ldr $t1,[sp,#$Boff+4] @ b.hi 149 ldr $t1,[sp,#$Boff+4] @ b.hi
150 orr $Alo,$Alo,$t2
139 ldr $t2,[sp,#$Coff+4] @ c.hi 151 ldr $t2,[sp,#$Coff+4] @ c.hi
140 and $Alo,$Alo,$t3 152 and $Alo,$Alo,$t3
141 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
142 and $t3,$Ahi,$t1 153 and $t3,$Ahi,$t1
143 orr $Ahi,$Ahi,$t1 154 orr $Ahi,$Ahi,$t1
155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
144 and $Ahi,$Ahi,$t2 156 and $Ahi,$Ahi,$t2
145 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
146 adds $Alo,$Alo,$Tlo 157 adds $Alo,$Alo,$Tlo
147 adc $Ahi,$Ahi,$Thi @ h += T 158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
148
149 sub sp,sp,#8 159 sub sp,sp,#8
160 adc $Ahi,$Ahi,$Thi @ h += T
161 tst $Ktbl,#1
150 add $Ktbl,$Ktbl,#8 162 add $Ktbl,$Ktbl,#8
151___ 163___
152} 164}
153$code=<<___; 165$code=<<___;
166#include "arm_arch.h"
167#ifdef __ARMEL__
168# define LO 0
169# define HI 4
170# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
171#else
172# define HI 0
173# define LO 4
174# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
175#endif
176
154.text 177.text
155.code 32 178.code 32
156.type K512,%object 179.type K512,%object
157.align 5 180.align 5
158K512: 181K512:
159.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd 182WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
160.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc 183WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
161.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 184WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
162.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 185WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
163.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe 186WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
164.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 187WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
165.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 188WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
166.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 189WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
167.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 190WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
168.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 191WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
169.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 192WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
170.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 193WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
171.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 194WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
172.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 195WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
173.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 196WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
174.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 197WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
175.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 198WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
176.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df 199WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
177.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 200WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
178.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b 201WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
179.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 202WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
180.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 203WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
181.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 204WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
182.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 205WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
183.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 206WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
184.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 207WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
185.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb 208WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
186.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 209WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
187.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 210WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
188.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec 211WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
189.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 212WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
190.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b 213WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
191.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 214WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
192.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 215WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
193.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 216WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
194.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b 217WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
195.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 218WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
196.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c 219WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
197.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a 220WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
198.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 221WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
199.size K512,.-K512 222.size K512,.-K512
223.LOPENSSL_armcap:
224.word OPENSSL_armcap_P-sha512_block_data_order
225.skip 32-4
200 226
201.global sha512_block_data_order 227.global sha512_block_data_order
202.type sha512_block_data_order,%function 228.type sha512_block_data_order,%function
203sha512_block_data_order: 229sha512_block_data_order:
204 sub r3,pc,#8 @ sha512_block_data_order 230 sub r3,pc,#8 @ sha512_block_data_order
205 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
232#if __ARM_ARCH__>=7
233 ldr r12,.LOPENSSL_armcap
234 ldr r12,[r3,r12] @ OPENSSL_armcap_P
235 tst r12,#1
236 bne .LNEON
237#endif
206 stmdb sp!,{r4-r12,lr} 238 stmdb sp!,{r4-r12,lr}
207 sub $Ktbl,r3,#640 @ K512 239 sub $Ktbl,r3,#672 @ K512
208 sub sp,sp,#9*8 240 sub sp,sp,#9*8
209 241
210 ldr $Elo,[$ctx,#$Eoff+$lo] 242 ldr $Elo,[$ctx,#$Eoff+$lo]
@@ -238,6 +270,7 @@ sha512_block_data_order:
238 str $Thi,[sp,#$Foff+4] 270 str $Thi,[sp,#$Foff+4]
239 271
240.L00_15: 272.L00_15:
273#if __ARM_ARCH__<7
241 ldrb $Tlo,[$inp,#7] 274 ldrb $Tlo,[$inp,#7]
242 ldrb $t0, [$inp,#6] 275 ldrb $t0, [$inp,#6]
243 ldrb $t1, [$inp,#5] 276 ldrb $t1, [$inp,#5]
@@ -252,26 +285,30 @@ sha512_block_data_order:
252 orr $Thi,$Thi,$t3,lsl#8 285 orr $Thi,$Thi,$t3,lsl#8
253 orr $Thi,$Thi,$t0,lsl#16 286 orr $Thi,$Thi,$t0,lsl#16
254 orr $Thi,$Thi,$t1,lsl#24 287 orr $Thi,$Thi,$t1,lsl#24
255 str $Tlo,[sp,#$Xoff+0] 288#else
256 str $Thi,[sp,#$Xoff+4] 289 ldr $Tlo,[$inp,#4]
290 ldr $Thi,[$inp],#8
291#ifdef __ARMEL__
292 rev $Tlo,$Tlo
293 rev $Thi,$Thi
294#endif
295#endif
257___ 296___
258 &BODY_00_15(0x94); 297 &BODY_00_15(0x94);
259$code.=<<___; 298$code.=<<___;
260 tst $Ktbl,#1 299 tst $Ktbl,#1
261 beq .L00_15 300 beq .L00_15
262 bic $Ktbl,$Ktbl,#1
263
264.L16_79:
265 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] 301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
266 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] 302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
267 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] 303 bic $Ktbl,$Ktbl,#1
268 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] 304.L16_79:
269
270 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) 305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
271 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
272 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
273 mov $Tlo,$t0,lsr#1 308 mov $Tlo,$t0,lsr#1
309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
274 mov $Thi,$t1,lsr#1 310 mov $Thi,$t1,lsr#1
311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
275 eor $Tlo,$Tlo,$t1,lsl#31 312 eor $Tlo,$Tlo,$t1,lsl#31
276 eor $Thi,$Thi,$t0,lsl#31 313 eor $Thi,$Thi,$t0,lsl#31
277 eor $Tlo,$Tlo,$t0,lsr#8 314 eor $Tlo,$Tlo,$t0,lsr#8
@@ -295,25 +332,24 @@ $code.=<<___;
295 eor $t1,$t1,$t3,lsl#3 332 eor $t1,$t1,$t3,lsl#3
296 eor $t0,$t0,$t2,lsr#6 333 eor $t0,$t0,$t2,lsr#6
297 eor $t1,$t1,$t3,lsr#6 334 eor $t1,$t1,$t3,lsr#6
335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
298 eor $t0,$t0,$t3,lsl#26 336 eor $t0,$t0,$t3,lsl#26
299 337
300 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
301 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] 338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
302 adds $Tlo,$Tlo,$t0 339 adds $Tlo,$Tlo,$t0
340 ldr $t0,[sp,#`$Xoff+8*16`+0]
303 adc $Thi,$Thi,$t1 341 adc $Thi,$Thi,$t1
304 342
305 ldr $t0,[sp,#`$Xoff+8*16`+0]
306 ldr $t1,[sp,#`$Xoff+8*16`+4] 343 ldr $t1,[sp,#`$Xoff+8*16`+4]
307 adds $Tlo,$Tlo,$t2 344 adds $Tlo,$Tlo,$t2
308 adc $Thi,$Thi,$t3 345 adc $Thi,$Thi,$t3
309 adds $Tlo,$Tlo,$t0 346 adds $Tlo,$Tlo,$t0
310 adc $Thi,$Thi,$t1 347 adc $Thi,$Thi,$t1
311 str $Tlo,[sp,#$Xoff+0]
312 str $Thi,[sp,#$Xoff+4]
313___ 348___
314 &BODY_00_15(0x17); 349 &BODY_00_15(0x17);
315$code.=<<___; 350$code.=<<___;
316 tst $Ktbl,#1 351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
317 beq .L16_79 353 beq .L16_79
318 bic $Ktbl,$Ktbl,#1 354 bic $Ktbl,$Ktbl,#1
319 355
@@ -324,12 +360,12 @@ $code.=<<___;
324 ldr $t2, [$ctx,#$Boff+$lo] 360 ldr $t2, [$ctx,#$Boff+$lo]
325 ldr $t3, [$ctx,#$Boff+$hi] 361 ldr $t3, [$ctx,#$Boff+$hi]
326 adds $t0,$Alo,$t0 362 adds $t0,$Alo,$t0
327 adc $t1,$Ahi,$t1
328 adds $t2,$Tlo,$t2
329 adc $t3,$Thi,$t3
330 str $t0, [$ctx,#$Aoff+$lo] 363 str $t0, [$ctx,#$Aoff+$lo]
364 adc $t1,$Ahi,$t1
331 str $t1, [$ctx,#$Aoff+$hi] 365 str $t1, [$ctx,#$Aoff+$hi]
366 adds $t2,$Tlo,$t2
332 str $t2, [$ctx,#$Boff+$lo] 367 str $t2, [$ctx,#$Boff+$lo]
368 adc $t3,$Thi,$t3
333 str $t3, [$ctx,#$Boff+$hi] 369 str $t3, [$ctx,#$Boff+$hi]
334 370
335 ldr $Alo,[sp,#$Coff+0] 371 ldr $Alo,[sp,#$Coff+0]
@@ -341,12 +377,12 @@ $code.=<<___;
341 ldr $t2, [$ctx,#$Doff+$lo] 377 ldr $t2, [$ctx,#$Doff+$lo]
342 ldr $t3, [$ctx,#$Doff+$hi] 378 ldr $t3, [$ctx,#$Doff+$hi]
343 adds $t0,$Alo,$t0 379 adds $t0,$Alo,$t0
344 adc $t1,$Ahi,$t1
345 adds $t2,$Tlo,$t2
346 adc $t3,$Thi,$t3
347 str $t0, [$ctx,#$Coff+$lo] 380 str $t0, [$ctx,#$Coff+$lo]
381 adc $t1,$Ahi,$t1
348 str $t1, [$ctx,#$Coff+$hi] 382 str $t1, [$ctx,#$Coff+$hi]
383 adds $t2,$Tlo,$t2
349 str $t2, [$ctx,#$Doff+$lo] 384 str $t2, [$ctx,#$Doff+$lo]
385 adc $t3,$Thi,$t3
350 str $t3, [$ctx,#$Doff+$hi] 386 str $t3, [$ctx,#$Doff+$hi]
351 387
352 ldr $Tlo,[sp,#$Foff+0] 388 ldr $Tlo,[sp,#$Foff+0]
@@ -356,12 +392,12 @@ $code.=<<___;
356 ldr $t2, [$ctx,#$Foff+$lo] 392 ldr $t2, [$ctx,#$Foff+$lo]
357 ldr $t3, [$ctx,#$Foff+$hi] 393 ldr $t3, [$ctx,#$Foff+$hi]
358 adds $Elo,$Elo,$t0 394 adds $Elo,$Elo,$t0
359 adc $Ehi,$Ehi,$t1
360 adds $t2,$Tlo,$t2
361 adc $t3,$Thi,$t3
362 str $Elo,[$ctx,#$Eoff+$lo] 395 str $Elo,[$ctx,#$Eoff+$lo]
396 adc $Ehi,$Ehi,$t1
363 str $Ehi,[$ctx,#$Eoff+$hi] 397 str $Ehi,[$ctx,#$Eoff+$hi]
398 adds $t2,$Tlo,$t2
364 str $t2, [$ctx,#$Foff+$lo] 399 str $t2, [$ctx,#$Foff+$lo]
400 adc $t3,$Thi,$t3
365 str $t3, [$ctx,#$Foff+$hi] 401 str $t3, [$ctx,#$Foff+$hi]
366 402
367 ldr $Alo,[sp,#$Goff+0] 403 ldr $Alo,[sp,#$Goff+0]
@@ -373,12 +409,12 @@ $code.=<<___;
373 ldr $t2, [$ctx,#$Hoff+$lo] 409 ldr $t2, [$ctx,#$Hoff+$lo]
374 ldr $t3, [$ctx,#$Hoff+$hi] 410 ldr $t3, [$ctx,#$Hoff+$hi]
375 adds $t0,$Alo,$t0 411 adds $t0,$Alo,$t0
376 adc $t1,$Ahi,$t1
377 adds $t2,$Tlo,$t2
378 adc $t3,$Thi,$t3
379 str $t0, [$ctx,#$Goff+$lo] 412 str $t0, [$ctx,#$Goff+$lo]
413 adc $t1,$Ahi,$t1
380 str $t1, [$ctx,#$Goff+$hi] 414 str $t1, [$ctx,#$Goff+$hi]
415 adds $t2,$Tlo,$t2
381 str $t2, [$ctx,#$Hoff+$lo] 416 str $t2, [$ctx,#$Hoff+$lo]
417 adc $t3,$Thi,$t3
382 str $t3, [$ctx,#$Hoff+$hi] 418 str $t3, [$ctx,#$Hoff+$hi]
383 419
384 add sp,sp,#640 420 add sp,sp,#640
@@ -388,13 +424,156 @@ $code.=<<___;
388 bne .Loop 424 bne .Loop
389 425
390 add sp,sp,#8*9 @ destroy frame 426 add sp,sp,#8*9 @ destroy frame
427#if __ARM_ARCH__>=5
428 ldmia sp!,{r4-r12,pc}
429#else
391 ldmia sp!,{r4-r12,lr} 430 ldmia sp!,{r4-r12,lr}
392 tst lr,#1 431 tst lr,#1
393 moveq pc,lr @ be binary compatible with V4, yet 432 moveq pc,lr @ be binary compatible with V4, yet
394 bx lr @ interoperable with Thumb ISA:-) 433 bx lr @ interoperable with Thumb ISA:-)
395.size sha512_block_data_order,.-sha512_block_data_order 434#endif
396.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 435___
436
437{
438my @Sigma0=(28,34,39);
439my @Sigma1=(14,18,41);
440my @sigma0=(1, 8, 7);
441my @sigma1=(19,61,6);
442
443my $Ktbl="r3";
444my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
445
446my @X=map("d$_",(0..15));
447my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
448
449sub NEON_00_15() {
450my $i=shift;
451my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
453
454$code.=<<___ if ($i<16 || $i&1);
455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
456#if $i<16
457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
458#endif
459 vshr.u64 $t1,$e,#@Sigma1[1]
460 vshr.u64 $t2,$e,#@Sigma1[2]
461___
462$code.=<<___;
463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
464 vsli.64 $t0,$e,#`64-@Sigma1[0]`
465 vsli.64 $t1,$e,#`64-@Sigma1[1]`
466 vsli.64 $t2,$e,#`64-@Sigma1[2]`
467#if $i<16 && defined(__ARMEL__)
468 vrev64.8 @X[$i],@X[$i]
469#endif
470 vadd.i64 $T1,$K,$h
471 veor $Ch,$f,$g
472 veor $t0,$t1
473 vand $Ch,$e
474 veor $t0,$t2 @ Sigma1(e)
475 veor $Ch,$g @ Ch(e,f,g)
476 vadd.i64 $T1,$t0
477 vshr.u64 $t0,$a,#@Sigma0[0]
478 vadd.i64 $T1,$Ch
479 vshr.u64 $t1,$a,#@Sigma0[1]
480 vshr.u64 $t2,$a,#@Sigma0[2]
481 vsli.64 $t0,$a,#`64-@Sigma0[0]`
482 vsli.64 $t1,$a,#`64-@Sigma0[1]`
483 vsli.64 $t2,$a,#`64-@Sigma0[2]`
484 vadd.i64 $T1,@X[$i%16]
485 vorr $Maj,$a,$c
486 vand $Ch,$a,$c
487 veor $h,$t0,$t1
488 vand $Maj,$b
489 veor $h,$t2 @ Sigma0(a)
490 vorr $Maj,$Ch @ Maj(a,b,c)
491 vadd.i64 $h,$T1
492 vadd.i64 $d,$T1
493 vadd.i64 $h,$Maj
494___
495}
496
497sub NEON_16_79() {
498my $i=shift;
499
500if ($i&1) { &NEON_00_15($i,@_); return; }
501
502# 2x-vectorized, therefore runs every 2nd round
503my @X=map("q$_",(0..7)); # view @X as 128-bit vector
504my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
505my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
506my $e=@_[4]; # $e from NEON_00_15
507$i /= 2;
508$code.=<<___;
509 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
510 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
511 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
512 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
513 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
514 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
515 veor $s1,$t0
516 vshr.u64 $t0,$s0,#@sigma0[0]
517 veor $s1,$t1 @ sigma1(X[i+14])
518 vshr.u64 $t1,$s0,#@sigma0[1]
519 vadd.i64 @X[$i%8],$s1
520 vshr.u64 $s1,$s0,#@sigma0[2]
521 vsli.64 $t0,$s0,#`64-@sigma0[0]`
522 vsli.64 $t1,$s0,#`64-@sigma0[1]`
523 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
524 veor $s1,$t0
525 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
526 vadd.i64 @X[$i%8],$s0
527 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
528 veor $s1,$t1 @ sigma0(X[i+1])
529 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
530 vadd.i64 @X[$i%8],$s1
531___
532 &NEON_00_15(2*$i,@_);
533}
534
535$code.=<<___;
536#if __ARM_ARCH__>=7
537.fpu neon
538
539.align 4
540.LNEON:
541 dmb @ errata #451034 on early Cortex A8
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 sub $Ktbl,r3,#672 @ K512
544 vldmia $ctx,{$A-$H} @ load context
545.Loop_neon:
546___
547for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
548$code.=<<___;
549 mov $cnt,#4
550.L16_79_neon:
551 subs $cnt,#1
552___
553for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
554$code.=<<___;
555 bne .L16_79_neon
556
557 vldmia $ctx,{d24-d31} @ load context to temp
558 vadd.i64 q8,q12 @ vectorized accumulate
559 vadd.i64 q9,q13
560 vadd.i64 q10,q14
561 vadd.i64 q11,q15
562 vstmia $ctx,{$A-$H} @ save context
563 teq $inp,$len
564 sub $Ktbl,#640 @ rewind K512
565 bne .Loop_neon
566
567 vldmia sp!,{d8-d15} @ epilogue
568 bx lr
569#endif
570___
571}
572$code.=<<___;
573.size sha512_block_data_order,.-sha512_block_data_order
574.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
397.align 2 575.align 2
576.comm OPENSSL_armcap_P,4,4
398___ 577___
399 578
400$code =~ s/\`([^\`]*)\`/eval $1/gem; 579$code =~ s/\`([^\`]*)\`/eval $1/gem;