diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/armv4-gf2m.pl')
-rw-r--r-- | src/lib/libcrypto/bn/asm/armv4-gf2m.pl | 278 |
1 files changed, 278 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl new file mode 100644 index 0000000000..c52e0b75b5 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl | |||
@@ -0,0 +1,278 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # May 2011 | ||
11 | # | ||
12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication | ||
13 | # used in bn_gf2m.c. It's kind of low-hanging mechanical port from | ||
14 | # C for the time being... Except that it has two code paths: pure | ||
15 | # integer code suitable for any ARMv4 and later CPU and NEON code | ||
16 | # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs | ||
17 | # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50% | ||
18 | # faster than compiler-generated code. For ECDH and ECDSA verify (but | ||
19 | # not for ECDSA sign) it means 25%-45% improvement depending on key | ||
20 | # length, more for longer keys. Even though NEON 1x1 multiplication | ||
21 | # runs in even less cycles, ~30, improvement is measurable only on | ||
22 | # longer keys. One has to optimize code elsewhere to get NEON glow... | ||
23 | |||
24 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
25 | open STDOUT,">$output"; | ||
26 | |||
27 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | ||
28 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | ||
29 | sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } | ||
30 | |||
31 | $code=<<___; | ||
32 | #include "arm_arch.h" | ||
33 | |||
34 | .text | ||
35 | .code 32 | ||
36 | |||
37 | #if __ARM_ARCH__>=7 | ||
38 | .fpu neon | ||
39 | |||
40 | .type mul_1x1_neon,%function | ||
41 | .align 5 | ||
42 | mul_1x1_neon: | ||
43 | vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a | ||
44 | vmull.p8 `&Q("d0")`,d16,d17 @ a·bb | ||
45 | vshl.u64 `&Dlo("q2")`,d16,#16 | ||
46 | vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb | ||
47 | vshl.u64 `&Dlo("q3")`,d16,#24 | ||
48 | vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb | ||
49 | vshr.u64 `&Dlo("q1")`,#8 | ||
50 | vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb | ||
51 | vshl.u64 `&Dhi("q1")`,#24 | ||
52 | veor d0,`&Dlo("q1")` | ||
53 | vshr.u64 `&Dlo("q2")`,#16 | ||
54 | veor d0,`&Dhi("q1")` | ||
55 | vshl.u64 `&Dhi("q2")`,#16 | ||
56 | veor d0,`&Dlo("q2")` | ||
57 | vshr.u64 `&Dlo("q3")`,#24 | ||
58 | veor d0,`&Dhi("q2")` | ||
59 | vshl.u64 `&Dhi("q3")`,#8 | ||
60 | veor d0,`&Dlo("q3")` | ||
61 | veor d0,`&Dhi("q3")` | ||
62 | bx lr | ||
63 | .size mul_1x1_neon,.-mul_1x1_neon | ||
64 | #endif | ||
65 | ___ | ||
66 | ################ | ||
67 | # private interface to mul_1x1_ialu | ||
68 | # | ||
69 | $a="r1"; | ||
70 | $b="r0"; | ||
71 | |||
72 | ($a0,$a1,$a2,$a12,$a4,$a14)= | ||
73 | ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12); | ||
74 | |||
75 | $mask="r12"; | ||
76 | |||
77 | $code.=<<___; | ||
78 | .type mul_1x1_ialu,%function | ||
79 | .align 5 | ||
80 | mul_1x1_ialu: | ||
81 | mov $a0,#0 | ||
82 | bic $a1,$a,#3<<30 @ a1=a&0x3fffffff | ||
83 | str $a0,[sp,#0] @ tab[0]=0 | ||
84 | add $a2,$a1,$a1 @ a2=a1<<1 | ||
85 | str $a1,[sp,#4] @ tab[1]=a1 | ||
86 | eor $a12,$a1,$a2 @ a1^a2 | ||
87 | str $a2,[sp,#8] @ tab[2]=a2 | ||
88 | mov $a4,$a1,lsl#2 @ a4=a1<<2 | ||
89 | str $a12,[sp,#12] @ tab[3]=a1^a2 | ||
90 | eor $a14,$a1,$a4 @ a1^a4 | ||
91 | str $a4,[sp,#16] @ tab[4]=a4 | ||
92 | eor $a0,$a2,$a4 @ a2^a4 | ||
93 | str $a14,[sp,#20] @ tab[5]=a1^a4 | ||
94 | eor $a12,$a12,$a4 @ a1^a2^a4 | ||
95 | str $a0,[sp,#24] @ tab[6]=a2^a4 | ||
96 | and $i0,$mask,$b,lsl#2 | ||
97 | str $a12,[sp,#28] @ tab[7]=a1^a2^a4 | ||
98 | |||
99 | and $i1,$mask,$b,lsr#1 | ||
100 | ldr $lo,[sp,$i0] @ tab[b & 0x7] | ||
101 | and $i0,$mask,$b,lsr#4 | ||
102 | ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7] | ||
103 | and $i1,$mask,$b,lsr#7 | ||
104 | ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7] | ||
105 | eor $lo,$lo,$t1,lsl#3 @ stall | ||
106 | mov $hi,$t1,lsr#29 | ||
107 | ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7] | ||
108 | |||
109 | and $i0,$mask,$b,lsr#10 | ||
110 | eor $lo,$lo,$t0,lsl#6 | ||
111 | eor $hi,$hi,$t0,lsr#26 | ||
112 | ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7] | ||
113 | |||
114 | and $i1,$mask,$b,lsr#13 | ||
115 | eor $lo,$lo,$t1,lsl#9 | ||
116 | eor $hi,$hi,$t1,lsr#23 | ||
117 | ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7] | ||
118 | |||
119 | and $i0,$mask,$b,lsr#16 | ||
120 | eor $lo,$lo,$t0,lsl#12 | ||
121 | eor $hi,$hi,$t0,lsr#20 | ||
122 | ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7] | ||
123 | |||
124 | and $i1,$mask,$b,lsr#19 | ||
125 | eor $lo,$lo,$t1,lsl#15 | ||
126 | eor $hi,$hi,$t1,lsr#17 | ||
127 | ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7] | ||
128 | |||
129 | and $i0,$mask,$b,lsr#22 | ||
130 | eor $lo,$lo,$t0,lsl#18 | ||
131 | eor $hi,$hi,$t0,lsr#14 | ||
132 | ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7] | ||
133 | |||
134 | and $i1,$mask,$b,lsr#25 | ||
135 | eor $lo,$lo,$t1,lsl#21 | ||
136 | eor $hi,$hi,$t1,lsr#11 | ||
137 | ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7] | ||
138 | |||
139 | tst $a,#1<<30 | ||
140 | and $i0,$mask,$b,lsr#28 | ||
141 | eor $lo,$lo,$t0,lsl#24 | ||
142 | eor $hi,$hi,$t0,lsr#8 | ||
143 | ldr $t0,[sp,$i0] @ tab[b >> 30 ] | ||
144 | |||
145 | eorne $lo,$lo,$b,lsl#30 | ||
146 | eorne $hi,$hi,$b,lsr#2 | ||
147 | tst $a,#1<<31 | ||
148 | eor $lo,$lo,$t1,lsl#27 | ||
149 | eor $hi,$hi,$t1,lsr#5 | ||
150 | eorne $lo,$lo,$b,lsl#31 | ||
151 | eorne $hi,$hi,$b,lsr#1 | ||
152 | eor $lo,$lo,$t0,lsl#30 | ||
153 | eor $hi,$hi,$t0,lsr#2 | ||
154 | |||
155 | mov pc,lr | ||
156 | .size mul_1x1_ialu,.-mul_1x1_ialu | ||
157 | ___ | ||
158 | ################ | ||
159 | # void bn_GF2m_mul_2x2(BN_ULONG *r, | ||
160 | # BN_ULONG a1,BN_ULONG a0, | ||
161 | # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 | ||
162 | |||
163 | ($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); | ||
164 | |||
165 | $code.=<<___; | ||
166 | .global bn_GF2m_mul_2x2 | ||
167 | .type bn_GF2m_mul_2x2,%function | ||
168 | .align 5 | ||
169 | bn_GF2m_mul_2x2: | ||
170 | #if __ARM_ARCH__>=7 | ||
171 | ldr r12,.LOPENSSL_armcap | ||
172 | .Lpic: ldr r12,[pc,r12] | ||
173 | tst r12,#1 | ||
174 | beq .Lialu | ||
175 | |||
176 | veor $A1,$A1 | ||
177 | vmov.32 $B1,r3,r3 @ two copies of b1 | ||
178 | vmov.32 ${A1}[0],r1 @ a1 | ||
179 | |||
180 | veor $A0,$A0 | ||
181 | vld1.32 ${B0}[],[sp,:32] @ two copies of b0 | ||
182 | vmov.32 ${A0}[0],r2 @ a0 | ||
183 | mov r12,lr | ||
184 | |||
185 | vmov d16,$A1 | ||
186 | vmov d17,$B1 | ||
187 | bl mul_1x1_neon @ a1·b1 | ||
188 | vmov $A1B1,d0 | ||
189 | |||
190 | vmov d16,$A0 | ||
191 | vmov d17,$B0 | ||
192 | bl mul_1x1_neon @ a0·b0 | ||
193 | vmov $A0B0,d0 | ||
194 | |||
195 | veor d16,$A0,$A1 | ||
196 | veor d17,$B0,$B1 | ||
197 | veor $A0,$A0B0,$A1B1 | ||
198 | bl mul_1x1_neon @ (a0+a1)·(b0+b1) | ||
199 | |||
200 | veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 | ||
201 | vshl.u64 d1,d0,#32 | ||
202 | vshr.u64 d0,d0,#32 | ||
203 | veor $A0B0,d1 | ||
204 | veor $A1B1,d0 | ||
205 | vst1.32 {${A0B0}[0]},[r0,:32]! | ||
206 | vst1.32 {${A0B0}[1]},[r0,:32]! | ||
207 | vst1.32 {${A1B1}[0]},[r0,:32]! | ||
208 | vst1.32 {${A1B1}[1]},[r0,:32] | ||
209 | bx r12 | ||
210 | .align 4 | ||
211 | .Lialu: | ||
212 | #endif | ||
213 | ___ | ||
214 | $ret="r10"; # reassigned 1st argument | ||
215 | $code.=<<___; | ||
216 | stmdb sp!,{r4-r10,lr} | ||
217 | mov $ret,r0 @ reassign 1st argument | ||
218 | mov $b,r3 @ $b=b1 | ||
219 | ldr r3,[sp,#32] @ load b0 | ||
220 | mov $mask,#7<<2 | ||
221 | sub sp,sp,#32 @ allocate tab[8] | ||
222 | |||
223 | bl mul_1x1_ialu @ a1·b1 | ||
224 | str $lo,[$ret,#8] | ||
225 | str $hi,[$ret,#12] | ||
226 | |||
227 | eor $b,$b,r3 @ flip b0 and b1 | ||
228 | eor $a,$a,r2 @ flip a0 and a1 | ||
229 | eor r3,r3,$b | ||
230 | eor r2,r2,$a | ||
231 | eor $b,$b,r3 | ||
232 | eor $a,$a,r2 | ||
233 | bl mul_1x1_ialu @ a0·b0 | ||
234 | str $lo,[$ret] | ||
235 | str $hi,[$ret,#4] | ||
236 | |||
237 | eor $a,$a,r2 | ||
238 | eor $b,$b,r3 | ||
239 | bl mul_1x1_ialu @ (a1+a0)·(b1+b0) | ||
240 | ___ | ||
241 | @r=map("r$_",(6..9)); | ||
242 | $code.=<<___; | ||
243 | ldmia $ret,{@r[0]-@r[3]} | ||
244 | eor $lo,$lo,$hi | ||
245 | eor $hi,$hi,@r[1] | ||
246 | eor $lo,$lo,@r[0] | ||
247 | eor $hi,$hi,@r[2] | ||
248 | eor $lo,$lo,@r[3] | ||
249 | eor $hi,$hi,@r[3] | ||
250 | str $hi,[$ret,#8] | ||
251 | eor $lo,$lo,$hi | ||
252 | add sp,sp,#32 @ destroy tab[8] | ||
253 | str $lo,[$ret,#4] | ||
254 | |||
255 | #if __ARM_ARCH__>=5 | ||
256 | ldmia sp!,{r4-r10,pc} | ||
257 | #else | ||
258 | ldmia sp!,{r4-r10,lr} | ||
259 | tst lr,#1 | ||
260 | moveq pc,lr @ be binary compatible with V4, yet | ||
261 | bx lr @ interoperable with Thumb ISA:-) | ||
262 | #endif | ||
263 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
264 | #if __ARM_ARCH__>=7 | ||
265 | .align 5 | ||
266 | .LOPENSSL_armcap: | ||
267 | .word OPENSSL_armcap_P-(.Lpic+8) | ||
268 | #endif | ||
269 | .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
270 | .align 5 | ||
271 | |||
272 | .comm OPENSSL_armcap_P,4,4 | ||
273 | ___ | ||
274 | |||
275 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
276 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
277 | print $code; | ||
278 | close STDOUT; # enforce flush | ||