diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/armv4-mont.pl')
-rw-r--r-- | src/lib/libcrypto/bn/asm/armv4-mont.pl | 201 |
1 files changed, 201 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl new file mode 100644 index 0000000000..14e0d2d1dd --- /dev/null +++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl | |||
@@ -0,0 +1,201 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # January 2007. | ||
11 | |||
12 | # Montgomery multiplication for ARMv4. | ||
13 | # | ||
14 | # Performance improvement naturally varies among CPU implementations | ||
15 | # and compilers. The code was observed to provide +65-35% improvement | ||
16 | # [depending on key length, less for longer keys] on ARM920T, and | ||
17 | # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code | ||
18 | # base and compiler generated code with in-lined umull and even umlal | ||
19 | # instructions. The latter means that this code didn't really have an | ||
20 | # "advantage" of utilizing some "secret" instruction. | ||
21 | # | ||
22 | # The code is interoperable with Thumb ISA and is rather compact, less | ||
23 | # than 1/2KB. Windows CE port would be trivial, as it's exclusively | ||
24 | # about decorations, ABI and instruction syntax are identical. | ||
25 | |||
26 | $num="r0"; # starts as num argument, but holds &tp[num-1] | ||
27 | $ap="r1"; | ||
28 | $bp="r2"; $bi="r2"; $rp="r2"; | ||
29 | $np="r3"; | ||
30 | $tp="r4"; | ||
31 | $aj="r5"; | ||
32 | $nj="r6"; | ||
33 | $tj="r7"; | ||
34 | $n0="r8"; | ||
35 | ########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer | ||
36 | $alo="r10"; # sl, gcc uses it to keep @GOT | ||
37 | $ahi="r11"; # fp | ||
38 | $nlo="r12"; # ip | ||
39 | ########### # r13 is stack pointer | ||
40 | $nhi="r14"; # lr | ||
41 | ########### # r15 is program counter | ||
42 | |||
43 | #### argument block layout relative to &tp[num-1], a.k.a. $num | ||
44 | $_rp="$num,#12*4"; | ||
45 | # ap permanently resides in r1 | ||
46 | $_bp="$num,#13*4"; | ||
47 | # np permanently resides in r3 | ||
48 | $_n0="$num,#14*4"; | ||
49 | $_num="$num,#15*4"; $_bpend=$_num; | ||
50 | |||
51 | $code=<<___; | ||
52 | .text | ||
53 | |||
54 | .global bn_mul_mont | ||
55 | .type bn_mul_mont,%function | ||
56 | |||
57 | .align 2 | ||
58 | bn_mul_mont: | ||
59 | stmdb sp!,{r0,r2} @ sp points at argument block | ||
60 | ldr $num,[sp,#3*4] @ load num | ||
61 | cmp $num,#2 | ||
62 | movlt r0,#0 | ||
63 | addlt sp,sp,#2*4 | ||
64 | blt .Labrt | ||
65 | |||
66 | stmdb sp!,{r4-r12,lr} @ save 10 registers | ||
67 | |||
68 | mov $num,$num,lsl#2 @ rescale $num for byte count | ||
69 | sub sp,sp,$num @ alloca(4*num) | ||
70 | sub sp,sp,#4 @ +extra dword | ||
71 | sub $num,$num,#4 @ "num=num-1" | ||
72 | add $tp,$bp,$num @ &bp[num-1] | ||
73 | |||
74 | add $num,sp,$num @ $num to point at &tp[num-1] | ||
75 | ldr $n0,[$_n0] @ &n0 | ||
76 | ldr $bi,[$bp] @ bp[0] | ||
77 | ldr $aj,[$ap],#4 @ ap[0],ap++ | ||
78 | ldr $nj,[$np],#4 @ np[0],np++ | ||
79 | ldr $n0,[$n0] @ *n0 | ||
80 | str $tp,[$_bpend] @ save &bp[num] | ||
81 | |||
82 | umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] | ||
83 | str $n0,[$_n0] @ save n0 value | ||
84 | mul $n0,$alo,$n0 @ "tp[0]"*n0 | ||
85 | mov $nlo,#0 | ||
86 | umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" | ||
87 | mov $tp,sp | ||
88 | |||
89 | .L1st: | ||
90 | ldr $aj,[$ap],#4 @ ap[j],ap++ | ||
91 | mov $alo,$ahi | ||
92 | mov $ahi,#0 | ||
93 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] | ||
94 | ldr $nj,[$np],#4 @ np[j],np++ | ||
95 | mov $nhi,#0 | ||
96 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | ||
97 | adds $nlo,$nlo,$alo | ||
98 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | ||
99 | adc $nlo,$nhi,#0 | ||
100 | cmp $tp,$num | ||
101 | bne .L1st | ||
102 | |||
103 | adds $nlo,$nlo,$ahi | ||
104 | mov $nhi,#0 | ||
105 | adc $nhi,$nhi,#0 | ||
106 | ldr $tp,[$_bp] @ restore bp | ||
107 | str $nlo,[$num] @ tp[num-1]= | ||
108 | ldr $n0,[$_n0] @ restore n0 | ||
109 | str $nhi,[$num,#4] @ tp[num]= | ||
110 | |||
111 | .Louter: | ||
112 | sub $tj,$num,sp @ "original" $num-1 value | ||
113 | sub $ap,$ap,$tj @ "rewind" ap to &ap[1] | ||
114 | sub $np,$np,$tj @ "rewind" np to &np[1] | ||
115 | ldr $bi,[$tp,#4]! @ *(++bp) | ||
116 | ldr $aj,[$ap,#-4] @ ap[0] | ||
117 | ldr $nj,[$np,#-4] @ np[0] | ||
118 | ldr $alo,[sp] @ tp[0] | ||
119 | ldr $tj,[sp,#4] @ tp[1] | ||
120 | |||
121 | mov $ahi,#0 | ||
122 | umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] | ||
123 | str $tp,[$_bp] @ save bp | ||
124 | mul $n0,$alo,$n0 | ||
125 | mov $nlo,#0 | ||
126 | umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" | ||
127 | mov $tp,sp | ||
128 | |||
129 | .Linner: | ||
130 | ldr $aj,[$ap],#4 @ ap[j],ap++ | ||
131 | adds $alo,$ahi,$tj @ +=tp[j] | ||
132 | mov $ahi,#0 | ||
133 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] | ||
134 | ldr $nj,[$np],#4 @ np[j],np++ | ||
135 | mov $nhi,#0 | ||
136 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | ||
137 | ldr $tj,[$tp,#8] @ tp[j+1] | ||
138 | adc $ahi,$ahi,#0 | ||
139 | adds $nlo,$nlo,$alo | ||
140 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | ||
141 | adc $nlo,$nhi,#0 | ||
142 | cmp $tp,$num | ||
143 | bne .Linner | ||
144 | |||
145 | adds $nlo,$nlo,$ahi | ||
146 | mov $nhi,#0 | ||
147 | adc $nhi,$nhi,#0 | ||
148 | adds $nlo,$nlo,$tj | ||
149 | adc $nhi,$nhi,#0 | ||
150 | ldr $tp,[$_bp] @ restore bp | ||
151 | ldr $tj,[$_bpend] @ restore &bp[num] | ||
152 | str $nlo,[$num] @ tp[num-1]= | ||
153 | ldr $n0,[$_n0] @ restore n0 | ||
154 | str $nhi,[$num,#4] @ tp[num]= | ||
155 | |||
156 | cmp $tp,$tj | ||
157 | bne .Louter | ||
158 | |||
159 | ldr $rp,[$_rp] @ pull rp | ||
160 | add $num,$num,#4 @ $num to point at &tp[num] | ||
161 | sub $aj,$num,sp @ "original" num value | ||
162 | mov $tp,sp @ "rewind" $tp | ||
163 | mov $ap,$tp @ "borrow" $ap | ||
164 | sub $np,$np,$aj @ "rewind" $np to &np[0] | ||
165 | |||
166 | subs $tj,$tj,$tj @ "clear" carry flag | ||
167 | .Lsub: ldr $tj,[$tp],#4 | ||
168 | ldr $nj,[$np],#4 | ||
169 | sbcs $tj,$tj,$nj @ tp[j]-np[j] | ||
170 | str $tj,[$rp],#4 @ rp[j]= | ||
171 | teq $tp,$num @ preserve carry | ||
172 | bne .Lsub | ||
173 | sbcs $nhi,$nhi,#0 @ upmost carry | ||
174 | mov $tp,sp @ "rewind" $tp | ||
175 | sub $rp,$rp,$aj @ "rewind" $rp | ||
176 | |||
177 | and $ap,$tp,$nhi | ||
178 | bic $np,$rp,$nhi | ||
179 | orr $ap,$ap,$np @ ap=borrow?tp:rp | ||
180 | |||
181 | .Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh | ||
182 | str sp,[$tp],#4 @ zap tp | ||
183 | str $tj,[$rp],#4 | ||
184 | cmp $tp,$num | ||
185 | bne .Lcopy | ||
186 | |||
187 | add sp,$num,#4 @ skip over tp[num+1] | ||
188 | ldmia sp!,{r4-r12,lr} @ restore registers | ||
189 | add sp,sp,#2*4 @ skip over {r0,r2} | ||
190 | mov r0,#1 | ||
191 | .Labrt: tst lr,#1 | ||
192 | moveq pc,lr @ be binary compatible with V4, yet | ||
193 | bx lr @ interoperable with Thumb ISA:-) | ||
194 | .size bn_mul_mont,.-bn_mul_mont | ||
195 | .asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | ||
196 | .align 2 | ||
197 | ___ | ||
198 | |||
199 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
200 | print $code; | ||
201 | close STDOUT; | ||