diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/s390x-mont.pl')
-rw-r--r-- | src/lib/libcrypto/bn/asm/s390x-mont.pl | 225 |
1 files changed, 225 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl new file mode 100644 index 0000000000..d23251033b --- /dev/null +++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl | |||
@@ -0,0 +1,225 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # April 2007. | ||
11 | # | ||
12 | # Performance improvement over vanilla C code varies from 85% to 45% | ||
13 | # depending on key length and benchmark. Unfortunately in this context | ||
14 | # these are not very impressive results [for code that utilizes "wide" | ||
15 | # 64x64=128-bit multiplication, which is not commonly available to C | ||
16 | # programmers], at least hand-coded bn_asm.c replacement is known to | ||
17 | # provide 30-40% better results for longest keys. Well, on a second | ||
18 | # thought it's not very surprising, because z-CPUs are single-issue | ||
19 | # and _strictly_ in-order execution, while bn_mul_mont is more or less | ||
20 | # dependent on CPU ability to pipe-line instructions and have several | ||
21 | # of them "in-flight" at the same time. I mean while other methods, | ||
22 | # for example Karatsuba, aim to minimize amount of multiplications at | ||
23 | # the cost of other operations increase, bn_mul_mont aim to neatly | ||
24 | # "overlap" multiplications and the other operations [and on most | ||
25 | # platforms even minimize the amount of the other operations, in | ||
26 | # particular references to memory]. But it's possible to improve this | ||
27 | # module performance by implementing dedicated squaring code-path and | ||
28 | # possibly by unrolling loops... | ||
29 | |||
30 | # January 2009. | ||
31 | # | ||
32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, | ||
33 | # make inner loops counter-based. | ||
34 | |||
35 | $mn0="%r0"; | ||
36 | $num="%r1"; | ||
37 | |||
38 | # int bn_mul_mont( | ||
39 | $rp="%r2"; # BN_ULONG *rp, | ||
40 | $ap="%r3"; # const BN_ULONG *ap, | ||
41 | $bp="%r4"; # const BN_ULONG *bp, | ||
42 | $np="%r5"; # const BN_ULONG *np, | ||
43 | $n0="%r6"; # const BN_ULONG *n0, | ||
44 | #$num="160(%r15)" # int num); | ||
45 | |||
46 | $bi="%r2"; # zaps rp | ||
47 | $j="%r7"; | ||
48 | |||
49 | $ahi="%r8"; | ||
50 | $alo="%r9"; | ||
51 | $nhi="%r10"; | ||
52 | $nlo="%r11"; | ||
53 | $AHI="%r12"; | ||
54 | $NHI="%r13"; | ||
55 | $count="%r14"; | ||
56 | $sp="%r15"; | ||
57 | |||
58 | $code.=<<___; | ||
59 | .text | ||
60 | .globl bn_mul_mont | ||
61 | .type bn_mul_mont,\@function | ||
62 | bn_mul_mont: | ||
63 | lgf $num,164($sp) # pull $num | ||
64 | sla $num,3 # $num to enumerate bytes | ||
65 | la $bp,0($num,$bp) | ||
66 | |||
67 | stg %r2,16($sp) | ||
68 | |||
69 | cghi $num,16 # | ||
70 | lghi %r2,0 # | ||
71 | blr %r14 # if($num<16) return 0; | ||
72 | cghi $num,128 # | ||
73 | bhr %r14 # if($num>128) return 0; | ||
74 | |||
75 | stmg %r3,%r15,24($sp) | ||
76 | |||
77 | lghi $rp,-160-8 # leave room for carry bit | ||
78 | lcgr $j,$num # -$num | ||
79 | lgr %r0,$sp | ||
80 | la $rp,0($rp,$sp) | ||
81 | la $sp,0($j,$rp) # alloca | ||
82 | stg %r0,0($sp) # back chain | ||
83 | |||
84 | sra $num,3 # restore $num | ||
85 | la $bp,0($j,$bp) # restore $bp | ||
86 | ahi $num,-1 # adjust $num for inner loop | ||
87 | lg $n0,0($n0) # pull n0 | ||
88 | |||
89 | lg $bi,0($bp) | ||
90 | lg $alo,0($ap) | ||
91 | mlgr $ahi,$bi # ap[0]*bp[0] | ||
92 | lgr $AHI,$ahi | ||
93 | |||
94 | lgr $mn0,$alo # "tp[0]"*n0 | ||
95 | msgr $mn0,$n0 | ||
96 | |||
97 | lg $nlo,0($np) # | ||
98 | mlgr $nhi,$mn0 # np[0]*m1 | ||
99 | algr $nlo,$alo # +="tp[0]" | ||
100 | lghi $NHI,0 | ||
101 | alcgr $NHI,$nhi | ||
102 | |||
103 | la $j,8(%r0) # j=1 | ||
104 | lr $count,$num | ||
105 | |||
106 | .align 16 | ||
107 | .L1st: | ||
108 | lg $alo,0($j,$ap) | ||
109 | mlgr $ahi,$bi # ap[j]*bp[0] | ||
110 | algr $alo,$AHI | ||
111 | lghi $AHI,0 | ||
112 | alcgr $AHI,$ahi | ||
113 | |||
114 | lg $nlo,0($j,$np) | ||
115 | mlgr $nhi,$mn0 # np[j]*m1 | ||
116 | algr $nlo,$NHI | ||
117 | lghi $NHI,0 | ||
118 | alcgr $nhi,$NHI # +="tp[j]" | ||
119 | algr $nlo,$alo | ||
120 | alcgr $NHI,$nhi | ||
121 | |||
122 | stg $nlo,160-8($j,$sp) # tp[j-1]= | ||
123 | la $j,8($j) # j++ | ||
124 | brct $count,.L1st | ||
125 | |||
126 | algr $NHI,$AHI | ||
127 | lghi $AHI,0 | ||
128 | alcgr $AHI,$AHI # upmost overflow bit | ||
129 | stg $NHI,160-8($j,$sp) | ||
130 | stg $AHI,160($j,$sp) | ||
131 | la $bp,8($bp) # bp++ | ||
132 | |||
133 | .Louter: | ||
134 | lg $bi,0($bp) # bp[i] | ||
135 | lg $alo,0($ap) | ||
136 | mlgr $ahi,$bi # ap[0]*bp[i] | ||
137 | alg $alo,160($sp) # +=tp[0] | ||
138 | lghi $AHI,0 | ||
139 | alcgr $AHI,$ahi | ||
140 | |||
141 | lgr $mn0,$alo | ||
142 | msgr $mn0,$n0 # tp[0]*n0 | ||
143 | |||
144 | lg $nlo,0($np) # np[0] | ||
145 | mlgr $nhi,$mn0 # np[0]*m1 | ||
146 | algr $nlo,$alo # +="tp[0]" | ||
147 | lghi $NHI,0 | ||
148 | alcgr $NHI,$nhi | ||
149 | |||
150 | la $j,8(%r0) # j=1 | ||
151 | lr $count,$num | ||
152 | |||
153 | .align 16 | ||
154 | .Linner: | ||
155 | lg $alo,0($j,$ap) | ||
156 | mlgr $ahi,$bi # ap[j]*bp[i] | ||
157 | algr $alo,$AHI | ||
158 | lghi $AHI,0 | ||
159 | alcgr $ahi,$AHI | ||
160 | alg $alo,160($j,$sp)# +=tp[j] | ||
161 | alcgr $AHI,$ahi | ||
162 | |||
163 | lg $nlo,0($j,$np) | ||
164 | mlgr $nhi,$mn0 # np[j]*m1 | ||
165 | algr $nlo,$NHI | ||
166 | lghi $NHI,0 | ||
167 | alcgr $nhi,$NHI | ||
168 | algr $nlo,$alo # +="tp[j]" | ||
169 | alcgr $NHI,$nhi | ||
170 | |||
171 | stg $nlo,160-8($j,$sp) # tp[j-1]= | ||
172 | la $j,8($j) # j++ | ||
173 | brct $count,.Linner | ||
174 | |||
175 | algr $NHI,$AHI | ||
176 | lghi $AHI,0 | ||
177 | alcgr $AHI,$AHI | ||
178 | alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit | ||
179 | lghi $ahi,0 | ||
180 | alcgr $AHI,$ahi # new upmost overflow bit | ||
181 | stg $NHI,160-8($j,$sp) | ||
182 | stg $AHI,160($j,$sp) | ||
183 | |||
184 | la $bp,8($bp) # bp++ | ||
185 | clg $bp,160+8+32($j,$sp) # compare to &bp[num] | ||
186 | jne .Louter | ||
187 | |||
188 | lg $rp,160+8+16($j,$sp) # reincarnate rp | ||
189 | la $ap,160($sp) | ||
190 | ahi $num,1 # restore $num, incidentally clears "borrow" | ||
191 | |||
192 | la $j,0(%r0) | ||
193 | lr $count,$num | ||
194 | .Lsub: lg $alo,0($j,$ap) | ||
195 | slbg $alo,0($j,$np) | ||
196 | stg $alo,0($j,$rp) | ||
197 | la $j,8($j) | ||
198 | brct $count,.Lsub | ||
199 | lghi $ahi,0 | ||
200 | slbgr $AHI,$ahi # handle upmost carry | ||
201 | |||
202 | ngr $ap,$AHI | ||
203 | lghi $np,-1 | ||
204 | xgr $np,$AHI | ||
205 | ngr $np,$rp | ||
206 | ogr $ap,$np # ap=borrow?tp:rp | ||
207 | |||
208 | la $j,0(%r0) | ||
209 | lgr $count,$num | ||
210 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh | ||
211 | stg $j,160($j,$sp) # zap tp | ||
212 | stg $alo,0($j,$rp) | ||
213 | la $j,8($j) | ||
214 | brct $count,.Lcopy | ||
215 | |||
216 | la %r1,160+8+48($j,$sp) | ||
217 | lmg %r6,%r15,0(%r1) | ||
218 | lghi %r2,1 # signal "processed" | ||
219 | br %r14 | ||
220 | .size bn_mul_mont,.-bn_mul_mont | ||
221 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | ||
222 | ___ | ||
223 | |||
224 | print $code; | ||
225 | close STDOUT; | ||