1 files changed, 0 insertions, 277 deletions
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl
deleted file mode 100644
index 9fd64e81ee..0000000000
--- a/src/lib/libcrypto/bn/asm/s390x-mont.pl
+++ /dev/null
@@ -1,277 +0,0 @@
-#!/usr/bin/env perl
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-# April 2007.
-#
-# Performance improvement over vanilla C code varies from 85% to 45%
-# depending on key length and benchmark. Unfortunately in this context
-# these are not very impressive results [for code that utilizes "wide"
-# 64x64=128-bit multiplication, which is not commonly available to C
-# programmers], at least hand-coded bn_asm.c replacement is known to
-# provide 30-40% better results for longest keys. Well, on a second
-# thought it's not very surprising, because z-CPUs are single-issue
-# and _strictly_ in-order execution, while bn_mul_mont is more or less
-# dependent on CPU ability to pipe-line instructions and have several
-# of them "in-flight" at the same time. I mean while other methods,
-# for example Karatsuba, aim to minimize amount of multiplications at
-# the cost of other operations increase, bn_mul_mont aim to neatly
-# "overlap" multiplications and the other operations [and on most
-# platforms even minimize the amount of the other operations, in
-# particular references to memory]. But it's possible to improve this
-# module performance by implementing dedicated squaring code-path and
-# possibly by unrolling loops...
-# January 2009.
-#
-# Reschedule to minimize/avoid Address Generation Interlock hazard,
-# make inner loops counter-based.
-# November 2010.
-#
-# Adapt for -m31 build. If kernel supports what's called "highgprs"
-# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
-# instructions and achieve "64-bit" performance even in 31-bit legacy
-# application context. The feature is not specific to any particular
-# processor, as long as it's "z-CPU". Latter implies that the code
-# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
-# is achieved by swapping words after 64-bit loads, follow _dswap-s.
-# On z990 it was measured to perform 2.6-2.2 times better than
-# compiler-generated code, less for longer keys...
-$flavour = shift;
-if ($flavour =~ /3[12]/) {
-        $SIZE_T=4;
-        $g="";
-} else {
-        $SIZE_T=8;
-        $g="g";
-}
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-$stdframe=16*$SIZE_T+4*8;
-$mn0="%r0";
-$num="%r1";
-# int bn_mul_mont(
-$rp="%r2";              # BN_ULONG *rp,
-$ap="%r3";              # const BN_ULONG *ap,
-$bp="%r4";              # const BN_ULONG *bp,
-$np="%r5";              # const BN_ULONG *np,
-$n0="%r6";              # const BN_ULONG *n0,
-#$num="160(%r15)"       # int num);
-$bi="%r2";      # zaps rp
-$j="%r7";
-$ahi="%r8";
-$alo="%r9";
-$nhi="%r10";
-$nlo="%r11";
-$AHI="%r12";
-$NHI="%r13";
-$count="%r14";
-$sp="%r15";
-$code.=<<___;
-.text
-.globl  bn_mul_mont
-.type   bn_mul_mont,\@function
-bn_mul_mont:
-        lgf     $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
-        sla     $num,`log($SIZE_T)/log(2)`      # $num to enumerate bytes
-        la      $bp,0($num,$bp)
-        st${g}  %r2,2*$SIZE_T($sp)
-        cghi    $num,16         #
-        lghi    %r2,0           #
-        blr     %r14            # if($num<16) return 0;
-___
-$code.=<<___ if ($flavour =~ /3[12]/);
-        tmll    $num,4
-        bnzr    %r14            # if ($num&1) return 0;
-___
-$code.=<<___ if ($flavour !~ /3[12]/);
-        cghi    $num,96         #
-        bhr     %r14            # if($num>96) return 0;
-___
-$code.=<<___;
-        stm${g} %r3,%r15,3*$SIZE_T($sp)
-        lghi    $rp,-$stdframe-8        # leave room for carry bit
-        lcgr    $j,$num         # -$num
-        lgr     %r0,$sp
-        la      $rp,0($rp,$sp)
-        la      $sp,0($j,$rp)   # alloca
-        st${g}  %r0,0($sp)      # back chain
-        sra     $num,3          # restore $num
-        la      $bp,0($j,$bp)   # restore $bp
-        ahi     $num,-1         # adjust $num for inner loop
-        lg      $n0,0($n0)      # pull n0
-        _dswap  $n0
-        lg      $bi,0($bp)
-        _dswap  $bi
-        lg      $alo,0($ap)
-        _dswap  $alo
-        mlgr    $ahi,$bi        # ap[0]*bp[0]
-        lgr     $AHI,$ahi
-        lgr     $mn0,$alo       # "tp[0]"*n0
-        msgr    $mn0,$n0
-        lg      $nlo,0($np)     #
-        _dswap  $nlo
-        mlgr    $nhi,$mn0       # np[0]*m1
-        algr    $nlo,$alo       # +="tp[0]"
-        lghi    $NHI,0
-        alcgr   $NHI,$nhi
-        la      $j,8(%r0)       # j=1
-        lr      $count,$num
-.align  16
-.L1st:
-        lg      $alo,0($j,$ap)
-        _dswap  $alo
-        mlgr    $ahi,$bi        # ap[j]*bp[0]
-        algr    $alo,$AHI
-        lghi    $AHI,0
-        alcgr   $AHI,$ahi
-        lg      $nlo,0($j,$np)
-        _dswap  $nlo
-        mlgr    $nhi,$mn0       # np[j]*m1
-        algr    $nlo,$NHI
-        lghi    $NHI,0
-        alcgr   $nhi,$NHI       # +="tp[j]"
-        algr    $nlo,$alo
-        alcgr   $NHI,$nhi
-        stg     $nlo,$stdframe-8($j,$sp)        # tp[j-1]=
-        la      $j,8($j)        # j++
-        brct    $count,.L1st
-        algr    $NHI,$AHI
-        lghi    $AHI,0
-        alcgr   $AHI,$AHI       # upmost overflow bit
-        stg     $NHI,$stdframe-8($j,$sp)
-        stg     $AHI,$stdframe($j,$sp)
-        la      $bp,8($bp)      # bp++
-.Louter:
-        lg      $bi,0($bp)      # bp[i]
-        _dswap  $bi
-        lg      $alo,0($ap)
-        _dswap  $alo
-        mlgr    $ahi,$bi        # ap[0]*bp[i]
-        alg     $alo,$stdframe($sp)     # +=tp[0]
-        lghi    $AHI,0
-        alcgr   $AHI,$ahi
-        lgr     $mn0,$alo
-        msgr    $mn0,$n0        # tp[0]*n0
-        lg      $nlo,0($np)     # np[0]
-        _dswap  $nlo
-        mlgr    $nhi,$mn0       # np[0]*m1
-        algr    $nlo,$alo       # +="tp[0]"
-        lghi    $NHI,0
-        alcgr   $NHI,$nhi
-        la      $j,8(%r0)       # j=1
-        lr      $count,$num
-.align  16
-.Linner:
-        lg      $alo,0($j,$ap)
-        _dswap  $alo
-        mlgr    $ahi,$bi        # ap[j]*bp[i]
-        algr    $alo,$AHI
-        lghi    $AHI,0
-        alcgr   $ahi,$AHI
-        alg     $alo,$stdframe($j,$sp)# +=tp[j]
-        alcgr   $AHI,$ahi
-        lg      $nlo,0($j,$np)
-        _dswap  $nlo
-        mlgr    $nhi,$mn0       # np[j]*m1
-        algr    $nlo,$NHI
-        lghi    $NHI,0
-        alcgr   $nhi,$NHI
-        algr    $nlo,$alo       # +="tp[j]"
-        alcgr   $NHI,$nhi
-        stg     $nlo,$stdframe-8($j,$sp)        # tp[j-1]=
-        la      $j,8($j)        # j++
-        brct    $count,.Linner
-        algr    $NHI,$AHI
-        lghi    $AHI,0
-        alcgr   $AHI,$AHI
-        alg     $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
-        lghi    $ahi,0
-        alcgr   $AHI,$ahi       # new upmost overflow bit
-        stg     $NHI,$stdframe-8($j,$sp)
-        stg     $AHI,$stdframe($j,$sp)
-        la      $bp,8($bp)      # bp++
-        cl${g}  $bp,`$stdframe+8+4*$SIZE_T`($j,$sp)     # compare to &bp[num]
-        jne     .Louter
-        l${g}   $rp,`$stdframe+8+2*$SIZE_T`($j,$sp)     # reincarnate rp
-        la      $ap,$stdframe($sp)
-        ahi     $num,1          # restore $num, incidentally clears "borrow"
-        la      $j,0(%r0)
-        lr      $count,$num
-.Lsub:  lg      $alo,0($j,$ap)
-        lg      $nlo,0($j,$np)
-        _dswap  $nlo
-        slbgr   $alo,$nlo
-        stg     $alo,0($j,$rp)
-        la      $j,8($j)
-        brct    $count,.Lsub
-        lghi    $ahi,0
-        slbgr   $AHI,$ahi       # handle upmost carry
-        ngr     $ap,$AHI
-        lghi    $np,-1
-        xgr     $np,$AHI
-        ngr     $np,$rp
-        ogr     $ap,$np         # ap=borrow?tp:rp
-        la      $j,0(%r0)
-        lgr     $count,$num
-.Lcopy: lg      $alo,0($j,$ap)          # copy or in-place refresh
-        _dswap  $alo
-        stg     $j,$stdframe($j,$sp)    # zap tp
-        stg     $alo,0($j,$rp)
-        la      $j,8($j)
-        brct    $count,.Lcopy
-        la      %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
-        lm${g}  %r6,%r15,0(%r1)
-        lghi    %r2,1           # signal "processed"
-        br      %r14
-.size   bn_mul_mont,.-bn_mul_mont
-.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-___
-foreach (split("\n",$code)) {
-        s/\`([^\`]*)\`/eval $1/ge;
-        s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
-        print $_,"\n";
-}
-close STDOUT;

diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl deleted file mode 100644 index 9fd64e81ee..0000000000 --- a/src/lib/libcrypto/bn/asm/s390x-mont.pl +++ /dev/null
@@ -1,277 +0,0 @@
1	#!/usr/bin/env perl
2
3	# ====================================================================
4	# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5	# project. The module is, however, dual licensed under OpenSSL and
6	# CRYPTOGAMS licenses depending on where you obtain it. For further
7	# details see http://www.openssl.org/~appro/cryptogams/.
8	# ====================================================================
9
10	# April 2007.
11	#
12	# Performance improvement over vanilla C code varies from 85% to 45%
13	# depending on key length and benchmark. Unfortunately in this context
14	# these are not very impressive results [for code that utilizes "wide"
15	# 64x64=128-bit multiplication, which is not commonly available to C
16	# programmers], at least hand-coded bn_asm.c replacement is known to
17	# provide 30-40% better results for longest keys. Well, on a second
18	# thought it's not very surprising, because z-CPUs are single-issue
19	# and _strictly_ in-order execution, while bn_mul_mont is more or less
20	# dependent on CPU ability to pipe-line instructions and have several
21	# of them "in-flight" at the same time. I mean while other methods,
22	# for example Karatsuba, aim to minimize amount of multiplications at
23	# the cost of other operations increase, bn_mul_mont aim to neatly
24	# "overlap" multiplications and the other operations [and on most
25	# platforms even minimize the amount of the other operations, in
26	# particular references to memory]. But it's possible to improve this
27	# module performance by implementing dedicated squaring code-path and
28	# possibly by unrolling loops...
29
30	# January 2009.
31	#
32	# Reschedule to minimize/avoid Address Generation Interlock hazard,
33	# make inner loops counter-based.
34
35	# November 2010.
36	#
37	# Adapt for -m31 build. If kernel supports what's called "highgprs"
38	# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
39	# instructions and achieve "64-bit" performance even in 31-bit legacy
40	# application context. The feature is not specific to any particular
41	# processor, as long as it's "z-CPU". Latter implies that the code
42	# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
43	# is achieved by swapping words after 64-bit loads, follow _dswap-s.
44	# On z990 it was measured to perform 2.6-2.2 times better than
45	# compiler-generated code, less for longer keys...
46
47	$flavour = shift;
48
49	if ($flavour =~ /3[12]/) {
50	$SIZE_T=4;
51	$g="";
52	} else {
53	$SIZE_T=8;
54	$g="g";
55	}
56
57	while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
58	open STDOUT,">$output";
59
60	$stdframe=16$SIZE_T+48;
61
62	$mn0="%r0";
63	$num="%r1";
64
65	# int bn_mul_mont(
66	$rp="%r2"; # BN_ULONG *rp,
67	$ap="%r3"; # const BN_ULONG *ap,
68	$bp="%r4"; # const BN_ULONG *bp,
69	$np="%r5"; # const BN_ULONG *np,
70	$n0="%r6"; # const BN_ULONG *n0,
71	#$num="160(%r15)" # int num);
72
73	$bi="%r2"; # zaps rp
74	$j="%r7";
75
76	$ahi="%r8";
77	$alo="%r9";
78	$nhi="%r10";
79	$nlo="%r11";
80	$AHI="%r12";
81	$NHI="%r13";
82	$count="%r14";
83	$sp="%r15";
84
85	$code.=<<___;
86	.text
87	.globl bn_mul_mont
88	.type bn_mul_mont,\@function
89	bn_mul_mont:
90	lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
91	sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
92	la $bp,0($num,$bp)
93
94	st${g} %r2,2*$SIZE_T($sp)
95
96	cghi $num,16 #
97	lghi %r2,0 #
98	blr %r14 # if($num<16) return 0;
99	___
100	$code.=<<___ if ($flavour =~ /3[12]/);
101	tmll $num,4
102	bnzr %r14 # if ($num&1) return 0;
103	___
104	$code.=<<___ if ($flavour !~ /3[12]/);
105	cghi $num,96 #
106	bhr %r14 # if($num>96) return 0;
107	___
108	$code.=<<___;
109	stm${g} %r3,%r15,3*$SIZE_T($sp)
110
111	lghi $rp,-$stdframe-8 # leave room for carry bit
112	lcgr $j,$num # -$num
113	lgr %r0,$sp
114	la $rp,0($rp,$sp)
115	la $sp,0($j,$rp) # alloca
116	st${g} %r0,0($sp) # back chain
117
118	sra $num,3 # restore $num
119	la $bp,0($j,$bp) # restore $bp
120	ahi $num,-1 # adjust $num for inner loop
121	lg $n0,0($n0) # pull n0
122	_dswap $n0
123
124	lg $bi,0($bp)
125	_dswap $bi
126	lg $alo,0($ap)
127	_dswap $alo
128	mlgr $ahi,$bi # ap[0]*bp[0]
129	lgr $AHI,$ahi
130
131	lgr $mn0,$alo # "tp[0]"*n0
132	msgr $mn0,$n0
133
134	lg $nlo,0($np) #
135	_dswap $nlo
136	mlgr $nhi,$mn0 # np[0]*m1
137	algr $nlo,$alo # +="tp[0]"
138	lghi $NHI,0
139	alcgr $NHI,$nhi
140
141	la $j,8(%r0) # j=1
142	lr $count,$num
143
144	.align 16
145	.L1st:
146	lg $alo,0($j,$ap)
147	_dswap $alo
148	mlgr $ahi,$bi # ap[j]*bp[0]
149	algr $alo,$AHI
150	lghi $AHI,0
151	alcgr $AHI,$ahi
152
153	lg $nlo,0($j,$np)
154	_dswap $nlo
155	mlgr $nhi,$mn0 # np[j]*m1
156	algr $nlo,$NHI
157	lghi $NHI,0
158	alcgr $nhi,$NHI # +="tp[j]"
159	algr $nlo,$alo
160	alcgr $NHI,$nhi
161
162	stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
163	la $j,8($j) # j++
164	brct $count,.L1st
165
166	algr $NHI,$AHI
167	lghi $AHI,0
168	alcgr $AHI,$AHI # upmost overflow bit
169	stg $NHI,$stdframe-8($j,$sp)
170	stg $AHI,$stdframe($j,$sp)
171	la $bp,8($bp) # bp++
172
173	.Louter:
174	lg $bi,0($bp) # bp[i]
175	_dswap $bi
176	lg $alo,0($ap)
177	_dswap $alo
178	mlgr $ahi,$bi # ap[0]*bp[i]
179	alg $alo,$stdframe($sp) # +=tp[0]
180	lghi $AHI,0
181	alcgr $AHI,$ahi
182
183	lgr $mn0,$alo
184	msgr $mn0,$n0 # tp[0]*n0
185
186	lg $nlo,0($np) # np[0]
187	_dswap $nlo
188	mlgr $nhi,$mn0 # np[0]*m1
189	algr $nlo,$alo # +="tp[0]"
190	lghi $NHI,0
191	alcgr $NHI,$nhi
192
193	la $j,8(%r0) # j=1
194	lr $count,$num
195
196	.align 16
197	.Linner:
198	lg $alo,0($j,$ap)
199	_dswap $alo
200	mlgr $ahi,$bi # ap[j]*bp[i]
201	algr $alo,$AHI
202	lghi $AHI,0
203	alcgr $ahi,$AHI
204	alg $alo,$stdframe($j,$sp)# +=tp[j]
205	alcgr $AHI,$ahi
206
207	lg $nlo,0($j,$np)
208	_dswap $nlo
209	mlgr $nhi,$mn0 # np[j]*m1
210	algr $nlo,$NHI
211	lghi $NHI,0
212	alcgr $nhi,$NHI
213	algr $nlo,$alo # +="tp[j]"
214	alcgr $NHI,$nhi
215
216	stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
217	la $j,8($j) # j++
218	brct $count,.Linner
219
220	algr $NHI,$AHI
221	lghi $AHI,0
222	alcgr $AHI,$AHI
223	alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
224	lghi $ahi,0
225	alcgr $AHI,$ahi # new upmost overflow bit
226	stg $NHI,$stdframe-8($j,$sp)
227	stg $AHI,$stdframe($j,$sp)
228
229	la $bp,8($bp) # bp++
230	cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
231	jne .Louter
232
233	l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
234	la $ap,$stdframe($sp)
235	ahi $num,1 # restore $num, incidentally clears "borrow"
236
237	la $j,0(%r0)
238	lr $count,$num
239	.Lsub: lg $alo,0($j,$ap)
240	lg $nlo,0($j,$np)
241	_dswap $nlo
242	slbgr $alo,$nlo
243	stg $alo,0($j,$rp)
244	la $j,8($j)
245	brct $count,.Lsub
246	lghi $ahi,0
247	slbgr $AHI,$ahi # handle upmost carry
248
249	ngr $ap,$AHI
250	lghi $np,-1
251	xgr $np,$AHI
252	ngr $np,$rp
253	ogr $ap,$np # ap=borrow?tp:rp
254
255	la $j,0(%r0)
256	lgr $count,$num
257	.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
258	_dswap $alo
259	stg $j,$stdframe($j,$sp) # zap tp
260	stg $alo,0($j,$rp)
261	la $j,8($j)
262	brct $count,.Lcopy
263
264	la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
265	lm${g} %r6,%r15,0(%r1)
266	lghi %r2,1 # signal "processed"
267	br %r14
268	.size bn_mul_mont,.-bn_mul_mont
269	.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
270	___
271
272	foreach (split("\n",$code)) {
273	s/\`([^\`]*)\`/eval $1/ge;
274	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
275	print $_,"\n";
276	}
277	close STDOUT;