diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/s390x-gf2m.pl')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/s390x-gf2m.pl | 221 |
1 files changed, 0 insertions, 221 deletions
diff --git a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl deleted file mode 100644 index cd9f13eca2..0000000000 --- a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl +++ /dev/null | |||
| @@ -1,221 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
| 13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
| 14 | # the time being... gcc 4.3 appeared to generate poor code, therefore | ||
| 15 | # the effort. And indeed, the module delivers 55%-90%(*) improvement | ||
| 16 | # on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit | ||
| 17 | # key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196. | ||
| 18 | # This is for 64-bit build. In 32-bit "highgprs" case improvement is | ||
| 19 | # even higher, for example on z990 it was measured 80%-150%. ECDSA | ||
| 20 | # sign is modest 9%-12% faster. Keep in mind that these coefficients | ||
| 21 | # are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is | ||
| 22 | # burnt in it... | ||
| 23 | # | ||
| 24 | # (*) gcc 4.1 was observed to deliver better results than gcc 4.3, | ||
| 25 | # so that improvement coefficients can vary from one specific | ||
| 26 | # setup to another. | ||
| 27 | |||
| 28 | $flavour = shift; | ||
| 29 | |||
| 30 | if ($flavour =~ /3[12]/) { | ||
| 31 | $SIZE_T=4; | ||
| 32 | $g=""; | ||
| 33 | } else { | ||
| 34 | $SIZE_T=8; | ||
| 35 | $g="g"; | ||
| 36 | } | ||
| 37 | |||
| 38 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 39 | open STDOUT,">$output"; | ||
| 40 | |||
| 41 | $stdframe=16*$SIZE_T+4*8; | ||
| 42 | |||
| 43 | $rp="%r2"; | ||
| 44 | $a1="%r3"; | ||
| 45 | $a0="%r4"; | ||
| 46 | $b1="%r5"; | ||
| 47 | $b0="%r6"; | ||
| 48 | |||
| 49 | $ra="%r14"; | ||
| 50 | $sp="%r15"; | ||
| 51 | |||
| 52 | @T=("%r0","%r1"); | ||
| 53 | @i=("%r12","%r13"); | ||
| 54 | |||
| 55 | ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11)); | ||
| 56 | ($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8; | ||
| 57 | |||
| 58 | $code.=<<___; | ||
| 59 | .text | ||
| 60 | |||
| 61 | .type _mul_1x1,\@function | ||
| 62 | .align 16 | ||
| 63 | _mul_1x1: | ||
| 64 | lgr $a1,$a | ||
| 65 | sllg $a2,$a,1 | ||
| 66 | sllg $a4,$a,2 | ||
| 67 | sllg $a8,$a,3 | ||
| 68 | |||
| 69 | srag $lo,$a1,63 # broadcast 63rd bit | ||
| 70 | nihh $a1,0x1fff | ||
| 71 | srag @i[0],$a2,63 # broadcast 62nd bit | ||
| 72 | nihh $a2,0x3fff | ||
| 73 | srag @i[1],$a4,63 # broadcast 61st bit | ||
| 74 | nihh $a4,0x7fff | ||
| 75 | ngr $lo,$b | ||
| 76 | ngr @i[0],$b | ||
| 77 | ngr @i[1],$b | ||
| 78 | |||
| 79 | lghi @T[0],0 | ||
| 80 | lgr $a12,$a1 | ||
| 81 | stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0 | ||
| 82 | xgr $a12,$a2 | ||
| 83 | stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1 | ||
| 84 | lgr $a48,$a4 | ||
| 85 | stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2 | ||
| 86 | xgr $a48,$a8 | ||
| 87 | stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2 | ||
| 88 | xgr $a1,$a4 | ||
| 89 | |||
| 90 | stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4 | ||
| 91 | xgr $a2,$a4 | ||
| 92 | stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4 | ||
| 93 | xgr $a12,$a4 | ||
| 94 | stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4 | ||
| 95 | xgr $a1,$a48 | ||
| 96 | stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4 | ||
| 97 | xgr $a2,$a48 | ||
| 98 | |||
| 99 | stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8 | ||
| 100 | xgr $a12,$a48 | ||
| 101 | stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8 | ||
| 102 | xgr $a1,$a4 | ||
| 103 | stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8 | ||
| 104 | xgr $a2,$a4 | ||
| 105 | stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8 | ||
| 106 | |||
| 107 | xgr $a12,$a4 | ||
| 108 | stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8 | ||
| 109 | srlg $hi,$lo,1 | ||
| 110 | stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8 | ||
| 111 | sllg $lo,$lo,63 | ||
| 112 | stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8 | ||
| 113 | srlg @T[0],@i[0],2 | ||
| 114 | stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8 | ||
| 115 | |||
| 116 | lghi $mask,`0xf<<3` | ||
| 117 | sllg $a1,@i[0],62 | ||
| 118 | sllg @i[0],$b,3 | ||
| 119 | srlg @T[1],@i[1],3 | ||
| 120 | ngr @i[0],$mask | ||
| 121 | sllg $a2,@i[1],61 | ||
| 122 | srlg @i[1],$b,4-3 | ||
| 123 | xgr $hi,@T[0] | ||
| 124 | ngr @i[1],$mask | ||
| 125 | xgr $lo,$a1 | ||
| 126 | xgr $hi,@T[1] | ||
| 127 | xgr $lo,$a2 | ||
| 128 | |||
| 129 | xg $lo,$stdframe(@i[0],$sp) | ||
| 130 | srlg @i[0],$b,8-3 | ||
| 131 | ngr @i[0],$mask | ||
| 132 | ___ | ||
| 133 | for($n=1;$n<14;$n++) { | ||
| 134 | $code.=<<___; | ||
| 135 | lg @T[1],$stdframe(@i[1],$sp) | ||
| 136 | srlg @i[1],$b,`($n+2)*4`-3 | ||
| 137 | sllg @T[0],@T[1],`$n*4` | ||
| 138 | ngr @i[1],$mask | ||
| 139 | srlg @T[1],@T[1],`64-$n*4` | ||
| 140 | xgr $lo,@T[0] | ||
| 141 | xgr $hi,@T[1] | ||
| 142 | ___ | ||
| 143 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
| 144 | } | ||
| 145 | $code.=<<___; | ||
| 146 | lg @T[1],$stdframe(@i[1],$sp) | ||
| 147 | sllg @T[0],@T[1],`$n*4` | ||
| 148 | srlg @T[1],@T[1],`64-$n*4` | ||
| 149 | xgr $lo,@T[0] | ||
| 150 | xgr $hi,@T[1] | ||
| 151 | |||
| 152 | lg @T[0],$stdframe(@i[0],$sp) | ||
| 153 | sllg @T[1],@T[0],`($n+1)*4` | ||
| 154 | srlg @T[0],@T[0],`64-($n+1)*4` | ||
| 155 | xgr $lo,@T[1] | ||
| 156 | xgr $hi,@T[0] | ||
| 157 | |||
| 158 | br $ra | ||
| 159 | .size _mul_1x1,.-_mul_1x1 | ||
| 160 | |||
| 161 | .globl bn_GF2m_mul_2x2 | ||
| 162 | .type bn_GF2m_mul_2x2,\@function | ||
| 163 | .align 16 | ||
| 164 | bn_GF2m_mul_2x2: | ||
| 165 | stm${g} %r3,%r15,3*$SIZE_T($sp) | ||
| 166 | |||
| 167 | lghi %r1,-$stdframe-128 | ||
| 168 | la %r0,0($sp) | ||
| 169 | la $sp,0(%r1,$sp) # alloca | ||
| 170 | st${g} %r0,0($sp) # back chain | ||
| 171 | ___ | ||
| 172 | if ($SIZE_T==8) { | ||
| 173 | my @r=map("%r$_",(6..9)); | ||
| 174 | $code.=<<___; | ||
| 175 | bras $ra,_mul_1x1 # a1·b1 | ||
| 176 | stmg $lo,$hi,16($rp) | ||
| 177 | |||
| 178 | lg $a,`$stdframe+128+4*$SIZE_T`($sp) | ||
| 179 | lg $b,`$stdframe+128+6*$SIZE_T`($sp) | ||
| 180 | bras $ra,_mul_1x1 # a0·b0 | ||
| 181 | stmg $lo,$hi,0($rp) | ||
| 182 | |||
| 183 | lg $a,`$stdframe+128+3*$SIZE_T`($sp) | ||
| 184 | lg $b,`$stdframe+128+5*$SIZE_T`($sp) | ||
| 185 | xg $a,`$stdframe+128+4*$SIZE_T`($sp) | ||
| 186 | xg $b,`$stdframe+128+6*$SIZE_T`($sp) | ||
| 187 | bras $ra,_mul_1x1 # (a0+a1)·(b0+b1) | ||
| 188 | lmg @r[0],@r[3],0($rp) | ||
| 189 | |||
| 190 | xgr $lo,$hi | ||
| 191 | xgr $hi,@r[1] | ||
| 192 | xgr $lo,@r[0] | ||
| 193 | xgr $hi,@r[2] | ||
| 194 | xgr $lo,@r[3] | ||
| 195 | xgr $hi,@r[3] | ||
| 196 | xgr $lo,$hi | ||
| 197 | stg $hi,16($rp) | ||
| 198 | stg $lo,8($rp) | ||
| 199 | ___ | ||
| 200 | } else { | ||
| 201 | $code.=<<___; | ||
| 202 | sllg %r3,%r3,32 | ||
| 203 | sllg %r5,%r5,32 | ||
| 204 | or %r3,%r4 | ||
| 205 | or %r5,%r6 | ||
| 206 | bras $ra,_mul_1x1 | ||
| 207 | rllg $lo,$lo,32 | ||
| 208 | rllg $hi,$hi,32 | ||
| 209 | stmg $lo,$hi,0($rp) | ||
| 210 | ___ | ||
| 211 | } | ||
| 212 | $code.=<<___; | ||
| 213 | lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp) | ||
| 214 | br $ra | ||
| 215 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
| 216 | .string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 217 | ___ | ||
| 218 | |||
| 219 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 220 | print $code; | ||
| 221 | close STDOUT; | ||
