diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib/libcrypto/aes/asm/bsaes-x86_64.pl | 3123 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/asm/vpaes-x86.pl | 911 | ||||
| -rw-r--r-- | src/lib/libcrypto/aes/asm/vpaes-x86_64.pl | 1222 |
3 files changed, 0 insertions, 5256 deletions
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl deleted file mode 100644 index c44a338114..0000000000 --- a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl +++ /dev/null | |||
| @@ -1,3123 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | ################################################################### | ||
| 4 | ### AES-128 [originally in CTR mode] ### | ||
| 5 | ### bitsliced implementation for Intel Core 2 processors ### | ||
| 6 | ### requires support of SSE extensions up to SSSE3 ### | ||
| 7 | ### Author: Emilia Käsper and Peter Schwabe ### | ||
| 8 | ### Date: 2009-03-19 ### | ||
| 9 | ### Public domain ### | ||
| 10 | ### ### | ||
| 11 | ### See http://homes.esat.kuleuven.be/~ekasper/#software for ### | ||
| 12 | ### further information. ### | ||
| 13 | ################################################################### | ||
| 14 | # | ||
| 15 | # September 2011. | ||
| 16 | # | ||
| 17 | # Started as transliteration to "perlasm" the original code has | ||
| 18 | # undergone following changes: | ||
| 19 | # | ||
| 20 | # - code was made position-independent; | ||
| 21 | # - rounds were folded into a loop resulting in >5x size reduction | ||
| 22 | # from 12.5KB to 2.2KB; | ||
| 23 | # - above was possible thanks to mixcolumns() modification that | ||
| 24 | # allowed to feed its output back to aesenc[last], this was | ||
| 25 | # achieved at cost of two additional inter-registers moves; | ||
| 26 | # - some instruction reordering and interleaving; | ||
| 27 | # - this module doesn't implement key setup subroutine, instead it | ||
| 28 | # relies on conversion of "conventional" key schedule as returned | ||
| 29 | # by AES_set_encrypt_key (see discussion below); | ||
| 30 | # - first and last round keys are treated differently, which allowed | ||
| 31 | # to skip one shiftrows(), reduce bit-sliced key schedule and | ||
| 32 | # speed-up conversion by 22%; | ||
| 33 | # - support for 192- and 256-bit keys was added; | ||
| 34 | # | ||
| 35 | # Resulting performance in CPU cycles spent to encrypt one byte out | ||
| 36 | # of 4096-byte buffer with 128-bit key is: | ||
| 37 | # | ||
| 38 | # Emilia's this(*) difference | ||
| 39 | # | ||
| 40 | # Core 2 9.30 8.69 +7% | ||
| 41 | # Nehalem(**) 7.63 6.98 +9% | ||
| 42 | # Atom 17.1 17.4 -2%(***) | ||
| 43 | # | ||
| 44 | # (*) Comparison is not completely fair, because "this" is ECB, | ||
| 45 | # i.e. no extra processing such as counter values calculation | ||
| 46 | # and xor-ing input as in Emilia's CTR implementation is | ||
| 47 | # performed. However, the CTR calculations stand for not more | ||
| 48 | # than 1% of total time, so comparison is *rather* fair. | ||
| 49 | # | ||
| 50 | # (**) Results were collected on Westmere, which is considered to | ||
| 51 | # be equivalent to Nehalem for this code. | ||
| 52 | # | ||
| 53 | # (***) Slowdown on Atom is rather strange per se, because original | ||
| 54 | # implementation has a number of 9+-bytes instructions, which | ||
| 55 | # are bad for Atom front-end, and which I eliminated completely. | ||
| 56 | # In attempt to address deterioration sbox() was tested in FP | ||
| 57 | # SIMD "domain" (movaps instead of movdqa, xorps instead of | ||
| 58 | # pxor, etc.). While it resulted in nominal 4% improvement on | ||
| 59 | # Atom, it hurted Westmere by more than 2x factor. | ||
| 60 | # | ||
| 61 | # As for key schedule conversion subroutine. Interface to OpenSSL | ||
| 62 | # relies on per-invocation on-the-fly conversion. This naturally | ||
| 63 | # has impact on performance, especially for short inputs. Conversion | ||
| 64 | # time in CPU cycles and its ratio to CPU cycles spent in 8x block | ||
| 65 | # function is: | ||
| 66 | # | ||
| 67 | # conversion conversion/8x block | ||
| 68 | # Core 2 240 0.22 | ||
| 69 | # Nehalem 180 0.20 | ||
| 70 | # Atom 430 0.19 | ||
| 71 | # | ||
| 72 | # The ratio values mean that 128-byte blocks will be processed | ||
| 73 | # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, | ||
| 74 | # etc. Then keep in mind that input sizes not divisible by 128 are | ||
| 75 | # *effectively* slower, especially shortest ones, e.g. consecutive | ||
| 76 | # 144-byte blocks are processed 44% slower than one would expect, | ||
| 77 | # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" | ||
| 78 | # it's still faster than ["hyper-threading-safe" code path in] | ||
| 79 | # aes-x86_64.pl on all lengths above 64 bytes... | ||
| 80 | # | ||
| 81 | # October 2011. | ||
| 82 | # | ||
| 83 | # Add decryption procedure. Performance in CPU cycles spent to decrypt | ||
| 84 | # one byte out of 4096-byte buffer with 128-bit key is: | ||
| 85 | # | ||
| 86 | # Core 2 9.83 | ||
| 87 | # Nehalem 7.74 | ||
| 88 | # Atom 19.0 | ||
| 89 | # | ||
| 90 | # November 2011. | ||
| 91 | # | ||
| 92 | # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is | ||
| 93 | # suboptimal, but XTS is meant to be used with larger blocks... | ||
| 94 | # | ||
| 95 | # <appro@openssl.org> | ||
| 96 | |||
| 97 | $flavour = shift; | ||
| 98 | $output = shift; | ||
| 99 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 100 | |||
| 101 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 102 | |||
| 103 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 104 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 105 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 106 | die "can't locate x86_64-xlate.pl"; | ||
| 107 | |||
| 108 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
| 109 | *STDOUT=*OUT; | ||
| 110 | |||
| 111 | my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); | ||
| 112 | my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) | ||
| 113 | my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... | ||
| 114 | |||
| 115 | { | ||
| 116 | my ($key,$rounds,$const)=("%rax","%r10d","%r11"); | ||
| 117 | |||
| 118 | sub Sbox { | ||
| 119 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
| 120 | # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb | ||
| 121 | my @b=@_[0..7]; | ||
| 122 | my @t=@_[8..11]; | ||
| 123 | my @s=@_[12..15]; | ||
| 124 | &InBasisChange (@b); | ||
| 125 | &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); | ||
| 126 | &OutBasisChange (@b[7,1,4,2,6,5,0,3]); | ||
| 127 | } | ||
| 128 | |||
| 129 | sub InBasisChange { | ||
| 130 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
| 131 | # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb | ||
| 132 | my @b=@_[0..7]; | ||
| 133 | $code.=<<___; | ||
| 134 | pxor @b[6], @b[5] | ||
| 135 | pxor @b[1], @b[2] | ||
| 136 | pxor @b[0], @b[3] | ||
| 137 | pxor @b[2], @b[6] | ||
| 138 | pxor @b[0], @b[5] | ||
| 139 | |||
| 140 | pxor @b[3], @b[6] | ||
| 141 | pxor @b[7], @b[3] | ||
| 142 | pxor @b[5], @b[7] | ||
| 143 | pxor @b[4], @b[3] | ||
| 144 | pxor @b[5], @b[4] | ||
| 145 | pxor @b[1], @b[3] | ||
| 146 | |||
| 147 | pxor @b[7], @b[2] | ||
| 148 | pxor @b[5], @b[1] | ||
| 149 | ___ | ||
| 150 | } | ||
| 151 | |||
| 152 | sub OutBasisChange { | ||
| 153 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
| 154 | # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb | ||
| 155 | my @b=@_[0..7]; | ||
| 156 | $code.=<<___; | ||
| 157 | pxor @b[6], @b[0] | ||
| 158 | pxor @b[4], @b[1] | ||
| 159 | pxor @b[0], @b[2] | ||
| 160 | pxor @b[6], @b[4] | ||
| 161 | pxor @b[1], @b[6] | ||
| 162 | |||
| 163 | pxor @b[5], @b[1] | ||
| 164 | pxor @b[3], @b[5] | ||
| 165 | pxor @b[7], @b[3] | ||
| 166 | pxor @b[5], @b[7] | ||
| 167 | pxor @b[5], @b[2] | ||
| 168 | |||
| 169 | pxor @b[7], @b[4] | ||
| 170 | ___ | ||
| 171 | } | ||
| 172 | |||
| 173 | sub InvSbox { | ||
| 174 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
| 175 | # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb | ||
| 176 | my @b=@_[0..7]; | ||
| 177 | my @t=@_[8..11]; | ||
| 178 | my @s=@_[12..15]; | ||
| 179 | &InvInBasisChange (@b); | ||
| 180 | &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); | ||
| 181 | &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); | ||
| 182 | } | ||
| 183 | |||
| 184 | sub InvInBasisChange { # OutBasisChange in reverse | ||
| 185 | my @b=@_[5,1,2,6,3,7,0,4]; | ||
| 186 | $code.=<<___ | ||
| 187 | pxor @b[7], @b[4] | ||
| 188 | |||
| 189 | pxor @b[5], @b[7] | ||
| 190 | pxor @b[5], @b[2] | ||
| 191 | pxor @b[7], @b[3] | ||
| 192 | pxor @b[3], @b[5] | ||
| 193 | pxor @b[5], @b[1] | ||
| 194 | |||
| 195 | pxor @b[1], @b[6] | ||
| 196 | pxor @b[0], @b[2] | ||
| 197 | pxor @b[6], @b[4] | ||
| 198 | pxor @b[6], @b[0] | ||
| 199 | pxor @b[4], @b[1] | ||
| 200 | ___ | ||
| 201 | } | ||
| 202 | |||
| 203 | sub InvOutBasisChange { # InBasisChange in reverse | ||
| 204 | my @b=@_[2,5,7,3,6,1,0,4]; | ||
| 205 | $code.=<<___; | ||
| 206 | pxor @b[5], @b[1] | ||
| 207 | pxor @b[7], @b[2] | ||
| 208 | |||
| 209 | pxor @b[1], @b[3] | ||
| 210 | pxor @b[5], @b[4] | ||
| 211 | pxor @b[5], @b[7] | ||
| 212 | pxor @b[4], @b[3] | ||
| 213 | pxor @b[0], @b[5] | ||
| 214 | pxor @b[7], @b[3] | ||
| 215 | pxor @b[2], @b[6] | ||
| 216 | pxor @b[1], @b[2] | ||
| 217 | pxor @b[3], @b[6] | ||
| 218 | |||
| 219 | pxor @b[0], @b[3] | ||
| 220 | pxor @b[6], @b[5] | ||
| 221 | ___ | ||
| 222 | } | ||
| 223 | |||
| 224 | sub Mul_GF4 { | ||
| 225 | #;************************************************************* | ||
| 226 | #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * | ||
| 227 | #;************************************************************* | ||
| 228 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
| 229 | $code.=<<___; | ||
| 230 | movdqa $y0, $t0 | ||
| 231 | pxor $y1, $t0 | ||
| 232 | pand $x0, $t0 | ||
| 233 | pxor $x1, $x0 | ||
| 234 | pand $y0, $x1 | ||
| 235 | pand $y1, $x0 | ||
| 236 | pxor $x1, $x0 | ||
| 237 | pxor $t0, $x1 | ||
| 238 | ___ | ||
| 239 | } | ||
| 240 | |||
| 241 | sub Mul_GF4_N { # not used, see next subroutine | ||
| 242 | # multiply and scale by N | ||
| 243 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
| 244 | $code.=<<___; | ||
| 245 | movdqa $y0, $t0 | ||
| 246 | pxor $y1, $t0 | ||
| 247 | pand $x0, $t0 | ||
| 248 | pxor $x1, $x0 | ||
| 249 | pand $y0, $x1 | ||
| 250 | pand $y1, $x0 | ||
| 251 | pxor $x0, $x1 | ||
| 252 | pxor $t0, $x0 | ||
| 253 | ___ | ||
| 254 | } | ||
| 255 | |||
| 256 | sub Mul_GF4_N_GF4 { | ||
| 257 | # interleaved Mul_GF4_N and Mul_GF4 | ||
| 258 | my ($x0,$x1,$y0,$y1,$t0, | ||
| 259 | $x2,$x3,$y2,$y3,$t1)=@_; | ||
| 260 | $code.=<<___; | ||
| 261 | movdqa $y0, $t0 | ||
| 262 | movdqa $y2, $t1 | ||
| 263 | pxor $y1, $t0 | ||
| 264 | pxor $y3, $t1 | ||
| 265 | pand $x0, $t0 | ||
| 266 | pand $x2, $t1 | ||
| 267 | pxor $x1, $x0 | ||
| 268 | pxor $x3, $x2 | ||
| 269 | pand $y0, $x1 | ||
| 270 | pand $y2, $x3 | ||
| 271 | pand $y1, $x0 | ||
| 272 | pand $y3, $x2 | ||
| 273 | pxor $x0, $x1 | ||
| 274 | pxor $x3, $x2 | ||
| 275 | pxor $t0, $x0 | ||
| 276 | pxor $t1, $x3 | ||
| 277 | ___ | ||
| 278 | } | ||
| 279 | sub Mul_GF16_2 { | ||
| 280 | my @x=@_[0..7]; | ||
| 281 | my @y=@_[8..11]; | ||
| 282 | my @t=@_[12..15]; | ||
| 283 | $code.=<<___; | ||
| 284 | movdqa @x[0], @t[0] | ||
| 285 | movdqa @x[1], @t[1] | ||
| 286 | ___ | ||
| 287 | &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); | ||
| 288 | $code.=<<___; | ||
| 289 | pxor @x[2], @t[0] | ||
| 290 | pxor @x[3], @t[1] | ||
| 291 | pxor @y[2], @y[0] | ||
| 292 | pxor @y[3], @y[1] | ||
| 293 | ___ | ||
| 294 | Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
| 295 | @x[2], @x[3], @y[2], @y[3], @t[2]); | ||
| 296 | $code.=<<___; | ||
| 297 | pxor @t[0], @x[0] | ||
| 298 | pxor @t[0], @x[2] | ||
| 299 | pxor @t[1], @x[1] | ||
| 300 | pxor @t[1], @x[3] | ||
| 301 | |||
| 302 | movdqa @x[4], @t[0] | ||
| 303 | movdqa @x[5], @t[1] | ||
| 304 | pxor @x[6], @t[0] | ||
| 305 | pxor @x[7], @t[1] | ||
| 306 | ___ | ||
| 307 | &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
| 308 | @x[6], @x[7], @y[2], @y[3], @t[2]); | ||
| 309 | $code.=<<___; | ||
| 310 | pxor @y[2], @y[0] | ||
| 311 | pxor @y[3], @y[1] | ||
| 312 | ___ | ||
| 313 | &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); | ||
| 314 | $code.=<<___; | ||
| 315 | pxor @t[0], @x[4] | ||
| 316 | pxor @t[0], @x[6] | ||
| 317 | pxor @t[1], @x[5] | ||
| 318 | pxor @t[1], @x[7] | ||
| 319 | ___ | ||
| 320 | } | ||
| 321 | sub Inv_GF256 { | ||
| 322 | #;******************************************************************** | ||
| 323 | #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * | ||
| 324 | #;******************************************************************** | ||
| 325 | my @x=@_[0..7]; | ||
| 326 | my @t=@_[8..11]; | ||
| 327 | my @s=@_[12..15]; | ||
| 328 | # direct optimizations from hardware | ||
| 329 | $code.=<<___; | ||
| 330 | movdqa @x[4], @t[3] | ||
| 331 | movdqa @x[5], @t[2] | ||
| 332 | movdqa @x[1], @t[1] | ||
| 333 | movdqa @x[7], @s[1] | ||
| 334 | movdqa @x[0], @s[0] | ||
| 335 | |||
| 336 | pxor @x[6], @t[3] | ||
| 337 | pxor @x[7], @t[2] | ||
| 338 | pxor @x[3], @t[1] | ||
| 339 | movdqa @t[3], @s[2] | ||
| 340 | pxor @x[6], @s[1] | ||
| 341 | movdqa @t[2], @t[0] | ||
| 342 | pxor @x[2], @s[0] | ||
| 343 | movdqa @t[3], @s[3] | ||
| 344 | |||
| 345 | por @t[1], @t[2] | ||
| 346 | por @s[0], @t[3] | ||
| 347 | pxor @t[0], @s[3] | ||
| 348 | pand @s[0], @s[2] | ||
| 349 | pxor @t[1], @s[0] | ||
| 350 | pand @t[1], @t[0] | ||
| 351 | pand @s[0], @s[3] | ||
| 352 | movdqa @x[3], @s[0] | ||
| 353 | pxor @x[2], @s[0] | ||
| 354 | pand @s[0], @s[1] | ||
| 355 | pxor @s[1], @t[3] | ||
| 356 | pxor @s[1], @t[2] | ||
| 357 | movdqa @x[4], @s[1] | ||
| 358 | movdqa @x[1], @s[0] | ||
| 359 | pxor @x[5], @s[1] | ||
| 360 | pxor @x[0], @s[0] | ||
| 361 | movdqa @s[1], @t[1] | ||
| 362 | pand @s[0], @s[1] | ||
| 363 | por @s[0], @t[1] | ||
| 364 | pxor @s[1], @t[0] | ||
| 365 | pxor @s[3], @t[3] | ||
| 366 | pxor @s[2], @t[2] | ||
| 367 | pxor @s[3], @t[1] | ||
| 368 | movdqa @x[7], @s[0] | ||
| 369 | pxor @s[2], @t[0] | ||
| 370 | movdqa @x[6], @s[1] | ||
| 371 | pxor @s[2], @t[1] | ||
| 372 | movdqa @x[5], @s[2] | ||
| 373 | pand @x[3], @s[0] | ||
| 374 | movdqa @x[4], @s[3] | ||
| 375 | pand @x[2], @s[1] | ||
| 376 | pand @x[1], @s[2] | ||
| 377 | por @x[0], @s[3] | ||
| 378 | pxor @s[0], @t[3] | ||
| 379 | pxor @s[1], @t[2] | ||
| 380 | pxor @s[2], @t[1] | ||
| 381 | pxor @s[3], @t[0] | ||
| 382 | |||
| 383 | #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 | ||
| 384 | |||
| 385 | # new smaller inversion | ||
| 386 | |||
| 387 | movdqa @t[3], @s[0] | ||
| 388 | pand @t[1], @t[3] | ||
| 389 | pxor @t[2], @s[0] | ||
| 390 | |||
| 391 | movdqa @t[0], @s[2] | ||
| 392 | movdqa @s[0], @s[3] | ||
| 393 | pxor @t[3], @s[2] | ||
| 394 | pand @s[2], @s[3] | ||
| 395 | |||
| 396 | movdqa @t[1], @s[1] | ||
| 397 | pxor @t[2], @s[3] | ||
| 398 | pxor @t[0], @s[1] | ||
| 399 | |||
| 400 | pxor @t[2], @t[3] | ||
| 401 | |||
| 402 | pand @t[3], @s[1] | ||
| 403 | |||
| 404 | movdqa @s[2], @t[2] | ||
| 405 | pxor @t[0], @s[1] | ||
| 406 | |||
| 407 | pxor @s[1], @t[2] | ||
| 408 | pxor @s[1], @t[1] | ||
| 409 | |||
| 410 | pand @t[0], @t[2] | ||
| 411 | |||
| 412 | pxor @t[2], @s[2] | ||
| 413 | pxor @t[2], @t[1] | ||
| 414 | |||
| 415 | pand @s[3], @s[2] | ||
| 416 | |||
| 417 | pxor @s[0], @s[2] | ||
| 418 | ___ | ||
| 419 | # output in s3, s2, s1, t1 | ||
| 420 | |||
| 421 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 | ||
| 422 | |||
| 423 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 | ||
| 424 | &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); | ||
| 425 | |||
| 426 | ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb | ||
| 427 | } | ||
| 428 | |||
| 429 | # AES linear components | ||
| 430 | |||
| 431 | sub ShiftRows { | ||
| 432 | my @x=@_[0..7]; | ||
| 433 | my $mask=pop; | ||
| 434 | $code.=<<___; | ||
| 435 | pxor 0x00($key),@x[0] | ||
| 436 | pxor 0x10($key),@x[1] | ||
| 437 | pshufb $mask,@x[0] | ||
| 438 | pxor 0x20($key),@x[2] | ||
| 439 | pshufb $mask,@x[1] | ||
| 440 | pxor 0x30($key),@x[3] | ||
| 441 | pshufb $mask,@x[2] | ||
| 442 | pxor 0x40($key),@x[4] | ||
| 443 | pshufb $mask,@x[3] | ||
| 444 | pxor 0x50($key),@x[5] | ||
| 445 | pshufb $mask,@x[4] | ||
| 446 | pxor 0x60($key),@x[6] | ||
| 447 | pshufb $mask,@x[5] | ||
| 448 | pxor 0x70($key),@x[7] | ||
| 449 | pshufb $mask,@x[6] | ||
| 450 | lea 0x80($key),$key | ||
| 451 | pshufb $mask,@x[7] | ||
| 452 | ___ | ||
| 453 | } | ||
| 454 | |||
| 455 | sub MixColumns { | ||
| 456 | # modified to emit output in order suitable for feeding back to aesenc[last] | ||
| 457 | my @x=@_[0..7]; | ||
| 458 | my @t=@_[8..15]; | ||
| 459 | my $inv=@_[16]; # optional | ||
| 460 | $code.=<<___; | ||
| 461 | pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 | ||
| 462 | pshufd \$0x93, @x[1], @t[1] | ||
| 463 | pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) | ||
| 464 | pshufd \$0x93, @x[2], @t[2] | ||
| 465 | pxor @t[1], @x[1] | ||
| 466 | pshufd \$0x93, @x[3], @t[3] | ||
| 467 | pxor @t[2], @x[2] | ||
| 468 | pshufd \$0x93, @x[4], @t[4] | ||
| 469 | pxor @t[3], @x[3] | ||
| 470 | pshufd \$0x93, @x[5], @t[5] | ||
| 471 | pxor @t[4], @x[4] | ||
| 472 | pshufd \$0x93, @x[6], @t[6] | ||
| 473 | pxor @t[5], @x[5] | ||
| 474 | pshufd \$0x93, @x[7], @t[7] | ||
| 475 | pxor @t[6], @x[6] | ||
| 476 | pxor @t[7], @x[7] | ||
| 477 | |||
| 478 | pxor @x[0], @t[1] | ||
| 479 | pxor @x[7], @t[0] | ||
| 480 | pxor @x[7], @t[1] | ||
| 481 | pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) | ||
| 482 | pxor @x[1], @t[2] | ||
| 483 | pshufd \$0x4E, @x[1], @x[1] | ||
| 484 | pxor @x[4], @t[5] | ||
| 485 | pxor @t[0], @x[0] | ||
| 486 | pxor @x[5], @t[6] | ||
| 487 | pxor @t[1], @x[1] | ||
| 488 | pxor @x[3], @t[4] | ||
| 489 | pshufd \$0x4E, @x[4], @t[0] | ||
| 490 | pxor @x[6], @t[7] | ||
| 491 | pshufd \$0x4E, @x[5], @t[1] | ||
| 492 | pxor @x[2], @t[3] | ||
| 493 | pshufd \$0x4E, @x[3], @x[4] | ||
| 494 | pxor @x[7], @t[3] | ||
| 495 | pshufd \$0x4E, @x[7], @x[5] | ||
| 496 | pxor @x[7], @t[4] | ||
| 497 | pshufd \$0x4E, @x[6], @x[3] | ||
| 498 | pxor @t[4], @t[0] | ||
| 499 | pshufd \$0x4E, @x[2], @x[6] | ||
| 500 | pxor @t[5], @t[1] | ||
| 501 | ___ | ||
| 502 | $code.=<<___ if (!$inv); | ||
| 503 | pxor @t[3], @x[4] | ||
| 504 | pxor @t[7], @x[5] | ||
| 505 | pxor @t[6], @x[3] | ||
| 506 | movdqa @t[0], @x[2] | ||
| 507 | pxor @t[2], @x[6] | ||
| 508 | movdqa @t[1], @x[7] | ||
| 509 | ___ | ||
| 510 | $code.=<<___ if ($inv); | ||
| 511 | pxor @x[4], @t[3] | ||
| 512 | pxor @t[7], @x[5] | ||
| 513 | pxor @x[3], @t[6] | ||
| 514 | movdqa @t[0], @x[3] | ||
| 515 | pxor @t[2], @x[6] | ||
| 516 | movdqa @t[6], @x[2] | ||
| 517 | movdqa @t[1], @x[7] | ||
| 518 | movdqa @x[6], @x[4] | ||
| 519 | movdqa @t[3], @x[6] | ||
| 520 | ___ | ||
| 521 | } | ||
| 522 | |||
| 523 | sub InvMixColumns_orig { | ||
| 524 | my @x=@_[0..7]; | ||
| 525 | my @t=@_[8..15]; | ||
| 526 | |||
| 527 | $code.=<<___; | ||
| 528 | # multiplication by 0x0e | ||
| 529 | pshufd \$0x93, @x[7], @t[7] | ||
| 530 | movdqa @x[2], @t[2] | ||
| 531 | pxor @x[5], @x[7] # 7 5 | ||
| 532 | pxor @x[5], @x[2] # 2 5 | ||
| 533 | pshufd \$0x93, @x[0], @t[0] | ||
| 534 | movdqa @x[5], @t[5] | ||
| 535 | pxor @x[0], @x[5] # 5 0 [1] | ||
| 536 | pxor @x[1], @x[0] # 0 1 | ||
| 537 | pshufd \$0x93, @x[1], @t[1] | ||
| 538 | pxor @x[2], @x[1] # 1 25 | ||
| 539 | pxor @x[6], @x[0] # 01 6 [2] | ||
| 540 | pxor @x[3], @x[1] # 125 3 [4] | ||
| 541 | pshufd \$0x93, @x[3], @t[3] | ||
| 542 | pxor @x[0], @x[2] # 25 016 [3] | ||
| 543 | pxor @x[7], @x[3] # 3 75 | ||
| 544 | pxor @x[6], @x[7] # 75 6 [0] | ||
| 545 | pshufd \$0x93, @x[6], @t[6] | ||
| 546 | movdqa @x[4], @t[4] | ||
| 547 | pxor @x[4], @x[6] # 6 4 | ||
| 548 | pxor @x[3], @x[4] # 4 375 [6] | ||
| 549 | pxor @x[7], @x[3] # 375 756=36 | ||
| 550 | pxor @t[5], @x[6] # 64 5 [7] | ||
| 551 | pxor @t[2], @x[3] # 36 2 | ||
| 552 | pxor @t[4], @x[3] # 362 4 [5] | ||
| 553 | pshufd \$0x93, @t[5], @t[5] | ||
| 554 | ___ | ||
| 555 | my @y = @x[7,5,0,2,1,3,4,6]; | ||
| 556 | $code.=<<___; | ||
| 557 | # multiplication by 0x0b | ||
| 558 | pxor @y[0], @y[1] | ||
| 559 | pxor @t[0], @y[0] | ||
| 560 | pxor @t[1], @y[1] | ||
| 561 | pshufd \$0x93, @t[2], @t[2] | ||
| 562 | pxor @t[5], @y[0] | ||
| 563 | pxor @t[6], @y[1] | ||
| 564 | pxor @t[7], @y[0] | ||
| 565 | pshufd \$0x93, @t[4], @t[4] | ||
| 566 | pxor @t[6], @t[7] # clobber t[7] | ||
| 567 | pxor @y[0], @y[1] | ||
| 568 | |||
| 569 | pxor @t[0], @y[3] | ||
| 570 | pshufd \$0x93, @t[0], @t[0] | ||
| 571 | pxor @t[1], @y[2] | ||
| 572 | pxor @t[1], @y[4] | ||
| 573 | pxor @t[2], @y[2] | ||
| 574 | pshufd \$0x93, @t[1], @t[1] | ||
| 575 | pxor @t[2], @y[3] | ||
| 576 | pxor @t[2], @y[5] | ||
| 577 | pxor @t[7], @y[2] | ||
| 578 | pshufd \$0x93, @t[2], @t[2] | ||
| 579 | pxor @t[3], @y[3] | ||
| 580 | pxor @t[3], @y[6] | ||
| 581 | pxor @t[3], @y[4] | ||
| 582 | pshufd \$0x93, @t[3], @t[3] | ||
| 583 | pxor @t[4], @y[7] | ||
| 584 | pxor @t[4], @y[5] | ||
| 585 | pxor @t[7], @y[7] | ||
| 586 | pxor @t[5], @y[3] | ||
| 587 | pxor @t[4], @y[4] | ||
| 588 | pxor @t[5], @t[7] # clobber t[7] even more | ||
| 589 | |||
| 590 | pxor @t[7], @y[5] | ||
| 591 | pshufd \$0x93, @t[4], @t[4] | ||
| 592 | pxor @t[7], @y[6] | ||
| 593 | pxor @t[7], @y[4] | ||
| 594 | |||
| 595 | pxor @t[5], @t[7] | ||
| 596 | pshufd \$0x93, @t[5], @t[5] | ||
| 597 | pxor @t[6], @t[7] # restore t[7] | ||
| 598 | |||
| 599 | # multiplication by 0x0d | ||
| 600 | pxor @y[7], @y[4] | ||
| 601 | pxor @t[4], @y[7] | ||
| 602 | pshufd \$0x93, @t[6], @t[6] | ||
| 603 | pxor @t[0], @y[2] | ||
| 604 | pxor @t[5], @y[7] | ||
| 605 | pxor @t[2], @y[2] | ||
| 606 | pshufd \$0x93, @t[7], @t[7] | ||
| 607 | |||
| 608 | pxor @y[1], @y[3] | ||
| 609 | pxor @t[1], @y[1] | ||
| 610 | pxor @t[0], @y[0] | ||
| 611 | pxor @t[0], @y[3] | ||
| 612 | pxor @t[5], @y[1] | ||
| 613 | pxor @t[5], @y[0] | ||
| 614 | pxor @t[7], @y[1] | ||
| 615 | pshufd \$0x93, @t[0], @t[0] | ||
| 616 | pxor @t[6], @y[0] | ||
| 617 | pxor @y[1], @y[3] | ||
| 618 | pxor @t[1], @y[4] | ||
| 619 | pshufd \$0x93, @t[1], @t[1] | ||
| 620 | |||
| 621 | pxor @t[7], @y[7] | ||
| 622 | pxor @t[2], @y[4] | ||
| 623 | pxor @t[2], @y[5] | ||
| 624 | pshufd \$0x93, @t[2], @t[2] | ||
| 625 | pxor @t[6], @y[2] | ||
| 626 | pxor @t[3], @t[6] # clobber t[6] | ||
| 627 | pxor @y[7], @y[4] | ||
| 628 | pxor @t[6], @y[3] | ||
| 629 | |||
| 630 | pxor @t[6], @y[6] | ||
| 631 | pxor @t[5], @y[5] | ||
| 632 | pxor @t[4], @y[6] | ||
| 633 | pshufd \$0x93, @t[4], @t[4] | ||
| 634 | pxor @t[6], @y[5] | ||
| 635 | pxor @t[7], @y[6] | ||
| 636 | pxor @t[3], @t[6] # restore t[6] | ||
| 637 | |||
| 638 | pshufd \$0x93, @t[5], @t[5] | ||
| 639 | pshufd \$0x93, @t[6], @t[6] | ||
| 640 | pshufd \$0x93, @t[7], @t[7] | ||
| 641 | pshufd \$0x93, @t[3], @t[3] | ||
| 642 | |||
| 643 | # multiplication by 0x09 | ||
| 644 | pxor @y[1], @y[4] | ||
| 645 | pxor @y[1], @t[1] # t[1]=y[1] | ||
| 646 | pxor @t[5], @t[0] # clobber t[0] | ||
| 647 | pxor @t[5], @t[1] | ||
| 648 | pxor @t[0], @y[3] | ||
| 649 | pxor @y[0], @t[0] # t[0]=y[0] | ||
| 650 | pxor @t[6], @t[1] | ||
| 651 | pxor @t[7], @t[6] # clobber t[6] | ||
| 652 | pxor @t[1], @y[4] | ||
| 653 | pxor @t[4], @y[7] | ||
| 654 | pxor @y[4], @t[4] # t[4]=y[4] | ||
| 655 | pxor @t[3], @y[6] | ||
| 656 | pxor @y[3], @t[3] # t[3]=y[3] | ||
| 657 | pxor @t[2], @y[5] | ||
| 658 | pxor @y[2], @t[2] # t[2]=y[2] | ||
| 659 | pxor @t[7], @t[3] | ||
| 660 | pxor @y[5], @t[5] # t[5]=y[5] | ||
| 661 | pxor @t[6], @t[2] | ||
| 662 | pxor @t[6], @t[5] | ||
| 663 | pxor @y[6], @t[6] # t[6]=y[6] | ||
| 664 | pxor @y[7], @t[7] # t[7]=y[7] | ||
| 665 | |||
| 666 | movdqa @t[0],@XMM[0] | ||
| 667 | movdqa @t[1],@XMM[1] | ||
| 668 | movdqa @t[2],@XMM[2] | ||
| 669 | movdqa @t[3],@XMM[3] | ||
| 670 | movdqa @t[4],@XMM[4] | ||
| 671 | movdqa @t[5],@XMM[5] | ||
| 672 | movdqa @t[6],@XMM[6] | ||
| 673 | movdqa @t[7],@XMM[7] | ||
| 674 | ___ | ||
| 675 | } | ||
| 676 | |||
| 677 | sub InvMixColumns { | ||
| 678 | my @x=@_[0..7]; | ||
| 679 | my @t=@_[8..15]; | ||
| 680 | |||
| 681 | # Thanks to Jussi Kivilinna for providing pointer to | ||
| 682 | # | ||
| 683 | # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | | ||
| 684 | # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | | ||
| 685 | # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | | ||
| 686 | # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | | ||
| 687 | |||
| 688 | $code.=<<___; | ||
| 689 | # multiplication by 0x05-0x00-0x04-0x00 | ||
| 690 | pshufd \$0x4E, @x[0], @t[0] | ||
| 691 | pshufd \$0x4E, @x[6], @t[6] | ||
| 692 | pxor @x[0], @t[0] | ||
| 693 | pshufd \$0x4E, @x[7], @t[7] | ||
| 694 | pxor @x[6], @t[6] | ||
| 695 | pshufd \$0x4E, @x[1], @t[1] | ||
| 696 | pxor @x[7], @t[7] | ||
| 697 | pshufd \$0x4E, @x[2], @t[2] | ||
| 698 | pxor @x[1], @t[1] | ||
| 699 | pshufd \$0x4E, @x[3], @t[3] | ||
| 700 | pxor @x[2], @t[2] | ||
| 701 | pxor @t[6], @x[0] | ||
| 702 | pxor @t[6], @x[1] | ||
| 703 | pshufd \$0x4E, @x[4], @t[4] | ||
| 704 | pxor @x[3], @t[3] | ||
| 705 | pxor @t[0], @x[2] | ||
| 706 | pxor @t[1], @x[3] | ||
| 707 | pshufd \$0x4E, @x[5], @t[5] | ||
| 708 | pxor @x[4], @t[4] | ||
| 709 | pxor @t[7], @x[1] | ||
| 710 | pxor @t[2], @x[4] | ||
| 711 | pxor @x[5], @t[5] | ||
| 712 | |||
| 713 | pxor @t[7], @x[2] | ||
| 714 | pxor @t[6], @x[3] | ||
| 715 | pxor @t[6], @x[4] | ||
| 716 | pxor @t[3], @x[5] | ||
| 717 | pxor @t[4], @x[6] | ||
| 718 | pxor @t[7], @x[4] | ||
| 719 | pxor @t[7], @x[5] | ||
| 720 | pxor @t[5], @x[7] | ||
| 721 | ___ | ||
| 722 | &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 | ||
| 723 | } | ||
| 724 | |||
| 725 | sub aesenc { # not used | ||
| 726 | my @b=@_[0..7]; | ||
| 727 | my @t=@_[8..15]; | ||
| 728 | $code.=<<___; | ||
| 729 | movdqa 0x30($const),@t[0] # .LSR | ||
| 730 | ___ | ||
| 731 | &ShiftRows (@b,@t[0]); | ||
| 732 | &Sbox (@b,@t); | ||
| 733 | &MixColumns (@b[0,1,4,6,3,7,2,5],@t); | ||
| 734 | } | ||
| 735 | |||
| 736 | sub aesenclast { # not used | ||
| 737 | my @b=@_[0..7]; | ||
| 738 | my @t=@_[8..15]; | ||
| 739 | $code.=<<___; | ||
| 740 | movdqa 0x40($const),@t[0] # .LSRM0 | ||
| 741 | ___ | ||
| 742 | &ShiftRows (@b,@t[0]); | ||
| 743 | &Sbox (@b,@t); | ||
| 744 | $code.=<<___ | ||
| 745 | pxor 0x00($key),@b[0] | ||
| 746 | pxor 0x10($key),@b[1] | ||
| 747 | pxor 0x20($key),@b[4] | ||
| 748 | pxor 0x30($key),@b[6] | ||
| 749 | pxor 0x40($key),@b[3] | ||
| 750 | pxor 0x50($key),@b[7] | ||
| 751 | pxor 0x60($key),@b[2] | ||
| 752 | pxor 0x70($key),@b[5] | ||
| 753 | ___ | ||
| 754 | } | ||
| 755 | |||
| 756 | sub swapmove { | ||
| 757 | my ($a,$b,$n,$mask,$t)=@_; | ||
| 758 | $code.=<<___; | ||
| 759 | movdqa $b,$t | ||
| 760 | psrlq \$$n,$b | ||
| 761 | pxor $a,$b | ||
| 762 | pand $mask,$b | ||
| 763 | pxor $b,$a | ||
| 764 | psllq \$$n,$b | ||
| 765 | pxor $t,$b | ||
| 766 | ___ | ||
| 767 | } | ||
| 768 | sub swapmove2x { | ||
| 769 | my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; | ||
| 770 | $code.=<<___; | ||
| 771 | movdqa $b0,$t0 | ||
| 772 | psrlq \$$n,$b0 | ||
| 773 | movdqa $b1,$t1 | ||
| 774 | psrlq \$$n,$b1 | ||
| 775 | pxor $a0,$b0 | ||
| 776 | pxor $a1,$b1 | ||
| 777 | pand $mask,$b0 | ||
| 778 | pand $mask,$b1 | ||
| 779 | pxor $b0,$a0 | ||
| 780 | psllq \$$n,$b0 | ||
| 781 | pxor $b1,$a1 | ||
| 782 | psllq \$$n,$b1 | ||
| 783 | pxor $t0,$b0 | ||
| 784 | pxor $t1,$b1 | ||
| 785 | ___ | ||
| 786 | } | ||
| 787 | |||
| 788 | sub bitslice { | ||
| 789 | my @x=reverse(@_[0..7]); | ||
| 790 | my ($t0,$t1,$t2,$t3)=@_[8..11]; | ||
| 791 | $code.=<<___; | ||
| 792 | movdqa 0x00($const),$t0 # .LBS0 | ||
| 793 | movdqa 0x10($const),$t1 # .LBS1 | ||
| 794 | ___ | ||
| 795 | &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); | ||
| 796 | &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
| 797 | $code.=<<___; | ||
| 798 | movdqa 0x20($const),$t0 # .LBS2 | ||
| 799 | ___ | ||
| 800 | &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); | ||
| 801 | &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
| 802 | |||
| 803 | &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); | ||
| 804 | &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); | ||
| 805 | } | ||
| 806 | |||
| 807 | $code.=<<___; | ||
| 808 | .text | ||
| 809 | |||
| 810 | .extern asm_AES_encrypt | ||
| 811 | .extern asm_AES_decrypt | ||
| 812 | |||
| 813 | .type _bsaes_encrypt8,\@abi-omnipotent | ||
| 814 | .align 64 | ||
| 815 | _bsaes_encrypt8: | ||
| 816 | _CET_ENDBR | ||
| 817 | lea .LBS0(%rip), $const # constants table | ||
| 818 | |||
| 819 | movdqa ($key), @XMM[9] # round 0 key | ||
| 820 | lea 0x10($key), $key | ||
| 821 | movdqa 0x50($const), @XMM[8] # .LM0SR | ||
| 822 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
| 823 | pxor @XMM[9], @XMM[1] | ||
| 824 | pshufb @XMM[8], @XMM[0] | ||
| 825 | pxor @XMM[9], @XMM[2] | ||
| 826 | pshufb @XMM[8], @XMM[1] | ||
| 827 | pxor @XMM[9], @XMM[3] | ||
| 828 | pshufb @XMM[8], @XMM[2] | ||
| 829 | pxor @XMM[9], @XMM[4] | ||
| 830 | pshufb @XMM[8], @XMM[3] | ||
| 831 | pxor @XMM[9], @XMM[5] | ||
| 832 | pshufb @XMM[8], @XMM[4] | ||
| 833 | pxor @XMM[9], @XMM[6] | ||
| 834 | pshufb @XMM[8], @XMM[5] | ||
| 835 | pxor @XMM[9], @XMM[7] | ||
| 836 | pshufb @XMM[8], @XMM[6] | ||
| 837 | pshufb @XMM[8], @XMM[7] | ||
| 838 | _bsaes_encrypt8_bitslice: | ||
| 839 | ___ | ||
| 840 | &bitslice (@XMM[0..7, 8..11]); | ||
| 841 | $code.=<<___; | ||
| 842 | dec $rounds | ||
| 843 | jmp .Lenc_sbox | ||
| 844 | .align 16 | ||
| 845 | .Lenc_loop: | ||
| 846 | ___ | ||
| 847 | &ShiftRows (@XMM[0..7, 8]); | ||
| 848 | $code.=".Lenc_sbox:\n"; | ||
| 849 | &Sbox (@XMM[0..7, 8..15]); | ||
| 850 | $code.=<<___; | ||
| 851 | dec $rounds | ||
| 852 | jl .Lenc_done | ||
| 853 | ___ | ||
| 854 | &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); | ||
| 855 | $code.=<<___; | ||
| 856 | movdqa 0x30($const), @XMM[8] # .LSR | ||
| 857 | jnz .Lenc_loop | ||
| 858 | movdqa 0x40($const), @XMM[8] # .LSRM0 | ||
| 859 | jmp .Lenc_loop | ||
| 860 | .align 16 | ||
| 861 | .Lenc_done: | ||
| 862 | ___ | ||
| 863 | # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb | ||
| 864 | &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); | ||
| 865 | $code.=<<___; | ||
| 866 | movdqa ($key), @XMM[8] # last round key | ||
| 867 | pxor @XMM[8], @XMM[4] | ||
| 868 | pxor @XMM[8], @XMM[6] | ||
| 869 | pxor @XMM[8], @XMM[3] | ||
| 870 | pxor @XMM[8], @XMM[7] | ||
| 871 | pxor @XMM[8], @XMM[2] | ||
| 872 | pxor @XMM[8], @XMM[5] | ||
| 873 | pxor @XMM[8], @XMM[0] | ||
| 874 | pxor @XMM[8], @XMM[1] | ||
| 875 | ret | ||
| 876 | .size _bsaes_encrypt8,.-_bsaes_encrypt8 | ||
| 877 | |||
| 878 | .type _bsaes_decrypt8,\@abi-omnipotent | ||
| 879 | .align 64 | ||
| 880 | _bsaes_decrypt8: | ||
| 881 | _CET_ENDBR | ||
| 882 | lea .LBS0(%rip), $const # constants table | ||
| 883 | |||
| 884 | movdqa ($key), @XMM[9] # round 0 key | ||
| 885 | lea 0x10($key), $key | ||
| 886 | movdqa -0x30($const), @XMM[8] # .LM0ISR | ||
| 887 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
| 888 | pxor @XMM[9], @XMM[1] | ||
| 889 | pshufb @XMM[8], @XMM[0] | ||
| 890 | pxor @XMM[9], @XMM[2] | ||
| 891 | pshufb @XMM[8], @XMM[1] | ||
| 892 | pxor @XMM[9], @XMM[3] | ||
| 893 | pshufb @XMM[8], @XMM[2] | ||
| 894 | pxor @XMM[9], @XMM[4] | ||
| 895 | pshufb @XMM[8], @XMM[3] | ||
| 896 | pxor @XMM[9], @XMM[5] | ||
| 897 | pshufb @XMM[8], @XMM[4] | ||
| 898 | pxor @XMM[9], @XMM[6] | ||
| 899 | pshufb @XMM[8], @XMM[5] | ||
| 900 | pxor @XMM[9], @XMM[7] | ||
| 901 | pshufb @XMM[8], @XMM[6] | ||
| 902 | pshufb @XMM[8], @XMM[7] | ||
| 903 | ___ | ||
| 904 | &bitslice (@XMM[0..7, 8..11]); | ||
| 905 | $code.=<<___; | ||
| 906 | dec $rounds | ||
| 907 | jmp .Ldec_sbox | ||
| 908 | .align 16 | ||
| 909 | .Ldec_loop: | ||
| 910 | ___ | ||
| 911 | &ShiftRows (@XMM[0..7, 8]); | ||
| 912 | $code.=".Ldec_sbox:\n"; | ||
| 913 | &InvSbox (@XMM[0..7, 8..15]); | ||
| 914 | $code.=<<___; | ||
| 915 | dec $rounds | ||
| 916 | jl .Ldec_done | ||
| 917 | ___ | ||
| 918 | &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); | ||
| 919 | $code.=<<___; | ||
| 920 | movdqa -0x10($const), @XMM[8] # .LISR | ||
| 921 | jnz .Ldec_loop | ||
| 922 | movdqa -0x20($const), @XMM[8] # .LISRM0 | ||
| 923 | jmp .Ldec_loop | ||
| 924 | .align 16 | ||
| 925 | .Ldec_done: | ||
| 926 | ___ | ||
| 927 | &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); | ||
| 928 | $code.=<<___; | ||
| 929 | movdqa ($key), @XMM[8] # last round key | ||
| 930 | pxor @XMM[8], @XMM[6] | ||
| 931 | pxor @XMM[8], @XMM[4] | ||
| 932 | pxor @XMM[8], @XMM[2] | ||
| 933 | pxor @XMM[8], @XMM[7] | ||
| 934 | pxor @XMM[8], @XMM[3] | ||
| 935 | pxor @XMM[8], @XMM[5] | ||
| 936 | pxor @XMM[8], @XMM[0] | ||
| 937 | pxor @XMM[8], @XMM[1] | ||
| 938 | ret | ||
| 939 | .size _bsaes_decrypt8,.-_bsaes_decrypt8 | ||
| 940 | ___ | ||
| 941 | } | ||
| 942 | { | ||
| 943 | my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); | ||
| 944 | |||
| 945 | sub bitslice_key { | ||
| 946 | my @x=reverse(@_[0..7]); | ||
| 947 | my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; | ||
| 948 | |||
| 949 | &swapmove (@x[0,1],1,$bs0,$t2,$t3); | ||
| 950 | $code.=<<___; | ||
| 951 | #&swapmove(@x[2,3],1,$t0,$t2,$t3); | ||
| 952 | movdqa @x[0], @x[2] | ||
| 953 | movdqa @x[1], @x[3] | ||
| 954 | ___ | ||
| 955 | #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
| 956 | |||
| 957 | &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); | ||
| 958 | $code.=<<___; | ||
| 959 | #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
| 960 | movdqa @x[0], @x[4] | ||
| 961 | movdqa @x[2], @x[6] | ||
| 962 | movdqa @x[1], @x[5] | ||
| 963 | movdqa @x[3], @x[7] | ||
| 964 | ___ | ||
| 965 | &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); | ||
| 966 | &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); | ||
| 967 | } | ||
| 968 | |||
| 969 | $code.=<<___; | ||
| 970 | .type _bsaes_key_convert,\@abi-omnipotent | ||
| 971 | .align 16 | ||
| 972 | _bsaes_key_convert: | ||
| 973 | _CET_ENDBR | ||
| 974 | lea .Lmasks(%rip), $const | ||
| 975 | movdqu ($inp), %xmm7 # load round 0 key | ||
| 976 | lea 0x10($inp), $inp | ||
| 977 | movdqa 0x00($const), %xmm0 # 0x01... | ||
| 978 | movdqa 0x10($const), %xmm1 # 0x02... | ||
| 979 | movdqa 0x20($const), %xmm2 # 0x04... | ||
| 980 | movdqa 0x30($const), %xmm3 # 0x08... | ||
| 981 | movdqa 0x40($const), %xmm4 # .LM0 | ||
| 982 | pcmpeqd %xmm5, %xmm5 # .LNOT | ||
| 983 | |||
| 984 | movdqu ($inp), %xmm6 # load round 1 key | ||
| 985 | movdqa %xmm7, ($out) # save round 0 key | ||
| 986 | lea 0x10($out), $out | ||
| 987 | dec $rounds | ||
| 988 | jmp .Lkey_loop | ||
| 989 | .align 16 | ||
| 990 | .Lkey_loop: | ||
| 991 | pshufb %xmm4, %xmm6 # .LM0 | ||
| 992 | |||
| 993 | movdqa %xmm0, %xmm8 | ||
| 994 | movdqa %xmm1, %xmm9 | ||
| 995 | |||
| 996 | pand %xmm6, %xmm8 | ||
| 997 | pand %xmm6, %xmm9 | ||
| 998 | movdqa %xmm2, %xmm10 | ||
| 999 | pcmpeqb %xmm0, %xmm8 | ||
| 1000 | psllq \$4, %xmm0 # 0x10... | ||
| 1001 | movdqa %xmm3, %xmm11 | ||
| 1002 | pcmpeqb %xmm1, %xmm9 | ||
| 1003 | psllq \$4, %xmm1 # 0x20... | ||
| 1004 | |||
| 1005 | pand %xmm6, %xmm10 | ||
| 1006 | pand %xmm6, %xmm11 | ||
| 1007 | movdqa %xmm0, %xmm12 | ||
| 1008 | pcmpeqb %xmm2, %xmm10 | ||
| 1009 | psllq \$4, %xmm2 # 0x40... | ||
| 1010 | movdqa %xmm1, %xmm13 | ||
| 1011 | pcmpeqb %xmm3, %xmm11 | ||
| 1012 | psllq \$4, %xmm3 # 0x80... | ||
| 1013 | |||
| 1014 | movdqa %xmm2, %xmm14 | ||
| 1015 | movdqa %xmm3, %xmm15 | ||
| 1016 | pxor %xmm5, %xmm8 # "pnot" | ||
| 1017 | pxor %xmm5, %xmm9 | ||
| 1018 | |||
| 1019 | pand %xmm6, %xmm12 | ||
| 1020 | pand %xmm6, %xmm13 | ||
| 1021 | movdqa %xmm8, 0x00($out) # write bit-sliced round key | ||
| 1022 | pcmpeqb %xmm0, %xmm12 | ||
| 1023 | psrlq \$4, %xmm0 # 0x01... | ||
| 1024 | movdqa %xmm9, 0x10($out) | ||
| 1025 | pcmpeqb %xmm1, %xmm13 | ||
| 1026 | psrlq \$4, %xmm1 # 0x02... | ||
| 1027 | lea 0x10($inp), $inp | ||
| 1028 | |||
| 1029 | pand %xmm6, %xmm14 | ||
| 1030 | pand %xmm6, %xmm15 | ||
| 1031 | movdqa %xmm10, 0x20($out) | ||
| 1032 | pcmpeqb %xmm2, %xmm14 | ||
| 1033 | psrlq \$4, %xmm2 # 0x04... | ||
| 1034 | movdqa %xmm11, 0x30($out) | ||
| 1035 | pcmpeqb %xmm3, %xmm15 | ||
| 1036 | psrlq \$4, %xmm3 # 0x08... | ||
| 1037 | movdqu ($inp), %xmm6 # load next round key | ||
| 1038 | |||
| 1039 | pxor %xmm5, %xmm13 # "pnot" | ||
| 1040 | pxor %xmm5, %xmm14 | ||
| 1041 | movdqa %xmm12, 0x40($out) | ||
| 1042 | movdqa %xmm13, 0x50($out) | ||
| 1043 | movdqa %xmm14, 0x60($out) | ||
| 1044 | movdqa %xmm15, 0x70($out) | ||
| 1045 | lea 0x80($out),$out | ||
| 1046 | dec $rounds | ||
| 1047 | jnz .Lkey_loop | ||
| 1048 | |||
| 1049 | movdqa 0x50($const), %xmm7 # .L63 | ||
| 1050 | #movdqa %xmm6, ($out) # don't save last round key | ||
| 1051 | ret | ||
| 1052 | .size _bsaes_key_convert,.-_bsaes_key_convert | ||
| 1053 | ___ | ||
| 1054 | } | ||
| 1055 | |||
| 1056 | if (0 && !$win64) { # following four functions are unsupported interface | ||
| 1057 | # used for benchmarking... | ||
| 1058 | $code.=<<___; | ||
| 1059 | .globl bsaes_enc_key_convert | ||
| 1060 | .type bsaes_enc_key_convert,\@function,2 | ||
| 1061 | .align 16 | ||
| 1062 | bsaes_enc_key_convert: | ||
| 1063 | _CET_ENDBR | ||
| 1064 | mov 240($inp),%r10d # pass rounds | ||
| 1065 | mov $inp,%rcx # pass key | ||
| 1066 | mov $out,%rax # pass key schedule | ||
| 1067 | call _bsaes_key_convert | ||
| 1068 | pxor %xmm6,%xmm7 # fix up last round key | ||
| 1069 | movdqa %xmm7,(%rax) # save last round key | ||
| 1070 | ret | ||
| 1071 | .size bsaes_enc_key_convert,.-bsaes_enc_key_convert | ||
| 1072 | |||
| 1073 | .globl bsaes_encrypt_128 | ||
| 1074 | .type bsaes_encrypt_128,\@function,4 | ||
| 1075 | .align 16 | ||
| 1076 | bsaes_encrypt_128: | ||
| 1077 | .Lenc128_loop: | ||
| 1078 | _CET_ENDBR | ||
| 1079 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1080 | movdqu 0x10($inp), @XMM[1] | ||
| 1081 | movdqu 0x20($inp), @XMM[2] | ||
| 1082 | movdqu 0x30($inp), @XMM[3] | ||
| 1083 | movdqu 0x40($inp), @XMM[4] | ||
| 1084 | movdqu 0x50($inp), @XMM[5] | ||
| 1085 | movdqu 0x60($inp), @XMM[6] | ||
| 1086 | movdqu 0x70($inp), @XMM[7] | ||
| 1087 | mov $key, %rax # pass the $key | ||
| 1088 | lea 0x80($inp), $inp | ||
| 1089 | mov \$10,%r10d | ||
| 1090 | |||
| 1091 | call _bsaes_encrypt8 | ||
| 1092 | |||
| 1093 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1094 | movdqu @XMM[1], 0x10($out) | ||
| 1095 | movdqu @XMM[4], 0x20($out) | ||
| 1096 | movdqu @XMM[6], 0x30($out) | ||
| 1097 | movdqu @XMM[3], 0x40($out) | ||
| 1098 | movdqu @XMM[7], 0x50($out) | ||
| 1099 | movdqu @XMM[2], 0x60($out) | ||
| 1100 | movdqu @XMM[5], 0x70($out) | ||
| 1101 | lea 0x80($out), $out | ||
| 1102 | sub \$0x80,$len | ||
| 1103 | ja .Lenc128_loop | ||
| 1104 | ret | ||
| 1105 | .size bsaes_encrypt_128,.-bsaes_encrypt_128 | ||
| 1106 | |||
| 1107 | .globl bsaes_dec_key_convert | ||
| 1108 | .type bsaes_dec_key_convert,\@function,2 | ||
| 1109 | .align 16 | ||
| 1110 | bsaes_dec_key_convert: | ||
| 1111 | _CET_ENDBR | ||
| 1112 | mov 240($inp),%r10d # pass rounds | ||
| 1113 | mov $inp,%rcx # pass key | ||
| 1114 | mov $out,%rax # pass key schedule | ||
| 1115 | call _bsaes_key_convert | ||
| 1116 | pxor ($out),%xmm7 # fix up round 0 key | ||
| 1117 | movdqa %xmm6,(%rax) # save last round key | ||
| 1118 | movdqa %xmm7,($out) | ||
| 1119 | ret | ||
| 1120 | .size bsaes_dec_key_convert,.-bsaes_dec_key_convert | ||
| 1121 | |||
| 1122 | .globl bsaes_decrypt_128 | ||
| 1123 | .type bsaes_decrypt_128,\@function,4 | ||
| 1124 | .align 16 | ||
| 1125 | bsaes_decrypt_128: | ||
| 1126 | _CET_ENDBR | ||
| 1127 | .Ldec128_loop: | ||
| 1128 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1129 | movdqu 0x10($inp), @XMM[1] | ||
| 1130 | movdqu 0x20($inp), @XMM[2] | ||
| 1131 | movdqu 0x30($inp), @XMM[3] | ||
| 1132 | movdqu 0x40($inp), @XMM[4] | ||
| 1133 | movdqu 0x50($inp), @XMM[5] | ||
| 1134 | movdqu 0x60($inp), @XMM[6] | ||
| 1135 | movdqu 0x70($inp), @XMM[7] | ||
| 1136 | mov $key, %rax # pass the $key | ||
| 1137 | lea 0x80($inp), $inp | ||
| 1138 | mov \$10,%r10d | ||
| 1139 | |||
| 1140 | call _bsaes_decrypt8 | ||
| 1141 | |||
| 1142 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1143 | movdqu @XMM[1], 0x10($out) | ||
| 1144 | movdqu @XMM[6], 0x20($out) | ||
| 1145 | movdqu @XMM[4], 0x30($out) | ||
| 1146 | movdqu @XMM[2], 0x40($out) | ||
| 1147 | movdqu @XMM[7], 0x50($out) | ||
| 1148 | movdqu @XMM[3], 0x60($out) | ||
| 1149 | movdqu @XMM[5], 0x70($out) | ||
| 1150 | lea 0x80($out), $out | ||
| 1151 | sub \$0x80,$len | ||
| 1152 | ja .Ldec128_loop | ||
| 1153 | ret | ||
| 1154 | .size bsaes_decrypt_128,.-bsaes_decrypt_128 | ||
| 1155 | ___ | ||
| 1156 | } | ||
| 1157 | { | ||
| 1158 | ###################################################################### | ||
| 1159 | # | ||
| 1160 | # OpenSSL interface | ||
| 1161 | # | ||
| 1162 | my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") | ||
| 1163 | : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | ||
| 1164 | my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); | ||
| 1165 | |||
| 1166 | if ($ecb) { | ||
| 1167 | $code.=<<___; | ||
| 1168 | .globl bsaes_ecb_encrypt_blocks | ||
| 1169 | .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent | ||
| 1170 | .align 16 | ||
| 1171 | bsaes_ecb_encrypt_blocks: | ||
| 1172 | _CET_ENDBR | ||
| 1173 | mov %rsp, %rax | ||
| 1174 | .Lecb_enc_prologue: | ||
| 1175 | push %rbp | ||
| 1176 | push %rbx | ||
| 1177 | push %r12 | ||
| 1178 | push %r13 | ||
| 1179 | push %r14 | ||
| 1180 | push %r15 | ||
| 1181 | lea -0x48(%rsp),%rsp | ||
| 1182 | ___ | ||
| 1183 | $code.=<<___ if ($win64); | ||
| 1184 | lea -0xa0(%rsp), %rsp | ||
| 1185 | movaps %xmm6, 0x40(%rsp) | ||
| 1186 | movaps %xmm7, 0x50(%rsp) | ||
| 1187 | movaps %xmm8, 0x60(%rsp) | ||
| 1188 | movaps %xmm9, 0x70(%rsp) | ||
| 1189 | movaps %xmm10, 0x80(%rsp) | ||
| 1190 | movaps %xmm11, 0x90(%rsp) | ||
| 1191 | movaps %xmm12, 0xa0(%rsp) | ||
| 1192 | movaps %xmm13, 0xb0(%rsp) | ||
| 1193 | movaps %xmm14, 0xc0(%rsp) | ||
| 1194 | movaps %xmm15, 0xd0(%rsp) | ||
| 1195 | .Lecb_enc_body: | ||
| 1196 | ___ | ||
| 1197 | $code.=<<___; | ||
| 1198 | mov %rsp,%rbp # backup %rsp | ||
| 1199 | mov 240($arg4),%eax # rounds | ||
| 1200 | mov $arg1,$inp # backup arguments | ||
| 1201 | mov $arg2,$out | ||
| 1202 | mov $arg3,$len | ||
| 1203 | mov $arg4,$key | ||
| 1204 | cmp \$8,$arg3 | ||
| 1205 | jb .Lecb_enc_short | ||
| 1206 | |||
| 1207 | mov %eax,%ebx # backup rounds | ||
| 1208 | shl \$7,%rax # 128 bytes per inner round key | ||
| 1209 | sub \$`128-32`,%rax # size of bit-sliced key schedule | ||
| 1210 | sub %rax,%rsp | ||
| 1211 | mov %rsp,%rax # pass key schedule | ||
| 1212 | mov $key,%rcx # pass key | ||
| 1213 | mov %ebx,%r10d # pass rounds | ||
| 1214 | call _bsaes_key_convert | ||
| 1215 | pxor %xmm6,%xmm7 # fix up last round key | ||
| 1216 | movdqa %xmm7,(%rax) # save last round key | ||
| 1217 | |||
| 1218 | sub \$8,$len | ||
| 1219 | .Lecb_enc_loop: | ||
| 1220 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1221 | movdqu 0x10($inp), @XMM[1] | ||
| 1222 | movdqu 0x20($inp), @XMM[2] | ||
| 1223 | movdqu 0x30($inp), @XMM[3] | ||
| 1224 | movdqu 0x40($inp), @XMM[4] | ||
| 1225 | movdqu 0x50($inp), @XMM[5] | ||
| 1226 | mov %rsp, %rax # pass key schedule | ||
| 1227 | movdqu 0x60($inp), @XMM[6] | ||
| 1228 | mov %ebx,%r10d # pass rounds | ||
| 1229 | movdqu 0x70($inp), @XMM[7] | ||
| 1230 | lea 0x80($inp), $inp | ||
| 1231 | |||
| 1232 | call _bsaes_encrypt8 | ||
| 1233 | |||
| 1234 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1235 | movdqu @XMM[1], 0x10($out) | ||
| 1236 | movdqu @XMM[4], 0x20($out) | ||
| 1237 | movdqu @XMM[6], 0x30($out) | ||
| 1238 | movdqu @XMM[3], 0x40($out) | ||
| 1239 | movdqu @XMM[7], 0x50($out) | ||
| 1240 | movdqu @XMM[2], 0x60($out) | ||
| 1241 | movdqu @XMM[5], 0x70($out) | ||
| 1242 | lea 0x80($out), $out | ||
| 1243 | sub \$8,$len | ||
| 1244 | jnc .Lecb_enc_loop | ||
| 1245 | |||
| 1246 | add \$8,$len | ||
| 1247 | jz .Lecb_enc_done | ||
| 1248 | |||
| 1249 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1250 | mov %rsp, %rax # pass key schedule | ||
| 1251 | mov %ebx,%r10d # pass rounds | ||
| 1252 | cmp \$2,$len | ||
| 1253 | jb .Lecb_enc_one | ||
| 1254 | movdqu 0x10($inp), @XMM[1] | ||
| 1255 | je .Lecb_enc_two | ||
| 1256 | movdqu 0x20($inp), @XMM[2] | ||
| 1257 | cmp \$4,$len | ||
| 1258 | jb .Lecb_enc_three | ||
| 1259 | movdqu 0x30($inp), @XMM[3] | ||
| 1260 | je .Lecb_enc_four | ||
| 1261 | movdqu 0x40($inp), @XMM[4] | ||
| 1262 | cmp \$6,$len | ||
| 1263 | jb .Lecb_enc_five | ||
| 1264 | movdqu 0x50($inp), @XMM[5] | ||
| 1265 | je .Lecb_enc_six | ||
| 1266 | movdqu 0x60($inp), @XMM[6] | ||
| 1267 | call _bsaes_encrypt8 | ||
| 1268 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1269 | movdqu @XMM[1], 0x10($out) | ||
| 1270 | movdqu @XMM[4], 0x20($out) | ||
| 1271 | movdqu @XMM[6], 0x30($out) | ||
| 1272 | movdqu @XMM[3], 0x40($out) | ||
| 1273 | movdqu @XMM[7], 0x50($out) | ||
| 1274 | movdqu @XMM[2], 0x60($out) | ||
| 1275 | jmp .Lecb_enc_done | ||
| 1276 | .align 16 | ||
| 1277 | .Lecb_enc_six: | ||
| 1278 | call _bsaes_encrypt8 | ||
| 1279 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1280 | movdqu @XMM[1], 0x10($out) | ||
| 1281 | movdqu @XMM[4], 0x20($out) | ||
| 1282 | movdqu @XMM[6], 0x30($out) | ||
| 1283 | movdqu @XMM[3], 0x40($out) | ||
| 1284 | movdqu @XMM[7], 0x50($out) | ||
| 1285 | jmp .Lecb_enc_done | ||
| 1286 | .align 16 | ||
| 1287 | .Lecb_enc_five: | ||
| 1288 | call _bsaes_encrypt8 | ||
| 1289 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1290 | movdqu @XMM[1], 0x10($out) | ||
| 1291 | movdqu @XMM[4], 0x20($out) | ||
| 1292 | movdqu @XMM[6], 0x30($out) | ||
| 1293 | movdqu @XMM[3], 0x40($out) | ||
| 1294 | jmp .Lecb_enc_done | ||
| 1295 | .align 16 | ||
| 1296 | .Lecb_enc_four: | ||
| 1297 | call _bsaes_encrypt8 | ||
| 1298 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1299 | movdqu @XMM[1], 0x10($out) | ||
| 1300 | movdqu @XMM[4], 0x20($out) | ||
| 1301 | movdqu @XMM[6], 0x30($out) | ||
| 1302 | jmp .Lecb_enc_done | ||
| 1303 | .align 16 | ||
| 1304 | .Lecb_enc_three: | ||
| 1305 | call _bsaes_encrypt8 | ||
| 1306 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1307 | movdqu @XMM[1], 0x10($out) | ||
| 1308 | movdqu @XMM[4], 0x20($out) | ||
| 1309 | jmp .Lecb_enc_done | ||
| 1310 | .align 16 | ||
| 1311 | .Lecb_enc_two: | ||
| 1312 | call _bsaes_encrypt8 | ||
| 1313 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1314 | movdqu @XMM[1], 0x10($out) | ||
| 1315 | jmp .Lecb_enc_done | ||
| 1316 | .align 16 | ||
| 1317 | .Lecb_enc_one: | ||
| 1318 | call _bsaes_encrypt8 | ||
| 1319 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1320 | jmp .Lecb_enc_done | ||
| 1321 | .align 16 | ||
| 1322 | .Lecb_enc_short: | ||
| 1323 | lea ($inp), $arg1 | ||
| 1324 | lea ($out), $arg2 | ||
| 1325 | lea ($key), $arg3 | ||
| 1326 | call asm_AES_encrypt | ||
| 1327 | lea 16($inp), $inp | ||
| 1328 | lea 16($out), $out | ||
| 1329 | dec $len | ||
| 1330 | jnz .Lecb_enc_short | ||
| 1331 | |||
| 1332 | .Lecb_enc_done: | ||
| 1333 | lea (%rsp),%rax | ||
| 1334 | pxor %xmm0, %xmm0 | ||
| 1335 | .Lecb_enc_bzero: # wipe key schedule [if any] | ||
| 1336 | movdqa %xmm0, 0x00(%rax) | ||
| 1337 | movdqa %xmm0, 0x10(%rax) | ||
| 1338 | lea 0x20(%rax), %rax | ||
| 1339 | cmp %rax, %rbp | ||
| 1340 | jb .Lecb_enc_bzero | ||
| 1341 | |||
| 1342 | lea (%rbp),%rsp # restore %rsp | ||
| 1343 | ___ | ||
| 1344 | $code.=<<___ if ($win64); | ||
| 1345 | movaps 0x40(%rbp), %xmm6 | ||
| 1346 | movaps 0x50(%rbp), %xmm7 | ||
| 1347 | movaps 0x60(%rbp), %xmm8 | ||
| 1348 | movaps 0x70(%rbp), %xmm9 | ||
| 1349 | movaps 0x80(%rbp), %xmm10 | ||
| 1350 | movaps 0x90(%rbp), %xmm11 | ||
| 1351 | movaps 0xa0(%rbp), %xmm12 | ||
| 1352 | movaps 0xb0(%rbp), %xmm13 | ||
| 1353 | movaps 0xc0(%rbp), %xmm14 | ||
| 1354 | movaps 0xd0(%rbp), %xmm15 | ||
| 1355 | lea 0xa0(%rbp), %rsp | ||
| 1356 | ___ | ||
| 1357 | $code.=<<___; | ||
| 1358 | mov 0x48(%rsp), %r15 | ||
| 1359 | mov 0x50(%rsp), %r14 | ||
| 1360 | mov 0x58(%rsp), %r13 | ||
| 1361 | mov 0x60(%rsp), %r12 | ||
| 1362 | mov 0x68(%rsp), %rbx | ||
| 1363 | mov 0x70(%rsp), %rax | ||
| 1364 | lea 0x78(%rsp), %rsp | ||
| 1365 | mov %rax, %rbp | ||
| 1366 | .Lecb_enc_epilogue: | ||
| 1367 | ret | ||
| 1368 | .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks | ||
| 1369 | |||
| 1370 | .globl bsaes_ecb_decrypt_blocks | ||
| 1371 | .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent | ||
| 1372 | .align 16 | ||
| 1373 | bsaes_ecb_decrypt_blocks: | ||
| 1374 | _CET_ENDBR | ||
| 1375 | mov %rsp, %rax | ||
| 1376 | .Lecb_dec_prologue: | ||
| 1377 | push %rbp | ||
| 1378 | push %rbx | ||
| 1379 | push %r12 | ||
| 1380 | push %r13 | ||
| 1381 | push %r14 | ||
| 1382 | push %r15 | ||
| 1383 | lea -0x48(%rsp),%rsp | ||
| 1384 | ___ | ||
| 1385 | $code.=<<___ if ($win64); | ||
| 1386 | lea -0xa0(%rsp), %rsp | ||
| 1387 | movaps %xmm6, 0x40(%rsp) | ||
| 1388 | movaps %xmm7, 0x50(%rsp) | ||
| 1389 | movaps %xmm8, 0x60(%rsp) | ||
| 1390 | movaps %xmm9, 0x70(%rsp) | ||
| 1391 | movaps %xmm10, 0x80(%rsp) | ||
| 1392 | movaps %xmm11, 0x90(%rsp) | ||
| 1393 | movaps %xmm12, 0xa0(%rsp) | ||
| 1394 | movaps %xmm13, 0xb0(%rsp) | ||
| 1395 | movaps %xmm14, 0xc0(%rsp) | ||
| 1396 | movaps %xmm15, 0xd0(%rsp) | ||
| 1397 | .Lecb_dec_body: | ||
| 1398 | ___ | ||
| 1399 | $code.=<<___; | ||
| 1400 | mov %rsp,%rbp # backup %rsp | ||
| 1401 | mov 240($arg4),%eax # rounds | ||
| 1402 | mov $arg1,$inp # backup arguments | ||
| 1403 | mov $arg2,$out | ||
| 1404 | mov $arg3,$len | ||
| 1405 | mov $arg4,$key | ||
| 1406 | cmp \$8,$arg3 | ||
| 1407 | jb .Lecb_dec_short | ||
| 1408 | |||
| 1409 | mov %eax,%ebx # backup rounds | ||
| 1410 | shl \$7,%rax # 128 bytes per inner round key | ||
| 1411 | sub \$`128-32`,%rax # size of bit-sliced key schedule | ||
| 1412 | sub %rax,%rsp | ||
| 1413 | mov %rsp,%rax # pass key schedule | ||
| 1414 | mov $key,%rcx # pass key | ||
| 1415 | mov %ebx,%r10d # pass rounds | ||
| 1416 | call _bsaes_key_convert | ||
| 1417 | pxor (%rsp),%xmm7 # fix up 0 round key | ||
| 1418 | movdqa %xmm6,(%rax) # save last round key | ||
| 1419 | movdqa %xmm7,(%rsp) | ||
| 1420 | |||
| 1421 | sub \$8,$len | ||
| 1422 | .Lecb_dec_loop: | ||
| 1423 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1424 | movdqu 0x10($inp), @XMM[1] | ||
| 1425 | movdqu 0x20($inp), @XMM[2] | ||
| 1426 | movdqu 0x30($inp), @XMM[3] | ||
| 1427 | movdqu 0x40($inp), @XMM[4] | ||
| 1428 | movdqu 0x50($inp), @XMM[5] | ||
| 1429 | mov %rsp, %rax # pass key schedule | ||
| 1430 | movdqu 0x60($inp), @XMM[6] | ||
| 1431 | mov %ebx,%r10d # pass rounds | ||
| 1432 | movdqu 0x70($inp), @XMM[7] | ||
| 1433 | lea 0x80($inp), $inp | ||
| 1434 | |||
| 1435 | call _bsaes_decrypt8 | ||
| 1436 | |||
| 1437 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1438 | movdqu @XMM[1], 0x10($out) | ||
| 1439 | movdqu @XMM[6], 0x20($out) | ||
| 1440 | movdqu @XMM[4], 0x30($out) | ||
| 1441 | movdqu @XMM[2], 0x40($out) | ||
| 1442 | movdqu @XMM[7], 0x50($out) | ||
| 1443 | movdqu @XMM[3], 0x60($out) | ||
| 1444 | movdqu @XMM[5], 0x70($out) | ||
| 1445 | lea 0x80($out), $out | ||
| 1446 | sub \$8,$len | ||
| 1447 | jnc .Lecb_dec_loop | ||
| 1448 | |||
| 1449 | add \$8,$len | ||
| 1450 | jz .Lecb_dec_done | ||
| 1451 | |||
| 1452 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1453 | mov %rsp, %rax # pass key schedule | ||
| 1454 | mov %ebx,%r10d # pass rounds | ||
| 1455 | cmp \$2,$len | ||
| 1456 | jb .Lecb_dec_one | ||
| 1457 | movdqu 0x10($inp), @XMM[1] | ||
| 1458 | je .Lecb_dec_two | ||
| 1459 | movdqu 0x20($inp), @XMM[2] | ||
| 1460 | cmp \$4,$len | ||
| 1461 | jb .Lecb_dec_three | ||
| 1462 | movdqu 0x30($inp), @XMM[3] | ||
| 1463 | je .Lecb_dec_four | ||
| 1464 | movdqu 0x40($inp), @XMM[4] | ||
| 1465 | cmp \$6,$len | ||
| 1466 | jb .Lecb_dec_five | ||
| 1467 | movdqu 0x50($inp), @XMM[5] | ||
| 1468 | je .Lecb_dec_six | ||
| 1469 | movdqu 0x60($inp), @XMM[6] | ||
| 1470 | call _bsaes_decrypt8 | ||
| 1471 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1472 | movdqu @XMM[1], 0x10($out) | ||
| 1473 | movdqu @XMM[6], 0x20($out) | ||
| 1474 | movdqu @XMM[4], 0x30($out) | ||
| 1475 | movdqu @XMM[2], 0x40($out) | ||
| 1476 | movdqu @XMM[7], 0x50($out) | ||
| 1477 | movdqu @XMM[3], 0x60($out) | ||
| 1478 | jmp .Lecb_dec_done | ||
| 1479 | .align 16 | ||
| 1480 | .Lecb_dec_six: | ||
| 1481 | call _bsaes_decrypt8 | ||
| 1482 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1483 | movdqu @XMM[1], 0x10($out) | ||
| 1484 | movdqu @XMM[6], 0x20($out) | ||
| 1485 | movdqu @XMM[4], 0x30($out) | ||
| 1486 | movdqu @XMM[2], 0x40($out) | ||
| 1487 | movdqu @XMM[7], 0x50($out) | ||
| 1488 | jmp .Lecb_dec_done | ||
| 1489 | .align 16 | ||
| 1490 | .Lecb_dec_five: | ||
| 1491 | call _bsaes_decrypt8 | ||
| 1492 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1493 | movdqu @XMM[1], 0x10($out) | ||
| 1494 | movdqu @XMM[6], 0x20($out) | ||
| 1495 | movdqu @XMM[4], 0x30($out) | ||
| 1496 | movdqu @XMM[2], 0x40($out) | ||
| 1497 | jmp .Lecb_dec_done | ||
| 1498 | .align 16 | ||
| 1499 | .Lecb_dec_four: | ||
| 1500 | call _bsaes_decrypt8 | ||
| 1501 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1502 | movdqu @XMM[1], 0x10($out) | ||
| 1503 | movdqu @XMM[6], 0x20($out) | ||
| 1504 | movdqu @XMM[4], 0x30($out) | ||
| 1505 | jmp .Lecb_dec_done | ||
| 1506 | .align 16 | ||
| 1507 | .Lecb_dec_three: | ||
| 1508 | call _bsaes_decrypt8 | ||
| 1509 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1510 | movdqu @XMM[1], 0x10($out) | ||
| 1511 | movdqu @XMM[6], 0x20($out) | ||
| 1512 | jmp .Lecb_dec_done | ||
| 1513 | .align 16 | ||
| 1514 | .Lecb_dec_two: | ||
| 1515 | call _bsaes_decrypt8 | ||
| 1516 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1517 | movdqu @XMM[1], 0x10($out) | ||
| 1518 | jmp .Lecb_dec_done | ||
| 1519 | .align 16 | ||
| 1520 | .Lecb_dec_one: | ||
| 1521 | call _bsaes_decrypt8 | ||
| 1522 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1523 | jmp .Lecb_dec_done | ||
| 1524 | .align 16 | ||
| 1525 | .Lecb_dec_short: | ||
| 1526 | lea ($inp), $arg1 | ||
| 1527 | lea ($out), $arg2 | ||
| 1528 | lea ($key), $arg3 | ||
| 1529 | call asm_AES_decrypt | ||
| 1530 | lea 16($inp), $inp | ||
| 1531 | lea 16($out), $out | ||
| 1532 | dec $len | ||
| 1533 | jnz .Lecb_dec_short | ||
| 1534 | |||
| 1535 | .Lecb_dec_done: | ||
| 1536 | lea (%rsp),%rax | ||
| 1537 | pxor %xmm0, %xmm0 | ||
| 1538 | .Lecb_dec_bzero: # wipe key schedule [if any] | ||
| 1539 | movdqa %xmm0, 0x00(%rax) | ||
| 1540 | movdqa %xmm0, 0x10(%rax) | ||
| 1541 | lea 0x20(%rax), %rax | ||
| 1542 | cmp %rax, %rbp | ||
| 1543 | jb .Lecb_dec_bzero | ||
| 1544 | |||
| 1545 | lea (%rbp),%rsp # restore %rsp | ||
| 1546 | ___ | ||
| 1547 | $code.=<<___ if ($win64); | ||
| 1548 | movaps 0x40(%rbp), %xmm6 | ||
| 1549 | movaps 0x50(%rbp), %xmm7 | ||
| 1550 | movaps 0x60(%rbp), %xmm8 | ||
| 1551 | movaps 0x70(%rbp), %xmm9 | ||
| 1552 | movaps 0x80(%rbp), %xmm10 | ||
| 1553 | movaps 0x90(%rbp), %xmm11 | ||
| 1554 | movaps 0xa0(%rbp), %xmm12 | ||
| 1555 | movaps 0xb0(%rbp), %xmm13 | ||
| 1556 | movaps 0xc0(%rbp), %xmm14 | ||
| 1557 | movaps 0xd0(%rbp), %xmm15 | ||
| 1558 | lea 0xa0(%rbp), %rsp | ||
| 1559 | ___ | ||
| 1560 | $code.=<<___; | ||
| 1561 | mov 0x48(%rsp), %r15 | ||
| 1562 | mov 0x50(%rsp), %r14 | ||
| 1563 | mov 0x58(%rsp), %r13 | ||
| 1564 | mov 0x60(%rsp), %r12 | ||
| 1565 | mov 0x68(%rsp), %rbx | ||
| 1566 | mov 0x70(%rsp), %rax | ||
| 1567 | lea 0x78(%rsp), %rsp | ||
| 1568 | mov %rax, %rbp | ||
| 1569 | .Lecb_dec_epilogue: | ||
| 1570 | ret | ||
| 1571 | .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks | ||
| 1572 | ___ | ||
| 1573 | } | ||
| 1574 | $code.=<<___; | ||
| 1575 | .extern asm_AES_cbc_encrypt | ||
| 1576 | .globl bsaes_cbc_encrypt | ||
| 1577 | .type bsaes_cbc_encrypt,\@abi-omnipotent | ||
| 1578 | .align 16 | ||
| 1579 | bsaes_cbc_encrypt: | ||
| 1580 | _CET_ENDBR | ||
| 1581 | ___ | ||
| 1582 | $code.=<<___ if ($win64); | ||
| 1583 | mov 48(%rsp),$arg6 # pull direction flag | ||
| 1584 | ___ | ||
| 1585 | $code.=<<___; | ||
| 1586 | cmp \$0,$arg6 | ||
| 1587 | jne asm_AES_cbc_encrypt | ||
| 1588 | cmp \$128,$arg3 | ||
| 1589 | jb asm_AES_cbc_encrypt | ||
| 1590 | |||
| 1591 | mov %rsp, %rax | ||
| 1592 | .Lcbc_dec_prologue: | ||
| 1593 | push %rbp | ||
| 1594 | push %rbx | ||
| 1595 | push %r12 | ||
| 1596 | push %r13 | ||
| 1597 | push %r14 | ||
| 1598 | push %r15 | ||
| 1599 | lea -0x48(%rsp), %rsp | ||
| 1600 | ___ | ||
| 1601 | $code.=<<___ if ($win64); | ||
| 1602 | mov 0xa0(%rsp),$arg5 # pull ivp | ||
| 1603 | lea -0xa0(%rsp), %rsp | ||
| 1604 | movaps %xmm6, 0x40(%rsp) | ||
| 1605 | movaps %xmm7, 0x50(%rsp) | ||
| 1606 | movaps %xmm8, 0x60(%rsp) | ||
| 1607 | movaps %xmm9, 0x70(%rsp) | ||
| 1608 | movaps %xmm10, 0x80(%rsp) | ||
| 1609 | movaps %xmm11, 0x90(%rsp) | ||
| 1610 | movaps %xmm12, 0xa0(%rsp) | ||
| 1611 | movaps %xmm13, 0xb0(%rsp) | ||
| 1612 | movaps %xmm14, 0xc0(%rsp) | ||
| 1613 | movaps %xmm15, 0xd0(%rsp) | ||
| 1614 | .Lcbc_dec_body: | ||
| 1615 | ___ | ||
| 1616 | $code.=<<___; | ||
| 1617 | mov %rsp, %rbp # backup %rsp | ||
| 1618 | mov 240($arg4), %eax # rounds | ||
| 1619 | mov $arg1, $inp # backup arguments | ||
| 1620 | mov $arg2, $out | ||
| 1621 | mov $arg3, $len | ||
| 1622 | mov $arg4, $key | ||
| 1623 | mov $arg5, %rbx | ||
| 1624 | shr \$4, $len # bytes to blocks | ||
| 1625 | |||
| 1626 | mov %eax, %edx # rounds | ||
| 1627 | shl \$7, %rax # 128 bytes per inner round key | ||
| 1628 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
| 1629 | sub %rax, %rsp | ||
| 1630 | |||
| 1631 | mov %rsp, %rax # pass key schedule | ||
| 1632 | mov $key, %rcx # pass key | ||
| 1633 | mov %edx, %r10d # pass rounds | ||
| 1634 | call _bsaes_key_convert | ||
| 1635 | pxor (%rsp),%xmm7 # fix up 0 round key | ||
| 1636 | movdqa %xmm6,(%rax) # save last round key | ||
| 1637 | movdqa %xmm7,(%rsp) | ||
| 1638 | |||
| 1639 | movdqu (%rbx), @XMM[15] # load IV | ||
| 1640 | sub \$8,$len | ||
| 1641 | .Lcbc_dec_loop: | ||
| 1642 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1643 | movdqu 0x10($inp), @XMM[1] | ||
| 1644 | movdqu 0x20($inp), @XMM[2] | ||
| 1645 | movdqu 0x30($inp), @XMM[3] | ||
| 1646 | movdqu 0x40($inp), @XMM[4] | ||
| 1647 | movdqu 0x50($inp), @XMM[5] | ||
| 1648 | mov %rsp, %rax # pass key schedule | ||
| 1649 | movdqu 0x60($inp), @XMM[6] | ||
| 1650 | mov %edx,%r10d # pass rounds | ||
| 1651 | movdqu 0x70($inp), @XMM[7] | ||
| 1652 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1653 | |||
| 1654 | call _bsaes_decrypt8 | ||
| 1655 | |||
| 1656 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1657 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1658 | movdqu 0x10($inp), @XMM[9] | ||
| 1659 | pxor @XMM[8], @XMM[1] | ||
| 1660 | movdqu 0x20($inp), @XMM[10] | ||
| 1661 | pxor @XMM[9], @XMM[6] | ||
| 1662 | movdqu 0x30($inp), @XMM[11] | ||
| 1663 | pxor @XMM[10], @XMM[4] | ||
| 1664 | movdqu 0x40($inp), @XMM[12] | ||
| 1665 | pxor @XMM[11], @XMM[2] | ||
| 1666 | movdqu 0x50($inp), @XMM[13] | ||
| 1667 | pxor @XMM[12], @XMM[7] | ||
| 1668 | movdqu 0x60($inp), @XMM[14] | ||
| 1669 | pxor @XMM[13], @XMM[3] | ||
| 1670 | movdqu 0x70($inp), @XMM[15] # IV | ||
| 1671 | pxor @XMM[14], @XMM[5] | ||
| 1672 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1673 | lea 0x80($inp), $inp | ||
| 1674 | movdqu @XMM[1], 0x10($out) | ||
| 1675 | movdqu @XMM[6], 0x20($out) | ||
| 1676 | movdqu @XMM[4], 0x30($out) | ||
| 1677 | movdqu @XMM[2], 0x40($out) | ||
| 1678 | movdqu @XMM[7], 0x50($out) | ||
| 1679 | movdqu @XMM[3], 0x60($out) | ||
| 1680 | movdqu @XMM[5], 0x70($out) | ||
| 1681 | lea 0x80($out), $out | ||
| 1682 | sub \$8,$len | ||
| 1683 | jnc .Lcbc_dec_loop | ||
| 1684 | |||
| 1685 | add \$8,$len | ||
| 1686 | jz .Lcbc_dec_done | ||
| 1687 | |||
| 1688 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1689 | mov %rsp, %rax # pass key schedule | ||
| 1690 | mov %edx, %r10d # pass rounds | ||
| 1691 | cmp \$2,$len | ||
| 1692 | jb .Lcbc_dec_one | ||
| 1693 | movdqu 0x10($inp), @XMM[1] | ||
| 1694 | je .Lcbc_dec_two | ||
| 1695 | movdqu 0x20($inp), @XMM[2] | ||
| 1696 | cmp \$4,$len | ||
| 1697 | jb .Lcbc_dec_three | ||
| 1698 | movdqu 0x30($inp), @XMM[3] | ||
| 1699 | je .Lcbc_dec_four | ||
| 1700 | movdqu 0x40($inp), @XMM[4] | ||
| 1701 | cmp \$6,$len | ||
| 1702 | jb .Lcbc_dec_five | ||
| 1703 | movdqu 0x50($inp), @XMM[5] | ||
| 1704 | je .Lcbc_dec_six | ||
| 1705 | movdqu 0x60($inp), @XMM[6] | ||
| 1706 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1707 | call _bsaes_decrypt8 | ||
| 1708 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1709 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1710 | movdqu 0x10($inp), @XMM[9] | ||
| 1711 | pxor @XMM[8], @XMM[1] | ||
| 1712 | movdqu 0x20($inp), @XMM[10] | ||
| 1713 | pxor @XMM[9], @XMM[6] | ||
| 1714 | movdqu 0x30($inp), @XMM[11] | ||
| 1715 | pxor @XMM[10], @XMM[4] | ||
| 1716 | movdqu 0x40($inp), @XMM[12] | ||
| 1717 | pxor @XMM[11], @XMM[2] | ||
| 1718 | movdqu 0x50($inp), @XMM[13] | ||
| 1719 | pxor @XMM[12], @XMM[7] | ||
| 1720 | movdqu 0x60($inp), @XMM[15] # IV | ||
| 1721 | pxor @XMM[13], @XMM[3] | ||
| 1722 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1723 | movdqu @XMM[1], 0x10($out) | ||
| 1724 | movdqu @XMM[6], 0x20($out) | ||
| 1725 | movdqu @XMM[4], 0x30($out) | ||
| 1726 | movdqu @XMM[2], 0x40($out) | ||
| 1727 | movdqu @XMM[7], 0x50($out) | ||
| 1728 | movdqu @XMM[3], 0x60($out) | ||
| 1729 | jmp .Lcbc_dec_done | ||
| 1730 | .align 16 | ||
| 1731 | .Lcbc_dec_six: | ||
| 1732 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1733 | call _bsaes_decrypt8 | ||
| 1734 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1735 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1736 | movdqu 0x10($inp), @XMM[9] | ||
| 1737 | pxor @XMM[8], @XMM[1] | ||
| 1738 | movdqu 0x20($inp), @XMM[10] | ||
| 1739 | pxor @XMM[9], @XMM[6] | ||
| 1740 | movdqu 0x30($inp), @XMM[11] | ||
| 1741 | pxor @XMM[10], @XMM[4] | ||
| 1742 | movdqu 0x40($inp), @XMM[12] | ||
| 1743 | pxor @XMM[11], @XMM[2] | ||
| 1744 | movdqu 0x50($inp), @XMM[15] # IV | ||
| 1745 | pxor @XMM[12], @XMM[7] | ||
| 1746 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1747 | movdqu @XMM[1], 0x10($out) | ||
| 1748 | movdqu @XMM[6], 0x20($out) | ||
| 1749 | movdqu @XMM[4], 0x30($out) | ||
| 1750 | movdqu @XMM[2], 0x40($out) | ||
| 1751 | movdqu @XMM[7], 0x50($out) | ||
| 1752 | jmp .Lcbc_dec_done | ||
| 1753 | .align 16 | ||
| 1754 | .Lcbc_dec_five: | ||
| 1755 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1756 | call _bsaes_decrypt8 | ||
| 1757 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1758 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1759 | movdqu 0x10($inp), @XMM[9] | ||
| 1760 | pxor @XMM[8], @XMM[1] | ||
| 1761 | movdqu 0x20($inp), @XMM[10] | ||
| 1762 | pxor @XMM[9], @XMM[6] | ||
| 1763 | movdqu 0x30($inp), @XMM[11] | ||
| 1764 | pxor @XMM[10], @XMM[4] | ||
| 1765 | movdqu 0x40($inp), @XMM[15] # IV | ||
| 1766 | pxor @XMM[11], @XMM[2] | ||
| 1767 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1768 | movdqu @XMM[1], 0x10($out) | ||
| 1769 | movdqu @XMM[6], 0x20($out) | ||
| 1770 | movdqu @XMM[4], 0x30($out) | ||
| 1771 | movdqu @XMM[2], 0x40($out) | ||
| 1772 | jmp .Lcbc_dec_done | ||
| 1773 | .align 16 | ||
| 1774 | .Lcbc_dec_four: | ||
| 1775 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1776 | call _bsaes_decrypt8 | ||
| 1777 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1778 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1779 | movdqu 0x10($inp), @XMM[9] | ||
| 1780 | pxor @XMM[8], @XMM[1] | ||
| 1781 | movdqu 0x20($inp), @XMM[10] | ||
| 1782 | pxor @XMM[9], @XMM[6] | ||
| 1783 | movdqu 0x30($inp), @XMM[15] # IV | ||
| 1784 | pxor @XMM[10], @XMM[4] | ||
| 1785 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1786 | movdqu @XMM[1], 0x10($out) | ||
| 1787 | movdqu @XMM[6], 0x20($out) | ||
| 1788 | movdqu @XMM[4], 0x30($out) | ||
| 1789 | jmp .Lcbc_dec_done | ||
| 1790 | .align 16 | ||
| 1791 | .Lcbc_dec_three: | ||
| 1792 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1793 | call _bsaes_decrypt8 | ||
| 1794 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1795 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1796 | movdqu 0x10($inp), @XMM[9] | ||
| 1797 | pxor @XMM[8], @XMM[1] | ||
| 1798 | movdqu 0x20($inp), @XMM[15] # IV | ||
| 1799 | pxor @XMM[9], @XMM[6] | ||
| 1800 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1801 | movdqu @XMM[1], 0x10($out) | ||
| 1802 | movdqu @XMM[6], 0x20($out) | ||
| 1803 | jmp .Lcbc_dec_done | ||
| 1804 | .align 16 | ||
| 1805 | .Lcbc_dec_two: | ||
| 1806 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1807 | call _bsaes_decrypt8 | ||
| 1808 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1809 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1810 | movdqu 0x10($inp), @XMM[15] # IV | ||
| 1811 | pxor @XMM[8], @XMM[1] | ||
| 1812 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1813 | movdqu @XMM[1], 0x10($out) | ||
| 1814 | jmp .Lcbc_dec_done | ||
| 1815 | .align 16 | ||
| 1816 | .Lcbc_dec_one: | ||
| 1817 | lea ($inp), $arg1 | ||
| 1818 | lea 0x20(%rbp), $arg2 # buffer output | ||
| 1819 | lea ($key), $arg3 | ||
| 1820 | call asm_AES_decrypt # doesn't touch %xmm | ||
| 1821 | pxor 0x20(%rbp), @XMM[15] # ^= IV | ||
| 1822 | movdqu @XMM[15], ($out) # write output | ||
| 1823 | movdqa @XMM[0], @XMM[15] # IV | ||
| 1824 | |||
| 1825 | .Lcbc_dec_done: | ||
| 1826 | movdqu @XMM[15], (%rbx) # return IV | ||
| 1827 | lea (%rsp), %rax | ||
| 1828 | pxor %xmm0, %xmm0 | ||
| 1829 | .Lcbc_dec_bzero: # wipe key schedule [if any] | ||
| 1830 | movdqa %xmm0, 0x00(%rax) | ||
| 1831 | movdqa %xmm0, 0x10(%rax) | ||
| 1832 | lea 0x20(%rax), %rax | ||
| 1833 | cmp %rax, %rbp | ||
| 1834 | ja .Lcbc_dec_bzero | ||
| 1835 | |||
| 1836 | lea (%rbp),%rsp # restore %rsp | ||
| 1837 | ___ | ||
| 1838 | $code.=<<___ if ($win64); | ||
| 1839 | movaps 0x40(%rbp), %xmm6 | ||
| 1840 | movaps 0x50(%rbp), %xmm7 | ||
| 1841 | movaps 0x60(%rbp), %xmm8 | ||
| 1842 | movaps 0x70(%rbp), %xmm9 | ||
| 1843 | movaps 0x80(%rbp), %xmm10 | ||
| 1844 | movaps 0x90(%rbp), %xmm11 | ||
| 1845 | movaps 0xa0(%rbp), %xmm12 | ||
| 1846 | movaps 0xb0(%rbp), %xmm13 | ||
| 1847 | movaps 0xc0(%rbp), %xmm14 | ||
| 1848 | movaps 0xd0(%rbp), %xmm15 | ||
| 1849 | lea 0xa0(%rbp), %rsp | ||
| 1850 | ___ | ||
| 1851 | $code.=<<___; | ||
| 1852 | mov 0x48(%rsp), %r15 | ||
| 1853 | mov 0x50(%rsp), %r14 | ||
| 1854 | mov 0x58(%rsp), %r13 | ||
| 1855 | mov 0x60(%rsp), %r12 | ||
| 1856 | mov 0x68(%rsp), %rbx | ||
| 1857 | mov 0x70(%rsp), %rax | ||
| 1858 | lea 0x78(%rsp), %rsp | ||
| 1859 | mov %rax, %rbp | ||
| 1860 | .Lcbc_dec_epilogue: | ||
| 1861 | ret | ||
| 1862 | .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt | ||
| 1863 | |||
| 1864 | .globl bsaes_ctr32_encrypt_blocks | ||
| 1865 | .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent | ||
| 1866 | .align 16 | ||
| 1867 | bsaes_ctr32_encrypt_blocks: | ||
| 1868 | _CET_ENDBR | ||
| 1869 | mov %rsp, %rax | ||
| 1870 | .Lctr_enc_prologue: | ||
| 1871 | push %rbp | ||
| 1872 | push %rbx | ||
| 1873 | push %r12 | ||
| 1874 | push %r13 | ||
| 1875 | push %r14 | ||
| 1876 | push %r15 | ||
| 1877 | lea -0x48(%rsp), %rsp | ||
| 1878 | ___ | ||
| 1879 | $code.=<<___ if ($win64); | ||
| 1880 | mov 0xa0(%rsp),$arg5 # pull ivp | ||
| 1881 | lea -0xa0(%rsp), %rsp | ||
| 1882 | movaps %xmm6, 0x40(%rsp) | ||
| 1883 | movaps %xmm7, 0x50(%rsp) | ||
| 1884 | movaps %xmm8, 0x60(%rsp) | ||
| 1885 | movaps %xmm9, 0x70(%rsp) | ||
| 1886 | movaps %xmm10, 0x80(%rsp) | ||
| 1887 | movaps %xmm11, 0x90(%rsp) | ||
| 1888 | movaps %xmm12, 0xa0(%rsp) | ||
| 1889 | movaps %xmm13, 0xb0(%rsp) | ||
| 1890 | movaps %xmm14, 0xc0(%rsp) | ||
| 1891 | movaps %xmm15, 0xd0(%rsp) | ||
| 1892 | .Lctr_enc_body: | ||
| 1893 | ___ | ||
| 1894 | $code.=<<___; | ||
| 1895 | mov %rsp, %rbp # backup %rsp | ||
| 1896 | movdqu ($arg5), %xmm0 # load counter | ||
| 1897 | mov 240($arg4), %eax # rounds | ||
| 1898 | mov $arg1, $inp # backup arguments | ||
| 1899 | mov $arg2, $out | ||
| 1900 | mov $arg3, $len | ||
| 1901 | mov $arg4, $key | ||
| 1902 | movdqa %xmm0, 0x20(%rbp) # copy counter | ||
| 1903 | cmp \$8, $arg3 | ||
| 1904 | jb .Lctr_enc_short | ||
| 1905 | |||
| 1906 | mov %eax, %ebx # rounds | ||
| 1907 | shl \$7, %rax # 128 bytes per inner round key | ||
| 1908 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
| 1909 | sub %rax, %rsp | ||
| 1910 | |||
| 1911 | mov %rsp, %rax # pass key schedule | ||
| 1912 | mov $key, %rcx # pass key | ||
| 1913 | mov %ebx, %r10d # pass rounds | ||
| 1914 | call _bsaes_key_convert | ||
| 1915 | pxor %xmm6,%xmm7 # fix up last round key | ||
| 1916 | movdqa %xmm7,(%rax) # save last round key | ||
| 1917 | |||
| 1918 | movdqa (%rsp), @XMM[9] # load round0 key | ||
| 1919 | lea .LADD1(%rip), %r11 | ||
| 1920 | movdqa 0x20(%rbp), @XMM[0] # counter copy | ||
| 1921 | movdqa -0x20(%r11), @XMM[8] # .LSWPUP | ||
| 1922 | pshufb @XMM[8], @XMM[9] # byte swap upper part | ||
| 1923 | pshufb @XMM[8], @XMM[0] | ||
| 1924 | movdqa @XMM[9], (%rsp) # save adjusted round0 key | ||
| 1925 | jmp .Lctr_enc_loop | ||
| 1926 | .align 16 | ||
| 1927 | .Lctr_enc_loop: | ||
| 1928 | movdqa @XMM[0], 0x20(%rbp) # save counter | ||
| 1929 | movdqa @XMM[0], @XMM[1] # prepare 8 counter values | ||
| 1930 | movdqa @XMM[0], @XMM[2] | ||
| 1931 | paddd 0x00(%r11), @XMM[1] # .LADD1 | ||
| 1932 | movdqa @XMM[0], @XMM[3] | ||
| 1933 | paddd 0x10(%r11), @XMM[2] # .LADD2 | ||
| 1934 | movdqa @XMM[0], @XMM[4] | ||
| 1935 | paddd 0x20(%r11), @XMM[3] # .LADD3 | ||
| 1936 | movdqa @XMM[0], @XMM[5] | ||
| 1937 | paddd 0x30(%r11), @XMM[4] # .LADD4 | ||
| 1938 | movdqa @XMM[0], @XMM[6] | ||
| 1939 | paddd 0x40(%r11), @XMM[5] # .LADD5 | ||
| 1940 | movdqa @XMM[0], @XMM[7] | ||
| 1941 | paddd 0x50(%r11), @XMM[6] # .LADD6 | ||
| 1942 | paddd 0x60(%r11), @XMM[7] # .LADD7 | ||
| 1943 | |||
| 1944 | # Borrow prologue from _bsaes_encrypt8 to use the opportunity | ||
| 1945 | # to flip byte order in 32-bit counter | ||
| 1946 | movdqa (%rsp), @XMM[9] # round 0 key | ||
| 1947 | lea 0x10(%rsp), %rax # pass key schedule | ||
| 1948 | movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR | ||
| 1949 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
| 1950 | pxor @XMM[9], @XMM[1] | ||
| 1951 | pshufb @XMM[8], @XMM[0] | ||
| 1952 | pxor @XMM[9], @XMM[2] | ||
| 1953 | pshufb @XMM[8], @XMM[1] | ||
| 1954 | pxor @XMM[9], @XMM[3] | ||
| 1955 | pshufb @XMM[8], @XMM[2] | ||
| 1956 | pxor @XMM[9], @XMM[4] | ||
| 1957 | pshufb @XMM[8], @XMM[3] | ||
| 1958 | pxor @XMM[9], @XMM[5] | ||
| 1959 | pshufb @XMM[8], @XMM[4] | ||
| 1960 | pxor @XMM[9], @XMM[6] | ||
| 1961 | pshufb @XMM[8], @XMM[5] | ||
| 1962 | pxor @XMM[9], @XMM[7] | ||
| 1963 | pshufb @XMM[8], @XMM[6] | ||
| 1964 | lea .LBS0(%rip), %r11 # constants table | ||
| 1965 | pshufb @XMM[8], @XMM[7] | ||
| 1966 | mov %ebx,%r10d # pass rounds | ||
| 1967 | |||
| 1968 | call _bsaes_encrypt8_bitslice | ||
| 1969 | |||
| 1970 | sub \$8,$len | ||
| 1971 | jc .Lctr_enc_loop_done | ||
| 1972 | |||
| 1973 | movdqu 0x00($inp), @XMM[8] # load input | ||
| 1974 | movdqu 0x10($inp), @XMM[9] | ||
| 1975 | movdqu 0x20($inp), @XMM[10] | ||
| 1976 | movdqu 0x30($inp), @XMM[11] | ||
| 1977 | movdqu 0x40($inp), @XMM[12] | ||
| 1978 | movdqu 0x50($inp), @XMM[13] | ||
| 1979 | movdqu 0x60($inp), @XMM[14] | ||
| 1980 | movdqu 0x70($inp), @XMM[15] | ||
| 1981 | lea 0x80($inp),$inp | ||
| 1982 | pxor @XMM[0], @XMM[8] | ||
| 1983 | movdqa 0x20(%rbp), @XMM[0] # load counter | ||
| 1984 | pxor @XMM[9], @XMM[1] | ||
| 1985 | movdqu @XMM[8], 0x00($out) # write output | ||
| 1986 | pxor @XMM[10], @XMM[4] | ||
| 1987 | movdqu @XMM[1], 0x10($out) | ||
| 1988 | pxor @XMM[11], @XMM[6] | ||
| 1989 | movdqu @XMM[4], 0x20($out) | ||
| 1990 | pxor @XMM[12], @XMM[3] | ||
| 1991 | movdqu @XMM[6], 0x30($out) | ||
| 1992 | pxor @XMM[13], @XMM[7] | ||
| 1993 | movdqu @XMM[3], 0x40($out) | ||
| 1994 | pxor @XMM[14], @XMM[2] | ||
| 1995 | movdqu @XMM[7], 0x50($out) | ||
| 1996 | pxor @XMM[15], @XMM[5] | ||
| 1997 | movdqu @XMM[2], 0x60($out) | ||
| 1998 | lea .LADD1(%rip), %r11 | ||
| 1999 | movdqu @XMM[5], 0x70($out) | ||
| 2000 | lea 0x80($out), $out | ||
| 2001 | paddd 0x70(%r11), @XMM[0] # .LADD8 | ||
| 2002 | jnz .Lctr_enc_loop | ||
| 2003 | |||
| 2004 | jmp .Lctr_enc_done | ||
| 2005 | .align 16 | ||
| 2006 | .Lctr_enc_loop_done: | ||
| 2007 | add \$8, $len | ||
| 2008 | movdqu 0x00($inp), @XMM[8] # load input | ||
| 2009 | pxor @XMM[8], @XMM[0] | ||
| 2010 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2011 | cmp \$2,$len | ||
| 2012 | jb .Lctr_enc_done | ||
| 2013 | movdqu 0x10($inp), @XMM[9] | ||
| 2014 | pxor @XMM[9], @XMM[1] | ||
| 2015 | movdqu @XMM[1], 0x10($out) | ||
| 2016 | je .Lctr_enc_done | ||
| 2017 | movdqu 0x20($inp), @XMM[10] | ||
| 2018 | pxor @XMM[10], @XMM[4] | ||
| 2019 | movdqu @XMM[4], 0x20($out) | ||
| 2020 | cmp \$4,$len | ||
| 2021 | jb .Lctr_enc_done | ||
| 2022 | movdqu 0x30($inp), @XMM[11] | ||
| 2023 | pxor @XMM[11], @XMM[6] | ||
| 2024 | movdqu @XMM[6], 0x30($out) | ||
| 2025 | je .Lctr_enc_done | ||
| 2026 | movdqu 0x40($inp), @XMM[12] | ||
| 2027 | pxor @XMM[12], @XMM[3] | ||
| 2028 | movdqu @XMM[3], 0x40($out) | ||
| 2029 | cmp \$6,$len | ||
| 2030 | jb .Lctr_enc_done | ||
| 2031 | movdqu 0x50($inp), @XMM[13] | ||
| 2032 | pxor @XMM[13], @XMM[7] | ||
| 2033 | movdqu @XMM[7], 0x50($out) | ||
| 2034 | je .Lctr_enc_done | ||
| 2035 | movdqu 0x60($inp), @XMM[14] | ||
| 2036 | pxor @XMM[14], @XMM[2] | ||
| 2037 | movdqu @XMM[2], 0x60($out) | ||
| 2038 | jmp .Lctr_enc_done | ||
| 2039 | |||
| 2040 | .align 16 | ||
| 2041 | .Lctr_enc_short: | ||
| 2042 | lea 0x20(%rbp), $arg1 | ||
| 2043 | lea 0x30(%rbp), $arg2 | ||
| 2044 | lea ($key), $arg3 | ||
| 2045 | call asm_AES_encrypt | ||
| 2046 | movdqu ($inp), @XMM[1] | ||
| 2047 | lea 16($inp), $inp | ||
| 2048 | mov 0x2c(%rbp), %eax # load 32-bit counter | ||
| 2049 | bswap %eax | ||
| 2050 | pxor 0x30(%rbp), @XMM[1] | ||
| 2051 | inc %eax # increment | ||
| 2052 | movdqu @XMM[1], ($out) | ||
| 2053 | bswap %eax | ||
| 2054 | lea 16($out), $out | ||
| 2055 | mov %eax, 0x2c(%rsp) # save 32-bit counter | ||
| 2056 | dec $len | ||
| 2057 | jnz .Lctr_enc_short | ||
| 2058 | |||
| 2059 | .Lctr_enc_done: | ||
| 2060 | lea (%rsp), %rax | ||
| 2061 | pxor %xmm0, %xmm0 | ||
| 2062 | .Lctr_enc_bzero: # wipe key schedule [if any] | ||
| 2063 | movdqa %xmm0, 0x00(%rax) | ||
| 2064 | movdqa %xmm0, 0x10(%rax) | ||
| 2065 | lea 0x20(%rax), %rax | ||
| 2066 | cmp %rax, %rbp | ||
| 2067 | ja .Lctr_enc_bzero | ||
| 2068 | |||
| 2069 | lea (%rbp),%rsp # restore %rsp | ||
| 2070 | ___ | ||
| 2071 | $code.=<<___ if ($win64); | ||
| 2072 | movaps 0x40(%rbp), %xmm6 | ||
| 2073 | movaps 0x50(%rbp), %xmm7 | ||
| 2074 | movaps 0x60(%rbp), %xmm8 | ||
| 2075 | movaps 0x70(%rbp), %xmm9 | ||
| 2076 | movaps 0x80(%rbp), %xmm10 | ||
| 2077 | movaps 0x90(%rbp), %xmm11 | ||
| 2078 | movaps 0xa0(%rbp), %xmm12 | ||
| 2079 | movaps 0xb0(%rbp), %xmm13 | ||
| 2080 | movaps 0xc0(%rbp), %xmm14 | ||
| 2081 | movaps 0xd0(%rbp), %xmm15 | ||
| 2082 | lea 0xa0(%rbp), %rsp | ||
| 2083 | ___ | ||
| 2084 | $code.=<<___; | ||
| 2085 | mov 0x48(%rsp), %r15 | ||
| 2086 | mov 0x50(%rsp), %r14 | ||
| 2087 | mov 0x58(%rsp), %r13 | ||
| 2088 | mov 0x60(%rsp), %r12 | ||
| 2089 | mov 0x68(%rsp), %rbx | ||
| 2090 | mov 0x70(%rsp), %rax | ||
| 2091 | lea 0x78(%rsp), %rsp | ||
| 2092 | mov %rax, %rbp | ||
| 2093 | .Lctr_enc_epilogue: | ||
| 2094 | ret | ||
| 2095 | .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks | ||
| 2096 | ___ | ||
| 2097 | ###################################################################### | ||
| 2098 | # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
| 2099 | # const AES_KEY *key1, const AES_KEY *key2, | ||
| 2100 | # const unsigned char iv[16]); | ||
| 2101 | # | ||
| 2102 | my ($twmask,$twres,$twtmp)=@XMM[13..15]; | ||
| 2103 | $arg6=~s/d$//; | ||
| 2104 | |||
| 2105 | $code.=<<___; | ||
| 2106 | .globl bsaes_xts_encrypt | ||
| 2107 | .type bsaes_xts_encrypt,\@abi-omnipotent | ||
| 2108 | .align 16 | ||
| 2109 | bsaes_xts_encrypt: | ||
| 2110 | _CET_ENDBR | ||
| 2111 | mov %rsp, %rax | ||
| 2112 | .Lxts_enc_prologue: | ||
| 2113 | push %rbp | ||
| 2114 | push %rbx | ||
| 2115 | push %r12 | ||
| 2116 | push %r13 | ||
| 2117 | push %r14 | ||
| 2118 | push %r15 | ||
| 2119 | lea -0x48(%rsp), %rsp | ||
| 2120 | ___ | ||
| 2121 | $code.=<<___ if ($win64); | ||
| 2122 | mov 0xa0(%rsp),$arg5 # pull key2 | ||
| 2123 | mov 0xa8(%rsp),$arg6 # pull ivp | ||
| 2124 | lea -0xa0(%rsp), %rsp | ||
| 2125 | movaps %xmm6, 0x40(%rsp) | ||
| 2126 | movaps %xmm7, 0x50(%rsp) | ||
| 2127 | movaps %xmm8, 0x60(%rsp) | ||
| 2128 | movaps %xmm9, 0x70(%rsp) | ||
| 2129 | movaps %xmm10, 0x80(%rsp) | ||
| 2130 | movaps %xmm11, 0x90(%rsp) | ||
| 2131 | movaps %xmm12, 0xa0(%rsp) | ||
| 2132 | movaps %xmm13, 0xb0(%rsp) | ||
| 2133 | movaps %xmm14, 0xc0(%rsp) | ||
| 2134 | movaps %xmm15, 0xd0(%rsp) | ||
| 2135 | .Lxts_enc_body: | ||
| 2136 | ___ | ||
| 2137 | $code.=<<___; | ||
| 2138 | mov %rsp, %rbp # backup %rsp | ||
| 2139 | mov $arg1, $inp # backup arguments | ||
| 2140 | mov $arg2, $out | ||
| 2141 | mov $arg3, $len | ||
| 2142 | mov $arg4, $key | ||
| 2143 | |||
| 2144 | lea ($arg6), $arg1 | ||
| 2145 | lea 0x20(%rbp), $arg2 | ||
| 2146 | lea ($arg5), $arg3 | ||
| 2147 | call asm_AES_encrypt # generate initial tweak | ||
| 2148 | |||
| 2149 | mov 240($key), %eax # rounds | ||
| 2150 | mov $len, %rbx # backup $len | ||
| 2151 | |||
| 2152 | mov %eax, %edx # rounds | ||
| 2153 | shl \$7, %rax # 128 bytes per inner round key | ||
| 2154 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
| 2155 | sub %rax, %rsp | ||
| 2156 | |||
| 2157 | mov %rsp, %rax # pass key schedule | ||
| 2158 | mov $key, %rcx # pass key | ||
| 2159 | mov %edx, %r10d # pass rounds | ||
| 2160 | call _bsaes_key_convert | ||
| 2161 | pxor %xmm6, %xmm7 # fix up last round key | ||
| 2162 | movdqa %xmm7, (%rax) # save last round key | ||
| 2163 | |||
| 2164 | and \$-16, $len | ||
| 2165 | sub \$0x80, %rsp # place for tweak[8] | ||
| 2166 | movdqa 0x20(%rbp), @XMM[7] # initial tweak | ||
| 2167 | |||
| 2168 | pxor $twtmp, $twtmp | ||
| 2169 | movdqa .Lxts_magic(%rip), $twmask | ||
| 2170 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2171 | |||
| 2172 | sub \$0x80, $len | ||
| 2173 | jc .Lxts_enc_short | ||
| 2174 | jmp .Lxts_enc_loop | ||
| 2175 | |||
| 2176 | .align 16 | ||
| 2177 | .Lxts_enc_loop: | ||
| 2178 | ___ | ||
| 2179 | for ($i=0;$i<7;$i++) { | ||
| 2180 | $code.=<<___; | ||
| 2181 | pshufd \$0x13, $twtmp, $twres | ||
| 2182 | pxor $twtmp, $twtmp | ||
| 2183 | movdqa @XMM[7], @XMM[$i] | ||
| 2184 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
| 2185 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2186 | pand $twmask, $twres # isolate carry and residue | ||
| 2187 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2188 | pxor $twres, @XMM[7] | ||
| 2189 | ___ | ||
| 2190 | $code.=<<___ if ($i>=1); | ||
| 2191 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
| 2192 | ___ | ||
| 2193 | $code.=<<___ if ($i>=2); | ||
| 2194 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
| 2195 | ___ | ||
| 2196 | } | ||
| 2197 | $code.=<<___; | ||
| 2198 | movdqu 0x60($inp), @XMM[8+6] | ||
| 2199 | pxor @XMM[8+5], @XMM[5] | ||
| 2200 | movdqu 0x70($inp), @XMM[8+7] | ||
| 2201 | lea 0x80($inp), $inp | ||
| 2202 | movdqa @XMM[7], 0x70(%rsp) | ||
| 2203 | pxor @XMM[8+6], @XMM[6] | ||
| 2204 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2205 | pxor @XMM[8+7], @XMM[7] | ||
| 2206 | mov %edx, %r10d # pass rounds | ||
| 2207 | |||
| 2208 | call _bsaes_encrypt8 | ||
| 2209 | |||
| 2210 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2211 | pxor 0x10(%rsp), @XMM[1] | ||
| 2212 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2213 | pxor 0x20(%rsp), @XMM[4] | ||
| 2214 | movdqu @XMM[1], 0x10($out) | ||
| 2215 | pxor 0x30(%rsp), @XMM[6] | ||
| 2216 | movdqu @XMM[4], 0x20($out) | ||
| 2217 | pxor 0x40(%rsp), @XMM[3] | ||
| 2218 | movdqu @XMM[6], 0x30($out) | ||
| 2219 | pxor 0x50(%rsp), @XMM[7] | ||
| 2220 | movdqu @XMM[3], 0x40($out) | ||
| 2221 | pxor 0x60(%rsp), @XMM[2] | ||
| 2222 | movdqu @XMM[7], 0x50($out) | ||
| 2223 | pxor 0x70(%rsp), @XMM[5] | ||
| 2224 | movdqu @XMM[2], 0x60($out) | ||
| 2225 | movdqu @XMM[5], 0x70($out) | ||
| 2226 | lea 0x80($out), $out | ||
| 2227 | |||
| 2228 | movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | ||
| 2229 | pxor $twtmp, $twtmp | ||
| 2230 | movdqa .Lxts_magic(%rip), $twmask | ||
| 2231 | pcmpgtd @XMM[7], $twtmp | ||
| 2232 | pshufd \$0x13, $twtmp, $twres | ||
| 2233 | pxor $twtmp, $twtmp | ||
| 2234 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2235 | pand $twmask, $twres # isolate carry and residue | ||
| 2236 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2237 | pxor $twres, @XMM[7] | ||
| 2238 | |||
| 2239 | sub \$0x80,$len | ||
| 2240 | jnc .Lxts_enc_loop | ||
| 2241 | |||
| 2242 | .Lxts_enc_short: | ||
| 2243 | add \$0x80, $len | ||
| 2244 | jz .Lxts_enc_done | ||
| 2245 | ___ | ||
| 2246 | for ($i=0;$i<7;$i++) { | ||
| 2247 | $code.=<<___; | ||
| 2248 | pshufd \$0x13, $twtmp, $twres | ||
| 2249 | pxor $twtmp, $twtmp | ||
| 2250 | movdqa @XMM[7], @XMM[$i] | ||
| 2251 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
| 2252 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2253 | pand $twmask, $twres # isolate carry and residue | ||
| 2254 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2255 | pxor $twres, @XMM[7] | ||
| 2256 | ___ | ||
| 2257 | $code.=<<___ if ($i>=1); | ||
| 2258 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
| 2259 | cmp \$`0x10*$i`,$len | ||
| 2260 | je .Lxts_enc_$i | ||
| 2261 | ___ | ||
| 2262 | $code.=<<___ if ($i>=2); | ||
| 2263 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
| 2264 | ___ | ||
| 2265 | } | ||
| 2266 | $code.=<<___; | ||
| 2267 | movdqu 0x60($inp), @XMM[8+6] | ||
| 2268 | pxor @XMM[8+5], @XMM[5] | ||
| 2269 | movdqa @XMM[7], 0x70(%rsp) | ||
| 2270 | lea 0x70($inp), $inp | ||
| 2271 | pxor @XMM[8+6], @XMM[6] | ||
| 2272 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2273 | mov %edx, %r10d # pass rounds | ||
| 2274 | |||
| 2275 | call _bsaes_encrypt8 | ||
| 2276 | |||
| 2277 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2278 | pxor 0x10(%rsp), @XMM[1] | ||
| 2279 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2280 | pxor 0x20(%rsp), @XMM[4] | ||
| 2281 | movdqu @XMM[1], 0x10($out) | ||
| 2282 | pxor 0x30(%rsp), @XMM[6] | ||
| 2283 | movdqu @XMM[4], 0x20($out) | ||
| 2284 | pxor 0x40(%rsp), @XMM[3] | ||
| 2285 | movdqu @XMM[6], 0x30($out) | ||
| 2286 | pxor 0x50(%rsp), @XMM[7] | ||
| 2287 | movdqu @XMM[3], 0x40($out) | ||
| 2288 | pxor 0x60(%rsp), @XMM[2] | ||
| 2289 | movdqu @XMM[7], 0x50($out) | ||
| 2290 | movdqu @XMM[2], 0x60($out) | ||
| 2291 | lea 0x70($out), $out | ||
| 2292 | |||
| 2293 | movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | ||
| 2294 | jmp .Lxts_enc_done | ||
| 2295 | .align 16 | ||
| 2296 | .Lxts_enc_6: | ||
| 2297 | pxor @XMM[8+4], @XMM[4] | ||
| 2298 | lea 0x60($inp), $inp | ||
| 2299 | pxor @XMM[8+5], @XMM[5] | ||
| 2300 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2301 | mov %edx, %r10d # pass rounds | ||
| 2302 | |||
| 2303 | call _bsaes_encrypt8 | ||
| 2304 | |||
| 2305 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2306 | pxor 0x10(%rsp), @XMM[1] | ||
| 2307 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2308 | pxor 0x20(%rsp), @XMM[4] | ||
| 2309 | movdqu @XMM[1], 0x10($out) | ||
| 2310 | pxor 0x30(%rsp), @XMM[6] | ||
| 2311 | movdqu @XMM[4], 0x20($out) | ||
| 2312 | pxor 0x40(%rsp), @XMM[3] | ||
| 2313 | movdqu @XMM[6], 0x30($out) | ||
| 2314 | pxor 0x50(%rsp), @XMM[7] | ||
| 2315 | movdqu @XMM[3], 0x40($out) | ||
| 2316 | movdqu @XMM[7], 0x50($out) | ||
| 2317 | lea 0x60($out), $out | ||
| 2318 | |||
| 2319 | movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | ||
| 2320 | jmp .Lxts_enc_done | ||
| 2321 | .align 16 | ||
| 2322 | .Lxts_enc_5: | ||
| 2323 | pxor @XMM[8+3], @XMM[3] | ||
| 2324 | lea 0x50($inp), $inp | ||
| 2325 | pxor @XMM[8+4], @XMM[4] | ||
| 2326 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2327 | mov %edx, %r10d # pass rounds | ||
| 2328 | |||
| 2329 | call _bsaes_encrypt8 | ||
| 2330 | |||
| 2331 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2332 | pxor 0x10(%rsp), @XMM[1] | ||
| 2333 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2334 | pxor 0x20(%rsp), @XMM[4] | ||
| 2335 | movdqu @XMM[1], 0x10($out) | ||
| 2336 | pxor 0x30(%rsp), @XMM[6] | ||
| 2337 | movdqu @XMM[4], 0x20($out) | ||
| 2338 | pxor 0x40(%rsp), @XMM[3] | ||
| 2339 | movdqu @XMM[6], 0x30($out) | ||
| 2340 | movdqu @XMM[3], 0x40($out) | ||
| 2341 | lea 0x50($out), $out | ||
| 2342 | |||
| 2343 | movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | ||
| 2344 | jmp .Lxts_enc_done | ||
| 2345 | .align 16 | ||
| 2346 | .Lxts_enc_4: | ||
| 2347 | pxor @XMM[8+2], @XMM[2] | ||
| 2348 | lea 0x40($inp), $inp | ||
| 2349 | pxor @XMM[8+3], @XMM[3] | ||
| 2350 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2351 | mov %edx, %r10d # pass rounds | ||
| 2352 | |||
| 2353 | call _bsaes_encrypt8 | ||
| 2354 | |||
| 2355 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2356 | pxor 0x10(%rsp), @XMM[1] | ||
| 2357 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2358 | pxor 0x20(%rsp), @XMM[4] | ||
| 2359 | movdqu @XMM[1], 0x10($out) | ||
| 2360 | pxor 0x30(%rsp), @XMM[6] | ||
| 2361 | movdqu @XMM[4], 0x20($out) | ||
| 2362 | movdqu @XMM[6], 0x30($out) | ||
| 2363 | lea 0x40($out), $out | ||
| 2364 | |||
| 2365 | movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | ||
| 2366 | jmp .Lxts_enc_done | ||
| 2367 | .align 16 | ||
| 2368 | .Lxts_enc_3: | ||
| 2369 | pxor @XMM[8+1], @XMM[1] | ||
| 2370 | lea 0x30($inp), $inp | ||
| 2371 | pxor @XMM[8+2], @XMM[2] | ||
| 2372 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2373 | mov %edx, %r10d # pass rounds | ||
| 2374 | |||
| 2375 | call _bsaes_encrypt8 | ||
| 2376 | |||
| 2377 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2378 | pxor 0x10(%rsp), @XMM[1] | ||
| 2379 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2380 | pxor 0x20(%rsp), @XMM[4] | ||
| 2381 | movdqu @XMM[1], 0x10($out) | ||
| 2382 | movdqu @XMM[4], 0x20($out) | ||
| 2383 | lea 0x30($out), $out | ||
| 2384 | |||
| 2385 | movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | ||
| 2386 | jmp .Lxts_enc_done | ||
| 2387 | .align 16 | ||
| 2388 | .Lxts_enc_2: | ||
| 2389 | pxor @XMM[8+0], @XMM[0] | ||
| 2390 | lea 0x20($inp), $inp | ||
| 2391 | pxor @XMM[8+1], @XMM[1] | ||
| 2392 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2393 | mov %edx, %r10d # pass rounds | ||
| 2394 | |||
| 2395 | call _bsaes_encrypt8 | ||
| 2396 | |||
| 2397 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2398 | pxor 0x10(%rsp), @XMM[1] | ||
| 2399 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2400 | movdqu @XMM[1], 0x10($out) | ||
| 2401 | lea 0x20($out), $out | ||
| 2402 | |||
| 2403 | movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | ||
| 2404 | jmp .Lxts_enc_done | ||
| 2405 | .align 16 | ||
| 2406 | .Lxts_enc_1: | ||
| 2407 | pxor @XMM[0], @XMM[8] | ||
| 2408 | lea 0x10($inp), $inp | ||
| 2409 | movdqa @XMM[8], 0x20(%rbp) | ||
| 2410 | lea 0x20(%rbp), $arg1 | ||
| 2411 | lea 0x20(%rbp), $arg2 | ||
| 2412 | lea ($key), $arg3 | ||
| 2413 | call asm_AES_encrypt # doesn't touch %xmm | ||
| 2414 | pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | ||
| 2415 | #pxor @XMM[8], @XMM[0] | ||
| 2416 | #lea 0x80(%rsp), %rax # pass key schedule | ||
| 2417 | #mov %edx, %r10d # pass rounds | ||
| 2418 | #call _bsaes_encrypt8 | ||
| 2419 | #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2420 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2421 | lea 0x10($out), $out | ||
| 2422 | |||
| 2423 | movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | ||
| 2424 | |||
| 2425 | .Lxts_enc_done: | ||
| 2426 | and \$15, %ebx | ||
| 2427 | jz .Lxts_enc_ret | ||
| 2428 | mov $out, %rdx | ||
| 2429 | |||
| 2430 | .Lxts_enc_steal: | ||
| 2431 | movzb ($inp), %eax | ||
| 2432 | movzb -16(%rdx), %ecx | ||
| 2433 | lea 1($inp), $inp | ||
| 2434 | mov %al, -16(%rdx) | ||
| 2435 | mov %cl, 0(%rdx) | ||
| 2436 | lea 1(%rdx), %rdx | ||
| 2437 | sub \$1,%ebx | ||
| 2438 | jnz .Lxts_enc_steal | ||
| 2439 | |||
| 2440 | movdqu -16($out), @XMM[0] | ||
| 2441 | lea 0x20(%rbp), $arg1 | ||
| 2442 | pxor @XMM[7], @XMM[0] | ||
| 2443 | lea 0x20(%rbp), $arg2 | ||
| 2444 | movdqa @XMM[0], 0x20(%rbp) | ||
| 2445 | lea ($key), $arg3 | ||
| 2446 | call asm_AES_encrypt # doesn't touch %xmm | ||
| 2447 | pxor 0x20(%rbp), @XMM[7] | ||
| 2448 | movdqu @XMM[7], -16($out) | ||
| 2449 | |||
| 2450 | .Lxts_enc_ret: | ||
| 2451 | lea (%rsp), %rax | ||
| 2452 | pxor %xmm0, %xmm0 | ||
| 2453 | .Lxts_enc_bzero: # wipe key schedule [if any] | ||
| 2454 | movdqa %xmm0, 0x00(%rax) | ||
| 2455 | movdqa %xmm0, 0x10(%rax) | ||
| 2456 | lea 0x20(%rax), %rax | ||
| 2457 | cmp %rax, %rbp | ||
| 2458 | ja .Lxts_enc_bzero | ||
| 2459 | |||
| 2460 | lea (%rbp),%rsp # restore %rsp | ||
| 2461 | ___ | ||
| 2462 | $code.=<<___ if ($win64); | ||
| 2463 | movaps 0x40(%rbp), %xmm6 | ||
| 2464 | movaps 0x50(%rbp), %xmm7 | ||
| 2465 | movaps 0x60(%rbp), %xmm8 | ||
| 2466 | movaps 0x70(%rbp), %xmm9 | ||
| 2467 | movaps 0x80(%rbp), %xmm10 | ||
| 2468 | movaps 0x90(%rbp), %xmm11 | ||
| 2469 | movaps 0xa0(%rbp), %xmm12 | ||
| 2470 | movaps 0xb0(%rbp), %xmm13 | ||
| 2471 | movaps 0xc0(%rbp), %xmm14 | ||
| 2472 | movaps 0xd0(%rbp), %xmm15 | ||
| 2473 | lea 0xa0(%rbp), %rsp | ||
| 2474 | ___ | ||
| 2475 | $code.=<<___; | ||
| 2476 | mov 0x48(%rsp), %r15 | ||
| 2477 | mov 0x50(%rsp), %r14 | ||
| 2478 | mov 0x58(%rsp), %r13 | ||
| 2479 | mov 0x60(%rsp), %r12 | ||
| 2480 | mov 0x68(%rsp), %rbx | ||
| 2481 | mov 0x70(%rsp), %rax | ||
| 2482 | lea 0x78(%rsp), %rsp | ||
| 2483 | mov %rax, %rbp | ||
| 2484 | .Lxts_enc_epilogue: | ||
| 2485 | ret | ||
| 2486 | .size bsaes_xts_encrypt,.-bsaes_xts_encrypt | ||
| 2487 | |||
| 2488 | .globl bsaes_xts_decrypt | ||
| 2489 | .type bsaes_xts_decrypt,\@abi-omnipotent | ||
| 2490 | .align 16 | ||
| 2491 | bsaes_xts_decrypt: | ||
| 2492 | _CET_ENDBR | ||
| 2493 | mov %rsp, %rax | ||
| 2494 | .Lxts_dec_prologue: | ||
| 2495 | push %rbp | ||
| 2496 | push %rbx | ||
| 2497 | push %r12 | ||
| 2498 | push %r13 | ||
| 2499 | push %r14 | ||
| 2500 | push %r15 | ||
| 2501 | lea -0x48(%rsp), %rsp | ||
| 2502 | ___ | ||
| 2503 | $code.=<<___ if ($win64); | ||
| 2504 | mov 0xa0(%rsp),$arg5 # pull key2 | ||
| 2505 | mov 0xa8(%rsp),$arg6 # pull ivp | ||
| 2506 | lea -0xa0(%rsp), %rsp | ||
| 2507 | movaps %xmm6, 0x40(%rsp) | ||
| 2508 | movaps %xmm7, 0x50(%rsp) | ||
| 2509 | movaps %xmm8, 0x60(%rsp) | ||
| 2510 | movaps %xmm9, 0x70(%rsp) | ||
| 2511 | movaps %xmm10, 0x80(%rsp) | ||
| 2512 | movaps %xmm11, 0x90(%rsp) | ||
| 2513 | movaps %xmm12, 0xa0(%rsp) | ||
| 2514 | movaps %xmm13, 0xb0(%rsp) | ||
| 2515 | movaps %xmm14, 0xc0(%rsp) | ||
| 2516 | movaps %xmm15, 0xd0(%rsp) | ||
| 2517 | .Lxts_dec_body: | ||
| 2518 | ___ | ||
| 2519 | $code.=<<___; | ||
| 2520 | mov %rsp, %rbp # backup %rsp | ||
| 2521 | mov $arg1, $inp # backup arguments | ||
| 2522 | mov $arg2, $out | ||
| 2523 | mov $arg3, $len | ||
| 2524 | mov $arg4, $key | ||
| 2525 | |||
| 2526 | lea ($arg6), $arg1 | ||
| 2527 | lea 0x20(%rbp), $arg2 | ||
| 2528 | lea ($arg5), $arg3 | ||
| 2529 | call asm_AES_encrypt # generate initial tweak | ||
| 2530 | |||
| 2531 | mov 240($key), %eax # rounds | ||
| 2532 | mov $len, %rbx # backup $len | ||
| 2533 | |||
| 2534 | mov %eax, %edx # rounds | ||
| 2535 | shl \$7, %rax # 128 bytes per inner round key | ||
| 2536 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
| 2537 | sub %rax, %rsp | ||
| 2538 | |||
| 2539 | mov %rsp, %rax # pass key schedule | ||
| 2540 | mov $key, %rcx # pass key | ||
| 2541 | mov %edx, %r10d # pass rounds | ||
| 2542 | call _bsaes_key_convert | ||
| 2543 | pxor (%rsp), %xmm7 # fix up round 0 key | ||
| 2544 | movdqa %xmm6, (%rax) # save last round key | ||
| 2545 | movdqa %xmm7, (%rsp) | ||
| 2546 | |||
| 2547 | xor %eax, %eax # if ($len%16) len-=16; | ||
| 2548 | and \$-16, $len | ||
| 2549 | test \$15, %ebx | ||
| 2550 | setnz %al | ||
| 2551 | shl \$4, %rax | ||
| 2552 | sub %rax, $len | ||
| 2553 | |||
| 2554 | sub \$0x80, %rsp # place for tweak[8] | ||
| 2555 | movdqa 0x20(%rbp), @XMM[7] # initial tweak | ||
| 2556 | |||
| 2557 | pxor $twtmp, $twtmp | ||
| 2558 | movdqa .Lxts_magic(%rip), $twmask | ||
| 2559 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2560 | |||
| 2561 | sub \$0x80, $len | ||
| 2562 | jc .Lxts_dec_short | ||
| 2563 | jmp .Lxts_dec_loop | ||
| 2564 | |||
| 2565 | .align 16 | ||
| 2566 | .Lxts_dec_loop: | ||
| 2567 | ___ | ||
| 2568 | for ($i=0;$i<7;$i++) { | ||
| 2569 | $code.=<<___; | ||
| 2570 | pshufd \$0x13, $twtmp, $twres | ||
| 2571 | pxor $twtmp, $twtmp | ||
| 2572 | movdqa @XMM[7], @XMM[$i] | ||
| 2573 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
| 2574 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2575 | pand $twmask, $twres # isolate carry and residue | ||
| 2576 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2577 | pxor $twres, @XMM[7] | ||
| 2578 | ___ | ||
| 2579 | $code.=<<___ if ($i>=1); | ||
| 2580 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
| 2581 | ___ | ||
| 2582 | $code.=<<___ if ($i>=2); | ||
| 2583 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
| 2584 | ___ | ||
| 2585 | } | ||
| 2586 | $code.=<<___; | ||
| 2587 | movdqu 0x60($inp), @XMM[8+6] | ||
| 2588 | pxor @XMM[8+5], @XMM[5] | ||
| 2589 | movdqu 0x70($inp), @XMM[8+7] | ||
| 2590 | lea 0x80($inp), $inp | ||
| 2591 | movdqa @XMM[7], 0x70(%rsp) | ||
| 2592 | pxor @XMM[8+6], @XMM[6] | ||
| 2593 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2594 | pxor @XMM[8+7], @XMM[7] | ||
| 2595 | mov %edx, %r10d # pass rounds | ||
| 2596 | |||
| 2597 | call _bsaes_decrypt8 | ||
| 2598 | |||
| 2599 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2600 | pxor 0x10(%rsp), @XMM[1] | ||
| 2601 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2602 | pxor 0x20(%rsp), @XMM[6] | ||
| 2603 | movdqu @XMM[1], 0x10($out) | ||
| 2604 | pxor 0x30(%rsp), @XMM[4] | ||
| 2605 | movdqu @XMM[6], 0x20($out) | ||
| 2606 | pxor 0x40(%rsp), @XMM[2] | ||
| 2607 | movdqu @XMM[4], 0x30($out) | ||
| 2608 | pxor 0x50(%rsp), @XMM[7] | ||
| 2609 | movdqu @XMM[2], 0x40($out) | ||
| 2610 | pxor 0x60(%rsp), @XMM[3] | ||
| 2611 | movdqu @XMM[7], 0x50($out) | ||
| 2612 | pxor 0x70(%rsp), @XMM[5] | ||
| 2613 | movdqu @XMM[3], 0x60($out) | ||
| 2614 | movdqu @XMM[5], 0x70($out) | ||
| 2615 | lea 0x80($out), $out | ||
| 2616 | |||
| 2617 | movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | ||
| 2618 | pxor $twtmp, $twtmp | ||
| 2619 | movdqa .Lxts_magic(%rip), $twmask | ||
| 2620 | pcmpgtd @XMM[7], $twtmp | ||
| 2621 | pshufd \$0x13, $twtmp, $twres | ||
| 2622 | pxor $twtmp, $twtmp | ||
| 2623 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2624 | pand $twmask, $twres # isolate carry and residue | ||
| 2625 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2626 | pxor $twres, @XMM[7] | ||
| 2627 | |||
| 2628 | sub \$0x80,$len | ||
| 2629 | jnc .Lxts_dec_loop | ||
| 2630 | |||
| 2631 | .Lxts_dec_short: | ||
| 2632 | add \$0x80, $len | ||
| 2633 | jz .Lxts_dec_done | ||
| 2634 | ___ | ||
| 2635 | for ($i=0;$i<7;$i++) { | ||
| 2636 | $code.=<<___; | ||
| 2637 | pshufd \$0x13, $twtmp, $twres | ||
| 2638 | pxor $twtmp, $twtmp | ||
| 2639 | movdqa @XMM[7], @XMM[$i] | ||
| 2640 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
| 2641 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2642 | pand $twmask, $twres # isolate carry and residue | ||
| 2643 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2644 | pxor $twres, @XMM[7] | ||
| 2645 | ___ | ||
| 2646 | $code.=<<___ if ($i>=1); | ||
| 2647 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
| 2648 | cmp \$`0x10*$i`,$len | ||
| 2649 | je .Lxts_dec_$i | ||
| 2650 | ___ | ||
| 2651 | $code.=<<___ if ($i>=2); | ||
| 2652 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
| 2653 | ___ | ||
| 2654 | } | ||
| 2655 | $code.=<<___; | ||
| 2656 | movdqu 0x60($inp), @XMM[8+6] | ||
| 2657 | pxor @XMM[8+5], @XMM[5] | ||
| 2658 | movdqa @XMM[7], 0x70(%rsp) | ||
| 2659 | lea 0x70($inp), $inp | ||
| 2660 | pxor @XMM[8+6], @XMM[6] | ||
| 2661 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2662 | mov %edx, %r10d # pass rounds | ||
| 2663 | |||
| 2664 | call _bsaes_decrypt8 | ||
| 2665 | |||
| 2666 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2667 | pxor 0x10(%rsp), @XMM[1] | ||
| 2668 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2669 | pxor 0x20(%rsp), @XMM[6] | ||
| 2670 | movdqu @XMM[1], 0x10($out) | ||
| 2671 | pxor 0x30(%rsp), @XMM[4] | ||
| 2672 | movdqu @XMM[6], 0x20($out) | ||
| 2673 | pxor 0x40(%rsp), @XMM[2] | ||
| 2674 | movdqu @XMM[4], 0x30($out) | ||
| 2675 | pxor 0x50(%rsp), @XMM[7] | ||
| 2676 | movdqu @XMM[2], 0x40($out) | ||
| 2677 | pxor 0x60(%rsp), @XMM[3] | ||
| 2678 | movdqu @XMM[7], 0x50($out) | ||
| 2679 | movdqu @XMM[3], 0x60($out) | ||
| 2680 | lea 0x70($out), $out | ||
| 2681 | |||
| 2682 | movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | ||
| 2683 | jmp .Lxts_dec_done | ||
| 2684 | .align 16 | ||
| 2685 | .Lxts_dec_6: | ||
| 2686 | pxor @XMM[8+4], @XMM[4] | ||
| 2687 | lea 0x60($inp), $inp | ||
| 2688 | pxor @XMM[8+5], @XMM[5] | ||
| 2689 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2690 | mov %edx, %r10d # pass rounds | ||
| 2691 | |||
| 2692 | call _bsaes_decrypt8 | ||
| 2693 | |||
| 2694 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2695 | pxor 0x10(%rsp), @XMM[1] | ||
| 2696 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2697 | pxor 0x20(%rsp), @XMM[6] | ||
| 2698 | movdqu @XMM[1], 0x10($out) | ||
| 2699 | pxor 0x30(%rsp), @XMM[4] | ||
| 2700 | movdqu @XMM[6], 0x20($out) | ||
| 2701 | pxor 0x40(%rsp), @XMM[2] | ||
| 2702 | movdqu @XMM[4], 0x30($out) | ||
| 2703 | pxor 0x50(%rsp), @XMM[7] | ||
| 2704 | movdqu @XMM[2], 0x40($out) | ||
| 2705 | movdqu @XMM[7], 0x50($out) | ||
| 2706 | lea 0x60($out), $out | ||
| 2707 | |||
| 2708 | movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | ||
| 2709 | jmp .Lxts_dec_done | ||
| 2710 | .align 16 | ||
| 2711 | .Lxts_dec_5: | ||
| 2712 | pxor @XMM[8+3], @XMM[3] | ||
| 2713 | lea 0x50($inp), $inp | ||
| 2714 | pxor @XMM[8+4], @XMM[4] | ||
| 2715 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2716 | mov %edx, %r10d # pass rounds | ||
| 2717 | |||
| 2718 | call _bsaes_decrypt8 | ||
| 2719 | |||
| 2720 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2721 | pxor 0x10(%rsp), @XMM[1] | ||
| 2722 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2723 | pxor 0x20(%rsp), @XMM[6] | ||
| 2724 | movdqu @XMM[1], 0x10($out) | ||
| 2725 | pxor 0x30(%rsp), @XMM[4] | ||
| 2726 | movdqu @XMM[6], 0x20($out) | ||
| 2727 | pxor 0x40(%rsp), @XMM[2] | ||
| 2728 | movdqu @XMM[4], 0x30($out) | ||
| 2729 | movdqu @XMM[2], 0x40($out) | ||
| 2730 | lea 0x50($out), $out | ||
| 2731 | |||
| 2732 | movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | ||
| 2733 | jmp .Lxts_dec_done | ||
| 2734 | .align 16 | ||
| 2735 | .Lxts_dec_4: | ||
| 2736 | pxor @XMM[8+2], @XMM[2] | ||
| 2737 | lea 0x40($inp), $inp | ||
| 2738 | pxor @XMM[8+3], @XMM[3] | ||
| 2739 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2740 | mov %edx, %r10d # pass rounds | ||
| 2741 | |||
| 2742 | call _bsaes_decrypt8 | ||
| 2743 | |||
| 2744 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2745 | pxor 0x10(%rsp), @XMM[1] | ||
| 2746 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2747 | pxor 0x20(%rsp), @XMM[6] | ||
| 2748 | movdqu @XMM[1], 0x10($out) | ||
| 2749 | pxor 0x30(%rsp), @XMM[4] | ||
| 2750 | movdqu @XMM[6], 0x20($out) | ||
| 2751 | movdqu @XMM[4], 0x30($out) | ||
| 2752 | lea 0x40($out), $out | ||
| 2753 | |||
| 2754 | movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | ||
| 2755 | jmp .Lxts_dec_done | ||
| 2756 | .align 16 | ||
| 2757 | .Lxts_dec_3: | ||
| 2758 | pxor @XMM[8+1], @XMM[1] | ||
| 2759 | lea 0x30($inp), $inp | ||
| 2760 | pxor @XMM[8+2], @XMM[2] | ||
| 2761 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2762 | mov %edx, %r10d # pass rounds | ||
| 2763 | |||
| 2764 | call _bsaes_decrypt8 | ||
| 2765 | |||
| 2766 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2767 | pxor 0x10(%rsp), @XMM[1] | ||
| 2768 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2769 | pxor 0x20(%rsp), @XMM[6] | ||
| 2770 | movdqu @XMM[1], 0x10($out) | ||
| 2771 | movdqu @XMM[6], 0x20($out) | ||
| 2772 | lea 0x30($out), $out | ||
| 2773 | |||
| 2774 | movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | ||
| 2775 | jmp .Lxts_dec_done | ||
| 2776 | .align 16 | ||
| 2777 | .Lxts_dec_2: | ||
| 2778 | pxor @XMM[8+0], @XMM[0] | ||
| 2779 | lea 0x20($inp), $inp | ||
| 2780 | pxor @XMM[8+1], @XMM[1] | ||
| 2781 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2782 | mov %edx, %r10d # pass rounds | ||
| 2783 | |||
| 2784 | call _bsaes_decrypt8 | ||
| 2785 | |||
| 2786 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2787 | pxor 0x10(%rsp), @XMM[1] | ||
| 2788 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2789 | movdqu @XMM[1], 0x10($out) | ||
| 2790 | lea 0x20($out), $out | ||
| 2791 | |||
| 2792 | movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | ||
| 2793 | jmp .Lxts_dec_done | ||
| 2794 | .align 16 | ||
| 2795 | .Lxts_dec_1: | ||
| 2796 | pxor @XMM[0], @XMM[8] | ||
| 2797 | lea 0x10($inp), $inp | ||
| 2798 | movdqa @XMM[8], 0x20(%rbp) | ||
| 2799 | lea 0x20(%rbp), $arg1 | ||
| 2800 | lea 0x20(%rbp), $arg2 | ||
| 2801 | lea ($key), $arg3 | ||
| 2802 | call asm_AES_decrypt # doesn't touch %xmm | ||
| 2803 | pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | ||
| 2804 | #pxor @XMM[8], @XMM[0] | ||
| 2805 | #lea 0x80(%rsp), %rax # pass key schedule | ||
| 2806 | #mov %edx, %r10d # pass rounds | ||
| 2807 | #call _bsaes_decrypt8 | ||
| 2808 | #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2809 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2810 | lea 0x10($out), $out | ||
| 2811 | |||
| 2812 | movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | ||
| 2813 | |||
| 2814 | .Lxts_dec_done: | ||
| 2815 | and \$15, %ebx | ||
| 2816 | jz .Lxts_dec_ret | ||
| 2817 | |||
| 2818 | pxor $twtmp, $twtmp | ||
| 2819 | movdqa .Lxts_magic(%rip), $twmask | ||
| 2820 | pcmpgtd @XMM[7], $twtmp | ||
| 2821 | pshufd \$0x13, $twtmp, $twres | ||
| 2822 | movdqa @XMM[7], @XMM[6] | ||
| 2823 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2824 | pand $twmask, $twres # isolate carry and residue | ||
| 2825 | movdqu ($inp), @XMM[0] | ||
| 2826 | pxor $twres, @XMM[7] | ||
| 2827 | |||
| 2828 | lea 0x20(%rbp), $arg1 | ||
| 2829 | pxor @XMM[7], @XMM[0] | ||
| 2830 | lea 0x20(%rbp), $arg2 | ||
| 2831 | movdqa @XMM[0], 0x20(%rbp) | ||
| 2832 | lea ($key), $arg3 | ||
| 2833 | call asm_AES_decrypt # doesn't touch %xmm | ||
| 2834 | pxor 0x20(%rbp), @XMM[7] | ||
| 2835 | mov $out, %rdx | ||
| 2836 | movdqu @XMM[7], ($out) | ||
| 2837 | |||
| 2838 | .Lxts_dec_steal: | ||
| 2839 | movzb 16($inp), %eax | ||
| 2840 | movzb (%rdx), %ecx | ||
| 2841 | lea 1($inp), $inp | ||
| 2842 | mov %al, (%rdx) | ||
| 2843 | mov %cl, 16(%rdx) | ||
| 2844 | lea 1(%rdx), %rdx | ||
| 2845 | sub \$1,%ebx | ||
| 2846 | jnz .Lxts_dec_steal | ||
| 2847 | |||
| 2848 | movdqu ($out), @XMM[0] | ||
| 2849 | lea 0x20(%rbp), $arg1 | ||
| 2850 | pxor @XMM[6], @XMM[0] | ||
| 2851 | lea 0x20(%rbp), $arg2 | ||
| 2852 | movdqa @XMM[0], 0x20(%rbp) | ||
| 2853 | lea ($key), $arg3 | ||
| 2854 | call asm_AES_decrypt # doesn't touch %xmm | ||
| 2855 | pxor 0x20(%rbp), @XMM[6] | ||
| 2856 | movdqu @XMM[6], ($out) | ||
| 2857 | |||
| 2858 | .Lxts_dec_ret: | ||
| 2859 | lea (%rsp), %rax | ||
| 2860 | pxor %xmm0, %xmm0 | ||
| 2861 | .Lxts_dec_bzero: # wipe key schedule [if any] | ||
| 2862 | movdqa %xmm0, 0x00(%rax) | ||
| 2863 | movdqa %xmm0, 0x10(%rax) | ||
| 2864 | lea 0x20(%rax), %rax | ||
| 2865 | cmp %rax, %rbp | ||
| 2866 | ja .Lxts_dec_bzero | ||
| 2867 | |||
| 2868 | lea (%rbp),%rsp # restore %rsp | ||
| 2869 | ___ | ||
| 2870 | $code.=<<___ if ($win64); | ||
| 2871 | movaps 0x40(%rbp), %xmm6 | ||
| 2872 | movaps 0x50(%rbp), %xmm7 | ||
| 2873 | movaps 0x60(%rbp), %xmm8 | ||
| 2874 | movaps 0x70(%rbp), %xmm9 | ||
| 2875 | movaps 0x80(%rbp), %xmm10 | ||
| 2876 | movaps 0x90(%rbp), %xmm11 | ||
| 2877 | movaps 0xa0(%rbp), %xmm12 | ||
| 2878 | movaps 0xb0(%rbp), %xmm13 | ||
| 2879 | movaps 0xc0(%rbp), %xmm14 | ||
| 2880 | movaps 0xd0(%rbp), %xmm15 | ||
| 2881 | lea 0xa0(%rbp), %rsp | ||
| 2882 | ___ | ||
| 2883 | $code.=<<___; | ||
| 2884 | mov 0x48(%rsp), %r15 | ||
| 2885 | mov 0x50(%rsp), %r14 | ||
| 2886 | mov 0x58(%rsp), %r13 | ||
| 2887 | mov 0x60(%rsp), %r12 | ||
| 2888 | mov 0x68(%rsp), %rbx | ||
| 2889 | mov 0x70(%rsp), %rax | ||
| 2890 | lea 0x78(%rsp), %rsp | ||
| 2891 | mov %rax, %rbp | ||
| 2892 | .Lxts_dec_epilogue: | ||
| 2893 | ret | ||
| 2894 | .size bsaes_xts_decrypt,.-bsaes_xts_decrypt | ||
| 2895 | ___ | ||
| 2896 | } | ||
| 2897 | $code.=<<___; | ||
| 2898 | .section .rodata | ||
| 2899 | .type _bsaes_const,\@object | ||
| 2900 | .align 64 | ||
| 2901 | _bsaes_const: | ||
| 2902 | .LM0ISR: # InvShiftRows constants | ||
| 2903 | .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 | ||
| 2904 | .LISRM0: | ||
| 2905 | .quad 0x01040b0e0205080f, 0x0306090c00070a0d | ||
| 2906 | .LISR: | ||
| 2907 | .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 | ||
| 2908 | .LBS0: # bit-slice constants | ||
| 2909 | .quad 0x5555555555555555, 0x5555555555555555 | ||
| 2910 | .LBS1: | ||
| 2911 | .quad 0x3333333333333333, 0x3333333333333333 | ||
| 2912 | .LBS2: | ||
| 2913 | .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f | ||
| 2914 | .LSR: # shiftrows constants | ||
| 2915 | .quad 0x0504070600030201, 0x0f0e0d0c0a09080b | ||
| 2916 | .LSRM0: | ||
| 2917 | .quad 0x0304090e00050a0f, 0x01060b0c0207080d | ||
| 2918 | .LM0SR: | ||
| 2919 | .quad 0x0a0e02060f03070b, 0x0004080c05090d01 | ||
| 2920 | .LSWPUP: # byte-swap upper dword | ||
| 2921 | .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 | ||
| 2922 | .LSWPUPM0SR: | ||
| 2923 | .quad 0x0a0d02060c03070b, 0x0004080f05090e01 | ||
| 2924 | .LADD1: # counter increment constants | ||
| 2925 | .quad 0x0000000000000000, 0x0000000100000000 | ||
| 2926 | .LADD2: | ||
| 2927 | .quad 0x0000000000000000, 0x0000000200000000 | ||
| 2928 | .LADD3: | ||
| 2929 | .quad 0x0000000000000000, 0x0000000300000000 | ||
| 2930 | .LADD4: | ||
| 2931 | .quad 0x0000000000000000, 0x0000000400000000 | ||
| 2932 | .LADD5: | ||
| 2933 | .quad 0x0000000000000000, 0x0000000500000000 | ||
| 2934 | .LADD6: | ||
| 2935 | .quad 0x0000000000000000, 0x0000000600000000 | ||
| 2936 | .LADD7: | ||
| 2937 | .quad 0x0000000000000000, 0x0000000700000000 | ||
| 2938 | .LADD8: | ||
| 2939 | .quad 0x0000000000000000, 0x0000000800000000 | ||
| 2940 | .Lxts_magic: | ||
| 2941 | .long 0x87,0,1,0 | ||
| 2942 | .Lmasks: | ||
| 2943 | .quad 0x0101010101010101, 0x0101010101010101 | ||
| 2944 | .quad 0x0202020202020202, 0x0202020202020202 | ||
| 2945 | .quad 0x0404040404040404, 0x0404040404040404 | ||
| 2946 | .quad 0x0808080808080808, 0x0808080808080808 | ||
| 2947 | .LM0: | ||
| 2948 | .quad 0x02060a0e03070b0f, 0x0004080c0105090d | ||
| 2949 | .L63: | ||
| 2950 | .quad 0x6363636363636363, 0x6363636363636363 | ||
| 2951 | .align 64 | ||
| 2952 | .size _bsaes_const,.-_bsaes_const | ||
| 2953 | .text | ||
| 2954 | ___ | ||
| 2955 | |||
| 2956 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 2957 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 2958 | if ($win64) { | ||
| 2959 | $rec="%rcx"; | ||
| 2960 | $frame="%rdx"; | ||
| 2961 | $context="%r8"; | ||
| 2962 | $disp="%r9"; | ||
| 2963 | |||
| 2964 | $code.=<<___; | ||
| 2965 | .extern __imp_RtlVirtualUnwind | ||
| 2966 | .type se_handler,\@abi-omnipotent | ||
| 2967 | .align 16 | ||
| 2968 | se_handler: | ||
| 2969 | _CET_ENDBR | ||
| 2970 | push %rsi | ||
| 2971 | push %rdi | ||
| 2972 | push %rbx | ||
| 2973 | push %rbp | ||
| 2974 | push %r12 | ||
| 2975 | push %r13 | ||
| 2976 | push %r14 | ||
| 2977 | push %r15 | ||
| 2978 | pushfq | ||
| 2979 | sub \$64,%rsp | ||
| 2980 | |||
| 2981 | mov 120($context),%rax # pull context->Rax | ||
| 2982 | mov 248($context),%rbx # pull context->Rip | ||
| 2983 | |||
| 2984 | mov 8($disp),%rsi # disp->ImageBase | ||
| 2985 | mov 56($disp),%r11 # disp->HandlerData | ||
| 2986 | |||
| 2987 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 2988 | lea (%rsi,%r10),%r10 # prologue label | ||
| 2989 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 2990 | jb .Lin_prologue | ||
| 2991 | |||
| 2992 | mov 152($context),%rax # pull context->Rsp | ||
| 2993 | |||
| 2994 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 2995 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 2996 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 2997 | jae .Lin_prologue | ||
| 2998 | |||
| 2999 | mov 160($context),%rax # pull context->Rbp | ||
| 3000 | |||
| 3001 | lea 0x40(%rax),%rsi # %xmm save area | ||
| 3002 | lea 512($context),%rdi # &context.Xmm6 | ||
| 3003 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
| 3004 | .long 0xa548f3fc # cld; rep movsq | ||
| 3005 | lea 0xa0(%rax),%rax # adjust stack pointer | ||
| 3006 | |||
| 3007 | mov 0x70(%rax),%rbp | ||
| 3008 | mov 0x68(%rax),%rbx | ||
| 3009 | mov 0x60(%rax),%r12 | ||
| 3010 | mov 0x58(%rax),%r13 | ||
| 3011 | mov 0x50(%rax),%r14 | ||
| 3012 | mov 0x48(%rax),%r15 | ||
| 3013 | lea 0x78(%rax),%rax # adjust stack pointer | ||
| 3014 | mov %rbx,144($context) # restore context->Rbx | ||
| 3015 | mov %rbp,160($context) # restore context->Rbp | ||
| 3016 | mov %r12,216($context) # restore context->R12 | ||
| 3017 | mov %r13,224($context) # restore context->R13 | ||
| 3018 | mov %r14,232($context) # restore context->R14 | ||
| 3019 | mov %r15,240($context) # restore context->R15 | ||
| 3020 | |||
| 3021 | .Lin_prologue: | ||
| 3022 | mov %rax,152($context) # restore context->Rsp | ||
| 3023 | |||
| 3024 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 3025 | mov $context,%rsi # context | ||
| 3026 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
| 3027 | .long 0xa548f3fc # cld; rep movsq | ||
| 3028 | |||
| 3029 | mov $disp,%rsi | ||
| 3030 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 3031 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 3032 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 3033 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 3034 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 3035 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 3036 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 3037 | mov %r10,32(%rsp) # arg5 | ||
| 3038 | mov %r11,40(%rsp) # arg6 | ||
| 3039 | mov %r12,48(%rsp) # arg7 | ||
| 3040 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 3041 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 3042 | |||
| 3043 | mov \$1,%eax # ExceptionContinueSearch | ||
| 3044 | add \$64,%rsp | ||
| 3045 | popfq | ||
| 3046 | pop %r15 | ||
| 3047 | pop %r14 | ||
| 3048 | pop %r13 | ||
| 3049 | pop %r12 | ||
| 3050 | pop %rbp | ||
| 3051 | pop %rbx | ||
| 3052 | pop %rdi | ||
| 3053 | pop %rsi | ||
| 3054 | ret | ||
| 3055 | .size se_handler,.-se_handler | ||
| 3056 | |||
| 3057 | .section .pdata | ||
| 3058 | .align 4 | ||
| 3059 | ___ | ||
| 3060 | $code.=<<___ if ($ecb); | ||
| 3061 | .rva .Lecb_enc_prologue | ||
| 3062 | .rva .Lecb_enc_epilogue | ||
| 3063 | .rva .Lecb_enc_info | ||
| 3064 | |||
| 3065 | .rva .Lecb_dec_prologue | ||
| 3066 | .rva .Lecb_dec_epilogue | ||
| 3067 | .rva .Lecb_dec_info | ||
| 3068 | ___ | ||
| 3069 | $code.=<<___; | ||
| 3070 | .rva .Lcbc_dec_prologue | ||
| 3071 | .rva .Lcbc_dec_epilogue | ||
| 3072 | .rva .Lcbc_dec_info | ||
| 3073 | |||
| 3074 | .rva .Lctr_enc_prologue | ||
| 3075 | .rva .Lctr_enc_epilogue | ||
| 3076 | .rva .Lctr_enc_info | ||
| 3077 | |||
| 3078 | .rva .Lxts_enc_prologue | ||
| 3079 | .rva .Lxts_enc_epilogue | ||
| 3080 | .rva .Lxts_enc_info | ||
| 3081 | |||
| 3082 | .rva .Lxts_dec_prologue | ||
| 3083 | .rva .Lxts_dec_epilogue | ||
| 3084 | .rva .Lxts_dec_info | ||
| 3085 | |||
| 3086 | .section .xdata | ||
| 3087 | .align 8 | ||
| 3088 | ___ | ||
| 3089 | $code.=<<___ if ($ecb); | ||
| 3090 | .Lecb_enc_info: | ||
| 3091 | .byte 9,0,0,0 | ||
| 3092 | .rva se_handler | ||
| 3093 | .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] | ||
| 3094 | .Lecb_dec_info: | ||
| 3095 | .byte 9,0,0,0 | ||
| 3096 | .rva se_handler | ||
| 3097 | .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] | ||
| 3098 | ___ | ||
| 3099 | $code.=<<___; | ||
| 3100 | .Lcbc_dec_info: | ||
| 3101 | .byte 9,0,0,0 | ||
| 3102 | .rva se_handler | ||
| 3103 | .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] | ||
| 3104 | .Lctr_enc_info: | ||
| 3105 | .byte 9,0,0,0 | ||
| 3106 | .rva se_handler | ||
| 3107 | .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] | ||
| 3108 | .Lxts_enc_info: | ||
| 3109 | .byte 9,0,0,0 | ||
| 3110 | .rva se_handler | ||
| 3111 | .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] | ||
| 3112 | .Lxts_dec_info: | ||
| 3113 | .byte 9,0,0,0 | ||
| 3114 | .rva se_handler | ||
| 3115 | .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] | ||
| 3116 | ___ | ||
| 3117 | } | ||
| 3118 | |||
| 3119 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 3120 | |||
| 3121 | print $code; | ||
| 3122 | |||
| 3123 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl deleted file mode 100644 index 6e7bd36d05..0000000000 --- a/src/lib/libcrypto/aes/asm/vpaes-x86.pl +++ /dev/null | |||
| @@ -1,911 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | ###################################################################### | ||
| 4 | ## Constant-time SSSE3 AES core implementation. | ||
| 5 | ## version 0.1 | ||
| 6 | ## | ||
| 7 | ## By Mike Hamburg (Stanford University), 2009 | ||
| 8 | ## Public domain. | ||
| 9 | ## | ||
| 10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
| 11 | ## http://crypto.stanford.edu/vpaes/. | ||
| 12 | |||
| 13 | ###################################################################### | ||
| 14 | # September 2011. | ||
| 15 | # | ||
| 16 | # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for | ||
| 17 | # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
| 18 | # doesn't handle partial vectors (doesn't have to if called from | ||
| 19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
| 20 | # schedule structure with the original nor does it make assumption | ||
| 21 | # about its alignment... | ||
| 22 | # | ||
| 23 | # Performance summary. aes-586.pl column lists large-block CBC | ||
| 24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
| 25 | # byte processed with 128-bit key, and vpaes-x86.pl column - [also | ||
| 26 | # large-block CBC] encrypt/decrypt. | ||
| 27 | # | ||
| 28 | # aes-586.pl vpaes-x86.pl | ||
| 29 | # | ||
| 30 | # Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) | ||
| 31 | # Nehalem 27.9/40.4/18.1 10.3/12.0 | ||
| 32 | # Atom 102./119./60.1 64.5/85.3(***) | ||
| 33 | # | ||
| 34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
| 35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
| 36 | # majority of contemporary cores share cache, slower code path | ||
| 37 | # is common place. In other words "with-hyper-threading-off" | ||
| 38 | # results are presented mostly for reference purposes. | ||
| 39 | # | ||
| 40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
| 41 | # | ||
| 42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
| 43 | # pshufb, yet it's respectable +32%/65% improvement on Core 2 | ||
| 44 | # and +58%/40% on Atom (as implied, over "hyper-threading-safe" | ||
| 45 | # code path). | ||
| 46 | # | ||
| 47 | # <appro@openssl.org> | ||
| 48 | |||
| 49 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 50 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 51 | require "x86asm.pl"; | ||
| 52 | |||
| 53 | &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); | ||
| 54 | |||
| 55 | $PREFIX="vpaes"; | ||
| 56 | |||
| 57 | my ($round, $base, $magic, $key, $const, $inp, $out)= | ||
| 58 | ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); | ||
| 59 | |||
| 60 | &rodataseg(); | ||
| 61 | &static_label("_vpaes_consts"); | ||
| 62 | &static_label("_vpaes_schedule_low_round"); | ||
| 63 | |||
| 64 | &set_label("_vpaes_consts",64); | ||
| 65 | $k_inv=-0x30; # inv, inva | ||
| 66 | &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); | ||
| 67 | &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); | ||
| 68 | |||
| 69 | $k_s0F=-0x10; # s0F | ||
| 70 | &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); | ||
| 71 | |||
| 72 | $k_ipt=0x00; # input transform (lo, hi) | ||
| 73 | &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); | ||
| 74 | &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); | ||
| 75 | |||
| 76 | $k_sb1=0x20; # sb1u, sb1t | ||
| 77 | &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); | ||
| 78 | &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); | ||
| 79 | $k_sb2=0x40; # sb2u, sb2t | ||
| 80 | &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); | ||
| 81 | &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); | ||
| 82 | $k_sbo=0x60; # sbou, sbot | ||
| 83 | &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); | ||
| 84 | &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); | ||
| 85 | |||
| 86 | $k_mc_forward=0x80; # mc_forward | ||
| 87 | &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); | ||
| 88 | &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); | ||
| 89 | &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); | ||
| 90 | &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); | ||
| 91 | |||
| 92 | $k_mc_backward=0xc0; # mc_backward | ||
| 93 | &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); | ||
| 94 | &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); | ||
| 95 | &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); | ||
| 96 | &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); | ||
| 97 | |||
| 98 | $k_sr=0x100; # sr | ||
| 99 | &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); | ||
| 100 | &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); | ||
| 101 | &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); | ||
| 102 | &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); | ||
| 103 | |||
| 104 | $k_rcon=0x140; # rcon | ||
| 105 | &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); | ||
| 106 | |||
| 107 | $k_s63=0x150; # s63: all equal to 0x63 transformed | ||
| 108 | &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); | ||
| 109 | |||
| 110 | $k_opt=0x160; # output transform | ||
| 111 | &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); | ||
| 112 | &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); | ||
| 113 | |||
| 114 | $k_deskew=0x180; # deskew tables: inverts the sbox's "skew" | ||
| 115 | &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); | ||
| 116 | &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); | ||
| 117 | ## | ||
| 118 | ## Decryption stuff | ||
| 119 | ## Key schedule constants | ||
| 120 | ## | ||
| 121 | $k_dksd=0x1a0; # decryption key schedule: invskew x*D | ||
| 122 | &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); | ||
| 123 | &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); | ||
| 124 | $k_dksb=0x1c0; # decryption key schedule: invskew x*B | ||
| 125 | &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); | ||
| 126 | &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); | ||
| 127 | $k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 | ||
| 128 | &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); | ||
| 129 | &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); | ||
| 130 | $k_dks9=0x200; # decryption key schedule: invskew x*9 | ||
| 131 | &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); | ||
| 132 | &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); | ||
| 133 | |||
| 134 | ## | ||
| 135 | ## Decryption stuff | ||
| 136 | ## Round function constants | ||
| 137 | ## | ||
| 138 | $k_dipt=0x220; # decryption input transform | ||
| 139 | &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); | ||
| 140 | &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); | ||
| 141 | |||
| 142 | $k_dsb9=0x240; # decryption sbox output *9*u, *9*t | ||
| 143 | &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); | ||
| 144 | &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); | ||
| 145 | $k_dsbd=0x260; # decryption sbox output *D*u, *D*t | ||
| 146 | &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); | ||
| 147 | &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); | ||
| 148 | $k_dsbb=0x280; # decryption sbox output *B*u, *B*t | ||
| 149 | &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); | ||
| 150 | &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); | ||
| 151 | $k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t | ||
| 152 | &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); | ||
| 153 | &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); | ||
| 154 | $k_dsbo=0x2c0; # decryption sbox final output | ||
| 155 | &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); | ||
| 156 | &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); | ||
| 157 | &previous(); | ||
| 158 | |||
| 159 | &function_begin_B("_vpaes_preheat"); | ||
| 160 | &movdqa ("xmm7",&QWP($k_inv,$const)); | ||
| 161 | &movdqa ("xmm6",&QWP($k_s0F,$const)); | ||
| 162 | &ret (); | ||
| 163 | &function_end_B("_vpaes_preheat"); | ||
| 164 | |||
| 165 | ## | ||
| 166 | ## _aes_encrypt_core | ||
| 167 | ## | ||
| 168 | ## AES-encrypt %xmm0. | ||
| 169 | ## | ||
| 170 | ## Inputs: | ||
| 171 | ## %xmm0 = input | ||
| 172 | ## %xmm6-%xmm7 as in _vpaes_preheat | ||
| 173 | ## (%edx) = scheduled keys | ||
| 174 | ## | ||
| 175 | ## Output in %xmm0 | ||
| 176 | ## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx | ||
| 177 | ## | ||
| 178 | ## | ||
| 179 | &function_begin_B("_vpaes_encrypt_core"); | ||
| 180 | &mov ($magic,16); | ||
| 181 | &mov ($round,&DWP(240,$key)); | ||
| 182 | &movdqa ("xmm1","xmm6") | ||
| 183 | &movdqa ("xmm2",&QWP($k_ipt,$const)); | ||
| 184 | &pandn ("xmm1","xmm0"); | ||
| 185 | &movdqu ("xmm5",&QWP(0,$key)); | ||
| 186 | &psrld ("xmm1",4); | ||
| 187 | &pand ("xmm0","xmm6"); | ||
| 188 | &pshufb ("xmm2","xmm0"); | ||
| 189 | &movdqa ("xmm0",&QWP($k_ipt+16,$const)); | ||
| 190 | &pshufb ("xmm0","xmm1"); | ||
| 191 | &pxor ("xmm2","xmm5"); | ||
| 192 | &pxor ("xmm0","xmm2"); | ||
| 193 | &add ($key,16); | ||
| 194 | &lea ($base,&DWP($k_mc_backward,$const)); | ||
| 195 | &jmp (&label("enc_entry")); | ||
| 196 | |||
| 197 | |||
| 198 | &set_label("enc_loop",16); | ||
| 199 | # middle of middle round | ||
| 200 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u | ||
| 201 | &pshufb ("xmm4","xmm2"); # 4 = sb1u | ||
| 202 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
| 203 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t | ||
| 204 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
| 205 | &pxor ("xmm0","xmm4"); # 0 = A | ||
| 206 | &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u | ||
| 207 | &pshufb ("xmm5","xmm2"); # 4 = sb2u | ||
| 208 | &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] | ||
| 209 | &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t | ||
| 210 | &pshufb ("xmm2","xmm3"); # 2 = sb2t | ||
| 211 | &pxor ("xmm2","xmm5"); # 2 = 2A | ||
| 212 | &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] | ||
| 213 | &movdqa ("xmm3","xmm0"); # 3 = A | ||
| 214 | &pshufb ("xmm0","xmm1"); # 0 = B | ||
| 215 | &add ($key,16); # next key | ||
| 216 | &pxor ("xmm0","xmm2"); # 0 = 2A+B | ||
| 217 | &pshufb ("xmm3","xmm4"); # 3 = D | ||
| 218 | &add ($magic,16); # next mc | ||
| 219 | &pxor ("xmm3","xmm0"); # 3 = 2A+B+D | ||
| 220 | &pshufb ("xmm0","xmm1"); # 0 = 2B+C | ||
| 221 | &and ($magic,0x30); # ... mod 4 | ||
| 222 | &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D | ||
| 223 | &sub ($round,1); # nr-- | ||
| 224 | |||
| 225 | &set_label("enc_entry"); | ||
| 226 | # top of round | ||
| 227 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
| 228 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
| 229 | &psrld ("xmm1",4); # 1 = i | ||
| 230 | &pand ("xmm0","xmm6"); # 0 = k | ||
| 231 | &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k | ||
| 232 | &pshufb ("xmm5","xmm0"); # 2 = a/k | ||
| 233 | &pxor ("xmm0","xmm1"); # 0 = j | ||
| 234 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
| 235 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
| 236 | &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k | ||
| 237 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
| 238 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
| 239 | &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k | ||
| 240 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
| 241 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
| 242 | &pxor ("xmm2","xmm0"); # 2 = io | ||
| 243 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
| 244 | &movdqu ("xmm5",&QWP(0,$key)); | ||
| 245 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
| 246 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
| 247 | &jnz (&label("enc_loop")); | ||
| 248 | |||
| 249 | # middle of last round | ||
| 250 | &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo | ||
| 251 | &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 | ||
| 252 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
| 253 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
| 254 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
| 255 | &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] | ||
| 256 | &pxor ("xmm0","xmm4"); # 0 = A | ||
| 257 | &pshufb ("xmm0","xmm1"); | ||
| 258 | &ret (); | ||
| 259 | &function_end_B("_vpaes_encrypt_core"); | ||
| 260 | |||
| 261 | ## | ||
| 262 | ## Decryption core | ||
| 263 | ## | ||
| 264 | ## Same API as encryption core. | ||
| 265 | ## | ||
| 266 | &function_begin_B("_vpaes_decrypt_core"); | ||
| 267 | &mov ($round,&DWP(240,$key)); | ||
| 268 | &lea ($base,&DWP($k_dsbd,$const)); | ||
| 269 | &movdqa ("xmm1","xmm6"); | ||
| 270 | &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); | ||
| 271 | &pandn ("xmm1","xmm0"); | ||
| 272 | &mov ($magic,$round); | ||
| 273 | &psrld ("xmm1",4) | ||
| 274 | &movdqu ("xmm5",&QWP(0,$key)); | ||
| 275 | &shl ($magic,4); | ||
| 276 | &pand ("xmm0","xmm6"); | ||
| 277 | &pshufb ("xmm2","xmm0"); | ||
| 278 | &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); | ||
| 279 | &xor ($magic,0x30); | ||
| 280 | &pshufb ("xmm0","xmm1"); | ||
| 281 | &and ($magic,0x30); | ||
| 282 | &pxor ("xmm2","xmm5"); | ||
| 283 | &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); | ||
| 284 | &pxor ("xmm0","xmm2"); | ||
| 285 | &add ($key,16); | ||
| 286 | &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); | ||
| 287 | &jmp (&label("dec_entry")); | ||
| 288 | |||
| 289 | &set_label("dec_loop",16); | ||
| 290 | ## | ||
| 291 | ## Inverse mix columns | ||
| 292 | ## | ||
| 293 | &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u | ||
| 294 | &pshufb ("xmm4","xmm2"); # 4 = sb9u | ||
| 295 | &pxor ("xmm4","xmm0"); | ||
| 296 | &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t | ||
| 297 | &pshufb ("xmm0","xmm3"); # 0 = sb9t | ||
| 298 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
| 299 | &add ($key,16); # next round key | ||
| 300 | |||
| 301 | &pshufb ("xmm0","xmm5"); # MC ch | ||
| 302 | &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu | ||
| 303 | &pshufb ("xmm4","xmm2"); # 4 = sbdu | ||
| 304 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
| 305 | &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt | ||
| 306 | &pshufb ("xmm0","xmm3"); # 0 = sbdt | ||
| 307 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
| 308 | &sub ($round,1); # nr-- | ||
| 309 | |||
| 310 | &pshufb ("xmm0","xmm5"); # MC ch | ||
| 311 | &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu | ||
| 312 | &pshufb ("xmm4","xmm2"); # 4 = sbbu | ||
| 313 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
| 314 | &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt | ||
| 315 | &pshufb ("xmm0","xmm3"); # 0 = sbbt | ||
| 316 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
| 317 | |||
| 318 | &pshufb ("xmm0","xmm5"); # MC ch | ||
| 319 | &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu | ||
| 320 | &pshufb ("xmm4","xmm2"); # 4 = sbeu | ||
| 321 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
| 322 | &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet | ||
| 323 | &pshufb ("xmm0","xmm3"); # 0 = sbet | ||
| 324 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
| 325 | |||
| 326 | &palignr("xmm5","xmm5",12); | ||
| 327 | |||
| 328 | &set_label("dec_entry"); | ||
| 329 | # top of round | ||
| 330 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
| 331 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
| 332 | &psrld ("xmm1",4); # 1 = i | ||
| 333 | &pand ("xmm0","xmm6"); # 0 = k | ||
| 334 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
| 335 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
| 336 | &pxor ("xmm0","xmm1"); # 0 = j | ||
| 337 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
| 338 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
| 339 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
| 340 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
| 341 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
| 342 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
| 343 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
| 344 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
| 345 | &pxor ("xmm2","xmm0"); # 2 = io | ||
| 346 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
| 347 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
| 348 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
| 349 | &movdqu ("xmm0",&QWP(0,$key)); | ||
| 350 | &jnz (&label("dec_loop")); | ||
| 351 | |||
| 352 | # middle of last round | ||
| 353 | &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou | ||
| 354 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
| 355 | &pxor ("xmm4","xmm0"); # 4 = sb1u + k | ||
| 356 | &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot | ||
| 357 | &movdqa ("xmm2",&QWP(0,$magic)); | ||
| 358 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
| 359 | &pxor ("xmm0","xmm4"); # 0 = A | ||
| 360 | &pshufb ("xmm0","xmm2"); | ||
| 361 | &ret (); | ||
| 362 | &function_end_B("_vpaes_decrypt_core"); | ||
| 363 | |||
| 364 | ######################################################## | ||
| 365 | ## ## | ||
| 366 | ## AES key schedule ## | ||
| 367 | ## ## | ||
| 368 | ######################################################## | ||
| 369 | &function_begin_B("_vpaes_schedule_core"); | ||
| 370 | &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) | ||
| 371 | &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon | ||
| 372 | |||
| 373 | # input transform | ||
| 374 | &movdqa ("xmm3","xmm0"); | ||
| 375 | &lea ($base,&DWP($k_ipt,$const)); | ||
| 376 | &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 | ||
| 377 | &call ("_vpaes_schedule_transform"); | ||
| 378 | &movdqa ("xmm7","xmm0"); | ||
| 379 | |||
| 380 | &test ($out,$out); | ||
| 381 | &jnz (&label("schedule_am_decrypting")); | ||
| 382 | |||
| 383 | # encrypting, output zeroth round key after transform | ||
| 384 | &movdqu (&QWP(0,$key),"xmm0"); | ||
| 385 | &jmp (&label("schedule_go")); | ||
| 386 | |||
| 387 | &set_label("schedule_am_decrypting"); | ||
| 388 | # decrypting, output zeroth round key after shiftrows | ||
| 389 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
| 390 | &pshufb ("xmm3","xmm1"); | ||
| 391 | &movdqu (&QWP(0,$key),"xmm3"); | ||
| 392 | &xor ($magic,0x30); | ||
| 393 | |||
| 394 | &set_label("schedule_go"); | ||
| 395 | &cmp ($round,192); | ||
| 396 | &ja (&label("schedule_256")); | ||
| 397 | &je (&label("schedule_192")); | ||
| 398 | # 128: fall though | ||
| 399 | |||
| 400 | ## | ||
| 401 | ## .schedule_128 | ||
| 402 | ## | ||
| 403 | ## 128-bit specific part of key schedule. | ||
| 404 | ## | ||
| 405 | ## This schedule is really simple, because all its parts | ||
| 406 | ## are accomplished by the subroutines. | ||
| 407 | ## | ||
| 408 | &set_label("schedule_128"); | ||
| 409 | &mov ($round,10); | ||
| 410 | |||
| 411 | &set_label("loop_schedule_128"); | ||
| 412 | &call ("_vpaes_schedule_round"); | ||
| 413 | &dec ($round); | ||
| 414 | &jz (&label("schedule_mangle_last")); | ||
| 415 | &call ("_vpaes_schedule_mangle"); # write output | ||
| 416 | &jmp (&label("loop_schedule_128")); | ||
| 417 | |||
| 418 | ## | ||
| 419 | ## .aes_schedule_192 | ||
| 420 | ## | ||
| 421 | ## 192-bit specific part of key schedule. | ||
| 422 | ## | ||
| 423 | ## The main body of this schedule is the same as the 128-bit | ||
| 424 | ## schedule, but with more smearing. The long, high side is | ||
| 425 | ## stored in %xmm7 as before, and the short, low side is in | ||
| 426 | ## the high bits of %xmm6. | ||
| 427 | ## | ||
| 428 | ## This schedule is somewhat nastier, however, because each | ||
| 429 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
| 430 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
| 431 | ## keys. | ||
| 432 | ## | ||
| 433 | &set_label("schedule_192",16); | ||
| 434 | &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) | ||
| 435 | &call ("_vpaes_schedule_transform"); # input transform | ||
| 436 | &movdqa ("xmm6","xmm0"); # save short part | ||
| 437 | &pxor ("xmm4","xmm4"); # clear 4 | ||
| 438 | &movhlps("xmm6","xmm4"); # clobber low side with zeros | ||
| 439 | &mov ($round,4); | ||
| 440 | |||
| 441 | &set_label("loop_schedule_192"); | ||
| 442 | &call ("_vpaes_schedule_round"); | ||
| 443 | &palignr("xmm0","xmm6",8); | ||
| 444 | &call ("_vpaes_schedule_mangle"); # save key n | ||
| 445 | &call ("_vpaes_schedule_192_smear"); | ||
| 446 | &call ("_vpaes_schedule_mangle"); # save key n+1 | ||
| 447 | &call ("_vpaes_schedule_round"); | ||
| 448 | &dec ($round); | ||
| 449 | &jz (&label("schedule_mangle_last")); | ||
| 450 | &call ("_vpaes_schedule_mangle"); # save key n+2 | ||
| 451 | &call ("_vpaes_schedule_192_smear"); | ||
| 452 | &jmp (&label("loop_schedule_192")); | ||
| 453 | |||
| 454 | ## | ||
| 455 | ## .aes_schedule_256 | ||
| 456 | ## | ||
| 457 | ## 256-bit specific part of key schedule. | ||
| 458 | ## | ||
| 459 | ## The structure here is very similar to the 128-bit | ||
| 460 | ## schedule, but with an additional "low side" in | ||
| 461 | ## %xmm6. The low side's rounds are the same as the | ||
| 462 | ## high side's, except no rcon and no rotation. | ||
| 463 | ## | ||
| 464 | &set_label("schedule_256",16); | ||
| 465 | &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) | ||
| 466 | &call ("_vpaes_schedule_transform"); # input transform | ||
| 467 | &mov ($round,7); | ||
| 468 | |||
| 469 | &set_label("loop_schedule_256"); | ||
| 470 | &call ("_vpaes_schedule_mangle"); # output low result | ||
| 471 | &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 | ||
| 472 | |||
| 473 | # high round | ||
| 474 | &call ("_vpaes_schedule_round"); | ||
| 475 | &dec ($round); | ||
| 476 | &jz (&label("schedule_mangle_last")); | ||
| 477 | &call ("_vpaes_schedule_mangle"); | ||
| 478 | |||
| 479 | # low round. swap xmm7 and xmm6 | ||
| 480 | &pshufd ("xmm0","xmm0",0xFF); | ||
| 481 | &movdqa (&QWP(20,"esp"),"xmm7"); | ||
| 482 | &movdqa ("xmm7","xmm6"); | ||
| 483 | &call ("_vpaes_schedule_low_round"); | ||
| 484 | &movdqa ("xmm7",&QWP(20,"esp")); | ||
| 485 | |||
| 486 | &jmp (&label("loop_schedule_256")); | ||
| 487 | |||
| 488 | ## | ||
| 489 | ## .aes_schedule_mangle_last | ||
| 490 | ## | ||
| 491 | ## Mangler for last round of key schedule | ||
| 492 | ## Mangles %xmm0 | ||
| 493 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
| 494 | ## when decrypting, outputs unskew(%xmm0) | ||
| 495 | ## | ||
| 496 | ## Always called right before return... jumps to cleanup and exits | ||
| 497 | ## | ||
| 498 | &set_label("schedule_mangle_last",16); | ||
| 499 | # schedule last round key from xmm0 | ||
| 500 | &lea ($base,&DWP($k_deskew,$const)); | ||
| 501 | &test ($out,$out); | ||
| 502 | &jnz (&label("schedule_mangle_last_dec")); | ||
| 503 | |||
| 504 | # encrypting | ||
| 505 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
| 506 | &pshufb ("xmm0","xmm1"); # output permute | ||
| 507 | &lea ($base,&DWP($k_opt,$const)); # prepare to output transform | ||
| 508 | &add ($key,32); | ||
| 509 | |||
| 510 | &set_label("schedule_mangle_last_dec"); | ||
| 511 | &add ($key,-16); | ||
| 512 | &pxor ("xmm0",&QWP($k_s63,$const)); | ||
| 513 | &call ("_vpaes_schedule_transform"); # output transform | ||
| 514 | &movdqu (&QWP(0,$key),"xmm0"); # save last key | ||
| 515 | |||
| 516 | # cleanup | ||
| 517 | &pxor ("xmm0","xmm0"); | ||
| 518 | &pxor ("xmm1","xmm1"); | ||
| 519 | &pxor ("xmm2","xmm2"); | ||
| 520 | &pxor ("xmm3","xmm3"); | ||
| 521 | &pxor ("xmm4","xmm4"); | ||
| 522 | &pxor ("xmm5","xmm5"); | ||
| 523 | &pxor ("xmm6","xmm6"); | ||
| 524 | &pxor ("xmm7","xmm7"); | ||
| 525 | &ret (); | ||
| 526 | &function_end_B("_vpaes_schedule_core"); | ||
| 527 | |||
| 528 | ## | ||
| 529 | ## .aes_schedule_192_smear | ||
| 530 | ## | ||
| 531 | ## Smear the short, low side in the 192-bit key schedule. | ||
| 532 | ## | ||
| 533 | ## Inputs: | ||
| 534 | ## %xmm7: high side, b a x y | ||
| 535 | ## %xmm6: low side, d c 0 0 | ||
| 536 | ## %xmm13: 0 | ||
| 537 | ## | ||
| 538 | ## Outputs: | ||
| 539 | ## %xmm6: b+c+d b+c 0 0 | ||
| 540 | ## %xmm0: b+c+d b+c b a | ||
| 541 | ## | ||
| 542 | &function_begin_B("_vpaes_schedule_192_smear"); | ||
| 543 | &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 | ||
| 544 | &pxor ("xmm6","xmm0"); # -> c+d c 0 0 | ||
| 545 | &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a | ||
| 546 | &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a | ||
| 547 | &movdqa ("xmm0","xmm6"); | ||
| 548 | &pxor ("xmm1","xmm1"); | ||
| 549 | &movhlps("xmm6","xmm1"); # clobber low side with zeros | ||
| 550 | &ret (); | ||
| 551 | &function_end_B("_vpaes_schedule_192_smear"); | ||
| 552 | |||
| 553 | ## | ||
| 554 | ## .aes_schedule_round | ||
| 555 | ## | ||
| 556 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
| 557 | ## | ||
| 558 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
| 559 | ## then rotates it by one byte and xors into the low dword of | ||
| 560 | ## %xmm7. | ||
| 561 | ## | ||
| 562 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
| 563 | ## next rcon. | ||
| 564 | ## | ||
| 565 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
| 566 | ## second low, result into third, result into highest. | ||
| 567 | ## | ||
| 568 | ## Returns results in %xmm7 = %xmm0. | ||
| 569 | ## Clobbers %xmm1-%xmm5. | ||
| 570 | ## | ||
| 571 | &function_begin_B("_vpaes_schedule_round"); | ||
| 572 | # extract rcon from xmm8 | ||
| 573 | &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 | ||
| 574 | &pxor ("xmm1","xmm1"); | ||
| 575 | &palignr("xmm1","xmm2",15); | ||
| 576 | &palignr("xmm2","xmm2",15); | ||
| 577 | &pxor ("xmm7","xmm1"); | ||
| 578 | |||
| 579 | # rotate | ||
| 580 | &pshufd ("xmm0","xmm0",0xFF); | ||
| 581 | &palignr("xmm0","xmm0",1); | ||
| 582 | |||
| 583 | # fall through... | ||
| 584 | &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 | ||
| 585 | |||
| 586 | # low round: same as high round, but no rotation and no rcon. | ||
| 587 | &set_label("_vpaes_schedule_low_round"); | ||
| 588 | # smear xmm7 | ||
| 589 | &movdqa ("xmm1","xmm7"); | ||
| 590 | &pslldq ("xmm7",4); | ||
| 591 | &pxor ("xmm7","xmm1"); | ||
| 592 | &movdqa ("xmm1","xmm7"); | ||
| 593 | &pslldq ("xmm7",8); | ||
| 594 | &pxor ("xmm7","xmm1"); | ||
| 595 | &pxor ("xmm7",&QWP($k_s63,$const)); | ||
| 596 | |||
| 597 | # subbyte | ||
| 598 | &movdqa ("xmm4",&QWP($k_s0F,$const)); | ||
| 599 | &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j | ||
| 600 | &movdqa ("xmm1","xmm4"); | ||
| 601 | &pandn ("xmm1","xmm0"); | ||
| 602 | &psrld ("xmm1",4); # 1 = i | ||
| 603 | &pand ("xmm0","xmm4"); # 0 = k | ||
| 604 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
| 605 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
| 606 | &pxor ("xmm0","xmm1"); # 0 = j | ||
| 607 | &movdqa ("xmm3","xmm5"); # 3 : 1/i | ||
| 608 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
| 609 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
| 610 | &movdqa ("xmm4","xmm5"); # 4 : 1/j | ||
| 611 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
| 612 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
| 613 | &movdqa ("xmm2","xmm5"); # 2 : 1/iak | ||
| 614 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
| 615 | &pxor ("xmm2","xmm0"); # 2 = io | ||
| 616 | &movdqa ("xmm3","xmm5"); # 3 : 1/jak | ||
| 617 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
| 618 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
| 619 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou | ||
| 620 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
| 621 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot | ||
| 622 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
| 623 | &pxor ("xmm0","xmm4"); # 0 = sbox output | ||
| 624 | |||
| 625 | # add in smeared stuff | ||
| 626 | &pxor ("xmm0","xmm7"); | ||
| 627 | &movdqa ("xmm7","xmm0"); | ||
| 628 | &ret (); | ||
| 629 | &function_end_B("_vpaes_schedule_round"); | ||
| 630 | |||
| 631 | ## | ||
| 632 | ## .aes_schedule_transform | ||
| 633 | ## | ||
| 634 | ## Linear-transform %xmm0 according to tables at (%ebx) | ||
| 635 | ## | ||
| 636 | ## Output in %xmm0 | ||
| 637 | ## Clobbers %xmm1, %xmm2 | ||
| 638 | ## | ||
| 639 | &function_begin_B("_vpaes_schedule_transform"); | ||
| 640 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
| 641 | &movdqa ("xmm1","xmm2"); | ||
| 642 | &pandn ("xmm1","xmm0"); | ||
| 643 | &psrld ("xmm1",4); | ||
| 644 | &pand ("xmm0","xmm2"); | ||
| 645 | &movdqa ("xmm2",&QWP(0,$base)); | ||
| 646 | &pshufb ("xmm2","xmm0"); | ||
| 647 | &movdqa ("xmm0",&QWP(16,$base)); | ||
| 648 | &pshufb ("xmm0","xmm1"); | ||
| 649 | &pxor ("xmm0","xmm2"); | ||
| 650 | &ret (); | ||
| 651 | &function_end_B("_vpaes_schedule_transform"); | ||
| 652 | |||
| 653 | ## | ||
| 654 | ## .aes_schedule_mangle | ||
| 655 | ## | ||
| 656 | ## Mangle xmm0 from (basis-transformed) standard version | ||
| 657 | ## to our version. | ||
| 658 | ## | ||
| 659 | ## On encrypt, | ||
| 660 | ## xor with 0x63 | ||
| 661 | ## multiply by circulant 0,1,1,1 | ||
| 662 | ## apply shiftrows transform | ||
| 663 | ## | ||
| 664 | ## On decrypt, | ||
| 665 | ## xor with 0x63 | ||
| 666 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
| 667 | ## deskew | ||
| 668 | ## apply shiftrows transform | ||
| 669 | ## | ||
| 670 | ## | ||
| 671 | ## Writes out to (%edx), and increments or decrements it | ||
| 672 | ## Keeps track of round number mod 4 in %ecx | ||
| 673 | ## Preserves xmm0 | ||
| 674 | ## Clobbers xmm1-xmm5 | ||
| 675 | ## | ||
| 676 | &function_begin_B("_vpaes_schedule_mangle"); | ||
| 677 | &movdqa ("xmm4","xmm0"); # save xmm0 for later | ||
| 678 | &movdqa ("xmm5",&QWP($k_mc_forward,$const)); | ||
| 679 | &test ($out,$out); | ||
| 680 | &jnz (&label("schedule_mangle_dec")); | ||
| 681 | |||
| 682 | # encrypting | ||
| 683 | &add ($key,16); | ||
| 684 | &pxor ("xmm4",&QWP($k_s63,$const)); | ||
| 685 | &pshufb ("xmm4","xmm5"); | ||
| 686 | &movdqa ("xmm3","xmm4"); | ||
| 687 | &pshufb ("xmm4","xmm5"); | ||
| 688 | &pxor ("xmm3","xmm4"); | ||
| 689 | &pshufb ("xmm4","xmm5"); | ||
| 690 | &pxor ("xmm3","xmm4"); | ||
| 691 | |||
| 692 | &jmp (&label("schedule_mangle_both")); | ||
| 693 | |||
| 694 | &set_label("schedule_mangle_dec",16); | ||
| 695 | # inverse mix columns | ||
| 696 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
| 697 | &lea ($inp,&DWP($k_dksd,$const)); | ||
| 698 | &movdqa ("xmm1","xmm2"); | ||
| 699 | &pandn ("xmm1","xmm4"); | ||
| 700 | &psrld ("xmm1",4); # 1 = hi | ||
| 701 | &pand ("xmm4","xmm2"); # 4 = lo | ||
| 702 | |||
| 703 | &movdqa ("xmm2",&QWP(0,$inp)); | ||
| 704 | &pshufb ("xmm2","xmm4"); | ||
| 705 | &movdqa ("xmm3",&QWP(0x10,$inp)); | ||
| 706 | &pshufb ("xmm3","xmm1"); | ||
| 707 | &pxor ("xmm3","xmm2"); | ||
| 708 | &pshufb ("xmm3","xmm5"); | ||
| 709 | |||
| 710 | &movdqa ("xmm2",&QWP(0x20,$inp)); | ||
| 711 | &pshufb ("xmm2","xmm4"); | ||
| 712 | &pxor ("xmm2","xmm3"); | ||
| 713 | &movdqa ("xmm3",&QWP(0x30,$inp)); | ||
| 714 | &pshufb ("xmm3","xmm1"); | ||
| 715 | &pxor ("xmm3","xmm2"); | ||
| 716 | &pshufb ("xmm3","xmm5"); | ||
| 717 | |||
| 718 | &movdqa ("xmm2",&QWP(0x40,$inp)); | ||
| 719 | &pshufb ("xmm2","xmm4"); | ||
| 720 | &pxor ("xmm2","xmm3"); | ||
| 721 | &movdqa ("xmm3",&QWP(0x50,$inp)); | ||
| 722 | &pshufb ("xmm3","xmm1"); | ||
| 723 | &pxor ("xmm3","xmm2"); | ||
| 724 | &pshufb ("xmm3","xmm5"); | ||
| 725 | |||
| 726 | &movdqa ("xmm2",&QWP(0x60,$inp)); | ||
| 727 | &pshufb ("xmm2","xmm4"); | ||
| 728 | &pxor ("xmm2","xmm3"); | ||
| 729 | &movdqa ("xmm3",&QWP(0x70,$inp)); | ||
| 730 | &pshufb ("xmm3","xmm1"); | ||
| 731 | &pxor ("xmm3","xmm2"); | ||
| 732 | |||
| 733 | &add ($key,-16); | ||
| 734 | |||
| 735 | &set_label("schedule_mangle_both"); | ||
| 736 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
| 737 | &pshufb ("xmm3","xmm1"); | ||
| 738 | &add ($magic,-16); | ||
| 739 | &and ($magic,0x30); | ||
| 740 | &movdqu (&QWP(0,$key),"xmm3"); | ||
| 741 | &ret (); | ||
| 742 | &function_end_B("_vpaes_schedule_mangle"); | ||
| 743 | |||
| 744 | # | ||
| 745 | # Interface to OpenSSL | ||
| 746 | # | ||
| 747 | &function_begin("${PREFIX}_set_encrypt_key"); | ||
| 748 | &mov ($inp,&wparam(0)); # inp | ||
| 749 | &lea ($base,&DWP(-56,"esp")); | ||
| 750 | &mov ($round,&wparam(1)); # bits | ||
| 751 | &and ($base,-16); | ||
| 752 | &mov ($key,&wparam(2)); # key | ||
| 753 | &xchg ($base,"esp"); # alloca | ||
| 754 | &mov (&DWP(48,"esp"),$base); | ||
| 755 | |||
| 756 | &mov ($base,$round); | ||
| 757 | &shr ($base,5); | ||
| 758 | &add ($base,5); | ||
| 759 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
| 760 | &mov ($magic,0x30); | ||
| 761 | &mov ($out,0); | ||
| 762 | |||
| 763 | &picsetup($const); | ||
| 764 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
| 765 | &lea ($const,&DWP(0x30,$const)) | ||
| 766 | |||
| 767 | &call ("_vpaes_schedule_core"); | ||
| 768 | |||
| 769 | &mov ("esp",&DWP(48,"esp")); | ||
| 770 | &xor ("eax","eax"); | ||
| 771 | &function_end("${PREFIX}_set_encrypt_key"); | ||
| 772 | |||
| 773 | &function_begin("${PREFIX}_set_decrypt_key"); | ||
| 774 | &mov ($inp,&wparam(0)); # inp | ||
| 775 | &lea ($base,&DWP(-56,"esp")); | ||
| 776 | &mov ($round,&wparam(1)); # bits | ||
| 777 | &and ($base,-16); | ||
| 778 | &mov ($key,&wparam(2)); # key | ||
| 779 | &xchg ($base,"esp"); # alloca | ||
| 780 | &mov (&DWP(48,"esp"),$base); | ||
| 781 | |||
| 782 | &mov ($base,$round); | ||
| 783 | &shr ($base,5); | ||
| 784 | &add ($base,5); | ||
| 785 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
| 786 | &shl ($base,4); | ||
| 787 | &lea ($key,&DWP(16,$key,$base)); | ||
| 788 | |||
| 789 | &mov ($out,1); | ||
| 790 | &mov ($magic,$round); | ||
| 791 | &shr ($magic,1); | ||
| 792 | &and ($magic,32); | ||
| 793 | &xor ($magic,32); # nbist==192?0:32; | ||
| 794 | |||
| 795 | &picsetup($const); | ||
| 796 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
| 797 | &lea ($const,&DWP(0x30,$const)) | ||
| 798 | |||
| 799 | &call ("_vpaes_schedule_core"); | ||
| 800 | |||
| 801 | &mov ("esp",&DWP(48,"esp")); | ||
| 802 | &xor ("eax","eax"); | ||
| 803 | &function_end("${PREFIX}_set_decrypt_key"); | ||
| 804 | |||
| 805 | &function_begin("${PREFIX}_encrypt"); | ||
| 806 | &picsetup($const); | ||
| 807 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
| 808 | &lea ($const,&DWP(0x30,$const)) | ||
| 809 | |||
| 810 | &call ("_vpaes_preheat"); | ||
| 811 | &mov ($inp,&wparam(0)); # inp | ||
| 812 | &lea ($base,&DWP(-56,"esp")); | ||
| 813 | &mov ($out,&wparam(1)); # out | ||
| 814 | &and ($base,-16); | ||
| 815 | &mov ($key,&wparam(2)); # key | ||
| 816 | &xchg ($base,"esp"); # alloca | ||
| 817 | &mov (&DWP(48,"esp"),$base); | ||
| 818 | |||
| 819 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
| 820 | &call ("_vpaes_encrypt_core"); | ||
| 821 | &movdqu (&QWP(0,$out),"xmm0"); | ||
| 822 | |||
| 823 | &mov ("esp",&DWP(48,"esp")); | ||
| 824 | &function_end("${PREFIX}_encrypt"); | ||
| 825 | |||
| 826 | &function_begin("${PREFIX}_decrypt"); | ||
| 827 | &picsetup($const); | ||
| 828 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
| 829 | &lea ($const,&DWP(0x30,$const)) | ||
| 830 | |||
| 831 | &call ("_vpaes_preheat"); | ||
| 832 | &mov ($inp,&wparam(0)); # inp | ||
| 833 | &lea ($base,&DWP(-56,"esp")); | ||
| 834 | &mov ($out,&wparam(1)); # out | ||
| 835 | &and ($base,-16); | ||
| 836 | &mov ($key,&wparam(2)); # key | ||
| 837 | &xchg ($base,"esp"); # alloca | ||
| 838 | &mov (&DWP(48,"esp"),$base); | ||
| 839 | |||
| 840 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
| 841 | &call ("_vpaes_decrypt_core"); | ||
| 842 | &movdqu (&QWP(0,$out),"xmm0"); | ||
| 843 | |||
| 844 | &mov ("esp",&DWP(48,"esp")); | ||
| 845 | &function_end("${PREFIX}_decrypt"); | ||
| 846 | |||
| 847 | &function_begin("${PREFIX}_cbc_encrypt"); | ||
| 848 | &mov ($inp,&wparam(0)); # inp | ||
| 849 | &mov ($out,&wparam(1)); # out | ||
| 850 | &mov ($round,&wparam(2)); # len | ||
| 851 | &mov ($key,&wparam(3)); # key | ||
| 852 | &sub ($round,16); | ||
| 853 | &jc (&label("cbc_abort")); | ||
| 854 | &lea ($base,&DWP(-56,"esp")); | ||
| 855 | &mov ($const,&wparam(4)); # ivp | ||
| 856 | &and ($base,-16); | ||
| 857 | &mov ($magic,&wparam(5)); # enc | ||
| 858 | &xchg ($base,"esp"); # alloca | ||
| 859 | &movdqu ("xmm1",&QWP(0,$const)); # load IV | ||
| 860 | &sub ($out,$inp); | ||
| 861 | &mov (&DWP(48,"esp"),$base); | ||
| 862 | |||
| 863 | &mov (&DWP(0,"esp"),$out); # save out | ||
| 864 | &mov (&DWP(4,"esp"),$key) # save key | ||
| 865 | &mov (&DWP(8,"esp"),$const); # save ivp | ||
| 866 | &mov ($out,$round); # $out works as $len | ||
| 867 | |||
| 868 | &picsetup($const); | ||
| 869 | &picsymbol($const, &label("_vpaes_consts"), $const); | ||
| 870 | &lea ($const,&DWP(0x30,$const)) | ||
| 871 | |||
| 872 | &call ("_vpaes_preheat"); | ||
| 873 | &cmp ($magic,0); | ||
| 874 | &je (&label("cbc_dec_loop")); | ||
| 875 | &jmp (&label("cbc_enc_loop")); | ||
| 876 | |||
| 877 | &set_label("cbc_enc_loop",16); | ||
| 878 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
| 879 | &pxor ("xmm0","xmm1"); # inp^=iv | ||
| 880 | &call ("_vpaes_encrypt_core"); | ||
| 881 | &mov ($base,&DWP(0,"esp")); # restore out | ||
| 882 | &mov ($key,&DWP(4,"esp")); # restore key | ||
| 883 | &movdqa ("xmm1","xmm0"); | ||
| 884 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
| 885 | &lea ($inp,&DWP(16,$inp)); | ||
| 886 | &sub ($out,16); | ||
| 887 | &jnc (&label("cbc_enc_loop")); | ||
| 888 | &jmp (&label("cbc_done")); | ||
| 889 | |||
| 890 | &set_label("cbc_dec_loop",16); | ||
| 891 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
| 892 | &movdqa (&QWP(16,"esp"),"xmm1"); # save IV | ||
| 893 | &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV | ||
| 894 | &call ("_vpaes_decrypt_core"); | ||
| 895 | &mov ($base,&DWP(0,"esp")); # restore out | ||
| 896 | &mov ($key,&DWP(4,"esp")); # restore key | ||
| 897 | &pxor ("xmm0",&QWP(16,"esp")); # out^=iv | ||
| 898 | &movdqa ("xmm1",&QWP(32,"esp")); # load next IV | ||
| 899 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
| 900 | &lea ($inp,&DWP(16,$inp)); | ||
| 901 | &sub ($out,16); | ||
| 902 | &jnc (&label("cbc_dec_loop")); | ||
| 903 | |||
| 904 | &set_label("cbc_done"); | ||
| 905 | &mov ($base,&DWP(8,"esp")); # restore ivp | ||
| 906 | &mov ("esp",&DWP(48,"esp")); | ||
| 907 | &movdqu (&QWP(0,$base),"xmm1"); # write IV | ||
| 908 | &set_label("cbc_abort"); | ||
| 909 | &function_end("${PREFIX}_cbc_encrypt"); | ||
| 910 | |||
| 911 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl deleted file mode 100644 index 7d92e8d8ca..0000000000 --- a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl +++ /dev/null | |||
| @@ -1,1222 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | ###################################################################### | ||
| 4 | ## Constant-time SSSE3 AES core implementation. | ||
| 5 | ## version 0.1 | ||
| 6 | ## | ||
| 7 | ## By Mike Hamburg (Stanford University), 2009 | ||
| 8 | ## Public domain. | ||
| 9 | ## | ||
| 10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
| 11 | ## http://crypto.stanford.edu/vpaes/. | ||
| 12 | |||
| 13 | ###################################################################### | ||
| 14 | # September 2011. | ||
| 15 | # | ||
| 16 | # Interface to OpenSSL as "almost" drop-in replacement for | ||
| 17 | # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
| 18 | # doesn't handle partial vectors (doesn't have to if called from | ||
| 19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
| 20 | # schedule structure with the original nor does it make assumption | ||
| 21 | # about its alignment... | ||
| 22 | # | ||
| 23 | # Performance summary. aes-x86_64.pl column lists large-block CBC | ||
| 24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
| 25 | # byte processed with 128-bit key, and vpaes-x86_64.pl column - | ||
| 26 | # [also large-block CBC] encrypt/decrypt. | ||
| 27 | # | ||
| 28 | # aes-x86_64.pl vpaes-x86_64.pl | ||
| 29 | # | ||
| 30 | # Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) | ||
| 31 | # Nehalem 30.5/42.2/14.6 9.8/11.8 | ||
| 32 | # Atom 63.9/79.0/32.1 64.0/84.8(***) | ||
| 33 | # | ||
| 34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
| 35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
| 36 | # majority of contemporary cores share cache, slower code path | ||
| 37 | # is common place. In other words "with-hyper-threading-off" | ||
| 38 | # results are presented mostly for reference purposes. | ||
| 39 | # | ||
| 40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
| 41 | # | ||
| 42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
| 43 | # pshufb, yet it's respectable +40%/78% improvement on Core 2 | ||
| 44 | # (as implied, over "hyper-threading-safe" code path). | ||
| 45 | # | ||
| 46 | # <appro@openssl.org> | ||
| 47 | |||
| 48 | $flavour = shift; | ||
| 49 | $output = shift; | ||
| 50 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 51 | |||
| 52 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 53 | |||
| 54 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 55 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 56 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 57 | die "can't locate x86_64-xlate.pl"; | ||
| 58 | |||
| 59 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
| 60 | *STDOUT=*OUT; | ||
| 61 | |||
| 62 | $PREFIX="vpaes"; | ||
| 63 | |||
| 64 | $code.=<<___; | ||
| 65 | .text | ||
| 66 | |||
| 67 | ## | ||
| 68 | ## _aes_encrypt_core | ||
| 69 | ## | ||
| 70 | ## AES-encrypt %xmm0. | ||
| 71 | ## | ||
| 72 | ## Inputs: | ||
| 73 | ## %xmm0 = input | ||
| 74 | ## %xmm9-%xmm15 as in _vpaes_preheat | ||
| 75 | ## (%rdx) = scheduled keys | ||
| 76 | ## | ||
| 77 | ## Output in %xmm0 | ||
| 78 | ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax | ||
| 79 | ## Preserves %xmm6 - %xmm8 so you get some local vectors | ||
| 80 | ## | ||
| 81 | ## | ||
| 82 | .type _vpaes_encrypt_core,\@abi-omnipotent | ||
| 83 | .align 16 | ||
| 84 | _vpaes_encrypt_core: | ||
| 85 | _CET_ENDBR | ||
| 86 | mov %rdx, %r9 | ||
| 87 | mov \$16, %r11 | ||
| 88 | mov 240(%rdx),%eax | ||
| 89 | movdqa %xmm9, %xmm1 | ||
| 90 | movdqa .Lk_ipt(%rip), %xmm2 # iptlo | ||
| 91 | pandn %xmm0, %xmm1 | ||
| 92 | movdqu (%r9), %xmm5 # round0 key | ||
| 93 | psrld \$4, %xmm1 | ||
| 94 | pand %xmm9, %xmm0 | ||
| 95 | pshufb %xmm0, %xmm2 | ||
| 96 | movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi | ||
| 97 | pshufb %xmm1, %xmm0 | ||
| 98 | pxor %xmm5, %xmm2 | ||
| 99 | pxor %xmm2, %xmm0 | ||
| 100 | add \$16, %r9 | ||
| 101 | lea .Lk_mc_backward(%rip),%r10 | ||
| 102 | jmp .Lenc_entry | ||
| 103 | |||
| 104 | .align 16 | ||
| 105 | .Lenc_loop: | ||
| 106 | # middle of middle round | ||
| 107 | movdqa %xmm13, %xmm4 # 4 : sb1u | ||
| 108 | pshufb %xmm2, %xmm4 # 4 = sb1u | ||
| 109 | pxor %xmm5, %xmm4 # 4 = sb1u + k | ||
| 110 | movdqa %xmm12, %xmm0 # 0 : sb1t | ||
| 111 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 112 | pxor %xmm4, %xmm0 # 0 = A | ||
| 113 | movdqa %xmm15, %xmm5 # 4 : sb2u | ||
| 114 | pshufb %xmm2, %xmm5 # 4 = sb2u | ||
| 115 | movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] | ||
| 116 | movdqa %xmm14, %xmm2 # 2 : sb2t | ||
| 117 | pshufb %xmm3, %xmm2 # 2 = sb2t | ||
| 118 | pxor %xmm5, %xmm2 # 2 = 2A | ||
| 119 | movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] | ||
| 120 | movdqa %xmm0, %xmm3 # 3 = A | ||
| 121 | pshufb %xmm1, %xmm0 # 0 = B | ||
| 122 | add \$16, %r9 # next key | ||
| 123 | pxor %xmm2, %xmm0 # 0 = 2A+B | ||
| 124 | pshufb %xmm4, %xmm3 # 3 = D | ||
| 125 | add \$16, %r11 # next mc | ||
| 126 | pxor %xmm0, %xmm3 # 3 = 2A+B+D | ||
| 127 | pshufb %xmm1, %xmm0 # 0 = 2B+C | ||
| 128 | and \$0x30, %r11 # ... mod 4 | ||
| 129 | pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D | ||
| 130 | sub \$1,%rax # nr-- | ||
| 131 | |||
| 132 | .Lenc_entry: | ||
| 133 | # top of round | ||
| 134 | movdqa %xmm9, %xmm1 # 1 : i | ||
| 135 | pandn %xmm0, %xmm1 # 1 = i<<4 | ||
| 136 | psrld \$4, %xmm1 # 1 = i | ||
| 137 | pand %xmm9, %xmm0 # 0 = k | ||
| 138 | movdqa %xmm11, %xmm5 # 2 : a/k | ||
| 139 | pshufb %xmm0, %xmm5 # 2 = a/k | ||
| 140 | pxor %xmm1, %xmm0 # 0 = j | ||
| 141 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
| 142 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
| 143 | pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k | ||
| 144 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
| 145 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
| 146 | pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k | ||
| 147 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
| 148 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
| 149 | pxor %xmm0, %xmm2 # 2 = io | ||
| 150 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
| 151 | movdqu (%r9), %xmm5 | ||
| 152 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
| 153 | pxor %xmm1, %xmm3 # 3 = jo | ||
| 154 | jnz .Lenc_loop | ||
| 155 | |||
| 156 | # middle of last round | ||
| 157 | movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo | ||
| 158 | movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 | ||
| 159 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
| 160 | pxor %xmm5, %xmm4 # 4 = sb1u + k | ||
| 161 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 162 | movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] | ||
| 163 | pxor %xmm4, %xmm0 # 0 = A | ||
| 164 | pshufb %xmm1, %xmm0 | ||
| 165 | ret | ||
| 166 | .size _vpaes_encrypt_core,.-_vpaes_encrypt_core | ||
| 167 | |||
| 168 | ## | ||
| 169 | ## Decryption core | ||
| 170 | ## | ||
| 171 | ## Same API as encryption core. | ||
| 172 | ## | ||
| 173 | .type _vpaes_decrypt_core,\@abi-omnipotent | ||
| 174 | .align 16 | ||
| 175 | _vpaes_decrypt_core: | ||
| 176 | _CET_ENDBR | ||
| 177 | mov %rdx, %r9 # load key | ||
| 178 | mov 240(%rdx),%eax | ||
| 179 | movdqa %xmm9, %xmm1 | ||
| 180 | movdqa .Lk_dipt(%rip), %xmm2 # iptlo | ||
| 181 | pandn %xmm0, %xmm1 | ||
| 182 | mov %rax, %r11 | ||
| 183 | psrld \$4, %xmm1 | ||
| 184 | movdqu (%r9), %xmm5 # round0 key | ||
| 185 | shl \$4, %r11 | ||
| 186 | pand %xmm9, %xmm0 | ||
| 187 | pshufb %xmm0, %xmm2 | ||
| 188 | movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi | ||
| 189 | xor \$0x30, %r11 | ||
| 190 | lea .Lk_dsbd(%rip),%r10 | ||
| 191 | pshufb %xmm1, %xmm0 | ||
| 192 | and \$0x30, %r11 | ||
| 193 | pxor %xmm5, %xmm2 | ||
| 194 | movdqa .Lk_mc_forward+48(%rip), %xmm5 | ||
| 195 | pxor %xmm2, %xmm0 | ||
| 196 | add \$16, %r9 | ||
| 197 | add %r10, %r11 | ||
| 198 | jmp .Ldec_entry | ||
| 199 | |||
| 200 | .align 16 | ||
| 201 | .Ldec_loop: | ||
| 202 | ## | ||
| 203 | ## Inverse mix columns | ||
| 204 | ## | ||
| 205 | movdqa -0x20(%r10),%xmm4 # 4 : sb9u | ||
| 206 | pshufb %xmm2, %xmm4 # 4 = sb9u | ||
| 207 | pxor %xmm0, %xmm4 | ||
| 208 | movdqa -0x10(%r10),%xmm0 # 0 : sb9t | ||
| 209 | pshufb %xmm3, %xmm0 # 0 = sb9t | ||
| 210 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 211 | add \$16, %r9 # next round key | ||
| 212 | |||
| 213 | pshufb %xmm5, %xmm0 # MC ch | ||
| 214 | movdqa 0x00(%r10),%xmm4 # 4 : sbdu | ||
| 215 | pshufb %xmm2, %xmm4 # 4 = sbdu | ||
| 216 | pxor %xmm0, %xmm4 # 4 = ch | ||
| 217 | movdqa 0x10(%r10),%xmm0 # 0 : sbdt | ||
| 218 | pshufb %xmm3, %xmm0 # 0 = sbdt | ||
| 219 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 220 | sub \$1,%rax # nr-- | ||
| 221 | |||
| 222 | pshufb %xmm5, %xmm0 # MC ch | ||
| 223 | movdqa 0x20(%r10),%xmm4 # 4 : sbbu | ||
| 224 | pshufb %xmm2, %xmm4 # 4 = sbbu | ||
| 225 | pxor %xmm0, %xmm4 # 4 = ch | ||
| 226 | movdqa 0x30(%r10),%xmm0 # 0 : sbbt | ||
| 227 | pshufb %xmm3, %xmm0 # 0 = sbbt | ||
| 228 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 229 | |||
| 230 | pshufb %xmm5, %xmm0 # MC ch | ||
| 231 | movdqa 0x40(%r10),%xmm4 # 4 : sbeu | ||
| 232 | pshufb %xmm2, %xmm4 # 4 = sbeu | ||
| 233 | pxor %xmm0, %xmm4 # 4 = ch | ||
| 234 | movdqa 0x50(%r10),%xmm0 # 0 : sbet | ||
| 235 | pshufb %xmm3, %xmm0 # 0 = sbet | ||
| 236 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 237 | |||
| 238 | palignr \$12, %xmm5, %xmm5 | ||
| 239 | |||
| 240 | .Ldec_entry: | ||
| 241 | # top of round | ||
| 242 | movdqa %xmm9, %xmm1 # 1 : i | ||
| 243 | pandn %xmm0, %xmm1 # 1 = i<<4 | ||
| 244 | psrld \$4, %xmm1 # 1 = i | ||
| 245 | pand %xmm9, %xmm0 # 0 = k | ||
| 246 | movdqa %xmm11, %xmm2 # 2 : a/k | ||
| 247 | pshufb %xmm0, %xmm2 # 2 = a/k | ||
| 248 | pxor %xmm1, %xmm0 # 0 = j | ||
| 249 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
| 250 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
| 251 | pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k | ||
| 252 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
| 253 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
| 254 | pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k | ||
| 255 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
| 256 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
| 257 | pxor %xmm0, %xmm2 # 2 = io | ||
| 258 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
| 259 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
| 260 | pxor %xmm1, %xmm3 # 3 = jo | ||
| 261 | movdqu (%r9), %xmm0 | ||
| 262 | jnz .Ldec_loop | ||
| 263 | |||
| 264 | # middle of last round | ||
| 265 | movdqa 0x60(%r10), %xmm4 # 3 : sbou | ||
| 266 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
| 267 | pxor %xmm0, %xmm4 # 4 = sb1u + k | ||
| 268 | movdqa 0x70(%r10), %xmm0 # 0 : sbot | ||
| 269 | movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 | ||
| 270 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 271 | pxor %xmm4, %xmm0 # 0 = A | ||
| 272 | pshufb %xmm2, %xmm0 | ||
| 273 | ret | ||
| 274 | .size _vpaes_decrypt_core,.-_vpaes_decrypt_core | ||
| 275 | |||
| 276 | ######################################################## | ||
| 277 | ## ## | ||
| 278 | ## AES key schedule ## | ||
| 279 | ## ## | ||
| 280 | ######################################################## | ||
| 281 | .type _vpaes_schedule_core,\@abi-omnipotent | ||
| 282 | .align 16 | ||
| 283 | _vpaes_schedule_core: | ||
| 284 | _CET_ENDBR | ||
| 285 | # rdi = key | ||
| 286 | # rsi = size in bits | ||
| 287 | # rdx = buffer | ||
| 288 | # rcx = direction. 0=encrypt, 1=decrypt | ||
| 289 | |||
| 290 | call _vpaes_preheat # load the tables | ||
| 291 | movdqa .Lk_rcon(%rip), %xmm8 # load rcon | ||
| 292 | movdqu (%rdi), %xmm0 # load key (unaligned) | ||
| 293 | |||
| 294 | # input transform | ||
| 295 | movdqa %xmm0, %xmm3 | ||
| 296 | lea .Lk_ipt(%rip), %r11 | ||
| 297 | call _vpaes_schedule_transform | ||
| 298 | movdqa %xmm0, %xmm7 | ||
| 299 | |||
| 300 | lea .Lk_sr(%rip),%r10 | ||
| 301 | test %rcx, %rcx | ||
| 302 | jnz .Lschedule_am_decrypting | ||
| 303 | |||
| 304 | # encrypting, output zeroth round key after transform | ||
| 305 | movdqu %xmm0, (%rdx) | ||
| 306 | jmp .Lschedule_go | ||
| 307 | |||
| 308 | .Lschedule_am_decrypting: | ||
| 309 | # decrypting, output zeroth round key after shiftrows | ||
| 310 | movdqa (%r8,%r10),%xmm1 | ||
| 311 | pshufb %xmm1, %xmm3 | ||
| 312 | movdqu %xmm3, (%rdx) | ||
| 313 | xor \$0x30, %r8 | ||
| 314 | |||
| 315 | .Lschedule_go: | ||
| 316 | cmp \$192, %esi | ||
| 317 | ja .Lschedule_256 | ||
| 318 | je .Lschedule_192 | ||
| 319 | # 128: fall though | ||
| 320 | |||
| 321 | ## | ||
| 322 | ## .schedule_128 | ||
| 323 | ## | ||
| 324 | ## 128-bit specific part of key schedule. | ||
| 325 | ## | ||
| 326 | ## This schedule is really simple, because all its parts | ||
| 327 | ## are accomplished by the subroutines. | ||
| 328 | ## | ||
| 329 | .Lschedule_128: | ||
| 330 | mov \$10, %esi | ||
| 331 | |||
| 332 | .Loop_schedule_128: | ||
| 333 | call _vpaes_schedule_round | ||
| 334 | dec %rsi | ||
| 335 | jz .Lschedule_mangle_last | ||
| 336 | call _vpaes_schedule_mangle # write output | ||
| 337 | jmp .Loop_schedule_128 | ||
| 338 | |||
| 339 | ## | ||
| 340 | ## .aes_schedule_192 | ||
| 341 | ## | ||
| 342 | ## 192-bit specific part of key schedule. | ||
| 343 | ## | ||
| 344 | ## The main body of this schedule is the same as the 128-bit | ||
| 345 | ## schedule, but with more smearing. The long, high side is | ||
| 346 | ## stored in %xmm7 as before, and the short, low side is in | ||
| 347 | ## the high bits of %xmm6. | ||
| 348 | ## | ||
| 349 | ## This schedule is somewhat nastier, however, because each | ||
| 350 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
| 351 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
| 352 | ## keys. | ||
| 353 | ## | ||
| 354 | .align 16 | ||
| 355 | .Lschedule_192: | ||
| 356 | movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) | ||
| 357 | call _vpaes_schedule_transform # input transform | ||
| 358 | movdqa %xmm0, %xmm6 # save short part | ||
| 359 | pxor %xmm4, %xmm4 # clear 4 | ||
| 360 | movhlps %xmm4, %xmm6 # clobber low side with zeros | ||
| 361 | mov \$4, %esi | ||
| 362 | |||
| 363 | .Loop_schedule_192: | ||
| 364 | call _vpaes_schedule_round | ||
| 365 | palignr \$8,%xmm6,%xmm0 | ||
| 366 | call _vpaes_schedule_mangle # save key n | ||
| 367 | call _vpaes_schedule_192_smear | ||
| 368 | call _vpaes_schedule_mangle # save key n+1 | ||
| 369 | call _vpaes_schedule_round | ||
| 370 | dec %rsi | ||
| 371 | jz .Lschedule_mangle_last | ||
| 372 | call _vpaes_schedule_mangle # save key n+2 | ||
| 373 | call _vpaes_schedule_192_smear | ||
| 374 | jmp .Loop_schedule_192 | ||
| 375 | |||
| 376 | ## | ||
| 377 | ## .aes_schedule_256 | ||
| 378 | ## | ||
| 379 | ## 256-bit specific part of key schedule. | ||
| 380 | ## | ||
| 381 | ## The structure here is very similar to the 128-bit | ||
| 382 | ## schedule, but with an additional "low side" in | ||
| 383 | ## %xmm6. The low side's rounds are the same as the | ||
| 384 | ## high side's, except no rcon and no rotation. | ||
| 385 | ## | ||
| 386 | .align 16 | ||
| 387 | .Lschedule_256: | ||
| 388 | movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) | ||
| 389 | call _vpaes_schedule_transform # input transform | ||
| 390 | mov \$7, %esi | ||
| 391 | |||
| 392 | .Loop_schedule_256: | ||
| 393 | call _vpaes_schedule_mangle # output low result | ||
| 394 | movdqa %xmm0, %xmm6 # save cur_lo in xmm6 | ||
| 395 | |||
| 396 | # high round | ||
| 397 | call _vpaes_schedule_round | ||
| 398 | dec %rsi | ||
| 399 | jz .Lschedule_mangle_last | ||
| 400 | call _vpaes_schedule_mangle | ||
| 401 | |||
| 402 | # low round. swap xmm7 and xmm6 | ||
| 403 | pshufd \$0xFF, %xmm0, %xmm0 | ||
| 404 | movdqa %xmm7, %xmm5 | ||
| 405 | movdqa %xmm6, %xmm7 | ||
| 406 | call _vpaes_schedule_low_round | ||
| 407 | movdqa %xmm5, %xmm7 | ||
| 408 | |||
| 409 | jmp .Loop_schedule_256 | ||
| 410 | |||
| 411 | |||
| 412 | ## | ||
| 413 | ## .aes_schedule_mangle_last | ||
| 414 | ## | ||
| 415 | ## Mangler for last round of key schedule | ||
| 416 | ## Mangles %xmm0 | ||
| 417 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
| 418 | ## when decrypting, outputs unskew(%xmm0) | ||
| 419 | ## | ||
| 420 | ## Always called right before return... jumps to cleanup and exits | ||
| 421 | ## | ||
| 422 | .align 16 | ||
| 423 | .Lschedule_mangle_last: | ||
| 424 | # schedule last round key from xmm0 | ||
| 425 | lea .Lk_deskew(%rip),%r11 # prepare to deskew | ||
| 426 | test %rcx, %rcx | ||
| 427 | jnz .Lschedule_mangle_last_dec | ||
| 428 | |||
| 429 | # encrypting | ||
| 430 | movdqa (%r8,%r10),%xmm1 | ||
| 431 | pshufb %xmm1, %xmm0 # output permute | ||
| 432 | lea .Lk_opt(%rip), %r11 # prepare to output transform | ||
| 433 | add \$32, %rdx | ||
| 434 | |||
| 435 | .Lschedule_mangle_last_dec: | ||
| 436 | add \$-16, %rdx | ||
| 437 | pxor .Lk_s63(%rip), %xmm0 | ||
| 438 | call _vpaes_schedule_transform # output transform | ||
| 439 | movdqu %xmm0, (%rdx) # save last key | ||
| 440 | |||
| 441 | # cleanup | ||
| 442 | pxor %xmm0, %xmm0 | ||
| 443 | pxor %xmm1, %xmm1 | ||
| 444 | pxor %xmm2, %xmm2 | ||
| 445 | pxor %xmm3, %xmm3 | ||
| 446 | pxor %xmm4, %xmm4 | ||
| 447 | pxor %xmm5, %xmm5 | ||
| 448 | pxor %xmm6, %xmm6 | ||
| 449 | pxor %xmm7, %xmm7 | ||
| 450 | ret | ||
| 451 | .size _vpaes_schedule_core,.-_vpaes_schedule_core | ||
| 452 | |||
| 453 | ## | ||
| 454 | ## .aes_schedule_192_smear | ||
| 455 | ## | ||
| 456 | ## Smear the short, low side in the 192-bit key schedule. | ||
| 457 | ## | ||
| 458 | ## Inputs: | ||
| 459 | ## %xmm7: high side, b a x y | ||
| 460 | ## %xmm6: low side, d c 0 0 | ||
| 461 | ## %xmm13: 0 | ||
| 462 | ## | ||
| 463 | ## Outputs: | ||
| 464 | ## %xmm6: b+c+d b+c 0 0 | ||
| 465 | ## %xmm0: b+c+d b+c b a | ||
| 466 | ## | ||
| 467 | .type _vpaes_schedule_192_smear,\@abi-omnipotent | ||
| 468 | .align 16 | ||
| 469 | _vpaes_schedule_192_smear: | ||
| 470 | _CET_ENDBR | ||
| 471 | pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 | ||
| 472 | pxor %xmm0, %xmm6 # -> c+d c 0 0 | ||
| 473 | pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a | ||
| 474 | pxor %xmm0, %xmm6 # -> b+c+d b+c b a | ||
| 475 | movdqa %xmm6, %xmm0 | ||
| 476 | pxor %xmm1, %xmm1 | ||
| 477 | movhlps %xmm1, %xmm6 # clobber low side with zeros | ||
| 478 | ret | ||
| 479 | .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear | ||
| 480 | |||
| 481 | ## | ||
| 482 | ## .aes_schedule_round | ||
| 483 | ## | ||
| 484 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
| 485 | ## | ||
| 486 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
| 487 | ## then rotates it by one byte and xors into the low dword of | ||
| 488 | ## %xmm7. | ||
| 489 | ## | ||
| 490 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
| 491 | ## next rcon. | ||
| 492 | ## | ||
| 493 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
| 494 | ## second low, result into third, result into highest. | ||
| 495 | ## | ||
| 496 | ## Returns results in %xmm7 = %xmm0. | ||
| 497 | ## Clobbers %xmm1-%xmm4, %r11. | ||
| 498 | ## | ||
| 499 | .type _vpaes_schedule_round,\@abi-omnipotent | ||
| 500 | .align 16 | ||
| 501 | _vpaes_schedule_round: | ||
| 502 | _CET_ENDBR | ||
| 503 | # extract rcon from xmm8 | ||
| 504 | pxor %xmm1, %xmm1 | ||
| 505 | palignr \$15, %xmm8, %xmm1 | ||
| 506 | palignr \$15, %xmm8, %xmm8 | ||
| 507 | pxor %xmm1, %xmm7 | ||
| 508 | |||
| 509 | # rotate | ||
| 510 | pshufd \$0xFF, %xmm0, %xmm0 | ||
| 511 | palignr \$1, %xmm0, %xmm0 | ||
| 512 | |||
| 513 | # fall through... | ||
| 514 | |||
| 515 | # low round: same as high round, but no rotation and no rcon. | ||
| 516 | _vpaes_schedule_low_round: | ||
| 517 | # smear xmm7 | ||
| 518 | movdqa %xmm7, %xmm1 | ||
| 519 | pslldq \$4, %xmm7 | ||
| 520 | pxor %xmm1, %xmm7 | ||
| 521 | movdqa %xmm7, %xmm1 | ||
| 522 | pslldq \$8, %xmm7 | ||
| 523 | pxor %xmm1, %xmm7 | ||
| 524 | pxor .Lk_s63(%rip), %xmm7 | ||
| 525 | |||
| 526 | # subbytes | ||
| 527 | movdqa %xmm9, %xmm1 | ||
| 528 | pandn %xmm0, %xmm1 | ||
| 529 | psrld \$4, %xmm1 # 1 = i | ||
| 530 | pand %xmm9, %xmm0 # 0 = k | ||
| 531 | movdqa %xmm11, %xmm2 # 2 : a/k | ||
| 532 | pshufb %xmm0, %xmm2 # 2 = a/k | ||
| 533 | pxor %xmm1, %xmm0 # 0 = j | ||
| 534 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
| 535 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
| 536 | pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k | ||
| 537 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
| 538 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
| 539 | pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k | ||
| 540 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
| 541 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
| 542 | pxor %xmm0, %xmm2 # 2 = io | ||
| 543 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
| 544 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
| 545 | pxor %xmm1, %xmm3 # 3 = jo | ||
| 546 | movdqa %xmm13, %xmm4 # 4 : sbou | ||
| 547 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
| 548 | movdqa %xmm12, %xmm0 # 0 : sbot | ||
| 549 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 550 | pxor %xmm4, %xmm0 # 0 = sbox output | ||
| 551 | |||
| 552 | # add in smeared stuff | ||
| 553 | pxor %xmm7, %xmm0 | ||
| 554 | movdqa %xmm0, %xmm7 | ||
| 555 | ret | ||
| 556 | .size _vpaes_schedule_round,.-_vpaes_schedule_round | ||
| 557 | |||
| 558 | ## | ||
| 559 | ## .aes_schedule_transform | ||
| 560 | ## | ||
| 561 | ## Linear-transform %xmm0 according to tables at (%r11) | ||
| 562 | ## | ||
| 563 | ## Requires that %xmm9 = 0x0F0F... as in preheat | ||
| 564 | ## Output in %xmm0 | ||
| 565 | ## Clobbers %xmm1, %xmm2 | ||
| 566 | ## | ||
| 567 | .type _vpaes_schedule_transform,\@abi-omnipotent | ||
| 568 | .align 16 | ||
| 569 | _vpaes_schedule_transform: | ||
| 570 | _CET_ENDBR | ||
| 571 | movdqa %xmm9, %xmm1 | ||
| 572 | pandn %xmm0, %xmm1 | ||
| 573 | psrld \$4, %xmm1 | ||
| 574 | pand %xmm9, %xmm0 | ||
| 575 | movdqa (%r11), %xmm2 # lo | ||
| 576 | pshufb %xmm0, %xmm2 | ||
| 577 | movdqa 16(%r11), %xmm0 # hi | ||
| 578 | pshufb %xmm1, %xmm0 | ||
| 579 | pxor %xmm2, %xmm0 | ||
| 580 | ret | ||
| 581 | .size _vpaes_schedule_transform,.-_vpaes_schedule_transform | ||
| 582 | |||
| 583 | ## | ||
| 584 | ## .aes_schedule_mangle | ||
| 585 | ## | ||
| 586 | ## Mangle xmm0 from (basis-transformed) standard version | ||
| 587 | ## to our version. | ||
| 588 | ## | ||
| 589 | ## On encrypt, | ||
| 590 | ## xor with 0x63 | ||
| 591 | ## multiply by circulant 0,1,1,1 | ||
| 592 | ## apply shiftrows transform | ||
| 593 | ## | ||
| 594 | ## On decrypt, | ||
| 595 | ## xor with 0x63 | ||
| 596 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
| 597 | ## deskew | ||
| 598 | ## apply shiftrows transform | ||
| 599 | ## | ||
| 600 | ## | ||
| 601 | ## Writes out to (%rdx), and increments or decrements it | ||
| 602 | ## Keeps track of round number mod 4 in %r8 | ||
| 603 | ## Preserves xmm0 | ||
| 604 | ## Clobbers xmm1-xmm5 | ||
| 605 | ## | ||
| 606 | .type _vpaes_schedule_mangle,\@abi-omnipotent | ||
| 607 | .align 16 | ||
| 608 | _vpaes_schedule_mangle: | ||
| 609 | _CET_ENDBR | ||
| 610 | movdqa %xmm0, %xmm4 # save xmm0 for later | ||
| 611 | movdqa .Lk_mc_forward(%rip),%xmm5 | ||
| 612 | test %rcx, %rcx | ||
| 613 | jnz .Lschedule_mangle_dec | ||
| 614 | |||
| 615 | # encrypting | ||
| 616 | add \$16, %rdx | ||
| 617 | pxor .Lk_s63(%rip),%xmm4 | ||
| 618 | pshufb %xmm5, %xmm4 | ||
| 619 | movdqa %xmm4, %xmm3 | ||
| 620 | pshufb %xmm5, %xmm4 | ||
| 621 | pxor %xmm4, %xmm3 | ||
| 622 | pshufb %xmm5, %xmm4 | ||
| 623 | pxor %xmm4, %xmm3 | ||
| 624 | |||
| 625 | jmp .Lschedule_mangle_both | ||
| 626 | .align 16 | ||
| 627 | .Lschedule_mangle_dec: | ||
| 628 | # inverse mix columns | ||
| 629 | lea .Lk_dksd(%rip),%r11 | ||
| 630 | movdqa %xmm9, %xmm1 | ||
| 631 | pandn %xmm4, %xmm1 | ||
| 632 | psrld \$4, %xmm1 # 1 = hi | ||
| 633 | pand %xmm9, %xmm4 # 4 = lo | ||
| 634 | |||
| 635 | movdqa 0x00(%r11), %xmm2 | ||
| 636 | pshufb %xmm4, %xmm2 | ||
| 637 | movdqa 0x10(%r11), %xmm3 | ||
| 638 | pshufb %xmm1, %xmm3 | ||
| 639 | pxor %xmm2, %xmm3 | ||
| 640 | pshufb %xmm5, %xmm3 | ||
| 641 | |||
| 642 | movdqa 0x20(%r11), %xmm2 | ||
| 643 | pshufb %xmm4, %xmm2 | ||
| 644 | pxor %xmm3, %xmm2 | ||
| 645 | movdqa 0x30(%r11), %xmm3 | ||
| 646 | pshufb %xmm1, %xmm3 | ||
| 647 | pxor %xmm2, %xmm3 | ||
| 648 | pshufb %xmm5, %xmm3 | ||
| 649 | |||
| 650 | movdqa 0x40(%r11), %xmm2 | ||
| 651 | pshufb %xmm4, %xmm2 | ||
| 652 | pxor %xmm3, %xmm2 | ||
| 653 | movdqa 0x50(%r11), %xmm3 | ||
| 654 | pshufb %xmm1, %xmm3 | ||
| 655 | pxor %xmm2, %xmm3 | ||
| 656 | pshufb %xmm5, %xmm3 | ||
| 657 | |||
| 658 | movdqa 0x60(%r11), %xmm2 | ||
| 659 | pshufb %xmm4, %xmm2 | ||
| 660 | pxor %xmm3, %xmm2 | ||
| 661 | movdqa 0x70(%r11), %xmm3 | ||
| 662 | pshufb %xmm1, %xmm3 | ||
| 663 | pxor %xmm2, %xmm3 | ||
| 664 | |||
| 665 | add \$-16, %rdx | ||
| 666 | |||
| 667 | .Lschedule_mangle_both: | ||
| 668 | movdqa (%r8,%r10),%xmm1 | ||
| 669 | pshufb %xmm1,%xmm3 | ||
| 670 | add \$-16, %r8 | ||
| 671 | and \$0x30, %r8 | ||
| 672 | movdqu %xmm3, (%rdx) | ||
| 673 | ret | ||
| 674 | .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle | ||
| 675 | |||
| 676 | # | ||
| 677 | # Interface to OpenSSL | ||
| 678 | # | ||
| 679 | .globl ${PREFIX}_set_encrypt_key | ||
| 680 | .type ${PREFIX}_set_encrypt_key,\@function,3 | ||
| 681 | .align 16 | ||
| 682 | ${PREFIX}_set_encrypt_key: | ||
| 683 | _CET_ENDBR | ||
| 684 | ___ | ||
| 685 | $code.=<<___ if ($win64); | ||
| 686 | lea -0xb8(%rsp),%rsp | ||
| 687 | movaps %xmm6,0x10(%rsp) | ||
| 688 | movaps %xmm7,0x20(%rsp) | ||
| 689 | movaps %xmm8,0x30(%rsp) | ||
| 690 | movaps %xmm9,0x40(%rsp) | ||
| 691 | movaps %xmm10,0x50(%rsp) | ||
| 692 | movaps %xmm11,0x60(%rsp) | ||
| 693 | movaps %xmm12,0x70(%rsp) | ||
| 694 | movaps %xmm13,0x80(%rsp) | ||
| 695 | movaps %xmm14,0x90(%rsp) | ||
| 696 | movaps %xmm15,0xa0(%rsp) | ||
| 697 | .Lenc_key_body: | ||
| 698 | ___ | ||
| 699 | $code.=<<___; | ||
| 700 | mov %esi,%eax | ||
| 701 | shr \$5,%eax | ||
| 702 | add \$5,%eax | ||
| 703 | mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; | ||
| 704 | |||
| 705 | mov \$0,%ecx | ||
| 706 | mov \$0x30,%r8d | ||
| 707 | call _vpaes_schedule_core | ||
| 708 | ___ | ||
| 709 | $code.=<<___ if ($win64); | ||
| 710 | movaps 0x10(%rsp),%xmm6 | ||
| 711 | movaps 0x20(%rsp),%xmm7 | ||
| 712 | movaps 0x30(%rsp),%xmm8 | ||
| 713 | movaps 0x40(%rsp),%xmm9 | ||
| 714 | movaps 0x50(%rsp),%xmm10 | ||
| 715 | movaps 0x60(%rsp),%xmm11 | ||
| 716 | movaps 0x70(%rsp),%xmm12 | ||
| 717 | movaps 0x80(%rsp),%xmm13 | ||
| 718 | movaps 0x90(%rsp),%xmm14 | ||
| 719 | movaps 0xa0(%rsp),%xmm15 | ||
| 720 | lea 0xb8(%rsp),%rsp | ||
| 721 | .Lenc_key_epilogue: | ||
| 722 | ___ | ||
| 723 | $code.=<<___; | ||
| 724 | xor %eax,%eax | ||
| 725 | ret | ||
| 726 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key | ||
| 727 | |||
| 728 | .globl ${PREFIX}_set_decrypt_key | ||
| 729 | .type ${PREFIX}_set_decrypt_key,\@function,3 | ||
| 730 | .align 16 | ||
| 731 | ${PREFIX}_set_decrypt_key: | ||
| 732 | _CET_ENDBR | ||
| 733 | ___ | ||
| 734 | $code.=<<___ if ($win64); | ||
| 735 | lea -0xb8(%rsp),%rsp | ||
| 736 | movaps %xmm6,0x10(%rsp) | ||
| 737 | movaps %xmm7,0x20(%rsp) | ||
| 738 | movaps %xmm8,0x30(%rsp) | ||
| 739 | movaps %xmm9,0x40(%rsp) | ||
| 740 | movaps %xmm10,0x50(%rsp) | ||
| 741 | movaps %xmm11,0x60(%rsp) | ||
| 742 | movaps %xmm12,0x70(%rsp) | ||
| 743 | movaps %xmm13,0x80(%rsp) | ||
| 744 | movaps %xmm14,0x90(%rsp) | ||
| 745 | movaps %xmm15,0xa0(%rsp) | ||
| 746 | .Ldec_key_body: | ||
| 747 | ___ | ||
| 748 | $code.=<<___; | ||
| 749 | mov %esi,%eax | ||
| 750 | shr \$5,%eax | ||
| 751 | add \$5,%eax | ||
| 752 | mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; | ||
| 753 | shl \$4,%eax | ||
| 754 | lea 16(%rdx,%rax),%rdx | ||
| 755 | |||
| 756 | mov \$1,%ecx | ||
| 757 | mov %esi,%r8d | ||
| 758 | shr \$1,%r8d | ||
| 759 | and \$32,%r8d | ||
| 760 | xor \$32,%r8d # nbits==192?0:32 | ||
| 761 | call _vpaes_schedule_core | ||
| 762 | ___ | ||
| 763 | $code.=<<___ if ($win64); | ||
| 764 | movaps 0x10(%rsp),%xmm6 | ||
| 765 | movaps 0x20(%rsp),%xmm7 | ||
| 766 | movaps 0x30(%rsp),%xmm8 | ||
| 767 | movaps 0x40(%rsp),%xmm9 | ||
| 768 | movaps 0x50(%rsp),%xmm10 | ||
| 769 | movaps 0x60(%rsp),%xmm11 | ||
| 770 | movaps 0x70(%rsp),%xmm12 | ||
| 771 | movaps 0x80(%rsp),%xmm13 | ||
| 772 | movaps 0x90(%rsp),%xmm14 | ||
| 773 | movaps 0xa0(%rsp),%xmm15 | ||
| 774 | lea 0xb8(%rsp),%rsp | ||
| 775 | .Ldec_key_epilogue: | ||
| 776 | ___ | ||
| 777 | $code.=<<___; | ||
| 778 | xor %eax,%eax | ||
| 779 | ret | ||
| 780 | .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key | ||
| 781 | |||
| 782 | .globl ${PREFIX}_encrypt | ||
| 783 | .type ${PREFIX}_encrypt,\@function,3 | ||
| 784 | .align 16 | ||
| 785 | ${PREFIX}_encrypt: | ||
| 786 | _CET_ENDBR | ||
| 787 | ___ | ||
| 788 | $code.=<<___ if ($win64); | ||
| 789 | lea -0xb8(%rsp),%rsp | ||
| 790 | movaps %xmm6,0x10(%rsp) | ||
| 791 | movaps %xmm7,0x20(%rsp) | ||
| 792 | movaps %xmm8,0x30(%rsp) | ||
| 793 | movaps %xmm9,0x40(%rsp) | ||
| 794 | movaps %xmm10,0x50(%rsp) | ||
| 795 | movaps %xmm11,0x60(%rsp) | ||
| 796 | movaps %xmm12,0x70(%rsp) | ||
| 797 | movaps %xmm13,0x80(%rsp) | ||
| 798 | movaps %xmm14,0x90(%rsp) | ||
| 799 | movaps %xmm15,0xa0(%rsp) | ||
| 800 | .Lenc_body: | ||
| 801 | ___ | ||
| 802 | $code.=<<___; | ||
| 803 | movdqu (%rdi),%xmm0 | ||
| 804 | call _vpaes_preheat | ||
| 805 | call _vpaes_encrypt_core | ||
| 806 | movdqu %xmm0,(%rsi) | ||
| 807 | ___ | ||
| 808 | $code.=<<___ if ($win64); | ||
| 809 | movaps 0x10(%rsp),%xmm6 | ||
| 810 | movaps 0x20(%rsp),%xmm7 | ||
| 811 | movaps 0x30(%rsp),%xmm8 | ||
| 812 | movaps 0x40(%rsp),%xmm9 | ||
| 813 | movaps 0x50(%rsp),%xmm10 | ||
| 814 | movaps 0x60(%rsp),%xmm11 | ||
| 815 | movaps 0x70(%rsp),%xmm12 | ||
| 816 | movaps 0x80(%rsp),%xmm13 | ||
| 817 | movaps 0x90(%rsp),%xmm14 | ||
| 818 | movaps 0xa0(%rsp),%xmm15 | ||
| 819 | lea 0xb8(%rsp),%rsp | ||
| 820 | .Lenc_epilogue: | ||
| 821 | ___ | ||
| 822 | $code.=<<___; | ||
| 823 | ret | ||
| 824 | .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt | ||
| 825 | |||
| 826 | .globl ${PREFIX}_decrypt | ||
| 827 | .type ${PREFIX}_decrypt,\@function,3 | ||
| 828 | .align 16 | ||
| 829 | ${PREFIX}_decrypt: | ||
| 830 | _CET_ENDBR | ||
| 831 | ___ | ||
| 832 | $code.=<<___ if ($win64); | ||
| 833 | lea -0xb8(%rsp),%rsp | ||
| 834 | movaps %xmm6,0x10(%rsp) | ||
| 835 | movaps %xmm7,0x20(%rsp) | ||
| 836 | movaps %xmm8,0x30(%rsp) | ||
| 837 | movaps %xmm9,0x40(%rsp) | ||
| 838 | movaps %xmm10,0x50(%rsp) | ||
| 839 | movaps %xmm11,0x60(%rsp) | ||
| 840 | movaps %xmm12,0x70(%rsp) | ||
| 841 | movaps %xmm13,0x80(%rsp) | ||
| 842 | movaps %xmm14,0x90(%rsp) | ||
| 843 | movaps %xmm15,0xa0(%rsp) | ||
| 844 | .Ldec_body: | ||
| 845 | ___ | ||
| 846 | $code.=<<___; | ||
| 847 | movdqu (%rdi),%xmm0 | ||
| 848 | call _vpaes_preheat | ||
| 849 | call _vpaes_decrypt_core | ||
| 850 | movdqu %xmm0,(%rsi) | ||
| 851 | ___ | ||
| 852 | $code.=<<___ if ($win64); | ||
| 853 | movaps 0x10(%rsp),%xmm6 | ||
| 854 | movaps 0x20(%rsp),%xmm7 | ||
| 855 | movaps 0x30(%rsp),%xmm8 | ||
| 856 | movaps 0x40(%rsp),%xmm9 | ||
| 857 | movaps 0x50(%rsp),%xmm10 | ||
| 858 | movaps 0x60(%rsp),%xmm11 | ||
| 859 | movaps 0x70(%rsp),%xmm12 | ||
| 860 | movaps 0x80(%rsp),%xmm13 | ||
| 861 | movaps 0x90(%rsp),%xmm14 | ||
| 862 | movaps 0xa0(%rsp),%xmm15 | ||
| 863 | lea 0xb8(%rsp),%rsp | ||
| 864 | .Ldec_epilogue: | ||
| 865 | ___ | ||
| 866 | $code.=<<___; | ||
| 867 | ret | ||
| 868 | .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt | ||
| 869 | ___ | ||
| 870 | { | ||
| 871 | my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||
| 872 | # void AES_cbc_encrypt (const void char *inp, unsigned char *out, | ||
| 873 | # size_t length, const AES_KEY *key, | ||
| 874 | # unsigned char *ivp,const int enc); | ||
| 875 | $code.=<<___; | ||
| 876 | .globl ${PREFIX}_cbc_encrypt | ||
| 877 | .type ${PREFIX}_cbc_encrypt,\@function,6 | ||
| 878 | .align 16 | ||
| 879 | ${PREFIX}_cbc_encrypt: | ||
| 880 | _CET_ENDBR | ||
| 881 | xchg $key,$len | ||
| 882 | ___ | ||
| 883 | ($len,$key)=($key,$len); | ||
| 884 | $code.=<<___; | ||
| 885 | sub \$16,$len | ||
| 886 | jc .Lcbc_abort | ||
| 887 | ___ | ||
| 888 | $code.=<<___ if ($win64); | ||
| 889 | lea -0xb8(%rsp),%rsp | ||
| 890 | movaps %xmm6,0x10(%rsp) | ||
| 891 | movaps %xmm7,0x20(%rsp) | ||
| 892 | movaps %xmm8,0x30(%rsp) | ||
| 893 | movaps %xmm9,0x40(%rsp) | ||
| 894 | movaps %xmm10,0x50(%rsp) | ||
| 895 | movaps %xmm11,0x60(%rsp) | ||
| 896 | movaps %xmm12,0x70(%rsp) | ||
| 897 | movaps %xmm13,0x80(%rsp) | ||
| 898 | movaps %xmm14,0x90(%rsp) | ||
| 899 | movaps %xmm15,0xa0(%rsp) | ||
| 900 | .Lcbc_body: | ||
| 901 | ___ | ||
| 902 | $code.=<<___; | ||
| 903 | movdqu ($ivp),%xmm6 # load IV | ||
| 904 | sub $inp,$out | ||
| 905 | call _vpaes_preheat | ||
| 906 | cmp \$0,${enc}d | ||
| 907 | je .Lcbc_dec_loop | ||
| 908 | jmp .Lcbc_enc_loop | ||
| 909 | .align 16 | ||
| 910 | .Lcbc_enc_loop: | ||
| 911 | movdqu ($inp),%xmm0 | ||
| 912 | pxor %xmm6,%xmm0 | ||
| 913 | call _vpaes_encrypt_core | ||
| 914 | movdqa %xmm0,%xmm6 | ||
| 915 | movdqu %xmm0,($out,$inp) | ||
| 916 | lea 16($inp),$inp | ||
| 917 | sub \$16,$len | ||
| 918 | jnc .Lcbc_enc_loop | ||
| 919 | jmp .Lcbc_done | ||
| 920 | .align 16 | ||
| 921 | .Lcbc_dec_loop: | ||
| 922 | movdqu ($inp),%xmm0 | ||
| 923 | movdqa %xmm0,%xmm7 | ||
| 924 | call _vpaes_decrypt_core | ||
| 925 | pxor %xmm6,%xmm0 | ||
| 926 | movdqa %xmm7,%xmm6 | ||
| 927 | movdqu %xmm0,($out,$inp) | ||
| 928 | lea 16($inp),$inp | ||
| 929 | sub \$16,$len | ||
| 930 | jnc .Lcbc_dec_loop | ||
| 931 | .Lcbc_done: | ||
| 932 | movdqu %xmm6,($ivp) # save IV | ||
| 933 | ___ | ||
| 934 | $code.=<<___ if ($win64); | ||
| 935 | movaps 0x10(%rsp),%xmm6 | ||
| 936 | movaps 0x20(%rsp),%xmm7 | ||
| 937 | movaps 0x30(%rsp),%xmm8 | ||
| 938 | movaps 0x40(%rsp),%xmm9 | ||
| 939 | movaps 0x50(%rsp),%xmm10 | ||
| 940 | movaps 0x60(%rsp),%xmm11 | ||
| 941 | movaps 0x70(%rsp),%xmm12 | ||
| 942 | movaps 0x80(%rsp),%xmm13 | ||
| 943 | movaps 0x90(%rsp),%xmm14 | ||
| 944 | movaps 0xa0(%rsp),%xmm15 | ||
| 945 | lea 0xb8(%rsp),%rsp | ||
| 946 | .Lcbc_epilogue: | ||
| 947 | ___ | ||
| 948 | $code.=<<___; | ||
| 949 | .Lcbc_abort: | ||
| 950 | ret | ||
| 951 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt | ||
| 952 | ___ | ||
| 953 | } | ||
| 954 | $code.=<<___; | ||
| 955 | ## | ||
| 956 | ## _aes_preheat | ||
| 957 | ## | ||
| 958 | ## Fills register %r10 -> .aes_consts (so you can -fPIC) | ||
| 959 | ## and %xmm9-%xmm15 as specified below. | ||
| 960 | ## | ||
| 961 | .type _vpaes_preheat,\@abi-omnipotent | ||
| 962 | .align 16 | ||
| 963 | _vpaes_preheat: | ||
| 964 | _CET_ENDBR | ||
| 965 | lea .Lk_s0F(%rip), %r10 | ||
| 966 | movdqa -0x20(%r10), %xmm10 # .Lk_inv | ||
| 967 | movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 | ||
| 968 | movdqa 0x00(%r10), %xmm9 # .Lk_s0F | ||
| 969 | movdqa 0x30(%r10), %xmm13 # .Lk_sb1 | ||
| 970 | movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 | ||
| 971 | movdqa 0x50(%r10), %xmm15 # .Lk_sb2 | ||
| 972 | movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 | ||
| 973 | ret | ||
| 974 | .size _vpaes_preheat,.-_vpaes_preheat | ||
| 975 | ######################################################## | ||
| 976 | ## ## | ||
| 977 | ## Constants ## | ||
| 978 | ## ## | ||
| 979 | ######################################################## | ||
| 980 | .section .rodata | ||
| 981 | .type _vpaes_consts,\@object | ||
| 982 | .align 64 | ||
| 983 | _vpaes_consts: | ||
| 984 | .Lk_inv: # inv, inva | ||
| 985 | .quad 0x0E05060F0D080180, 0x040703090A0B0C02 | ||
| 986 | .quad 0x01040A060F0B0780, 0x030D0E0C02050809 | ||
| 987 | |||
| 988 | .Lk_s0F: # s0F | ||
| 989 | .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F | ||
| 990 | |||
| 991 | .Lk_ipt: # input transform (lo, hi) | ||
| 992 | .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 | ||
| 993 | .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 | ||
| 994 | |||
| 995 | .Lk_sb1: # sb1u, sb1t | ||
| 996 | .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 | ||
| 997 | .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF | ||
| 998 | .Lk_sb2: # sb2u, sb2t | ||
| 999 | .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD | ||
| 1000 | .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A | ||
| 1001 | .Lk_sbo: # sbou, sbot | ||
| 1002 | .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 | ||
| 1003 | .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA | ||
| 1004 | |||
| 1005 | .Lk_mc_forward: # mc_forward | ||
| 1006 | .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 | ||
| 1007 | .quad 0x080B0A0904070605, 0x000302010C0F0E0D | ||
| 1008 | .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 | ||
| 1009 | .quad 0x000302010C0F0E0D, 0x080B0A0904070605 | ||
| 1010 | |||
| 1011 | .Lk_mc_backward:# mc_backward | ||
| 1012 | .quad 0x0605040702010003, 0x0E0D0C0F0A09080B | ||
| 1013 | .quad 0x020100030E0D0C0F, 0x0A09080B06050407 | ||
| 1014 | .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 | ||
| 1015 | .quad 0x0A09080B06050407, 0x020100030E0D0C0F | ||
| 1016 | |||
| 1017 | .Lk_sr: # sr | ||
| 1018 | .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 | ||
| 1019 | .quad 0x030E09040F0A0500, 0x0B06010C07020D08 | ||
| 1020 | .quad 0x0F060D040B020900, 0x070E050C030A0108 | ||
| 1021 | .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 | ||
| 1022 | |||
| 1023 | .Lk_rcon: # rcon | ||
| 1024 | .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 | ||
| 1025 | |||
| 1026 | .Lk_s63: # s63: all equal to 0x63 transformed | ||
| 1027 | .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B | ||
| 1028 | |||
| 1029 | .Lk_opt: # output transform | ||
| 1030 | .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 | ||
| 1031 | .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 | ||
| 1032 | |||
| 1033 | .Lk_deskew: # deskew tables: inverts the sbox's "skew" | ||
| 1034 | .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A | ||
| 1035 | .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 | ||
| 1036 | |||
| 1037 | ## | ||
| 1038 | ## Decryption stuff | ||
| 1039 | ## Key schedule constants | ||
| 1040 | ## | ||
| 1041 | .Lk_dksd: # decryption key schedule: invskew x*D | ||
| 1042 | .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 | ||
| 1043 | .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E | ||
| 1044 | .Lk_dksb: # decryption key schedule: invskew x*B | ||
| 1045 | .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 | ||
| 1046 | .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 | ||
| 1047 | .Lk_dkse: # decryption key schedule: invskew x*E + 0x63 | ||
| 1048 | .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 | ||
| 1049 | .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 | ||
| 1050 | .Lk_dks9: # decryption key schedule: invskew x*9 | ||
| 1051 | .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC | ||
| 1052 | .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE | ||
| 1053 | |||
| 1054 | ## | ||
| 1055 | ## Decryption stuff | ||
| 1056 | ## Round function constants | ||
| 1057 | ## | ||
| 1058 | .Lk_dipt: # decryption input transform | ||
| 1059 | .quad 0x0F505B040B545F00, 0x154A411E114E451A | ||
| 1060 | .quad 0x86E383E660056500, 0x12771772F491F194 | ||
| 1061 | |||
| 1062 | .Lk_dsb9: # decryption sbox output *9*u, *9*t | ||
| 1063 | .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 | ||
| 1064 | .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 | ||
| 1065 | .Lk_dsbd: # decryption sbox output *D*u, *D*t | ||
| 1066 | .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 | ||
| 1067 | .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 | ||
| 1068 | .Lk_dsbb: # decryption sbox output *B*u, *B*t | ||
| 1069 | .quad 0xD022649296B44200, 0x602646F6B0F2D404 | ||
| 1070 | .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B | ||
| 1071 | .Lk_dsbe: # decryption sbox output *E*u, *E*t | ||
| 1072 | .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 | ||
| 1073 | .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 | ||
| 1074 | .Lk_dsbo: # decryption sbox final output | ||
| 1075 | .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D | ||
| 1076 | .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C | ||
| 1077 | .align 64 | ||
| 1078 | .size _vpaes_consts,.-_vpaes_consts | ||
| 1079 | .text | ||
| 1080 | ___ | ||
| 1081 | |||
| 1082 | if ($win64) { | ||
| 1083 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 1084 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 1085 | $rec="%rcx"; | ||
| 1086 | $frame="%rdx"; | ||
| 1087 | $context="%r8"; | ||
| 1088 | $disp="%r9"; | ||
| 1089 | |||
| 1090 | $code.=<<___; | ||
| 1091 | .extern __imp_RtlVirtualUnwind | ||
| 1092 | .type se_handler,\@abi-omnipotent | ||
| 1093 | .align 16 | ||
| 1094 | se_handler: | ||
| 1095 | _CET_ENDBR | ||
| 1096 | push %rsi | ||
| 1097 | push %rdi | ||
| 1098 | push %rbx | ||
| 1099 | push %rbp | ||
| 1100 | push %r12 | ||
| 1101 | push %r13 | ||
| 1102 | push %r14 | ||
| 1103 | push %r15 | ||
| 1104 | pushfq | ||
| 1105 | sub \$64,%rsp | ||
| 1106 | |||
| 1107 | mov 120($context),%rax # pull context->Rax | ||
| 1108 | mov 248($context),%rbx # pull context->Rip | ||
| 1109 | |||
| 1110 | mov 8($disp),%rsi # disp->ImageBase | ||
| 1111 | mov 56($disp),%r11 # disp->HandlerData | ||
| 1112 | |||
| 1113 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 1114 | lea (%rsi,%r10),%r10 # prologue label | ||
| 1115 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 1116 | jb .Lin_prologue | ||
| 1117 | |||
| 1118 | mov 152($context),%rax # pull context->Rsp | ||
| 1119 | |||
| 1120 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 1121 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 1122 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 1123 | jae .Lin_prologue | ||
| 1124 | |||
| 1125 | lea 16(%rax),%rsi # %xmm save area | ||
| 1126 | lea 512($context),%rdi # &context.Xmm6 | ||
| 1127 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
| 1128 | .long 0xa548f3fc # cld; rep movsq | ||
| 1129 | lea 0xb8(%rax),%rax # adjust stack pointer | ||
| 1130 | |||
| 1131 | .Lin_prologue: | ||
| 1132 | mov 8(%rax),%rdi | ||
| 1133 | mov 16(%rax),%rsi | ||
| 1134 | mov %rax,152($context) # restore context->Rsp | ||
| 1135 | mov %rsi,168($context) # restore context->Rsi | ||
| 1136 | mov %rdi,176($context) # restore context->Rdi | ||
| 1137 | |||
| 1138 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 1139 | mov $context,%rsi # context | ||
| 1140 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
| 1141 | .long 0xa548f3fc # cld; rep movsq | ||
| 1142 | |||
| 1143 | mov $disp,%rsi | ||
| 1144 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 1145 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 1146 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 1147 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 1148 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 1149 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 1150 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 1151 | mov %r10,32(%rsp) # arg5 | ||
| 1152 | mov %r11,40(%rsp) # arg6 | ||
| 1153 | mov %r12,48(%rsp) # arg7 | ||
| 1154 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 1155 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 1156 | |||
| 1157 | mov \$1,%eax # ExceptionContinueSearch | ||
| 1158 | add \$64,%rsp | ||
| 1159 | popfq | ||
| 1160 | pop %r15 | ||
| 1161 | pop %r14 | ||
| 1162 | pop %r13 | ||
| 1163 | pop %r12 | ||
| 1164 | pop %rbp | ||
| 1165 | pop %rbx | ||
| 1166 | pop %rdi | ||
| 1167 | pop %rsi | ||
| 1168 | ret | ||
| 1169 | .size se_handler,.-se_handler | ||
| 1170 | |||
| 1171 | .section .pdata | ||
| 1172 | .align 4 | ||
| 1173 | .rva .LSEH_begin_${PREFIX}_set_encrypt_key | ||
| 1174 | .rva .LSEH_end_${PREFIX}_set_encrypt_key | ||
| 1175 | .rva .LSEH_info_${PREFIX}_set_encrypt_key | ||
| 1176 | |||
| 1177 | .rva .LSEH_begin_${PREFIX}_set_decrypt_key | ||
| 1178 | .rva .LSEH_end_${PREFIX}_set_decrypt_key | ||
| 1179 | .rva .LSEH_info_${PREFIX}_set_decrypt_key | ||
| 1180 | |||
| 1181 | .rva .LSEH_begin_${PREFIX}_encrypt | ||
| 1182 | .rva .LSEH_end_${PREFIX}_encrypt | ||
| 1183 | .rva .LSEH_info_${PREFIX}_encrypt | ||
| 1184 | |||
| 1185 | .rva .LSEH_begin_${PREFIX}_decrypt | ||
| 1186 | .rva .LSEH_end_${PREFIX}_decrypt | ||
| 1187 | .rva .LSEH_info_${PREFIX}_decrypt | ||
| 1188 | |||
| 1189 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt | ||
| 1190 | .rva .LSEH_end_${PREFIX}_cbc_encrypt | ||
| 1191 | .rva .LSEH_info_${PREFIX}_cbc_encrypt | ||
| 1192 | |||
| 1193 | .section .xdata | ||
| 1194 | .align 8 | ||
| 1195 | .LSEH_info_${PREFIX}_set_encrypt_key: | ||
| 1196 | .byte 9,0,0,0 | ||
| 1197 | .rva se_handler | ||
| 1198 | .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] | ||
| 1199 | .LSEH_info_${PREFIX}_set_decrypt_key: | ||
| 1200 | .byte 9,0,0,0 | ||
| 1201 | .rva se_handler | ||
| 1202 | .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[] | ||
| 1203 | .LSEH_info_${PREFIX}_encrypt: | ||
| 1204 | .byte 9,0,0,0 | ||
| 1205 | .rva se_handler | ||
| 1206 | .rva .Lenc_body,.Lenc_epilogue # HandlerData[] | ||
| 1207 | .LSEH_info_${PREFIX}_decrypt: | ||
| 1208 | .byte 9,0,0,0 | ||
| 1209 | .rva se_handler | ||
| 1210 | .rva .Ldec_body,.Ldec_epilogue # HandlerData[] | ||
| 1211 | .LSEH_info_${PREFIX}_cbc_encrypt: | ||
| 1212 | .byte 9,0,0,0 | ||
| 1213 | .rva se_handler | ||
| 1214 | .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[] | ||
| 1215 | ___ | ||
| 1216 | } | ||
| 1217 | |||
| 1218 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 1219 | |||
| 1220 | print $code; | ||
| 1221 | |||
| 1222 | close STDOUT; | ||
