diff options
Diffstat (limited to 'src/lib/libcrypto/aes/asm/vpaes-x86_64.pl')
| -rw-r--r-- | src/lib/libcrypto/aes/asm/vpaes-x86_64.pl | 1207 |
1 files changed, 0 insertions, 1207 deletions
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl deleted file mode 100644 index bd7f45b850..0000000000 --- a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl +++ /dev/null | |||
| @@ -1,1207 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | ###################################################################### | ||
| 4 | ## Constant-time SSSE3 AES core implementation. | ||
| 5 | ## version 0.1 | ||
| 6 | ## | ||
| 7 | ## By Mike Hamburg (Stanford University), 2009 | ||
| 8 | ## Public domain. | ||
| 9 | ## | ||
| 10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
| 11 | ## http://crypto.stanford.edu/vpaes/. | ||
| 12 | |||
| 13 | ###################################################################### | ||
| 14 | # September 2011. | ||
| 15 | # | ||
| 16 | # Interface to OpenSSL as "almost" drop-in replacement for | ||
| 17 | # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
| 18 | # doesn't handle partial vectors (doesn't have to if called from | ||
| 19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
| 20 | # schedule structure with the original nor does it make assumption | ||
| 21 | # about its alignment... | ||
| 22 | # | ||
| 23 | # Performance summary. aes-x86_64.pl column lists large-block CBC | ||
| 24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
| 25 | # byte processed with 128-bit key, and vpaes-x86_64.pl column - | ||
| 26 | # [also large-block CBC] encrypt/decrypt. | ||
| 27 | # | ||
| 28 | # aes-x86_64.pl vpaes-x86_64.pl | ||
| 29 | # | ||
| 30 | # Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) | ||
| 31 | # Nehalem 30.5/42.2/14.6 9.8/11.8 | ||
| 32 | # Atom 63.9/79.0/32.1 64.0/84.8(***) | ||
| 33 | # | ||
| 34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
| 35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
| 36 | # majority of contemporary cores share cache, slower code path | ||
| 37 | # is common place. In other words "with-hyper-threading-off" | ||
| 38 | # results are presented mostly for reference purposes. | ||
| 39 | # | ||
| 40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
| 41 | # | ||
| 42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
| 43 | # pshufb, yet it's respectable +40%/78% improvement on Core 2 | ||
| 44 | # (as implied, over "hyper-threading-safe" code path). | ||
| 45 | # | ||
| 46 | # <appro@openssl.org> | ||
| 47 | |||
| 48 | $flavour = shift; | ||
| 49 | $output = shift; | ||
| 50 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 51 | |||
| 52 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 53 | |||
| 54 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 55 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 56 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 57 | die "can't locate x86_64-xlate.pl"; | ||
| 58 | |||
| 59 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
| 60 | *STDOUT=*OUT; | ||
| 61 | |||
| 62 | $PREFIX="vpaes"; | ||
| 63 | |||
| 64 | $code.=<<___; | ||
| 65 | .text | ||
| 66 | |||
| 67 | ## | ||
| 68 | ## _aes_encrypt_core | ||
| 69 | ## | ||
| 70 | ## AES-encrypt %xmm0. | ||
| 71 | ## | ||
| 72 | ## Inputs: | ||
| 73 | ## %xmm0 = input | ||
| 74 | ## %xmm9-%xmm15 as in _vpaes_preheat | ||
| 75 | ## (%rdx) = scheduled keys | ||
| 76 | ## | ||
| 77 | ## Output in %xmm0 | ||
| 78 | ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax | ||
| 79 | ## Preserves %xmm6 - %xmm8 so you get some local vectors | ||
| 80 | ## | ||
| 81 | ## | ||
| 82 | .type _vpaes_encrypt_core,\@abi-omnipotent | ||
| 83 | .align 16 | ||
| 84 | _vpaes_encrypt_core: | ||
| 85 | mov %rdx, %r9 | ||
| 86 | mov \$16, %r11 | ||
| 87 | mov 240(%rdx),%eax | ||
| 88 | movdqa %xmm9, %xmm1 | ||
| 89 | movdqa .Lk_ipt(%rip), %xmm2 # iptlo | ||
| 90 | pandn %xmm0, %xmm1 | ||
| 91 | movdqu (%r9), %xmm5 # round0 key | ||
| 92 | psrld \$4, %xmm1 | ||
| 93 | pand %xmm9, %xmm0 | ||
| 94 | pshufb %xmm0, %xmm2 | ||
| 95 | movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi | ||
| 96 | pshufb %xmm1, %xmm0 | ||
| 97 | pxor %xmm5, %xmm2 | ||
| 98 | pxor %xmm2, %xmm0 | ||
| 99 | add \$16, %r9 | ||
| 100 | lea .Lk_mc_backward(%rip),%r10 | ||
| 101 | jmp .Lenc_entry | ||
| 102 | |||
| 103 | .align 16 | ||
| 104 | .Lenc_loop: | ||
| 105 | # middle of middle round | ||
| 106 | movdqa %xmm13, %xmm4 # 4 : sb1u | ||
| 107 | pshufb %xmm2, %xmm4 # 4 = sb1u | ||
| 108 | pxor %xmm5, %xmm4 # 4 = sb1u + k | ||
| 109 | movdqa %xmm12, %xmm0 # 0 : sb1t | ||
| 110 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 111 | pxor %xmm4, %xmm0 # 0 = A | ||
| 112 | movdqa %xmm15, %xmm5 # 4 : sb2u | ||
| 113 | pshufb %xmm2, %xmm5 # 4 = sb2u | ||
| 114 | movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] | ||
| 115 | movdqa %xmm14, %xmm2 # 2 : sb2t | ||
| 116 | pshufb %xmm3, %xmm2 # 2 = sb2t | ||
| 117 | pxor %xmm5, %xmm2 # 2 = 2A | ||
| 118 | movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] | ||
| 119 | movdqa %xmm0, %xmm3 # 3 = A | ||
| 120 | pshufb %xmm1, %xmm0 # 0 = B | ||
| 121 | add \$16, %r9 # next key | ||
| 122 | pxor %xmm2, %xmm0 # 0 = 2A+B | ||
| 123 | pshufb %xmm4, %xmm3 # 3 = D | ||
| 124 | add \$16, %r11 # next mc | ||
| 125 | pxor %xmm0, %xmm3 # 3 = 2A+B+D | ||
| 126 | pshufb %xmm1, %xmm0 # 0 = 2B+C | ||
| 127 | and \$0x30, %r11 # ... mod 4 | ||
| 128 | pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D | ||
| 129 | sub \$1,%rax # nr-- | ||
| 130 | |||
| 131 | .Lenc_entry: | ||
| 132 | # top of round | ||
| 133 | movdqa %xmm9, %xmm1 # 1 : i | ||
| 134 | pandn %xmm0, %xmm1 # 1 = i<<4 | ||
| 135 | psrld \$4, %xmm1 # 1 = i | ||
| 136 | pand %xmm9, %xmm0 # 0 = k | ||
| 137 | movdqa %xmm11, %xmm5 # 2 : a/k | ||
| 138 | pshufb %xmm0, %xmm5 # 2 = a/k | ||
| 139 | pxor %xmm1, %xmm0 # 0 = j | ||
| 140 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
| 141 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
| 142 | pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k | ||
| 143 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
| 144 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
| 145 | pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k | ||
| 146 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
| 147 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
| 148 | pxor %xmm0, %xmm2 # 2 = io | ||
| 149 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
| 150 | movdqu (%r9), %xmm5 | ||
| 151 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
| 152 | pxor %xmm1, %xmm3 # 3 = jo | ||
| 153 | jnz .Lenc_loop | ||
| 154 | |||
| 155 | # middle of last round | ||
| 156 | movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo | ||
| 157 | movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 | ||
| 158 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
| 159 | pxor %xmm5, %xmm4 # 4 = sb1u + k | ||
| 160 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 161 | movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] | ||
| 162 | pxor %xmm4, %xmm0 # 0 = A | ||
| 163 | pshufb %xmm1, %xmm0 | ||
| 164 | ret | ||
| 165 | .size _vpaes_encrypt_core,.-_vpaes_encrypt_core | ||
| 166 | |||
| 167 | ## | ||
| 168 | ## Decryption core | ||
| 169 | ## | ||
| 170 | ## Same API as encryption core. | ||
| 171 | ## | ||
| 172 | .type _vpaes_decrypt_core,\@abi-omnipotent | ||
| 173 | .align 16 | ||
| 174 | _vpaes_decrypt_core: | ||
| 175 | mov %rdx, %r9 # load key | ||
| 176 | mov 240(%rdx),%eax | ||
| 177 | movdqa %xmm9, %xmm1 | ||
| 178 | movdqa .Lk_dipt(%rip), %xmm2 # iptlo | ||
| 179 | pandn %xmm0, %xmm1 | ||
| 180 | mov %rax, %r11 | ||
| 181 | psrld \$4, %xmm1 | ||
| 182 | movdqu (%r9), %xmm5 # round0 key | ||
| 183 | shl \$4, %r11 | ||
| 184 | pand %xmm9, %xmm0 | ||
| 185 | pshufb %xmm0, %xmm2 | ||
| 186 | movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi | ||
| 187 | xor \$0x30, %r11 | ||
| 188 | lea .Lk_dsbd(%rip),%r10 | ||
| 189 | pshufb %xmm1, %xmm0 | ||
| 190 | and \$0x30, %r11 | ||
| 191 | pxor %xmm5, %xmm2 | ||
| 192 | movdqa .Lk_mc_forward+48(%rip), %xmm5 | ||
| 193 | pxor %xmm2, %xmm0 | ||
| 194 | add \$16, %r9 | ||
| 195 | add %r10, %r11 | ||
| 196 | jmp .Ldec_entry | ||
| 197 | |||
| 198 | .align 16 | ||
| 199 | .Ldec_loop: | ||
| 200 | ## | ||
| 201 | ## Inverse mix columns | ||
| 202 | ## | ||
| 203 | movdqa -0x20(%r10),%xmm4 # 4 : sb9u | ||
| 204 | pshufb %xmm2, %xmm4 # 4 = sb9u | ||
| 205 | pxor %xmm0, %xmm4 | ||
| 206 | movdqa -0x10(%r10),%xmm0 # 0 : sb9t | ||
| 207 | pshufb %xmm3, %xmm0 # 0 = sb9t | ||
| 208 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 209 | add \$16, %r9 # next round key | ||
| 210 | |||
| 211 | pshufb %xmm5, %xmm0 # MC ch | ||
| 212 | movdqa 0x00(%r10),%xmm4 # 4 : sbdu | ||
| 213 | pshufb %xmm2, %xmm4 # 4 = sbdu | ||
| 214 | pxor %xmm0, %xmm4 # 4 = ch | ||
| 215 | movdqa 0x10(%r10),%xmm0 # 0 : sbdt | ||
| 216 | pshufb %xmm3, %xmm0 # 0 = sbdt | ||
| 217 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 218 | sub \$1,%rax # nr-- | ||
| 219 | |||
| 220 | pshufb %xmm5, %xmm0 # MC ch | ||
| 221 | movdqa 0x20(%r10),%xmm4 # 4 : sbbu | ||
| 222 | pshufb %xmm2, %xmm4 # 4 = sbbu | ||
| 223 | pxor %xmm0, %xmm4 # 4 = ch | ||
| 224 | movdqa 0x30(%r10),%xmm0 # 0 : sbbt | ||
| 225 | pshufb %xmm3, %xmm0 # 0 = sbbt | ||
| 226 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 227 | |||
| 228 | pshufb %xmm5, %xmm0 # MC ch | ||
| 229 | movdqa 0x40(%r10),%xmm4 # 4 : sbeu | ||
| 230 | pshufb %xmm2, %xmm4 # 4 = sbeu | ||
| 231 | pxor %xmm0, %xmm4 # 4 = ch | ||
| 232 | movdqa 0x50(%r10),%xmm0 # 0 : sbet | ||
| 233 | pshufb %xmm3, %xmm0 # 0 = sbet | ||
| 234 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 235 | |||
| 236 | palignr \$12, %xmm5, %xmm5 | ||
| 237 | |||
| 238 | .Ldec_entry: | ||
| 239 | # top of round | ||
| 240 | movdqa %xmm9, %xmm1 # 1 : i | ||
| 241 | pandn %xmm0, %xmm1 # 1 = i<<4 | ||
| 242 | psrld \$4, %xmm1 # 1 = i | ||
| 243 | pand %xmm9, %xmm0 # 0 = k | ||
| 244 | movdqa %xmm11, %xmm2 # 2 : a/k | ||
| 245 | pshufb %xmm0, %xmm2 # 2 = a/k | ||
| 246 | pxor %xmm1, %xmm0 # 0 = j | ||
| 247 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
| 248 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
| 249 | pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k | ||
| 250 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
| 251 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
| 252 | pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k | ||
| 253 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
| 254 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
| 255 | pxor %xmm0, %xmm2 # 2 = io | ||
| 256 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
| 257 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
| 258 | pxor %xmm1, %xmm3 # 3 = jo | ||
| 259 | movdqu (%r9), %xmm0 | ||
| 260 | jnz .Ldec_loop | ||
| 261 | |||
| 262 | # middle of last round | ||
| 263 | movdqa 0x60(%r10), %xmm4 # 3 : sbou | ||
| 264 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
| 265 | pxor %xmm0, %xmm4 # 4 = sb1u + k | ||
| 266 | movdqa 0x70(%r10), %xmm0 # 0 : sbot | ||
| 267 | movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 | ||
| 268 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 269 | pxor %xmm4, %xmm0 # 0 = A | ||
| 270 | pshufb %xmm2, %xmm0 | ||
| 271 | ret | ||
| 272 | .size _vpaes_decrypt_core,.-_vpaes_decrypt_core | ||
| 273 | |||
| 274 | ######################################################## | ||
| 275 | ## ## | ||
| 276 | ## AES key schedule ## | ||
| 277 | ## ## | ||
| 278 | ######################################################## | ||
| 279 | .type _vpaes_schedule_core,\@abi-omnipotent | ||
| 280 | .align 16 | ||
| 281 | _vpaes_schedule_core: | ||
| 282 | # rdi = key | ||
| 283 | # rsi = size in bits | ||
| 284 | # rdx = buffer | ||
| 285 | # rcx = direction. 0=encrypt, 1=decrypt | ||
| 286 | |||
| 287 | call _vpaes_preheat # load the tables | ||
| 288 | movdqa .Lk_rcon(%rip), %xmm8 # load rcon | ||
| 289 | movdqu (%rdi), %xmm0 # load key (unaligned) | ||
| 290 | |||
| 291 | # input transform | ||
| 292 | movdqa %xmm0, %xmm3 | ||
| 293 | lea .Lk_ipt(%rip), %r11 | ||
| 294 | call _vpaes_schedule_transform | ||
| 295 | movdqa %xmm0, %xmm7 | ||
| 296 | |||
| 297 | lea .Lk_sr(%rip),%r10 | ||
| 298 | test %rcx, %rcx | ||
| 299 | jnz .Lschedule_am_decrypting | ||
| 300 | |||
| 301 | # encrypting, output zeroth round key after transform | ||
| 302 | movdqu %xmm0, (%rdx) | ||
| 303 | jmp .Lschedule_go | ||
| 304 | |||
| 305 | .Lschedule_am_decrypting: | ||
| 306 | # decrypting, output zeroth round key after shiftrows | ||
| 307 | movdqa (%r8,%r10),%xmm1 | ||
| 308 | pshufb %xmm1, %xmm3 | ||
| 309 | movdqu %xmm3, (%rdx) | ||
| 310 | xor \$0x30, %r8 | ||
| 311 | |||
| 312 | .Lschedule_go: | ||
| 313 | cmp \$192, %esi | ||
| 314 | ja .Lschedule_256 | ||
| 315 | je .Lschedule_192 | ||
| 316 | # 128: fall though | ||
| 317 | |||
| 318 | ## | ||
| 319 | ## .schedule_128 | ||
| 320 | ## | ||
| 321 | ## 128-bit specific part of key schedule. | ||
| 322 | ## | ||
| 323 | ## This schedule is really simple, because all its parts | ||
| 324 | ## are accomplished by the subroutines. | ||
| 325 | ## | ||
| 326 | .Lschedule_128: | ||
| 327 | mov \$10, %esi | ||
| 328 | |||
| 329 | .Loop_schedule_128: | ||
| 330 | call _vpaes_schedule_round | ||
| 331 | dec %rsi | ||
| 332 | jz .Lschedule_mangle_last | ||
| 333 | call _vpaes_schedule_mangle # write output | ||
| 334 | jmp .Loop_schedule_128 | ||
| 335 | |||
| 336 | ## | ||
| 337 | ## .aes_schedule_192 | ||
| 338 | ## | ||
| 339 | ## 192-bit specific part of key schedule. | ||
| 340 | ## | ||
| 341 | ## The main body of this schedule is the same as the 128-bit | ||
| 342 | ## schedule, but with more smearing. The long, high side is | ||
| 343 | ## stored in %xmm7 as before, and the short, low side is in | ||
| 344 | ## the high bits of %xmm6. | ||
| 345 | ## | ||
| 346 | ## This schedule is somewhat nastier, however, because each | ||
| 347 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
| 348 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
| 349 | ## keys. | ||
| 350 | ## | ||
| 351 | .align 16 | ||
| 352 | .Lschedule_192: | ||
| 353 | movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) | ||
| 354 | call _vpaes_schedule_transform # input transform | ||
| 355 | movdqa %xmm0, %xmm6 # save short part | ||
| 356 | pxor %xmm4, %xmm4 # clear 4 | ||
| 357 | movhlps %xmm4, %xmm6 # clobber low side with zeros | ||
| 358 | mov \$4, %esi | ||
| 359 | |||
| 360 | .Loop_schedule_192: | ||
| 361 | call _vpaes_schedule_round | ||
| 362 | palignr \$8,%xmm6,%xmm0 | ||
| 363 | call _vpaes_schedule_mangle # save key n | ||
| 364 | call _vpaes_schedule_192_smear | ||
| 365 | call _vpaes_schedule_mangle # save key n+1 | ||
| 366 | call _vpaes_schedule_round | ||
| 367 | dec %rsi | ||
| 368 | jz .Lschedule_mangle_last | ||
| 369 | call _vpaes_schedule_mangle # save key n+2 | ||
| 370 | call _vpaes_schedule_192_smear | ||
| 371 | jmp .Loop_schedule_192 | ||
| 372 | |||
| 373 | ## | ||
| 374 | ## .aes_schedule_256 | ||
| 375 | ## | ||
| 376 | ## 256-bit specific part of key schedule. | ||
| 377 | ## | ||
| 378 | ## The structure here is very similar to the 128-bit | ||
| 379 | ## schedule, but with an additional "low side" in | ||
| 380 | ## %xmm6. The low side's rounds are the same as the | ||
| 381 | ## high side's, except no rcon and no rotation. | ||
| 382 | ## | ||
| 383 | .align 16 | ||
| 384 | .Lschedule_256: | ||
| 385 | movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) | ||
| 386 | call _vpaes_schedule_transform # input transform | ||
| 387 | mov \$7, %esi | ||
| 388 | |||
| 389 | .Loop_schedule_256: | ||
| 390 | call _vpaes_schedule_mangle # output low result | ||
| 391 | movdqa %xmm0, %xmm6 # save cur_lo in xmm6 | ||
| 392 | |||
| 393 | # high round | ||
| 394 | call _vpaes_schedule_round | ||
| 395 | dec %rsi | ||
| 396 | jz .Lschedule_mangle_last | ||
| 397 | call _vpaes_schedule_mangle | ||
| 398 | |||
| 399 | # low round. swap xmm7 and xmm6 | ||
| 400 | pshufd \$0xFF, %xmm0, %xmm0 | ||
| 401 | movdqa %xmm7, %xmm5 | ||
| 402 | movdqa %xmm6, %xmm7 | ||
| 403 | call _vpaes_schedule_low_round | ||
| 404 | movdqa %xmm5, %xmm7 | ||
| 405 | |||
| 406 | jmp .Loop_schedule_256 | ||
| 407 | |||
| 408 | |||
| 409 | ## | ||
| 410 | ## .aes_schedule_mangle_last | ||
| 411 | ## | ||
| 412 | ## Mangler for last round of key schedule | ||
| 413 | ## Mangles %xmm0 | ||
| 414 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
| 415 | ## when decrypting, outputs unskew(%xmm0) | ||
| 416 | ## | ||
| 417 | ## Always called right before return... jumps to cleanup and exits | ||
| 418 | ## | ||
| 419 | .align 16 | ||
| 420 | .Lschedule_mangle_last: | ||
| 421 | # schedule last round key from xmm0 | ||
| 422 | lea .Lk_deskew(%rip),%r11 # prepare to deskew | ||
| 423 | test %rcx, %rcx | ||
| 424 | jnz .Lschedule_mangle_last_dec | ||
| 425 | |||
| 426 | # encrypting | ||
| 427 | movdqa (%r8,%r10),%xmm1 | ||
| 428 | pshufb %xmm1, %xmm0 # output permute | ||
| 429 | lea .Lk_opt(%rip), %r11 # prepare to output transform | ||
| 430 | add \$32, %rdx | ||
| 431 | |||
| 432 | .Lschedule_mangle_last_dec: | ||
| 433 | add \$-16, %rdx | ||
| 434 | pxor .Lk_s63(%rip), %xmm0 | ||
| 435 | call _vpaes_schedule_transform # output transform | ||
| 436 | movdqu %xmm0, (%rdx) # save last key | ||
| 437 | |||
| 438 | # cleanup | ||
| 439 | pxor %xmm0, %xmm0 | ||
| 440 | pxor %xmm1, %xmm1 | ||
| 441 | pxor %xmm2, %xmm2 | ||
| 442 | pxor %xmm3, %xmm3 | ||
| 443 | pxor %xmm4, %xmm4 | ||
| 444 | pxor %xmm5, %xmm5 | ||
| 445 | pxor %xmm6, %xmm6 | ||
| 446 | pxor %xmm7, %xmm7 | ||
| 447 | ret | ||
| 448 | .size _vpaes_schedule_core,.-_vpaes_schedule_core | ||
| 449 | |||
| 450 | ## | ||
| 451 | ## .aes_schedule_192_smear | ||
| 452 | ## | ||
| 453 | ## Smear the short, low side in the 192-bit key schedule. | ||
| 454 | ## | ||
| 455 | ## Inputs: | ||
| 456 | ## %xmm7: high side, b a x y | ||
| 457 | ## %xmm6: low side, d c 0 0 | ||
| 458 | ## %xmm13: 0 | ||
| 459 | ## | ||
| 460 | ## Outputs: | ||
| 461 | ## %xmm6: b+c+d b+c 0 0 | ||
| 462 | ## %xmm0: b+c+d b+c b a | ||
| 463 | ## | ||
| 464 | .type _vpaes_schedule_192_smear,\@abi-omnipotent | ||
| 465 | .align 16 | ||
| 466 | _vpaes_schedule_192_smear: | ||
| 467 | pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 | ||
| 468 | pxor %xmm0, %xmm6 # -> c+d c 0 0 | ||
| 469 | pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a | ||
| 470 | pxor %xmm0, %xmm6 # -> b+c+d b+c b a | ||
| 471 | movdqa %xmm6, %xmm0 | ||
| 472 | pxor %xmm1, %xmm1 | ||
| 473 | movhlps %xmm1, %xmm6 # clobber low side with zeros | ||
| 474 | ret | ||
| 475 | .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear | ||
| 476 | |||
| 477 | ## | ||
| 478 | ## .aes_schedule_round | ||
| 479 | ## | ||
| 480 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
| 481 | ## | ||
| 482 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
| 483 | ## then rotates it by one byte and xors into the low dword of | ||
| 484 | ## %xmm7. | ||
| 485 | ## | ||
| 486 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
| 487 | ## next rcon. | ||
| 488 | ## | ||
| 489 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
| 490 | ## second low, result into third, result into highest. | ||
| 491 | ## | ||
| 492 | ## Returns results in %xmm7 = %xmm0. | ||
| 493 | ## Clobbers %xmm1-%xmm4, %r11. | ||
| 494 | ## | ||
| 495 | .type _vpaes_schedule_round,\@abi-omnipotent | ||
| 496 | .align 16 | ||
| 497 | _vpaes_schedule_round: | ||
| 498 | # extract rcon from xmm8 | ||
| 499 | pxor %xmm1, %xmm1 | ||
| 500 | palignr \$15, %xmm8, %xmm1 | ||
| 501 | palignr \$15, %xmm8, %xmm8 | ||
| 502 | pxor %xmm1, %xmm7 | ||
| 503 | |||
| 504 | # rotate | ||
| 505 | pshufd \$0xFF, %xmm0, %xmm0 | ||
| 506 | palignr \$1, %xmm0, %xmm0 | ||
| 507 | |||
| 508 | # fall through... | ||
| 509 | |||
| 510 | # low round: same as high round, but no rotation and no rcon. | ||
| 511 | _vpaes_schedule_low_round: | ||
| 512 | # smear xmm7 | ||
| 513 | movdqa %xmm7, %xmm1 | ||
| 514 | pslldq \$4, %xmm7 | ||
| 515 | pxor %xmm1, %xmm7 | ||
| 516 | movdqa %xmm7, %xmm1 | ||
| 517 | pslldq \$8, %xmm7 | ||
| 518 | pxor %xmm1, %xmm7 | ||
| 519 | pxor .Lk_s63(%rip), %xmm7 | ||
| 520 | |||
| 521 | # subbytes | ||
| 522 | movdqa %xmm9, %xmm1 | ||
| 523 | pandn %xmm0, %xmm1 | ||
| 524 | psrld \$4, %xmm1 # 1 = i | ||
| 525 | pand %xmm9, %xmm0 # 0 = k | ||
| 526 | movdqa %xmm11, %xmm2 # 2 : a/k | ||
| 527 | pshufb %xmm0, %xmm2 # 2 = a/k | ||
| 528 | pxor %xmm1, %xmm0 # 0 = j | ||
| 529 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
| 530 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
| 531 | pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k | ||
| 532 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
| 533 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
| 534 | pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k | ||
| 535 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
| 536 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
| 537 | pxor %xmm0, %xmm2 # 2 = io | ||
| 538 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
| 539 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
| 540 | pxor %xmm1, %xmm3 # 3 = jo | ||
| 541 | movdqa %xmm13, %xmm4 # 4 : sbou | ||
| 542 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
| 543 | movdqa %xmm12, %xmm0 # 0 : sbot | ||
| 544 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 545 | pxor %xmm4, %xmm0 # 0 = sbox output | ||
| 546 | |||
| 547 | # add in smeared stuff | ||
| 548 | pxor %xmm7, %xmm0 | ||
| 549 | movdqa %xmm0, %xmm7 | ||
| 550 | ret | ||
| 551 | .size _vpaes_schedule_round,.-_vpaes_schedule_round | ||
| 552 | |||
| 553 | ## | ||
| 554 | ## .aes_schedule_transform | ||
| 555 | ## | ||
| 556 | ## Linear-transform %xmm0 according to tables at (%r11) | ||
| 557 | ## | ||
| 558 | ## Requires that %xmm9 = 0x0F0F... as in preheat | ||
| 559 | ## Output in %xmm0 | ||
| 560 | ## Clobbers %xmm1, %xmm2 | ||
| 561 | ## | ||
| 562 | .type _vpaes_schedule_transform,\@abi-omnipotent | ||
| 563 | .align 16 | ||
| 564 | _vpaes_schedule_transform: | ||
| 565 | movdqa %xmm9, %xmm1 | ||
| 566 | pandn %xmm0, %xmm1 | ||
| 567 | psrld \$4, %xmm1 | ||
| 568 | pand %xmm9, %xmm0 | ||
| 569 | movdqa (%r11), %xmm2 # lo | ||
| 570 | pshufb %xmm0, %xmm2 | ||
| 571 | movdqa 16(%r11), %xmm0 # hi | ||
| 572 | pshufb %xmm1, %xmm0 | ||
| 573 | pxor %xmm2, %xmm0 | ||
| 574 | ret | ||
| 575 | .size _vpaes_schedule_transform,.-_vpaes_schedule_transform | ||
| 576 | |||
| 577 | ## | ||
| 578 | ## .aes_schedule_mangle | ||
| 579 | ## | ||
| 580 | ## Mangle xmm0 from (basis-transformed) standard version | ||
| 581 | ## to our version. | ||
| 582 | ## | ||
| 583 | ## On encrypt, | ||
| 584 | ## xor with 0x63 | ||
| 585 | ## multiply by circulant 0,1,1,1 | ||
| 586 | ## apply shiftrows transform | ||
| 587 | ## | ||
| 588 | ## On decrypt, | ||
| 589 | ## xor with 0x63 | ||
| 590 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
| 591 | ## deskew | ||
| 592 | ## apply shiftrows transform | ||
| 593 | ## | ||
| 594 | ## | ||
| 595 | ## Writes out to (%rdx), and increments or decrements it | ||
| 596 | ## Keeps track of round number mod 4 in %r8 | ||
| 597 | ## Preserves xmm0 | ||
| 598 | ## Clobbers xmm1-xmm5 | ||
| 599 | ## | ||
| 600 | .type _vpaes_schedule_mangle,\@abi-omnipotent | ||
| 601 | .align 16 | ||
| 602 | _vpaes_schedule_mangle: | ||
| 603 | movdqa %xmm0, %xmm4 # save xmm0 for later | ||
| 604 | movdqa .Lk_mc_forward(%rip),%xmm5 | ||
| 605 | test %rcx, %rcx | ||
| 606 | jnz .Lschedule_mangle_dec | ||
| 607 | |||
| 608 | # encrypting | ||
| 609 | add \$16, %rdx | ||
| 610 | pxor .Lk_s63(%rip),%xmm4 | ||
| 611 | pshufb %xmm5, %xmm4 | ||
| 612 | movdqa %xmm4, %xmm3 | ||
| 613 | pshufb %xmm5, %xmm4 | ||
| 614 | pxor %xmm4, %xmm3 | ||
| 615 | pshufb %xmm5, %xmm4 | ||
| 616 | pxor %xmm4, %xmm3 | ||
| 617 | |||
| 618 | jmp .Lschedule_mangle_both | ||
| 619 | .align 16 | ||
| 620 | .Lschedule_mangle_dec: | ||
| 621 | # inverse mix columns | ||
| 622 | lea .Lk_dksd(%rip),%r11 | ||
| 623 | movdqa %xmm9, %xmm1 | ||
| 624 | pandn %xmm4, %xmm1 | ||
| 625 | psrld \$4, %xmm1 # 1 = hi | ||
| 626 | pand %xmm9, %xmm4 # 4 = lo | ||
| 627 | |||
| 628 | movdqa 0x00(%r11), %xmm2 | ||
| 629 | pshufb %xmm4, %xmm2 | ||
| 630 | movdqa 0x10(%r11), %xmm3 | ||
| 631 | pshufb %xmm1, %xmm3 | ||
| 632 | pxor %xmm2, %xmm3 | ||
| 633 | pshufb %xmm5, %xmm3 | ||
| 634 | |||
| 635 | movdqa 0x20(%r11), %xmm2 | ||
| 636 | pshufb %xmm4, %xmm2 | ||
| 637 | pxor %xmm3, %xmm2 | ||
| 638 | movdqa 0x30(%r11), %xmm3 | ||
| 639 | pshufb %xmm1, %xmm3 | ||
| 640 | pxor %xmm2, %xmm3 | ||
| 641 | pshufb %xmm5, %xmm3 | ||
| 642 | |||
| 643 | movdqa 0x40(%r11), %xmm2 | ||
| 644 | pshufb %xmm4, %xmm2 | ||
| 645 | pxor %xmm3, %xmm2 | ||
| 646 | movdqa 0x50(%r11), %xmm3 | ||
| 647 | pshufb %xmm1, %xmm3 | ||
| 648 | pxor %xmm2, %xmm3 | ||
| 649 | pshufb %xmm5, %xmm3 | ||
| 650 | |||
| 651 | movdqa 0x60(%r11), %xmm2 | ||
| 652 | pshufb %xmm4, %xmm2 | ||
| 653 | pxor %xmm3, %xmm2 | ||
| 654 | movdqa 0x70(%r11), %xmm3 | ||
| 655 | pshufb %xmm1, %xmm3 | ||
| 656 | pxor %xmm2, %xmm3 | ||
| 657 | |||
| 658 | add \$-16, %rdx | ||
| 659 | |||
| 660 | .Lschedule_mangle_both: | ||
| 661 | movdqa (%r8,%r10),%xmm1 | ||
| 662 | pshufb %xmm1,%xmm3 | ||
| 663 | add \$-16, %r8 | ||
| 664 | and \$0x30, %r8 | ||
| 665 | movdqu %xmm3, (%rdx) | ||
| 666 | ret | ||
| 667 | .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle | ||
| 668 | |||
| 669 | # | ||
| 670 | # Interface to OpenSSL | ||
| 671 | # | ||
| 672 | .globl ${PREFIX}_set_encrypt_key | ||
| 673 | .type ${PREFIX}_set_encrypt_key,\@function,3 | ||
| 674 | .align 16 | ||
| 675 | ${PREFIX}_set_encrypt_key: | ||
| 676 | ___ | ||
| 677 | $code.=<<___ if ($win64); | ||
| 678 | lea -0xb8(%rsp),%rsp | ||
| 679 | movaps %xmm6,0x10(%rsp) | ||
| 680 | movaps %xmm7,0x20(%rsp) | ||
| 681 | movaps %xmm8,0x30(%rsp) | ||
| 682 | movaps %xmm9,0x40(%rsp) | ||
| 683 | movaps %xmm10,0x50(%rsp) | ||
| 684 | movaps %xmm11,0x60(%rsp) | ||
| 685 | movaps %xmm12,0x70(%rsp) | ||
| 686 | movaps %xmm13,0x80(%rsp) | ||
| 687 | movaps %xmm14,0x90(%rsp) | ||
| 688 | movaps %xmm15,0xa0(%rsp) | ||
| 689 | .Lenc_key_body: | ||
| 690 | ___ | ||
| 691 | $code.=<<___; | ||
| 692 | mov %esi,%eax | ||
| 693 | shr \$5,%eax | ||
| 694 | add \$5,%eax | ||
| 695 | mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; | ||
| 696 | |||
| 697 | mov \$0,%ecx | ||
| 698 | mov \$0x30,%r8d | ||
| 699 | call _vpaes_schedule_core | ||
| 700 | ___ | ||
| 701 | $code.=<<___ if ($win64); | ||
| 702 | movaps 0x10(%rsp),%xmm6 | ||
| 703 | movaps 0x20(%rsp),%xmm7 | ||
| 704 | movaps 0x30(%rsp),%xmm8 | ||
| 705 | movaps 0x40(%rsp),%xmm9 | ||
| 706 | movaps 0x50(%rsp),%xmm10 | ||
| 707 | movaps 0x60(%rsp),%xmm11 | ||
| 708 | movaps 0x70(%rsp),%xmm12 | ||
| 709 | movaps 0x80(%rsp),%xmm13 | ||
| 710 | movaps 0x90(%rsp),%xmm14 | ||
| 711 | movaps 0xa0(%rsp),%xmm15 | ||
| 712 | lea 0xb8(%rsp),%rsp | ||
| 713 | .Lenc_key_epilogue: | ||
| 714 | ___ | ||
| 715 | $code.=<<___; | ||
| 716 | xor %eax,%eax | ||
| 717 | ret | ||
| 718 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key | ||
| 719 | |||
| 720 | .globl ${PREFIX}_set_decrypt_key | ||
| 721 | .type ${PREFIX}_set_decrypt_key,\@function,3 | ||
| 722 | .align 16 | ||
| 723 | ${PREFIX}_set_decrypt_key: | ||
| 724 | ___ | ||
| 725 | $code.=<<___ if ($win64); | ||
| 726 | lea -0xb8(%rsp),%rsp | ||
| 727 | movaps %xmm6,0x10(%rsp) | ||
| 728 | movaps %xmm7,0x20(%rsp) | ||
| 729 | movaps %xmm8,0x30(%rsp) | ||
| 730 | movaps %xmm9,0x40(%rsp) | ||
| 731 | movaps %xmm10,0x50(%rsp) | ||
| 732 | movaps %xmm11,0x60(%rsp) | ||
| 733 | movaps %xmm12,0x70(%rsp) | ||
| 734 | movaps %xmm13,0x80(%rsp) | ||
| 735 | movaps %xmm14,0x90(%rsp) | ||
| 736 | movaps %xmm15,0xa0(%rsp) | ||
| 737 | .Ldec_key_body: | ||
| 738 | ___ | ||
| 739 | $code.=<<___; | ||
| 740 | mov %esi,%eax | ||
| 741 | shr \$5,%eax | ||
| 742 | add \$5,%eax | ||
| 743 | mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; | ||
| 744 | shl \$4,%eax | ||
| 745 | lea 16(%rdx,%rax),%rdx | ||
| 746 | |||
| 747 | mov \$1,%ecx | ||
| 748 | mov %esi,%r8d | ||
| 749 | shr \$1,%r8d | ||
| 750 | and \$32,%r8d | ||
| 751 | xor \$32,%r8d # nbits==192?0:32 | ||
| 752 | call _vpaes_schedule_core | ||
| 753 | ___ | ||
| 754 | $code.=<<___ if ($win64); | ||
| 755 | movaps 0x10(%rsp),%xmm6 | ||
| 756 | movaps 0x20(%rsp),%xmm7 | ||
| 757 | movaps 0x30(%rsp),%xmm8 | ||
| 758 | movaps 0x40(%rsp),%xmm9 | ||
| 759 | movaps 0x50(%rsp),%xmm10 | ||
| 760 | movaps 0x60(%rsp),%xmm11 | ||
| 761 | movaps 0x70(%rsp),%xmm12 | ||
| 762 | movaps 0x80(%rsp),%xmm13 | ||
| 763 | movaps 0x90(%rsp),%xmm14 | ||
| 764 | movaps 0xa0(%rsp),%xmm15 | ||
| 765 | lea 0xb8(%rsp),%rsp | ||
| 766 | .Ldec_key_epilogue: | ||
| 767 | ___ | ||
| 768 | $code.=<<___; | ||
| 769 | xor %eax,%eax | ||
| 770 | ret | ||
| 771 | .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key | ||
| 772 | |||
| 773 | .globl ${PREFIX}_encrypt | ||
| 774 | .type ${PREFIX}_encrypt,\@function,3 | ||
| 775 | .align 16 | ||
| 776 | ${PREFIX}_encrypt: | ||
| 777 | ___ | ||
| 778 | $code.=<<___ if ($win64); | ||
| 779 | lea -0xb8(%rsp),%rsp | ||
| 780 | movaps %xmm6,0x10(%rsp) | ||
| 781 | movaps %xmm7,0x20(%rsp) | ||
| 782 | movaps %xmm8,0x30(%rsp) | ||
| 783 | movaps %xmm9,0x40(%rsp) | ||
| 784 | movaps %xmm10,0x50(%rsp) | ||
| 785 | movaps %xmm11,0x60(%rsp) | ||
| 786 | movaps %xmm12,0x70(%rsp) | ||
| 787 | movaps %xmm13,0x80(%rsp) | ||
| 788 | movaps %xmm14,0x90(%rsp) | ||
| 789 | movaps %xmm15,0xa0(%rsp) | ||
| 790 | .Lenc_body: | ||
| 791 | ___ | ||
| 792 | $code.=<<___; | ||
| 793 | movdqu (%rdi),%xmm0 | ||
| 794 | call _vpaes_preheat | ||
| 795 | call _vpaes_encrypt_core | ||
| 796 | movdqu %xmm0,(%rsi) | ||
| 797 | ___ | ||
| 798 | $code.=<<___ if ($win64); | ||
| 799 | movaps 0x10(%rsp),%xmm6 | ||
| 800 | movaps 0x20(%rsp),%xmm7 | ||
| 801 | movaps 0x30(%rsp),%xmm8 | ||
| 802 | movaps 0x40(%rsp),%xmm9 | ||
| 803 | movaps 0x50(%rsp),%xmm10 | ||
| 804 | movaps 0x60(%rsp),%xmm11 | ||
| 805 | movaps 0x70(%rsp),%xmm12 | ||
| 806 | movaps 0x80(%rsp),%xmm13 | ||
| 807 | movaps 0x90(%rsp),%xmm14 | ||
| 808 | movaps 0xa0(%rsp),%xmm15 | ||
| 809 | lea 0xb8(%rsp),%rsp | ||
| 810 | .Lenc_epilogue: | ||
| 811 | ___ | ||
| 812 | $code.=<<___; | ||
| 813 | ret | ||
| 814 | .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt | ||
| 815 | |||
| 816 | .globl ${PREFIX}_decrypt | ||
| 817 | .type ${PREFIX}_decrypt,\@function,3 | ||
| 818 | .align 16 | ||
| 819 | ${PREFIX}_decrypt: | ||
| 820 | ___ | ||
| 821 | $code.=<<___ if ($win64); | ||
| 822 | lea -0xb8(%rsp),%rsp | ||
| 823 | movaps %xmm6,0x10(%rsp) | ||
| 824 | movaps %xmm7,0x20(%rsp) | ||
| 825 | movaps %xmm8,0x30(%rsp) | ||
| 826 | movaps %xmm9,0x40(%rsp) | ||
| 827 | movaps %xmm10,0x50(%rsp) | ||
| 828 | movaps %xmm11,0x60(%rsp) | ||
| 829 | movaps %xmm12,0x70(%rsp) | ||
| 830 | movaps %xmm13,0x80(%rsp) | ||
| 831 | movaps %xmm14,0x90(%rsp) | ||
| 832 | movaps %xmm15,0xa0(%rsp) | ||
| 833 | .Ldec_body: | ||
| 834 | ___ | ||
| 835 | $code.=<<___; | ||
| 836 | movdqu (%rdi),%xmm0 | ||
| 837 | call _vpaes_preheat | ||
| 838 | call _vpaes_decrypt_core | ||
| 839 | movdqu %xmm0,(%rsi) | ||
| 840 | ___ | ||
| 841 | $code.=<<___ if ($win64); | ||
| 842 | movaps 0x10(%rsp),%xmm6 | ||
| 843 | movaps 0x20(%rsp),%xmm7 | ||
| 844 | movaps 0x30(%rsp),%xmm8 | ||
| 845 | movaps 0x40(%rsp),%xmm9 | ||
| 846 | movaps 0x50(%rsp),%xmm10 | ||
| 847 | movaps 0x60(%rsp),%xmm11 | ||
| 848 | movaps 0x70(%rsp),%xmm12 | ||
| 849 | movaps 0x80(%rsp),%xmm13 | ||
| 850 | movaps 0x90(%rsp),%xmm14 | ||
| 851 | movaps 0xa0(%rsp),%xmm15 | ||
| 852 | lea 0xb8(%rsp),%rsp | ||
| 853 | .Ldec_epilogue: | ||
| 854 | ___ | ||
| 855 | $code.=<<___; | ||
| 856 | ret | ||
| 857 | .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt | ||
| 858 | ___ | ||
| 859 | { | ||
| 860 | my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||
| 861 | # void AES_cbc_encrypt (const void char *inp, unsigned char *out, | ||
| 862 | # size_t length, const AES_KEY *key, | ||
| 863 | # unsigned char *ivp,const int enc); | ||
| 864 | $code.=<<___; | ||
| 865 | .globl ${PREFIX}_cbc_encrypt | ||
| 866 | .type ${PREFIX}_cbc_encrypt,\@function,6 | ||
| 867 | .align 16 | ||
| 868 | ${PREFIX}_cbc_encrypt: | ||
| 869 | xchg $key,$len | ||
| 870 | ___ | ||
| 871 | ($len,$key)=($key,$len); | ||
| 872 | $code.=<<___; | ||
| 873 | sub \$16,$len | ||
| 874 | jc .Lcbc_abort | ||
| 875 | ___ | ||
| 876 | $code.=<<___ if ($win64); | ||
| 877 | lea -0xb8(%rsp),%rsp | ||
| 878 | movaps %xmm6,0x10(%rsp) | ||
| 879 | movaps %xmm7,0x20(%rsp) | ||
| 880 | movaps %xmm8,0x30(%rsp) | ||
| 881 | movaps %xmm9,0x40(%rsp) | ||
| 882 | movaps %xmm10,0x50(%rsp) | ||
| 883 | movaps %xmm11,0x60(%rsp) | ||
| 884 | movaps %xmm12,0x70(%rsp) | ||
| 885 | movaps %xmm13,0x80(%rsp) | ||
| 886 | movaps %xmm14,0x90(%rsp) | ||
| 887 | movaps %xmm15,0xa0(%rsp) | ||
| 888 | .Lcbc_body: | ||
| 889 | ___ | ||
| 890 | $code.=<<___; | ||
| 891 | movdqu ($ivp),%xmm6 # load IV | ||
| 892 | sub $inp,$out | ||
| 893 | call _vpaes_preheat | ||
| 894 | cmp \$0,${enc}d | ||
| 895 | je .Lcbc_dec_loop | ||
| 896 | jmp .Lcbc_enc_loop | ||
| 897 | .align 16 | ||
| 898 | .Lcbc_enc_loop: | ||
| 899 | movdqu ($inp),%xmm0 | ||
| 900 | pxor %xmm6,%xmm0 | ||
| 901 | call _vpaes_encrypt_core | ||
| 902 | movdqa %xmm0,%xmm6 | ||
| 903 | movdqu %xmm0,($out,$inp) | ||
| 904 | lea 16($inp),$inp | ||
| 905 | sub \$16,$len | ||
| 906 | jnc .Lcbc_enc_loop | ||
| 907 | jmp .Lcbc_done | ||
| 908 | .align 16 | ||
| 909 | .Lcbc_dec_loop: | ||
| 910 | movdqu ($inp),%xmm0 | ||
| 911 | movdqa %xmm0,%xmm7 | ||
| 912 | call _vpaes_decrypt_core | ||
| 913 | pxor %xmm6,%xmm0 | ||
| 914 | movdqa %xmm7,%xmm6 | ||
| 915 | movdqu %xmm0,($out,$inp) | ||
| 916 | lea 16($inp),$inp | ||
| 917 | sub \$16,$len | ||
| 918 | jnc .Lcbc_dec_loop | ||
| 919 | .Lcbc_done: | ||
| 920 | movdqu %xmm6,($ivp) # save IV | ||
| 921 | ___ | ||
| 922 | $code.=<<___ if ($win64); | ||
| 923 | movaps 0x10(%rsp),%xmm6 | ||
| 924 | movaps 0x20(%rsp),%xmm7 | ||
| 925 | movaps 0x30(%rsp),%xmm8 | ||
| 926 | movaps 0x40(%rsp),%xmm9 | ||
| 927 | movaps 0x50(%rsp),%xmm10 | ||
| 928 | movaps 0x60(%rsp),%xmm11 | ||
| 929 | movaps 0x70(%rsp),%xmm12 | ||
| 930 | movaps 0x80(%rsp),%xmm13 | ||
| 931 | movaps 0x90(%rsp),%xmm14 | ||
| 932 | movaps 0xa0(%rsp),%xmm15 | ||
| 933 | lea 0xb8(%rsp),%rsp | ||
| 934 | .Lcbc_epilogue: | ||
| 935 | ___ | ||
| 936 | $code.=<<___; | ||
| 937 | .Lcbc_abort: | ||
| 938 | ret | ||
| 939 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt | ||
| 940 | ___ | ||
| 941 | } | ||
| 942 | $code.=<<___; | ||
| 943 | ## | ||
| 944 | ## _aes_preheat | ||
| 945 | ## | ||
| 946 | ## Fills register %r10 -> .aes_consts (so you can -fPIC) | ||
| 947 | ## and %xmm9-%xmm15 as specified below. | ||
| 948 | ## | ||
| 949 | .type _vpaes_preheat,\@abi-omnipotent | ||
| 950 | .align 16 | ||
| 951 | _vpaes_preheat: | ||
| 952 | lea .Lk_s0F(%rip), %r10 | ||
| 953 | movdqa -0x20(%r10), %xmm10 # .Lk_inv | ||
| 954 | movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 | ||
| 955 | movdqa 0x00(%r10), %xmm9 # .Lk_s0F | ||
| 956 | movdqa 0x30(%r10), %xmm13 # .Lk_sb1 | ||
| 957 | movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 | ||
| 958 | movdqa 0x50(%r10), %xmm15 # .Lk_sb2 | ||
| 959 | movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 | ||
| 960 | ret | ||
| 961 | .size _vpaes_preheat,.-_vpaes_preheat | ||
| 962 | ######################################################## | ||
| 963 | ## ## | ||
| 964 | ## Constants ## | ||
| 965 | ## ## | ||
| 966 | ######################################################## | ||
| 967 | .type _vpaes_consts,\@object | ||
| 968 | .align 64 | ||
| 969 | _vpaes_consts: | ||
| 970 | .Lk_inv: # inv, inva | ||
| 971 | .quad 0x0E05060F0D080180, 0x040703090A0B0C02 | ||
| 972 | .quad 0x01040A060F0B0780, 0x030D0E0C02050809 | ||
| 973 | |||
| 974 | .Lk_s0F: # s0F | ||
| 975 | .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F | ||
| 976 | |||
| 977 | .Lk_ipt: # input transform (lo, hi) | ||
| 978 | .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 | ||
| 979 | .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 | ||
| 980 | |||
| 981 | .Lk_sb1: # sb1u, sb1t | ||
| 982 | .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 | ||
| 983 | .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF | ||
| 984 | .Lk_sb2: # sb2u, sb2t | ||
| 985 | .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD | ||
| 986 | .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A | ||
| 987 | .Lk_sbo: # sbou, sbot | ||
| 988 | .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 | ||
| 989 | .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA | ||
| 990 | |||
| 991 | .Lk_mc_forward: # mc_forward | ||
| 992 | .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 | ||
| 993 | .quad 0x080B0A0904070605, 0x000302010C0F0E0D | ||
| 994 | .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 | ||
| 995 | .quad 0x000302010C0F0E0D, 0x080B0A0904070605 | ||
| 996 | |||
| 997 | .Lk_mc_backward:# mc_backward | ||
| 998 | .quad 0x0605040702010003, 0x0E0D0C0F0A09080B | ||
| 999 | .quad 0x020100030E0D0C0F, 0x0A09080B06050407 | ||
| 1000 | .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 | ||
| 1001 | .quad 0x0A09080B06050407, 0x020100030E0D0C0F | ||
| 1002 | |||
| 1003 | .Lk_sr: # sr | ||
| 1004 | .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 | ||
| 1005 | .quad 0x030E09040F0A0500, 0x0B06010C07020D08 | ||
| 1006 | .quad 0x0F060D040B020900, 0x070E050C030A0108 | ||
| 1007 | .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 | ||
| 1008 | |||
| 1009 | .Lk_rcon: # rcon | ||
| 1010 | .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 | ||
| 1011 | |||
| 1012 | .Lk_s63: # s63: all equal to 0x63 transformed | ||
| 1013 | .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B | ||
| 1014 | |||
| 1015 | .Lk_opt: # output transform | ||
| 1016 | .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 | ||
| 1017 | .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 | ||
| 1018 | |||
| 1019 | .Lk_deskew: # deskew tables: inverts the sbox's "skew" | ||
| 1020 | .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A | ||
| 1021 | .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 | ||
| 1022 | |||
| 1023 | ## | ||
| 1024 | ## Decryption stuff | ||
| 1025 | ## Key schedule constants | ||
| 1026 | ## | ||
| 1027 | .Lk_dksd: # decryption key schedule: invskew x*D | ||
| 1028 | .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 | ||
| 1029 | .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E | ||
| 1030 | .Lk_dksb: # decryption key schedule: invskew x*B | ||
| 1031 | .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 | ||
| 1032 | .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 | ||
| 1033 | .Lk_dkse: # decryption key schedule: invskew x*E + 0x63 | ||
| 1034 | .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 | ||
| 1035 | .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 | ||
| 1036 | .Lk_dks9: # decryption key schedule: invskew x*9 | ||
| 1037 | .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC | ||
| 1038 | .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE | ||
| 1039 | |||
| 1040 | ## | ||
| 1041 | ## Decryption stuff | ||
| 1042 | ## Round function constants | ||
| 1043 | ## | ||
| 1044 | .Lk_dipt: # decryption input transform | ||
| 1045 | .quad 0x0F505B040B545F00, 0x154A411E114E451A | ||
| 1046 | .quad 0x86E383E660056500, 0x12771772F491F194 | ||
| 1047 | |||
| 1048 | .Lk_dsb9: # decryption sbox output *9*u, *9*t | ||
| 1049 | .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 | ||
| 1050 | .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 | ||
| 1051 | .Lk_dsbd: # decryption sbox output *D*u, *D*t | ||
| 1052 | .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 | ||
| 1053 | .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 | ||
| 1054 | .Lk_dsbb: # decryption sbox output *B*u, *B*t | ||
| 1055 | .quad 0xD022649296B44200, 0x602646F6B0F2D404 | ||
| 1056 | .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B | ||
| 1057 | .Lk_dsbe: # decryption sbox output *E*u, *E*t | ||
| 1058 | .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 | ||
| 1059 | .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 | ||
| 1060 | .Lk_dsbo: # decryption sbox final output | ||
| 1061 | .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D | ||
| 1062 | .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C | ||
| 1063 | .asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)" | ||
| 1064 | .align 64 | ||
| 1065 | .size _vpaes_consts,.-_vpaes_consts | ||
| 1066 | ___ | ||
| 1067 | |||
| 1068 | if ($win64) { | ||
| 1069 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 1070 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 1071 | $rec="%rcx"; | ||
| 1072 | $frame="%rdx"; | ||
| 1073 | $context="%r8"; | ||
| 1074 | $disp="%r9"; | ||
| 1075 | |||
| 1076 | $code.=<<___; | ||
| 1077 | .extern __imp_RtlVirtualUnwind | ||
| 1078 | .type se_handler,\@abi-omnipotent | ||
| 1079 | .align 16 | ||
| 1080 | se_handler: | ||
| 1081 | push %rsi | ||
| 1082 | push %rdi | ||
| 1083 | push %rbx | ||
| 1084 | push %rbp | ||
| 1085 | push %r12 | ||
| 1086 | push %r13 | ||
| 1087 | push %r14 | ||
| 1088 | push %r15 | ||
| 1089 | pushfq | ||
| 1090 | sub \$64,%rsp | ||
| 1091 | |||
| 1092 | mov 120($context),%rax # pull context->Rax | ||
| 1093 | mov 248($context),%rbx # pull context->Rip | ||
| 1094 | |||
| 1095 | mov 8($disp),%rsi # disp->ImageBase | ||
| 1096 | mov 56($disp),%r11 # disp->HandlerData | ||
| 1097 | |||
| 1098 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 1099 | lea (%rsi,%r10),%r10 # prologue label | ||
| 1100 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 1101 | jb .Lin_prologue | ||
| 1102 | |||
| 1103 | mov 152($context),%rax # pull context->Rsp | ||
| 1104 | |||
| 1105 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 1106 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 1107 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 1108 | jae .Lin_prologue | ||
| 1109 | |||
| 1110 | lea 16(%rax),%rsi # %xmm save area | ||
| 1111 | lea 512($context),%rdi # &context.Xmm6 | ||
| 1112 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
| 1113 | .long 0xa548f3fc # cld; rep movsq | ||
| 1114 | lea 0xb8(%rax),%rax # adjust stack pointer | ||
| 1115 | |||
| 1116 | .Lin_prologue: | ||
| 1117 | mov 8(%rax),%rdi | ||
| 1118 | mov 16(%rax),%rsi | ||
| 1119 | mov %rax,152($context) # restore context->Rsp | ||
| 1120 | mov %rsi,168($context) # restore context->Rsi | ||
| 1121 | mov %rdi,176($context) # restore context->Rdi | ||
| 1122 | |||
| 1123 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 1124 | mov $context,%rsi # context | ||
| 1125 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
| 1126 | .long 0xa548f3fc # cld; rep movsq | ||
| 1127 | |||
| 1128 | mov $disp,%rsi | ||
| 1129 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 1130 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 1131 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 1132 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 1133 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 1134 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 1135 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 1136 | mov %r10,32(%rsp) # arg5 | ||
| 1137 | mov %r11,40(%rsp) # arg6 | ||
| 1138 | mov %r12,48(%rsp) # arg7 | ||
| 1139 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 1140 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 1141 | |||
| 1142 | mov \$1,%eax # ExceptionContinueSearch | ||
| 1143 | add \$64,%rsp | ||
| 1144 | popfq | ||
| 1145 | pop %r15 | ||
| 1146 | pop %r14 | ||
| 1147 | pop %r13 | ||
| 1148 | pop %r12 | ||
| 1149 | pop %rbp | ||
| 1150 | pop %rbx | ||
| 1151 | pop %rdi | ||
| 1152 | pop %rsi | ||
| 1153 | ret | ||
| 1154 | .size se_handler,.-se_handler | ||
| 1155 | |||
| 1156 | .section .pdata | ||
| 1157 | .align 4 | ||
| 1158 | .rva .LSEH_begin_${PREFIX}_set_encrypt_key | ||
| 1159 | .rva .LSEH_end_${PREFIX}_set_encrypt_key | ||
| 1160 | .rva .LSEH_info_${PREFIX}_set_encrypt_key | ||
| 1161 | |||
| 1162 | .rva .LSEH_begin_${PREFIX}_set_decrypt_key | ||
| 1163 | .rva .LSEH_end_${PREFIX}_set_decrypt_key | ||
| 1164 | .rva .LSEH_info_${PREFIX}_set_decrypt_key | ||
| 1165 | |||
| 1166 | .rva .LSEH_begin_${PREFIX}_encrypt | ||
| 1167 | .rva .LSEH_end_${PREFIX}_encrypt | ||
| 1168 | .rva .LSEH_info_${PREFIX}_encrypt | ||
| 1169 | |||
| 1170 | .rva .LSEH_begin_${PREFIX}_decrypt | ||
| 1171 | .rva .LSEH_end_${PREFIX}_decrypt | ||
| 1172 | .rva .LSEH_info_${PREFIX}_decrypt | ||
| 1173 | |||
| 1174 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt | ||
| 1175 | .rva .LSEH_end_${PREFIX}_cbc_encrypt | ||
| 1176 | .rva .LSEH_info_${PREFIX}_cbc_encrypt | ||
| 1177 | |||
| 1178 | .section .xdata | ||
| 1179 | .align 8 | ||
| 1180 | .LSEH_info_${PREFIX}_set_encrypt_key: | ||
| 1181 | .byte 9,0,0,0 | ||
| 1182 | .rva se_handler | ||
| 1183 | .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] | ||
| 1184 | .LSEH_info_${PREFIX}_set_decrypt_key: | ||
| 1185 | .byte 9,0,0,0 | ||
| 1186 | .rva se_handler | ||
| 1187 | .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[] | ||
| 1188 | .LSEH_info_${PREFIX}_encrypt: | ||
| 1189 | .byte 9,0,0,0 | ||
| 1190 | .rva se_handler | ||
| 1191 | .rva .Lenc_body,.Lenc_epilogue # HandlerData[] | ||
| 1192 | .LSEH_info_${PREFIX}_decrypt: | ||
| 1193 | .byte 9,0,0,0 | ||
| 1194 | .rva se_handler | ||
| 1195 | .rva .Ldec_body,.Ldec_epilogue # HandlerData[] | ||
| 1196 | .LSEH_info_${PREFIX}_cbc_encrypt: | ||
| 1197 | .byte 9,0,0,0 | ||
| 1198 | .rva se_handler | ||
| 1199 | .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[] | ||
| 1200 | ___ | ||
| 1201 | } | ||
| 1202 | |||
| 1203 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 1204 | |||
| 1205 | print $code; | ||
| 1206 | |||
| 1207 | close STDOUT; | ||
