diff options
Diffstat (limited to '')
160 files changed, 48653 insertions, 1420 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl index c51ee1fbf6..86b86c4a0f 100644 --- a/src/lib/libcrypto/aes/asm/aes-armv4.pl +++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl | |||
| @@ -27,6 +27,11 @@ | |||
| 27 | # Rescheduling for dual-issue pipeline resulted in 12% improvement on | 27 | # Rescheduling for dual-issue pipeline resulted in 12% improvement on |
| 28 | # Cortex A8 core and ~25 cycles per byte processed with 128-bit key. | 28 | # Cortex A8 core and ~25 cycles per byte processed with 128-bit key. |
| 29 | 29 | ||
| 30 | # February 2011. | ||
| 31 | # | ||
| 32 | # Profiler-assisted and platform-specific optimization resulted in 16% | ||
| 33 | # improvement on Cortex A8 core and ~21.5 cycles per byte. | ||
| 34 | |||
| 30 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | 35 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| 31 | open STDOUT,">$output"; | 36 | open STDOUT,">$output"; |
| 32 | 37 | ||
| @@ -46,6 +51,7 @@ $key="r11"; | |||
| 46 | $rounds="r12"; | 51 | $rounds="r12"; |
| 47 | 52 | ||
| 48 | $code=<<___; | 53 | $code=<<___; |
| 54 | #include "arm_arch.h" | ||
| 49 | .text | 55 | .text |
| 50 | .code 32 | 56 | .code 32 |
| 51 | 57 | ||
| @@ -166,7 +172,7 @@ AES_encrypt: | |||
| 166 | mov $rounds,r0 @ inp | 172 | mov $rounds,r0 @ inp |
| 167 | mov $key,r2 | 173 | mov $key,r2 |
| 168 | sub $tbl,r3,#AES_encrypt-AES_Te @ Te | 174 | sub $tbl,r3,#AES_encrypt-AES_Te @ Te |
| 169 | 175 | #if __ARM_ARCH__<7 | |
| 170 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral | 176 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral |
| 171 | ldrb $t1,[$rounds,#2] @ manner... | 177 | ldrb $t1,[$rounds,#2] @ manner... |
| 172 | ldrb $t2,[$rounds,#1] | 178 | ldrb $t2,[$rounds,#1] |
| @@ -195,10 +201,33 @@ AES_encrypt: | |||
| 195 | orr $s3,$s3,$t1,lsl#8 | 201 | orr $s3,$s3,$t1,lsl#8 |
| 196 | orr $s3,$s3,$t2,lsl#16 | 202 | orr $s3,$s3,$t2,lsl#16 |
| 197 | orr $s3,$s3,$t3,lsl#24 | 203 | orr $s3,$s3,$t3,lsl#24 |
| 198 | 204 | #else | |
| 205 | ldr $s0,[$rounds,#0] | ||
| 206 | ldr $s1,[$rounds,#4] | ||
| 207 | ldr $s2,[$rounds,#8] | ||
| 208 | ldr $s3,[$rounds,#12] | ||
| 209 | #ifdef __ARMEL__ | ||
| 210 | rev $s0,$s0 | ||
| 211 | rev $s1,$s1 | ||
| 212 | rev $s2,$s2 | ||
| 213 | rev $s3,$s3 | ||
| 214 | #endif | ||
| 215 | #endif | ||
| 199 | bl _armv4_AES_encrypt | 216 | bl _armv4_AES_encrypt |
| 200 | 217 | ||
| 201 | ldr $rounds,[sp],#4 @ pop out | 218 | ldr $rounds,[sp],#4 @ pop out |
| 219 | #if __ARM_ARCH__>=7 | ||
| 220 | #ifdef __ARMEL__ | ||
| 221 | rev $s0,$s0 | ||
| 222 | rev $s1,$s1 | ||
| 223 | rev $s2,$s2 | ||
| 224 | rev $s3,$s3 | ||
| 225 | #endif | ||
| 226 | str $s0,[$rounds,#0] | ||
| 227 | str $s1,[$rounds,#4] | ||
| 228 | str $s2,[$rounds,#8] | ||
| 229 | str $s3,[$rounds,#12] | ||
| 230 | #else | ||
| 202 | mov $t1,$s0,lsr#24 @ write output in endian-neutral | 231 | mov $t1,$s0,lsr#24 @ write output in endian-neutral |
| 203 | mov $t2,$s0,lsr#16 @ manner... | 232 | mov $t2,$s0,lsr#16 @ manner... |
| 204 | mov $t3,$s0,lsr#8 | 233 | mov $t3,$s0,lsr#8 |
| @@ -227,11 +256,15 @@ AES_encrypt: | |||
| 227 | strb $t2,[$rounds,#13] | 256 | strb $t2,[$rounds,#13] |
| 228 | strb $t3,[$rounds,#14] | 257 | strb $t3,[$rounds,#14] |
| 229 | strb $s3,[$rounds,#15] | 258 | strb $s3,[$rounds,#15] |
| 230 | 259 | #endif | |
| 260 | #if __ARM_ARCH__>=5 | ||
| 261 | ldmia sp!,{r4-r12,pc} | ||
| 262 | #else | ||
| 231 | ldmia sp!,{r4-r12,lr} | 263 | ldmia sp!,{r4-r12,lr} |
| 232 | tst lr,#1 | 264 | tst lr,#1 |
| 233 | moveq pc,lr @ be binary compatible with V4, yet | 265 | moveq pc,lr @ be binary compatible with V4, yet |
| 234 | bx lr @ interoperable with Thumb ISA:-) | 266 | bx lr @ interoperable with Thumb ISA:-) |
| 267 | #endif | ||
| 235 | .size AES_encrypt,.-AES_encrypt | 268 | .size AES_encrypt,.-AES_encrypt |
| 236 | 269 | ||
| 237 | .type _armv4_AES_encrypt,%function | 270 | .type _armv4_AES_encrypt,%function |
| @@ -271,11 +304,11 @@ _armv4_AES_encrypt: | |||
| 271 | and $i2,lr,$s2,lsr#16 @ i1 | 304 | and $i2,lr,$s2,lsr#16 @ i1 |
| 272 | eor $t3,$t3,$i3,ror#8 | 305 | eor $t3,$t3,$i3,ror#8 |
| 273 | and $i3,lr,$s2 | 306 | and $i3,lr,$s2 |
| 274 | eor $s1,$s1,$t1,ror#24 | ||
| 275 | ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] | 307 | ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] |
| 308 | eor $s1,$s1,$t1,ror#24 | ||
| 309 | ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] | ||
| 276 | mov $s2,$s2,lsr#24 | 310 | mov $s2,$s2,lsr#24 |
| 277 | 311 | ||
| 278 | ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] | ||
| 279 | ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] | 312 | ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] |
| 280 | eor $s0,$s0,$i1,ror#16 | 313 | eor $s0,$s0,$i1,ror#16 |
| 281 | ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] | 314 | ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] |
| @@ -284,16 +317,16 @@ _armv4_AES_encrypt: | |||
| 284 | and $i2,lr,$s3,lsr#8 @ i1 | 317 | and $i2,lr,$s3,lsr#8 @ i1 |
| 285 | eor $t3,$t3,$i3,ror#16 | 318 | eor $t3,$t3,$i3,ror#16 |
| 286 | and $i3,lr,$s3,lsr#16 @ i2 | 319 | and $i3,lr,$s3,lsr#16 @ i2 |
| 287 | eor $s2,$s2,$t2,ror#16 | ||
| 288 | ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] | 320 | ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] |
| 321 | eor $s2,$s2,$t2,ror#16 | ||
| 322 | ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] | ||
| 289 | mov $s3,$s3,lsr#24 | 323 | mov $s3,$s3,lsr#24 |
| 290 | 324 | ||
| 291 | ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] | ||
| 292 | ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] | 325 | ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] |
| 293 | eor $s0,$s0,$i1,ror#24 | 326 | eor $s0,$s0,$i1,ror#24 |
| 294 | ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24] | ||
| 295 | eor $s1,$s1,$i2,ror#16 | ||
| 296 | ldr $i1,[$key],#16 | 327 | ldr $i1,[$key],#16 |
| 328 | eor $s1,$s1,$i2,ror#16 | ||
| 329 | ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24] | ||
| 297 | eor $s2,$s2,$i3,ror#8 | 330 | eor $s2,$s2,$i3,ror#8 |
| 298 | ldr $t1,[$key,#-12] | 331 | ldr $t1,[$key,#-12] |
| 299 | eor $s3,$s3,$t3,ror#8 | 332 | eor $s3,$s3,$t3,ror#8 |
| @@ -333,11 +366,11 @@ _armv4_AES_encrypt: | |||
| 333 | and $i2,lr,$s2,lsr#16 @ i1 | 366 | and $i2,lr,$s2,lsr#16 @ i1 |
| 334 | eor $t3,$i3,$t3,lsl#8 | 367 | eor $t3,$i3,$t3,lsl#8 |
| 335 | and $i3,lr,$s2 | 368 | and $i3,lr,$s2 |
| 336 | eor $s1,$t1,$s1,lsl#24 | ||
| 337 | ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] | 369 | ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] |
| 370 | eor $s1,$t1,$s1,lsl#24 | ||
| 371 | ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] | ||
| 338 | mov $s2,$s2,lsr#24 | 372 | mov $s2,$s2,lsr#24 |
| 339 | 373 | ||
| 340 | ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] | ||
| 341 | ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] | 374 | ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] |
| 342 | eor $s0,$i1,$s0,lsl#8 | 375 | eor $s0,$i1,$s0,lsl#8 |
| 343 | ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] | 376 | ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] |
| @@ -346,15 +379,15 @@ _armv4_AES_encrypt: | |||
| 346 | and $i2,lr,$s3,lsr#8 @ i1 | 379 | and $i2,lr,$s3,lsr#8 @ i1 |
| 347 | eor $t3,$i3,$t3,lsl#8 | 380 | eor $t3,$i3,$t3,lsl#8 |
| 348 | and $i3,lr,$s3,lsr#16 @ i2 | 381 | and $i3,lr,$s3,lsr#16 @ i2 |
| 349 | eor $s2,$t2,$s2,lsl#24 | ||
| 350 | ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] | 382 | ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] |
| 383 | eor $s2,$t2,$s2,lsl#24 | ||
| 384 | ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] | ||
| 351 | mov $s3,$s3,lsr#24 | 385 | mov $s3,$s3,lsr#24 |
| 352 | 386 | ||
| 353 | ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] | ||
| 354 | ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] | 387 | ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] |
| 355 | eor $s0,$i1,$s0,lsl#8 | 388 | eor $s0,$i1,$s0,lsl#8 |
| 356 | ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24] | ||
| 357 | ldr $i1,[$key,#0] | 389 | ldr $i1,[$key,#0] |
| 390 | ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24] | ||
| 358 | eor $s1,$s1,$i2,lsl#8 | 391 | eor $s1,$s1,$i2,lsl#8 |
| 359 | ldr $t1,[$key,#4] | 392 | ldr $t1,[$key,#4] |
| 360 | eor $s2,$s2,$i3,lsl#16 | 393 | eor $s2,$s2,$i3,lsl#16 |
| @@ -371,10 +404,11 @@ _armv4_AES_encrypt: | |||
| 371 | ldr pc,[sp],#4 @ pop and return | 404 | ldr pc,[sp],#4 @ pop and return |
| 372 | .size _armv4_AES_encrypt,.-_armv4_AES_encrypt | 405 | .size _armv4_AES_encrypt,.-_armv4_AES_encrypt |
| 373 | 406 | ||
| 374 | .global AES_set_encrypt_key | 407 | .global private_AES_set_encrypt_key |
| 375 | .type AES_set_encrypt_key,%function | 408 | .type private_AES_set_encrypt_key,%function |
| 376 | .align 5 | 409 | .align 5 |
| 377 | AES_set_encrypt_key: | 410 | private_AES_set_encrypt_key: |
| 411 | _armv4_AES_set_encrypt_key: | ||
| 378 | sub r3,pc,#8 @ AES_set_encrypt_key | 412 | sub r3,pc,#8 @ AES_set_encrypt_key |
| 379 | teq r0,#0 | 413 | teq r0,#0 |
| 380 | moveq r0,#-1 | 414 | moveq r0,#-1 |
| @@ -392,12 +426,13 @@ AES_set_encrypt_key: | |||
| 392 | bne .Labrt | 426 | bne .Labrt |
| 393 | 427 | ||
| 394 | .Lok: stmdb sp!,{r4-r12,lr} | 428 | .Lok: stmdb sp!,{r4-r12,lr} |
| 395 | sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4 | 429 | sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 |
| 396 | 430 | ||
| 397 | mov $rounds,r0 @ inp | 431 | mov $rounds,r0 @ inp |
| 398 | mov lr,r1 @ bits | 432 | mov lr,r1 @ bits |
| 399 | mov $key,r2 @ key | 433 | mov $key,r2 @ key |
| 400 | 434 | ||
| 435 | #if __ARM_ARCH__<7 | ||
| 401 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral | 436 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral |
| 402 | ldrb $t1,[$rounds,#2] @ manner... | 437 | ldrb $t1,[$rounds,#2] @ manner... |
| 403 | ldrb $t2,[$rounds,#1] | 438 | ldrb $t2,[$rounds,#1] |
| @@ -430,6 +465,22 @@ AES_set_encrypt_key: | |||
| 430 | orr $s3,$s3,$t3,lsl#24 | 465 | orr $s3,$s3,$t3,lsl#24 |
| 431 | str $s2,[$key,#-8] | 466 | str $s2,[$key,#-8] |
| 432 | str $s3,[$key,#-4] | 467 | str $s3,[$key,#-4] |
| 468 | #else | ||
| 469 | ldr $s0,[$rounds,#0] | ||
| 470 | ldr $s1,[$rounds,#4] | ||
| 471 | ldr $s2,[$rounds,#8] | ||
| 472 | ldr $s3,[$rounds,#12] | ||
| 473 | #ifdef __ARMEL__ | ||
| 474 | rev $s0,$s0 | ||
| 475 | rev $s1,$s1 | ||
| 476 | rev $s2,$s2 | ||
| 477 | rev $s3,$s3 | ||
| 478 | #endif | ||
| 479 | str $s0,[$key],#16 | ||
| 480 | str $s1,[$key,#-12] | ||
| 481 | str $s2,[$key,#-8] | ||
| 482 | str $s3,[$key,#-4] | ||
| 483 | #endif | ||
| 433 | 484 | ||
| 434 | teq lr,#128 | 485 | teq lr,#128 |
| 435 | bne .Lnot128 | 486 | bne .Lnot128 |
| @@ -466,6 +517,7 @@ AES_set_encrypt_key: | |||
| 466 | b .Ldone | 517 | b .Ldone |
| 467 | 518 | ||
| 468 | .Lnot128: | 519 | .Lnot128: |
| 520 | #if __ARM_ARCH__<7 | ||
| 469 | ldrb $i2,[$rounds,#19] | 521 | ldrb $i2,[$rounds,#19] |
| 470 | ldrb $t1,[$rounds,#18] | 522 | ldrb $t1,[$rounds,#18] |
| 471 | ldrb $t2,[$rounds,#17] | 523 | ldrb $t2,[$rounds,#17] |
| @@ -482,6 +534,16 @@ AES_set_encrypt_key: | |||
| 482 | str $i2,[$key],#8 | 534 | str $i2,[$key],#8 |
| 483 | orr $i3,$i3,$t3,lsl#24 | 535 | orr $i3,$i3,$t3,lsl#24 |
| 484 | str $i3,[$key,#-4] | 536 | str $i3,[$key,#-4] |
| 537 | #else | ||
| 538 | ldr $i2,[$rounds,#16] | ||
| 539 | ldr $i3,[$rounds,#20] | ||
| 540 | #ifdef __ARMEL__ | ||
| 541 | rev $i2,$i2 | ||
| 542 | rev $i3,$i3 | ||
| 543 | #endif | ||
| 544 | str $i2,[$key],#8 | ||
| 545 | str $i3,[$key,#-4] | ||
| 546 | #endif | ||
| 485 | 547 | ||
| 486 | teq lr,#192 | 548 | teq lr,#192 |
| 487 | bne .Lnot192 | 549 | bne .Lnot192 |
| @@ -526,6 +588,7 @@ AES_set_encrypt_key: | |||
| 526 | b .L192_loop | 588 | b .L192_loop |
| 527 | 589 | ||
| 528 | .Lnot192: | 590 | .Lnot192: |
| 591 | #if __ARM_ARCH__<7 | ||
| 529 | ldrb $i2,[$rounds,#27] | 592 | ldrb $i2,[$rounds,#27] |
| 530 | ldrb $t1,[$rounds,#26] | 593 | ldrb $t1,[$rounds,#26] |
| 531 | ldrb $t2,[$rounds,#25] | 594 | ldrb $t2,[$rounds,#25] |
| @@ -542,6 +605,16 @@ AES_set_encrypt_key: | |||
| 542 | str $i2,[$key],#8 | 605 | str $i2,[$key],#8 |
| 543 | orr $i3,$i3,$t3,lsl#24 | 606 | orr $i3,$i3,$t3,lsl#24 |
| 544 | str $i3,[$key,#-4] | 607 | str $i3,[$key,#-4] |
| 608 | #else | ||
| 609 | ldr $i2,[$rounds,#24] | ||
| 610 | ldr $i3,[$rounds,#28] | ||
| 611 | #ifdef __ARMEL__ | ||
| 612 | rev $i2,$i2 | ||
| 613 | rev $i3,$i3 | ||
| 614 | #endif | ||
| 615 | str $i2,[$key],#8 | ||
| 616 | str $i3,[$key,#-4] | ||
| 617 | #endif | ||
| 545 | 618 | ||
| 546 | mov $rounds,#14 | 619 | mov $rounds,#14 |
| 547 | str $rounds,[$key,#240-32] | 620 | str $rounds,[$key,#240-32] |
| @@ -606,14 +679,14 @@ AES_set_encrypt_key: | |||
| 606 | .Labrt: tst lr,#1 | 679 | .Labrt: tst lr,#1 |
| 607 | moveq pc,lr @ be binary compatible with V4, yet | 680 | moveq pc,lr @ be binary compatible with V4, yet |
| 608 | bx lr @ interoperable with Thumb ISA:-) | 681 | bx lr @ interoperable with Thumb ISA:-) |
| 609 | .size AES_set_encrypt_key,.-AES_set_encrypt_key | 682 | .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key |
| 610 | 683 | ||
| 611 | .global AES_set_decrypt_key | 684 | .global private_AES_set_decrypt_key |
| 612 | .type AES_set_decrypt_key,%function | 685 | .type private_AES_set_decrypt_key,%function |
| 613 | .align 5 | 686 | .align 5 |
| 614 | AES_set_decrypt_key: | 687 | private_AES_set_decrypt_key: |
| 615 | str lr,[sp,#-4]! @ push lr | 688 | str lr,[sp,#-4]! @ push lr |
| 616 | bl AES_set_encrypt_key | 689 | bl _armv4_AES_set_encrypt_key |
| 617 | teq r0,#0 | 690 | teq r0,#0 |
| 618 | ldrne lr,[sp],#4 @ pop lr | 691 | ldrne lr,[sp],#4 @ pop lr |
| 619 | bne .Labrt | 692 | bne .Labrt |
| @@ -692,11 +765,15 @@ $code.=<<___; | |||
| 692 | bne .Lmix | 765 | bne .Lmix |
| 693 | 766 | ||
| 694 | mov r0,#0 | 767 | mov r0,#0 |
| 768 | #if __ARM_ARCH__>=5 | ||
| 769 | ldmia sp!,{r4-r12,pc} | ||
| 770 | #else | ||
| 695 | ldmia sp!,{r4-r12,lr} | 771 | ldmia sp!,{r4-r12,lr} |
| 696 | tst lr,#1 | 772 | tst lr,#1 |
| 697 | moveq pc,lr @ be binary compatible with V4, yet | 773 | moveq pc,lr @ be binary compatible with V4, yet |
| 698 | bx lr @ interoperable with Thumb ISA:-) | 774 | bx lr @ interoperable with Thumb ISA:-) |
| 699 | .size AES_set_decrypt_key,.-AES_set_decrypt_key | 775 | #endif |
| 776 | .size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key | ||
| 700 | 777 | ||
| 701 | .type AES_Td,%object | 778 | .type AES_Td,%object |
| 702 | .align 5 | 779 | .align 5 |
| @@ -811,7 +888,7 @@ AES_decrypt: | |||
| 811 | mov $rounds,r0 @ inp | 888 | mov $rounds,r0 @ inp |
| 812 | mov $key,r2 | 889 | mov $key,r2 |
| 813 | sub $tbl,r3,#AES_decrypt-AES_Td @ Td | 890 | sub $tbl,r3,#AES_decrypt-AES_Td @ Td |
| 814 | 891 | #if __ARM_ARCH__<7 | |
| 815 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral | 892 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral |
| 816 | ldrb $t1,[$rounds,#2] @ manner... | 893 | ldrb $t1,[$rounds,#2] @ manner... |
| 817 | ldrb $t2,[$rounds,#1] | 894 | ldrb $t2,[$rounds,#1] |
| @@ -840,10 +917,33 @@ AES_decrypt: | |||
| 840 | orr $s3,$s3,$t1,lsl#8 | 917 | orr $s3,$s3,$t1,lsl#8 |
| 841 | orr $s3,$s3,$t2,lsl#16 | 918 | orr $s3,$s3,$t2,lsl#16 |
| 842 | orr $s3,$s3,$t3,lsl#24 | 919 | orr $s3,$s3,$t3,lsl#24 |
| 843 | 920 | #else | |
| 921 | ldr $s0,[$rounds,#0] | ||
| 922 | ldr $s1,[$rounds,#4] | ||
| 923 | ldr $s2,[$rounds,#8] | ||
| 924 | ldr $s3,[$rounds,#12] | ||
| 925 | #ifdef __ARMEL__ | ||
| 926 | rev $s0,$s0 | ||
| 927 | rev $s1,$s1 | ||
| 928 | rev $s2,$s2 | ||
| 929 | rev $s3,$s3 | ||
| 930 | #endif | ||
| 931 | #endif | ||
| 844 | bl _armv4_AES_decrypt | 932 | bl _armv4_AES_decrypt |
| 845 | 933 | ||
| 846 | ldr $rounds,[sp],#4 @ pop out | 934 | ldr $rounds,[sp],#4 @ pop out |
| 935 | #if __ARM_ARCH__>=7 | ||
| 936 | #ifdef __ARMEL__ | ||
| 937 | rev $s0,$s0 | ||
| 938 | rev $s1,$s1 | ||
| 939 | rev $s2,$s2 | ||
| 940 | rev $s3,$s3 | ||
| 941 | #endif | ||
| 942 | str $s0,[$rounds,#0] | ||
| 943 | str $s1,[$rounds,#4] | ||
| 944 | str $s2,[$rounds,#8] | ||
| 945 | str $s3,[$rounds,#12] | ||
| 946 | #else | ||
| 847 | mov $t1,$s0,lsr#24 @ write output in endian-neutral | 947 | mov $t1,$s0,lsr#24 @ write output in endian-neutral |
| 848 | mov $t2,$s0,lsr#16 @ manner... | 948 | mov $t2,$s0,lsr#16 @ manner... |
| 849 | mov $t3,$s0,lsr#8 | 949 | mov $t3,$s0,lsr#8 |
| @@ -872,11 +972,15 @@ AES_decrypt: | |||
| 872 | strb $t2,[$rounds,#13] | 972 | strb $t2,[$rounds,#13] |
| 873 | strb $t3,[$rounds,#14] | 973 | strb $t3,[$rounds,#14] |
| 874 | strb $s3,[$rounds,#15] | 974 | strb $s3,[$rounds,#15] |
| 875 | 975 | #endif | |
| 976 | #if __ARM_ARCH__>=5 | ||
| 977 | ldmia sp!,{r4-r12,pc} | ||
| 978 | #else | ||
| 876 | ldmia sp!,{r4-r12,lr} | 979 | ldmia sp!,{r4-r12,lr} |
| 877 | tst lr,#1 | 980 | tst lr,#1 |
| 878 | moveq pc,lr @ be binary compatible with V4, yet | 981 | moveq pc,lr @ be binary compatible with V4, yet |
| 879 | bx lr @ interoperable with Thumb ISA:-) | 982 | bx lr @ interoperable with Thumb ISA:-) |
| 983 | #endif | ||
| 880 | .size AES_decrypt,.-AES_decrypt | 984 | .size AES_decrypt,.-AES_decrypt |
| 881 | 985 | ||
| 882 | .type _armv4_AES_decrypt,%function | 986 | .type _armv4_AES_decrypt,%function |
| @@ -916,11 +1020,11 @@ _armv4_AES_decrypt: | |||
| 916 | and $i2,lr,$s2 @ i1 | 1020 | and $i2,lr,$s2 @ i1 |
| 917 | eor $t3,$i3,$t3,ror#8 | 1021 | eor $t3,$i3,$t3,ror#8 |
| 918 | and $i3,lr,$s2,lsr#16 | 1022 | and $i3,lr,$s2,lsr#16 |
| 919 | eor $s1,$s1,$t1,ror#8 | ||
| 920 | ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] | 1023 | ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] |
| 1024 | eor $s1,$s1,$t1,ror#8 | ||
| 1025 | ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] | ||
| 921 | mov $s2,$s2,lsr#24 | 1026 | mov $s2,$s2,lsr#24 |
| 922 | 1027 | ||
| 923 | ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] | ||
| 924 | ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] | 1028 | ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] |
| 925 | eor $s0,$s0,$i1,ror#16 | 1029 | eor $s0,$s0,$i1,ror#16 |
| 926 | ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] | 1030 | ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] |
| @@ -929,22 +1033,22 @@ _armv4_AES_decrypt: | |||
| 929 | and $i2,lr,$s3,lsr#8 @ i1 | 1033 | and $i2,lr,$s3,lsr#8 @ i1 |
| 930 | eor $t3,$i3,$t3,ror#8 | 1034 | eor $t3,$i3,$t3,ror#8 |
| 931 | and $i3,lr,$s3 @ i2 | 1035 | and $i3,lr,$s3 @ i2 |
| 932 | eor $s2,$s2,$t2,ror#8 | ||
| 933 | ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] | 1036 | ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] |
| 1037 | eor $s2,$s2,$t2,ror#8 | ||
| 1038 | ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] | ||
| 934 | mov $s3,$s3,lsr#24 | 1039 | mov $s3,$s3,lsr#24 |
| 935 | 1040 | ||
| 936 | ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] | ||
| 937 | ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] | 1041 | ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] |
| 938 | eor $s0,$s0,$i1,ror#8 | 1042 | eor $s0,$s0,$i1,ror#8 |
| 939 | ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] | 1043 | ldr $i1,[$key],#16 |
| 940 | eor $s1,$s1,$i2,ror#16 | 1044 | eor $s1,$s1,$i2,ror#16 |
| 1045 | ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] | ||
| 941 | eor $s2,$s2,$i3,ror#24 | 1046 | eor $s2,$s2,$i3,ror#24 |
| 942 | ldr $i1,[$key],#16 | ||
| 943 | eor $s3,$s3,$t3,ror#8 | ||
| 944 | 1047 | ||
| 945 | ldr $t1,[$key,#-12] | 1048 | ldr $t1,[$key,#-12] |
| 946 | ldr $t2,[$key,#-8] | ||
| 947 | eor $s0,$s0,$i1 | 1049 | eor $s0,$s0,$i1 |
| 1050 | ldr $t2,[$key,#-8] | ||
| 1051 | eor $s3,$s3,$t3,ror#8 | ||
| 948 | ldr $t3,[$key,#-4] | 1052 | ldr $t3,[$key,#-4] |
| 949 | and $i1,lr,$s0,lsr#16 | 1053 | and $i1,lr,$s0,lsr#16 |
| 950 | eor $s1,$s1,$t1 | 1054 | eor $s1,$s1,$t1 |
| @@ -985,11 +1089,11 @@ _armv4_AES_decrypt: | |||
| 985 | and $i1,lr,$s2,lsr#8 @ i0 | 1089 | and $i1,lr,$s2,lsr#8 @ i0 |
| 986 | eor $t2,$t2,$i2,lsl#8 | 1090 | eor $t2,$t2,$i2,lsl#8 |
| 987 | and $i2,lr,$s2 @ i1 | 1091 | and $i2,lr,$s2 @ i1 |
| 988 | eor $t3,$t3,$i3,lsl#8 | ||
| 989 | ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] | 1092 | ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] |
| 1093 | eor $t3,$t3,$i3,lsl#8 | ||
| 1094 | ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] | ||
| 990 | and $i3,lr,$s2,lsr#16 | 1095 | and $i3,lr,$s2,lsr#16 |
| 991 | 1096 | ||
| 992 | ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] | ||
| 993 | ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] | 1097 | ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] |
| 994 | eor $s0,$s0,$i1,lsl#8 | 1098 | eor $s0,$s0,$i1,lsl#8 |
| 995 | ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] | 1099 | ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] |
| @@ -997,11 +1101,11 @@ _armv4_AES_decrypt: | |||
| 997 | and $i1,lr,$s3,lsr#16 @ i0 | 1101 | and $i1,lr,$s3,lsr#16 @ i0 |
| 998 | eor $s2,$t2,$s2,lsl#16 | 1102 | eor $s2,$t2,$s2,lsl#16 |
| 999 | and $i2,lr,$s3,lsr#8 @ i1 | 1103 | and $i2,lr,$s3,lsr#8 @ i1 |
| 1000 | eor $t3,$t3,$i3,lsl#16 | ||
| 1001 | ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] | 1104 | ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] |
| 1105 | eor $t3,$t3,$i3,lsl#16 | ||
| 1106 | ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] | ||
| 1002 | and $i3,lr,$s3 @ i2 | 1107 | and $i3,lr,$s3 @ i2 |
| 1003 | 1108 | ||
| 1004 | ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] | ||
| 1005 | ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] | 1109 | ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] |
| 1006 | ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] | 1110 | ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] |
| 1007 | eor $s0,$s0,$i1,lsl#16 | 1111 | eor $s0,$s0,$i1,lsl#16 |
diff --git a/src/lib/libcrypto/aes/asm/aes-mips.pl b/src/lib/libcrypto/aes/asm/aes-mips.pl new file mode 100644 index 0000000000..2ce6deffc8 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aes-mips.pl | |||
| @@ -0,0 +1,1611 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # AES for MIPS | ||
| 11 | |||
| 12 | # October 2010 | ||
| 13 | # | ||
| 14 | # Code uses 1K[+256B] S-box and on single-issue core [such as R5000] | ||
| 15 | # spends ~68 cycles per byte processed with 128-bit key. This is ~16% | ||
| 16 | # faster than gcc-generated code, which is not very impressive. But | ||
| 17 | # recall that compressed S-box requires extra processing, namely | ||
| 18 | # additional rotations. Rotations are implemented with lwl/lwr pairs, | ||
| 19 | # which is normally used for loading unaligned data. Another cool | ||
| 20 | # thing about this module is its endian neutrality, which means that | ||
| 21 | # it processes data without ever changing byte order... | ||
| 22 | |||
| 23 | ###################################################################### | ||
| 24 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | ||
| 25 | # widely used. Then there is a new contender: NUBI. It appears that if | ||
| 26 | # one picks the latter, it's possible to arrange code in ABI neutral | ||
| 27 | # manner. Therefore let's stick to NUBI register layout: | ||
| 28 | # | ||
| 29 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | ||
| 30 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
| 31 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | ||
| 32 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | ||
| 33 | # | ||
| 34 | # The return value is placed in $a0. Following coding rules facilitate | ||
| 35 | # interoperability: | ||
| 36 | # | ||
| 37 | # - never ever touch $tp, "thread pointer", former $gp; | ||
| 38 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | ||
| 39 | # old code]; | ||
| 40 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | ||
| 41 | # | ||
| 42 | # For reference here is register layout for N32/64 MIPS ABIs: | ||
| 43 | # | ||
| 44 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
| 45 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
| 46 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
| 47 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
| 48 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
| 49 | # | ||
| 50 | $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 | ||
| 51 | |||
| 52 | if ($flavour =~ /64|n32/i) { | ||
| 53 | $PTR_ADD="dadd"; # incidentally works even on n32 | ||
| 54 | $PTR_SUB="dsub"; # incidentally works even on n32 | ||
| 55 | $REG_S="sd"; | ||
| 56 | $REG_L="ld"; | ||
| 57 | $PTR_SLL="dsll"; # incidentally works even on n32 | ||
| 58 | $SZREG=8; | ||
| 59 | } else { | ||
| 60 | $PTR_ADD="add"; | ||
| 61 | $PTR_SUB="sub"; | ||
| 62 | $REG_S="sw"; | ||
| 63 | $REG_L="lw"; | ||
| 64 | $PTR_SLL="sll"; | ||
| 65 | $SZREG=4; | ||
| 66 | } | ||
| 67 | $pf = ($flavour =~ /nubi/i) ? $t0 : $t2; | ||
| 68 | # | ||
| 69 | # <appro@openssl.org> | ||
| 70 | # | ||
| 71 | ###################################################################### | ||
| 72 | |||
| 73 | $big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; | ||
| 74 | |||
| 75 | for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } | ||
| 76 | open STDOUT,">$output"; | ||
| 77 | |||
| 78 | if (!defined($big_endian)) | ||
| 79 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
| 80 | |||
| 81 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 82 | open STDOUT,">$output"; | ||
| 83 | |||
| 84 | my ($MSB,$LSB)=(0,3); # automatically converted to little-endian | ||
| 85 | |||
| 86 | $code.=<<___; | ||
| 87 | .text | ||
| 88 | #ifdef OPENSSL_FIPSCANISTER | ||
| 89 | # include <openssl/fipssyms.h> | ||
| 90 | #endif | ||
| 91 | |||
| 92 | #if !defined(__vxworks) || defined(__pic__) | ||
| 93 | .option pic2 | ||
| 94 | #endif | ||
| 95 | .set noat | ||
| 96 | ___ | ||
| 97 | |||
| 98 | {{{ | ||
| 99 | my $FRAMESIZE=16*$SZREG; | ||
| 100 | my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; | ||
| 101 | |||
| 102 | my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7); | ||
| 103 | my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); | ||
| 104 | my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23)); | ||
| 105 | my ($key0,$cnt)=($gp,$fp); | ||
| 106 | |||
| 107 | # instuction ordering is "stolen" from output from MIPSpro assembler | ||
| 108 | # invoked with -mips3 -O3 arguments... | ||
| 109 | $code.=<<___; | ||
| 110 | .align 5 | ||
| 111 | .ent _mips_AES_encrypt | ||
| 112 | _mips_AES_encrypt: | ||
| 113 | .frame $sp,0,$ra | ||
| 114 | .set reorder | ||
| 115 | lw $t0,0($key) | ||
| 116 | lw $t1,4($key) | ||
| 117 | lw $t2,8($key) | ||
| 118 | lw $t3,12($key) | ||
| 119 | lw $cnt,240($key) | ||
| 120 | $PTR_ADD $key0,$key,16 | ||
| 121 | |||
| 122 | xor $s0,$t0 | ||
| 123 | xor $s1,$t1 | ||
| 124 | xor $s2,$t2 | ||
| 125 | xor $s3,$t3 | ||
| 126 | |||
| 127 | sub $cnt,1 | ||
| 128 | _xtr $i0,$s1,16-2 | ||
| 129 | .Loop_enc: | ||
| 130 | _xtr $i1,$s2,16-2 | ||
| 131 | _xtr $i2,$s3,16-2 | ||
| 132 | _xtr $i3,$s0,16-2 | ||
| 133 | and $i0,0x3fc | ||
| 134 | and $i1,0x3fc | ||
| 135 | and $i2,0x3fc | ||
| 136 | and $i3,0x3fc | ||
| 137 | $PTR_ADD $i0,$Tbl | ||
| 138 | $PTR_ADD $i1,$Tbl | ||
| 139 | $PTR_ADD $i2,$Tbl | ||
| 140 | $PTR_ADD $i3,$Tbl | ||
| 141 | lwl $t0,3($i0) # Te1[s1>>16] | ||
| 142 | lwl $t1,3($i1) # Te1[s2>>16] | ||
| 143 | lwl $t2,3($i2) # Te1[s3>>16] | ||
| 144 | lwl $t3,3($i3) # Te1[s0>>16] | ||
| 145 | lwr $t0,2($i0) # Te1[s1>>16] | ||
| 146 | lwr $t1,2($i1) # Te1[s2>>16] | ||
| 147 | lwr $t2,2($i2) # Te1[s3>>16] | ||
| 148 | lwr $t3,2($i3) # Te1[s0>>16] | ||
| 149 | |||
| 150 | _xtr $i0,$s2,8-2 | ||
| 151 | _xtr $i1,$s3,8-2 | ||
| 152 | _xtr $i2,$s0,8-2 | ||
| 153 | _xtr $i3,$s1,8-2 | ||
| 154 | and $i0,0x3fc | ||
| 155 | and $i1,0x3fc | ||
| 156 | and $i2,0x3fc | ||
| 157 | and $i3,0x3fc | ||
| 158 | $PTR_ADD $i0,$Tbl | ||
| 159 | $PTR_ADD $i1,$Tbl | ||
| 160 | $PTR_ADD $i2,$Tbl | ||
| 161 | $PTR_ADD $i3,$Tbl | ||
| 162 | lwl $t4,2($i0) # Te2[s2>>8] | ||
| 163 | lwl $t5,2($i1) # Te2[s3>>8] | ||
| 164 | lwl $t6,2($i2) # Te2[s0>>8] | ||
| 165 | lwl $t7,2($i3) # Te2[s1>>8] | ||
| 166 | lwr $t4,1($i0) # Te2[s2>>8] | ||
| 167 | lwr $t5,1($i1) # Te2[s3>>8] | ||
| 168 | lwr $t6,1($i2) # Te2[s0>>8] | ||
| 169 | lwr $t7,1($i3) # Te2[s1>>8] | ||
| 170 | |||
| 171 | _xtr $i0,$s3,0-2 | ||
| 172 | _xtr $i1,$s0,0-2 | ||
| 173 | _xtr $i2,$s1,0-2 | ||
| 174 | _xtr $i3,$s2,0-2 | ||
| 175 | and $i0,0x3fc | ||
| 176 | and $i1,0x3fc | ||
| 177 | and $i2,0x3fc | ||
| 178 | and $i3,0x3fc | ||
| 179 | $PTR_ADD $i0,$Tbl | ||
| 180 | $PTR_ADD $i1,$Tbl | ||
| 181 | $PTR_ADD $i2,$Tbl | ||
| 182 | $PTR_ADD $i3,$Tbl | ||
| 183 | lwl $t8,1($i0) # Te3[s3] | ||
| 184 | lwl $t9,1($i1) # Te3[s0] | ||
| 185 | lwl $t10,1($i2) # Te3[s1] | ||
| 186 | lwl $t11,1($i3) # Te3[s2] | ||
| 187 | lwr $t8,0($i0) # Te3[s3] | ||
| 188 | lwr $t9,0($i1) # Te3[s0] | ||
| 189 | lwr $t10,0($i2) # Te3[s1] | ||
| 190 | lwr $t11,0($i3) # Te3[s2] | ||
| 191 | |||
| 192 | _xtr $i0,$s0,24-2 | ||
| 193 | _xtr $i1,$s1,24-2 | ||
| 194 | _xtr $i2,$s2,24-2 | ||
| 195 | _xtr $i3,$s3,24-2 | ||
| 196 | and $i0,0x3fc | ||
| 197 | and $i1,0x3fc | ||
| 198 | and $i2,0x3fc | ||
| 199 | and $i3,0x3fc | ||
| 200 | $PTR_ADD $i0,$Tbl | ||
| 201 | $PTR_ADD $i1,$Tbl | ||
| 202 | $PTR_ADD $i2,$Tbl | ||
| 203 | $PTR_ADD $i3,$Tbl | ||
| 204 | xor $t0,$t4 | ||
| 205 | xor $t1,$t5 | ||
| 206 | xor $t2,$t6 | ||
| 207 | xor $t3,$t7 | ||
| 208 | lw $t4,0($i0) # Te0[s0>>24] | ||
| 209 | lw $t5,0($i1) # Te0[s1>>24] | ||
| 210 | lw $t6,0($i2) # Te0[s2>>24] | ||
| 211 | lw $t7,0($i3) # Te0[s3>>24] | ||
| 212 | |||
| 213 | lw $s0,0($key0) | ||
| 214 | lw $s1,4($key0) | ||
| 215 | lw $s2,8($key0) | ||
| 216 | lw $s3,12($key0) | ||
| 217 | |||
| 218 | xor $t0,$t8 | ||
| 219 | xor $t1,$t9 | ||
| 220 | xor $t2,$t10 | ||
| 221 | xor $t3,$t11 | ||
| 222 | |||
| 223 | xor $t0,$t4 | ||
| 224 | xor $t1,$t5 | ||
| 225 | xor $t2,$t6 | ||
| 226 | xor $t3,$t7 | ||
| 227 | |||
| 228 | sub $cnt,1 | ||
| 229 | $PTR_ADD $key0,16 | ||
| 230 | xor $s0,$t0 | ||
| 231 | xor $s1,$t1 | ||
| 232 | xor $s2,$t2 | ||
| 233 | xor $s3,$t3 | ||
| 234 | .set noreorder | ||
| 235 | bnez $cnt,.Loop_enc | ||
| 236 | _xtr $i0,$s1,16-2 | ||
| 237 | |||
| 238 | .set reorder | ||
| 239 | _xtr $i1,$s2,16-2 | ||
| 240 | _xtr $i2,$s3,16-2 | ||
| 241 | _xtr $i3,$s0,16-2 | ||
| 242 | and $i0,0x3fc | ||
| 243 | and $i1,0x3fc | ||
| 244 | and $i2,0x3fc | ||
| 245 | and $i3,0x3fc | ||
| 246 | $PTR_ADD $i0,$Tbl | ||
| 247 | $PTR_ADD $i1,$Tbl | ||
| 248 | $PTR_ADD $i2,$Tbl | ||
| 249 | $PTR_ADD $i3,$Tbl | ||
| 250 | lbu $t0,2($i0) # Te4[s1>>16] | ||
| 251 | lbu $t1,2($i1) # Te4[s2>>16] | ||
| 252 | lbu $t2,2($i2) # Te4[s3>>16] | ||
| 253 | lbu $t3,2($i3) # Te4[s0>>16] | ||
| 254 | |||
| 255 | _xtr $i0,$s2,8-2 | ||
| 256 | _xtr $i1,$s3,8-2 | ||
| 257 | _xtr $i2,$s0,8-2 | ||
| 258 | _xtr $i3,$s1,8-2 | ||
| 259 | and $i0,0x3fc | ||
| 260 | and $i1,0x3fc | ||
| 261 | and $i2,0x3fc | ||
| 262 | and $i3,0x3fc | ||
| 263 | $PTR_ADD $i0,$Tbl | ||
| 264 | $PTR_ADD $i1,$Tbl | ||
| 265 | $PTR_ADD $i2,$Tbl | ||
| 266 | $PTR_ADD $i3,$Tbl | ||
| 267 | lbu $t4,2($i0) # Te4[s2>>8] | ||
| 268 | lbu $t5,2($i1) # Te4[s3>>8] | ||
| 269 | lbu $t6,2($i2) # Te4[s0>>8] | ||
| 270 | lbu $t7,2($i3) # Te4[s1>>8] | ||
| 271 | |||
| 272 | _xtr $i0,$s0,24-2 | ||
| 273 | _xtr $i1,$s1,24-2 | ||
| 274 | _xtr $i2,$s2,24-2 | ||
| 275 | _xtr $i3,$s3,24-2 | ||
| 276 | and $i0,0x3fc | ||
| 277 | and $i1,0x3fc | ||
| 278 | and $i2,0x3fc | ||
| 279 | and $i3,0x3fc | ||
| 280 | $PTR_ADD $i0,$Tbl | ||
| 281 | $PTR_ADD $i1,$Tbl | ||
| 282 | $PTR_ADD $i2,$Tbl | ||
| 283 | $PTR_ADD $i3,$Tbl | ||
| 284 | lbu $t8,2($i0) # Te4[s0>>24] | ||
| 285 | lbu $t9,2($i1) # Te4[s1>>24] | ||
| 286 | lbu $t10,2($i2) # Te4[s2>>24] | ||
| 287 | lbu $t11,2($i3) # Te4[s3>>24] | ||
| 288 | |||
| 289 | _xtr $i0,$s3,0-2 | ||
| 290 | _xtr $i1,$s0,0-2 | ||
| 291 | _xtr $i2,$s1,0-2 | ||
| 292 | _xtr $i3,$s2,0-2 | ||
| 293 | and $i0,0x3fc | ||
| 294 | and $i1,0x3fc | ||
| 295 | and $i2,0x3fc | ||
| 296 | and $i3,0x3fc | ||
| 297 | |||
| 298 | _ins $t0,16 | ||
| 299 | _ins $t1,16 | ||
| 300 | _ins $t2,16 | ||
| 301 | _ins $t3,16 | ||
| 302 | |||
| 303 | _ins $t4,8 | ||
| 304 | _ins $t5,8 | ||
| 305 | _ins $t6,8 | ||
| 306 | _ins $t7,8 | ||
| 307 | |||
| 308 | xor $t0,$t4 | ||
| 309 | xor $t1,$t5 | ||
| 310 | xor $t2,$t6 | ||
| 311 | xor $t3,$t7 | ||
| 312 | |||
| 313 | $PTR_ADD $i0,$Tbl | ||
| 314 | $PTR_ADD $i1,$Tbl | ||
| 315 | $PTR_ADD $i2,$Tbl | ||
| 316 | $PTR_ADD $i3,$Tbl | ||
| 317 | lbu $t4,2($i0) # Te4[s3] | ||
| 318 | lbu $t5,2($i1) # Te4[s0] | ||
| 319 | lbu $t6,2($i2) # Te4[s1] | ||
| 320 | lbu $t7,2($i3) # Te4[s2] | ||
| 321 | |||
| 322 | _ins $t8,24 | ||
| 323 | _ins $t9,24 | ||
| 324 | _ins $t10,24 | ||
| 325 | _ins $t11,24 | ||
| 326 | |||
| 327 | lw $s0,0($key0) | ||
| 328 | lw $s1,4($key0) | ||
| 329 | lw $s2,8($key0) | ||
| 330 | lw $s3,12($key0) | ||
| 331 | |||
| 332 | xor $t0,$t8 | ||
| 333 | xor $t1,$t9 | ||
| 334 | xor $t2,$t10 | ||
| 335 | xor $t3,$t11 | ||
| 336 | |||
| 337 | _ins $t4,0 | ||
| 338 | _ins $t5,0 | ||
| 339 | _ins $t6,0 | ||
| 340 | _ins $t7,0 | ||
| 341 | |||
| 342 | xor $t0,$t4 | ||
| 343 | xor $t1,$t5 | ||
| 344 | xor $t2,$t6 | ||
| 345 | xor $t3,$t7 | ||
| 346 | |||
| 347 | xor $s0,$t0 | ||
| 348 | xor $s1,$t1 | ||
| 349 | xor $s2,$t2 | ||
| 350 | xor $s3,$t3 | ||
| 351 | |||
| 352 | jr $ra | ||
| 353 | .end _mips_AES_encrypt | ||
| 354 | |||
| 355 | .align 5 | ||
| 356 | .globl AES_encrypt | ||
| 357 | .ent AES_encrypt | ||
| 358 | AES_encrypt: | ||
| 359 | .frame $sp,$FRAMESIZE,$ra | ||
| 360 | .mask $SAVED_REGS_MASK,-$SZREG | ||
| 361 | .set noreorder | ||
| 362 | ___ | ||
| 363 | $code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification | ||
| 364 | .cpload $pf | ||
| 365 | ___ | ||
| 366 | $code.=<<___; | ||
| 367 | $PTR_SUB $sp,$FRAMESIZE | ||
| 368 | $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) | ||
| 369 | $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) | ||
| 370 | $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) | ||
| 371 | $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) | ||
| 372 | $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) | ||
| 373 | $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) | ||
| 374 | $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) | ||
| 375 | $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) | ||
| 376 | $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) | ||
| 377 | $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) | ||
| 378 | ___ | ||
| 379 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
| 380 | $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) | ||
| 381 | $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) | ||
| 382 | $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) | ||
| 383 | $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) | ||
| 384 | $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) | ||
| 385 | ___ | ||
| 386 | $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification | ||
| 387 | .cplocal $Tbl | ||
| 388 | .cpsetup $pf,$zero,AES_encrypt | ||
| 389 | ___ | ||
| 390 | $code.=<<___; | ||
| 391 | .set reorder | ||
| 392 | la $Tbl,AES_Te # PIC-ified 'load address' | ||
| 393 | |||
| 394 | lwl $s0,0+$MSB($inp) | ||
| 395 | lwl $s1,4+$MSB($inp) | ||
| 396 | lwl $s2,8+$MSB($inp) | ||
| 397 | lwl $s3,12+$MSB($inp) | ||
| 398 | lwr $s0,0+$LSB($inp) | ||
| 399 | lwr $s1,4+$LSB($inp) | ||
| 400 | lwr $s2,8+$LSB($inp) | ||
| 401 | lwr $s3,12+$LSB($inp) | ||
| 402 | |||
| 403 | bal _mips_AES_encrypt | ||
| 404 | |||
| 405 | swr $s0,0+$LSB($out) | ||
| 406 | swr $s1,4+$LSB($out) | ||
| 407 | swr $s2,8+$LSB($out) | ||
| 408 | swr $s3,12+$LSB($out) | ||
| 409 | swl $s0,0+$MSB($out) | ||
| 410 | swl $s1,4+$MSB($out) | ||
| 411 | swl $s2,8+$MSB($out) | ||
| 412 | swl $s3,12+$MSB($out) | ||
| 413 | |||
| 414 | .set noreorder | ||
| 415 | $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) | ||
| 416 | $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) | ||
| 417 | $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) | ||
| 418 | $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) | ||
| 419 | $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) | ||
| 420 | $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) | ||
| 421 | $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) | ||
| 422 | $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) | ||
| 423 | $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) | ||
| 424 | $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) | ||
| 425 | ___ | ||
| 426 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 427 | $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) | ||
| 428 | $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) | ||
| 429 | $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) | ||
| 430 | $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) | ||
| 431 | $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) | ||
| 432 | ___ | ||
| 433 | $code.=<<___; | ||
| 434 | jr $ra | ||
| 435 | $PTR_ADD $sp,$FRAMESIZE | ||
| 436 | .end AES_encrypt | ||
| 437 | ___ | ||
| 438 | |||
| 439 | $code.=<<___; | ||
| 440 | .align 5 | ||
| 441 | .ent _mips_AES_decrypt | ||
| 442 | _mips_AES_decrypt: | ||
| 443 | .frame $sp,0,$ra | ||
| 444 | .set reorder | ||
| 445 | lw $t0,0($key) | ||
| 446 | lw $t1,4($key) | ||
| 447 | lw $t2,8($key) | ||
| 448 | lw $t3,12($key) | ||
| 449 | lw $cnt,240($key) | ||
| 450 | $PTR_ADD $key0,$key,16 | ||
| 451 | |||
| 452 | xor $s0,$t0 | ||
| 453 | xor $s1,$t1 | ||
| 454 | xor $s2,$t2 | ||
| 455 | xor $s3,$t3 | ||
| 456 | |||
| 457 | sub $cnt,1 | ||
| 458 | _xtr $i0,$s3,16-2 | ||
| 459 | .Loop_dec: | ||
| 460 | _xtr $i1,$s0,16-2 | ||
| 461 | _xtr $i2,$s1,16-2 | ||
| 462 | _xtr $i3,$s2,16-2 | ||
| 463 | and $i0,0x3fc | ||
| 464 | and $i1,0x3fc | ||
| 465 | and $i2,0x3fc | ||
| 466 | and $i3,0x3fc | ||
| 467 | $PTR_ADD $i0,$Tbl | ||
| 468 | $PTR_ADD $i1,$Tbl | ||
| 469 | $PTR_ADD $i2,$Tbl | ||
| 470 | $PTR_ADD $i3,$Tbl | ||
| 471 | lwl $t0,3($i0) # Td1[s3>>16] | ||
| 472 | lwl $t1,3($i1) # Td1[s0>>16] | ||
| 473 | lwl $t2,3($i2) # Td1[s1>>16] | ||
| 474 | lwl $t3,3($i3) # Td1[s2>>16] | ||
| 475 | lwr $t0,2($i0) # Td1[s3>>16] | ||
| 476 | lwr $t1,2($i1) # Td1[s0>>16] | ||
| 477 | lwr $t2,2($i2) # Td1[s1>>16] | ||
| 478 | lwr $t3,2($i3) # Td1[s2>>16] | ||
| 479 | |||
| 480 | _xtr $i0,$s2,8-2 | ||
| 481 | _xtr $i1,$s3,8-2 | ||
| 482 | _xtr $i2,$s0,8-2 | ||
| 483 | _xtr $i3,$s1,8-2 | ||
| 484 | and $i0,0x3fc | ||
| 485 | and $i1,0x3fc | ||
| 486 | and $i2,0x3fc | ||
| 487 | and $i3,0x3fc | ||
| 488 | $PTR_ADD $i0,$Tbl | ||
| 489 | $PTR_ADD $i1,$Tbl | ||
| 490 | $PTR_ADD $i2,$Tbl | ||
| 491 | $PTR_ADD $i3,$Tbl | ||
| 492 | lwl $t4,2($i0) # Td2[s2>>8] | ||
| 493 | lwl $t5,2($i1) # Td2[s3>>8] | ||
| 494 | lwl $t6,2($i2) # Td2[s0>>8] | ||
| 495 | lwl $t7,2($i3) # Td2[s1>>8] | ||
| 496 | lwr $t4,1($i0) # Td2[s2>>8] | ||
| 497 | lwr $t5,1($i1) # Td2[s3>>8] | ||
| 498 | lwr $t6,1($i2) # Td2[s0>>8] | ||
| 499 | lwr $t7,1($i3) # Td2[s1>>8] | ||
| 500 | |||
| 501 | _xtr $i0,$s1,0-2 | ||
| 502 | _xtr $i1,$s2,0-2 | ||
| 503 | _xtr $i2,$s3,0-2 | ||
| 504 | _xtr $i3,$s0,0-2 | ||
| 505 | and $i0,0x3fc | ||
| 506 | and $i1,0x3fc | ||
| 507 | and $i2,0x3fc | ||
| 508 | and $i3,0x3fc | ||
| 509 | $PTR_ADD $i0,$Tbl | ||
| 510 | $PTR_ADD $i1,$Tbl | ||
| 511 | $PTR_ADD $i2,$Tbl | ||
| 512 | $PTR_ADD $i3,$Tbl | ||
| 513 | lwl $t8,1($i0) # Td3[s1] | ||
| 514 | lwl $t9,1($i1) # Td3[s2] | ||
| 515 | lwl $t10,1($i2) # Td3[s3] | ||
| 516 | lwl $t11,1($i3) # Td3[s0] | ||
| 517 | lwr $t8,0($i0) # Td3[s1] | ||
| 518 | lwr $t9,0($i1) # Td3[s2] | ||
| 519 | lwr $t10,0($i2) # Td3[s3] | ||
| 520 | lwr $t11,0($i3) # Td3[s0] | ||
| 521 | |||
| 522 | _xtr $i0,$s0,24-2 | ||
| 523 | _xtr $i1,$s1,24-2 | ||
| 524 | _xtr $i2,$s2,24-2 | ||
| 525 | _xtr $i3,$s3,24-2 | ||
| 526 | and $i0,0x3fc | ||
| 527 | and $i1,0x3fc | ||
| 528 | and $i2,0x3fc | ||
| 529 | and $i3,0x3fc | ||
| 530 | $PTR_ADD $i0,$Tbl | ||
| 531 | $PTR_ADD $i1,$Tbl | ||
| 532 | $PTR_ADD $i2,$Tbl | ||
| 533 | $PTR_ADD $i3,$Tbl | ||
| 534 | |||
| 535 | xor $t0,$t4 | ||
| 536 | xor $t1,$t5 | ||
| 537 | xor $t2,$t6 | ||
| 538 | xor $t3,$t7 | ||
| 539 | |||
| 540 | |||
| 541 | lw $t4,0($i0) # Td0[s0>>24] | ||
| 542 | lw $t5,0($i1) # Td0[s1>>24] | ||
| 543 | lw $t6,0($i2) # Td0[s2>>24] | ||
| 544 | lw $t7,0($i3) # Td0[s3>>24] | ||
| 545 | |||
| 546 | lw $s0,0($key0) | ||
| 547 | lw $s1,4($key0) | ||
| 548 | lw $s2,8($key0) | ||
| 549 | lw $s3,12($key0) | ||
| 550 | |||
| 551 | xor $t0,$t8 | ||
| 552 | xor $t1,$t9 | ||
| 553 | xor $t2,$t10 | ||
| 554 | xor $t3,$t11 | ||
| 555 | |||
| 556 | xor $t0,$t4 | ||
| 557 | xor $t1,$t5 | ||
| 558 | xor $t2,$t6 | ||
| 559 | xor $t3,$t7 | ||
| 560 | |||
| 561 | sub $cnt,1 | ||
| 562 | $PTR_ADD $key0,16 | ||
| 563 | xor $s0,$t0 | ||
| 564 | xor $s1,$t1 | ||
| 565 | xor $s2,$t2 | ||
| 566 | xor $s3,$t3 | ||
| 567 | .set noreorder | ||
| 568 | bnez $cnt,.Loop_dec | ||
| 569 | _xtr $i0,$s3,16-2 | ||
| 570 | |||
| 571 | .set reorder | ||
| 572 | lw $t4,1024($Tbl) # prefetch Td4 | ||
| 573 | lw $t5,1024+32($Tbl) | ||
| 574 | lw $t6,1024+64($Tbl) | ||
| 575 | lw $t7,1024+96($Tbl) | ||
| 576 | lw $t8,1024+128($Tbl) | ||
| 577 | lw $t9,1024+160($Tbl) | ||
| 578 | lw $t10,1024+192($Tbl) | ||
| 579 | lw $t11,1024+224($Tbl) | ||
| 580 | |||
| 581 | _xtr $i0,$s3,16 | ||
| 582 | _xtr $i1,$s0,16 | ||
| 583 | _xtr $i2,$s1,16 | ||
| 584 | _xtr $i3,$s2,16 | ||
| 585 | and $i0,0xff | ||
| 586 | and $i1,0xff | ||
| 587 | and $i2,0xff | ||
| 588 | and $i3,0xff | ||
| 589 | $PTR_ADD $i0,$Tbl | ||
| 590 | $PTR_ADD $i1,$Tbl | ||
| 591 | $PTR_ADD $i2,$Tbl | ||
| 592 | $PTR_ADD $i3,$Tbl | ||
| 593 | lbu $t0,1024($i0) # Td4[s3>>16] | ||
| 594 | lbu $t1,1024($i1) # Td4[s0>>16] | ||
| 595 | lbu $t2,1024($i2) # Td4[s1>>16] | ||
| 596 | lbu $t3,1024($i3) # Td4[s2>>16] | ||
| 597 | |||
| 598 | _xtr $i0,$s2,8 | ||
| 599 | _xtr $i1,$s3,8 | ||
| 600 | _xtr $i2,$s0,8 | ||
| 601 | _xtr $i3,$s1,8 | ||
| 602 | and $i0,0xff | ||
| 603 | and $i1,0xff | ||
| 604 | and $i2,0xff | ||
| 605 | and $i3,0xff | ||
| 606 | $PTR_ADD $i0,$Tbl | ||
| 607 | $PTR_ADD $i1,$Tbl | ||
| 608 | $PTR_ADD $i2,$Tbl | ||
| 609 | $PTR_ADD $i3,$Tbl | ||
| 610 | lbu $t4,1024($i0) # Td4[s2>>8] | ||
| 611 | lbu $t5,1024($i1) # Td4[s3>>8] | ||
| 612 | lbu $t6,1024($i2) # Td4[s0>>8] | ||
| 613 | lbu $t7,1024($i3) # Td4[s1>>8] | ||
| 614 | |||
| 615 | _xtr $i0,$s0,24 | ||
| 616 | _xtr $i1,$s1,24 | ||
| 617 | _xtr $i2,$s2,24 | ||
| 618 | _xtr $i3,$s3,24 | ||
| 619 | $PTR_ADD $i0,$Tbl | ||
| 620 | $PTR_ADD $i1,$Tbl | ||
| 621 | $PTR_ADD $i2,$Tbl | ||
| 622 | $PTR_ADD $i3,$Tbl | ||
| 623 | lbu $t8,1024($i0) # Td4[s0>>24] | ||
| 624 | lbu $t9,1024($i1) # Td4[s1>>24] | ||
| 625 | lbu $t10,1024($i2) # Td4[s2>>24] | ||
| 626 | lbu $t11,1024($i3) # Td4[s3>>24] | ||
| 627 | |||
| 628 | _xtr $i0,$s1,0 | ||
| 629 | _xtr $i1,$s2,0 | ||
| 630 | _xtr $i2,$s3,0 | ||
| 631 | _xtr $i3,$s0,0 | ||
| 632 | |||
| 633 | _ins $t0,16 | ||
| 634 | _ins $t1,16 | ||
| 635 | _ins $t2,16 | ||
| 636 | _ins $t3,16 | ||
| 637 | |||
| 638 | _ins $t4,8 | ||
| 639 | _ins $t5,8 | ||
| 640 | _ins $t6,8 | ||
| 641 | _ins $t7,8 | ||
| 642 | |||
| 643 | xor $t0,$t4 | ||
| 644 | xor $t1,$t5 | ||
| 645 | xor $t2,$t6 | ||
| 646 | xor $t3,$t7 | ||
| 647 | |||
| 648 | $PTR_ADD $i0,$Tbl | ||
| 649 | $PTR_ADD $i1,$Tbl | ||
| 650 | $PTR_ADD $i2,$Tbl | ||
| 651 | $PTR_ADD $i3,$Tbl | ||
| 652 | lbu $t4,1024($i0) # Td4[s1] | ||
| 653 | lbu $t5,1024($i1) # Td4[s2] | ||
| 654 | lbu $t6,1024($i2) # Td4[s3] | ||
| 655 | lbu $t7,1024($i3) # Td4[s0] | ||
| 656 | |||
| 657 | _ins $t8,24 | ||
| 658 | _ins $t9,24 | ||
| 659 | _ins $t10,24 | ||
| 660 | _ins $t11,24 | ||
| 661 | |||
| 662 | lw $s0,0($key0) | ||
| 663 | lw $s1,4($key0) | ||
| 664 | lw $s2,8($key0) | ||
| 665 | lw $s3,12($key0) | ||
| 666 | |||
| 667 | _ins $t4,0 | ||
| 668 | _ins $t5,0 | ||
| 669 | _ins $t6,0 | ||
| 670 | _ins $t7,0 | ||
| 671 | |||
| 672 | |||
| 673 | xor $t0,$t8 | ||
| 674 | xor $t1,$t9 | ||
| 675 | xor $t2,$t10 | ||
| 676 | xor $t3,$t11 | ||
| 677 | |||
| 678 | xor $t0,$t4 | ||
| 679 | xor $t1,$t5 | ||
| 680 | xor $t2,$t6 | ||
| 681 | xor $t3,$t7 | ||
| 682 | |||
| 683 | xor $s0,$t0 | ||
| 684 | xor $s1,$t1 | ||
| 685 | xor $s2,$t2 | ||
| 686 | xor $s3,$t3 | ||
| 687 | |||
| 688 | jr $ra | ||
| 689 | .end _mips_AES_decrypt | ||
| 690 | |||
| 691 | .align 5 | ||
| 692 | .globl AES_decrypt | ||
| 693 | .ent AES_decrypt | ||
| 694 | AES_decrypt: | ||
| 695 | .frame $sp,$FRAMESIZE,$ra | ||
| 696 | .mask $SAVED_REGS_MASK,-$SZREG | ||
| 697 | .set noreorder | ||
| 698 | ___ | ||
| 699 | $code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification | ||
| 700 | .cpload $pf | ||
| 701 | ___ | ||
| 702 | $code.=<<___; | ||
| 703 | $PTR_SUB $sp,$FRAMESIZE | ||
| 704 | $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) | ||
| 705 | $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) | ||
| 706 | $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) | ||
| 707 | $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) | ||
| 708 | $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) | ||
| 709 | $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) | ||
| 710 | $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) | ||
| 711 | $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) | ||
| 712 | $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) | ||
| 713 | $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) | ||
| 714 | ___ | ||
| 715 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
| 716 | $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) | ||
| 717 | $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) | ||
| 718 | $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) | ||
| 719 | $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) | ||
| 720 | $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) | ||
| 721 | ___ | ||
| 722 | $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification | ||
| 723 | .cplocal $Tbl | ||
| 724 | .cpsetup $pf,$zero,AES_decrypt | ||
| 725 | ___ | ||
| 726 | $code.=<<___; | ||
| 727 | .set reorder | ||
| 728 | la $Tbl,AES_Td # PIC-ified 'load address' | ||
| 729 | |||
| 730 | lwl $s0,0+$MSB($inp) | ||
| 731 | lwl $s1,4+$MSB($inp) | ||
| 732 | lwl $s2,8+$MSB($inp) | ||
| 733 | lwl $s3,12+$MSB($inp) | ||
| 734 | lwr $s0,0+$LSB($inp) | ||
| 735 | lwr $s1,4+$LSB($inp) | ||
| 736 | lwr $s2,8+$LSB($inp) | ||
| 737 | lwr $s3,12+$LSB($inp) | ||
| 738 | |||
| 739 | bal _mips_AES_decrypt | ||
| 740 | |||
| 741 | swr $s0,0+$LSB($out) | ||
| 742 | swr $s1,4+$LSB($out) | ||
| 743 | swr $s2,8+$LSB($out) | ||
| 744 | swr $s3,12+$LSB($out) | ||
| 745 | swl $s0,0+$MSB($out) | ||
| 746 | swl $s1,4+$MSB($out) | ||
| 747 | swl $s2,8+$MSB($out) | ||
| 748 | swl $s3,12+$MSB($out) | ||
| 749 | |||
| 750 | .set noreorder | ||
| 751 | $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) | ||
| 752 | $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) | ||
| 753 | $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) | ||
| 754 | $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) | ||
| 755 | $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) | ||
| 756 | $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) | ||
| 757 | $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) | ||
| 758 | $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) | ||
| 759 | $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) | ||
| 760 | $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) | ||
| 761 | ___ | ||
| 762 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 763 | $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) | ||
| 764 | $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) | ||
| 765 | $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) | ||
| 766 | $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) | ||
| 767 | $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) | ||
| 768 | ___ | ||
| 769 | $code.=<<___; | ||
| 770 | jr $ra | ||
| 771 | $PTR_ADD $sp,$FRAMESIZE | ||
| 772 | .end AES_decrypt | ||
| 773 | ___ | ||
| 774 | }}} | ||
| 775 | |||
| 776 | {{{ | ||
| 777 | my $FRAMESIZE=8*$SZREG; | ||
| 778 | my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000; | ||
| 779 | |||
| 780 | my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3); | ||
| 781 | my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); | ||
| 782 | my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); | ||
| 783 | my ($rcon,$cnt)=($gp,$fp); | ||
| 784 | |||
| 785 | $code.=<<___; | ||
| 786 | .align 5 | ||
| 787 | .ent _mips_AES_set_encrypt_key | ||
| 788 | _mips_AES_set_encrypt_key: | ||
| 789 | .frame $sp,0,$ra | ||
| 790 | .set noreorder | ||
| 791 | beqz $inp,.Lekey_done | ||
| 792 | li $t0,-1 | ||
| 793 | beqz $key,.Lekey_done | ||
| 794 | $PTR_ADD $rcon,$Tbl,1024+256 | ||
| 795 | |||
| 796 | .set reorder | ||
| 797 | lwl $rk0,0+$MSB($inp) # load 128 bits | ||
| 798 | lwl $rk1,4+$MSB($inp) | ||
| 799 | lwl $rk2,8+$MSB($inp) | ||
| 800 | lwl $rk3,12+$MSB($inp) | ||
| 801 | li $at,128 | ||
| 802 | lwr $rk0,0+$LSB($inp) | ||
| 803 | lwr $rk1,4+$LSB($inp) | ||
| 804 | lwr $rk2,8+$LSB($inp) | ||
| 805 | lwr $rk3,12+$LSB($inp) | ||
| 806 | .set noreorder | ||
| 807 | beq $bits,$at,.L128bits | ||
| 808 | li $cnt,10 | ||
| 809 | |||
| 810 | .set reorder | ||
| 811 | lwl $rk4,16+$MSB($inp) # load 192 bits | ||
| 812 | lwl $rk5,20+$MSB($inp) | ||
| 813 | li $at,192 | ||
| 814 | lwr $rk4,16+$LSB($inp) | ||
| 815 | lwr $rk5,20+$LSB($inp) | ||
| 816 | .set noreorder | ||
| 817 | beq $bits,$at,.L192bits | ||
| 818 | li $cnt,8 | ||
| 819 | |||
| 820 | .set reorder | ||
| 821 | lwl $rk6,24+$MSB($inp) # load 256 bits | ||
| 822 | lwl $rk7,28+$MSB($inp) | ||
| 823 | li $at,256 | ||
| 824 | lwr $rk6,24+$LSB($inp) | ||
| 825 | lwr $rk7,28+$LSB($inp) | ||
| 826 | .set noreorder | ||
| 827 | beq $bits,$at,.L256bits | ||
| 828 | li $cnt,7 | ||
| 829 | |||
| 830 | b .Lekey_done | ||
| 831 | li $t0,-2 | ||
| 832 | |||
| 833 | .align 4 | ||
| 834 | .L128bits: | ||
| 835 | .set reorder | ||
| 836 | srl $i0,$rk3,16 | ||
| 837 | srl $i1,$rk3,8 | ||
| 838 | and $i0,0xff | ||
| 839 | and $i1,0xff | ||
| 840 | and $i2,$rk3,0xff | ||
| 841 | srl $i3,$rk3,24 | ||
| 842 | $PTR_ADD $i0,$Tbl | ||
| 843 | $PTR_ADD $i1,$Tbl | ||
| 844 | $PTR_ADD $i2,$Tbl | ||
| 845 | $PTR_ADD $i3,$Tbl | ||
| 846 | lbu $i0,1024($i0) | ||
| 847 | lbu $i1,1024($i1) | ||
| 848 | lbu $i2,1024($i2) | ||
| 849 | lbu $i3,1024($i3) | ||
| 850 | |||
| 851 | sw $rk0,0($key) | ||
| 852 | sw $rk1,4($key) | ||
| 853 | sw $rk2,8($key) | ||
| 854 | sw $rk3,12($key) | ||
| 855 | sub $cnt,1 | ||
| 856 | $PTR_ADD $key,16 | ||
| 857 | |||
| 858 | _bias $i0,24 | ||
| 859 | _bias $i1,16 | ||
| 860 | _bias $i2,8 | ||
| 861 | _bias $i3,0 | ||
| 862 | |||
| 863 | xor $rk0,$i0 | ||
| 864 | lw $i0,0($rcon) | ||
| 865 | xor $rk0,$i1 | ||
| 866 | xor $rk0,$i2 | ||
| 867 | xor $rk0,$i3 | ||
| 868 | xor $rk0,$i0 | ||
| 869 | |||
| 870 | xor $rk1,$rk0 | ||
| 871 | xor $rk2,$rk1 | ||
| 872 | xor $rk3,$rk2 | ||
| 873 | |||
| 874 | .set noreorder | ||
| 875 | bnez $cnt,.L128bits | ||
| 876 | $PTR_ADD $rcon,4 | ||
| 877 | |||
| 878 | sw $rk0,0($key) | ||
| 879 | sw $rk1,4($key) | ||
| 880 | sw $rk2,8($key) | ||
| 881 | li $cnt,10 | ||
| 882 | sw $rk3,12($key) | ||
| 883 | li $t0,0 | ||
| 884 | sw $cnt,80($key) | ||
| 885 | b .Lekey_done | ||
| 886 | $PTR_SUB $key,10*16 | ||
| 887 | |||
| 888 | .align 4 | ||
| 889 | .L192bits: | ||
| 890 | .set reorder | ||
| 891 | srl $i0,$rk5,16 | ||
| 892 | srl $i1,$rk5,8 | ||
| 893 | and $i0,0xff | ||
| 894 | and $i1,0xff | ||
| 895 | and $i2,$rk5,0xff | ||
| 896 | srl $i3,$rk5,24 | ||
| 897 | $PTR_ADD $i0,$Tbl | ||
| 898 | $PTR_ADD $i1,$Tbl | ||
| 899 | $PTR_ADD $i2,$Tbl | ||
| 900 | $PTR_ADD $i3,$Tbl | ||
| 901 | lbu $i0,1024($i0) | ||
| 902 | lbu $i1,1024($i1) | ||
| 903 | lbu $i2,1024($i2) | ||
| 904 | lbu $i3,1024($i3) | ||
| 905 | |||
| 906 | sw $rk0,0($key) | ||
| 907 | sw $rk1,4($key) | ||
| 908 | sw $rk2,8($key) | ||
| 909 | sw $rk3,12($key) | ||
| 910 | sw $rk4,16($key) | ||
| 911 | sw $rk5,20($key) | ||
| 912 | sub $cnt,1 | ||
| 913 | $PTR_ADD $key,24 | ||
| 914 | |||
| 915 | _bias $i0,24 | ||
| 916 | _bias $i1,16 | ||
| 917 | _bias $i2,8 | ||
| 918 | _bias $i3,0 | ||
| 919 | |||
| 920 | xor $rk0,$i0 | ||
| 921 | lw $i0,0($rcon) | ||
| 922 | xor $rk0,$i1 | ||
| 923 | xor $rk0,$i2 | ||
| 924 | xor $rk0,$i3 | ||
| 925 | xor $rk0,$i0 | ||
| 926 | |||
| 927 | xor $rk1,$rk0 | ||
| 928 | xor $rk2,$rk1 | ||
| 929 | xor $rk3,$rk2 | ||
| 930 | xor $rk4,$rk3 | ||
| 931 | xor $rk5,$rk4 | ||
| 932 | |||
| 933 | .set noreorder | ||
| 934 | bnez $cnt,.L192bits | ||
| 935 | $PTR_ADD $rcon,4 | ||
| 936 | |||
| 937 | sw $rk0,0($key) | ||
| 938 | sw $rk1,4($key) | ||
| 939 | sw $rk2,8($key) | ||
| 940 | li $cnt,12 | ||
| 941 | sw $rk3,12($key) | ||
| 942 | li $t0,0 | ||
| 943 | sw $cnt,48($key) | ||
| 944 | b .Lekey_done | ||
| 945 | $PTR_SUB $key,12*16 | ||
| 946 | |||
| 947 | .align 4 | ||
| 948 | .L256bits: | ||
| 949 | .set reorder | ||
| 950 | srl $i0,$rk7,16 | ||
| 951 | srl $i1,$rk7,8 | ||
| 952 | and $i0,0xff | ||
| 953 | and $i1,0xff | ||
| 954 | and $i2,$rk7,0xff | ||
| 955 | srl $i3,$rk7,24 | ||
| 956 | $PTR_ADD $i0,$Tbl | ||
| 957 | $PTR_ADD $i1,$Tbl | ||
| 958 | $PTR_ADD $i2,$Tbl | ||
| 959 | $PTR_ADD $i3,$Tbl | ||
| 960 | lbu $i0,1024($i0) | ||
| 961 | lbu $i1,1024($i1) | ||
| 962 | lbu $i2,1024($i2) | ||
| 963 | lbu $i3,1024($i3) | ||
| 964 | |||
| 965 | sw $rk0,0($key) | ||
| 966 | sw $rk1,4($key) | ||
| 967 | sw $rk2,8($key) | ||
| 968 | sw $rk3,12($key) | ||
| 969 | sw $rk4,16($key) | ||
| 970 | sw $rk5,20($key) | ||
| 971 | sw $rk6,24($key) | ||
| 972 | sw $rk7,28($key) | ||
| 973 | sub $cnt,1 | ||
| 974 | |||
| 975 | _bias $i0,24 | ||
| 976 | _bias $i1,16 | ||
| 977 | _bias $i2,8 | ||
| 978 | _bias $i3,0 | ||
| 979 | |||
| 980 | xor $rk0,$i0 | ||
| 981 | lw $i0,0($rcon) | ||
| 982 | xor $rk0,$i1 | ||
| 983 | xor $rk0,$i2 | ||
| 984 | xor $rk0,$i3 | ||
| 985 | xor $rk0,$i0 | ||
| 986 | |||
| 987 | xor $rk1,$rk0 | ||
| 988 | xor $rk2,$rk1 | ||
| 989 | xor $rk3,$rk2 | ||
| 990 | beqz $cnt,.L256bits_done | ||
| 991 | |||
| 992 | srl $i0,$rk3,24 | ||
| 993 | srl $i1,$rk3,16 | ||
| 994 | srl $i2,$rk3,8 | ||
| 995 | and $i3,$rk3,0xff | ||
| 996 | and $i1,0xff | ||
| 997 | and $i2,0xff | ||
| 998 | $PTR_ADD $i0,$Tbl | ||
| 999 | $PTR_ADD $i1,$Tbl | ||
| 1000 | $PTR_ADD $i2,$Tbl | ||
| 1001 | $PTR_ADD $i3,$Tbl | ||
| 1002 | lbu $i0,1024($i0) | ||
| 1003 | lbu $i1,1024($i1) | ||
| 1004 | lbu $i2,1024($i2) | ||
| 1005 | lbu $i3,1024($i3) | ||
| 1006 | sll $i0,24 | ||
| 1007 | sll $i1,16 | ||
| 1008 | sll $i2,8 | ||
| 1009 | |||
| 1010 | xor $rk4,$i0 | ||
| 1011 | xor $rk4,$i1 | ||
| 1012 | xor $rk4,$i2 | ||
| 1013 | xor $rk4,$i3 | ||
| 1014 | |||
| 1015 | xor $rk5,$rk4 | ||
| 1016 | xor $rk6,$rk5 | ||
| 1017 | xor $rk7,$rk6 | ||
| 1018 | |||
| 1019 | $PTR_ADD $key,32 | ||
| 1020 | .set noreorder | ||
| 1021 | b .L256bits | ||
| 1022 | $PTR_ADD $rcon,4 | ||
| 1023 | |||
| 1024 | .L256bits_done: | ||
| 1025 | sw $rk0,32($key) | ||
| 1026 | sw $rk1,36($key) | ||
| 1027 | sw $rk2,40($key) | ||
| 1028 | li $cnt,14 | ||
| 1029 | sw $rk3,44($key) | ||
| 1030 | li $t0,0 | ||
| 1031 | sw $cnt,48($key) | ||
| 1032 | $PTR_SUB $key,12*16 | ||
| 1033 | |||
| 1034 | .Lekey_done: | ||
| 1035 | jr $ra | ||
| 1036 | nop | ||
| 1037 | .end _mips_AES_set_encrypt_key | ||
| 1038 | |||
| 1039 | .globl AES_set_encrypt_key | ||
| 1040 | .ent AES_set_encrypt_key | ||
| 1041 | AES_set_encrypt_key: | ||
| 1042 | .frame $sp,$FRAMESIZE,$ra | ||
| 1043 | .mask $SAVED_REGS_MASK,-$SZREG | ||
| 1044 | .set noreorder | ||
| 1045 | ___ | ||
| 1046 | $code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification | ||
| 1047 | .cpload $pf | ||
| 1048 | ___ | ||
| 1049 | $code.=<<___; | ||
| 1050 | $PTR_SUB $sp,$FRAMESIZE | ||
| 1051 | $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) | ||
| 1052 | $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) | ||
| 1053 | ___ | ||
| 1054 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
| 1055 | $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) | ||
| 1056 | $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) | ||
| 1057 | $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) | ||
| 1058 | $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) | ||
| 1059 | $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) | ||
| 1060 | ___ | ||
| 1061 | $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification | ||
| 1062 | .cplocal $Tbl | ||
| 1063 | .cpsetup $pf,$zero,AES_set_encrypt_key | ||
| 1064 | ___ | ||
| 1065 | $code.=<<___; | ||
| 1066 | .set reorder | ||
| 1067 | la $Tbl,AES_Te # PIC-ified 'load address' | ||
| 1068 | |||
| 1069 | bal _mips_AES_set_encrypt_key | ||
| 1070 | |||
| 1071 | .set noreorder | ||
| 1072 | move $a0,$t0 | ||
| 1073 | $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) | ||
| 1074 | $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) | ||
| 1075 | ___ | ||
| 1076 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 1077 | $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) | ||
| 1078 | $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) | ||
| 1079 | $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) | ||
| 1080 | $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) | ||
| 1081 | $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) | ||
| 1082 | ___ | ||
| 1083 | $code.=<<___; | ||
| 1084 | jr $ra | ||
| 1085 | $PTR_ADD $sp,$FRAMESIZE | ||
| 1086 | .end AES_set_encrypt_key | ||
| 1087 | ___ | ||
| 1088 | |||
| 1089 | my ($head,$tail)=($inp,$bits); | ||
| 1090 | my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); | ||
| 1091 | my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2); | ||
| 1092 | $code.=<<___; | ||
| 1093 | .align 5 | ||
| 1094 | .globl AES_set_decrypt_key | ||
| 1095 | .ent AES_set_decrypt_key | ||
| 1096 | AES_set_decrypt_key: | ||
| 1097 | .frame $sp,$FRAMESIZE,$ra | ||
| 1098 | .mask $SAVED_REGS_MASK,-$SZREG | ||
| 1099 | .set noreorder | ||
| 1100 | ___ | ||
| 1101 | $code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification | ||
| 1102 | .cpload $pf | ||
| 1103 | ___ | ||
| 1104 | $code.=<<___; | ||
| 1105 | $PTR_SUB $sp,$FRAMESIZE | ||
| 1106 | $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) | ||
| 1107 | $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) | ||
| 1108 | ___ | ||
| 1109 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
| 1110 | $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) | ||
| 1111 | $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) | ||
| 1112 | $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) | ||
| 1113 | $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) | ||
| 1114 | $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) | ||
| 1115 | ___ | ||
| 1116 | $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification | ||
| 1117 | .cplocal $Tbl | ||
| 1118 | .cpsetup $pf,$zero,AES_set_decrypt_key | ||
| 1119 | ___ | ||
| 1120 | $code.=<<___; | ||
| 1121 | .set reorder | ||
| 1122 | la $Tbl,AES_Te # PIC-ified 'load address' | ||
| 1123 | |||
| 1124 | bal _mips_AES_set_encrypt_key | ||
| 1125 | |||
| 1126 | bltz $t0,.Ldkey_done | ||
| 1127 | |||
| 1128 | sll $at,$cnt,4 | ||
| 1129 | $PTR_ADD $head,$key,0 | ||
| 1130 | $PTR_ADD $tail,$key,$at | ||
| 1131 | .align 4 | ||
| 1132 | .Lswap: | ||
| 1133 | lw $rk0,0($head) | ||
| 1134 | lw $rk1,4($head) | ||
| 1135 | lw $rk2,8($head) | ||
| 1136 | lw $rk3,12($head) | ||
| 1137 | lw $rk4,0($tail) | ||
| 1138 | lw $rk5,4($tail) | ||
| 1139 | lw $rk6,8($tail) | ||
| 1140 | lw $rk7,12($tail) | ||
| 1141 | sw $rk0,0($tail) | ||
| 1142 | sw $rk1,4($tail) | ||
| 1143 | sw $rk2,8($tail) | ||
| 1144 | sw $rk3,12($tail) | ||
| 1145 | $PTR_ADD $head,16 | ||
| 1146 | $PTR_SUB $tail,16 | ||
| 1147 | sw $rk4,-16($head) | ||
| 1148 | sw $rk5,-12($head) | ||
| 1149 | sw $rk6,-8($head) | ||
| 1150 | sw $rk7,-4($head) | ||
| 1151 | bne $head,$tail,.Lswap | ||
| 1152 | |||
| 1153 | lw $tp1,16($key) # modulo-scheduled | ||
| 1154 | lui $x80808080,0x8080 | ||
| 1155 | sub $cnt,1 | ||
| 1156 | or $x80808080,0x8080 | ||
| 1157 | sll $cnt,2 | ||
| 1158 | $PTR_ADD $key,16 | ||
| 1159 | lui $x1b1b1b1b,0x1b1b | ||
| 1160 | nor $x7f7f7f7f,$zero,$x80808080 | ||
| 1161 | or $x1b1b1b1b,0x1b1b | ||
| 1162 | .align 4 | ||
| 1163 | .Lmix: | ||
| 1164 | and $m,$tp1,$x80808080 | ||
| 1165 | and $tp2,$tp1,$x7f7f7f7f | ||
| 1166 | srl $tp4,$m,7 | ||
| 1167 | addu $tp2,$tp2 # tp2<<1 | ||
| 1168 | subu $m,$tp4 | ||
| 1169 | and $m,$x1b1b1b1b | ||
| 1170 | xor $tp2,$m | ||
| 1171 | |||
| 1172 | and $m,$tp2,$x80808080 | ||
| 1173 | and $tp4,$tp2,$x7f7f7f7f | ||
| 1174 | srl $tp8,$m,7 | ||
| 1175 | addu $tp4,$tp4 # tp4<<1 | ||
| 1176 | subu $m,$tp8 | ||
| 1177 | and $m,$x1b1b1b1b | ||
| 1178 | xor $tp4,$m | ||
| 1179 | |||
| 1180 | and $m,$tp4,$x80808080 | ||
| 1181 | and $tp8,$tp4,$x7f7f7f7f | ||
| 1182 | srl $tp9,$m,7 | ||
| 1183 | addu $tp8,$tp8 # tp8<<1 | ||
| 1184 | subu $m,$tp9 | ||
| 1185 | and $m,$x1b1b1b1b | ||
| 1186 | xor $tp8,$m | ||
| 1187 | |||
| 1188 | xor $tp9,$tp8,$tp1 | ||
| 1189 | xor $tpe,$tp8,$tp4 | ||
| 1190 | xor $tpb,$tp9,$tp2 | ||
| 1191 | xor $tpd,$tp9,$tp4 | ||
| 1192 | |||
| 1193 | _ror $tp1,$tpd,16 | ||
| 1194 | xor $tpe,$tp2 | ||
| 1195 | _ror $tp2,$tpd,-16 | ||
| 1196 | xor $tpe,$tp1 | ||
| 1197 | _ror $tp1,$tp9,8 | ||
| 1198 | xor $tpe,$tp2 | ||
| 1199 | _ror $tp2,$tp9,-24 | ||
| 1200 | xor $tpe,$tp1 | ||
| 1201 | _ror $tp1,$tpb,24 | ||
| 1202 | xor $tpe,$tp2 | ||
| 1203 | _ror $tp2,$tpb,-8 | ||
| 1204 | xor $tpe,$tp1 | ||
| 1205 | lw $tp1,4($key) # modulo-scheduled | ||
| 1206 | xor $tpe,$tp2 | ||
| 1207 | sub $cnt,1 | ||
| 1208 | sw $tpe,0($key) | ||
| 1209 | $PTR_ADD $key,4 | ||
| 1210 | bnez $cnt,.Lmix | ||
| 1211 | |||
| 1212 | li $t0,0 | ||
| 1213 | .Ldkey_done: | ||
| 1214 | .set noreorder | ||
| 1215 | move $a0,$t0 | ||
| 1216 | $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) | ||
| 1217 | $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) | ||
| 1218 | ___ | ||
| 1219 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 1220 | $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) | ||
| 1221 | $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) | ||
| 1222 | $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) | ||
| 1223 | $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) | ||
| 1224 | $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) | ||
| 1225 | ___ | ||
| 1226 | $code.=<<___; | ||
| 1227 | jr $ra | ||
| 1228 | $PTR_ADD $sp,$FRAMESIZE | ||
| 1229 | .end AES_set_decrypt_key | ||
| 1230 | ___ | ||
| 1231 | }}} | ||
| 1232 | |||
| 1233 | ###################################################################### | ||
| 1234 | # Tables are kept in endian-neutral manner | ||
| 1235 | $code.=<<___; | ||
| 1236 | .rdata | ||
| 1237 | .align 6 | ||
| 1238 | AES_Te: | ||
| 1239 | .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0 | ||
| 1240 | .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d | ||
| 1241 | .byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd | ||
| 1242 | .byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 | ||
| 1243 | .byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 | ||
| 1244 | .byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d | ||
| 1245 | .byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 | ||
| 1246 | .byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a | ||
| 1247 | .byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d | ||
| 1248 | .byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 | ||
| 1249 | .byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb | ||
| 1250 | .byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b | ||
| 1251 | .byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 | ||
| 1252 | .byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea | ||
| 1253 | .byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 | ||
| 1254 | .byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b | ||
| 1255 | .byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c | ||
| 1256 | .byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a | ||
| 1257 | .byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 | ||
| 1258 | .byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f | ||
| 1259 | .byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 | ||
| 1260 | .byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 | ||
| 1261 | .byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 | ||
| 1262 | .byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f | ||
| 1263 | .byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 | ||
| 1264 | .byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e | ||
| 1265 | .byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 | ||
| 1266 | .byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 | ||
| 1267 | .byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 | ||
| 1268 | .byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d | ||
| 1269 | .byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 | ||
| 1270 | .byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f | ||
| 1271 | .byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e | ||
| 1272 | .byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e | ||
| 1273 | .byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 | ||
| 1274 | .byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb | ||
| 1275 | .byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d | ||
| 1276 | .byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce | ||
| 1277 | .byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e | ||
| 1278 | .byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 | ||
| 1279 | .byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 | ||
| 1280 | .byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c | ||
| 1281 | .byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f | ||
| 1282 | .byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed | ||
| 1283 | .byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 | ||
| 1284 | .byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b | ||
| 1285 | .byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 | ||
| 1286 | .byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a | ||
| 1287 | .byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a | ||
| 1288 | .byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 | ||
| 1289 | .byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 | ||
| 1290 | .byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 | ||
| 1291 | .byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 | ||
| 1292 | .byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 | ||
| 1293 | .byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 | ||
| 1294 | .byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 | ||
| 1295 | .byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe | ||
| 1296 | .byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a | ||
| 1297 | .byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc | ||
| 1298 | .byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 | ||
| 1299 | .byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 | ||
| 1300 | .byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 | ||
| 1301 | .byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a | ||
| 1302 | .byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d | ||
| 1303 | .byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 | ||
| 1304 | .byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f | ||
| 1305 | .byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 | ||
| 1306 | .byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 | ||
| 1307 | .byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 | ||
| 1308 | .byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 | ||
| 1309 | .byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 | ||
| 1310 | .byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 | ||
| 1311 | .byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 | ||
| 1312 | .byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f | ||
| 1313 | .byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e | ||
| 1314 | .byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 | ||
| 1315 | .byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 | ||
| 1316 | .byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c | ||
| 1317 | .byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 | ||
| 1318 | .byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 | ||
| 1319 | .byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 | ||
| 1320 | .byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e | ||
| 1321 | .byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a | ||
| 1322 | .byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 | ||
| 1323 | .byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e | ||
| 1324 | .byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 | ||
| 1325 | .byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 | ||
| 1326 | .byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b | ||
| 1327 | .byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 | ||
| 1328 | .byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 | ||
| 1329 | .byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 | ||
| 1330 | .byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 | ||
| 1331 | .byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa | ||
| 1332 | .byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 | ||
| 1333 | .byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e | ||
| 1334 | .byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 | ||
| 1335 | .byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 | ||
| 1336 | .byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 | ||
| 1337 | .byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 | ||
| 1338 | .byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 | ||
| 1339 | .byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c | ||
| 1340 | .byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 | ||
| 1341 | .byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc | ||
| 1342 | .byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 | ||
| 1343 | .byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 | ||
| 1344 | .byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa | ||
| 1345 | .byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 | ||
| 1346 | .byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 | ||
| 1347 | .byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f | ||
| 1348 | .byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 | ||
| 1349 | .byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 | ||
| 1350 | .byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 | ||
| 1351 | .byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 | ||
| 1352 | .byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 | ||
| 1353 | .byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 | ||
| 1354 | .byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 | ||
| 1355 | .byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 | ||
| 1356 | .byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 | ||
| 1357 | .byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff | ||
| 1358 | .byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a | ||
| 1359 | .byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 | ||
| 1360 | .byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 | ||
| 1361 | .byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 | ||
| 1362 | .byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 | ||
| 1363 | .byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 | ||
| 1364 | .byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 | ||
| 1365 | .byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc | ||
| 1366 | .byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a | ||
| 1367 | |||
| 1368 | .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 | ||
| 1369 | .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | ||
| 1370 | .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | ||
| 1371 | .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | ||
| 1372 | .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | ||
| 1373 | .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | ||
| 1374 | .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | ||
| 1375 | .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | ||
| 1376 | .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | ||
| 1377 | .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | ||
| 1378 | .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | ||
| 1379 | .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | ||
| 1380 | .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | ||
| 1381 | .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | ||
| 1382 | .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | ||
| 1383 | .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | ||
| 1384 | .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | ||
| 1385 | .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | ||
| 1386 | .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | ||
| 1387 | .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | ||
| 1388 | .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | ||
| 1389 | .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | ||
| 1390 | .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | ||
| 1391 | .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | ||
| 1392 | .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | ||
| 1393 | .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | ||
| 1394 | .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | ||
| 1395 | .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | ||
| 1396 | .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | ||
| 1397 | .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | ||
| 1398 | .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | ||
| 1399 | .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | ||
| 1400 | |||
| 1401 | .byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon | ||
| 1402 | .byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 | ||
| 1403 | .byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 | ||
| 1404 | .byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 | ||
| 1405 | .byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 | ||
| 1406 | |||
| 1407 | .align 6 | ||
| 1408 | AES_Td: | ||
| 1409 | .byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0 | ||
| 1410 | .byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 | ||
| 1411 | .byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 | ||
| 1412 | .byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 | ||
| 1413 | .byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 | ||
| 1414 | .byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 | ||
| 1415 | .byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 | ||
| 1416 | .byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f | ||
| 1417 | .byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 | ||
| 1418 | .byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 | ||
| 1419 | .byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 | ||
| 1420 | .byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 | ||
| 1421 | .byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 | ||
| 1422 | .byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda | ||
| 1423 | .byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 | ||
| 1424 | .byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 | ||
| 1425 | .byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 | ||
| 1426 | .byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd | ||
| 1427 | .byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 | ||
| 1428 | .byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 | ||
| 1429 | .byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 | ||
| 1430 | .byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 | ||
| 1431 | .byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 | ||
| 1432 | .byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 | ||
| 1433 | .byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 | ||
| 1434 | .byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 | ||
| 1435 | .byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 | ||
| 1436 | .byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a | ||
| 1437 | .byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 | ||
| 1438 | .byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 | ||
| 1439 | .byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 | ||
| 1440 | .byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c | ||
| 1441 | .byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 | ||
| 1442 | .byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 | ||
| 1443 | .byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 | ||
| 1444 | .byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a | ||
| 1445 | .byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 | ||
| 1446 | .byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 | ||
| 1447 | .byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa | ||
| 1448 | .byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 | ||
| 1449 | .byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d | ||
| 1450 | .byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 | ||
| 1451 | .byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 | ||
| 1452 | .byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff | ||
| 1453 | .byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 | ||
| 1454 | .byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 | ||
| 1455 | .byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 | ||
| 1456 | .byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb | ||
| 1457 | .byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 | ||
| 1458 | .byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 | ||
| 1459 | .byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 | ||
| 1460 | .byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e | ||
| 1461 | .byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 | ||
| 1462 | .byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 | ||
| 1463 | .byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 | ||
| 1464 | .byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a | ||
| 1465 | .byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f | ||
| 1466 | .byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e | ||
| 1467 | .byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 | ||
| 1468 | .byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 | ||
| 1469 | .byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 | ||
| 1470 | .byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d | ||
| 1471 | .byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad | ||
| 1472 | .byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 | ||
| 1473 | .byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c | ||
| 1474 | .byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd | ||
| 1475 | .byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc | ||
| 1476 | .byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 | ||
| 1477 | .byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc | ||
| 1478 | .byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 | ||
| 1479 | .byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 | ||
| 1480 | .byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 | ||
| 1481 | .byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 | ||
| 1482 | .byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d | ||
| 1483 | .byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 | ||
| 1484 | .byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 | ||
| 1485 | .byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 | ||
| 1486 | .byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 | ||
| 1487 | .byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a | ||
| 1488 | .byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef | ||
| 1489 | .byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 | ||
| 1490 | .byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 | ||
| 1491 | .byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 | ||
| 1492 | .byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 | ||
| 1493 | .byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d | ||
| 1494 | .byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 | ||
| 1495 | .byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 | ||
| 1496 | .byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 | ||
| 1497 | .byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c | ||
| 1498 | .byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 | ||
| 1499 | .byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 | ||
| 1500 | .byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b | ||
| 1501 | .byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 | ||
| 1502 | .byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 | ||
| 1503 | .byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e | ||
| 1504 | .byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 | ||
| 1505 | .byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce | ||
| 1506 | .byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 | ||
| 1507 | .byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 | ||
| 1508 | .byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 | ||
| 1509 | .byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 | ||
| 1510 | .byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 | ||
| 1511 | .byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 | ||
| 1512 | .byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f | ||
| 1513 | .byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d | ||
| 1514 | .byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf | ||
| 1515 | .byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b | ||
| 1516 | .byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f | ||
| 1517 | .byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d | ||
| 1518 | .byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e | ||
| 1519 | .byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 | ||
| 1520 | .byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 | ||
| 1521 | .byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a | ||
| 1522 | .byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 | ||
| 1523 | .byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 | ||
| 1524 | .byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c | ||
| 1525 | .byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f | ||
| 1526 | .byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf | ||
| 1527 | .byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b | ||
| 1528 | .byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 | ||
| 1529 | .byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e | ||
| 1530 | .byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f | ||
| 1531 | .byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c | ||
| 1532 | .byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 | ||
| 1533 | .byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde | ||
| 1534 | .byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 | ||
| 1535 | .byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 | ||
| 1536 | .byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 | ||
| 1537 | |||
| 1538 | .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4 | ||
| 1539 | .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | ||
| 1540 | .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | ||
| 1541 | .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | ||
| 1542 | .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | ||
| 1543 | .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | ||
| 1544 | .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | ||
| 1545 | .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | ||
| 1546 | .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | ||
| 1547 | .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | ||
| 1548 | .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | ||
| 1549 | .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | ||
| 1550 | .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | ||
| 1551 | .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | ||
| 1552 | .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | ||
| 1553 | .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | ||
| 1554 | .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | ||
| 1555 | .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | ||
| 1556 | .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | ||
| 1557 | .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | ||
| 1558 | .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | ||
| 1559 | .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | ||
| 1560 | .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | ||
| 1561 | .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | ||
| 1562 | .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | ||
| 1563 | .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | ||
| 1564 | .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | ||
| 1565 | .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | ||
| 1566 | .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | ||
| 1567 | .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | ||
| 1568 | .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | ||
| 1569 | .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | ||
| 1570 | ___ | ||
| 1571 | |||
| 1572 | foreach (split("\n",$code)) { | ||
| 1573 | s/\`([^\`]*)\`/eval $1/ge; | ||
| 1574 | |||
| 1575 | # made-up _instructions, _xtr, _ins, _ror and _bias, cope | ||
| 1576 | # with byte order dependencies... | ||
| 1577 | if (/^\s+_/) { | ||
| 1578 | s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/; | ||
| 1579 | |||
| 1580 | s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/ | ||
| 1581 | sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) | ||
| 1582 | : eval("24-$3"))/e or | ||
| 1583 | s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ | ||
| 1584 | sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) | ||
| 1585 | : eval("24-$3"))/e or | ||
| 1586 | s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/ | ||
| 1587 | sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) | ||
| 1588 | : eval("$3*-1"))/e or | ||
| 1589 | s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ | ||
| 1590 | sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) | ||
| 1591 | : eval("($3-16)&31"))/e; | ||
| 1592 | |||
| 1593 | s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/ | ||
| 1594 | sprintf("sll\t$1,$2,$3")/e or | ||
| 1595 | s/srl\s+(\$[0-9]+),(\$[0-9]+),0/ | ||
| 1596 | sprintf("and\t$1,$2,0xff")/e or | ||
| 1597 | s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/; | ||
| 1598 | } | ||
| 1599 | |||
| 1600 | # convert lwl/lwr and swr/swl to little-endian order | ||
| 1601 | if (!$big_endian && /^\s+[sl]w[lr]\s+/) { | ||
| 1602 | s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/ | ||
| 1603 | sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or | ||
| 1604 | s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/ | ||
| 1605 | sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e; | ||
| 1606 | } | ||
| 1607 | |||
| 1608 | print $_,"\n"; | ||
| 1609 | } | ||
| 1610 | |||
| 1611 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/aes/asm/aes-parisc.pl b/src/lib/libcrypto/aes/asm/aes-parisc.pl new file mode 100644 index 0000000000..c36b6a2270 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aes-parisc.pl | |||
| @@ -0,0 +1,1021 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # AES for PA-RISC. | ||
| 11 | # | ||
| 12 | # June 2009. | ||
| 13 | # | ||
| 14 | # The module is mechanical transliteration of aes-sparcv9.pl, but with | ||
| 15 | # a twist: S-boxes are compressed even further down to 1K+256B. On | ||
| 16 | # PA-7100LC performance is ~40% better than gcc 3.2 generated code and | ||
| 17 | # is about 33 cycles per byte processed with 128-bit key. Newer CPUs | ||
| 18 | # perform at 16 cycles per byte. It's not faster than code generated | ||
| 19 | # by vendor compiler, but recall that it has compressed S-boxes, which | ||
| 20 | # requires extra processing. | ||
| 21 | # | ||
| 22 | # Special thanks to polarhome.com for providing HP-UX account. | ||
| 23 | |||
| 24 | $flavour = shift; | ||
| 25 | $output = shift; | ||
| 26 | open STDOUT,">$output"; | ||
| 27 | |||
| 28 | if ($flavour =~ /64/) { | ||
| 29 | $LEVEL ="2.0W"; | ||
| 30 | $SIZE_T =8; | ||
| 31 | $FRAME_MARKER =80; | ||
| 32 | $SAVED_RP =16; | ||
| 33 | $PUSH ="std"; | ||
| 34 | $PUSHMA ="std,ma"; | ||
| 35 | $POP ="ldd"; | ||
| 36 | $POPMB ="ldd,mb"; | ||
| 37 | } else { | ||
| 38 | $LEVEL ="1.0"; | ||
| 39 | $SIZE_T =4; | ||
| 40 | $FRAME_MARKER =48; | ||
| 41 | $SAVED_RP =20; | ||
| 42 | $PUSH ="stw"; | ||
| 43 | $PUSHMA ="stwm"; | ||
| 44 | $POP ="ldw"; | ||
| 45 | $POPMB ="ldwm"; | ||
| 46 | } | ||
| 47 | |||
| 48 | $FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker | ||
| 49 | # [+ argument transfer] | ||
| 50 | $inp="%r26"; # arg0 | ||
| 51 | $out="%r25"; # arg1 | ||
| 52 | $key="%r24"; # arg2 | ||
| 53 | |||
| 54 | ($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4"); | ||
| 55 | ($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8"); | ||
| 56 | |||
| 57 | ($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7, | ||
| 58 | $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) = | ||
| 59 | ("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16", | ||
| 60 | "%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26"); | ||
| 61 | |||
| 62 | $tbl="%r28"; | ||
| 63 | $rounds="%r29"; | ||
| 64 | |||
| 65 | $code=<<___; | ||
| 66 | .LEVEL $LEVEL | ||
| 67 | .SPACE \$TEXT\$ | ||
| 68 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
| 69 | |||
| 70 | .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
| 71 | .ALIGN 64 | ||
| 72 | AES_encrypt | ||
| 73 | .PROC | ||
| 74 | .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 | ||
| 75 | .ENTRY | ||
| 76 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
| 77 | $PUSHMA %r3,$FRAME(%sp) | ||
| 78 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
| 79 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
| 80 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
| 81 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
| 82 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
| 83 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
| 84 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
| 85 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
| 86 | $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) | ||
| 87 | $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) | ||
| 88 | $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) | ||
| 89 | $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) | ||
| 90 | $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) | ||
| 91 | $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) | ||
| 92 | $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) | ||
| 93 | |||
| 94 | blr %r0,$tbl | ||
| 95 | ldi 3,$t0 | ||
| 96 | L\$enc_pic | ||
| 97 | andcm $tbl,$t0,$tbl | ||
| 98 | ldo L\$AES_Te-L\$enc_pic($tbl),$tbl | ||
| 99 | |||
| 100 | and $inp,$t0,$t0 | ||
| 101 | sub $inp,$t0,$inp | ||
| 102 | ldw 0($inp),$s0 | ||
| 103 | ldw 4($inp),$s1 | ||
| 104 | ldw 8($inp),$s2 | ||
| 105 | comib,= 0,$t0,L\$enc_inp_aligned | ||
| 106 | ldw 12($inp),$s3 | ||
| 107 | |||
| 108 | sh3addl $t0,%r0,$t0 | ||
| 109 | subi 32,$t0,$t0 | ||
| 110 | mtctl $t0,%cr11 | ||
| 111 | ldw 16($inp),$t1 | ||
| 112 | vshd $s0,$s1,$s0 | ||
| 113 | vshd $s1,$s2,$s1 | ||
| 114 | vshd $s2,$s3,$s2 | ||
| 115 | vshd $s3,$t1,$s3 | ||
| 116 | |||
| 117 | L\$enc_inp_aligned | ||
| 118 | bl _parisc_AES_encrypt,%r31 | ||
| 119 | nop | ||
| 120 | |||
| 121 | extru,<> $out,31,2,%r0 | ||
| 122 | b L\$enc_out_aligned | ||
| 123 | nop | ||
| 124 | |||
| 125 | _srm $s0,24,$acc0 | ||
| 126 | _srm $s0,16,$acc1 | ||
| 127 | stb $acc0,0($out) | ||
| 128 | _srm $s0,8,$acc2 | ||
| 129 | stb $acc1,1($out) | ||
| 130 | _srm $s1,24,$acc4 | ||
| 131 | stb $acc2,2($out) | ||
| 132 | _srm $s1,16,$acc5 | ||
| 133 | stb $s0,3($out) | ||
| 134 | _srm $s1,8,$acc6 | ||
| 135 | stb $acc4,4($out) | ||
| 136 | _srm $s2,24,$acc0 | ||
| 137 | stb $acc5,5($out) | ||
| 138 | _srm $s2,16,$acc1 | ||
| 139 | stb $acc6,6($out) | ||
| 140 | _srm $s2,8,$acc2 | ||
| 141 | stb $s1,7($out) | ||
| 142 | _srm $s3,24,$acc4 | ||
| 143 | stb $acc0,8($out) | ||
| 144 | _srm $s3,16,$acc5 | ||
| 145 | stb $acc1,9($out) | ||
| 146 | _srm $s3,8,$acc6 | ||
| 147 | stb $acc2,10($out) | ||
| 148 | stb $s2,11($out) | ||
| 149 | stb $acc4,12($out) | ||
| 150 | stb $acc5,13($out) | ||
| 151 | stb $acc6,14($out) | ||
| 152 | b L\$enc_done | ||
| 153 | stb $s3,15($out) | ||
| 154 | |||
| 155 | L\$enc_out_aligned | ||
| 156 | stw $s0,0($out) | ||
| 157 | stw $s1,4($out) | ||
| 158 | stw $s2,8($out) | ||
| 159 | stw $s3,12($out) | ||
| 160 | |||
| 161 | L\$enc_done | ||
| 162 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
| 163 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
| 164 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
| 165 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
| 166 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
| 167 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
| 168 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
| 169 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
| 170 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
| 171 | $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 | ||
| 172 | $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 | ||
| 173 | $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 | ||
| 174 | $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 | ||
| 175 | $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 | ||
| 176 | $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 | ||
| 177 | $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 | ||
| 178 | bv (%r2) | ||
| 179 | .EXIT | ||
| 180 | $POPMB -$FRAME(%sp),%r3 | ||
| 181 | .PROCEND | ||
| 182 | |||
| 183 | .ALIGN 16 | ||
| 184 | _parisc_AES_encrypt | ||
| 185 | .PROC | ||
| 186 | .CALLINFO MILLICODE | ||
| 187 | .ENTRY | ||
| 188 | ldw 240($key),$rounds | ||
| 189 | ldw 0($key),$t0 | ||
| 190 | ldw 4($key),$t1 | ||
| 191 | ldw 8($key),$t2 | ||
| 192 | _srm $rounds,1,$rounds | ||
| 193 | xor $t0,$s0,$s0 | ||
| 194 | ldw 12($key),$t3 | ||
| 195 | _srm $s0,24,$acc0 | ||
| 196 | xor $t1,$s1,$s1 | ||
| 197 | ldw 16($key),$t0 | ||
| 198 | _srm $s1,16,$acc1 | ||
| 199 | xor $t2,$s2,$s2 | ||
| 200 | ldw 20($key),$t1 | ||
| 201 | xor $t3,$s3,$s3 | ||
| 202 | ldw 24($key),$t2 | ||
| 203 | ldw 28($key),$t3 | ||
| 204 | L\$enc_loop | ||
| 205 | _srm $s2,8,$acc2 | ||
| 206 | ldwx,s $acc0($tbl),$acc0 | ||
| 207 | _srm $s3,0,$acc3 | ||
| 208 | ldwx,s $acc1($tbl),$acc1 | ||
| 209 | _srm $s1,24,$acc4 | ||
| 210 | ldwx,s $acc2($tbl),$acc2 | ||
| 211 | _srm $s2,16,$acc5 | ||
| 212 | ldwx,s $acc3($tbl),$acc3 | ||
| 213 | _srm $s3,8,$acc6 | ||
| 214 | ldwx,s $acc4($tbl),$acc4 | ||
| 215 | _srm $s0,0,$acc7 | ||
| 216 | ldwx,s $acc5($tbl),$acc5 | ||
| 217 | _srm $s2,24,$acc8 | ||
| 218 | ldwx,s $acc6($tbl),$acc6 | ||
| 219 | _srm $s3,16,$acc9 | ||
| 220 | ldwx,s $acc7($tbl),$acc7 | ||
| 221 | _srm $s0,8,$acc10 | ||
| 222 | ldwx,s $acc8($tbl),$acc8 | ||
| 223 | _srm $s1,0,$acc11 | ||
| 224 | ldwx,s $acc9($tbl),$acc9 | ||
| 225 | _srm $s3,24,$acc12 | ||
| 226 | ldwx,s $acc10($tbl),$acc10 | ||
| 227 | _srm $s0,16,$acc13 | ||
| 228 | ldwx,s $acc11($tbl),$acc11 | ||
| 229 | _srm $s1,8,$acc14 | ||
| 230 | ldwx,s $acc12($tbl),$acc12 | ||
| 231 | _srm $s2,0,$acc15 | ||
| 232 | ldwx,s $acc13($tbl),$acc13 | ||
| 233 | ldwx,s $acc14($tbl),$acc14 | ||
| 234 | ldwx,s $acc15($tbl),$acc15 | ||
| 235 | addib,= -1,$rounds,L\$enc_last | ||
| 236 | ldo 32($key),$key | ||
| 237 | |||
| 238 | _ror $acc1,8,$acc1 | ||
| 239 | xor $acc0,$t0,$t0 | ||
| 240 | ldw 0($key),$s0 | ||
| 241 | _ror $acc2,16,$acc2 | ||
| 242 | xor $acc1,$t0,$t0 | ||
| 243 | ldw 4($key),$s1 | ||
| 244 | _ror $acc3,24,$acc3 | ||
| 245 | xor $acc2,$t0,$t0 | ||
| 246 | ldw 8($key),$s2 | ||
| 247 | _ror $acc5,8,$acc5 | ||
| 248 | xor $acc3,$t0,$t0 | ||
| 249 | ldw 12($key),$s3 | ||
| 250 | _ror $acc6,16,$acc6 | ||
| 251 | xor $acc4,$t1,$t1 | ||
| 252 | _ror $acc7,24,$acc7 | ||
| 253 | xor $acc5,$t1,$t1 | ||
| 254 | _ror $acc9,8,$acc9 | ||
| 255 | xor $acc6,$t1,$t1 | ||
| 256 | _ror $acc10,16,$acc10 | ||
| 257 | xor $acc7,$t1,$t1 | ||
| 258 | _ror $acc11,24,$acc11 | ||
| 259 | xor $acc8,$t2,$t2 | ||
| 260 | _ror $acc13,8,$acc13 | ||
| 261 | xor $acc9,$t2,$t2 | ||
| 262 | _ror $acc14,16,$acc14 | ||
| 263 | xor $acc10,$t2,$t2 | ||
| 264 | _ror $acc15,24,$acc15 | ||
| 265 | xor $acc11,$t2,$t2 | ||
| 266 | xor $acc12,$acc14,$acc14 | ||
| 267 | xor $acc13,$t3,$t3 | ||
| 268 | _srm $t0,24,$acc0 | ||
| 269 | xor $acc14,$t3,$t3 | ||
| 270 | _srm $t1,16,$acc1 | ||
| 271 | xor $acc15,$t3,$t3 | ||
| 272 | |||
| 273 | _srm $t2,8,$acc2 | ||
| 274 | ldwx,s $acc0($tbl),$acc0 | ||
| 275 | _srm $t3,0,$acc3 | ||
| 276 | ldwx,s $acc1($tbl),$acc1 | ||
| 277 | _srm $t1,24,$acc4 | ||
| 278 | ldwx,s $acc2($tbl),$acc2 | ||
| 279 | _srm $t2,16,$acc5 | ||
| 280 | ldwx,s $acc3($tbl),$acc3 | ||
| 281 | _srm $t3,8,$acc6 | ||
| 282 | ldwx,s $acc4($tbl),$acc4 | ||
| 283 | _srm $t0,0,$acc7 | ||
| 284 | ldwx,s $acc5($tbl),$acc5 | ||
| 285 | _srm $t2,24,$acc8 | ||
| 286 | ldwx,s $acc6($tbl),$acc6 | ||
| 287 | _srm $t3,16,$acc9 | ||
| 288 | ldwx,s $acc7($tbl),$acc7 | ||
| 289 | _srm $t0,8,$acc10 | ||
| 290 | ldwx,s $acc8($tbl),$acc8 | ||
| 291 | _srm $t1,0,$acc11 | ||
| 292 | ldwx,s $acc9($tbl),$acc9 | ||
| 293 | _srm $t3,24,$acc12 | ||
| 294 | ldwx,s $acc10($tbl),$acc10 | ||
| 295 | _srm $t0,16,$acc13 | ||
| 296 | ldwx,s $acc11($tbl),$acc11 | ||
| 297 | _srm $t1,8,$acc14 | ||
| 298 | ldwx,s $acc12($tbl),$acc12 | ||
| 299 | _srm $t2,0,$acc15 | ||
| 300 | ldwx,s $acc13($tbl),$acc13 | ||
| 301 | _ror $acc1,8,$acc1 | ||
| 302 | ldwx,s $acc14($tbl),$acc14 | ||
| 303 | |||
| 304 | _ror $acc2,16,$acc2 | ||
| 305 | xor $acc0,$s0,$s0 | ||
| 306 | ldwx,s $acc15($tbl),$acc15 | ||
| 307 | _ror $acc3,24,$acc3 | ||
| 308 | xor $acc1,$s0,$s0 | ||
| 309 | ldw 16($key),$t0 | ||
| 310 | _ror $acc5,8,$acc5 | ||
| 311 | xor $acc2,$s0,$s0 | ||
| 312 | ldw 20($key),$t1 | ||
| 313 | _ror $acc6,16,$acc6 | ||
| 314 | xor $acc3,$s0,$s0 | ||
| 315 | ldw 24($key),$t2 | ||
| 316 | _ror $acc7,24,$acc7 | ||
| 317 | xor $acc4,$s1,$s1 | ||
| 318 | ldw 28($key),$t3 | ||
| 319 | _ror $acc9,8,$acc9 | ||
| 320 | xor $acc5,$s1,$s1 | ||
| 321 | ldw 1024+0($tbl),%r0 ; prefetch te4 | ||
| 322 | _ror $acc10,16,$acc10 | ||
| 323 | xor $acc6,$s1,$s1 | ||
| 324 | ldw 1024+32($tbl),%r0 ; prefetch te4 | ||
| 325 | _ror $acc11,24,$acc11 | ||
| 326 | xor $acc7,$s1,$s1 | ||
| 327 | ldw 1024+64($tbl),%r0 ; prefetch te4 | ||
| 328 | _ror $acc13,8,$acc13 | ||
| 329 | xor $acc8,$s2,$s2 | ||
| 330 | ldw 1024+96($tbl),%r0 ; prefetch te4 | ||
| 331 | _ror $acc14,16,$acc14 | ||
| 332 | xor $acc9,$s2,$s2 | ||
| 333 | ldw 1024+128($tbl),%r0 ; prefetch te4 | ||
| 334 | _ror $acc15,24,$acc15 | ||
| 335 | xor $acc10,$s2,$s2 | ||
| 336 | ldw 1024+160($tbl),%r0 ; prefetch te4 | ||
| 337 | _srm $s0,24,$acc0 | ||
| 338 | xor $acc11,$s2,$s2 | ||
| 339 | ldw 1024+192($tbl),%r0 ; prefetch te4 | ||
| 340 | xor $acc12,$acc14,$acc14 | ||
| 341 | xor $acc13,$s3,$s3 | ||
| 342 | ldw 1024+224($tbl),%r0 ; prefetch te4 | ||
| 343 | _srm $s1,16,$acc1 | ||
| 344 | xor $acc14,$s3,$s3 | ||
| 345 | b L\$enc_loop | ||
| 346 | xor $acc15,$s3,$s3 | ||
| 347 | |||
| 348 | .ALIGN 16 | ||
| 349 | L\$enc_last | ||
| 350 | ldo 1024($tbl),$rounds | ||
| 351 | _ror $acc1,8,$acc1 | ||
| 352 | xor $acc0,$t0,$t0 | ||
| 353 | ldw 0($key),$s0 | ||
| 354 | _ror $acc2,16,$acc2 | ||
| 355 | xor $acc1,$t0,$t0 | ||
| 356 | ldw 4($key),$s1 | ||
| 357 | _ror $acc3,24,$acc3 | ||
| 358 | xor $acc2,$t0,$t0 | ||
| 359 | ldw 8($key),$s2 | ||
| 360 | _ror $acc5,8,$acc5 | ||
| 361 | xor $acc3,$t0,$t0 | ||
| 362 | ldw 12($key),$s3 | ||
| 363 | _ror $acc6,16,$acc6 | ||
| 364 | xor $acc4,$t1,$t1 | ||
| 365 | _ror $acc7,24,$acc7 | ||
| 366 | xor $acc5,$t1,$t1 | ||
| 367 | _ror $acc9,8,$acc9 | ||
| 368 | xor $acc6,$t1,$t1 | ||
| 369 | _ror $acc10,16,$acc10 | ||
| 370 | xor $acc7,$t1,$t1 | ||
| 371 | _ror $acc11,24,$acc11 | ||
| 372 | xor $acc8,$t2,$t2 | ||
| 373 | _ror $acc13,8,$acc13 | ||
| 374 | xor $acc9,$t2,$t2 | ||
| 375 | _ror $acc14,16,$acc14 | ||
| 376 | xor $acc10,$t2,$t2 | ||
| 377 | _ror $acc15,24,$acc15 | ||
| 378 | xor $acc11,$t2,$t2 | ||
| 379 | xor $acc12,$acc14,$acc14 | ||
| 380 | xor $acc13,$t3,$t3 | ||
| 381 | _srm $t0,24,$acc0 | ||
| 382 | xor $acc14,$t3,$t3 | ||
| 383 | _srm $t1,16,$acc1 | ||
| 384 | xor $acc15,$t3,$t3 | ||
| 385 | |||
| 386 | _srm $t2,8,$acc2 | ||
| 387 | ldbx $acc0($rounds),$acc0 | ||
| 388 | _srm $t1,24,$acc4 | ||
| 389 | ldbx $acc1($rounds),$acc1 | ||
| 390 | _srm $t2,16,$acc5 | ||
| 391 | _srm $t3,0,$acc3 | ||
| 392 | ldbx $acc2($rounds),$acc2 | ||
| 393 | ldbx $acc3($rounds),$acc3 | ||
| 394 | _srm $t3,8,$acc6 | ||
| 395 | ldbx $acc4($rounds),$acc4 | ||
| 396 | _srm $t2,24,$acc8 | ||
| 397 | ldbx $acc5($rounds),$acc5 | ||
| 398 | _srm $t3,16,$acc9 | ||
| 399 | _srm $t0,0,$acc7 | ||
| 400 | ldbx $acc6($rounds),$acc6 | ||
| 401 | ldbx $acc7($rounds),$acc7 | ||
| 402 | _srm $t0,8,$acc10 | ||
| 403 | ldbx $acc8($rounds),$acc8 | ||
| 404 | _srm $t3,24,$acc12 | ||
| 405 | ldbx $acc9($rounds),$acc9 | ||
| 406 | _srm $t0,16,$acc13 | ||
| 407 | _srm $t1,0,$acc11 | ||
| 408 | ldbx $acc10($rounds),$acc10 | ||
| 409 | _srm $t1,8,$acc14 | ||
| 410 | ldbx $acc11($rounds),$acc11 | ||
| 411 | ldbx $acc12($rounds),$acc12 | ||
| 412 | ldbx $acc13($rounds),$acc13 | ||
| 413 | _srm $t2,0,$acc15 | ||
| 414 | ldbx $acc14($rounds),$acc14 | ||
| 415 | |||
| 416 | dep $acc0,7,8,$acc3 | ||
| 417 | ldbx $acc15($rounds),$acc15 | ||
| 418 | dep $acc4,7,8,$acc7 | ||
| 419 | dep $acc1,15,8,$acc3 | ||
| 420 | dep $acc5,15,8,$acc7 | ||
| 421 | dep $acc2,23,8,$acc3 | ||
| 422 | dep $acc6,23,8,$acc7 | ||
| 423 | xor $acc3,$s0,$s0 | ||
| 424 | xor $acc7,$s1,$s1 | ||
| 425 | dep $acc8,7,8,$acc11 | ||
| 426 | dep $acc12,7,8,$acc15 | ||
| 427 | dep $acc9,15,8,$acc11 | ||
| 428 | dep $acc13,15,8,$acc15 | ||
| 429 | dep $acc10,23,8,$acc11 | ||
| 430 | dep $acc14,23,8,$acc15 | ||
| 431 | xor $acc11,$s2,$s2 | ||
| 432 | |||
| 433 | bv (%r31) | ||
| 434 | .EXIT | ||
| 435 | xor $acc15,$s3,$s3 | ||
| 436 | .PROCEND | ||
| 437 | |||
| 438 | .ALIGN 64 | ||
| 439 | L\$AES_Te | ||
| 440 | .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d | ||
| 441 | .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 | ||
| 442 | .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d | ||
| 443 | .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a | ||
| 444 | .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 | ||
| 445 | .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b | ||
| 446 | .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea | ||
| 447 | .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b | ||
| 448 | .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a | ||
| 449 | .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f | ||
| 450 | .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 | ||
| 451 | .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f | ||
| 452 | .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e | ||
| 453 | .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 | ||
| 454 | .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d | ||
| 455 | .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f | ||
| 456 | .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e | ||
| 457 | .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb | ||
| 458 | .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce | ||
| 459 | .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 | ||
| 460 | .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c | ||
| 461 | .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed | ||
| 462 | .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b | ||
| 463 | .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a | ||
| 464 | .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 | ||
| 465 | .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 | ||
| 466 | .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 | ||
| 467 | .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 | ||
| 468 | .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a | ||
| 469 | .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 | ||
| 470 | .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 | ||
| 471 | .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d | ||
| 472 | .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f | ||
| 473 | .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 | ||
| 474 | .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 | ||
| 475 | .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 | ||
| 476 | .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f | ||
| 477 | .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 | ||
| 478 | .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c | ||
| 479 | .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 | ||
| 480 | .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e | ||
| 481 | .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 | ||
| 482 | .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 | ||
| 483 | .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b | ||
| 484 | .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 | ||
| 485 | .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 | ||
| 486 | .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 | ||
| 487 | .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 | ||
| 488 | .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 | ||
| 489 | .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 | ||
| 490 | .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 | ||
| 491 | .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 | ||
| 492 | .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa | ||
| 493 | .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 | ||
| 494 | .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 | ||
| 495 | .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 | ||
| 496 | .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 | ||
| 497 | .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 | ||
| 498 | .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 | ||
| 499 | .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a | ||
| 500 | .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 | ||
| 501 | .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 | ||
| 502 | .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 | ||
| 503 | .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a | ||
| 504 | .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 | ||
| 505 | .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | ||
| 506 | .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | ||
| 507 | .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | ||
| 508 | .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | ||
| 509 | .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | ||
| 510 | .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | ||
| 511 | .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | ||
| 512 | .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | ||
| 513 | .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | ||
| 514 | .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | ||
| 515 | .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | ||
| 516 | .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | ||
| 517 | .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | ||
| 518 | .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | ||
| 519 | .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | ||
| 520 | .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | ||
| 521 | .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | ||
| 522 | .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | ||
| 523 | .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | ||
| 524 | .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | ||
| 525 | .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | ||
| 526 | .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | ||
| 527 | .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | ||
| 528 | .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | ||
| 529 | .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | ||
| 530 | .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | ||
| 531 | .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | ||
| 532 | .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | ||
| 533 | .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | ||
| 534 | .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | ||
| 535 | .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | ||
| 536 | ___ | ||
| 537 | |||
| 538 | $code.=<<___; | ||
| 539 | .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
| 540 | .ALIGN 16 | ||
| 541 | AES_decrypt | ||
| 542 | .PROC | ||
| 543 | .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 | ||
| 544 | .ENTRY | ||
| 545 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
| 546 | $PUSHMA %r3,$FRAME(%sp) | ||
| 547 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
| 548 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
| 549 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
| 550 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
| 551 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
| 552 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
| 553 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
| 554 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
| 555 | $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) | ||
| 556 | $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) | ||
| 557 | $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) | ||
| 558 | $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) | ||
| 559 | $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) | ||
| 560 | $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) | ||
| 561 | $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) | ||
| 562 | |||
| 563 | blr %r0,$tbl | ||
| 564 | ldi 3,$t0 | ||
| 565 | L\$dec_pic | ||
| 566 | andcm $tbl,$t0,$tbl | ||
| 567 | ldo L\$AES_Td-L\$dec_pic($tbl),$tbl | ||
| 568 | |||
| 569 | and $inp,$t0,$t0 | ||
| 570 | sub $inp,$t0,$inp | ||
| 571 | ldw 0($inp),$s0 | ||
| 572 | ldw 4($inp),$s1 | ||
| 573 | ldw 8($inp),$s2 | ||
| 574 | comib,= 0,$t0,L\$dec_inp_aligned | ||
| 575 | ldw 12($inp),$s3 | ||
| 576 | |||
| 577 | sh3addl $t0,%r0,$t0 | ||
| 578 | subi 32,$t0,$t0 | ||
| 579 | mtctl $t0,%cr11 | ||
| 580 | ldw 16($inp),$t1 | ||
| 581 | vshd $s0,$s1,$s0 | ||
| 582 | vshd $s1,$s2,$s1 | ||
| 583 | vshd $s2,$s3,$s2 | ||
| 584 | vshd $s3,$t1,$s3 | ||
| 585 | |||
| 586 | L\$dec_inp_aligned | ||
| 587 | bl _parisc_AES_decrypt,%r31 | ||
| 588 | nop | ||
| 589 | |||
| 590 | extru,<> $out,31,2,%r0 | ||
| 591 | b L\$dec_out_aligned | ||
| 592 | nop | ||
| 593 | |||
| 594 | _srm $s0,24,$acc0 | ||
| 595 | _srm $s0,16,$acc1 | ||
| 596 | stb $acc0,0($out) | ||
| 597 | _srm $s0,8,$acc2 | ||
| 598 | stb $acc1,1($out) | ||
| 599 | _srm $s1,24,$acc4 | ||
| 600 | stb $acc2,2($out) | ||
| 601 | _srm $s1,16,$acc5 | ||
| 602 | stb $s0,3($out) | ||
| 603 | _srm $s1,8,$acc6 | ||
| 604 | stb $acc4,4($out) | ||
| 605 | _srm $s2,24,$acc0 | ||
| 606 | stb $acc5,5($out) | ||
| 607 | _srm $s2,16,$acc1 | ||
| 608 | stb $acc6,6($out) | ||
| 609 | _srm $s2,8,$acc2 | ||
| 610 | stb $s1,7($out) | ||
| 611 | _srm $s3,24,$acc4 | ||
| 612 | stb $acc0,8($out) | ||
| 613 | _srm $s3,16,$acc5 | ||
| 614 | stb $acc1,9($out) | ||
| 615 | _srm $s3,8,$acc6 | ||
| 616 | stb $acc2,10($out) | ||
| 617 | stb $s2,11($out) | ||
| 618 | stb $acc4,12($out) | ||
| 619 | stb $acc5,13($out) | ||
| 620 | stb $acc6,14($out) | ||
| 621 | b L\$dec_done | ||
| 622 | stb $s3,15($out) | ||
| 623 | |||
| 624 | L\$dec_out_aligned | ||
| 625 | stw $s0,0($out) | ||
| 626 | stw $s1,4($out) | ||
| 627 | stw $s2,8($out) | ||
| 628 | stw $s3,12($out) | ||
| 629 | |||
| 630 | L\$dec_done | ||
| 631 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
| 632 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
| 633 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
| 634 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
| 635 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
| 636 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
| 637 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
| 638 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
| 639 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
| 640 | $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 | ||
| 641 | $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 | ||
| 642 | $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 | ||
| 643 | $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 | ||
| 644 | $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 | ||
| 645 | $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 | ||
| 646 | $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 | ||
| 647 | bv (%r2) | ||
| 648 | .EXIT | ||
| 649 | $POPMB -$FRAME(%sp),%r3 | ||
| 650 | .PROCEND | ||
| 651 | |||
| 652 | .ALIGN 16 | ||
| 653 | _parisc_AES_decrypt | ||
| 654 | .PROC | ||
| 655 | .CALLINFO MILLICODE | ||
| 656 | .ENTRY | ||
| 657 | ldw 240($key),$rounds | ||
| 658 | ldw 0($key),$t0 | ||
| 659 | ldw 4($key),$t1 | ||
| 660 | ldw 8($key),$t2 | ||
| 661 | ldw 12($key),$t3 | ||
| 662 | _srm $rounds,1,$rounds | ||
| 663 | xor $t0,$s0,$s0 | ||
| 664 | ldw 16($key),$t0 | ||
| 665 | xor $t1,$s1,$s1 | ||
| 666 | ldw 20($key),$t1 | ||
| 667 | _srm $s0,24,$acc0 | ||
| 668 | xor $t2,$s2,$s2 | ||
| 669 | ldw 24($key),$t2 | ||
| 670 | xor $t3,$s3,$s3 | ||
| 671 | ldw 28($key),$t3 | ||
| 672 | _srm $s3,16,$acc1 | ||
| 673 | L\$dec_loop | ||
| 674 | _srm $s2,8,$acc2 | ||
| 675 | ldwx,s $acc0($tbl),$acc0 | ||
| 676 | _srm $s1,0,$acc3 | ||
| 677 | ldwx,s $acc1($tbl),$acc1 | ||
| 678 | _srm $s1,24,$acc4 | ||
| 679 | ldwx,s $acc2($tbl),$acc2 | ||
| 680 | _srm $s0,16,$acc5 | ||
| 681 | ldwx,s $acc3($tbl),$acc3 | ||
| 682 | _srm $s3,8,$acc6 | ||
| 683 | ldwx,s $acc4($tbl),$acc4 | ||
| 684 | _srm $s2,0,$acc7 | ||
| 685 | ldwx,s $acc5($tbl),$acc5 | ||
| 686 | _srm $s2,24,$acc8 | ||
| 687 | ldwx,s $acc6($tbl),$acc6 | ||
| 688 | _srm $s1,16,$acc9 | ||
| 689 | ldwx,s $acc7($tbl),$acc7 | ||
| 690 | _srm $s0,8,$acc10 | ||
| 691 | ldwx,s $acc8($tbl),$acc8 | ||
| 692 | _srm $s3,0,$acc11 | ||
| 693 | ldwx,s $acc9($tbl),$acc9 | ||
| 694 | _srm $s3,24,$acc12 | ||
| 695 | ldwx,s $acc10($tbl),$acc10 | ||
| 696 | _srm $s2,16,$acc13 | ||
| 697 | ldwx,s $acc11($tbl),$acc11 | ||
| 698 | _srm $s1,8,$acc14 | ||
| 699 | ldwx,s $acc12($tbl),$acc12 | ||
| 700 | _srm $s0,0,$acc15 | ||
| 701 | ldwx,s $acc13($tbl),$acc13 | ||
| 702 | ldwx,s $acc14($tbl),$acc14 | ||
| 703 | ldwx,s $acc15($tbl),$acc15 | ||
| 704 | addib,= -1,$rounds,L\$dec_last | ||
| 705 | ldo 32($key),$key | ||
| 706 | |||
| 707 | _ror $acc1,8,$acc1 | ||
| 708 | xor $acc0,$t0,$t0 | ||
| 709 | ldw 0($key),$s0 | ||
| 710 | _ror $acc2,16,$acc2 | ||
| 711 | xor $acc1,$t0,$t0 | ||
| 712 | ldw 4($key),$s1 | ||
| 713 | _ror $acc3,24,$acc3 | ||
| 714 | xor $acc2,$t0,$t0 | ||
| 715 | ldw 8($key),$s2 | ||
| 716 | _ror $acc5,8,$acc5 | ||
| 717 | xor $acc3,$t0,$t0 | ||
| 718 | ldw 12($key),$s3 | ||
| 719 | _ror $acc6,16,$acc6 | ||
| 720 | xor $acc4,$t1,$t1 | ||
| 721 | _ror $acc7,24,$acc7 | ||
| 722 | xor $acc5,$t1,$t1 | ||
| 723 | _ror $acc9,8,$acc9 | ||
| 724 | xor $acc6,$t1,$t1 | ||
| 725 | _ror $acc10,16,$acc10 | ||
| 726 | xor $acc7,$t1,$t1 | ||
| 727 | _ror $acc11,24,$acc11 | ||
| 728 | xor $acc8,$t2,$t2 | ||
| 729 | _ror $acc13,8,$acc13 | ||
| 730 | xor $acc9,$t2,$t2 | ||
| 731 | _ror $acc14,16,$acc14 | ||
| 732 | xor $acc10,$t2,$t2 | ||
| 733 | _ror $acc15,24,$acc15 | ||
| 734 | xor $acc11,$t2,$t2 | ||
| 735 | xor $acc12,$acc14,$acc14 | ||
| 736 | xor $acc13,$t3,$t3 | ||
| 737 | _srm $t0,24,$acc0 | ||
| 738 | xor $acc14,$t3,$t3 | ||
| 739 | xor $acc15,$t3,$t3 | ||
| 740 | _srm $t3,16,$acc1 | ||
| 741 | |||
| 742 | _srm $t2,8,$acc2 | ||
| 743 | ldwx,s $acc0($tbl),$acc0 | ||
| 744 | _srm $t1,0,$acc3 | ||
| 745 | ldwx,s $acc1($tbl),$acc1 | ||
| 746 | _srm $t1,24,$acc4 | ||
| 747 | ldwx,s $acc2($tbl),$acc2 | ||
| 748 | _srm $t0,16,$acc5 | ||
| 749 | ldwx,s $acc3($tbl),$acc3 | ||
| 750 | _srm $t3,8,$acc6 | ||
| 751 | ldwx,s $acc4($tbl),$acc4 | ||
| 752 | _srm $t2,0,$acc7 | ||
| 753 | ldwx,s $acc5($tbl),$acc5 | ||
| 754 | _srm $t2,24,$acc8 | ||
| 755 | ldwx,s $acc6($tbl),$acc6 | ||
| 756 | _srm $t1,16,$acc9 | ||
| 757 | ldwx,s $acc7($tbl),$acc7 | ||
| 758 | _srm $t0,8,$acc10 | ||
| 759 | ldwx,s $acc8($tbl),$acc8 | ||
| 760 | _srm $t3,0,$acc11 | ||
| 761 | ldwx,s $acc9($tbl),$acc9 | ||
| 762 | _srm $t3,24,$acc12 | ||
| 763 | ldwx,s $acc10($tbl),$acc10 | ||
| 764 | _srm $t2,16,$acc13 | ||
| 765 | ldwx,s $acc11($tbl),$acc11 | ||
| 766 | _srm $t1,8,$acc14 | ||
| 767 | ldwx,s $acc12($tbl),$acc12 | ||
| 768 | _srm $t0,0,$acc15 | ||
| 769 | ldwx,s $acc13($tbl),$acc13 | ||
| 770 | _ror $acc1,8,$acc1 | ||
| 771 | ldwx,s $acc14($tbl),$acc14 | ||
| 772 | |||
| 773 | _ror $acc2,16,$acc2 | ||
| 774 | xor $acc0,$s0,$s0 | ||
| 775 | ldwx,s $acc15($tbl),$acc15 | ||
| 776 | _ror $acc3,24,$acc3 | ||
| 777 | xor $acc1,$s0,$s0 | ||
| 778 | ldw 16($key),$t0 | ||
| 779 | _ror $acc5,8,$acc5 | ||
| 780 | xor $acc2,$s0,$s0 | ||
| 781 | ldw 20($key),$t1 | ||
| 782 | _ror $acc6,16,$acc6 | ||
| 783 | xor $acc3,$s0,$s0 | ||
| 784 | ldw 24($key),$t2 | ||
| 785 | _ror $acc7,24,$acc7 | ||
| 786 | xor $acc4,$s1,$s1 | ||
| 787 | ldw 28($key),$t3 | ||
| 788 | _ror $acc9,8,$acc9 | ||
| 789 | xor $acc5,$s1,$s1 | ||
| 790 | ldw 1024+0($tbl),%r0 ; prefetch td4 | ||
| 791 | _ror $acc10,16,$acc10 | ||
| 792 | xor $acc6,$s1,$s1 | ||
| 793 | ldw 1024+32($tbl),%r0 ; prefetch td4 | ||
| 794 | _ror $acc11,24,$acc11 | ||
| 795 | xor $acc7,$s1,$s1 | ||
| 796 | ldw 1024+64($tbl),%r0 ; prefetch td4 | ||
| 797 | _ror $acc13,8,$acc13 | ||
| 798 | xor $acc8,$s2,$s2 | ||
| 799 | ldw 1024+96($tbl),%r0 ; prefetch td4 | ||
| 800 | _ror $acc14,16,$acc14 | ||
| 801 | xor $acc9,$s2,$s2 | ||
| 802 | ldw 1024+128($tbl),%r0 ; prefetch td4 | ||
| 803 | _ror $acc15,24,$acc15 | ||
| 804 | xor $acc10,$s2,$s2 | ||
| 805 | ldw 1024+160($tbl),%r0 ; prefetch td4 | ||
| 806 | _srm $s0,24,$acc0 | ||
| 807 | xor $acc11,$s2,$s2 | ||
| 808 | ldw 1024+192($tbl),%r0 ; prefetch td4 | ||
| 809 | xor $acc12,$acc14,$acc14 | ||
| 810 | xor $acc13,$s3,$s3 | ||
| 811 | ldw 1024+224($tbl),%r0 ; prefetch td4 | ||
| 812 | xor $acc14,$s3,$s3 | ||
| 813 | xor $acc15,$s3,$s3 | ||
| 814 | b L\$dec_loop | ||
| 815 | _srm $s3,16,$acc1 | ||
| 816 | |||
| 817 | .ALIGN 16 | ||
| 818 | L\$dec_last | ||
| 819 | ldo 1024($tbl),$rounds | ||
| 820 | _ror $acc1,8,$acc1 | ||
| 821 | xor $acc0,$t0,$t0 | ||
| 822 | ldw 0($key),$s0 | ||
| 823 | _ror $acc2,16,$acc2 | ||
| 824 | xor $acc1,$t0,$t0 | ||
| 825 | ldw 4($key),$s1 | ||
| 826 | _ror $acc3,24,$acc3 | ||
| 827 | xor $acc2,$t0,$t0 | ||
| 828 | ldw 8($key),$s2 | ||
| 829 | _ror $acc5,8,$acc5 | ||
| 830 | xor $acc3,$t0,$t0 | ||
| 831 | ldw 12($key),$s3 | ||
| 832 | _ror $acc6,16,$acc6 | ||
| 833 | xor $acc4,$t1,$t1 | ||
| 834 | _ror $acc7,24,$acc7 | ||
| 835 | xor $acc5,$t1,$t1 | ||
| 836 | _ror $acc9,8,$acc9 | ||
| 837 | xor $acc6,$t1,$t1 | ||
| 838 | _ror $acc10,16,$acc10 | ||
| 839 | xor $acc7,$t1,$t1 | ||
| 840 | _ror $acc11,24,$acc11 | ||
| 841 | xor $acc8,$t2,$t2 | ||
| 842 | _ror $acc13,8,$acc13 | ||
| 843 | xor $acc9,$t2,$t2 | ||
| 844 | _ror $acc14,16,$acc14 | ||
| 845 | xor $acc10,$t2,$t2 | ||
| 846 | _ror $acc15,24,$acc15 | ||
| 847 | xor $acc11,$t2,$t2 | ||
| 848 | xor $acc12,$acc14,$acc14 | ||
| 849 | xor $acc13,$t3,$t3 | ||
| 850 | _srm $t0,24,$acc0 | ||
| 851 | xor $acc14,$t3,$t3 | ||
| 852 | xor $acc15,$t3,$t3 | ||
| 853 | _srm $t3,16,$acc1 | ||
| 854 | |||
| 855 | _srm $t2,8,$acc2 | ||
| 856 | ldbx $acc0($rounds),$acc0 | ||
| 857 | _srm $t1,24,$acc4 | ||
| 858 | ldbx $acc1($rounds),$acc1 | ||
| 859 | _srm $t0,16,$acc5 | ||
| 860 | _srm $t1,0,$acc3 | ||
| 861 | ldbx $acc2($rounds),$acc2 | ||
| 862 | ldbx $acc3($rounds),$acc3 | ||
| 863 | _srm $t3,8,$acc6 | ||
| 864 | ldbx $acc4($rounds),$acc4 | ||
| 865 | _srm $t2,24,$acc8 | ||
| 866 | ldbx $acc5($rounds),$acc5 | ||
| 867 | _srm $t1,16,$acc9 | ||
| 868 | _srm $t2,0,$acc7 | ||
| 869 | ldbx $acc6($rounds),$acc6 | ||
| 870 | ldbx $acc7($rounds),$acc7 | ||
| 871 | _srm $t0,8,$acc10 | ||
| 872 | ldbx $acc8($rounds),$acc8 | ||
| 873 | _srm $t3,24,$acc12 | ||
| 874 | ldbx $acc9($rounds),$acc9 | ||
| 875 | _srm $t2,16,$acc13 | ||
| 876 | _srm $t3,0,$acc11 | ||
| 877 | ldbx $acc10($rounds),$acc10 | ||
| 878 | _srm $t1,8,$acc14 | ||
| 879 | ldbx $acc11($rounds),$acc11 | ||
| 880 | ldbx $acc12($rounds),$acc12 | ||
| 881 | ldbx $acc13($rounds),$acc13 | ||
| 882 | _srm $t0,0,$acc15 | ||
| 883 | ldbx $acc14($rounds),$acc14 | ||
| 884 | |||
| 885 | dep $acc0,7,8,$acc3 | ||
| 886 | ldbx $acc15($rounds),$acc15 | ||
| 887 | dep $acc4,7,8,$acc7 | ||
| 888 | dep $acc1,15,8,$acc3 | ||
| 889 | dep $acc5,15,8,$acc7 | ||
| 890 | dep $acc2,23,8,$acc3 | ||
| 891 | dep $acc6,23,8,$acc7 | ||
| 892 | xor $acc3,$s0,$s0 | ||
| 893 | xor $acc7,$s1,$s1 | ||
| 894 | dep $acc8,7,8,$acc11 | ||
| 895 | dep $acc12,7,8,$acc15 | ||
| 896 | dep $acc9,15,8,$acc11 | ||
| 897 | dep $acc13,15,8,$acc15 | ||
| 898 | dep $acc10,23,8,$acc11 | ||
| 899 | dep $acc14,23,8,$acc15 | ||
| 900 | xor $acc11,$s2,$s2 | ||
| 901 | |||
| 902 | bv (%r31) | ||
| 903 | .EXIT | ||
| 904 | xor $acc15,$s3,$s3 | ||
| 905 | .PROCEND | ||
| 906 | |||
| 907 | .ALIGN 64 | ||
| 908 | L\$AES_Td | ||
| 909 | .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 | ||
| 910 | .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 | ||
| 911 | .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 | ||
| 912 | .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f | ||
| 913 | .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 | ||
| 914 | .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 | ||
| 915 | .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da | ||
| 916 | .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 | ||
| 917 | .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd | ||
| 918 | .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 | ||
| 919 | .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 | ||
| 920 | .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 | ||
| 921 | .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 | ||
| 922 | .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a | ||
| 923 | .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 | ||
| 924 | .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c | ||
| 925 | .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 | ||
| 926 | .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a | ||
| 927 | .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 | ||
| 928 | .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 | ||
| 929 | .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 | ||
| 930 | .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff | ||
| 931 | .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 | ||
| 932 | .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb | ||
| 933 | .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 | ||
| 934 | .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e | ||
| 935 | .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 | ||
| 936 | .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a | ||
| 937 | .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e | ||
| 938 | .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 | ||
| 939 | .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d | ||
| 940 | .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 | ||
| 941 | .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd | ||
| 942 | .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 | ||
| 943 | .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 | ||
| 944 | .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 | ||
| 945 | .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d | ||
| 946 | .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 | ||
| 947 | .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 | ||
| 948 | .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef | ||
| 949 | .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 | ||
| 950 | .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 | ||
| 951 | .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 | ||
| 952 | .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 | ||
| 953 | .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 | ||
| 954 | .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b | ||
| 955 | .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 | ||
| 956 | .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 | ||
| 957 | .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 | ||
| 958 | .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 | ||
| 959 | .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 | ||
| 960 | .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f | ||
| 961 | .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df | ||
| 962 | .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f | ||
| 963 | .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e | ||
| 964 | .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 | ||
| 965 | .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 | ||
| 966 | .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c | ||
| 967 | .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf | ||
| 968 | .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 | ||
| 969 | .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f | ||
| 970 | .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 | ||
| 971 | .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 | ||
| 972 | .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 | ||
| 973 | .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 | ||
| 974 | .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | ||
| 975 | .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | ||
| 976 | .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | ||
| 977 | .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | ||
| 978 | .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | ||
| 979 | .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | ||
| 980 | .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | ||
| 981 | .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | ||
| 982 | .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | ||
| 983 | .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | ||
| 984 | .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | ||
| 985 | .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | ||
| 986 | .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | ||
| 987 | .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | ||
| 988 | .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | ||
| 989 | .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | ||
| 990 | .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | ||
| 991 | .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | ||
| 992 | .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | ||
| 993 | .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | ||
| 994 | .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | ||
| 995 | .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | ||
| 996 | .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | ||
| 997 | .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | ||
| 998 | .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | ||
| 999 | .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | ||
| 1000 | .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | ||
| 1001 | .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | ||
| 1002 | .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | ||
| 1003 | .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | ||
| 1004 | .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | ||
| 1005 | .STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 1006 | ___ | ||
| 1007 | |||
| 1008 | foreach (split("\n",$code)) { | ||
| 1009 | s/\`([^\`]*)\`/eval $1/ge; | ||
| 1010 | |||
| 1011 | # translate made up instructons: _ror, _srm | ||
| 1012 | s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or | ||
| 1013 | |||
| 1014 | s/_srm(\s+%r[0-9]+),([0-9]+),/ | ||
| 1015 | $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2) | ||
| 1016 | : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e; | ||
| 1017 | |||
| 1018 | s/,\*/,/ if ($SIZE_T==4); | ||
| 1019 | print $_,"\n"; | ||
| 1020 | } | ||
| 1021 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl index f82c5e1814..7c52cbe5f9 100644 --- a/src/lib/libcrypto/aes/asm/aes-ppc.pl +++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl | |||
| @@ -7,7 +7,7 @@ | |||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | 7 | # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 | # ==================================================================== | 8 | # ==================================================================== |
| 9 | 9 | ||
| 10 | # Needs more work: key setup, page boundaries, CBC routine... | 10 | # Needs more work: key setup, CBC routine... |
| 11 | # | 11 | # |
| 12 | # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with | 12 | # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with |
| 13 | # 128-bit key, which is ~40% better than 64-bit code generated by gcc | 13 | # 128-bit key, which is ~40% better than 64-bit code generated by gcc |
| @@ -18,7 +18,7 @@ | |||
| 18 | 18 | ||
| 19 | # February 2010 | 19 | # February 2010 |
| 20 | # | 20 | # |
| 21 | # Rescheduling instructions to favour Power6 pipeline gives 10% | 21 | # Rescheduling instructions to favour Power6 pipeline gave 10% |
| 22 | # performance improvement on the platfrom in question (and marginal | 22 | # performance improvement on the platfrom in question (and marginal |
| 23 | # improvement even on others). It should be noted that Power6 fails | 23 | # improvement even on others). It should be noted that Power6 fails |
| 24 | # to process byte in 18 cycles, only in 23, because it fails to issue | 24 | # to process byte in 18 cycles, only in 23, because it fails to issue |
| @@ -33,11 +33,13 @@ $flavour = shift; | |||
| 33 | 33 | ||
| 34 | if ($flavour =~ /64/) { | 34 | if ($flavour =~ /64/) { |
| 35 | $SIZE_T =8; | 35 | $SIZE_T =8; |
| 36 | $LRSAVE =2*$SIZE_T; | ||
| 36 | $STU ="stdu"; | 37 | $STU ="stdu"; |
| 37 | $POP ="ld"; | 38 | $POP ="ld"; |
| 38 | $PUSH ="std"; | 39 | $PUSH ="std"; |
| 39 | } elsif ($flavour =~ /32/) { | 40 | } elsif ($flavour =~ /32/) { |
| 40 | $SIZE_T =4; | 41 | $SIZE_T =4; |
| 42 | $LRSAVE =$SIZE_T; | ||
| 41 | $STU ="stwu"; | 43 | $STU ="stwu"; |
| 42 | $POP ="lwz"; | 44 | $POP ="lwz"; |
| 43 | $PUSH ="stw"; | 45 | $PUSH ="stw"; |
| @@ -116,15 +118,19 @@ LAES_Te: | |||
| 116 | addi $Tbl0,$Tbl0,`128-8` | 118 | addi $Tbl0,$Tbl0,`128-8` |
| 117 | mtlr r0 | 119 | mtlr r0 |
| 118 | blr | 120 | blr |
| 119 | .space `32-24` | 121 | .long 0 |
| 122 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 123 | .space `64-9*4` | ||
| 120 | LAES_Td: | 124 | LAES_Td: |
| 121 | mflr r0 | 125 | mflr r0 |
| 122 | bcl 20,31,\$+4 | 126 | bcl 20,31,\$+4 |
| 123 | mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry | 127 | mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry |
| 124 | addi $Tbl0,$Tbl0,`128-8-32+2048+256` | 128 | addi $Tbl0,$Tbl0,`128-64-8+2048+256` |
| 125 | mtlr r0 | 129 | mtlr r0 |
| 126 | blr | 130 | blr |
| 127 | .space `128-32-24` | 131 | .long 0 |
| 132 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 133 | .space `128-64-9*4` | ||
| 128 | ___ | 134 | ___ |
| 129 | &_data_word( | 135 | &_data_word( |
| 130 | 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, | 136 | 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, |
| @@ -328,10 +334,9 @@ $code.=<<___; | |||
| 328 | .globl .AES_encrypt | 334 | .globl .AES_encrypt |
| 329 | .align 7 | 335 | .align 7 |
| 330 | .AES_encrypt: | 336 | .AES_encrypt: |
| 331 | mflr r0 | ||
| 332 | $STU $sp,-$FRAME($sp) | 337 | $STU $sp,-$FRAME($sp) |
| 338 | mflr r0 | ||
| 333 | 339 | ||
| 334 | $PUSH r0,`$FRAME-$SIZE_T*21`($sp) | ||
| 335 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) | 340 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) |
| 336 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) | 341 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) |
| 337 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | 342 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) |
| @@ -352,7 +357,14 @@ $code.=<<___; | |||
| 352 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | 357 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
| 353 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | 358 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
| 354 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | 359 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
| 360 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | ||
| 361 | |||
| 362 | andi. $t0,$inp,3 | ||
| 363 | andi. $t1,$out,3 | ||
| 364 | or. $t0,$t0,$t1 | ||
| 365 | bne Lenc_unaligned | ||
| 355 | 366 | ||
| 367 | Lenc_unaligned_ok: | ||
| 356 | lwz $s0,0($inp) | 368 | lwz $s0,0($inp) |
| 357 | lwz $s1,4($inp) | 369 | lwz $s1,4($inp) |
| 358 | lwz $s2,8($inp) | 370 | lwz $s2,8($inp) |
| @@ -363,8 +375,80 @@ $code.=<<___; | |||
| 363 | stw $s1,4($out) | 375 | stw $s1,4($out) |
| 364 | stw $s2,8($out) | 376 | stw $s2,8($out) |
| 365 | stw $s3,12($out) | 377 | stw $s3,12($out) |
| 378 | b Lenc_done | ||
| 379 | |||
| 380 | Lenc_unaligned: | ||
| 381 | subfic $t0,$inp,4096 | ||
| 382 | subfic $t1,$out,4096 | ||
| 383 | andi. $t0,$t0,4096-16 | ||
| 384 | beq Lenc_xpage | ||
| 385 | andi. $t1,$t1,4096-16 | ||
| 386 | bne Lenc_unaligned_ok | ||
| 387 | |||
| 388 | Lenc_xpage: | ||
| 389 | lbz $acc00,0($inp) | ||
| 390 | lbz $acc01,1($inp) | ||
| 391 | lbz $acc02,2($inp) | ||
| 392 | lbz $s0,3($inp) | ||
| 393 | lbz $acc04,4($inp) | ||
| 394 | lbz $acc05,5($inp) | ||
| 395 | lbz $acc06,6($inp) | ||
| 396 | lbz $s1,7($inp) | ||
| 397 | lbz $acc08,8($inp) | ||
| 398 | lbz $acc09,9($inp) | ||
| 399 | lbz $acc10,10($inp) | ||
| 400 | insrwi $s0,$acc00,8,0 | ||
| 401 | lbz $s2,11($inp) | ||
| 402 | insrwi $s1,$acc04,8,0 | ||
| 403 | lbz $acc12,12($inp) | ||
| 404 | insrwi $s0,$acc01,8,8 | ||
| 405 | lbz $acc13,13($inp) | ||
| 406 | insrwi $s1,$acc05,8,8 | ||
| 407 | lbz $acc14,14($inp) | ||
| 408 | insrwi $s0,$acc02,8,16 | ||
| 409 | lbz $s3,15($inp) | ||
| 410 | insrwi $s1,$acc06,8,16 | ||
| 411 | insrwi $s2,$acc08,8,0 | ||
| 412 | insrwi $s3,$acc12,8,0 | ||
| 413 | insrwi $s2,$acc09,8,8 | ||
| 414 | insrwi $s3,$acc13,8,8 | ||
| 415 | insrwi $s2,$acc10,8,16 | ||
| 416 | insrwi $s3,$acc14,8,16 | ||
| 417 | |||
| 418 | bl LAES_Te | ||
| 419 | bl Lppc_AES_encrypt_compact | ||
| 420 | |||
| 421 | extrwi $acc00,$s0,8,0 | ||
| 422 | extrwi $acc01,$s0,8,8 | ||
| 423 | stb $acc00,0($out) | ||
| 424 | extrwi $acc02,$s0,8,16 | ||
| 425 | stb $acc01,1($out) | ||
| 426 | stb $acc02,2($out) | ||
| 427 | extrwi $acc04,$s1,8,0 | ||
| 428 | stb $s0,3($out) | ||
| 429 | extrwi $acc05,$s1,8,8 | ||
| 430 | stb $acc04,4($out) | ||
| 431 | extrwi $acc06,$s1,8,16 | ||
| 432 | stb $acc05,5($out) | ||
| 433 | stb $acc06,6($out) | ||
| 434 | extrwi $acc08,$s2,8,0 | ||
| 435 | stb $s1,7($out) | ||
| 436 | extrwi $acc09,$s2,8,8 | ||
| 437 | stb $acc08,8($out) | ||
| 438 | extrwi $acc10,$s2,8,16 | ||
| 439 | stb $acc09,9($out) | ||
| 440 | stb $acc10,10($out) | ||
| 441 | extrwi $acc12,$s3,8,0 | ||
| 442 | stb $s2,11($out) | ||
| 443 | extrwi $acc13,$s3,8,8 | ||
| 444 | stb $acc12,12($out) | ||
| 445 | extrwi $acc14,$s3,8,16 | ||
| 446 | stb $acc13,13($out) | ||
| 447 | stb $acc14,14($out) | ||
| 448 | stb $s3,15($out) | ||
| 366 | 449 | ||
| 367 | $POP r0,`$FRAME-$SIZE_T*21`($sp) | 450 | Lenc_done: |
| 451 | $POP r0,`$FRAME+$LRSAVE`($sp) | ||
| 368 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) | 452 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) |
| 369 | $POP r13,`$FRAME-$SIZE_T*19`($sp) | 453 | $POP r13,`$FRAME-$SIZE_T*19`($sp) |
| 370 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | 454 | $POP r14,`$FRAME-$SIZE_T*18`($sp) |
| @@ -388,18 +472,21 @@ $code.=<<___; | |||
| 388 | mtlr r0 | 472 | mtlr r0 |
| 389 | addi $sp,$sp,$FRAME | 473 | addi $sp,$sp,$FRAME |
| 390 | blr | 474 | blr |
| 475 | .long 0 | ||
| 476 | .byte 0,12,4,1,0x80,18,3,0 | ||
| 477 | .long 0 | ||
| 391 | 478 | ||
| 392 | .align 5 | 479 | .align 5 |
| 393 | Lppc_AES_encrypt: | 480 | Lppc_AES_encrypt: |
| 394 | lwz $acc00,240($key) | 481 | lwz $acc00,240($key) |
| 395 | lwz $t0,0($key) | ||
| 396 | lwz $t1,4($key) | ||
| 397 | lwz $t2,8($key) | ||
| 398 | lwz $t3,12($key) | ||
| 399 | addi $Tbl1,$Tbl0,3 | 482 | addi $Tbl1,$Tbl0,3 |
| 483 | lwz $t0,0($key) | ||
| 400 | addi $Tbl2,$Tbl0,2 | 484 | addi $Tbl2,$Tbl0,2 |
| 485 | lwz $t1,4($key) | ||
| 401 | addi $Tbl3,$Tbl0,1 | 486 | addi $Tbl3,$Tbl0,1 |
| 487 | lwz $t2,8($key) | ||
| 402 | addi $acc00,$acc00,-1 | 488 | addi $acc00,$acc00,-1 |
| 489 | lwz $t3,12($key) | ||
| 403 | addi $key,$key,16 | 490 | addi $key,$key,16 |
| 404 | xor $s0,$s0,$t0 | 491 | xor $s0,$s0,$t0 |
| 405 | xor $s1,$s1,$t1 | 492 | xor $s1,$s1,$t1 |
| @@ -413,44 +500,44 @@ Lenc_loop: | |||
| 413 | rlwinm $acc02,$s2,`32-24+3`,21,28 | 500 | rlwinm $acc02,$s2,`32-24+3`,21,28 |
| 414 | rlwinm $acc03,$s3,`32-24+3`,21,28 | 501 | rlwinm $acc03,$s3,`32-24+3`,21,28 |
| 415 | lwz $t0,0($key) | 502 | lwz $t0,0($key) |
| 416 | lwz $t1,4($key) | ||
| 417 | rlwinm $acc04,$s1,`32-16+3`,21,28 | 503 | rlwinm $acc04,$s1,`32-16+3`,21,28 |
| 504 | lwz $t1,4($key) | ||
| 418 | rlwinm $acc05,$s2,`32-16+3`,21,28 | 505 | rlwinm $acc05,$s2,`32-16+3`,21,28 |
| 419 | lwz $t2,8($key) | 506 | lwz $t2,8($key) |
| 420 | lwz $t3,12($key) | ||
| 421 | rlwinm $acc06,$s3,`32-16+3`,21,28 | 507 | rlwinm $acc06,$s3,`32-16+3`,21,28 |
| 508 | lwz $t3,12($key) | ||
| 422 | rlwinm $acc07,$s0,`32-16+3`,21,28 | 509 | rlwinm $acc07,$s0,`32-16+3`,21,28 |
| 423 | lwzx $acc00,$Tbl0,$acc00 | 510 | lwzx $acc00,$Tbl0,$acc00 |
| 424 | lwzx $acc01,$Tbl0,$acc01 | ||
| 425 | rlwinm $acc08,$s2,`32-8+3`,21,28 | 511 | rlwinm $acc08,$s2,`32-8+3`,21,28 |
| 512 | lwzx $acc01,$Tbl0,$acc01 | ||
| 426 | rlwinm $acc09,$s3,`32-8+3`,21,28 | 513 | rlwinm $acc09,$s3,`32-8+3`,21,28 |
| 427 | lwzx $acc02,$Tbl0,$acc02 | 514 | lwzx $acc02,$Tbl0,$acc02 |
| 428 | lwzx $acc03,$Tbl0,$acc03 | ||
| 429 | rlwinm $acc10,$s0,`32-8+3`,21,28 | 515 | rlwinm $acc10,$s0,`32-8+3`,21,28 |
| 516 | lwzx $acc03,$Tbl0,$acc03 | ||
| 430 | rlwinm $acc11,$s1,`32-8+3`,21,28 | 517 | rlwinm $acc11,$s1,`32-8+3`,21,28 |
| 431 | lwzx $acc04,$Tbl1,$acc04 | 518 | lwzx $acc04,$Tbl1,$acc04 |
| 432 | lwzx $acc05,$Tbl1,$acc05 | ||
| 433 | rlwinm $acc12,$s3,`0+3`,21,28 | 519 | rlwinm $acc12,$s3,`0+3`,21,28 |
| 520 | lwzx $acc05,$Tbl1,$acc05 | ||
| 434 | rlwinm $acc13,$s0,`0+3`,21,28 | 521 | rlwinm $acc13,$s0,`0+3`,21,28 |
| 435 | lwzx $acc06,$Tbl1,$acc06 | 522 | lwzx $acc06,$Tbl1,$acc06 |
| 436 | lwzx $acc07,$Tbl1,$acc07 | ||
| 437 | rlwinm $acc14,$s1,`0+3`,21,28 | 523 | rlwinm $acc14,$s1,`0+3`,21,28 |
| 524 | lwzx $acc07,$Tbl1,$acc07 | ||
| 438 | rlwinm $acc15,$s2,`0+3`,21,28 | 525 | rlwinm $acc15,$s2,`0+3`,21,28 |
| 439 | lwzx $acc08,$Tbl2,$acc08 | 526 | lwzx $acc08,$Tbl2,$acc08 |
| 440 | lwzx $acc09,$Tbl2,$acc09 | ||
| 441 | xor $t0,$t0,$acc00 | 527 | xor $t0,$t0,$acc00 |
| 528 | lwzx $acc09,$Tbl2,$acc09 | ||
| 442 | xor $t1,$t1,$acc01 | 529 | xor $t1,$t1,$acc01 |
| 443 | lwzx $acc10,$Tbl2,$acc10 | 530 | lwzx $acc10,$Tbl2,$acc10 |
| 444 | lwzx $acc11,$Tbl2,$acc11 | ||
| 445 | xor $t2,$t2,$acc02 | 531 | xor $t2,$t2,$acc02 |
| 532 | lwzx $acc11,$Tbl2,$acc11 | ||
| 446 | xor $t3,$t3,$acc03 | 533 | xor $t3,$t3,$acc03 |
| 447 | lwzx $acc12,$Tbl3,$acc12 | 534 | lwzx $acc12,$Tbl3,$acc12 |
| 448 | lwzx $acc13,$Tbl3,$acc13 | ||
| 449 | xor $t0,$t0,$acc04 | 535 | xor $t0,$t0,$acc04 |
| 536 | lwzx $acc13,$Tbl3,$acc13 | ||
| 450 | xor $t1,$t1,$acc05 | 537 | xor $t1,$t1,$acc05 |
| 451 | lwzx $acc14,$Tbl3,$acc14 | 538 | lwzx $acc14,$Tbl3,$acc14 |
| 452 | lwzx $acc15,$Tbl3,$acc15 | ||
| 453 | xor $t2,$t2,$acc06 | 539 | xor $t2,$t2,$acc06 |
| 540 | lwzx $acc15,$Tbl3,$acc15 | ||
| 454 | xor $t3,$t3,$acc07 | 541 | xor $t3,$t3,$acc07 |
| 455 | xor $t0,$t0,$acc08 | 542 | xor $t0,$t0,$acc08 |
| 456 | xor $t1,$t1,$acc09 | 543 | xor $t1,$t1,$acc09 |
| @@ -466,60 +553,60 @@ Lenc_loop: | |||
| 466 | addi $Tbl2,$Tbl0,2048 | 553 | addi $Tbl2,$Tbl0,2048 |
| 467 | nop | 554 | nop |
| 468 | lwz $t0,0($key) | 555 | lwz $t0,0($key) |
| 469 | lwz $t1,4($key) | ||
| 470 | rlwinm $acc00,$s0,`32-24`,24,31 | 556 | rlwinm $acc00,$s0,`32-24`,24,31 |
| 557 | lwz $t1,4($key) | ||
| 471 | rlwinm $acc01,$s1,`32-24`,24,31 | 558 | rlwinm $acc01,$s1,`32-24`,24,31 |
| 472 | lwz $t2,8($key) | 559 | lwz $t2,8($key) |
| 473 | lwz $t3,12($key) | ||
| 474 | rlwinm $acc02,$s2,`32-24`,24,31 | 560 | rlwinm $acc02,$s2,`32-24`,24,31 |
| 561 | lwz $t3,12($key) | ||
| 475 | rlwinm $acc03,$s3,`32-24`,24,31 | 562 | rlwinm $acc03,$s3,`32-24`,24,31 |
| 476 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 | 563 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 |
| 477 | lwz $acc09,`2048+32`($Tbl0) | ||
| 478 | rlwinm $acc04,$s1,`32-16`,24,31 | 564 | rlwinm $acc04,$s1,`32-16`,24,31 |
| 565 | lwz $acc09,`2048+32`($Tbl0) | ||
| 479 | rlwinm $acc05,$s2,`32-16`,24,31 | 566 | rlwinm $acc05,$s2,`32-16`,24,31 |
| 480 | lwz $acc10,`2048+64`($Tbl0) | 567 | lwz $acc10,`2048+64`($Tbl0) |
| 481 | lwz $acc11,`2048+96`($Tbl0) | ||
| 482 | rlwinm $acc06,$s3,`32-16`,24,31 | 568 | rlwinm $acc06,$s3,`32-16`,24,31 |
| 569 | lwz $acc11,`2048+96`($Tbl0) | ||
| 483 | rlwinm $acc07,$s0,`32-16`,24,31 | 570 | rlwinm $acc07,$s0,`32-16`,24,31 |
| 484 | lwz $acc12,`2048+128`($Tbl0) | 571 | lwz $acc12,`2048+128`($Tbl0) |
| 485 | lwz $acc13,`2048+160`($Tbl0) | ||
| 486 | rlwinm $acc08,$s2,`32-8`,24,31 | 572 | rlwinm $acc08,$s2,`32-8`,24,31 |
| 573 | lwz $acc13,`2048+160`($Tbl0) | ||
| 487 | rlwinm $acc09,$s3,`32-8`,24,31 | 574 | rlwinm $acc09,$s3,`32-8`,24,31 |
| 488 | lwz $acc14,`2048+192`($Tbl0) | 575 | lwz $acc14,`2048+192`($Tbl0) |
| 489 | lwz $acc15,`2048+224`($Tbl0) | ||
| 490 | rlwinm $acc10,$s0,`32-8`,24,31 | 576 | rlwinm $acc10,$s0,`32-8`,24,31 |
| 577 | lwz $acc15,`2048+224`($Tbl0) | ||
| 491 | rlwinm $acc11,$s1,`32-8`,24,31 | 578 | rlwinm $acc11,$s1,`32-8`,24,31 |
| 492 | lbzx $acc00,$Tbl2,$acc00 | 579 | lbzx $acc00,$Tbl2,$acc00 |
| 493 | lbzx $acc01,$Tbl2,$acc01 | ||
| 494 | rlwinm $acc12,$s3,`0`,24,31 | 580 | rlwinm $acc12,$s3,`0`,24,31 |
| 581 | lbzx $acc01,$Tbl2,$acc01 | ||
| 495 | rlwinm $acc13,$s0,`0`,24,31 | 582 | rlwinm $acc13,$s0,`0`,24,31 |
| 496 | lbzx $acc02,$Tbl2,$acc02 | 583 | lbzx $acc02,$Tbl2,$acc02 |
| 497 | lbzx $acc03,$Tbl2,$acc03 | ||
| 498 | rlwinm $acc14,$s1,`0`,24,31 | 584 | rlwinm $acc14,$s1,`0`,24,31 |
| 585 | lbzx $acc03,$Tbl2,$acc03 | ||
| 499 | rlwinm $acc15,$s2,`0`,24,31 | 586 | rlwinm $acc15,$s2,`0`,24,31 |
| 500 | lbzx $acc04,$Tbl2,$acc04 | 587 | lbzx $acc04,$Tbl2,$acc04 |
| 501 | lbzx $acc05,$Tbl2,$acc05 | ||
| 502 | rlwinm $s0,$acc00,24,0,7 | 588 | rlwinm $s0,$acc00,24,0,7 |
| 589 | lbzx $acc05,$Tbl2,$acc05 | ||
| 503 | rlwinm $s1,$acc01,24,0,7 | 590 | rlwinm $s1,$acc01,24,0,7 |
| 504 | lbzx $acc06,$Tbl2,$acc06 | 591 | lbzx $acc06,$Tbl2,$acc06 |
| 505 | lbzx $acc07,$Tbl2,$acc07 | ||
| 506 | rlwinm $s2,$acc02,24,0,7 | 592 | rlwinm $s2,$acc02,24,0,7 |
| 593 | lbzx $acc07,$Tbl2,$acc07 | ||
| 507 | rlwinm $s3,$acc03,24,0,7 | 594 | rlwinm $s3,$acc03,24,0,7 |
| 508 | lbzx $acc08,$Tbl2,$acc08 | 595 | lbzx $acc08,$Tbl2,$acc08 |
| 509 | lbzx $acc09,$Tbl2,$acc09 | ||
| 510 | rlwimi $s0,$acc04,16,8,15 | 596 | rlwimi $s0,$acc04,16,8,15 |
| 597 | lbzx $acc09,$Tbl2,$acc09 | ||
| 511 | rlwimi $s1,$acc05,16,8,15 | 598 | rlwimi $s1,$acc05,16,8,15 |
| 512 | lbzx $acc10,$Tbl2,$acc10 | 599 | lbzx $acc10,$Tbl2,$acc10 |
| 513 | lbzx $acc11,$Tbl2,$acc11 | ||
| 514 | rlwimi $s2,$acc06,16,8,15 | 600 | rlwimi $s2,$acc06,16,8,15 |
| 601 | lbzx $acc11,$Tbl2,$acc11 | ||
| 515 | rlwimi $s3,$acc07,16,8,15 | 602 | rlwimi $s3,$acc07,16,8,15 |
| 516 | lbzx $acc12,$Tbl2,$acc12 | 603 | lbzx $acc12,$Tbl2,$acc12 |
| 517 | lbzx $acc13,$Tbl2,$acc13 | ||
| 518 | rlwimi $s0,$acc08,8,16,23 | 604 | rlwimi $s0,$acc08,8,16,23 |
| 605 | lbzx $acc13,$Tbl2,$acc13 | ||
| 519 | rlwimi $s1,$acc09,8,16,23 | 606 | rlwimi $s1,$acc09,8,16,23 |
| 520 | lbzx $acc14,$Tbl2,$acc14 | 607 | lbzx $acc14,$Tbl2,$acc14 |
| 521 | lbzx $acc15,$Tbl2,$acc15 | ||
| 522 | rlwimi $s2,$acc10,8,16,23 | 608 | rlwimi $s2,$acc10,8,16,23 |
| 609 | lbzx $acc15,$Tbl2,$acc15 | ||
| 523 | rlwimi $s3,$acc11,8,16,23 | 610 | rlwimi $s3,$acc11,8,16,23 |
| 524 | or $s0,$s0,$acc12 | 611 | or $s0,$s0,$acc12 |
| 525 | or $s1,$s1,$acc13 | 612 | or $s1,$s1,$acc13 |
| @@ -530,29 +617,31 @@ Lenc_loop: | |||
| 530 | xor $s2,$s2,$t2 | 617 | xor $s2,$s2,$t2 |
| 531 | xor $s3,$s3,$t3 | 618 | xor $s3,$s3,$t3 |
| 532 | blr | 619 | blr |
| 620 | .long 0 | ||
| 621 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 533 | 622 | ||
| 534 | .align 4 | 623 | .align 4 |
| 535 | Lppc_AES_encrypt_compact: | 624 | Lppc_AES_encrypt_compact: |
| 536 | lwz $acc00,240($key) | 625 | lwz $acc00,240($key) |
| 537 | lwz $t0,0($key) | ||
| 538 | lwz $t1,4($key) | ||
| 539 | lwz $t2,8($key) | ||
| 540 | lwz $t3,12($key) | ||
| 541 | addi $Tbl1,$Tbl0,2048 | 626 | addi $Tbl1,$Tbl0,2048 |
| 627 | lwz $t0,0($key) | ||
| 542 | lis $mask80,0x8080 | 628 | lis $mask80,0x8080 |
| 629 | lwz $t1,4($key) | ||
| 543 | lis $mask1b,0x1b1b | 630 | lis $mask1b,0x1b1b |
| 544 | addi $key,$key,16 | 631 | lwz $t2,8($key) |
| 545 | ori $mask80,$mask80,0x8080 | 632 | ori $mask80,$mask80,0x8080 |
| 633 | lwz $t3,12($key) | ||
| 546 | ori $mask1b,$mask1b,0x1b1b | 634 | ori $mask1b,$mask1b,0x1b1b |
| 635 | addi $key,$key,16 | ||
| 547 | mtctr $acc00 | 636 | mtctr $acc00 |
| 548 | .align 4 | 637 | .align 4 |
| 549 | Lenc_compact_loop: | 638 | Lenc_compact_loop: |
| 550 | xor $s0,$s0,$t0 | 639 | xor $s0,$s0,$t0 |
| 551 | xor $s1,$s1,$t1 | 640 | xor $s1,$s1,$t1 |
| 552 | xor $s2,$s2,$t2 | ||
| 553 | xor $s3,$s3,$t3 | ||
| 554 | rlwinm $acc00,$s0,`32-24`,24,31 | 641 | rlwinm $acc00,$s0,`32-24`,24,31 |
| 642 | xor $s2,$s2,$t2 | ||
| 555 | rlwinm $acc01,$s1,`32-24`,24,31 | 643 | rlwinm $acc01,$s1,`32-24`,24,31 |
| 644 | xor $s3,$s3,$t3 | ||
| 556 | rlwinm $acc02,$s2,`32-24`,24,31 | 645 | rlwinm $acc02,$s2,`32-24`,24,31 |
| 557 | rlwinm $acc03,$s3,`32-24`,24,31 | 646 | rlwinm $acc03,$s3,`32-24`,24,31 |
| 558 | rlwinm $acc04,$s1,`32-16`,24,31 | 647 | rlwinm $acc04,$s1,`32-16`,24,31 |
| @@ -560,48 +649,48 @@ Lenc_compact_loop: | |||
| 560 | rlwinm $acc06,$s3,`32-16`,24,31 | 649 | rlwinm $acc06,$s3,`32-16`,24,31 |
| 561 | rlwinm $acc07,$s0,`32-16`,24,31 | 650 | rlwinm $acc07,$s0,`32-16`,24,31 |
| 562 | lbzx $acc00,$Tbl1,$acc00 | 651 | lbzx $acc00,$Tbl1,$acc00 |
| 563 | lbzx $acc01,$Tbl1,$acc01 | ||
| 564 | rlwinm $acc08,$s2,`32-8`,24,31 | 652 | rlwinm $acc08,$s2,`32-8`,24,31 |
| 653 | lbzx $acc01,$Tbl1,$acc01 | ||
| 565 | rlwinm $acc09,$s3,`32-8`,24,31 | 654 | rlwinm $acc09,$s3,`32-8`,24,31 |
| 566 | lbzx $acc02,$Tbl1,$acc02 | 655 | lbzx $acc02,$Tbl1,$acc02 |
| 567 | lbzx $acc03,$Tbl1,$acc03 | ||
| 568 | rlwinm $acc10,$s0,`32-8`,24,31 | 656 | rlwinm $acc10,$s0,`32-8`,24,31 |
| 657 | lbzx $acc03,$Tbl1,$acc03 | ||
| 569 | rlwinm $acc11,$s1,`32-8`,24,31 | 658 | rlwinm $acc11,$s1,`32-8`,24,31 |
| 570 | lbzx $acc04,$Tbl1,$acc04 | 659 | lbzx $acc04,$Tbl1,$acc04 |
| 571 | lbzx $acc05,$Tbl1,$acc05 | ||
| 572 | rlwinm $acc12,$s3,`0`,24,31 | 660 | rlwinm $acc12,$s3,`0`,24,31 |
| 661 | lbzx $acc05,$Tbl1,$acc05 | ||
| 573 | rlwinm $acc13,$s0,`0`,24,31 | 662 | rlwinm $acc13,$s0,`0`,24,31 |
| 574 | lbzx $acc06,$Tbl1,$acc06 | 663 | lbzx $acc06,$Tbl1,$acc06 |
| 575 | lbzx $acc07,$Tbl1,$acc07 | ||
| 576 | rlwinm $acc14,$s1,`0`,24,31 | 664 | rlwinm $acc14,$s1,`0`,24,31 |
| 665 | lbzx $acc07,$Tbl1,$acc07 | ||
| 577 | rlwinm $acc15,$s2,`0`,24,31 | 666 | rlwinm $acc15,$s2,`0`,24,31 |
| 578 | lbzx $acc08,$Tbl1,$acc08 | 667 | lbzx $acc08,$Tbl1,$acc08 |
| 579 | lbzx $acc09,$Tbl1,$acc09 | ||
| 580 | rlwinm $s0,$acc00,24,0,7 | 668 | rlwinm $s0,$acc00,24,0,7 |
| 669 | lbzx $acc09,$Tbl1,$acc09 | ||
| 581 | rlwinm $s1,$acc01,24,0,7 | 670 | rlwinm $s1,$acc01,24,0,7 |
| 582 | lbzx $acc10,$Tbl1,$acc10 | 671 | lbzx $acc10,$Tbl1,$acc10 |
| 583 | lbzx $acc11,$Tbl1,$acc11 | ||
| 584 | rlwinm $s2,$acc02,24,0,7 | 672 | rlwinm $s2,$acc02,24,0,7 |
| 673 | lbzx $acc11,$Tbl1,$acc11 | ||
| 585 | rlwinm $s3,$acc03,24,0,7 | 674 | rlwinm $s3,$acc03,24,0,7 |
| 586 | lbzx $acc12,$Tbl1,$acc12 | 675 | lbzx $acc12,$Tbl1,$acc12 |
| 587 | lbzx $acc13,$Tbl1,$acc13 | ||
| 588 | rlwimi $s0,$acc04,16,8,15 | 676 | rlwimi $s0,$acc04,16,8,15 |
| 677 | lbzx $acc13,$Tbl1,$acc13 | ||
| 589 | rlwimi $s1,$acc05,16,8,15 | 678 | rlwimi $s1,$acc05,16,8,15 |
| 590 | lbzx $acc14,$Tbl1,$acc14 | 679 | lbzx $acc14,$Tbl1,$acc14 |
| 591 | lbzx $acc15,$Tbl1,$acc15 | ||
| 592 | rlwimi $s2,$acc06,16,8,15 | 680 | rlwimi $s2,$acc06,16,8,15 |
| 681 | lbzx $acc15,$Tbl1,$acc15 | ||
| 593 | rlwimi $s3,$acc07,16,8,15 | 682 | rlwimi $s3,$acc07,16,8,15 |
| 594 | rlwimi $s0,$acc08,8,16,23 | 683 | rlwimi $s0,$acc08,8,16,23 |
| 595 | rlwimi $s1,$acc09,8,16,23 | 684 | rlwimi $s1,$acc09,8,16,23 |
| 596 | rlwimi $s2,$acc10,8,16,23 | 685 | rlwimi $s2,$acc10,8,16,23 |
| 597 | rlwimi $s3,$acc11,8,16,23 | 686 | rlwimi $s3,$acc11,8,16,23 |
| 598 | lwz $t0,0($key) | 687 | lwz $t0,0($key) |
| 599 | lwz $t1,4($key) | ||
| 600 | or $s0,$s0,$acc12 | 688 | or $s0,$s0,$acc12 |
| 689 | lwz $t1,4($key) | ||
| 601 | or $s1,$s1,$acc13 | 690 | or $s1,$s1,$acc13 |
| 602 | lwz $t2,8($key) | 691 | lwz $t2,8($key) |
| 603 | lwz $t3,12($key) | ||
| 604 | or $s2,$s2,$acc14 | 692 | or $s2,$s2,$acc14 |
| 693 | lwz $t3,12($key) | ||
| 605 | or $s3,$s3,$acc15 | 694 | or $s3,$s3,$acc15 |
| 606 | 695 | ||
| 607 | addi $key,$key,16 | 696 | addi $key,$key,16 |
| @@ -612,12 +701,12 @@ Lenc_compact_loop: | |||
| 612 | and $acc02,$s2,$mask80 | 701 | and $acc02,$s2,$mask80 |
| 613 | and $acc03,$s3,$mask80 | 702 | and $acc03,$s3,$mask80 |
| 614 | srwi $acc04,$acc00,7 # r1>>7 | 703 | srwi $acc04,$acc00,7 # r1>>7 |
| 615 | srwi $acc05,$acc01,7 | ||
| 616 | srwi $acc06,$acc02,7 | ||
| 617 | srwi $acc07,$acc03,7 | ||
| 618 | andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f | 704 | andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f |
| 705 | srwi $acc05,$acc01,7 | ||
| 619 | andc $acc09,$s1,$mask80 | 706 | andc $acc09,$s1,$mask80 |
| 707 | srwi $acc06,$acc02,7 | ||
| 620 | andc $acc10,$s2,$mask80 | 708 | andc $acc10,$s2,$mask80 |
| 709 | srwi $acc07,$acc03,7 | ||
| 621 | andc $acc11,$s3,$mask80 | 710 | andc $acc11,$s3,$mask80 |
| 622 | sub $acc00,$acc00,$acc04 # r1-(r1>>7) | 711 | sub $acc00,$acc00,$acc04 # r1-(r1>>7) |
| 623 | sub $acc01,$acc01,$acc05 | 712 | sub $acc01,$acc01,$acc05 |
| @@ -633,32 +722,32 @@ Lenc_compact_loop: | |||
| 633 | and $acc03,$acc03,$mask1b | 722 | and $acc03,$acc03,$mask1b |
| 634 | xor $acc00,$acc00,$acc08 # r2 | 723 | xor $acc00,$acc00,$acc08 # r2 |
| 635 | xor $acc01,$acc01,$acc09 | 724 | xor $acc01,$acc01,$acc09 |
| 725 | rotlwi $acc12,$s0,16 # ROTATE(r0,16) | ||
| 636 | xor $acc02,$acc02,$acc10 | 726 | xor $acc02,$acc02,$acc10 |
| 727 | rotlwi $acc13,$s1,16 | ||
| 637 | xor $acc03,$acc03,$acc11 | 728 | xor $acc03,$acc03,$acc11 |
| 729 | rotlwi $acc14,$s2,16 | ||
| 638 | 730 | ||
| 639 | rotlwi $acc12,$s0,16 # ROTATE(r0,16) | ||
| 640 | rotlwi $acc13,$s1,16 | ||
| 641 | rotlwi $acc14,$s2,16 | ||
| 642 | rotlwi $acc15,$s3,16 | ||
| 643 | xor $s0,$s0,$acc00 # r0^r2 | 731 | xor $s0,$s0,$acc00 # r0^r2 |
| 732 | rotlwi $acc15,$s3,16 | ||
| 644 | xor $s1,$s1,$acc01 | 733 | xor $s1,$s1,$acc01 |
| 645 | xor $s2,$s2,$acc02 | ||
| 646 | xor $s3,$s3,$acc03 | ||
| 647 | rotrwi $s0,$s0,24 # ROTATE(r2^r0,24) | 734 | rotrwi $s0,$s0,24 # ROTATE(r2^r0,24) |
| 735 | xor $s2,$s2,$acc02 | ||
| 648 | rotrwi $s1,$s1,24 | 736 | rotrwi $s1,$s1,24 |
| 737 | xor $s3,$s3,$acc03 | ||
| 649 | rotrwi $s2,$s2,24 | 738 | rotrwi $s2,$s2,24 |
| 650 | rotrwi $s3,$s3,24 | ||
| 651 | xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2 | 739 | xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2 |
| 740 | rotrwi $s3,$s3,24 | ||
| 652 | xor $s1,$s1,$acc01 | 741 | xor $s1,$s1,$acc01 |
| 653 | xor $s2,$s2,$acc02 | 742 | xor $s2,$s2,$acc02 |
| 654 | xor $s3,$s3,$acc03 | 743 | xor $s3,$s3,$acc03 |
| 655 | rotlwi $acc08,$acc12,8 # ROTATE(r0,24) | 744 | rotlwi $acc08,$acc12,8 # ROTATE(r0,24) |
| 656 | rotlwi $acc09,$acc13,8 | ||
| 657 | rotlwi $acc10,$acc14,8 | ||
| 658 | rotlwi $acc11,$acc15,8 | ||
| 659 | xor $s0,$s0,$acc12 # | 745 | xor $s0,$s0,$acc12 # |
| 746 | rotlwi $acc09,$acc13,8 | ||
| 660 | xor $s1,$s1,$acc13 | 747 | xor $s1,$s1,$acc13 |
| 748 | rotlwi $acc10,$acc14,8 | ||
| 661 | xor $s2,$s2,$acc14 | 749 | xor $s2,$s2,$acc14 |
| 750 | rotlwi $acc11,$acc15,8 | ||
| 662 | xor $s3,$s3,$acc15 | 751 | xor $s3,$s3,$acc15 |
| 663 | xor $s0,$s0,$acc08 # | 752 | xor $s0,$s0,$acc08 # |
| 664 | xor $s1,$s1,$acc09 | 753 | xor $s1,$s1,$acc09 |
| @@ -673,14 +762,15 @@ Lenc_compact_done: | |||
| 673 | xor $s2,$s2,$t2 | 762 | xor $s2,$s2,$t2 |
| 674 | xor $s3,$s3,$t3 | 763 | xor $s3,$s3,$t3 |
| 675 | blr | 764 | blr |
| 765 | .long 0 | ||
| 766 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 676 | 767 | ||
| 677 | .globl .AES_decrypt | 768 | .globl .AES_decrypt |
| 678 | .align 7 | 769 | .align 7 |
| 679 | .AES_decrypt: | 770 | .AES_decrypt: |
| 680 | mflr r0 | ||
| 681 | $STU $sp,-$FRAME($sp) | 771 | $STU $sp,-$FRAME($sp) |
| 772 | mflr r0 | ||
| 682 | 773 | ||
| 683 | $PUSH r0,`$FRAME-$SIZE_T*21`($sp) | ||
| 684 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) | 774 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) |
| 685 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) | 775 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) |
| 686 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | 776 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) |
| @@ -701,7 +791,14 @@ Lenc_compact_done: | |||
| 701 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | 791 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
| 702 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | 792 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
| 703 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | 793 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
| 794 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | ||
| 704 | 795 | ||
| 796 | andi. $t0,$inp,3 | ||
| 797 | andi. $t1,$out,3 | ||
| 798 | or. $t0,$t0,$t1 | ||
| 799 | bne Ldec_unaligned | ||
| 800 | |||
| 801 | Ldec_unaligned_ok: | ||
| 705 | lwz $s0,0($inp) | 802 | lwz $s0,0($inp) |
| 706 | lwz $s1,4($inp) | 803 | lwz $s1,4($inp) |
| 707 | lwz $s2,8($inp) | 804 | lwz $s2,8($inp) |
| @@ -712,8 +809,80 @@ Lenc_compact_done: | |||
| 712 | stw $s1,4($out) | 809 | stw $s1,4($out) |
| 713 | stw $s2,8($out) | 810 | stw $s2,8($out) |
| 714 | stw $s3,12($out) | 811 | stw $s3,12($out) |
| 812 | b Ldec_done | ||
| 813 | |||
| 814 | Ldec_unaligned: | ||
| 815 | subfic $t0,$inp,4096 | ||
| 816 | subfic $t1,$out,4096 | ||
| 817 | andi. $t0,$t0,4096-16 | ||
| 818 | beq Ldec_xpage | ||
| 819 | andi. $t1,$t1,4096-16 | ||
| 820 | bne Ldec_unaligned_ok | ||
| 821 | |||
| 822 | Ldec_xpage: | ||
| 823 | lbz $acc00,0($inp) | ||
| 824 | lbz $acc01,1($inp) | ||
| 825 | lbz $acc02,2($inp) | ||
| 826 | lbz $s0,3($inp) | ||
| 827 | lbz $acc04,4($inp) | ||
| 828 | lbz $acc05,5($inp) | ||
| 829 | lbz $acc06,6($inp) | ||
| 830 | lbz $s1,7($inp) | ||
| 831 | lbz $acc08,8($inp) | ||
| 832 | lbz $acc09,9($inp) | ||
| 833 | lbz $acc10,10($inp) | ||
| 834 | insrwi $s0,$acc00,8,0 | ||
| 835 | lbz $s2,11($inp) | ||
| 836 | insrwi $s1,$acc04,8,0 | ||
| 837 | lbz $acc12,12($inp) | ||
| 838 | insrwi $s0,$acc01,8,8 | ||
| 839 | lbz $acc13,13($inp) | ||
| 840 | insrwi $s1,$acc05,8,8 | ||
| 841 | lbz $acc14,14($inp) | ||
| 842 | insrwi $s0,$acc02,8,16 | ||
| 843 | lbz $s3,15($inp) | ||
| 844 | insrwi $s1,$acc06,8,16 | ||
| 845 | insrwi $s2,$acc08,8,0 | ||
| 846 | insrwi $s3,$acc12,8,0 | ||
| 847 | insrwi $s2,$acc09,8,8 | ||
| 848 | insrwi $s3,$acc13,8,8 | ||
| 849 | insrwi $s2,$acc10,8,16 | ||
| 850 | insrwi $s3,$acc14,8,16 | ||
| 851 | |||
| 852 | bl LAES_Td | ||
| 853 | bl Lppc_AES_decrypt_compact | ||
| 715 | 854 | ||
| 716 | $POP r0,`$FRAME-$SIZE_T*21`($sp) | 855 | extrwi $acc00,$s0,8,0 |
| 856 | extrwi $acc01,$s0,8,8 | ||
| 857 | stb $acc00,0($out) | ||
| 858 | extrwi $acc02,$s0,8,16 | ||
| 859 | stb $acc01,1($out) | ||
| 860 | stb $acc02,2($out) | ||
| 861 | extrwi $acc04,$s1,8,0 | ||
| 862 | stb $s0,3($out) | ||
| 863 | extrwi $acc05,$s1,8,8 | ||
| 864 | stb $acc04,4($out) | ||
| 865 | extrwi $acc06,$s1,8,16 | ||
| 866 | stb $acc05,5($out) | ||
| 867 | stb $acc06,6($out) | ||
| 868 | extrwi $acc08,$s2,8,0 | ||
| 869 | stb $s1,7($out) | ||
| 870 | extrwi $acc09,$s2,8,8 | ||
| 871 | stb $acc08,8($out) | ||
| 872 | extrwi $acc10,$s2,8,16 | ||
| 873 | stb $acc09,9($out) | ||
| 874 | stb $acc10,10($out) | ||
| 875 | extrwi $acc12,$s3,8,0 | ||
| 876 | stb $s2,11($out) | ||
| 877 | extrwi $acc13,$s3,8,8 | ||
| 878 | stb $acc12,12($out) | ||
| 879 | extrwi $acc14,$s3,8,16 | ||
| 880 | stb $acc13,13($out) | ||
| 881 | stb $acc14,14($out) | ||
| 882 | stb $s3,15($out) | ||
| 883 | |||
| 884 | Ldec_done: | ||
| 885 | $POP r0,`$FRAME+$LRSAVE`($sp) | ||
| 717 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) | 886 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) |
| 718 | $POP r13,`$FRAME-$SIZE_T*19`($sp) | 887 | $POP r13,`$FRAME-$SIZE_T*19`($sp) |
| 719 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | 888 | $POP r14,`$FRAME-$SIZE_T*18`($sp) |
| @@ -737,18 +906,21 @@ Lenc_compact_done: | |||
| 737 | mtlr r0 | 906 | mtlr r0 |
| 738 | addi $sp,$sp,$FRAME | 907 | addi $sp,$sp,$FRAME |
| 739 | blr | 908 | blr |
| 909 | .long 0 | ||
| 910 | .byte 0,12,4,1,0x80,18,3,0 | ||
| 911 | .long 0 | ||
| 740 | 912 | ||
| 741 | .align 5 | 913 | .align 5 |
| 742 | Lppc_AES_decrypt: | 914 | Lppc_AES_decrypt: |
| 743 | lwz $acc00,240($key) | 915 | lwz $acc00,240($key) |
| 744 | lwz $t0,0($key) | ||
| 745 | lwz $t1,4($key) | ||
| 746 | lwz $t2,8($key) | ||
| 747 | lwz $t3,12($key) | ||
| 748 | addi $Tbl1,$Tbl0,3 | 916 | addi $Tbl1,$Tbl0,3 |
| 917 | lwz $t0,0($key) | ||
| 749 | addi $Tbl2,$Tbl0,2 | 918 | addi $Tbl2,$Tbl0,2 |
| 919 | lwz $t1,4($key) | ||
| 750 | addi $Tbl3,$Tbl0,1 | 920 | addi $Tbl3,$Tbl0,1 |
| 921 | lwz $t2,8($key) | ||
| 751 | addi $acc00,$acc00,-1 | 922 | addi $acc00,$acc00,-1 |
| 923 | lwz $t3,12($key) | ||
| 752 | addi $key,$key,16 | 924 | addi $key,$key,16 |
| 753 | xor $s0,$s0,$t0 | 925 | xor $s0,$s0,$t0 |
| 754 | xor $s1,$s1,$t1 | 926 | xor $s1,$s1,$t1 |
| @@ -762,44 +934,44 @@ Ldec_loop: | |||
| 762 | rlwinm $acc02,$s2,`32-24+3`,21,28 | 934 | rlwinm $acc02,$s2,`32-24+3`,21,28 |
| 763 | rlwinm $acc03,$s3,`32-24+3`,21,28 | 935 | rlwinm $acc03,$s3,`32-24+3`,21,28 |
| 764 | lwz $t0,0($key) | 936 | lwz $t0,0($key) |
| 765 | lwz $t1,4($key) | ||
| 766 | rlwinm $acc04,$s3,`32-16+3`,21,28 | 937 | rlwinm $acc04,$s3,`32-16+3`,21,28 |
| 938 | lwz $t1,4($key) | ||
| 767 | rlwinm $acc05,$s0,`32-16+3`,21,28 | 939 | rlwinm $acc05,$s0,`32-16+3`,21,28 |
| 768 | lwz $t2,8($key) | 940 | lwz $t2,8($key) |
| 769 | lwz $t3,12($key) | ||
| 770 | rlwinm $acc06,$s1,`32-16+3`,21,28 | 941 | rlwinm $acc06,$s1,`32-16+3`,21,28 |
| 942 | lwz $t3,12($key) | ||
| 771 | rlwinm $acc07,$s2,`32-16+3`,21,28 | 943 | rlwinm $acc07,$s2,`32-16+3`,21,28 |
| 772 | lwzx $acc00,$Tbl0,$acc00 | 944 | lwzx $acc00,$Tbl0,$acc00 |
| 773 | lwzx $acc01,$Tbl0,$acc01 | ||
| 774 | rlwinm $acc08,$s2,`32-8+3`,21,28 | 945 | rlwinm $acc08,$s2,`32-8+3`,21,28 |
| 946 | lwzx $acc01,$Tbl0,$acc01 | ||
| 775 | rlwinm $acc09,$s3,`32-8+3`,21,28 | 947 | rlwinm $acc09,$s3,`32-8+3`,21,28 |
| 776 | lwzx $acc02,$Tbl0,$acc02 | 948 | lwzx $acc02,$Tbl0,$acc02 |
| 777 | lwzx $acc03,$Tbl0,$acc03 | ||
| 778 | rlwinm $acc10,$s0,`32-8+3`,21,28 | 949 | rlwinm $acc10,$s0,`32-8+3`,21,28 |
| 950 | lwzx $acc03,$Tbl0,$acc03 | ||
| 779 | rlwinm $acc11,$s1,`32-8+3`,21,28 | 951 | rlwinm $acc11,$s1,`32-8+3`,21,28 |
| 780 | lwzx $acc04,$Tbl1,$acc04 | 952 | lwzx $acc04,$Tbl1,$acc04 |
| 781 | lwzx $acc05,$Tbl1,$acc05 | ||
| 782 | rlwinm $acc12,$s1,`0+3`,21,28 | 953 | rlwinm $acc12,$s1,`0+3`,21,28 |
| 954 | lwzx $acc05,$Tbl1,$acc05 | ||
| 783 | rlwinm $acc13,$s2,`0+3`,21,28 | 955 | rlwinm $acc13,$s2,`0+3`,21,28 |
| 784 | lwzx $acc06,$Tbl1,$acc06 | 956 | lwzx $acc06,$Tbl1,$acc06 |
| 785 | lwzx $acc07,$Tbl1,$acc07 | ||
| 786 | rlwinm $acc14,$s3,`0+3`,21,28 | 957 | rlwinm $acc14,$s3,`0+3`,21,28 |
| 958 | lwzx $acc07,$Tbl1,$acc07 | ||
| 787 | rlwinm $acc15,$s0,`0+3`,21,28 | 959 | rlwinm $acc15,$s0,`0+3`,21,28 |
| 788 | lwzx $acc08,$Tbl2,$acc08 | 960 | lwzx $acc08,$Tbl2,$acc08 |
| 789 | lwzx $acc09,$Tbl2,$acc09 | ||
| 790 | xor $t0,$t0,$acc00 | 961 | xor $t0,$t0,$acc00 |
| 962 | lwzx $acc09,$Tbl2,$acc09 | ||
| 791 | xor $t1,$t1,$acc01 | 963 | xor $t1,$t1,$acc01 |
| 792 | lwzx $acc10,$Tbl2,$acc10 | 964 | lwzx $acc10,$Tbl2,$acc10 |
| 793 | lwzx $acc11,$Tbl2,$acc11 | ||
| 794 | xor $t2,$t2,$acc02 | 965 | xor $t2,$t2,$acc02 |
| 966 | lwzx $acc11,$Tbl2,$acc11 | ||
| 795 | xor $t3,$t3,$acc03 | 967 | xor $t3,$t3,$acc03 |
| 796 | lwzx $acc12,$Tbl3,$acc12 | 968 | lwzx $acc12,$Tbl3,$acc12 |
| 797 | lwzx $acc13,$Tbl3,$acc13 | ||
| 798 | xor $t0,$t0,$acc04 | 969 | xor $t0,$t0,$acc04 |
| 970 | lwzx $acc13,$Tbl3,$acc13 | ||
| 799 | xor $t1,$t1,$acc05 | 971 | xor $t1,$t1,$acc05 |
| 800 | lwzx $acc14,$Tbl3,$acc14 | 972 | lwzx $acc14,$Tbl3,$acc14 |
| 801 | lwzx $acc15,$Tbl3,$acc15 | ||
| 802 | xor $t2,$t2,$acc06 | 973 | xor $t2,$t2,$acc06 |
| 974 | lwzx $acc15,$Tbl3,$acc15 | ||
| 803 | xor $t3,$t3,$acc07 | 975 | xor $t3,$t3,$acc07 |
| 804 | xor $t0,$t0,$acc08 | 976 | xor $t0,$t0,$acc08 |
| 805 | xor $t1,$t1,$acc09 | 977 | xor $t1,$t1,$acc09 |
| @@ -815,56 +987,56 @@ Ldec_loop: | |||
| 815 | addi $Tbl2,$Tbl0,2048 | 987 | addi $Tbl2,$Tbl0,2048 |
| 816 | nop | 988 | nop |
| 817 | lwz $t0,0($key) | 989 | lwz $t0,0($key) |
| 818 | lwz $t1,4($key) | ||
| 819 | rlwinm $acc00,$s0,`32-24`,24,31 | 990 | rlwinm $acc00,$s0,`32-24`,24,31 |
| 991 | lwz $t1,4($key) | ||
| 820 | rlwinm $acc01,$s1,`32-24`,24,31 | 992 | rlwinm $acc01,$s1,`32-24`,24,31 |
| 821 | lwz $t2,8($key) | 993 | lwz $t2,8($key) |
| 822 | lwz $t3,12($key) | ||
| 823 | rlwinm $acc02,$s2,`32-24`,24,31 | 994 | rlwinm $acc02,$s2,`32-24`,24,31 |
| 995 | lwz $t3,12($key) | ||
| 824 | rlwinm $acc03,$s3,`32-24`,24,31 | 996 | rlwinm $acc03,$s3,`32-24`,24,31 |
| 825 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 | 997 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 |
| 826 | lwz $acc09,`2048+32`($Tbl0) | ||
| 827 | rlwinm $acc04,$s3,`32-16`,24,31 | 998 | rlwinm $acc04,$s3,`32-16`,24,31 |
| 999 | lwz $acc09,`2048+32`($Tbl0) | ||
| 828 | rlwinm $acc05,$s0,`32-16`,24,31 | 1000 | rlwinm $acc05,$s0,`32-16`,24,31 |
| 829 | lwz $acc10,`2048+64`($Tbl0) | 1001 | lwz $acc10,`2048+64`($Tbl0) |
| 830 | lwz $acc11,`2048+96`($Tbl0) | ||
| 831 | lbzx $acc00,$Tbl2,$acc00 | 1002 | lbzx $acc00,$Tbl2,$acc00 |
| 1003 | lwz $acc11,`2048+96`($Tbl0) | ||
| 832 | lbzx $acc01,$Tbl2,$acc01 | 1004 | lbzx $acc01,$Tbl2,$acc01 |
| 833 | lwz $acc12,`2048+128`($Tbl0) | 1005 | lwz $acc12,`2048+128`($Tbl0) |
| 834 | lwz $acc13,`2048+160`($Tbl0) | ||
| 835 | rlwinm $acc06,$s1,`32-16`,24,31 | 1006 | rlwinm $acc06,$s1,`32-16`,24,31 |
| 1007 | lwz $acc13,`2048+160`($Tbl0) | ||
| 836 | rlwinm $acc07,$s2,`32-16`,24,31 | 1008 | rlwinm $acc07,$s2,`32-16`,24,31 |
| 837 | lwz $acc14,`2048+192`($Tbl0) | 1009 | lwz $acc14,`2048+192`($Tbl0) |
| 838 | lwz $acc15,`2048+224`($Tbl0) | ||
| 839 | rlwinm $acc08,$s2,`32-8`,24,31 | 1010 | rlwinm $acc08,$s2,`32-8`,24,31 |
| 1011 | lwz $acc15,`2048+224`($Tbl0) | ||
| 840 | rlwinm $acc09,$s3,`32-8`,24,31 | 1012 | rlwinm $acc09,$s3,`32-8`,24,31 |
| 841 | lbzx $acc02,$Tbl2,$acc02 | 1013 | lbzx $acc02,$Tbl2,$acc02 |
| 842 | lbzx $acc03,$Tbl2,$acc03 | ||
| 843 | rlwinm $acc10,$s0,`32-8`,24,31 | 1014 | rlwinm $acc10,$s0,`32-8`,24,31 |
| 1015 | lbzx $acc03,$Tbl2,$acc03 | ||
| 844 | rlwinm $acc11,$s1,`32-8`,24,31 | 1016 | rlwinm $acc11,$s1,`32-8`,24,31 |
| 845 | lbzx $acc04,$Tbl2,$acc04 | 1017 | lbzx $acc04,$Tbl2,$acc04 |
| 846 | lbzx $acc05,$Tbl2,$acc05 | ||
| 847 | rlwinm $acc12,$s1,`0`,24,31 | 1018 | rlwinm $acc12,$s1,`0`,24,31 |
| 1019 | lbzx $acc05,$Tbl2,$acc05 | ||
| 848 | rlwinm $acc13,$s2,`0`,24,31 | 1020 | rlwinm $acc13,$s2,`0`,24,31 |
| 849 | lbzx $acc06,$Tbl2,$acc06 | 1021 | lbzx $acc06,$Tbl2,$acc06 |
| 850 | lbzx $acc07,$Tbl2,$acc07 | ||
| 851 | rlwinm $acc14,$s3,`0`,24,31 | 1022 | rlwinm $acc14,$s3,`0`,24,31 |
| 1023 | lbzx $acc07,$Tbl2,$acc07 | ||
| 852 | rlwinm $acc15,$s0,`0`,24,31 | 1024 | rlwinm $acc15,$s0,`0`,24,31 |
| 853 | lbzx $acc08,$Tbl2,$acc08 | 1025 | lbzx $acc08,$Tbl2,$acc08 |
| 854 | lbzx $acc09,$Tbl2,$acc09 | ||
| 855 | rlwinm $s0,$acc00,24,0,7 | 1026 | rlwinm $s0,$acc00,24,0,7 |
| 1027 | lbzx $acc09,$Tbl2,$acc09 | ||
| 856 | rlwinm $s1,$acc01,24,0,7 | 1028 | rlwinm $s1,$acc01,24,0,7 |
| 857 | lbzx $acc10,$Tbl2,$acc10 | 1029 | lbzx $acc10,$Tbl2,$acc10 |
| 858 | lbzx $acc11,$Tbl2,$acc11 | ||
| 859 | rlwinm $s2,$acc02,24,0,7 | 1030 | rlwinm $s2,$acc02,24,0,7 |
| 1031 | lbzx $acc11,$Tbl2,$acc11 | ||
| 860 | rlwinm $s3,$acc03,24,0,7 | 1032 | rlwinm $s3,$acc03,24,0,7 |
| 861 | lbzx $acc12,$Tbl2,$acc12 | 1033 | lbzx $acc12,$Tbl2,$acc12 |
| 862 | lbzx $acc13,$Tbl2,$acc13 | ||
| 863 | rlwimi $s0,$acc04,16,8,15 | 1034 | rlwimi $s0,$acc04,16,8,15 |
| 1035 | lbzx $acc13,$Tbl2,$acc13 | ||
| 864 | rlwimi $s1,$acc05,16,8,15 | 1036 | rlwimi $s1,$acc05,16,8,15 |
| 865 | lbzx $acc14,$Tbl2,$acc14 | 1037 | lbzx $acc14,$Tbl2,$acc14 |
| 866 | lbzx $acc15,$Tbl2,$acc15 | ||
| 867 | rlwimi $s2,$acc06,16,8,15 | 1038 | rlwimi $s2,$acc06,16,8,15 |
| 1039 | lbzx $acc15,$Tbl2,$acc15 | ||
| 868 | rlwimi $s3,$acc07,16,8,15 | 1040 | rlwimi $s3,$acc07,16,8,15 |
| 869 | rlwimi $s0,$acc08,8,16,23 | 1041 | rlwimi $s0,$acc08,8,16,23 |
| 870 | rlwimi $s1,$acc09,8,16,23 | 1042 | rlwimi $s1,$acc09,8,16,23 |
| @@ -879,20 +1051,22 @@ Ldec_loop: | |||
| 879 | xor $s2,$s2,$t2 | 1051 | xor $s2,$s2,$t2 |
| 880 | xor $s3,$s3,$t3 | 1052 | xor $s3,$s3,$t3 |
| 881 | blr | 1053 | blr |
| 1054 | .long 0 | ||
| 1055 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 882 | 1056 | ||
| 883 | .align 4 | 1057 | .align 4 |
| 884 | Lppc_AES_decrypt_compact: | 1058 | Lppc_AES_decrypt_compact: |
| 885 | lwz $acc00,240($key) | 1059 | lwz $acc00,240($key) |
| 886 | lwz $t0,0($key) | ||
| 887 | lwz $t1,4($key) | ||
| 888 | lwz $t2,8($key) | ||
| 889 | lwz $t3,12($key) | ||
| 890 | addi $Tbl1,$Tbl0,2048 | 1060 | addi $Tbl1,$Tbl0,2048 |
| 1061 | lwz $t0,0($key) | ||
| 891 | lis $mask80,0x8080 | 1062 | lis $mask80,0x8080 |
| 1063 | lwz $t1,4($key) | ||
| 892 | lis $mask1b,0x1b1b | 1064 | lis $mask1b,0x1b1b |
| 893 | addi $key,$key,16 | 1065 | lwz $t2,8($key) |
| 894 | ori $mask80,$mask80,0x8080 | 1066 | ori $mask80,$mask80,0x8080 |
| 1067 | lwz $t3,12($key) | ||
| 895 | ori $mask1b,$mask1b,0x1b1b | 1068 | ori $mask1b,$mask1b,0x1b1b |
| 1069 | addi $key,$key,16 | ||
| 896 | ___ | 1070 | ___ |
| 897 | $code.=<<___ if ($SIZE_T==8); | 1071 | $code.=<<___ if ($SIZE_T==8); |
| 898 | insrdi $mask80,$mask80,32,0 | 1072 | insrdi $mask80,$mask80,32,0 |
| @@ -904,10 +1078,10 @@ $code.=<<___; | |||
| 904 | Ldec_compact_loop: | 1078 | Ldec_compact_loop: |
| 905 | xor $s0,$s0,$t0 | 1079 | xor $s0,$s0,$t0 |
| 906 | xor $s1,$s1,$t1 | 1080 | xor $s1,$s1,$t1 |
| 907 | xor $s2,$s2,$t2 | ||
| 908 | xor $s3,$s3,$t3 | ||
| 909 | rlwinm $acc00,$s0,`32-24`,24,31 | 1081 | rlwinm $acc00,$s0,`32-24`,24,31 |
| 1082 | xor $s2,$s2,$t2 | ||
| 910 | rlwinm $acc01,$s1,`32-24`,24,31 | 1083 | rlwinm $acc01,$s1,`32-24`,24,31 |
| 1084 | xor $s3,$s3,$t3 | ||
| 911 | rlwinm $acc02,$s2,`32-24`,24,31 | 1085 | rlwinm $acc02,$s2,`32-24`,24,31 |
| 912 | rlwinm $acc03,$s3,`32-24`,24,31 | 1086 | rlwinm $acc03,$s3,`32-24`,24,31 |
| 913 | rlwinm $acc04,$s3,`32-16`,24,31 | 1087 | rlwinm $acc04,$s3,`32-16`,24,31 |
| @@ -915,48 +1089,48 @@ Ldec_compact_loop: | |||
| 915 | rlwinm $acc06,$s1,`32-16`,24,31 | 1089 | rlwinm $acc06,$s1,`32-16`,24,31 |
| 916 | rlwinm $acc07,$s2,`32-16`,24,31 | 1090 | rlwinm $acc07,$s2,`32-16`,24,31 |
| 917 | lbzx $acc00,$Tbl1,$acc00 | 1091 | lbzx $acc00,$Tbl1,$acc00 |
| 918 | lbzx $acc01,$Tbl1,$acc01 | ||
| 919 | rlwinm $acc08,$s2,`32-8`,24,31 | 1092 | rlwinm $acc08,$s2,`32-8`,24,31 |
| 1093 | lbzx $acc01,$Tbl1,$acc01 | ||
| 920 | rlwinm $acc09,$s3,`32-8`,24,31 | 1094 | rlwinm $acc09,$s3,`32-8`,24,31 |
| 921 | lbzx $acc02,$Tbl1,$acc02 | 1095 | lbzx $acc02,$Tbl1,$acc02 |
| 922 | lbzx $acc03,$Tbl1,$acc03 | ||
| 923 | rlwinm $acc10,$s0,`32-8`,24,31 | 1096 | rlwinm $acc10,$s0,`32-8`,24,31 |
| 1097 | lbzx $acc03,$Tbl1,$acc03 | ||
| 924 | rlwinm $acc11,$s1,`32-8`,24,31 | 1098 | rlwinm $acc11,$s1,`32-8`,24,31 |
| 925 | lbzx $acc04,$Tbl1,$acc04 | 1099 | lbzx $acc04,$Tbl1,$acc04 |
| 926 | lbzx $acc05,$Tbl1,$acc05 | ||
| 927 | rlwinm $acc12,$s1,`0`,24,31 | 1100 | rlwinm $acc12,$s1,`0`,24,31 |
| 1101 | lbzx $acc05,$Tbl1,$acc05 | ||
| 928 | rlwinm $acc13,$s2,`0`,24,31 | 1102 | rlwinm $acc13,$s2,`0`,24,31 |
| 929 | lbzx $acc06,$Tbl1,$acc06 | 1103 | lbzx $acc06,$Tbl1,$acc06 |
| 930 | lbzx $acc07,$Tbl1,$acc07 | ||
| 931 | rlwinm $acc14,$s3,`0`,24,31 | 1104 | rlwinm $acc14,$s3,`0`,24,31 |
| 1105 | lbzx $acc07,$Tbl1,$acc07 | ||
| 932 | rlwinm $acc15,$s0,`0`,24,31 | 1106 | rlwinm $acc15,$s0,`0`,24,31 |
| 933 | lbzx $acc08,$Tbl1,$acc08 | 1107 | lbzx $acc08,$Tbl1,$acc08 |
| 934 | lbzx $acc09,$Tbl1,$acc09 | ||
| 935 | rlwinm $s0,$acc00,24,0,7 | 1108 | rlwinm $s0,$acc00,24,0,7 |
| 1109 | lbzx $acc09,$Tbl1,$acc09 | ||
| 936 | rlwinm $s1,$acc01,24,0,7 | 1110 | rlwinm $s1,$acc01,24,0,7 |
| 937 | lbzx $acc10,$Tbl1,$acc10 | 1111 | lbzx $acc10,$Tbl1,$acc10 |
| 938 | lbzx $acc11,$Tbl1,$acc11 | ||
| 939 | rlwinm $s2,$acc02,24,0,7 | 1112 | rlwinm $s2,$acc02,24,0,7 |
| 1113 | lbzx $acc11,$Tbl1,$acc11 | ||
| 940 | rlwinm $s3,$acc03,24,0,7 | 1114 | rlwinm $s3,$acc03,24,0,7 |
| 941 | lbzx $acc12,$Tbl1,$acc12 | 1115 | lbzx $acc12,$Tbl1,$acc12 |
| 942 | lbzx $acc13,$Tbl1,$acc13 | ||
| 943 | rlwimi $s0,$acc04,16,8,15 | 1116 | rlwimi $s0,$acc04,16,8,15 |
| 1117 | lbzx $acc13,$Tbl1,$acc13 | ||
| 944 | rlwimi $s1,$acc05,16,8,15 | 1118 | rlwimi $s1,$acc05,16,8,15 |
| 945 | lbzx $acc14,$Tbl1,$acc14 | 1119 | lbzx $acc14,$Tbl1,$acc14 |
| 946 | lbzx $acc15,$Tbl1,$acc15 | ||
| 947 | rlwimi $s2,$acc06,16,8,15 | 1120 | rlwimi $s2,$acc06,16,8,15 |
| 1121 | lbzx $acc15,$Tbl1,$acc15 | ||
| 948 | rlwimi $s3,$acc07,16,8,15 | 1122 | rlwimi $s3,$acc07,16,8,15 |
| 949 | rlwimi $s0,$acc08,8,16,23 | 1123 | rlwimi $s0,$acc08,8,16,23 |
| 950 | rlwimi $s1,$acc09,8,16,23 | 1124 | rlwimi $s1,$acc09,8,16,23 |
| 951 | rlwimi $s2,$acc10,8,16,23 | 1125 | rlwimi $s2,$acc10,8,16,23 |
| 952 | rlwimi $s3,$acc11,8,16,23 | 1126 | rlwimi $s3,$acc11,8,16,23 |
| 953 | lwz $t0,0($key) | 1127 | lwz $t0,0($key) |
| 954 | lwz $t1,4($key) | ||
| 955 | or $s0,$s0,$acc12 | 1128 | or $s0,$s0,$acc12 |
| 1129 | lwz $t1,4($key) | ||
| 956 | or $s1,$s1,$acc13 | 1130 | or $s1,$s1,$acc13 |
| 957 | lwz $t2,8($key) | 1131 | lwz $t2,8($key) |
| 958 | lwz $t3,12($key) | ||
| 959 | or $s2,$s2,$acc14 | 1132 | or $s2,$s2,$acc14 |
| 1133 | lwz $t3,12($key) | ||
| 960 | or $s3,$s3,$acc15 | 1134 | or $s3,$s3,$acc15 |
| 961 | 1135 | ||
| 962 | addi $key,$key,16 | 1136 | addi $key,$key,16 |
| @@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4); | |||
| 1030 | and $acc02,$s2,$mask80 | 1204 | and $acc02,$s2,$mask80 |
| 1031 | and $acc03,$s3,$mask80 | 1205 | and $acc03,$s3,$mask80 |
| 1032 | srwi $acc04,$acc00,7 # r1>>7 | 1206 | srwi $acc04,$acc00,7 # r1>>7 |
| 1033 | srwi $acc05,$acc01,7 | ||
| 1034 | srwi $acc06,$acc02,7 | ||
| 1035 | srwi $acc07,$acc03,7 | ||
| 1036 | andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f | 1207 | andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f |
| 1208 | srwi $acc05,$acc01,7 | ||
| 1037 | andc $acc09,$s1,$mask80 | 1209 | andc $acc09,$s1,$mask80 |
| 1210 | srwi $acc06,$acc02,7 | ||
| 1038 | andc $acc10,$s2,$mask80 | 1211 | andc $acc10,$s2,$mask80 |
| 1212 | srwi $acc07,$acc03,7 | ||
| 1039 | andc $acc11,$s3,$mask80 | 1213 | andc $acc11,$s3,$mask80 |
| 1040 | sub $acc00,$acc00,$acc04 # r1-(r1>>7) | 1214 | sub $acc00,$acc00,$acc04 # r1-(r1>>7) |
| 1041 | sub $acc01,$acc01,$acc05 | 1215 | sub $acc01,$acc01,$acc05 |
| @@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4); | |||
| 1059 | and $acc06,$acc02,$mask80 | 1233 | and $acc06,$acc02,$mask80 |
| 1060 | and $acc07,$acc03,$mask80 | 1234 | and $acc07,$acc03,$mask80 |
| 1061 | srwi $acc08,$acc04,7 # r1>>7 | 1235 | srwi $acc08,$acc04,7 # r1>>7 |
| 1062 | srwi $acc09,$acc05,7 | ||
| 1063 | srwi $acc10,$acc06,7 | ||
| 1064 | srwi $acc11,$acc07,7 | ||
| 1065 | andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f | 1236 | andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f |
| 1237 | srwi $acc09,$acc05,7 | ||
| 1066 | andc $acc13,$acc01,$mask80 | 1238 | andc $acc13,$acc01,$mask80 |
| 1239 | srwi $acc10,$acc06,7 | ||
| 1067 | andc $acc14,$acc02,$mask80 | 1240 | andc $acc14,$acc02,$mask80 |
| 1241 | srwi $acc11,$acc07,7 | ||
| 1068 | andc $acc15,$acc03,$mask80 | 1242 | andc $acc15,$acc03,$mask80 |
| 1069 | sub $acc04,$acc04,$acc08 # r1-(r1>>7) | 1243 | sub $acc04,$acc04,$acc08 # r1-(r1>>7) |
| 1070 | sub $acc05,$acc05,$acc09 | 1244 | sub $acc05,$acc05,$acc09 |
| @@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4); | |||
| 1085 | 1259 | ||
| 1086 | and $acc08,$acc04,$mask80 # r1=r4&0x80808080 | 1260 | and $acc08,$acc04,$mask80 # r1=r4&0x80808080 |
| 1087 | and $acc09,$acc05,$mask80 | 1261 | and $acc09,$acc05,$mask80 |
| 1088 | and $acc10,$acc06,$mask80 | ||
| 1089 | and $acc11,$acc07,$mask80 | ||
| 1090 | srwi $acc12,$acc08,7 # r1>>7 | 1262 | srwi $acc12,$acc08,7 # r1>>7 |
| 1263 | and $acc10,$acc06,$mask80 | ||
| 1091 | srwi $acc13,$acc09,7 | 1264 | srwi $acc13,$acc09,7 |
| 1265 | and $acc11,$acc07,$mask80 | ||
| 1092 | srwi $acc14,$acc10,7 | 1266 | srwi $acc14,$acc10,7 |
| 1093 | srwi $acc15,$acc11,7 | ||
| 1094 | sub $acc08,$acc08,$acc12 # r1-(r1>>7) | 1267 | sub $acc08,$acc08,$acc12 # r1-(r1>>7) |
| 1268 | srwi $acc15,$acc11,7 | ||
| 1095 | sub $acc09,$acc09,$acc13 | 1269 | sub $acc09,$acc09,$acc13 |
| 1096 | sub $acc10,$acc10,$acc14 | 1270 | sub $acc10,$acc10,$acc14 |
| 1097 | sub $acc11,$acc11,$acc15 | 1271 | sub $acc11,$acc11,$acc15 |
| @@ -1124,10 +1298,10 @@ ___ | |||
| 1124 | $code.=<<___; | 1298 | $code.=<<___; |
| 1125 | rotrwi $s0,$s0,8 # = ROTATE(r0,8) | 1299 | rotrwi $s0,$s0,8 # = ROTATE(r0,8) |
| 1126 | rotrwi $s1,$s1,8 | 1300 | rotrwi $s1,$s1,8 |
| 1127 | rotrwi $s2,$s2,8 | ||
| 1128 | rotrwi $s3,$s3,8 | ||
| 1129 | xor $s0,$s0,$acc00 # ^= r2^r0 | 1301 | xor $s0,$s0,$acc00 # ^= r2^r0 |
| 1302 | rotrwi $s2,$s2,8 | ||
| 1130 | xor $s1,$s1,$acc01 | 1303 | xor $s1,$s1,$acc01 |
| 1304 | rotrwi $s3,$s3,8 | ||
| 1131 | xor $s2,$s2,$acc02 | 1305 | xor $s2,$s2,$acc02 |
| 1132 | xor $s3,$s3,$acc03 | 1306 | xor $s3,$s3,$acc03 |
| 1133 | xor $acc00,$acc00,$acc08 | 1307 | xor $acc00,$acc00,$acc08 |
| @@ -1135,32 +1309,32 @@ $code.=<<___; | |||
| 1135 | xor $acc02,$acc02,$acc10 | 1309 | xor $acc02,$acc02,$acc10 |
| 1136 | xor $acc03,$acc03,$acc11 | 1310 | xor $acc03,$acc03,$acc11 |
| 1137 | xor $s0,$s0,$acc04 # ^= r4^r0 | 1311 | xor $s0,$s0,$acc04 # ^= r4^r0 |
| 1138 | xor $s1,$s1,$acc05 | ||
| 1139 | xor $s2,$s2,$acc06 | ||
| 1140 | xor $s3,$s3,$acc07 | ||
| 1141 | rotrwi $acc00,$acc00,24 | 1312 | rotrwi $acc00,$acc00,24 |
| 1313 | xor $s1,$s1,$acc05 | ||
| 1142 | rotrwi $acc01,$acc01,24 | 1314 | rotrwi $acc01,$acc01,24 |
| 1315 | xor $s2,$s2,$acc06 | ||
| 1143 | rotrwi $acc02,$acc02,24 | 1316 | rotrwi $acc02,$acc02,24 |
| 1317 | xor $s3,$s3,$acc07 | ||
| 1144 | rotrwi $acc03,$acc03,24 | 1318 | rotrwi $acc03,$acc03,24 |
| 1145 | xor $acc04,$acc04,$acc08 | 1319 | xor $acc04,$acc04,$acc08 |
| 1146 | xor $acc05,$acc05,$acc09 | 1320 | xor $acc05,$acc05,$acc09 |
| 1147 | xor $acc06,$acc06,$acc10 | 1321 | xor $acc06,$acc06,$acc10 |
| 1148 | xor $acc07,$acc07,$acc11 | 1322 | xor $acc07,$acc07,$acc11 |
| 1149 | xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)] | 1323 | xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)] |
| 1150 | xor $s1,$s1,$acc09 | ||
| 1151 | xor $s2,$s2,$acc10 | ||
| 1152 | xor $s3,$s3,$acc11 | ||
| 1153 | rotrwi $acc04,$acc04,16 | 1324 | rotrwi $acc04,$acc04,16 |
| 1325 | xor $s1,$s1,$acc09 | ||
| 1154 | rotrwi $acc05,$acc05,16 | 1326 | rotrwi $acc05,$acc05,16 |
| 1327 | xor $s2,$s2,$acc10 | ||
| 1155 | rotrwi $acc06,$acc06,16 | 1328 | rotrwi $acc06,$acc06,16 |
| 1329 | xor $s3,$s3,$acc11 | ||
| 1156 | rotrwi $acc07,$acc07,16 | 1330 | rotrwi $acc07,$acc07,16 |
| 1157 | xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24) | 1331 | xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24) |
| 1158 | xor $s1,$s1,$acc01 | ||
| 1159 | xor $s2,$s2,$acc02 | ||
| 1160 | xor $s3,$s3,$acc03 | ||
| 1161 | rotrwi $acc08,$acc08,8 | 1332 | rotrwi $acc08,$acc08,8 |
| 1333 | xor $s1,$s1,$acc01 | ||
| 1162 | rotrwi $acc09,$acc09,8 | 1334 | rotrwi $acc09,$acc09,8 |
| 1335 | xor $s2,$s2,$acc02 | ||
| 1163 | rotrwi $acc10,$acc10,8 | 1336 | rotrwi $acc10,$acc10,8 |
| 1337 | xor $s3,$s3,$acc03 | ||
| 1164 | rotrwi $acc11,$acc11,8 | 1338 | rotrwi $acc11,$acc11,8 |
| 1165 | xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16) | 1339 | xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16) |
| 1166 | xor $s1,$s1,$acc05 | 1340 | xor $s1,$s1,$acc05 |
| @@ -1179,7 +1353,9 @@ Ldec_compact_done: | |||
| 1179 | xor $s2,$s2,$t2 | 1353 | xor $s2,$s2,$t2 |
| 1180 | xor $s3,$s3,$t3 | 1354 | xor $s3,$s3,$t3 |
| 1181 | blr | 1355 | blr |
| 1182 | .long 0 | 1356 | .long 0 |
| 1357 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 1358 | |||
| 1183 | .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" | 1359 | .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" |
| 1184 | .align 7 | 1360 | .align 7 |
| 1185 | ___ | 1361 | ___ |
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl index 7e01889298..445a1e6762 100644 --- a/src/lib/libcrypto/aes/asm/aes-s390x.pl +++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl | |||
| @@ -44,12 +44,57 @@ | |||
| 44 | # Unlike previous version hardware support detection takes place only | 44 | # Unlike previous version hardware support detection takes place only |
| 45 | # at the moment of key schedule setup, which is denoted in key->rounds. | 45 | # at the moment of key schedule setup, which is denoted in key->rounds. |
| 46 | # This is done, because deferred key setup can't be made MT-safe, not | 46 | # This is done, because deferred key setup can't be made MT-safe, not |
| 47 | # for key lengthes longer than 128 bits. | 47 | # for keys longer than 128 bits. |
| 48 | # | 48 | # |
| 49 | # Add AES_cbc_encrypt, which gives incredible performance improvement, | 49 | # Add AES_cbc_encrypt, which gives incredible performance improvement, |
| 50 | # it was measured to be ~6.6x. It's less than previously mentioned 8x, | 50 | # it was measured to be ~6.6x. It's less than previously mentioned 8x, |
| 51 | # because software implementation was optimized. | 51 | # because software implementation was optimized. |
| 52 | 52 | ||
| 53 | # May 2010. | ||
| 54 | # | ||
| 55 | # Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x | ||
| 56 | # performance improvement over "generic" counter mode routine relying | ||
| 57 | # on single-block, also hardware-assisted, AES_encrypt. "Up to" refers | ||
| 58 | # to the fact that exact throughput value depends on current stack | ||
| 59 | # frame alignment within 4KB page. In worst case you get ~75% of the | ||
| 60 | # maximum, but *on average* it would be as much as ~98%. Meaning that | ||
| 61 | # worst case is unlike, it's like hitting ravine on plateau. | ||
| 62 | |||
| 63 | # November 2010. | ||
| 64 | # | ||
| 65 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
| 66 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
| 67 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
| 68 | # application context. The feature is not specific to any particular | ||
| 69 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
| 70 | # remains z/Architecture specific. On z990 it was measured to perform | ||
| 71 | # 2x better than code generated by gcc 4.3. | ||
| 72 | |||
| 73 | # December 2010. | ||
| 74 | # | ||
| 75 | # Add support for z196 "cipher message with counter" instruction. | ||
| 76 | # Note however that it's disengaged, because it was measured to | ||
| 77 | # perform ~12% worse than vanilla km-based code... | ||
| 78 | |||
| 79 | # February 2011. | ||
| 80 | # | ||
| 81 | # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes | ||
| 82 | # instructions, which deliver ~70% improvement at 8KB block size over | ||
| 83 | # vanilla km-based code, 37% - at most like 512-bytes block size. | ||
| 84 | |||
| 85 | $flavour = shift; | ||
| 86 | |||
| 87 | if ($flavour =~ /3[12]/) { | ||
| 88 | $SIZE_T=4; | ||
| 89 | $g=""; | ||
| 90 | } else { | ||
| 91 | $SIZE_T=8; | ||
| 92 | $g="g"; | ||
| 93 | } | ||
| 94 | |||
| 95 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 96 | open STDOUT,">$output"; | ||
| 97 | |||
| 53 | $softonly=0; # allow hardware support | 98 | $softonly=0; # allow hardware support |
| 54 | 99 | ||
| 55 | $t0="%r0"; $mask="%r0"; | 100 | $t0="%r0"; $mask="%r0"; |
| @@ -69,6 +114,8 @@ $rounds="%r13"; | |||
| 69 | $ra="%r14"; | 114 | $ra="%r14"; |
| 70 | $sp="%r15"; | 115 | $sp="%r15"; |
| 71 | 116 | ||
| 117 | $stdframe=16*$SIZE_T+4*8; | ||
| 118 | |||
| 72 | sub _data_word() | 119 | sub _data_word() |
| 73 | { my $i; | 120 | { my $i; |
| 74 | while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } | 121 | while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } |
| @@ -210,7 +257,7 @@ $code.=<<___ if (!$softonly); | |||
| 210 | .Lesoft: | 257 | .Lesoft: |
| 211 | ___ | 258 | ___ |
| 212 | $code.=<<___; | 259 | $code.=<<___; |
| 213 | stmg %r3,$ra,24($sp) | 260 | stm${g} %r3,$ra,3*$SIZE_T($sp) |
| 214 | 261 | ||
| 215 | llgf $s0,0($inp) | 262 | llgf $s0,0($inp) |
| 216 | llgf $s1,4($inp) | 263 | llgf $s1,4($inp) |
| @@ -220,20 +267,20 @@ $code.=<<___; | |||
| 220 | larl $tbl,AES_Te | 267 | larl $tbl,AES_Te |
| 221 | bras $ra,_s390x_AES_encrypt | 268 | bras $ra,_s390x_AES_encrypt |
| 222 | 269 | ||
| 223 | lg $out,24($sp) | 270 | l${g} $out,3*$SIZE_T($sp) |
| 224 | st $s0,0($out) | 271 | st $s0,0($out) |
| 225 | st $s1,4($out) | 272 | st $s1,4($out) |
| 226 | st $s2,8($out) | 273 | st $s2,8($out) |
| 227 | st $s3,12($out) | 274 | st $s3,12($out) |
| 228 | 275 | ||
| 229 | lmg %r6,$ra,48($sp) | 276 | lm${g} %r6,$ra,6*$SIZE_T($sp) |
| 230 | br $ra | 277 | br $ra |
| 231 | .size AES_encrypt,.-AES_encrypt | 278 | .size AES_encrypt,.-AES_encrypt |
| 232 | 279 | ||
| 233 | .type _s390x_AES_encrypt,\@function | 280 | .type _s390x_AES_encrypt,\@function |
| 234 | .align 16 | 281 | .align 16 |
| 235 | _s390x_AES_encrypt: | 282 | _s390x_AES_encrypt: |
| 236 | stg $ra,152($sp) | 283 | st${g} $ra,15*$SIZE_T($sp) |
| 237 | x $s0,0($key) | 284 | x $s0,0($key) |
| 238 | x $s1,4($key) | 285 | x $s1,4($key) |
| 239 | x $s2,8($key) | 286 | x $s2,8($key) |
| @@ -397,7 +444,7 @@ _s390x_AES_encrypt: | |||
| 397 | or $s2,$i3 | 444 | or $s2,$i3 |
| 398 | or $s3,$t3 | 445 | or $s3,$t3 |
| 399 | 446 | ||
| 400 | lg $ra,152($sp) | 447 | l${g} $ra,15*$SIZE_T($sp) |
| 401 | xr $s0,$t0 | 448 | xr $s0,$t0 |
| 402 | xr $s1,$t2 | 449 | xr $s1,$t2 |
| 403 | x $s2,24($key) | 450 | x $s2,24($key) |
| @@ -536,7 +583,7 @@ $code.=<<___ if (!$softonly); | |||
| 536 | .Ldsoft: | 583 | .Ldsoft: |
| 537 | ___ | 584 | ___ |
| 538 | $code.=<<___; | 585 | $code.=<<___; |
| 539 | stmg %r3,$ra,24($sp) | 586 | stm${g} %r3,$ra,3*$SIZE_T($sp) |
| 540 | 587 | ||
| 541 | llgf $s0,0($inp) | 588 | llgf $s0,0($inp) |
| 542 | llgf $s1,4($inp) | 589 | llgf $s1,4($inp) |
| @@ -546,20 +593,20 @@ $code.=<<___; | |||
| 546 | larl $tbl,AES_Td | 593 | larl $tbl,AES_Td |
| 547 | bras $ra,_s390x_AES_decrypt | 594 | bras $ra,_s390x_AES_decrypt |
| 548 | 595 | ||
| 549 | lg $out,24($sp) | 596 | l${g} $out,3*$SIZE_T($sp) |
| 550 | st $s0,0($out) | 597 | st $s0,0($out) |
| 551 | st $s1,4($out) | 598 | st $s1,4($out) |
| 552 | st $s2,8($out) | 599 | st $s2,8($out) |
| 553 | st $s3,12($out) | 600 | st $s3,12($out) |
| 554 | 601 | ||
| 555 | lmg %r6,$ra,48($sp) | 602 | lm${g} %r6,$ra,6*$SIZE_T($sp) |
| 556 | br $ra | 603 | br $ra |
| 557 | .size AES_decrypt,.-AES_decrypt | 604 | .size AES_decrypt,.-AES_decrypt |
| 558 | 605 | ||
| 559 | .type _s390x_AES_decrypt,\@function | 606 | .type _s390x_AES_decrypt,\@function |
| 560 | .align 16 | 607 | .align 16 |
| 561 | _s390x_AES_decrypt: | 608 | _s390x_AES_decrypt: |
| 562 | stg $ra,152($sp) | 609 | st${g} $ra,15*$SIZE_T($sp) |
| 563 | x $s0,0($key) | 610 | x $s0,0($key) |
| 564 | x $s1,4($key) | 611 | x $s1,4($key) |
| 565 | x $s2,8($key) | 612 | x $s2,8($key) |
| @@ -703,7 +750,7 @@ _s390x_AES_decrypt: | |||
| 703 | nr $i1,$mask | 750 | nr $i1,$mask |
| 704 | nr $i2,$mask | 751 | nr $i2,$mask |
| 705 | 752 | ||
| 706 | lg $ra,152($sp) | 753 | l${g} $ra,15*$SIZE_T($sp) |
| 707 | or $s1,$t1 | 754 | or $s1,$t1 |
| 708 | l $t0,16($key) | 755 | l $t0,16($key) |
| 709 | l $t1,20($key) | 756 | l $t1,20($key) |
| @@ -732,14 +779,15 @@ ___ | |||
| 732 | $code.=<<___; | 779 | $code.=<<___; |
| 733 | # void AES_set_encrypt_key(const unsigned char *in, int bits, | 780 | # void AES_set_encrypt_key(const unsigned char *in, int bits, |
| 734 | # AES_KEY *key) { | 781 | # AES_KEY *key) { |
| 735 | .globl AES_set_encrypt_key | 782 | .globl private_AES_set_encrypt_key |
| 736 | .type AES_set_encrypt_key,\@function | 783 | .type private_AES_set_encrypt_key,\@function |
| 737 | .align 16 | 784 | .align 16 |
| 738 | AES_set_encrypt_key: | 785 | private_AES_set_encrypt_key: |
| 786 | _s390x_AES_set_encrypt_key: | ||
| 739 | lghi $t0,0 | 787 | lghi $t0,0 |
| 740 | clgr $inp,$t0 | 788 | cl${g}r $inp,$t0 |
| 741 | je .Lminus1 | 789 | je .Lminus1 |
| 742 | clgr $key,$t0 | 790 | cl${g}r $key,$t0 |
| 743 | je .Lminus1 | 791 | je .Lminus1 |
| 744 | 792 | ||
| 745 | lghi $t0,128 | 793 | lghi $t0,128 |
| @@ -789,7 +837,8 @@ $code.=<<___ if (!$softonly); | |||
| 789 | je 1f | 837 | je 1f |
| 790 | lg %r1,24($inp) | 838 | lg %r1,24($inp) |
| 791 | stg %r1,24($key) | 839 | stg %r1,24($key) |
| 792 | 1: st $bits,236($key) # save bits | 840 | 1: st $bits,236($key) # save bits [for debugging purposes] |
| 841 | lgr $t0,%r5 | ||
| 793 | st %r5,240($key) # save km code | 842 | st %r5,240($key) # save km code |
| 794 | lghi %r2,0 | 843 | lghi %r2,0 |
| 795 | br %r14 | 844 | br %r14 |
| @@ -797,7 +846,7 @@ ___ | |||
| 797 | $code.=<<___; | 846 | $code.=<<___; |
| 798 | .align 16 | 847 | .align 16 |
| 799 | .Lekey_internal: | 848 | .Lekey_internal: |
| 800 | stmg %r6,%r13,48($sp) # all non-volatile regs | 849 | stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key |
| 801 | 850 | ||
| 802 | larl $tbl,AES_Te+2048 | 851 | larl $tbl,AES_Te+2048 |
| 803 | 852 | ||
| @@ -857,8 +906,9 @@ $code.=<<___; | |||
| 857 | la $key,16($key) # key+=4 | 906 | la $key,16($key) # key+=4 |
| 858 | la $t3,4($t3) # i++ | 907 | la $t3,4($t3) # i++ |
| 859 | brct $rounds,.L128_loop | 908 | brct $rounds,.L128_loop |
| 909 | lghi $t0,10 | ||
| 860 | lghi %r2,0 | 910 | lghi %r2,0 |
| 861 | lmg %r6,%r13,48($sp) | 911 | lm${g} %r4,%r13,4*$SIZE_T($sp) |
| 862 | br $ra | 912 | br $ra |
| 863 | 913 | ||
| 864 | .align 16 | 914 | .align 16 |
| @@ -905,8 +955,9 @@ $code.=<<___; | |||
| 905 | st $s2,32($key) | 955 | st $s2,32($key) |
| 906 | st $s3,36($key) | 956 | st $s3,36($key) |
| 907 | brct $rounds,.L192_continue | 957 | brct $rounds,.L192_continue |
| 958 | lghi $t0,12 | ||
| 908 | lghi %r2,0 | 959 | lghi %r2,0 |
| 909 | lmg %r6,%r13,48($sp) | 960 | lm${g} %r4,%r13,4*$SIZE_T($sp) |
| 910 | br $ra | 961 | br $ra |
| 911 | 962 | ||
| 912 | .align 16 | 963 | .align 16 |
| @@ -967,8 +1018,9 @@ $code.=<<___; | |||
| 967 | st $s2,40($key) | 1018 | st $s2,40($key) |
| 968 | st $s3,44($key) | 1019 | st $s3,44($key) |
| 969 | brct $rounds,.L256_continue | 1020 | brct $rounds,.L256_continue |
| 1021 | lghi $t0,14 | ||
| 970 | lghi %r2,0 | 1022 | lghi %r2,0 |
| 971 | lmg %r6,%r13,48($sp) | 1023 | lm${g} %r4,%r13,4*$SIZE_T($sp) |
| 972 | br $ra | 1024 | br $ra |
| 973 | 1025 | ||
| 974 | .align 16 | 1026 | .align 16 |
| @@ -1011,42 +1063,34 @@ $code.=<<___; | |||
| 1011 | .Lminus1: | 1063 | .Lminus1: |
| 1012 | lghi %r2,-1 | 1064 | lghi %r2,-1 |
| 1013 | br $ra | 1065 | br $ra |
| 1014 | .size AES_set_encrypt_key,.-AES_set_encrypt_key | 1066 | .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key |
| 1015 | 1067 | ||
| 1016 | # void AES_set_decrypt_key(const unsigned char *in, int bits, | 1068 | # void AES_set_decrypt_key(const unsigned char *in, int bits, |
| 1017 | # AES_KEY *key) { | 1069 | # AES_KEY *key) { |
| 1018 | .globl AES_set_decrypt_key | 1070 | .globl private_AES_set_decrypt_key |
| 1019 | .type AES_set_decrypt_key,\@function | 1071 | .type private_AES_set_decrypt_key,\@function |
| 1020 | .align 16 | 1072 | .align 16 |
| 1021 | AES_set_decrypt_key: | 1073 | private_AES_set_decrypt_key: |
| 1022 | stg $key,32($sp) # I rely on AES_set_encrypt_key to | 1074 | #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to |
| 1023 | stg $ra,112($sp) # save non-volatile registers! | 1075 | st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key! |
| 1024 | bras $ra,AES_set_encrypt_key | 1076 | bras $ra,_s390x_AES_set_encrypt_key |
| 1025 | lg $key,32($sp) | 1077 | #l${g} $key,4*$SIZE_T($sp) |
| 1026 | lg $ra,112($sp) | 1078 | l${g} $ra,14*$SIZE_T($sp) |
| 1027 | ltgr %r2,%r2 | 1079 | ltgr %r2,%r2 |
| 1028 | bnzr $ra | 1080 | bnzr $ra |
| 1029 | ___ | 1081 | ___ |
| 1030 | $code.=<<___ if (!$softonly); | 1082 | $code.=<<___ if (!$softonly); |
| 1031 | l $t0,240($key) | 1083 | #l $t0,240($key) |
| 1032 | lhi $t1,16 | 1084 | lhi $t1,16 |
| 1033 | cr $t0,$t1 | 1085 | cr $t0,$t1 |
| 1034 | jl .Lgo | 1086 | jl .Lgo |
| 1035 | oill $t0,0x80 # set "decrypt" bit | 1087 | oill $t0,0x80 # set "decrypt" bit |
| 1036 | st $t0,240($key) | 1088 | st $t0,240($key) |
| 1037 | br $ra | 1089 | br $ra |
| 1038 | |||
| 1039 | .align 16 | ||
| 1040 | .Ldkey_internal: | ||
| 1041 | stg $key,32($sp) | ||
| 1042 | stg $ra,40($sp) | ||
| 1043 | bras $ra,.Lekey_internal | ||
| 1044 | lg $key,32($sp) | ||
| 1045 | lg $ra,40($sp) | ||
| 1046 | ___ | 1090 | ___ |
| 1047 | $code.=<<___; | 1091 | $code.=<<___; |
| 1048 | 1092 | .align 16 | |
| 1049 | .Lgo: llgf $rounds,240($key) | 1093 | .Lgo: lgr $rounds,$t0 #llgf $rounds,240($key) |
| 1050 | la $i1,0($key) | 1094 | la $i1,0($key) |
| 1051 | sllg $i2,$rounds,4 | 1095 | sllg $i2,$rounds,4 |
| 1052 | la $i2,0($i2,$key) | 1096 | la $i2,0($i2,$key) |
| @@ -1123,13 +1167,14 @@ $code.=<<___; | |||
| 1123 | la $key,4($key) | 1167 | la $key,4($key) |
| 1124 | brct $rounds,.Lmix | 1168 | brct $rounds,.Lmix |
| 1125 | 1169 | ||
| 1126 | lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! | 1170 | lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key! |
| 1127 | lghi %r2,0 | 1171 | lghi %r2,0 |
| 1128 | br $ra | 1172 | br $ra |
| 1129 | .size AES_set_decrypt_key,.-AES_set_decrypt_key | 1173 | .size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key |
| 1130 | ___ | 1174 | ___ |
| 1131 | 1175 | ||
| 1132 | #void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | 1176 | ######################################################################## |
| 1177 | # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | ||
| 1133 | # size_t length, const AES_KEY *key, | 1178 | # size_t length, const AES_KEY *key, |
| 1134 | # unsigned char *ivec, const int enc) | 1179 | # unsigned char *ivec, const int enc) |
| 1135 | { | 1180 | { |
| @@ -1163,7 +1208,7 @@ $code.=<<___ if (!$softonly); | |||
| 1163 | l %r0,240($key) # load kmc code | 1208 | l %r0,240($key) # load kmc code |
| 1164 | lghi $key,15 # res=len%16, len-=res; | 1209 | lghi $key,15 # res=len%16, len-=res; |
| 1165 | ngr $key,$len | 1210 | ngr $key,$len |
| 1166 | slgr $len,$key | 1211 | sl${g}r $len,$key |
| 1167 | la %r1,16($sp) # parameter block - ivec || key | 1212 | la %r1,16($sp) # parameter block - ivec || key |
| 1168 | jz .Lkmc_truncated | 1213 | jz .Lkmc_truncated |
| 1169 | .long 0xb92f0042 # kmc %r4,%r2 | 1214 | .long 0xb92f0042 # kmc %r4,%r2 |
| @@ -1181,34 +1226,34 @@ $code.=<<___ if (!$softonly); | |||
| 1181 | tmll %r0,0x80 | 1226 | tmll %r0,0x80 |
| 1182 | jnz .Lkmc_truncated_dec | 1227 | jnz .Lkmc_truncated_dec |
| 1183 | lghi %r1,0 | 1228 | lghi %r1,0 |
| 1184 | stg %r1,128($sp) | 1229 | stg %r1,16*$SIZE_T($sp) |
| 1185 | stg %r1,136($sp) | 1230 | stg %r1,16*$SIZE_T+8($sp) |
| 1186 | bras %r1,1f | 1231 | bras %r1,1f |
| 1187 | mvc 128(1,$sp),0($inp) | 1232 | mvc 16*$SIZE_T(1,$sp),0($inp) |
| 1188 | 1: ex $key,0(%r1) | 1233 | 1: ex $key,0(%r1) |
| 1189 | la %r1,16($sp) # restore parameter block | 1234 | la %r1,16($sp) # restore parameter block |
| 1190 | la $inp,128($sp) | 1235 | la $inp,16*$SIZE_T($sp) |
| 1191 | lghi $len,16 | 1236 | lghi $len,16 |
| 1192 | .long 0xb92f0042 # kmc %r4,%r2 | 1237 | .long 0xb92f0042 # kmc %r4,%r2 |
| 1193 | j .Lkmc_done | 1238 | j .Lkmc_done |
| 1194 | .align 16 | 1239 | .align 16 |
| 1195 | .Lkmc_truncated_dec: | 1240 | .Lkmc_truncated_dec: |
| 1196 | stg $out,64($sp) | 1241 | st${g} $out,4*$SIZE_T($sp) |
| 1197 | la $out,128($sp) | 1242 | la $out,16*$SIZE_T($sp) |
| 1198 | lghi $len,16 | 1243 | lghi $len,16 |
| 1199 | .long 0xb92f0042 # kmc %r4,%r2 | 1244 | .long 0xb92f0042 # kmc %r4,%r2 |
| 1200 | lg $out,64($sp) | 1245 | l${g} $out,4*$SIZE_T($sp) |
| 1201 | bras %r1,2f | 1246 | bras %r1,2f |
| 1202 | mvc 0(1,$out),128($sp) | 1247 | mvc 0(1,$out),16*$SIZE_T($sp) |
| 1203 | 2: ex $key,0(%r1) | 1248 | 2: ex $key,0(%r1) |
| 1204 | j .Lkmc_done | 1249 | j .Lkmc_done |
| 1205 | .align 16 | 1250 | .align 16 |
| 1206 | .Lcbc_software: | 1251 | .Lcbc_software: |
| 1207 | ___ | 1252 | ___ |
| 1208 | $code.=<<___; | 1253 | $code.=<<___; |
| 1209 | stmg $key,$ra,40($sp) | 1254 | stm${g} $key,$ra,5*$SIZE_T($sp) |
| 1210 | lhi %r0,0 | 1255 | lhi %r0,0 |
| 1211 | cl %r0,164($sp) | 1256 | cl %r0,`$stdframe+$SIZE_T-4`($sp) |
| 1212 | je .Lcbc_decrypt | 1257 | je .Lcbc_decrypt |
| 1213 | 1258 | ||
| 1214 | larl $tbl,AES_Te | 1259 | larl $tbl,AES_Te |
| @@ -1219,10 +1264,10 @@ $code.=<<___; | |||
| 1219 | llgf $s3,12($ivp) | 1264 | llgf $s3,12($ivp) |
| 1220 | 1265 | ||
| 1221 | lghi $t0,16 | 1266 | lghi $t0,16 |
| 1222 | slgr $len,$t0 | 1267 | sl${g}r $len,$t0 |
| 1223 | brc 4,.Lcbc_enc_tail # if borrow | 1268 | brc 4,.Lcbc_enc_tail # if borrow |
| 1224 | .Lcbc_enc_loop: | 1269 | .Lcbc_enc_loop: |
| 1225 | stmg $inp,$out,16($sp) | 1270 | stm${g} $inp,$out,2*$SIZE_T($sp) |
| 1226 | x $s0,0($inp) | 1271 | x $s0,0($inp) |
| 1227 | x $s1,4($inp) | 1272 | x $s1,4($inp) |
| 1228 | x $s2,8($inp) | 1273 | x $s2,8($inp) |
| @@ -1231,7 +1276,7 @@ $code.=<<___; | |||
| 1231 | 1276 | ||
| 1232 | bras $ra,_s390x_AES_encrypt | 1277 | bras $ra,_s390x_AES_encrypt |
| 1233 | 1278 | ||
| 1234 | lmg $inp,$key,16($sp) | 1279 | lm${g} $inp,$key,2*$SIZE_T($sp) |
| 1235 | st $s0,0($out) | 1280 | st $s0,0($out) |
| 1236 | st $s1,4($out) | 1281 | st $s1,4($out) |
| 1237 | st $s2,8($out) | 1282 | st $s2,8($out) |
| @@ -1240,33 +1285,33 @@ $code.=<<___; | |||
| 1240 | la $inp,16($inp) | 1285 | la $inp,16($inp) |
| 1241 | la $out,16($out) | 1286 | la $out,16($out) |
| 1242 | lghi $t0,16 | 1287 | lghi $t0,16 |
| 1243 | ltgr $len,$len | 1288 | lt${g}r $len,$len |
| 1244 | jz .Lcbc_enc_done | 1289 | jz .Lcbc_enc_done |
| 1245 | slgr $len,$t0 | 1290 | sl${g}r $len,$t0 |
| 1246 | brc 4,.Lcbc_enc_tail # if borrow | 1291 | brc 4,.Lcbc_enc_tail # if borrow |
| 1247 | j .Lcbc_enc_loop | 1292 | j .Lcbc_enc_loop |
| 1248 | .align 16 | 1293 | .align 16 |
| 1249 | .Lcbc_enc_done: | 1294 | .Lcbc_enc_done: |
| 1250 | lg $ivp,48($sp) | 1295 | l${g} $ivp,6*$SIZE_T($sp) |
| 1251 | st $s0,0($ivp) | 1296 | st $s0,0($ivp) |
| 1252 | st $s1,4($ivp) | 1297 | st $s1,4($ivp) |
| 1253 | st $s2,8($ivp) | 1298 | st $s2,8($ivp) |
| 1254 | st $s3,12($ivp) | 1299 | st $s3,12($ivp) |
| 1255 | 1300 | ||
| 1256 | lmg %r7,$ra,56($sp) | 1301 | lm${g} %r7,$ra,7*$SIZE_T($sp) |
| 1257 | br $ra | 1302 | br $ra |
| 1258 | 1303 | ||
| 1259 | .align 16 | 1304 | .align 16 |
| 1260 | .Lcbc_enc_tail: | 1305 | .Lcbc_enc_tail: |
| 1261 | aghi $len,15 | 1306 | aghi $len,15 |
| 1262 | lghi $t0,0 | 1307 | lghi $t0,0 |
| 1263 | stg $t0,128($sp) | 1308 | stg $t0,16*$SIZE_T($sp) |
| 1264 | stg $t0,136($sp) | 1309 | stg $t0,16*$SIZE_T+8($sp) |
| 1265 | bras $t1,3f | 1310 | bras $t1,3f |
| 1266 | mvc 128(1,$sp),0($inp) | 1311 | mvc 16*$SIZE_T(1,$sp),0($inp) |
| 1267 | 3: ex $len,0($t1) | 1312 | 3: ex $len,0($t1) |
| 1268 | lghi $len,0 | 1313 | lghi $len,0 |
| 1269 | la $inp,128($sp) | 1314 | la $inp,16*$SIZE_T($sp) |
| 1270 | j .Lcbc_enc_loop | 1315 | j .Lcbc_enc_loop |
| 1271 | 1316 | ||
| 1272 | .align 16 | 1317 | .align 16 |
| @@ -1275,10 +1320,10 @@ $code.=<<___; | |||
| 1275 | 1320 | ||
| 1276 | lg $t0,0($ivp) | 1321 | lg $t0,0($ivp) |
| 1277 | lg $t1,8($ivp) | 1322 | lg $t1,8($ivp) |
| 1278 | stmg $t0,$t1,128($sp) | 1323 | stmg $t0,$t1,16*$SIZE_T($sp) |
| 1279 | 1324 | ||
| 1280 | .Lcbc_dec_loop: | 1325 | .Lcbc_dec_loop: |
| 1281 | stmg $inp,$out,16($sp) | 1326 | stm${g} $inp,$out,2*$SIZE_T($sp) |
| 1282 | llgf $s0,0($inp) | 1327 | llgf $s0,0($inp) |
| 1283 | llgf $s1,4($inp) | 1328 | llgf $s1,4($inp) |
| 1284 | llgf $s2,8($inp) | 1329 | llgf $s2,8($inp) |
| @@ -1287,7 +1332,7 @@ $code.=<<___; | |||
| 1287 | 1332 | ||
| 1288 | bras $ra,_s390x_AES_decrypt | 1333 | bras $ra,_s390x_AES_decrypt |
| 1289 | 1334 | ||
| 1290 | lmg $inp,$key,16($sp) | 1335 | lm${g} $inp,$key,2*$SIZE_T($sp) |
| 1291 | sllg $s0,$s0,32 | 1336 | sllg $s0,$s0,32 |
| 1292 | sllg $s2,$s2,32 | 1337 | sllg $s2,$s2,32 |
| 1293 | lr $s0,$s1 | 1338 | lr $s0,$s1 |
| @@ -1295,15 +1340,15 @@ $code.=<<___; | |||
| 1295 | 1340 | ||
| 1296 | lg $t0,0($inp) | 1341 | lg $t0,0($inp) |
| 1297 | lg $t1,8($inp) | 1342 | lg $t1,8($inp) |
| 1298 | xg $s0,128($sp) | 1343 | xg $s0,16*$SIZE_T($sp) |
| 1299 | xg $s2,136($sp) | 1344 | xg $s2,16*$SIZE_T+8($sp) |
| 1300 | lghi $s1,16 | 1345 | lghi $s1,16 |
| 1301 | slgr $len,$s1 | 1346 | sl${g}r $len,$s1 |
| 1302 | brc 4,.Lcbc_dec_tail # if borrow | 1347 | brc 4,.Lcbc_dec_tail # if borrow |
| 1303 | brc 2,.Lcbc_dec_done # if zero | 1348 | brc 2,.Lcbc_dec_done # if zero |
| 1304 | stg $s0,0($out) | 1349 | stg $s0,0($out) |
| 1305 | stg $s2,8($out) | 1350 | stg $s2,8($out) |
| 1306 | stmg $t0,$t1,128($sp) | 1351 | stmg $t0,$t1,16*$SIZE_T($sp) |
| 1307 | 1352 | ||
| 1308 | la $inp,16($inp) | 1353 | la $inp,16($inp) |
| 1309 | la $out,16($out) | 1354 | la $out,16($out) |
| @@ -1313,7 +1358,7 @@ $code.=<<___; | |||
| 1313 | stg $s0,0($out) | 1358 | stg $s0,0($out) |
| 1314 | stg $s2,8($out) | 1359 | stg $s2,8($out) |
| 1315 | .Lcbc_dec_exit: | 1360 | .Lcbc_dec_exit: |
| 1316 | lmg $ivp,$ra,48($sp) | 1361 | lm${g} %r6,$ra,6*$SIZE_T($sp) |
| 1317 | stmg $t0,$t1,0($ivp) | 1362 | stmg $t0,$t1,0($ivp) |
| 1318 | 1363 | ||
| 1319 | br $ra | 1364 | br $ra |
| @@ -1321,19 +1366,889 @@ $code.=<<___; | |||
| 1321 | .align 16 | 1366 | .align 16 |
| 1322 | .Lcbc_dec_tail: | 1367 | .Lcbc_dec_tail: |
| 1323 | aghi $len,15 | 1368 | aghi $len,15 |
| 1324 | stg $s0,128($sp) | 1369 | stg $s0,16*$SIZE_T($sp) |
| 1325 | stg $s2,136($sp) | 1370 | stg $s2,16*$SIZE_T+8($sp) |
| 1326 | bras $s1,4f | 1371 | bras $s1,4f |
| 1327 | mvc 0(1,$out),128($sp) | 1372 | mvc 0(1,$out),16*$SIZE_T($sp) |
| 1328 | 4: ex $len,0($s1) | 1373 | 4: ex $len,0($s1) |
| 1329 | j .Lcbc_dec_exit | 1374 | j .Lcbc_dec_exit |
| 1330 | .size AES_cbc_encrypt,.-AES_cbc_encrypt | 1375 | .size AES_cbc_encrypt,.-AES_cbc_encrypt |
| 1331 | .comm OPENSSL_s390xcap_P,8,8 | 1376 | ___ |
| 1377 | } | ||
| 1378 | ######################################################################## | ||
| 1379 | # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out, | ||
| 1380 | # size_t blocks, const AES_KEY *key, | ||
| 1381 | # const unsigned char *ivec) | ||
| 1382 | { | ||
| 1383 | my $inp="%r2"; | ||
| 1384 | my $out="%r4"; # blocks and out are swapped | ||
| 1385 | my $len="%r3"; | ||
| 1386 | my $key="%r5"; my $iv0="%r5"; | ||
| 1387 | my $ivp="%r6"; | ||
| 1388 | my $fp ="%r7"; | ||
| 1389 | |||
| 1390 | $code.=<<___; | ||
| 1391 | .globl AES_ctr32_encrypt | ||
| 1392 | .type AES_ctr32_encrypt,\@function | ||
| 1393 | .align 16 | ||
| 1394 | AES_ctr32_encrypt: | ||
| 1395 | xgr %r3,%r4 # flip %r3 and %r4, $out and $len | ||
| 1396 | xgr %r4,%r3 | ||
| 1397 | xgr %r3,%r4 | ||
| 1398 | llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case | ||
| 1399 | ___ | ||
| 1400 | $code.=<<___ if (!$softonly); | ||
| 1401 | l %r0,240($key) | ||
| 1402 | lhi %r1,16 | ||
| 1403 | clr %r0,%r1 | ||
| 1404 | jl .Lctr32_software | ||
| 1405 | |||
| 1406 | stm${g} %r6,$s3,6*$SIZE_T($sp) | ||
| 1407 | |||
| 1408 | slgr $out,$inp | ||
| 1409 | la %r1,0($key) # %r1 is permanent copy of $key | ||
| 1410 | lg $iv0,0($ivp) # load ivec | ||
| 1411 | lg $ivp,8($ivp) | ||
| 1412 | |||
| 1413 | # prepare and allocate stack frame at the top of 4K page | ||
| 1414 | # with 1K reserved for eventual signal handling | ||
| 1415 | lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer | ||
| 1416 | lghi $s1,-4096 | ||
| 1417 | algr $s0,$sp | ||
| 1418 | lgr $fp,$sp | ||
| 1419 | ngr $s0,$s1 # align at page boundary | ||
| 1420 | slgr $fp,$s0 # total buffer size | ||
| 1421 | lgr $s2,$sp | ||
| 1422 | lghi $s1,1024+16 # sl[g]fi is extended-immediate facility | ||
| 1423 | slgr $fp,$s1 # deduct reservation to get usable buffer size | ||
| 1424 | # buffer size is at lest 256 and at most 3072+256-16 | ||
| 1425 | |||
| 1426 | la $sp,1024($s0) # alloca | ||
| 1427 | srlg $fp,$fp,4 # convert bytes to blocks, minimum 16 | ||
| 1428 | st${g} $s2,0($sp) # back-chain | ||
| 1429 | st${g} $fp,$SIZE_T($sp) | ||
| 1430 | |||
| 1431 | slgr $len,$fp | ||
| 1432 | brc 1,.Lctr32_hw_switch # not zero, no borrow | ||
| 1433 | algr $fp,$len # input is shorter than allocated buffer | ||
| 1434 | lghi $len,0 | ||
| 1435 | st${g} $fp,$SIZE_T($sp) | ||
| 1436 | |||
| 1437 | .Lctr32_hw_switch: | ||
| 1438 | ___ | ||
| 1439 | $code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower | ||
| 1440 | larl $s0,OPENSSL_s390xcap_P | ||
| 1441 | lg $s0,8($s0) | ||
| 1442 | tmhh $s0,0x0004 # check for message_security-assist-4 | ||
| 1443 | jz .Lctr32_km_loop | ||
| 1444 | |||
| 1445 | llgfr $s0,%r0 | ||
| 1446 | lgr $s1,%r1 | ||
| 1447 | lghi %r0,0 | ||
| 1448 | la %r1,16($sp) | ||
| 1449 | .long 0xb92d2042 # kmctr %r4,%r2,%r2 | ||
| 1450 | |||
| 1451 | llihh %r0,0x8000 # check if kmctr supports the function code | ||
| 1452 | srlg %r0,%r0,0($s0) | ||
| 1453 | ng %r0,16($sp) | ||
| 1454 | lgr %r0,$s0 | ||
| 1455 | lgr %r1,$s1 | ||
| 1456 | jz .Lctr32_km_loop | ||
| 1457 | |||
| 1458 | ####### kmctr code | ||
| 1459 | algr $out,$inp # restore $out | ||
| 1460 | lgr $s1,$len # $s1 undertakes $len | ||
| 1461 | j .Lctr32_kmctr_loop | ||
| 1462 | .align 16 | ||
| 1463 | .Lctr32_kmctr_loop: | ||
| 1464 | la $s2,16($sp) | ||
| 1465 | lgr $s3,$fp | ||
| 1466 | .Lctr32_kmctr_prepare: | ||
| 1467 | stg $iv0,0($s2) | ||
| 1468 | stg $ivp,8($s2) | ||
| 1469 | la $s2,16($s2) | ||
| 1470 | ahi $ivp,1 # 32-bit increment, preserves upper half | ||
| 1471 | brct $s3,.Lctr32_kmctr_prepare | ||
| 1472 | |||
| 1473 | #la $inp,0($inp) # inp | ||
| 1474 | sllg $len,$fp,4 # len | ||
| 1475 | #la $out,0($out) # out | ||
| 1476 | la $s2,16($sp) # iv | ||
| 1477 | .long 0xb92da042 # kmctr $out,$s2,$inp | ||
| 1478 | brc 1,.-4 # pay attention to "partial completion" | ||
| 1479 | |||
| 1480 | slgr $s1,$fp | ||
| 1481 | brc 1,.Lctr32_kmctr_loop # not zero, no borrow | ||
| 1482 | algr $fp,$s1 | ||
| 1483 | lghi $s1,0 | ||
| 1484 | brc 4+1,.Lctr32_kmctr_loop # not zero | ||
| 1485 | |||
| 1486 | l${g} $sp,0($sp) | ||
| 1487 | lm${g} %r6,$s3,6*$SIZE_T($sp) | ||
| 1488 | br $ra | ||
| 1489 | .align 16 | ||
| 1490 | ___ | ||
| 1491 | $code.=<<___; | ||
| 1492 | .Lctr32_km_loop: | ||
| 1493 | la $s2,16($sp) | ||
| 1494 | lgr $s3,$fp | ||
| 1495 | .Lctr32_km_prepare: | ||
| 1496 | stg $iv0,0($s2) | ||
| 1497 | stg $ivp,8($s2) | ||
| 1498 | la $s2,16($s2) | ||
| 1499 | ahi $ivp,1 # 32-bit increment, preserves upper half | ||
| 1500 | brct $s3,.Lctr32_km_prepare | ||
| 1501 | |||
| 1502 | la $s0,16($sp) # inp | ||
| 1503 | sllg $s1,$fp,4 # len | ||
| 1504 | la $s2,16($sp) # out | ||
| 1505 | .long 0xb92e00a8 # km %r10,%r8 | ||
| 1506 | brc 1,.-4 # pay attention to "partial completion" | ||
| 1507 | |||
| 1508 | la $s2,16($sp) | ||
| 1509 | lgr $s3,$fp | ||
| 1510 | slgr $s2,$inp | ||
| 1511 | .Lctr32_km_xor: | ||
| 1512 | lg $s0,0($inp) | ||
| 1513 | lg $s1,8($inp) | ||
| 1514 | xg $s0,0($s2,$inp) | ||
| 1515 | xg $s1,8($s2,$inp) | ||
| 1516 | stg $s0,0($out,$inp) | ||
| 1517 | stg $s1,8($out,$inp) | ||
| 1518 | la $inp,16($inp) | ||
| 1519 | brct $s3,.Lctr32_km_xor | ||
| 1520 | |||
| 1521 | slgr $len,$fp | ||
| 1522 | brc 1,.Lctr32_km_loop # not zero, no borrow | ||
| 1523 | algr $fp,$len | ||
| 1524 | lghi $len,0 | ||
| 1525 | brc 4+1,.Lctr32_km_loop # not zero | ||
| 1526 | |||
| 1527 | l${g} $s0,0($sp) | ||
| 1528 | l${g} $s1,$SIZE_T($sp) | ||
| 1529 | la $s2,16($sp) | ||
| 1530 | .Lctr32_km_zap: | ||
| 1531 | stg $s0,0($s2) | ||
| 1532 | stg $s0,8($s2) | ||
| 1533 | la $s2,16($s2) | ||
| 1534 | brct $s1,.Lctr32_km_zap | ||
| 1535 | |||
| 1536 | la $sp,0($s0) | ||
| 1537 | lm${g} %r6,$s3,6*$SIZE_T($sp) | ||
| 1538 | br $ra | ||
| 1539 | .align 16 | ||
| 1540 | .Lctr32_software: | ||
| 1541 | ___ | ||
| 1542 | $code.=<<___; | ||
| 1543 | stm${g} $key,$ra,5*$SIZE_T($sp) | ||
| 1544 | sl${g}r $inp,$out | ||
| 1545 | larl $tbl,AES_Te | ||
| 1546 | llgf $t1,12($ivp) | ||
| 1547 | |||
| 1548 | .Lctr32_loop: | ||
| 1549 | stm${g} $inp,$out,2*$SIZE_T($sp) | ||
| 1550 | llgf $s0,0($ivp) | ||
| 1551 | llgf $s1,4($ivp) | ||
| 1552 | llgf $s2,8($ivp) | ||
| 1553 | lgr $s3,$t1 | ||
| 1554 | st $t1,16*$SIZE_T($sp) | ||
| 1555 | lgr %r4,$key | ||
| 1556 | |||
| 1557 | bras $ra,_s390x_AES_encrypt | ||
| 1558 | |||
| 1559 | lm${g} $inp,$ivp,2*$SIZE_T($sp) | ||
| 1560 | llgf $t1,16*$SIZE_T($sp) | ||
| 1561 | x $s0,0($inp,$out) | ||
| 1562 | x $s1,4($inp,$out) | ||
| 1563 | x $s2,8($inp,$out) | ||
| 1564 | x $s3,12($inp,$out) | ||
| 1565 | stm $s0,$s3,0($out) | ||
| 1566 | |||
| 1567 | la $out,16($out) | ||
| 1568 | ahi $t1,1 # 32-bit increment | ||
| 1569 | brct $len,.Lctr32_loop | ||
| 1570 | |||
| 1571 | lm${g} %r6,$ra,6*$SIZE_T($sp) | ||
| 1572 | br $ra | ||
| 1573 | .size AES_ctr32_encrypt,.-AES_ctr32_encrypt | ||
| 1574 | ___ | ||
| 1575 | } | ||
| 1576 | |||
| 1577 | ######################################################################## | ||
| 1578 | # void AES_xts_encrypt(const char *inp,char *out,size_t len, | ||
| 1579 | # const AES_KEY *key1, const AES_KEY *key2, | ||
| 1580 | # const unsigned char iv[16]); | ||
| 1581 | # | ||
| 1582 | { | ||
| 1583 | my $inp="%r2"; | ||
| 1584 | my $out="%r4"; # len and out are swapped | ||
| 1585 | my $len="%r3"; | ||
| 1586 | my $key1="%r5"; # $i1 | ||
| 1587 | my $key2="%r6"; # $i2 | ||
| 1588 | my $fp="%r7"; # $i3 | ||
| 1589 | my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame... | ||
| 1590 | |||
| 1591 | $code.=<<___; | ||
| 1592 | .type _s390x_xts_km,\@function | ||
| 1593 | .align 16 | ||
| 1594 | _s390x_xts_km: | ||
| 1595 | ___ | ||
| 1596 | $code.=<<___ if(1); | ||
| 1597 | llgfr $s0,%r0 # put aside the function code | ||
| 1598 | lghi $s1,0x7f | ||
| 1599 | nr $s1,%r0 | ||
| 1600 | lghi %r0,0 # query capability vector | ||
| 1601 | la %r1,2*$SIZE_T($sp) | ||
| 1602 | .long 0xb92e0042 # km %r4,%r2 | ||
| 1603 | llihh %r1,0x8000 | ||
| 1604 | srlg %r1,%r1,32($s1) # check for 32+function code | ||
| 1605 | ng %r1,2*$SIZE_T($sp) | ||
| 1606 | lgr %r0,$s0 # restore the function code | ||
| 1607 | la %r1,0($key1) # restore $key1 | ||
| 1608 | jz .Lxts_km_vanilla | ||
| 1609 | |||
| 1610 | lmg $i2,$i3,$tweak($sp) # put aside the tweak value | ||
| 1611 | algr $out,$inp | ||
| 1612 | |||
| 1613 | oill %r0,32 # switch to xts function code | ||
| 1614 | aghi $s1,-18 # | ||
| 1615 | sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16 | ||
| 1616 | la %r1,$tweak-16($sp) | ||
| 1617 | slgr %r1,$s1 # parameter block position | ||
| 1618 | lmg $s0,$s3,0($key1) # load 256 bits of key material, | ||
| 1619 | stmg $s0,$s3,0(%r1) # and copy it to parameter block. | ||
| 1620 | # yes, it contains junk and overlaps | ||
| 1621 | # with the tweak in 128-bit case. | ||
| 1622 | # it's done to avoid conditional | ||
| 1623 | # branch. | ||
| 1624 | stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value | ||
| 1625 | |||
| 1626 | .long 0xb92e0042 # km %r4,%r2 | ||
| 1627 | brc 1,.-4 # pay attention to "partial completion" | ||
| 1628 | |||
| 1629 | lrvg $s0,$tweak+0($sp) # load the last tweak | ||
| 1630 | lrvg $s1,$tweak+8($sp) | ||
| 1631 | stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key | ||
| 1632 | |||
| 1633 | nill %r0,0xffdf # switch back to original function code | ||
| 1634 | la %r1,0($key1) # restore pointer to $key1 | ||
| 1635 | slgr $out,$inp | ||
| 1636 | |||
| 1637 | llgc $len,2*$SIZE_T-1($sp) | ||
| 1638 | nill $len,0x0f # $len%=16 | ||
| 1639 | br $ra | ||
| 1640 | |||
| 1641 | .align 16 | ||
| 1642 | .Lxts_km_vanilla: | ||
| 1643 | ___ | ||
| 1644 | $code.=<<___; | ||
| 1645 | # prepare and allocate stack frame at the top of 4K page | ||
| 1646 | # with 1K reserved for eventual signal handling | ||
| 1647 | lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer | ||
| 1648 | lghi $s1,-4096 | ||
| 1649 | algr $s0,$sp | ||
| 1650 | lgr $fp,$sp | ||
| 1651 | ngr $s0,$s1 # align at page boundary | ||
| 1652 | slgr $fp,$s0 # total buffer size | ||
| 1653 | lgr $s2,$sp | ||
| 1654 | lghi $s1,1024+16 # sl[g]fi is extended-immediate facility | ||
| 1655 | slgr $fp,$s1 # deduct reservation to get usable buffer size | ||
| 1656 | # buffer size is at lest 256 and at most 3072+256-16 | ||
| 1657 | |||
| 1658 | la $sp,1024($s0) # alloca | ||
| 1659 | nill $fp,0xfff0 # round to 16*n | ||
| 1660 | st${g} $s2,0($sp) # back-chain | ||
| 1661 | nill $len,0xfff0 # redundant | ||
| 1662 | st${g} $fp,$SIZE_T($sp) | ||
| 1663 | |||
| 1664 | slgr $len,$fp | ||
| 1665 | brc 1,.Lxts_km_go # not zero, no borrow | ||
| 1666 | algr $fp,$len # input is shorter than allocated buffer | ||
| 1667 | lghi $len,0 | ||
| 1668 | st${g} $fp,$SIZE_T($sp) | ||
| 1669 | |||
| 1670 | .Lxts_km_go: | ||
| 1671 | lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian | ||
| 1672 | lrvg $s1,$tweak+8($s2) | ||
| 1673 | |||
| 1674 | la $s2,16($sp) # vector of ascending tweak values | ||
| 1675 | slgr $s2,$inp | ||
| 1676 | srlg $s3,$fp,4 | ||
| 1677 | j .Lxts_km_start | ||
| 1678 | |||
| 1679 | .Lxts_km_loop: | ||
| 1680 | la $s2,16($sp) | ||
| 1681 | slgr $s2,$inp | ||
| 1682 | srlg $s3,$fp,4 | ||
| 1683 | .Lxts_km_prepare: | ||
| 1684 | lghi $i1,0x87 | ||
| 1685 | srag $i2,$s1,63 # broadcast upper bit | ||
| 1686 | ngr $i1,$i2 # rem | ||
| 1687 | srlg $i2,$s0,63 # carry bit from lower half | ||
| 1688 | sllg $s0,$s0,1 | ||
| 1689 | sllg $s1,$s1,1 | ||
| 1690 | xgr $s0,$i1 | ||
| 1691 | ogr $s1,$i2 | ||
| 1692 | .Lxts_km_start: | ||
| 1693 | lrvgr $i1,$s0 # flip byte order | ||
| 1694 | lrvgr $i2,$s1 | ||
| 1695 | stg $i1,0($s2,$inp) | ||
| 1696 | stg $i2,8($s2,$inp) | ||
| 1697 | xg $i1,0($inp) | ||
| 1698 | xg $i2,8($inp) | ||
| 1699 | stg $i1,0($out,$inp) | ||
| 1700 | stg $i2,8($out,$inp) | ||
| 1701 | la $inp,16($inp) | ||
| 1702 | brct $s3,.Lxts_km_prepare | ||
| 1703 | |||
| 1704 | slgr $inp,$fp # rewind $inp | ||
| 1705 | la $s2,0($out,$inp) | ||
| 1706 | lgr $s3,$fp | ||
| 1707 | .long 0xb92e00aa # km $s2,$s2 | ||
| 1708 | brc 1,.-4 # pay attention to "partial completion" | ||
| 1709 | |||
| 1710 | la $s2,16($sp) | ||
| 1711 | slgr $s2,$inp | ||
| 1712 | srlg $s3,$fp,4 | ||
| 1713 | .Lxts_km_xor: | ||
| 1714 | lg $i1,0($out,$inp) | ||
| 1715 | lg $i2,8($out,$inp) | ||
| 1716 | xg $i1,0($s2,$inp) | ||
| 1717 | xg $i2,8($s2,$inp) | ||
| 1718 | stg $i1,0($out,$inp) | ||
| 1719 | stg $i2,8($out,$inp) | ||
| 1720 | la $inp,16($inp) | ||
| 1721 | brct $s3,.Lxts_km_xor | ||
| 1722 | |||
| 1723 | slgr $len,$fp | ||
| 1724 | brc 1,.Lxts_km_loop # not zero, no borrow | ||
| 1725 | algr $fp,$len | ||
| 1726 | lghi $len,0 | ||
| 1727 | brc 4+1,.Lxts_km_loop # not zero | ||
| 1728 | |||
| 1729 | l${g} $i1,0($sp) # back-chain | ||
| 1730 | llgf $fp,`2*$SIZE_T-4`($sp) # bytes used | ||
| 1731 | la $i2,16($sp) | ||
| 1732 | srlg $fp,$fp,4 | ||
| 1733 | .Lxts_km_zap: | ||
| 1734 | stg $i1,0($i2) | ||
| 1735 | stg $i1,8($i2) | ||
| 1736 | la $i2,16($i2) | ||
| 1737 | brct $fp,.Lxts_km_zap | ||
| 1738 | |||
| 1739 | la $sp,0($i1) | ||
| 1740 | llgc $len,2*$SIZE_T-1($i1) | ||
| 1741 | nill $len,0x0f # $len%=16 | ||
| 1742 | bzr $ra | ||
| 1743 | |||
| 1744 | # generate one more tweak... | ||
| 1745 | lghi $i1,0x87 | ||
| 1746 | srag $i2,$s1,63 # broadcast upper bit | ||
| 1747 | ngr $i1,$i2 # rem | ||
| 1748 | srlg $i2,$s0,63 # carry bit from lower half | ||
| 1749 | sllg $s0,$s0,1 | ||
| 1750 | sllg $s1,$s1,1 | ||
| 1751 | xgr $s0,$i1 | ||
| 1752 | ogr $s1,$i2 | ||
| 1753 | |||
| 1754 | ltr $len,$len # clear zero flag | ||
| 1755 | br $ra | ||
| 1756 | .size _s390x_xts_km,.-_s390x_xts_km | ||
| 1757 | |||
| 1758 | .globl AES_xts_encrypt | ||
| 1759 | .type AES_xts_encrypt,\@function | ||
| 1760 | .align 16 | ||
| 1761 | AES_xts_encrypt: | ||
| 1762 | xgr %r3,%r4 # flip %r3 and %r4, $out and $len | ||
| 1763 | xgr %r4,%r3 | ||
| 1764 | xgr %r3,%r4 | ||
| 1765 | ___ | ||
| 1766 | $code.=<<___ if ($SIZE_T==4); | ||
| 1767 | llgfr $len,$len | ||
| 1768 | ___ | ||
| 1769 | $code.=<<___; | ||
| 1770 | st${g} $len,1*$SIZE_T($sp) # save copy of $len | ||
| 1771 | srag $len,$len,4 # formally wrong, because it expands | ||
| 1772 | # sign byte, but who can afford asking | ||
| 1773 | # to process more than 2^63-1 bytes? | ||
| 1774 | # I use it, because it sets condition | ||
| 1775 | # code... | ||
| 1776 | bcr 8,$ra # abort if zero (i.e. less than 16) | ||
| 1777 | ___ | ||
| 1778 | $code.=<<___ if (!$softonly); | ||
| 1779 | llgf %r0,240($key2) | ||
| 1780 | lhi %r1,16 | ||
| 1781 | clr %r0,%r1 | ||
| 1782 | jl .Lxts_enc_software | ||
| 1783 | |||
| 1784 | stm${g} %r6,$s3,6*$SIZE_T($sp) | ||
| 1785 | st${g} $ra,14*$SIZE_T($sp) | ||
| 1786 | |||
| 1787 | sllg $len,$len,4 # $len&=~15 | ||
| 1788 | slgr $out,$inp | ||
| 1789 | |||
| 1790 | # generate the tweak value | ||
| 1791 | l${g} $s3,$stdframe($sp) # pointer to iv | ||
| 1792 | la $s2,$tweak($sp) | ||
| 1793 | lmg $s0,$s1,0($s3) | ||
| 1794 | lghi $s3,16 | ||
| 1795 | stmg $s0,$s1,0($s2) | ||
| 1796 | la %r1,0($key2) # $key2 is not needed anymore | ||
| 1797 | .long 0xb92e00aa # km $s2,$s2, generate the tweak | ||
| 1798 | brc 1,.-4 # can this happen? | ||
| 1799 | |||
| 1800 | l %r0,240($key1) | ||
| 1801 | la %r1,0($key1) # $key1 is not needed anymore | ||
| 1802 | bras $ra,_s390x_xts_km | ||
| 1803 | jz .Lxts_enc_km_done | ||
| 1804 | |||
| 1805 | aghi $inp,-16 # take one step back | ||
| 1806 | la $i3,0($out,$inp) # put aside real $out | ||
| 1807 | .Lxts_enc_km_steal: | ||
| 1808 | llgc $i1,16($inp) | ||
| 1809 | llgc $i2,0($out,$inp) | ||
| 1810 | stc $i1,0($out,$inp) | ||
| 1811 | stc $i2,16($out,$inp) | ||
| 1812 | la $inp,1($inp) | ||
| 1813 | brct $len,.Lxts_enc_km_steal | ||
| 1814 | |||
| 1815 | la $s2,0($i3) | ||
| 1816 | lghi $s3,16 | ||
| 1817 | lrvgr $i1,$s0 # flip byte order | ||
| 1818 | lrvgr $i2,$s1 | ||
| 1819 | xg $i1,0($s2) | ||
| 1820 | xg $i2,8($s2) | ||
| 1821 | stg $i1,0($s2) | ||
| 1822 | stg $i2,8($s2) | ||
| 1823 | .long 0xb92e00aa # km $s2,$s2 | ||
| 1824 | brc 1,.-4 # can this happen? | ||
| 1825 | lrvgr $i1,$s0 # flip byte order | ||
| 1826 | lrvgr $i2,$s1 | ||
| 1827 | xg $i1,0($i3) | ||
| 1828 | xg $i2,8($i3) | ||
| 1829 | stg $i1,0($i3) | ||
| 1830 | stg $i2,8($i3) | ||
| 1831 | |||
| 1832 | .Lxts_enc_km_done: | ||
| 1833 | l${g} $ra,14*$SIZE_T($sp) | ||
| 1834 | st${g} $sp,$tweak($sp) # wipe tweak | ||
| 1835 | st${g} $sp,$tweak($sp) | ||
| 1836 | lm${g} %r6,$s3,6*$SIZE_T($sp) | ||
| 1837 | br $ra | ||
| 1838 | .align 16 | ||
| 1839 | .Lxts_enc_software: | ||
| 1840 | ___ | ||
| 1841 | $code.=<<___; | ||
| 1842 | stm${g} %r6,$ra,6*$SIZE_T($sp) | ||
| 1843 | |||
| 1844 | slgr $out,$inp | ||
| 1845 | |||
| 1846 | xgr $s0,$s0 # clear upper half | ||
| 1847 | xgr $s1,$s1 | ||
| 1848 | lrv $s0,$stdframe+4($sp) # load secno | ||
| 1849 | lrv $s1,$stdframe+0($sp) | ||
| 1850 | xgr $s2,$s2 | ||
| 1851 | xgr $s3,$s3 | ||
| 1852 | stm${g} %r2,%r5,2*$SIZE_T($sp) | ||
| 1853 | la $key,0($key2) | ||
| 1854 | larl $tbl,AES_Te | ||
| 1855 | bras $ra,_s390x_AES_encrypt # generate the tweak | ||
| 1856 | lm${g} %r2,%r5,2*$SIZE_T($sp) | ||
| 1857 | stm $s0,$s3,$tweak($sp) # save the tweak | ||
| 1858 | j .Lxts_enc_enter | ||
| 1859 | |||
| 1860 | .align 16 | ||
| 1861 | .Lxts_enc_loop: | ||
| 1862 | lrvg $s1,$tweak+0($sp) # load the tweak in little-endian | ||
| 1863 | lrvg $s3,$tweak+8($sp) | ||
| 1864 | lghi %r1,0x87 | ||
| 1865 | srag %r0,$s3,63 # broadcast upper bit | ||
| 1866 | ngr %r1,%r0 # rem | ||
| 1867 | srlg %r0,$s1,63 # carry bit from lower half | ||
| 1868 | sllg $s1,$s1,1 | ||
| 1869 | sllg $s3,$s3,1 | ||
| 1870 | xgr $s1,%r1 | ||
| 1871 | ogr $s3,%r0 | ||
| 1872 | lrvgr $s1,$s1 # flip byte order | ||
| 1873 | lrvgr $s3,$s3 | ||
| 1874 | srlg $s0,$s1,32 # smash the tweak to 4x32-bits | ||
| 1875 | stg $s1,$tweak+0($sp) # save the tweak | ||
| 1876 | llgfr $s1,$s1 | ||
| 1877 | srlg $s2,$s3,32 | ||
| 1878 | stg $s3,$tweak+8($sp) | ||
| 1879 | llgfr $s3,$s3 | ||
| 1880 | la $inp,16($inp) # $inp+=16 | ||
| 1881 | .Lxts_enc_enter: | ||
| 1882 | x $s0,0($inp) # ^=*($inp) | ||
| 1883 | x $s1,4($inp) | ||
| 1884 | x $s2,8($inp) | ||
| 1885 | x $s3,12($inp) | ||
| 1886 | stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing | ||
| 1887 | la $key,0($key1) | ||
| 1888 | bras $ra,_s390x_AES_encrypt | ||
| 1889 | lm${g} %r2,%r5,2*$SIZE_T($sp) | ||
| 1890 | x $s0,$tweak+0($sp) # ^=tweak | ||
| 1891 | x $s1,$tweak+4($sp) | ||
| 1892 | x $s2,$tweak+8($sp) | ||
| 1893 | x $s3,$tweak+12($sp) | ||
| 1894 | st $s0,0($out,$inp) | ||
| 1895 | st $s1,4($out,$inp) | ||
| 1896 | st $s2,8($out,$inp) | ||
| 1897 | st $s3,12($out,$inp) | ||
| 1898 | brct${g} $len,.Lxts_enc_loop | ||
| 1899 | |||
| 1900 | llgc $len,`2*$SIZE_T-1`($sp) | ||
| 1901 | nill $len,0x0f # $len%16 | ||
| 1902 | jz .Lxts_enc_done | ||
| 1903 | |||
| 1904 | la $i3,0($inp,$out) # put aside real $out | ||
| 1905 | .Lxts_enc_steal: | ||
| 1906 | llgc %r0,16($inp) | ||
| 1907 | llgc %r1,0($out,$inp) | ||
| 1908 | stc %r0,0($out,$inp) | ||
| 1909 | stc %r1,16($out,$inp) | ||
| 1910 | la $inp,1($inp) | ||
| 1911 | brct $len,.Lxts_enc_steal | ||
| 1912 | la $out,0($i3) # restore real $out | ||
| 1913 | |||
| 1914 | # generate last tweak... | ||
| 1915 | lrvg $s1,$tweak+0($sp) # load the tweak in little-endian | ||
| 1916 | lrvg $s3,$tweak+8($sp) | ||
| 1917 | lghi %r1,0x87 | ||
| 1918 | srag %r0,$s3,63 # broadcast upper bit | ||
| 1919 | ngr %r1,%r0 # rem | ||
| 1920 | srlg %r0,$s1,63 # carry bit from lower half | ||
| 1921 | sllg $s1,$s1,1 | ||
| 1922 | sllg $s3,$s3,1 | ||
| 1923 | xgr $s1,%r1 | ||
| 1924 | ogr $s3,%r0 | ||
| 1925 | lrvgr $s1,$s1 # flip byte order | ||
| 1926 | lrvgr $s3,$s3 | ||
| 1927 | srlg $s0,$s1,32 # smash the tweak to 4x32-bits | ||
| 1928 | stg $s1,$tweak+0($sp) # save the tweak | ||
| 1929 | llgfr $s1,$s1 | ||
| 1930 | srlg $s2,$s3,32 | ||
| 1931 | stg $s3,$tweak+8($sp) | ||
| 1932 | llgfr $s3,$s3 | ||
| 1933 | |||
| 1934 | x $s0,0($out) # ^=*(inp)|stolen cipther-text | ||
| 1935 | x $s1,4($out) | ||
| 1936 | x $s2,8($out) | ||
| 1937 | x $s3,12($out) | ||
| 1938 | st${g} $out,4*$SIZE_T($sp) | ||
| 1939 | la $key,0($key1) | ||
| 1940 | bras $ra,_s390x_AES_encrypt | ||
| 1941 | l${g} $out,4*$SIZE_T($sp) | ||
| 1942 | x $s0,`$tweak+0`($sp) # ^=tweak | ||
| 1943 | x $s1,`$tweak+4`($sp) | ||
| 1944 | x $s2,`$tweak+8`($sp) | ||
| 1945 | x $s3,`$tweak+12`($sp) | ||
| 1946 | st $s0,0($out) | ||
| 1947 | st $s1,4($out) | ||
| 1948 | st $s2,8($out) | ||
| 1949 | st $s3,12($out) | ||
| 1950 | |||
| 1951 | .Lxts_enc_done: | ||
| 1952 | stg $sp,$tweak+0($sp) # wipe tweak | ||
| 1953 | stg $sp,$twesk+8($sp) | ||
| 1954 | lm${g} %r6,$ra,6*$SIZE_T($sp) | ||
| 1955 | br $ra | ||
| 1956 | .size AES_xts_encrypt,.-AES_xts_encrypt | ||
| 1957 | ___ | ||
| 1958 | # void AES_xts_decrypt(const char *inp,char *out,size_t len, | ||
| 1959 | # const AES_KEY *key1, const AES_KEY *key2,u64 secno); | ||
| 1960 | # | ||
| 1961 | $code.=<<___; | ||
| 1962 | .globl AES_xts_decrypt | ||
| 1963 | .type AES_xts_decrypt,\@function | ||
| 1964 | .align 16 | ||
| 1965 | AES_xts_decrypt: | ||
| 1966 | xgr %r3,%r4 # flip %r3 and %r4, $out and $len | ||
| 1967 | xgr %r4,%r3 | ||
| 1968 | xgr %r3,%r4 | ||
| 1969 | ___ | ||
| 1970 | $code.=<<___ if ($SIZE_T==4); | ||
| 1971 | llgfr $len,$len | ||
| 1972 | ___ | ||
| 1973 | $code.=<<___; | ||
| 1974 | st${g} $len,1*$SIZE_T($sp) # save copy of $len | ||
| 1975 | aghi $len,-16 | ||
| 1976 | bcr 4,$ra # abort if less than zero. formally | ||
| 1977 | # wrong, because $len is unsigned, | ||
| 1978 | # but who can afford asking to | ||
| 1979 | # process more than 2^63-1 bytes? | ||
| 1980 | tmll $len,0x0f | ||
| 1981 | jnz .Lxts_dec_proceed | ||
| 1982 | aghi $len,16 | ||
| 1983 | .Lxts_dec_proceed: | ||
| 1984 | ___ | ||
| 1985 | $code.=<<___ if (!$softonly); | ||
| 1986 | llgf %r0,240($key2) | ||
| 1987 | lhi %r1,16 | ||
| 1988 | clr %r0,%r1 | ||
| 1989 | jl .Lxts_dec_software | ||
| 1990 | |||
| 1991 | stm${g} %r6,$s3,6*$SIZE_T($sp) | ||
| 1992 | st${g} $ra,14*$SIZE_T($sp) | ||
| 1993 | |||
| 1994 | nill $len,0xfff0 # $len&=~15 | ||
| 1995 | slgr $out,$inp | ||
| 1996 | |||
| 1997 | # generate the tweak value | ||
| 1998 | l${g} $s3,$stdframe($sp) # pointer to iv | ||
| 1999 | la $s2,$tweak($sp) | ||
| 2000 | lmg $s0,$s1,0($s3) | ||
| 2001 | lghi $s3,16 | ||
| 2002 | stmg $s0,$s1,0($s2) | ||
| 2003 | la %r1,0($key2) # $key2 is not needed past this point | ||
| 2004 | .long 0xb92e00aa # km $s2,$s2, generate the tweak | ||
| 2005 | brc 1,.-4 # can this happen? | ||
| 2006 | |||
| 2007 | l %r0,240($key1) | ||
| 2008 | la %r1,0($key1) # $key1 is not needed anymore | ||
| 2009 | |||
| 2010 | ltgr $len,$len | ||
| 2011 | jz .Lxts_dec_km_short | ||
| 2012 | bras $ra,_s390x_xts_km | ||
| 2013 | jz .Lxts_dec_km_done | ||
| 2014 | |||
| 2015 | lrvgr $s2,$s0 # make copy in reverse byte order | ||
| 2016 | lrvgr $s3,$s1 | ||
| 2017 | j .Lxts_dec_km_2ndtweak | ||
| 2018 | |||
| 2019 | .Lxts_dec_km_short: | ||
| 2020 | llgc $len,`2*$SIZE_T-1`($sp) | ||
| 2021 | nill $len,0x0f # $len%=16 | ||
| 2022 | lrvg $s0,$tweak+0($sp) # load the tweak | ||
| 2023 | lrvg $s1,$tweak+8($sp) | ||
| 2024 | lrvgr $s2,$s0 # make copy in reverse byte order | ||
| 2025 | lrvgr $s3,$s1 | ||
| 2026 | |||
| 2027 | .Lxts_dec_km_2ndtweak: | ||
| 2028 | lghi $i1,0x87 | ||
| 2029 | srag $i2,$s1,63 # broadcast upper bit | ||
| 2030 | ngr $i1,$i2 # rem | ||
| 2031 | srlg $i2,$s0,63 # carry bit from lower half | ||
| 2032 | sllg $s0,$s0,1 | ||
| 2033 | sllg $s1,$s1,1 | ||
| 2034 | xgr $s0,$i1 | ||
| 2035 | ogr $s1,$i2 | ||
| 2036 | lrvgr $i1,$s0 # flip byte order | ||
| 2037 | lrvgr $i2,$s1 | ||
| 2038 | |||
| 2039 | xg $i1,0($inp) | ||
| 2040 | xg $i2,8($inp) | ||
| 2041 | stg $i1,0($out,$inp) | ||
| 2042 | stg $i2,8($out,$inp) | ||
| 2043 | la $i2,0($out,$inp) | ||
| 2044 | lghi $i3,16 | ||
| 2045 | .long 0xb92e0066 # km $i2,$i2 | ||
| 2046 | brc 1,.-4 # can this happen? | ||
| 2047 | lrvgr $i1,$s0 | ||
| 2048 | lrvgr $i2,$s1 | ||
| 2049 | xg $i1,0($out,$inp) | ||
| 2050 | xg $i2,8($out,$inp) | ||
| 2051 | stg $i1,0($out,$inp) | ||
| 2052 | stg $i2,8($out,$inp) | ||
| 2053 | |||
| 2054 | la $i3,0($out,$inp) # put aside real $out | ||
| 2055 | .Lxts_dec_km_steal: | ||
| 2056 | llgc $i1,16($inp) | ||
| 2057 | llgc $i2,0($out,$inp) | ||
| 2058 | stc $i1,0($out,$inp) | ||
| 2059 | stc $i2,16($out,$inp) | ||
| 2060 | la $inp,1($inp) | ||
| 2061 | brct $len,.Lxts_dec_km_steal | ||
| 2062 | |||
| 2063 | lgr $s0,$s2 | ||
| 2064 | lgr $s1,$s3 | ||
| 2065 | xg $s0,0($i3) | ||
| 2066 | xg $s1,8($i3) | ||
| 2067 | stg $s0,0($i3) | ||
| 2068 | stg $s1,8($i3) | ||
| 2069 | la $s0,0($i3) | ||
| 2070 | lghi $s1,16 | ||
| 2071 | .long 0xb92e0088 # km $s0,$s0 | ||
| 2072 | brc 1,.-4 # can this happen? | ||
| 2073 | xg $s2,0($i3) | ||
| 2074 | xg $s3,8($i3) | ||
| 2075 | stg $s2,0($i3) | ||
| 2076 | stg $s3,8($i3) | ||
| 2077 | .Lxts_dec_km_done: | ||
| 2078 | l${g} $ra,14*$SIZE_T($sp) | ||
| 2079 | st${g} $sp,$tweak($sp) # wipe tweak | ||
| 2080 | st${g} $sp,$tweak($sp) | ||
| 2081 | lm${g} %r6,$s3,6*$SIZE_T($sp) | ||
| 2082 | br $ra | ||
| 2083 | .align 16 | ||
| 2084 | .Lxts_dec_software: | ||
| 2085 | ___ | ||
| 2086 | $code.=<<___; | ||
| 2087 | stm${g} %r6,$ra,6*$SIZE_T($sp) | ||
| 2088 | |||
| 2089 | srlg $len,$len,4 | ||
| 2090 | slgr $out,$inp | ||
| 2091 | |||
| 2092 | xgr $s0,$s0 # clear upper half | ||
| 2093 | xgr $s1,$s1 | ||
| 2094 | lrv $s0,$stdframe+4($sp) # load secno | ||
| 2095 | lrv $s1,$stdframe+0($sp) | ||
| 2096 | xgr $s2,$s2 | ||
| 2097 | xgr $s3,$s3 | ||
| 2098 | stm${g} %r2,%r5,2*$SIZE_T($sp) | ||
| 2099 | la $key,0($key2) | ||
| 2100 | larl $tbl,AES_Te | ||
| 2101 | bras $ra,_s390x_AES_encrypt # generate the tweak | ||
| 2102 | lm${g} %r2,%r5,2*$SIZE_T($sp) | ||
| 2103 | larl $tbl,AES_Td | ||
| 2104 | lt${g}r $len,$len | ||
| 2105 | stm $s0,$s3,$tweak($sp) # save the tweak | ||
| 2106 | jz .Lxts_dec_short | ||
| 2107 | j .Lxts_dec_enter | ||
| 2108 | |||
| 2109 | .align 16 | ||
| 2110 | .Lxts_dec_loop: | ||
| 2111 | lrvg $s1,$tweak+0($sp) # load the tweak in little-endian | ||
| 2112 | lrvg $s3,$tweak+8($sp) | ||
| 2113 | lghi %r1,0x87 | ||
| 2114 | srag %r0,$s3,63 # broadcast upper bit | ||
| 2115 | ngr %r1,%r0 # rem | ||
| 2116 | srlg %r0,$s1,63 # carry bit from lower half | ||
| 2117 | sllg $s1,$s1,1 | ||
| 2118 | sllg $s3,$s3,1 | ||
| 2119 | xgr $s1,%r1 | ||
| 2120 | ogr $s3,%r0 | ||
| 2121 | lrvgr $s1,$s1 # flip byte order | ||
| 2122 | lrvgr $s3,$s3 | ||
| 2123 | srlg $s0,$s1,32 # smash the tweak to 4x32-bits | ||
| 2124 | stg $s1,$tweak+0($sp) # save the tweak | ||
| 2125 | llgfr $s1,$s1 | ||
| 2126 | srlg $s2,$s3,32 | ||
| 2127 | stg $s3,$tweak+8($sp) | ||
| 2128 | llgfr $s3,$s3 | ||
| 2129 | .Lxts_dec_enter: | ||
| 2130 | x $s0,0($inp) # tweak^=*(inp) | ||
| 2131 | x $s1,4($inp) | ||
| 2132 | x $s2,8($inp) | ||
| 2133 | x $s3,12($inp) | ||
| 2134 | stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing | ||
| 2135 | la $key,0($key1) | ||
| 2136 | bras $ra,_s390x_AES_decrypt | ||
| 2137 | lm${g} %r2,%r5,2*$SIZE_T($sp) | ||
| 2138 | x $s0,$tweak+0($sp) # ^=tweak | ||
| 2139 | x $s1,$tweak+4($sp) | ||
| 2140 | x $s2,$tweak+8($sp) | ||
| 2141 | x $s3,$tweak+12($sp) | ||
| 2142 | st $s0,0($out,$inp) | ||
| 2143 | st $s1,4($out,$inp) | ||
| 2144 | st $s2,8($out,$inp) | ||
| 2145 | st $s3,12($out,$inp) | ||
| 2146 | la $inp,16($inp) | ||
| 2147 | brct${g} $len,.Lxts_dec_loop | ||
| 2148 | |||
| 2149 | llgc $len,`2*$SIZE_T-1`($sp) | ||
| 2150 | nill $len,0x0f # $len%16 | ||
| 2151 | jz .Lxts_dec_done | ||
| 2152 | |||
| 2153 | # generate pair of tweaks... | ||
| 2154 | lrvg $s1,$tweak+0($sp) # load the tweak in little-endian | ||
| 2155 | lrvg $s3,$tweak+8($sp) | ||
| 2156 | lghi %r1,0x87 | ||
| 2157 | srag %r0,$s3,63 # broadcast upper bit | ||
| 2158 | ngr %r1,%r0 # rem | ||
| 2159 | srlg %r0,$s1,63 # carry bit from lower half | ||
| 2160 | sllg $s1,$s1,1 | ||
| 2161 | sllg $s3,$s3,1 | ||
| 2162 | xgr $s1,%r1 | ||
| 2163 | ogr $s3,%r0 | ||
| 2164 | lrvgr $i2,$s1 # flip byte order | ||
| 2165 | lrvgr $i3,$s3 | ||
| 2166 | stmg $i2,$i3,$tweak($sp) # save the 1st tweak | ||
| 2167 | j .Lxts_dec_2ndtweak | ||
| 2168 | |||
| 2169 | .align 16 | ||
| 2170 | .Lxts_dec_short: | ||
| 2171 | llgc $len,`2*$SIZE_T-1`($sp) | ||
| 2172 | nill $len,0x0f # $len%16 | ||
| 2173 | lrvg $s1,$tweak+0($sp) # load the tweak in little-endian | ||
| 2174 | lrvg $s3,$tweak+8($sp) | ||
| 2175 | .Lxts_dec_2ndtweak: | ||
| 2176 | lghi %r1,0x87 | ||
| 2177 | srag %r0,$s3,63 # broadcast upper bit | ||
| 2178 | ngr %r1,%r0 # rem | ||
| 2179 | srlg %r0,$s1,63 # carry bit from lower half | ||
| 2180 | sllg $s1,$s1,1 | ||
| 2181 | sllg $s3,$s3,1 | ||
| 2182 | xgr $s1,%r1 | ||
| 2183 | ogr $s3,%r0 | ||
| 2184 | lrvgr $s1,$s1 # flip byte order | ||
| 2185 | lrvgr $s3,$s3 | ||
| 2186 | srlg $s0,$s1,32 # smash the tweak to 4x32-bits | ||
| 2187 | stg $s1,$tweak-16+0($sp) # save the 2nd tweak | ||
| 2188 | llgfr $s1,$s1 | ||
| 2189 | srlg $s2,$s3,32 | ||
| 2190 | stg $s3,$tweak-16+8($sp) | ||
| 2191 | llgfr $s3,$s3 | ||
| 2192 | |||
| 2193 | x $s0,0($inp) # tweak_the_2nd^=*(inp) | ||
| 2194 | x $s1,4($inp) | ||
| 2195 | x $s2,8($inp) | ||
| 2196 | x $s3,12($inp) | ||
| 2197 | stm${g} %r2,%r3,2*$SIZE_T($sp) | ||
| 2198 | la $key,0($key1) | ||
| 2199 | bras $ra,_s390x_AES_decrypt | ||
| 2200 | lm${g} %r2,%r5,2*$SIZE_T($sp) | ||
| 2201 | x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd | ||
| 2202 | x $s1,$tweak-16+4($sp) | ||
| 2203 | x $s2,$tweak-16+8($sp) | ||
| 2204 | x $s3,$tweak-16+12($sp) | ||
| 2205 | st $s0,0($out,$inp) | ||
| 2206 | st $s1,4($out,$inp) | ||
| 2207 | st $s2,8($out,$inp) | ||
| 2208 | st $s3,12($out,$inp) | ||
| 2209 | |||
| 2210 | la $i3,0($out,$inp) # put aside real $out | ||
| 2211 | .Lxts_dec_steal: | ||
| 2212 | llgc %r0,16($inp) | ||
| 2213 | llgc %r1,0($out,$inp) | ||
| 2214 | stc %r0,0($out,$inp) | ||
| 2215 | stc %r1,16($out,$inp) | ||
| 2216 | la $inp,1($inp) | ||
| 2217 | brct $len,.Lxts_dec_steal | ||
| 2218 | la $out,0($i3) # restore real $out | ||
| 2219 | |||
| 2220 | lm $s0,$s3,$tweak($sp) # load the 1st tweak | ||
| 2221 | x $s0,0($out) # tweak^=*(inp)|stolen cipher-text | ||
| 2222 | x $s1,4($out) | ||
| 2223 | x $s2,8($out) | ||
| 2224 | x $s3,12($out) | ||
| 2225 | st${g} $out,4*$SIZE_T($sp) | ||
| 2226 | la $key,0($key1) | ||
| 2227 | bras $ra,_s390x_AES_decrypt | ||
| 2228 | l${g} $out,4*$SIZE_T($sp) | ||
| 2229 | x $s0,$tweak+0($sp) # ^=tweak | ||
| 2230 | x $s1,$tweak+4($sp) | ||
| 2231 | x $s2,$tweak+8($sp) | ||
| 2232 | x $s3,$tweak+12($sp) | ||
| 2233 | st $s0,0($out) | ||
| 2234 | st $s1,4($out) | ||
| 2235 | st $s2,8($out) | ||
| 2236 | st $s3,12($out) | ||
| 2237 | stg $sp,$tweak-16+0($sp) # wipe 2nd tweak | ||
| 2238 | stg $sp,$tweak-16+8($sp) | ||
| 2239 | .Lxts_dec_done: | ||
| 2240 | stg $sp,$tweak+0($sp) # wipe tweak | ||
| 2241 | stg $sp,$twesk+8($sp) | ||
| 2242 | lm${g} %r6,$ra,6*$SIZE_T($sp) | ||
| 2243 | br $ra | ||
| 2244 | .size AES_xts_decrypt,.-AES_xts_decrypt | ||
| 1332 | ___ | 2245 | ___ |
| 1333 | } | 2246 | } |
| 1334 | $code.=<<___; | 2247 | $code.=<<___; |
| 1335 | .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 2248 | .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
| 2249 | .comm OPENSSL_s390xcap_P,16,8 | ||
| 1336 | ___ | 2250 | ___ |
| 1337 | 2251 | ||
| 1338 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 2252 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
| 1339 | print $code; | 2253 | print $code; |
| 2254 | close STDOUT; # force flush | ||
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl index c57b3a2d6d..403c4d1290 100755 --- a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl +++ b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl | |||
| @@ -1176,6 +1176,7 @@ ___ | |||
| 1176 | # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have | 1176 | # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have |
| 1177 | # undesired effect, so just omit them and sacrifice some portion of | 1177 | # undesired effect, so just omit them and sacrifice some portion of |
| 1178 | # percent in performance... | 1178 | # percent in performance... |
| 1179 | $code =~ s/fmovs.*$//gem; | 1179 | $code =~ s/fmovs.*$//gm; |
| 1180 | 1180 | ||
| 1181 | print $code; | 1181 | print $code; |
| 1182 | close STDOUT; # ensure flush | ||
diff --git a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl new file mode 100644 index 0000000000..c6f6b3334a --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl | |||
| @@ -0,0 +1,1249 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # June 2011 | ||
| 11 | # | ||
| 12 | # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled | ||
| 13 | # in http://download.intel.com/design/intarch/papers/323686.pdf, is | ||
| 14 | # that since AESNI-CBC encrypt exhibit *very* low instruction-level | ||
| 15 | # parallelism, interleaving it with another algorithm would allow to | ||
| 16 | # utilize processor resources better and achieve better performance. | ||
| 17 | # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and | ||
| 18 | # AESNI code is weaved into it. Below are performance numbers in | ||
| 19 | # cycles per processed byte, less is better, for standalone AESNI-CBC | ||
| 20 | # encrypt, sum of the latter and standalone SHA1, and "stitched" | ||
| 21 | # subroutine: | ||
| 22 | # | ||
| 23 | # AES-128-CBC +SHA1 stitch gain | ||
| 24 | # Westmere 3.77[+5.6] 9.37 6.65 +41% | ||
| 25 | # Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%) | ||
| 26 | # | ||
| 27 | # AES-192-CBC | ||
| 28 | # Westmere 4.51 10.11 6.97 +45% | ||
| 29 | # Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%) | ||
| 30 | # | ||
| 31 | # AES-256-CBC | ||
| 32 | # Westmere 5.25 10.85 7.25 +50% | ||
| 33 | # Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%) | ||
| 34 | # | ||
| 35 | # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for | ||
| 36 | # background information. Above numbers in parentheses are SSSE3 | ||
| 37 | # results collected on AVX-capable CPU, i.e. apply on OSes that | ||
| 38 | # don't support AVX. | ||
| 39 | # | ||
| 40 | # Needless to mention that it makes no sense to implement "stitched" | ||
| 41 | # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1 | ||
| 42 | # fully utilize parallelism, so stitching would not give any gain | ||
| 43 | # anyway. Well, there might be some, e.g. because of better cache | ||
| 44 | # locality... For reference, here are performance results for | ||
| 45 | # standalone AESNI-CBC decrypt: | ||
| 46 | # | ||
| 47 | # AES-128-CBC AES-192-CBC AES-256-CBC | ||
| 48 | # Westmere 1.31 1.55 1.80 | ||
| 49 | # Sandy Bridge 0.93 1.06 1.22 | ||
| 50 | |||
| 51 | $flavour = shift; | ||
| 52 | $output = shift; | ||
| 53 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 54 | |||
| 55 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 56 | |||
| 57 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 58 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 59 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 60 | die "can't locate x86_64-xlate.pl"; | ||
| 61 | |||
| 62 | $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | ||
| 63 | =~ /GNU assembler version ([2-9]\.[0-9]+)/ && | ||
| 64 | $1>=2.19); | ||
| 65 | $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | ||
| 66 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && | ||
| 67 | $1>=2.09); | ||
| 68 | $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | ||
| 69 | `ml64 2>&1` =~ /Version ([0-9]+)\./ && | ||
| 70 | $1>=10); | ||
| 71 | |||
| 72 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 73 | |||
| 74 | # void aesni_cbc_sha1_enc(const void *inp, | ||
| 75 | # void *out, | ||
| 76 | # size_t length, | ||
| 77 | # const AES_KEY *key, | ||
| 78 | # unsigned char *iv, | ||
| 79 | # SHA_CTX *ctx, | ||
| 80 | # const void *in0); | ||
| 81 | |||
| 82 | $code.=<<___; | ||
| 83 | .text | ||
| 84 | .extern OPENSSL_ia32cap_P | ||
| 85 | |||
| 86 | .globl aesni_cbc_sha1_enc | ||
| 87 | .type aesni_cbc_sha1_enc,\@abi-omnipotent | ||
| 88 | .align 16 | ||
| 89 | aesni_cbc_sha1_enc: | ||
| 90 | # caller should check for SSSE3 and AES-NI bits | ||
| 91 | mov OPENSSL_ia32cap_P+0(%rip),%r10d | ||
| 92 | mov OPENSSL_ia32cap_P+4(%rip),%r11d | ||
| 93 | ___ | ||
| 94 | $code.=<<___ if ($avx); | ||
| 95 | and \$`1<<28`,%r11d # mask AVX bit | ||
| 96 | and \$`1<<30`,%r10d # mask "Intel CPU" bit | ||
| 97 | or %r11d,%r10d | ||
| 98 | cmp \$`1<<28|1<<30`,%r10d | ||
| 99 | je aesni_cbc_sha1_enc_avx | ||
| 100 | ___ | ||
| 101 | $code.=<<___; | ||
| 102 | jmp aesni_cbc_sha1_enc_ssse3 | ||
| 103 | ret | ||
| 104 | .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc | ||
| 105 | ___ | ||
| 106 | |||
| 107 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); | ||
| 108 | |||
| 109 | my $Xi=4; | ||
| 110 | my @X=map("%xmm$_",(4..7,0..3)); | ||
| 111 | my @Tx=map("%xmm$_",(8..10)); | ||
| 112 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization | ||
| 113 | my @T=("%esi","%edi"); | ||
| 114 | my $j=0; my $jj=0; my $r=0; my $sn=0; | ||
| 115 | my $K_XX_XX="%r11"; | ||
| 116 | my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13)); | ||
| 117 | my @rndkey=("%xmm14","%xmm15"); | ||
| 118 | |||
| 119 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | ||
| 120 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | ||
| 121 | my $arg = pop; | ||
| 122 | $arg = "\$$arg" if ($arg*1 eq $arg); | ||
| 123 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | ||
| 124 | } | ||
| 125 | |||
| 126 | my $_rol=sub { &rol(@_) }; | ||
| 127 | my $_ror=sub { &ror(@_) }; | ||
| 128 | |||
| 129 | $code.=<<___; | ||
| 130 | .type aesni_cbc_sha1_enc_ssse3,\@function,6 | ||
| 131 | .align 16 | ||
| 132 | aesni_cbc_sha1_enc_ssse3: | ||
| 133 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument | ||
| 134 | #shr \$6,$len # debugging artefact | ||
| 135 | #jz .Lepilogue_ssse3 # debugging artefact | ||
| 136 | push %rbx | ||
| 137 | push %rbp | ||
| 138 | push %r12 | ||
| 139 | push %r13 | ||
| 140 | push %r14 | ||
| 141 | push %r15 | ||
| 142 | lea `-104-($win64?10*16:0)`(%rsp),%rsp | ||
| 143 | #mov $in0,$inp # debugging artefact | ||
| 144 | #lea 64(%rsp),$ctx # debugging artefact | ||
| 145 | ___ | ||
| 146 | $code.=<<___ if ($win64); | ||
| 147 | movaps %xmm6,96+0(%rsp) | ||
| 148 | movaps %xmm7,96+16(%rsp) | ||
| 149 | movaps %xmm8,96+32(%rsp) | ||
| 150 | movaps %xmm9,96+48(%rsp) | ||
| 151 | movaps %xmm10,96+64(%rsp) | ||
| 152 | movaps %xmm11,96+80(%rsp) | ||
| 153 | movaps %xmm12,96+96(%rsp) | ||
| 154 | movaps %xmm13,96+112(%rsp) | ||
| 155 | movaps %xmm14,96+128(%rsp) | ||
| 156 | movaps %xmm15,96+144(%rsp) | ||
| 157 | .Lprologue_ssse3: | ||
| 158 | ___ | ||
| 159 | $code.=<<___; | ||
| 160 | mov $in0,%r12 # reassign arguments | ||
| 161 | mov $out,%r13 | ||
| 162 | mov $len,%r14 | ||
| 163 | mov $key,%r15 | ||
| 164 | movdqu ($ivp),$iv # load IV | ||
| 165 | mov $ivp,88(%rsp) # save $ivp | ||
| 166 | ___ | ||
| 167 | my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments | ||
| 168 | my $rounds="${ivp}d"; | ||
| 169 | $code.=<<___; | ||
| 170 | shl \$6,$len | ||
| 171 | sub $in0,$out | ||
| 172 | mov 240($key),$rounds | ||
| 173 | add $inp,$len # end of input | ||
| 174 | |||
| 175 | lea K_XX_XX(%rip),$K_XX_XX | ||
| 176 | mov 0($ctx),$A # load context | ||
| 177 | mov 4($ctx),$B | ||
| 178 | mov 8($ctx),$C | ||
| 179 | mov 12($ctx),$D | ||
| 180 | mov $B,@T[0] # magic seed | ||
| 181 | mov 16($ctx),$E | ||
| 182 | |||
| 183 | movdqa 64($K_XX_XX),@X[2] # pbswap mask | ||
| 184 | movdqa 0($K_XX_XX),@Tx[1] # K_00_19 | ||
| 185 | movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] | ||
| 186 | movdqu 16($inp),@X[-3&7] | ||
| 187 | movdqu 32($inp),@X[-2&7] | ||
| 188 | movdqu 48($inp),@X[-1&7] | ||
| 189 | pshufb @X[2],@X[-4&7] # byte swap | ||
| 190 | add \$64,$inp | ||
| 191 | pshufb @X[2],@X[-3&7] | ||
| 192 | pshufb @X[2],@X[-2&7] | ||
| 193 | pshufb @X[2],@X[-1&7] | ||
| 194 | paddd @Tx[1],@X[-4&7] # add K_00_19 | ||
| 195 | paddd @Tx[1],@X[-3&7] | ||
| 196 | paddd @Tx[1],@X[-2&7] | ||
| 197 | movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU | ||
| 198 | psubd @Tx[1],@X[-4&7] # restore X[] | ||
| 199 | movdqa @X[-3&7],16(%rsp) | ||
| 200 | psubd @Tx[1],@X[-3&7] | ||
| 201 | movdqa @X[-2&7],32(%rsp) | ||
| 202 | psubd @Tx[1],@X[-2&7] | ||
| 203 | movups ($key),$rndkey0 # $key[0] | ||
| 204 | movups 16($key),$rndkey[0] # forward reference | ||
| 205 | jmp .Loop_ssse3 | ||
| 206 | ___ | ||
| 207 | |||
| 208 | my $aesenc=sub { | ||
| 209 | use integer; | ||
| 210 | my ($n,$k)=($r/10,$r%10); | ||
| 211 | if ($k==0) { | ||
| 212 | $code.=<<___; | ||
| 213 | movups `16*$n`($in0),$in # load input | ||
| 214 | xorps $rndkey0,$in | ||
| 215 | ___ | ||
| 216 | $code.=<<___ if ($n); | ||
| 217 | movups $iv,`16*($n-1)`($out,$in0) # write output | ||
| 218 | ___ | ||
| 219 | $code.=<<___; | ||
| 220 | xorps $in,$iv | ||
| 221 | aesenc $rndkey[0],$iv | ||
| 222 | movups `32+16*$k`($key),$rndkey[1] | ||
| 223 | ___ | ||
| 224 | } elsif ($k==9) { | ||
| 225 | $sn++; | ||
| 226 | $code.=<<___; | ||
| 227 | cmp \$11,$rounds | ||
| 228 | jb .Laesenclast$sn | ||
| 229 | movups `32+16*($k+0)`($key),$rndkey[1] | ||
| 230 | aesenc $rndkey[0],$iv | ||
| 231 | movups `32+16*($k+1)`($key),$rndkey[0] | ||
| 232 | aesenc $rndkey[1],$iv | ||
| 233 | je .Laesenclast$sn | ||
| 234 | movups `32+16*($k+2)`($key),$rndkey[1] | ||
| 235 | aesenc $rndkey[0],$iv | ||
| 236 | movups `32+16*($k+3)`($key),$rndkey[0] | ||
| 237 | aesenc $rndkey[1],$iv | ||
| 238 | .Laesenclast$sn: | ||
| 239 | aesenclast $rndkey[0],$iv | ||
| 240 | movups 16($key),$rndkey[1] # forward reference | ||
| 241 | ___ | ||
| 242 | } else { | ||
| 243 | $code.=<<___; | ||
| 244 | aesenc $rndkey[0],$iv | ||
| 245 | movups `32+16*$k`($key),$rndkey[1] | ||
| 246 | ___ | ||
| 247 | } | ||
| 248 | $r++; unshift(@rndkey,pop(@rndkey)); | ||
| 249 | }; | ||
| 250 | |||
| 251 | sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 | ||
| 252 | { use integer; | ||
| 253 | my $body = shift; | ||
| 254 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | ||
| 255 | my ($a,$b,$c,$d,$e); | ||
| 256 | |||
| 257 | &movdqa (@X[0],@X[-3&7]); | ||
| 258 | eval(shift(@insns)); | ||
| 259 | eval(shift(@insns)); | ||
| 260 | &movdqa (@Tx[0],@X[-1&7]); | ||
| 261 | &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" | ||
| 262 | eval(shift(@insns)); | ||
| 263 | eval(shift(@insns)); | ||
| 264 | |||
| 265 | &paddd (@Tx[1],@X[-1&7]); | ||
| 266 | eval(shift(@insns)); | ||
| 267 | eval(shift(@insns)); | ||
| 268 | &psrldq (@Tx[0],4); # "X[-3]", 3 dwords | ||
| 269 | eval(shift(@insns)); | ||
| 270 | eval(shift(@insns)); | ||
| 271 | &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | ||
| 272 | eval(shift(@insns)); | ||
| 273 | eval(shift(@insns)); | ||
| 274 | |||
| 275 | &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" | ||
| 276 | eval(shift(@insns)); | ||
| 277 | eval(shift(@insns)); | ||
| 278 | eval(shift(@insns)); | ||
| 279 | eval(shift(@insns)); | ||
| 280 | |||
| 281 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" | ||
| 282 | eval(shift(@insns)); | ||
| 283 | eval(shift(@insns)); | ||
| 284 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
| 285 | eval(shift(@insns)); | ||
| 286 | eval(shift(@insns)); | ||
| 287 | |||
| 288 | &movdqa (@Tx[2],@X[0]); | ||
| 289 | &movdqa (@Tx[0],@X[0]); | ||
| 290 | eval(shift(@insns)); | ||
| 291 | eval(shift(@insns)); | ||
| 292 | eval(shift(@insns)); | ||
| 293 | eval(shift(@insns)); | ||
| 294 | |||
| 295 | &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword | ||
| 296 | &paddd (@X[0],@X[0]); | ||
| 297 | eval(shift(@insns)); | ||
| 298 | eval(shift(@insns)); | ||
| 299 | eval(shift(@insns)); | ||
| 300 | eval(shift(@insns)); | ||
| 301 | |||
| 302 | &psrld (@Tx[0],31); | ||
| 303 | eval(shift(@insns)); | ||
| 304 | eval(shift(@insns)); | ||
| 305 | &movdqa (@Tx[1],@Tx[2]); | ||
| 306 | eval(shift(@insns)); | ||
| 307 | eval(shift(@insns)); | ||
| 308 | |||
| 309 | &psrld (@Tx[2],30); | ||
| 310 | &por (@X[0],@Tx[0]); # "X[0]"<<<=1 | ||
| 311 | eval(shift(@insns)); | ||
| 312 | eval(shift(@insns)); | ||
| 313 | eval(shift(@insns)); | ||
| 314 | eval(shift(@insns)); | ||
| 315 | |||
| 316 | &pslld (@Tx[1],2); | ||
| 317 | &pxor (@X[0],@Tx[2]); | ||
| 318 | eval(shift(@insns)); | ||
| 319 | eval(shift(@insns)); | ||
| 320 | &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX | ||
| 321 | eval(shift(@insns)); | ||
| 322 | eval(shift(@insns)); | ||
| 323 | |||
| 324 | &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 | ||
| 325 | |||
| 326 | foreach (@insns) { eval; } # remaining instructions [if any] | ||
| 327 | |||
| 328 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
| 329 | push(@Tx,shift(@Tx)); | ||
| 330 | } | ||
| 331 | |||
| 332 | sub Xupdate_ssse3_32_79() | ||
| 333 | { use integer; | ||
| 334 | my $body = shift; | ||
| 335 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | ||
| 336 | my ($a,$b,$c,$d,$e); | ||
| 337 | |||
| 338 | &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); | ||
| 339 | eval(shift(@insns)); # body_20_39 | ||
| 340 | &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | ||
| 341 | &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" | ||
| 342 | eval(shift(@insns)); | ||
| 343 | eval(shift(@insns)); | ||
| 344 | eval(shift(@insns)); # rol | ||
| 345 | |||
| 346 | &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | ||
| 347 | eval(shift(@insns)); | ||
| 348 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); | ||
| 349 | if ($Xi%5) { | ||
| 350 | &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... | ||
| 351 | } else { # ... or load next one | ||
| 352 | &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); | ||
| 353 | } | ||
| 354 | &paddd (@Tx[1],@X[-1&7]); | ||
| 355 | eval(shift(@insns)); # ror | ||
| 356 | eval(shift(@insns)); | ||
| 357 | |||
| 358 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" | ||
| 359 | eval(shift(@insns)); # body_20_39 | ||
| 360 | eval(shift(@insns)); | ||
| 361 | eval(shift(@insns)); | ||
| 362 | eval(shift(@insns)); # rol | ||
| 363 | |||
| 364 | &movdqa (@Tx[0],@X[0]); | ||
| 365 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
| 366 | eval(shift(@insns)); | ||
| 367 | eval(shift(@insns)); | ||
| 368 | eval(shift(@insns)); # ror | ||
| 369 | eval(shift(@insns)); | ||
| 370 | |||
| 371 | &pslld (@X[0],2); | ||
| 372 | eval(shift(@insns)); # body_20_39 | ||
| 373 | eval(shift(@insns)); | ||
| 374 | &psrld (@Tx[0],30); | ||
| 375 | eval(shift(@insns)); | ||
| 376 | eval(shift(@insns)); # rol | ||
| 377 | eval(shift(@insns)); | ||
| 378 | eval(shift(@insns)); | ||
| 379 | eval(shift(@insns)); # ror | ||
| 380 | eval(shift(@insns)); | ||
| 381 | |||
| 382 | &por (@X[0],@Tx[0]); # "X[0]"<<<=2 | ||
| 383 | eval(shift(@insns)); # body_20_39 | ||
| 384 | eval(shift(@insns)); | ||
| 385 | &movdqa (@Tx[1],@X[0]) if ($Xi<19); | ||
| 386 | eval(shift(@insns)); | ||
| 387 | eval(shift(@insns)); # rol | ||
| 388 | eval(shift(@insns)); | ||
| 389 | eval(shift(@insns)); | ||
| 390 | eval(shift(@insns)); # rol | ||
| 391 | eval(shift(@insns)); | ||
| 392 | |||
| 393 | foreach (@insns) { eval; } # remaining instructions | ||
| 394 | |||
| 395 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
| 396 | push(@Tx,shift(@Tx)); | ||
| 397 | } | ||
| 398 | |||
| 399 | sub Xuplast_ssse3_80() | ||
| 400 | { use integer; | ||
| 401 | my $body = shift; | ||
| 402 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 403 | my ($a,$b,$c,$d,$e); | ||
| 404 | |||
| 405 | eval(shift(@insns)); | ||
| 406 | &paddd (@Tx[1],@X[-1&7]); | ||
| 407 | eval(shift(@insns)); | ||
| 408 | eval(shift(@insns)); | ||
| 409 | eval(shift(@insns)); | ||
| 410 | eval(shift(@insns)); | ||
| 411 | |||
| 412 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU | ||
| 413 | |||
| 414 | foreach (@insns) { eval; } # remaining instructions | ||
| 415 | |||
| 416 | &cmp ($inp,$len); | ||
| 417 | &je (".Ldone_ssse3"); | ||
| 418 | |||
| 419 | unshift(@Tx,pop(@Tx)); | ||
| 420 | |||
| 421 | &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask | ||
| 422 | &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 | ||
| 423 | &movdqu (@X[-4&7],"0($inp)"); # load input | ||
| 424 | &movdqu (@X[-3&7],"16($inp)"); | ||
| 425 | &movdqu (@X[-2&7],"32($inp)"); | ||
| 426 | &movdqu (@X[-1&7],"48($inp)"); | ||
| 427 | &pshufb (@X[-4&7],@X[2]); # byte swap | ||
| 428 | &add ($inp,64); | ||
| 429 | |||
| 430 | $Xi=0; | ||
| 431 | } | ||
| 432 | |||
| 433 | sub Xloop_ssse3() | ||
| 434 | { use integer; | ||
| 435 | my $body = shift; | ||
| 436 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 437 | my ($a,$b,$c,$d,$e); | ||
| 438 | |||
| 439 | eval(shift(@insns)); | ||
| 440 | eval(shift(@insns)); | ||
| 441 | &pshufb (@X[($Xi-3)&7],@X[2]); | ||
| 442 | eval(shift(@insns)); | ||
| 443 | eval(shift(@insns)); | ||
| 444 | &paddd (@X[($Xi-4)&7],@Tx[1]); | ||
| 445 | eval(shift(@insns)); | ||
| 446 | eval(shift(@insns)); | ||
| 447 | eval(shift(@insns)); | ||
| 448 | eval(shift(@insns)); | ||
| 449 | &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU | ||
| 450 | eval(shift(@insns)); | ||
| 451 | eval(shift(@insns)); | ||
| 452 | &psubd (@X[($Xi-4)&7],@Tx[1]); | ||
| 453 | |||
| 454 | foreach (@insns) { eval; } | ||
| 455 | $Xi++; | ||
| 456 | } | ||
| 457 | |||
| 458 | sub Xtail_ssse3() | ||
| 459 | { use integer; | ||
| 460 | my $body = shift; | ||
| 461 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 462 | my ($a,$b,$c,$d,$e); | ||
| 463 | |||
| 464 | foreach (@insns) { eval; } | ||
| 465 | } | ||
| 466 | |||
| 467 | sub body_00_19 () { | ||
| 468 | use integer; | ||
| 469 | my ($k,$n); | ||
| 470 | my @r=( | ||
| 471 | '($a,$b,$c,$d,$e)=@V;'. | ||
| 472 | '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer | ||
| 473 | '&xor ($c,$d);', | ||
| 474 | '&mov (@T[1],$a);', # $b in next round | ||
| 475 | '&$_rol ($a,5);', | ||
| 476 | '&and (@T[0],$c);', # ($b&($c^$d)) | ||
| 477 | '&xor ($c,$d);', # restore $c | ||
| 478 | '&xor (@T[0],$d);', | ||
| 479 | '&add ($e,$a);', | ||
| 480 | '&$_ror ($b,$j?7:2);', # $b>>>2 | ||
| 481 | '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
| 482 | ); | ||
| 483 | $n = scalar(@r); | ||
| 484 | $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds | ||
| 485 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); | ||
| 486 | $jj++; | ||
| 487 | return @r; | ||
| 488 | } | ||
| 489 | |||
| 490 | sub body_20_39 () { | ||
| 491 | use integer; | ||
| 492 | my ($k,$n); | ||
| 493 | my @r=( | ||
| 494 | '($a,$b,$c,$d,$e)=@V;'. | ||
| 495 | '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer | ||
| 496 | '&xor (@T[0],$d);', # ($b^$d) | ||
| 497 | '&mov (@T[1],$a);', # $b in next round | ||
| 498 | '&$_rol ($a,5);', | ||
| 499 | '&xor (@T[0],$c);', # ($b^$d^$c) | ||
| 500 | '&add ($e,$a);', | ||
| 501 | '&$_ror ($b,7);', # $b>>>2 | ||
| 502 | '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
| 503 | ); | ||
| 504 | $n = scalar(@r); | ||
| 505 | $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds | ||
| 506 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); | ||
| 507 | $jj++; | ||
| 508 | return @r; | ||
| 509 | } | ||
| 510 | |||
| 511 | sub body_40_59 () { | ||
| 512 | use integer; | ||
| 513 | my ($k,$n); | ||
| 514 | my @r=( | ||
| 515 | '($a,$b,$c,$d,$e)=@V;'. | ||
| 516 | '&mov (@T[1],$c);', | ||
| 517 | '&xor ($c,$d);', | ||
| 518 | '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer | ||
| 519 | '&and (@T[1],$d);', | ||
| 520 | '&and (@T[0],$c);', # ($b&($c^$d)) | ||
| 521 | '&$_ror ($b,7);', # $b>>>2 | ||
| 522 | '&add ($e,@T[1]);', | ||
| 523 | '&mov (@T[1],$a);', # $b in next round | ||
| 524 | '&$_rol ($a,5);', | ||
| 525 | '&add ($e,@T[0]);', | ||
| 526 | '&xor ($c,$d);', # restore $c | ||
| 527 | '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
| 528 | ); | ||
| 529 | $n = scalar(@r); | ||
| 530 | $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds | ||
| 531 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); | ||
| 532 | $jj++; | ||
| 533 | return @r; | ||
| 534 | } | ||
| 535 | $code.=<<___; | ||
| 536 | .align 16 | ||
| 537 | .Loop_ssse3: | ||
| 538 | ___ | ||
| 539 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
| 540 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
| 541 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
| 542 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
| 543 | &Xupdate_ssse3_32_79(\&body_00_19); | ||
| 544 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 545 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 546 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 547 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 548 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 549 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
| 550 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
| 551 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
| 552 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
| 553 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
| 554 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 555 | &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" | ||
| 556 | |||
| 557 | $saved_j=$j; @saved_V=@V; | ||
| 558 | $saved_r=$r; @saved_rndkey=@rndkey; | ||
| 559 | |||
| 560 | &Xloop_ssse3(\&body_20_39); | ||
| 561 | &Xloop_ssse3(\&body_20_39); | ||
| 562 | &Xloop_ssse3(\&body_20_39); | ||
| 563 | |||
| 564 | $code.=<<___; | ||
| 565 | movups $iv,48($out,$in0) # write output | ||
| 566 | lea 64($in0),$in0 | ||
| 567 | |||
| 568 | add 0($ctx),$A # update context | ||
| 569 | add 4($ctx),@T[0] | ||
| 570 | add 8($ctx),$C | ||
| 571 | add 12($ctx),$D | ||
| 572 | mov $A,0($ctx) | ||
| 573 | add 16($ctx),$E | ||
| 574 | mov @T[0],4($ctx) | ||
| 575 | mov @T[0],$B # magic seed | ||
| 576 | mov $C,8($ctx) | ||
| 577 | mov $D,12($ctx) | ||
| 578 | mov $E,16($ctx) | ||
| 579 | jmp .Loop_ssse3 | ||
| 580 | |||
| 581 | .align 16 | ||
| 582 | .Ldone_ssse3: | ||
| 583 | ___ | ||
| 584 | $jj=$j=$saved_j; @V=@saved_V; | ||
| 585 | $r=$saved_r; @rndkey=@saved_rndkey; | ||
| 586 | |||
| 587 | &Xtail_ssse3(\&body_20_39); | ||
| 588 | &Xtail_ssse3(\&body_20_39); | ||
| 589 | &Xtail_ssse3(\&body_20_39); | ||
| 590 | |||
| 591 | $code.=<<___; | ||
| 592 | movups $iv,48($out,$in0) # write output | ||
| 593 | mov 88(%rsp),$ivp # restore $ivp | ||
| 594 | |||
| 595 | add 0($ctx),$A # update context | ||
| 596 | add 4($ctx),@T[0] | ||
| 597 | add 8($ctx),$C | ||
| 598 | mov $A,0($ctx) | ||
| 599 | add 12($ctx),$D | ||
| 600 | mov @T[0],4($ctx) | ||
| 601 | add 16($ctx),$E | ||
| 602 | mov $C,8($ctx) | ||
| 603 | mov $D,12($ctx) | ||
| 604 | mov $E,16($ctx) | ||
| 605 | movups $iv,($ivp) # write IV | ||
| 606 | ___ | ||
| 607 | $code.=<<___ if ($win64); | ||
| 608 | movaps 96+0(%rsp),%xmm6 | ||
| 609 | movaps 96+16(%rsp),%xmm7 | ||
| 610 | movaps 96+32(%rsp),%xmm8 | ||
| 611 | movaps 96+48(%rsp),%xmm9 | ||
| 612 | movaps 96+64(%rsp),%xmm10 | ||
| 613 | movaps 96+80(%rsp),%xmm11 | ||
| 614 | movaps 96+96(%rsp),%xmm12 | ||
| 615 | movaps 96+112(%rsp),%xmm13 | ||
| 616 | movaps 96+128(%rsp),%xmm14 | ||
| 617 | movaps 96+144(%rsp),%xmm15 | ||
| 618 | ___ | ||
| 619 | $code.=<<___; | ||
| 620 | lea `104+($win64?10*16:0)`(%rsp),%rsi | ||
| 621 | mov 0(%rsi),%r15 | ||
| 622 | mov 8(%rsi),%r14 | ||
| 623 | mov 16(%rsi),%r13 | ||
| 624 | mov 24(%rsi),%r12 | ||
| 625 | mov 32(%rsi),%rbp | ||
| 626 | mov 40(%rsi),%rbx | ||
| 627 | lea 48(%rsi),%rsp | ||
| 628 | .Lepilogue_ssse3: | ||
| 629 | ret | ||
| 630 | .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 | ||
| 631 | ___ | ||
| 632 | |||
| 633 | $j=$jj=$r=$sn=0; | ||
| 634 | |||
| 635 | if ($avx) { | ||
| 636 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); | ||
| 637 | |||
| 638 | my $Xi=4; | ||
| 639 | my @X=map("%xmm$_",(4..7,0..3)); | ||
| 640 | my @Tx=map("%xmm$_",(8..10)); | ||
| 641 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization | ||
| 642 | my @T=("%esi","%edi"); | ||
| 643 | |||
| 644 | my $_rol=sub { &shld(@_[0],@_) }; | ||
| 645 | my $_ror=sub { &shrd(@_[0],@_) }; | ||
| 646 | |||
| 647 | $code.=<<___; | ||
| 648 | .type aesni_cbc_sha1_enc_avx,\@function,6 | ||
| 649 | .align 16 | ||
| 650 | aesni_cbc_sha1_enc_avx: | ||
| 651 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument | ||
| 652 | #shr \$6,$len # debugging artefact | ||
| 653 | #jz .Lepilogue_avx # debugging artefact | ||
| 654 | push %rbx | ||
| 655 | push %rbp | ||
| 656 | push %r12 | ||
| 657 | push %r13 | ||
| 658 | push %r14 | ||
| 659 | push %r15 | ||
| 660 | lea `-104-($win64?10*16:0)`(%rsp),%rsp | ||
| 661 | #mov $in0,$inp # debugging artefact | ||
| 662 | #lea 64(%rsp),$ctx # debugging artefact | ||
| 663 | ___ | ||
| 664 | $code.=<<___ if ($win64); | ||
| 665 | movaps %xmm6,96+0(%rsp) | ||
| 666 | movaps %xmm7,96+16(%rsp) | ||
| 667 | movaps %xmm8,96+32(%rsp) | ||
| 668 | movaps %xmm9,96+48(%rsp) | ||
| 669 | movaps %xmm10,96+64(%rsp) | ||
| 670 | movaps %xmm11,96+80(%rsp) | ||
| 671 | movaps %xmm12,96+96(%rsp) | ||
| 672 | movaps %xmm13,96+112(%rsp) | ||
| 673 | movaps %xmm14,96+128(%rsp) | ||
| 674 | movaps %xmm15,96+144(%rsp) | ||
| 675 | .Lprologue_avx: | ||
| 676 | ___ | ||
| 677 | $code.=<<___; | ||
| 678 | vzeroall | ||
| 679 | mov $in0,%r12 # reassign arguments | ||
| 680 | mov $out,%r13 | ||
| 681 | mov $len,%r14 | ||
| 682 | mov $key,%r15 | ||
| 683 | vmovdqu ($ivp),$iv # load IV | ||
| 684 | mov $ivp,88(%rsp) # save $ivp | ||
| 685 | ___ | ||
| 686 | my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments | ||
| 687 | my $rounds="${ivp}d"; | ||
| 688 | $code.=<<___; | ||
| 689 | shl \$6,$len | ||
| 690 | sub $in0,$out | ||
| 691 | mov 240($key),$rounds | ||
| 692 | add \$112,$key # size optimization | ||
| 693 | add $inp,$len # end of input | ||
| 694 | |||
| 695 | lea K_XX_XX(%rip),$K_XX_XX | ||
| 696 | mov 0($ctx),$A # load context | ||
| 697 | mov 4($ctx),$B | ||
| 698 | mov 8($ctx),$C | ||
| 699 | mov 12($ctx),$D | ||
| 700 | mov $B,@T[0] # magic seed | ||
| 701 | mov 16($ctx),$E | ||
| 702 | |||
| 703 | vmovdqa 64($K_XX_XX),@X[2] # pbswap mask | ||
| 704 | vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 | ||
| 705 | vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] | ||
| 706 | vmovdqu 16($inp),@X[-3&7] | ||
| 707 | vmovdqu 32($inp),@X[-2&7] | ||
| 708 | vmovdqu 48($inp),@X[-1&7] | ||
| 709 | vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap | ||
| 710 | add \$64,$inp | ||
| 711 | vpshufb @X[2],@X[-3&7],@X[-3&7] | ||
| 712 | vpshufb @X[2],@X[-2&7],@X[-2&7] | ||
| 713 | vpshufb @X[2],@X[-1&7],@X[-1&7] | ||
| 714 | vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 | ||
| 715 | vpaddd @Tx[1],@X[-3&7],@X[1] | ||
| 716 | vpaddd @Tx[1],@X[-2&7],@X[2] | ||
| 717 | vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU | ||
| 718 | vmovdqa @X[1],16(%rsp) | ||
| 719 | vmovdqa @X[2],32(%rsp) | ||
| 720 | vmovups -112($key),$rndkey0 # $key[0] | ||
| 721 | vmovups 16-112($key),$rndkey[0] # forward reference | ||
| 722 | jmp .Loop_avx | ||
| 723 | ___ | ||
| 724 | |||
| 725 | my $aesenc=sub { | ||
| 726 | use integer; | ||
| 727 | my ($n,$k)=($r/10,$r%10); | ||
| 728 | if ($k==0) { | ||
| 729 | $code.=<<___; | ||
| 730 | vmovups `16*$n`($in0),$in # load input | ||
| 731 | vxorps $rndkey0,$in,$in | ||
| 732 | ___ | ||
| 733 | $code.=<<___ if ($n); | ||
| 734 | vmovups $iv,`16*($n-1)`($out,$in0) # write output | ||
| 735 | ___ | ||
| 736 | $code.=<<___; | ||
| 737 | vxorps $in,$iv,$iv | ||
| 738 | vaesenc $rndkey[0],$iv,$iv | ||
| 739 | vmovups `32+16*$k-112`($key),$rndkey[1] | ||
| 740 | ___ | ||
| 741 | } elsif ($k==9) { | ||
| 742 | $sn++; | ||
| 743 | $code.=<<___; | ||
| 744 | cmp \$11,$rounds | ||
| 745 | jb .Lvaesenclast$sn | ||
| 746 | vaesenc $rndkey[0],$iv,$iv | ||
| 747 | vmovups `32+16*($k+0)-112`($key),$rndkey[1] | ||
| 748 | vaesenc $rndkey[1],$iv,$iv | ||
| 749 | vmovups `32+16*($k+1)-112`($key),$rndkey[0] | ||
| 750 | je .Lvaesenclast$sn | ||
| 751 | vaesenc $rndkey[0],$iv,$iv | ||
| 752 | vmovups `32+16*($k+2)-112`($key),$rndkey[1] | ||
| 753 | vaesenc $rndkey[1],$iv,$iv | ||
| 754 | vmovups `32+16*($k+3)-112`($key),$rndkey[0] | ||
| 755 | .Lvaesenclast$sn: | ||
| 756 | vaesenclast $rndkey[0],$iv,$iv | ||
| 757 | vmovups 16-112($key),$rndkey[1] # forward reference | ||
| 758 | ___ | ||
| 759 | } else { | ||
| 760 | $code.=<<___; | ||
| 761 | vaesenc $rndkey[0],$iv,$iv | ||
| 762 | vmovups `32+16*$k-112`($key),$rndkey[1] | ||
| 763 | ___ | ||
| 764 | } | ||
| 765 | $r++; unshift(@rndkey,pop(@rndkey)); | ||
| 766 | }; | ||
| 767 | |||
| 768 | sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 | ||
| 769 | { use integer; | ||
| 770 | my $body = shift; | ||
| 771 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | ||
| 772 | my ($a,$b,$c,$d,$e); | ||
| 773 | |||
| 774 | eval(shift(@insns)); | ||
| 775 | eval(shift(@insns)); | ||
| 776 | &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" | ||
| 777 | eval(shift(@insns)); | ||
| 778 | eval(shift(@insns)); | ||
| 779 | |||
| 780 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
| 781 | eval(shift(@insns)); | ||
| 782 | eval(shift(@insns)); | ||
| 783 | &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords | ||
| 784 | eval(shift(@insns)); | ||
| 785 | eval(shift(@insns)); | ||
| 786 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | ||
| 787 | eval(shift(@insns)); | ||
| 788 | eval(shift(@insns)); | ||
| 789 | |||
| 790 | &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" | ||
| 791 | eval(shift(@insns)); | ||
| 792 | eval(shift(@insns)); | ||
| 793 | eval(shift(@insns)); | ||
| 794 | eval(shift(@insns)); | ||
| 795 | |||
| 796 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" | ||
| 797 | eval(shift(@insns)); | ||
| 798 | eval(shift(@insns)); | ||
| 799 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
| 800 | eval(shift(@insns)); | ||
| 801 | eval(shift(@insns)); | ||
| 802 | |||
| 803 | &vpsrld (@Tx[0],@X[0],31); | ||
| 804 | eval(shift(@insns)); | ||
| 805 | eval(shift(@insns)); | ||
| 806 | eval(shift(@insns)); | ||
| 807 | eval(shift(@insns)); | ||
| 808 | |||
| 809 | &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword | ||
| 810 | &vpaddd (@X[0],@X[0],@X[0]); | ||
| 811 | eval(shift(@insns)); | ||
| 812 | eval(shift(@insns)); | ||
| 813 | eval(shift(@insns)); | ||
| 814 | eval(shift(@insns)); | ||
| 815 | |||
| 816 | &vpsrld (@Tx[1],@Tx[2],30); | ||
| 817 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 | ||
| 818 | eval(shift(@insns)); | ||
| 819 | eval(shift(@insns)); | ||
| 820 | eval(shift(@insns)); | ||
| 821 | eval(shift(@insns)); | ||
| 822 | |||
| 823 | &vpslld (@Tx[2],@Tx[2],2); | ||
| 824 | &vpxor (@X[0],@X[0],@Tx[1]); | ||
| 825 | eval(shift(@insns)); | ||
| 826 | eval(shift(@insns)); | ||
| 827 | eval(shift(@insns)); | ||
| 828 | eval(shift(@insns)); | ||
| 829 | |||
| 830 | &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 | ||
| 831 | eval(shift(@insns)); | ||
| 832 | eval(shift(@insns)); | ||
| 833 | &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX | ||
| 834 | eval(shift(@insns)); | ||
| 835 | eval(shift(@insns)); | ||
| 836 | |||
| 837 | |||
| 838 | foreach (@insns) { eval; } # remaining instructions [if any] | ||
| 839 | |||
| 840 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
| 841 | push(@Tx,shift(@Tx)); | ||
| 842 | } | ||
| 843 | |||
| 844 | sub Xupdate_avx_32_79() | ||
| 845 | { use integer; | ||
| 846 | my $body = shift; | ||
| 847 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | ||
| 848 | my ($a,$b,$c,$d,$e); | ||
| 849 | |||
| 850 | &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" | ||
| 851 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | ||
| 852 | eval(shift(@insns)); # body_20_39 | ||
| 853 | eval(shift(@insns)); | ||
| 854 | eval(shift(@insns)); | ||
| 855 | eval(shift(@insns)); # rol | ||
| 856 | |||
| 857 | &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | ||
| 858 | eval(shift(@insns)); | ||
| 859 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); | ||
| 860 | if ($Xi%5) { | ||
| 861 | &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... | ||
| 862 | } else { # ... or load next one | ||
| 863 | &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); | ||
| 864 | } | ||
| 865 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
| 866 | eval(shift(@insns)); # ror | ||
| 867 | eval(shift(@insns)); | ||
| 868 | |||
| 869 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" | ||
| 870 | eval(shift(@insns)); # body_20_39 | ||
| 871 | eval(shift(@insns)); | ||
| 872 | eval(shift(@insns)); | ||
| 873 | eval(shift(@insns)); # rol | ||
| 874 | |||
| 875 | &vpsrld (@Tx[0],@X[0],30); | ||
| 876 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
| 877 | eval(shift(@insns)); | ||
| 878 | eval(shift(@insns)); | ||
| 879 | eval(shift(@insns)); # ror | ||
| 880 | eval(shift(@insns)); | ||
| 881 | |||
| 882 | &vpslld (@X[0],@X[0],2); | ||
| 883 | eval(shift(@insns)); # body_20_39 | ||
| 884 | eval(shift(@insns)); | ||
| 885 | eval(shift(@insns)); | ||
| 886 | eval(shift(@insns)); # rol | ||
| 887 | eval(shift(@insns)); | ||
| 888 | eval(shift(@insns)); | ||
| 889 | eval(shift(@insns)); # ror | ||
| 890 | eval(shift(@insns)); | ||
| 891 | |||
| 892 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 | ||
| 893 | eval(shift(@insns)); # body_20_39 | ||
| 894 | eval(shift(@insns)); | ||
| 895 | &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); | ||
| 896 | eval(shift(@insns)); | ||
| 897 | eval(shift(@insns)); # rol | ||
| 898 | eval(shift(@insns)); | ||
| 899 | eval(shift(@insns)); | ||
| 900 | eval(shift(@insns)); # rol | ||
| 901 | eval(shift(@insns)); | ||
| 902 | |||
| 903 | foreach (@insns) { eval; } # remaining instructions | ||
| 904 | |||
| 905 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
| 906 | push(@Tx,shift(@Tx)); | ||
| 907 | } | ||
| 908 | |||
| 909 | sub Xuplast_avx_80() | ||
| 910 | { use integer; | ||
| 911 | my $body = shift; | ||
| 912 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 913 | my ($a,$b,$c,$d,$e); | ||
| 914 | |||
| 915 | eval(shift(@insns)); | ||
| 916 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
| 917 | eval(shift(@insns)); | ||
| 918 | eval(shift(@insns)); | ||
| 919 | eval(shift(@insns)); | ||
| 920 | eval(shift(@insns)); | ||
| 921 | |||
| 922 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU | ||
| 923 | |||
| 924 | foreach (@insns) { eval; } # remaining instructions | ||
| 925 | |||
| 926 | &cmp ($inp,$len); | ||
| 927 | &je (".Ldone_avx"); | ||
| 928 | |||
| 929 | unshift(@Tx,pop(@Tx)); | ||
| 930 | |||
| 931 | &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask | ||
| 932 | &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 | ||
| 933 | &vmovdqu(@X[-4&7],"0($inp)"); # load input | ||
| 934 | &vmovdqu(@X[-3&7],"16($inp)"); | ||
| 935 | &vmovdqu(@X[-2&7],"32($inp)"); | ||
| 936 | &vmovdqu(@X[-1&7],"48($inp)"); | ||
| 937 | &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | ||
| 938 | &add ($inp,64); | ||
| 939 | |||
| 940 | $Xi=0; | ||
| 941 | } | ||
| 942 | |||
| 943 | sub Xloop_avx() | ||
| 944 | { use integer; | ||
| 945 | my $body = shift; | ||
| 946 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 947 | my ($a,$b,$c,$d,$e); | ||
| 948 | |||
| 949 | eval(shift(@insns)); | ||
| 950 | eval(shift(@insns)); | ||
| 951 | &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); | ||
| 952 | eval(shift(@insns)); | ||
| 953 | eval(shift(@insns)); | ||
| 954 | &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); | ||
| 955 | eval(shift(@insns)); | ||
| 956 | eval(shift(@insns)); | ||
| 957 | eval(shift(@insns)); | ||
| 958 | eval(shift(@insns)); | ||
| 959 | &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU | ||
| 960 | eval(shift(@insns)); | ||
| 961 | eval(shift(@insns)); | ||
| 962 | |||
| 963 | foreach (@insns) { eval; } | ||
| 964 | $Xi++; | ||
| 965 | } | ||
| 966 | |||
| 967 | sub Xtail_avx() | ||
| 968 | { use integer; | ||
| 969 | my $body = shift; | ||
| 970 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 971 | my ($a,$b,$c,$d,$e); | ||
| 972 | |||
| 973 | foreach (@insns) { eval; } | ||
| 974 | } | ||
| 975 | |||
| 976 | $code.=<<___; | ||
| 977 | .align 16 | ||
| 978 | .Loop_avx: | ||
| 979 | ___ | ||
| 980 | &Xupdate_avx_16_31(\&body_00_19); | ||
| 981 | &Xupdate_avx_16_31(\&body_00_19); | ||
| 982 | &Xupdate_avx_16_31(\&body_00_19); | ||
| 983 | &Xupdate_avx_16_31(\&body_00_19); | ||
| 984 | &Xupdate_avx_32_79(\&body_00_19); | ||
| 985 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 986 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 987 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 988 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 989 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 990 | &Xupdate_avx_32_79(\&body_40_59); | ||
| 991 | &Xupdate_avx_32_79(\&body_40_59); | ||
| 992 | &Xupdate_avx_32_79(\&body_40_59); | ||
| 993 | &Xupdate_avx_32_79(\&body_40_59); | ||
| 994 | &Xupdate_avx_32_79(\&body_40_59); | ||
| 995 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 996 | &Xuplast_avx_80(\&body_20_39); # can jump to "done" | ||
| 997 | |||
| 998 | $saved_j=$j; @saved_V=@V; | ||
| 999 | $saved_r=$r; @saved_rndkey=@rndkey; | ||
| 1000 | |||
| 1001 | &Xloop_avx(\&body_20_39); | ||
| 1002 | &Xloop_avx(\&body_20_39); | ||
| 1003 | &Xloop_avx(\&body_20_39); | ||
| 1004 | |||
| 1005 | $code.=<<___; | ||
| 1006 | vmovups $iv,48($out,$in0) # write output | ||
| 1007 | lea 64($in0),$in0 | ||
| 1008 | |||
| 1009 | add 0($ctx),$A # update context | ||
| 1010 | add 4($ctx),@T[0] | ||
| 1011 | add 8($ctx),$C | ||
| 1012 | add 12($ctx),$D | ||
| 1013 | mov $A,0($ctx) | ||
| 1014 | add 16($ctx),$E | ||
| 1015 | mov @T[0],4($ctx) | ||
| 1016 | mov @T[0],$B # magic seed | ||
| 1017 | mov $C,8($ctx) | ||
| 1018 | mov $D,12($ctx) | ||
| 1019 | mov $E,16($ctx) | ||
| 1020 | jmp .Loop_avx | ||
| 1021 | |||
| 1022 | .align 16 | ||
| 1023 | .Ldone_avx: | ||
| 1024 | ___ | ||
| 1025 | $jj=$j=$saved_j; @V=@saved_V; | ||
| 1026 | $r=$saved_r; @rndkey=@saved_rndkey; | ||
| 1027 | |||
| 1028 | &Xtail_avx(\&body_20_39); | ||
| 1029 | &Xtail_avx(\&body_20_39); | ||
| 1030 | &Xtail_avx(\&body_20_39); | ||
| 1031 | |||
| 1032 | $code.=<<___; | ||
| 1033 | vmovups $iv,48($out,$in0) # write output | ||
| 1034 | mov 88(%rsp),$ivp # restore $ivp | ||
| 1035 | |||
| 1036 | add 0($ctx),$A # update context | ||
| 1037 | add 4($ctx),@T[0] | ||
| 1038 | add 8($ctx),$C | ||
| 1039 | mov $A,0($ctx) | ||
| 1040 | add 12($ctx),$D | ||
| 1041 | mov @T[0],4($ctx) | ||
| 1042 | add 16($ctx),$E | ||
| 1043 | mov $C,8($ctx) | ||
| 1044 | mov $D,12($ctx) | ||
| 1045 | mov $E,16($ctx) | ||
| 1046 | vmovups $iv,($ivp) # write IV | ||
| 1047 | vzeroall | ||
| 1048 | ___ | ||
| 1049 | $code.=<<___ if ($win64); | ||
| 1050 | movaps 96+0(%rsp),%xmm6 | ||
| 1051 | movaps 96+16(%rsp),%xmm7 | ||
| 1052 | movaps 96+32(%rsp),%xmm8 | ||
| 1053 | movaps 96+48(%rsp),%xmm9 | ||
| 1054 | movaps 96+64(%rsp),%xmm10 | ||
| 1055 | movaps 96+80(%rsp),%xmm11 | ||
| 1056 | movaps 96+96(%rsp),%xmm12 | ||
| 1057 | movaps 96+112(%rsp),%xmm13 | ||
| 1058 | movaps 96+128(%rsp),%xmm14 | ||
| 1059 | movaps 96+144(%rsp),%xmm15 | ||
| 1060 | ___ | ||
| 1061 | $code.=<<___; | ||
| 1062 | lea `104+($win64?10*16:0)`(%rsp),%rsi | ||
| 1063 | mov 0(%rsi),%r15 | ||
| 1064 | mov 8(%rsi),%r14 | ||
| 1065 | mov 16(%rsi),%r13 | ||
| 1066 | mov 24(%rsi),%r12 | ||
| 1067 | mov 32(%rsi),%rbp | ||
| 1068 | mov 40(%rsi),%rbx | ||
| 1069 | lea 48(%rsi),%rsp | ||
| 1070 | .Lepilogue_avx: | ||
| 1071 | ret | ||
| 1072 | .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx | ||
| 1073 | ___ | ||
| 1074 | } | ||
| 1075 | $code.=<<___; | ||
| 1076 | .align 64 | ||
| 1077 | K_XX_XX: | ||
| 1078 | .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 | ||
| 1079 | .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 | ||
| 1080 | .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 | ||
| 1081 | .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 | ||
| 1082 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask | ||
| 1083 | |||
| 1084 | .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 1085 | .align 64 | ||
| 1086 | ___ | ||
| 1087 | |||
| 1088 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 1089 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 1090 | if ($win64) { | ||
| 1091 | $rec="%rcx"; | ||
| 1092 | $frame="%rdx"; | ||
| 1093 | $context="%r8"; | ||
| 1094 | $disp="%r9"; | ||
| 1095 | |||
| 1096 | $code.=<<___; | ||
| 1097 | .extern __imp_RtlVirtualUnwind | ||
| 1098 | .type ssse3_handler,\@abi-omnipotent | ||
| 1099 | .align 16 | ||
| 1100 | ssse3_handler: | ||
| 1101 | push %rsi | ||
| 1102 | push %rdi | ||
| 1103 | push %rbx | ||
| 1104 | push %rbp | ||
| 1105 | push %r12 | ||
| 1106 | push %r13 | ||
| 1107 | push %r14 | ||
| 1108 | push %r15 | ||
| 1109 | pushfq | ||
| 1110 | sub \$64,%rsp | ||
| 1111 | |||
| 1112 | mov 120($context),%rax # pull context->Rax | ||
| 1113 | mov 248($context),%rbx # pull context->Rip | ||
| 1114 | |||
| 1115 | mov 8($disp),%rsi # disp->ImageBase | ||
| 1116 | mov 56($disp),%r11 # disp->HandlerData | ||
| 1117 | |||
| 1118 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 1119 | lea (%rsi,%r10),%r10 # prologue label | ||
| 1120 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 1121 | jb .Lcommon_seh_tail | ||
| 1122 | |||
| 1123 | mov 152($context),%rax # pull context->Rsp | ||
| 1124 | |||
| 1125 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 1126 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 1127 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 1128 | jae .Lcommon_seh_tail | ||
| 1129 | |||
| 1130 | lea 96(%rax),%rsi | ||
| 1131 | lea 512($context),%rdi # &context.Xmm6 | ||
| 1132 | mov \$20,%ecx | ||
| 1133 | .long 0xa548f3fc # cld; rep movsq | ||
| 1134 | lea `104+10*16`(%rax),%rax # adjust stack pointer | ||
| 1135 | |||
| 1136 | mov 0(%rax),%r15 | ||
| 1137 | mov 8(%rax),%r14 | ||
| 1138 | mov 16(%rax),%r13 | ||
| 1139 | mov 24(%rax),%r12 | ||
| 1140 | mov 32(%rax),%rbp | ||
| 1141 | mov 40(%rax),%rbx | ||
| 1142 | lea 48(%rax),%rax | ||
| 1143 | mov %rbx,144($context) # restore context->Rbx | ||
| 1144 | mov %rbp,160($context) # restore context->Rbp | ||
| 1145 | mov %r12,216($context) # restore context->R12 | ||
| 1146 | mov %r13,224($context) # restore context->R13 | ||
| 1147 | mov %r14,232($context) # restore context->R14 | ||
| 1148 | mov %r15,240($context) # restore context->R15 | ||
| 1149 | |||
| 1150 | .Lcommon_seh_tail: | ||
| 1151 | mov 8(%rax),%rdi | ||
| 1152 | mov 16(%rax),%rsi | ||
| 1153 | mov %rax,152($context) # restore context->Rsp | ||
| 1154 | mov %rsi,168($context) # restore context->Rsi | ||
| 1155 | mov %rdi,176($context) # restore context->Rdi | ||
| 1156 | |||
| 1157 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 1158 | mov $context,%rsi # context | ||
| 1159 | mov \$154,%ecx # sizeof(CONTEXT) | ||
| 1160 | .long 0xa548f3fc # cld; rep movsq | ||
| 1161 | |||
| 1162 | mov $disp,%rsi | ||
| 1163 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 1164 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 1165 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 1166 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 1167 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 1168 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 1169 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 1170 | mov %r10,32(%rsp) # arg5 | ||
| 1171 | mov %r11,40(%rsp) # arg6 | ||
| 1172 | mov %r12,48(%rsp) # arg7 | ||
| 1173 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 1174 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 1175 | |||
| 1176 | mov \$1,%eax # ExceptionContinueSearch | ||
| 1177 | add \$64,%rsp | ||
| 1178 | popfq | ||
| 1179 | pop %r15 | ||
| 1180 | pop %r14 | ||
| 1181 | pop %r13 | ||
| 1182 | pop %r12 | ||
| 1183 | pop %rbp | ||
| 1184 | pop %rbx | ||
| 1185 | pop %rdi | ||
| 1186 | pop %rsi | ||
| 1187 | ret | ||
| 1188 | .size ssse3_handler,.-ssse3_handler | ||
| 1189 | |||
| 1190 | .section .pdata | ||
| 1191 | .align 4 | ||
| 1192 | .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3 | ||
| 1193 | .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3 | ||
| 1194 | .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3 | ||
| 1195 | ___ | ||
| 1196 | $code.=<<___ if ($avx); | ||
| 1197 | .rva .LSEH_begin_aesni_cbc_sha1_enc_avx | ||
| 1198 | .rva .LSEH_end_aesni_cbc_sha1_enc_avx | ||
| 1199 | .rva .LSEH_info_aesni_cbc_sha1_enc_avx | ||
| 1200 | ___ | ||
| 1201 | $code.=<<___; | ||
| 1202 | .section .xdata | ||
| 1203 | .align 8 | ||
| 1204 | .LSEH_info_aesni_cbc_sha1_enc_ssse3: | ||
| 1205 | .byte 9,0,0,0 | ||
| 1206 | .rva ssse3_handler | ||
| 1207 | .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] | ||
| 1208 | ___ | ||
| 1209 | $code.=<<___ if ($avx); | ||
| 1210 | .LSEH_info_aesni_cbc_sha1_enc_avx: | ||
| 1211 | .byte 9,0,0,0 | ||
| 1212 | .rva ssse3_handler | ||
| 1213 | .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] | ||
| 1214 | ___ | ||
| 1215 | } | ||
| 1216 | |||
| 1217 | #################################################################### | ||
| 1218 | sub rex { | ||
| 1219 | local *opcode=shift; | ||
| 1220 | my ($dst,$src)=@_; | ||
| 1221 | my $rex=0; | ||
| 1222 | |||
| 1223 | $rex|=0x04 if($dst>=8); | ||
| 1224 | $rex|=0x01 if($src>=8); | ||
| 1225 | push @opcode,$rex|0x40 if($rex); | ||
| 1226 | } | ||
| 1227 | |||
| 1228 | sub aesni { | ||
| 1229 | my $line=shift; | ||
| 1230 | my @opcode=(0x66); | ||
| 1231 | |||
| 1232 | if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { | ||
| 1233 | my %opcodelet = ( | ||
| 1234 | "aesenc" => 0xdc, "aesenclast" => 0xdd | ||
| 1235 | ); | ||
| 1236 | return undef if (!defined($opcodelet{$1})); | ||
| 1237 | rex(\@opcode,$3,$2); | ||
| 1238 | push @opcode,0x0f,0x38,$opcodelet{$1}; | ||
| 1239 | push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M | ||
| 1240 | return ".byte\t".join(',',@opcode); | ||
| 1241 | } | ||
| 1242 | return $line; | ||
| 1243 | } | ||
| 1244 | |||
| 1245 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 1246 | $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; | ||
| 1247 | |||
| 1248 | print $code; | ||
| 1249 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86.pl b/src/lib/libcrypto/aes/asm/aesni-x86.pl new file mode 100644 index 0000000000..3dc345b585 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aesni-x86.pl | |||
| @@ -0,0 +1,2189 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # This module implements support for Intel AES-NI extension. In | ||
| 11 | # OpenSSL context it's used with Intel engine, but can also be used as | ||
| 12 | # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for | ||
| 13 | # details]. | ||
| 14 | # | ||
| 15 | # Performance. | ||
| 16 | # | ||
| 17 | # To start with see corresponding paragraph in aesni-x86_64.pl... | ||
| 18 | # Instead of filling table similar to one found there I've chosen to | ||
| 19 | # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. | ||
| 20 | # The simplified table below represents 32-bit performance relative | ||
| 21 | # to 64-bit one in every given point. Ratios vary for different | ||
| 22 | # encryption modes, therefore interval values. | ||
| 23 | # | ||
| 24 | # 16-byte 64-byte 256-byte 1-KB 8-KB | ||
| 25 | # 53-67% 67-84% 91-94% 95-98% 97-99.5% | ||
| 26 | # | ||
| 27 | # Lower ratios for smaller block sizes are perfectly understandable, | ||
| 28 | # because function call overhead is higher in 32-bit mode. Largest | ||
| 29 | # 8-KB block performance is virtually same: 32-bit code is less than | ||
| 30 | # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. | ||
| 31 | |||
| 32 | # January 2011 | ||
| 33 | # | ||
| 34 | # See aesni-x86_64.pl for details. Unlike x86_64 version this module | ||
| 35 | # interleaves at most 6 aes[enc|dec] instructions, because there are | ||
| 36 | # not enough registers for 8x interleave [which should be optimal for | ||
| 37 | # Sandy Bridge]. Actually, performance results for 6x interleave | ||
| 38 | # factor presented in aesni-x86_64.pl (except for CTR) are for this | ||
| 39 | # module. | ||
| 40 | |||
| 41 | # April 2011 | ||
| 42 | # | ||
| 43 | # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing | ||
| 44 | # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. | ||
| 45 | |||
| 46 | $PREFIX="aesni"; # if $PREFIX is set to "AES", the script | ||
| 47 | # generates drop-in replacement for | ||
| 48 | # crypto/aes/asm/aes-586.pl:-) | ||
| 49 | $inline=1; # inline _aesni_[en|de]crypt | ||
| 50 | |||
| 51 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 52 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 53 | require "x86asm.pl"; | ||
| 54 | |||
| 55 | &asm_init($ARGV[0],$0); | ||
| 56 | |||
| 57 | if ($PREFIX eq "aesni") { $movekey=*movups; } | ||
| 58 | else { $movekey=*movups; } | ||
| 59 | |||
| 60 | $len="eax"; | ||
| 61 | $rounds="ecx"; | ||
| 62 | $key="edx"; | ||
| 63 | $inp="esi"; | ||
| 64 | $out="edi"; | ||
| 65 | $rounds_="ebx"; # backup copy for $rounds | ||
| 66 | $key_="ebp"; # backup copy for $key | ||
| 67 | |||
| 68 | $rndkey0="xmm0"; | ||
| 69 | $rndkey1="xmm1"; | ||
| 70 | $inout0="xmm2"; | ||
| 71 | $inout1="xmm3"; | ||
| 72 | $inout2="xmm4"; | ||
| 73 | $inout3="xmm5"; $in1="xmm5"; | ||
| 74 | $inout4="xmm6"; $in0="xmm6"; | ||
| 75 | $inout5="xmm7"; $ivec="xmm7"; | ||
| 76 | |||
| 77 | # AESNI extenstion | ||
| 78 | sub aeskeygenassist | ||
| 79 | { my($dst,$src,$imm)=@_; | ||
| 80 | if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) | ||
| 81 | { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } | ||
| 82 | } | ||
| 83 | sub aescommon | ||
| 84 | { my($opcodelet,$dst,$src)=@_; | ||
| 85 | if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) | ||
| 86 | { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} | ||
| 87 | } | ||
| 88 | sub aesimc { aescommon(0xdb,@_); } | ||
| 89 | sub aesenc { aescommon(0xdc,@_); } | ||
| 90 | sub aesenclast { aescommon(0xdd,@_); } | ||
| 91 | sub aesdec { aescommon(0xde,@_); } | ||
| 92 | sub aesdeclast { aescommon(0xdf,@_); } | ||
| 93 | |||
| 94 | # Inline version of internal aesni_[en|de]crypt1 | ||
| 95 | { my $sn; | ||
| 96 | sub aesni_inline_generate1 | ||
| 97 | { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); | ||
| 98 | $sn++; | ||
| 99 | |||
| 100 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 101 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
| 102 | &xorps ($ivec,$rndkey0) if (defined($ivec)); | ||
| 103 | &lea ($key,&DWP(32,$key)); | ||
| 104 | &xorps ($inout,$ivec) if (defined($ivec)); | ||
| 105 | &xorps ($inout,$rndkey0) if (!defined($ivec)); | ||
| 106 | &set_label("${p}1_loop_$sn"); | ||
| 107 | eval"&aes${p} ($inout,$rndkey1)"; | ||
| 108 | &dec ($rounds); | ||
| 109 | &$movekey ($rndkey1,&QWP(0,$key)); | ||
| 110 | &lea ($key,&DWP(16,$key)); | ||
| 111 | &jnz (&label("${p}1_loop_$sn")); | ||
| 112 | eval"&aes${p}last ($inout,$rndkey1)"; | ||
| 113 | }} | ||
| 114 | |||
| 115 | sub aesni_generate1 # fully unrolled loop | ||
| 116 | { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); | ||
| 117 | |||
| 118 | &function_begin_B("_aesni_${p}rypt1"); | ||
| 119 | &movups ($rndkey0,&QWP(0,$key)); | ||
| 120 | &$movekey ($rndkey1,&QWP(0x10,$key)); | ||
| 121 | &xorps ($inout,$rndkey0); | ||
| 122 | &$movekey ($rndkey0,&QWP(0x20,$key)); | ||
| 123 | &lea ($key,&DWP(0x30,$key)); | ||
| 124 | &cmp ($rounds,11); | ||
| 125 | &jb (&label("${p}128")); | ||
| 126 | &lea ($key,&DWP(0x20,$key)); | ||
| 127 | &je (&label("${p}192")); | ||
| 128 | &lea ($key,&DWP(0x20,$key)); | ||
| 129 | eval"&aes${p} ($inout,$rndkey1)"; | ||
| 130 | &$movekey ($rndkey1,&QWP(-0x40,$key)); | ||
| 131 | eval"&aes${p} ($inout,$rndkey0)"; | ||
| 132 | &$movekey ($rndkey0,&QWP(-0x30,$key)); | ||
| 133 | &set_label("${p}192"); | ||
| 134 | eval"&aes${p} ($inout,$rndkey1)"; | ||
| 135 | &$movekey ($rndkey1,&QWP(-0x20,$key)); | ||
| 136 | eval"&aes${p} ($inout,$rndkey0)"; | ||
| 137 | &$movekey ($rndkey0,&QWP(-0x10,$key)); | ||
| 138 | &set_label("${p}128"); | ||
| 139 | eval"&aes${p} ($inout,$rndkey1)"; | ||
| 140 | &$movekey ($rndkey1,&QWP(0,$key)); | ||
| 141 | eval"&aes${p} ($inout,$rndkey0)"; | ||
| 142 | &$movekey ($rndkey0,&QWP(0x10,$key)); | ||
| 143 | eval"&aes${p} ($inout,$rndkey1)"; | ||
| 144 | &$movekey ($rndkey1,&QWP(0x20,$key)); | ||
| 145 | eval"&aes${p} ($inout,$rndkey0)"; | ||
| 146 | &$movekey ($rndkey0,&QWP(0x30,$key)); | ||
| 147 | eval"&aes${p} ($inout,$rndkey1)"; | ||
| 148 | &$movekey ($rndkey1,&QWP(0x40,$key)); | ||
| 149 | eval"&aes${p} ($inout,$rndkey0)"; | ||
| 150 | &$movekey ($rndkey0,&QWP(0x50,$key)); | ||
| 151 | eval"&aes${p} ($inout,$rndkey1)"; | ||
| 152 | &$movekey ($rndkey1,&QWP(0x60,$key)); | ||
| 153 | eval"&aes${p} ($inout,$rndkey0)"; | ||
| 154 | &$movekey ($rndkey0,&QWP(0x70,$key)); | ||
| 155 | eval"&aes${p} ($inout,$rndkey1)"; | ||
| 156 | eval"&aes${p}last ($inout,$rndkey0)"; | ||
| 157 | &ret(); | ||
| 158 | &function_end_B("_aesni_${p}rypt1"); | ||
| 159 | } | ||
| 160 | |||
| 161 | # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); | ||
| 162 | &aesni_generate1("enc") if (!$inline); | ||
| 163 | &function_begin_B("${PREFIX}_encrypt"); | ||
| 164 | &mov ("eax",&wparam(0)); | ||
| 165 | &mov ($key,&wparam(2)); | ||
| 166 | &movups ($inout0,&QWP(0,"eax")); | ||
| 167 | &mov ($rounds,&DWP(240,$key)); | ||
| 168 | &mov ("eax",&wparam(1)); | ||
| 169 | if ($inline) | ||
| 170 | { &aesni_inline_generate1("enc"); } | ||
| 171 | else | ||
| 172 | { &call ("_aesni_encrypt1"); } | ||
| 173 | &movups (&QWP(0,"eax"),$inout0); | ||
| 174 | &ret (); | ||
| 175 | &function_end_B("${PREFIX}_encrypt"); | ||
| 176 | |||
| 177 | # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); | ||
| 178 | &aesni_generate1("dec") if(!$inline); | ||
| 179 | &function_begin_B("${PREFIX}_decrypt"); | ||
| 180 | &mov ("eax",&wparam(0)); | ||
| 181 | &mov ($key,&wparam(2)); | ||
| 182 | &movups ($inout0,&QWP(0,"eax")); | ||
| 183 | &mov ($rounds,&DWP(240,$key)); | ||
| 184 | &mov ("eax",&wparam(1)); | ||
| 185 | if ($inline) | ||
| 186 | { &aesni_inline_generate1("dec"); } | ||
| 187 | else | ||
| 188 | { &call ("_aesni_decrypt1"); } | ||
| 189 | &movups (&QWP(0,"eax"),$inout0); | ||
| 190 | &ret (); | ||
| 191 | &function_end_B("${PREFIX}_decrypt"); | ||
| 192 | |||
| 193 | # _aesni_[en|de]cryptN are private interfaces, N denotes interleave | ||
| 194 | # factor. Why 3x subroutine were originally used in loops? Even though | ||
| 195 | # aes[enc|dec] latency was originally 6, it could be scheduled only | ||
| 196 | # every *2nd* cycle. Thus 3x interleave was the one providing optimal | ||
| 197 | # utilization, i.e. when subroutine's throughput is virtually same as | ||
| 198 | # of non-interleaved subroutine [for number of input blocks up to 3]. | ||
| 199 | # This is why it makes no sense to implement 2x subroutine. | ||
| 200 | # aes[enc|dec] latency in next processor generation is 8, but the | ||
| 201 | # instructions can be scheduled every cycle. Optimal interleave for | ||
| 202 | # new processor is therefore 8x, but it's unfeasible to accommodate it | ||
| 203 | # in XMM registers addreassable in 32-bit mode and therefore 6x is | ||
| 204 | # used instead... | ||
| 205 | |||
| 206 | sub aesni_generate3 | ||
| 207 | { my $p=shift; | ||
| 208 | |||
| 209 | &function_begin_B("_aesni_${p}rypt3"); | ||
| 210 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 211 | &shr ($rounds,1); | ||
| 212 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
| 213 | &lea ($key,&DWP(32,$key)); | ||
| 214 | &xorps ($inout0,$rndkey0); | ||
| 215 | &pxor ($inout1,$rndkey0); | ||
| 216 | &pxor ($inout2,$rndkey0); | ||
| 217 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 218 | |||
| 219 | &set_label("${p}3_loop"); | ||
| 220 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
| 221 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
| 222 | &dec ($rounds); | ||
| 223 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
| 224 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
| 225 | eval"&aes${p} ($inout0,$rndkey0)"; | ||
| 226 | eval"&aes${p} ($inout1,$rndkey0)"; | ||
| 227 | &lea ($key,&DWP(32,$key)); | ||
| 228 | eval"&aes${p} ($inout2,$rndkey0)"; | ||
| 229 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 230 | &jnz (&label("${p}3_loop")); | ||
| 231 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
| 232 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
| 233 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
| 234 | eval"&aes${p}last ($inout0,$rndkey0)"; | ||
| 235 | eval"&aes${p}last ($inout1,$rndkey0)"; | ||
| 236 | eval"&aes${p}last ($inout2,$rndkey0)"; | ||
| 237 | &ret(); | ||
| 238 | &function_end_B("_aesni_${p}rypt3"); | ||
| 239 | } | ||
| 240 | |||
| 241 | # 4x interleave is implemented to improve small block performance, | ||
| 242 | # most notably [and naturally] 4 block by ~30%. One can argue that one | ||
| 243 | # should have implemented 5x as well, but improvement would be <20%, | ||
| 244 | # so it's not worth it... | ||
| 245 | sub aesni_generate4 | ||
| 246 | { my $p=shift; | ||
| 247 | |||
| 248 | &function_begin_B("_aesni_${p}rypt4"); | ||
| 249 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 250 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
| 251 | &shr ($rounds,1); | ||
| 252 | &lea ($key,&DWP(32,$key)); | ||
| 253 | &xorps ($inout0,$rndkey0); | ||
| 254 | &pxor ($inout1,$rndkey0); | ||
| 255 | &pxor ($inout2,$rndkey0); | ||
| 256 | &pxor ($inout3,$rndkey0); | ||
| 257 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 258 | |||
| 259 | &set_label("${p}4_loop"); | ||
| 260 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
| 261 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
| 262 | &dec ($rounds); | ||
| 263 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
| 264 | eval"&aes${p} ($inout3,$rndkey1)"; | ||
| 265 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
| 266 | eval"&aes${p} ($inout0,$rndkey0)"; | ||
| 267 | eval"&aes${p} ($inout1,$rndkey0)"; | ||
| 268 | &lea ($key,&DWP(32,$key)); | ||
| 269 | eval"&aes${p} ($inout2,$rndkey0)"; | ||
| 270 | eval"&aes${p} ($inout3,$rndkey0)"; | ||
| 271 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 272 | &jnz (&label("${p}4_loop")); | ||
| 273 | |||
| 274 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
| 275 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
| 276 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
| 277 | eval"&aes${p} ($inout3,$rndkey1)"; | ||
| 278 | eval"&aes${p}last ($inout0,$rndkey0)"; | ||
| 279 | eval"&aes${p}last ($inout1,$rndkey0)"; | ||
| 280 | eval"&aes${p}last ($inout2,$rndkey0)"; | ||
| 281 | eval"&aes${p}last ($inout3,$rndkey0)"; | ||
| 282 | &ret(); | ||
| 283 | &function_end_B("_aesni_${p}rypt4"); | ||
| 284 | } | ||
| 285 | |||
| 286 | sub aesni_generate6 | ||
| 287 | { my $p=shift; | ||
| 288 | |||
| 289 | &function_begin_B("_aesni_${p}rypt6"); | ||
| 290 | &static_label("_aesni_${p}rypt6_enter"); | ||
| 291 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 292 | &shr ($rounds,1); | ||
| 293 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
| 294 | &lea ($key,&DWP(32,$key)); | ||
| 295 | &xorps ($inout0,$rndkey0); | ||
| 296 | &pxor ($inout1,$rndkey0); # pxor does better here | ||
| 297 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
| 298 | &pxor ($inout2,$rndkey0); | ||
| 299 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
| 300 | &pxor ($inout3,$rndkey0); | ||
| 301 | &dec ($rounds); | ||
| 302 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
| 303 | &pxor ($inout4,$rndkey0); | ||
| 304 | eval"&aes${p} ($inout3,$rndkey1)"; | ||
| 305 | &pxor ($inout5,$rndkey0); | ||
| 306 | eval"&aes${p} ($inout4,$rndkey1)"; | ||
| 307 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 308 | eval"&aes${p} ($inout5,$rndkey1)"; | ||
| 309 | &jmp (&label("_aesni_${p}rypt6_enter")); | ||
| 310 | |||
| 311 | &set_label("${p}6_loop",16); | ||
| 312 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
| 313 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
| 314 | &dec ($rounds); | ||
| 315 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
| 316 | eval"&aes${p} ($inout3,$rndkey1)"; | ||
| 317 | eval"&aes${p} ($inout4,$rndkey1)"; | ||
| 318 | eval"&aes${p} ($inout5,$rndkey1)"; | ||
| 319 | &set_label("_aesni_${p}rypt6_enter",16); | ||
| 320 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
| 321 | eval"&aes${p} ($inout0,$rndkey0)"; | ||
| 322 | eval"&aes${p} ($inout1,$rndkey0)"; | ||
| 323 | &lea ($key,&DWP(32,$key)); | ||
| 324 | eval"&aes${p} ($inout2,$rndkey0)"; | ||
| 325 | eval"&aes${p} ($inout3,$rndkey0)"; | ||
| 326 | eval"&aes${p} ($inout4,$rndkey0)"; | ||
| 327 | eval"&aes${p} ($inout5,$rndkey0)"; | ||
| 328 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 329 | &jnz (&label("${p}6_loop")); | ||
| 330 | |||
| 331 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
| 332 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
| 333 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
| 334 | eval"&aes${p} ($inout3,$rndkey1)"; | ||
| 335 | eval"&aes${p} ($inout4,$rndkey1)"; | ||
| 336 | eval"&aes${p} ($inout5,$rndkey1)"; | ||
| 337 | eval"&aes${p}last ($inout0,$rndkey0)"; | ||
| 338 | eval"&aes${p}last ($inout1,$rndkey0)"; | ||
| 339 | eval"&aes${p}last ($inout2,$rndkey0)"; | ||
| 340 | eval"&aes${p}last ($inout3,$rndkey0)"; | ||
| 341 | eval"&aes${p}last ($inout4,$rndkey0)"; | ||
| 342 | eval"&aes${p}last ($inout5,$rndkey0)"; | ||
| 343 | &ret(); | ||
| 344 | &function_end_B("_aesni_${p}rypt6"); | ||
| 345 | } | ||
| 346 | &aesni_generate3("enc") if ($PREFIX eq "aesni"); | ||
| 347 | &aesni_generate3("dec"); | ||
| 348 | &aesni_generate4("enc") if ($PREFIX eq "aesni"); | ||
| 349 | &aesni_generate4("dec"); | ||
| 350 | &aesni_generate6("enc") if ($PREFIX eq "aesni"); | ||
| 351 | &aesni_generate6("dec"); | ||
| 352 | |||
| 353 | if ($PREFIX eq "aesni") { | ||
| 354 | ###################################################################### | ||
| 355 | # void aesni_ecb_encrypt (const void *in, void *out, | ||
| 356 | # size_t length, const AES_KEY *key, | ||
| 357 | # int enc); | ||
| 358 | &function_begin("aesni_ecb_encrypt"); | ||
| 359 | &mov ($inp,&wparam(0)); | ||
| 360 | &mov ($out,&wparam(1)); | ||
| 361 | &mov ($len,&wparam(2)); | ||
| 362 | &mov ($key,&wparam(3)); | ||
| 363 | &mov ($rounds_,&wparam(4)); | ||
| 364 | &and ($len,-16); | ||
| 365 | &jz (&label("ecb_ret")); | ||
| 366 | &mov ($rounds,&DWP(240,$key)); | ||
| 367 | &test ($rounds_,$rounds_); | ||
| 368 | &jz (&label("ecb_decrypt")); | ||
| 369 | |||
| 370 | &mov ($key_,$key); # backup $key | ||
| 371 | &mov ($rounds_,$rounds); # backup $rounds | ||
| 372 | &cmp ($len,0x60); | ||
| 373 | &jb (&label("ecb_enc_tail")); | ||
| 374 | |||
| 375 | &movdqu ($inout0,&QWP(0,$inp)); | ||
| 376 | &movdqu ($inout1,&QWP(0x10,$inp)); | ||
| 377 | &movdqu ($inout2,&QWP(0x20,$inp)); | ||
| 378 | &movdqu ($inout3,&QWP(0x30,$inp)); | ||
| 379 | &movdqu ($inout4,&QWP(0x40,$inp)); | ||
| 380 | &movdqu ($inout5,&QWP(0x50,$inp)); | ||
| 381 | &lea ($inp,&DWP(0x60,$inp)); | ||
| 382 | &sub ($len,0x60); | ||
| 383 | &jmp (&label("ecb_enc_loop6_enter")); | ||
| 384 | |||
| 385 | &set_label("ecb_enc_loop6",16); | ||
| 386 | &movups (&QWP(0,$out),$inout0); | ||
| 387 | &movdqu ($inout0,&QWP(0,$inp)); | ||
| 388 | &movups (&QWP(0x10,$out),$inout1); | ||
| 389 | &movdqu ($inout1,&QWP(0x10,$inp)); | ||
| 390 | &movups (&QWP(0x20,$out),$inout2); | ||
| 391 | &movdqu ($inout2,&QWP(0x20,$inp)); | ||
| 392 | &movups (&QWP(0x30,$out),$inout3); | ||
| 393 | &movdqu ($inout3,&QWP(0x30,$inp)); | ||
| 394 | &movups (&QWP(0x40,$out),$inout4); | ||
| 395 | &movdqu ($inout4,&QWP(0x40,$inp)); | ||
| 396 | &movups (&QWP(0x50,$out),$inout5); | ||
| 397 | &lea ($out,&DWP(0x60,$out)); | ||
| 398 | &movdqu ($inout5,&QWP(0x50,$inp)); | ||
| 399 | &lea ($inp,&DWP(0x60,$inp)); | ||
| 400 | &set_label("ecb_enc_loop6_enter"); | ||
| 401 | |||
| 402 | &call ("_aesni_encrypt6"); | ||
| 403 | |||
| 404 | &mov ($key,$key_); # restore $key | ||
| 405 | &mov ($rounds,$rounds_); # restore $rounds | ||
| 406 | &sub ($len,0x60); | ||
| 407 | &jnc (&label("ecb_enc_loop6")); | ||
| 408 | |||
| 409 | &movups (&QWP(0,$out),$inout0); | ||
| 410 | &movups (&QWP(0x10,$out),$inout1); | ||
| 411 | &movups (&QWP(0x20,$out),$inout2); | ||
| 412 | &movups (&QWP(0x30,$out),$inout3); | ||
| 413 | &movups (&QWP(0x40,$out),$inout4); | ||
| 414 | &movups (&QWP(0x50,$out),$inout5); | ||
| 415 | &lea ($out,&DWP(0x60,$out)); | ||
| 416 | &add ($len,0x60); | ||
| 417 | &jz (&label("ecb_ret")); | ||
| 418 | |||
| 419 | &set_label("ecb_enc_tail"); | ||
| 420 | &movups ($inout0,&QWP(0,$inp)); | ||
| 421 | &cmp ($len,0x20); | ||
| 422 | &jb (&label("ecb_enc_one")); | ||
| 423 | &movups ($inout1,&QWP(0x10,$inp)); | ||
| 424 | &je (&label("ecb_enc_two")); | ||
| 425 | &movups ($inout2,&QWP(0x20,$inp)); | ||
| 426 | &cmp ($len,0x40); | ||
| 427 | &jb (&label("ecb_enc_three")); | ||
| 428 | &movups ($inout3,&QWP(0x30,$inp)); | ||
| 429 | &je (&label("ecb_enc_four")); | ||
| 430 | &movups ($inout4,&QWP(0x40,$inp)); | ||
| 431 | &xorps ($inout5,$inout5); | ||
| 432 | &call ("_aesni_encrypt6"); | ||
| 433 | &movups (&QWP(0,$out),$inout0); | ||
| 434 | &movups (&QWP(0x10,$out),$inout1); | ||
| 435 | &movups (&QWP(0x20,$out),$inout2); | ||
| 436 | &movups (&QWP(0x30,$out),$inout3); | ||
| 437 | &movups (&QWP(0x40,$out),$inout4); | ||
| 438 | jmp (&label("ecb_ret")); | ||
| 439 | |||
| 440 | &set_label("ecb_enc_one",16); | ||
| 441 | if ($inline) | ||
| 442 | { &aesni_inline_generate1("enc"); } | ||
| 443 | else | ||
| 444 | { &call ("_aesni_encrypt1"); } | ||
| 445 | &movups (&QWP(0,$out),$inout0); | ||
| 446 | &jmp (&label("ecb_ret")); | ||
| 447 | |||
| 448 | &set_label("ecb_enc_two",16); | ||
| 449 | &xorps ($inout2,$inout2); | ||
| 450 | &call ("_aesni_encrypt3"); | ||
| 451 | &movups (&QWP(0,$out),$inout0); | ||
| 452 | &movups (&QWP(0x10,$out),$inout1); | ||
| 453 | &jmp (&label("ecb_ret")); | ||
| 454 | |||
| 455 | &set_label("ecb_enc_three",16); | ||
| 456 | &call ("_aesni_encrypt3"); | ||
| 457 | &movups (&QWP(0,$out),$inout0); | ||
| 458 | &movups (&QWP(0x10,$out),$inout1); | ||
| 459 | &movups (&QWP(0x20,$out),$inout2); | ||
| 460 | &jmp (&label("ecb_ret")); | ||
| 461 | |||
| 462 | &set_label("ecb_enc_four",16); | ||
| 463 | &call ("_aesni_encrypt4"); | ||
| 464 | &movups (&QWP(0,$out),$inout0); | ||
| 465 | &movups (&QWP(0x10,$out),$inout1); | ||
| 466 | &movups (&QWP(0x20,$out),$inout2); | ||
| 467 | &movups (&QWP(0x30,$out),$inout3); | ||
| 468 | &jmp (&label("ecb_ret")); | ||
| 469 | ###################################################################### | ||
| 470 | &set_label("ecb_decrypt",16); | ||
| 471 | &mov ($key_,$key); # backup $key | ||
| 472 | &mov ($rounds_,$rounds); # backup $rounds | ||
| 473 | &cmp ($len,0x60); | ||
| 474 | &jb (&label("ecb_dec_tail")); | ||
| 475 | |||
| 476 | &movdqu ($inout0,&QWP(0,$inp)); | ||
| 477 | &movdqu ($inout1,&QWP(0x10,$inp)); | ||
| 478 | &movdqu ($inout2,&QWP(0x20,$inp)); | ||
| 479 | &movdqu ($inout3,&QWP(0x30,$inp)); | ||
| 480 | &movdqu ($inout4,&QWP(0x40,$inp)); | ||
| 481 | &movdqu ($inout5,&QWP(0x50,$inp)); | ||
| 482 | &lea ($inp,&DWP(0x60,$inp)); | ||
| 483 | &sub ($len,0x60); | ||
| 484 | &jmp (&label("ecb_dec_loop6_enter")); | ||
| 485 | |||
| 486 | &set_label("ecb_dec_loop6",16); | ||
| 487 | &movups (&QWP(0,$out),$inout0); | ||
| 488 | &movdqu ($inout0,&QWP(0,$inp)); | ||
| 489 | &movups (&QWP(0x10,$out),$inout1); | ||
| 490 | &movdqu ($inout1,&QWP(0x10,$inp)); | ||
| 491 | &movups (&QWP(0x20,$out),$inout2); | ||
| 492 | &movdqu ($inout2,&QWP(0x20,$inp)); | ||
| 493 | &movups (&QWP(0x30,$out),$inout3); | ||
| 494 | &movdqu ($inout3,&QWP(0x30,$inp)); | ||
| 495 | &movups (&QWP(0x40,$out),$inout4); | ||
| 496 | &movdqu ($inout4,&QWP(0x40,$inp)); | ||
| 497 | &movups (&QWP(0x50,$out),$inout5); | ||
| 498 | &lea ($out,&DWP(0x60,$out)); | ||
| 499 | &movdqu ($inout5,&QWP(0x50,$inp)); | ||
| 500 | &lea ($inp,&DWP(0x60,$inp)); | ||
| 501 | &set_label("ecb_dec_loop6_enter"); | ||
| 502 | |||
| 503 | &call ("_aesni_decrypt6"); | ||
| 504 | |||
| 505 | &mov ($key,$key_); # restore $key | ||
| 506 | &mov ($rounds,$rounds_); # restore $rounds | ||
| 507 | &sub ($len,0x60); | ||
| 508 | &jnc (&label("ecb_dec_loop6")); | ||
| 509 | |||
| 510 | &movups (&QWP(0,$out),$inout0); | ||
| 511 | &movups (&QWP(0x10,$out),$inout1); | ||
| 512 | &movups (&QWP(0x20,$out),$inout2); | ||
| 513 | &movups (&QWP(0x30,$out),$inout3); | ||
| 514 | &movups (&QWP(0x40,$out),$inout4); | ||
| 515 | &movups (&QWP(0x50,$out),$inout5); | ||
| 516 | &lea ($out,&DWP(0x60,$out)); | ||
| 517 | &add ($len,0x60); | ||
| 518 | &jz (&label("ecb_ret")); | ||
| 519 | |||
| 520 | &set_label("ecb_dec_tail"); | ||
| 521 | &movups ($inout0,&QWP(0,$inp)); | ||
| 522 | &cmp ($len,0x20); | ||
| 523 | &jb (&label("ecb_dec_one")); | ||
| 524 | &movups ($inout1,&QWP(0x10,$inp)); | ||
| 525 | &je (&label("ecb_dec_two")); | ||
| 526 | &movups ($inout2,&QWP(0x20,$inp)); | ||
| 527 | &cmp ($len,0x40); | ||
| 528 | &jb (&label("ecb_dec_three")); | ||
| 529 | &movups ($inout3,&QWP(0x30,$inp)); | ||
| 530 | &je (&label("ecb_dec_four")); | ||
| 531 | &movups ($inout4,&QWP(0x40,$inp)); | ||
| 532 | &xorps ($inout5,$inout5); | ||
| 533 | &call ("_aesni_decrypt6"); | ||
| 534 | &movups (&QWP(0,$out),$inout0); | ||
| 535 | &movups (&QWP(0x10,$out),$inout1); | ||
| 536 | &movups (&QWP(0x20,$out),$inout2); | ||
| 537 | &movups (&QWP(0x30,$out),$inout3); | ||
| 538 | &movups (&QWP(0x40,$out),$inout4); | ||
| 539 | &jmp (&label("ecb_ret")); | ||
| 540 | |||
| 541 | &set_label("ecb_dec_one",16); | ||
| 542 | if ($inline) | ||
| 543 | { &aesni_inline_generate1("dec"); } | ||
| 544 | else | ||
| 545 | { &call ("_aesni_decrypt1"); } | ||
| 546 | &movups (&QWP(0,$out),$inout0); | ||
| 547 | &jmp (&label("ecb_ret")); | ||
| 548 | |||
| 549 | &set_label("ecb_dec_two",16); | ||
| 550 | &xorps ($inout2,$inout2); | ||
| 551 | &call ("_aesni_decrypt3"); | ||
| 552 | &movups (&QWP(0,$out),$inout0); | ||
| 553 | &movups (&QWP(0x10,$out),$inout1); | ||
| 554 | &jmp (&label("ecb_ret")); | ||
| 555 | |||
| 556 | &set_label("ecb_dec_three",16); | ||
| 557 | &call ("_aesni_decrypt3"); | ||
| 558 | &movups (&QWP(0,$out),$inout0); | ||
| 559 | &movups (&QWP(0x10,$out),$inout1); | ||
| 560 | &movups (&QWP(0x20,$out),$inout2); | ||
| 561 | &jmp (&label("ecb_ret")); | ||
| 562 | |||
| 563 | &set_label("ecb_dec_four",16); | ||
| 564 | &call ("_aesni_decrypt4"); | ||
| 565 | &movups (&QWP(0,$out),$inout0); | ||
| 566 | &movups (&QWP(0x10,$out),$inout1); | ||
| 567 | &movups (&QWP(0x20,$out),$inout2); | ||
| 568 | &movups (&QWP(0x30,$out),$inout3); | ||
| 569 | |||
| 570 | &set_label("ecb_ret"); | ||
| 571 | &function_end("aesni_ecb_encrypt"); | ||
| 572 | |||
| 573 | ###################################################################### | ||
| 574 | # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, | ||
| 575 | # size_t blocks, const AES_KEY *key, | ||
| 576 | # const char *ivec,char *cmac); | ||
| 577 | # | ||
| 578 | # Handles only complete blocks, operates on 64-bit counter and | ||
| 579 | # does not update *ivec! Nor does it finalize CMAC value | ||
| 580 | # (see engine/eng_aesni.c for details) | ||
| 581 | # | ||
| 582 | { my $cmac=$inout1; | ||
| 583 | &function_begin("aesni_ccm64_encrypt_blocks"); | ||
| 584 | &mov ($inp,&wparam(0)); | ||
| 585 | &mov ($out,&wparam(1)); | ||
| 586 | &mov ($len,&wparam(2)); | ||
| 587 | &mov ($key,&wparam(3)); | ||
| 588 | &mov ($rounds_,&wparam(4)); | ||
| 589 | &mov ($rounds,&wparam(5)); | ||
| 590 | &mov ($key_,"esp"); | ||
| 591 | &sub ("esp",60); | ||
| 592 | &and ("esp",-16); # align stack | ||
| 593 | &mov (&DWP(48,"esp"),$key_); | ||
| 594 | |||
| 595 | &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec | ||
| 596 | &movdqu ($cmac,&QWP(0,$rounds)); # load cmac | ||
| 597 | &mov ($rounds,&DWP(240,$key)); | ||
| 598 | |||
| 599 | # compose byte-swap control mask for pshufb on stack | ||
| 600 | &mov (&DWP(0,"esp"),0x0c0d0e0f); | ||
| 601 | &mov (&DWP(4,"esp"),0x08090a0b); | ||
| 602 | &mov (&DWP(8,"esp"),0x04050607); | ||
| 603 | &mov (&DWP(12,"esp"),0x00010203); | ||
| 604 | |||
| 605 | # compose counter increment vector on stack | ||
| 606 | &mov ($rounds_,1); | ||
| 607 | &xor ($key_,$key_); | ||
| 608 | &mov (&DWP(16,"esp"),$rounds_); | ||
| 609 | &mov (&DWP(20,"esp"),$key_); | ||
| 610 | &mov (&DWP(24,"esp"),$key_); | ||
| 611 | &mov (&DWP(28,"esp"),$key_); | ||
| 612 | |||
| 613 | &shr ($rounds,1); | ||
| 614 | &lea ($key_,&DWP(0,$key)); | ||
| 615 | &movdqa ($inout3,&QWP(0,"esp")); | ||
| 616 | &movdqa ($inout0,$ivec); | ||
| 617 | &mov ($rounds_,$rounds); | ||
| 618 | &pshufb ($ivec,$inout3); | ||
| 619 | |||
| 620 | &set_label("ccm64_enc_outer"); | ||
| 621 | &$movekey ($rndkey0,&QWP(0,$key_)); | ||
| 622 | &mov ($rounds,$rounds_); | ||
| 623 | &movups ($in0,&QWP(0,$inp)); | ||
| 624 | |||
| 625 | &xorps ($inout0,$rndkey0); | ||
| 626 | &$movekey ($rndkey1,&QWP(16,$key_)); | ||
| 627 | &xorps ($rndkey0,$in0); | ||
| 628 | &lea ($key,&DWP(32,$key_)); | ||
| 629 | &xorps ($cmac,$rndkey0); # cmac^=inp | ||
| 630 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 631 | |||
| 632 | &set_label("ccm64_enc2_loop"); | ||
| 633 | &aesenc ($inout0,$rndkey1); | ||
| 634 | &dec ($rounds); | ||
| 635 | &aesenc ($cmac,$rndkey1); | ||
| 636 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
| 637 | &aesenc ($inout0,$rndkey0); | ||
| 638 | &lea ($key,&DWP(32,$key)); | ||
| 639 | &aesenc ($cmac,$rndkey0); | ||
| 640 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 641 | &jnz (&label("ccm64_enc2_loop")); | ||
| 642 | &aesenc ($inout0,$rndkey1); | ||
| 643 | &aesenc ($cmac,$rndkey1); | ||
| 644 | &paddq ($ivec,&QWP(16,"esp")); | ||
| 645 | &aesenclast ($inout0,$rndkey0); | ||
| 646 | &aesenclast ($cmac,$rndkey0); | ||
| 647 | |||
| 648 | &dec ($len); | ||
| 649 | &lea ($inp,&DWP(16,$inp)); | ||
| 650 | &xorps ($in0,$inout0); # inp^=E(ivec) | ||
| 651 | &movdqa ($inout0,$ivec); | ||
| 652 | &movups (&QWP(0,$out),$in0); # save output | ||
| 653 | &lea ($out,&DWP(16,$out)); | ||
| 654 | &pshufb ($inout0,$inout3); | ||
| 655 | &jnz (&label("ccm64_enc_outer")); | ||
| 656 | |||
| 657 | &mov ("esp",&DWP(48,"esp")); | ||
| 658 | &mov ($out,&wparam(5)); | ||
| 659 | &movups (&QWP(0,$out),$cmac); | ||
| 660 | &function_end("aesni_ccm64_encrypt_blocks"); | ||
| 661 | |||
| 662 | &function_begin("aesni_ccm64_decrypt_blocks"); | ||
| 663 | &mov ($inp,&wparam(0)); | ||
| 664 | &mov ($out,&wparam(1)); | ||
| 665 | &mov ($len,&wparam(2)); | ||
| 666 | &mov ($key,&wparam(3)); | ||
| 667 | &mov ($rounds_,&wparam(4)); | ||
| 668 | &mov ($rounds,&wparam(5)); | ||
| 669 | &mov ($key_,"esp"); | ||
| 670 | &sub ("esp",60); | ||
| 671 | &and ("esp",-16); # align stack | ||
| 672 | &mov (&DWP(48,"esp"),$key_); | ||
| 673 | |||
| 674 | &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec | ||
| 675 | &movdqu ($cmac,&QWP(0,$rounds)); # load cmac | ||
| 676 | &mov ($rounds,&DWP(240,$key)); | ||
| 677 | |||
| 678 | # compose byte-swap control mask for pshufb on stack | ||
| 679 | &mov (&DWP(0,"esp"),0x0c0d0e0f); | ||
| 680 | &mov (&DWP(4,"esp"),0x08090a0b); | ||
| 681 | &mov (&DWP(8,"esp"),0x04050607); | ||
| 682 | &mov (&DWP(12,"esp"),0x00010203); | ||
| 683 | |||
| 684 | # compose counter increment vector on stack | ||
| 685 | &mov ($rounds_,1); | ||
| 686 | &xor ($key_,$key_); | ||
| 687 | &mov (&DWP(16,"esp"),$rounds_); | ||
| 688 | &mov (&DWP(20,"esp"),$key_); | ||
| 689 | &mov (&DWP(24,"esp"),$key_); | ||
| 690 | &mov (&DWP(28,"esp"),$key_); | ||
| 691 | |||
| 692 | &movdqa ($inout3,&QWP(0,"esp")); # bswap mask | ||
| 693 | &movdqa ($inout0,$ivec); | ||
| 694 | |||
| 695 | &mov ($key_,$key); | ||
| 696 | &mov ($rounds_,$rounds); | ||
| 697 | |||
| 698 | &pshufb ($ivec,$inout3); | ||
| 699 | if ($inline) | ||
| 700 | { &aesni_inline_generate1("enc"); } | ||
| 701 | else | ||
| 702 | { &call ("_aesni_encrypt1"); } | ||
| 703 | &movups ($in0,&QWP(0,$inp)); # load inp | ||
| 704 | &paddq ($ivec,&QWP(16,"esp")); | ||
| 705 | &lea ($inp,&QWP(16,$inp)); | ||
| 706 | &jmp (&label("ccm64_dec_outer")); | ||
| 707 | |||
| 708 | &set_label("ccm64_dec_outer",16); | ||
| 709 | &xorps ($in0,$inout0); # inp ^= E(ivec) | ||
| 710 | &movdqa ($inout0,$ivec); | ||
| 711 | &mov ($rounds,$rounds_); | ||
| 712 | &movups (&QWP(0,$out),$in0); # save output | ||
| 713 | &lea ($out,&DWP(16,$out)); | ||
| 714 | &pshufb ($inout0,$inout3); | ||
| 715 | |||
| 716 | &sub ($len,1); | ||
| 717 | &jz (&label("ccm64_dec_break")); | ||
| 718 | |||
| 719 | &$movekey ($rndkey0,&QWP(0,$key_)); | ||
| 720 | &shr ($rounds,1); | ||
| 721 | &$movekey ($rndkey1,&QWP(16,$key_)); | ||
| 722 | &xorps ($in0,$rndkey0); | ||
| 723 | &lea ($key,&DWP(32,$key_)); | ||
| 724 | &xorps ($inout0,$rndkey0); | ||
| 725 | &xorps ($cmac,$in0); # cmac^=out | ||
| 726 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 727 | |||
| 728 | &set_label("ccm64_dec2_loop"); | ||
| 729 | &aesenc ($inout0,$rndkey1); | ||
| 730 | &dec ($rounds); | ||
| 731 | &aesenc ($cmac,$rndkey1); | ||
| 732 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
| 733 | &aesenc ($inout0,$rndkey0); | ||
| 734 | &lea ($key,&DWP(32,$key)); | ||
| 735 | &aesenc ($cmac,$rndkey0); | ||
| 736 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 737 | &jnz (&label("ccm64_dec2_loop")); | ||
| 738 | &movups ($in0,&QWP(0,$inp)); # load inp | ||
| 739 | &paddq ($ivec,&QWP(16,"esp")); | ||
| 740 | &aesenc ($inout0,$rndkey1); | ||
| 741 | &aesenc ($cmac,$rndkey1); | ||
| 742 | &lea ($inp,&QWP(16,$inp)); | ||
| 743 | &aesenclast ($inout0,$rndkey0); | ||
| 744 | &aesenclast ($cmac,$rndkey0); | ||
| 745 | &jmp (&label("ccm64_dec_outer")); | ||
| 746 | |||
| 747 | &set_label("ccm64_dec_break",16); | ||
| 748 | &mov ($key,$key_); | ||
| 749 | if ($inline) | ||
| 750 | { &aesni_inline_generate1("enc",$cmac,$in0); } | ||
| 751 | else | ||
| 752 | { &call ("_aesni_encrypt1",$cmac); } | ||
| 753 | |||
| 754 | &mov ("esp",&DWP(48,"esp")); | ||
| 755 | &mov ($out,&wparam(5)); | ||
| 756 | &movups (&QWP(0,$out),$cmac); | ||
| 757 | &function_end("aesni_ccm64_decrypt_blocks"); | ||
| 758 | } | ||
| 759 | |||
| 760 | ###################################################################### | ||
| 761 | # void aesni_ctr32_encrypt_blocks (const void *in, void *out, | ||
| 762 | # size_t blocks, const AES_KEY *key, | ||
| 763 | # const char *ivec); | ||
| 764 | # | ||
| 765 | # Handles only complete blocks, operates on 32-bit counter and | ||
| 766 | # does not update *ivec! (see engine/eng_aesni.c for details) | ||
| 767 | # | ||
| 768 | # stack layout: | ||
| 769 | # 0 pshufb mask | ||
| 770 | # 16 vector addend: 0,6,6,6 | ||
| 771 | # 32 counter-less ivec | ||
| 772 | # 48 1st triplet of counter vector | ||
| 773 | # 64 2nd triplet of counter vector | ||
| 774 | # 80 saved %esp | ||
| 775 | |||
| 776 | &function_begin("aesni_ctr32_encrypt_blocks"); | ||
| 777 | &mov ($inp,&wparam(0)); | ||
| 778 | &mov ($out,&wparam(1)); | ||
| 779 | &mov ($len,&wparam(2)); | ||
| 780 | &mov ($key,&wparam(3)); | ||
| 781 | &mov ($rounds_,&wparam(4)); | ||
| 782 | &mov ($key_,"esp"); | ||
| 783 | &sub ("esp",88); | ||
| 784 | &and ("esp",-16); # align stack | ||
| 785 | &mov (&DWP(80,"esp"),$key_); | ||
| 786 | |||
| 787 | &cmp ($len,1); | ||
| 788 | &je (&label("ctr32_one_shortcut")); | ||
| 789 | |||
| 790 | &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec | ||
| 791 | |||
| 792 | # compose byte-swap control mask for pshufb on stack | ||
| 793 | &mov (&DWP(0,"esp"),0x0c0d0e0f); | ||
| 794 | &mov (&DWP(4,"esp"),0x08090a0b); | ||
| 795 | &mov (&DWP(8,"esp"),0x04050607); | ||
| 796 | &mov (&DWP(12,"esp"),0x00010203); | ||
| 797 | |||
| 798 | # compose counter increment vector on stack | ||
| 799 | &mov ($rounds,6); | ||
| 800 | &xor ($key_,$key_); | ||
| 801 | &mov (&DWP(16,"esp"),$rounds); | ||
| 802 | &mov (&DWP(20,"esp"),$rounds); | ||
| 803 | &mov (&DWP(24,"esp"),$rounds); | ||
| 804 | &mov (&DWP(28,"esp"),$key_); | ||
| 805 | |||
| 806 | &pextrd ($rounds_,$inout5,3); # pull 32-bit counter | ||
| 807 | &pinsrd ($inout5,$key_,3); # wipe 32-bit counter | ||
| 808 | |||
| 809 | &mov ($rounds,&DWP(240,$key)); # key->rounds | ||
| 810 | |||
| 811 | # compose 2 vectors of 3x32-bit counters | ||
| 812 | &bswap ($rounds_); | ||
| 813 | &pxor ($rndkey1,$rndkey1); | ||
| 814 | &pxor ($rndkey0,$rndkey0); | ||
| 815 | &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask | ||
| 816 | &pinsrd ($rndkey1,$rounds_,0); | ||
| 817 | &lea ($key_,&DWP(3,$rounds_)); | ||
| 818 | &pinsrd ($rndkey0,$key_,0); | ||
| 819 | &inc ($rounds_); | ||
| 820 | &pinsrd ($rndkey1,$rounds_,1); | ||
| 821 | &inc ($key_); | ||
| 822 | &pinsrd ($rndkey0,$key_,1); | ||
| 823 | &inc ($rounds_); | ||
| 824 | &pinsrd ($rndkey1,$rounds_,2); | ||
| 825 | &inc ($key_); | ||
| 826 | &pinsrd ($rndkey0,$key_,2); | ||
| 827 | &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet | ||
| 828 | &pshufb ($rndkey1,$inout0); # byte swap | ||
| 829 | &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet | ||
| 830 | &pshufb ($rndkey0,$inout0); # byte swap | ||
| 831 | |||
| 832 | &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword | ||
| 833 | &pshufd ($inout1,$rndkey1,2<<6); | ||
| 834 | &cmp ($len,6); | ||
| 835 | &jb (&label("ctr32_tail")); | ||
| 836 | &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec | ||
| 837 | &shr ($rounds,1); | ||
| 838 | &mov ($key_,$key); # backup $key | ||
| 839 | &mov ($rounds_,$rounds); # backup $rounds | ||
| 840 | &sub ($len,6); | ||
| 841 | &jmp (&label("ctr32_loop6")); | ||
| 842 | |||
| 843 | &set_label("ctr32_loop6",16); | ||
| 844 | &pshufd ($inout2,$rndkey1,1<<6); | ||
| 845 | &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec | ||
| 846 | &pshufd ($inout3,$rndkey0,3<<6); | ||
| 847 | &por ($inout0,$rndkey1); # merge counter-less ivec | ||
| 848 | &pshufd ($inout4,$rndkey0,2<<6); | ||
| 849 | &por ($inout1,$rndkey1); | ||
| 850 | &pshufd ($inout5,$rndkey0,1<<6); | ||
| 851 | &por ($inout2,$rndkey1); | ||
| 852 | &por ($inout3,$rndkey1); | ||
| 853 | &por ($inout4,$rndkey1); | ||
| 854 | &por ($inout5,$rndkey1); | ||
| 855 | |||
| 856 | # inlining _aesni_encrypt6's prologue gives ~4% improvement... | ||
| 857 | &$movekey ($rndkey0,&QWP(0,$key_)); | ||
| 858 | &$movekey ($rndkey1,&QWP(16,$key_)); | ||
| 859 | &lea ($key,&DWP(32,$key_)); | ||
| 860 | &dec ($rounds); | ||
| 861 | &pxor ($inout0,$rndkey0); | ||
| 862 | &pxor ($inout1,$rndkey0); | ||
| 863 | &aesenc ($inout0,$rndkey1); | ||
| 864 | &pxor ($inout2,$rndkey0); | ||
| 865 | &aesenc ($inout1,$rndkey1); | ||
| 866 | &pxor ($inout3,$rndkey0); | ||
| 867 | &aesenc ($inout2,$rndkey1); | ||
| 868 | &pxor ($inout4,$rndkey0); | ||
| 869 | &aesenc ($inout3,$rndkey1); | ||
| 870 | &pxor ($inout5,$rndkey0); | ||
| 871 | &aesenc ($inout4,$rndkey1); | ||
| 872 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 873 | &aesenc ($inout5,$rndkey1); | ||
| 874 | |||
| 875 | &call (&label("_aesni_encrypt6_enter")); | ||
| 876 | |||
| 877 | &movups ($rndkey1,&QWP(0,$inp)); | ||
| 878 | &movups ($rndkey0,&QWP(0x10,$inp)); | ||
| 879 | &xorps ($inout0,$rndkey1); | ||
| 880 | &movups ($rndkey1,&QWP(0x20,$inp)); | ||
| 881 | &xorps ($inout1,$rndkey0); | ||
| 882 | &movups (&QWP(0,$out),$inout0); | ||
| 883 | &movdqa ($rndkey0,&QWP(16,"esp")); # load increment | ||
| 884 | &xorps ($inout2,$rndkey1); | ||
| 885 | &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet | ||
| 886 | &movups (&QWP(0x10,$out),$inout1); | ||
| 887 | &movups (&QWP(0x20,$out),$inout2); | ||
| 888 | |||
| 889 | &paddd ($rndkey1,$rndkey0); # 1st triplet increment | ||
| 890 | &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment | ||
| 891 | &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask | ||
| 892 | |||
| 893 | &movups ($inout1,&QWP(0x30,$inp)); | ||
| 894 | &movups ($inout2,&QWP(0x40,$inp)); | ||
| 895 | &xorps ($inout3,$inout1); | ||
| 896 | &movups ($inout1,&QWP(0x50,$inp)); | ||
| 897 | &lea ($inp,&DWP(0x60,$inp)); | ||
| 898 | &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet | ||
| 899 | &pshufb ($rndkey1,$inout0); # byte swap | ||
| 900 | &xorps ($inout4,$inout2); | ||
| 901 | &movups (&QWP(0x30,$out),$inout3); | ||
| 902 | &xorps ($inout5,$inout1); | ||
| 903 | &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet | ||
| 904 | &pshufb ($rndkey0,$inout0); # byte swap | ||
| 905 | &movups (&QWP(0x40,$out),$inout4); | ||
| 906 | &pshufd ($inout0,$rndkey1,3<<6); | ||
| 907 | &movups (&QWP(0x50,$out),$inout5); | ||
| 908 | &lea ($out,&DWP(0x60,$out)); | ||
| 909 | |||
| 910 | &mov ($rounds,$rounds_); | ||
| 911 | &pshufd ($inout1,$rndkey1,2<<6); | ||
| 912 | &sub ($len,6); | ||
| 913 | &jnc (&label("ctr32_loop6")); | ||
| 914 | |||
| 915 | &add ($len,6); | ||
| 916 | &jz (&label("ctr32_ret")); | ||
| 917 | &mov ($key,$key_); | ||
| 918 | &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds | ||
| 919 | &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec | ||
| 920 | |||
| 921 | &set_label("ctr32_tail"); | ||
| 922 | &por ($inout0,$inout5); | ||
| 923 | &cmp ($len,2); | ||
| 924 | &jb (&label("ctr32_one")); | ||
| 925 | |||
| 926 | &pshufd ($inout2,$rndkey1,1<<6); | ||
| 927 | &por ($inout1,$inout5); | ||
| 928 | &je (&label("ctr32_two")); | ||
| 929 | |||
| 930 | &pshufd ($inout3,$rndkey0,3<<6); | ||
| 931 | &por ($inout2,$inout5); | ||
| 932 | &cmp ($len,4); | ||
| 933 | &jb (&label("ctr32_three")); | ||
| 934 | |||
| 935 | &pshufd ($inout4,$rndkey0,2<<6); | ||
| 936 | &por ($inout3,$inout5); | ||
| 937 | &je (&label("ctr32_four")); | ||
| 938 | |||
| 939 | &por ($inout4,$inout5); | ||
| 940 | &call ("_aesni_encrypt6"); | ||
| 941 | &movups ($rndkey1,&QWP(0,$inp)); | ||
| 942 | &movups ($rndkey0,&QWP(0x10,$inp)); | ||
| 943 | &xorps ($inout0,$rndkey1); | ||
| 944 | &movups ($rndkey1,&QWP(0x20,$inp)); | ||
| 945 | &xorps ($inout1,$rndkey0); | ||
| 946 | &movups ($rndkey0,&QWP(0x30,$inp)); | ||
| 947 | &xorps ($inout2,$rndkey1); | ||
| 948 | &movups ($rndkey1,&QWP(0x40,$inp)); | ||
| 949 | &xorps ($inout3,$rndkey0); | ||
| 950 | &movups (&QWP(0,$out),$inout0); | ||
| 951 | &xorps ($inout4,$rndkey1); | ||
| 952 | &movups (&QWP(0x10,$out),$inout1); | ||
| 953 | &movups (&QWP(0x20,$out),$inout2); | ||
| 954 | &movups (&QWP(0x30,$out),$inout3); | ||
| 955 | &movups (&QWP(0x40,$out),$inout4); | ||
| 956 | &jmp (&label("ctr32_ret")); | ||
| 957 | |||
| 958 | &set_label("ctr32_one_shortcut",16); | ||
| 959 | &movups ($inout0,&QWP(0,$rounds_)); # load ivec | ||
| 960 | &mov ($rounds,&DWP(240,$key)); | ||
| 961 | |||
| 962 | &set_label("ctr32_one"); | ||
| 963 | if ($inline) | ||
| 964 | { &aesni_inline_generate1("enc"); } | ||
| 965 | else | ||
| 966 | { &call ("_aesni_encrypt1"); } | ||
| 967 | &movups ($in0,&QWP(0,$inp)); | ||
| 968 | &xorps ($in0,$inout0); | ||
| 969 | &movups (&QWP(0,$out),$in0); | ||
| 970 | &jmp (&label("ctr32_ret")); | ||
| 971 | |||
| 972 | &set_label("ctr32_two",16); | ||
| 973 | &call ("_aesni_encrypt3"); | ||
| 974 | &movups ($inout3,&QWP(0,$inp)); | ||
| 975 | &movups ($inout4,&QWP(0x10,$inp)); | ||
| 976 | &xorps ($inout0,$inout3); | ||
| 977 | &xorps ($inout1,$inout4); | ||
| 978 | &movups (&QWP(0,$out),$inout0); | ||
| 979 | &movups (&QWP(0x10,$out),$inout1); | ||
| 980 | &jmp (&label("ctr32_ret")); | ||
| 981 | |||
| 982 | &set_label("ctr32_three",16); | ||
| 983 | &call ("_aesni_encrypt3"); | ||
| 984 | &movups ($inout3,&QWP(0,$inp)); | ||
| 985 | &movups ($inout4,&QWP(0x10,$inp)); | ||
| 986 | &xorps ($inout0,$inout3); | ||
| 987 | &movups ($inout5,&QWP(0x20,$inp)); | ||
| 988 | &xorps ($inout1,$inout4); | ||
| 989 | &movups (&QWP(0,$out),$inout0); | ||
| 990 | &xorps ($inout2,$inout5); | ||
| 991 | &movups (&QWP(0x10,$out),$inout1); | ||
| 992 | &movups (&QWP(0x20,$out),$inout2); | ||
| 993 | &jmp (&label("ctr32_ret")); | ||
| 994 | |||
| 995 | &set_label("ctr32_four",16); | ||
| 996 | &call ("_aesni_encrypt4"); | ||
| 997 | &movups ($inout4,&QWP(0,$inp)); | ||
| 998 | &movups ($inout5,&QWP(0x10,$inp)); | ||
| 999 | &movups ($rndkey1,&QWP(0x20,$inp)); | ||
| 1000 | &xorps ($inout0,$inout4); | ||
| 1001 | &movups ($rndkey0,&QWP(0x30,$inp)); | ||
| 1002 | &xorps ($inout1,$inout5); | ||
| 1003 | &movups (&QWP(0,$out),$inout0); | ||
| 1004 | &xorps ($inout2,$rndkey1); | ||
| 1005 | &movups (&QWP(0x10,$out),$inout1); | ||
| 1006 | &xorps ($inout3,$rndkey0); | ||
| 1007 | &movups (&QWP(0x20,$out),$inout2); | ||
| 1008 | &movups (&QWP(0x30,$out),$inout3); | ||
| 1009 | |||
| 1010 | &set_label("ctr32_ret"); | ||
| 1011 | &mov ("esp",&DWP(80,"esp")); | ||
| 1012 | &function_end("aesni_ctr32_encrypt_blocks"); | ||
| 1013 | |||
| 1014 | ###################################################################### | ||
| 1015 | # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
| 1016 | # const AES_KEY *key1, const AES_KEY *key2 | ||
| 1017 | # const unsigned char iv[16]); | ||
| 1018 | # | ||
| 1019 | { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); | ||
| 1020 | |||
| 1021 | &function_begin("aesni_xts_encrypt"); | ||
| 1022 | &mov ($key,&wparam(4)); # key2 | ||
| 1023 | &mov ($inp,&wparam(5)); # clear-text tweak | ||
| 1024 | |||
| 1025 | &mov ($rounds,&DWP(240,$key)); # key2->rounds | ||
| 1026 | &movups ($inout0,&QWP(0,$inp)); | ||
| 1027 | if ($inline) | ||
| 1028 | { &aesni_inline_generate1("enc"); } | ||
| 1029 | else | ||
| 1030 | { &call ("_aesni_encrypt1"); } | ||
| 1031 | |||
| 1032 | &mov ($inp,&wparam(0)); | ||
| 1033 | &mov ($out,&wparam(1)); | ||
| 1034 | &mov ($len,&wparam(2)); | ||
| 1035 | &mov ($key,&wparam(3)); # key1 | ||
| 1036 | |||
| 1037 | &mov ($key_,"esp"); | ||
| 1038 | &sub ("esp",16*7+8); | ||
| 1039 | &mov ($rounds,&DWP(240,$key)); # key1->rounds | ||
| 1040 | &and ("esp",-16); # align stack | ||
| 1041 | |||
| 1042 | &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant | ||
| 1043 | &mov (&DWP(16*6+4,"esp"),0); | ||
| 1044 | &mov (&DWP(16*6+8,"esp"),1); | ||
| 1045 | &mov (&DWP(16*6+12,"esp"),0); | ||
| 1046 | &mov (&DWP(16*7+0,"esp"),$len); # save original $len | ||
| 1047 | &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp | ||
| 1048 | |||
| 1049 | &movdqa ($tweak,$inout0); | ||
| 1050 | &pxor ($twtmp,$twtmp); | ||
| 1051 | &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 | ||
| 1052 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1053 | |||
| 1054 | &and ($len,-16); | ||
| 1055 | &mov ($key_,$key); # backup $key | ||
| 1056 | &mov ($rounds_,$rounds); # backup $rounds | ||
| 1057 | &sub ($len,16*6); | ||
| 1058 | &jc (&label("xts_enc_short")); | ||
| 1059 | |||
| 1060 | &shr ($rounds,1); | ||
| 1061 | &mov ($rounds_,$rounds); | ||
| 1062 | &jmp (&label("xts_enc_loop6")); | ||
| 1063 | |||
| 1064 | &set_label("xts_enc_loop6",16); | ||
| 1065 | for ($i=0;$i<4;$i++) { | ||
| 1066 | &pshufd ($twres,$twtmp,0x13); | ||
| 1067 | &pxor ($twtmp,$twtmp); | ||
| 1068 | &movdqa (&QWP(16*$i,"esp"),$tweak); | ||
| 1069 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1070 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1071 | &pcmpgtd ($twtmp,$tweak); # broadcast upper bits | ||
| 1072 | &pxor ($tweak,$twres); | ||
| 1073 | } | ||
| 1074 | &pshufd ($inout5,$twtmp,0x13); | ||
| 1075 | &movdqa (&QWP(16*$i++,"esp"),$tweak); | ||
| 1076 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1077 | &$movekey ($rndkey0,&QWP(0,$key_)); | ||
| 1078 | &pand ($inout5,$twmask); # isolate carry and residue | ||
| 1079 | &movups ($inout0,&QWP(0,$inp)); # load input | ||
| 1080 | &pxor ($inout5,$tweak); | ||
| 1081 | |||
| 1082 | # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] | ||
| 1083 | &movdqu ($inout1,&QWP(16*1,$inp)); | ||
| 1084 | &xorps ($inout0,$rndkey0); # input^=rndkey[0] | ||
| 1085 | &movdqu ($inout2,&QWP(16*2,$inp)); | ||
| 1086 | &pxor ($inout1,$rndkey0); | ||
| 1087 | &movdqu ($inout3,&QWP(16*3,$inp)); | ||
| 1088 | &pxor ($inout2,$rndkey0); | ||
| 1089 | &movdqu ($inout4,&QWP(16*4,$inp)); | ||
| 1090 | &pxor ($inout3,$rndkey0); | ||
| 1091 | &movdqu ($rndkey1,&QWP(16*5,$inp)); | ||
| 1092 | &pxor ($inout4,$rndkey0); | ||
| 1093 | &lea ($inp,&DWP(16*6,$inp)); | ||
| 1094 | &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
| 1095 | &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak | ||
| 1096 | &pxor ($inout5,$rndkey1); | ||
| 1097 | |||
| 1098 | &$movekey ($rndkey1,&QWP(16,$key_)); | ||
| 1099 | &lea ($key,&DWP(32,$key_)); | ||
| 1100 | &pxor ($inout1,&QWP(16*1,"esp")); | ||
| 1101 | &aesenc ($inout0,$rndkey1); | ||
| 1102 | &pxor ($inout2,&QWP(16*2,"esp")); | ||
| 1103 | &aesenc ($inout1,$rndkey1); | ||
| 1104 | &pxor ($inout3,&QWP(16*3,"esp")); | ||
| 1105 | &dec ($rounds); | ||
| 1106 | &aesenc ($inout2,$rndkey1); | ||
| 1107 | &pxor ($inout4,&QWP(16*4,"esp")); | ||
| 1108 | &aesenc ($inout3,$rndkey1); | ||
| 1109 | &pxor ($inout5,$rndkey0); | ||
| 1110 | &aesenc ($inout4,$rndkey1); | ||
| 1111 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 1112 | &aesenc ($inout5,$rndkey1); | ||
| 1113 | &call (&label("_aesni_encrypt6_enter")); | ||
| 1114 | |||
| 1115 | &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak | ||
| 1116 | &pxor ($twtmp,$twtmp); | ||
| 1117 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
| 1118 | &pcmpgtd ($twtmp,$tweak); # broadcast upper bits | ||
| 1119 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
| 1120 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1121 | &xorps ($inout2,&QWP(16*2,"esp")); | ||
| 1122 | &movups (&QWP(16*1,$out),$inout1); | ||
| 1123 | &xorps ($inout3,&QWP(16*3,"esp")); | ||
| 1124 | &movups (&QWP(16*2,$out),$inout2); | ||
| 1125 | &xorps ($inout4,&QWP(16*4,"esp")); | ||
| 1126 | &movups (&QWP(16*3,$out),$inout3); | ||
| 1127 | &xorps ($inout5,$tweak); | ||
| 1128 | &movups (&QWP(16*4,$out),$inout4); | ||
| 1129 | &pshufd ($twres,$twtmp,0x13); | ||
| 1130 | &movups (&QWP(16*5,$out),$inout5); | ||
| 1131 | &lea ($out,&DWP(16*6,$out)); | ||
| 1132 | &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 | ||
| 1133 | |||
| 1134 | &pxor ($twtmp,$twtmp); | ||
| 1135 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1136 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1137 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1138 | &mov ($rounds,$rounds_); # restore $rounds | ||
| 1139 | &pxor ($tweak,$twres); | ||
| 1140 | |||
| 1141 | &sub ($len,16*6); | ||
| 1142 | &jnc (&label("xts_enc_loop6")); | ||
| 1143 | |||
| 1144 | &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds | ||
| 1145 | &mov ($key,$key_); # restore $key | ||
| 1146 | &mov ($rounds_,$rounds); | ||
| 1147 | |||
| 1148 | &set_label("xts_enc_short"); | ||
| 1149 | &add ($len,16*6); | ||
| 1150 | &jz (&label("xts_enc_done6x")); | ||
| 1151 | |||
| 1152 | &movdqa ($inout3,$tweak); # put aside previous tweak | ||
| 1153 | &cmp ($len,0x20); | ||
| 1154 | &jb (&label("xts_enc_one")); | ||
| 1155 | |||
| 1156 | &pshufd ($twres,$twtmp,0x13); | ||
| 1157 | &pxor ($twtmp,$twtmp); | ||
| 1158 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1159 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1160 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1161 | &pxor ($tweak,$twres); | ||
| 1162 | &je (&label("xts_enc_two")); | ||
| 1163 | |||
| 1164 | &pshufd ($twres,$twtmp,0x13); | ||
| 1165 | &pxor ($twtmp,$twtmp); | ||
| 1166 | &movdqa ($inout4,$tweak); # put aside previous tweak | ||
| 1167 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1168 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1169 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1170 | &pxor ($tweak,$twres); | ||
| 1171 | &cmp ($len,0x40); | ||
| 1172 | &jb (&label("xts_enc_three")); | ||
| 1173 | |||
| 1174 | &pshufd ($twres,$twtmp,0x13); | ||
| 1175 | &pxor ($twtmp,$twtmp); | ||
| 1176 | &movdqa ($inout5,$tweak); # put aside previous tweak | ||
| 1177 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1178 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1179 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1180 | &pxor ($tweak,$twres); | ||
| 1181 | &movdqa (&QWP(16*0,"esp"),$inout3); | ||
| 1182 | &movdqa (&QWP(16*1,"esp"),$inout4); | ||
| 1183 | &je (&label("xts_enc_four")); | ||
| 1184 | |||
| 1185 | &movdqa (&QWP(16*2,"esp"),$inout5); | ||
| 1186 | &pshufd ($inout5,$twtmp,0x13); | ||
| 1187 | &movdqa (&QWP(16*3,"esp"),$tweak); | ||
| 1188 | &paddq ($tweak,$tweak); # &psllq($inout0,1); | ||
| 1189 | &pand ($inout5,$twmask); # isolate carry and residue | ||
| 1190 | &pxor ($inout5,$tweak); | ||
| 1191 | |||
| 1192 | &movdqu ($inout0,&QWP(16*0,$inp)); # load input | ||
| 1193 | &movdqu ($inout1,&QWP(16*1,$inp)); | ||
| 1194 | &movdqu ($inout2,&QWP(16*2,$inp)); | ||
| 1195 | &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
| 1196 | &movdqu ($inout3,&QWP(16*3,$inp)); | ||
| 1197 | &pxor ($inout1,&QWP(16*1,"esp")); | ||
| 1198 | &movdqu ($inout4,&QWP(16*4,$inp)); | ||
| 1199 | &pxor ($inout2,&QWP(16*2,"esp")); | ||
| 1200 | &lea ($inp,&DWP(16*5,$inp)); | ||
| 1201 | &pxor ($inout3,&QWP(16*3,"esp")); | ||
| 1202 | &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak | ||
| 1203 | &pxor ($inout4,$inout5); | ||
| 1204 | |||
| 1205 | &call ("_aesni_encrypt6"); | ||
| 1206 | |||
| 1207 | &movaps ($tweak,&QWP(16*4,"esp")); # last tweak | ||
| 1208 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
| 1209 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
| 1210 | &xorps ($inout2,&QWP(16*2,"esp")); | ||
| 1211 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1212 | &xorps ($inout3,&QWP(16*3,"esp")); | ||
| 1213 | &movups (&QWP(16*1,$out),$inout1); | ||
| 1214 | &xorps ($inout4,$tweak); | ||
| 1215 | &movups (&QWP(16*2,$out),$inout2); | ||
| 1216 | &movups (&QWP(16*3,$out),$inout3); | ||
| 1217 | &movups (&QWP(16*4,$out),$inout4); | ||
| 1218 | &lea ($out,&DWP(16*5,$out)); | ||
| 1219 | &jmp (&label("xts_enc_done")); | ||
| 1220 | |||
| 1221 | &set_label("xts_enc_one",16); | ||
| 1222 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
| 1223 | &lea ($inp,&DWP(16*1,$inp)); | ||
| 1224 | &xorps ($inout0,$inout3); # input^=tweak | ||
| 1225 | if ($inline) | ||
| 1226 | { &aesni_inline_generate1("enc"); } | ||
| 1227 | else | ||
| 1228 | { &call ("_aesni_encrypt1"); } | ||
| 1229 | &xorps ($inout0,$inout3); # output^=tweak | ||
| 1230 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1231 | &lea ($out,&DWP(16*1,$out)); | ||
| 1232 | |||
| 1233 | &movdqa ($tweak,$inout3); # last tweak | ||
| 1234 | &jmp (&label("xts_enc_done")); | ||
| 1235 | |||
| 1236 | &set_label("xts_enc_two",16); | ||
| 1237 | &movaps ($inout4,$tweak); # put aside last tweak | ||
| 1238 | |||
| 1239 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
| 1240 | &movups ($inout1,&QWP(16*1,$inp)); | ||
| 1241 | &lea ($inp,&DWP(16*2,$inp)); | ||
| 1242 | &xorps ($inout0,$inout3); # input^=tweak | ||
| 1243 | &xorps ($inout1,$inout4); | ||
| 1244 | &xorps ($inout2,$inout2); | ||
| 1245 | |||
| 1246 | &call ("_aesni_encrypt3"); | ||
| 1247 | |||
| 1248 | &xorps ($inout0,$inout3); # output^=tweak | ||
| 1249 | &xorps ($inout1,$inout4); | ||
| 1250 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1251 | &movups (&QWP(16*1,$out),$inout1); | ||
| 1252 | &lea ($out,&DWP(16*2,$out)); | ||
| 1253 | |||
| 1254 | &movdqa ($tweak,$inout4); # last tweak | ||
| 1255 | &jmp (&label("xts_enc_done")); | ||
| 1256 | |||
| 1257 | &set_label("xts_enc_three",16); | ||
| 1258 | &movaps ($inout5,$tweak); # put aside last tweak | ||
| 1259 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
| 1260 | &movups ($inout1,&QWP(16*1,$inp)); | ||
| 1261 | &movups ($inout2,&QWP(16*2,$inp)); | ||
| 1262 | &lea ($inp,&DWP(16*3,$inp)); | ||
| 1263 | &xorps ($inout0,$inout3); # input^=tweak | ||
| 1264 | &xorps ($inout1,$inout4); | ||
| 1265 | &xorps ($inout2,$inout5); | ||
| 1266 | |||
| 1267 | &call ("_aesni_encrypt3"); | ||
| 1268 | |||
| 1269 | &xorps ($inout0,$inout3); # output^=tweak | ||
| 1270 | &xorps ($inout1,$inout4); | ||
| 1271 | &xorps ($inout2,$inout5); | ||
| 1272 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1273 | &movups (&QWP(16*1,$out),$inout1); | ||
| 1274 | &movups (&QWP(16*2,$out),$inout2); | ||
| 1275 | &lea ($out,&DWP(16*3,$out)); | ||
| 1276 | |||
| 1277 | &movdqa ($tweak,$inout5); # last tweak | ||
| 1278 | &jmp (&label("xts_enc_done")); | ||
| 1279 | |||
| 1280 | &set_label("xts_enc_four",16); | ||
| 1281 | &movaps ($inout4,$tweak); # put aside last tweak | ||
| 1282 | |||
| 1283 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
| 1284 | &movups ($inout1,&QWP(16*1,$inp)); | ||
| 1285 | &movups ($inout2,&QWP(16*2,$inp)); | ||
| 1286 | &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
| 1287 | &movups ($inout3,&QWP(16*3,$inp)); | ||
| 1288 | &lea ($inp,&DWP(16*4,$inp)); | ||
| 1289 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
| 1290 | &xorps ($inout2,$inout5); | ||
| 1291 | &xorps ($inout3,$inout4); | ||
| 1292 | |||
| 1293 | &call ("_aesni_encrypt4"); | ||
| 1294 | |||
| 1295 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
| 1296 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
| 1297 | &xorps ($inout2,$inout5); | ||
| 1298 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1299 | &xorps ($inout3,$inout4); | ||
| 1300 | &movups (&QWP(16*1,$out),$inout1); | ||
| 1301 | &movups (&QWP(16*2,$out),$inout2); | ||
| 1302 | &movups (&QWP(16*3,$out),$inout3); | ||
| 1303 | &lea ($out,&DWP(16*4,$out)); | ||
| 1304 | |||
| 1305 | &movdqa ($tweak,$inout4); # last tweak | ||
| 1306 | &jmp (&label("xts_enc_done")); | ||
| 1307 | |||
| 1308 | &set_label("xts_enc_done6x",16); # $tweak is pre-calculated | ||
| 1309 | &mov ($len,&DWP(16*7+0,"esp")); # restore original $len | ||
| 1310 | &and ($len,15); | ||
| 1311 | &jz (&label("xts_enc_ret")); | ||
| 1312 | &movdqa ($inout3,$tweak); | ||
| 1313 | &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 | ||
| 1314 | &jmp (&label("xts_enc_steal")); | ||
| 1315 | |||
| 1316 | &set_label("xts_enc_done",16); | ||
| 1317 | &mov ($len,&DWP(16*7+0,"esp")); # restore original $len | ||
| 1318 | &pxor ($twtmp,$twtmp); | ||
| 1319 | &and ($len,15); | ||
| 1320 | &jz (&label("xts_enc_ret")); | ||
| 1321 | |||
| 1322 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1323 | &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 | ||
| 1324 | &pshufd ($inout3,$twtmp,0x13); | ||
| 1325 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1326 | &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue | ||
| 1327 | &pxor ($inout3,$tweak); | ||
| 1328 | |||
| 1329 | &set_label("xts_enc_steal"); | ||
| 1330 | &movz ($rounds,&BP(0,$inp)); | ||
| 1331 | &movz ($key,&BP(-16,$out)); | ||
| 1332 | &lea ($inp,&DWP(1,$inp)); | ||
| 1333 | &mov (&BP(-16,$out),&LB($rounds)); | ||
| 1334 | &mov (&BP(0,$out),&LB($key)); | ||
| 1335 | &lea ($out,&DWP(1,$out)); | ||
| 1336 | &sub ($len,1); | ||
| 1337 | &jnz (&label("xts_enc_steal")); | ||
| 1338 | |||
| 1339 | &sub ($out,&DWP(16*7+0,"esp")); # rewind $out | ||
| 1340 | &mov ($key,$key_); # restore $key | ||
| 1341 | &mov ($rounds,$rounds_); # restore $rounds | ||
| 1342 | |||
| 1343 | &movups ($inout0,&QWP(-16,$out)); # load input | ||
| 1344 | &xorps ($inout0,$inout3); # input^=tweak | ||
| 1345 | if ($inline) | ||
| 1346 | { &aesni_inline_generate1("enc"); } | ||
| 1347 | else | ||
| 1348 | { &call ("_aesni_encrypt1"); } | ||
| 1349 | &xorps ($inout0,$inout3); # output^=tweak | ||
| 1350 | &movups (&QWP(-16,$out),$inout0); # write output | ||
| 1351 | |||
| 1352 | &set_label("xts_enc_ret"); | ||
| 1353 | &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp | ||
| 1354 | &function_end("aesni_xts_encrypt"); | ||
| 1355 | |||
| 1356 | &function_begin("aesni_xts_decrypt"); | ||
| 1357 | &mov ($key,&wparam(4)); # key2 | ||
| 1358 | &mov ($inp,&wparam(5)); # clear-text tweak | ||
| 1359 | |||
| 1360 | &mov ($rounds,&DWP(240,$key)); # key2->rounds | ||
| 1361 | &movups ($inout0,&QWP(0,$inp)); | ||
| 1362 | if ($inline) | ||
| 1363 | { &aesni_inline_generate1("enc"); } | ||
| 1364 | else | ||
| 1365 | { &call ("_aesni_encrypt1"); } | ||
| 1366 | |||
| 1367 | &mov ($inp,&wparam(0)); | ||
| 1368 | &mov ($out,&wparam(1)); | ||
| 1369 | &mov ($len,&wparam(2)); | ||
| 1370 | &mov ($key,&wparam(3)); # key1 | ||
| 1371 | |||
| 1372 | &mov ($key_,"esp"); | ||
| 1373 | &sub ("esp",16*7+8); | ||
| 1374 | &and ("esp",-16); # align stack | ||
| 1375 | |||
| 1376 | &xor ($rounds_,$rounds_); # if(len%16) len-=16; | ||
| 1377 | &test ($len,15); | ||
| 1378 | &setnz (&LB($rounds_)); | ||
| 1379 | &shl ($rounds_,4); | ||
| 1380 | &sub ($len,$rounds_); | ||
| 1381 | |||
| 1382 | &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant | ||
| 1383 | &mov (&DWP(16*6+4,"esp"),0); | ||
| 1384 | &mov (&DWP(16*6+8,"esp"),1); | ||
| 1385 | &mov (&DWP(16*6+12,"esp"),0); | ||
| 1386 | &mov (&DWP(16*7+0,"esp"),$len); # save original $len | ||
| 1387 | &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp | ||
| 1388 | |||
| 1389 | &mov ($rounds,&DWP(240,$key)); # key1->rounds | ||
| 1390 | &mov ($key_,$key); # backup $key | ||
| 1391 | &mov ($rounds_,$rounds); # backup $rounds | ||
| 1392 | |||
| 1393 | &movdqa ($tweak,$inout0); | ||
| 1394 | &pxor ($twtmp,$twtmp); | ||
| 1395 | &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 | ||
| 1396 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1397 | |||
| 1398 | &and ($len,-16); | ||
| 1399 | &sub ($len,16*6); | ||
| 1400 | &jc (&label("xts_dec_short")); | ||
| 1401 | |||
| 1402 | &shr ($rounds,1); | ||
| 1403 | &mov ($rounds_,$rounds); | ||
| 1404 | &jmp (&label("xts_dec_loop6")); | ||
| 1405 | |||
| 1406 | &set_label("xts_dec_loop6",16); | ||
| 1407 | for ($i=0;$i<4;$i++) { | ||
| 1408 | &pshufd ($twres,$twtmp,0x13); | ||
| 1409 | &pxor ($twtmp,$twtmp); | ||
| 1410 | &movdqa (&QWP(16*$i,"esp"),$tweak); | ||
| 1411 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1412 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1413 | &pcmpgtd ($twtmp,$tweak); # broadcast upper bits | ||
| 1414 | &pxor ($tweak,$twres); | ||
| 1415 | } | ||
| 1416 | &pshufd ($inout5,$twtmp,0x13); | ||
| 1417 | &movdqa (&QWP(16*$i++,"esp"),$tweak); | ||
| 1418 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1419 | &$movekey ($rndkey0,&QWP(0,$key_)); | ||
| 1420 | &pand ($inout5,$twmask); # isolate carry and residue | ||
| 1421 | &movups ($inout0,&QWP(0,$inp)); # load input | ||
| 1422 | &pxor ($inout5,$tweak); | ||
| 1423 | |||
| 1424 | # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] | ||
| 1425 | &movdqu ($inout1,&QWP(16*1,$inp)); | ||
| 1426 | &xorps ($inout0,$rndkey0); # input^=rndkey[0] | ||
| 1427 | &movdqu ($inout2,&QWP(16*2,$inp)); | ||
| 1428 | &pxor ($inout1,$rndkey0); | ||
| 1429 | &movdqu ($inout3,&QWP(16*3,$inp)); | ||
| 1430 | &pxor ($inout2,$rndkey0); | ||
| 1431 | &movdqu ($inout4,&QWP(16*4,$inp)); | ||
| 1432 | &pxor ($inout3,$rndkey0); | ||
| 1433 | &movdqu ($rndkey1,&QWP(16*5,$inp)); | ||
| 1434 | &pxor ($inout4,$rndkey0); | ||
| 1435 | &lea ($inp,&DWP(16*6,$inp)); | ||
| 1436 | &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
| 1437 | &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak | ||
| 1438 | &pxor ($inout5,$rndkey1); | ||
| 1439 | |||
| 1440 | &$movekey ($rndkey1,&QWP(16,$key_)); | ||
| 1441 | &lea ($key,&DWP(32,$key_)); | ||
| 1442 | &pxor ($inout1,&QWP(16*1,"esp")); | ||
| 1443 | &aesdec ($inout0,$rndkey1); | ||
| 1444 | &pxor ($inout2,&QWP(16*2,"esp")); | ||
| 1445 | &aesdec ($inout1,$rndkey1); | ||
| 1446 | &pxor ($inout3,&QWP(16*3,"esp")); | ||
| 1447 | &dec ($rounds); | ||
| 1448 | &aesdec ($inout2,$rndkey1); | ||
| 1449 | &pxor ($inout4,&QWP(16*4,"esp")); | ||
| 1450 | &aesdec ($inout3,$rndkey1); | ||
| 1451 | &pxor ($inout5,$rndkey0); | ||
| 1452 | &aesdec ($inout4,$rndkey1); | ||
| 1453 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
| 1454 | &aesdec ($inout5,$rndkey1); | ||
| 1455 | &call (&label("_aesni_decrypt6_enter")); | ||
| 1456 | |||
| 1457 | &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak | ||
| 1458 | &pxor ($twtmp,$twtmp); | ||
| 1459 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
| 1460 | &pcmpgtd ($twtmp,$tweak); # broadcast upper bits | ||
| 1461 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
| 1462 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1463 | &xorps ($inout2,&QWP(16*2,"esp")); | ||
| 1464 | &movups (&QWP(16*1,$out),$inout1); | ||
| 1465 | &xorps ($inout3,&QWP(16*3,"esp")); | ||
| 1466 | &movups (&QWP(16*2,$out),$inout2); | ||
| 1467 | &xorps ($inout4,&QWP(16*4,"esp")); | ||
| 1468 | &movups (&QWP(16*3,$out),$inout3); | ||
| 1469 | &xorps ($inout5,$tweak); | ||
| 1470 | &movups (&QWP(16*4,$out),$inout4); | ||
| 1471 | &pshufd ($twres,$twtmp,0x13); | ||
| 1472 | &movups (&QWP(16*5,$out),$inout5); | ||
| 1473 | &lea ($out,&DWP(16*6,$out)); | ||
| 1474 | &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 | ||
| 1475 | |||
| 1476 | &pxor ($twtmp,$twtmp); | ||
| 1477 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1478 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1479 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1480 | &mov ($rounds,$rounds_); # restore $rounds | ||
| 1481 | &pxor ($tweak,$twres); | ||
| 1482 | |||
| 1483 | &sub ($len,16*6); | ||
| 1484 | &jnc (&label("xts_dec_loop6")); | ||
| 1485 | |||
| 1486 | &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds | ||
| 1487 | &mov ($key,$key_); # restore $key | ||
| 1488 | &mov ($rounds_,$rounds); | ||
| 1489 | |||
| 1490 | &set_label("xts_dec_short"); | ||
| 1491 | &add ($len,16*6); | ||
| 1492 | &jz (&label("xts_dec_done6x")); | ||
| 1493 | |||
| 1494 | &movdqa ($inout3,$tweak); # put aside previous tweak | ||
| 1495 | &cmp ($len,0x20); | ||
| 1496 | &jb (&label("xts_dec_one")); | ||
| 1497 | |||
| 1498 | &pshufd ($twres,$twtmp,0x13); | ||
| 1499 | &pxor ($twtmp,$twtmp); | ||
| 1500 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1501 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1502 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1503 | &pxor ($tweak,$twres); | ||
| 1504 | &je (&label("xts_dec_two")); | ||
| 1505 | |||
| 1506 | &pshufd ($twres,$twtmp,0x13); | ||
| 1507 | &pxor ($twtmp,$twtmp); | ||
| 1508 | &movdqa ($inout4,$tweak); # put aside previous tweak | ||
| 1509 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1510 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1511 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1512 | &pxor ($tweak,$twres); | ||
| 1513 | &cmp ($len,0x40); | ||
| 1514 | &jb (&label("xts_dec_three")); | ||
| 1515 | |||
| 1516 | &pshufd ($twres,$twtmp,0x13); | ||
| 1517 | &pxor ($twtmp,$twtmp); | ||
| 1518 | &movdqa ($inout5,$tweak); # put aside previous tweak | ||
| 1519 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1520 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1521 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1522 | &pxor ($tweak,$twres); | ||
| 1523 | &movdqa (&QWP(16*0,"esp"),$inout3); | ||
| 1524 | &movdqa (&QWP(16*1,"esp"),$inout4); | ||
| 1525 | &je (&label("xts_dec_four")); | ||
| 1526 | |||
| 1527 | &movdqa (&QWP(16*2,"esp"),$inout5); | ||
| 1528 | &pshufd ($inout5,$twtmp,0x13); | ||
| 1529 | &movdqa (&QWP(16*3,"esp"),$tweak); | ||
| 1530 | &paddq ($tweak,$tweak); # &psllq($inout0,1); | ||
| 1531 | &pand ($inout5,$twmask); # isolate carry and residue | ||
| 1532 | &pxor ($inout5,$tweak); | ||
| 1533 | |||
| 1534 | &movdqu ($inout0,&QWP(16*0,$inp)); # load input | ||
| 1535 | &movdqu ($inout1,&QWP(16*1,$inp)); | ||
| 1536 | &movdqu ($inout2,&QWP(16*2,$inp)); | ||
| 1537 | &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
| 1538 | &movdqu ($inout3,&QWP(16*3,$inp)); | ||
| 1539 | &pxor ($inout1,&QWP(16*1,"esp")); | ||
| 1540 | &movdqu ($inout4,&QWP(16*4,$inp)); | ||
| 1541 | &pxor ($inout2,&QWP(16*2,"esp")); | ||
| 1542 | &lea ($inp,&DWP(16*5,$inp)); | ||
| 1543 | &pxor ($inout3,&QWP(16*3,"esp")); | ||
| 1544 | &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak | ||
| 1545 | &pxor ($inout4,$inout5); | ||
| 1546 | |||
| 1547 | &call ("_aesni_decrypt6"); | ||
| 1548 | |||
| 1549 | &movaps ($tweak,&QWP(16*4,"esp")); # last tweak | ||
| 1550 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
| 1551 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
| 1552 | &xorps ($inout2,&QWP(16*2,"esp")); | ||
| 1553 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1554 | &xorps ($inout3,&QWP(16*3,"esp")); | ||
| 1555 | &movups (&QWP(16*1,$out),$inout1); | ||
| 1556 | &xorps ($inout4,$tweak); | ||
| 1557 | &movups (&QWP(16*2,$out),$inout2); | ||
| 1558 | &movups (&QWP(16*3,$out),$inout3); | ||
| 1559 | &movups (&QWP(16*4,$out),$inout4); | ||
| 1560 | &lea ($out,&DWP(16*5,$out)); | ||
| 1561 | &jmp (&label("xts_dec_done")); | ||
| 1562 | |||
| 1563 | &set_label("xts_dec_one",16); | ||
| 1564 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
| 1565 | &lea ($inp,&DWP(16*1,$inp)); | ||
| 1566 | &xorps ($inout0,$inout3); # input^=tweak | ||
| 1567 | if ($inline) | ||
| 1568 | { &aesni_inline_generate1("dec"); } | ||
| 1569 | else | ||
| 1570 | { &call ("_aesni_decrypt1"); } | ||
| 1571 | &xorps ($inout0,$inout3); # output^=tweak | ||
| 1572 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1573 | &lea ($out,&DWP(16*1,$out)); | ||
| 1574 | |||
| 1575 | &movdqa ($tweak,$inout3); # last tweak | ||
| 1576 | &jmp (&label("xts_dec_done")); | ||
| 1577 | |||
| 1578 | &set_label("xts_dec_two",16); | ||
| 1579 | &movaps ($inout4,$tweak); # put aside last tweak | ||
| 1580 | |||
| 1581 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
| 1582 | &movups ($inout1,&QWP(16*1,$inp)); | ||
| 1583 | &lea ($inp,&DWP(16*2,$inp)); | ||
| 1584 | &xorps ($inout0,$inout3); # input^=tweak | ||
| 1585 | &xorps ($inout1,$inout4); | ||
| 1586 | |||
| 1587 | &call ("_aesni_decrypt3"); | ||
| 1588 | |||
| 1589 | &xorps ($inout0,$inout3); # output^=tweak | ||
| 1590 | &xorps ($inout1,$inout4); | ||
| 1591 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1592 | &movups (&QWP(16*1,$out),$inout1); | ||
| 1593 | &lea ($out,&DWP(16*2,$out)); | ||
| 1594 | |||
| 1595 | &movdqa ($tweak,$inout4); # last tweak | ||
| 1596 | &jmp (&label("xts_dec_done")); | ||
| 1597 | |||
| 1598 | &set_label("xts_dec_three",16); | ||
| 1599 | &movaps ($inout5,$tweak); # put aside last tweak | ||
| 1600 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
| 1601 | &movups ($inout1,&QWP(16*1,$inp)); | ||
| 1602 | &movups ($inout2,&QWP(16*2,$inp)); | ||
| 1603 | &lea ($inp,&DWP(16*3,$inp)); | ||
| 1604 | &xorps ($inout0,$inout3); # input^=tweak | ||
| 1605 | &xorps ($inout1,$inout4); | ||
| 1606 | &xorps ($inout2,$inout5); | ||
| 1607 | |||
| 1608 | &call ("_aesni_decrypt3"); | ||
| 1609 | |||
| 1610 | &xorps ($inout0,$inout3); # output^=tweak | ||
| 1611 | &xorps ($inout1,$inout4); | ||
| 1612 | &xorps ($inout2,$inout5); | ||
| 1613 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1614 | &movups (&QWP(16*1,$out),$inout1); | ||
| 1615 | &movups (&QWP(16*2,$out),$inout2); | ||
| 1616 | &lea ($out,&DWP(16*3,$out)); | ||
| 1617 | |||
| 1618 | &movdqa ($tweak,$inout5); # last tweak | ||
| 1619 | &jmp (&label("xts_dec_done")); | ||
| 1620 | |||
| 1621 | &set_label("xts_dec_four",16); | ||
| 1622 | &movaps ($inout4,$tweak); # put aside last tweak | ||
| 1623 | |||
| 1624 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
| 1625 | &movups ($inout1,&QWP(16*1,$inp)); | ||
| 1626 | &movups ($inout2,&QWP(16*2,$inp)); | ||
| 1627 | &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
| 1628 | &movups ($inout3,&QWP(16*3,$inp)); | ||
| 1629 | &lea ($inp,&DWP(16*4,$inp)); | ||
| 1630 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
| 1631 | &xorps ($inout2,$inout5); | ||
| 1632 | &xorps ($inout3,$inout4); | ||
| 1633 | |||
| 1634 | &call ("_aesni_decrypt4"); | ||
| 1635 | |||
| 1636 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
| 1637 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
| 1638 | &xorps ($inout2,$inout5); | ||
| 1639 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
| 1640 | &xorps ($inout3,$inout4); | ||
| 1641 | &movups (&QWP(16*1,$out),$inout1); | ||
| 1642 | &movups (&QWP(16*2,$out),$inout2); | ||
| 1643 | &movups (&QWP(16*3,$out),$inout3); | ||
| 1644 | &lea ($out,&DWP(16*4,$out)); | ||
| 1645 | |||
| 1646 | &movdqa ($tweak,$inout4); # last tweak | ||
| 1647 | &jmp (&label("xts_dec_done")); | ||
| 1648 | |||
| 1649 | &set_label("xts_dec_done6x",16); # $tweak is pre-calculated | ||
| 1650 | &mov ($len,&DWP(16*7+0,"esp")); # restore original $len | ||
| 1651 | &and ($len,15); | ||
| 1652 | &jz (&label("xts_dec_ret")); | ||
| 1653 | &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 | ||
| 1654 | &jmp (&label("xts_dec_only_one_more")); | ||
| 1655 | |||
| 1656 | &set_label("xts_dec_done",16); | ||
| 1657 | &mov ($len,&DWP(16*7+0,"esp")); # restore original $len | ||
| 1658 | &pxor ($twtmp,$twtmp); | ||
| 1659 | &and ($len,15); | ||
| 1660 | &jz (&label("xts_dec_ret")); | ||
| 1661 | |||
| 1662 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1663 | &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 | ||
| 1664 | &pshufd ($twres,$twtmp,0x13); | ||
| 1665 | &pxor ($twtmp,$twtmp); | ||
| 1666 | &movdqa ($twmask,&QWP(16*6,"esp")); | ||
| 1667 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1668 | &pand ($twres,$twmask); # isolate carry and residue | ||
| 1669 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
| 1670 | &pxor ($tweak,$twres); | ||
| 1671 | |||
| 1672 | &set_label("xts_dec_only_one_more"); | ||
| 1673 | &pshufd ($inout3,$twtmp,0x13); | ||
| 1674 | &movdqa ($inout4,$tweak); # put aside previous tweak | ||
| 1675 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
| 1676 | &pand ($inout3,$twmask); # isolate carry and residue | ||
| 1677 | &pxor ($inout3,$tweak); | ||
| 1678 | |||
| 1679 | &mov ($key,$key_); # restore $key | ||
| 1680 | &mov ($rounds,$rounds_); # restore $rounds | ||
| 1681 | |||
| 1682 | &movups ($inout0,&QWP(0,$inp)); # load input | ||
| 1683 | &xorps ($inout0,$inout3); # input^=tweak | ||
| 1684 | if ($inline) | ||
| 1685 | { &aesni_inline_generate1("dec"); } | ||
| 1686 | else | ||
| 1687 | { &call ("_aesni_decrypt1"); } | ||
| 1688 | &xorps ($inout0,$inout3); # output^=tweak | ||
| 1689 | &movups (&QWP(0,$out),$inout0); # write output | ||
| 1690 | |||
| 1691 | &set_label("xts_dec_steal"); | ||
| 1692 | &movz ($rounds,&BP(16,$inp)); | ||
| 1693 | &movz ($key,&BP(0,$out)); | ||
| 1694 | &lea ($inp,&DWP(1,$inp)); | ||
| 1695 | &mov (&BP(0,$out),&LB($rounds)); | ||
| 1696 | &mov (&BP(16,$out),&LB($key)); | ||
| 1697 | &lea ($out,&DWP(1,$out)); | ||
| 1698 | &sub ($len,1); | ||
| 1699 | &jnz (&label("xts_dec_steal")); | ||
| 1700 | |||
| 1701 | &sub ($out,&DWP(16*7+0,"esp")); # rewind $out | ||
| 1702 | &mov ($key,$key_); # restore $key | ||
| 1703 | &mov ($rounds,$rounds_); # restore $rounds | ||
| 1704 | |||
| 1705 | &movups ($inout0,&QWP(0,$out)); # load input | ||
| 1706 | &xorps ($inout0,$inout4); # input^=tweak | ||
| 1707 | if ($inline) | ||
| 1708 | { &aesni_inline_generate1("dec"); } | ||
| 1709 | else | ||
| 1710 | { &call ("_aesni_decrypt1"); } | ||
| 1711 | &xorps ($inout0,$inout4); # output^=tweak | ||
| 1712 | &movups (&QWP(0,$out),$inout0); # write output | ||
| 1713 | |||
| 1714 | &set_label("xts_dec_ret"); | ||
| 1715 | &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp | ||
| 1716 | &function_end("aesni_xts_decrypt"); | ||
| 1717 | } | ||
| 1718 | } | ||
| 1719 | |||
| 1720 | ###################################################################### | ||
| 1721 | # void $PREFIX_cbc_encrypt (const void *inp, void *out, | ||
| 1722 | # size_t length, const AES_KEY *key, | ||
| 1723 | # unsigned char *ivp,const int enc); | ||
| 1724 | &function_begin("${PREFIX}_cbc_encrypt"); | ||
| 1725 | &mov ($inp,&wparam(0)); | ||
| 1726 | &mov ($rounds_,"esp"); | ||
| 1727 | &mov ($out,&wparam(1)); | ||
| 1728 | &sub ($rounds_,24); | ||
| 1729 | &mov ($len,&wparam(2)); | ||
| 1730 | &and ($rounds_,-16); | ||
| 1731 | &mov ($key,&wparam(3)); | ||
| 1732 | &mov ($key_,&wparam(4)); | ||
| 1733 | &test ($len,$len); | ||
| 1734 | &jz (&label("cbc_abort")); | ||
| 1735 | |||
| 1736 | &cmp (&wparam(5),0); | ||
| 1737 | &xchg ($rounds_,"esp"); # alloca | ||
| 1738 | &movups ($ivec,&QWP(0,$key_)); # load IV | ||
| 1739 | &mov ($rounds,&DWP(240,$key)); | ||
| 1740 | &mov ($key_,$key); # backup $key | ||
| 1741 | &mov (&DWP(16,"esp"),$rounds_); # save original %esp | ||
| 1742 | &mov ($rounds_,$rounds); # backup $rounds | ||
| 1743 | &je (&label("cbc_decrypt")); | ||
| 1744 | |||
| 1745 | &movaps ($inout0,$ivec); | ||
| 1746 | &cmp ($len,16); | ||
| 1747 | &jb (&label("cbc_enc_tail")); | ||
| 1748 | &sub ($len,16); | ||
| 1749 | &jmp (&label("cbc_enc_loop")); | ||
| 1750 | |||
| 1751 | &set_label("cbc_enc_loop",16); | ||
| 1752 | &movups ($ivec,&QWP(0,$inp)); # input actually | ||
| 1753 | &lea ($inp,&DWP(16,$inp)); | ||
| 1754 | if ($inline) | ||
| 1755 | { &aesni_inline_generate1("enc",$inout0,$ivec); } | ||
| 1756 | else | ||
| 1757 | { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } | ||
| 1758 | &mov ($rounds,$rounds_); # restore $rounds | ||
| 1759 | &mov ($key,$key_); # restore $key | ||
| 1760 | &movups (&QWP(0,$out),$inout0); # store output | ||
| 1761 | &lea ($out,&DWP(16,$out)); | ||
| 1762 | &sub ($len,16); | ||
| 1763 | &jnc (&label("cbc_enc_loop")); | ||
| 1764 | &add ($len,16); | ||
| 1765 | &jnz (&label("cbc_enc_tail")); | ||
| 1766 | &movaps ($ivec,$inout0); | ||
| 1767 | &jmp (&label("cbc_ret")); | ||
| 1768 | |||
| 1769 | &set_label("cbc_enc_tail"); | ||
| 1770 | &mov ("ecx",$len); # zaps $rounds | ||
| 1771 | &data_word(0xA4F3F689); # rep movsb | ||
| 1772 | &mov ("ecx",16); # zero tail | ||
| 1773 | &sub ("ecx",$len); | ||
| 1774 | &xor ("eax","eax"); # zaps $len | ||
| 1775 | &data_word(0xAAF3F689); # rep stosb | ||
| 1776 | &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block | ||
| 1777 | &mov ($rounds,$rounds_); # restore $rounds | ||
| 1778 | &mov ($inp,$out); # $inp and $out are the same | ||
| 1779 | &mov ($key,$key_); # restore $key | ||
| 1780 | &jmp (&label("cbc_enc_loop")); | ||
| 1781 | ###################################################################### | ||
| 1782 | &set_label("cbc_decrypt",16); | ||
| 1783 | &cmp ($len,0x50); | ||
| 1784 | &jbe (&label("cbc_dec_tail")); | ||
| 1785 | &movaps (&QWP(0,"esp"),$ivec); # save IV | ||
| 1786 | &sub ($len,0x50); | ||
| 1787 | &jmp (&label("cbc_dec_loop6_enter")); | ||
| 1788 | |||
| 1789 | &set_label("cbc_dec_loop6",16); | ||
| 1790 | &movaps (&QWP(0,"esp"),$rndkey0); # save IV | ||
| 1791 | &movups (&QWP(0,$out),$inout5); | ||
| 1792 | &lea ($out,&DWP(0x10,$out)); | ||
| 1793 | &set_label("cbc_dec_loop6_enter"); | ||
| 1794 | &movdqu ($inout0,&QWP(0,$inp)); | ||
| 1795 | &movdqu ($inout1,&QWP(0x10,$inp)); | ||
| 1796 | &movdqu ($inout2,&QWP(0x20,$inp)); | ||
| 1797 | &movdqu ($inout3,&QWP(0x30,$inp)); | ||
| 1798 | &movdqu ($inout4,&QWP(0x40,$inp)); | ||
| 1799 | &movdqu ($inout5,&QWP(0x50,$inp)); | ||
| 1800 | |||
| 1801 | &call ("_aesni_decrypt6"); | ||
| 1802 | |||
| 1803 | &movups ($rndkey1,&QWP(0,$inp)); | ||
| 1804 | &movups ($rndkey0,&QWP(0x10,$inp)); | ||
| 1805 | &xorps ($inout0,&QWP(0,"esp")); # ^=IV | ||
| 1806 | &xorps ($inout1,$rndkey1); | ||
| 1807 | &movups ($rndkey1,&QWP(0x20,$inp)); | ||
| 1808 | &xorps ($inout2,$rndkey0); | ||
| 1809 | &movups ($rndkey0,&QWP(0x30,$inp)); | ||
| 1810 | &xorps ($inout3,$rndkey1); | ||
| 1811 | &movups ($rndkey1,&QWP(0x40,$inp)); | ||
| 1812 | &xorps ($inout4,$rndkey0); | ||
| 1813 | &movups ($rndkey0,&QWP(0x50,$inp)); # IV | ||
| 1814 | &xorps ($inout5,$rndkey1); | ||
| 1815 | &movups (&QWP(0,$out),$inout0); | ||
| 1816 | &movups (&QWP(0x10,$out),$inout1); | ||
| 1817 | &lea ($inp,&DWP(0x60,$inp)); | ||
| 1818 | &movups (&QWP(0x20,$out),$inout2); | ||
| 1819 | &mov ($rounds,$rounds_) # restore $rounds | ||
| 1820 | &movups (&QWP(0x30,$out),$inout3); | ||
| 1821 | &mov ($key,$key_); # restore $key | ||
| 1822 | &movups (&QWP(0x40,$out),$inout4); | ||
| 1823 | &lea ($out,&DWP(0x50,$out)); | ||
| 1824 | &sub ($len,0x60); | ||
| 1825 | &ja (&label("cbc_dec_loop6")); | ||
| 1826 | |||
| 1827 | &movaps ($inout0,$inout5); | ||
| 1828 | &movaps ($ivec,$rndkey0); | ||
| 1829 | &add ($len,0x50); | ||
| 1830 | &jle (&label("cbc_dec_tail_collected")); | ||
| 1831 | &movups (&QWP(0,$out),$inout0); | ||
| 1832 | &lea ($out,&DWP(0x10,$out)); | ||
| 1833 | &set_label("cbc_dec_tail"); | ||
| 1834 | &movups ($inout0,&QWP(0,$inp)); | ||
| 1835 | &movaps ($in0,$inout0); | ||
| 1836 | &cmp ($len,0x10); | ||
| 1837 | &jbe (&label("cbc_dec_one")); | ||
| 1838 | |||
| 1839 | &movups ($inout1,&QWP(0x10,$inp)); | ||
| 1840 | &movaps ($in1,$inout1); | ||
| 1841 | &cmp ($len,0x20); | ||
| 1842 | &jbe (&label("cbc_dec_two")); | ||
| 1843 | |||
| 1844 | &movups ($inout2,&QWP(0x20,$inp)); | ||
| 1845 | &cmp ($len,0x30); | ||
| 1846 | &jbe (&label("cbc_dec_three")); | ||
| 1847 | |||
| 1848 | &movups ($inout3,&QWP(0x30,$inp)); | ||
| 1849 | &cmp ($len,0x40); | ||
| 1850 | &jbe (&label("cbc_dec_four")); | ||
| 1851 | |||
| 1852 | &movups ($inout4,&QWP(0x40,$inp)); | ||
| 1853 | &movaps (&QWP(0,"esp"),$ivec); # save IV | ||
| 1854 | &movups ($inout0,&QWP(0,$inp)); | ||
| 1855 | &xorps ($inout5,$inout5); | ||
| 1856 | &call ("_aesni_decrypt6"); | ||
| 1857 | &movups ($rndkey1,&QWP(0,$inp)); | ||
| 1858 | &movups ($rndkey0,&QWP(0x10,$inp)); | ||
| 1859 | &xorps ($inout0,&QWP(0,"esp")); # ^= IV | ||
| 1860 | &xorps ($inout1,$rndkey1); | ||
| 1861 | &movups ($rndkey1,&QWP(0x20,$inp)); | ||
| 1862 | &xorps ($inout2,$rndkey0); | ||
| 1863 | &movups ($rndkey0,&QWP(0x30,$inp)); | ||
| 1864 | &xorps ($inout3,$rndkey1); | ||
| 1865 | &movups ($ivec,&QWP(0x40,$inp)); # IV | ||
| 1866 | &xorps ($inout4,$rndkey0); | ||
| 1867 | &movups (&QWP(0,$out),$inout0); | ||
| 1868 | &movups (&QWP(0x10,$out),$inout1); | ||
| 1869 | &movups (&QWP(0x20,$out),$inout2); | ||
| 1870 | &movups (&QWP(0x30,$out),$inout3); | ||
| 1871 | &lea ($out,&DWP(0x40,$out)); | ||
| 1872 | &movaps ($inout0,$inout4); | ||
| 1873 | &sub ($len,0x50); | ||
| 1874 | &jmp (&label("cbc_dec_tail_collected")); | ||
| 1875 | |||
| 1876 | &set_label("cbc_dec_one",16); | ||
| 1877 | if ($inline) | ||
| 1878 | { &aesni_inline_generate1("dec"); } | ||
| 1879 | else | ||
| 1880 | { &call ("_aesni_decrypt1"); } | ||
| 1881 | &xorps ($inout0,$ivec); | ||
| 1882 | &movaps ($ivec,$in0); | ||
| 1883 | &sub ($len,0x10); | ||
| 1884 | &jmp (&label("cbc_dec_tail_collected")); | ||
| 1885 | |||
| 1886 | &set_label("cbc_dec_two",16); | ||
| 1887 | &xorps ($inout2,$inout2); | ||
| 1888 | &call ("_aesni_decrypt3"); | ||
| 1889 | &xorps ($inout0,$ivec); | ||
| 1890 | &xorps ($inout1,$in0); | ||
| 1891 | &movups (&QWP(0,$out),$inout0); | ||
| 1892 | &movaps ($inout0,$inout1); | ||
| 1893 | &lea ($out,&DWP(0x10,$out)); | ||
| 1894 | &movaps ($ivec,$in1); | ||
| 1895 | &sub ($len,0x20); | ||
| 1896 | &jmp (&label("cbc_dec_tail_collected")); | ||
| 1897 | |||
| 1898 | &set_label("cbc_dec_three",16); | ||
| 1899 | &call ("_aesni_decrypt3"); | ||
| 1900 | &xorps ($inout0,$ivec); | ||
| 1901 | &xorps ($inout1,$in0); | ||
| 1902 | &xorps ($inout2,$in1); | ||
| 1903 | &movups (&QWP(0,$out),$inout0); | ||
| 1904 | &movaps ($inout0,$inout2); | ||
| 1905 | &movups (&QWP(0x10,$out),$inout1); | ||
| 1906 | &lea ($out,&DWP(0x20,$out)); | ||
| 1907 | &movups ($ivec,&QWP(0x20,$inp)); | ||
| 1908 | &sub ($len,0x30); | ||
| 1909 | &jmp (&label("cbc_dec_tail_collected")); | ||
| 1910 | |||
| 1911 | &set_label("cbc_dec_four",16); | ||
| 1912 | &call ("_aesni_decrypt4"); | ||
| 1913 | &movups ($rndkey1,&QWP(0x10,$inp)); | ||
| 1914 | &movups ($rndkey0,&QWP(0x20,$inp)); | ||
| 1915 | &xorps ($inout0,$ivec); | ||
| 1916 | &movups ($ivec,&QWP(0x30,$inp)); | ||
| 1917 | &xorps ($inout1,$in0); | ||
| 1918 | &movups (&QWP(0,$out),$inout0); | ||
| 1919 | &xorps ($inout2,$rndkey1); | ||
| 1920 | &movups (&QWP(0x10,$out),$inout1); | ||
| 1921 | &xorps ($inout3,$rndkey0); | ||
| 1922 | &movups (&QWP(0x20,$out),$inout2); | ||
| 1923 | &lea ($out,&DWP(0x30,$out)); | ||
| 1924 | &movaps ($inout0,$inout3); | ||
| 1925 | &sub ($len,0x40); | ||
| 1926 | |||
| 1927 | &set_label("cbc_dec_tail_collected"); | ||
| 1928 | &and ($len,15); | ||
| 1929 | &jnz (&label("cbc_dec_tail_partial")); | ||
| 1930 | &movups (&QWP(0,$out),$inout0); | ||
| 1931 | &jmp (&label("cbc_ret")); | ||
| 1932 | |||
| 1933 | &set_label("cbc_dec_tail_partial",16); | ||
| 1934 | &movaps (&QWP(0,"esp"),$inout0); | ||
| 1935 | &mov ("ecx",16); | ||
| 1936 | &mov ($inp,"esp"); | ||
| 1937 | &sub ("ecx",$len); | ||
| 1938 | &data_word(0xA4F3F689); # rep movsb | ||
| 1939 | |||
| 1940 | &set_label("cbc_ret"); | ||
| 1941 | &mov ("esp",&DWP(16,"esp")); # pull original %esp | ||
| 1942 | &mov ($key_,&wparam(4)); | ||
| 1943 | &movups (&QWP(0,$key_),$ivec); # output IV | ||
| 1944 | &set_label("cbc_abort"); | ||
| 1945 | &function_end("${PREFIX}_cbc_encrypt"); | ||
| 1946 | |||
| 1947 | ###################################################################### | ||
| 1948 | # Mechanical port from aesni-x86_64.pl. | ||
| 1949 | # | ||
| 1950 | # _aesni_set_encrypt_key is private interface, | ||
| 1951 | # input: | ||
| 1952 | # "eax" const unsigned char *userKey | ||
| 1953 | # $rounds int bits | ||
| 1954 | # $key AES_KEY *key | ||
| 1955 | # output: | ||
| 1956 | # "eax" return code | ||
| 1957 | # $round rounds | ||
| 1958 | |||
| 1959 | &function_begin_B("_aesni_set_encrypt_key"); | ||
| 1960 | &test ("eax","eax"); | ||
| 1961 | &jz (&label("bad_pointer")); | ||
| 1962 | &test ($key,$key); | ||
| 1963 | &jz (&label("bad_pointer")); | ||
| 1964 | |||
| 1965 | &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey | ||
| 1966 | &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 | ||
| 1967 | &lea ($key,&DWP(16,$key)); | ||
| 1968 | &cmp ($rounds,256); | ||
| 1969 | &je (&label("14rounds")); | ||
| 1970 | &cmp ($rounds,192); | ||
| 1971 | &je (&label("12rounds")); | ||
| 1972 | &cmp ($rounds,128); | ||
| 1973 | &jne (&label("bad_keybits")); | ||
| 1974 | |||
| 1975 | &set_label("10rounds",16); | ||
| 1976 | &mov ($rounds,9); | ||
| 1977 | &$movekey (&QWP(-16,$key),"xmm0"); # round 0 | ||
| 1978 | &aeskeygenassist("xmm1","xmm0",0x01); # round 1 | ||
| 1979 | &call (&label("key_128_cold")); | ||
| 1980 | &aeskeygenassist("xmm1","xmm0",0x2); # round 2 | ||
| 1981 | &call (&label("key_128")); | ||
| 1982 | &aeskeygenassist("xmm1","xmm0",0x04); # round 3 | ||
| 1983 | &call (&label("key_128")); | ||
| 1984 | &aeskeygenassist("xmm1","xmm0",0x08); # round 4 | ||
| 1985 | &call (&label("key_128")); | ||
| 1986 | &aeskeygenassist("xmm1","xmm0",0x10); # round 5 | ||
| 1987 | &call (&label("key_128")); | ||
| 1988 | &aeskeygenassist("xmm1","xmm0",0x20); # round 6 | ||
| 1989 | &call (&label("key_128")); | ||
| 1990 | &aeskeygenassist("xmm1","xmm0",0x40); # round 7 | ||
| 1991 | &call (&label("key_128")); | ||
| 1992 | &aeskeygenassist("xmm1","xmm0",0x80); # round 8 | ||
| 1993 | &call (&label("key_128")); | ||
| 1994 | &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 | ||
| 1995 | &call (&label("key_128")); | ||
| 1996 | &aeskeygenassist("xmm1","xmm0",0x36); # round 10 | ||
| 1997 | &call (&label("key_128")); | ||
| 1998 | &$movekey (&QWP(0,$key),"xmm0"); | ||
| 1999 | &mov (&DWP(80,$key),$rounds); | ||
| 2000 | &xor ("eax","eax"); | ||
| 2001 | &ret(); | ||
| 2002 | |||
| 2003 | &set_label("key_128",16); | ||
| 2004 | &$movekey (&QWP(0,$key),"xmm0"); | ||
| 2005 | &lea ($key,&DWP(16,$key)); | ||
| 2006 | &set_label("key_128_cold"); | ||
| 2007 | &shufps ("xmm4","xmm0",0b00010000); | ||
| 2008 | &xorps ("xmm0","xmm4"); | ||
| 2009 | &shufps ("xmm4","xmm0",0b10001100); | ||
| 2010 | &xorps ("xmm0","xmm4"); | ||
| 2011 | &shufps ("xmm1","xmm1",0b11111111); # critical path | ||
| 2012 | &xorps ("xmm0","xmm1"); | ||
| 2013 | &ret(); | ||
| 2014 | |||
| 2015 | &set_label("12rounds",16); | ||
| 2016 | &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey | ||
| 2017 | &mov ($rounds,11); | ||
| 2018 | &$movekey (&QWP(-16,$key),"xmm0") # round 0 | ||
| 2019 | &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 | ||
| 2020 | &call (&label("key_192a_cold")); | ||
| 2021 | &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 | ||
| 2022 | &call (&label("key_192b")); | ||
| 2023 | &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 | ||
| 2024 | &call (&label("key_192a")); | ||
| 2025 | &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 | ||
| 2026 | &call (&label("key_192b")); | ||
| 2027 | &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 | ||
| 2028 | &call (&label("key_192a")); | ||
| 2029 | &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 | ||
| 2030 | &call (&label("key_192b")); | ||
| 2031 | &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 | ||
| 2032 | &call (&label("key_192a")); | ||
| 2033 | &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 | ||
| 2034 | &call (&label("key_192b")); | ||
| 2035 | &$movekey (&QWP(0,$key),"xmm0"); | ||
| 2036 | &mov (&DWP(48,$key),$rounds); | ||
| 2037 | &xor ("eax","eax"); | ||
| 2038 | &ret(); | ||
| 2039 | |||
| 2040 | &set_label("key_192a",16); | ||
| 2041 | &$movekey (&QWP(0,$key),"xmm0"); | ||
| 2042 | &lea ($key,&DWP(16,$key)); | ||
| 2043 | &set_label("key_192a_cold",16); | ||
| 2044 | &movaps ("xmm5","xmm2"); | ||
| 2045 | &set_label("key_192b_warm"); | ||
| 2046 | &shufps ("xmm4","xmm0",0b00010000); | ||
| 2047 | &movdqa ("xmm3","xmm2"); | ||
| 2048 | &xorps ("xmm0","xmm4"); | ||
| 2049 | &shufps ("xmm4","xmm0",0b10001100); | ||
| 2050 | &pslldq ("xmm3",4); | ||
| 2051 | &xorps ("xmm0","xmm4"); | ||
| 2052 | &pshufd ("xmm1","xmm1",0b01010101); # critical path | ||
| 2053 | &pxor ("xmm2","xmm3"); | ||
| 2054 | &pxor ("xmm0","xmm1"); | ||
| 2055 | &pshufd ("xmm3","xmm0",0b11111111); | ||
| 2056 | &pxor ("xmm2","xmm3"); | ||
| 2057 | &ret(); | ||
| 2058 | |||
| 2059 | &set_label("key_192b",16); | ||
| 2060 | &movaps ("xmm3","xmm0"); | ||
| 2061 | &shufps ("xmm5","xmm0",0b01000100); | ||
| 2062 | &$movekey (&QWP(0,$key),"xmm5"); | ||
| 2063 | &shufps ("xmm3","xmm2",0b01001110); | ||
| 2064 | &$movekey (&QWP(16,$key),"xmm3"); | ||
| 2065 | &lea ($key,&DWP(32,$key)); | ||
| 2066 | &jmp (&label("key_192b_warm")); | ||
| 2067 | |||
| 2068 | &set_label("14rounds",16); | ||
| 2069 | &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey | ||
| 2070 | &mov ($rounds,13); | ||
| 2071 | &lea ($key,&DWP(16,$key)); | ||
| 2072 | &$movekey (&QWP(-32,$key),"xmm0"); # round 0 | ||
| 2073 | &$movekey (&QWP(-16,$key),"xmm2"); # round 1 | ||
| 2074 | &aeskeygenassist("xmm1","xmm2",0x01); # round 2 | ||
| 2075 | &call (&label("key_256a_cold")); | ||
| 2076 | &aeskeygenassist("xmm1","xmm0",0x01); # round 3 | ||
| 2077 | &call (&label("key_256b")); | ||
| 2078 | &aeskeygenassist("xmm1","xmm2",0x02); # round 4 | ||
| 2079 | &call (&label("key_256a")); | ||
| 2080 | &aeskeygenassist("xmm1","xmm0",0x02); # round 5 | ||
| 2081 | &call (&label("key_256b")); | ||
| 2082 | &aeskeygenassist("xmm1","xmm2",0x04); # round 6 | ||
| 2083 | &call (&label("key_256a")); | ||
| 2084 | &aeskeygenassist("xmm1","xmm0",0x04); # round 7 | ||
| 2085 | &call (&label("key_256b")); | ||
| 2086 | &aeskeygenassist("xmm1","xmm2",0x08); # round 8 | ||
| 2087 | &call (&label("key_256a")); | ||
| 2088 | &aeskeygenassist("xmm1","xmm0",0x08); # round 9 | ||
| 2089 | &call (&label("key_256b")); | ||
| 2090 | &aeskeygenassist("xmm1","xmm2",0x10); # round 10 | ||
| 2091 | &call (&label("key_256a")); | ||
| 2092 | &aeskeygenassist("xmm1","xmm0",0x10); # round 11 | ||
| 2093 | &call (&label("key_256b")); | ||
| 2094 | &aeskeygenassist("xmm1","xmm2",0x20); # round 12 | ||
| 2095 | &call (&label("key_256a")); | ||
| 2096 | &aeskeygenassist("xmm1","xmm0",0x20); # round 13 | ||
| 2097 | &call (&label("key_256b")); | ||
| 2098 | &aeskeygenassist("xmm1","xmm2",0x40); # round 14 | ||
| 2099 | &call (&label("key_256a")); | ||
| 2100 | &$movekey (&QWP(0,$key),"xmm0"); | ||
| 2101 | &mov (&DWP(16,$key),$rounds); | ||
| 2102 | &xor ("eax","eax"); | ||
| 2103 | &ret(); | ||
| 2104 | |||
| 2105 | &set_label("key_256a",16); | ||
| 2106 | &$movekey (&QWP(0,$key),"xmm2"); | ||
| 2107 | &lea ($key,&DWP(16,$key)); | ||
| 2108 | &set_label("key_256a_cold"); | ||
| 2109 | &shufps ("xmm4","xmm0",0b00010000); | ||
| 2110 | &xorps ("xmm0","xmm4"); | ||
| 2111 | &shufps ("xmm4","xmm0",0b10001100); | ||
| 2112 | &xorps ("xmm0","xmm4"); | ||
| 2113 | &shufps ("xmm1","xmm1",0b11111111); # critical path | ||
| 2114 | &xorps ("xmm0","xmm1"); | ||
| 2115 | &ret(); | ||
| 2116 | |||
| 2117 | &set_label("key_256b",16); | ||
| 2118 | &$movekey (&QWP(0,$key),"xmm0"); | ||
| 2119 | &lea ($key,&DWP(16,$key)); | ||
| 2120 | |||
| 2121 | &shufps ("xmm4","xmm2",0b00010000); | ||
| 2122 | &xorps ("xmm2","xmm4"); | ||
| 2123 | &shufps ("xmm4","xmm2",0b10001100); | ||
| 2124 | &xorps ("xmm2","xmm4"); | ||
| 2125 | &shufps ("xmm1","xmm1",0b10101010); # critical path | ||
| 2126 | &xorps ("xmm2","xmm1"); | ||
| 2127 | &ret(); | ||
| 2128 | |||
| 2129 | &set_label("bad_pointer",4); | ||
| 2130 | &mov ("eax",-1); | ||
| 2131 | &ret (); | ||
| 2132 | &set_label("bad_keybits",4); | ||
| 2133 | &mov ("eax",-2); | ||
| 2134 | &ret (); | ||
| 2135 | &function_end_B("_aesni_set_encrypt_key"); | ||
| 2136 | |||
| 2137 | # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, | ||
| 2138 | # AES_KEY *key) | ||
| 2139 | &function_begin_B("${PREFIX}_set_encrypt_key"); | ||
| 2140 | &mov ("eax",&wparam(0)); | ||
| 2141 | &mov ($rounds,&wparam(1)); | ||
| 2142 | &mov ($key,&wparam(2)); | ||
| 2143 | &call ("_aesni_set_encrypt_key"); | ||
| 2144 | &ret (); | ||
| 2145 | &function_end_B("${PREFIX}_set_encrypt_key"); | ||
| 2146 | |||
| 2147 | # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, | ||
| 2148 | # AES_KEY *key) | ||
| 2149 | &function_begin_B("${PREFIX}_set_decrypt_key"); | ||
| 2150 | &mov ("eax",&wparam(0)); | ||
| 2151 | &mov ($rounds,&wparam(1)); | ||
| 2152 | &mov ($key,&wparam(2)); | ||
| 2153 | &call ("_aesni_set_encrypt_key"); | ||
| 2154 | &mov ($key,&wparam(2)); | ||
| 2155 | &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key | ||
| 2156 | &test ("eax","eax"); | ||
| 2157 | &jnz (&label("dec_key_ret")); | ||
| 2158 | &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule | ||
| 2159 | |||
| 2160 | &$movekey ("xmm0",&QWP(0,$key)); # just swap | ||
| 2161 | &$movekey ("xmm1",&QWP(0,"eax")); | ||
| 2162 | &$movekey (&QWP(0,"eax"),"xmm0"); | ||
| 2163 | &$movekey (&QWP(0,$key),"xmm1"); | ||
| 2164 | &lea ($key,&DWP(16,$key)); | ||
| 2165 | &lea ("eax",&DWP(-16,"eax")); | ||
| 2166 | |||
| 2167 | &set_label("dec_key_inverse"); | ||
| 2168 | &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse | ||
| 2169 | &$movekey ("xmm1",&QWP(0,"eax")); | ||
| 2170 | &aesimc ("xmm0","xmm0"); | ||
| 2171 | &aesimc ("xmm1","xmm1"); | ||
| 2172 | &lea ($key,&DWP(16,$key)); | ||
| 2173 | &lea ("eax",&DWP(-16,"eax")); | ||
| 2174 | &$movekey (&QWP(16,"eax"),"xmm0"); | ||
| 2175 | &$movekey (&QWP(-16,$key),"xmm1"); | ||
| 2176 | &cmp ("eax",$key); | ||
| 2177 | &ja (&label("dec_key_inverse")); | ||
| 2178 | |||
| 2179 | &$movekey ("xmm0",&QWP(0,$key)); # inverse middle | ||
| 2180 | &aesimc ("xmm0","xmm0"); | ||
| 2181 | &$movekey (&QWP(0,$key),"xmm0"); | ||
| 2182 | |||
| 2183 | &xor ("eax","eax"); # return success | ||
| 2184 | &set_label("dec_key_ret"); | ||
| 2185 | &ret (); | ||
| 2186 | &function_end_B("${PREFIX}_set_decrypt_key"); | ||
| 2187 | &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); | ||
| 2188 | |||
| 2189 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl new file mode 100644 index 0000000000..c9c6312fa7 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl | |||
| @@ -0,0 +1,3044 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | ################################################################### | ||
| 4 | ### AES-128 [originally in CTR mode] ### | ||
| 5 | ### bitsliced implementation for Intel Core 2 processors ### | ||
| 6 | ### requires support of SSE extensions up to SSSE3 ### | ||
| 7 | ### Author: Emilia Käsper and Peter Schwabe ### | ||
| 8 | ### Date: 2009-03-19 ### | ||
| 9 | ### Public domain ### | ||
| 10 | ### ### | ||
| 11 | ### See http://homes.esat.kuleuven.be/~ekasper/#software for ### | ||
| 12 | ### further information. ### | ||
| 13 | ################################################################### | ||
| 14 | # | ||
| 15 | # September 2011. | ||
| 16 | # | ||
| 17 | # Started as transliteration to "perlasm" the original code has | ||
| 18 | # undergone following changes: | ||
| 19 | # | ||
| 20 | # - code was made position-independent; | ||
| 21 | # - rounds were folded into a loop resulting in >5x size reduction | ||
| 22 | # from 12.5KB to 2.2KB; | ||
| 23 | # - above was possibile thanks to mixcolumns() modification that | ||
| 24 | # allowed to feed its output back to aesenc[last], this was | ||
| 25 | # achieved at cost of two additional inter-registers moves; | ||
| 26 | # - some instruction reordering and interleaving; | ||
| 27 | # - this module doesn't implement key setup subroutine, instead it | ||
| 28 | # relies on conversion of "conventional" key schedule as returned | ||
| 29 | # by AES_set_encrypt_key (see discussion below); | ||
| 30 | # - first and last round keys are treated differently, which allowed | ||
| 31 | # to skip one shiftrows(), reduce bit-sliced key schedule and | ||
| 32 | # speed-up conversion by 22%; | ||
| 33 | # - support for 192- and 256-bit keys was added; | ||
| 34 | # | ||
| 35 | # Resulting performance in CPU cycles spent to encrypt one byte out | ||
| 36 | # of 4096-byte buffer with 128-bit key is: | ||
| 37 | # | ||
| 38 | # Emilia's this(*) difference | ||
| 39 | # | ||
| 40 | # Core 2 9.30 8.69 +7% | ||
| 41 | # Nehalem(**) 7.63 6.98 +9% | ||
| 42 | # Atom 17.1 17.4 -2%(***) | ||
| 43 | # | ||
| 44 | # (*) Comparison is not completely fair, because "this" is ECB, | ||
| 45 | # i.e. no extra processing such as counter values calculation | ||
| 46 | # and xor-ing input as in Emilia's CTR implementation is | ||
| 47 | # performed. However, the CTR calculations stand for not more | ||
| 48 | # than 1% of total time, so comparison is *rather* fair. | ||
| 49 | # | ||
| 50 | # (**) Results were collected on Westmere, which is considered to | ||
| 51 | # be equivalent to Nehalem for this code. | ||
| 52 | # | ||
| 53 | # (***) Slowdown on Atom is rather strange per se, because original | ||
| 54 | # implementation has a number of 9+-bytes instructions, which | ||
| 55 | # are bad for Atom front-end, and which I eliminated completely. | ||
| 56 | # In attempt to address deterioration sbox() was tested in FP | ||
| 57 | # SIMD "domain" (movaps instead of movdqa, xorps instead of | ||
| 58 | # pxor, etc.). While it resulted in nominal 4% improvement on | ||
| 59 | # Atom, it hurted Westmere by more than 2x factor. | ||
| 60 | # | ||
| 61 | # As for key schedule conversion subroutine. Interface to OpenSSL | ||
| 62 | # relies on per-invocation on-the-fly conversion. This naturally | ||
| 63 | # has impact on performance, especially for short inputs. Conversion | ||
| 64 | # time in CPU cycles and its ratio to CPU cycles spent in 8x block | ||
| 65 | # function is: | ||
| 66 | # | ||
| 67 | # conversion conversion/8x block | ||
| 68 | # Core 2 240 0.22 | ||
| 69 | # Nehalem 180 0.20 | ||
| 70 | # Atom 430 0.19 | ||
| 71 | # | ||
| 72 | # The ratio values mean that 128-byte blocks will be processed | ||
| 73 | # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, | ||
| 74 | # etc. Then keep in mind that input sizes not divisible by 128 are | ||
| 75 | # *effectively* slower, especially shortest ones, e.g. consecutive | ||
| 76 | # 144-byte blocks are processed 44% slower than one would expect, | ||
| 77 | # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" | ||
| 78 | # it's still faster than ["hyper-threading-safe" code path in] | ||
| 79 | # aes-x86_64.pl on all lengths above 64 bytes... | ||
| 80 | # | ||
| 81 | # October 2011. | ||
| 82 | # | ||
| 83 | # Add decryption procedure. Performance in CPU cycles spent to decrypt | ||
| 84 | # one byte out of 4096-byte buffer with 128-bit key is: | ||
| 85 | # | ||
| 86 | # Core 2 11.0 | ||
| 87 | # Nehalem 9.16 | ||
| 88 | # Atom 20.9 | ||
| 89 | # | ||
| 90 | # November 2011. | ||
| 91 | # | ||
| 92 | # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is | ||
| 93 | # suboptimal, but XTS is meant to be used with larger blocks... | ||
| 94 | # | ||
| 95 | # <appro@openssl.org> | ||
| 96 | |||
| 97 | $flavour = shift; | ||
| 98 | $output = shift; | ||
| 99 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 100 | |||
| 101 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 102 | |||
| 103 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 104 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 105 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 106 | die "can't locate x86_64-xlate.pl"; | ||
| 107 | |||
| 108 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 109 | |||
| 110 | my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); | ||
| 111 | my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) | ||
| 112 | my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... | ||
| 113 | |||
| 114 | { | ||
| 115 | my ($key,$rounds,$const)=("%rax","%r10d","%r11"); | ||
| 116 | |||
| 117 | sub Sbox { | ||
| 118 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
| 119 | # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb | ||
| 120 | my @b=@_[0..7]; | ||
| 121 | my @t=@_[8..11]; | ||
| 122 | my @s=@_[12..15]; | ||
| 123 | &InBasisChange (@b); | ||
| 124 | &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); | ||
| 125 | &OutBasisChange (@b[7,1,4,2,6,5,0,3]); | ||
| 126 | } | ||
| 127 | |||
| 128 | sub InBasisChange { | ||
| 129 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
| 130 | # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb | ||
| 131 | my @b=@_[0..7]; | ||
| 132 | $code.=<<___; | ||
| 133 | pxor @b[6], @b[5] | ||
| 134 | pxor @b[1], @b[2] | ||
| 135 | pxor @b[0], @b[3] | ||
| 136 | pxor @b[2], @b[6] | ||
| 137 | pxor @b[0], @b[5] | ||
| 138 | |||
| 139 | pxor @b[3], @b[6] | ||
| 140 | pxor @b[7], @b[3] | ||
| 141 | pxor @b[5], @b[7] | ||
| 142 | pxor @b[4], @b[3] | ||
| 143 | pxor @b[5], @b[4] | ||
| 144 | pxor @b[1], @b[3] | ||
| 145 | |||
| 146 | pxor @b[7], @b[2] | ||
| 147 | pxor @b[5], @b[1] | ||
| 148 | ___ | ||
| 149 | } | ||
| 150 | |||
| 151 | sub OutBasisChange { | ||
| 152 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
| 153 | # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb | ||
| 154 | my @b=@_[0..7]; | ||
| 155 | $code.=<<___; | ||
| 156 | pxor @b[6], @b[0] | ||
| 157 | pxor @b[4], @b[1] | ||
| 158 | pxor @b[0], @b[2] | ||
| 159 | pxor @b[6], @b[4] | ||
| 160 | pxor @b[1], @b[6] | ||
| 161 | |||
| 162 | pxor @b[5], @b[1] | ||
| 163 | pxor @b[3], @b[5] | ||
| 164 | pxor @b[7], @b[3] | ||
| 165 | pxor @b[5], @b[7] | ||
| 166 | pxor @b[5], @b[2] | ||
| 167 | |||
| 168 | pxor @b[7], @b[4] | ||
| 169 | ___ | ||
| 170 | } | ||
| 171 | |||
| 172 | sub InvSbox { | ||
| 173 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
| 174 | # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb | ||
| 175 | my @b=@_[0..7]; | ||
| 176 | my @t=@_[8..11]; | ||
| 177 | my @s=@_[12..15]; | ||
| 178 | &InvInBasisChange (@b); | ||
| 179 | &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); | ||
| 180 | &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); | ||
| 181 | } | ||
| 182 | |||
| 183 | sub InvInBasisChange { # OutBasisChange in reverse | ||
| 184 | my @b=@_[5,1,2,6,3,7,0,4]; | ||
| 185 | $code.=<<___ | ||
| 186 | pxor @b[7], @b[4] | ||
| 187 | |||
| 188 | pxor @b[5], @b[7] | ||
| 189 | pxor @b[5], @b[2] | ||
| 190 | pxor @b[7], @b[3] | ||
| 191 | pxor @b[3], @b[5] | ||
| 192 | pxor @b[5], @b[1] | ||
| 193 | |||
| 194 | pxor @b[1], @b[6] | ||
| 195 | pxor @b[0], @b[2] | ||
| 196 | pxor @b[6], @b[4] | ||
| 197 | pxor @b[6], @b[0] | ||
| 198 | pxor @b[4], @b[1] | ||
| 199 | ___ | ||
| 200 | } | ||
| 201 | |||
| 202 | sub InvOutBasisChange { # InBasisChange in reverse | ||
| 203 | my @b=@_[2,5,7,3,6,1,0,4]; | ||
| 204 | $code.=<<___; | ||
| 205 | pxor @b[5], @b[1] | ||
| 206 | pxor @b[7], @b[2] | ||
| 207 | |||
| 208 | pxor @b[1], @b[3] | ||
| 209 | pxor @b[5], @b[4] | ||
| 210 | pxor @b[5], @b[7] | ||
| 211 | pxor @b[4], @b[3] | ||
| 212 | pxor @b[0], @b[5] | ||
| 213 | pxor @b[7], @b[3] | ||
| 214 | pxor @b[2], @b[6] | ||
| 215 | pxor @b[1], @b[2] | ||
| 216 | pxor @b[3], @b[6] | ||
| 217 | |||
| 218 | pxor @b[0], @b[3] | ||
| 219 | pxor @b[6], @b[5] | ||
| 220 | ___ | ||
| 221 | } | ||
| 222 | |||
| 223 | sub Mul_GF4 { | ||
| 224 | #;************************************************************* | ||
| 225 | #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * | ||
| 226 | #;************************************************************* | ||
| 227 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
| 228 | $code.=<<___; | ||
| 229 | movdqa $y0, $t0 | ||
| 230 | pxor $y1, $t0 | ||
| 231 | pand $x0, $t0 | ||
| 232 | pxor $x1, $x0 | ||
| 233 | pand $y0, $x1 | ||
| 234 | pand $y1, $x0 | ||
| 235 | pxor $x1, $x0 | ||
| 236 | pxor $t0, $x1 | ||
| 237 | ___ | ||
| 238 | } | ||
| 239 | |||
| 240 | sub Mul_GF4_N { # not used, see next subroutine | ||
| 241 | # multiply and scale by N | ||
| 242 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
| 243 | $code.=<<___; | ||
| 244 | movdqa $y0, $t0 | ||
| 245 | pxor $y1, $t0 | ||
| 246 | pand $x0, $t0 | ||
| 247 | pxor $x1, $x0 | ||
| 248 | pand $y0, $x1 | ||
| 249 | pand $y1, $x0 | ||
| 250 | pxor $x0, $x1 | ||
| 251 | pxor $t0, $x0 | ||
| 252 | ___ | ||
| 253 | } | ||
| 254 | |||
| 255 | sub Mul_GF4_N_GF4 { | ||
| 256 | # interleaved Mul_GF4_N and Mul_GF4 | ||
| 257 | my ($x0,$x1,$y0,$y1,$t0, | ||
| 258 | $x2,$x3,$y2,$y3,$t1)=@_; | ||
| 259 | $code.=<<___; | ||
| 260 | movdqa $y0, $t0 | ||
| 261 | movdqa $y2, $t1 | ||
| 262 | pxor $y1, $t0 | ||
| 263 | pxor $y3, $t1 | ||
| 264 | pand $x0, $t0 | ||
| 265 | pand $x2, $t1 | ||
| 266 | pxor $x1, $x0 | ||
| 267 | pxor $x3, $x2 | ||
| 268 | pand $y0, $x1 | ||
| 269 | pand $y2, $x3 | ||
| 270 | pand $y1, $x0 | ||
| 271 | pand $y3, $x2 | ||
| 272 | pxor $x0, $x1 | ||
| 273 | pxor $x3, $x2 | ||
| 274 | pxor $t0, $x0 | ||
| 275 | pxor $t1, $x3 | ||
| 276 | ___ | ||
| 277 | } | ||
| 278 | sub Mul_GF16_2 { | ||
| 279 | my @x=@_[0..7]; | ||
| 280 | my @y=@_[8..11]; | ||
| 281 | my @t=@_[12..15]; | ||
| 282 | $code.=<<___; | ||
| 283 | movdqa @x[0], @t[0] | ||
| 284 | movdqa @x[1], @t[1] | ||
| 285 | ___ | ||
| 286 | &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); | ||
| 287 | $code.=<<___; | ||
| 288 | pxor @x[2], @t[0] | ||
| 289 | pxor @x[3], @t[1] | ||
| 290 | pxor @y[2], @y[0] | ||
| 291 | pxor @y[3], @y[1] | ||
| 292 | ___ | ||
| 293 | Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
| 294 | @x[2], @x[3], @y[2], @y[3], @t[2]); | ||
| 295 | $code.=<<___; | ||
| 296 | pxor @t[0], @x[0] | ||
| 297 | pxor @t[0], @x[2] | ||
| 298 | pxor @t[1], @x[1] | ||
| 299 | pxor @t[1], @x[3] | ||
| 300 | |||
| 301 | movdqa @x[4], @t[0] | ||
| 302 | movdqa @x[5], @t[1] | ||
| 303 | pxor @x[6], @t[0] | ||
| 304 | pxor @x[7], @t[1] | ||
| 305 | ___ | ||
| 306 | &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
| 307 | @x[6], @x[7], @y[2], @y[3], @t[2]); | ||
| 308 | $code.=<<___; | ||
| 309 | pxor @y[2], @y[0] | ||
| 310 | pxor @y[3], @y[1] | ||
| 311 | ___ | ||
| 312 | &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); | ||
| 313 | $code.=<<___; | ||
| 314 | pxor @t[0], @x[4] | ||
| 315 | pxor @t[0], @x[6] | ||
| 316 | pxor @t[1], @x[5] | ||
| 317 | pxor @t[1], @x[7] | ||
| 318 | ___ | ||
| 319 | } | ||
| 320 | sub Inv_GF256 { | ||
| 321 | #;******************************************************************** | ||
| 322 | #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * | ||
| 323 | #;******************************************************************** | ||
| 324 | my @x=@_[0..7]; | ||
| 325 | my @t=@_[8..11]; | ||
| 326 | my @s=@_[12..15]; | ||
| 327 | # direct optimizations from hardware | ||
| 328 | $code.=<<___; | ||
| 329 | movdqa @x[4], @t[3] | ||
| 330 | movdqa @x[5], @t[2] | ||
| 331 | movdqa @x[1], @t[1] | ||
| 332 | movdqa @x[7], @s[1] | ||
| 333 | movdqa @x[0], @s[0] | ||
| 334 | |||
| 335 | pxor @x[6], @t[3] | ||
| 336 | pxor @x[7], @t[2] | ||
| 337 | pxor @x[3], @t[1] | ||
| 338 | movdqa @t[3], @s[2] | ||
| 339 | pxor @x[6], @s[1] | ||
| 340 | movdqa @t[2], @t[0] | ||
| 341 | pxor @x[2], @s[0] | ||
| 342 | movdqa @t[3], @s[3] | ||
| 343 | |||
| 344 | por @t[1], @t[2] | ||
| 345 | por @s[0], @t[3] | ||
| 346 | pxor @t[0], @s[3] | ||
| 347 | pand @s[0], @s[2] | ||
| 348 | pxor @t[1], @s[0] | ||
| 349 | pand @t[1], @t[0] | ||
| 350 | pand @s[0], @s[3] | ||
| 351 | movdqa @x[3], @s[0] | ||
| 352 | pxor @x[2], @s[0] | ||
| 353 | pand @s[0], @s[1] | ||
| 354 | pxor @s[1], @t[3] | ||
| 355 | pxor @s[1], @t[2] | ||
| 356 | movdqa @x[4], @s[1] | ||
| 357 | movdqa @x[1], @s[0] | ||
| 358 | pxor @x[5], @s[1] | ||
| 359 | pxor @x[0], @s[0] | ||
| 360 | movdqa @s[1], @t[1] | ||
| 361 | pand @s[0], @s[1] | ||
| 362 | por @s[0], @t[1] | ||
| 363 | pxor @s[1], @t[0] | ||
| 364 | pxor @s[3], @t[3] | ||
| 365 | pxor @s[2], @t[2] | ||
| 366 | pxor @s[3], @t[1] | ||
| 367 | movdqa @x[7], @s[0] | ||
| 368 | pxor @s[2], @t[0] | ||
| 369 | movdqa @x[6], @s[1] | ||
| 370 | pxor @s[2], @t[1] | ||
| 371 | movdqa @x[5], @s[2] | ||
| 372 | pand @x[3], @s[0] | ||
| 373 | movdqa @x[4], @s[3] | ||
| 374 | pand @x[2], @s[1] | ||
| 375 | pand @x[1], @s[2] | ||
| 376 | por @x[0], @s[3] | ||
| 377 | pxor @s[0], @t[3] | ||
| 378 | pxor @s[1], @t[2] | ||
| 379 | pxor @s[2], @t[1] | ||
| 380 | pxor @s[3], @t[0] | ||
| 381 | |||
| 382 | #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 | ||
| 383 | |||
| 384 | # new smaller inversion | ||
| 385 | |||
| 386 | movdqa @t[3], @s[0] | ||
| 387 | pand @t[1], @t[3] | ||
| 388 | pxor @t[2], @s[0] | ||
| 389 | |||
| 390 | movdqa @t[0], @s[2] | ||
| 391 | movdqa @s[0], @s[3] | ||
| 392 | pxor @t[3], @s[2] | ||
| 393 | pand @s[2], @s[3] | ||
| 394 | |||
| 395 | movdqa @t[1], @s[1] | ||
| 396 | pxor @t[2], @s[3] | ||
| 397 | pxor @t[0], @s[1] | ||
| 398 | |||
| 399 | pxor @t[2], @t[3] | ||
| 400 | |||
| 401 | pand @t[3], @s[1] | ||
| 402 | |||
| 403 | movdqa @s[2], @t[2] | ||
| 404 | pxor @t[0], @s[1] | ||
| 405 | |||
| 406 | pxor @s[1], @t[2] | ||
| 407 | pxor @s[1], @t[1] | ||
| 408 | |||
| 409 | pand @t[0], @t[2] | ||
| 410 | |||
| 411 | pxor @t[2], @s[2] | ||
| 412 | pxor @t[2], @t[1] | ||
| 413 | |||
| 414 | pand @s[3], @s[2] | ||
| 415 | |||
| 416 | pxor @s[0], @s[2] | ||
| 417 | ___ | ||
| 418 | # output in s3, s2, s1, t1 | ||
| 419 | |||
| 420 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 | ||
| 421 | |||
| 422 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 | ||
| 423 | &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); | ||
| 424 | |||
| 425 | ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb | ||
| 426 | } | ||
| 427 | |||
| 428 | # AES linear components | ||
| 429 | |||
| 430 | sub ShiftRows { | ||
| 431 | my @x=@_[0..7]; | ||
| 432 | my $mask=pop; | ||
| 433 | $code.=<<___; | ||
| 434 | pxor 0x00($key),@x[0] | ||
| 435 | pxor 0x10($key),@x[1] | ||
| 436 | pshufb $mask,@x[0] | ||
| 437 | pxor 0x20($key),@x[2] | ||
| 438 | pshufb $mask,@x[1] | ||
| 439 | pxor 0x30($key),@x[3] | ||
| 440 | pshufb $mask,@x[2] | ||
| 441 | pxor 0x40($key),@x[4] | ||
| 442 | pshufb $mask,@x[3] | ||
| 443 | pxor 0x50($key),@x[5] | ||
| 444 | pshufb $mask,@x[4] | ||
| 445 | pxor 0x60($key),@x[6] | ||
| 446 | pshufb $mask,@x[5] | ||
| 447 | pxor 0x70($key),@x[7] | ||
| 448 | pshufb $mask,@x[6] | ||
| 449 | lea 0x80($key),$key | ||
| 450 | pshufb $mask,@x[7] | ||
| 451 | ___ | ||
| 452 | } | ||
| 453 | |||
| 454 | sub MixColumns { | ||
| 455 | # modified to emit output in order suitable for feeding back to aesenc[last] | ||
| 456 | my @x=@_[0..7]; | ||
| 457 | my @t=@_[8..15]; | ||
| 458 | $code.=<<___; | ||
| 459 | pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 | ||
| 460 | pshufd \$0x93, @x[1], @t[1] | ||
| 461 | pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) | ||
| 462 | pshufd \$0x93, @x[2], @t[2] | ||
| 463 | pxor @t[1], @x[1] | ||
| 464 | pshufd \$0x93, @x[3], @t[3] | ||
| 465 | pxor @t[2], @x[2] | ||
| 466 | pshufd \$0x93, @x[4], @t[4] | ||
| 467 | pxor @t[3], @x[3] | ||
| 468 | pshufd \$0x93, @x[5], @t[5] | ||
| 469 | pxor @t[4], @x[4] | ||
| 470 | pshufd \$0x93, @x[6], @t[6] | ||
| 471 | pxor @t[5], @x[5] | ||
| 472 | pshufd \$0x93, @x[7], @t[7] | ||
| 473 | pxor @t[6], @x[6] | ||
| 474 | pxor @t[7], @x[7] | ||
| 475 | |||
| 476 | pxor @x[0], @t[1] | ||
| 477 | pxor @x[7], @t[0] | ||
| 478 | pxor @x[7], @t[1] | ||
| 479 | pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) | ||
| 480 | pxor @x[1], @t[2] | ||
| 481 | pshufd \$0x4E, @x[1], @x[1] | ||
| 482 | pxor @x[4], @t[5] | ||
| 483 | pxor @t[0], @x[0] | ||
| 484 | pxor @x[5], @t[6] | ||
| 485 | pxor @t[1], @x[1] | ||
| 486 | pxor @x[3], @t[4] | ||
| 487 | pshufd \$0x4E, @x[4], @t[0] | ||
| 488 | pxor @x[6], @t[7] | ||
| 489 | pshufd \$0x4E, @x[5], @t[1] | ||
| 490 | pxor @x[2], @t[3] | ||
| 491 | pshufd \$0x4E, @x[3], @x[4] | ||
| 492 | pxor @x[7], @t[3] | ||
| 493 | pshufd \$0x4E, @x[7], @x[5] | ||
| 494 | pxor @x[7], @t[4] | ||
| 495 | pshufd \$0x4E, @x[6], @x[3] | ||
| 496 | pxor @t[4], @t[0] | ||
| 497 | pshufd \$0x4E, @x[2], @x[6] | ||
| 498 | pxor @t[5], @t[1] | ||
| 499 | |||
| 500 | pxor @t[3], @x[4] | ||
| 501 | pxor @t[7], @x[5] | ||
| 502 | pxor @t[6], @x[3] | ||
| 503 | movdqa @t[0], @x[2] | ||
| 504 | pxor @t[2], @x[6] | ||
| 505 | movdqa @t[1], @x[7] | ||
| 506 | ___ | ||
| 507 | } | ||
| 508 | |||
| 509 | sub InvMixColumns { | ||
| 510 | my @x=@_[0..7]; | ||
| 511 | my @t=@_[8..15]; | ||
| 512 | |||
| 513 | $code.=<<___; | ||
| 514 | # multiplication by 0x0e | ||
| 515 | pshufd \$0x93, @x[7], @t[7] | ||
| 516 | movdqa @x[2], @t[2] | ||
| 517 | pxor @x[5], @x[7] # 7 5 | ||
| 518 | pxor @x[5], @x[2] # 2 5 | ||
| 519 | pshufd \$0x93, @x[0], @t[0] | ||
| 520 | movdqa @x[5], @t[5] | ||
| 521 | pxor @x[0], @x[5] # 5 0 [1] | ||
| 522 | pxor @x[1], @x[0] # 0 1 | ||
| 523 | pshufd \$0x93, @x[1], @t[1] | ||
| 524 | pxor @x[2], @x[1] # 1 25 | ||
| 525 | pxor @x[6], @x[0] # 01 6 [2] | ||
| 526 | pxor @x[3], @x[1] # 125 3 [4] | ||
| 527 | pshufd \$0x93, @x[3], @t[3] | ||
| 528 | pxor @x[0], @x[2] # 25 016 [3] | ||
| 529 | pxor @x[7], @x[3] # 3 75 | ||
| 530 | pxor @x[6], @x[7] # 75 6 [0] | ||
| 531 | pshufd \$0x93, @x[6], @t[6] | ||
| 532 | movdqa @x[4], @t[4] | ||
| 533 | pxor @x[4], @x[6] # 6 4 | ||
| 534 | pxor @x[3], @x[4] # 4 375 [6] | ||
| 535 | pxor @x[7], @x[3] # 375 756=36 | ||
| 536 | pxor @t[5], @x[6] # 64 5 [7] | ||
| 537 | pxor @t[2], @x[3] # 36 2 | ||
| 538 | pxor @t[4], @x[3] # 362 4 [5] | ||
| 539 | pshufd \$0x93, @t[5], @t[5] | ||
| 540 | ___ | ||
| 541 | my @y = @x[7,5,0,2,1,3,4,6]; | ||
| 542 | $code.=<<___; | ||
| 543 | # multiplication by 0x0b | ||
| 544 | pxor @y[0], @y[1] | ||
| 545 | pxor @t[0], @y[0] | ||
| 546 | pxor @t[1], @y[1] | ||
| 547 | pshufd \$0x93, @t[2], @t[2] | ||
| 548 | pxor @t[5], @y[0] | ||
| 549 | pxor @t[6], @y[1] | ||
| 550 | pxor @t[7], @y[0] | ||
| 551 | pshufd \$0x93, @t[4], @t[4] | ||
| 552 | pxor @t[6], @t[7] # clobber t[7] | ||
| 553 | pxor @y[0], @y[1] | ||
| 554 | |||
| 555 | pxor @t[0], @y[3] | ||
| 556 | pshufd \$0x93, @t[0], @t[0] | ||
| 557 | pxor @t[1], @y[2] | ||
| 558 | pxor @t[1], @y[4] | ||
| 559 | pxor @t[2], @y[2] | ||
| 560 | pshufd \$0x93, @t[1], @t[1] | ||
| 561 | pxor @t[2], @y[3] | ||
| 562 | pxor @t[2], @y[5] | ||
| 563 | pxor @t[7], @y[2] | ||
| 564 | pshufd \$0x93, @t[2], @t[2] | ||
| 565 | pxor @t[3], @y[3] | ||
| 566 | pxor @t[3], @y[6] | ||
| 567 | pxor @t[3], @y[4] | ||
| 568 | pshufd \$0x93, @t[3], @t[3] | ||
| 569 | pxor @t[4], @y[7] | ||
| 570 | pxor @t[4], @y[5] | ||
| 571 | pxor @t[7], @y[7] | ||
| 572 | pxor @t[5], @y[3] | ||
| 573 | pxor @t[4], @y[4] | ||
| 574 | pxor @t[5], @t[7] # clobber t[7] even more | ||
| 575 | |||
| 576 | pxor @t[7], @y[5] | ||
| 577 | pshufd \$0x93, @t[4], @t[4] | ||
| 578 | pxor @t[7], @y[6] | ||
| 579 | pxor @t[7], @y[4] | ||
| 580 | |||
| 581 | pxor @t[5], @t[7] | ||
| 582 | pshufd \$0x93, @t[5], @t[5] | ||
| 583 | pxor @t[6], @t[7] # restore t[7] | ||
| 584 | |||
| 585 | # multiplication by 0x0d | ||
| 586 | pxor @y[7], @y[4] | ||
| 587 | pxor @t[4], @y[7] | ||
| 588 | pshufd \$0x93, @t[6], @t[6] | ||
| 589 | pxor @t[0], @y[2] | ||
| 590 | pxor @t[5], @y[7] | ||
| 591 | pxor @t[2], @y[2] | ||
| 592 | pshufd \$0x93, @t[7], @t[7] | ||
| 593 | |||
| 594 | pxor @y[1], @y[3] | ||
| 595 | pxor @t[1], @y[1] | ||
| 596 | pxor @t[0], @y[0] | ||
| 597 | pxor @t[0], @y[3] | ||
| 598 | pxor @t[5], @y[1] | ||
| 599 | pxor @t[5], @y[0] | ||
| 600 | pxor @t[7], @y[1] | ||
| 601 | pshufd \$0x93, @t[0], @t[0] | ||
| 602 | pxor @t[6], @y[0] | ||
| 603 | pxor @y[1], @y[3] | ||
| 604 | pxor @t[1], @y[4] | ||
| 605 | pshufd \$0x93, @t[1], @t[1] | ||
| 606 | |||
| 607 | pxor @t[7], @y[7] | ||
| 608 | pxor @t[2], @y[4] | ||
| 609 | pxor @t[2], @y[5] | ||
| 610 | pshufd \$0x93, @t[2], @t[2] | ||
| 611 | pxor @t[6], @y[2] | ||
| 612 | pxor @t[3], @t[6] # clobber t[6] | ||
| 613 | pxor @y[7], @y[4] | ||
| 614 | pxor @t[6], @y[3] | ||
| 615 | |||
| 616 | pxor @t[6], @y[6] | ||
| 617 | pxor @t[5], @y[5] | ||
| 618 | pxor @t[4], @y[6] | ||
| 619 | pshufd \$0x93, @t[4], @t[4] | ||
| 620 | pxor @t[6], @y[5] | ||
| 621 | pxor @t[7], @y[6] | ||
| 622 | pxor @t[3], @t[6] # restore t[6] | ||
| 623 | |||
| 624 | pshufd \$0x93, @t[5], @t[5] | ||
| 625 | pshufd \$0x93, @t[6], @t[6] | ||
| 626 | pshufd \$0x93, @t[7], @t[7] | ||
| 627 | pshufd \$0x93, @t[3], @t[3] | ||
| 628 | |||
| 629 | # multiplication by 0x09 | ||
| 630 | pxor @y[1], @y[4] | ||
| 631 | pxor @y[1], @t[1] # t[1]=y[1] | ||
| 632 | pxor @t[5], @t[0] # clobber t[0] | ||
| 633 | pxor @t[5], @t[1] | ||
| 634 | pxor @t[0], @y[3] | ||
| 635 | pxor @y[0], @t[0] # t[0]=y[0] | ||
| 636 | pxor @t[6], @t[1] | ||
| 637 | pxor @t[7], @t[6] # clobber t[6] | ||
| 638 | pxor @t[1], @y[4] | ||
| 639 | pxor @t[4], @y[7] | ||
| 640 | pxor @y[4], @t[4] # t[4]=y[4] | ||
| 641 | pxor @t[3], @y[6] | ||
| 642 | pxor @y[3], @t[3] # t[3]=y[3] | ||
| 643 | pxor @t[2], @y[5] | ||
| 644 | pxor @y[2], @t[2] # t[2]=y[2] | ||
| 645 | pxor @t[7], @t[3] | ||
| 646 | pxor @y[5], @t[5] # t[5]=y[5] | ||
| 647 | pxor @t[6], @t[2] | ||
| 648 | pxor @t[6], @t[5] | ||
| 649 | pxor @y[6], @t[6] # t[6]=y[6] | ||
| 650 | pxor @y[7], @t[7] # t[7]=y[7] | ||
| 651 | |||
| 652 | movdqa @t[0],@XMM[0] | ||
| 653 | movdqa @t[1],@XMM[1] | ||
| 654 | movdqa @t[2],@XMM[2] | ||
| 655 | movdqa @t[3],@XMM[3] | ||
| 656 | movdqa @t[4],@XMM[4] | ||
| 657 | movdqa @t[5],@XMM[5] | ||
| 658 | movdqa @t[6],@XMM[6] | ||
| 659 | movdqa @t[7],@XMM[7] | ||
| 660 | ___ | ||
| 661 | } | ||
| 662 | |||
| 663 | sub aesenc { # not used | ||
| 664 | my @b=@_[0..7]; | ||
| 665 | my @t=@_[8..15]; | ||
| 666 | $code.=<<___; | ||
| 667 | movdqa 0x30($const),@t[0] # .LSR | ||
| 668 | ___ | ||
| 669 | &ShiftRows (@b,@t[0]); | ||
| 670 | &Sbox (@b,@t); | ||
| 671 | &MixColumns (@b[0,1,4,6,3,7,2,5],@t); | ||
| 672 | } | ||
| 673 | |||
| 674 | sub aesenclast { # not used | ||
| 675 | my @b=@_[0..7]; | ||
| 676 | my @t=@_[8..15]; | ||
| 677 | $code.=<<___; | ||
| 678 | movdqa 0x40($const),@t[0] # .LSRM0 | ||
| 679 | ___ | ||
| 680 | &ShiftRows (@b,@t[0]); | ||
| 681 | &Sbox (@b,@t); | ||
| 682 | $code.=<<___ | ||
| 683 | pxor 0x00($key),@b[0] | ||
| 684 | pxor 0x10($key),@b[1] | ||
| 685 | pxor 0x20($key),@b[4] | ||
| 686 | pxor 0x30($key),@b[6] | ||
| 687 | pxor 0x40($key),@b[3] | ||
| 688 | pxor 0x50($key),@b[7] | ||
| 689 | pxor 0x60($key),@b[2] | ||
| 690 | pxor 0x70($key),@b[5] | ||
| 691 | ___ | ||
| 692 | } | ||
| 693 | |||
| 694 | sub swapmove { | ||
| 695 | my ($a,$b,$n,$mask,$t)=@_; | ||
| 696 | $code.=<<___; | ||
| 697 | movdqa $b,$t | ||
| 698 | psrlq \$$n,$b | ||
| 699 | pxor $a,$b | ||
| 700 | pand $mask,$b | ||
| 701 | pxor $b,$a | ||
| 702 | psllq \$$n,$b | ||
| 703 | pxor $t,$b | ||
| 704 | ___ | ||
| 705 | } | ||
| 706 | sub swapmove2x { | ||
| 707 | my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; | ||
| 708 | $code.=<<___; | ||
| 709 | movdqa $b0,$t0 | ||
| 710 | psrlq \$$n,$b0 | ||
| 711 | movdqa $b1,$t1 | ||
| 712 | psrlq \$$n,$b1 | ||
| 713 | pxor $a0,$b0 | ||
| 714 | pxor $a1,$b1 | ||
| 715 | pand $mask,$b0 | ||
| 716 | pand $mask,$b1 | ||
| 717 | pxor $b0,$a0 | ||
| 718 | psllq \$$n,$b0 | ||
| 719 | pxor $b1,$a1 | ||
| 720 | psllq \$$n,$b1 | ||
| 721 | pxor $t0,$b0 | ||
| 722 | pxor $t1,$b1 | ||
| 723 | ___ | ||
| 724 | } | ||
| 725 | |||
| 726 | sub bitslice { | ||
| 727 | my @x=reverse(@_[0..7]); | ||
| 728 | my ($t0,$t1,$t2,$t3)=@_[8..11]; | ||
| 729 | $code.=<<___; | ||
| 730 | movdqa 0x00($const),$t0 # .LBS0 | ||
| 731 | movdqa 0x10($const),$t1 # .LBS1 | ||
| 732 | ___ | ||
| 733 | &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); | ||
| 734 | &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
| 735 | $code.=<<___; | ||
| 736 | movdqa 0x20($const),$t0 # .LBS2 | ||
| 737 | ___ | ||
| 738 | &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); | ||
| 739 | &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
| 740 | |||
| 741 | &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); | ||
| 742 | &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); | ||
| 743 | } | ||
| 744 | |||
| 745 | $code.=<<___; | ||
| 746 | .text | ||
| 747 | |||
| 748 | .extern asm_AES_encrypt | ||
| 749 | .extern asm_AES_decrypt | ||
| 750 | |||
| 751 | .type _bsaes_encrypt8,\@abi-omnipotent | ||
| 752 | .align 64 | ||
| 753 | _bsaes_encrypt8: | ||
| 754 | lea .LBS0(%rip), $const # constants table | ||
| 755 | |||
| 756 | movdqa ($key), @XMM[9] # round 0 key | ||
| 757 | lea 0x10($key), $key | ||
| 758 | movdqa 0x50($const), @XMM[8] # .LM0SR | ||
| 759 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
| 760 | pxor @XMM[9], @XMM[1] | ||
| 761 | pshufb @XMM[8], @XMM[0] | ||
| 762 | pxor @XMM[9], @XMM[2] | ||
| 763 | pshufb @XMM[8], @XMM[1] | ||
| 764 | pxor @XMM[9], @XMM[3] | ||
| 765 | pshufb @XMM[8], @XMM[2] | ||
| 766 | pxor @XMM[9], @XMM[4] | ||
| 767 | pshufb @XMM[8], @XMM[3] | ||
| 768 | pxor @XMM[9], @XMM[5] | ||
| 769 | pshufb @XMM[8], @XMM[4] | ||
| 770 | pxor @XMM[9], @XMM[6] | ||
| 771 | pshufb @XMM[8], @XMM[5] | ||
| 772 | pxor @XMM[9], @XMM[7] | ||
| 773 | pshufb @XMM[8], @XMM[6] | ||
| 774 | pshufb @XMM[8], @XMM[7] | ||
| 775 | _bsaes_encrypt8_bitslice: | ||
| 776 | ___ | ||
| 777 | &bitslice (@XMM[0..7, 8..11]); | ||
| 778 | $code.=<<___; | ||
| 779 | dec $rounds | ||
| 780 | jmp .Lenc_sbox | ||
| 781 | .align 16 | ||
| 782 | .Lenc_loop: | ||
| 783 | ___ | ||
| 784 | &ShiftRows (@XMM[0..7, 8]); | ||
| 785 | $code.=".Lenc_sbox:\n"; | ||
| 786 | &Sbox (@XMM[0..7, 8..15]); | ||
| 787 | $code.=<<___; | ||
| 788 | dec $rounds | ||
| 789 | jl .Lenc_done | ||
| 790 | ___ | ||
| 791 | &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); | ||
| 792 | $code.=<<___; | ||
| 793 | movdqa 0x30($const), @XMM[8] # .LSR | ||
| 794 | jnz .Lenc_loop | ||
| 795 | movdqa 0x40($const), @XMM[8] # .LSRM0 | ||
| 796 | jmp .Lenc_loop | ||
| 797 | .align 16 | ||
| 798 | .Lenc_done: | ||
| 799 | ___ | ||
| 800 | # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb | ||
| 801 | &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); | ||
| 802 | $code.=<<___; | ||
| 803 | movdqa ($key), @XMM[8] # last round key | ||
| 804 | pxor @XMM[8], @XMM[4] | ||
| 805 | pxor @XMM[8], @XMM[6] | ||
| 806 | pxor @XMM[8], @XMM[3] | ||
| 807 | pxor @XMM[8], @XMM[7] | ||
| 808 | pxor @XMM[8], @XMM[2] | ||
| 809 | pxor @XMM[8], @XMM[5] | ||
| 810 | pxor @XMM[8], @XMM[0] | ||
| 811 | pxor @XMM[8], @XMM[1] | ||
| 812 | ret | ||
| 813 | .size _bsaes_encrypt8,.-_bsaes_encrypt8 | ||
| 814 | |||
| 815 | .type _bsaes_decrypt8,\@abi-omnipotent | ||
| 816 | .align 64 | ||
| 817 | _bsaes_decrypt8: | ||
| 818 | lea .LBS0(%rip), $const # constants table | ||
| 819 | |||
| 820 | movdqa ($key), @XMM[9] # round 0 key | ||
| 821 | lea 0x10($key), $key | ||
| 822 | movdqa -0x30($const), @XMM[8] # .LM0ISR | ||
| 823 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
| 824 | pxor @XMM[9], @XMM[1] | ||
| 825 | pshufb @XMM[8], @XMM[0] | ||
| 826 | pxor @XMM[9], @XMM[2] | ||
| 827 | pshufb @XMM[8], @XMM[1] | ||
| 828 | pxor @XMM[9], @XMM[3] | ||
| 829 | pshufb @XMM[8], @XMM[2] | ||
| 830 | pxor @XMM[9], @XMM[4] | ||
| 831 | pshufb @XMM[8], @XMM[3] | ||
| 832 | pxor @XMM[9], @XMM[5] | ||
| 833 | pshufb @XMM[8], @XMM[4] | ||
| 834 | pxor @XMM[9], @XMM[6] | ||
| 835 | pshufb @XMM[8], @XMM[5] | ||
| 836 | pxor @XMM[9], @XMM[7] | ||
| 837 | pshufb @XMM[8], @XMM[6] | ||
| 838 | pshufb @XMM[8], @XMM[7] | ||
| 839 | ___ | ||
| 840 | &bitslice (@XMM[0..7, 8..11]); | ||
| 841 | $code.=<<___; | ||
| 842 | dec $rounds | ||
| 843 | jmp .Ldec_sbox | ||
| 844 | .align 16 | ||
| 845 | .Ldec_loop: | ||
| 846 | ___ | ||
| 847 | &ShiftRows (@XMM[0..7, 8]); | ||
| 848 | $code.=".Ldec_sbox:\n"; | ||
| 849 | &InvSbox (@XMM[0..7, 8..15]); | ||
| 850 | $code.=<<___; | ||
| 851 | dec $rounds | ||
| 852 | jl .Ldec_done | ||
| 853 | ___ | ||
| 854 | &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); | ||
| 855 | $code.=<<___; | ||
| 856 | movdqa -0x10($const), @XMM[8] # .LISR | ||
| 857 | jnz .Ldec_loop | ||
| 858 | movdqa -0x20($const), @XMM[8] # .LISRM0 | ||
| 859 | jmp .Ldec_loop | ||
| 860 | .align 16 | ||
| 861 | .Ldec_done: | ||
| 862 | ___ | ||
| 863 | &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); | ||
| 864 | $code.=<<___; | ||
| 865 | movdqa ($key), @XMM[8] # last round key | ||
| 866 | pxor @XMM[8], @XMM[6] | ||
| 867 | pxor @XMM[8], @XMM[4] | ||
| 868 | pxor @XMM[8], @XMM[2] | ||
| 869 | pxor @XMM[8], @XMM[7] | ||
| 870 | pxor @XMM[8], @XMM[3] | ||
| 871 | pxor @XMM[8], @XMM[5] | ||
| 872 | pxor @XMM[8], @XMM[0] | ||
| 873 | pxor @XMM[8], @XMM[1] | ||
| 874 | ret | ||
| 875 | .size _bsaes_decrypt8,.-_bsaes_decrypt8 | ||
| 876 | ___ | ||
| 877 | } | ||
| 878 | { | ||
| 879 | my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); | ||
| 880 | |||
| 881 | sub bitslice_key { | ||
| 882 | my @x=reverse(@_[0..7]); | ||
| 883 | my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; | ||
| 884 | |||
| 885 | &swapmove (@x[0,1],1,$bs0,$t2,$t3); | ||
| 886 | $code.=<<___; | ||
| 887 | #&swapmove(@x[2,3],1,$t0,$t2,$t3); | ||
| 888 | movdqa @x[0], @x[2] | ||
| 889 | movdqa @x[1], @x[3] | ||
| 890 | ___ | ||
| 891 | #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
| 892 | |||
| 893 | &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); | ||
| 894 | $code.=<<___; | ||
| 895 | #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
| 896 | movdqa @x[0], @x[4] | ||
| 897 | movdqa @x[2], @x[6] | ||
| 898 | movdqa @x[1], @x[5] | ||
| 899 | movdqa @x[3], @x[7] | ||
| 900 | ___ | ||
| 901 | &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); | ||
| 902 | &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); | ||
| 903 | } | ||
| 904 | |||
| 905 | $code.=<<___; | ||
| 906 | .type _bsaes_key_convert,\@abi-omnipotent | ||
| 907 | .align 16 | ||
| 908 | _bsaes_key_convert: | ||
| 909 | lea .Lmasks(%rip), $const | ||
| 910 | movdqu ($inp), %xmm7 # load round 0 key | ||
| 911 | lea 0x10($inp), $inp | ||
| 912 | movdqa 0x00($const), %xmm0 # 0x01... | ||
| 913 | movdqa 0x10($const), %xmm1 # 0x02... | ||
| 914 | movdqa 0x20($const), %xmm2 # 0x04... | ||
| 915 | movdqa 0x30($const), %xmm3 # 0x08... | ||
| 916 | movdqa 0x40($const), %xmm4 # .LM0 | ||
| 917 | pcmpeqd %xmm5, %xmm5 # .LNOT | ||
| 918 | |||
| 919 | movdqu ($inp), %xmm6 # load round 1 key | ||
| 920 | movdqa %xmm7, ($out) # save round 0 key | ||
| 921 | lea 0x10($out), $out | ||
| 922 | dec $rounds | ||
| 923 | jmp .Lkey_loop | ||
| 924 | .align 16 | ||
| 925 | .Lkey_loop: | ||
| 926 | pshufb %xmm4, %xmm6 # .LM0 | ||
| 927 | |||
| 928 | movdqa %xmm0, %xmm8 | ||
| 929 | movdqa %xmm1, %xmm9 | ||
| 930 | |||
| 931 | pand %xmm6, %xmm8 | ||
| 932 | pand %xmm6, %xmm9 | ||
| 933 | movdqa %xmm2, %xmm10 | ||
| 934 | pcmpeqb %xmm0, %xmm8 | ||
| 935 | psllq \$4, %xmm0 # 0x10... | ||
| 936 | movdqa %xmm3, %xmm11 | ||
| 937 | pcmpeqb %xmm1, %xmm9 | ||
| 938 | psllq \$4, %xmm1 # 0x20... | ||
| 939 | |||
| 940 | pand %xmm6, %xmm10 | ||
| 941 | pand %xmm6, %xmm11 | ||
| 942 | movdqa %xmm0, %xmm12 | ||
| 943 | pcmpeqb %xmm2, %xmm10 | ||
| 944 | psllq \$4, %xmm2 # 0x40... | ||
| 945 | movdqa %xmm1, %xmm13 | ||
| 946 | pcmpeqb %xmm3, %xmm11 | ||
| 947 | psllq \$4, %xmm3 # 0x80... | ||
| 948 | |||
| 949 | movdqa %xmm2, %xmm14 | ||
| 950 | movdqa %xmm3, %xmm15 | ||
| 951 | pxor %xmm5, %xmm8 # "pnot" | ||
| 952 | pxor %xmm5, %xmm9 | ||
| 953 | |||
| 954 | pand %xmm6, %xmm12 | ||
| 955 | pand %xmm6, %xmm13 | ||
| 956 | movdqa %xmm8, 0x00($out) # write bit-sliced round key | ||
| 957 | pcmpeqb %xmm0, %xmm12 | ||
| 958 | psrlq \$4, %xmm0 # 0x01... | ||
| 959 | movdqa %xmm9, 0x10($out) | ||
| 960 | pcmpeqb %xmm1, %xmm13 | ||
| 961 | psrlq \$4, %xmm1 # 0x02... | ||
| 962 | lea 0x10($inp), $inp | ||
| 963 | |||
| 964 | pand %xmm6, %xmm14 | ||
| 965 | pand %xmm6, %xmm15 | ||
| 966 | movdqa %xmm10, 0x20($out) | ||
| 967 | pcmpeqb %xmm2, %xmm14 | ||
| 968 | psrlq \$4, %xmm2 # 0x04... | ||
| 969 | movdqa %xmm11, 0x30($out) | ||
| 970 | pcmpeqb %xmm3, %xmm15 | ||
| 971 | psrlq \$4, %xmm3 # 0x08... | ||
| 972 | movdqu ($inp), %xmm6 # load next round key | ||
| 973 | |||
| 974 | pxor %xmm5, %xmm13 # "pnot" | ||
| 975 | pxor %xmm5, %xmm14 | ||
| 976 | movdqa %xmm12, 0x40($out) | ||
| 977 | movdqa %xmm13, 0x50($out) | ||
| 978 | movdqa %xmm14, 0x60($out) | ||
| 979 | movdqa %xmm15, 0x70($out) | ||
| 980 | lea 0x80($out),$out | ||
| 981 | dec $rounds | ||
| 982 | jnz .Lkey_loop | ||
| 983 | |||
| 984 | movdqa 0x50($const), %xmm7 # .L63 | ||
| 985 | #movdqa %xmm6, ($out) # don't save last round key | ||
| 986 | ret | ||
| 987 | .size _bsaes_key_convert,.-_bsaes_key_convert | ||
| 988 | ___ | ||
| 989 | } | ||
| 990 | |||
| 991 | if (0 && !$win64) { # following four functions are unsupported interface | ||
| 992 | # used for benchmarking... | ||
| 993 | $code.=<<___; | ||
| 994 | .globl bsaes_enc_key_convert | ||
| 995 | .type bsaes_enc_key_convert,\@function,2 | ||
| 996 | .align 16 | ||
| 997 | bsaes_enc_key_convert: | ||
| 998 | mov 240($inp),%r10d # pass rounds | ||
| 999 | mov $inp,%rcx # pass key | ||
| 1000 | mov $out,%rax # pass key schedule | ||
| 1001 | call _bsaes_key_convert | ||
| 1002 | pxor %xmm6,%xmm7 # fix up last round key | ||
| 1003 | movdqa %xmm7,(%rax) # save last round key | ||
| 1004 | ret | ||
| 1005 | .size bsaes_enc_key_convert,.-bsaes_enc_key_convert | ||
| 1006 | |||
| 1007 | .globl bsaes_encrypt_128 | ||
| 1008 | .type bsaes_encrypt_128,\@function,4 | ||
| 1009 | .align 16 | ||
| 1010 | bsaes_encrypt_128: | ||
| 1011 | .Lenc128_loop: | ||
| 1012 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1013 | movdqu 0x10($inp), @XMM[1] | ||
| 1014 | movdqu 0x20($inp), @XMM[2] | ||
| 1015 | movdqu 0x30($inp), @XMM[3] | ||
| 1016 | movdqu 0x40($inp), @XMM[4] | ||
| 1017 | movdqu 0x50($inp), @XMM[5] | ||
| 1018 | movdqu 0x60($inp), @XMM[6] | ||
| 1019 | movdqu 0x70($inp), @XMM[7] | ||
| 1020 | mov $key, %rax # pass the $key | ||
| 1021 | lea 0x80($inp), $inp | ||
| 1022 | mov \$10,%r10d | ||
| 1023 | |||
| 1024 | call _bsaes_encrypt8 | ||
| 1025 | |||
| 1026 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1027 | movdqu @XMM[1], 0x10($out) | ||
| 1028 | movdqu @XMM[4], 0x20($out) | ||
| 1029 | movdqu @XMM[6], 0x30($out) | ||
| 1030 | movdqu @XMM[3], 0x40($out) | ||
| 1031 | movdqu @XMM[7], 0x50($out) | ||
| 1032 | movdqu @XMM[2], 0x60($out) | ||
| 1033 | movdqu @XMM[5], 0x70($out) | ||
| 1034 | lea 0x80($out), $out | ||
| 1035 | sub \$0x80,$len | ||
| 1036 | ja .Lenc128_loop | ||
| 1037 | ret | ||
| 1038 | .size bsaes_encrypt_128,.-bsaes_encrypt_128 | ||
| 1039 | |||
| 1040 | .globl bsaes_dec_key_convert | ||
| 1041 | .type bsaes_dec_key_convert,\@function,2 | ||
| 1042 | .align 16 | ||
| 1043 | bsaes_dec_key_convert: | ||
| 1044 | mov 240($inp),%r10d # pass rounds | ||
| 1045 | mov $inp,%rcx # pass key | ||
| 1046 | mov $out,%rax # pass key schedule | ||
| 1047 | call _bsaes_key_convert | ||
| 1048 | pxor ($out),%xmm7 # fix up round 0 key | ||
| 1049 | movdqa %xmm6,(%rax) # save last round key | ||
| 1050 | movdqa %xmm7,($out) | ||
| 1051 | ret | ||
| 1052 | .size bsaes_dec_key_convert,.-bsaes_dec_key_convert | ||
| 1053 | |||
| 1054 | .globl bsaes_decrypt_128 | ||
| 1055 | .type bsaes_decrypt_128,\@function,4 | ||
| 1056 | .align 16 | ||
| 1057 | bsaes_decrypt_128: | ||
| 1058 | .Ldec128_loop: | ||
| 1059 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1060 | movdqu 0x10($inp), @XMM[1] | ||
| 1061 | movdqu 0x20($inp), @XMM[2] | ||
| 1062 | movdqu 0x30($inp), @XMM[3] | ||
| 1063 | movdqu 0x40($inp), @XMM[4] | ||
| 1064 | movdqu 0x50($inp), @XMM[5] | ||
| 1065 | movdqu 0x60($inp), @XMM[6] | ||
| 1066 | movdqu 0x70($inp), @XMM[7] | ||
| 1067 | mov $key, %rax # pass the $key | ||
| 1068 | lea 0x80($inp), $inp | ||
| 1069 | mov \$10,%r10d | ||
| 1070 | |||
| 1071 | call _bsaes_decrypt8 | ||
| 1072 | |||
| 1073 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1074 | movdqu @XMM[1], 0x10($out) | ||
| 1075 | movdqu @XMM[6], 0x20($out) | ||
| 1076 | movdqu @XMM[4], 0x30($out) | ||
| 1077 | movdqu @XMM[2], 0x40($out) | ||
| 1078 | movdqu @XMM[7], 0x50($out) | ||
| 1079 | movdqu @XMM[3], 0x60($out) | ||
| 1080 | movdqu @XMM[5], 0x70($out) | ||
| 1081 | lea 0x80($out), $out | ||
| 1082 | sub \$0x80,$len | ||
| 1083 | ja .Ldec128_loop | ||
| 1084 | ret | ||
| 1085 | .size bsaes_decrypt_128,.-bsaes_decrypt_128 | ||
| 1086 | ___ | ||
| 1087 | } | ||
| 1088 | { | ||
| 1089 | ###################################################################### | ||
| 1090 | # | ||
| 1091 | # OpenSSL interface | ||
| 1092 | # | ||
| 1093 | my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") | ||
| 1094 | : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | ||
| 1095 | my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); | ||
| 1096 | |||
| 1097 | if ($ecb) { | ||
| 1098 | $code.=<<___; | ||
| 1099 | .globl bsaes_ecb_encrypt_blocks | ||
| 1100 | .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent | ||
| 1101 | .align 16 | ||
| 1102 | bsaes_ecb_encrypt_blocks: | ||
| 1103 | mov %rsp, %rax | ||
| 1104 | .Lecb_enc_prologue: | ||
| 1105 | push %rbp | ||
| 1106 | push %rbx | ||
| 1107 | push %r12 | ||
| 1108 | push %r13 | ||
| 1109 | push %r14 | ||
| 1110 | push %r15 | ||
| 1111 | lea -0x48(%rsp),%rsp | ||
| 1112 | ___ | ||
| 1113 | $code.=<<___ if ($win64); | ||
| 1114 | lea -0xa0(%rsp), %rsp | ||
| 1115 | movaps %xmm6, 0x40(%rsp) | ||
| 1116 | movaps %xmm7, 0x50(%rsp) | ||
| 1117 | movaps %xmm8, 0x60(%rsp) | ||
| 1118 | movaps %xmm9, 0x70(%rsp) | ||
| 1119 | movaps %xmm10, 0x80(%rsp) | ||
| 1120 | movaps %xmm11, 0x90(%rsp) | ||
| 1121 | movaps %xmm12, 0xa0(%rsp) | ||
| 1122 | movaps %xmm13, 0xb0(%rsp) | ||
| 1123 | movaps %xmm14, 0xc0(%rsp) | ||
| 1124 | movaps %xmm15, 0xd0(%rsp) | ||
| 1125 | .Lecb_enc_body: | ||
| 1126 | ___ | ||
| 1127 | $code.=<<___; | ||
| 1128 | mov %rsp,%rbp # backup %rsp | ||
| 1129 | mov 240($arg4),%eax # rounds | ||
| 1130 | mov $arg1,$inp # backup arguments | ||
| 1131 | mov $arg2,$out | ||
| 1132 | mov $arg3,$len | ||
| 1133 | mov $arg4,$key | ||
| 1134 | cmp \$8,$arg3 | ||
| 1135 | jb .Lecb_enc_short | ||
| 1136 | |||
| 1137 | mov %eax,%ebx # backup rounds | ||
| 1138 | shl \$7,%rax # 128 bytes per inner round key | ||
| 1139 | sub \$`128-32`,%rax # size of bit-sliced key schedule | ||
| 1140 | sub %rax,%rsp | ||
| 1141 | mov %rsp,%rax # pass key schedule | ||
| 1142 | mov $key,%rcx # pass key | ||
| 1143 | mov %ebx,%r10d # pass rounds | ||
| 1144 | call _bsaes_key_convert | ||
| 1145 | pxor %xmm6,%xmm7 # fix up last round key | ||
| 1146 | movdqa %xmm7,(%rax) # save last round key | ||
| 1147 | |||
| 1148 | sub \$8,$len | ||
| 1149 | .Lecb_enc_loop: | ||
| 1150 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1151 | movdqu 0x10($inp), @XMM[1] | ||
| 1152 | movdqu 0x20($inp), @XMM[2] | ||
| 1153 | movdqu 0x30($inp), @XMM[3] | ||
| 1154 | movdqu 0x40($inp), @XMM[4] | ||
| 1155 | movdqu 0x50($inp), @XMM[5] | ||
| 1156 | mov %rsp, %rax # pass key schedule | ||
| 1157 | movdqu 0x60($inp), @XMM[6] | ||
| 1158 | mov %ebx,%r10d # pass rounds | ||
| 1159 | movdqu 0x70($inp), @XMM[7] | ||
| 1160 | lea 0x80($inp), $inp | ||
| 1161 | |||
| 1162 | call _bsaes_encrypt8 | ||
| 1163 | |||
| 1164 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1165 | movdqu @XMM[1], 0x10($out) | ||
| 1166 | movdqu @XMM[4], 0x20($out) | ||
| 1167 | movdqu @XMM[6], 0x30($out) | ||
| 1168 | movdqu @XMM[3], 0x40($out) | ||
| 1169 | movdqu @XMM[7], 0x50($out) | ||
| 1170 | movdqu @XMM[2], 0x60($out) | ||
| 1171 | movdqu @XMM[5], 0x70($out) | ||
| 1172 | lea 0x80($out), $out | ||
| 1173 | sub \$8,$len | ||
| 1174 | jnc .Lecb_enc_loop | ||
| 1175 | |||
| 1176 | add \$8,$len | ||
| 1177 | jz .Lecb_enc_done | ||
| 1178 | |||
| 1179 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1180 | mov %rsp, %rax # pass key schedule | ||
| 1181 | mov %ebx,%r10d # pass rounds | ||
| 1182 | cmp \$2,$len | ||
| 1183 | jb .Lecb_enc_one | ||
| 1184 | movdqu 0x10($inp), @XMM[1] | ||
| 1185 | je .Lecb_enc_two | ||
| 1186 | movdqu 0x20($inp), @XMM[2] | ||
| 1187 | cmp \$4,$len | ||
| 1188 | jb .Lecb_enc_three | ||
| 1189 | movdqu 0x30($inp), @XMM[3] | ||
| 1190 | je .Lecb_enc_four | ||
| 1191 | movdqu 0x40($inp), @XMM[4] | ||
| 1192 | cmp \$6,$len | ||
| 1193 | jb .Lecb_enc_five | ||
| 1194 | movdqu 0x50($inp), @XMM[5] | ||
| 1195 | je .Lecb_enc_six | ||
| 1196 | movdqu 0x60($inp), @XMM[6] | ||
| 1197 | call _bsaes_encrypt8 | ||
| 1198 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1199 | movdqu @XMM[1], 0x10($out) | ||
| 1200 | movdqu @XMM[4], 0x20($out) | ||
| 1201 | movdqu @XMM[6], 0x30($out) | ||
| 1202 | movdqu @XMM[3], 0x40($out) | ||
| 1203 | movdqu @XMM[7], 0x50($out) | ||
| 1204 | movdqu @XMM[2], 0x60($out) | ||
| 1205 | jmp .Lecb_enc_done | ||
| 1206 | .align 16 | ||
| 1207 | .Lecb_enc_six: | ||
| 1208 | call _bsaes_encrypt8 | ||
| 1209 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1210 | movdqu @XMM[1], 0x10($out) | ||
| 1211 | movdqu @XMM[4], 0x20($out) | ||
| 1212 | movdqu @XMM[6], 0x30($out) | ||
| 1213 | movdqu @XMM[3], 0x40($out) | ||
| 1214 | movdqu @XMM[7], 0x50($out) | ||
| 1215 | jmp .Lecb_enc_done | ||
| 1216 | .align 16 | ||
| 1217 | .Lecb_enc_five: | ||
| 1218 | call _bsaes_encrypt8 | ||
| 1219 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1220 | movdqu @XMM[1], 0x10($out) | ||
| 1221 | movdqu @XMM[4], 0x20($out) | ||
| 1222 | movdqu @XMM[6], 0x30($out) | ||
| 1223 | movdqu @XMM[3], 0x40($out) | ||
| 1224 | jmp .Lecb_enc_done | ||
| 1225 | .align 16 | ||
| 1226 | .Lecb_enc_four: | ||
| 1227 | call _bsaes_encrypt8 | ||
| 1228 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1229 | movdqu @XMM[1], 0x10($out) | ||
| 1230 | movdqu @XMM[4], 0x20($out) | ||
| 1231 | movdqu @XMM[6], 0x30($out) | ||
| 1232 | jmp .Lecb_enc_done | ||
| 1233 | .align 16 | ||
| 1234 | .Lecb_enc_three: | ||
| 1235 | call _bsaes_encrypt8 | ||
| 1236 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1237 | movdqu @XMM[1], 0x10($out) | ||
| 1238 | movdqu @XMM[4], 0x20($out) | ||
| 1239 | jmp .Lecb_enc_done | ||
| 1240 | .align 16 | ||
| 1241 | .Lecb_enc_two: | ||
| 1242 | call _bsaes_encrypt8 | ||
| 1243 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1244 | movdqu @XMM[1], 0x10($out) | ||
| 1245 | jmp .Lecb_enc_done | ||
| 1246 | .align 16 | ||
| 1247 | .Lecb_enc_one: | ||
| 1248 | call _bsaes_encrypt8 | ||
| 1249 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1250 | jmp .Lecb_enc_done | ||
| 1251 | .align 16 | ||
| 1252 | .Lecb_enc_short: | ||
| 1253 | lea ($inp), $arg1 | ||
| 1254 | lea ($out), $arg2 | ||
| 1255 | lea ($key), $arg3 | ||
| 1256 | call asm_AES_encrypt | ||
| 1257 | lea 16($inp), $inp | ||
| 1258 | lea 16($out), $out | ||
| 1259 | dec $len | ||
| 1260 | jnz .Lecb_enc_short | ||
| 1261 | |||
| 1262 | .Lecb_enc_done: | ||
| 1263 | lea (%rsp),%rax | ||
| 1264 | pxor %xmm0, %xmm0 | ||
| 1265 | .Lecb_enc_bzero: # wipe key schedule [if any] | ||
| 1266 | movdqa %xmm0, 0x00(%rax) | ||
| 1267 | movdqa %xmm0, 0x10(%rax) | ||
| 1268 | lea 0x20(%rax), %rax | ||
| 1269 | cmp %rax, %rbp | ||
| 1270 | jb .Lecb_enc_bzero | ||
| 1271 | |||
| 1272 | lea (%rbp),%rsp # restore %rsp | ||
| 1273 | ___ | ||
| 1274 | $code.=<<___ if ($win64); | ||
| 1275 | movaps 0x40(%rbp), %xmm6 | ||
| 1276 | movaps 0x50(%rbp), %xmm7 | ||
| 1277 | movaps 0x60(%rbp), %xmm8 | ||
| 1278 | movaps 0x70(%rbp), %xmm9 | ||
| 1279 | movaps 0x80(%rbp), %xmm10 | ||
| 1280 | movaps 0x90(%rbp), %xmm11 | ||
| 1281 | movaps 0xa0(%rbp), %xmm12 | ||
| 1282 | movaps 0xb0(%rbp), %xmm13 | ||
| 1283 | movaps 0xc0(%rbp), %xmm14 | ||
| 1284 | movaps 0xd0(%rbp), %xmm15 | ||
| 1285 | lea 0xa0(%rbp), %rsp | ||
| 1286 | ___ | ||
| 1287 | $code.=<<___; | ||
| 1288 | mov 0x48(%rsp), %r15 | ||
| 1289 | mov 0x50(%rsp), %r14 | ||
| 1290 | mov 0x58(%rsp), %r13 | ||
| 1291 | mov 0x60(%rsp), %r12 | ||
| 1292 | mov 0x68(%rsp), %rbx | ||
| 1293 | mov 0x70(%rsp), %rax | ||
| 1294 | lea 0x78(%rsp), %rsp | ||
| 1295 | mov %rax, %rbp | ||
| 1296 | .Lecb_enc_epilogue: | ||
| 1297 | ret | ||
| 1298 | .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks | ||
| 1299 | |||
| 1300 | .globl bsaes_ecb_decrypt_blocks | ||
| 1301 | .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent | ||
| 1302 | .align 16 | ||
| 1303 | bsaes_ecb_decrypt_blocks: | ||
| 1304 | mov %rsp, %rax | ||
| 1305 | .Lecb_dec_prologue: | ||
| 1306 | push %rbp | ||
| 1307 | push %rbx | ||
| 1308 | push %r12 | ||
| 1309 | push %r13 | ||
| 1310 | push %r14 | ||
| 1311 | push %r15 | ||
| 1312 | lea -0x48(%rsp),%rsp | ||
| 1313 | ___ | ||
| 1314 | $code.=<<___ if ($win64); | ||
| 1315 | lea -0xa0(%rsp), %rsp | ||
| 1316 | movaps %xmm6, 0x40(%rsp) | ||
| 1317 | movaps %xmm7, 0x50(%rsp) | ||
| 1318 | movaps %xmm8, 0x60(%rsp) | ||
| 1319 | movaps %xmm9, 0x70(%rsp) | ||
| 1320 | movaps %xmm10, 0x80(%rsp) | ||
| 1321 | movaps %xmm11, 0x90(%rsp) | ||
| 1322 | movaps %xmm12, 0xa0(%rsp) | ||
| 1323 | movaps %xmm13, 0xb0(%rsp) | ||
| 1324 | movaps %xmm14, 0xc0(%rsp) | ||
| 1325 | movaps %xmm15, 0xd0(%rsp) | ||
| 1326 | .Lecb_dec_body: | ||
| 1327 | ___ | ||
| 1328 | $code.=<<___; | ||
| 1329 | mov %rsp,%rbp # backup %rsp | ||
| 1330 | mov 240($arg4),%eax # rounds | ||
| 1331 | mov $arg1,$inp # backup arguments | ||
| 1332 | mov $arg2,$out | ||
| 1333 | mov $arg3,$len | ||
| 1334 | mov $arg4,$key | ||
| 1335 | cmp \$8,$arg3 | ||
| 1336 | jb .Lecb_dec_short | ||
| 1337 | |||
| 1338 | mov %eax,%ebx # backup rounds | ||
| 1339 | shl \$7,%rax # 128 bytes per inner round key | ||
| 1340 | sub \$`128-32`,%rax # size of bit-sliced key schedule | ||
| 1341 | sub %rax,%rsp | ||
| 1342 | mov %rsp,%rax # pass key schedule | ||
| 1343 | mov $key,%rcx # pass key | ||
| 1344 | mov %ebx,%r10d # pass rounds | ||
| 1345 | call _bsaes_key_convert | ||
| 1346 | pxor (%rsp),%xmm7 # fix up 0 round key | ||
| 1347 | movdqa %xmm6,(%rax) # save last round key | ||
| 1348 | movdqa %xmm7,(%rsp) | ||
| 1349 | |||
| 1350 | sub \$8,$len | ||
| 1351 | .Lecb_dec_loop: | ||
| 1352 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1353 | movdqu 0x10($inp), @XMM[1] | ||
| 1354 | movdqu 0x20($inp), @XMM[2] | ||
| 1355 | movdqu 0x30($inp), @XMM[3] | ||
| 1356 | movdqu 0x40($inp), @XMM[4] | ||
| 1357 | movdqu 0x50($inp), @XMM[5] | ||
| 1358 | mov %rsp, %rax # pass key schedule | ||
| 1359 | movdqu 0x60($inp), @XMM[6] | ||
| 1360 | mov %ebx,%r10d # pass rounds | ||
| 1361 | movdqu 0x70($inp), @XMM[7] | ||
| 1362 | lea 0x80($inp), $inp | ||
| 1363 | |||
| 1364 | call _bsaes_decrypt8 | ||
| 1365 | |||
| 1366 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1367 | movdqu @XMM[1], 0x10($out) | ||
| 1368 | movdqu @XMM[6], 0x20($out) | ||
| 1369 | movdqu @XMM[4], 0x30($out) | ||
| 1370 | movdqu @XMM[2], 0x40($out) | ||
| 1371 | movdqu @XMM[7], 0x50($out) | ||
| 1372 | movdqu @XMM[3], 0x60($out) | ||
| 1373 | movdqu @XMM[5], 0x70($out) | ||
| 1374 | lea 0x80($out), $out | ||
| 1375 | sub \$8,$len | ||
| 1376 | jnc .Lecb_dec_loop | ||
| 1377 | |||
| 1378 | add \$8,$len | ||
| 1379 | jz .Lecb_dec_done | ||
| 1380 | |||
| 1381 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1382 | mov %rsp, %rax # pass key schedule | ||
| 1383 | mov %ebx,%r10d # pass rounds | ||
| 1384 | cmp \$2,$len | ||
| 1385 | jb .Lecb_dec_one | ||
| 1386 | movdqu 0x10($inp), @XMM[1] | ||
| 1387 | je .Lecb_dec_two | ||
| 1388 | movdqu 0x20($inp), @XMM[2] | ||
| 1389 | cmp \$4,$len | ||
| 1390 | jb .Lecb_dec_three | ||
| 1391 | movdqu 0x30($inp), @XMM[3] | ||
| 1392 | je .Lecb_dec_four | ||
| 1393 | movdqu 0x40($inp), @XMM[4] | ||
| 1394 | cmp \$6,$len | ||
| 1395 | jb .Lecb_dec_five | ||
| 1396 | movdqu 0x50($inp), @XMM[5] | ||
| 1397 | je .Lecb_dec_six | ||
| 1398 | movdqu 0x60($inp), @XMM[6] | ||
| 1399 | call _bsaes_decrypt8 | ||
| 1400 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1401 | movdqu @XMM[1], 0x10($out) | ||
| 1402 | movdqu @XMM[6], 0x20($out) | ||
| 1403 | movdqu @XMM[4], 0x30($out) | ||
| 1404 | movdqu @XMM[2], 0x40($out) | ||
| 1405 | movdqu @XMM[7], 0x50($out) | ||
| 1406 | movdqu @XMM[3], 0x60($out) | ||
| 1407 | jmp .Lecb_dec_done | ||
| 1408 | .align 16 | ||
| 1409 | .Lecb_dec_six: | ||
| 1410 | call _bsaes_decrypt8 | ||
| 1411 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1412 | movdqu @XMM[1], 0x10($out) | ||
| 1413 | movdqu @XMM[6], 0x20($out) | ||
| 1414 | movdqu @XMM[4], 0x30($out) | ||
| 1415 | movdqu @XMM[2], 0x40($out) | ||
| 1416 | movdqu @XMM[7], 0x50($out) | ||
| 1417 | jmp .Lecb_dec_done | ||
| 1418 | .align 16 | ||
| 1419 | .Lecb_dec_five: | ||
| 1420 | call _bsaes_decrypt8 | ||
| 1421 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1422 | movdqu @XMM[1], 0x10($out) | ||
| 1423 | movdqu @XMM[6], 0x20($out) | ||
| 1424 | movdqu @XMM[4], 0x30($out) | ||
| 1425 | movdqu @XMM[2], 0x40($out) | ||
| 1426 | jmp .Lecb_dec_done | ||
| 1427 | .align 16 | ||
| 1428 | .Lecb_dec_four: | ||
| 1429 | call _bsaes_decrypt8 | ||
| 1430 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1431 | movdqu @XMM[1], 0x10($out) | ||
| 1432 | movdqu @XMM[6], 0x20($out) | ||
| 1433 | movdqu @XMM[4], 0x30($out) | ||
| 1434 | jmp .Lecb_dec_done | ||
| 1435 | .align 16 | ||
| 1436 | .Lecb_dec_three: | ||
| 1437 | call _bsaes_decrypt8 | ||
| 1438 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1439 | movdqu @XMM[1], 0x10($out) | ||
| 1440 | movdqu @XMM[6], 0x20($out) | ||
| 1441 | jmp .Lecb_dec_done | ||
| 1442 | .align 16 | ||
| 1443 | .Lecb_dec_two: | ||
| 1444 | call _bsaes_decrypt8 | ||
| 1445 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1446 | movdqu @XMM[1], 0x10($out) | ||
| 1447 | jmp .Lecb_dec_done | ||
| 1448 | .align 16 | ||
| 1449 | .Lecb_dec_one: | ||
| 1450 | call _bsaes_decrypt8 | ||
| 1451 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1452 | jmp .Lecb_dec_done | ||
| 1453 | .align 16 | ||
| 1454 | .Lecb_dec_short: | ||
| 1455 | lea ($inp), $arg1 | ||
| 1456 | lea ($out), $arg2 | ||
| 1457 | lea ($key), $arg3 | ||
| 1458 | call asm_AES_decrypt | ||
| 1459 | lea 16($inp), $inp | ||
| 1460 | lea 16($out), $out | ||
| 1461 | dec $len | ||
| 1462 | jnz .Lecb_dec_short | ||
| 1463 | |||
| 1464 | .Lecb_dec_done: | ||
| 1465 | lea (%rsp),%rax | ||
| 1466 | pxor %xmm0, %xmm0 | ||
| 1467 | .Lecb_dec_bzero: # wipe key schedule [if any] | ||
| 1468 | movdqa %xmm0, 0x00(%rax) | ||
| 1469 | movdqa %xmm0, 0x10(%rax) | ||
| 1470 | lea 0x20(%rax), %rax | ||
| 1471 | cmp %rax, %rbp | ||
| 1472 | jb .Lecb_dec_bzero | ||
| 1473 | |||
| 1474 | lea (%rbp),%rsp # restore %rsp | ||
| 1475 | ___ | ||
| 1476 | $code.=<<___ if ($win64); | ||
| 1477 | movaps 0x40(%rbp), %xmm6 | ||
| 1478 | movaps 0x50(%rbp), %xmm7 | ||
| 1479 | movaps 0x60(%rbp), %xmm8 | ||
| 1480 | movaps 0x70(%rbp), %xmm9 | ||
| 1481 | movaps 0x80(%rbp), %xmm10 | ||
| 1482 | movaps 0x90(%rbp), %xmm11 | ||
| 1483 | movaps 0xa0(%rbp), %xmm12 | ||
| 1484 | movaps 0xb0(%rbp), %xmm13 | ||
| 1485 | movaps 0xc0(%rbp), %xmm14 | ||
| 1486 | movaps 0xd0(%rbp), %xmm15 | ||
| 1487 | lea 0xa0(%rbp), %rsp | ||
| 1488 | ___ | ||
| 1489 | $code.=<<___; | ||
| 1490 | mov 0x48(%rsp), %r15 | ||
| 1491 | mov 0x50(%rsp), %r14 | ||
| 1492 | mov 0x58(%rsp), %r13 | ||
| 1493 | mov 0x60(%rsp), %r12 | ||
| 1494 | mov 0x68(%rsp), %rbx | ||
| 1495 | mov 0x70(%rsp), %rax | ||
| 1496 | lea 0x78(%rsp), %rsp | ||
| 1497 | mov %rax, %rbp | ||
| 1498 | .Lecb_dec_epilogue: | ||
| 1499 | ret | ||
| 1500 | .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks | ||
| 1501 | ___ | ||
| 1502 | } | ||
| 1503 | $code.=<<___; | ||
| 1504 | .extern asm_AES_cbc_encrypt | ||
| 1505 | .globl bsaes_cbc_encrypt | ||
| 1506 | .type bsaes_cbc_encrypt,\@abi-omnipotent | ||
| 1507 | .align 16 | ||
| 1508 | bsaes_cbc_encrypt: | ||
| 1509 | ___ | ||
| 1510 | $code.=<<___ if ($win64); | ||
| 1511 | mov 48(%rsp),$arg6 # pull direction flag | ||
| 1512 | ___ | ||
| 1513 | $code.=<<___; | ||
| 1514 | cmp \$0,$arg6 | ||
| 1515 | jne asm_AES_cbc_encrypt | ||
| 1516 | cmp \$128,$arg3 | ||
| 1517 | jb asm_AES_cbc_encrypt | ||
| 1518 | |||
| 1519 | mov %rsp, %rax | ||
| 1520 | .Lcbc_dec_prologue: | ||
| 1521 | push %rbp | ||
| 1522 | push %rbx | ||
| 1523 | push %r12 | ||
| 1524 | push %r13 | ||
| 1525 | push %r14 | ||
| 1526 | push %r15 | ||
| 1527 | lea -0x48(%rsp), %rsp | ||
| 1528 | ___ | ||
| 1529 | $code.=<<___ if ($win64); | ||
| 1530 | mov 0xa0(%rsp),$arg5 # pull ivp | ||
| 1531 | lea -0xa0(%rsp), %rsp | ||
| 1532 | movaps %xmm6, 0x40(%rsp) | ||
| 1533 | movaps %xmm7, 0x50(%rsp) | ||
| 1534 | movaps %xmm8, 0x60(%rsp) | ||
| 1535 | movaps %xmm9, 0x70(%rsp) | ||
| 1536 | movaps %xmm10, 0x80(%rsp) | ||
| 1537 | movaps %xmm11, 0x90(%rsp) | ||
| 1538 | movaps %xmm12, 0xa0(%rsp) | ||
| 1539 | movaps %xmm13, 0xb0(%rsp) | ||
| 1540 | movaps %xmm14, 0xc0(%rsp) | ||
| 1541 | movaps %xmm15, 0xd0(%rsp) | ||
| 1542 | .Lcbc_dec_body: | ||
| 1543 | ___ | ||
| 1544 | $code.=<<___; | ||
| 1545 | mov %rsp, %rbp # backup %rsp | ||
| 1546 | mov 240($arg4), %eax # rounds | ||
| 1547 | mov $arg1, $inp # backup arguments | ||
| 1548 | mov $arg2, $out | ||
| 1549 | mov $arg3, $len | ||
| 1550 | mov $arg4, $key | ||
| 1551 | mov $arg5, %rbx | ||
| 1552 | shr \$4, $len # bytes to blocks | ||
| 1553 | |||
| 1554 | mov %eax, %edx # rounds | ||
| 1555 | shl \$7, %rax # 128 bytes per inner round key | ||
| 1556 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
| 1557 | sub %rax, %rsp | ||
| 1558 | |||
| 1559 | mov %rsp, %rax # pass key schedule | ||
| 1560 | mov $key, %rcx # pass key | ||
| 1561 | mov %edx, %r10d # pass rounds | ||
| 1562 | call _bsaes_key_convert | ||
| 1563 | pxor (%rsp),%xmm7 # fix up 0 round key | ||
| 1564 | movdqa %xmm6,(%rax) # save last round key | ||
| 1565 | movdqa %xmm7,(%rsp) | ||
| 1566 | |||
| 1567 | movdqu (%rbx), @XMM[15] # load IV | ||
| 1568 | sub \$8,$len | ||
| 1569 | .Lcbc_dec_loop: | ||
| 1570 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1571 | movdqu 0x10($inp), @XMM[1] | ||
| 1572 | movdqu 0x20($inp), @XMM[2] | ||
| 1573 | movdqu 0x30($inp), @XMM[3] | ||
| 1574 | movdqu 0x40($inp), @XMM[4] | ||
| 1575 | movdqu 0x50($inp), @XMM[5] | ||
| 1576 | mov %rsp, %rax # pass key schedule | ||
| 1577 | movdqu 0x60($inp), @XMM[6] | ||
| 1578 | mov %edx,%r10d # pass rounds | ||
| 1579 | movdqu 0x70($inp), @XMM[7] | ||
| 1580 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1581 | |||
| 1582 | call _bsaes_decrypt8 | ||
| 1583 | |||
| 1584 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1585 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1586 | movdqu 0x10($inp), @XMM[9] | ||
| 1587 | pxor @XMM[8], @XMM[1] | ||
| 1588 | movdqu 0x20($inp), @XMM[10] | ||
| 1589 | pxor @XMM[9], @XMM[6] | ||
| 1590 | movdqu 0x30($inp), @XMM[11] | ||
| 1591 | pxor @XMM[10], @XMM[4] | ||
| 1592 | movdqu 0x40($inp), @XMM[12] | ||
| 1593 | pxor @XMM[11], @XMM[2] | ||
| 1594 | movdqu 0x50($inp), @XMM[13] | ||
| 1595 | pxor @XMM[12], @XMM[7] | ||
| 1596 | movdqu 0x60($inp), @XMM[14] | ||
| 1597 | pxor @XMM[13], @XMM[3] | ||
| 1598 | movdqu 0x70($inp), @XMM[15] # IV | ||
| 1599 | pxor @XMM[14], @XMM[5] | ||
| 1600 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1601 | lea 0x80($inp), $inp | ||
| 1602 | movdqu @XMM[1], 0x10($out) | ||
| 1603 | movdqu @XMM[6], 0x20($out) | ||
| 1604 | movdqu @XMM[4], 0x30($out) | ||
| 1605 | movdqu @XMM[2], 0x40($out) | ||
| 1606 | movdqu @XMM[7], 0x50($out) | ||
| 1607 | movdqu @XMM[3], 0x60($out) | ||
| 1608 | movdqu @XMM[5], 0x70($out) | ||
| 1609 | lea 0x80($out), $out | ||
| 1610 | sub \$8,$len | ||
| 1611 | jnc .Lcbc_dec_loop | ||
| 1612 | |||
| 1613 | add \$8,$len | ||
| 1614 | jz .Lcbc_dec_done | ||
| 1615 | |||
| 1616 | movdqu 0x00($inp), @XMM[0] # load input | ||
| 1617 | mov %rsp, %rax # pass key schedule | ||
| 1618 | mov %edx, %r10d # pass rounds | ||
| 1619 | cmp \$2,$len | ||
| 1620 | jb .Lcbc_dec_one | ||
| 1621 | movdqu 0x10($inp), @XMM[1] | ||
| 1622 | je .Lcbc_dec_two | ||
| 1623 | movdqu 0x20($inp), @XMM[2] | ||
| 1624 | cmp \$4,$len | ||
| 1625 | jb .Lcbc_dec_three | ||
| 1626 | movdqu 0x30($inp), @XMM[3] | ||
| 1627 | je .Lcbc_dec_four | ||
| 1628 | movdqu 0x40($inp), @XMM[4] | ||
| 1629 | cmp \$6,$len | ||
| 1630 | jb .Lcbc_dec_five | ||
| 1631 | movdqu 0x50($inp), @XMM[5] | ||
| 1632 | je .Lcbc_dec_six | ||
| 1633 | movdqu 0x60($inp), @XMM[6] | ||
| 1634 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1635 | call _bsaes_decrypt8 | ||
| 1636 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1637 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1638 | movdqu 0x10($inp), @XMM[9] | ||
| 1639 | pxor @XMM[8], @XMM[1] | ||
| 1640 | movdqu 0x20($inp), @XMM[10] | ||
| 1641 | pxor @XMM[9], @XMM[6] | ||
| 1642 | movdqu 0x30($inp), @XMM[11] | ||
| 1643 | pxor @XMM[10], @XMM[4] | ||
| 1644 | movdqu 0x40($inp), @XMM[12] | ||
| 1645 | pxor @XMM[11], @XMM[2] | ||
| 1646 | movdqu 0x50($inp), @XMM[13] | ||
| 1647 | pxor @XMM[12], @XMM[7] | ||
| 1648 | movdqu 0x60($inp), @XMM[15] # IV | ||
| 1649 | pxor @XMM[13], @XMM[3] | ||
| 1650 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1651 | movdqu @XMM[1], 0x10($out) | ||
| 1652 | movdqu @XMM[6], 0x20($out) | ||
| 1653 | movdqu @XMM[4], 0x30($out) | ||
| 1654 | movdqu @XMM[2], 0x40($out) | ||
| 1655 | movdqu @XMM[7], 0x50($out) | ||
| 1656 | movdqu @XMM[3], 0x60($out) | ||
| 1657 | jmp .Lcbc_dec_done | ||
| 1658 | .align 16 | ||
| 1659 | .Lcbc_dec_six: | ||
| 1660 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1661 | call _bsaes_decrypt8 | ||
| 1662 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1663 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1664 | movdqu 0x10($inp), @XMM[9] | ||
| 1665 | pxor @XMM[8], @XMM[1] | ||
| 1666 | movdqu 0x20($inp), @XMM[10] | ||
| 1667 | pxor @XMM[9], @XMM[6] | ||
| 1668 | movdqu 0x30($inp), @XMM[11] | ||
| 1669 | pxor @XMM[10], @XMM[4] | ||
| 1670 | movdqu 0x40($inp), @XMM[12] | ||
| 1671 | pxor @XMM[11], @XMM[2] | ||
| 1672 | movdqu 0x50($inp), @XMM[15] # IV | ||
| 1673 | pxor @XMM[12], @XMM[7] | ||
| 1674 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1675 | movdqu @XMM[1], 0x10($out) | ||
| 1676 | movdqu @XMM[6], 0x20($out) | ||
| 1677 | movdqu @XMM[4], 0x30($out) | ||
| 1678 | movdqu @XMM[2], 0x40($out) | ||
| 1679 | movdqu @XMM[7], 0x50($out) | ||
| 1680 | jmp .Lcbc_dec_done | ||
| 1681 | .align 16 | ||
| 1682 | .Lcbc_dec_five: | ||
| 1683 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1684 | call _bsaes_decrypt8 | ||
| 1685 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1686 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1687 | movdqu 0x10($inp), @XMM[9] | ||
| 1688 | pxor @XMM[8], @XMM[1] | ||
| 1689 | movdqu 0x20($inp), @XMM[10] | ||
| 1690 | pxor @XMM[9], @XMM[6] | ||
| 1691 | movdqu 0x30($inp), @XMM[11] | ||
| 1692 | pxor @XMM[10], @XMM[4] | ||
| 1693 | movdqu 0x40($inp), @XMM[15] # IV | ||
| 1694 | pxor @XMM[11], @XMM[2] | ||
| 1695 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1696 | movdqu @XMM[1], 0x10($out) | ||
| 1697 | movdqu @XMM[6], 0x20($out) | ||
| 1698 | movdqu @XMM[4], 0x30($out) | ||
| 1699 | movdqu @XMM[2], 0x40($out) | ||
| 1700 | jmp .Lcbc_dec_done | ||
| 1701 | .align 16 | ||
| 1702 | .Lcbc_dec_four: | ||
| 1703 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1704 | call _bsaes_decrypt8 | ||
| 1705 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1706 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1707 | movdqu 0x10($inp), @XMM[9] | ||
| 1708 | pxor @XMM[8], @XMM[1] | ||
| 1709 | movdqu 0x20($inp), @XMM[10] | ||
| 1710 | pxor @XMM[9], @XMM[6] | ||
| 1711 | movdqu 0x30($inp), @XMM[15] # IV | ||
| 1712 | pxor @XMM[10], @XMM[4] | ||
| 1713 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1714 | movdqu @XMM[1], 0x10($out) | ||
| 1715 | movdqu @XMM[6], 0x20($out) | ||
| 1716 | movdqu @XMM[4], 0x30($out) | ||
| 1717 | jmp .Lcbc_dec_done | ||
| 1718 | .align 16 | ||
| 1719 | .Lcbc_dec_three: | ||
| 1720 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1721 | call _bsaes_decrypt8 | ||
| 1722 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1723 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1724 | movdqu 0x10($inp), @XMM[9] | ||
| 1725 | pxor @XMM[8], @XMM[1] | ||
| 1726 | movdqu 0x20($inp), @XMM[15] # IV | ||
| 1727 | pxor @XMM[9], @XMM[6] | ||
| 1728 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1729 | movdqu @XMM[1], 0x10($out) | ||
| 1730 | movdqu @XMM[6], 0x20($out) | ||
| 1731 | jmp .Lcbc_dec_done | ||
| 1732 | .align 16 | ||
| 1733 | .Lcbc_dec_two: | ||
| 1734 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
| 1735 | call _bsaes_decrypt8 | ||
| 1736 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
| 1737 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
| 1738 | movdqu 0x10($inp), @XMM[15] # IV | ||
| 1739 | pxor @XMM[8], @XMM[1] | ||
| 1740 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1741 | movdqu @XMM[1], 0x10($out) | ||
| 1742 | jmp .Lcbc_dec_done | ||
| 1743 | .align 16 | ||
| 1744 | .Lcbc_dec_one: | ||
| 1745 | lea ($inp), $arg1 | ||
| 1746 | lea 0x20(%rbp), $arg2 # buffer output | ||
| 1747 | lea ($key), $arg3 | ||
| 1748 | call asm_AES_decrypt # doesn't touch %xmm | ||
| 1749 | pxor 0x20(%rbp), @XMM[15] # ^= IV | ||
| 1750 | movdqu @XMM[15], ($out) # write output | ||
| 1751 | movdqa @XMM[0], @XMM[15] # IV | ||
| 1752 | |||
| 1753 | .Lcbc_dec_done: | ||
| 1754 | movdqu @XMM[15], (%rbx) # return IV | ||
| 1755 | lea (%rsp), %rax | ||
| 1756 | pxor %xmm0, %xmm0 | ||
| 1757 | .Lcbc_dec_bzero: # wipe key schedule [if any] | ||
| 1758 | movdqa %xmm0, 0x00(%rax) | ||
| 1759 | movdqa %xmm0, 0x10(%rax) | ||
| 1760 | lea 0x20(%rax), %rax | ||
| 1761 | cmp %rax, %rbp | ||
| 1762 | ja .Lcbc_dec_bzero | ||
| 1763 | |||
| 1764 | lea (%rbp),%rsp # restore %rsp | ||
| 1765 | ___ | ||
| 1766 | $code.=<<___ if ($win64); | ||
| 1767 | movaps 0x40(%rbp), %xmm6 | ||
| 1768 | movaps 0x50(%rbp), %xmm7 | ||
| 1769 | movaps 0x60(%rbp), %xmm8 | ||
| 1770 | movaps 0x70(%rbp), %xmm9 | ||
| 1771 | movaps 0x80(%rbp), %xmm10 | ||
| 1772 | movaps 0x90(%rbp), %xmm11 | ||
| 1773 | movaps 0xa0(%rbp), %xmm12 | ||
| 1774 | movaps 0xb0(%rbp), %xmm13 | ||
| 1775 | movaps 0xc0(%rbp), %xmm14 | ||
| 1776 | movaps 0xd0(%rbp), %xmm15 | ||
| 1777 | lea 0xa0(%rbp), %rsp | ||
| 1778 | ___ | ||
| 1779 | $code.=<<___; | ||
| 1780 | mov 0x48(%rsp), %r15 | ||
| 1781 | mov 0x50(%rsp), %r14 | ||
| 1782 | mov 0x58(%rsp), %r13 | ||
| 1783 | mov 0x60(%rsp), %r12 | ||
| 1784 | mov 0x68(%rsp), %rbx | ||
| 1785 | mov 0x70(%rsp), %rax | ||
| 1786 | lea 0x78(%rsp), %rsp | ||
| 1787 | mov %rax, %rbp | ||
| 1788 | .Lcbc_dec_epilogue: | ||
| 1789 | ret | ||
| 1790 | .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt | ||
| 1791 | |||
| 1792 | .globl bsaes_ctr32_encrypt_blocks | ||
| 1793 | .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent | ||
| 1794 | .align 16 | ||
| 1795 | bsaes_ctr32_encrypt_blocks: | ||
| 1796 | mov %rsp, %rax | ||
| 1797 | .Lctr_enc_prologue: | ||
| 1798 | push %rbp | ||
| 1799 | push %rbx | ||
| 1800 | push %r12 | ||
| 1801 | push %r13 | ||
| 1802 | push %r14 | ||
| 1803 | push %r15 | ||
| 1804 | lea -0x48(%rsp), %rsp | ||
| 1805 | ___ | ||
| 1806 | $code.=<<___ if ($win64); | ||
| 1807 | mov 0xa0(%rsp),$arg5 # pull ivp | ||
| 1808 | lea -0xa0(%rsp), %rsp | ||
| 1809 | movaps %xmm6, 0x40(%rsp) | ||
| 1810 | movaps %xmm7, 0x50(%rsp) | ||
| 1811 | movaps %xmm8, 0x60(%rsp) | ||
| 1812 | movaps %xmm9, 0x70(%rsp) | ||
| 1813 | movaps %xmm10, 0x80(%rsp) | ||
| 1814 | movaps %xmm11, 0x90(%rsp) | ||
| 1815 | movaps %xmm12, 0xa0(%rsp) | ||
| 1816 | movaps %xmm13, 0xb0(%rsp) | ||
| 1817 | movaps %xmm14, 0xc0(%rsp) | ||
| 1818 | movaps %xmm15, 0xd0(%rsp) | ||
| 1819 | .Lctr_enc_body: | ||
| 1820 | ___ | ||
| 1821 | $code.=<<___; | ||
| 1822 | mov %rsp, %rbp # backup %rsp | ||
| 1823 | movdqu ($arg5), %xmm0 # load counter | ||
| 1824 | mov 240($arg4), %eax # rounds | ||
| 1825 | mov $arg1, $inp # backup arguments | ||
| 1826 | mov $arg2, $out | ||
| 1827 | mov $arg3, $len | ||
| 1828 | mov $arg4, $key | ||
| 1829 | movdqa %xmm0, 0x20(%rbp) # copy counter | ||
| 1830 | cmp \$8, $arg3 | ||
| 1831 | jb .Lctr_enc_short | ||
| 1832 | |||
| 1833 | mov %eax, %ebx # rounds | ||
| 1834 | shl \$7, %rax # 128 bytes per inner round key | ||
| 1835 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
| 1836 | sub %rax, %rsp | ||
| 1837 | |||
| 1838 | mov %rsp, %rax # pass key schedule | ||
| 1839 | mov $key, %rcx # pass key | ||
| 1840 | mov %ebx, %r10d # pass rounds | ||
| 1841 | call _bsaes_key_convert | ||
| 1842 | pxor %xmm6,%xmm7 # fix up last round key | ||
| 1843 | movdqa %xmm7,(%rax) # save last round key | ||
| 1844 | |||
| 1845 | movdqa (%rsp), @XMM[9] # load round0 key | ||
| 1846 | lea .LADD1(%rip), %r11 | ||
| 1847 | movdqa 0x20(%rbp), @XMM[0] # counter copy | ||
| 1848 | movdqa -0x20(%r11), @XMM[8] # .LSWPUP | ||
| 1849 | pshufb @XMM[8], @XMM[9] # byte swap upper part | ||
| 1850 | pshufb @XMM[8], @XMM[0] | ||
| 1851 | movdqa @XMM[9], (%rsp) # save adjusted round0 key | ||
| 1852 | jmp .Lctr_enc_loop | ||
| 1853 | .align 16 | ||
| 1854 | .Lctr_enc_loop: | ||
| 1855 | movdqa @XMM[0], 0x20(%rbp) # save counter | ||
| 1856 | movdqa @XMM[0], @XMM[1] # prepare 8 counter values | ||
| 1857 | movdqa @XMM[0], @XMM[2] | ||
| 1858 | paddd 0x00(%r11), @XMM[1] # .LADD1 | ||
| 1859 | movdqa @XMM[0], @XMM[3] | ||
| 1860 | paddd 0x10(%r11), @XMM[2] # .LADD2 | ||
| 1861 | movdqa @XMM[0], @XMM[4] | ||
| 1862 | paddd 0x20(%r11), @XMM[3] # .LADD3 | ||
| 1863 | movdqa @XMM[0], @XMM[5] | ||
| 1864 | paddd 0x30(%r11), @XMM[4] # .LADD4 | ||
| 1865 | movdqa @XMM[0], @XMM[6] | ||
| 1866 | paddd 0x40(%r11), @XMM[5] # .LADD5 | ||
| 1867 | movdqa @XMM[0], @XMM[7] | ||
| 1868 | paddd 0x50(%r11), @XMM[6] # .LADD6 | ||
| 1869 | paddd 0x60(%r11), @XMM[7] # .LADD7 | ||
| 1870 | |||
| 1871 | # Borrow prologue from _bsaes_encrypt8 to use the opportunity | ||
| 1872 | # to flip byte order in 32-bit counter | ||
| 1873 | movdqa (%rsp), @XMM[9] # round 0 key | ||
| 1874 | lea 0x10(%rsp), %rax # pass key schedule | ||
| 1875 | movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR | ||
| 1876 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
| 1877 | pxor @XMM[9], @XMM[1] | ||
| 1878 | pshufb @XMM[8], @XMM[0] | ||
| 1879 | pxor @XMM[9], @XMM[2] | ||
| 1880 | pshufb @XMM[8], @XMM[1] | ||
| 1881 | pxor @XMM[9], @XMM[3] | ||
| 1882 | pshufb @XMM[8], @XMM[2] | ||
| 1883 | pxor @XMM[9], @XMM[4] | ||
| 1884 | pshufb @XMM[8], @XMM[3] | ||
| 1885 | pxor @XMM[9], @XMM[5] | ||
| 1886 | pshufb @XMM[8], @XMM[4] | ||
| 1887 | pxor @XMM[9], @XMM[6] | ||
| 1888 | pshufb @XMM[8], @XMM[5] | ||
| 1889 | pxor @XMM[9], @XMM[7] | ||
| 1890 | pshufb @XMM[8], @XMM[6] | ||
| 1891 | lea .LBS0(%rip), %r11 # constants table | ||
| 1892 | pshufb @XMM[8], @XMM[7] | ||
| 1893 | mov %ebx,%r10d # pass rounds | ||
| 1894 | |||
| 1895 | call _bsaes_encrypt8_bitslice | ||
| 1896 | |||
| 1897 | sub \$8,$len | ||
| 1898 | jc .Lctr_enc_loop_done | ||
| 1899 | |||
| 1900 | movdqu 0x00($inp), @XMM[8] # load input | ||
| 1901 | movdqu 0x10($inp), @XMM[9] | ||
| 1902 | movdqu 0x20($inp), @XMM[10] | ||
| 1903 | movdqu 0x30($inp), @XMM[11] | ||
| 1904 | movdqu 0x40($inp), @XMM[12] | ||
| 1905 | movdqu 0x50($inp), @XMM[13] | ||
| 1906 | movdqu 0x60($inp), @XMM[14] | ||
| 1907 | movdqu 0x70($inp), @XMM[15] | ||
| 1908 | lea 0x80($inp),$inp | ||
| 1909 | pxor @XMM[0], @XMM[8] | ||
| 1910 | movdqa 0x20(%rbp), @XMM[0] # load counter | ||
| 1911 | pxor @XMM[9], @XMM[1] | ||
| 1912 | movdqu @XMM[8], 0x00($out) # write output | ||
| 1913 | pxor @XMM[10], @XMM[4] | ||
| 1914 | movdqu @XMM[1], 0x10($out) | ||
| 1915 | pxor @XMM[11], @XMM[6] | ||
| 1916 | movdqu @XMM[4], 0x20($out) | ||
| 1917 | pxor @XMM[12], @XMM[3] | ||
| 1918 | movdqu @XMM[6], 0x30($out) | ||
| 1919 | pxor @XMM[13], @XMM[7] | ||
| 1920 | movdqu @XMM[3], 0x40($out) | ||
| 1921 | pxor @XMM[14], @XMM[2] | ||
| 1922 | movdqu @XMM[7], 0x50($out) | ||
| 1923 | pxor @XMM[15], @XMM[5] | ||
| 1924 | movdqu @XMM[2], 0x60($out) | ||
| 1925 | lea .LADD1(%rip), %r11 | ||
| 1926 | movdqu @XMM[5], 0x70($out) | ||
| 1927 | lea 0x80($out), $out | ||
| 1928 | paddd 0x70(%r11), @XMM[0] # .LADD8 | ||
| 1929 | jnz .Lctr_enc_loop | ||
| 1930 | |||
| 1931 | jmp .Lctr_enc_done | ||
| 1932 | .align 16 | ||
| 1933 | .Lctr_enc_loop_done: | ||
| 1934 | add \$8, $len | ||
| 1935 | movdqu 0x00($inp), @XMM[8] # load input | ||
| 1936 | pxor @XMM[8], @XMM[0] | ||
| 1937 | movdqu @XMM[0], 0x00($out) # write output | ||
| 1938 | cmp \$2,$len | ||
| 1939 | jb .Lctr_enc_done | ||
| 1940 | movdqu 0x10($inp), @XMM[9] | ||
| 1941 | pxor @XMM[9], @XMM[1] | ||
| 1942 | movdqu @XMM[1], 0x10($out) | ||
| 1943 | je .Lctr_enc_done | ||
| 1944 | movdqu 0x20($inp), @XMM[10] | ||
| 1945 | pxor @XMM[10], @XMM[4] | ||
| 1946 | movdqu @XMM[4], 0x20($out) | ||
| 1947 | cmp \$4,$len | ||
| 1948 | jb .Lctr_enc_done | ||
| 1949 | movdqu 0x30($inp), @XMM[11] | ||
| 1950 | pxor @XMM[11], @XMM[6] | ||
| 1951 | movdqu @XMM[6], 0x30($out) | ||
| 1952 | je .Lctr_enc_done | ||
| 1953 | movdqu 0x40($inp), @XMM[12] | ||
| 1954 | pxor @XMM[12], @XMM[3] | ||
| 1955 | movdqu @XMM[3], 0x40($out) | ||
| 1956 | cmp \$6,$len | ||
| 1957 | jb .Lctr_enc_done | ||
| 1958 | movdqu 0x50($inp), @XMM[13] | ||
| 1959 | pxor @XMM[13], @XMM[7] | ||
| 1960 | movdqu @XMM[7], 0x50($out) | ||
| 1961 | je .Lctr_enc_done | ||
| 1962 | movdqu 0x60($inp), @XMM[14] | ||
| 1963 | pxor @XMM[14], @XMM[2] | ||
| 1964 | movdqu @XMM[2], 0x60($out) | ||
| 1965 | jmp .Lctr_enc_done | ||
| 1966 | |||
| 1967 | .align 16 | ||
| 1968 | .Lctr_enc_short: | ||
| 1969 | lea 0x20(%rbp), $arg1 | ||
| 1970 | lea 0x30(%rbp), $arg2 | ||
| 1971 | lea ($key), $arg3 | ||
| 1972 | call asm_AES_encrypt | ||
| 1973 | movdqu ($inp), @XMM[1] | ||
| 1974 | lea 16($inp), $inp | ||
| 1975 | mov 0x2c(%rbp), %eax # load 32-bit counter | ||
| 1976 | bswap %eax | ||
| 1977 | pxor 0x30(%rbp), @XMM[1] | ||
| 1978 | inc %eax # increment | ||
| 1979 | movdqu @XMM[1], ($out) | ||
| 1980 | bswap %eax | ||
| 1981 | lea 16($out), $out | ||
| 1982 | mov %eax, 0x2c(%rsp) # save 32-bit counter | ||
| 1983 | dec $len | ||
| 1984 | jnz .Lctr_enc_short | ||
| 1985 | |||
| 1986 | .Lctr_enc_done: | ||
| 1987 | lea (%rsp), %rax | ||
| 1988 | pxor %xmm0, %xmm0 | ||
| 1989 | .Lctr_enc_bzero: # wipe key schedule [if any] | ||
| 1990 | movdqa %xmm0, 0x00(%rax) | ||
| 1991 | movdqa %xmm0, 0x10(%rax) | ||
| 1992 | lea 0x20(%rax), %rax | ||
| 1993 | cmp %rax, %rbp | ||
| 1994 | ja .Lctr_enc_bzero | ||
| 1995 | |||
| 1996 | lea (%rbp),%rsp # restore %rsp | ||
| 1997 | ___ | ||
| 1998 | $code.=<<___ if ($win64); | ||
| 1999 | movaps 0x40(%rbp), %xmm6 | ||
| 2000 | movaps 0x50(%rbp), %xmm7 | ||
| 2001 | movaps 0x60(%rbp), %xmm8 | ||
| 2002 | movaps 0x70(%rbp), %xmm9 | ||
| 2003 | movaps 0x80(%rbp), %xmm10 | ||
| 2004 | movaps 0x90(%rbp), %xmm11 | ||
| 2005 | movaps 0xa0(%rbp), %xmm12 | ||
| 2006 | movaps 0xb0(%rbp), %xmm13 | ||
| 2007 | movaps 0xc0(%rbp), %xmm14 | ||
| 2008 | movaps 0xd0(%rbp), %xmm15 | ||
| 2009 | lea 0xa0(%rbp), %rsp | ||
| 2010 | ___ | ||
| 2011 | $code.=<<___; | ||
| 2012 | mov 0x48(%rsp), %r15 | ||
| 2013 | mov 0x50(%rsp), %r14 | ||
| 2014 | mov 0x58(%rsp), %r13 | ||
| 2015 | mov 0x60(%rsp), %r12 | ||
| 2016 | mov 0x68(%rsp), %rbx | ||
| 2017 | mov 0x70(%rsp), %rax | ||
| 2018 | lea 0x78(%rsp), %rsp | ||
| 2019 | mov %rax, %rbp | ||
| 2020 | .Lctr_enc_epilogue: | ||
| 2021 | ret | ||
| 2022 | .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks | ||
| 2023 | ___ | ||
| 2024 | ###################################################################### | ||
| 2025 | # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
| 2026 | # const AES_KEY *key1, const AES_KEY *key2, | ||
| 2027 | # const unsigned char iv[16]); | ||
| 2028 | # | ||
| 2029 | my ($twmask,$twres,$twtmp)=@XMM[13..15]; | ||
| 2030 | $code.=<<___; | ||
| 2031 | .globl bsaes_xts_encrypt | ||
| 2032 | .type bsaes_xts_encrypt,\@abi-omnipotent | ||
| 2033 | .align 16 | ||
| 2034 | bsaes_xts_encrypt: | ||
| 2035 | mov %rsp, %rax | ||
| 2036 | .Lxts_enc_prologue: | ||
| 2037 | push %rbp | ||
| 2038 | push %rbx | ||
| 2039 | push %r12 | ||
| 2040 | push %r13 | ||
| 2041 | push %r14 | ||
| 2042 | push %r15 | ||
| 2043 | lea -0x48(%rsp), %rsp | ||
| 2044 | ___ | ||
| 2045 | $code.=<<___ if ($win64); | ||
| 2046 | mov 0xa0(%rsp),$arg5 # pull key2 | ||
| 2047 | mov 0xa8(%rsp),$arg6 # pull ivp | ||
| 2048 | lea -0xa0(%rsp), %rsp | ||
| 2049 | movaps %xmm6, 0x40(%rsp) | ||
| 2050 | movaps %xmm7, 0x50(%rsp) | ||
| 2051 | movaps %xmm8, 0x60(%rsp) | ||
| 2052 | movaps %xmm9, 0x70(%rsp) | ||
| 2053 | movaps %xmm10, 0x80(%rsp) | ||
| 2054 | movaps %xmm11, 0x90(%rsp) | ||
| 2055 | movaps %xmm12, 0xa0(%rsp) | ||
| 2056 | movaps %xmm13, 0xb0(%rsp) | ||
| 2057 | movaps %xmm14, 0xc0(%rsp) | ||
| 2058 | movaps %xmm15, 0xd0(%rsp) | ||
| 2059 | .Lxts_enc_body: | ||
| 2060 | ___ | ||
| 2061 | $code.=<<___; | ||
| 2062 | mov %rsp, %rbp # backup %rsp | ||
| 2063 | mov $arg1, $inp # backup arguments | ||
| 2064 | mov $arg2, $out | ||
| 2065 | mov $arg3, $len | ||
| 2066 | mov $arg4, $key | ||
| 2067 | |||
| 2068 | lea ($arg6), $arg1 | ||
| 2069 | lea 0x20(%rbp), $arg2 | ||
| 2070 | lea ($arg5), $arg3 | ||
| 2071 | call asm_AES_encrypt # generate initial tweak | ||
| 2072 | |||
| 2073 | mov 240($key), %eax # rounds | ||
| 2074 | mov $len, %rbx # backup $len | ||
| 2075 | |||
| 2076 | mov %eax, %edx # rounds | ||
| 2077 | shl \$7, %rax # 128 bytes per inner round key | ||
| 2078 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
| 2079 | sub %rax, %rsp | ||
| 2080 | |||
| 2081 | mov %rsp, %rax # pass key schedule | ||
| 2082 | mov $key, %rcx # pass key | ||
| 2083 | mov %edx, %r10d # pass rounds | ||
| 2084 | call _bsaes_key_convert | ||
| 2085 | pxor %xmm6, %xmm7 # fix up last round key | ||
| 2086 | movdqa %xmm7, (%rax) # save last round key | ||
| 2087 | |||
| 2088 | and \$-16, $len | ||
| 2089 | sub \$0x80, %rsp # place for tweak[8] | ||
| 2090 | movdqa 0x20(%rbp), @XMM[7] # initial tweak | ||
| 2091 | |||
| 2092 | pxor $twtmp, $twtmp | ||
| 2093 | movdqa .Lxts_magic(%rip), $twmask | ||
| 2094 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2095 | |||
| 2096 | sub \$0x80, $len | ||
| 2097 | jc .Lxts_enc_short | ||
| 2098 | jmp .Lxts_enc_loop | ||
| 2099 | |||
| 2100 | .align 16 | ||
| 2101 | .Lxts_enc_loop: | ||
| 2102 | ___ | ||
| 2103 | for ($i=0;$i<7;$i++) { | ||
| 2104 | $code.=<<___; | ||
| 2105 | pshufd \$0x13, $twtmp, $twres | ||
| 2106 | pxor $twtmp, $twtmp | ||
| 2107 | movdqa @XMM[7], @XMM[$i] | ||
| 2108 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
| 2109 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2110 | pand $twmask, $twres # isolate carry and residue | ||
| 2111 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2112 | pxor $twres, @XMM[7] | ||
| 2113 | ___ | ||
| 2114 | $code.=<<___ if ($i>=1); | ||
| 2115 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
| 2116 | ___ | ||
| 2117 | $code.=<<___ if ($i>=2); | ||
| 2118 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
| 2119 | ___ | ||
| 2120 | } | ||
| 2121 | $code.=<<___; | ||
| 2122 | movdqu 0x60($inp), @XMM[8+6] | ||
| 2123 | pxor @XMM[8+5], @XMM[5] | ||
| 2124 | movdqu 0x70($inp), @XMM[8+7] | ||
| 2125 | lea 0x80($inp), $inp | ||
| 2126 | movdqa @XMM[7], 0x70(%rsp) | ||
| 2127 | pxor @XMM[8+6], @XMM[6] | ||
| 2128 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2129 | pxor @XMM[8+7], @XMM[7] | ||
| 2130 | mov %edx, %r10d # pass rounds | ||
| 2131 | |||
| 2132 | call _bsaes_encrypt8 | ||
| 2133 | |||
| 2134 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2135 | pxor 0x10(%rsp), @XMM[1] | ||
| 2136 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2137 | pxor 0x20(%rsp), @XMM[4] | ||
| 2138 | movdqu @XMM[1], 0x10($out) | ||
| 2139 | pxor 0x30(%rsp), @XMM[6] | ||
| 2140 | movdqu @XMM[4], 0x20($out) | ||
| 2141 | pxor 0x40(%rsp), @XMM[3] | ||
| 2142 | movdqu @XMM[6], 0x30($out) | ||
| 2143 | pxor 0x50(%rsp), @XMM[7] | ||
| 2144 | movdqu @XMM[3], 0x40($out) | ||
| 2145 | pxor 0x60(%rsp), @XMM[2] | ||
| 2146 | movdqu @XMM[7], 0x50($out) | ||
| 2147 | pxor 0x70(%rsp), @XMM[5] | ||
| 2148 | movdqu @XMM[2], 0x60($out) | ||
| 2149 | movdqu @XMM[5], 0x70($out) | ||
| 2150 | lea 0x80($out), $out | ||
| 2151 | |||
| 2152 | movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | ||
| 2153 | pxor $twtmp, $twtmp | ||
| 2154 | movdqa .Lxts_magic(%rip), $twmask | ||
| 2155 | pcmpgtd @XMM[7], $twtmp | ||
| 2156 | pshufd \$0x13, $twtmp, $twres | ||
| 2157 | pxor $twtmp, $twtmp | ||
| 2158 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2159 | pand $twmask, $twres # isolate carry and residue | ||
| 2160 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2161 | pxor $twres, @XMM[7] | ||
| 2162 | |||
| 2163 | sub \$0x80,$len | ||
| 2164 | jnc .Lxts_enc_loop | ||
| 2165 | |||
| 2166 | .Lxts_enc_short: | ||
| 2167 | add \$0x80, $len | ||
| 2168 | jz .Lxts_enc_done | ||
| 2169 | ___ | ||
| 2170 | for ($i=0;$i<7;$i++) { | ||
| 2171 | $code.=<<___; | ||
| 2172 | pshufd \$0x13, $twtmp, $twres | ||
| 2173 | pxor $twtmp, $twtmp | ||
| 2174 | movdqa @XMM[7], @XMM[$i] | ||
| 2175 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
| 2176 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2177 | pand $twmask, $twres # isolate carry and residue | ||
| 2178 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2179 | pxor $twres, @XMM[7] | ||
| 2180 | ___ | ||
| 2181 | $code.=<<___ if ($i>=1); | ||
| 2182 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
| 2183 | cmp \$`0x10*$i`,$len | ||
| 2184 | je .Lxts_enc_$i | ||
| 2185 | ___ | ||
| 2186 | $code.=<<___ if ($i>=2); | ||
| 2187 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
| 2188 | ___ | ||
| 2189 | } | ||
| 2190 | $code.=<<___; | ||
| 2191 | movdqu 0x60($inp), @XMM[8+6] | ||
| 2192 | pxor @XMM[8+5], @XMM[5] | ||
| 2193 | movdqa @XMM[7], 0x70(%rsp) | ||
| 2194 | lea 0x70($inp), $inp | ||
| 2195 | pxor @XMM[8+6], @XMM[6] | ||
| 2196 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2197 | mov %edx, %r10d # pass rounds | ||
| 2198 | |||
| 2199 | call _bsaes_encrypt8 | ||
| 2200 | |||
| 2201 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2202 | pxor 0x10(%rsp), @XMM[1] | ||
| 2203 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2204 | pxor 0x20(%rsp), @XMM[4] | ||
| 2205 | movdqu @XMM[1], 0x10($out) | ||
| 2206 | pxor 0x30(%rsp), @XMM[6] | ||
| 2207 | movdqu @XMM[4], 0x20($out) | ||
| 2208 | pxor 0x40(%rsp), @XMM[3] | ||
| 2209 | movdqu @XMM[6], 0x30($out) | ||
| 2210 | pxor 0x50(%rsp), @XMM[7] | ||
| 2211 | movdqu @XMM[3], 0x40($out) | ||
| 2212 | pxor 0x60(%rsp), @XMM[2] | ||
| 2213 | movdqu @XMM[7], 0x50($out) | ||
| 2214 | movdqu @XMM[2], 0x60($out) | ||
| 2215 | lea 0x70($out), $out | ||
| 2216 | |||
| 2217 | movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | ||
| 2218 | jmp .Lxts_enc_done | ||
| 2219 | .align 16 | ||
| 2220 | .Lxts_enc_6: | ||
| 2221 | pxor @XMM[8+4], @XMM[4] | ||
| 2222 | lea 0x60($inp), $inp | ||
| 2223 | pxor @XMM[8+5], @XMM[5] | ||
| 2224 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2225 | mov %edx, %r10d # pass rounds | ||
| 2226 | |||
| 2227 | call _bsaes_encrypt8 | ||
| 2228 | |||
| 2229 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2230 | pxor 0x10(%rsp), @XMM[1] | ||
| 2231 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2232 | pxor 0x20(%rsp), @XMM[4] | ||
| 2233 | movdqu @XMM[1], 0x10($out) | ||
| 2234 | pxor 0x30(%rsp), @XMM[6] | ||
| 2235 | movdqu @XMM[4], 0x20($out) | ||
| 2236 | pxor 0x40(%rsp), @XMM[3] | ||
| 2237 | movdqu @XMM[6], 0x30($out) | ||
| 2238 | pxor 0x50(%rsp), @XMM[7] | ||
| 2239 | movdqu @XMM[3], 0x40($out) | ||
| 2240 | movdqu @XMM[7], 0x50($out) | ||
| 2241 | lea 0x60($out), $out | ||
| 2242 | |||
| 2243 | movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | ||
| 2244 | jmp .Lxts_enc_done | ||
| 2245 | .align 16 | ||
| 2246 | .Lxts_enc_5: | ||
| 2247 | pxor @XMM[8+3], @XMM[3] | ||
| 2248 | lea 0x50($inp), $inp | ||
| 2249 | pxor @XMM[8+4], @XMM[4] | ||
| 2250 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2251 | mov %edx, %r10d # pass rounds | ||
| 2252 | |||
| 2253 | call _bsaes_encrypt8 | ||
| 2254 | |||
| 2255 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2256 | pxor 0x10(%rsp), @XMM[1] | ||
| 2257 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2258 | pxor 0x20(%rsp), @XMM[4] | ||
| 2259 | movdqu @XMM[1], 0x10($out) | ||
| 2260 | pxor 0x30(%rsp), @XMM[6] | ||
| 2261 | movdqu @XMM[4], 0x20($out) | ||
| 2262 | pxor 0x40(%rsp), @XMM[3] | ||
| 2263 | movdqu @XMM[6], 0x30($out) | ||
| 2264 | movdqu @XMM[3], 0x40($out) | ||
| 2265 | lea 0x50($out), $out | ||
| 2266 | |||
| 2267 | movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | ||
| 2268 | jmp .Lxts_enc_done | ||
| 2269 | .align 16 | ||
| 2270 | .Lxts_enc_4: | ||
| 2271 | pxor @XMM[8+2], @XMM[2] | ||
| 2272 | lea 0x40($inp), $inp | ||
| 2273 | pxor @XMM[8+3], @XMM[3] | ||
| 2274 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2275 | mov %edx, %r10d # pass rounds | ||
| 2276 | |||
| 2277 | call _bsaes_encrypt8 | ||
| 2278 | |||
| 2279 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2280 | pxor 0x10(%rsp), @XMM[1] | ||
| 2281 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2282 | pxor 0x20(%rsp), @XMM[4] | ||
| 2283 | movdqu @XMM[1], 0x10($out) | ||
| 2284 | pxor 0x30(%rsp), @XMM[6] | ||
| 2285 | movdqu @XMM[4], 0x20($out) | ||
| 2286 | movdqu @XMM[6], 0x30($out) | ||
| 2287 | lea 0x40($out), $out | ||
| 2288 | |||
| 2289 | movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | ||
| 2290 | jmp .Lxts_enc_done | ||
| 2291 | .align 16 | ||
| 2292 | .Lxts_enc_3: | ||
| 2293 | pxor @XMM[8+1], @XMM[1] | ||
| 2294 | lea 0x30($inp), $inp | ||
| 2295 | pxor @XMM[8+2], @XMM[2] | ||
| 2296 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2297 | mov %edx, %r10d # pass rounds | ||
| 2298 | |||
| 2299 | call _bsaes_encrypt8 | ||
| 2300 | |||
| 2301 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2302 | pxor 0x10(%rsp), @XMM[1] | ||
| 2303 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2304 | pxor 0x20(%rsp), @XMM[4] | ||
| 2305 | movdqu @XMM[1], 0x10($out) | ||
| 2306 | movdqu @XMM[4], 0x20($out) | ||
| 2307 | lea 0x30($out), $out | ||
| 2308 | |||
| 2309 | movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | ||
| 2310 | jmp .Lxts_enc_done | ||
| 2311 | .align 16 | ||
| 2312 | .Lxts_enc_2: | ||
| 2313 | pxor @XMM[8+0], @XMM[0] | ||
| 2314 | lea 0x20($inp), $inp | ||
| 2315 | pxor @XMM[8+1], @XMM[1] | ||
| 2316 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2317 | mov %edx, %r10d # pass rounds | ||
| 2318 | |||
| 2319 | call _bsaes_encrypt8 | ||
| 2320 | |||
| 2321 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2322 | pxor 0x10(%rsp), @XMM[1] | ||
| 2323 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2324 | movdqu @XMM[1], 0x10($out) | ||
| 2325 | lea 0x20($out), $out | ||
| 2326 | |||
| 2327 | movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | ||
| 2328 | jmp .Lxts_enc_done | ||
| 2329 | .align 16 | ||
| 2330 | .Lxts_enc_1: | ||
| 2331 | pxor @XMM[0], @XMM[8] | ||
| 2332 | lea 0x10($inp), $inp | ||
| 2333 | movdqa @XMM[8], 0x20(%rbp) | ||
| 2334 | lea 0x20(%rbp), $arg1 | ||
| 2335 | lea 0x20(%rbp), $arg2 | ||
| 2336 | lea ($key), $arg3 | ||
| 2337 | call asm_AES_encrypt # doesn't touch %xmm | ||
| 2338 | pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | ||
| 2339 | #pxor @XMM[8], @XMM[0] | ||
| 2340 | #lea 0x80(%rsp), %rax # pass key schedule | ||
| 2341 | #mov %edx, %r10d # pass rounds | ||
| 2342 | #call _bsaes_encrypt8 | ||
| 2343 | #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2344 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2345 | lea 0x10($out), $out | ||
| 2346 | |||
| 2347 | movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | ||
| 2348 | |||
| 2349 | .Lxts_enc_done: | ||
| 2350 | and \$15, %ebx | ||
| 2351 | jz .Lxts_enc_ret | ||
| 2352 | mov $out, %rdx | ||
| 2353 | |||
| 2354 | .Lxts_enc_steal: | ||
| 2355 | movzb ($inp), %eax | ||
| 2356 | movzb -16(%rdx), %ecx | ||
| 2357 | lea 1($inp), $inp | ||
| 2358 | mov %al, -16(%rdx) | ||
| 2359 | mov %cl, 0(%rdx) | ||
| 2360 | lea 1(%rdx), %rdx | ||
| 2361 | sub \$1,%ebx | ||
| 2362 | jnz .Lxts_enc_steal | ||
| 2363 | |||
| 2364 | movdqu -16($out), @XMM[0] | ||
| 2365 | lea 0x20(%rbp), $arg1 | ||
| 2366 | pxor @XMM[7], @XMM[0] | ||
| 2367 | lea 0x20(%rbp), $arg2 | ||
| 2368 | movdqa @XMM[0], 0x20(%rbp) | ||
| 2369 | lea ($key), $arg3 | ||
| 2370 | call asm_AES_encrypt # doesn't touch %xmm | ||
| 2371 | pxor 0x20(%rbp), @XMM[7] | ||
| 2372 | movdqu @XMM[7], -16($out) | ||
| 2373 | |||
| 2374 | .Lxts_enc_ret: | ||
| 2375 | lea (%rsp), %rax | ||
| 2376 | pxor %xmm0, %xmm0 | ||
| 2377 | .Lxts_enc_bzero: # wipe key schedule [if any] | ||
| 2378 | movdqa %xmm0, 0x00(%rax) | ||
| 2379 | movdqa %xmm0, 0x10(%rax) | ||
| 2380 | lea 0x20(%rax), %rax | ||
| 2381 | cmp %rax, %rbp | ||
| 2382 | ja .Lxts_enc_bzero | ||
| 2383 | |||
| 2384 | lea (%rbp),%rsp # restore %rsp | ||
| 2385 | ___ | ||
| 2386 | $code.=<<___ if ($win64); | ||
| 2387 | movaps 0x40(%rbp), %xmm6 | ||
| 2388 | movaps 0x50(%rbp), %xmm7 | ||
| 2389 | movaps 0x60(%rbp), %xmm8 | ||
| 2390 | movaps 0x70(%rbp), %xmm9 | ||
| 2391 | movaps 0x80(%rbp), %xmm10 | ||
| 2392 | movaps 0x90(%rbp), %xmm11 | ||
| 2393 | movaps 0xa0(%rbp), %xmm12 | ||
| 2394 | movaps 0xb0(%rbp), %xmm13 | ||
| 2395 | movaps 0xc0(%rbp), %xmm14 | ||
| 2396 | movaps 0xd0(%rbp), %xmm15 | ||
| 2397 | lea 0xa0(%rbp), %rsp | ||
| 2398 | ___ | ||
| 2399 | $code.=<<___; | ||
| 2400 | mov 0x48(%rsp), %r15 | ||
| 2401 | mov 0x50(%rsp), %r14 | ||
| 2402 | mov 0x58(%rsp), %r13 | ||
| 2403 | mov 0x60(%rsp), %r12 | ||
| 2404 | mov 0x68(%rsp), %rbx | ||
| 2405 | mov 0x70(%rsp), %rax | ||
| 2406 | lea 0x78(%rsp), %rsp | ||
| 2407 | mov %rax, %rbp | ||
| 2408 | .Lxts_enc_epilogue: | ||
| 2409 | ret | ||
| 2410 | .size bsaes_xts_encrypt,.-bsaes_xts_encrypt | ||
| 2411 | |||
| 2412 | .globl bsaes_xts_decrypt | ||
| 2413 | .type bsaes_xts_decrypt,\@abi-omnipotent | ||
| 2414 | .align 16 | ||
| 2415 | bsaes_xts_decrypt: | ||
| 2416 | mov %rsp, %rax | ||
| 2417 | .Lxts_dec_prologue: | ||
| 2418 | push %rbp | ||
| 2419 | push %rbx | ||
| 2420 | push %r12 | ||
| 2421 | push %r13 | ||
| 2422 | push %r14 | ||
| 2423 | push %r15 | ||
| 2424 | lea -0x48(%rsp), %rsp | ||
| 2425 | ___ | ||
| 2426 | $code.=<<___ if ($win64); | ||
| 2427 | mov 0xa0(%rsp),$arg5 # pull key2 | ||
| 2428 | mov 0xa8(%rsp),$arg6 # pull ivp | ||
| 2429 | lea -0xa0(%rsp), %rsp | ||
| 2430 | movaps %xmm6, 0x40(%rsp) | ||
| 2431 | movaps %xmm7, 0x50(%rsp) | ||
| 2432 | movaps %xmm8, 0x60(%rsp) | ||
| 2433 | movaps %xmm9, 0x70(%rsp) | ||
| 2434 | movaps %xmm10, 0x80(%rsp) | ||
| 2435 | movaps %xmm11, 0x90(%rsp) | ||
| 2436 | movaps %xmm12, 0xa0(%rsp) | ||
| 2437 | movaps %xmm13, 0xb0(%rsp) | ||
| 2438 | movaps %xmm14, 0xc0(%rsp) | ||
| 2439 | movaps %xmm15, 0xd0(%rsp) | ||
| 2440 | .Lxts_dec_body: | ||
| 2441 | ___ | ||
| 2442 | $code.=<<___; | ||
| 2443 | mov %rsp, %rbp # backup %rsp | ||
| 2444 | mov $arg1, $inp # backup arguments | ||
| 2445 | mov $arg2, $out | ||
| 2446 | mov $arg3, $len | ||
| 2447 | mov $arg4, $key | ||
| 2448 | |||
| 2449 | lea ($arg6), $arg1 | ||
| 2450 | lea 0x20(%rbp), $arg2 | ||
| 2451 | lea ($arg5), $arg3 | ||
| 2452 | call asm_AES_encrypt # generate initial tweak | ||
| 2453 | |||
| 2454 | mov 240($key), %eax # rounds | ||
| 2455 | mov $len, %rbx # backup $len | ||
| 2456 | |||
| 2457 | mov %eax, %edx # rounds | ||
| 2458 | shl \$7, %rax # 128 bytes per inner round key | ||
| 2459 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
| 2460 | sub %rax, %rsp | ||
| 2461 | |||
| 2462 | mov %rsp, %rax # pass key schedule | ||
| 2463 | mov $key, %rcx # pass key | ||
| 2464 | mov %edx, %r10d # pass rounds | ||
| 2465 | call _bsaes_key_convert | ||
| 2466 | pxor (%rsp), %xmm7 # fix up round 0 key | ||
| 2467 | movdqa %xmm6, (%rax) # save last round key | ||
| 2468 | movdqa %xmm7, (%rsp) | ||
| 2469 | |||
| 2470 | xor %eax, %eax # if ($len%16) len-=16; | ||
| 2471 | and \$-16, $len | ||
| 2472 | test \$15, %ebx | ||
| 2473 | setnz %al | ||
| 2474 | shl \$4, %rax | ||
| 2475 | sub %rax, $len | ||
| 2476 | |||
| 2477 | sub \$0x80, %rsp # place for tweak[8] | ||
| 2478 | movdqa 0x20(%rbp), @XMM[7] # initial tweak | ||
| 2479 | |||
| 2480 | pxor $twtmp, $twtmp | ||
| 2481 | movdqa .Lxts_magic(%rip), $twmask | ||
| 2482 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2483 | |||
| 2484 | sub \$0x80, $len | ||
| 2485 | jc .Lxts_dec_short | ||
| 2486 | jmp .Lxts_dec_loop | ||
| 2487 | |||
| 2488 | .align 16 | ||
| 2489 | .Lxts_dec_loop: | ||
| 2490 | ___ | ||
| 2491 | for ($i=0;$i<7;$i++) { | ||
| 2492 | $code.=<<___; | ||
| 2493 | pshufd \$0x13, $twtmp, $twres | ||
| 2494 | pxor $twtmp, $twtmp | ||
| 2495 | movdqa @XMM[7], @XMM[$i] | ||
| 2496 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
| 2497 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2498 | pand $twmask, $twres # isolate carry and residue | ||
| 2499 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2500 | pxor $twres, @XMM[7] | ||
| 2501 | ___ | ||
| 2502 | $code.=<<___ if ($i>=1); | ||
| 2503 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
| 2504 | ___ | ||
| 2505 | $code.=<<___ if ($i>=2); | ||
| 2506 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
| 2507 | ___ | ||
| 2508 | } | ||
| 2509 | $code.=<<___; | ||
| 2510 | movdqu 0x60($inp), @XMM[8+6] | ||
| 2511 | pxor @XMM[8+5], @XMM[5] | ||
| 2512 | movdqu 0x70($inp), @XMM[8+7] | ||
| 2513 | lea 0x80($inp), $inp | ||
| 2514 | movdqa @XMM[7], 0x70(%rsp) | ||
| 2515 | pxor @XMM[8+6], @XMM[6] | ||
| 2516 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2517 | pxor @XMM[8+7], @XMM[7] | ||
| 2518 | mov %edx, %r10d # pass rounds | ||
| 2519 | |||
| 2520 | call _bsaes_decrypt8 | ||
| 2521 | |||
| 2522 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2523 | pxor 0x10(%rsp), @XMM[1] | ||
| 2524 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2525 | pxor 0x20(%rsp), @XMM[6] | ||
| 2526 | movdqu @XMM[1], 0x10($out) | ||
| 2527 | pxor 0x30(%rsp), @XMM[4] | ||
| 2528 | movdqu @XMM[6], 0x20($out) | ||
| 2529 | pxor 0x40(%rsp), @XMM[2] | ||
| 2530 | movdqu @XMM[4], 0x30($out) | ||
| 2531 | pxor 0x50(%rsp), @XMM[7] | ||
| 2532 | movdqu @XMM[2], 0x40($out) | ||
| 2533 | pxor 0x60(%rsp), @XMM[3] | ||
| 2534 | movdqu @XMM[7], 0x50($out) | ||
| 2535 | pxor 0x70(%rsp), @XMM[5] | ||
| 2536 | movdqu @XMM[3], 0x60($out) | ||
| 2537 | movdqu @XMM[5], 0x70($out) | ||
| 2538 | lea 0x80($out), $out | ||
| 2539 | |||
| 2540 | movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | ||
| 2541 | pxor $twtmp, $twtmp | ||
| 2542 | movdqa .Lxts_magic(%rip), $twmask | ||
| 2543 | pcmpgtd @XMM[7], $twtmp | ||
| 2544 | pshufd \$0x13, $twtmp, $twres | ||
| 2545 | pxor $twtmp, $twtmp | ||
| 2546 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2547 | pand $twmask, $twres # isolate carry and residue | ||
| 2548 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2549 | pxor $twres, @XMM[7] | ||
| 2550 | |||
| 2551 | sub \$0x80,$len | ||
| 2552 | jnc .Lxts_dec_loop | ||
| 2553 | |||
| 2554 | .Lxts_dec_short: | ||
| 2555 | add \$0x80, $len | ||
| 2556 | jz .Lxts_dec_done | ||
| 2557 | ___ | ||
| 2558 | for ($i=0;$i<7;$i++) { | ||
| 2559 | $code.=<<___; | ||
| 2560 | pshufd \$0x13, $twtmp, $twres | ||
| 2561 | pxor $twtmp, $twtmp | ||
| 2562 | movdqa @XMM[7], @XMM[$i] | ||
| 2563 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
| 2564 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2565 | pand $twmask, $twres # isolate carry and residue | ||
| 2566 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
| 2567 | pxor $twres, @XMM[7] | ||
| 2568 | ___ | ||
| 2569 | $code.=<<___ if ($i>=1); | ||
| 2570 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
| 2571 | cmp \$`0x10*$i`,$len | ||
| 2572 | je .Lxts_dec_$i | ||
| 2573 | ___ | ||
| 2574 | $code.=<<___ if ($i>=2); | ||
| 2575 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
| 2576 | ___ | ||
| 2577 | } | ||
| 2578 | $code.=<<___; | ||
| 2579 | movdqu 0x60($inp), @XMM[8+6] | ||
| 2580 | pxor @XMM[8+5], @XMM[5] | ||
| 2581 | movdqa @XMM[7], 0x70(%rsp) | ||
| 2582 | lea 0x70($inp), $inp | ||
| 2583 | pxor @XMM[8+6], @XMM[6] | ||
| 2584 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2585 | mov %edx, %r10d # pass rounds | ||
| 2586 | |||
| 2587 | call _bsaes_decrypt8 | ||
| 2588 | |||
| 2589 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2590 | pxor 0x10(%rsp), @XMM[1] | ||
| 2591 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2592 | pxor 0x20(%rsp), @XMM[6] | ||
| 2593 | movdqu @XMM[1], 0x10($out) | ||
| 2594 | pxor 0x30(%rsp), @XMM[4] | ||
| 2595 | movdqu @XMM[6], 0x20($out) | ||
| 2596 | pxor 0x40(%rsp), @XMM[2] | ||
| 2597 | movdqu @XMM[4], 0x30($out) | ||
| 2598 | pxor 0x50(%rsp), @XMM[7] | ||
| 2599 | movdqu @XMM[2], 0x40($out) | ||
| 2600 | pxor 0x60(%rsp), @XMM[3] | ||
| 2601 | movdqu @XMM[7], 0x50($out) | ||
| 2602 | movdqu @XMM[3], 0x60($out) | ||
| 2603 | lea 0x70($out), $out | ||
| 2604 | |||
| 2605 | movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | ||
| 2606 | jmp .Lxts_dec_done | ||
| 2607 | .align 16 | ||
| 2608 | .Lxts_dec_6: | ||
| 2609 | pxor @XMM[8+4], @XMM[4] | ||
| 2610 | lea 0x60($inp), $inp | ||
| 2611 | pxor @XMM[8+5], @XMM[5] | ||
| 2612 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2613 | mov %edx, %r10d # pass rounds | ||
| 2614 | |||
| 2615 | call _bsaes_decrypt8 | ||
| 2616 | |||
| 2617 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2618 | pxor 0x10(%rsp), @XMM[1] | ||
| 2619 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2620 | pxor 0x20(%rsp), @XMM[6] | ||
| 2621 | movdqu @XMM[1], 0x10($out) | ||
| 2622 | pxor 0x30(%rsp), @XMM[4] | ||
| 2623 | movdqu @XMM[6], 0x20($out) | ||
| 2624 | pxor 0x40(%rsp), @XMM[2] | ||
| 2625 | movdqu @XMM[4], 0x30($out) | ||
| 2626 | pxor 0x50(%rsp), @XMM[7] | ||
| 2627 | movdqu @XMM[2], 0x40($out) | ||
| 2628 | movdqu @XMM[7], 0x50($out) | ||
| 2629 | lea 0x60($out), $out | ||
| 2630 | |||
| 2631 | movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | ||
| 2632 | jmp .Lxts_dec_done | ||
| 2633 | .align 16 | ||
| 2634 | .Lxts_dec_5: | ||
| 2635 | pxor @XMM[8+3], @XMM[3] | ||
| 2636 | lea 0x50($inp), $inp | ||
| 2637 | pxor @XMM[8+4], @XMM[4] | ||
| 2638 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2639 | mov %edx, %r10d # pass rounds | ||
| 2640 | |||
| 2641 | call _bsaes_decrypt8 | ||
| 2642 | |||
| 2643 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2644 | pxor 0x10(%rsp), @XMM[1] | ||
| 2645 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2646 | pxor 0x20(%rsp), @XMM[6] | ||
| 2647 | movdqu @XMM[1], 0x10($out) | ||
| 2648 | pxor 0x30(%rsp), @XMM[4] | ||
| 2649 | movdqu @XMM[6], 0x20($out) | ||
| 2650 | pxor 0x40(%rsp), @XMM[2] | ||
| 2651 | movdqu @XMM[4], 0x30($out) | ||
| 2652 | movdqu @XMM[2], 0x40($out) | ||
| 2653 | lea 0x50($out), $out | ||
| 2654 | |||
| 2655 | movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | ||
| 2656 | jmp .Lxts_dec_done | ||
| 2657 | .align 16 | ||
| 2658 | .Lxts_dec_4: | ||
| 2659 | pxor @XMM[8+2], @XMM[2] | ||
| 2660 | lea 0x40($inp), $inp | ||
| 2661 | pxor @XMM[8+3], @XMM[3] | ||
| 2662 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2663 | mov %edx, %r10d # pass rounds | ||
| 2664 | |||
| 2665 | call _bsaes_decrypt8 | ||
| 2666 | |||
| 2667 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2668 | pxor 0x10(%rsp), @XMM[1] | ||
| 2669 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2670 | pxor 0x20(%rsp), @XMM[6] | ||
| 2671 | movdqu @XMM[1], 0x10($out) | ||
| 2672 | pxor 0x30(%rsp), @XMM[4] | ||
| 2673 | movdqu @XMM[6], 0x20($out) | ||
| 2674 | movdqu @XMM[4], 0x30($out) | ||
| 2675 | lea 0x40($out), $out | ||
| 2676 | |||
| 2677 | movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | ||
| 2678 | jmp .Lxts_dec_done | ||
| 2679 | .align 16 | ||
| 2680 | .Lxts_dec_3: | ||
| 2681 | pxor @XMM[8+1], @XMM[1] | ||
| 2682 | lea 0x30($inp), $inp | ||
| 2683 | pxor @XMM[8+2], @XMM[2] | ||
| 2684 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2685 | mov %edx, %r10d # pass rounds | ||
| 2686 | |||
| 2687 | call _bsaes_decrypt8 | ||
| 2688 | |||
| 2689 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2690 | pxor 0x10(%rsp), @XMM[1] | ||
| 2691 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2692 | pxor 0x20(%rsp), @XMM[6] | ||
| 2693 | movdqu @XMM[1], 0x10($out) | ||
| 2694 | movdqu @XMM[6], 0x20($out) | ||
| 2695 | lea 0x30($out), $out | ||
| 2696 | |||
| 2697 | movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | ||
| 2698 | jmp .Lxts_dec_done | ||
| 2699 | .align 16 | ||
| 2700 | .Lxts_dec_2: | ||
| 2701 | pxor @XMM[8+0], @XMM[0] | ||
| 2702 | lea 0x20($inp), $inp | ||
| 2703 | pxor @XMM[8+1], @XMM[1] | ||
| 2704 | lea 0x80(%rsp), %rax # pass key schedule | ||
| 2705 | mov %edx, %r10d # pass rounds | ||
| 2706 | |||
| 2707 | call _bsaes_decrypt8 | ||
| 2708 | |||
| 2709 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2710 | pxor 0x10(%rsp), @XMM[1] | ||
| 2711 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2712 | movdqu @XMM[1], 0x10($out) | ||
| 2713 | lea 0x20($out), $out | ||
| 2714 | |||
| 2715 | movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | ||
| 2716 | jmp .Lxts_dec_done | ||
| 2717 | .align 16 | ||
| 2718 | .Lxts_dec_1: | ||
| 2719 | pxor @XMM[0], @XMM[8] | ||
| 2720 | lea 0x10($inp), $inp | ||
| 2721 | movdqa @XMM[8], 0x20(%rbp) | ||
| 2722 | lea 0x20(%rbp), $arg1 | ||
| 2723 | lea 0x20(%rbp), $arg2 | ||
| 2724 | lea ($key), $arg3 | ||
| 2725 | call asm_AES_decrypt # doesn't touch %xmm | ||
| 2726 | pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | ||
| 2727 | #pxor @XMM[8], @XMM[0] | ||
| 2728 | #lea 0x80(%rsp), %rax # pass key schedule | ||
| 2729 | #mov %edx, %r10d # pass rounds | ||
| 2730 | #call _bsaes_decrypt8 | ||
| 2731 | #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
| 2732 | movdqu @XMM[0], 0x00($out) # write output | ||
| 2733 | lea 0x10($out), $out | ||
| 2734 | |||
| 2735 | movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | ||
| 2736 | |||
| 2737 | .Lxts_dec_done: | ||
| 2738 | and \$15, %ebx | ||
| 2739 | jz .Lxts_dec_ret | ||
| 2740 | |||
| 2741 | pxor $twtmp, $twtmp | ||
| 2742 | movdqa .Lxts_magic(%rip), $twmask | ||
| 2743 | pcmpgtd @XMM[7], $twtmp | ||
| 2744 | pshufd \$0x13, $twtmp, $twres | ||
| 2745 | movdqa @XMM[7], @XMM[6] | ||
| 2746 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
| 2747 | pand $twmask, $twres # isolate carry and residue | ||
| 2748 | movdqu ($inp), @XMM[0] | ||
| 2749 | pxor $twres, @XMM[7] | ||
| 2750 | |||
| 2751 | lea 0x20(%rbp), $arg1 | ||
| 2752 | pxor @XMM[7], @XMM[0] | ||
| 2753 | lea 0x20(%rbp), $arg2 | ||
| 2754 | movdqa @XMM[0], 0x20(%rbp) | ||
| 2755 | lea ($key), $arg3 | ||
| 2756 | call asm_AES_decrypt # doesn't touch %xmm | ||
| 2757 | pxor 0x20(%rbp), @XMM[7] | ||
| 2758 | mov $out, %rdx | ||
| 2759 | movdqu @XMM[7], ($out) | ||
| 2760 | |||
| 2761 | .Lxts_dec_steal: | ||
| 2762 | movzb 16($inp), %eax | ||
| 2763 | movzb (%rdx), %ecx | ||
| 2764 | lea 1($inp), $inp | ||
| 2765 | mov %al, (%rdx) | ||
| 2766 | mov %cl, 16(%rdx) | ||
| 2767 | lea 1(%rdx), %rdx | ||
| 2768 | sub \$1,%ebx | ||
| 2769 | jnz .Lxts_dec_steal | ||
| 2770 | |||
| 2771 | movdqu ($out), @XMM[0] | ||
| 2772 | lea 0x20(%rbp), $arg1 | ||
| 2773 | pxor @XMM[6], @XMM[0] | ||
| 2774 | lea 0x20(%rbp), $arg2 | ||
| 2775 | movdqa @XMM[0], 0x20(%rbp) | ||
| 2776 | lea ($key), $arg3 | ||
| 2777 | call asm_AES_decrypt # doesn't touch %xmm | ||
| 2778 | pxor 0x20(%rbp), @XMM[6] | ||
| 2779 | movdqu @XMM[6], ($out) | ||
| 2780 | |||
| 2781 | .Lxts_dec_ret: | ||
| 2782 | lea (%rsp), %rax | ||
| 2783 | pxor %xmm0, %xmm0 | ||
| 2784 | .Lxts_dec_bzero: # wipe key schedule [if any] | ||
| 2785 | movdqa %xmm0, 0x00(%rax) | ||
| 2786 | movdqa %xmm0, 0x10(%rax) | ||
| 2787 | lea 0x20(%rax), %rax | ||
| 2788 | cmp %rax, %rbp | ||
| 2789 | ja .Lxts_dec_bzero | ||
| 2790 | |||
| 2791 | lea (%rbp),%rsp # restore %rsp | ||
| 2792 | ___ | ||
| 2793 | $code.=<<___ if ($win64); | ||
| 2794 | movaps 0x40(%rbp), %xmm6 | ||
| 2795 | movaps 0x50(%rbp), %xmm7 | ||
| 2796 | movaps 0x60(%rbp), %xmm8 | ||
| 2797 | movaps 0x70(%rbp), %xmm9 | ||
| 2798 | movaps 0x80(%rbp), %xmm10 | ||
| 2799 | movaps 0x90(%rbp), %xmm11 | ||
| 2800 | movaps 0xa0(%rbp), %xmm12 | ||
| 2801 | movaps 0xb0(%rbp), %xmm13 | ||
| 2802 | movaps 0xc0(%rbp), %xmm14 | ||
| 2803 | movaps 0xd0(%rbp), %xmm15 | ||
| 2804 | lea 0xa0(%rbp), %rsp | ||
| 2805 | ___ | ||
| 2806 | $code.=<<___; | ||
| 2807 | mov 0x48(%rsp), %r15 | ||
| 2808 | mov 0x50(%rsp), %r14 | ||
| 2809 | mov 0x58(%rsp), %r13 | ||
| 2810 | mov 0x60(%rsp), %r12 | ||
| 2811 | mov 0x68(%rsp), %rbx | ||
| 2812 | mov 0x70(%rsp), %rax | ||
| 2813 | lea 0x78(%rsp), %rsp | ||
| 2814 | mov %rax, %rbp | ||
| 2815 | .Lxts_dec_epilogue: | ||
| 2816 | ret | ||
| 2817 | .size bsaes_xts_decrypt,.-bsaes_xts_decrypt | ||
| 2818 | ___ | ||
| 2819 | } | ||
| 2820 | $code.=<<___; | ||
| 2821 | .type _bsaes_const,\@object | ||
| 2822 | .align 64 | ||
| 2823 | _bsaes_const: | ||
| 2824 | .LM0ISR: # InvShiftRows constants | ||
| 2825 | .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 | ||
| 2826 | .LISRM0: | ||
| 2827 | .quad 0x01040b0e0205080f, 0x0306090c00070a0d | ||
| 2828 | .LISR: | ||
| 2829 | .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 | ||
| 2830 | .LBS0: # bit-slice constants | ||
| 2831 | .quad 0x5555555555555555, 0x5555555555555555 | ||
| 2832 | .LBS1: | ||
| 2833 | .quad 0x3333333333333333, 0x3333333333333333 | ||
| 2834 | .LBS2: | ||
| 2835 | .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f | ||
| 2836 | .LSR: # shiftrows constants | ||
| 2837 | .quad 0x0504070600030201, 0x0f0e0d0c0a09080b | ||
| 2838 | .LSRM0: | ||
| 2839 | .quad 0x0304090e00050a0f, 0x01060b0c0207080d | ||
| 2840 | .LM0SR: | ||
| 2841 | .quad 0x0a0e02060f03070b, 0x0004080c05090d01 | ||
| 2842 | .LSWPUP: # byte-swap upper dword | ||
| 2843 | .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 | ||
| 2844 | .LSWPUPM0SR: | ||
| 2845 | .quad 0x0a0d02060c03070b, 0x0004080f05090e01 | ||
| 2846 | .LADD1: # counter increment constants | ||
| 2847 | .quad 0x0000000000000000, 0x0000000100000000 | ||
| 2848 | .LADD2: | ||
| 2849 | .quad 0x0000000000000000, 0x0000000200000000 | ||
| 2850 | .LADD3: | ||
| 2851 | .quad 0x0000000000000000, 0x0000000300000000 | ||
| 2852 | .LADD4: | ||
| 2853 | .quad 0x0000000000000000, 0x0000000400000000 | ||
| 2854 | .LADD5: | ||
| 2855 | .quad 0x0000000000000000, 0x0000000500000000 | ||
| 2856 | .LADD6: | ||
| 2857 | .quad 0x0000000000000000, 0x0000000600000000 | ||
| 2858 | .LADD7: | ||
| 2859 | .quad 0x0000000000000000, 0x0000000700000000 | ||
| 2860 | .LADD8: | ||
| 2861 | .quad 0x0000000000000000, 0x0000000800000000 | ||
| 2862 | .Lxts_magic: | ||
| 2863 | .long 0x87,0,1,0 | ||
| 2864 | .Lmasks: | ||
| 2865 | .quad 0x0101010101010101, 0x0101010101010101 | ||
| 2866 | .quad 0x0202020202020202, 0x0202020202020202 | ||
| 2867 | .quad 0x0404040404040404, 0x0404040404040404 | ||
| 2868 | .quad 0x0808080808080808, 0x0808080808080808 | ||
| 2869 | .LM0: | ||
| 2870 | .quad 0x02060a0e03070b0f, 0x0004080c0105090d | ||
| 2871 | .L63: | ||
| 2872 | .quad 0x6363636363636363, 0x6363636363636363 | ||
| 2873 | .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" | ||
| 2874 | .align 64 | ||
| 2875 | .size _bsaes_const,.-_bsaes_const | ||
| 2876 | ___ | ||
| 2877 | |||
| 2878 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 2879 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 2880 | if ($win64) { | ||
| 2881 | $rec="%rcx"; | ||
| 2882 | $frame="%rdx"; | ||
| 2883 | $context="%r8"; | ||
| 2884 | $disp="%r9"; | ||
| 2885 | |||
| 2886 | $code.=<<___; | ||
| 2887 | .extern __imp_RtlVirtualUnwind | ||
| 2888 | .type se_handler,\@abi-omnipotent | ||
| 2889 | .align 16 | ||
| 2890 | se_handler: | ||
| 2891 | push %rsi | ||
| 2892 | push %rdi | ||
| 2893 | push %rbx | ||
| 2894 | push %rbp | ||
| 2895 | push %r12 | ||
| 2896 | push %r13 | ||
| 2897 | push %r14 | ||
| 2898 | push %r15 | ||
| 2899 | pushfq | ||
| 2900 | sub \$64,%rsp | ||
| 2901 | |||
| 2902 | mov 120($context),%rax # pull context->Rax | ||
| 2903 | mov 248($context),%rbx # pull context->Rip | ||
| 2904 | |||
| 2905 | mov 8($disp),%rsi # disp->ImageBase | ||
| 2906 | mov 56($disp),%r11 # disp->HandlerData | ||
| 2907 | |||
| 2908 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 2909 | lea (%rsi,%r10),%r10 # prologue label | ||
| 2910 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 2911 | jb .Lin_prologue | ||
| 2912 | |||
| 2913 | mov 152($context),%rax # pull context->Rsp | ||
| 2914 | |||
| 2915 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 2916 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 2917 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 2918 | jae .Lin_prologue | ||
| 2919 | |||
| 2920 | mov 160($context),%rax # pull context->Rbp | ||
| 2921 | |||
| 2922 | lea 0x40(%rax),%rsi # %xmm save area | ||
| 2923 | lea 512($context),%rdi # &context.Xmm6 | ||
| 2924 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
| 2925 | .long 0xa548f3fc # cld; rep movsq | ||
| 2926 | lea 0xa0(%rax),%rax # adjust stack pointer | ||
| 2927 | |||
| 2928 | mov 0x70(%rax),%rbp | ||
| 2929 | mov 0x68(%rax),%rbx | ||
| 2930 | mov 0x60(%rax),%r12 | ||
| 2931 | mov 0x58(%rax),%r13 | ||
| 2932 | mov 0x50(%rax),%r14 | ||
| 2933 | mov 0x48(%rax),%r15 | ||
| 2934 | lea 0x78(%rax),%rax # adjust stack pointer | ||
| 2935 | mov %rbx,144($context) # restore context->Rbx | ||
| 2936 | mov %rbp,160($context) # restore context->Rbp | ||
| 2937 | mov %r12,216($context) # restore context->R12 | ||
| 2938 | mov %r13,224($context) # restore context->R13 | ||
| 2939 | mov %r14,232($context) # restore context->R14 | ||
| 2940 | mov %r15,240($context) # restore context->R15 | ||
| 2941 | |||
| 2942 | .Lin_prologue: | ||
| 2943 | mov %rax,152($context) # restore context->Rsp | ||
| 2944 | |||
| 2945 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 2946 | mov $context,%rsi # context | ||
| 2947 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
| 2948 | .long 0xa548f3fc # cld; rep movsq | ||
| 2949 | |||
| 2950 | mov $disp,%rsi | ||
| 2951 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 2952 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 2953 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 2954 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 2955 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 2956 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 2957 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 2958 | mov %r10,32(%rsp) # arg5 | ||
| 2959 | mov %r11,40(%rsp) # arg6 | ||
| 2960 | mov %r12,48(%rsp) # arg7 | ||
| 2961 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 2962 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 2963 | |||
| 2964 | mov \$1,%eax # ExceptionContinueSearch | ||
| 2965 | add \$64,%rsp | ||
| 2966 | popfq | ||
| 2967 | pop %r15 | ||
| 2968 | pop %r14 | ||
| 2969 | pop %r13 | ||
| 2970 | pop %r12 | ||
| 2971 | pop %rbp | ||
| 2972 | pop %rbx | ||
| 2973 | pop %rdi | ||
| 2974 | pop %rsi | ||
| 2975 | ret | ||
| 2976 | .size se_handler,.-se_handler | ||
| 2977 | |||
| 2978 | .section .pdata | ||
| 2979 | .align 4 | ||
| 2980 | ___ | ||
| 2981 | $code.=<<___ if ($ecb); | ||
| 2982 | .rva .Lecb_enc_prologue | ||
| 2983 | .rva .Lecb_enc_epilogue | ||
| 2984 | .rva .Lecb_enc_info | ||
| 2985 | |||
| 2986 | .rva .Lecb_dec_prologue | ||
| 2987 | .rva .Lecb_dec_epilogue | ||
| 2988 | .rva .Lecb_dec_info | ||
| 2989 | ___ | ||
| 2990 | $code.=<<___; | ||
| 2991 | .rva .Lcbc_dec_prologue | ||
| 2992 | .rva .Lcbc_dec_epilogue | ||
| 2993 | .rva .Lcbc_dec_info | ||
| 2994 | |||
| 2995 | .rva .Lctr_enc_prologue | ||
| 2996 | .rva .Lctr_enc_epilogue | ||
| 2997 | .rva .Lctr_enc_info | ||
| 2998 | |||
| 2999 | .rva .Lxts_enc_prologue | ||
| 3000 | .rva .Lxts_enc_epilogue | ||
| 3001 | .rva .Lxts_enc_info | ||
| 3002 | |||
| 3003 | .rva .Lxts_dec_prologue | ||
| 3004 | .rva .Lxts_dec_epilogue | ||
| 3005 | .rva .Lxts_dec_info | ||
| 3006 | |||
| 3007 | .section .xdata | ||
| 3008 | .align 8 | ||
| 3009 | ___ | ||
| 3010 | $code.=<<___ if ($ecb); | ||
| 3011 | .Lecb_enc_info: | ||
| 3012 | .byte 9,0,0,0 | ||
| 3013 | .rva se_handler | ||
| 3014 | .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] | ||
| 3015 | .Lecb_dec_info: | ||
| 3016 | .byte 9,0,0,0 | ||
| 3017 | .rva se_handler | ||
| 3018 | .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] | ||
| 3019 | ___ | ||
| 3020 | $code.=<<___; | ||
| 3021 | .Lcbc_dec_info: | ||
| 3022 | .byte 9,0,0,0 | ||
| 3023 | .rva se_handler | ||
| 3024 | .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] | ||
| 3025 | .Lctr_enc_info: | ||
| 3026 | .byte 9,0,0,0 | ||
| 3027 | .rva se_handler | ||
| 3028 | .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] | ||
| 3029 | .Lxts_enc_info: | ||
| 3030 | .byte 9,0,0,0 | ||
| 3031 | .rva se_handler | ||
| 3032 | .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] | ||
| 3033 | .Lxts_dec_info: | ||
| 3034 | .byte 9,0,0,0 | ||
| 3035 | .rva se_handler | ||
| 3036 | .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] | ||
| 3037 | ___ | ||
| 3038 | } | ||
| 3039 | |||
| 3040 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 3041 | |||
| 3042 | print $code; | ||
| 3043 | |||
| 3044 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl new file mode 100644 index 0000000000..1533e2c304 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/vpaes-x86.pl | |||
| @@ -0,0 +1,903 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | ###################################################################### | ||
| 4 | ## Constant-time SSSE3 AES core implementation. | ||
| 5 | ## version 0.1 | ||
| 6 | ## | ||
| 7 | ## By Mike Hamburg (Stanford University), 2009 | ||
| 8 | ## Public domain. | ||
| 9 | ## | ||
| 10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
| 11 | ## http://crypto.stanford.edu/vpaes/. | ||
| 12 | |||
| 13 | ###################################################################### | ||
| 14 | # September 2011. | ||
| 15 | # | ||
| 16 | # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for | ||
| 17 | # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
| 18 | # doesn't handle partial vectors (doesn't have to if called from | ||
| 19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
| 20 | # schedule structure with the original nor does it make assumption | ||
| 21 | # about its alignment... | ||
| 22 | # | ||
| 23 | # Performance summary. aes-586.pl column lists large-block CBC | ||
| 24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
| 25 | # byte processed with 128-bit key, and vpaes-x86.pl column - [also | ||
| 26 | # large-block CBC] encrypt/decrypt. | ||
| 27 | # | ||
| 28 | # aes-586.pl vpaes-x86.pl | ||
| 29 | # | ||
| 30 | # Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) | ||
| 31 | # Nehalem 27.9/40.4/18.1 10.3/12.0 | ||
| 32 | # Atom 102./119./60.1 64.5/85.3(***) | ||
| 33 | # | ||
| 34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
| 35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
| 36 | # majority of contemporary cores share cache, slower code path | ||
| 37 | # is common place. In other words "with-hyper-threading-off" | ||
| 38 | # results are presented mostly for reference purposes. | ||
| 39 | # | ||
| 40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
| 41 | # | ||
| 42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
| 43 | # pshufb, yet it's respectable +32%/65% improvement on Core 2 | ||
| 44 | # and +58%/40% on Atom (as implied, over "hyper-threading-safe" | ||
| 45 | # code path). | ||
| 46 | # | ||
| 47 | # <appro@openssl.org> | ||
| 48 | |||
| 49 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 50 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 51 | require "x86asm.pl"; | ||
| 52 | |||
| 53 | &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); | ||
| 54 | |||
| 55 | $PREFIX="vpaes"; | ||
| 56 | |||
| 57 | my ($round, $base, $magic, $key, $const, $inp, $out)= | ||
| 58 | ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); | ||
| 59 | |||
| 60 | &static_label("_vpaes_consts"); | ||
| 61 | &static_label("_vpaes_schedule_low_round"); | ||
| 62 | |||
| 63 | &set_label("_vpaes_consts",64); | ||
| 64 | $k_inv=-0x30; # inv, inva | ||
| 65 | &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); | ||
| 66 | &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); | ||
| 67 | |||
| 68 | $k_s0F=-0x10; # s0F | ||
| 69 | &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); | ||
| 70 | |||
| 71 | $k_ipt=0x00; # input transform (lo, hi) | ||
| 72 | &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); | ||
| 73 | &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); | ||
| 74 | |||
| 75 | $k_sb1=0x20; # sb1u, sb1t | ||
| 76 | &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); | ||
| 77 | &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); | ||
| 78 | $k_sb2=0x40; # sb2u, sb2t | ||
| 79 | &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); | ||
| 80 | &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); | ||
| 81 | $k_sbo=0x60; # sbou, sbot | ||
| 82 | &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); | ||
| 83 | &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); | ||
| 84 | |||
| 85 | $k_mc_forward=0x80; # mc_forward | ||
| 86 | &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); | ||
| 87 | &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); | ||
| 88 | &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); | ||
| 89 | &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); | ||
| 90 | |||
| 91 | $k_mc_backward=0xc0; # mc_backward | ||
| 92 | &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); | ||
| 93 | &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); | ||
| 94 | &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); | ||
| 95 | &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); | ||
| 96 | |||
| 97 | $k_sr=0x100; # sr | ||
| 98 | &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); | ||
| 99 | &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); | ||
| 100 | &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); | ||
| 101 | &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); | ||
| 102 | |||
| 103 | $k_rcon=0x140; # rcon | ||
| 104 | &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); | ||
| 105 | |||
| 106 | $k_s63=0x150; # s63: all equal to 0x63 transformed | ||
| 107 | &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); | ||
| 108 | |||
| 109 | $k_opt=0x160; # output transform | ||
| 110 | &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); | ||
| 111 | &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); | ||
| 112 | |||
| 113 | $k_deskew=0x180; # deskew tables: inverts the sbox's "skew" | ||
| 114 | &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); | ||
| 115 | &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); | ||
| 116 | ## | ||
| 117 | ## Decryption stuff | ||
| 118 | ## Key schedule constants | ||
| 119 | ## | ||
| 120 | $k_dksd=0x1a0; # decryption key schedule: invskew x*D | ||
| 121 | &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); | ||
| 122 | &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); | ||
| 123 | $k_dksb=0x1c0; # decryption key schedule: invskew x*B | ||
| 124 | &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); | ||
| 125 | &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); | ||
| 126 | $k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 | ||
| 127 | &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); | ||
| 128 | &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); | ||
| 129 | $k_dks9=0x200; # decryption key schedule: invskew x*9 | ||
| 130 | &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); | ||
| 131 | &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); | ||
| 132 | |||
| 133 | ## | ||
| 134 | ## Decryption stuff | ||
| 135 | ## Round function constants | ||
| 136 | ## | ||
| 137 | $k_dipt=0x220; # decryption input transform | ||
| 138 | &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); | ||
| 139 | &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); | ||
| 140 | |||
| 141 | $k_dsb9=0x240; # decryption sbox output *9*u, *9*t | ||
| 142 | &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); | ||
| 143 | &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); | ||
| 144 | $k_dsbd=0x260; # decryption sbox output *D*u, *D*t | ||
| 145 | &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); | ||
| 146 | &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); | ||
| 147 | $k_dsbb=0x280; # decryption sbox output *B*u, *B*t | ||
| 148 | &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); | ||
| 149 | &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); | ||
| 150 | $k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t | ||
| 151 | &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); | ||
| 152 | &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); | ||
| 153 | $k_dsbo=0x2c0; # decryption sbox final output | ||
| 154 | &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); | ||
| 155 | &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); | ||
| 156 | &asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); | ||
| 157 | &align (64); | ||
| 158 | |||
| 159 | &function_begin_B("_vpaes_preheat"); | ||
| 160 | &add ($const,&DWP(0,"esp")); | ||
| 161 | &movdqa ("xmm7",&QWP($k_inv,$const)); | ||
| 162 | &movdqa ("xmm6",&QWP($k_s0F,$const)); | ||
| 163 | &ret (); | ||
| 164 | &function_end_B("_vpaes_preheat"); | ||
| 165 | |||
| 166 | ## | ||
| 167 | ## _aes_encrypt_core | ||
| 168 | ## | ||
| 169 | ## AES-encrypt %xmm0. | ||
| 170 | ## | ||
| 171 | ## Inputs: | ||
| 172 | ## %xmm0 = input | ||
| 173 | ## %xmm6-%xmm7 as in _vpaes_preheat | ||
| 174 | ## (%edx) = scheduled keys | ||
| 175 | ## | ||
| 176 | ## Output in %xmm0 | ||
| 177 | ## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx | ||
| 178 | ## | ||
| 179 | ## | ||
| 180 | &function_begin_B("_vpaes_encrypt_core"); | ||
| 181 | &mov ($magic,16); | ||
| 182 | &mov ($round,&DWP(240,$key)); | ||
| 183 | &movdqa ("xmm1","xmm6") | ||
| 184 | &movdqa ("xmm2",&QWP($k_ipt,$const)); | ||
| 185 | &pandn ("xmm1","xmm0"); | ||
| 186 | &movdqu ("xmm5",&QWP(0,$key)); | ||
| 187 | &psrld ("xmm1",4); | ||
| 188 | &pand ("xmm0","xmm6"); | ||
| 189 | &pshufb ("xmm2","xmm0"); | ||
| 190 | &movdqa ("xmm0",&QWP($k_ipt+16,$const)); | ||
| 191 | &pshufb ("xmm0","xmm1"); | ||
| 192 | &pxor ("xmm2","xmm5"); | ||
| 193 | &pxor ("xmm0","xmm2"); | ||
| 194 | &add ($key,16); | ||
| 195 | &lea ($base,&DWP($k_mc_backward,$const)); | ||
| 196 | &jmp (&label("enc_entry")); | ||
| 197 | |||
| 198 | |||
| 199 | &set_label("enc_loop",16); | ||
| 200 | # middle of middle round | ||
| 201 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u | ||
| 202 | &pshufb ("xmm4","xmm2"); # 4 = sb1u | ||
| 203 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
| 204 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t | ||
| 205 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
| 206 | &pxor ("xmm0","xmm4"); # 0 = A | ||
| 207 | &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u | ||
| 208 | &pshufb ("xmm5","xmm2"); # 4 = sb2u | ||
| 209 | &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] | ||
| 210 | &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t | ||
| 211 | &pshufb ("xmm2","xmm3"); # 2 = sb2t | ||
| 212 | &pxor ("xmm2","xmm5"); # 2 = 2A | ||
| 213 | &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] | ||
| 214 | &movdqa ("xmm3","xmm0"); # 3 = A | ||
| 215 | &pshufb ("xmm0","xmm1"); # 0 = B | ||
| 216 | &add ($key,16); # next key | ||
| 217 | &pxor ("xmm0","xmm2"); # 0 = 2A+B | ||
| 218 | &pshufb ("xmm3","xmm4"); # 3 = D | ||
| 219 | &add ($magic,16); # next mc | ||
| 220 | &pxor ("xmm3","xmm0"); # 3 = 2A+B+D | ||
| 221 | &pshufb ("xmm0","xmm1"); # 0 = 2B+C | ||
| 222 | &and ($magic,0x30); # ... mod 4 | ||
| 223 | &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D | ||
| 224 | &sub ($round,1); # nr-- | ||
| 225 | |||
| 226 | &set_label("enc_entry"); | ||
| 227 | # top of round | ||
| 228 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
| 229 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
| 230 | &psrld ("xmm1",4); # 1 = i | ||
| 231 | &pand ("xmm0","xmm6"); # 0 = k | ||
| 232 | &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k | ||
| 233 | &pshufb ("xmm5","xmm0"); # 2 = a/k | ||
| 234 | &pxor ("xmm0","xmm1"); # 0 = j | ||
| 235 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
| 236 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
| 237 | &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k | ||
| 238 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
| 239 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
| 240 | &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k | ||
| 241 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
| 242 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
| 243 | &pxor ("xmm2","xmm0"); # 2 = io | ||
| 244 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
| 245 | &movdqu ("xmm5",&QWP(0,$key)); | ||
| 246 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
| 247 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
| 248 | &jnz (&label("enc_loop")); | ||
| 249 | |||
| 250 | # middle of last round | ||
| 251 | &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo | ||
| 252 | &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 | ||
| 253 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
| 254 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
| 255 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
| 256 | &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] | ||
| 257 | &pxor ("xmm0","xmm4"); # 0 = A | ||
| 258 | &pshufb ("xmm0","xmm1"); | ||
| 259 | &ret (); | ||
| 260 | &function_end_B("_vpaes_encrypt_core"); | ||
| 261 | |||
| 262 | ## | ||
| 263 | ## Decryption core | ||
| 264 | ## | ||
| 265 | ## Same API as encryption core. | ||
| 266 | ## | ||
| 267 | &function_begin_B("_vpaes_decrypt_core"); | ||
| 268 | &mov ($round,&DWP(240,$key)); | ||
| 269 | &lea ($base,&DWP($k_dsbd,$const)); | ||
| 270 | &movdqa ("xmm1","xmm6"); | ||
| 271 | &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); | ||
| 272 | &pandn ("xmm1","xmm0"); | ||
| 273 | &mov ($magic,$round); | ||
| 274 | &psrld ("xmm1",4) | ||
| 275 | &movdqu ("xmm5",&QWP(0,$key)); | ||
| 276 | &shl ($magic,4); | ||
| 277 | &pand ("xmm0","xmm6"); | ||
| 278 | &pshufb ("xmm2","xmm0"); | ||
| 279 | &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); | ||
| 280 | &xor ($magic,0x30); | ||
| 281 | &pshufb ("xmm0","xmm1"); | ||
| 282 | &and ($magic,0x30); | ||
| 283 | &pxor ("xmm2","xmm5"); | ||
| 284 | &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); | ||
| 285 | &pxor ("xmm0","xmm2"); | ||
| 286 | &add ($key,16); | ||
| 287 | &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); | ||
| 288 | &jmp (&label("dec_entry")); | ||
| 289 | |||
| 290 | &set_label("dec_loop",16); | ||
| 291 | ## | ||
| 292 | ## Inverse mix columns | ||
| 293 | ## | ||
| 294 | &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u | ||
| 295 | &pshufb ("xmm4","xmm2"); # 4 = sb9u | ||
| 296 | &pxor ("xmm4","xmm0"); | ||
| 297 | &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t | ||
| 298 | &pshufb ("xmm0","xmm3"); # 0 = sb9t | ||
| 299 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
| 300 | &add ($key,16); # next round key | ||
| 301 | |||
| 302 | &pshufb ("xmm0","xmm5"); # MC ch | ||
| 303 | &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu | ||
| 304 | &pshufb ("xmm4","xmm2"); # 4 = sbdu | ||
| 305 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
| 306 | &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt | ||
| 307 | &pshufb ("xmm0","xmm3"); # 0 = sbdt | ||
| 308 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
| 309 | &sub ($round,1); # nr-- | ||
| 310 | |||
| 311 | &pshufb ("xmm0","xmm5"); # MC ch | ||
| 312 | &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu | ||
| 313 | &pshufb ("xmm4","xmm2"); # 4 = sbbu | ||
| 314 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
| 315 | &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt | ||
| 316 | &pshufb ("xmm0","xmm3"); # 0 = sbbt | ||
| 317 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
| 318 | |||
| 319 | &pshufb ("xmm0","xmm5"); # MC ch | ||
| 320 | &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu | ||
| 321 | &pshufb ("xmm4","xmm2"); # 4 = sbeu | ||
| 322 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
| 323 | &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet | ||
| 324 | &pshufb ("xmm0","xmm3"); # 0 = sbet | ||
| 325 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
| 326 | |||
| 327 | &palignr("xmm5","xmm5",12); | ||
| 328 | |||
| 329 | &set_label("dec_entry"); | ||
| 330 | # top of round | ||
| 331 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
| 332 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
| 333 | &psrld ("xmm1",4); # 1 = i | ||
| 334 | &pand ("xmm0","xmm6"); # 0 = k | ||
| 335 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
| 336 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
| 337 | &pxor ("xmm0","xmm1"); # 0 = j | ||
| 338 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
| 339 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
| 340 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
| 341 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
| 342 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
| 343 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
| 344 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
| 345 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
| 346 | &pxor ("xmm2","xmm0"); # 2 = io | ||
| 347 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
| 348 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
| 349 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
| 350 | &movdqu ("xmm0",&QWP(0,$key)); | ||
| 351 | &jnz (&label("dec_loop")); | ||
| 352 | |||
| 353 | # middle of last round | ||
| 354 | &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou | ||
| 355 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
| 356 | &pxor ("xmm4","xmm0"); # 4 = sb1u + k | ||
| 357 | &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot | ||
| 358 | &movdqa ("xmm2",&QWP(0,$magic)); | ||
| 359 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
| 360 | &pxor ("xmm0","xmm4"); # 0 = A | ||
| 361 | &pshufb ("xmm0","xmm2"); | ||
| 362 | &ret (); | ||
| 363 | &function_end_B("_vpaes_decrypt_core"); | ||
| 364 | |||
| 365 | ######################################################## | ||
| 366 | ## ## | ||
| 367 | ## AES key schedule ## | ||
| 368 | ## ## | ||
| 369 | ######################################################## | ||
| 370 | &function_begin_B("_vpaes_schedule_core"); | ||
| 371 | &add ($const,&DWP(0,"esp")); | ||
| 372 | &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) | ||
| 373 | &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon | ||
| 374 | |||
| 375 | # input transform | ||
| 376 | &movdqa ("xmm3","xmm0"); | ||
| 377 | &lea ($base,&DWP($k_ipt,$const)); | ||
| 378 | &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 | ||
| 379 | &call ("_vpaes_schedule_transform"); | ||
| 380 | &movdqa ("xmm7","xmm0"); | ||
| 381 | |||
| 382 | &test ($out,$out); | ||
| 383 | &jnz (&label("schedule_am_decrypting")); | ||
| 384 | |||
| 385 | # encrypting, output zeroth round key after transform | ||
| 386 | &movdqu (&QWP(0,$key),"xmm0"); | ||
| 387 | &jmp (&label("schedule_go")); | ||
| 388 | |||
| 389 | &set_label("schedule_am_decrypting"); | ||
| 390 | # decrypting, output zeroth round key after shiftrows | ||
| 391 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
| 392 | &pshufb ("xmm3","xmm1"); | ||
| 393 | &movdqu (&QWP(0,$key),"xmm3"); | ||
| 394 | &xor ($magic,0x30); | ||
| 395 | |||
| 396 | &set_label("schedule_go"); | ||
| 397 | &cmp ($round,192); | ||
| 398 | &ja (&label("schedule_256")); | ||
| 399 | &je (&label("schedule_192")); | ||
| 400 | # 128: fall though | ||
| 401 | |||
| 402 | ## | ||
| 403 | ## .schedule_128 | ||
| 404 | ## | ||
| 405 | ## 128-bit specific part of key schedule. | ||
| 406 | ## | ||
| 407 | ## This schedule is really simple, because all its parts | ||
| 408 | ## are accomplished by the subroutines. | ||
| 409 | ## | ||
| 410 | &set_label("schedule_128"); | ||
| 411 | &mov ($round,10); | ||
| 412 | |||
| 413 | &set_label("loop_schedule_128"); | ||
| 414 | &call ("_vpaes_schedule_round"); | ||
| 415 | &dec ($round); | ||
| 416 | &jz (&label("schedule_mangle_last")); | ||
| 417 | &call ("_vpaes_schedule_mangle"); # write output | ||
| 418 | &jmp (&label("loop_schedule_128")); | ||
| 419 | |||
| 420 | ## | ||
| 421 | ## .aes_schedule_192 | ||
| 422 | ## | ||
| 423 | ## 192-bit specific part of key schedule. | ||
| 424 | ## | ||
| 425 | ## The main body of this schedule is the same as the 128-bit | ||
| 426 | ## schedule, but with more smearing. The long, high side is | ||
| 427 | ## stored in %xmm7 as before, and the short, low side is in | ||
| 428 | ## the high bits of %xmm6. | ||
| 429 | ## | ||
| 430 | ## This schedule is somewhat nastier, however, because each | ||
| 431 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
| 432 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
| 433 | ## keys. | ||
| 434 | ## | ||
| 435 | &set_label("schedule_192",16); | ||
| 436 | &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) | ||
| 437 | &call ("_vpaes_schedule_transform"); # input transform | ||
| 438 | &movdqa ("xmm6","xmm0"); # save short part | ||
| 439 | &pxor ("xmm4","xmm4"); # clear 4 | ||
| 440 | &movhlps("xmm6","xmm4"); # clobber low side with zeros | ||
| 441 | &mov ($round,4); | ||
| 442 | |||
| 443 | &set_label("loop_schedule_192"); | ||
| 444 | &call ("_vpaes_schedule_round"); | ||
| 445 | &palignr("xmm0","xmm6",8); | ||
| 446 | &call ("_vpaes_schedule_mangle"); # save key n | ||
| 447 | &call ("_vpaes_schedule_192_smear"); | ||
| 448 | &call ("_vpaes_schedule_mangle"); # save key n+1 | ||
| 449 | &call ("_vpaes_schedule_round"); | ||
| 450 | &dec ($round); | ||
| 451 | &jz (&label("schedule_mangle_last")); | ||
| 452 | &call ("_vpaes_schedule_mangle"); # save key n+2 | ||
| 453 | &call ("_vpaes_schedule_192_smear"); | ||
| 454 | &jmp (&label("loop_schedule_192")); | ||
| 455 | |||
| 456 | ## | ||
| 457 | ## .aes_schedule_256 | ||
| 458 | ## | ||
| 459 | ## 256-bit specific part of key schedule. | ||
| 460 | ## | ||
| 461 | ## The structure here is very similar to the 128-bit | ||
| 462 | ## schedule, but with an additional "low side" in | ||
| 463 | ## %xmm6. The low side's rounds are the same as the | ||
| 464 | ## high side's, except no rcon and no rotation. | ||
| 465 | ## | ||
| 466 | &set_label("schedule_256",16); | ||
| 467 | &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) | ||
| 468 | &call ("_vpaes_schedule_transform"); # input transform | ||
| 469 | &mov ($round,7); | ||
| 470 | |||
| 471 | &set_label("loop_schedule_256"); | ||
| 472 | &call ("_vpaes_schedule_mangle"); # output low result | ||
| 473 | &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 | ||
| 474 | |||
| 475 | # high round | ||
| 476 | &call ("_vpaes_schedule_round"); | ||
| 477 | &dec ($round); | ||
| 478 | &jz (&label("schedule_mangle_last")); | ||
| 479 | &call ("_vpaes_schedule_mangle"); | ||
| 480 | |||
| 481 | # low round. swap xmm7 and xmm6 | ||
| 482 | &pshufd ("xmm0","xmm0",0xFF); | ||
| 483 | &movdqa (&QWP(20,"esp"),"xmm7"); | ||
| 484 | &movdqa ("xmm7","xmm6"); | ||
| 485 | &call ("_vpaes_schedule_low_round"); | ||
| 486 | &movdqa ("xmm7",&QWP(20,"esp")); | ||
| 487 | |||
| 488 | &jmp (&label("loop_schedule_256")); | ||
| 489 | |||
| 490 | ## | ||
| 491 | ## .aes_schedule_mangle_last | ||
| 492 | ## | ||
| 493 | ## Mangler for last round of key schedule | ||
| 494 | ## Mangles %xmm0 | ||
| 495 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
| 496 | ## when decrypting, outputs unskew(%xmm0) | ||
| 497 | ## | ||
| 498 | ## Always called right before return... jumps to cleanup and exits | ||
| 499 | ## | ||
| 500 | &set_label("schedule_mangle_last",16); | ||
| 501 | # schedule last round key from xmm0 | ||
| 502 | &lea ($base,&DWP($k_deskew,$const)); | ||
| 503 | &test ($out,$out); | ||
| 504 | &jnz (&label("schedule_mangle_last_dec")); | ||
| 505 | |||
| 506 | # encrypting | ||
| 507 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
| 508 | &pshufb ("xmm0","xmm1"); # output permute | ||
| 509 | &lea ($base,&DWP($k_opt,$const)); # prepare to output transform | ||
| 510 | &add ($key,32); | ||
| 511 | |||
| 512 | &set_label("schedule_mangle_last_dec"); | ||
| 513 | &add ($key,-16); | ||
| 514 | &pxor ("xmm0",&QWP($k_s63,$const)); | ||
| 515 | &call ("_vpaes_schedule_transform"); # output transform | ||
| 516 | &movdqu (&QWP(0,$key),"xmm0"); # save last key | ||
| 517 | |||
| 518 | # cleanup | ||
| 519 | &pxor ("xmm0","xmm0"); | ||
| 520 | &pxor ("xmm1","xmm1"); | ||
| 521 | &pxor ("xmm2","xmm2"); | ||
| 522 | &pxor ("xmm3","xmm3"); | ||
| 523 | &pxor ("xmm4","xmm4"); | ||
| 524 | &pxor ("xmm5","xmm5"); | ||
| 525 | &pxor ("xmm6","xmm6"); | ||
| 526 | &pxor ("xmm7","xmm7"); | ||
| 527 | &ret (); | ||
| 528 | &function_end_B("_vpaes_schedule_core"); | ||
| 529 | |||
| 530 | ## | ||
| 531 | ## .aes_schedule_192_smear | ||
| 532 | ## | ||
| 533 | ## Smear the short, low side in the 192-bit key schedule. | ||
| 534 | ## | ||
| 535 | ## Inputs: | ||
| 536 | ## %xmm7: high side, b a x y | ||
| 537 | ## %xmm6: low side, d c 0 0 | ||
| 538 | ## %xmm13: 0 | ||
| 539 | ## | ||
| 540 | ## Outputs: | ||
| 541 | ## %xmm6: b+c+d b+c 0 0 | ||
| 542 | ## %xmm0: b+c+d b+c b a | ||
| 543 | ## | ||
| 544 | &function_begin_B("_vpaes_schedule_192_smear"); | ||
| 545 | &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 | ||
| 546 | &pxor ("xmm6","xmm0"); # -> c+d c 0 0 | ||
| 547 | &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a | ||
| 548 | &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a | ||
| 549 | &movdqa ("xmm0","xmm6"); | ||
| 550 | &pxor ("xmm1","xmm1"); | ||
| 551 | &movhlps("xmm6","xmm1"); # clobber low side with zeros | ||
| 552 | &ret (); | ||
| 553 | &function_end_B("_vpaes_schedule_192_smear"); | ||
| 554 | |||
| 555 | ## | ||
| 556 | ## .aes_schedule_round | ||
| 557 | ## | ||
| 558 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
| 559 | ## | ||
| 560 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
| 561 | ## then rotates it by one byte and xors into the low dword of | ||
| 562 | ## %xmm7. | ||
| 563 | ## | ||
| 564 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
| 565 | ## next rcon. | ||
| 566 | ## | ||
| 567 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
| 568 | ## second low, result into third, result into highest. | ||
| 569 | ## | ||
| 570 | ## Returns results in %xmm7 = %xmm0. | ||
| 571 | ## Clobbers %xmm1-%xmm5. | ||
| 572 | ## | ||
| 573 | &function_begin_B("_vpaes_schedule_round"); | ||
| 574 | # extract rcon from xmm8 | ||
| 575 | &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 | ||
| 576 | &pxor ("xmm1","xmm1"); | ||
| 577 | &palignr("xmm1","xmm2",15); | ||
| 578 | &palignr("xmm2","xmm2",15); | ||
| 579 | &pxor ("xmm7","xmm1"); | ||
| 580 | |||
| 581 | # rotate | ||
| 582 | &pshufd ("xmm0","xmm0",0xFF); | ||
| 583 | &palignr("xmm0","xmm0",1); | ||
| 584 | |||
| 585 | # fall through... | ||
| 586 | &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 | ||
| 587 | |||
| 588 | # low round: same as high round, but no rotation and no rcon. | ||
| 589 | &set_label("_vpaes_schedule_low_round"); | ||
| 590 | # smear xmm7 | ||
| 591 | &movdqa ("xmm1","xmm7"); | ||
| 592 | &pslldq ("xmm7",4); | ||
| 593 | &pxor ("xmm7","xmm1"); | ||
| 594 | &movdqa ("xmm1","xmm7"); | ||
| 595 | &pslldq ("xmm7",8); | ||
| 596 | &pxor ("xmm7","xmm1"); | ||
| 597 | &pxor ("xmm7",&QWP($k_s63,$const)); | ||
| 598 | |||
| 599 | # subbyte | ||
| 600 | &movdqa ("xmm4",&QWP($k_s0F,$const)); | ||
| 601 | &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j | ||
| 602 | &movdqa ("xmm1","xmm4"); | ||
| 603 | &pandn ("xmm1","xmm0"); | ||
| 604 | &psrld ("xmm1",4); # 1 = i | ||
| 605 | &pand ("xmm0","xmm4"); # 0 = k | ||
| 606 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
| 607 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
| 608 | &pxor ("xmm0","xmm1"); # 0 = j | ||
| 609 | &movdqa ("xmm3","xmm5"); # 3 : 1/i | ||
| 610 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
| 611 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
| 612 | &movdqa ("xmm4","xmm5"); # 4 : 1/j | ||
| 613 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
| 614 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
| 615 | &movdqa ("xmm2","xmm5"); # 2 : 1/iak | ||
| 616 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
| 617 | &pxor ("xmm2","xmm0"); # 2 = io | ||
| 618 | &movdqa ("xmm3","xmm5"); # 3 : 1/jak | ||
| 619 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
| 620 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
| 621 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou | ||
| 622 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
| 623 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot | ||
| 624 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
| 625 | &pxor ("xmm0","xmm4"); # 0 = sbox output | ||
| 626 | |||
| 627 | # add in smeared stuff | ||
| 628 | &pxor ("xmm0","xmm7"); | ||
| 629 | &movdqa ("xmm7","xmm0"); | ||
| 630 | &ret (); | ||
| 631 | &function_end_B("_vpaes_schedule_round"); | ||
| 632 | |||
| 633 | ## | ||
| 634 | ## .aes_schedule_transform | ||
| 635 | ## | ||
| 636 | ## Linear-transform %xmm0 according to tables at (%ebx) | ||
| 637 | ## | ||
| 638 | ## Output in %xmm0 | ||
| 639 | ## Clobbers %xmm1, %xmm2 | ||
| 640 | ## | ||
| 641 | &function_begin_B("_vpaes_schedule_transform"); | ||
| 642 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
| 643 | &movdqa ("xmm1","xmm2"); | ||
| 644 | &pandn ("xmm1","xmm0"); | ||
| 645 | &psrld ("xmm1",4); | ||
| 646 | &pand ("xmm0","xmm2"); | ||
| 647 | &movdqa ("xmm2",&QWP(0,$base)); | ||
| 648 | &pshufb ("xmm2","xmm0"); | ||
| 649 | &movdqa ("xmm0",&QWP(16,$base)); | ||
| 650 | &pshufb ("xmm0","xmm1"); | ||
| 651 | &pxor ("xmm0","xmm2"); | ||
| 652 | &ret (); | ||
| 653 | &function_end_B("_vpaes_schedule_transform"); | ||
| 654 | |||
| 655 | ## | ||
| 656 | ## .aes_schedule_mangle | ||
| 657 | ## | ||
| 658 | ## Mangle xmm0 from (basis-transformed) standard version | ||
| 659 | ## to our version. | ||
| 660 | ## | ||
| 661 | ## On encrypt, | ||
| 662 | ## xor with 0x63 | ||
| 663 | ## multiply by circulant 0,1,1,1 | ||
| 664 | ## apply shiftrows transform | ||
| 665 | ## | ||
| 666 | ## On decrypt, | ||
| 667 | ## xor with 0x63 | ||
| 668 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
| 669 | ## deskew | ||
| 670 | ## apply shiftrows transform | ||
| 671 | ## | ||
| 672 | ## | ||
| 673 | ## Writes out to (%edx), and increments or decrements it | ||
| 674 | ## Keeps track of round number mod 4 in %ecx | ||
| 675 | ## Preserves xmm0 | ||
| 676 | ## Clobbers xmm1-xmm5 | ||
| 677 | ## | ||
| 678 | &function_begin_B("_vpaes_schedule_mangle"); | ||
| 679 | &movdqa ("xmm4","xmm0"); # save xmm0 for later | ||
| 680 | &movdqa ("xmm5",&QWP($k_mc_forward,$const)); | ||
| 681 | &test ($out,$out); | ||
| 682 | &jnz (&label("schedule_mangle_dec")); | ||
| 683 | |||
| 684 | # encrypting | ||
| 685 | &add ($key,16); | ||
| 686 | &pxor ("xmm4",&QWP($k_s63,$const)); | ||
| 687 | &pshufb ("xmm4","xmm5"); | ||
| 688 | &movdqa ("xmm3","xmm4"); | ||
| 689 | &pshufb ("xmm4","xmm5"); | ||
| 690 | &pxor ("xmm3","xmm4"); | ||
| 691 | &pshufb ("xmm4","xmm5"); | ||
| 692 | &pxor ("xmm3","xmm4"); | ||
| 693 | |||
| 694 | &jmp (&label("schedule_mangle_both")); | ||
| 695 | |||
| 696 | &set_label("schedule_mangle_dec",16); | ||
| 697 | # inverse mix columns | ||
| 698 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
| 699 | &lea ($inp,&DWP($k_dksd,$const)); | ||
| 700 | &movdqa ("xmm1","xmm2"); | ||
| 701 | &pandn ("xmm1","xmm4"); | ||
| 702 | &psrld ("xmm1",4); # 1 = hi | ||
| 703 | &pand ("xmm4","xmm2"); # 4 = lo | ||
| 704 | |||
| 705 | &movdqa ("xmm2",&QWP(0,$inp)); | ||
| 706 | &pshufb ("xmm2","xmm4"); | ||
| 707 | &movdqa ("xmm3",&QWP(0x10,$inp)); | ||
| 708 | &pshufb ("xmm3","xmm1"); | ||
| 709 | &pxor ("xmm3","xmm2"); | ||
| 710 | &pshufb ("xmm3","xmm5"); | ||
| 711 | |||
| 712 | &movdqa ("xmm2",&QWP(0x20,$inp)); | ||
| 713 | &pshufb ("xmm2","xmm4"); | ||
| 714 | &pxor ("xmm2","xmm3"); | ||
| 715 | &movdqa ("xmm3",&QWP(0x30,$inp)); | ||
| 716 | &pshufb ("xmm3","xmm1"); | ||
| 717 | &pxor ("xmm3","xmm2"); | ||
| 718 | &pshufb ("xmm3","xmm5"); | ||
| 719 | |||
| 720 | &movdqa ("xmm2",&QWP(0x40,$inp)); | ||
| 721 | &pshufb ("xmm2","xmm4"); | ||
| 722 | &pxor ("xmm2","xmm3"); | ||
| 723 | &movdqa ("xmm3",&QWP(0x50,$inp)); | ||
| 724 | &pshufb ("xmm3","xmm1"); | ||
| 725 | &pxor ("xmm3","xmm2"); | ||
| 726 | &pshufb ("xmm3","xmm5"); | ||
| 727 | |||
| 728 | &movdqa ("xmm2",&QWP(0x60,$inp)); | ||
| 729 | &pshufb ("xmm2","xmm4"); | ||
| 730 | &pxor ("xmm2","xmm3"); | ||
| 731 | &movdqa ("xmm3",&QWP(0x70,$inp)); | ||
| 732 | &pshufb ("xmm3","xmm1"); | ||
| 733 | &pxor ("xmm3","xmm2"); | ||
| 734 | |||
| 735 | &add ($key,-16); | ||
| 736 | |||
| 737 | &set_label("schedule_mangle_both"); | ||
| 738 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
| 739 | &pshufb ("xmm3","xmm1"); | ||
| 740 | &add ($magic,-16); | ||
| 741 | &and ($magic,0x30); | ||
| 742 | &movdqu (&QWP(0,$key),"xmm3"); | ||
| 743 | &ret (); | ||
| 744 | &function_end_B("_vpaes_schedule_mangle"); | ||
| 745 | |||
| 746 | # | ||
| 747 | # Interface to OpenSSL | ||
| 748 | # | ||
| 749 | &function_begin("${PREFIX}_set_encrypt_key"); | ||
| 750 | &mov ($inp,&wparam(0)); # inp | ||
| 751 | &lea ($base,&DWP(-56,"esp")); | ||
| 752 | &mov ($round,&wparam(1)); # bits | ||
| 753 | &and ($base,-16); | ||
| 754 | &mov ($key,&wparam(2)); # key | ||
| 755 | &xchg ($base,"esp"); # alloca | ||
| 756 | &mov (&DWP(48,"esp"),$base); | ||
| 757 | |||
| 758 | &mov ($base,$round); | ||
| 759 | &shr ($base,5); | ||
| 760 | &add ($base,5); | ||
| 761 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
| 762 | &mov ($magic,0x30); | ||
| 763 | &mov ($out,0); | ||
| 764 | |||
| 765 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
| 766 | &call ("_vpaes_schedule_core"); | ||
| 767 | &set_label("pic_point"); | ||
| 768 | |||
| 769 | &mov ("esp",&DWP(48,"esp")); | ||
| 770 | &xor ("eax","eax"); | ||
| 771 | &function_end("${PREFIX}_set_encrypt_key"); | ||
| 772 | |||
| 773 | &function_begin("${PREFIX}_set_decrypt_key"); | ||
| 774 | &mov ($inp,&wparam(0)); # inp | ||
| 775 | &lea ($base,&DWP(-56,"esp")); | ||
| 776 | &mov ($round,&wparam(1)); # bits | ||
| 777 | &and ($base,-16); | ||
| 778 | &mov ($key,&wparam(2)); # key | ||
| 779 | &xchg ($base,"esp"); # alloca | ||
| 780 | &mov (&DWP(48,"esp"),$base); | ||
| 781 | |||
| 782 | &mov ($base,$round); | ||
| 783 | &shr ($base,5); | ||
| 784 | &add ($base,5); | ||
| 785 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
| 786 | &shl ($base,4); | ||
| 787 | &lea ($key,&DWP(16,$key,$base)); | ||
| 788 | |||
| 789 | &mov ($out,1); | ||
| 790 | &mov ($magic,$round); | ||
| 791 | &shr ($magic,1); | ||
| 792 | &and ($magic,32); | ||
| 793 | &xor ($magic,32); # nbist==192?0:32; | ||
| 794 | |||
| 795 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
| 796 | &call ("_vpaes_schedule_core"); | ||
| 797 | &set_label("pic_point"); | ||
| 798 | |||
| 799 | &mov ("esp",&DWP(48,"esp")); | ||
| 800 | &xor ("eax","eax"); | ||
| 801 | &function_end("${PREFIX}_set_decrypt_key"); | ||
| 802 | |||
| 803 | &function_begin("${PREFIX}_encrypt"); | ||
| 804 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
| 805 | &call ("_vpaes_preheat"); | ||
| 806 | &set_label("pic_point"); | ||
| 807 | &mov ($inp,&wparam(0)); # inp | ||
| 808 | &lea ($base,&DWP(-56,"esp")); | ||
| 809 | &mov ($out,&wparam(1)); # out | ||
| 810 | &and ($base,-16); | ||
| 811 | &mov ($key,&wparam(2)); # key | ||
| 812 | &xchg ($base,"esp"); # alloca | ||
| 813 | &mov (&DWP(48,"esp"),$base); | ||
| 814 | |||
| 815 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
| 816 | &call ("_vpaes_encrypt_core"); | ||
| 817 | &movdqu (&QWP(0,$out),"xmm0"); | ||
| 818 | |||
| 819 | &mov ("esp",&DWP(48,"esp")); | ||
| 820 | &function_end("${PREFIX}_encrypt"); | ||
| 821 | |||
| 822 | &function_begin("${PREFIX}_decrypt"); | ||
| 823 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
| 824 | &call ("_vpaes_preheat"); | ||
| 825 | &set_label("pic_point"); | ||
| 826 | &mov ($inp,&wparam(0)); # inp | ||
| 827 | &lea ($base,&DWP(-56,"esp")); | ||
| 828 | &mov ($out,&wparam(1)); # out | ||
| 829 | &and ($base,-16); | ||
| 830 | &mov ($key,&wparam(2)); # key | ||
| 831 | &xchg ($base,"esp"); # alloca | ||
| 832 | &mov (&DWP(48,"esp"),$base); | ||
| 833 | |||
| 834 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
| 835 | &call ("_vpaes_decrypt_core"); | ||
| 836 | &movdqu (&QWP(0,$out),"xmm0"); | ||
| 837 | |||
| 838 | &mov ("esp",&DWP(48,"esp")); | ||
| 839 | &function_end("${PREFIX}_decrypt"); | ||
| 840 | |||
| 841 | &function_begin("${PREFIX}_cbc_encrypt"); | ||
| 842 | &mov ($inp,&wparam(0)); # inp | ||
| 843 | &mov ($out,&wparam(1)); # out | ||
| 844 | &mov ($round,&wparam(2)); # len | ||
| 845 | &mov ($key,&wparam(3)); # key | ||
| 846 | &sub ($round,16); | ||
| 847 | &jc (&label("cbc_abort")); | ||
| 848 | &lea ($base,&DWP(-56,"esp")); | ||
| 849 | &mov ($const,&wparam(4)); # ivp | ||
| 850 | &and ($base,-16); | ||
| 851 | &mov ($magic,&wparam(5)); # enc | ||
| 852 | &xchg ($base,"esp"); # alloca | ||
| 853 | &movdqu ("xmm1",&QWP(0,$const)); # load IV | ||
| 854 | &sub ($out,$inp); | ||
| 855 | &mov (&DWP(48,"esp"),$base); | ||
| 856 | |||
| 857 | &mov (&DWP(0,"esp"),$out); # save out | ||
| 858 | &mov (&DWP(4,"esp"),$key) # save key | ||
| 859 | &mov (&DWP(8,"esp"),$const); # save ivp | ||
| 860 | &mov ($out,$round); # $out works as $len | ||
| 861 | |||
| 862 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
| 863 | &call ("_vpaes_preheat"); | ||
| 864 | &set_label("pic_point"); | ||
| 865 | &cmp ($magic,0); | ||
| 866 | &je (&label("cbc_dec_loop")); | ||
| 867 | &jmp (&label("cbc_enc_loop")); | ||
| 868 | |||
| 869 | &set_label("cbc_enc_loop",16); | ||
| 870 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
| 871 | &pxor ("xmm0","xmm1"); # inp^=iv | ||
| 872 | &call ("_vpaes_encrypt_core"); | ||
| 873 | &mov ($base,&DWP(0,"esp")); # restore out | ||
| 874 | &mov ($key,&DWP(4,"esp")); # restore key | ||
| 875 | &movdqa ("xmm1","xmm0"); | ||
| 876 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
| 877 | &lea ($inp,&DWP(16,$inp)); | ||
| 878 | &sub ($out,16); | ||
| 879 | &jnc (&label("cbc_enc_loop")); | ||
| 880 | &jmp (&label("cbc_done")); | ||
| 881 | |||
| 882 | &set_label("cbc_dec_loop",16); | ||
| 883 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
| 884 | &movdqa (&QWP(16,"esp"),"xmm1"); # save IV | ||
| 885 | &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV | ||
| 886 | &call ("_vpaes_decrypt_core"); | ||
| 887 | &mov ($base,&DWP(0,"esp")); # restore out | ||
| 888 | &mov ($key,&DWP(4,"esp")); # restore key | ||
| 889 | &pxor ("xmm0",&QWP(16,"esp")); # out^=iv | ||
| 890 | &movdqa ("xmm1",&QWP(32,"esp")); # load next IV | ||
| 891 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
| 892 | &lea ($inp,&DWP(16,$inp)); | ||
| 893 | &sub ($out,16); | ||
| 894 | &jnc (&label("cbc_dec_loop")); | ||
| 895 | |||
| 896 | &set_label("cbc_done"); | ||
| 897 | &mov ($base,&DWP(8,"esp")); # restore ivp | ||
| 898 | &mov ("esp",&DWP(48,"esp")); | ||
| 899 | &movdqu (&QWP(0,$base),"xmm1"); # write IV | ||
| 900 | &set_label("cbc_abort"); | ||
| 901 | &function_end("${PREFIX}_cbc_encrypt"); | ||
| 902 | |||
| 903 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl new file mode 100644 index 0000000000..37998db5e1 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl | |||
| @@ -0,0 +1,1206 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | ###################################################################### | ||
| 4 | ## Constant-time SSSE3 AES core implementation. | ||
| 5 | ## version 0.1 | ||
| 6 | ## | ||
| 7 | ## By Mike Hamburg (Stanford University), 2009 | ||
| 8 | ## Public domain. | ||
| 9 | ## | ||
| 10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
| 11 | ## http://crypto.stanford.edu/vpaes/. | ||
| 12 | |||
| 13 | ###################################################################### | ||
| 14 | # September 2011. | ||
| 15 | # | ||
| 16 | # Interface to OpenSSL as "almost" drop-in replacement for | ||
| 17 | # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
| 18 | # doesn't handle partial vectors (doesn't have to if called from | ||
| 19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
| 20 | # schedule structure with the original nor does it make assumption | ||
| 21 | # about its alignment... | ||
| 22 | # | ||
| 23 | # Performance summary. aes-x86_64.pl column lists large-block CBC | ||
| 24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
| 25 | # byte processed with 128-bit key, and vpaes-x86_64.pl column - | ||
| 26 | # [also large-block CBC] encrypt/decrypt. | ||
| 27 | # | ||
| 28 | # aes-x86_64.pl vpaes-x86_64.pl | ||
| 29 | # | ||
| 30 | # Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) | ||
| 31 | # Nehalem 30.5/42.2/14.6 9.8/11.8 | ||
| 32 | # Atom 63.9/79.0/32.1 64.0/84.8(***) | ||
| 33 | # | ||
| 34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
| 35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
| 36 | # majority of contemporary cores share cache, slower code path | ||
| 37 | # is common place. In other words "with-hyper-threading-off" | ||
| 38 | # results are presented mostly for reference purposes. | ||
| 39 | # | ||
| 40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
| 41 | # | ||
| 42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
| 43 | # pshufb, yet it's respectable +40%/78% improvement on Core 2 | ||
| 44 | # (as implied, over "hyper-threading-safe" code path). | ||
| 45 | # | ||
| 46 | # <appro@openssl.org> | ||
| 47 | |||
| 48 | $flavour = shift; | ||
| 49 | $output = shift; | ||
| 50 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 51 | |||
| 52 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 53 | |||
| 54 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 55 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 56 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 57 | die "can't locate x86_64-xlate.pl"; | ||
| 58 | |||
| 59 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 60 | |||
| 61 | $PREFIX="vpaes"; | ||
| 62 | |||
| 63 | $code.=<<___; | ||
| 64 | .text | ||
| 65 | |||
| 66 | ## | ||
| 67 | ## _aes_encrypt_core | ||
| 68 | ## | ||
| 69 | ## AES-encrypt %xmm0. | ||
| 70 | ## | ||
| 71 | ## Inputs: | ||
| 72 | ## %xmm0 = input | ||
| 73 | ## %xmm9-%xmm15 as in _vpaes_preheat | ||
| 74 | ## (%rdx) = scheduled keys | ||
| 75 | ## | ||
| 76 | ## Output in %xmm0 | ||
| 77 | ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax | ||
| 78 | ## Preserves %xmm6 - %xmm8 so you get some local vectors | ||
| 79 | ## | ||
| 80 | ## | ||
| 81 | .type _vpaes_encrypt_core,\@abi-omnipotent | ||
| 82 | .align 16 | ||
| 83 | _vpaes_encrypt_core: | ||
| 84 | mov %rdx, %r9 | ||
| 85 | mov \$16, %r11 | ||
| 86 | mov 240(%rdx),%eax | ||
| 87 | movdqa %xmm9, %xmm1 | ||
| 88 | movdqa .Lk_ipt(%rip), %xmm2 # iptlo | ||
| 89 | pandn %xmm0, %xmm1 | ||
| 90 | movdqu (%r9), %xmm5 # round0 key | ||
| 91 | psrld \$4, %xmm1 | ||
| 92 | pand %xmm9, %xmm0 | ||
| 93 | pshufb %xmm0, %xmm2 | ||
| 94 | movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi | ||
| 95 | pshufb %xmm1, %xmm0 | ||
| 96 | pxor %xmm5, %xmm2 | ||
| 97 | pxor %xmm2, %xmm0 | ||
| 98 | add \$16, %r9 | ||
| 99 | lea .Lk_mc_backward(%rip),%r10 | ||
| 100 | jmp .Lenc_entry | ||
| 101 | |||
| 102 | .align 16 | ||
| 103 | .Lenc_loop: | ||
| 104 | # middle of middle round | ||
| 105 | movdqa %xmm13, %xmm4 # 4 : sb1u | ||
| 106 | pshufb %xmm2, %xmm4 # 4 = sb1u | ||
| 107 | pxor %xmm5, %xmm4 # 4 = sb1u + k | ||
| 108 | movdqa %xmm12, %xmm0 # 0 : sb1t | ||
| 109 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 110 | pxor %xmm4, %xmm0 # 0 = A | ||
| 111 | movdqa %xmm15, %xmm5 # 4 : sb2u | ||
| 112 | pshufb %xmm2, %xmm5 # 4 = sb2u | ||
| 113 | movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] | ||
| 114 | movdqa %xmm14, %xmm2 # 2 : sb2t | ||
| 115 | pshufb %xmm3, %xmm2 # 2 = sb2t | ||
| 116 | pxor %xmm5, %xmm2 # 2 = 2A | ||
| 117 | movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] | ||
| 118 | movdqa %xmm0, %xmm3 # 3 = A | ||
| 119 | pshufb %xmm1, %xmm0 # 0 = B | ||
| 120 | add \$16, %r9 # next key | ||
| 121 | pxor %xmm2, %xmm0 # 0 = 2A+B | ||
| 122 | pshufb %xmm4, %xmm3 # 3 = D | ||
| 123 | add \$16, %r11 # next mc | ||
| 124 | pxor %xmm0, %xmm3 # 3 = 2A+B+D | ||
| 125 | pshufb %xmm1, %xmm0 # 0 = 2B+C | ||
| 126 | and \$0x30, %r11 # ... mod 4 | ||
| 127 | pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D | ||
| 128 | sub \$1,%rax # nr-- | ||
| 129 | |||
| 130 | .Lenc_entry: | ||
| 131 | # top of round | ||
| 132 | movdqa %xmm9, %xmm1 # 1 : i | ||
| 133 | pandn %xmm0, %xmm1 # 1 = i<<4 | ||
| 134 | psrld \$4, %xmm1 # 1 = i | ||
| 135 | pand %xmm9, %xmm0 # 0 = k | ||
| 136 | movdqa %xmm11, %xmm5 # 2 : a/k | ||
| 137 | pshufb %xmm0, %xmm5 # 2 = a/k | ||
| 138 | pxor %xmm1, %xmm0 # 0 = j | ||
| 139 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
| 140 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
| 141 | pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k | ||
| 142 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
| 143 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
| 144 | pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k | ||
| 145 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
| 146 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
| 147 | pxor %xmm0, %xmm2 # 2 = io | ||
| 148 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
| 149 | movdqu (%r9), %xmm5 | ||
| 150 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
| 151 | pxor %xmm1, %xmm3 # 3 = jo | ||
| 152 | jnz .Lenc_loop | ||
| 153 | |||
| 154 | # middle of last round | ||
| 155 | movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo | ||
| 156 | movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 | ||
| 157 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
| 158 | pxor %xmm5, %xmm4 # 4 = sb1u + k | ||
| 159 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 160 | movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] | ||
| 161 | pxor %xmm4, %xmm0 # 0 = A | ||
| 162 | pshufb %xmm1, %xmm0 | ||
| 163 | ret | ||
| 164 | .size _vpaes_encrypt_core,.-_vpaes_encrypt_core | ||
| 165 | |||
| 166 | ## | ||
| 167 | ## Decryption core | ||
| 168 | ## | ||
| 169 | ## Same API as encryption core. | ||
| 170 | ## | ||
| 171 | .type _vpaes_decrypt_core,\@abi-omnipotent | ||
| 172 | .align 16 | ||
| 173 | _vpaes_decrypt_core: | ||
| 174 | mov %rdx, %r9 # load key | ||
| 175 | mov 240(%rdx),%eax | ||
| 176 | movdqa %xmm9, %xmm1 | ||
| 177 | movdqa .Lk_dipt(%rip), %xmm2 # iptlo | ||
| 178 | pandn %xmm0, %xmm1 | ||
| 179 | mov %rax, %r11 | ||
| 180 | psrld \$4, %xmm1 | ||
| 181 | movdqu (%r9), %xmm5 # round0 key | ||
| 182 | shl \$4, %r11 | ||
| 183 | pand %xmm9, %xmm0 | ||
| 184 | pshufb %xmm0, %xmm2 | ||
| 185 | movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi | ||
| 186 | xor \$0x30, %r11 | ||
| 187 | lea .Lk_dsbd(%rip),%r10 | ||
| 188 | pshufb %xmm1, %xmm0 | ||
| 189 | and \$0x30, %r11 | ||
| 190 | pxor %xmm5, %xmm2 | ||
| 191 | movdqa .Lk_mc_forward+48(%rip), %xmm5 | ||
| 192 | pxor %xmm2, %xmm0 | ||
| 193 | add \$16, %r9 | ||
| 194 | add %r10, %r11 | ||
| 195 | jmp .Ldec_entry | ||
| 196 | |||
| 197 | .align 16 | ||
| 198 | .Ldec_loop: | ||
| 199 | ## | ||
| 200 | ## Inverse mix columns | ||
| 201 | ## | ||
| 202 | movdqa -0x20(%r10),%xmm4 # 4 : sb9u | ||
| 203 | pshufb %xmm2, %xmm4 # 4 = sb9u | ||
| 204 | pxor %xmm0, %xmm4 | ||
| 205 | movdqa -0x10(%r10),%xmm0 # 0 : sb9t | ||
| 206 | pshufb %xmm3, %xmm0 # 0 = sb9t | ||
| 207 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 208 | add \$16, %r9 # next round key | ||
| 209 | |||
| 210 | pshufb %xmm5, %xmm0 # MC ch | ||
| 211 | movdqa 0x00(%r10),%xmm4 # 4 : sbdu | ||
| 212 | pshufb %xmm2, %xmm4 # 4 = sbdu | ||
| 213 | pxor %xmm0, %xmm4 # 4 = ch | ||
| 214 | movdqa 0x10(%r10),%xmm0 # 0 : sbdt | ||
| 215 | pshufb %xmm3, %xmm0 # 0 = sbdt | ||
| 216 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 217 | sub \$1,%rax # nr-- | ||
| 218 | |||
| 219 | pshufb %xmm5, %xmm0 # MC ch | ||
| 220 | movdqa 0x20(%r10),%xmm4 # 4 : sbbu | ||
| 221 | pshufb %xmm2, %xmm4 # 4 = sbbu | ||
| 222 | pxor %xmm0, %xmm4 # 4 = ch | ||
| 223 | movdqa 0x30(%r10),%xmm0 # 0 : sbbt | ||
| 224 | pshufb %xmm3, %xmm0 # 0 = sbbt | ||
| 225 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 226 | |||
| 227 | pshufb %xmm5, %xmm0 # MC ch | ||
| 228 | movdqa 0x40(%r10),%xmm4 # 4 : sbeu | ||
| 229 | pshufb %xmm2, %xmm4 # 4 = sbeu | ||
| 230 | pxor %xmm0, %xmm4 # 4 = ch | ||
| 231 | movdqa 0x50(%r10),%xmm0 # 0 : sbet | ||
| 232 | pshufb %xmm3, %xmm0 # 0 = sbet | ||
| 233 | pxor %xmm4, %xmm0 # 0 = ch | ||
| 234 | |||
| 235 | palignr \$12, %xmm5, %xmm5 | ||
| 236 | |||
| 237 | .Ldec_entry: | ||
| 238 | # top of round | ||
| 239 | movdqa %xmm9, %xmm1 # 1 : i | ||
| 240 | pandn %xmm0, %xmm1 # 1 = i<<4 | ||
| 241 | psrld \$4, %xmm1 # 1 = i | ||
| 242 | pand %xmm9, %xmm0 # 0 = k | ||
| 243 | movdqa %xmm11, %xmm2 # 2 : a/k | ||
| 244 | pshufb %xmm0, %xmm2 # 2 = a/k | ||
| 245 | pxor %xmm1, %xmm0 # 0 = j | ||
| 246 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
| 247 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
| 248 | pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k | ||
| 249 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
| 250 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
| 251 | pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k | ||
| 252 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
| 253 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
| 254 | pxor %xmm0, %xmm2 # 2 = io | ||
| 255 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
| 256 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
| 257 | pxor %xmm1, %xmm3 # 3 = jo | ||
| 258 | movdqu (%r9), %xmm0 | ||
| 259 | jnz .Ldec_loop | ||
| 260 | |||
| 261 | # middle of last round | ||
| 262 | movdqa 0x60(%r10), %xmm4 # 3 : sbou | ||
| 263 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
| 264 | pxor %xmm0, %xmm4 # 4 = sb1u + k | ||
| 265 | movdqa 0x70(%r10), %xmm0 # 0 : sbot | ||
| 266 | movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 | ||
| 267 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 268 | pxor %xmm4, %xmm0 # 0 = A | ||
| 269 | pshufb %xmm2, %xmm0 | ||
| 270 | ret | ||
| 271 | .size _vpaes_decrypt_core,.-_vpaes_decrypt_core | ||
| 272 | |||
| 273 | ######################################################## | ||
| 274 | ## ## | ||
| 275 | ## AES key schedule ## | ||
| 276 | ## ## | ||
| 277 | ######################################################## | ||
| 278 | .type _vpaes_schedule_core,\@abi-omnipotent | ||
| 279 | .align 16 | ||
| 280 | _vpaes_schedule_core: | ||
| 281 | # rdi = key | ||
| 282 | # rsi = size in bits | ||
| 283 | # rdx = buffer | ||
| 284 | # rcx = direction. 0=encrypt, 1=decrypt | ||
| 285 | |||
| 286 | call _vpaes_preheat # load the tables | ||
| 287 | movdqa .Lk_rcon(%rip), %xmm8 # load rcon | ||
| 288 | movdqu (%rdi), %xmm0 # load key (unaligned) | ||
| 289 | |||
| 290 | # input transform | ||
| 291 | movdqa %xmm0, %xmm3 | ||
| 292 | lea .Lk_ipt(%rip), %r11 | ||
| 293 | call _vpaes_schedule_transform | ||
| 294 | movdqa %xmm0, %xmm7 | ||
| 295 | |||
| 296 | lea .Lk_sr(%rip),%r10 | ||
| 297 | test %rcx, %rcx | ||
| 298 | jnz .Lschedule_am_decrypting | ||
| 299 | |||
| 300 | # encrypting, output zeroth round key after transform | ||
| 301 | movdqu %xmm0, (%rdx) | ||
| 302 | jmp .Lschedule_go | ||
| 303 | |||
| 304 | .Lschedule_am_decrypting: | ||
| 305 | # decrypting, output zeroth round key after shiftrows | ||
| 306 | movdqa (%r8,%r10),%xmm1 | ||
| 307 | pshufb %xmm1, %xmm3 | ||
| 308 | movdqu %xmm3, (%rdx) | ||
| 309 | xor \$0x30, %r8 | ||
| 310 | |||
| 311 | .Lschedule_go: | ||
| 312 | cmp \$192, %esi | ||
| 313 | ja .Lschedule_256 | ||
| 314 | je .Lschedule_192 | ||
| 315 | # 128: fall though | ||
| 316 | |||
| 317 | ## | ||
| 318 | ## .schedule_128 | ||
| 319 | ## | ||
| 320 | ## 128-bit specific part of key schedule. | ||
| 321 | ## | ||
| 322 | ## This schedule is really simple, because all its parts | ||
| 323 | ## are accomplished by the subroutines. | ||
| 324 | ## | ||
| 325 | .Lschedule_128: | ||
| 326 | mov \$10, %esi | ||
| 327 | |||
| 328 | .Loop_schedule_128: | ||
| 329 | call _vpaes_schedule_round | ||
| 330 | dec %rsi | ||
| 331 | jz .Lschedule_mangle_last | ||
| 332 | call _vpaes_schedule_mangle # write output | ||
| 333 | jmp .Loop_schedule_128 | ||
| 334 | |||
| 335 | ## | ||
| 336 | ## .aes_schedule_192 | ||
| 337 | ## | ||
| 338 | ## 192-bit specific part of key schedule. | ||
| 339 | ## | ||
| 340 | ## The main body of this schedule is the same as the 128-bit | ||
| 341 | ## schedule, but with more smearing. The long, high side is | ||
| 342 | ## stored in %xmm7 as before, and the short, low side is in | ||
| 343 | ## the high bits of %xmm6. | ||
| 344 | ## | ||
| 345 | ## This schedule is somewhat nastier, however, because each | ||
| 346 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
| 347 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
| 348 | ## keys. | ||
| 349 | ## | ||
| 350 | .align 16 | ||
| 351 | .Lschedule_192: | ||
| 352 | movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) | ||
| 353 | call _vpaes_schedule_transform # input transform | ||
| 354 | movdqa %xmm0, %xmm6 # save short part | ||
| 355 | pxor %xmm4, %xmm4 # clear 4 | ||
| 356 | movhlps %xmm4, %xmm6 # clobber low side with zeros | ||
| 357 | mov \$4, %esi | ||
| 358 | |||
| 359 | .Loop_schedule_192: | ||
| 360 | call _vpaes_schedule_round | ||
| 361 | palignr \$8,%xmm6,%xmm0 | ||
| 362 | call _vpaes_schedule_mangle # save key n | ||
| 363 | call _vpaes_schedule_192_smear | ||
| 364 | call _vpaes_schedule_mangle # save key n+1 | ||
| 365 | call _vpaes_schedule_round | ||
| 366 | dec %rsi | ||
| 367 | jz .Lschedule_mangle_last | ||
| 368 | call _vpaes_schedule_mangle # save key n+2 | ||
| 369 | call _vpaes_schedule_192_smear | ||
| 370 | jmp .Loop_schedule_192 | ||
| 371 | |||
| 372 | ## | ||
| 373 | ## .aes_schedule_256 | ||
| 374 | ## | ||
| 375 | ## 256-bit specific part of key schedule. | ||
| 376 | ## | ||
| 377 | ## The structure here is very similar to the 128-bit | ||
| 378 | ## schedule, but with an additional "low side" in | ||
| 379 | ## %xmm6. The low side's rounds are the same as the | ||
| 380 | ## high side's, except no rcon and no rotation. | ||
| 381 | ## | ||
| 382 | .align 16 | ||
| 383 | .Lschedule_256: | ||
| 384 | movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) | ||
| 385 | call _vpaes_schedule_transform # input transform | ||
| 386 | mov \$7, %esi | ||
| 387 | |||
| 388 | .Loop_schedule_256: | ||
| 389 | call _vpaes_schedule_mangle # output low result | ||
| 390 | movdqa %xmm0, %xmm6 # save cur_lo in xmm6 | ||
| 391 | |||
| 392 | # high round | ||
| 393 | call _vpaes_schedule_round | ||
| 394 | dec %rsi | ||
| 395 | jz .Lschedule_mangle_last | ||
| 396 | call _vpaes_schedule_mangle | ||
| 397 | |||
| 398 | # low round. swap xmm7 and xmm6 | ||
| 399 | pshufd \$0xFF, %xmm0, %xmm0 | ||
| 400 | movdqa %xmm7, %xmm5 | ||
| 401 | movdqa %xmm6, %xmm7 | ||
| 402 | call _vpaes_schedule_low_round | ||
| 403 | movdqa %xmm5, %xmm7 | ||
| 404 | |||
| 405 | jmp .Loop_schedule_256 | ||
| 406 | |||
| 407 | |||
| 408 | ## | ||
| 409 | ## .aes_schedule_mangle_last | ||
| 410 | ## | ||
| 411 | ## Mangler for last round of key schedule | ||
| 412 | ## Mangles %xmm0 | ||
| 413 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
| 414 | ## when decrypting, outputs unskew(%xmm0) | ||
| 415 | ## | ||
| 416 | ## Always called right before return... jumps to cleanup and exits | ||
| 417 | ## | ||
| 418 | .align 16 | ||
| 419 | .Lschedule_mangle_last: | ||
| 420 | # schedule last round key from xmm0 | ||
| 421 | lea .Lk_deskew(%rip),%r11 # prepare to deskew | ||
| 422 | test %rcx, %rcx | ||
| 423 | jnz .Lschedule_mangle_last_dec | ||
| 424 | |||
| 425 | # encrypting | ||
| 426 | movdqa (%r8,%r10),%xmm1 | ||
| 427 | pshufb %xmm1, %xmm0 # output permute | ||
| 428 | lea .Lk_opt(%rip), %r11 # prepare to output transform | ||
| 429 | add \$32, %rdx | ||
| 430 | |||
| 431 | .Lschedule_mangle_last_dec: | ||
| 432 | add \$-16, %rdx | ||
| 433 | pxor .Lk_s63(%rip), %xmm0 | ||
| 434 | call _vpaes_schedule_transform # output transform | ||
| 435 | movdqu %xmm0, (%rdx) # save last key | ||
| 436 | |||
| 437 | # cleanup | ||
| 438 | pxor %xmm0, %xmm0 | ||
| 439 | pxor %xmm1, %xmm1 | ||
| 440 | pxor %xmm2, %xmm2 | ||
| 441 | pxor %xmm3, %xmm3 | ||
| 442 | pxor %xmm4, %xmm4 | ||
| 443 | pxor %xmm5, %xmm5 | ||
| 444 | pxor %xmm6, %xmm6 | ||
| 445 | pxor %xmm7, %xmm7 | ||
| 446 | ret | ||
| 447 | .size _vpaes_schedule_core,.-_vpaes_schedule_core | ||
| 448 | |||
| 449 | ## | ||
| 450 | ## .aes_schedule_192_smear | ||
| 451 | ## | ||
| 452 | ## Smear the short, low side in the 192-bit key schedule. | ||
| 453 | ## | ||
| 454 | ## Inputs: | ||
| 455 | ## %xmm7: high side, b a x y | ||
| 456 | ## %xmm6: low side, d c 0 0 | ||
| 457 | ## %xmm13: 0 | ||
| 458 | ## | ||
| 459 | ## Outputs: | ||
| 460 | ## %xmm6: b+c+d b+c 0 0 | ||
| 461 | ## %xmm0: b+c+d b+c b a | ||
| 462 | ## | ||
| 463 | .type _vpaes_schedule_192_smear,\@abi-omnipotent | ||
| 464 | .align 16 | ||
| 465 | _vpaes_schedule_192_smear: | ||
| 466 | pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 | ||
| 467 | pxor %xmm0, %xmm6 # -> c+d c 0 0 | ||
| 468 | pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a | ||
| 469 | pxor %xmm0, %xmm6 # -> b+c+d b+c b a | ||
| 470 | movdqa %xmm6, %xmm0 | ||
| 471 | pxor %xmm1, %xmm1 | ||
| 472 | movhlps %xmm1, %xmm6 # clobber low side with zeros | ||
| 473 | ret | ||
| 474 | .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear | ||
| 475 | |||
| 476 | ## | ||
| 477 | ## .aes_schedule_round | ||
| 478 | ## | ||
| 479 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
| 480 | ## | ||
| 481 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
| 482 | ## then rotates it by one byte and xors into the low dword of | ||
| 483 | ## %xmm7. | ||
| 484 | ## | ||
| 485 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
| 486 | ## next rcon. | ||
| 487 | ## | ||
| 488 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
| 489 | ## second low, result into third, result into highest. | ||
| 490 | ## | ||
| 491 | ## Returns results in %xmm7 = %xmm0. | ||
| 492 | ## Clobbers %xmm1-%xmm4, %r11. | ||
| 493 | ## | ||
| 494 | .type _vpaes_schedule_round,\@abi-omnipotent | ||
| 495 | .align 16 | ||
| 496 | _vpaes_schedule_round: | ||
| 497 | # extract rcon from xmm8 | ||
| 498 | pxor %xmm1, %xmm1 | ||
| 499 | palignr \$15, %xmm8, %xmm1 | ||
| 500 | palignr \$15, %xmm8, %xmm8 | ||
| 501 | pxor %xmm1, %xmm7 | ||
| 502 | |||
| 503 | # rotate | ||
| 504 | pshufd \$0xFF, %xmm0, %xmm0 | ||
| 505 | palignr \$1, %xmm0, %xmm0 | ||
| 506 | |||
| 507 | # fall through... | ||
| 508 | |||
| 509 | # low round: same as high round, but no rotation and no rcon. | ||
| 510 | _vpaes_schedule_low_round: | ||
| 511 | # smear xmm7 | ||
| 512 | movdqa %xmm7, %xmm1 | ||
| 513 | pslldq \$4, %xmm7 | ||
| 514 | pxor %xmm1, %xmm7 | ||
| 515 | movdqa %xmm7, %xmm1 | ||
| 516 | pslldq \$8, %xmm7 | ||
| 517 | pxor %xmm1, %xmm7 | ||
| 518 | pxor .Lk_s63(%rip), %xmm7 | ||
| 519 | |||
| 520 | # subbytes | ||
| 521 | movdqa %xmm9, %xmm1 | ||
| 522 | pandn %xmm0, %xmm1 | ||
| 523 | psrld \$4, %xmm1 # 1 = i | ||
| 524 | pand %xmm9, %xmm0 # 0 = k | ||
| 525 | movdqa %xmm11, %xmm2 # 2 : a/k | ||
| 526 | pshufb %xmm0, %xmm2 # 2 = a/k | ||
| 527 | pxor %xmm1, %xmm0 # 0 = j | ||
| 528 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
| 529 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
| 530 | pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k | ||
| 531 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
| 532 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
| 533 | pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k | ||
| 534 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
| 535 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
| 536 | pxor %xmm0, %xmm2 # 2 = io | ||
| 537 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
| 538 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
| 539 | pxor %xmm1, %xmm3 # 3 = jo | ||
| 540 | movdqa %xmm13, %xmm4 # 4 : sbou | ||
| 541 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
| 542 | movdqa %xmm12, %xmm0 # 0 : sbot | ||
| 543 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
| 544 | pxor %xmm4, %xmm0 # 0 = sbox output | ||
| 545 | |||
| 546 | # add in smeared stuff | ||
| 547 | pxor %xmm7, %xmm0 | ||
| 548 | movdqa %xmm0, %xmm7 | ||
| 549 | ret | ||
| 550 | .size _vpaes_schedule_round,.-_vpaes_schedule_round | ||
| 551 | |||
| 552 | ## | ||
| 553 | ## .aes_schedule_transform | ||
| 554 | ## | ||
| 555 | ## Linear-transform %xmm0 according to tables at (%r11) | ||
| 556 | ## | ||
| 557 | ## Requires that %xmm9 = 0x0F0F... as in preheat | ||
| 558 | ## Output in %xmm0 | ||
| 559 | ## Clobbers %xmm1, %xmm2 | ||
| 560 | ## | ||
| 561 | .type _vpaes_schedule_transform,\@abi-omnipotent | ||
| 562 | .align 16 | ||
| 563 | _vpaes_schedule_transform: | ||
| 564 | movdqa %xmm9, %xmm1 | ||
| 565 | pandn %xmm0, %xmm1 | ||
| 566 | psrld \$4, %xmm1 | ||
| 567 | pand %xmm9, %xmm0 | ||
| 568 | movdqa (%r11), %xmm2 # lo | ||
| 569 | pshufb %xmm0, %xmm2 | ||
| 570 | movdqa 16(%r11), %xmm0 # hi | ||
| 571 | pshufb %xmm1, %xmm0 | ||
| 572 | pxor %xmm2, %xmm0 | ||
| 573 | ret | ||
| 574 | .size _vpaes_schedule_transform,.-_vpaes_schedule_transform | ||
| 575 | |||
| 576 | ## | ||
| 577 | ## .aes_schedule_mangle | ||
| 578 | ## | ||
| 579 | ## Mangle xmm0 from (basis-transformed) standard version | ||
| 580 | ## to our version. | ||
| 581 | ## | ||
| 582 | ## On encrypt, | ||
| 583 | ## xor with 0x63 | ||
| 584 | ## multiply by circulant 0,1,1,1 | ||
| 585 | ## apply shiftrows transform | ||
| 586 | ## | ||
| 587 | ## On decrypt, | ||
| 588 | ## xor with 0x63 | ||
| 589 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
| 590 | ## deskew | ||
| 591 | ## apply shiftrows transform | ||
| 592 | ## | ||
| 593 | ## | ||
| 594 | ## Writes out to (%rdx), and increments or decrements it | ||
| 595 | ## Keeps track of round number mod 4 in %r8 | ||
| 596 | ## Preserves xmm0 | ||
| 597 | ## Clobbers xmm1-xmm5 | ||
| 598 | ## | ||
| 599 | .type _vpaes_schedule_mangle,\@abi-omnipotent | ||
| 600 | .align 16 | ||
| 601 | _vpaes_schedule_mangle: | ||
| 602 | movdqa %xmm0, %xmm4 # save xmm0 for later | ||
| 603 | movdqa .Lk_mc_forward(%rip),%xmm5 | ||
| 604 | test %rcx, %rcx | ||
| 605 | jnz .Lschedule_mangle_dec | ||
| 606 | |||
| 607 | # encrypting | ||
| 608 | add \$16, %rdx | ||
| 609 | pxor .Lk_s63(%rip),%xmm4 | ||
| 610 | pshufb %xmm5, %xmm4 | ||
| 611 | movdqa %xmm4, %xmm3 | ||
| 612 | pshufb %xmm5, %xmm4 | ||
| 613 | pxor %xmm4, %xmm3 | ||
| 614 | pshufb %xmm5, %xmm4 | ||
| 615 | pxor %xmm4, %xmm3 | ||
| 616 | |||
| 617 | jmp .Lschedule_mangle_both | ||
| 618 | .align 16 | ||
| 619 | .Lschedule_mangle_dec: | ||
| 620 | # inverse mix columns | ||
| 621 | lea .Lk_dksd(%rip),%r11 | ||
| 622 | movdqa %xmm9, %xmm1 | ||
| 623 | pandn %xmm4, %xmm1 | ||
| 624 | psrld \$4, %xmm1 # 1 = hi | ||
| 625 | pand %xmm9, %xmm4 # 4 = lo | ||
| 626 | |||
| 627 | movdqa 0x00(%r11), %xmm2 | ||
| 628 | pshufb %xmm4, %xmm2 | ||
| 629 | movdqa 0x10(%r11), %xmm3 | ||
| 630 | pshufb %xmm1, %xmm3 | ||
| 631 | pxor %xmm2, %xmm3 | ||
| 632 | pshufb %xmm5, %xmm3 | ||
| 633 | |||
| 634 | movdqa 0x20(%r11), %xmm2 | ||
| 635 | pshufb %xmm4, %xmm2 | ||
| 636 | pxor %xmm3, %xmm2 | ||
| 637 | movdqa 0x30(%r11), %xmm3 | ||
| 638 | pshufb %xmm1, %xmm3 | ||
| 639 | pxor %xmm2, %xmm3 | ||
| 640 | pshufb %xmm5, %xmm3 | ||
| 641 | |||
| 642 | movdqa 0x40(%r11), %xmm2 | ||
| 643 | pshufb %xmm4, %xmm2 | ||
| 644 | pxor %xmm3, %xmm2 | ||
| 645 | movdqa 0x50(%r11), %xmm3 | ||
| 646 | pshufb %xmm1, %xmm3 | ||
| 647 | pxor %xmm2, %xmm3 | ||
| 648 | pshufb %xmm5, %xmm3 | ||
| 649 | |||
| 650 | movdqa 0x60(%r11), %xmm2 | ||
| 651 | pshufb %xmm4, %xmm2 | ||
| 652 | pxor %xmm3, %xmm2 | ||
| 653 | movdqa 0x70(%r11), %xmm3 | ||
| 654 | pshufb %xmm1, %xmm3 | ||
| 655 | pxor %xmm2, %xmm3 | ||
| 656 | |||
| 657 | add \$-16, %rdx | ||
| 658 | |||
| 659 | .Lschedule_mangle_both: | ||
| 660 | movdqa (%r8,%r10),%xmm1 | ||
| 661 | pshufb %xmm1,%xmm3 | ||
| 662 | add \$-16, %r8 | ||
| 663 | and \$0x30, %r8 | ||
| 664 | movdqu %xmm3, (%rdx) | ||
| 665 | ret | ||
| 666 | .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle | ||
| 667 | |||
| 668 | # | ||
| 669 | # Interface to OpenSSL | ||
| 670 | # | ||
| 671 | .globl ${PREFIX}_set_encrypt_key | ||
| 672 | .type ${PREFIX}_set_encrypt_key,\@function,3 | ||
| 673 | .align 16 | ||
| 674 | ${PREFIX}_set_encrypt_key: | ||
| 675 | ___ | ||
| 676 | $code.=<<___ if ($win64); | ||
| 677 | lea -0xb8(%rsp),%rsp | ||
| 678 | movaps %xmm6,0x10(%rsp) | ||
| 679 | movaps %xmm7,0x20(%rsp) | ||
| 680 | movaps %xmm8,0x30(%rsp) | ||
| 681 | movaps %xmm9,0x40(%rsp) | ||
| 682 | movaps %xmm10,0x50(%rsp) | ||
| 683 | movaps %xmm11,0x60(%rsp) | ||
| 684 | movaps %xmm12,0x70(%rsp) | ||
| 685 | movaps %xmm13,0x80(%rsp) | ||
| 686 | movaps %xmm14,0x90(%rsp) | ||
| 687 | movaps %xmm15,0xa0(%rsp) | ||
| 688 | .Lenc_key_body: | ||
| 689 | ___ | ||
| 690 | $code.=<<___; | ||
| 691 | mov %esi,%eax | ||
| 692 | shr \$5,%eax | ||
| 693 | add \$5,%eax | ||
| 694 | mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; | ||
| 695 | |||
| 696 | mov \$0,%ecx | ||
| 697 | mov \$0x30,%r8d | ||
| 698 | call _vpaes_schedule_core | ||
| 699 | ___ | ||
| 700 | $code.=<<___ if ($win64); | ||
| 701 | movaps 0x10(%rsp),%xmm6 | ||
| 702 | movaps 0x20(%rsp),%xmm7 | ||
| 703 | movaps 0x30(%rsp),%xmm8 | ||
| 704 | movaps 0x40(%rsp),%xmm9 | ||
| 705 | movaps 0x50(%rsp),%xmm10 | ||
| 706 | movaps 0x60(%rsp),%xmm11 | ||
| 707 | movaps 0x70(%rsp),%xmm12 | ||
| 708 | movaps 0x80(%rsp),%xmm13 | ||
| 709 | movaps 0x90(%rsp),%xmm14 | ||
| 710 | movaps 0xa0(%rsp),%xmm15 | ||
| 711 | lea 0xb8(%rsp),%rsp | ||
| 712 | .Lenc_key_epilogue: | ||
| 713 | ___ | ||
| 714 | $code.=<<___; | ||
| 715 | xor %eax,%eax | ||
| 716 | ret | ||
| 717 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key | ||
| 718 | |||
| 719 | .globl ${PREFIX}_set_decrypt_key | ||
| 720 | .type ${PREFIX}_set_decrypt_key,\@function,3 | ||
| 721 | .align 16 | ||
| 722 | ${PREFIX}_set_decrypt_key: | ||
| 723 | ___ | ||
| 724 | $code.=<<___ if ($win64); | ||
| 725 | lea -0xb8(%rsp),%rsp | ||
| 726 | movaps %xmm6,0x10(%rsp) | ||
| 727 | movaps %xmm7,0x20(%rsp) | ||
| 728 | movaps %xmm8,0x30(%rsp) | ||
| 729 | movaps %xmm9,0x40(%rsp) | ||
| 730 | movaps %xmm10,0x50(%rsp) | ||
| 731 | movaps %xmm11,0x60(%rsp) | ||
| 732 | movaps %xmm12,0x70(%rsp) | ||
| 733 | movaps %xmm13,0x80(%rsp) | ||
| 734 | movaps %xmm14,0x90(%rsp) | ||
| 735 | movaps %xmm15,0xa0(%rsp) | ||
| 736 | .Ldec_key_body: | ||
| 737 | ___ | ||
| 738 | $code.=<<___; | ||
| 739 | mov %esi,%eax | ||
| 740 | shr \$5,%eax | ||
| 741 | add \$5,%eax | ||
| 742 | mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; | ||
| 743 | shl \$4,%eax | ||
| 744 | lea 16(%rdx,%rax),%rdx | ||
| 745 | |||
| 746 | mov \$1,%ecx | ||
| 747 | mov %esi,%r8d | ||
| 748 | shr \$1,%r8d | ||
| 749 | and \$32,%r8d | ||
| 750 | xor \$32,%r8d # nbits==192?0:32 | ||
| 751 | call _vpaes_schedule_core | ||
| 752 | ___ | ||
| 753 | $code.=<<___ if ($win64); | ||
| 754 | movaps 0x10(%rsp),%xmm6 | ||
| 755 | movaps 0x20(%rsp),%xmm7 | ||
| 756 | movaps 0x30(%rsp),%xmm8 | ||
| 757 | movaps 0x40(%rsp),%xmm9 | ||
| 758 | movaps 0x50(%rsp),%xmm10 | ||
| 759 | movaps 0x60(%rsp),%xmm11 | ||
| 760 | movaps 0x70(%rsp),%xmm12 | ||
| 761 | movaps 0x80(%rsp),%xmm13 | ||
| 762 | movaps 0x90(%rsp),%xmm14 | ||
| 763 | movaps 0xa0(%rsp),%xmm15 | ||
| 764 | lea 0xb8(%rsp),%rsp | ||
| 765 | .Ldec_key_epilogue: | ||
| 766 | ___ | ||
| 767 | $code.=<<___; | ||
| 768 | xor %eax,%eax | ||
| 769 | ret | ||
| 770 | .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key | ||
| 771 | |||
| 772 | .globl ${PREFIX}_encrypt | ||
| 773 | .type ${PREFIX}_encrypt,\@function,3 | ||
| 774 | .align 16 | ||
| 775 | ${PREFIX}_encrypt: | ||
| 776 | ___ | ||
| 777 | $code.=<<___ if ($win64); | ||
| 778 | lea -0xb8(%rsp),%rsp | ||
| 779 | movaps %xmm6,0x10(%rsp) | ||
| 780 | movaps %xmm7,0x20(%rsp) | ||
| 781 | movaps %xmm8,0x30(%rsp) | ||
| 782 | movaps %xmm9,0x40(%rsp) | ||
| 783 | movaps %xmm10,0x50(%rsp) | ||
| 784 | movaps %xmm11,0x60(%rsp) | ||
| 785 | movaps %xmm12,0x70(%rsp) | ||
| 786 | movaps %xmm13,0x80(%rsp) | ||
| 787 | movaps %xmm14,0x90(%rsp) | ||
| 788 | movaps %xmm15,0xa0(%rsp) | ||
| 789 | .Lenc_body: | ||
| 790 | ___ | ||
| 791 | $code.=<<___; | ||
| 792 | movdqu (%rdi),%xmm0 | ||
| 793 | call _vpaes_preheat | ||
| 794 | call _vpaes_encrypt_core | ||
| 795 | movdqu %xmm0,(%rsi) | ||
| 796 | ___ | ||
| 797 | $code.=<<___ if ($win64); | ||
| 798 | movaps 0x10(%rsp),%xmm6 | ||
| 799 | movaps 0x20(%rsp),%xmm7 | ||
| 800 | movaps 0x30(%rsp),%xmm8 | ||
| 801 | movaps 0x40(%rsp),%xmm9 | ||
| 802 | movaps 0x50(%rsp),%xmm10 | ||
| 803 | movaps 0x60(%rsp),%xmm11 | ||
| 804 | movaps 0x70(%rsp),%xmm12 | ||
| 805 | movaps 0x80(%rsp),%xmm13 | ||
| 806 | movaps 0x90(%rsp),%xmm14 | ||
| 807 | movaps 0xa0(%rsp),%xmm15 | ||
| 808 | lea 0xb8(%rsp),%rsp | ||
| 809 | .Lenc_epilogue: | ||
| 810 | ___ | ||
| 811 | $code.=<<___; | ||
| 812 | ret | ||
| 813 | .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt | ||
| 814 | |||
| 815 | .globl ${PREFIX}_decrypt | ||
| 816 | .type ${PREFIX}_decrypt,\@function,3 | ||
| 817 | .align 16 | ||
| 818 | ${PREFIX}_decrypt: | ||
| 819 | ___ | ||
| 820 | $code.=<<___ if ($win64); | ||
| 821 | lea -0xb8(%rsp),%rsp | ||
| 822 | movaps %xmm6,0x10(%rsp) | ||
| 823 | movaps %xmm7,0x20(%rsp) | ||
| 824 | movaps %xmm8,0x30(%rsp) | ||
| 825 | movaps %xmm9,0x40(%rsp) | ||
| 826 | movaps %xmm10,0x50(%rsp) | ||
| 827 | movaps %xmm11,0x60(%rsp) | ||
| 828 | movaps %xmm12,0x70(%rsp) | ||
| 829 | movaps %xmm13,0x80(%rsp) | ||
| 830 | movaps %xmm14,0x90(%rsp) | ||
| 831 | movaps %xmm15,0xa0(%rsp) | ||
| 832 | .Ldec_body: | ||
| 833 | ___ | ||
| 834 | $code.=<<___; | ||
| 835 | movdqu (%rdi),%xmm0 | ||
| 836 | call _vpaes_preheat | ||
| 837 | call _vpaes_decrypt_core | ||
| 838 | movdqu %xmm0,(%rsi) | ||
| 839 | ___ | ||
| 840 | $code.=<<___ if ($win64); | ||
| 841 | movaps 0x10(%rsp),%xmm6 | ||
| 842 | movaps 0x20(%rsp),%xmm7 | ||
| 843 | movaps 0x30(%rsp),%xmm8 | ||
| 844 | movaps 0x40(%rsp),%xmm9 | ||
| 845 | movaps 0x50(%rsp),%xmm10 | ||
| 846 | movaps 0x60(%rsp),%xmm11 | ||
| 847 | movaps 0x70(%rsp),%xmm12 | ||
| 848 | movaps 0x80(%rsp),%xmm13 | ||
| 849 | movaps 0x90(%rsp),%xmm14 | ||
| 850 | movaps 0xa0(%rsp),%xmm15 | ||
| 851 | lea 0xb8(%rsp),%rsp | ||
| 852 | .Ldec_epilogue: | ||
| 853 | ___ | ||
| 854 | $code.=<<___; | ||
| 855 | ret | ||
| 856 | .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt | ||
| 857 | ___ | ||
| 858 | { | ||
| 859 | my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||
| 860 | # void AES_cbc_encrypt (const void char *inp, unsigned char *out, | ||
| 861 | # size_t length, const AES_KEY *key, | ||
| 862 | # unsigned char *ivp,const int enc); | ||
| 863 | $code.=<<___; | ||
| 864 | .globl ${PREFIX}_cbc_encrypt | ||
| 865 | .type ${PREFIX}_cbc_encrypt,\@function,6 | ||
| 866 | .align 16 | ||
| 867 | ${PREFIX}_cbc_encrypt: | ||
| 868 | xchg $key,$len | ||
| 869 | ___ | ||
| 870 | ($len,$key)=($key,$len); | ||
| 871 | $code.=<<___; | ||
| 872 | sub \$16,$len | ||
| 873 | jc .Lcbc_abort | ||
| 874 | ___ | ||
| 875 | $code.=<<___ if ($win64); | ||
| 876 | lea -0xb8(%rsp),%rsp | ||
| 877 | movaps %xmm6,0x10(%rsp) | ||
| 878 | movaps %xmm7,0x20(%rsp) | ||
| 879 | movaps %xmm8,0x30(%rsp) | ||
| 880 | movaps %xmm9,0x40(%rsp) | ||
| 881 | movaps %xmm10,0x50(%rsp) | ||
| 882 | movaps %xmm11,0x60(%rsp) | ||
| 883 | movaps %xmm12,0x70(%rsp) | ||
| 884 | movaps %xmm13,0x80(%rsp) | ||
| 885 | movaps %xmm14,0x90(%rsp) | ||
| 886 | movaps %xmm15,0xa0(%rsp) | ||
| 887 | .Lcbc_body: | ||
| 888 | ___ | ||
| 889 | $code.=<<___; | ||
| 890 | movdqu ($ivp),%xmm6 # load IV | ||
| 891 | sub $inp,$out | ||
| 892 | call _vpaes_preheat | ||
| 893 | cmp \$0,${enc}d | ||
| 894 | je .Lcbc_dec_loop | ||
| 895 | jmp .Lcbc_enc_loop | ||
| 896 | .align 16 | ||
| 897 | .Lcbc_enc_loop: | ||
| 898 | movdqu ($inp),%xmm0 | ||
| 899 | pxor %xmm6,%xmm0 | ||
| 900 | call _vpaes_encrypt_core | ||
| 901 | movdqa %xmm0,%xmm6 | ||
| 902 | movdqu %xmm0,($out,$inp) | ||
| 903 | lea 16($inp),$inp | ||
| 904 | sub \$16,$len | ||
| 905 | jnc .Lcbc_enc_loop | ||
| 906 | jmp .Lcbc_done | ||
| 907 | .align 16 | ||
| 908 | .Lcbc_dec_loop: | ||
| 909 | movdqu ($inp),%xmm0 | ||
| 910 | movdqa %xmm0,%xmm7 | ||
| 911 | call _vpaes_decrypt_core | ||
| 912 | pxor %xmm6,%xmm0 | ||
| 913 | movdqa %xmm7,%xmm6 | ||
| 914 | movdqu %xmm0,($out,$inp) | ||
| 915 | lea 16($inp),$inp | ||
| 916 | sub \$16,$len | ||
| 917 | jnc .Lcbc_dec_loop | ||
| 918 | .Lcbc_done: | ||
| 919 | movdqu %xmm6,($ivp) # save IV | ||
| 920 | ___ | ||
| 921 | $code.=<<___ if ($win64); | ||
| 922 | movaps 0x10(%rsp),%xmm6 | ||
| 923 | movaps 0x20(%rsp),%xmm7 | ||
| 924 | movaps 0x30(%rsp),%xmm8 | ||
| 925 | movaps 0x40(%rsp),%xmm9 | ||
| 926 | movaps 0x50(%rsp),%xmm10 | ||
| 927 | movaps 0x60(%rsp),%xmm11 | ||
| 928 | movaps 0x70(%rsp),%xmm12 | ||
| 929 | movaps 0x80(%rsp),%xmm13 | ||
| 930 | movaps 0x90(%rsp),%xmm14 | ||
| 931 | movaps 0xa0(%rsp),%xmm15 | ||
| 932 | lea 0xb8(%rsp),%rsp | ||
| 933 | .Lcbc_epilogue: | ||
| 934 | ___ | ||
| 935 | $code.=<<___; | ||
| 936 | .Lcbc_abort: | ||
| 937 | ret | ||
| 938 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt | ||
| 939 | ___ | ||
| 940 | } | ||
| 941 | $code.=<<___; | ||
| 942 | ## | ||
| 943 | ## _aes_preheat | ||
| 944 | ## | ||
| 945 | ## Fills register %r10 -> .aes_consts (so you can -fPIC) | ||
| 946 | ## and %xmm9-%xmm15 as specified below. | ||
| 947 | ## | ||
| 948 | .type _vpaes_preheat,\@abi-omnipotent | ||
| 949 | .align 16 | ||
| 950 | _vpaes_preheat: | ||
| 951 | lea .Lk_s0F(%rip), %r10 | ||
| 952 | movdqa -0x20(%r10), %xmm10 # .Lk_inv | ||
| 953 | movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 | ||
| 954 | movdqa 0x00(%r10), %xmm9 # .Lk_s0F | ||
| 955 | movdqa 0x30(%r10), %xmm13 # .Lk_sb1 | ||
| 956 | movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 | ||
| 957 | movdqa 0x50(%r10), %xmm15 # .Lk_sb2 | ||
| 958 | movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 | ||
| 959 | ret | ||
| 960 | .size _vpaes_preheat,.-_vpaes_preheat | ||
| 961 | ######################################################## | ||
| 962 | ## ## | ||
| 963 | ## Constants ## | ||
| 964 | ## ## | ||
| 965 | ######################################################## | ||
| 966 | .type _vpaes_consts,\@object | ||
| 967 | .align 64 | ||
| 968 | _vpaes_consts: | ||
| 969 | .Lk_inv: # inv, inva | ||
| 970 | .quad 0x0E05060F0D080180, 0x040703090A0B0C02 | ||
| 971 | .quad 0x01040A060F0B0780, 0x030D0E0C02050809 | ||
| 972 | |||
| 973 | .Lk_s0F: # s0F | ||
| 974 | .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F | ||
| 975 | |||
| 976 | .Lk_ipt: # input transform (lo, hi) | ||
| 977 | .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 | ||
| 978 | .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 | ||
| 979 | |||
| 980 | .Lk_sb1: # sb1u, sb1t | ||
| 981 | .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 | ||
| 982 | .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF | ||
| 983 | .Lk_sb2: # sb2u, sb2t | ||
| 984 | .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD | ||
| 985 | .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A | ||
| 986 | .Lk_sbo: # sbou, sbot | ||
| 987 | .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 | ||
| 988 | .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA | ||
| 989 | |||
| 990 | .Lk_mc_forward: # mc_forward | ||
| 991 | .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 | ||
| 992 | .quad 0x080B0A0904070605, 0x000302010C0F0E0D | ||
| 993 | .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 | ||
| 994 | .quad 0x000302010C0F0E0D, 0x080B0A0904070605 | ||
| 995 | |||
| 996 | .Lk_mc_backward:# mc_backward | ||
| 997 | .quad 0x0605040702010003, 0x0E0D0C0F0A09080B | ||
| 998 | .quad 0x020100030E0D0C0F, 0x0A09080B06050407 | ||
| 999 | .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 | ||
| 1000 | .quad 0x0A09080B06050407, 0x020100030E0D0C0F | ||
| 1001 | |||
| 1002 | .Lk_sr: # sr | ||
| 1003 | .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 | ||
| 1004 | .quad 0x030E09040F0A0500, 0x0B06010C07020D08 | ||
| 1005 | .quad 0x0F060D040B020900, 0x070E050C030A0108 | ||
| 1006 | .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 | ||
| 1007 | |||
| 1008 | .Lk_rcon: # rcon | ||
| 1009 | .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 | ||
| 1010 | |||
| 1011 | .Lk_s63: # s63: all equal to 0x63 transformed | ||
| 1012 | .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B | ||
| 1013 | |||
| 1014 | .Lk_opt: # output transform | ||
| 1015 | .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 | ||
| 1016 | .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 | ||
| 1017 | |||
| 1018 | .Lk_deskew: # deskew tables: inverts the sbox's "skew" | ||
| 1019 | .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A | ||
| 1020 | .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 | ||
| 1021 | |||
| 1022 | ## | ||
| 1023 | ## Decryption stuff | ||
| 1024 | ## Key schedule constants | ||
| 1025 | ## | ||
| 1026 | .Lk_dksd: # decryption key schedule: invskew x*D | ||
| 1027 | .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 | ||
| 1028 | .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E | ||
| 1029 | .Lk_dksb: # decryption key schedule: invskew x*B | ||
| 1030 | .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 | ||
| 1031 | .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 | ||
| 1032 | .Lk_dkse: # decryption key schedule: invskew x*E + 0x63 | ||
| 1033 | .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 | ||
| 1034 | .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 | ||
| 1035 | .Lk_dks9: # decryption key schedule: invskew x*9 | ||
| 1036 | .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC | ||
| 1037 | .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE | ||
| 1038 | |||
| 1039 | ## | ||
| 1040 | ## Decryption stuff | ||
| 1041 | ## Round function constants | ||
| 1042 | ## | ||
| 1043 | .Lk_dipt: # decryption input transform | ||
| 1044 | .quad 0x0F505B040B545F00, 0x154A411E114E451A | ||
| 1045 | .quad 0x86E383E660056500, 0x12771772F491F194 | ||
| 1046 | |||
| 1047 | .Lk_dsb9: # decryption sbox output *9*u, *9*t | ||
| 1048 | .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 | ||
| 1049 | .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 | ||
| 1050 | .Lk_dsbd: # decryption sbox output *D*u, *D*t | ||
| 1051 | .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 | ||
| 1052 | .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 | ||
| 1053 | .Lk_dsbb: # decryption sbox output *B*u, *B*t | ||
| 1054 | .quad 0xD022649296B44200, 0x602646F6B0F2D404 | ||
| 1055 | .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B | ||
| 1056 | .Lk_dsbe: # decryption sbox output *E*u, *E*t | ||
| 1057 | .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 | ||
| 1058 | .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 | ||
| 1059 | .Lk_dsbo: # decryption sbox final output | ||
| 1060 | .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D | ||
| 1061 | .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C | ||
| 1062 | .asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)" | ||
| 1063 | .align 64 | ||
| 1064 | .size _vpaes_consts,.-_vpaes_consts | ||
| 1065 | ___ | ||
| 1066 | |||
| 1067 | if ($win64) { | ||
| 1068 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 1069 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 1070 | $rec="%rcx"; | ||
| 1071 | $frame="%rdx"; | ||
| 1072 | $context="%r8"; | ||
| 1073 | $disp="%r9"; | ||
| 1074 | |||
| 1075 | $code.=<<___; | ||
| 1076 | .extern __imp_RtlVirtualUnwind | ||
| 1077 | .type se_handler,\@abi-omnipotent | ||
| 1078 | .align 16 | ||
| 1079 | se_handler: | ||
| 1080 | push %rsi | ||
| 1081 | push %rdi | ||
| 1082 | push %rbx | ||
| 1083 | push %rbp | ||
| 1084 | push %r12 | ||
| 1085 | push %r13 | ||
| 1086 | push %r14 | ||
| 1087 | push %r15 | ||
| 1088 | pushfq | ||
| 1089 | sub \$64,%rsp | ||
| 1090 | |||
| 1091 | mov 120($context),%rax # pull context->Rax | ||
| 1092 | mov 248($context),%rbx # pull context->Rip | ||
| 1093 | |||
| 1094 | mov 8($disp),%rsi # disp->ImageBase | ||
| 1095 | mov 56($disp),%r11 # disp->HandlerData | ||
| 1096 | |||
| 1097 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 1098 | lea (%rsi,%r10),%r10 # prologue label | ||
| 1099 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 1100 | jb .Lin_prologue | ||
| 1101 | |||
| 1102 | mov 152($context),%rax # pull context->Rsp | ||
| 1103 | |||
| 1104 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 1105 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 1106 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 1107 | jae .Lin_prologue | ||
| 1108 | |||
| 1109 | lea 16(%rax),%rsi # %xmm save area | ||
| 1110 | lea 512($context),%rdi # &context.Xmm6 | ||
| 1111 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
| 1112 | .long 0xa548f3fc # cld; rep movsq | ||
| 1113 | lea 0xb8(%rax),%rax # adjust stack pointer | ||
| 1114 | |||
| 1115 | .Lin_prologue: | ||
| 1116 | mov 8(%rax),%rdi | ||
| 1117 | mov 16(%rax),%rsi | ||
| 1118 | mov %rax,152($context) # restore context->Rsp | ||
| 1119 | mov %rsi,168($context) # restore context->Rsi | ||
| 1120 | mov %rdi,176($context) # restore context->Rdi | ||
| 1121 | |||
| 1122 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 1123 | mov $context,%rsi # context | ||
| 1124 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
| 1125 | .long 0xa548f3fc # cld; rep movsq | ||
| 1126 | |||
| 1127 | mov $disp,%rsi | ||
| 1128 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 1129 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 1130 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 1131 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 1132 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 1133 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 1134 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 1135 | mov %r10,32(%rsp) # arg5 | ||
| 1136 | mov %r11,40(%rsp) # arg6 | ||
| 1137 | mov %r12,48(%rsp) # arg7 | ||
| 1138 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 1139 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 1140 | |||
| 1141 | mov \$1,%eax # ExceptionContinueSearch | ||
| 1142 | add \$64,%rsp | ||
| 1143 | popfq | ||
| 1144 | pop %r15 | ||
| 1145 | pop %r14 | ||
| 1146 | pop %r13 | ||
| 1147 | pop %r12 | ||
| 1148 | pop %rbp | ||
| 1149 | pop %rbx | ||
| 1150 | pop %rdi | ||
| 1151 | pop %rsi | ||
| 1152 | ret | ||
| 1153 | .size se_handler,.-se_handler | ||
| 1154 | |||
| 1155 | .section .pdata | ||
| 1156 | .align 4 | ||
| 1157 | .rva .LSEH_begin_${PREFIX}_set_encrypt_key | ||
| 1158 | .rva .LSEH_end_${PREFIX}_set_encrypt_key | ||
| 1159 | .rva .LSEH_info_${PREFIX}_set_encrypt_key | ||
| 1160 | |||
| 1161 | .rva .LSEH_begin_${PREFIX}_set_decrypt_key | ||
| 1162 | .rva .LSEH_end_${PREFIX}_set_decrypt_key | ||
| 1163 | .rva .LSEH_info_${PREFIX}_set_decrypt_key | ||
| 1164 | |||
| 1165 | .rva .LSEH_begin_${PREFIX}_encrypt | ||
| 1166 | .rva .LSEH_end_${PREFIX}_encrypt | ||
| 1167 | .rva .LSEH_info_${PREFIX}_encrypt | ||
| 1168 | |||
| 1169 | .rva .LSEH_begin_${PREFIX}_decrypt | ||
| 1170 | .rva .LSEH_end_${PREFIX}_decrypt | ||
| 1171 | .rva .LSEH_info_${PREFIX}_decrypt | ||
| 1172 | |||
| 1173 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt | ||
| 1174 | .rva .LSEH_end_${PREFIX}_cbc_encrypt | ||
| 1175 | .rva .LSEH_info_${PREFIX}_cbc_encrypt | ||
| 1176 | |||
| 1177 | .section .xdata | ||
| 1178 | .align 8 | ||
| 1179 | .LSEH_info_${PREFIX}_set_encrypt_key: | ||
| 1180 | .byte 9,0,0,0 | ||
| 1181 | .rva se_handler | ||
| 1182 | .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] | ||
| 1183 | .LSEH_info_${PREFIX}_set_decrypt_key: | ||
| 1184 | .byte 9,0,0,0 | ||
| 1185 | .rva se_handler | ||
| 1186 | .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[] | ||
| 1187 | .LSEH_info_${PREFIX}_encrypt: | ||
| 1188 | .byte 9,0,0,0 | ||
| 1189 | .rva se_handler | ||
| 1190 | .rva .Lenc_body,.Lenc_epilogue # HandlerData[] | ||
| 1191 | .LSEH_info_${PREFIX}_decrypt: | ||
| 1192 | .byte 9,0,0,0 | ||
| 1193 | .rva se_handler | ||
| 1194 | .rva .Ldec_body,.Ldec_epilogue # HandlerData[] | ||
| 1195 | .LSEH_info_${PREFIX}_cbc_encrypt: | ||
| 1196 | .byte 9,0,0,0 | ||
| 1197 | .rva se_handler | ||
| 1198 | .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[] | ||
| 1199 | ___ | ||
| 1200 | } | ||
| 1201 | |||
| 1202 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 1203 | |||
| 1204 | print $code; | ||
| 1205 | |||
| 1206 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/arm_arch.h b/src/lib/libcrypto/arm_arch.h new file mode 100644 index 0000000000..5a83107680 --- /dev/null +++ b/src/lib/libcrypto/arm_arch.h | |||
| @@ -0,0 +1,51 @@ | |||
| 1 | #ifndef __ARM_ARCH_H__ | ||
| 2 | #define __ARM_ARCH_H__ | ||
| 3 | |||
| 4 | #if !defined(__ARM_ARCH__) | ||
| 5 | # if defined(__CC_ARM) | ||
| 6 | # define __ARM_ARCH__ __TARGET_ARCH_ARM | ||
| 7 | # if defined(__BIG_ENDIAN) | ||
| 8 | # define __ARMEB__ | ||
| 9 | # else | ||
| 10 | # define __ARMEL__ | ||
| 11 | # endif | ||
| 12 | # elif defined(__GNUC__) | ||
| 13 | /* | ||
| 14 | * Why doesn't gcc define __ARM_ARCH__? Instead it defines | ||
| 15 | * bunch of below macros. See all_architectires[] table in | ||
| 16 | * gcc/config/arm/arm.c. On a side note it defines | ||
| 17 | * __ARMEL__/__ARMEB__ for little-/big-endian. | ||
| 18 | */ | ||
| 19 | # if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ | ||
| 20 | defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ | ||
| 21 | defined(__ARM_ARCH_7EM__) | ||
| 22 | # define __ARM_ARCH__ 7 | ||
| 23 | # elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ | ||
| 24 | defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \ | ||
| 25 | defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \ | ||
| 26 | defined(__ARM_ARCH_6T2__) | ||
| 27 | # define __ARM_ARCH__ 6 | ||
| 28 | # elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ | ||
| 29 | defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \ | ||
| 30 | defined(__ARM_ARCH_5TEJ__) | ||
| 31 | # define __ARM_ARCH__ 5 | ||
| 32 | # elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) | ||
| 33 | # define __ARM_ARCH__ 4 | ||
| 34 | # else | ||
| 35 | # error "unsupported ARM architecture" | ||
| 36 | # endif | ||
| 37 | # endif | ||
| 38 | #endif | ||
| 39 | |||
| 40 | #ifdef OPENSSL_FIPSCANISTER | ||
| 41 | #include <openssl/fipssyms.h> | ||
| 42 | #endif | ||
| 43 | |||
| 44 | #if !__ASSEMBLER__ | ||
| 45 | extern unsigned int OPENSSL_armcap_P; | ||
| 46 | |||
| 47 | #define ARMV7_NEON (1<<0) | ||
| 48 | #define ARMV7_TICK (1<<1) | ||
| 49 | #endif | ||
| 50 | |||
| 51 | #endif | ||
diff --git a/src/lib/libcrypto/armcap.c b/src/lib/libcrypto/armcap.c new file mode 100644 index 0000000000..5258d2fbdd --- /dev/null +++ b/src/lib/libcrypto/armcap.c | |||
| @@ -0,0 +1,80 @@ | |||
| 1 | #include <stdio.h> | ||
| 2 | #include <stdlib.h> | ||
| 3 | #include <string.h> | ||
| 4 | #include <setjmp.h> | ||
| 5 | #include <signal.h> | ||
| 6 | #include <crypto.h> | ||
| 7 | |||
| 8 | #include "arm_arch.h" | ||
| 9 | |||
| 10 | unsigned int OPENSSL_armcap_P; | ||
| 11 | |||
| 12 | static sigset_t all_masked; | ||
| 13 | |||
| 14 | static sigjmp_buf ill_jmp; | ||
| 15 | static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } | ||
| 16 | |||
| 17 | /* | ||
| 18 | * Following subroutines could have been inlined, but it's not all | ||
| 19 | * ARM compilers support inline assembler... | ||
| 20 | */ | ||
| 21 | void _armv7_neon_probe(void); | ||
| 22 | unsigned int _armv7_tick(void); | ||
| 23 | |||
| 24 | unsigned int OPENSSL_rdtsc(void) | ||
| 25 | { | ||
| 26 | if (OPENSSL_armcap_P|ARMV7_TICK) | ||
| 27 | return _armv7_tick(); | ||
| 28 | else | ||
| 29 | return 0; | ||
| 30 | } | ||
| 31 | |||
| 32 | #if defined(__GNUC__) && __GNUC__>=2 | ||
| 33 | void OPENSSL_cpuid_setup(void) __attribute__((constructor)); | ||
| 34 | #endif | ||
| 35 | void OPENSSL_cpuid_setup(void) | ||
| 36 | { | ||
| 37 | char *e; | ||
| 38 | struct sigaction ill_oact,ill_act; | ||
| 39 | sigset_t oset; | ||
| 40 | static int trigger=0; | ||
| 41 | |||
| 42 | if (trigger) return; | ||
| 43 | trigger=1; | ||
| 44 | |||
| 45 | if ((e=getenv("OPENSSL_armcap"))) | ||
| 46 | { | ||
| 47 | OPENSSL_armcap_P=strtoul(e,NULL,0); | ||
| 48 | return; | ||
| 49 | } | ||
| 50 | |||
| 51 | sigfillset(&all_masked); | ||
| 52 | sigdelset(&all_masked,SIGILL); | ||
| 53 | sigdelset(&all_masked,SIGTRAP); | ||
| 54 | sigdelset(&all_masked,SIGFPE); | ||
| 55 | sigdelset(&all_masked,SIGBUS); | ||
| 56 | sigdelset(&all_masked,SIGSEGV); | ||
| 57 | |||
| 58 | OPENSSL_armcap_P = 0; | ||
| 59 | |||
| 60 | memset(&ill_act,0,sizeof(ill_act)); | ||
| 61 | ill_act.sa_handler = ill_handler; | ||
| 62 | ill_act.sa_mask = all_masked; | ||
| 63 | |||
| 64 | sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset); | ||
| 65 | sigaction(SIGILL,&ill_act,&ill_oact); | ||
| 66 | |||
| 67 | if (sigsetjmp(ill_jmp,1) == 0) | ||
| 68 | { | ||
| 69 | _armv7_neon_probe(); | ||
| 70 | OPENSSL_armcap_P |= ARMV7_NEON; | ||
| 71 | } | ||
| 72 | if (sigsetjmp(ill_jmp,1) == 0) | ||
| 73 | { | ||
| 74 | _armv7_tick(); | ||
| 75 | OPENSSL_armcap_P |= ARMV7_TICK; | ||
| 76 | } | ||
| 77 | |||
| 78 | sigaction (SIGILL,&ill_oact,NULL); | ||
| 79 | sigprocmask(SIG_SETMASK,&oset,NULL); | ||
| 80 | } | ||
diff --git a/src/lib/libcrypto/armv4cpuid.S b/src/lib/libcrypto/armv4cpuid.S new file mode 100644 index 0000000000..2d618deaa4 --- /dev/null +++ b/src/lib/libcrypto/armv4cpuid.S | |||
| @@ -0,0 +1,154 @@ | |||
| 1 | #include "arm_arch.h" | ||
| 2 | |||
| 3 | .text | ||
| 4 | .code 32 | ||
| 5 | |||
| 6 | .align 5 | ||
| 7 | .global _armv7_neon_probe | ||
| 8 | .type _armv7_neon_probe,%function | ||
| 9 | _armv7_neon_probe: | ||
| 10 | .word 0xf26ee1fe @ vorr q15,q15,q15 | ||
| 11 | .word 0xe12fff1e @ bx lr | ||
| 12 | .size _armv7_neon_probe,.-_armv7_neon_probe | ||
| 13 | |||
| 14 | .global _armv7_tick | ||
| 15 | .type _armv7_tick,%function | ||
| 16 | _armv7_tick: | ||
| 17 | mrc p15,0,r0,c9,c13,0 | ||
| 18 | .word 0xe12fff1e @ bx lr | ||
| 19 | .size _armv7_tick,.-_armv7_tick | ||
| 20 | |||
| 21 | .global OPENSSL_atomic_add | ||
| 22 | .type OPENSSL_atomic_add,%function | ||
| 23 | OPENSSL_atomic_add: | ||
| 24 | #if __ARM_ARCH__>=6 | ||
| 25 | .Ladd: ldrex r2,[r0] | ||
| 26 | add r3,r2,r1 | ||
| 27 | strex r2,r3,[r0] | ||
| 28 | cmp r2,#0 | ||
| 29 | bne .Ladd | ||
| 30 | mov r0,r3 | ||
| 31 | .word 0xe12fff1e @ bx lr | ||
| 32 | #else | ||
| 33 | stmdb sp!,{r4-r6,lr} | ||
| 34 | ldr r2,.Lspinlock | ||
| 35 | adr r3,.Lspinlock | ||
| 36 | mov r4,r0 | ||
| 37 | mov r5,r1 | ||
| 38 | add r6,r3,r2 @ &spinlock | ||
| 39 | b .+8 | ||
| 40 | .Lspin: bl sched_yield | ||
| 41 | mov r0,#-1 | ||
| 42 | swp r0,r0,[r6] | ||
| 43 | cmp r0,#0 | ||
| 44 | bne .Lspin | ||
| 45 | |||
| 46 | ldr r2,[r4] | ||
| 47 | add r2,r2,r5 | ||
| 48 | str r2,[r4] | ||
| 49 | str r0,[r6] @ release spinlock | ||
| 50 | ldmia sp!,{r4-r6,lr} | ||
| 51 | tst lr,#1 | ||
| 52 | moveq pc,lr | ||
| 53 | .word 0xe12fff1e @ bx lr | ||
| 54 | #endif | ||
| 55 | .size OPENSSL_atomic_add,.-OPENSSL_atomic_add | ||
| 56 | |||
| 57 | .global OPENSSL_cleanse | ||
| 58 | .type OPENSSL_cleanse,%function | ||
| 59 | OPENSSL_cleanse: | ||
| 60 | eor ip,ip,ip | ||
| 61 | cmp r1,#7 | ||
| 62 | subhs r1,r1,#4 | ||
| 63 | bhs .Lot | ||
| 64 | cmp r1,#0 | ||
| 65 | beq .Lcleanse_done | ||
| 66 | .Little: | ||
| 67 | strb ip,[r0],#1 | ||
| 68 | subs r1,r1,#1 | ||
| 69 | bhi .Little | ||
| 70 | b .Lcleanse_done | ||
| 71 | |||
| 72 | .Lot: tst r0,#3 | ||
| 73 | beq .Laligned | ||
| 74 | strb ip,[r0],#1 | ||
| 75 | sub r1,r1,#1 | ||
| 76 | b .Lot | ||
| 77 | .Laligned: | ||
| 78 | str ip,[r0],#4 | ||
| 79 | subs r1,r1,#4 | ||
| 80 | bhs .Laligned | ||
| 81 | adds r1,r1,#4 | ||
| 82 | bne .Little | ||
| 83 | .Lcleanse_done: | ||
| 84 | tst lr,#1 | ||
| 85 | moveq pc,lr | ||
| 86 | .word 0xe12fff1e @ bx lr | ||
| 87 | .size OPENSSL_cleanse,.-OPENSSL_cleanse | ||
| 88 | |||
| 89 | .global OPENSSL_wipe_cpu | ||
| 90 | .type OPENSSL_wipe_cpu,%function | ||
| 91 | OPENSSL_wipe_cpu: | ||
| 92 | ldr r0,.LOPENSSL_armcap | ||
| 93 | adr r1,.LOPENSSL_armcap | ||
| 94 | ldr r0,[r1,r0] | ||
| 95 | eor r2,r2,r2 | ||
| 96 | eor r3,r3,r3 | ||
| 97 | eor ip,ip,ip | ||
| 98 | tst r0,#1 | ||
| 99 | beq .Lwipe_done | ||
| 100 | .word 0xf3000150 @ veor q0, q0, q0 | ||
| 101 | .word 0xf3022152 @ veor q1, q1, q1 | ||
| 102 | .word 0xf3044154 @ veor q2, q2, q2 | ||
| 103 | .word 0xf3066156 @ veor q3, q3, q3 | ||
| 104 | .word 0xf34001f0 @ veor q8, q8, q8 | ||
| 105 | .word 0xf34221f2 @ veor q9, q9, q9 | ||
| 106 | .word 0xf34441f4 @ veor q10, q10, q10 | ||
| 107 | .word 0xf34661f6 @ veor q11, q11, q11 | ||
| 108 | .word 0xf34881f8 @ veor q12, q12, q12 | ||
| 109 | .word 0xf34aa1fa @ veor q13, q13, q13 | ||
| 110 | .word 0xf34cc1fc @ veor q14, q14, q14 | ||
| 111 | .word 0xf34ee1fe @ veor q15, q15, q15 | ||
| 112 | .Lwipe_done: | ||
| 113 | mov r0,sp | ||
| 114 | tst lr,#1 | ||
| 115 | moveq pc,lr | ||
| 116 | .word 0xe12fff1e @ bx lr | ||
| 117 | .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu | ||
| 118 | |||
| 119 | .global OPENSSL_instrument_bus | ||
| 120 | .type OPENSSL_instrument_bus,%function | ||
| 121 | OPENSSL_instrument_bus: | ||
| 122 | eor r0,r0,r0 | ||
| 123 | tst lr,#1 | ||
| 124 | moveq pc,lr | ||
| 125 | .word 0xe12fff1e @ bx lr | ||
| 126 | .size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus | ||
| 127 | |||
| 128 | .global OPENSSL_instrument_bus2 | ||
| 129 | .type OPENSSL_instrument_bus2,%function | ||
| 130 | OPENSSL_instrument_bus2: | ||
| 131 | eor r0,r0,r0 | ||
| 132 | tst lr,#1 | ||
| 133 | moveq pc,lr | ||
| 134 | .word 0xe12fff1e @ bx lr | ||
| 135 | .size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 | ||
| 136 | |||
| 137 | .align 5 | ||
| 138 | .LOPENSSL_armcap: | ||
| 139 | .word OPENSSL_armcap_P-.LOPENSSL_armcap | ||
| 140 | #if __ARM_ARCH__>=6 | ||
| 141 | .align 5 | ||
| 142 | #else | ||
| 143 | .Lspinlock: | ||
| 144 | .word atomic_add_spinlock-.Lspinlock | ||
| 145 | .align 5 | ||
| 146 | |||
| 147 | .data | ||
| 148 | .align 2 | ||
| 149 | atomic_add_spinlock: | ||
| 150 | .word 0 | ||
| 151 | #endif | ||
| 152 | |||
| 153 | .comm OPENSSL_armcap_P,4,4 | ||
| 154 | .hidden OPENSSL_armcap_P | ||
diff --git a/src/lib/libcrypto/asn1/ameth_lib.c b/src/lib/libcrypto/asn1/ameth_lib.c index 5a581b90ea..a19e058fca 100644 --- a/src/lib/libcrypto/asn1/ameth_lib.c +++ b/src/lib/libcrypto/asn1/ameth_lib.c | |||
| @@ -69,6 +69,7 @@ extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[]; | |||
| 69 | extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth; | 69 | extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth; |
| 70 | extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth; | 70 | extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth; |
| 71 | extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth; | 71 | extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth; |
| 72 | extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth; | ||
| 72 | 73 | ||
| 73 | /* Keep this sorted in type order !! */ | 74 | /* Keep this sorted in type order !! */ |
| 74 | static const EVP_PKEY_ASN1_METHOD *standard_methods[] = | 75 | static const EVP_PKEY_ASN1_METHOD *standard_methods[] = |
| @@ -90,7 +91,8 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] = | |||
| 90 | #ifndef OPENSSL_NO_EC | 91 | #ifndef OPENSSL_NO_EC |
| 91 | &eckey_asn1_meth, | 92 | &eckey_asn1_meth, |
| 92 | #endif | 93 | #endif |
| 93 | &hmac_asn1_meth | 94 | &hmac_asn1_meth, |
| 95 | &cmac_asn1_meth | ||
| 94 | }; | 96 | }; |
| 95 | 97 | ||
| 96 | typedef int sk_cmp_fn_type(const char * const *a, const char * const *b); | 98 | typedef int sk_cmp_fn_type(const char * const *a, const char * const *b); |
| @@ -291,6 +293,8 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags, | |||
| 291 | if (!ameth) | 293 | if (!ameth) |
| 292 | return NULL; | 294 | return NULL; |
| 293 | 295 | ||
| 296 | memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD)); | ||
| 297 | |||
| 294 | ameth->pkey_id = id; | 298 | ameth->pkey_id = id; |
| 295 | ameth->pkey_base_id = id; | 299 | ameth->pkey_base_id = id; |
| 296 | ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC; | 300 | ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC; |
| @@ -325,6 +329,9 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags, | |||
| 325 | ameth->old_priv_encode = 0; | 329 | ameth->old_priv_encode = 0; |
| 326 | ameth->old_priv_decode = 0; | 330 | ameth->old_priv_decode = 0; |
| 327 | 331 | ||
| 332 | ameth->item_verify = 0; | ||
| 333 | ameth->item_sign = 0; | ||
| 334 | |||
| 328 | ameth->pkey_size = 0; | 335 | ameth->pkey_size = 0; |
| 329 | ameth->pkey_bits = 0; | 336 | ameth->pkey_bits = 0; |
| 330 | 337 | ||
| @@ -376,6 +383,9 @@ void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst, | |||
| 376 | dst->pkey_free = src->pkey_free; | 383 | dst->pkey_free = src->pkey_free; |
| 377 | dst->pkey_ctrl = src->pkey_ctrl; | 384 | dst->pkey_ctrl = src->pkey_ctrl; |
| 378 | 385 | ||
| 386 | dst->item_sign = src->item_sign; | ||
| 387 | dst->item_verify = src->item_verify; | ||
| 388 | |||
| 379 | } | 389 | } |
| 380 | 390 | ||
| 381 | void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth) | 391 | void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth) |
diff --git a/src/lib/libcrypto/asn1/asn1_locl.h b/src/lib/libcrypto/asn1/asn1_locl.h index 5aa65e28f5..9fcf0d9530 100644 --- a/src/lib/libcrypto/asn1/asn1_locl.h +++ b/src/lib/libcrypto/asn1/asn1_locl.h | |||
| @@ -102,6 +102,10 @@ struct evp_pkey_asn1_method_st | |||
| 102 | int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); | 102 | int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); |
| 103 | int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, | 103 | int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, |
| 104 | ASN1_PCTX *pctx); | 104 | ASN1_PCTX *pctx); |
| 105 | int (*sig_print)(BIO *out, | ||
| 106 | const X509_ALGOR *sigalg, const ASN1_STRING *sig, | ||
| 107 | int indent, ASN1_PCTX *pctx); | ||
| 108 | |||
| 105 | 109 | ||
| 106 | void (*pkey_free)(EVP_PKEY *pkey); | 110 | void (*pkey_free)(EVP_PKEY *pkey); |
| 107 | int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2); | 111 | int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2); |
| @@ -111,6 +115,13 @@ struct evp_pkey_asn1_method_st | |||
| 111 | int (*old_priv_decode)(EVP_PKEY *pkey, | 115 | int (*old_priv_decode)(EVP_PKEY *pkey, |
| 112 | const unsigned char **pder, int derlen); | 116 | const unsigned char **pder, int derlen); |
| 113 | int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder); | 117 | int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder); |
| 118 | /* Custom ASN1 signature verification */ | ||
| 119 | int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, | ||
| 120 | X509_ALGOR *a, ASN1_BIT_STRING *sig, | ||
| 121 | EVP_PKEY *pkey); | ||
| 122 | int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, | ||
| 123 | X509_ALGOR *alg1, X509_ALGOR *alg2, | ||
| 124 | ASN1_BIT_STRING *sig); | ||
| 114 | 125 | ||
| 115 | } /* EVP_PKEY_ASN1_METHOD */; | 126 | } /* EVP_PKEY_ASN1_METHOD */; |
| 116 | 127 | ||
diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl new file mode 100644 index 0000000000..c52e0b75b5 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl | |||
| @@ -0,0 +1,278 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication | ||
| 13 | # used in bn_gf2m.c. It's kind of low-hanging mechanical port from | ||
| 14 | # C for the time being... Except that it has two code paths: pure | ||
| 15 | # integer code suitable for any ARMv4 and later CPU and NEON code | ||
| 16 | # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs | ||
| 17 | # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50% | ||
| 18 | # faster than compiler-generated code. For ECDH and ECDSA verify (but | ||
| 19 | # not for ECDSA sign) it means 25%-45% improvement depending on key | ||
| 20 | # length, more for longer keys. Even though NEON 1x1 multiplication | ||
| 21 | # runs in even less cycles, ~30, improvement is measurable only on | ||
| 22 | # longer keys. One has to optimize code elsewhere to get NEON glow... | ||
| 23 | |||
| 24 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 25 | open STDOUT,">$output"; | ||
| 26 | |||
| 27 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | ||
| 28 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | ||
| 29 | sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } | ||
| 30 | |||
| 31 | $code=<<___; | ||
| 32 | #include "arm_arch.h" | ||
| 33 | |||
| 34 | .text | ||
| 35 | .code 32 | ||
| 36 | |||
| 37 | #if __ARM_ARCH__>=7 | ||
| 38 | .fpu neon | ||
| 39 | |||
| 40 | .type mul_1x1_neon,%function | ||
| 41 | .align 5 | ||
| 42 | mul_1x1_neon: | ||
| 43 | vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a | ||
| 44 | vmull.p8 `&Q("d0")`,d16,d17 @ a·bb | ||
| 45 | vshl.u64 `&Dlo("q2")`,d16,#16 | ||
| 46 | vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb | ||
| 47 | vshl.u64 `&Dlo("q3")`,d16,#24 | ||
| 48 | vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb | ||
| 49 | vshr.u64 `&Dlo("q1")`,#8 | ||
| 50 | vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb | ||
| 51 | vshl.u64 `&Dhi("q1")`,#24 | ||
| 52 | veor d0,`&Dlo("q1")` | ||
| 53 | vshr.u64 `&Dlo("q2")`,#16 | ||
| 54 | veor d0,`&Dhi("q1")` | ||
| 55 | vshl.u64 `&Dhi("q2")`,#16 | ||
| 56 | veor d0,`&Dlo("q2")` | ||
| 57 | vshr.u64 `&Dlo("q3")`,#24 | ||
| 58 | veor d0,`&Dhi("q2")` | ||
| 59 | vshl.u64 `&Dhi("q3")`,#8 | ||
| 60 | veor d0,`&Dlo("q3")` | ||
| 61 | veor d0,`&Dhi("q3")` | ||
| 62 | bx lr | ||
| 63 | .size mul_1x1_neon,.-mul_1x1_neon | ||
| 64 | #endif | ||
| 65 | ___ | ||
| 66 | ################ | ||
| 67 | # private interface to mul_1x1_ialu | ||
| 68 | # | ||
| 69 | $a="r1"; | ||
| 70 | $b="r0"; | ||
| 71 | |||
| 72 | ($a0,$a1,$a2,$a12,$a4,$a14)= | ||
| 73 | ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12); | ||
| 74 | |||
| 75 | $mask="r12"; | ||
| 76 | |||
| 77 | $code.=<<___; | ||
| 78 | .type mul_1x1_ialu,%function | ||
| 79 | .align 5 | ||
| 80 | mul_1x1_ialu: | ||
| 81 | mov $a0,#0 | ||
| 82 | bic $a1,$a,#3<<30 @ a1=a&0x3fffffff | ||
| 83 | str $a0,[sp,#0] @ tab[0]=0 | ||
| 84 | add $a2,$a1,$a1 @ a2=a1<<1 | ||
| 85 | str $a1,[sp,#4] @ tab[1]=a1 | ||
| 86 | eor $a12,$a1,$a2 @ a1^a2 | ||
| 87 | str $a2,[sp,#8] @ tab[2]=a2 | ||
| 88 | mov $a4,$a1,lsl#2 @ a4=a1<<2 | ||
| 89 | str $a12,[sp,#12] @ tab[3]=a1^a2 | ||
| 90 | eor $a14,$a1,$a4 @ a1^a4 | ||
| 91 | str $a4,[sp,#16] @ tab[4]=a4 | ||
| 92 | eor $a0,$a2,$a4 @ a2^a4 | ||
| 93 | str $a14,[sp,#20] @ tab[5]=a1^a4 | ||
| 94 | eor $a12,$a12,$a4 @ a1^a2^a4 | ||
| 95 | str $a0,[sp,#24] @ tab[6]=a2^a4 | ||
| 96 | and $i0,$mask,$b,lsl#2 | ||
| 97 | str $a12,[sp,#28] @ tab[7]=a1^a2^a4 | ||
| 98 | |||
| 99 | and $i1,$mask,$b,lsr#1 | ||
| 100 | ldr $lo,[sp,$i0] @ tab[b & 0x7] | ||
| 101 | and $i0,$mask,$b,lsr#4 | ||
| 102 | ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7] | ||
| 103 | and $i1,$mask,$b,lsr#7 | ||
| 104 | ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7] | ||
| 105 | eor $lo,$lo,$t1,lsl#3 @ stall | ||
| 106 | mov $hi,$t1,lsr#29 | ||
| 107 | ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7] | ||
| 108 | |||
| 109 | and $i0,$mask,$b,lsr#10 | ||
| 110 | eor $lo,$lo,$t0,lsl#6 | ||
| 111 | eor $hi,$hi,$t0,lsr#26 | ||
| 112 | ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7] | ||
| 113 | |||
| 114 | and $i1,$mask,$b,lsr#13 | ||
| 115 | eor $lo,$lo,$t1,lsl#9 | ||
| 116 | eor $hi,$hi,$t1,lsr#23 | ||
| 117 | ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7] | ||
| 118 | |||
| 119 | and $i0,$mask,$b,lsr#16 | ||
| 120 | eor $lo,$lo,$t0,lsl#12 | ||
| 121 | eor $hi,$hi,$t0,lsr#20 | ||
| 122 | ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7] | ||
| 123 | |||
| 124 | and $i1,$mask,$b,lsr#19 | ||
| 125 | eor $lo,$lo,$t1,lsl#15 | ||
| 126 | eor $hi,$hi,$t1,lsr#17 | ||
| 127 | ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7] | ||
| 128 | |||
| 129 | and $i0,$mask,$b,lsr#22 | ||
| 130 | eor $lo,$lo,$t0,lsl#18 | ||
| 131 | eor $hi,$hi,$t0,lsr#14 | ||
| 132 | ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7] | ||
| 133 | |||
| 134 | and $i1,$mask,$b,lsr#25 | ||
| 135 | eor $lo,$lo,$t1,lsl#21 | ||
| 136 | eor $hi,$hi,$t1,lsr#11 | ||
| 137 | ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7] | ||
| 138 | |||
| 139 | tst $a,#1<<30 | ||
| 140 | and $i0,$mask,$b,lsr#28 | ||
| 141 | eor $lo,$lo,$t0,lsl#24 | ||
| 142 | eor $hi,$hi,$t0,lsr#8 | ||
| 143 | ldr $t0,[sp,$i0] @ tab[b >> 30 ] | ||
| 144 | |||
| 145 | eorne $lo,$lo,$b,lsl#30 | ||
| 146 | eorne $hi,$hi,$b,lsr#2 | ||
| 147 | tst $a,#1<<31 | ||
| 148 | eor $lo,$lo,$t1,lsl#27 | ||
| 149 | eor $hi,$hi,$t1,lsr#5 | ||
| 150 | eorne $lo,$lo,$b,lsl#31 | ||
| 151 | eorne $hi,$hi,$b,lsr#1 | ||
| 152 | eor $lo,$lo,$t0,lsl#30 | ||
| 153 | eor $hi,$hi,$t0,lsr#2 | ||
| 154 | |||
| 155 | mov pc,lr | ||
| 156 | .size mul_1x1_ialu,.-mul_1x1_ialu | ||
| 157 | ___ | ||
| 158 | ################ | ||
| 159 | # void bn_GF2m_mul_2x2(BN_ULONG *r, | ||
| 160 | # BN_ULONG a1,BN_ULONG a0, | ||
| 161 | # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 | ||
| 162 | |||
| 163 | ($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); | ||
| 164 | |||
| 165 | $code.=<<___; | ||
| 166 | .global bn_GF2m_mul_2x2 | ||
| 167 | .type bn_GF2m_mul_2x2,%function | ||
| 168 | .align 5 | ||
| 169 | bn_GF2m_mul_2x2: | ||
| 170 | #if __ARM_ARCH__>=7 | ||
| 171 | ldr r12,.LOPENSSL_armcap | ||
| 172 | .Lpic: ldr r12,[pc,r12] | ||
| 173 | tst r12,#1 | ||
| 174 | beq .Lialu | ||
| 175 | |||
| 176 | veor $A1,$A1 | ||
| 177 | vmov.32 $B1,r3,r3 @ two copies of b1 | ||
| 178 | vmov.32 ${A1}[0],r1 @ a1 | ||
| 179 | |||
| 180 | veor $A0,$A0 | ||
| 181 | vld1.32 ${B0}[],[sp,:32] @ two copies of b0 | ||
| 182 | vmov.32 ${A0}[0],r2 @ a0 | ||
| 183 | mov r12,lr | ||
| 184 | |||
| 185 | vmov d16,$A1 | ||
| 186 | vmov d17,$B1 | ||
| 187 | bl mul_1x1_neon @ a1·b1 | ||
| 188 | vmov $A1B1,d0 | ||
| 189 | |||
| 190 | vmov d16,$A0 | ||
| 191 | vmov d17,$B0 | ||
| 192 | bl mul_1x1_neon @ a0·b0 | ||
| 193 | vmov $A0B0,d0 | ||
| 194 | |||
| 195 | veor d16,$A0,$A1 | ||
| 196 | veor d17,$B0,$B1 | ||
| 197 | veor $A0,$A0B0,$A1B1 | ||
| 198 | bl mul_1x1_neon @ (a0+a1)·(b0+b1) | ||
| 199 | |||
| 200 | veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 | ||
| 201 | vshl.u64 d1,d0,#32 | ||
| 202 | vshr.u64 d0,d0,#32 | ||
| 203 | veor $A0B0,d1 | ||
| 204 | veor $A1B1,d0 | ||
| 205 | vst1.32 {${A0B0}[0]},[r0,:32]! | ||
| 206 | vst1.32 {${A0B0}[1]},[r0,:32]! | ||
| 207 | vst1.32 {${A1B1}[0]},[r0,:32]! | ||
| 208 | vst1.32 {${A1B1}[1]},[r0,:32] | ||
| 209 | bx r12 | ||
| 210 | .align 4 | ||
| 211 | .Lialu: | ||
| 212 | #endif | ||
| 213 | ___ | ||
| 214 | $ret="r10"; # reassigned 1st argument | ||
| 215 | $code.=<<___; | ||
| 216 | stmdb sp!,{r4-r10,lr} | ||
| 217 | mov $ret,r0 @ reassign 1st argument | ||
| 218 | mov $b,r3 @ $b=b1 | ||
| 219 | ldr r3,[sp,#32] @ load b0 | ||
| 220 | mov $mask,#7<<2 | ||
| 221 | sub sp,sp,#32 @ allocate tab[8] | ||
| 222 | |||
| 223 | bl mul_1x1_ialu @ a1·b1 | ||
| 224 | str $lo,[$ret,#8] | ||
| 225 | str $hi,[$ret,#12] | ||
| 226 | |||
| 227 | eor $b,$b,r3 @ flip b0 and b1 | ||
| 228 | eor $a,$a,r2 @ flip a0 and a1 | ||
| 229 | eor r3,r3,$b | ||
| 230 | eor r2,r2,$a | ||
| 231 | eor $b,$b,r3 | ||
| 232 | eor $a,$a,r2 | ||
| 233 | bl mul_1x1_ialu @ a0·b0 | ||
| 234 | str $lo,[$ret] | ||
| 235 | str $hi,[$ret,#4] | ||
| 236 | |||
| 237 | eor $a,$a,r2 | ||
| 238 | eor $b,$b,r3 | ||
| 239 | bl mul_1x1_ialu @ (a1+a0)·(b1+b0) | ||
| 240 | ___ | ||
| 241 | @r=map("r$_",(6..9)); | ||
| 242 | $code.=<<___; | ||
| 243 | ldmia $ret,{@r[0]-@r[3]} | ||
| 244 | eor $lo,$lo,$hi | ||
| 245 | eor $hi,$hi,@r[1] | ||
| 246 | eor $lo,$lo,@r[0] | ||
| 247 | eor $hi,$hi,@r[2] | ||
| 248 | eor $lo,$lo,@r[3] | ||
| 249 | eor $hi,$hi,@r[3] | ||
| 250 | str $hi,[$ret,#8] | ||
| 251 | eor $lo,$lo,$hi | ||
| 252 | add sp,sp,#32 @ destroy tab[8] | ||
| 253 | str $lo,[$ret,#4] | ||
| 254 | |||
| 255 | #if __ARM_ARCH__>=5 | ||
| 256 | ldmia sp!,{r4-r10,pc} | ||
| 257 | #else | ||
| 258 | ldmia sp!,{r4-r10,lr} | ||
| 259 | tst lr,#1 | ||
| 260 | moveq pc,lr @ be binary compatible with V4, yet | ||
| 261 | bx lr @ interoperable with Thumb ISA:-) | ||
| 262 | #endif | ||
| 263 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
| 264 | #if __ARM_ARCH__>=7 | ||
| 265 | .align 5 | ||
| 266 | .LOPENSSL_armcap: | ||
| 267 | .word OPENSSL_armcap_P-(.Lpic+8) | ||
| 268 | #endif | ||
| 269 | .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 270 | .align 5 | ||
| 271 | |||
| 272 | .comm OPENSSL_armcap_P,4,4 | ||
| 273 | ___ | ||
| 274 | |||
| 275 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 276 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
| 277 | print $code; | ||
| 278 | close STDOUT; # enforce flush | ||
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl index 14e0d2d1dd..f78a8b5f0f 100644 --- a/src/lib/libcrypto/bn/asm/armv4-mont.pl +++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl | |||
| @@ -23,6 +23,9 @@ | |||
| 23 | # than 1/2KB. Windows CE port would be trivial, as it's exclusively | 23 | # than 1/2KB. Windows CE port would be trivial, as it's exclusively |
| 24 | # about decorations, ABI and instruction syntax are identical. | 24 | # about decorations, ABI and instruction syntax are identical. |
| 25 | 25 | ||
| 26 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 27 | open STDOUT,">$output"; | ||
| 28 | |||
| 26 | $num="r0"; # starts as num argument, but holds &tp[num-1] | 29 | $num="r0"; # starts as num argument, but holds &tp[num-1] |
| 27 | $ap="r1"; | 30 | $ap="r1"; |
| 28 | $bp="r2"; $bi="r2"; $rp="r2"; | 31 | $bp="r2"; $bi="r2"; $rp="r2"; |
| @@ -89,9 +92,9 @@ bn_mul_mont: | |||
| 89 | .L1st: | 92 | .L1st: |
| 90 | ldr $aj,[$ap],#4 @ ap[j],ap++ | 93 | ldr $aj,[$ap],#4 @ ap[j],ap++ |
| 91 | mov $alo,$ahi | 94 | mov $alo,$ahi |
| 95 | ldr $nj,[$np],#4 @ np[j],np++ | ||
| 92 | mov $ahi,#0 | 96 | mov $ahi,#0 |
| 93 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] | 97 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] |
| 94 | ldr $nj,[$np],#4 @ np[j],np++ | ||
| 95 | mov $nhi,#0 | 98 | mov $nhi,#0 |
| 96 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | 99 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 |
| 97 | adds $nlo,$nlo,$alo | 100 | adds $nlo,$nlo,$alo |
| @@ -101,21 +104,21 @@ bn_mul_mont: | |||
| 101 | bne .L1st | 104 | bne .L1st |
| 102 | 105 | ||
| 103 | adds $nlo,$nlo,$ahi | 106 | adds $nlo,$nlo,$ahi |
| 107 | ldr $tp,[$_bp] @ restore bp | ||
| 104 | mov $nhi,#0 | 108 | mov $nhi,#0 |
| 109 | ldr $n0,[$_n0] @ restore n0 | ||
| 105 | adc $nhi,$nhi,#0 | 110 | adc $nhi,$nhi,#0 |
| 106 | ldr $tp,[$_bp] @ restore bp | ||
| 107 | str $nlo,[$num] @ tp[num-1]= | 111 | str $nlo,[$num] @ tp[num-1]= |
| 108 | ldr $n0,[$_n0] @ restore n0 | ||
| 109 | str $nhi,[$num,#4] @ tp[num]= | 112 | str $nhi,[$num,#4] @ tp[num]= |
| 110 | 113 | ||
| 111 | .Louter: | 114 | .Louter: |
| 112 | sub $tj,$num,sp @ "original" $num-1 value | 115 | sub $tj,$num,sp @ "original" $num-1 value |
| 113 | sub $ap,$ap,$tj @ "rewind" ap to &ap[1] | 116 | sub $ap,$ap,$tj @ "rewind" ap to &ap[1] |
| 114 | sub $np,$np,$tj @ "rewind" np to &np[1] | ||
| 115 | ldr $bi,[$tp,#4]! @ *(++bp) | 117 | ldr $bi,[$tp,#4]! @ *(++bp) |
| 118 | sub $np,$np,$tj @ "rewind" np to &np[1] | ||
| 116 | ldr $aj,[$ap,#-4] @ ap[0] | 119 | ldr $aj,[$ap,#-4] @ ap[0] |
| 117 | ldr $nj,[$np,#-4] @ np[0] | ||
| 118 | ldr $alo,[sp] @ tp[0] | 120 | ldr $alo,[sp] @ tp[0] |
| 121 | ldr $nj,[$np,#-4] @ np[0] | ||
| 119 | ldr $tj,[sp,#4] @ tp[1] | 122 | ldr $tj,[sp,#4] @ tp[1] |
| 120 | 123 | ||
| 121 | mov $ahi,#0 | 124 | mov $ahi,#0 |
| @@ -129,13 +132,13 @@ bn_mul_mont: | |||
| 129 | .Linner: | 132 | .Linner: |
| 130 | ldr $aj,[$ap],#4 @ ap[j],ap++ | 133 | ldr $aj,[$ap],#4 @ ap[j],ap++ |
| 131 | adds $alo,$ahi,$tj @ +=tp[j] | 134 | adds $alo,$ahi,$tj @ +=tp[j] |
| 135 | ldr $nj,[$np],#4 @ np[j],np++ | ||
| 132 | mov $ahi,#0 | 136 | mov $ahi,#0 |
| 133 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] | 137 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] |
| 134 | ldr $nj,[$np],#4 @ np[j],np++ | ||
| 135 | mov $nhi,#0 | 138 | mov $nhi,#0 |
| 136 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | 139 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 |
| 137 | ldr $tj,[$tp,#8] @ tp[j+1] | ||
| 138 | adc $ahi,$ahi,#0 | 140 | adc $ahi,$ahi,#0 |
| 141 | ldr $tj,[$tp,#8] @ tp[j+1] | ||
| 139 | adds $nlo,$nlo,$alo | 142 | adds $nlo,$nlo,$alo |
| 140 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | 143 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ |
| 141 | adc $nlo,$nhi,#0 | 144 | adc $nlo,$nhi,#0 |
| @@ -144,13 +147,13 @@ bn_mul_mont: | |||
| 144 | 147 | ||
| 145 | adds $nlo,$nlo,$ahi | 148 | adds $nlo,$nlo,$ahi |
| 146 | mov $nhi,#0 | 149 | mov $nhi,#0 |
| 150 | ldr $tp,[$_bp] @ restore bp | ||
| 147 | adc $nhi,$nhi,#0 | 151 | adc $nhi,$nhi,#0 |
| 152 | ldr $n0,[$_n0] @ restore n0 | ||
| 148 | adds $nlo,$nlo,$tj | 153 | adds $nlo,$nlo,$tj |
| 149 | adc $nhi,$nhi,#0 | ||
| 150 | ldr $tp,[$_bp] @ restore bp | ||
| 151 | ldr $tj,[$_bpend] @ restore &bp[num] | 154 | ldr $tj,[$_bpend] @ restore &bp[num] |
| 155 | adc $nhi,$nhi,#0 | ||
| 152 | str $nlo,[$num] @ tp[num-1]= | 156 | str $nlo,[$num] @ tp[num-1]= |
| 153 | ldr $n0,[$_n0] @ restore n0 | ||
| 154 | str $nhi,[$num,#4] @ tp[num]= | 157 | str $nhi,[$num,#4] @ tp[num]= |
| 155 | 158 | ||
| 156 | cmp $tp,$tj | 159 | cmp $tp,$tj |
diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl new file mode 100644 index 0000000000..e258658428 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/ia64-mont.pl | |||
| @@ -0,0 +1,851 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # January 2010 | ||
| 11 | # | ||
| 12 | # "Teaser" Montgomery multiplication module for IA-64. There are | ||
| 13 | # several possibilities for improvement: | ||
| 14 | # | ||
| 15 | # - modulo-scheduling outer loop would eliminate quite a number of | ||
| 16 | # stalls after ldf8, xma and getf.sig outside inner loop and | ||
| 17 | # improve shorter key performance; | ||
| 18 | # - shorter vector support [with input vectors being fetched only | ||
| 19 | # once] should be added; | ||
| 20 | # - 2x unroll with help of n0[1] would make the code scalable on | ||
| 21 | # "wider" IA-64, "wider" than Itanium 2 that is, which is not of | ||
| 22 | # acute interest, because upcoming Tukwila's individual cores are | ||
| 23 | # reportedly based on Itanium 2 design; | ||
| 24 | # - dedicated squaring procedure(?); | ||
| 25 | # | ||
| 26 | # January 2010 | ||
| 27 | # | ||
| 28 | # Shorter vector support is implemented by zero-padding ap and np | ||
| 29 | # vectors up to 8 elements, or 512 bits. This means that 256-bit | ||
| 30 | # inputs will be processed only 2 times faster than 512-bit inputs, | ||
| 31 | # not 4 [as one would expect, because algorithm complexity is n^2]. | ||
| 32 | # The reason for padding is that inputs shorter than 512 bits won't | ||
| 33 | # be processed faster anyway, because minimal critical path of the | ||
| 34 | # core loop happens to match 512-bit timing. Either way, it resulted | ||
| 35 | # in >100% improvement of 512-bit RSA sign benchmark and 50% - of | ||
| 36 | # 1024-bit one [in comparison to original version of *this* module]. | ||
| 37 | # | ||
| 38 | # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* | ||
| 39 | # this module is: | ||
| 40 | # sign verify sign/s verify/s | ||
| 41 | # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 | ||
| 42 | # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 | ||
| 43 | # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 | ||
| 44 | # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 | ||
| 45 | # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 | ||
| 46 | # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 | ||
| 47 | # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 | ||
| 48 | # | ||
| 49 | # ... and *without* (but still with ia64.S): | ||
| 50 | # | ||
| 51 | # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 | ||
| 52 | # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 | ||
| 53 | # rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 | ||
| 54 | # rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 | ||
| 55 | # dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 | ||
| 56 | # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 | ||
| 57 | # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 | ||
| 58 | # | ||
| 59 | # As it can be seen, RSA sign performance improves by 130-30%, | ||
| 60 | # hereafter less for longer keys, while verify - by 74-13%. | ||
| 61 | # DSA performance improves by 115-30%. | ||
| 62 | |||
| 63 | if ($^O eq "hpux") { | ||
| 64 | $ADDP="addp4"; | ||
| 65 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | ||
| 66 | } else { $ADDP="add"; } | ||
| 67 | |||
| 68 | $code=<<___; | ||
| 69 | .explicit | ||
| 70 | .text | ||
| 71 | |||
| 72 | // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, | ||
| 73 | // const BN_ULONG *bp,const BN_ULONG *np, | ||
| 74 | // const BN_ULONG *n0p,int num); | ||
| 75 | .align 64 | ||
| 76 | .global bn_mul_mont# | ||
| 77 | .proc bn_mul_mont# | ||
| 78 | bn_mul_mont: | ||
| 79 | .prologue | ||
| 80 | .body | ||
| 81 | { .mmi; cmp4.le p6,p7=2,r37;; | ||
| 82 | (p6) cmp4.lt.unc p8,p9=8,r37 | ||
| 83 | mov ret0=r0 };; | ||
| 84 | { .bbb; | ||
| 85 | (p9) br.cond.dptk.many bn_mul_mont_8 | ||
| 86 | (p8) br.cond.dpnt.many bn_mul_mont_general | ||
| 87 | (p7) br.ret.spnt.many b0 };; | ||
| 88 | .endp bn_mul_mont# | ||
| 89 | |||
| 90 | prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; | ||
| 91 | |||
| 92 | rptr=r8; aptr=r9; bptr=r14; nptr=r15; | ||
| 93 | tptr=r16; // &tp[0] | ||
| 94 | tp_1=r17; // &tp[-1] | ||
| 95 | num=r18; len=r19; lc=r20; | ||
| 96 | topbit=r21; // carry bit from tmp[num] | ||
| 97 | |||
| 98 | n0=f6; | ||
| 99 | m0=f7; | ||
| 100 | bi=f8; | ||
| 101 | |||
| 102 | .align 64 | ||
| 103 | .local bn_mul_mont_general# | ||
| 104 | .proc bn_mul_mont_general# | ||
| 105 | bn_mul_mont_general: | ||
| 106 | .prologue | ||
| 107 | { .mmi; .save ar.pfs,prevfs | ||
| 108 | alloc prevfs=ar.pfs,6,2,0,8 | ||
| 109 | $ADDP aptr=0,in1 | ||
| 110 | .save ar.lc,prevlc | ||
| 111 | mov prevlc=ar.lc } | ||
| 112 | { .mmi; .vframe prevsp | ||
| 113 | mov prevsp=sp | ||
| 114 | $ADDP bptr=0,in2 | ||
| 115 | .save pr,prevpr | ||
| 116 | mov prevpr=pr };; | ||
| 117 | |||
| 118 | .body | ||
| 119 | .rotf alo[6],nlo[4],ahi[8],nhi[6] | ||
| 120 | .rotr a[3],n[3],t[2] | ||
| 121 | |||
| 122 | { .mmi; ldf8 bi=[bptr],8 // (*bp++) | ||
| 123 | ldf8 alo[4]=[aptr],16 // ap[0] | ||
| 124 | $ADDP r30=8,in1 };; | ||
| 125 | { .mmi; ldf8 alo[3]=[r30],16 // ap[1] | ||
| 126 | ldf8 alo[2]=[aptr],16 // ap[2] | ||
| 127 | $ADDP in4=0,in4 };; | ||
| 128 | { .mmi; ldf8 alo[1]=[r30] // ap[3] | ||
| 129 | ldf8 n0=[in4] // n0 | ||
| 130 | $ADDP rptr=0,in0 } | ||
| 131 | { .mmi; $ADDP nptr=0,in3 | ||
| 132 | mov r31=16 | ||
| 133 | zxt4 num=in5 };; | ||
| 134 | { .mmi; ldf8 nlo[2]=[nptr],8 // np[0] | ||
| 135 | shladd len=num,3,r0 | ||
| 136 | shladd r31=num,3,r31 };; | ||
| 137 | { .mmi; ldf8 nlo[1]=[nptr],8 // np[1] | ||
| 138 | add lc=-5,num | ||
| 139 | sub r31=sp,r31 };; | ||
| 140 | { .mfb; and sp=-16,r31 // alloca | ||
| 141 | xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] | ||
| 142 | nop.b 0 } | ||
| 143 | { .mfb; nop.m 0 | ||
| 144 | xmpy.lu alo[4]=alo[4],bi | ||
| 145 | brp.loop.imp .L1st_ctop,.L1st_cend-16 | ||
| 146 | };; | ||
| 147 | { .mfi; nop.m 0 | ||
| 148 | xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] | ||
| 149 | add tp_1=8,sp } | ||
| 150 | { .mfi; nop.m 0 | ||
| 151 | xma.lu alo[3]=alo[3],bi,ahi[2] | ||
| 152 | mov pr.rot=0x20001f<<16 | ||
| 153 | // ------^----- (p40) at first (p23) | ||
| 154 | // ----------^^ p[16:20]=1 | ||
| 155 | };; | ||
| 156 | { .mfi; nop.m 0 | ||
| 157 | xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 | ||
| 158 | mov ar.lc=lc } | ||
| 159 | { .mfi; nop.m 0 | ||
| 160 | fcvt.fxu.s1 nhi[1]=f0 | ||
| 161 | mov ar.ec=8 };; | ||
| 162 | |||
| 163 | .align 32 | ||
| 164 | .L1st_ctop: | ||
| 165 | .pred.rel "mutex",p40,p42 | ||
| 166 | { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) | ||
| 167 | (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] | ||
| 168 | (p40) add n[2]=n[2],a[2] } // (p23) } | ||
| 169 | { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) | ||
| 170 | (p18) xma.lu alo[2]=alo[2],bi,ahi[1] | ||
| 171 | (p42) add n[2]=n[2],a[2],1 };; // (p23) | ||
| 172 | { .mfi; (p21) getf.sig a[0]=alo[5] | ||
| 173 | (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] | ||
| 174 | (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) | ||
| 175 | { .mfi; (p23) st8 [tp_1]=n[2],8 | ||
| 176 | (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] | ||
| 177 | (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) | ||
| 178 | { .mmb; (p21) getf.sig n[0]=nlo[3] | ||
| 179 | (p16) nop.m 0 | ||
| 180 | br.ctop.sptk .L1st_ctop };; | ||
| 181 | .L1st_cend: | ||
| 182 | |||
| 183 | { .mmi; getf.sig a[0]=ahi[6] // (p24) | ||
| 184 | getf.sig n[0]=nhi[4] | ||
| 185 | add num=-1,num };; // num-- | ||
| 186 | { .mmi; .pred.rel "mutex",p40,p42 | ||
| 187 | (p40) add n[0]=n[0],a[0] | ||
| 188 | (p42) add n[0]=n[0],a[0],1 | ||
| 189 | sub aptr=aptr,len };; // rewind | ||
| 190 | { .mmi; .pred.rel "mutex",p40,p42 | ||
| 191 | (p40) cmp.ltu p41,p39=n[0],a[0] | ||
| 192 | (p42) cmp.leu p41,p39=n[0],a[0] | ||
| 193 | sub nptr=nptr,len };; | ||
| 194 | { .mmi; .pred.rel "mutex",p39,p41 | ||
| 195 | (p39) add topbit=r0,r0 | ||
| 196 | (p41) add topbit=r0,r0,1 | ||
| 197 | nop.i 0 } | ||
| 198 | { .mmi; st8 [tp_1]=n[0] | ||
| 199 | add tptr=16,sp | ||
| 200 | add tp_1=8,sp };; | ||
| 201 | |||
| 202 | .Louter: | ||
| 203 | { .mmi; ldf8 bi=[bptr],8 // (*bp++) | ||
| 204 | ldf8 ahi[3]=[tptr] // tp[0] | ||
| 205 | add r30=8,aptr };; | ||
| 206 | { .mmi; ldf8 alo[4]=[aptr],16 // ap[0] | ||
| 207 | ldf8 alo[3]=[r30],16 // ap[1] | ||
| 208 | add r31=8,nptr };; | ||
| 209 | { .mfb; ldf8 alo[2]=[aptr],16 // ap[2] | ||
| 210 | xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] | ||
| 211 | brp.loop.imp .Linner_ctop,.Linner_cend-16 | ||
| 212 | } | ||
| 213 | { .mfb; ldf8 alo[1]=[r30] // ap[3] | ||
| 214 | xma.lu alo[4]=alo[4],bi,ahi[3] | ||
| 215 | clrrrb.pr };; | ||
| 216 | { .mfi; ldf8 nlo[2]=[nptr],16 // np[0] | ||
| 217 | xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] | ||
| 218 | nop.i 0 } | ||
| 219 | { .mfi; ldf8 nlo[1]=[r31] // np[1] | ||
| 220 | xma.lu alo[3]=alo[3],bi,ahi[2] | ||
| 221 | mov pr.rot=0x20101f<<16 | ||
| 222 | // ------^----- (p40) at first (p23) | ||
| 223 | // --------^--- (p30) at first (p22) | ||
| 224 | // ----------^^ p[16:20]=1 | ||
| 225 | };; | ||
| 226 | { .mfi; st8 [tptr]=r0 // tp[0] is already accounted | ||
| 227 | xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 | ||
| 228 | mov ar.lc=lc } | ||
| 229 | { .mfi; | ||
| 230 | fcvt.fxu.s1 nhi[1]=f0 | ||
| 231 | mov ar.ec=8 };; | ||
| 232 | |||
| 233 | // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in | ||
| 234 | // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 | ||
| 235 | // in latter case accounts for two-tick pipeline stall, which means | ||
| 236 | // that its performance would be ~20% lower than optimal one. No | ||
| 237 | // attempt was made to address this, because original Itanium is | ||
| 238 | // hardly represented out in the wild... | ||
| 239 | .align 32 | ||
| 240 | .Linner_ctop: | ||
| 241 | .pred.rel "mutex",p40,p42 | ||
| 242 | .pred.rel "mutex",p30,p32 | ||
| 243 | { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) | ||
| 244 | (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] | ||
| 245 | (p40) add n[2]=n[2],a[2] } // (p23) | ||
| 246 | { .mfi; (p16) nop.m 0 | ||
| 247 | (p18) xma.lu alo[2]=alo[2],bi,ahi[1] | ||
| 248 | (p42) add n[2]=n[2],a[2],1 };; // (p23) | ||
| 249 | { .mfi; (p21) getf.sig a[0]=alo[5] | ||
| 250 | (p16) nop.f 0 | ||
| 251 | (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) | ||
| 252 | { .mfi; (p21) ld8 t[0]=[tptr],8 | ||
| 253 | (p16) nop.f 0 | ||
| 254 | (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) | ||
| 255 | { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) | ||
| 256 | (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] | ||
| 257 | (p30) add a[1]=a[1],t[1] } // (p22) | ||
| 258 | { .mfi; (p16) nop.m 0 | ||
| 259 | (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] | ||
| 260 | (p32) add a[1]=a[1],t[1],1 };; // (p22) | ||
| 261 | { .mmi; (p21) getf.sig n[0]=nlo[3] | ||
| 262 | (p16) nop.m 0 | ||
| 263 | (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) | ||
| 264 | { .mmb; (p23) st8 [tp_1]=n[2],8 | ||
| 265 | (p32) cmp.leu p31,p29=a[1],t[1] // (p22) | ||
| 266 | br.ctop.sptk .Linner_ctop };; | ||
| 267 | .Linner_cend: | ||
| 268 | |||
| 269 | { .mmi; getf.sig a[0]=ahi[6] // (p24) | ||
| 270 | getf.sig n[0]=nhi[4] | ||
| 271 | nop.i 0 };; | ||
| 272 | |||
| 273 | { .mmi; .pred.rel "mutex",p31,p33 | ||
| 274 | (p31) add a[0]=a[0],topbit | ||
| 275 | (p33) add a[0]=a[0],topbit,1 | ||
| 276 | mov topbit=r0 };; | ||
| 277 | { .mfi; .pred.rel "mutex",p31,p33 | ||
| 278 | (p31) cmp.ltu p32,p30=a[0],topbit | ||
| 279 | (p33) cmp.leu p32,p30=a[0],topbit | ||
| 280 | } | ||
| 281 | { .mfi; .pred.rel "mutex",p40,p42 | ||
| 282 | (p40) add n[0]=n[0],a[0] | ||
| 283 | (p42) add n[0]=n[0],a[0],1 | ||
| 284 | };; | ||
| 285 | { .mmi; .pred.rel "mutex",p44,p46 | ||
| 286 | (p40) cmp.ltu p41,p39=n[0],a[0] | ||
| 287 | (p42) cmp.leu p41,p39=n[0],a[0] | ||
| 288 | (p32) add topbit=r0,r0,1 } | ||
| 289 | |||
| 290 | { .mmi; st8 [tp_1]=n[0],8 | ||
| 291 | cmp4.ne p6,p0=1,num | ||
| 292 | sub aptr=aptr,len };; // rewind | ||
| 293 | { .mmi; sub nptr=nptr,len | ||
| 294 | (p41) add topbit=r0,r0,1 | ||
| 295 | add tptr=16,sp } | ||
| 296 | { .mmb; add tp_1=8,sp | ||
| 297 | add num=-1,num // num-- | ||
| 298 | (p6) br.cond.sptk.many .Louter };; | ||
| 299 | |||
| 300 | { .mbb; add lc=4,lc | ||
| 301 | brp.loop.imp .Lsub_ctop,.Lsub_cend-16 | ||
| 302 | clrrrb.pr };; | ||
| 303 | { .mii; nop.m 0 | ||
| 304 | mov pr.rot=0x10001<<16 | ||
| 305 | // ------^---- (p33) at first (p17) | ||
| 306 | mov ar.lc=lc } | ||
| 307 | { .mii; nop.m 0 | ||
| 308 | mov ar.ec=3 | ||
| 309 | nop.i 0 };; | ||
| 310 | |||
| 311 | .Lsub_ctop: | ||
| 312 | .pred.rel "mutex",p33,p35 | ||
| 313 | { .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) | ||
| 314 | (p16) nop.f 0 | ||
| 315 | (p33) sub n[1]=t[1],n[1] } // (p17) | ||
| 316 | { .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) | ||
| 317 | (p16) nop.f 0 | ||
| 318 | (p35) sub n[1]=t[1],n[1],1 };; // (p17) | ||
| 319 | { .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r | ||
| 320 | (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) | ||
| 321 | (p18) nop.b 0 } | ||
| 322 | { .mib; (p18) nop.m 0 | ||
| 323 | (p35) cmp.geu p34,p32=n[1],t[1] // (p17) | ||
| 324 | br.ctop.sptk .Lsub_ctop };; | ||
| 325 | .Lsub_cend: | ||
| 326 | |||
| 327 | { .mmb; .pred.rel "mutex",p34,p36 | ||
| 328 | (p34) sub topbit=topbit,r0 // (p19) | ||
| 329 | (p36) sub topbit=topbit,r0,1 | ||
| 330 | brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 | ||
| 331 | } | ||
| 332 | { .mmb; sub rptr=rptr,len // rewind | ||
| 333 | sub tptr=tptr,len | ||
| 334 | clrrrb.pr };; | ||
| 335 | { .mmi; and aptr=tptr,topbit | ||
| 336 | andcm bptr=rptr,topbit | ||
| 337 | mov pr.rot=1<<16 };; | ||
| 338 | { .mii; or nptr=aptr,bptr | ||
| 339 | mov ar.lc=lc | ||
| 340 | mov ar.ec=3 };; | ||
| 341 | |||
| 342 | .Lcopy_ctop: | ||
| 343 | { .mmb; (p16) ld8 n[0]=[nptr],8 | ||
| 344 | (p18) st8 [tptr]=r0,8 | ||
| 345 | (p16) nop.b 0 } | ||
| 346 | { .mmb; (p16) nop.m 0 | ||
| 347 | (p18) st8 [rptr]=n[2],8 | ||
| 348 | br.ctop.sptk .Lcopy_ctop };; | ||
| 349 | .Lcopy_cend: | ||
| 350 | |||
| 351 | { .mmi; mov ret0=1 // signal "handled" | ||
| 352 | rum 1<<5 // clear um.mfh | ||
| 353 | mov ar.lc=prevlc } | ||
| 354 | { .mib; .restore sp | ||
| 355 | mov sp=prevsp | ||
| 356 | mov pr=prevpr,0x1ffff | ||
| 357 | br.ret.sptk.many b0 };; | ||
| 358 | .endp bn_mul_mont_general# | ||
| 359 | |||
| 360 | a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; | ||
| 361 | n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; | ||
| 362 | t0=r15; | ||
| 363 | |||
| 364 | ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; | ||
| 365 | ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; | ||
| 366 | |||
| 367 | .align 64 | ||
| 368 | .skip 48 // aligns loop body | ||
| 369 | .local bn_mul_mont_8# | ||
| 370 | .proc bn_mul_mont_8# | ||
| 371 | bn_mul_mont_8: | ||
| 372 | .prologue | ||
| 373 | { .mmi; .save ar.pfs,prevfs | ||
| 374 | alloc prevfs=ar.pfs,6,2,0,8 | ||
| 375 | .vframe prevsp | ||
| 376 | mov prevsp=sp | ||
| 377 | .save ar.lc,prevlc | ||
| 378 | mov prevlc=ar.lc } | ||
| 379 | { .mmi; add r17=-6*16,sp | ||
| 380 | add sp=-7*16,sp | ||
| 381 | .save pr,prevpr | ||
| 382 | mov prevpr=pr };; | ||
| 383 | |||
| 384 | { .mmi; .save.gf 0,0x10 | ||
| 385 | stf.spill [sp]=f16,-16 | ||
| 386 | .save.gf 0,0x20 | ||
| 387 | stf.spill [r17]=f17,32 | ||
| 388 | add r16=-5*16,prevsp};; | ||
| 389 | { .mmi; .save.gf 0,0x40 | ||
| 390 | stf.spill [r16]=f18,32 | ||
| 391 | .save.gf 0,0x80 | ||
| 392 | stf.spill [r17]=f19,32 | ||
| 393 | $ADDP aptr=0,in1 };; | ||
| 394 | { .mmi; .save.gf 0,0x100 | ||
| 395 | stf.spill [r16]=f20,32 | ||
| 396 | .save.gf 0,0x200 | ||
| 397 | stf.spill [r17]=f21,32 | ||
| 398 | $ADDP r29=8,in1 };; | ||
| 399 | { .mmi; .save.gf 0,0x400 | ||
| 400 | stf.spill [r16]=f22 | ||
| 401 | .save.gf 0,0x800 | ||
| 402 | stf.spill [r17]=f23 | ||
| 403 | $ADDP rptr=0,in0 };; | ||
| 404 | |||
| 405 | .body | ||
| 406 | .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] | ||
| 407 | .rotr t[8] | ||
| 408 | |||
| 409 | // load input vectors padding them to 8 elements | ||
| 410 | { .mmi; ldf8 ai0=[aptr],16 // ap[0] | ||
| 411 | ldf8 ai1=[r29],16 // ap[1] | ||
| 412 | $ADDP bptr=0,in2 } | ||
| 413 | { .mmi; $ADDP r30=8,in2 | ||
| 414 | $ADDP nptr=0,in3 | ||
| 415 | $ADDP r31=8,in3 };; | ||
| 416 | { .mmi; ldf8 bj[7]=[bptr],16 // bp[0] | ||
| 417 | ldf8 bj[6]=[r30],16 // bp[1] | ||
| 418 | cmp4.le p4,p5=3,in5 } | ||
| 419 | { .mmi; ldf8 ni0=[nptr],16 // np[0] | ||
| 420 | ldf8 ni1=[r31],16 // np[1] | ||
| 421 | cmp4.le p6,p7=4,in5 };; | ||
| 422 | |||
| 423 | { .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] | ||
| 424 | (p5)fcvt.fxu ai2=f0 | ||
| 425 | cmp4.le p8,p9=5,in5 } | ||
| 426 | { .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] | ||
| 427 | (p7)fcvt.fxu ai3=f0 | ||
| 428 | cmp4.le p10,p11=6,in5 } | ||
| 429 | { .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] | ||
| 430 | (p5)fcvt.fxu bj[5]=f0 | ||
| 431 | cmp4.le p12,p13=7,in5 } | ||
| 432 | { .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] | ||
| 433 | (p7)fcvt.fxu bj[4]=f0 | ||
| 434 | cmp4.le p14,p15=8,in5 } | ||
| 435 | { .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] | ||
| 436 | (p5)fcvt.fxu ni2=f0 | ||
| 437 | addp4 r28=-1,in5 } | ||
| 438 | { .mfi; (p6)ldf8 ni3=[r31],16 // np[3] | ||
| 439 | (p7)fcvt.fxu ni3=f0 | ||
| 440 | $ADDP in4=0,in4 };; | ||
| 441 | |||
| 442 | { .mfi; ldf8 n0=[in4] | ||
| 443 | fcvt.fxu tf[1]=f0 | ||
| 444 | nop.i 0 } | ||
| 445 | |||
| 446 | { .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] | ||
| 447 | (p9)fcvt.fxu ai4=f0 | ||
| 448 | mov t[0]=r0 } | ||
| 449 | { .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] | ||
| 450 | (p11)fcvt.fxu ai5=f0 | ||
| 451 | mov t[1]=r0 } | ||
| 452 | { .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] | ||
| 453 | (p9)fcvt.fxu bj[3]=f0 | ||
| 454 | mov t[2]=r0 } | ||
| 455 | { .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] | ||
| 456 | (p11)fcvt.fxu bj[2]=f0 | ||
| 457 | mov t[3]=r0 } | ||
| 458 | { .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] | ||
| 459 | (p9)fcvt.fxu ni4=f0 | ||
| 460 | mov t[4]=r0 } | ||
| 461 | { .mfi; (p10)ldf8 ni5=[r31],16 // np[5] | ||
| 462 | (p11)fcvt.fxu ni5=f0 | ||
| 463 | mov t[5]=r0 };; | ||
| 464 | |||
| 465 | { .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] | ||
| 466 | (p13)fcvt.fxu ai6=f0 | ||
| 467 | mov t[6]=r0 } | ||
| 468 | { .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] | ||
| 469 | (p15)fcvt.fxu ai7=f0 | ||
| 470 | mov t[7]=r0 } | ||
| 471 | { .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] | ||
| 472 | (p13)fcvt.fxu bj[1]=f0 | ||
| 473 | mov ar.lc=r28 } | ||
| 474 | { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] | ||
| 475 | (p15)fcvt.fxu bj[0]=f0 | ||
| 476 | mov ar.ec=1 } | ||
| 477 | { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] | ||
| 478 | (p13)fcvt.fxu ni6=f0 | ||
| 479 | mov pr.rot=1<<16 } | ||
| 480 | { .mfb; (p14)ldf8 ni7=[r31],16 // np[7] | ||
| 481 | (p15)fcvt.fxu ni7=f0 | ||
| 482 | brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 | ||
| 483 | };; | ||
| 484 | |||
| 485 | // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt | ||
| 486 | // to measure with help of Interval Time Counter indicated that the | ||
| 487 | // factor is a tad higher: 33 or 34, if not 35. Exact measurement and | ||
| 488 | // addressing the issue is problematic, because I don't have access | ||
| 489 | // to platform-specific instruction-level profiler. On Itanium it | ||
| 490 | // should run in 56*n ticks, because of higher xma latency... | ||
| 491 | .Louter_8_ctop: | ||
| 492 | .pred.rel "mutex",p40,p42 | ||
| 493 | .pred.rel "mutex",p48,p50 | ||
| 494 | { .mfi; (p16) nop.m 0 // 0: | ||
| 495 | (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] | ||
| 496 | (p40) add a3=a3,n3 } // (p17) a3+=n3 | ||
| 497 | { .mfi; (p42) add a3=a3,n3,1 | ||
| 498 | (p16) xma.lu alo[0]=ai0,bj[7],tf[1] | ||
| 499 | (p16) nop.i 0 };; | ||
| 500 | { .mii; (p17) getf.sig a7=alo[8] // 1: | ||
| 501 | (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 | ||
| 502 | (p50) add t[6]=t[6],a3,1 };; | ||
| 503 | { .mfi; (p17) getf.sig a8=ahi[8] // 2: | ||
| 504 | (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 | ||
| 505 | (p40) cmp.ltu p43,p41=a3,n3 } | ||
| 506 | { .mfi; (p42) cmp.leu p43,p41=a3,n3 | ||
| 507 | (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] | ||
| 508 | (p16) nop.i 0 };; | ||
| 509 | { .mii; (p17) getf.sig n5=nlo[6] // 3: | ||
| 510 | (p48) cmp.ltu p51,p49=t[6],a3 | ||
| 511 | (p50) cmp.leu p51,p49=t[6],a3 };; | ||
| 512 | .pred.rel "mutex",p41,p43 | ||
| 513 | .pred.rel "mutex",p49,p51 | ||
| 514 | { .mfi; (p16) nop.m 0 // 4: | ||
| 515 | (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] | ||
| 516 | (p41) add a4=a4,n4 } // (p17) a4+=n4 | ||
| 517 | { .mfi; (p43) add a4=a4,n4,1 | ||
| 518 | (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] | ||
| 519 | (p16) nop.i 0 };; | ||
| 520 | { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 | ||
| 521 | (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 | ||
| 522 | (p51) add t[5]=t[5],a4,1 };; | ||
| 523 | { .mfi; (p16) nop.m 0 // 6: | ||
| 524 | (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 | ||
| 525 | (p41) cmp.ltu p42,p40=a4,n4 } | ||
| 526 | { .mfi; (p43) cmp.leu p42,p40=a4,n4 | ||
| 527 | (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] | ||
| 528 | (p16) nop.i 0 };; | ||
| 529 | { .mii; (p17) getf.sig n6=nlo[7] // 7: | ||
| 530 | (p49) cmp.ltu p50,p48=t[5],a4 | ||
| 531 | (p51) cmp.leu p50,p48=t[5],a4 };; | ||
| 532 | .pred.rel "mutex",p40,p42 | ||
| 533 | .pred.rel "mutex",p48,p50 | ||
| 534 | { .mfi; (p16) nop.m 0 // 8: | ||
| 535 | (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] | ||
| 536 | (p40) add a5=a5,n5 } // (p17) a5+=n5 | ||
| 537 | { .mfi; (p42) add a5=a5,n5,1 | ||
| 538 | (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] | ||
| 539 | (p16) nop.i 0 };; | ||
| 540 | { .mii; (p16) getf.sig a1=alo[1] // 9: | ||
| 541 | (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 | ||
| 542 | (p50) add t[4]=t[4],a5,1 };; | ||
| 543 | { .mfi; (p16) nop.m 0 // 10: | ||
| 544 | (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 | ||
| 545 | (p40) cmp.ltu p43,p41=a5,n5 } | ||
| 546 | { .mfi; (p42) cmp.leu p43,p41=a5,n5 | ||
| 547 | (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] | ||
| 548 | (p16) nop.i 0 };; | ||
| 549 | { .mii; (p17) getf.sig n7=nlo[8] // 11: | ||
| 550 | (p48) cmp.ltu p51,p49=t[4],a5 | ||
| 551 | (p50) cmp.leu p51,p49=t[4],a5 };; | ||
| 552 | .pred.rel "mutex",p41,p43 | ||
| 553 | .pred.rel "mutex",p49,p51 | ||
| 554 | { .mfi; (p17) getf.sig n8=nhi[8] // 12: | ||
| 555 | (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] | ||
| 556 | (p41) add a6=a6,n6 } // (p17) a6+=n6 | ||
| 557 | { .mfi; (p43) add a6=a6,n6,1 | ||
| 558 | (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] | ||
| 559 | (p16) nop.i 0 };; | ||
| 560 | { .mii; (p16) getf.sig a2=alo[2] // 13: | ||
| 561 | (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 | ||
| 562 | (p51) add t[3]=t[3],a6,1 };; | ||
| 563 | { .mfi; (p16) nop.m 0 // 14: | ||
| 564 | (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 | ||
| 565 | (p41) cmp.ltu p42,p40=a6,n6 } | ||
| 566 | { .mfi; (p43) cmp.leu p42,p40=a6,n6 | ||
| 567 | (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] | ||
| 568 | (p16) nop.i 0 };; | ||
| 569 | { .mii; (p16) nop.m 0 // 15: | ||
| 570 | (p49) cmp.ltu p50,p48=t[3],a6 | ||
| 571 | (p51) cmp.leu p50,p48=t[3],a6 };; | ||
| 572 | .pred.rel "mutex",p40,p42 | ||
| 573 | .pred.rel "mutex",p48,p50 | ||
| 574 | { .mfi; (p16) nop.m 0 // 16: | ||
| 575 | (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] | ||
| 576 | (p40) add a7=a7,n7 } // (p17) a7+=n7 | ||
| 577 | { .mfi; (p42) add a7=a7,n7,1 | ||
| 578 | (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] | ||
| 579 | (p16) nop.i 0 };; | ||
| 580 | { .mii; (p16) getf.sig a3=alo[3] // 17: | ||
| 581 | (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 | ||
| 582 | (p50) add t[2]=t[2],a7,1 };; | ||
| 583 | { .mfi; (p16) nop.m 0 // 18: | ||
| 584 | (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 | ||
| 585 | (p40) cmp.ltu p43,p41=a7,n7 } | ||
| 586 | { .mfi; (p42) cmp.leu p43,p41=a7,n7 | ||
| 587 | (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] | ||
| 588 | (p16) nop.i 0 };; | ||
| 589 | { .mii; (p16) getf.sig n1=nlo[1] // 19: | ||
| 590 | (p48) cmp.ltu p51,p49=t[2],a7 | ||
| 591 | (p50) cmp.leu p51,p49=t[2],a7 };; | ||
| 592 | .pred.rel "mutex",p41,p43 | ||
| 593 | .pred.rel "mutex",p49,p51 | ||
| 594 | { .mfi; (p16) nop.m 0 // 20: | ||
| 595 | (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] | ||
| 596 | (p41) add a8=a8,n8 } // (p17) a8+=n8 | ||
| 597 | { .mfi; (p43) add a8=a8,n8,1 | ||
| 598 | (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] | ||
| 599 | (p16) nop.i 0 };; | ||
| 600 | { .mii; (p16) getf.sig a4=alo[4] // 21: | ||
| 601 | (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 | ||
| 602 | (p51) add t[1]=t[1],a8,1 };; | ||
| 603 | { .mfi; (p16) nop.m 0 // 22: | ||
| 604 | (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 | ||
| 605 | (p41) cmp.ltu p42,p40=a8,n8 } | ||
| 606 | { .mfi; (p43) cmp.leu p42,p40=a8,n8 | ||
| 607 | (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] | ||
| 608 | (p16) nop.i 0 };; | ||
| 609 | { .mii; (p16) getf.sig n2=nlo[2] // 23: | ||
| 610 | (p49) cmp.ltu p50,p48=t[1],a8 | ||
| 611 | (p51) cmp.leu p50,p48=t[1],a8 };; | ||
| 612 | { .mfi; (p16) nop.m 0 // 24: | ||
| 613 | (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] | ||
| 614 | (p16) add a1=a1,n1 } // (p16) a1+=n1 | ||
| 615 | { .mfi; (p16) nop.m 0 | ||
| 616 | (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] | ||
| 617 | (p17) mov t[0]=r0 };; | ||
| 618 | { .mii; (p16) getf.sig a5=alo[5] // 25: | ||
| 619 | (p16) add t0=t[7],a1 // (p16) t[7]+=a1 | ||
| 620 | (p42) add t[0]=t[0],r0,1 };; | ||
| 621 | { .mfi; (p16) setf.sig tf[0]=t0 // 26: | ||
| 622 | (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 | ||
| 623 | (p50) add t[0]=t[0],r0,1 } | ||
| 624 | { .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 | ||
| 625 | (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] | ||
| 626 | (p16) nop.i 0 };; | ||
| 627 | { .mii; (p16) getf.sig n3=nlo[3] // 27: | ||
| 628 | (p16) cmp.ltu.unc p50,p48=t0,a1 | ||
| 629 | (p16) nop.i 0 };; | ||
| 630 | .pred.rel "mutex",p40,p42 | ||
| 631 | .pred.rel "mutex",p48,p50 | ||
| 632 | { .mfi; (p16) nop.m 0 // 28: | ||
| 633 | (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] | ||
| 634 | (p40) add a2=a2,n2 } // (p16) a2+=n2 | ||
| 635 | { .mfi; (p42) add a2=a2,n2,1 | ||
| 636 | (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] | ||
| 637 | (p16) nop.i 0 };; | ||
| 638 | { .mii; (p16) getf.sig a6=alo[6] // 29: | ||
| 639 | (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 | ||
| 640 | (p50) add t[6]=t[6],a2,1 };; | ||
| 641 | { .mfi; (p16) nop.m 0 // 30: | ||
| 642 | (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 | ||
| 643 | (p40) cmp.ltu p41,p39=a2,n2 } | ||
| 644 | { .mfi; (p42) cmp.leu p41,p39=a2,n2 | ||
| 645 | (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] | ||
| 646 | (p16) nop.i 0 };; | ||
| 647 | { .mfi; (p16) getf.sig n4=nlo[4] // 31: | ||
| 648 | (p16) nop.f 0 | ||
| 649 | (p48) cmp.ltu p49,p47=t[6],a2 } | ||
| 650 | { .mfb; (p50) cmp.leu p49,p47=t[6],a2 | ||
| 651 | (p16) nop.f 0 | ||
| 652 | br.ctop.sptk.many .Louter_8_ctop };; | ||
| 653 | .Louter_8_cend: | ||
| 654 | |||
| 655 | // above loop has to execute one more time, without (p16), which is | ||
| 656 | // replaced with merged move of np[8] to GPR bank | ||
| 657 | .pred.rel "mutex",p40,p42 | ||
| 658 | .pred.rel "mutex",p48,p50 | ||
| 659 | { .mmi; (p0) getf.sig n1=ni0 // 0: | ||
| 660 | (p40) add a3=a3,n3 // (p17) a3+=n3 | ||
| 661 | (p42) add a3=a3,n3,1 };; | ||
| 662 | { .mii; (p17) getf.sig a7=alo[8] // 1: | ||
| 663 | (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 | ||
| 664 | (p50) add t[6]=t[6],a3,1 };; | ||
| 665 | { .mfi; (p17) getf.sig a8=ahi[8] // 2: | ||
| 666 | (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 | ||
| 667 | (p40) cmp.ltu p43,p41=a3,n3 } | ||
| 668 | { .mfi; (p42) cmp.leu p43,p41=a3,n3 | ||
| 669 | (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] | ||
| 670 | (p0) nop.i 0 };; | ||
| 671 | { .mii; (p17) getf.sig n5=nlo[6] // 3: | ||
| 672 | (p48) cmp.ltu p51,p49=t[6],a3 | ||
| 673 | (p50) cmp.leu p51,p49=t[6],a3 };; | ||
| 674 | .pred.rel "mutex",p41,p43 | ||
| 675 | .pred.rel "mutex",p49,p51 | ||
| 676 | { .mmi; (p0) getf.sig n2=ni1 // 4: | ||
| 677 | (p41) add a4=a4,n4 // (p17) a4+=n4 | ||
| 678 | (p43) add a4=a4,n4,1 };; | ||
| 679 | { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 | ||
| 680 | (p0) nop.f 0 | ||
| 681 | (p51) add t[5]=t[5],a4,1 };; | ||
| 682 | { .mfi; (p0) getf.sig n3=ni2 // 6: | ||
| 683 | (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 | ||
| 684 | (p41) cmp.ltu p42,p40=a4,n4 } | ||
| 685 | { .mfi; (p43) cmp.leu p42,p40=a4,n4 | ||
| 686 | (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] | ||
| 687 | (p0) nop.i 0 };; | ||
| 688 | { .mii; (p17) getf.sig n6=nlo[7] // 7: | ||
| 689 | (p49) cmp.ltu p50,p48=t[5],a4 | ||
| 690 | (p51) cmp.leu p50,p48=t[5],a4 };; | ||
| 691 | .pred.rel "mutex",p40,p42 | ||
| 692 | .pred.rel "mutex",p48,p50 | ||
| 693 | { .mii; (p0) getf.sig n4=ni3 // 8: | ||
| 694 | (p40) add a5=a5,n5 // (p17) a5+=n5 | ||
| 695 | (p42) add a5=a5,n5,1 };; | ||
| 696 | { .mii; (p0) nop.m 0 // 9: | ||
| 697 | (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 | ||
| 698 | (p50) add t[4]=t[4],a5,1 };; | ||
| 699 | { .mii; (p0) nop.m 0 // 10: | ||
| 700 | (p40) cmp.ltu p43,p41=a5,n5 | ||
| 701 | (p42) cmp.leu p43,p41=a5,n5 };; | ||
| 702 | { .mii; (p17) getf.sig n7=nlo[8] // 11: | ||
| 703 | (p48) cmp.ltu p51,p49=t[4],a5 | ||
| 704 | (p50) cmp.leu p51,p49=t[4],a5 };; | ||
| 705 | .pred.rel "mutex",p41,p43 | ||
| 706 | .pred.rel "mutex",p49,p51 | ||
| 707 | { .mii; (p17) getf.sig n8=nhi[8] // 12: | ||
| 708 | (p41) add a6=a6,n6 // (p17) a6+=n6 | ||
| 709 | (p43) add a6=a6,n6,1 };; | ||
| 710 | { .mii; (p0) getf.sig n5=ni4 // 13: | ||
| 711 | (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 | ||
| 712 | (p51) add t[3]=t[3],a6,1 };; | ||
| 713 | { .mii; (p0) nop.m 0 // 14: | ||
| 714 | (p41) cmp.ltu p42,p40=a6,n6 | ||
| 715 | (p43) cmp.leu p42,p40=a6,n6 };; | ||
| 716 | { .mii; (p0) getf.sig n6=ni5 // 15: | ||
| 717 | (p49) cmp.ltu p50,p48=t[3],a6 | ||
| 718 | (p51) cmp.leu p50,p48=t[3],a6 };; | ||
| 719 | .pred.rel "mutex",p40,p42 | ||
| 720 | .pred.rel "mutex",p48,p50 | ||
| 721 | { .mii; (p0) nop.m 0 // 16: | ||
| 722 | (p40) add a7=a7,n7 // (p17) a7+=n7 | ||
| 723 | (p42) add a7=a7,n7,1 };; | ||
| 724 | { .mii; (p0) nop.m 0 // 17: | ||
| 725 | (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 | ||
| 726 | (p50) add t[2]=t[2],a7,1 };; | ||
| 727 | { .mii; (p0) nop.m 0 // 18: | ||
| 728 | (p40) cmp.ltu p43,p41=a7,n7 | ||
| 729 | (p42) cmp.leu p43,p41=a7,n7 };; | ||
| 730 | { .mii; (p0) getf.sig n7=ni6 // 19: | ||
| 731 | (p48) cmp.ltu p51,p49=t[2],a7 | ||
| 732 | (p50) cmp.leu p51,p49=t[2],a7 };; | ||
| 733 | .pred.rel "mutex",p41,p43 | ||
| 734 | .pred.rel "mutex",p49,p51 | ||
| 735 | { .mii; (p0) nop.m 0 // 20: | ||
| 736 | (p41) add a8=a8,n8 // (p17) a8+=n8 | ||
| 737 | (p43) add a8=a8,n8,1 };; | ||
| 738 | { .mmi; (p0) nop.m 0 // 21: | ||
| 739 | (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 | ||
| 740 | (p51) add t[1]=t[1],a8,1 } | ||
| 741 | { .mmi; (p17) mov t[0]=r0 | ||
| 742 | (p41) cmp.ltu p42,p40=a8,n8 | ||
| 743 | (p43) cmp.leu p42,p40=a8,n8 };; | ||
| 744 | { .mmi; (p0) getf.sig n8=ni7 // 22: | ||
| 745 | (p49) cmp.ltu p50,p48=t[1],a8 | ||
| 746 | (p51) cmp.leu p50,p48=t[1],a8 } | ||
| 747 | { .mmi; (p42) add t[0]=t[0],r0,1 | ||
| 748 | (p0) add r16=-7*16,prevsp | ||
| 749 | (p0) add r17=-6*16,prevsp };; | ||
| 750 | |||
| 751 | // subtract np[8] from carrybit|tmp[8] | ||
| 752 | // carrybit|tmp[8] layout upon exit from above loop is: | ||
| 753 | // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) | ||
| 754 | { .mmi; (p50)add t[0]=t[0],r0,1 | ||
| 755 | add r18=-5*16,prevsp | ||
| 756 | sub n1=t0,n1 };; | ||
| 757 | { .mmi; cmp.gtu p34,p32=n1,t0;; | ||
| 758 | .pred.rel "mutex",p32,p34 | ||
| 759 | (p32)sub n2=t[7],n2 | ||
| 760 | (p34)sub n2=t[7],n2,1 };; | ||
| 761 | { .mii; (p32)cmp.gtu p35,p33=n2,t[7] | ||
| 762 | (p34)cmp.geu p35,p33=n2,t[7];; | ||
| 763 | .pred.rel "mutex",p33,p35 | ||
| 764 | (p33)sub n3=t[6],n3 } | ||
| 765 | { .mmi; (p35)sub n3=t[6],n3,1;; | ||
| 766 | (p33)cmp.gtu p34,p32=n3,t[6] | ||
| 767 | (p35)cmp.geu p34,p32=n3,t[6] };; | ||
| 768 | .pred.rel "mutex",p32,p34 | ||
| 769 | { .mii; (p32)sub n4=t[5],n4 | ||
| 770 | (p34)sub n4=t[5],n4,1;; | ||
| 771 | (p32)cmp.gtu p35,p33=n4,t[5] } | ||
| 772 | { .mmi; (p34)cmp.geu p35,p33=n4,t[5];; | ||
| 773 | .pred.rel "mutex",p33,p35 | ||
| 774 | (p33)sub n5=t[4],n5 | ||
| 775 | (p35)sub n5=t[4],n5,1 };; | ||
| 776 | { .mii; (p33)cmp.gtu p34,p32=n5,t[4] | ||
| 777 | (p35)cmp.geu p34,p32=n5,t[4];; | ||
| 778 | .pred.rel "mutex",p32,p34 | ||
| 779 | (p32)sub n6=t[3],n6 } | ||
| 780 | { .mmi; (p34)sub n6=t[3],n6,1;; | ||
| 781 | (p32)cmp.gtu p35,p33=n6,t[3] | ||
| 782 | (p34)cmp.geu p35,p33=n6,t[3] };; | ||
| 783 | .pred.rel "mutex",p33,p35 | ||
| 784 | { .mii; (p33)sub n7=t[2],n7 | ||
| 785 | (p35)sub n7=t[2],n7,1;; | ||
| 786 | (p33)cmp.gtu p34,p32=n7,t[2] } | ||
| 787 | { .mmi; (p35)cmp.geu p34,p32=n7,t[2];; | ||
| 788 | .pred.rel "mutex",p32,p34 | ||
| 789 | (p32)sub n8=t[1],n8 | ||
| 790 | (p34)sub n8=t[1],n8,1 };; | ||
| 791 | { .mii; (p32)cmp.gtu p35,p33=n8,t[1] | ||
| 792 | (p34)cmp.geu p35,p33=n8,t[1];; | ||
| 793 | .pred.rel "mutex",p33,p35 | ||
| 794 | (p33)sub a8=t[0],r0 } | ||
| 795 | { .mmi; (p35)sub a8=t[0],r0,1;; | ||
| 796 | (p33)cmp.gtu p34,p32=a8,t[0] | ||
| 797 | (p35)cmp.geu p34,p32=a8,t[0] };; | ||
| 798 | |||
| 799 | // save the result, either tmp[num] or tmp[num]-np[num] | ||
| 800 | .pred.rel "mutex",p32,p34 | ||
| 801 | { .mmi; (p32)st8 [rptr]=n1,8 | ||
| 802 | (p34)st8 [rptr]=t0,8 | ||
| 803 | add r19=-4*16,prevsp};; | ||
| 804 | { .mmb; (p32)st8 [rptr]=n2,8 | ||
| 805 | (p34)st8 [rptr]=t[7],8 | ||
| 806 | (p5)br.cond.dpnt.few .Ldone };; | ||
| 807 | { .mmb; (p32)st8 [rptr]=n3,8 | ||
| 808 | (p34)st8 [rptr]=t[6],8 | ||
| 809 | (p7)br.cond.dpnt.few .Ldone };; | ||
| 810 | { .mmb; (p32)st8 [rptr]=n4,8 | ||
| 811 | (p34)st8 [rptr]=t[5],8 | ||
| 812 | (p9)br.cond.dpnt.few .Ldone };; | ||
| 813 | { .mmb; (p32)st8 [rptr]=n5,8 | ||
| 814 | (p34)st8 [rptr]=t[4],8 | ||
| 815 | (p11)br.cond.dpnt.few .Ldone };; | ||
| 816 | { .mmb; (p32)st8 [rptr]=n6,8 | ||
| 817 | (p34)st8 [rptr]=t[3],8 | ||
| 818 | (p13)br.cond.dpnt.few .Ldone };; | ||
| 819 | { .mmb; (p32)st8 [rptr]=n7,8 | ||
| 820 | (p34)st8 [rptr]=t[2],8 | ||
| 821 | (p15)br.cond.dpnt.few .Ldone };; | ||
| 822 | { .mmb; (p32)st8 [rptr]=n8,8 | ||
| 823 | (p34)st8 [rptr]=t[1],8 | ||
| 824 | nop.b 0 };; | ||
| 825 | .Ldone: // epilogue | ||
| 826 | { .mmi; ldf.fill f16=[r16],64 | ||
| 827 | ldf.fill f17=[r17],64 | ||
| 828 | nop.i 0 } | ||
| 829 | { .mmi; ldf.fill f18=[r18],64 | ||
| 830 | ldf.fill f19=[r19],64 | ||
| 831 | mov pr=prevpr,0x1ffff };; | ||
| 832 | { .mmi; ldf.fill f20=[r16] | ||
| 833 | ldf.fill f21=[r17] | ||
| 834 | mov ar.lc=prevlc } | ||
| 835 | { .mmi; ldf.fill f22=[r18] | ||
| 836 | ldf.fill f23=[r19] | ||
| 837 | mov ret0=1 } // signal "handled" | ||
| 838 | { .mib; rum 1<<5 | ||
| 839 | .restore sp | ||
| 840 | mov sp=prevsp | ||
| 841 | br.ret.sptk.many b0 };; | ||
| 842 | .endp bn_mul_mont_8# | ||
| 843 | |||
| 844 | .type copyright#,\@object | ||
| 845 | copyright: | ||
| 846 | stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 847 | ___ | ||
| 848 | |||
| 849 | $output=shift and open STDOUT,">$output"; | ||
| 850 | print $code; | ||
| 851 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/mips-mont.pl b/src/lib/libcrypto/bn/asm/mips-mont.pl new file mode 100644 index 0000000000..b944a12b8e --- /dev/null +++ b/src/lib/libcrypto/bn/asm/mips-mont.pl | |||
| @@ -0,0 +1,426 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # This module doesn't present direct interest for OpenSSL, because it | ||
| 11 | # doesn't provide better performance for longer keys, at least not on | ||
| 12 | # in-order-execution cores. While 512-bit RSA sign operations can be | ||
| 13 | # 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and | ||
| 14 | # 4096-bit ones are up to 15% slower. In 32-bit mode it varies from | ||
| 15 | # 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA | ||
| 16 | # verify:-( All comparisons are against bn_mul_mont-free assembler. | ||
| 17 | # The module might be of interest to embedded system developers, as | ||
| 18 | # the code is smaller than 1KB, yet offers >3x improvement on MIPS64 | ||
| 19 | # and 75-30% [less for longer keys] on MIPS32 over compiler-generated | ||
| 20 | # code. | ||
| 21 | |||
| 22 | ###################################################################### | ||
| 23 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | ||
| 24 | # widely used. Then there is a new contender: NUBI. It appears that if | ||
| 25 | # one picks the latter, it's possible to arrange code in ABI neutral | ||
| 26 | # manner. Therefore let's stick to NUBI register layout: | ||
| 27 | # | ||
| 28 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | ||
| 29 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
| 30 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | ||
| 31 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | ||
| 32 | # | ||
| 33 | # The return value is placed in $a0. Following coding rules facilitate | ||
| 34 | # interoperability: | ||
| 35 | # | ||
| 36 | # - never ever touch $tp, "thread pointer", former $gp; | ||
| 37 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | ||
| 38 | # old code]; | ||
| 39 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | ||
| 40 | # | ||
| 41 | # For reference here is register layout for N32/64 MIPS ABIs: | ||
| 42 | # | ||
| 43 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
| 44 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
| 45 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
| 46 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
| 47 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
| 48 | # | ||
| 49 | $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 | ||
| 50 | |||
| 51 | if ($flavour =~ /64|n32/i) { | ||
| 52 | $PTR_ADD="dadd"; # incidentally works even on n32 | ||
| 53 | $PTR_SUB="dsub"; # incidentally works even on n32 | ||
| 54 | $REG_S="sd"; | ||
| 55 | $REG_L="ld"; | ||
| 56 | $SZREG=8; | ||
| 57 | } else { | ||
| 58 | $PTR_ADD="add"; | ||
| 59 | $PTR_SUB="sub"; | ||
| 60 | $REG_S="sw"; | ||
| 61 | $REG_L="lw"; | ||
| 62 | $SZREG=4; | ||
| 63 | } | ||
| 64 | $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; | ||
| 65 | # | ||
| 66 | # <appro@openssl.org> | ||
| 67 | # | ||
| 68 | ###################################################################### | ||
| 69 | |||
| 70 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 71 | open STDOUT,">$output"; | ||
| 72 | |||
| 73 | if ($flavour =~ /64|n32/i) { | ||
| 74 | $LD="ld"; | ||
| 75 | $ST="sd"; | ||
| 76 | $MULTU="dmultu"; | ||
| 77 | $ADDU="daddu"; | ||
| 78 | $SUBU="dsubu"; | ||
| 79 | $BNSZ=8; | ||
| 80 | } else { | ||
| 81 | $LD="lw"; | ||
| 82 | $ST="sw"; | ||
| 83 | $MULTU="multu"; | ||
| 84 | $ADDU="addu"; | ||
| 85 | $SUBU="subu"; | ||
| 86 | $BNSZ=4; | ||
| 87 | } | ||
| 88 | |||
| 89 | # int bn_mul_mont( | ||
| 90 | $rp=$a0; # BN_ULONG *rp, | ||
| 91 | $ap=$a1; # const BN_ULONG *ap, | ||
| 92 | $bp=$a2; # const BN_ULONG *bp, | ||
| 93 | $np=$a3; # const BN_ULONG *np, | ||
| 94 | $n0=$a4; # const BN_ULONG *n0, | ||
| 95 | $num=$a5; # int num); | ||
| 96 | |||
| 97 | $lo0=$a6; | ||
| 98 | $hi0=$a7; | ||
| 99 | $lo1=$t1; | ||
| 100 | $hi1=$t2; | ||
| 101 | $aj=$s0; | ||
| 102 | $bi=$s1; | ||
| 103 | $nj=$s2; | ||
| 104 | $tp=$s3; | ||
| 105 | $alo=$s4; | ||
| 106 | $ahi=$s5; | ||
| 107 | $nlo=$s6; | ||
| 108 | $nhi=$s7; | ||
| 109 | $tj=$s8; | ||
| 110 | $i=$s9; | ||
| 111 | $j=$s10; | ||
| 112 | $m1=$s11; | ||
| 113 | |||
| 114 | $FRAMESIZE=14; | ||
| 115 | |||
| 116 | $code=<<___; | ||
| 117 | .text | ||
| 118 | |||
| 119 | .set noat | ||
| 120 | .set noreorder | ||
| 121 | |||
| 122 | .align 5 | ||
| 123 | .globl bn_mul_mont | ||
| 124 | .ent bn_mul_mont | ||
| 125 | bn_mul_mont: | ||
| 126 | ___ | ||
| 127 | $code.=<<___ if ($flavour =~ /o32/i); | ||
| 128 | lw $n0,16($sp) | ||
| 129 | lw $num,20($sp) | ||
| 130 | ___ | ||
| 131 | $code.=<<___; | ||
| 132 | slt $at,$num,4 | ||
| 133 | bnez $at,1f | ||
| 134 | li $t0,0 | ||
| 135 | slt $at,$num,17 # on in-order CPU | ||
| 136 | bnezl $at,bn_mul_mont_internal | ||
| 137 | nop | ||
| 138 | 1: jr $ra | ||
| 139 | li $a0,0 | ||
| 140 | .end bn_mul_mont | ||
| 141 | |||
| 142 | .align 5 | ||
| 143 | .ent bn_mul_mont_internal | ||
| 144 | bn_mul_mont_internal: | ||
| 145 | .frame $fp,$FRAMESIZE*$SZREG,$ra | ||
| 146 | .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG | ||
| 147 | $PTR_SUB $sp,$FRAMESIZE*$SZREG | ||
| 148 | $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) | ||
| 149 | $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) | ||
| 150 | $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) | ||
| 151 | $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) | ||
| 152 | $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) | ||
| 153 | $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) | ||
| 154 | $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) | ||
| 155 | $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) | ||
| 156 | $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) | ||
| 157 | ___ | ||
| 158 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 159 | $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) | ||
| 160 | $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) | ||
| 161 | $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) | ||
| 162 | $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) | ||
| 163 | ___ | ||
| 164 | $code.=<<___; | ||
| 165 | move $fp,$sp | ||
| 166 | |||
| 167 | .set reorder | ||
| 168 | $LD $n0,0($n0) | ||
| 169 | $LD $bi,0($bp) # bp[0] | ||
| 170 | $LD $aj,0($ap) # ap[0] | ||
| 171 | $LD $nj,0($np) # np[0] | ||
| 172 | |||
| 173 | $PTR_SUB $sp,2*$BNSZ # place for two extra words | ||
| 174 | sll $num,`log($BNSZ)/log(2)` | ||
| 175 | li $at,-4096 | ||
| 176 | $PTR_SUB $sp,$num | ||
| 177 | and $sp,$at | ||
| 178 | |||
| 179 | $MULTU $aj,$bi | ||
| 180 | $LD $alo,$BNSZ($ap) | ||
| 181 | $LD $nlo,$BNSZ($np) | ||
| 182 | mflo $lo0 | ||
| 183 | mfhi $hi0 | ||
| 184 | $MULTU $lo0,$n0 | ||
| 185 | mflo $m1 | ||
| 186 | |||
| 187 | $MULTU $alo,$bi | ||
| 188 | mflo $alo | ||
| 189 | mfhi $ahi | ||
| 190 | |||
| 191 | $MULTU $nj,$m1 | ||
| 192 | mflo $lo1 | ||
| 193 | mfhi $hi1 | ||
| 194 | $MULTU $nlo,$m1 | ||
| 195 | $ADDU $lo1,$lo0 | ||
| 196 | sltu $at,$lo1,$lo0 | ||
| 197 | $ADDU $hi1,$at | ||
| 198 | mflo $nlo | ||
| 199 | mfhi $nhi | ||
| 200 | |||
| 201 | move $tp,$sp | ||
| 202 | li $j,2*$BNSZ | ||
| 203 | .align 4 | ||
| 204 | .L1st: | ||
| 205 | .set noreorder | ||
| 206 | $PTR_ADD $aj,$ap,$j | ||
| 207 | $PTR_ADD $nj,$np,$j | ||
| 208 | $LD $aj,($aj) | ||
| 209 | $LD $nj,($nj) | ||
| 210 | |||
| 211 | $MULTU $aj,$bi | ||
| 212 | $ADDU $lo0,$alo,$hi0 | ||
| 213 | $ADDU $lo1,$nlo,$hi1 | ||
| 214 | sltu $at,$lo0,$hi0 | ||
| 215 | sltu $t0,$lo1,$hi1 | ||
| 216 | $ADDU $hi0,$ahi,$at | ||
| 217 | $ADDU $hi1,$nhi,$t0 | ||
| 218 | mflo $alo | ||
| 219 | mfhi $ahi | ||
| 220 | |||
| 221 | $ADDU $lo1,$lo0 | ||
| 222 | sltu $at,$lo1,$lo0 | ||
| 223 | $MULTU $nj,$m1 | ||
| 224 | $ADDU $hi1,$at | ||
| 225 | addu $j,$BNSZ | ||
| 226 | $ST $lo1,($tp) | ||
| 227 | sltu $t0,$j,$num | ||
| 228 | mflo $nlo | ||
| 229 | mfhi $nhi | ||
| 230 | |||
| 231 | bnez $t0,.L1st | ||
| 232 | $PTR_ADD $tp,$BNSZ | ||
| 233 | .set reorder | ||
| 234 | |||
| 235 | $ADDU $lo0,$alo,$hi0 | ||
| 236 | sltu $at,$lo0,$hi0 | ||
| 237 | $ADDU $hi0,$ahi,$at | ||
| 238 | |||
| 239 | $ADDU $lo1,$nlo,$hi1 | ||
| 240 | sltu $t0,$lo1,$hi1 | ||
| 241 | $ADDU $hi1,$nhi,$t0 | ||
| 242 | $ADDU $lo1,$lo0 | ||
| 243 | sltu $at,$lo1,$lo0 | ||
| 244 | $ADDU $hi1,$at | ||
| 245 | |||
| 246 | $ST $lo1,($tp) | ||
| 247 | |||
| 248 | $ADDU $hi1,$hi0 | ||
| 249 | sltu $at,$hi1,$hi0 | ||
| 250 | $ST $hi1,$BNSZ($tp) | ||
| 251 | $ST $at,2*$BNSZ($tp) | ||
| 252 | |||
| 253 | li $i,$BNSZ | ||
| 254 | .align 4 | ||
| 255 | .Louter: | ||
| 256 | $PTR_ADD $bi,$bp,$i | ||
| 257 | $LD $bi,($bi) | ||
| 258 | $LD $aj,($ap) | ||
| 259 | $LD $alo,$BNSZ($ap) | ||
| 260 | $LD $tj,($sp) | ||
| 261 | |||
| 262 | $MULTU $aj,$bi | ||
| 263 | $LD $nj,($np) | ||
| 264 | $LD $nlo,$BNSZ($np) | ||
| 265 | mflo $lo0 | ||
| 266 | mfhi $hi0 | ||
| 267 | $ADDU $lo0,$tj | ||
| 268 | $MULTU $lo0,$n0 | ||
| 269 | sltu $at,$lo0,$tj | ||
| 270 | $ADDU $hi0,$at | ||
| 271 | mflo $m1 | ||
| 272 | |||
| 273 | $MULTU $alo,$bi | ||
| 274 | mflo $alo | ||
| 275 | mfhi $ahi | ||
| 276 | |||
| 277 | $MULTU $nj,$m1 | ||
| 278 | mflo $lo1 | ||
| 279 | mfhi $hi1 | ||
| 280 | |||
| 281 | $MULTU $nlo,$m1 | ||
| 282 | $ADDU $lo1,$lo0 | ||
| 283 | sltu $at,$lo1,$lo0 | ||
| 284 | $ADDU $hi1,$at | ||
| 285 | mflo $nlo | ||
| 286 | mfhi $nhi | ||
| 287 | |||
| 288 | move $tp,$sp | ||
| 289 | li $j,2*$BNSZ | ||
| 290 | $LD $tj,$BNSZ($tp) | ||
| 291 | .align 4 | ||
| 292 | .Linner: | ||
| 293 | .set noreorder | ||
| 294 | $PTR_ADD $aj,$ap,$j | ||
| 295 | $PTR_ADD $nj,$np,$j | ||
| 296 | $LD $aj,($aj) | ||
| 297 | $LD $nj,($nj) | ||
| 298 | |||
| 299 | $MULTU $aj,$bi | ||
| 300 | $ADDU $lo0,$alo,$hi0 | ||
| 301 | $ADDU $lo1,$nlo,$hi1 | ||
| 302 | sltu $at,$lo0,$hi0 | ||
| 303 | sltu $t0,$lo1,$hi1 | ||
| 304 | $ADDU $hi0,$ahi,$at | ||
| 305 | $ADDU $hi1,$nhi,$t0 | ||
| 306 | mflo $alo | ||
| 307 | mfhi $ahi | ||
| 308 | |||
| 309 | $ADDU $lo0,$tj | ||
| 310 | addu $j,$BNSZ | ||
| 311 | $MULTU $nj,$m1 | ||
| 312 | sltu $at,$lo0,$tj | ||
| 313 | $ADDU $lo1,$lo0 | ||
| 314 | $ADDU $hi0,$at | ||
| 315 | sltu $t0,$lo1,$lo0 | ||
| 316 | $LD $tj,2*$BNSZ($tp) | ||
| 317 | $ADDU $hi1,$t0 | ||
| 318 | sltu $at,$j,$num | ||
| 319 | mflo $nlo | ||
| 320 | mfhi $nhi | ||
| 321 | $ST $lo1,($tp) | ||
| 322 | bnez $at,.Linner | ||
| 323 | $PTR_ADD $tp,$BNSZ | ||
| 324 | .set reorder | ||
| 325 | |||
| 326 | $ADDU $lo0,$alo,$hi0 | ||
| 327 | sltu $at,$lo0,$hi0 | ||
| 328 | $ADDU $hi0,$ahi,$at | ||
| 329 | $ADDU $lo0,$tj | ||
| 330 | sltu $t0,$lo0,$tj | ||
| 331 | $ADDU $hi0,$t0 | ||
| 332 | |||
| 333 | $LD $tj,2*$BNSZ($tp) | ||
| 334 | $ADDU $lo1,$nlo,$hi1 | ||
| 335 | sltu $at,$lo1,$hi1 | ||
| 336 | $ADDU $hi1,$nhi,$at | ||
| 337 | $ADDU $lo1,$lo0 | ||
| 338 | sltu $t0,$lo1,$lo0 | ||
| 339 | $ADDU $hi1,$t0 | ||
| 340 | $ST $lo1,($tp) | ||
| 341 | |||
| 342 | $ADDU $lo1,$hi1,$hi0 | ||
| 343 | sltu $hi1,$lo1,$hi0 | ||
| 344 | $ADDU $lo1,$tj | ||
| 345 | sltu $at,$lo1,$tj | ||
| 346 | $ADDU $hi1,$at | ||
| 347 | $ST $lo1,$BNSZ($tp) | ||
| 348 | $ST $hi1,2*$BNSZ($tp) | ||
| 349 | |||
| 350 | addu $i,$BNSZ | ||
| 351 | sltu $t0,$i,$num | ||
| 352 | bnez $t0,.Louter | ||
| 353 | |||
| 354 | .set noreorder | ||
| 355 | $PTR_ADD $tj,$sp,$num # &tp[num] | ||
| 356 | move $tp,$sp | ||
| 357 | move $ap,$sp | ||
| 358 | li $hi0,0 # clear borrow bit | ||
| 359 | |||
| 360 | .align 4 | ||
| 361 | .Lsub: $LD $lo0,($tp) | ||
| 362 | $LD $lo1,($np) | ||
| 363 | $PTR_ADD $tp,$BNSZ | ||
| 364 | $PTR_ADD $np,$BNSZ | ||
| 365 | $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] | ||
| 366 | sgtu $at,$lo1,$lo0 | ||
| 367 | $SUBU $lo0,$lo1,$hi0 | ||
| 368 | sgtu $hi0,$lo0,$lo1 | ||
| 369 | $ST $lo0,($rp) | ||
| 370 | or $hi0,$at | ||
| 371 | sltu $at,$tp,$tj | ||
| 372 | bnez $at,.Lsub | ||
| 373 | $PTR_ADD $rp,$BNSZ | ||
| 374 | |||
| 375 | $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit | ||
| 376 | move $tp,$sp | ||
| 377 | $PTR_SUB $rp,$num # restore rp | ||
| 378 | not $hi1,$hi0 | ||
| 379 | |||
| 380 | and $ap,$hi0,$sp | ||
| 381 | and $bp,$hi1,$rp | ||
| 382 | or $ap,$ap,$bp # ap=borrow?tp:rp | ||
| 383 | |||
| 384 | .align 4 | ||
| 385 | .Lcopy: $LD $aj,($ap) | ||
| 386 | $PTR_ADD $ap,$BNSZ | ||
| 387 | $ST $zero,($tp) | ||
| 388 | $PTR_ADD $tp,$BNSZ | ||
| 389 | sltu $at,$tp,$tj | ||
| 390 | $ST $aj,($rp) | ||
| 391 | bnez $at,.Lcopy | ||
| 392 | $PTR_ADD $rp,$BNSZ | ||
| 393 | |||
| 394 | li $a0,1 | ||
| 395 | li $t0,1 | ||
| 396 | |||
| 397 | .set noreorder | ||
| 398 | move $sp,$fp | ||
| 399 | $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) | ||
| 400 | $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) | ||
| 401 | $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) | ||
| 402 | $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) | ||
| 403 | $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) | ||
| 404 | $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) | ||
| 405 | $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) | ||
| 406 | $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) | ||
| 407 | $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) | ||
| 408 | ___ | ||
| 409 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 410 | $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) | ||
| 411 | $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) | ||
| 412 | $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) | ||
| 413 | $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) | ||
| 414 | ___ | ||
| 415 | $code.=<<___; | ||
| 416 | jr $ra | ||
| 417 | $PTR_ADD $sp,$FRAMESIZE*$SZREG | ||
| 418 | .end bn_mul_mont_internal | ||
| 419 | .rdata | ||
| 420 | .asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 421 | ___ | ||
| 422 | |||
| 423 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 424 | |||
| 425 | print $code; | ||
| 426 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/mips.pl b/src/lib/libcrypto/bn/asm/mips.pl new file mode 100644 index 0000000000..c162a3ec23 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/mips.pl | |||
| @@ -0,0 +1,2585 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. | ||
| 6 | # | ||
| 7 | # Rights for redistribution and usage in source and binary forms are | ||
| 8 | # granted according to the OpenSSL license. Warranty of any kind is | ||
| 9 | # disclaimed. | ||
| 10 | # ==================================================================== | ||
| 11 | |||
| 12 | |||
| 13 | # July 1999 | ||
| 14 | # | ||
| 15 | # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. | ||
| 16 | # | ||
| 17 | # The module is designed to work with either of the "new" MIPS ABI(5), | ||
| 18 | # namely N32 or N64, offered by IRIX 6.x. It's not ment to work under | ||
| 19 | # IRIX 5.x not only because it doesn't support new ABIs but also | ||
| 20 | # because 5.x kernels put R4x00 CPU into 32-bit mode and all those | ||
| 21 | # 64-bit instructions (daddu, dmultu, etc.) found below gonna only | ||
| 22 | # cause illegal instruction exception:-( | ||
| 23 | # | ||
| 24 | # In addition the code depends on preprocessor flags set up by MIPSpro | ||
| 25 | # compiler driver (either as or cc) and therefore (probably?) can't be | ||
| 26 | # compiled by the GNU assembler. GNU C driver manages fine though... | ||
| 27 | # I mean as long as -mmips-as is specified or is the default option, | ||
| 28 | # because then it simply invokes /usr/bin/as which in turn takes | ||
| 29 | # perfect care of the preprocessor definitions. Another neat feature | ||
| 30 | # offered by the MIPSpro assembler is an optimization pass. This gave | ||
| 31 | # me the opportunity to have the code looking more regular as all those | ||
| 32 | # architecture dependent instruction rescheduling details were left to | ||
| 33 | # the assembler. Cool, huh? | ||
| 34 | # | ||
| 35 | # Performance improvement is astonishing! 'apps/openssl speed rsa dsa' | ||
| 36 | # goes way over 3 times faster! | ||
| 37 | # | ||
| 38 | # <appro@fy.chalmers.se> | ||
| 39 | |||
| 40 | # October 2010 | ||
| 41 | # | ||
| 42 | # Adapt the module even for 32-bit ABIs and other OSes. The former was | ||
| 43 | # achieved by mechanical replacement of 64-bit arithmetic instructions | ||
| 44 | # such as dmultu, daddu, etc. with their 32-bit counterparts and | ||
| 45 | # adjusting offsets denoting multiples of BN_ULONG. Above mentioned | ||
| 46 | # >3x performance improvement naturally does not apply to 32-bit code | ||
| 47 | # [because there is no instruction 32-bit compiler can't use], one | ||
| 48 | # has to content with 40-85% improvement depending on benchmark and | ||
| 49 | # key length, more for longer keys. | ||
| 50 | |||
| 51 | $flavour = shift; | ||
| 52 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 53 | open STDOUT,">$output"; | ||
| 54 | |||
| 55 | if ($flavour =~ /64|n32/i) { | ||
| 56 | $LD="ld"; | ||
| 57 | $ST="sd"; | ||
| 58 | $MULTU="dmultu"; | ||
| 59 | $DIVU="ddivu"; | ||
| 60 | $ADDU="daddu"; | ||
| 61 | $SUBU="dsubu"; | ||
| 62 | $SRL="dsrl"; | ||
| 63 | $SLL="dsll"; | ||
| 64 | $BNSZ=8; | ||
| 65 | $PTR_ADD="daddu"; | ||
| 66 | $PTR_SUB="dsubu"; | ||
| 67 | $SZREG=8; | ||
| 68 | $REG_S="sd"; | ||
| 69 | $REG_L="ld"; | ||
| 70 | } else { | ||
| 71 | $LD="lw"; | ||
| 72 | $ST="sw"; | ||
| 73 | $MULTU="multu"; | ||
| 74 | $DIVU="divu"; | ||
| 75 | $ADDU="addu"; | ||
| 76 | $SUBU="subu"; | ||
| 77 | $SRL="srl"; | ||
| 78 | $SLL="sll"; | ||
| 79 | $BNSZ=4; | ||
| 80 | $PTR_ADD="addu"; | ||
| 81 | $PTR_SUB="subu"; | ||
| 82 | $SZREG=4; | ||
| 83 | $REG_S="sw"; | ||
| 84 | $REG_L="lw"; | ||
| 85 | $code=".set mips2\n"; | ||
| 86 | } | ||
| 87 | |||
| 88 | # Below is N32/64 register layout used in the original module. | ||
| 89 | # | ||
| 90 | ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
| 91 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
| 92 | ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
| 93 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
| 94 | ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
| 95 | ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); | ||
| 96 | # | ||
| 97 | # No special adaptation is required for O32. NUBI on the other hand | ||
| 98 | # is treated by saving/restoring ($v1,$t0..$t3). | ||
| 99 | |||
| 100 | $gp=$v1 if ($flavour =~ /nubi/i); | ||
| 101 | |||
| 102 | $minus4=$v1; | ||
| 103 | |||
| 104 | $code.=<<___; | ||
| 105 | .rdata | ||
| 106 | .asciiz "mips3.s, Version 1.2" | ||
| 107 | .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" | ||
| 108 | |||
| 109 | .text | ||
| 110 | .set noat | ||
| 111 | |||
| 112 | .align 5 | ||
| 113 | .globl bn_mul_add_words | ||
| 114 | .ent bn_mul_add_words | ||
| 115 | bn_mul_add_words: | ||
| 116 | .set noreorder | ||
| 117 | bgtz $a2,bn_mul_add_words_internal | ||
| 118 | move $v0,$zero | ||
| 119 | jr $ra | ||
| 120 | move $a0,$v0 | ||
| 121 | .end bn_mul_add_words | ||
| 122 | |||
| 123 | .align 5 | ||
| 124 | .ent bn_mul_add_words_internal | ||
| 125 | bn_mul_add_words_internal: | ||
| 126 | ___ | ||
| 127 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 128 | .frame $sp,6*$SZREG,$ra | ||
| 129 | .mask 0x8000f008,-$SZREG | ||
| 130 | .set noreorder | ||
| 131 | $PTR_SUB $sp,6*$SZREG | ||
| 132 | $REG_S $ra,5*$SZREG($sp) | ||
| 133 | $REG_S $t3,4*$SZREG($sp) | ||
| 134 | $REG_S $t2,3*$SZREG($sp) | ||
| 135 | $REG_S $t1,2*$SZREG($sp) | ||
| 136 | $REG_S $t0,1*$SZREG($sp) | ||
| 137 | $REG_S $gp,0*$SZREG($sp) | ||
| 138 | ___ | ||
| 139 | $code.=<<___; | ||
| 140 | .set reorder | ||
| 141 | li $minus4,-4 | ||
| 142 | and $ta0,$a2,$minus4 | ||
| 143 | $LD $t0,0($a1) | ||
| 144 | beqz $ta0,.L_bn_mul_add_words_tail | ||
| 145 | |||
| 146 | .L_bn_mul_add_words_loop: | ||
| 147 | $MULTU $t0,$a3 | ||
| 148 | $LD $t1,0($a0) | ||
| 149 | $LD $t2,$BNSZ($a1) | ||
| 150 | $LD $t3,$BNSZ($a0) | ||
| 151 | $LD $ta0,2*$BNSZ($a1) | ||
| 152 | $LD $ta1,2*$BNSZ($a0) | ||
| 153 | $ADDU $t1,$v0 | ||
| 154 | sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit | ||
| 155 | # values", but it seems to work fine | ||
| 156 | # even on 64-bit registers. | ||
| 157 | mflo $at | ||
| 158 | mfhi $t0 | ||
| 159 | $ADDU $t1,$at | ||
| 160 | $ADDU $v0,$t0 | ||
| 161 | $MULTU $t2,$a3 | ||
| 162 | sltu $at,$t1,$at | ||
| 163 | $ST $t1,0($a0) | ||
| 164 | $ADDU $v0,$at | ||
| 165 | |||
| 166 | $LD $ta2,3*$BNSZ($a1) | ||
| 167 | $LD $ta3,3*$BNSZ($a0) | ||
| 168 | $ADDU $t3,$v0 | ||
| 169 | sltu $v0,$t3,$v0 | ||
| 170 | mflo $at | ||
| 171 | mfhi $t2 | ||
| 172 | $ADDU $t3,$at | ||
| 173 | $ADDU $v0,$t2 | ||
| 174 | $MULTU $ta0,$a3 | ||
| 175 | sltu $at,$t3,$at | ||
| 176 | $ST $t3,$BNSZ($a0) | ||
| 177 | $ADDU $v0,$at | ||
| 178 | |||
| 179 | subu $a2,4 | ||
| 180 | $PTR_ADD $a0,4*$BNSZ | ||
| 181 | $PTR_ADD $a1,4*$BNSZ | ||
| 182 | $ADDU $ta1,$v0 | ||
| 183 | sltu $v0,$ta1,$v0 | ||
| 184 | mflo $at | ||
| 185 | mfhi $ta0 | ||
| 186 | $ADDU $ta1,$at | ||
| 187 | $ADDU $v0,$ta0 | ||
| 188 | $MULTU $ta2,$a3 | ||
| 189 | sltu $at,$ta1,$at | ||
| 190 | $ST $ta1,-2*$BNSZ($a0) | ||
| 191 | $ADDU $v0,$at | ||
| 192 | |||
| 193 | |||
| 194 | and $ta0,$a2,$minus4 | ||
| 195 | $ADDU $ta3,$v0 | ||
| 196 | sltu $v0,$ta3,$v0 | ||
| 197 | mflo $at | ||
| 198 | mfhi $ta2 | ||
| 199 | $ADDU $ta3,$at | ||
| 200 | $ADDU $v0,$ta2 | ||
| 201 | sltu $at,$ta3,$at | ||
| 202 | $ST $ta3,-$BNSZ($a0) | ||
| 203 | $ADDU $v0,$at | ||
| 204 | .set noreorder | ||
| 205 | bgtzl $ta0,.L_bn_mul_add_words_loop | ||
| 206 | $LD $t0,0($a1) | ||
| 207 | |||
| 208 | beqz $a2,.L_bn_mul_add_words_return | ||
| 209 | nop | ||
| 210 | |||
| 211 | .L_bn_mul_add_words_tail: | ||
| 212 | .set reorder | ||
| 213 | $LD $t0,0($a1) | ||
| 214 | $MULTU $t0,$a3 | ||
| 215 | $LD $t1,0($a0) | ||
| 216 | subu $a2,1 | ||
| 217 | $ADDU $t1,$v0 | ||
| 218 | sltu $v0,$t1,$v0 | ||
| 219 | mflo $at | ||
| 220 | mfhi $t0 | ||
| 221 | $ADDU $t1,$at | ||
| 222 | $ADDU $v0,$t0 | ||
| 223 | sltu $at,$t1,$at | ||
| 224 | $ST $t1,0($a0) | ||
| 225 | $ADDU $v0,$at | ||
| 226 | beqz $a2,.L_bn_mul_add_words_return | ||
| 227 | |||
| 228 | $LD $t0,$BNSZ($a1) | ||
| 229 | $MULTU $t0,$a3 | ||
| 230 | $LD $t1,$BNSZ($a0) | ||
| 231 | subu $a2,1 | ||
| 232 | $ADDU $t1,$v0 | ||
| 233 | sltu $v0,$t1,$v0 | ||
| 234 | mflo $at | ||
| 235 | mfhi $t0 | ||
| 236 | $ADDU $t1,$at | ||
| 237 | $ADDU $v0,$t0 | ||
| 238 | sltu $at,$t1,$at | ||
| 239 | $ST $t1,$BNSZ($a0) | ||
| 240 | $ADDU $v0,$at | ||
| 241 | beqz $a2,.L_bn_mul_add_words_return | ||
| 242 | |||
| 243 | $LD $t0,2*$BNSZ($a1) | ||
| 244 | $MULTU $t0,$a3 | ||
| 245 | $LD $t1,2*$BNSZ($a0) | ||
| 246 | $ADDU $t1,$v0 | ||
| 247 | sltu $v0,$t1,$v0 | ||
| 248 | mflo $at | ||
| 249 | mfhi $t0 | ||
| 250 | $ADDU $t1,$at | ||
| 251 | $ADDU $v0,$t0 | ||
| 252 | sltu $at,$t1,$at | ||
| 253 | $ST $t1,2*$BNSZ($a0) | ||
| 254 | $ADDU $v0,$at | ||
| 255 | |||
| 256 | .L_bn_mul_add_words_return: | ||
| 257 | .set noreorder | ||
| 258 | ___ | ||
| 259 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 260 | $REG_L $t3,4*$SZREG($sp) | ||
| 261 | $REG_L $t2,3*$SZREG($sp) | ||
| 262 | $REG_L $t1,2*$SZREG($sp) | ||
| 263 | $REG_L $t0,1*$SZREG($sp) | ||
| 264 | $REG_L $gp,0*$SZREG($sp) | ||
| 265 | $PTR_ADD $sp,6*$SZREG | ||
| 266 | ___ | ||
| 267 | $code.=<<___; | ||
| 268 | jr $ra | ||
| 269 | move $a0,$v0 | ||
| 270 | .end bn_mul_add_words_internal | ||
| 271 | |||
| 272 | .align 5 | ||
| 273 | .globl bn_mul_words | ||
| 274 | .ent bn_mul_words | ||
| 275 | bn_mul_words: | ||
| 276 | .set noreorder | ||
| 277 | bgtz $a2,bn_mul_words_internal | ||
| 278 | move $v0,$zero | ||
| 279 | jr $ra | ||
| 280 | move $a0,$v0 | ||
| 281 | .end bn_mul_words | ||
| 282 | |||
| 283 | .align 5 | ||
| 284 | .ent bn_mul_words_internal | ||
| 285 | bn_mul_words_internal: | ||
| 286 | ___ | ||
| 287 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 288 | .frame $sp,6*$SZREG,$ra | ||
| 289 | .mask 0x8000f008,-$SZREG | ||
| 290 | .set noreorder | ||
| 291 | $PTR_SUB $sp,6*$SZREG | ||
| 292 | $REG_S $ra,5*$SZREG($sp) | ||
| 293 | $REG_S $t3,4*$SZREG($sp) | ||
| 294 | $REG_S $t2,3*$SZREG($sp) | ||
| 295 | $REG_S $t1,2*$SZREG($sp) | ||
| 296 | $REG_S $t0,1*$SZREG($sp) | ||
| 297 | $REG_S $gp,0*$SZREG($sp) | ||
| 298 | ___ | ||
| 299 | $code.=<<___; | ||
| 300 | .set reorder | ||
| 301 | li $minus4,-4 | ||
| 302 | and $ta0,$a2,$minus4 | ||
| 303 | $LD $t0,0($a1) | ||
| 304 | beqz $ta0,.L_bn_mul_words_tail | ||
| 305 | |||
| 306 | .L_bn_mul_words_loop: | ||
| 307 | $MULTU $t0,$a3 | ||
| 308 | $LD $t2,$BNSZ($a1) | ||
| 309 | $LD $ta0,2*$BNSZ($a1) | ||
| 310 | $LD $ta2,3*$BNSZ($a1) | ||
| 311 | mflo $at | ||
| 312 | mfhi $t0 | ||
| 313 | $ADDU $v0,$at | ||
| 314 | sltu $t1,$v0,$at | ||
| 315 | $MULTU $t2,$a3 | ||
| 316 | $ST $v0,0($a0) | ||
| 317 | $ADDU $v0,$t1,$t0 | ||
| 318 | |||
| 319 | subu $a2,4 | ||
| 320 | $PTR_ADD $a0,4*$BNSZ | ||
| 321 | $PTR_ADD $a1,4*$BNSZ | ||
| 322 | mflo $at | ||
| 323 | mfhi $t2 | ||
| 324 | $ADDU $v0,$at | ||
| 325 | sltu $t3,$v0,$at | ||
| 326 | $MULTU $ta0,$a3 | ||
| 327 | $ST $v0,-3*$BNSZ($a0) | ||
| 328 | $ADDU $v0,$t3,$t2 | ||
| 329 | |||
| 330 | mflo $at | ||
| 331 | mfhi $ta0 | ||
| 332 | $ADDU $v0,$at | ||
| 333 | sltu $ta1,$v0,$at | ||
| 334 | $MULTU $ta2,$a3 | ||
| 335 | $ST $v0,-2*$BNSZ($a0) | ||
| 336 | $ADDU $v0,$ta1,$ta0 | ||
| 337 | |||
| 338 | and $ta0,$a2,$minus4 | ||
| 339 | mflo $at | ||
| 340 | mfhi $ta2 | ||
| 341 | $ADDU $v0,$at | ||
| 342 | sltu $ta3,$v0,$at | ||
| 343 | $ST $v0,-$BNSZ($a0) | ||
| 344 | $ADDU $v0,$ta3,$ta2 | ||
| 345 | .set noreorder | ||
| 346 | bgtzl $ta0,.L_bn_mul_words_loop | ||
| 347 | $LD $t0,0($a1) | ||
| 348 | |||
| 349 | beqz $a2,.L_bn_mul_words_return | ||
| 350 | nop | ||
| 351 | |||
| 352 | .L_bn_mul_words_tail: | ||
| 353 | .set reorder | ||
| 354 | $LD $t0,0($a1) | ||
| 355 | $MULTU $t0,$a3 | ||
| 356 | subu $a2,1 | ||
| 357 | mflo $at | ||
| 358 | mfhi $t0 | ||
| 359 | $ADDU $v0,$at | ||
| 360 | sltu $t1,$v0,$at | ||
| 361 | $ST $v0,0($a0) | ||
| 362 | $ADDU $v0,$t1,$t0 | ||
| 363 | beqz $a2,.L_bn_mul_words_return | ||
| 364 | |||
| 365 | $LD $t0,$BNSZ($a1) | ||
| 366 | $MULTU $t0,$a3 | ||
| 367 | subu $a2,1 | ||
| 368 | mflo $at | ||
| 369 | mfhi $t0 | ||
| 370 | $ADDU $v0,$at | ||
| 371 | sltu $t1,$v0,$at | ||
| 372 | $ST $v0,$BNSZ($a0) | ||
| 373 | $ADDU $v0,$t1,$t0 | ||
| 374 | beqz $a2,.L_bn_mul_words_return | ||
| 375 | |||
| 376 | $LD $t0,2*$BNSZ($a1) | ||
| 377 | $MULTU $t0,$a3 | ||
| 378 | mflo $at | ||
| 379 | mfhi $t0 | ||
| 380 | $ADDU $v0,$at | ||
| 381 | sltu $t1,$v0,$at | ||
| 382 | $ST $v0,2*$BNSZ($a0) | ||
| 383 | $ADDU $v0,$t1,$t0 | ||
| 384 | |||
| 385 | .L_bn_mul_words_return: | ||
| 386 | .set noreorder | ||
| 387 | ___ | ||
| 388 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 389 | $REG_L $t3,4*$SZREG($sp) | ||
| 390 | $REG_L $t2,3*$SZREG($sp) | ||
| 391 | $REG_L $t1,2*$SZREG($sp) | ||
| 392 | $REG_L $t0,1*$SZREG($sp) | ||
| 393 | $REG_L $gp,0*$SZREG($sp) | ||
| 394 | $PTR_ADD $sp,6*$SZREG | ||
| 395 | ___ | ||
| 396 | $code.=<<___; | ||
| 397 | jr $ra | ||
| 398 | move $a0,$v0 | ||
| 399 | .end bn_mul_words_internal | ||
| 400 | |||
| 401 | .align 5 | ||
| 402 | .globl bn_sqr_words | ||
| 403 | .ent bn_sqr_words | ||
| 404 | bn_sqr_words: | ||
| 405 | .set noreorder | ||
| 406 | bgtz $a2,bn_sqr_words_internal | ||
| 407 | move $v0,$zero | ||
| 408 | jr $ra | ||
| 409 | move $a0,$v0 | ||
| 410 | .end bn_sqr_words | ||
| 411 | |||
| 412 | .align 5 | ||
| 413 | .ent bn_sqr_words_internal | ||
| 414 | bn_sqr_words_internal: | ||
| 415 | ___ | ||
| 416 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 417 | .frame $sp,6*$SZREG,$ra | ||
| 418 | .mask 0x8000f008,-$SZREG | ||
| 419 | .set noreorder | ||
| 420 | $PTR_SUB $sp,6*$SZREG | ||
| 421 | $REG_S $ra,5*$SZREG($sp) | ||
| 422 | $REG_S $t3,4*$SZREG($sp) | ||
| 423 | $REG_S $t2,3*$SZREG($sp) | ||
| 424 | $REG_S $t1,2*$SZREG($sp) | ||
| 425 | $REG_S $t0,1*$SZREG($sp) | ||
| 426 | $REG_S $gp,0*$SZREG($sp) | ||
| 427 | ___ | ||
| 428 | $code.=<<___; | ||
| 429 | .set reorder | ||
| 430 | li $minus4,-4 | ||
| 431 | and $ta0,$a2,$minus4 | ||
| 432 | $LD $t0,0($a1) | ||
| 433 | beqz $ta0,.L_bn_sqr_words_tail | ||
| 434 | |||
| 435 | .L_bn_sqr_words_loop: | ||
| 436 | $MULTU $t0,$t0 | ||
| 437 | $LD $t2,$BNSZ($a1) | ||
| 438 | $LD $ta0,2*$BNSZ($a1) | ||
| 439 | $LD $ta2,3*$BNSZ($a1) | ||
| 440 | mflo $t1 | ||
| 441 | mfhi $t0 | ||
| 442 | $ST $t1,0($a0) | ||
| 443 | $ST $t0,$BNSZ($a0) | ||
| 444 | |||
| 445 | $MULTU $t2,$t2 | ||
| 446 | subu $a2,4 | ||
| 447 | $PTR_ADD $a0,8*$BNSZ | ||
| 448 | $PTR_ADD $a1,4*$BNSZ | ||
| 449 | mflo $t3 | ||
| 450 | mfhi $t2 | ||
| 451 | $ST $t3,-6*$BNSZ($a0) | ||
| 452 | $ST $t2,-5*$BNSZ($a0) | ||
| 453 | |||
| 454 | $MULTU $ta0,$ta0 | ||
| 455 | mflo $ta1 | ||
| 456 | mfhi $ta0 | ||
| 457 | $ST $ta1,-4*$BNSZ($a0) | ||
| 458 | $ST $ta0,-3*$BNSZ($a0) | ||
| 459 | |||
| 460 | |||
| 461 | $MULTU $ta2,$ta2 | ||
| 462 | and $ta0,$a2,$minus4 | ||
| 463 | mflo $ta3 | ||
| 464 | mfhi $ta2 | ||
| 465 | $ST $ta3,-2*$BNSZ($a0) | ||
| 466 | $ST $ta2,-$BNSZ($a0) | ||
| 467 | |||
| 468 | .set noreorder | ||
| 469 | bgtzl $ta0,.L_bn_sqr_words_loop | ||
| 470 | $LD $t0,0($a1) | ||
| 471 | |||
| 472 | beqz $a2,.L_bn_sqr_words_return | ||
| 473 | nop | ||
| 474 | |||
| 475 | .L_bn_sqr_words_tail: | ||
| 476 | .set reorder | ||
| 477 | $LD $t0,0($a1) | ||
| 478 | $MULTU $t0,$t0 | ||
| 479 | subu $a2,1 | ||
| 480 | mflo $t1 | ||
| 481 | mfhi $t0 | ||
| 482 | $ST $t1,0($a0) | ||
| 483 | $ST $t0,$BNSZ($a0) | ||
| 484 | beqz $a2,.L_bn_sqr_words_return | ||
| 485 | |||
| 486 | $LD $t0,$BNSZ($a1) | ||
| 487 | $MULTU $t0,$t0 | ||
| 488 | subu $a2,1 | ||
| 489 | mflo $t1 | ||
| 490 | mfhi $t0 | ||
| 491 | $ST $t1,2*$BNSZ($a0) | ||
| 492 | $ST $t0,3*$BNSZ($a0) | ||
| 493 | beqz $a2,.L_bn_sqr_words_return | ||
| 494 | |||
| 495 | $LD $t0,2*$BNSZ($a1) | ||
| 496 | $MULTU $t0,$t0 | ||
| 497 | mflo $t1 | ||
| 498 | mfhi $t0 | ||
| 499 | $ST $t1,4*$BNSZ($a0) | ||
| 500 | $ST $t0,5*$BNSZ($a0) | ||
| 501 | |||
| 502 | .L_bn_sqr_words_return: | ||
| 503 | .set noreorder | ||
| 504 | ___ | ||
| 505 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 506 | $REG_L $t3,4*$SZREG($sp) | ||
| 507 | $REG_L $t2,3*$SZREG($sp) | ||
| 508 | $REG_L $t1,2*$SZREG($sp) | ||
| 509 | $REG_L $t0,1*$SZREG($sp) | ||
| 510 | $REG_L $gp,0*$SZREG($sp) | ||
| 511 | $PTR_ADD $sp,6*$SZREG | ||
| 512 | ___ | ||
| 513 | $code.=<<___; | ||
| 514 | jr $ra | ||
| 515 | move $a0,$v0 | ||
| 516 | |||
| 517 | .end bn_sqr_words_internal | ||
| 518 | |||
| 519 | .align 5 | ||
| 520 | .globl bn_add_words | ||
| 521 | .ent bn_add_words | ||
| 522 | bn_add_words: | ||
| 523 | .set noreorder | ||
| 524 | bgtz $a3,bn_add_words_internal | ||
| 525 | move $v0,$zero | ||
| 526 | jr $ra | ||
| 527 | move $a0,$v0 | ||
| 528 | .end bn_add_words | ||
| 529 | |||
| 530 | .align 5 | ||
| 531 | .ent bn_add_words_internal | ||
| 532 | bn_add_words_internal: | ||
| 533 | ___ | ||
| 534 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 535 | .frame $sp,6*$SZREG,$ra | ||
| 536 | .mask 0x8000f008,-$SZREG | ||
| 537 | .set noreorder | ||
| 538 | $PTR_SUB $sp,6*$SZREG | ||
| 539 | $REG_S $ra,5*$SZREG($sp) | ||
| 540 | $REG_S $t3,4*$SZREG($sp) | ||
| 541 | $REG_S $t2,3*$SZREG($sp) | ||
| 542 | $REG_S $t1,2*$SZREG($sp) | ||
| 543 | $REG_S $t0,1*$SZREG($sp) | ||
| 544 | $REG_S $gp,0*$SZREG($sp) | ||
| 545 | ___ | ||
| 546 | $code.=<<___; | ||
| 547 | .set reorder | ||
| 548 | li $minus4,-4 | ||
| 549 | and $at,$a3,$minus4 | ||
| 550 | $LD $t0,0($a1) | ||
| 551 | beqz $at,.L_bn_add_words_tail | ||
| 552 | |||
| 553 | .L_bn_add_words_loop: | ||
| 554 | $LD $ta0,0($a2) | ||
| 555 | subu $a3,4 | ||
| 556 | $LD $t1,$BNSZ($a1) | ||
| 557 | and $at,$a3,$minus4 | ||
| 558 | $LD $t2,2*$BNSZ($a1) | ||
| 559 | $PTR_ADD $a2,4*$BNSZ | ||
| 560 | $LD $t3,3*$BNSZ($a1) | ||
| 561 | $PTR_ADD $a0,4*$BNSZ | ||
| 562 | $LD $ta1,-3*$BNSZ($a2) | ||
| 563 | $PTR_ADD $a1,4*$BNSZ | ||
| 564 | $LD $ta2,-2*$BNSZ($a2) | ||
| 565 | $LD $ta3,-$BNSZ($a2) | ||
| 566 | $ADDU $ta0,$t0 | ||
| 567 | sltu $t8,$ta0,$t0 | ||
| 568 | $ADDU $t0,$ta0,$v0 | ||
| 569 | sltu $v0,$t0,$ta0 | ||
| 570 | $ST $t0,-4*$BNSZ($a0) | ||
| 571 | $ADDU $v0,$t8 | ||
| 572 | |||
| 573 | $ADDU $ta1,$t1 | ||
| 574 | sltu $t9,$ta1,$t1 | ||
| 575 | $ADDU $t1,$ta1,$v0 | ||
| 576 | sltu $v0,$t1,$ta1 | ||
| 577 | $ST $t1,-3*$BNSZ($a0) | ||
| 578 | $ADDU $v0,$t9 | ||
| 579 | |||
| 580 | $ADDU $ta2,$t2 | ||
| 581 | sltu $t8,$ta2,$t2 | ||
| 582 | $ADDU $t2,$ta2,$v0 | ||
| 583 | sltu $v0,$t2,$ta2 | ||
| 584 | $ST $t2,-2*$BNSZ($a0) | ||
| 585 | $ADDU $v0,$t8 | ||
| 586 | |||
| 587 | $ADDU $ta3,$t3 | ||
| 588 | sltu $t9,$ta3,$t3 | ||
| 589 | $ADDU $t3,$ta3,$v0 | ||
| 590 | sltu $v0,$t3,$ta3 | ||
| 591 | $ST $t3,-$BNSZ($a0) | ||
| 592 | $ADDU $v0,$t9 | ||
| 593 | |||
| 594 | .set noreorder | ||
| 595 | bgtzl $at,.L_bn_add_words_loop | ||
| 596 | $LD $t0,0($a1) | ||
| 597 | |||
| 598 | beqz $a3,.L_bn_add_words_return | ||
| 599 | nop | ||
| 600 | |||
| 601 | .L_bn_add_words_tail: | ||
| 602 | .set reorder | ||
| 603 | $LD $t0,0($a1) | ||
| 604 | $LD $ta0,0($a2) | ||
| 605 | $ADDU $ta0,$t0 | ||
| 606 | subu $a3,1 | ||
| 607 | sltu $t8,$ta0,$t0 | ||
| 608 | $ADDU $t0,$ta0,$v0 | ||
| 609 | sltu $v0,$t0,$ta0 | ||
| 610 | $ST $t0,0($a0) | ||
| 611 | $ADDU $v0,$t8 | ||
| 612 | beqz $a3,.L_bn_add_words_return | ||
| 613 | |||
| 614 | $LD $t1,$BNSZ($a1) | ||
| 615 | $LD $ta1,$BNSZ($a2) | ||
| 616 | $ADDU $ta1,$t1 | ||
| 617 | subu $a3,1 | ||
| 618 | sltu $t9,$ta1,$t1 | ||
| 619 | $ADDU $t1,$ta1,$v0 | ||
| 620 | sltu $v0,$t1,$ta1 | ||
| 621 | $ST $t1,$BNSZ($a0) | ||
| 622 | $ADDU $v0,$t9 | ||
| 623 | beqz $a3,.L_bn_add_words_return | ||
| 624 | |||
| 625 | $LD $t2,2*$BNSZ($a1) | ||
| 626 | $LD $ta2,2*$BNSZ($a2) | ||
| 627 | $ADDU $ta2,$t2 | ||
| 628 | sltu $t8,$ta2,$t2 | ||
| 629 | $ADDU $t2,$ta2,$v0 | ||
| 630 | sltu $v0,$t2,$ta2 | ||
| 631 | $ST $t2,2*$BNSZ($a0) | ||
| 632 | $ADDU $v0,$t8 | ||
| 633 | |||
| 634 | .L_bn_add_words_return: | ||
| 635 | .set noreorder | ||
| 636 | ___ | ||
| 637 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 638 | $REG_L $t3,4*$SZREG($sp) | ||
| 639 | $REG_L $t2,3*$SZREG($sp) | ||
| 640 | $REG_L $t1,2*$SZREG($sp) | ||
| 641 | $REG_L $t0,1*$SZREG($sp) | ||
| 642 | $REG_L $gp,0*$SZREG($sp) | ||
| 643 | $PTR_ADD $sp,6*$SZREG | ||
| 644 | ___ | ||
| 645 | $code.=<<___; | ||
| 646 | jr $ra | ||
| 647 | move $a0,$v0 | ||
| 648 | |||
| 649 | .end bn_add_words_internal | ||
| 650 | |||
| 651 | .align 5 | ||
| 652 | .globl bn_sub_words | ||
| 653 | .ent bn_sub_words | ||
| 654 | bn_sub_words: | ||
| 655 | .set noreorder | ||
| 656 | bgtz $a3,bn_sub_words_internal | ||
| 657 | move $v0,$zero | ||
| 658 | jr $ra | ||
| 659 | move $a0,$zero | ||
| 660 | .end bn_sub_words | ||
| 661 | |||
| 662 | .align 5 | ||
| 663 | .ent bn_sub_words_internal | ||
| 664 | bn_sub_words_internal: | ||
| 665 | ___ | ||
| 666 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 667 | .frame $sp,6*$SZREG,$ra | ||
| 668 | .mask 0x8000f008,-$SZREG | ||
| 669 | .set noreorder | ||
| 670 | $PTR_SUB $sp,6*$SZREG | ||
| 671 | $REG_S $ra,5*$SZREG($sp) | ||
| 672 | $REG_S $t3,4*$SZREG($sp) | ||
| 673 | $REG_S $t2,3*$SZREG($sp) | ||
| 674 | $REG_S $t1,2*$SZREG($sp) | ||
| 675 | $REG_S $t0,1*$SZREG($sp) | ||
| 676 | $REG_S $gp,0*$SZREG($sp) | ||
| 677 | ___ | ||
| 678 | $code.=<<___; | ||
| 679 | .set reorder | ||
| 680 | li $minus4,-4 | ||
| 681 | and $at,$a3,$minus4 | ||
| 682 | $LD $t0,0($a1) | ||
| 683 | beqz $at,.L_bn_sub_words_tail | ||
| 684 | |||
| 685 | .L_bn_sub_words_loop: | ||
| 686 | $LD $ta0,0($a2) | ||
| 687 | subu $a3,4 | ||
| 688 | $LD $t1,$BNSZ($a1) | ||
| 689 | and $at,$a3,$minus4 | ||
| 690 | $LD $t2,2*$BNSZ($a1) | ||
| 691 | $PTR_ADD $a2,4*$BNSZ | ||
| 692 | $LD $t3,3*$BNSZ($a1) | ||
| 693 | $PTR_ADD $a0,4*$BNSZ | ||
| 694 | $LD $ta1,-3*$BNSZ($a2) | ||
| 695 | $PTR_ADD $a1,4*$BNSZ | ||
| 696 | $LD $ta2,-2*$BNSZ($a2) | ||
| 697 | $LD $ta3,-$BNSZ($a2) | ||
| 698 | sltu $t8,$t0,$ta0 | ||
| 699 | $SUBU $ta0,$t0,$ta0 | ||
| 700 | $SUBU $t0,$ta0,$v0 | ||
| 701 | sgtu $v0,$t0,$ta0 | ||
| 702 | $ST $t0,-4*$BNSZ($a0) | ||
| 703 | $ADDU $v0,$t8 | ||
| 704 | |||
| 705 | sltu $t9,$t1,$ta1 | ||
| 706 | $SUBU $ta1,$t1,$ta1 | ||
| 707 | $SUBU $t1,$ta1,$v0 | ||
| 708 | sgtu $v0,$t1,$ta1 | ||
| 709 | $ST $t1,-3*$BNSZ($a0) | ||
| 710 | $ADDU $v0,$t9 | ||
| 711 | |||
| 712 | |||
| 713 | sltu $t8,$t2,$ta2 | ||
| 714 | $SUBU $ta2,$t2,$ta2 | ||
| 715 | $SUBU $t2,$ta2,$v0 | ||
| 716 | sgtu $v0,$t2,$ta2 | ||
| 717 | $ST $t2,-2*$BNSZ($a0) | ||
| 718 | $ADDU $v0,$t8 | ||
| 719 | |||
| 720 | sltu $t9,$t3,$ta3 | ||
| 721 | $SUBU $ta3,$t3,$ta3 | ||
| 722 | $SUBU $t3,$ta3,$v0 | ||
| 723 | sgtu $v0,$t3,$ta3 | ||
| 724 | $ST $t3,-$BNSZ($a0) | ||
| 725 | $ADDU $v0,$t9 | ||
| 726 | |||
| 727 | .set noreorder | ||
| 728 | bgtzl $at,.L_bn_sub_words_loop | ||
| 729 | $LD $t0,0($a1) | ||
| 730 | |||
| 731 | beqz $a3,.L_bn_sub_words_return | ||
| 732 | nop | ||
| 733 | |||
| 734 | .L_bn_sub_words_tail: | ||
| 735 | .set reorder | ||
| 736 | $LD $t0,0($a1) | ||
| 737 | $LD $ta0,0($a2) | ||
| 738 | subu $a3,1 | ||
| 739 | sltu $t8,$t0,$ta0 | ||
| 740 | $SUBU $ta0,$t0,$ta0 | ||
| 741 | $SUBU $t0,$ta0,$v0 | ||
| 742 | sgtu $v0,$t0,$ta0 | ||
| 743 | $ST $t0,0($a0) | ||
| 744 | $ADDU $v0,$t8 | ||
| 745 | beqz $a3,.L_bn_sub_words_return | ||
| 746 | |||
| 747 | $LD $t1,$BNSZ($a1) | ||
| 748 | subu $a3,1 | ||
| 749 | $LD $ta1,$BNSZ($a2) | ||
| 750 | sltu $t9,$t1,$ta1 | ||
| 751 | $SUBU $ta1,$t1,$ta1 | ||
| 752 | $SUBU $t1,$ta1,$v0 | ||
| 753 | sgtu $v0,$t1,$ta1 | ||
| 754 | $ST $t1,$BNSZ($a0) | ||
| 755 | $ADDU $v0,$t9 | ||
| 756 | beqz $a3,.L_bn_sub_words_return | ||
| 757 | |||
| 758 | $LD $t2,2*$BNSZ($a1) | ||
| 759 | $LD $ta2,2*$BNSZ($a2) | ||
| 760 | sltu $t8,$t2,$ta2 | ||
| 761 | $SUBU $ta2,$t2,$ta2 | ||
| 762 | $SUBU $t2,$ta2,$v0 | ||
| 763 | sgtu $v0,$t2,$ta2 | ||
| 764 | $ST $t2,2*$BNSZ($a0) | ||
| 765 | $ADDU $v0,$t8 | ||
| 766 | |||
| 767 | .L_bn_sub_words_return: | ||
| 768 | .set noreorder | ||
| 769 | ___ | ||
| 770 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 771 | $REG_L $t3,4*$SZREG($sp) | ||
| 772 | $REG_L $t2,3*$SZREG($sp) | ||
| 773 | $REG_L $t1,2*$SZREG($sp) | ||
| 774 | $REG_L $t0,1*$SZREG($sp) | ||
| 775 | $REG_L $gp,0*$SZREG($sp) | ||
| 776 | $PTR_ADD $sp,6*$SZREG | ||
| 777 | ___ | ||
| 778 | $code.=<<___; | ||
| 779 | jr $ra | ||
| 780 | move $a0,$v0 | ||
| 781 | .end bn_sub_words_internal | ||
| 782 | |||
| 783 | .align 5 | ||
| 784 | .globl bn_div_3_words | ||
| 785 | .ent bn_div_3_words | ||
| 786 | bn_div_3_words: | ||
| 787 | .set noreorder | ||
| 788 | move $a3,$a0 # we know that bn_div_words does not | ||
| 789 | # touch $a3, $ta2, $ta3 and preserves $a2 | ||
| 790 | # so that we can save two arguments | ||
| 791 | # and return address in registers | ||
| 792 | # instead of stack:-) | ||
| 793 | |||
| 794 | $LD $a0,($a3) | ||
| 795 | move $ta2,$a1 | ||
| 796 | bne $a0,$a2,bn_div_3_words_internal | ||
| 797 | $LD $a1,-$BNSZ($a3) | ||
| 798 | li $v0,-1 | ||
| 799 | jr $ra | ||
| 800 | move $a0,$v0 | ||
| 801 | .end bn_div_3_words | ||
| 802 | |||
| 803 | .align 5 | ||
| 804 | .ent bn_div_3_words_internal | ||
| 805 | bn_div_3_words_internal: | ||
| 806 | ___ | ||
| 807 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 808 | .frame $sp,6*$SZREG,$ra | ||
| 809 | .mask 0x8000f008,-$SZREG | ||
| 810 | .set noreorder | ||
| 811 | $PTR_SUB $sp,6*$SZREG | ||
| 812 | $REG_S $ra,5*$SZREG($sp) | ||
| 813 | $REG_S $t3,4*$SZREG($sp) | ||
| 814 | $REG_S $t2,3*$SZREG($sp) | ||
| 815 | $REG_S $t1,2*$SZREG($sp) | ||
| 816 | $REG_S $t0,1*$SZREG($sp) | ||
| 817 | $REG_S $gp,0*$SZREG($sp) | ||
| 818 | ___ | ||
| 819 | $code.=<<___; | ||
| 820 | .set reorder | ||
| 821 | move $ta3,$ra | ||
| 822 | bal bn_div_words | ||
| 823 | move $ra,$ta3 | ||
| 824 | $MULTU $ta2,$v0 | ||
| 825 | $LD $t2,-2*$BNSZ($a3) | ||
| 826 | move $ta0,$zero | ||
| 827 | mfhi $t1 | ||
| 828 | mflo $t0 | ||
| 829 | sltu $t8,$t1,$a1 | ||
| 830 | .L_bn_div_3_words_inner_loop: | ||
| 831 | bnez $t8,.L_bn_div_3_words_inner_loop_done | ||
| 832 | sgeu $at,$t2,$t0 | ||
| 833 | seq $t9,$t1,$a1 | ||
| 834 | and $at,$t9 | ||
| 835 | sltu $t3,$t0,$ta2 | ||
| 836 | $ADDU $a1,$a2 | ||
| 837 | $SUBU $t1,$t3 | ||
| 838 | $SUBU $t0,$ta2 | ||
| 839 | sltu $t8,$t1,$a1 | ||
| 840 | sltu $ta0,$a1,$a2 | ||
| 841 | or $t8,$ta0 | ||
| 842 | .set noreorder | ||
| 843 | beqzl $at,.L_bn_div_3_words_inner_loop | ||
| 844 | $SUBU $v0,1 | ||
| 845 | .set reorder | ||
| 846 | .L_bn_div_3_words_inner_loop_done: | ||
| 847 | .set noreorder | ||
| 848 | ___ | ||
| 849 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 850 | $REG_L $t3,4*$SZREG($sp) | ||
| 851 | $REG_L $t2,3*$SZREG($sp) | ||
| 852 | $REG_L $t1,2*$SZREG($sp) | ||
| 853 | $REG_L $t0,1*$SZREG($sp) | ||
| 854 | $REG_L $gp,0*$SZREG($sp) | ||
| 855 | $PTR_ADD $sp,6*$SZREG | ||
| 856 | ___ | ||
| 857 | $code.=<<___; | ||
| 858 | jr $ra | ||
| 859 | move $a0,$v0 | ||
| 860 | .end bn_div_3_words_internal | ||
| 861 | |||
| 862 | .align 5 | ||
| 863 | .globl bn_div_words | ||
| 864 | .ent bn_div_words | ||
| 865 | bn_div_words: | ||
| 866 | .set noreorder | ||
| 867 | bnez $a2,bn_div_words_internal | ||
| 868 | li $v0,-1 # I would rather signal div-by-zero | ||
| 869 | # which can be done with 'break 7' | ||
| 870 | jr $ra | ||
| 871 | move $a0,$v0 | ||
| 872 | .end bn_div_words | ||
| 873 | |||
| 874 | .align 5 | ||
| 875 | .ent bn_div_words_internal | ||
| 876 | bn_div_words_internal: | ||
| 877 | ___ | ||
| 878 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 879 | .frame $sp,6*$SZREG,$ra | ||
| 880 | .mask 0x8000f008,-$SZREG | ||
| 881 | .set noreorder | ||
| 882 | $PTR_SUB $sp,6*$SZREG | ||
| 883 | $REG_S $ra,5*$SZREG($sp) | ||
| 884 | $REG_S $t3,4*$SZREG($sp) | ||
| 885 | $REG_S $t2,3*$SZREG($sp) | ||
| 886 | $REG_S $t1,2*$SZREG($sp) | ||
| 887 | $REG_S $t0,1*$SZREG($sp) | ||
| 888 | $REG_S $gp,0*$SZREG($sp) | ||
| 889 | ___ | ||
| 890 | $code.=<<___; | ||
| 891 | move $v1,$zero | ||
| 892 | bltz $a2,.L_bn_div_words_body | ||
| 893 | move $t9,$v1 | ||
| 894 | $SLL $a2,1 | ||
| 895 | bgtz $a2,.-4 | ||
| 896 | addu $t9,1 | ||
| 897 | |||
| 898 | .set reorder | ||
| 899 | negu $t1,$t9 | ||
| 900 | li $t2,-1 | ||
| 901 | $SLL $t2,$t1 | ||
| 902 | and $t2,$a0 | ||
| 903 | $SRL $at,$a1,$t1 | ||
| 904 | .set noreorder | ||
| 905 | bnezl $t2,.+8 | ||
| 906 | break 6 # signal overflow | ||
| 907 | .set reorder | ||
| 908 | $SLL $a0,$t9 | ||
| 909 | $SLL $a1,$t9 | ||
| 910 | or $a0,$at | ||
| 911 | ___ | ||
| 912 | $QT=$ta0; | ||
| 913 | $HH=$ta1; | ||
| 914 | $DH=$v1; | ||
| 915 | $code.=<<___; | ||
| 916 | .L_bn_div_words_body: | ||
| 917 | $SRL $DH,$a2,4*$BNSZ # bits | ||
| 918 | sgeu $at,$a0,$a2 | ||
| 919 | .set noreorder | ||
| 920 | bnezl $at,.+8 | ||
| 921 | $SUBU $a0,$a2 | ||
| 922 | .set reorder | ||
| 923 | |||
| 924 | li $QT,-1 | ||
| 925 | $SRL $HH,$a0,4*$BNSZ # bits | ||
| 926 | $SRL $QT,4*$BNSZ # q=0xffffffff | ||
| 927 | beq $DH,$HH,.L_bn_div_words_skip_div1 | ||
| 928 | $DIVU $zero,$a0,$DH | ||
| 929 | mflo $QT | ||
| 930 | .L_bn_div_words_skip_div1: | ||
| 931 | $MULTU $a2,$QT | ||
| 932 | $SLL $t3,$a0,4*$BNSZ # bits | ||
| 933 | $SRL $at,$a1,4*$BNSZ # bits | ||
| 934 | or $t3,$at | ||
| 935 | mflo $t0 | ||
| 936 | mfhi $t1 | ||
| 937 | .L_bn_div_words_inner_loop1: | ||
| 938 | sltu $t2,$t3,$t0 | ||
| 939 | seq $t8,$HH,$t1 | ||
| 940 | sltu $at,$HH,$t1 | ||
| 941 | and $t2,$t8 | ||
| 942 | sltu $v0,$t0,$a2 | ||
| 943 | or $at,$t2 | ||
| 944 | .set noreorder | ||
| 945 | beqz $at,.L_bn_div_words_inner_loop1_done | ||
| 946 | $SUBU $t1,$v0 | ||
| 947 | $SUBU $t0,$a2 | ||
| 948 | b .L_bn_div_words_inner_loop1 | ||
| 949 | $SUBU $QT,1 | ||
| 950 | .set reorder | ||
| 951 | .L_bn_div_words_inner_loop1_done: | ||
| 952 | |||
| 953 | $SLL $a1,4*$BNSZ # bits | ||
| 954 | $SUBU $a0,$t3,$t0 | ||
| 955 | $SLL $v0,$QT,4*$BNSZ # bits | ||
| 956 | |||
| 957 | li $QT,-1 | ||
| 958 | $SRL $HH,$a0,4*$BNSZ # bits | ||
| 959 | $SRL $QT,4*$BNSZ # q=0xffffffff | ||
| 960 | beq $DH,$HH,.L_bn_div_words_skip_div2 | ||
| 961 | $DIVU $zero,$a0,$DH | ||
| 962 | mflo $QT | ||
| 963 | .L_bn_div_words_skip_div2: | ||
| 964 | $MULTU $a2,$QT | ||
| 965 | $SLL $t3,$a0,4*$BNSZ # bits | ||
| 966 | $SRL $at,$a1,4*$BNSZ # bits | ||
| 967 | or $t3,$at | ||
| 968 | mflo $t0 | ||
| 969 | mfhi $t1 | ||
| 970 | .L_bn_div_words_inner_loop2: | ||
| 971 | sltu $t2,$t3,$t0 | ||
| 972 | seq $t8,$HH,$t1 | ||
| 973 | sltu $at,$HH,$t1 | ||
| 974 | and $t2,$t8 | ||
| 975 | sltu $v1,$t0,$a2 | ||
| 976 | or $at,$t2 | ||
| 977 | .set noreorder | ||
| 978 | beqz $at,.L_bn_div_words_inner_loop2_done | ||
| 979 | $SUBU $t1,$v1 | ||
| 980 | $SUBU $t0,$a2 | ||
| 981 | b .L_bn_div_words_inner_loop2 | ||
| 982 | $SUBU $QT,1 | ||
| 983 | .set reorder | ||
| 984 | .L_bn_div_words_inner_loop2_done: | ||
| 985 | |||
| 986 | $SUBU $a0,$t3,$t0 | ||
| 987 | or $v0,$QT | ||
| 988 | $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it | ||
| 989 | $SRL $a2,$t9 # restore $a2 | ||
| 990 | |||
| 991 | .set noreorder | ||
| 992 | move $a1,$v1 | ||
| 993 | ___ | ||
| 994 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 995 | $REG_L $t3,4*$SZREG($sp) | ||
| 996 | $REG_L $t2,3*$SZREG($sp) | ||
| 997 | $REG_L $t1,2*$SZREG($sp) | ||
| 998 | $REG_L $t0,1*$SZREG($sp) | ||
| 999 | $REG_L $gp,0*$SZREG($sp) | ||
| 1000 | $PTR_ADD $sp,6*$SZREG | ||
| 1001 | ___ | ||
| 1002 | $code.=<<___; | ||
| 1003 | jr $ra | ||
| 1004 | move $a0,$v0 | ||
| 1005 | .end bn_div_words_internal | ||
| 1006 | ___ | ||
| 1007 | undef $HH; undef $QT; undef $DH; | ||
| 1008 | |||
| 1009 | ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); | ||
| 1010 | ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); | ||
| 1011 | |||
| 1012 | ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 | ||
| 1013 | ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 | ||
| 1014 | |||
| 1015 | ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); | ||
| 1016 | |||
| 1017 | $code.=<<___; | ||
| 1018 | |||
| 1019 | .align 5 | ||
| 1020 | .globl bn_mul_comba8 | ||
| 1021 | .ent bn_mul_comba8 | ||
| 1022 | bn_mul_comba8: | ||
| 1023 | .set noreorder | ||
| 1024 | ___ | ||
| 1025 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 1026 | .frame $sp,12*$SZREG,$ra | ||
| 1027 | .mask 0x803ff008,-$SZREG | ||
| 1028 | $PTR_SUB $sp,12*$SZREG | ||
| 1029 | $REG_S $ra,11*$SZREG($sp) | ||
| 1030 | $REG_S $s5,10*$SZREG($sp) | ||
| 1031 | $REG_S $s4,9*$SZREG($sp) | ||
| 1032 | $REG_S $s3,8*$SZREG($sp) | ||
| 1033 | $REG_S $s2,7*$SZREG($sp) | ||
| 1034 | $REG_S $s1,6*$SZREG($sp) | ||
| 1035 | $REG_S $s0,5*$SZREG($sp) | ||
| 1036 | $REG_S $t3,4*$SZREG($sp) | ||
| 1037 | $REG_S $t2,3*$SZREG($sp) | ||
| 1038 | $REG_S $t1,2*$SZREG($sp) | ||
| 1039 | $REG_S $t0,1*$SZREG($sp) | ||
| 1040 | $REG_S $gp,0*$SZREG($sp) | ||
| 1041 | ___ | ||
| 1042 | $code.=<<___ if ($flavour !~ /nubi/i); | ||
| 1043 | .frame $sp,6*$SZREG,$ra | ||
| 1044 | .mask 0x003f0000,-$SZREG | ||
| 1045 | $PTR_SUB $sp,6*$SZREG | ||
| 1046 | $REG_S $s5,5*$SZREG($sp) | ||
| 1047 | $REG_S $s4,4*$SZREG($sp) | ||
| 1048 | $REG_S $s3,3*$SZREG($sp) | ||
| 1049 | $REG_S $s2,2*$SZREG($sp) | ||
| 1050 | $REG_S $s1,1*$SZREG($sp) | ||
| 1051 | $REG_S $s0,0*$SZREG($sp) | ||
| 1052 | ___ | ||
| 1053 | $code.=<<___; | ||
| 1054 | |||
| 1055 | .set reorder | ||
| 1056 | $LD $a_0,0($a1) # If compiled with -mips3 option on | ||
| 1057 | # R5000 box assembler barks on this | ||
| 1058 | # 1ine with "should not have mult/div | ||
| 1059 | # as last instruction in bb (R10K | ||
| 1060 | # bug)" warning. If anybody out there | ||
| 1061 | # has a clue about how to circumvent | ||
| 1062 | # this do send me a note. | ||
| 1063 | # <appro\@fy.chalmers.se> | ||
| 1064 | |||
| 1065 | $LD $b_0,0($a2) | ||
| 1066 | $LD $a_1,$BNSZ($a1) | ||
| 1067 | $LD $a_2,2*$BNSZ($a1) | ||
| 1068 | $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
| 1069 | $LD $a_3,3*$BNSZ($a1) | ||
| 1070 | $LD $b_1,$BNSZ($a2) | ||
| 1071 | $LD $b_2,2*$BNSZ($a2) | ||
| 1072 | $LD $b_3,3*$BNSZ($a2) | ||
| 1073 | mflo $c_1 | ||
| 1074 | mfhi $c_2 | ||
| 1075 | |||
| 1076 | $LD $a_4,4*$BNSZ($a1) | ||
| 1077 | $LD $a_5,5*$BNSZ($a1) | ||
| 1078 | $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); | ||
| 1079 | $LD $a_6,6*$BNSZ($a1) | ||
| 1080 | $LD $a_7,7*$BNSZ($a1) | ||
| 1081 | $LD $b_4,4*$BNSZ($a2) | ||
| 1082 | $LD $b_5,5*$BNSZ($a2) | ||
| 1083 | mflo $t_1 | ||
| 1084 | mfhi $t_2 | ||
| 1085 | $ADDU $c_2,$t_1 | ||
| 1086 | sltu $at,$c_2,$t_1 | ||
| 1087 | $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); | ||
| 1088 | $ADDU $c_3,$t_2,$at | ||
| 1089 | $LD $b_6,6*$BNSZ($a2) | ||
| 1090 | $LD $b_7,7*$BNSZ($a2) | ||
| 1091 | $ST $c_1,0($a0) # r[0]=c1; | ||
| 1092 | mflo $t_1 | ||
| 1093 | mfhi $t_2 | ||
| 1094 | $ADDU $c_2,$t_1 | ||
| 1095 | sltu $at,$c_2,$t_1 | ||
| 1096 | $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); | ||
| 1097 | $ADDU $t_2,$at | ||
| 1098 | $ADDU $c_3,$t_2 | ||
| 1099 | sltu $c_1,$c_3,$t_2 | ||
| 1100 | $ST $c_2,$BNSZ($a0) # r[1]=c2; | ||
| 1101 | |||
| 1102 | mflo $t_1 | ||
| 1103 | mfhi $t_2 | ||
| 1104 | $ADDU $c_3,$t_1 | ||
| 1105 | sltu $at,$c_3,$t_1 | ||
| 1106 | $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); | ||
| 1107 | $ADDU $t_2,$at | ||
| 1108 | $ADDU $c_1,$t_2 | ||
| 1109 | mflo $t_1 | ||
| 1110 | mfhi $t_2 | ||
| 1111 | $ADDU $c_3,$t_1 | ||
| 1112 | sltu $at,$c_3,$t_1 | ||
| 1113 | $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); | ||
| 1114 | $ADDU $t_2,$at | ||
| 1115 | $ADDU $c_1,$t_2 | ||
| 1116 | sltu $c_2,$c_1,$t_2 | ||
| 1117 | mflo $t_1 | ||
| 1118 | mfhi $t_2 | ||
| 1119 | $ADDU $c_3,$t_1 | ||
| 1120 | sltu $at,$c_3,$t_1 | ||
| 1121 | $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); | ||
| 1122 | $ADDU $t_2,$at | ||
| 1123 | $ADDU $c_1,$t_2 | ||
| 1124 | sltu $at,$c_1,$t_2 | ||
| 1125 | $ADDU $c_2,$at | ||
| 1126 | $ST $c_3,2*$BNSZ($a0) # r[2]=c3; | ||
| 1127 | |||
| 1128 | mflo $t_1 | ||
| 1129 | mfhi $t_2 | ||
| 1130 | $ADDU $c_1,$t_1 | ||
| 1131 | sltu $at,$c_1,$t_1 | ||
| 1132 | $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); | ||
| 1133 | $ADDU $t_2,$at | ||
| 1134 | $ADDU $c_2,$t_2 | ||
| 1135 | sltu $c_3,$c_2,$t_2 | ||
| 1136 | mflo $t_1 | ||
| 1137 | mfhi $t_2 | ||
| 1138 | $ADDU $c_1,$t_1 | ||
| 1139 | sltu $at,$c_1,$t_1 | ||
| 1140 | $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); | ||
| 1141 | $ADDU $t_2,$at | ||
| 1142 | $ADDU $c_2,$t_2 | ||
| 1143 | sltu $at,$c_2,$t_2 | ||
| 1144 | $ADDU $c_3,$at | ||
| 1145 | mflo $t_1 | ||
| 1146 | mfhi $t_2 | ||
| 1147 | $ADDU $c_1,$t_1 | ||
| 1148 | sltu $at,$c_1,$t_1 | ||
| 1149 | $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); | ||
| 1150 | $ADDU $t_2,$at | ||
| 1151 | $ADDU $c_2,$t_2 | ||
| 1152 | sltu $at,$c_2,$t_2 | ||
| 1153 | $ADDU $c_3,$at | ||
| 1154 | mflo $t_1 | ||
| 1155 | mfhi $t_2 | ||
| 1156 | $ADDU $c_1,$t_1 | ||
| 1157 | sltu $at,$c_1,$t_1 | ||
| 1158 | $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); | ||
| 1159 | $ADDU $t_2,$at | ||
| 1160 | $ADDU $c_2,$t_2 | ||
| 1161 | sltu $at,$c_2,$t_2 | ||
| 1162 | $ADDU $c_3,$at | ||
| 1163 | $ST $c_1,3*$BNSZ($a0) # r[3]=c1; | ||
| 1164 | |||
| 1165 | mflo $t_1 | ||
| 1166 | mfhi $t_2 | ||
| 1167 | $ADDU $c_2,$t_1 | ||
| 1168 | sltu $at,$c_2,$t_1 | ||
| 1169 | $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); | ||
| 1170 | $ADDU $t_2,$at | ||
| 1171 | $ADDU $c_3,$t_2 | ||
| 1172 | sltu $c_1,$c_3,$t_2 | ||
| 1173 | mflo $t_1 | ||
| 1174 | mfhi $t_2 | ||
| 1175 | $ADDU $c_2,$t_1 | ||
| 1176 | sltu $at,$c_2,$t_1 | ||
| 1177 | $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); | ||
| 1178 | $ADDU $t_2,$at | ||
| 1179 | $ADDU $c_3,$t_2 | ||
| 1180 | sltu $at,$c_3,$t_2 | ||
| 1181 | $ADDU $c_1,$at | ||
| 1182 | mflo $t_1 | ||
| 1183 | mfhi $t_2 | ||
| 1184 | $ADDU $c_2,$t_1 | ||
| 1185 | sltu $at,$c_2,$t_1 | ||
| 1186 | $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); | ||
| 1187 | $ADDU $t_2,$at | ||
| 1188 | $ADDU $c_3,$t_2 | ||
| 1189 | sltu $at,$c_3,$t_2 | ||
| 1190 | $ADDU $c_1,$at | ||
| 1191 | mflo $t_1 | ||
| 1192 | mfhi $t_2 | ||
| 1193 | $ADDU $c_2,$t_1 | ||
| 1194 | sltu $at,$c_2,$t_1 | ||
| 1195 | $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); | ||
| 1196 | $ADDU $t_2,$at | ||
| 1197 | $ADDU $c_3,$t_2 | ||
| 1198 | sltu $at,$c_3,$t_2 | ||
| 1199 | $ADDU $c_1,$at | ||
| 1200 | mflo $t_1 | ||
| 1201 | mfhi $t_2 | ||
| 1202 | $ADDU $c_2,$t_1 | ||
| 1203 | sltu $at,$c_2,$t_1 | ||
| 1204 | $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); | ||
| 1205 | $ADDU $t_2,$at | ||
| 1206 | $ADDU $c_3,$t_2 | ||
| 1207 | sltu $at,$c_3,$t_2 | ||
| 1208 | $ADDU $c_1,$at | ||
| 1209 | $ST $c_2,4*$BNSZ($a0) # r[4]=c2; | ||
| 1210 | |||
| 1211 | mflo $t_1 | ||
| 1212 | mfhi $t_2 | ||
| 1213 | $ADDU $c_3,$t_1 | ||
| 1214 | sltu $at,$c_3,$t_1 | ||
| 1215 | $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); | ||
| 1216 | $ADDU $t_2,$at | ||
| 1217 | $ADDU $c_1,$t_2 | ||
| 1218 | sltu $c_2,$c_1,$t_2 | ||
| 1219 | mflo $t_1 | ||
| 1220 | mfhi $t_2 | ||
| 1221 | $ADDU $c_3,$t_1 | ||
| 1222 | sltu $at,$c_3,$t_1 | ||
| 1223 | $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); | ||
| 1224 | $ADDU $t_2,$at | ||
| 1225 | $ADDU $c_1,$t_2 | ||
| 1226 | sltu $at,$c_1,$t_2 | ||
| 1227 | $ADDU $c_2,$at | ||
| 1228 | mflo $t_1 | ||
| 1229 | mfhi $t_2 | ||
| 1230 | $ADDU $c_3,$t_1 | ||
| 1231 | sltu $at,$c_3,$t_1 | ||
| 1232 | $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); | ||
| 1233 | $ADDU $t_2,$at | ||
| 1234 | $ADDU $c_1,$t_2 | ||
| 1235 | sltu $at,$c_1,$t_2 | ||
| 1236 | $ADDU $c_2,$at | ||
| 1237 | mflo $t_1 | ||
| 1238 | mfhi $t_2 | ||
| 1239 | $ADDU $c_3,$t_1 | ||
| 1240 | sltu $at,$c_3,$t_1 | ||
| 1241 | $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); | ||
| 1242 | $ADDU $t_2,$at | ||
| 1243 | $ADDU $c_1,$t_2 | ||
| 1244 | sltu $at,$c_1,$t_2 | ||
| 1245 | $ADDU $c_2,$at | ||
| 1246 | mflo $t_1 | ||
| 1247 | mfhi $t_2 | ||
| 1248 | $ADDU $c_3,$t_1 | ||
| 1249 | sltu $at,$c_3,$t_1 | ||
| 1250 | $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); | ||
| 1251 | $ADDU $t_2,$at | ||
| 1252 | $ADDU $c_1,$t_2 | ||
| 1253 | sltu $at,$c_1,$t_2 | ||
| 1254 | $ADDU $c_2,$at | ||
| 1255 | mflo $t_1 | ||
| 1256 | mfhi $t_2 | ||
| 1257 | $ADDU $c_3,$t_1 | ||
| 1258 | sltu $at,$c_3,$t_1 | ||
| 1259 | $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); | ||
| 1260 | $ADDU $t_2,$at | ||
| 1261 | $ADDU $c_1,$t_2 | ||
| 1262 | sltu $at,$c_1,$t_2 | ||
| 1263 | $ADDU $c_2,$at | ||
| 1264 | $ST $c_3,5*$BNSZ($a0) # r[5]=c3; | ||
| 1265 | |||
| 1266 | mflo $t_1 | ||
| 1267 | mfhi $t_2 | ||
| 1268 | $ADDU $c_1,$t_1 | ||
| 1269 | sltu $at,$c_1,$t_1 | ||
| 1270 | $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); | ||
| 1271 | $ADDU $t_2,$at | ||
| 1272 | $ADDU $c_2,$t_2 | ||
| 1273 | sltu $c_3,$c_2,$t_2 | ||
| 1274 | mflo $t_1 | ||
| 1275 | mfhi $t_2 | ||
| 1276 | $ADDU $c_1,$t_1 | ||
| 1277 | sltu $at,$c_1,$t_1 | ||
| 1278 | $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); | ||
| 1279 | $ADDU $t_2,$at | ||
| 1280 | $ADDU $c_2,$t_2 | ||
| 1281 | sltu $at,$c_2,$t_2 | ||
| 1282 | $ADDU $c_3,$at | ||
| 1283 | mflo $t_1 | ||
| 1284 | mfhi $t_2 | ||
| 1285 | $ADDU $c_1,$t_1 | ||
| 1286 | sltu $at,$c_1,$t_1 | ||
| 1287 | $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); | ||
| 1288 | $ADDU $t_2,$at | ||
| 1289 | $ADDU $c_2,$t_2 | ||
| 1290 | sltu $at,$c_2,$t_2 | ||
| 1291 | $ADDU $c_3,$at | ||
| 1292 | mflo $t_1 | ||
| 1293 | mfhi $t_2 | ||
| 1294 | $ADDU $c_1,$t_1 | ||
| 1295 | sltu $at,$c_1,$t_1 | ||
| 1296 | $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); | ||
| 1297 | $ADDU $t_2,$at | ||
| 1298 | $ADDU $c_2,$t_2 | ||
| 1299 | sltu $at,$c_2,$t_2 | ||
| 1300 | $ADDU $c_3,$at | ||
| 1301 | mflo $t_1 | ||
| 1302 | mfhi $t_2 | ||
| 1303 | $ADDU $c_1,$t_1 | ||
| 1304 | sltu $at,$c_1,$t_1 | ||
| 1305 | $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); | ||
| 1306 | $ADDU $t_2,$at | ||
| 1307 | $ADDU $c_2,$t_2 | ||
| 1308 | sltu $at,$c_2,$t_2 | ||
| 1309 | $ADDU $c_3,$at | ||
| 1310 | mflo $t_1 | ||
| 1311 | mfhi $t_2 | ||
| 1312 | $ADDU $c_1,$t_1 | ||
| 1313 | sltu $at,$c_1,$t_1 | ||
| 1314 | $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); | ||
| 1315 | $ADDU $t_2,$at | ||
| 1316 | $ADDU $c_2,$t_2 | ||
| 1317 | sltu $at,$c_2,$t_2 | ||
| 1318 | $ADDU $c_3,$at | ||
| 1319 | mflo $t_1 | ||
| 1320 | mfhi $t_2 | ||
| 1321 | $ADDU $c_1,$t_1 | ||
| 1322 | sltu $at,$c_1,$t_1 | ||
| 1323 | $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); | ||
| 1324 | $ADDU $t_2,$at | ||
| 1325 | $ADDU $c_2,$t_2 | ||
| 1326 | sltu $at,$c_2,$t_2 | ||
| 1327 | $ADDU $c_3,$at | ||
| 1328 | $ST $c_1,6*$BNSZ($a0) # r[6]=c1; | ||
| 1329 | |||
| 1330 | mflo $t_1 | ||
| 1331 | mfhi $t_2 | ||
| 1332 | $ADDU $c_2,$t_1 | ||
| 1333 | sltu $at,$c_2,$t_1 | ||
| 1334 | $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); | ||
| 1335 | $ADDU $t_2,$at | ||
| 1336 | $ADDU $c_3,$t_2 | ||
| 1337 | sltu $c_1,$c_3,$t_2 | ||
| 1338 | mflo $t_1 | ||
| 1339 | mfhi $t_2 | ||
| 1340 | $ADDU $c_2,$t_1 | ||
| 1341 | sltu $at,$c_2,$t_1 | ||
| 1342 | $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); | ||
| 1343 | $ADDU $t_2,$at | ||
| 1344 | $ADDU $c_3,$t_2 | ||
| 1345 | sltu $at,$c_3,$t_2 | ||
| 1346 | $ADDU $c_1,$at | ||
| 1347 | mflo $t_1 | ||
| 1348 | mfhi $t_2 | ||
| 1349 | $ADDU $c_2,$t_1 | ||
| 1350 | sltu $at,$c_2,$t_1 | ||
| 1351 | $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); | ||
| 1352 | $ADDU $t_2,$at | ||
| 1353 | $ADDU $c_3,$t_2 | ||
| 1354 | sltu $at,$c_3,$t_2 | ||
| 1355 | $ADDU $c_1,$at | ||
| 1356 | mflo $t_1 | ||
| 1357 | mfhi $t_2 | ||
| 1358 | $ADDU $c_2,$t_1 | ||
| 1359 | sltu $at,$c_2,$t_1 | ||
| 1360 | $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); | ||
| 1361 | $ADDU $t_2,$at | ||
| 1362 | $ADDU $c_3,$t_2 | ||
| 1363 | sltu $at,$c_3,$t_2 | ||
| 1364 | $ADDU $c_1,$at | ||
| 1365 | mflo $t_1 | ||
| 1366 | mfhi $t_2 | ||
| 1367 | $ADDU $c_2,$t_1 | ||
| 1368 | sltu $at,$c_2,$t_1 | ||
| 1369 | $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); | ||
| 1370 | $ADDU $t_2,$at | ||
| 1371 | $ADDU $c_3,$t_2 | ||
| 1372 | sltu $at,$c_3,$t_2 | ||
| 1373 | $ADDU $c_1,$at | ||
| 1374 | mflo $t_1 | ||
| 1375 | mfhi $t_2 | ||
| 1376 | $ADDU $c_2,$t_1 | ||
| 1377 | sltu $at,$c_2,$t_1 | ||
| 1378 | $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); | ||
| 1379 | $ADDU $t_2,$at | ||
| 1380 | $ADDU $c_3,$t_2 | ||
| 1381 | sltu $at,$c_3,$t_2 | ||
| 1382 | $ADDU $c_1,$at | ||
| 1383 | mflo $t_1 | ||
| 1384 | mfhi $t_2 | ||
| 1385 | $ADDU $c_2,$t_1 | ||
| 1386 | sltu $at,$c_2,$t_1 | ||
| 1387 | $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); | ||
| 1388 | $ADDU $t_2,$at | ||
| 1389 | $ADDU $c_3,$t_2 | ||
| 1390 | sltu $at,$c_3,$t_2 | ||
| 1391 | $ADDU $c_1,$at | ||
| 1392 | mflo $t_1 | ||
| 1393 | mfhi $t_2 | ||
| 1394 | $ADDU $c_2,$t_1 | ||
| 1395 | sltu $at,$c_2,$t_1 | ||
| 1396 | $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); | ||
| 1397 | $ADDU $t_2,$at | ||
| 1398 | $ADDU $c_3,$t_2 | ||
| 1399 | sltu $at,$c_3,$t_2 | ||
| 1400 | $ADDU $c_1,$at | ||
| 1401 | $ST $c_2,7*$BNSZ($a0) # r[7]=c2; | ||
| 1402 | |||
| 1403 | mflo $t_1 | ||
| 1404 | mfhi $t_2 | ||
| 1405 | $ADDU $c_3,$t_1 | ||
| 1406 | sltu $at,$c_3,$t_1 | ||
| 1407 | $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); | ||
| 1408 | $ADDU $t_2,$at | ||
| 1409 | $ADDU $c_1,$t_2 | ||
| 1410 | sltu $c_2,$c_1,$t_2 | ||
| 1411 | mflo $t_1 | ||
| 1412 | mfhi $t_2 | ||
| 1413 | $ADDU $c_3,$t_1 | ||
| 1414 | sltu $at,$c_3,$t_1 | ||
| 1415 | $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); | ||
| 1416 | $ADDU $t_2,$at | ||
| 1417 | $ADDU $c_1,$t_2 | ||
| 1418 | sltu $at,$c_1,$t_2 | ||
| 1419 | $ADDU $c_2,$at | ||
| 1420 | mflo $t_1 | ||
| 1421 | mfhi $t_2 | ||
| 1422 | $ADDU $c_3,$t_1 | ||
| 1423 | sltu $at,$c_3,$t_1 | ||
| 1424 | $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); | ||
| 1425 | $ADDU $t_2,$at | ||
| 1426 | $ADDU $c_1,$t_2 | ||
| 1427 | sltu $at,$c_1,$t_2 | ||
| 1428 | $ADDU $c_2,$at | ||
| 1429 | mflo $t_1 | ||
| 1430 | mfhi $t_2 | ||
| 1431 | $ADDU $c_3,$t_1 | ||
| 1432 | sltu $at,$c_3,$t_1 | ||
| 1433 | $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); | ||
| 1434 | $ADDU $t_2,$at | ||
| 1435 | $ADDU $c_1,$t_2 | ||
| 1436 | sltu $at,$c_1,$t_2 | ||
| 1437 | $ADDU $c_2,$at | ||
| 1438 | mflo $t_1 | ||
| 1439 | mfhi $t_2 | ||
| 1440 | $ADDU $c_3,$t_1 | ||
| 1441 | sltu $at,$c_3,$t_1 | ||
| 1442 | $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); | ||
| 1443 | $ADDU $t_2,$at | ||
| 1444 | $ADDU $c_1,$t_2 | ||
| 1445 | sltu $at,$c_1,$t_2 | ||
| 1446 | $ADDU $c_2,$at | ||
| 1447 | mflo $t_1 | ||
| 1448 | mfhi $t_2 | ||
| 1449 | $ADDU $c_3,$t_1 | ||
| 1450 | sltu $at,$c_3,$t_1 | ||
| 1451 | $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); | ||
| 1452 | $ADDU $t_2,$at | ||
| 1453 | $ADDU $c_1,$t_2 | ||
| 1454 | sltu $at,$c_1,$t_2 | ||
| 1455 | $ADDU $c_2,$at | ||
| 1456 | mflo $t_1 | ||
| 1457 | mfhi $t_2 | ||
| 1458 | $ADDU $c_3,$t_1 | ||
| 1459 | sltu $at,$c_3,$t_1 | ||
| 1460 | $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); | ||
| 1461 | $ADDU $t_2,$at | ||
| 1462 | $ADDU $c_1,$t_2 | ||
| 1463 | sltu $at,$c_1,$t_2 | ||
| 1464 | $ADDU $c_2,$at | ||
| 1465 | $ST $c_3,8*$BNSZ($a0) # r[8]=c3; | ||
| 1466 | |||
| 1467 | mflo $t_1 | ||
| 1468 | mfhi $t_2 | ||
| 1469 | $ADDU $c_1,$t_1 | ||
| 1470 | sltu $at,$c_1,$t_1 | ||
| 1471 | $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); | ||
| 1472 | $ADDU $t_2,$at | ||
| 1473 | $ADDU $c_2,$t_2 | ||
| 1474 | sltu $c_3,$c_2,$t_2 | ||
| 1475 | mflo $t_1 | ||
| 1476 | mfhi $t_2 | ||
| 1477 | $ADDU $c_1,$t_1 | ||
| 1478 | sltu $at,$c_1,$t_1 | ||
| 1479 | $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); | ||
| 1480 | $ADDU $t_2,$at | ||
| 1481 | $ADDU $c_2,$t_2 | ||
| 1482 | sltu $at,$c_2,$t_2 | ||
| 1483 | $ADDU $c_3,$at | ||
| 1484 | mflo $t_1 | ||
| 1485 | mfhi $t_2 | ||
| 1486 | $ADDU $c_1,$t_1 | ||
| 1487 | sltu $at,$c_1,$t_1 | ||
| 1488 | $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); | ||
| 1489 | $ADDU $t_2,$at | ||
| 1490 | $ADDU $c_2,$t_2 | ||
| 1491 | sltu $at,$c_2,$t_2 | ||
| 1492 | $ADDU $c_3,$at | ||
| 1493 | mflo $t_1 | ||
| 1494 | mfhi $t_2 | ||
| 1495 | $ADDU $c_1,$t_1 | ||
| 1496 | sltu $at,$c_1,$t_1 | ||
| 1497 | $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); | ||
| 1498 | $ADDU $t_2,$at | ||
| 1499 | $ADDU $c_2,$t_2 | ||
| 1500 | sltu $at,$c_2,$t_2 | ||
| 1501 | $ADDU $c_3,$at | ||
| 1502 | mflo $t_1 | ||
| 1503 | mfhi $t_2 | ||
| 1504 | $ADDU $c_1,$t_1 | ||
| 1505 | sltu $at,$c_1,$t_1 | ||
| 1506 | $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); | ||
| 1507 | $ADDU $t_2,$at | ||
| 1508 | $ADDU $c_2,$t_2 | ||
| 1509 | sltu $at,$c_2,$t_2 | ||
| 1510 | $ADDU $c_3,$at | ||
| 1511 | mflo $t_1 | ||
| 1512 | mfhi $t_2 | ||
| 1513 | $ADDU $c_1,$t_1 | ||
| 1514 | sltu $at,$c_1,$t_1 | ||
| 1515 | $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); | ||
| 1516 | $ADDU $t_2,$at | ||
| 1517 | $ADDU $c_2,$t_2 | ||
| 1518 | sltu $at,$c_2,$t_2 | ||
| 1519 | $ADDU $c_3,$at | ||
| 1520 | $ST $c_1,9*$BNSZ($a0) # r[9]=c1; | ||
| 1521 | |||
| 1522 | mflo $t_1 | ||
| 1523 | mfhi $t_2 | ||
| 1524 | $ADDU $c_2,$t_1 | ||
| 1525 | sltu $at,$c_2,$t_1 | ||
| 1526 | $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); | ||
| 1527 | $ADDU $t_2,$at | ||
| 1528 | $ADDU $c_3,$t_2 | ||
| 1529 | sltu $c_1,$c_3,$t_2 | ||
| 1530 | mflo $t_1 | ||
| 1531 | mfhi $t_2 | ||
| 1532 | $ADDU $c_2,$t_1 | ||
| 1533 | sltu $at,$c_2,$t_1 | ||
| 1534 | $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); | ||
| 1535 | $ADDU $t_2,$at | ||
| 1536 | $ADDU $c_3,$t_2 | ||
| 1537 | sltu $at,$c_3,$t_2 | ||
| 1538 | $ADDU $c_1,$at | ||
| 1539 | mflo $t_1 | ||
| 1540 | mfhi $t_2 | ||
| 1541 | $ADDU $c_2,$t_1 | ||
| 1542 | sltu $at,$c_2,$t_1 | ||
| 1543 | $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); | ||
| 1544 | $ADDU $t_2,$at | ||
| 1545 | $ADDU $c_3,$t_2 | ||
| 1546 | sltu $at,$c_3,$t_2 | ||
| 1547 | $ADDU $c_1,$at | ||
| 1548 | mflo $t_1 | ||
| 1549 | mfhi $t_2 | ||
| 1550 | $ADDU $c_2,$t_1 | ||
| 1551 | sltu $at,$c_2,$t_1 | ||
| 1552 | $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); | ||
| 1553 | $ADDU $t_2,$at | ||
| 1554 | $ADDU $c_3,$t_2 | ||
| 1555 | sltu $at,$c_3,$t_2 | ||
| 1556 | $ADDU $c_1,$at | ||
| 1557 | mflo $t_1 | ||
| 1558 | mfhi $t_2 | ||
| 1559 | $ADDU $c_2,$t_1 | ||
| 1560 | sltu $at,$c_2,$t_1 | ||
| 1561 | $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); | ||
| 1562 | $ADDU $t_2,$at | ||
| 1563 | $ADDU $c_3,$t_2 | ||
| 1564 | sltu $at,$c_3,$t_2 | ||
| 1565 | $ADDU $c_1,$at | ||
| 1566 | $ST $c_2,10*$BNSZ($a0) # r[10]=c2; | ||
| 1567 | |||
| 1568 | mflo $t_1 | ||
| 1569 | mfhi $t_2 | ||
| 1570 | $ADDU $c_3,$t_1 | ||
| 1571 | sltu $at,$c_3,$t_1 | ||
| 1572 | $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); | ||
| 1573 | $ADDU $t_2,$at | ||
| 1574 | $ADDU $c_1,$t_2 | ||
| 1575 | sltu $c_2,$c_1,$t_2 | ||
| 1576 | mflo $t_1 | ||
| 1577 | mfhi $t_2 | ||
| 1578 | $ADDU $c_3,$t_1 | ||
| 1579 | sltu $at,$c_3,$t_1 | ||
| 1580 | $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); | ||
| 1581 | $ADDU $t_2,$at | ||
| 1582 | $ADDU $c_1,$t_2 | ||
| 1583 | sltu $at,$c_1,$t_2 | ||
| 1584 | $ADDU $c_2,$at | ||
| 1585 | mflo $t_1 | ||
| 1586 | mfhi $t_2 | ||
| 1587 | $ADDU $c_3,$t_1 | ||
| 1588 | sltu $at,$c_3,$t_1 | ||
| 1589 | $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); | ||
| 1590 | $ADDU $t_2,$at | ||
| 1591 | $ADDU $c_1,$t_2 | ||
| 1592 | sltu $at,$c_1,$t_2 | ||
| 1593 | $ADDU $c_2,$at | ||
| 1594 | mflo $t_1 | ||
| 1595 | mfhi $t_2 | ||
| 1596 | $ADDU $c_3,$t_1 | ||
| 1597 | sltu $at,$c_3,$t_1 | ||
| 1598 | $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); | ||
| 1599 | $ADDU $t_2,$at | ||
| 1600 | $ADDU $c_1,$t_2 | ||
| 1601 | sltu $at,$c_1,$t_2 | ||
| 1602 | $ADDU $c_2,$at | ||
| 1603 | $ST $c_3,11*$BNSZ($a0) # r[11]=c3; | ||
| 1604 | |||
| 1605 | mflo $t_1 | ||
| 1606 | mfhi $t_2 | ||
| 1607 | $ADDU $c_1,$t_1 | ||
| 1608 | sltu $at,$c_1,$t_1 | ||
| 1609 | $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); | ||
| 1610 | $ADDU $t_2,$at | ||
| 1611 | $ADDU $c_2,$t_2 | ||
| 1612 | sltu $c_3,$c_2,$t_2 | ||
| 1613 | mflo $t_1 | ||
| 1614 | mfhi $t_2 | ||
| 1615 | $ADDU $c_1,$t_1 | ||
| 1616 | sltu $at,$c_1,$t_1 | ||
| 1617 | $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); | ||
| 1618 | $ADDU $t_2,$at | ||
| 1619 | $ADDU $c_2,$t_2 | ||
| 1620 | sltu $at,$c_2,$t_2 | ||
| 1621 | $ADDU $c_3,$at | ||
| 1622 | mflo $t_1 | ||
| 1623 | mfhi $t_2 | ||
| 1624 | $ADDU $c_1,$t_1 | ||
| 1625 | sltu $at,$c_1,$t_1 | ||
| 1626 | $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); | ||
| 1627 | $ADDU $t_2,$at | ||
| 1628 | $ADDU $c_2,$t_2 | ||
| 1629 | sltu $at,$c_2,$t_2 | ||
| 1630 | $ADDU $c_3,$at | ||
| 1631 | $ST $c_1,12*$BNSZ($a0) # r[12]=c1; | ||
| 1632 | |||
| 1633 | mflo $t_1 | ||
| 1634 | mfhi $t_2 | ||
| 1635 | $ADDU $c_2,$t_1 | ||
| 1636 | sltu $at,$c_2,$t_1 | ||
| 1637 | $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); | ||
| 1638 | $ADDU $t_2,$at | ||
| 1639 | $ADDU $c_3,$t_2 | ||
| 1640 | sltu $c_1,$c_3,$t_2 | ||
| 1641 | mflo $t_1 | ||
| 1642 | mfhi $t_2 | ||
| 1643 | $ADDU $c_2,$t_1 | ||
| 1644 | sltu $at,$c_2,$t_1 | ||
| 1645 | $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); | ||
| 1646 | $ADDU $t_2,$at | ||
| 1647 | $ADDU $c_3,$t_2 | ||
| 1648 | sltu $at,$c_3,$t_2 | ||
| 1649 | $ADDU $c_1,$at | ||
| 1650 | $ST $c_2,13*$BNSZ($a0) # r[13]=c2; | ||
| 1651 | |||
| 1652 | mflo $t_1 | ||
| 1653 | mfhi $t_2 | ||
| 1654 | $ADDU $c_3,$t_1 | ||
| 1655 | sltu $at,$c_3,$t_1 | ||
| 1656 | $ADDU $t_2,$at | ||
| 1657 | $ADDU $c_1,$t_2 | ||
| 1658 | $ST $c_3,14*$BNSZ($a0) # r[14]=c3; | ||
| 1659 | $ST $c_1,15*$BNSZ($a0) # r[15]=c1; | ||
| 1660 | |||
| 1661 | .set noreorder | ||
| 1662 | ___ | ||
| 1663 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 1664 | $REG_L $s5,10*$SZREG($sp) | ||
| 1665 | $REG_L $s4,9*$SZREG($sp) | ||
| 1666 | $REG_L $s3,8*$SZREG($sp) | ||
| 1667 | $REG_L $s2,7*$SZREG($sp) | ||
| 1668 | $REG_L $s1,6*$SZREG($sp) | ||
| 1669 | $REG_L $s0,5*$SZREG($sp) | ||
| 1670 | $REG_L $t3,4*$SZREG($sp) | ||
| 1671 | $REG_L $t2,3*$SZREG($sp) | ||
| 1672 | $REG_L $t1,2*$SZREG($sp) | ||
| 1673 | $REG_L $t0,1*$SZREG($sp) | ||
| 1674 | $REG_L $gp,0*$SZREG($sp) | ||
| 1675 | jr $ra | ||
| 1676 | $PTR_ADD $sp,12*$SZREG | ||
| 1677 | ___ | ||
| 1678 | $code.=<<___ if ($flavour !~ /nubi/i); | ||
| 1679 | $REG_L $s5,5*$SZREG($sp) | ||
| 1680 | $REG_L $s4,4*$SZREG($sp) | ||
| 1681 | $REG_L $s3,3*$SZREG($sp) | ||
| 1682 | $REG_L $s2,2*$SZREG($sp) | ||
| 1683 | $REG_L $s1,1*$SZREG($sp) | ||
| 1684 | $REG_L $s0,0*$SZREG($sp) | ||
| 1685 | jr $ra | ||
| 1686 | $PTR_ADD $sp,6*$SZREG | ||
| 1687 | ___ | ||
| 1688 | $code.=<<___; | ||
| 1689 | .end bn_mul_comba8 | ||
| 1690 | |||
| 1691 | .align 5 | ||
| 1692 | .globl bn_mul_comba4 | ||
| 1693 | .ent bn_mul_comba4 | ||
| 1694 | bn_mul_comba4: | ||
| 1695 | ___ | ||
| 1696 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 1697 | .frame $sp,6*$SZREG,$ra | ||
| 1698 | .mask 0x8000f008,-$SZREG | ||
| 1699 | .set noreorder | ||
| 1700 | $PTR_SUB $sp,6*$SZREG | ||
| 1701 | $REG_S $ra,5*$SZREG($sp) | ||
| 1702 | $REG_S $t3,4*$SZREG($sp) | ||
| 1703 | $REG_S $t2,3*$SZREG($sp) | ||
| 1704 | $REG_S $t1,2*$SZREG($sp) | ||
| 1705 | $REG_S $t0,1*$SZREG($sp) | ||
| 1706 | $REG_S $gp,0*$SZREG($sp) | ||
| 1707 | ___ | ||
| 1708 | $code.=<<___; | ||
| 1709 | .set reorder | ||
| 1710 | $LD $a_0,0($a1) | ||
| 1711 | $LD $b_0,0($a2) | ||
| 1712 | $LD $a_1,$BNSZ($a1) | ||
| 1713 | $LD $a_2,2*$BNSZ($a1) | ||
| 1714 | $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
| 1715 | $LD $a_3,3*$BNSZ($a1) | ||
| 1716 | $LD $b_1,$BNSZ($a2) | ||
| 1717 | $LD $b_2,2*$BNSZ($a2) | ||
| 1718 | $LD $b_3,3*$BNSZ($a2) | ||
| 1719 | mflo $c_1 | ||
| 1720 | mfhi $c_2 | ||
| 1721 | $ST $c_1,0($a0) | ||
| 1722 | |||
| 1723 | $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); | ||
| 1724 | mflo $t_1 | ||
| 1725 | mfhi $t_2 | ||
| 1726 | $ADDU $c_2,$t_1 | ||
| 1727 | sltu $at,$c_2,$t_1 | ||
| 1728 | $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); | ||
| 1729 | $ADDU $c_3,$t_2,$at | ||
| 1730 | mflo $t_1 | ||
| 1731 | mfhi $t_2 | ||
| 1732 | $ADDU $c_2,$t_1 | ||
| 1733 | sltu $at,$c_2,$t_1 | ||
| 1734 | $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); | ||
| 1735 | $ADDU $t_2,$at | ||
| 1736 | $ADDU $c_3,$t_2 | ||
| 1737 | sltu $c_1,$c_3,$t_2 | ||
| 1738 | $ST $c_2,$BNSZ($a0) | ||
| 1739 | |||
| 1740 | mflo $t_1 | ||
| 1741 | mfhi $t_2 | ||
| 1742 | $ADDU $c_3,$t_1 | ||
| 1743 | sltu $at,$c_3,$t_1 | ||
| 1744 | $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); | ||
| 1745 | $ADDU $t_2,$at | ||
| 1746 | $ADDU $c_1,$t_2 | ||
| 1747 | mflo $t_1 | ||
| 1748 | mfhi $t_2 | ||
| 1749 | $ADDU $c_3,$t_1 | ||
| 1750 | sltu $at,$c_3,$t_1 | ||
| 1751 | $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); | ||
| 1752 | $ADDU $t_2,$at | ||
| 1753 | $ADDU $c_1,$t_2 | ||
| 1754 | sltu $c_2,$c_1,$t_2 | ||
| 1755 | mflo $t_1 | ||
| 1756 | mfhi $t_2 | ||
| 1757 | $ADDU $c_3,$t_1 | ||
| 1758 | sltu $at,$c_3,$t_1 | ||
| 1759 | $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); | ||
| 1760 | $ADDU $t_2,$at | ||
| 1761 | $ADDU $c_1,$t_2 | ||
| 1762 | sltu $at,$c_1,$t_2 | ||
| 1763 | $ADDU $c_2,$at | ||
| 1764 | $ST $c_3,2*$BNSZ($a0) | ||
| 1765 | |||
| 1766 | mflo $t_1 | ||
| 1767 | mfhi $t_2 | ||
| 1768 | $ADDU $c_1,$t_1 | ||
| 1769 | sltu $at,$c_1,$t_1 | ||
| 1770 | $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); | ||
| 1771 | $ADDU $t_2,$at | ||
| 1772 | $ADDU $c_2,$t_2 | ||
| 1773 | sltu $c_3,$c_2,$t_2 | ||
| 1774 | mflo $t_1 | ||
| 1775 | mfhi $t_2 | ||
| 1776 | $ADDU $c_1,$t_1 | ||
| 1777 | sltu $at,$c_1,$t_1 | ||
| 1778 | $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); | ||
| 1779 | $ADDU $t_2,$at | ||
| 1780 | $ADDU $c_2,$t_2 | ||
| 1781 | sltu $at,$c_2,$t_2 | ||
| 1782 | $ADDU $c_3,$at | ||
| 1783 | mflo $t_1 | ||
| 1784 | mfhi $t_2 | ||
| 1785 | $ADDU $c_1,$t_1 | ||
| 1786 | sltu $at,$c_1,$t_1 | ||
| 1787 | $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); | ||
| 1788 | $ADDU $t_2,$at | ||
| 1789 | $ADDU $c_2,$t_2 | ||
| 1790 | sltu $at,$c_2,$t_2 | ||
| 1791 | $ADDU $c_3,$at | ||
| 1792 | mflo $t_1 | ||
| 1793 | mfhi $t_2 | ||
| 1794 | $ADDU $c_1,$t_1 | ||
| 1795 | sltu $at,$c_1,$t_1 | ||
| 1796 | $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); | ||
| 1797 | $ADDU $t_2,$at | ||
| 1798 | $ADDU $c_2,$t_2 | ||
| 1799 | sltu $at,$c_2,$t_2 | ||
| 1800 | $ADDU $c_3,$at | ||
| 1801 | $ST $c_1,3*$BNSZ($a0) | ||
| 1802 | |||
| 1803 | mflo $t_1 | ||
| 1804 | mfhi $t_2 | ||
| 1805 | $ADDU $c_2,$t_1 | ||
| 1806 | sltu $at,$c_2,$t_1 | ||
| 1807 | $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); | ||
| 1808 | $ADDU $t_2,$at | ||
| 1809 | $ADDU $c_3,$t_2 | ||
| 1810 | sltu $c_1,$c_3,$t_2 | ||
| 1811 | mflo $t_1 | ||
| 1812 | mfhi $t_2 | ||
| 1813 | $ADDU $c_2,$t_1 | ||
| 1814 | sltu $at,$c_2,$t_1 | ||
| 1815 | $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); | ||
| 1816 | $ADDU $t_2,$at | ||
| 1817 | $ADDU $c_3,$t_2 | ||
| 1818 | sltu $at,$c_3,$t_2 | ||
| 1819 | $ADDU $c_1,$at | ||
| 1820 | mflo $t_1 | ||
| 1821 | mfhi $t_2 | ||
| 1822 | $ADDU $c_2,$t_1 | ||
| 1823 | sltu $at,$c_2,$t_1 | ||
| 1824 | $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); | ||
| 1825 | $ADDU $t_2,$at | ||
| 1826 | $ADDU $c_3,$t_2 | ||
| 1827 | sltu $at,$c_3,$t_2 | ||
| 1828 | $ADDU $c_1,$at | ||
| 1829 | $ST $c_2,4*$BNSZ($a0) | ||
| 1830 | |||
| 1831 | mflo $t_1 | ||
| 1832 | mfhi $t_2 | ||
| 1833 | $ADDU $c_3,$t_1 | ||
| 1834 | sltu $at,$c_3,$t_1 | ||
| 1835 | $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); | ||
| 1836 | $ADDU $t_2,$at | ||
| 1837 | $ADDU $c_1,$t_2 | ||
| 1838 | sltu $c_2,$c_1,$t_2 | ||
| 1839 | mflo $t_1 | ||
| 1840 | mfhi $t_2 | ||
| 1841 | $ADDU $c_3,$t_1 | ||
| 1842 | sltu $at,$c_3,$t_1 | ||
| 1843 | $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); | ||
| 1844 | $ADDU $t_2,$at | ||
| 1845 | $ADDU $c_1,$t_2 | ||
| 1846 | sltu $at,$c_1,$t_2 | ||
| 1847 | $ADDU $c_2,$at | ||
| 1848 | $ST $c_3,5*$BNSZ($a0) | ||
| 1849 | |||
| 1850 | mflo $t_1 | ||
| 1851 | mfhi $t_2 | ||
| 1852 | $ADDU $c_1,$t_1 | ||
| 1853 | sltu $at,$c_1,$t_1 | ||
| 1854 | $ADDU $t_2,$at | ||
| 1855 | $ADDU $c_2,$t_2 | ||
| 1856 | $ST $c_1,6*$BNSZ($a0) | ||
| 1857 | $ST $c_2,7*$BNSZ($a0) | ||
| 1858 | |||
| 1859 | .set noreorder | ||
| 1860 | ___ | ||
| 1861 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 1862 | $REG_L $t3,4*$SZREG($sp) | ||
| 1863 | $REG_L $t2,3*$SZREG($sp) | ||
| 1864 | $REG_L $t1,2*$SZREG($sp) | ||
| 1865 | $REG_L $t0,1*$SZREG($sp) | ||
| 1866 | $REG_L $gp,0*$SZREG($sp) | ||
| 1867 | $PTR_ADD $sp,6*$SZREG | ||
| 1868 | ___ | ||
| 1869 | $code.=<<___; | ||
| 1870 | jr $ra | ||
| 1871 | nop | ||
| 1872 | .end bn_mul_comba4 | ||
| 1873 | ___ | ||
| 1874 | |||
| 1875 | ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); | ||
| 1876 | |||
| 1877 | $code.=<<___; | ||
| 1878 | |||
| 1879 | .align 5 | ||
| 1880 | .globl bn_sqr_comba8 | ||
| 1881 | .ent bn_sqr_comba8 | ||
| 1882 | bn_sqr_comba8: | ||
| 1883 | ___ | ||
| 1884 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 1885 | .frame $sp,6*$SZREG,$ra | ||
| 1886 | .mask 0x8000f008,-$SZREG | ||
| 1887 | .set noreorder | ||
| 1888 | $PTR_SUB $sp,6*$SZREG | ||
| 1889 | $REG_S $ra,5*$SZREG($sp) | ||
| 1890 | $REG_S $t3,4*$SZREG($sp) | ||
| 1891 | $REG_S $t2,3*$SZREG($sp) | ||
| 1892 | $REG_S $t1,2*$SZREG($sp) | ||
| 1893 | $REG_S $t0,1*$SZREG($sp) | ||
| 1894 | $REG_S $gp,0*$SZREG($sp) | ||
| 1895 | ___ | ||
| 1896 | $code.=<<___; | ||
| 1897 | .set reorder | ||
| 1898 | $LD $a_0,0($a1) | ||
| 1899 | $LD $a_1,$BNSZ($a1) | ||
| 1900 | $LD $a_2,2*$BNSZ($a1) | ||
| 1901 | $LD $a_3,3*$BNSZ($a1) | ||
| 1902 | |||
| 1903 | $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
| 1904 | $LD $a_4,4*$BNSZ($a1) | ||
| 1905 | $LD $a_5,5*$BNSZ($a1) | ||
| 1906 | $LD $a_6,6*$BNSZ($a1) | ||
| 1907 | $LD $a_7,7*$BNSZ($a1) | ||
| 1908 | mflo $c_1 | ||
| 1909 | mfhi $c_2 | ||
| 1910 | $ST $c_1,0($a0) | ||
| 1911 | |||
| 1912 | $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); | ||
| 1913 | mflo $t_1 | ||
| 1914 | mfhi $t_2 | ||
| 1915 | slt $c_1,$t_2,$zero | ||
| 1916 | $SLL $t_2,1 | ||
| 1917 | $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); | ||
| 1918 | slt $a2,$t_1,$zero | ||
| 1919 | $ADDU $t_2,$a2 | ||
| 1920 | $SLL $t_1,1 | ||
| 1921 | $ADDU $c_2,$t_1 | ||
| 1922 | sltu $at,$c_2,$t_1 | ||
| 1923 | $ADDU $c_3,$t_2,$at | ||
| 1924 | $ST $c_2,$BNSZ($a0) | ||
| 1925 | |||
| 1926 | mflo $t_1 | ||
| 1927 | mfhi $t_2 | ||
| 1928 | slt $c_2,$t_2,$zero | ||
| 1929 | $SLL $t_2,1 | ||
| 1930 | $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); | ||
| 1931 | slt $a2,$t_1,$zero | ||
| 1932 | $ADDU $t_2,$a2 | ||
| 1933 | $SLL $t_1,1 | ||
| 1934 | $ADDU $c_3,$t_1 | ||
| 1935 | sltu $at,$c_3,$t_1 | ||
| 1936 | $ADDU $t_2,$at | ||
| 1937 | $ADDU $c_1,$t_2 | ||
| 1938 | sltu $at,$c_1,$t_2 | ||
| 1939 | $ADDU $c_2,$at | ||
| 1940 | mflo $t_1 | ||
| 1941 | mfhi $t_2 | ||
| 1942 | $ADDU $c_3,$t_1 | ||
| 1943 | sltu $at,$c_3,$t_1 | ||
| 1944 | $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); | ||
| 1945 | $ADDU $t_2,$at | ||
| 1946 | $ADDU $c_1,$t_2 | ||
| 1947 | sltu $at,$c_1,$t_2 | ||
| 1948 | $ADDU $c_2,$at | ||
| 1949 | $ST $c_3,2*$BNSZ($a0) | ||
| 1950 | |||
| 1951 | mflo $t_1 | ||
| 1952 | mfhi $t_2 | ||
| 1953 | slt $c_3,$t_2,$zero | ||
| 1954 | $SLL $t_2,1 | ||
| 1955 | $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); | ||
| 1956 | slt $a2,$t_1,$zero | ||
| 1957 | $ADDU $t_2,$a2 | ||
| 1958 | $SLL $t_1,1 | ||
| 1959 | $ADDU $c_1,$t_1 | ||
| 1960 | sltu $at,$c_1,$t_1 | ||
| 1961 | $ADDU $t_2,$at | ||
| 1962 | $ADDU $c_2,$t_2 | ||
| 1963 | sltu $at,$c_2,$t_2 | ||
| 1964 | $ADDU $c_3,$at | ||
| 1965 | mflo $t_1 | ||
| 1966 | mfhi $t_2 | ||
| 1967 | slt $at,$t_2,$zero | ||
| 1968 | $ADDU $c_3,$at | ||
| 1969 | $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); | ||
| 1970 | $SLL $t_2,1 | ||
| 1971 | slt $a2,$t_1,$zero | ||
| 1972 | $ADDU $t_2,$a2 | ||
| 1973 | $SLL $t_1,1 | ||
| 1974 | $ADDU $c_1,$t_1 | ||
| 1975 | sltu $at,$c_1,$t_1 | ||
| 1976 | $ADDU $t_2,$at | ||
| 1977 | $ADDU $c_2,$t_2 | ||
| 1978 | sltu $at,$c_2,$t_2 | ||
| 1979 | $ADDU $c_3,$at | ||
| 1980 | $ST $c_1,3*$BNSZ($a0) | ||
| 1981 | |||
| 1982 | mflo $t_1 | ||
| 1983 | mfhi $t_2 | ||
| 1984 | slt $c_1,$t_2,$zero | ||
| 1985 | $SLL $t_2,1 | ||
| 1986 | $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); | ||
| 1987 | slt $a2,$t_1,$zero | ||
| 1988 | $ADDU $t_2,$a2 | ||
| 1989 | $SLL $t_1,1 | ||
| 1990 | $ADDU $c_2,$t_1 | ||
| 1991 | sltu $at,$c_2,$t_1 | ||
| 1992 | $ADDU $t_2,$at | ||
| 1993 | $ADDU $c_3,$t_2 | ||
| 1994 | sltu $at,$c_3,$t_2 | ||
| 1995 | $ADDU $c_1,$at | ||
| 1996 | mflo $t_1 | ||
| 1997 | mfhi $t_2 | ||
| 1998 | slt $at,$t_2,$zero | ||
| 1999 | $ADDU $c_1,$at | ||
| 2000 | $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); | ||
| 2001 | $SLL $t_2,1 | ||
| 2002 | slt $a2,$t_1,$zero | ||
| 2003 | $ADDU $t_2,$a2 | ||
| 2004 | $SLL $t_1,1 | ||
| 2005 | $ADDU $c_2,$t_1 | ||
| 2006 | sltu $at,$c_2,$t_1 | ||
| 2007 | $ADDU $t_2,$at | ||
| 2008 | $ADDU $c_3,$t_2 | ||
| 2009 | sltu $at,$c_3,$t_2 | ||
| 2010 | $ADDU $c_1,$at | ||
| 2011 | mflo $t_1 | ||
| 2012 | mfhi $t_2 | ||
| 2013 | $ADDU $c_2,$t_1 | ||
| 2014 | sltu $at,$c_2,$t_1 | ||
| 2015 | $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); | ||
| 2016 | $ADDU $t_2,$at | ||
| 2017 | $ADDU $c_3,$t_2 | ||
| 2018 | sltu $at,$c_3,$t_2 | ||
| 2019 | $ADDU $c_1,$at | ||
| 2020 | $ST $c_2,4*$BNSZ($a0) | ||
| 2021 | |||
| 2022 | mflo $t_1 | ||
| 2023 | mfhi $t_2 | ||
| 2024 | slt $c_2,$t_2,$zero | ||
| 2025 | $SLL $t_2,1 | ||
| 2026 | $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); | ||
| 2027 | slt $a2,$t_1,$zero | ||
| 2028 | $ADDU $t_2,$a2 | ||
| 2029 | $SLL $t_1,1 | ||
| 2030 | $ADDU $c_3,$t_1 | ||
| 2031 | sltu $at,$c_3,$t_1 | ||
| 2032 | $ADDU $t_2,$at | ||
| 2033 | $ADDU $c_1,$t_2 | ||
| 2034 | sltu $at,$c_1,$t_2 | ||
| 2035 | $ADDU $c_2,$at | ||
| 2036 | mflo $t_1 | ||
| 2037 | mfhi $t_2 | ||
| 2038 | slt $at,$t_2,$zero | ||
| 2039 | $ADDU $c_2,$at | ||
| 2040 | $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); | ||
| 2041 | $SLL $t_2,1 | ||
| 2042 | slt $a2,$t_1,$zero | ||
| 2043 | $ADDU $t_2,$a2 | ||
| 2044 | $SLL $t_1,1 | ||
| 2045 | $ADDU $c_3,$t_1 | ||
| 2046 | sltu $at,$c_3,$t_1 | ||
| 2047 | $ADDU $t_2,$at | ||
| 2048 | $ADDU $c_1,$t_2 | ||
| 2049 | sltu $at,$c_1,$t_2 | ||
| 2050 | $ADDU $c_2,$at | ||
| 2051 | mflo $t_1 | ||
| 2052 | mfhi $t_2 | ||
| 2053 | slt $at,$t_2,$zero | ||
| 2054 | $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); | ||
| 2055 | $ADDU $c_2,$at | ||
| 2056 | $SLL $t_2,1 | ||
| 2057 | slt $a2,$t_1,$zero | ||
| 2058 | $ADDU $t_2,$a2 | ||
| 2059 | $SLL $t_1,1 | ||
| 2060 | $ADDU $c_3,$t_1 | ||
| 2061 | sltu $at,$c_3,$t_1 | ||
| 2062 | $ADDU $t_2,$at | ||
| 2063 | $ADDU $c_1,$t_2 | ||
| 2064 | sltu $at,$c_1,$t_2 | ||
| 2065 | $ADDU $c_2,$at | ||
| 2066 | $ST $c_3,5*$BNSZ($a0) | ||
| 2067 | |||
| 2068 | mflo $t_1 | ||
| 2069 | mfhi $t_2 | ||
| 2070 | slt $c_3,$t_2,$zero | ||
| 2071 | $SLL $t_2,1 | ||
| 2072 | $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); | ||
| 2073 | slt $a2,$t_1,$zero | ||
| 2074 | $ADDU $t_2,$a2 | ||
| 2075 | $SLL $t_1,1 | ||
| 2076 | $ADDU $c_1,$t_1 | ||
| 2077 | sltu $at,$c_1,$t_1 | ||
| 2078 | $ADDU $t_2,$at | ||
| 2079 | $ADDU $c_2,$t_2 | ||
| 2080 | sltu $at,$c_2,$t_2 | ||
| 2081 | $ADDU $c_3,$at | ||
| 2082 | mflo $t_1 | ||
| 2083 | mfhi $t_2 | ||
| 2084 | slt $at,$t_2,$zero | ||
| 2085 | $ADDU $c_3,$at | ||
| 2086 | $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); | ||
| 2087 | $SLL $t_2,1 | ||
| 2088 | slt $a2,$t_1,$zero | ||
| 2089 | $ADDU $t_2,$a2 | ||
| 2090 | $SLL $t_1,1 | ||
| 2091 | $ADDU $c_1,$t_1 | ||
| 2092 | sltu $at,$c_1,$t_1 | ||
| 2093 | $ADDU $t_2,$at | ||
| 2094 | $ADDU $c_2,$t_2 | ||
| 2095 | sltu $at,$c_2,$t_2 | ||
| 2096 | $ADDU $c_3,$at | ||
| 2097 | mflo $t_1 | ||
| 2098 | mfhi $t_2 | ||
| 2099 | slt $at,$t_2,$zero | ||
| 2100 | $ADDU $c_3,$at | ||
| 2101 | $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); | ||
| 2102 | $SLL $t_2,1 | ||
| 2103 | slt $a2,$t_1,$zero | ||
| 2104 | $ADDU $t_2,$a2 | ||
| 2105 | $SLL $t_1,1 | ||
| 2106 | $ADDU $c_1,$t_1 | ||
| 2107 | sltu $at,$c_1,$t_1 | ||
| 2108 | $ADDU $t_2,$at | ||
| 2109 | $ADDU $c_2,$t_2 | ||
| 2110 | sltu $at,$c_2,$t_2 | ||
| 2111 | $ADDU $c_3,$at | ||
| 2112 | mflo $t_1 | ||
| 2113 | mfhi $t_2 | ||
| 2114 | $ADDU $c_1,$t_1 | ||
| 2115 | sltu $at,$c_1,$t_1 | ||
| 2116 | $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); | ||
| 2117 | $ADDU $t_2,$at | ||
| 2118 | $ADDU $c_2,$t_2 | ||
| 2119 | sltu $at,$c_2,$t_2 | ||
| 2120 | $ADDU $c_3,$at | ||
| 2121 | $ST $c_1,6*$BNSZ($a0) | ||
| 2122 | |||
| 2123 | mflo $t_1 | ||
| 2124 | mfhi $t_2 | ||
| 2125 | slt $c_1,$t_2,$zero | ||
| 2126 | $SLL $t_2,1 | ||
| 2127 | $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); | ||
| 2128 | slt $a2,$t_1,$zero | ||
| 2129 | $ADDU $t_2,$a2 | ||
| 2130 | $SLL $t_1,1 | ||
| 2131 | $ADDU $c_2,$t_1 | ||
| 2132 | sltu $at,$c_2,$t_1 | ||
| 2133 | $ADDU $t_2,$at | ||
| 2134 | $ADDU $c_3,$t_2 | ||
| 2135 | sltu $at,$c_3,$t_2 | ||
| 2136 | $ADDU $c_1,$at | ||
| 2137 | mflo $t_1 | ||
| 2138 | mfhi $t_2 | ||
| 2139 | slt $at,$t_2,$zero | ||
| 2140 | $ADDU $c_1,$at | ||
| 2141 | $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); | ||
| 2142 | $SLL $t_2,1 | ||
| 2143 | slt $a2,$t_1,$zero | ||
| 2144 | $ADDU $t_2,$a2 | ||
| 2145 | $SLL $t_1,1 | ||
| 2146 | $ADDU $c_2,$t_1 | ||
| 2147 | sltu $at,$c_2,$t_1 | ||
| 2148 | $ADDU $t_2,$at | ||
| 2149 | $ADDU $c_3,$t_2 | ||
| 2150 | sltu $at,$c_3,$t_2 | ||
| 2151 | $ADDU $c_1,$at | ||
| 2152 | mflo $t_1 | ||
| 2153 | mfhi $t_2 | ||
| 2154 | slt $at,$t_2,$zero | ||
| 2155 | $ADDU $c_1,$at | ||
| 2156 | $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); | ||
| 2157 | $SLL $t_2,1 | ||
| 2158 | slt $a2,$t_1,$zero | ||
| 2159 | $ADDU $t_2,$a2 | ||
| 2160 | $SLL $t_1,1 | ||
| 2161 | $ADDU $c_2,$t_1 | ||
| 2162 | sltu $at,$c_2,$t_1 | ||
| 2163 | $ADDU $t_2,$at | ||
| 2164 | $ADDU $c_3,$t_2 | ||
| 2165 | sltu $at,$c_3,$t_2 | ||
| 2166 | $ADDU $c_1,$at | ||
| 2167 | mflo $t_1 | ||
| 2168 | mfhi $t_2 | ||
| 2169 | slt $at,$t_2,$zero | ||
| 2170 | $ADDU $c_1,$at | ||
| 2171 | $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); | ||
| 2172 | $SLL $t_2,1 | ||
| 2173 | slt $a2,$t_1,$zero | ||
| 2174 | $ADDU $t_2,$a2 | ||
| 2175 | $SLL $t_1,1 | ||
| 2176 | $ADDU $c_2,$t_1 | ||
| 2177 | sltu $at,$c_2,$t_1 | ||
| 2178 | $ADDU $t_2,$at | ||
| 2179 | $ADDU $c_3,$t_2 | ||
| 2180 | sltu $at,$c_3,$t_2 | ||
| 2181 | $ADDU $c_1,$at | ||
| 2182 | $ST $c_2,7*$BNSZ($a0) | ||
| 2183 | |||
| 2184 | mflo $t_1 | ||
| 2185 | mfhi $t_2 | ||
| 2186 | slt $c_2,$t_2,$zero | ||
| 2187 | $SLL $t_2,1 | ||
| 2188 | $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); | ||
| 2189 | slt $a2,$t_1,$zero | ||
| 2190 | $ADDU $t_2,$a2 | ||
| 2191 | $SLL $t_1,1 | ||
| 2192 | $ADDU $c_3,$t_1 | ||
| 2193 | sltu $at,$c_3,$t_1 | ||
| 2194 | $ADDU $t_2,$at | ||
| 2195 | $ADDU $c_1,$t_2 | ||
| 2196 | sltu $at,$c_1,$t_2 | ||
| 2197 | $ADDU $c_2,$at | ||
| 2198 | mflo $t_1 | ||
| 2199 | mfhi $t_2 | ||
| 2200 | slt $at,$t_2,$zero | ||
| 2201 | $ADDU $c_2,$at | ||
| 2202 | $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); | ||
| 2203 | $SLL $t_2,1 | ||
| 2204 | slt $a2,$t_1,$zero | ||
| 2205 | $ADDU $t_2,$a2 | ||
| 2206 | $SLL $t_1,1 | ||
| 2207 | $ADDU $c_3,$t_1 | ||
| 2208 | sltu $at,$c_3,$t_1 | ||
| 2209 | $ADDU $t_2,$at | ||
| 2210 | $ADDU $c_1,$t_2 | ||
| 2211 | sltu $at,$c_1,$t_2 | ||
| 2212 | $ADDU $c_2,$at | ||
| 2213 | mflo $t_1 | ||
| 2214 | mfhi $t_2 | ||
| 2215 | slt $at,$t_2,$zero | ||
| 2216 | $ADDU $c_2,$at | ||
| 2217 | $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); | ||
| 2218 | $SLL $t_2,1 | ||
| 2219 | slt $a2,$t_1,$zero | ||
| 2220 | $ADDU $t_2,$a2 | ||
| 2221 | $SLL $t_1,1 | ||
| 2222 | $ADDU $c_3,$t_1 | ||
| 2223 | sltu $at,$c_3,$t_1 | ||
| 2224 | $ADDU $t_2,$at | ||
| 2225 | $ADDU $c_1,$t_2 | ||
| 2226 | sltu $at,$c_1,$t_2 | ||
| 2227 | $ADDU $c_2,$at | ||
| 2228 | mflo $t_1 | ||
| 2229 | mfhi $t_2 | ||
| 2230 | $ADDU $c_3,$t_1 | ||
| 2231 | sltu $at,$c_3,$t_1 | ||
| 2232 | $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); | ||
| 2233 | $ADDU $t_2,$at | ||
| 2234 | $ADDU $c_1,$t_2 | ||
| 2235 | sltu $at,$c_1,$t_2 | ||
| 2236 | $ADDU $c_2,$at | ||
| 2237 | $ST $c_3,8*$BNSZ($a0) | ||
| 2238 | |||
| 2239 | mflo $t_1 | ||
| 2240 | mfhi $t_2 | ||
| 2241 | slt $c_3,$t_2,$zero | ||
| 2242 | $SLL $t_2,1 | ||
| 2243 | $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); | ||
| 2244 | slt $a2,$t_1,$zero | ||
| 2245 | $ADDU $t_2,$a2 | ||
| 2246 | $SLL $t_1,1 | ||
| 2247 | $ADDU $c_1,$t_1 | ||
| 2248 | sltu $at,$c_1,$t_1 | ||
| 2249 | $ADDU $t_2,$at | ||
| 2250 | $ADDU $c_2,$t_2 | ||
| 2251 | sltu $at,$c_2,$t_2 | ||
| 2252 | $ADDU $c_3,$at | ||
| 2253 | mflo $t_1 | ||
| 2254 | mfhi $t_2 | ||
| 2255 | slt $at,$t_2,$zero | ||
| 2256 | $ADDU $c_3,$at | ||
| 2257 | $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); | ||
| 2258 | $SLL $t_2,1 | ||
| 2259 | slt $a2,$t_1,$zero | ||
| 2260 | $ADDU $t_2,$a2 | ||
| 2261 | $SLL $t_1,1 | ||
| 2262 | $ADDU $c_1,$t_1 | ||
| 2263 | sltu $at,$c_1,$t_1 | ||
| 2264 | $ADDU $t_2,$at | ||
| 2265 | $ADDU $c_2,$t_2 | ||
| 2266 | sltu $at,$c_2,$t_2 | ||
| 2267 | $ADDU $c_3,$at | ||
| 2268 | mflo $t_1 | ||
| 2269 | mfhi $t_2 | ||
| 2270 | slt $at,$t_2,$zero | ||
| 2271 | $ADDU $c_3,$at | ||
| 2272 | $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); | ||
| 2273 | $SLL $t_2,1 | ||
| 2274 | slt $a2,$t_1,$zero | ||
| 2275 | $ADDU $t_2,$a2 | ||
| 2276 | $SLL $t_1,1 | ||
| 2277 | $ADDU $c_1,$t_1 | ||
| 2278 | sltu $at,$c_1,$t_1 | ||
| 2279 | $ADDU $t_2,$at | ||
| 2280 | $ADDU $c_2,$t_2 | ||
| 2281 | sltu $at,$c_2,$t_2 | ||
| 2282 | $ADDU $c_3,$at | ||
| 2283 | $ST $c_1,9*$BNSZ($a0) | ||
| 2284 | |||
| 2285 | mflo $t_1 | ||
| 2286 | mfhi $t_2 | ||
| 2287 | slt $c_1,$t_2,$zero | ||
| 2288 | $SLL $t_2,1 | ||
| 2289 | $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); | ||
| 2290 | slt $a2,$t_1,$zero | ||
| 2291 | $ADDU $t_2,$a2 | ||
| 2292 | $SLL $t_1,1 | ||
| 2293 | $ADDU $c_2,$t_1 | ||
| 2294 | sltu $at,$c_2,$t_1 | ||
| 2295 | $ADDU $t_2,$at | ||
| 2296 | $ADDU $c_3,$t_2 | ||
| 2297 | sltu $at,$c_3,$t_2 | ||
| 2298 | $ADDU $c_1,$at | ||
| 2299 | mflo $t_1 | ||
| 2300 | mfhi $t_2 | ||
| 2301 | slt $at,$t_2,$zero | ||
| 2302 | $ADDU $c_1,$at | ||
| 2303 | $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); | ||
| 2304 | $SLL $t_2,1 | ||
| 2305 | slt $a2,$t_1,$zero | ||
| 2306 | $ADDU $t_2,$a2 | ||
| 2307 | $SLL $t_1,1 | ||
| 2308 | $ADDU $c_2,$t_1 | ||
| 2309 | sltu $at,$c_2,$t_1 | ||
| 2310 | $ADDU $t_2,$at | ||
| 2311 | $ADDU $c_3,$t_2 | ||
| 2312 | sltu $at,$c_3,$t_2 | ||
| 2313 | $ADDU $c_1,$at | ||
| 2314 | mflo $t_1 | ||
| 2315 | mfhi $t_2 | ||
| 2316 | $ADDU $c_2,$t_1 | ||
| 2317 | sltu $at,$c_2,$t_1 | ||
| 2318 | $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); | ||
| 2319 | $ADDU $t_2,$at | ||
| 2320 | $ADDU $c_3,$t_2 | ||
| 2321 | sltu $at,$c_3,$t_2 | ||
| 2322 | $ADDU $c_1,$at | ||
| 2323 | $ST $c_2,10*$BNSZ($a0) | ||
| 2324 | |||
| 2325 | mflo $t_1 | ||
| 2326 | mfhi $t_2 | ||
| 2327 | slt $c_2,$t_2,$zero | ||
| 2328 | $SLL $t_2,1 | ||
| 2329 | $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); | ||
| 2330 | slt $a2,$t_1,$zero | ||
| 2331 | $ADDU $t_2,$a2 | ||
| 2332 | $SLL $t_1,1 | ||
| 2333 | $ADDU $c_3,$t_1 | ||
| 2334 | sltu $at,$c_3,$t_1 | ||
| 2335 | $ADDU $t_2,$at | ||
| 2336 | $ADDU $c_1,$t_2 | ||
| 2337 | sltu $at,$c_1,$t_2 | ||
| 2338 | $ADDU $c_2,$at | ||
| 2339 | mflo $t_1 | ||
| 2340 | mfhi $t_2 | ||
| 2341 | slt $at,$t_2,$zero | ||
| 2342 | $ADDU $c_2,$at | ||
| 2343 | $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); | ||
| 2344 | $SLL $t_2,1 | ||
| 2345 | slt $a2,$t_1,$zero | ||
| 2346 | $ADDU $t_2,$a2 | ||
| 2347 | $SLL $t_1,1 | ||
| 2348 | $ADDU $c_3,$t_1 | ||
| 2349 | sltu $at,$c_3,$t_1 | ||
| 2350 | $ADDU $t_2,$at | ||
| 2351 | $ADDU $c_1,$t_2 | ||
| 2352 | sltu $at,$c_1,$t_2 | ||
| 2353 | $ADDU $c_2,$at | ||
| 2354 | $ST $c_3,11*$BNSZ($a0) | ||
| 2355 | |||
| 2356 | mflo $t_1 | ||
| 2357 | mfhi $t_2 | ||
| 2358 | slt $c_3,$t_2,$zero | ||
| 2359 | $SLL $t_2,1 | ||
| 2360 | $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); | ||
| 2361 | slt $a2,$t_1,$zero | ||
| 2362 | $ADDU $t_2,$a2 | ||
| 2363 | $SLL $t_1,1 | ||
| 2364 | $ADDU $c_1,$t_1 | ||
| 2365 | sltu $at,$c_1,$t_1 | ||
| 2366 | $ADDU $t_2,$at | ||
| 2367 | $ADDU $c_2,$t_2 | ||
| 2368 | sltu $at,$c_2,$t_2 | ||
| 2369 | $ADDU $c_3,$at | ||
| 2370 | mflo $t_1 | ||
| 2371 | mfhi $t_2 | ||
| 2372 | $ADDU $c_1,$t_1 | ||
| 2373 | sltu $at,$c_1,$t_1 | ||
| 2374 | $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); | ||
| 2375 | $ADDU $t_2,$at | ||
| 2376 | $ADDU $c_2,$t_2 | ||
| 2377 | sltu $at,$c_2,$t_2 | ||
| 2378 | $ADDU $c_3,$at | ||
| 2379 | $ST $c_1,12*$BNSZ($a0) | ||
| 2380 | |||
| 2381 | mflo $t_1 | ||
| 2382 | mfhi $t_2 | ||
| 2383 | slt $c_1,$t_2,$zero | ||
| 2384 | $SLL $t_2,1 | ||
| 2385 | $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); | ||
| 2386 | slt $a2,$t_1,$zero | ||
| 2387 | $ADDU $t_2,$a2 | ||
| 2388 | $SLL $t_1,1 | ||
| 2389 | $ADDU $c_2,$t_1 | ||
| 2390 | sltu $at,$c_2,$t_1 | ||
| 2391 | $ADDU $t_2,$at | ||
| 2392 | $ADDU $c_3,$t_2 | ||
| 2393 | sltu $at,$c_3,$t_2 | ||
| 2394 | $ADDU $c_1,$at | ||
| 2395 | $ST $c_2,13*$BNSZ($a0) | ||
| 2396 | |||
| 2397 | mflo $t_1 | ||
| 2398 | mfhi $t_2 | ||
| 2399 | $ADDU $c_3,$t_1 | ||
| 2400 | sltu $at,$c_3,$t_1 | ||
| 2401 | $ADDU $t_2,$at | ||
| 2402 | $ADDU $c_1,$t_2 | ||
| 2403 | $ST $c_3,14*$BNSZ($a0) | ||
| 2404 | $ST $c_1,15*$BNSZ($a0) | ||
| 2405 | |||
| 2406 | .set noreorder | ||
| 2407 | ___ | ||
| 2408 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 2409 | $REG_L $t3,4*$SZREG($sp) | ||
| 2410 | $REG_L $t2,3*$SZREG($sp) | ||
| 2411 | $REG_L $t1,2*$SZREG($sp) | ||
| 2412 | $REG_L $t0,1*$SZREG($sp) | ||
| 2413 | $REG_L $gp,0*$SZREG($sp) | ||
| 2414 | $PTR_ADD $sp,6*$SZREG | ||
| 2415 | ___ | ||
| 2416 | $code.=<<___; | ||
| 2417 | jr $ra | ||
| 2418 | nop | ||
| 2419 | .end bn_sqr_comba8 | ||
| 2420 | |||
| 2421 | .align 5 | ||
| 2422 | .globl bn_sqr_comba4 | ||
| 2423 | .ent bn_sqr_comba4 | ||
| 2424 | bn_sqr_comba4: | ||
| 2425 | ___ | ||
| 2426 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 2427 | .frame $sp,6*$SZREG,$ra | ||
| 2428 | .mask 0x8000f008,-$SZREG | ||
| 2429 | .set noreorder | ||
| 2430 | $PTR_SUB $sp,6*$SZREG | ||
| 2431 | $REG_S $ra,5*$SZREG($sp) | ||
| 2432 | $REG_S $t3,4*$SZREG($sp) | ||
| 2433 | $REG_S $t2,3*$SZREG($sp) | ||
| 2434 | $REG_S $t1,2*$SZREG($sp) | ||
| 2435 | $REG_S $t0,1*$SZREG($sp) | ||
| 2436 | $REG_S $gp,0*$SZREG($sp) | ||
| 2437 | ___ | ||
| 2438 | $code.=<<___; | ||
| 2439 | .set reorder | ||
| 2440 | $LD $a_0,0($a1) | ||
| 2441 | $LD $a_1,$BNSZ($a1) | ||
| 2442 | $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
| 2443 | $LD $a_2,2*$BNSZ($a1) | ||
| 2444 | $LD $a_3,3*$BNSZ($a1) | ||
| 2445 | mflo $c_1 | ||
| 2446 | mfhi $c_2 | ||
| 2447 | $ST $c_1,0($a0) | ||
| 2448 | |||
| 2449 | $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); | ||
| 2450 | mflo $t_1 | ||
| 2451 | mfhi $t_2 | ||
| 2452 | slt $c_1,$t_2,$zero | ||
| 2453 | $SLL $t_2,1 | ||
| 2454 | $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); | ||
| 2455 | slt $a2,$t_1,$zero | ||
| 2456 | $ADDU $t_2,$a2 | ||
| 2457 | $SLL $t_1,1 | ||
| 2458 | $ADDU $c_2,$t_1 | ||
| 2459 | sltu $at,$c_2,$t_1 | ||
| 2460 | $ADDU $c_3,$t_2,$at | ||
| 2461 | $ST $c_2,$BNSZ($a0) | ||
| 2462 | |||
| 2463 | mflo $t_1 | ||
| 2464 | mfhi $t_2 | ||
| 2465 | slt $c_2,$t_2,$zero | ||
| 2466 | $SLL $t_2,1 | ||
| 2467 | $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); | ||
| 2468 | slt $a2,$t_1,$zero | ||
| 2469 | $ADDU $t_2,$a2 | ||
| 2470 | $SLL $t_1,1 | ||
| 2471 | $ADDU $c_3,$t_1 | ||
| 2472 | sltu $at,$c_3,$t_1 | ||
| 2473 | $ADDU $t_2,$at | ||
| 2474 | $ADDU $c_1,$t_2 | ||
| 2475 | sltu $at,$c_1,$t_2 | ||
| 2476 | $ADDU $c_2,$at | ||
| 2477 | mflo $t_1 | ||
| 2478 | mfhi $t_2 | ||
| 2479 | $ADDU $c_3,$t_1 | ||
| 2480 | sltu $at,$c_3,$t_1 | ||
| 2481 | $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); | ||
| 2482 | $ADDU $t_2,$at | ||
| 2483 | $ADDU $c_1,$t_2 | ||
| 2484 | sltu $at,$c_1,$t_2 | ||
| 2485 | $ADDU $c_2,$at | ||
| 2486 | $ST $c_3,2*$BNSZ($a0) | ||
| 2487 | |||
| 2488 | mflo $t_1 | ||
| 2489 | mfhi $t_2 | ||
| 2490 | slt $c_3,$t_2,$zero | ||
| 2491 | $SLL $t_2,1 | ||
| 2492 | $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); | ||
| 2493 | slt $a2,$t_1,$zero | ||
| 2494 | $ADDU $t_2,$a2 | ||
| 2495 | $SLL $t_1,1 | ||
| 2496 | $ADDU $c_1,$t_1 | ||
| 2497 | sltu $at,$c_1,$t_1 | ||
| 2498 | $ADDU $t_2,$at | ||
| 2499 | $ADDU $c_2,$t_2 | ||
| 2500 | sltu $at,$c_2,$t_2 | ||
| 2501 | $ADDU $c_3,$at | ||
| 2502 | mflo $t_1 | ||
| 2503 | mfhi $t_2 | ||
| 2504 | slt $at,$t_2,$zero | ||
| 2505 | $ADDU $c_3,$at | ||
| 2506 | $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); | ||
| 2507 | $SLL $t_2,1 | ||
| 2508 | slt $a2,$t_1,$zero | ||
| 2509 | $ADDU $t_2,$a2 | ||
| 2510 | $SLL $t_1,1 | ||
| 2511 | $ADDU $c_1,$t_1 | ||
| 2512 | sltu $at,$c_1,$t_1 | ||
| 2513 | $ADDU $t_2,$at | ||
| 2514 | $ADDU $c_2,$t_2 | ||
| 2515 | sltu $at,$c_2,$t_2 | ||
| 2516 | $ADDU $c_3,$at | ||
| 2517 | $ST $c_1,3*$BNSZ($a0) | ||
| 2518 | |||
| 2519 | mflo $t_1 | ||
| 2520 | mfhi $t_2 | ||
| 2521 | slt $c_1,$t_2,$zero | ||
| 2522 | $SLL $t_2,1 | ||
| 2523 | $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); | ||
| 2524 | slt $a2,$t_1,$zero | ||
| 2525 | $ADDU $t_2,$a2 | ||
| 2526 | $SLL $t_1,1 | ||
| 2527 | $ADDU $c_2,$t_1 | ||
| 2528 | sltu $at,$c_2,$t_1 | ||
| 2529 | $ADDU $t_2,$at | ||
| 2530 | $ADDU $c_3,$t_2 | ||
| 2531 | sltu $at,$c_3,$t_2 | ||
| 2532 | $ADDU $c_1,$at | ||
| 2533 | mflo $t_1 | ||
| 2534 | mfhi $t_2 | ||
| 2535 | $ADDU $c_2,$t_1 | ||
| 2536 | sltu $at,$c_2,$t_1 | ||
| 2537 | $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); | ||
| 2538 | $ADDU $t_2,$at | ||
| 2539 | $ADDU $c_3,$t_2 | ||
| 2540 | sltu $at,$c_3,$t_2 | ||
| 2541 | $ADDU $c_1,$at | ||
| 2542 | $ST $c_2,4*$BNSZ($a0) | ||
| 2543 | |||
| 2544 | mflo $t_1 | ||
| 2545 | mfhi $t_2 | ||
| 2546 | slt $c_2,$t_2,$zero | ||
| 2547 | $SLL $t_2,1 | ||
| 2548 | $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); | ||
| 2549 | slt $a2,$t_1,$zero | ||
| 2550 | $ADDU $t_2,$a2 | ||
| 2551 | $SLL $t_1,1 | ||
| 2552 | $ADDU $c_3,$t_1 | ||
| 2553 | sltu $at,$c_3,$t_1 | ||
| 2554 | $ADDU $t_2,$at | ||
| 2555 | $ADDU $c_1,$t_2 | ||
| 2556 | sltu $at,$c_1,$t_2 | ||
| 2557 | $ADDU $c_2,$at | ||
| 2558 | $ST $c_3,5*$BNSZ($a0) | ||
| 2559 | |||
| 2560 | mflo $t_1 | ||
| 2561 | mfhi $t_2 | ||
| 2562 | $ADDU $c_1,$t_1 | ||
| 2563 | sltu $at,$c_1,$t_1 | ||
| 2564 | $ADDU $t_2,$at | ||
| 2565 | $ADDU $c_2,$t_2 | ||
| 2566 | $ST $c_1,6*$BNSZ($a0) | ||
| 2567 | $ST $c_2,7*$BNSZ($a0) | ||
| 2568 | |||
| 2569 | .set noreorder | ||
| 2570 | ___ | ||
| 2571 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 2572 | $REG_L $t3,4*$SZREG($sp) | ||
| 2573 | $REG_L $t2,3*$SZREG($sp) | ||
| 2574 | $REG_L $t1,2*$SZREG($sp) | ||
| 2575 | $REG_L $t0,1*$SZREG($sp) | ||
| 2576 | $REG_L $gp,0*$SZREG($sp) | ||
| 2577 | $PTR_ADD $sp,6*$SZREG | ||
| 2578 | ___ | ||
| 2579 | $code.=<<___; | ||
| 2580 | jr $ra | ||
| 2581 | nop | ||
| 2582 | .end bn_sqr_comba4 | ||
| 2583 | ___ | ||
| 2584 | print $code; | ||
| 2585 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl new file mode 100644 index 0000000000..54aeb01921 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl | |||
| @@ -0,0 +1,1496 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # Copyright (c) 2010-2011 Intel Corp. | ||
| 4 | # Author: Vinodh.Gopal@intel.com | ||
| 5 | # Jim Guilford | ||
| 6 | # Erdinc.Ozturk@intel.com | ||
| 7 | # Maxim.Perminov@intel.com | ||
| 8 | # | ||
| 9 | # More information about algorithm used can be found at: | ||
| 10 | # http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf | ||
| 11 | # | ||
| 12 | # ==================================================================== | ||
| 13 | # Copyright (c) 2011 The OpenSSL Project. All rights reserved. | ||
| 14 | # | ||
| 15 | # Redistribution and use in source and binary forms, with or without | ||
| 16 | # modification, are permitted provided that the following conditions | ||
| 17 | # are met: | ||
| 18 | # | ||
| 19 | # 1. Redistributions of source code must retain the above copyright | ||
| 20 | # notice, this list of conditions and the following disclaimer. | ||
| 21 | # | ||
| 22 | # 2. Redistributions in binary form must reproduce the above copyright | ||
| 23 | # notice, this list of conditions and the following disclaimer in | ||
| 24 | # the documentation and/or other materials provided with the | ||
| 25 | # distribution. | ||
| 26 | # | ||
| 27 | # 3. All advertising materials mentioning features or use of this | ||
| 28 | # software must display the following acknowledgment: | ||
| 29 | # "This product includes software developed by the OpenSSL Project | ||
| 30 | # for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
| 31 | # | ||
| 32 | # 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 33 | # endorse or promote products derived from this software without | ||
| 34 | # prior written permission. For written permission, please contact | ||
| 35 | # licensing@OpenSSL.org. | ||
| 36 | # | ||
| 37 | # 5. Products derived from this software may not be called "OpenSSL" | ||
| 38 | # nor may "OpenSSL" appear in their names without prior written | ||
| 39 | # permission of the OpenSSL Project. | ||
| 40 | # | ||
| 41 | # 6. Redistributions of any form whatsoever must retain the following | ||
| 42 | # acknowledgment: | ||
| 43 | # "This product includes software developed by the OpenSSL Project | ||
| 44 | # for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
| 45 | # | ||
| 46 | # THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 47 | # EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 48 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 49 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 50 | # ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 51 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 52 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 53 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 54 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 55 | # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 56 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 57 | # OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 58 | # ==================================================================== | ||
| 59 | |||
| 60 | $flavour = shift; | ||
| 61 | $output = shift; | ||
| 62 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 63 | |||
| 64 | my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 65 | |||
| 66 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 67 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 68 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 69 | die "can't locate x86_64-xlate.pl"; | ||
| 70 | |||
| 71 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 72 | |||
| 73 | use strict; | ||
| 74 | my $code=".text\n\n"; | ||
| 75 | my $m=0; | ||
| 76 | |||
| 77 | # | ||
| 78 | # Define x512 macros | ||
| 79 | # | ||
| 80 | |||
| 81 | #MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2 | ||
| 82 | # | ||
| 83 | # uses rax, rdx, and args | ||
| 84 | sub MULSTEP_512_ADD | ||
| 85 | { | ||
| 86 | my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_; | ||
| 87 | my @X=@$x; # make a copy | ||
| 88 | $code.=<<___; | ||
| 89 | mov (+8*0)($SRC2), %rax | ||
| 90 | mul $OP # rdx:rax = %OP * [0] | ||
| 91 | mov ($ASRC), $X[0] | ||
| 92 | add %rax, $X[0] | ||
| 93 | adc \$0, %rdx | ||
| 94 | mov $X[0], $DST | ||
| 95 | ___ | ||
| 96 | for(my $i=1;$i<8;$i++) { | ||
| 97 | $code.=<<___; | ||
| 98 | mov %rdx, $TMP | ||
| 99 | |||
| 100 | mov (+8*$i)($SRC2), %rax | ||
| 101 | mul $OP # rdx:rax = %OP * [$i] | ||
| 102 | mov (+8*$i)($ASRC), $X[$i] | ||
| 103 | add %rax, $X[$i] | ||
| 104 | adc \$0, %rdx | ||
| 105 | add $TMP, $X[$i] | ||
| 106 | adc \$0, %rdx | ||
| 107 | ___ | ||
| 108 | } | ||
| 109 | $code.=<<___; | ||
| 110 | mov %rdx, $X[0] | ||
| 111 | ___ | ||
| 112 | } | ||
| 113 | |||
| 114 | #MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp | ||
| 115 | # | ||
| 116 | # uses rax, rdx, and args | ||
| 117 | sub MULSTEP_512 | ||
| 118 | { | ||
| 119 | my ($x, $DST, $SRC2, $OP, $TMP)=@_; | ||
| 120 | my @X=@$x; # make a copy | ||
| 121 | $code.=<<___; | ||
| 122 | mov (+8*0)($SRC2), %rax | ||
| 123 | mul $OP # rdx:rax = %OP * [0] | ||
| 124 | add %rax, $X[0] | ||
| 125 | adc \$0, %rdx | ||
| 126 | mov $X[0], $DST | ||
| 127 | ___ | ||
| 128 | for(my $i=1;$i<8;$i++) { | ||
| 129 | $code.=<<___; | ||
| 130 | mov %rdx, $TMP | ||
| 131 | |||
| 132 | mov (+8*$i)($SRC2), %rax | ||
| 133 | mul $OP # rdx:rax = %OP * [$i] | ||
| 134 | add %rax, $X[$i] | ||
| 135 | adc \$0, %rdx | ||
| 136 | add $TMP, $X[$i] | ||
| 137 | adc \$0, %rdx | ||
| 138 | ___ | ||
| 139 | } | ||
| 140 | $code.=<<___; | ||
| 141 | mov %rdx, $X[0] | ||
| 142 | ___ | ||
| 143 | } | ||
| 144 | |||
| 145 | # | ||
| 146 | # Swizzle Macros | ||
| 147 | # | ||
| 148 | |||
| 149 | # macro to copy data from flat space to swizzled table | ||
| 150 | #MACRO swizzle pDst, pSrc, tmp1, tmp2 | ||
| 151 | # pDst and pSrc are modified | ||
| 152 | sub swizzle | ||
| 153 | { | ||
| 154 | my ($pDst, $pSrc, $cnt, $d0)=@_; | ||
| 155 | $code.=<<___; | ||
| 156 | mov \$8, $cnt | ||
| 157 | loop_$m: | ||
| 158 | mov ($pSrc), $d0 | ||
| 159 | mov $d0#w, ($pDst) | ||
| 160 | shr \$16, $d0 | ||
| 161 | mov $d0#w, (+64*1)($pDst) | ||
| 162 | shr \$16, $d0 | ||
| 163 | mov $d0#w, (+64*2)($pDst) | ||
| 164 | shr \$16, $d0 | ||
| 165 | mov $d0#w, (+64*3)($pDst) | ||
| 166 | lea 8($pSrc), $pSrc | ||
| 167 | lea 64*4($pDst), $pDst | ||
| 168 | dec $cnt | ||
| 169 | jnz loop_$m | ||
| 170 | ___ | ||
| 171 | |||
| 172 | $m++; | ||
| 173 | } | ||
| 174 | |||
| 175 | # macro to copy data from swizzled table to flat space | ||
| 176 | #MACRO unswizzle pDst, pSrc, tmp*3 | ||
| 177 | sub unswizzle | ||
| 178 | { | ||
| 179 | my ($pDst, $pSrc, $cnt, $d0, $d1)=@_; | ||
| 180 | $code.=<<___; | ||
| 181 | mov \$4, $cnt | ||
| 182 | loop_$m: | ||
| 183 | movzxw (+64*3+256*0)($pSrc), $d0 | ||
| 184 | movzxw (+64*3+256*1)($pSrc), $d1 | ||
| 185 | shl \$16, $d0 | ||
| 186 | shl \$16, $d1 | ||
| 187 | mov (+64*2+256*0)($pSrc), $d0#w | ||
| 188 | mov (+64*2+256*1)($pSrc), $d1#w | ||
| 189 | shl \$16, $d0 | ||
| 190 | shl \$16, $d1 | ||
| 191 | mov (+64*1+256*0)($pSrc), $d0#w | ||
| 192 | mov (+64*1+256*1)($pSrc), $d1#w | ||
| 193 | shl \$16, $d0 | ||
| 194 | shl \$16, $d1 | ||
| 195 | mov (+64*0+256*0)($pSrc), $d0#w | ||
| 196 | mov (+64*0+256*1)($pSrc), $d1#w | ||
| 197 | mov $d0, (+8*0)($pDst) | ||
| 198 | mov $d1, (+8*1)($pDst) | ||
| 199 | lea 256*2($pSrc), $pSrc | ||
| 200 | lea 8*2($pDst), $pDst | ||
| 201 | sub \$1, $cnt | ||
| 202 | jnz loop_$m | ||
| 203 | ___ | ||
| 204 | |||
| 205 | $m++; | ||
| 206 | } | ||
| 207 | |||
| 208 | # | ||
| 209 | # Data Structures | ||
| 210 | # | ||
| 211 | |||
| 212 | # Reduce Data | ||
| 213 | # | ||
| 214 | # | ||
| 215 | # Offset Value | ||
| 216 | # 0C0 Carries | ||
| 217 | # 0B8 X2[10] | ||
| 218 | # 0B0 X2[9] | ||
| 219 | # 0A8 X2[8] | ||
| 220 | # 0A0 X2[7] | ||
| 221 | # 098 X2[6] | ||
| 222 | # 090 X2[5] | ||
| 223 | # 088 X2[4] | ||
| 224 | # 080 X2[3] | ||
| 225 | # 078 X2[2] | ||
| 226 | # 070 X2[1] | ||
| 227 | # 068 X2[0] | ||
| 228 | # 060 X1[12] P[10] | ||
| 229 | # 058 X1[11] P[9] Z[8] | ||
| 230 | # 050 X1[10] P[8] Z[7] | ||
| 231 | # 048 X1[9] P[7] Z[6] | ||
| 232 | # 040 X1[8] P[6] Z[5] | ||
| 233 | # 038 X1[7] P[5] Z[4] | ||
| 234 | # 030 X1[6] P[4] Z[3] | ||
| 235 | # 028 X1[5] P[3] Z[2] | ||
| 236 | # 020 X1[4] P[2] Z[1] | ||
| 237 | # 018 X1[3] P[1] Z[0] | ||
| 238 | # 010 X1[2] P[0] Y[2] | ||
| 239 | # 008 X1[1] Q[1] Y[1] | ||
| 240 | # 000 X1[0] Q[0] Y[0] | ||
| 241 | |||
| 242 | my $X1_offset = 0; # 13 qwords | ||
| 243 | my $X2_offset = $X1_offset + 13*8; # 11 qwords | ||
| 244 | my $Carries_offset = $X2_offset + 11*8; # 1 qword | ||
| 245 | my $Q_offset = 0; # 2 qwords | ||
| 246 | my $P_offset = $Q_offset + 2*8; # 11 qwords | ||
| 247 | my $Y_offset = 0; # 3 qwords | ||
| 248 | my $Z_offset = $Y_offset + 3*8; # 9 qwords | ||
| 249 | |||
| 250 | my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords) | ||
| 251 | |||
| 252 | # | ||
| 253 | # Stack Frame | ||
| 254 | # | ||
| 255 | # | ||
| 256 | # offset value | ||
| 257 | # ... <old stack contents> | ||
| 258 | # ... | ||
| 259 | # 280 Garray | ||
| 260 | |||
| 261 | # 278 tmp16[15] | ||
| 262 | # ... ... | ||
| 263 | # 200 tmp16[0] | ||
| 264 | |||
| 265 | # 1F8 tmp[7] | ||
| 266 | # ... ... | ||
| 267 | # 1C0 tmp[0] | ||
| 268 | |||
| 269 | # 1B8 GT[7] | ||
| 270 | # ... ... | ||
| 271 | # 180 GT[0] | ||
| 272 | |||
| 273 | # 178 Reduce Data | ||
| 274 | # ... ... | ||
| 275 | # 0B8 Reduce Data | ||
| 276 | # 0B0 reserved | ||
| 277 | # 0A8 reserved | ||
| 278 | # 0A0 reserved | ||
| 279 | # 098 reserved | ||
| 280 | # 090 reserved | ||
| 281 | # 088 reduce result addr | ||
| 282 | # 080 exp[8] | ||
| 283 | |||
| 284 | # ... | ||
| 285 | # 048 exp[1] | ||
| 286 | # 040 exp[0] | ||
| 287 | |||
| 288 | # 038 reserved | ||
| 289 | # 030 loop_idx | ||
| 290 | # 028 pg | ||
| 291 | # 020 i | ||
| 292 | # 018 pData ; arg 4 | ||
| 293 | # 010 pG ; arg 2 | ||
| 294 | # 008 pResult ; arg 1 | ||
| 295 | # 000 rsp ; stack pointer before subtract | ||
| 296 | |||
| 297 | my $rsp_offset = 0; | ||
| 298 | my $pResult_offset = 8*1 + $rsp_offset; | ||
| 299 | my $pG_offset = 8*1 + $pResult_offset; | ||
| 300 | my $pData_offset = 8*1 + $pG_offset; | ||
| 301 | my $i_offset = 8*1 + $pData_offset; | ||
| 302 | my $pg_offset = 8*1 + $i_offset; | ||
| 303 | my $loop_idx_offset = 8*1 + $pg_offset; | ||
| 304 | my $reserved1_offset = 8*1 + $loop_idx_offset; | ||
| 305 | my $exp_offset = 8*1 + $reserved1_offset; | ||
| 306 | my $red_result_addr_offset= 8*9 + $exp_offset; | ||
| 307 | my $reserved2_offset = 8*1 + $red_result_addr_offset; | ||
| 308 | my $Reduce_Data_offset = 8*5 + $reserved2_offset; | ||
| 309 | my $GT_offset = $Red_Data_Size + $Reduce_Data_offset; | ||
| 310 | my $tmp_offset = 8*8 + $GT_offset; | ||
| 311 | my $tmp16_offset = 8*8 + $tmp_offset; | ||
| 312 | my $garray_offset = 8*16 + $tmp16_offset; | ||
| 313 | my $mem_size = 8*8*32 + $garray_offset; | ||
| 314 | |||
| 315 | # | ||
| 316 | # Offsets within Reduce Data | ||
| 317 | # | ||
| 318 | # | ||
| 319 | # struct MODF_2FOLD_MONT_512_C1_DATA { | ||
| 320 | # UINT64 t[8][8]; | ||
| 321 | # UINT64 m[8]; | ||
| 322 | # UINT64 m1[8]; /* 2^768 % m */ | ||
| 323 | # UINT64 m2[8]; /* 2^640 % m */ | ||
| 324 | # UINT64 k1[2]; /* (- 1/m) % 2^128 */ | ||
| 325 | # }; | ||
| 326 | |||
| 327 | my $T = 0; | ||
| 328 | my $M = 512; # = 8 * 8 * 8 | ||
| 329 | my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */ | ||
| 330 | my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */ | ||
| 331 | my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */ | ||
| 332 | |||
| 333 | # | ||
| 334 | # FUNCTIONS | ||
| 335 | # | ||
| 336 | |||
| 337 | {{{ | ||
| 338 | # | ||
| 339 | # MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords) | ||
| 340 | # and add 512-bits (8 qwords) | ||
| 341 | # to get 640 bits (10 qwords) | ||
| 342 | # Input: 128-bit mul source: [rdi+8*1], rbp | ||
| 343 | # 512-bit mul source: [rsi+8*n] | ||
| 344 | # 512-bit add source: r15, r14, ..., r9, r8 | ||
| 345 | # Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0] | ||
| 346 | # Clobbers all regs except: rcx, rsi, rdi | ||
| 347 | $code.=<<___; | ||
| 348 | .type MULADD_128x512,\@abi-omnipotent | ||
| 349 | .align 16 | ||
| 350 | MULADD_128x512: | ||
| 351 | ___ | ||
| 352 | &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx"); | ||
| 353 | $code.=<<___; | ||
| 354 | mov (+8*1)(%rdi), %rbp | ||
| 355 | ___ | ||
| 356 | &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx"); | ||
| 357 | $code.=<<___; | ||
| 358 | ret | ||
| 359 | .size MULADD_128x512,.-MULADD_128x512 | ||
| 360 | ___ | ||
| 361 | }}} | ||
| 362 | |||
| 363 | {{{ | ||
| 364 | #MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0 | ||
| 365 | # | ||
| 366 | # Inputs: pDst: Destination (768 bits, 12 qwords) | ||
| 367 | # pA: Multiplicand (1024 bits, 16 qwords) | ||
| 368 | # pB: Multiplicand (512 bits, 8 qwords) | ||
| 369 | # Dst = Ah * B + Al | ||
| 370 | # where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits) | ||
| 371 | # Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0] | ||
| 372 | # Uses registers: arguments, RAX, RDX | ||
| 373 | sub MULADD_256x512 | ||
| 374 | { | ||
| 375 | my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_; | ||
| 376 | $code.=<<___; | ||
| 377 | mov (+8*12)($pA), $OP | ||
| 378 | ___ | ||
| 379 | &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP); | ||
| 380 | push(@$X,shift(@$X)); | ||
| 381 | |||
| 382 | $code.=<<___; | ||
| 383 | mov (+8*13)($pA), $OP | ||
| 384 | ___ | ||
| 385 | &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP); | ||
| 386 | push(@$X,shift(@$X)); | ||
| 387 | |||
| 388 | $code.=<<___; | ||
| 389 | mov (+8*14)($pA), $OP | ||
| 390 | ___ | ||
| 391 | &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP); | ||
| 392 | push(@$X,shift(@$X)); | ||
| 393 | |||
| 394 | $code.=<<___; | ||
| 395 | mov (+8*15)($pA), $OP | ||
| 396 | ___ | ||
| 397 | &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP); | ||
| 398 | push(@$X,shift(@$X)); | ||
| 399 | } | ||
| 400 | |||
| 401 | # | ||
| 402 | # mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */ | ||
| 403 | # UINT64 *m, /* 512 bits, 8 qwords */ | ||
| 404 | # MODF_2FOLD_MONT_512_C1_DATA *data, | ||
| 405 | # UINT64 *r) /* 512 bits, 8 qwords */ | ||
| 406 | # Input: x (number to be reduced): tmp16 (Implicit) | ||
| 407 | # m (modulus): [pM] (Implicit) | ||
| 408 | # data (reduce data): [pData] (Implicit) | ||
| 409 | # Output: r (result): Address in [red_res_addr] | ||
| 410 | # result also in: r9, r8, r15, r14, r13, r12, r11, r10 | ||
| 411 | |||
| 412 | my @X=map("%r$_",(8..15)); | ||
| 413 | |||
| 414 | $code.=<<___; | ||
| 415 | .type mont_reduce,\@abi-omnipotent | ||
| 416 | .align 16 | ||
| 417 | mont_reduce: | ||
| 418 | ___ | ||
| 419 | |||
| 420 | my $STACK_DEPTH = 8; | ||
| 421 | # | ||
| 422 | # X1 = Xh * M1 + Xl | ||
| 423 | $code.=<<___; | ||
| 424 | lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords | ||
| 425 | mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords | ||
| 426 | add \$$M1, %rsi | ||
| 427 | lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords | ||
| 428 | |||
| 429 | ___ | ||
| 430 | |||
| 431 | &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times | ||
| 432 | # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0] | ||
| 433 | |||
| 434 | $code.=<<___; | ||
| 435 | xor %rax, %rax | ||
| 436 | # X1 += xl | ||
| 437 | add (+8*8)(%rcx), $X[4] | ||
| 438 | adc (+8*9)(%rcx), $X[5] | ||
| 439 | adc (+8*10)(%rcx), $X[6] | ||
| 440 | adc (+8*11)(%rcx), $X[7] | ||
| 441 | adc \$0, %rax | ||
| 442 | # X1 is now rax, r11-r8, r15-r12, tmp16[3:0] | ||
| 443 | |||
| 444 | # | ||
| 445 | # check for carry ;; carry stored in rax | ||
| 446 | mov $X[4], (+8*8)(%rdi) # rdi points to X1 | ||
| 447 | mov $X[5], (+8*9)(%rdi) | ||
| 448 | mov $X[6], %rbp | ||
| 449 | mov $X[7], (+8*11)(%rdi) | ||
| 450 | |||
| 451 | mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp) | ||
| 452 | |||
| 453 | mov (+8*0)(%rdi), $X[4] | ||
| 454 | mov (+8*1)(%rdi), $X[5] | ||
| 455 | mov (+8*2)(%rdi), $X[6] | ||
| 456 | mov (+8*3)(%rdi), $X[7] | ||
| 457 | |||
| 458 | # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8 | ||
| 459 | # rdi -> X1 | ||
| 460 | # rsi -> M1 | ||
| 461 | |||
| 462 | # | ||
| 463 | # X2 = Xh * M2 + Xl | ||
| 464 | # do first part (X2 = Xh * M2) | ||
| 465 | add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords | ||
| 466 | # Xh is actually { [rdi+8*1], rbp } | ||
| 467 | add \$`$M2-$M1`, %rsi # rsi -> M2 | ||
| 468 | lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords | ||
| 469 | ___ | ||
| 470 | unshift(@X,pop(@X)); unshift(@X,pop(@X)); | ||
| 471 | $code.=<<___; | ||
| 472 | |||
| 473 | call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8 | ||
| 474 | # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0] | ||
| 475 | mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax | ||
| 476 | |||
| 477 | # X2 += Xl | ||
| 478 | add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl | ||
| 479 | adc (+8*9-8*10)(%rdi), $X[7] | ||
| 480 | mov $X[6], (+8*8)(%rcx) | ||
| 481 | mov $X[7], (+8*9)(%rcx) | ||
| 482 | |||
| 483 | adc %rax, %rax | ||
| 484 | mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp) | ||
| 485 | |||
| 486 | lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords | ||
| 487 | add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords | ||
| 488 | |||
| 489 | # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half) | ||
| 490 | # B1:B0 = rsi[1:0] = K1[1:0] | ||
| 491 | # A1:A0 = rcx[1:0] = X2[1:0] | ||
| 492 | # Result = rdi[1],rbp = Q[1],rbp | ||
| 493 | mov (%rsi), %r8 # B0 | ||
| 494 | mov (+8*1)(%rsi), %rbx # B1 | ||
| 495 | |||
| 496 | mov (%rcx), %rax # A0 | ||
| 497 | mul %r8 # B0 | ||
| 498 | mov %rax, %rbp | ||
| 499 | mov %rdx, %r9 | ||
| 500 | |||
| 501 | mov (+8*1)(%rcx), %rax # A1 | ||
| 502 | mul %r8 # B0 | ||
| 503 | add %rax, %r9 | ||
| 504 | |||
| 505 | mov (%rcx), %rax # A0 | ||
| 506 | mul %rbx # B1 | ||
| 507 | add %rax, %r9 | ||
| 508 | |||
| 509 | mov %r9, (+8*1)(%rdi) | ||
| 510 | # end MUL_128x128t128 | ||
| 511 | |||
| 512 | sub \$`$K1-$M`, %rsi | ||
| 513 | |||
| 514 | mov (%rcx), $X[6] | ||
| 515 | mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0] | ||
| 516 | |||
| 517 | call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8 | ||
| 518 | # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0] | ||
| 519 | |||
| 520 | # load first half of m to rdx, rdi, rbx, rax | ||
| 521 | # moved this here for efficiency | ||
| 522 | mov (+8*0)(%rsi), %rax | ||
| 523 | mov (+8*1)(%rsi), %rbx | ||
| 524 | mov (+8*2)(%rsi), %rdi | ||
| 525 | mov (+8*3)(%rsi), %rdx | ||
| 526 | |||
| 527 | # continue with reduction | ||
| 528 | mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp | ||
| 529 | |||
| 530 | add (+8*8)(%rcx), $X[6] | ||
| 531 | adc (+8*9)(%rcx), $X[7] | ||
| 532 | |||
| 533 | #accumulate the final carry to rbp | ||
| 534 | adc %rbp, %rbp | ||
| 535 | |||
| 536 | # Add in overflow corrections: R = (X2>>128) += T[overflow] | ||
| 537 | # R = {r9, r8, r15, r14, ..., r10} | ||
| 538 | shl \$3, %rbp | ||
| 539 | mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T) | ||
| 540 | add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out | ||
| 541 | |||
| 542 | # rsi will be used to generate a mask after the addition | ||
| 543 | xor %rsi, %rsi | ||
| 544 | |||
| 545 | add (+8*8*0)(%rbp), $X[0] | ||
| 546 | adc (+8*8*1)(%rbp), $X[1] | ||
| 547 | adc (+8*8*2)(%rbp), $X[2] | ||
| 548 | adc (+8*8*3)(%rbp), $X[3] | ||
| 549 | adc (+8*8*4)(%rbp), $X[4] | ||
| 550 | adc (+8*8*5)(%rbp), $X[5] | ||
| 551 | adc (+8*8*6)(%rbp), $X[6] | ||
| 552 | adc (+8*8*7)(%rbp), $X[7] | ||
| 553 | |||
| 554 | # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF | ||
| 555 | # if carry is clear: rsi = 0x0000000000000000 | ||
| 556 | sbb \$0, %rsi | ||
| 557 | |||
| 558 | # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m | ||
| 559 | and %rsi, %rax | ||
| 560 | and %rsi, %rbx | ||
| 561 | and %rsi, %rdi | ||
| 562 | and %rsi, %rdx | ||
| 563 | |||
| 564 | mov \$1, %rbp | ||
| 565 | sub %rax, $X[0] | ||
| 566 | sbb %rbx, $X[1] | ||
| 567 | sbb %rdi, $X[2] | ||
| 568 | sbb %rdx, $X[3] | ||
| 569 | |||
| 570 | # if there is a borrow: rbp = 0 | ||
| 571 | # if there is no borrow: rbp = 1 | ||
| 572 | # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m | ||
| 573 | sbb \$0, %rbp | ||
| 574 | |||
| 575 | #load second half of m to rdx, rdi, rbx, rax | ||
| 576 | |||
| 577 | add \$$M, %rcx | ||
| 578 | mov (+8*4)(%rcx), %rax | ||
| 579 | mov (+8*5)(%rcx), %rbx | ||
| 580 | mov (+8*6)(%rcx), %rdi | ||
| 581 | mov (+8*7)(%rcx), %rdx | ||
| 582 | |||
| 583 | # use the rsi mask as before | ||
| 584 | # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m | ||
| 585 | and %rsi, %rax | ||
| 586 | and %rsi, %rbx | ||
| 587 | and %rsi, %rdi | ||
| 588 | and %rsi, %rdx | ||
| 589 | |||
| 590 | # if rbp = 0, there was a borrow before, it is moved to the carry flag | ||
| 591 | # if rbp = 1, there was not a borrow before, carry flag is cleared | ||
| 592 | sub \$1, %rbp | ||
| 593 | |||
| 594 | sbb %rax, $X[4] | ||
| 595 | sbb %rbx, $X[5] | ||
| 596 | sbb %rdi, $X[6] | ||
| 597 | sbb %rdx, $X[7] | ||
| 598 | |||
| 599 | # write R back to memory | ||
| 600 | |||
| 601 | mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi | ||
| 602 | mov $X[0], (+8*0)(%rsi) | ||
| 603 | mov $X[1], (+8*1)(%rsi) | ||
| 604 | mov $X[2], (+8*2)(%rsi) | ||
| 605 | mov $X[3], (+8*3)(%rsi) | ||
| 606 | mov $X[4], (+8*4)(%rsi) | ||
| 607 | mov $X[5], (+8*5)(%rsi) | ||
| 608 | mov $X[6], (+8*6)(%rsi) | ||
| 609 | mov $X[7], (+8*7)(%rsi) | ||
| 610 | |||
| 611 | ret | ||
| 612 | .size mont_reduce,.-mont_reduce | ||
| 613 | ___ | ||
| 614 | }}} | ||
| 615 | |||
| 616 | {{{ | ||
| 617 | #MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2 | ||
| 618 | # | ||
| 619 | # Inputs: pDst: Destination (1024 bits, 16 qwords) | ||
| 620 | # pA: Multiplicand (512 bits, 8 qwords) | ||
| 621 | # pB: Multiplicand (512 bits, 8 qwords) | ||
| 622 | # Uses registers rax, rdx, args | ||
| 623 | # B operand in [pB] and also in x7...x0 | ||
| 624 | sub MUL_512x512 | ||
| 625 | { | ||
| 626 | my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_; | ||
| 627 | my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/); | ||
| 628 | my @X=@$x; # make a copy | ||
| 629 | |||
| 630 | $code.=<<___; | ||
| 631 | mov (+8*0)($pA), $OP | ||
| 632 | |||
| 633 | mov $X[0], %rax | ||
| 634 | mul $OP # rdx:rax = %OP * [0] | ||
| 635 | mov %rax, (+$pDst_o+8*0)($pDst) | ||
| 636 | mov %rdx, $X[0] | ||
| 637 | ___ | ||
| 638 | for(my $i=1;$i<8;$i++) { | ||
| 639 | $code.=<<___; | ||
| 640 | mov $X[$i], %rax | ||
| 641 | mul $OP # rdx:rax = %OP * [$i] | ||
| 642 | add %rax, $X[$i-1] | ||
| 643 | adc \$0, %rdx | ||
| 644 | mov %rdx, $X[$i] | ||
| 645 | ___ | ||
| 646 | } | ||
| 647 | |||
| 648 | for(my $i=1;$i<8;$i++) { | ||
| 649 | $code.=<<___; | ||
| 650 | mov (+8*$i)($pA), $OP | ||
| 651 | ___ | ||
| 652 | |||
| 653 | &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP); | ||
| 654 | push(@X,shift(@X)); | ||
| 655 | } | ||
| 656 | |||
| 657 | $code.=<<___; | ||
| 658 | mov $X[0], (+$pDst_o+8*8)($pDst) | ||
| 659 | mov $X[1], (+$pDst_o+8*9)($pDst) | ||
| 660 | mov $X[2], (+$pDst_o+8*10)($pDst) | ||
| 661 | mov $X[3], (+$pDst_o+8*11)($pDst) | ||
| 662 | mov $X[4], (+$pDst_o+8*12)($pDst) | ||
| 663 | mov $X[5], (+$pDst_o+8*13)($pDst) | ||
| 664 | mov $X[6], (+$pDst_o+8*14)($pDst) | ||
| 665 | mov $X[7], (+$pDst_o+8*15)($pDst) | ||
| 666 | ___ | ||
| 667 | } | ||
| 668 | |||
| 669 | # | ||
| 670 | # mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits) | ||
| 671 | # Input: src1: Address of source 1: rdi | ||
| 672 | # src2: Address of source 2: rsi | ||
| 673 | # Output: dst: Address of destination: [red_res_addr] | ||
| 674 | # src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10 | ||
| 675 | # Temp: Clobbers [tmp16], all registers | ||
| 676 | $code.=<<___; | ||
| 677 | .type mont_mul_a3b,\@abi-omnipotent | ||
| 678 | .align 16 | ||
| 679 | mont_mul_a3b: | ||
| 680 | # | ||
| 681 | # multiply tmp = src1 * src2 | ||
| 682 | # For multiply: dst = rcx, src1 = rdi, src2 = rsi | ||
| 683 | # stack depth is extra 8 from call | ||
| 684 | ___ | ||
| 685 | &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx"); | ||
| 686 | $code.=<<___; | ||
| 687 | # | ||
| 688 | # Dst = tmp % m | ||
| 689 | # Call reduce(tmp, m, data, dst) | ||
| 690 | |||
| 691 | # tail recursion optimization: jmp to mont_reduce and return from there | ||
| 692 | jmp mont_reduce | ||
| 693 | # call mont_reduce | ||
| 694 | # ret | ||
| 695 | .size mont_mul_a3b,.-mont_mul_a3b | ||
| 696 | ___ | ||
| 697 | }}} | ||
| 698 | |||
| 699 | {{{ | ||
| 700 | #SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4 | ||
| 701 | # | ||
| 702 | # Input in memory [pA] and also in x7...x0 | ||
| 703 | # Uses all argument registers plus rax and rdx | ||
| 704 | # | ||
| 705 | # This version computes all of the off-diagonal terms into memory, | ||
| 706 | # and then it adds in the diagonal terms | ||
| 707 | |||
| 708 | sub SQR_512 | ||
| 709 | { | ||
| 710 | my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_; | ||
| 711 | my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/); | ||
| 712 | my @X=@$x; # make a copy | ||
| 713 | $code.=<<___; | ||
| 714 | # ------------------ | ||
| 715 | # first pass 01...07 | ||
| 716 | # ------------------ | ||
| 717 | mov $X[0], $A | ||
| 718 | |||
| 719 | mov $X[1],%rax | ||
| 720 | mul $A | ||
| 721 | mov %rax, (+$pDst_o+8*1)($pDst) | ||
| 722 | ___ | ||
| 723 | for(my $i=2;$i<8;$i++) { | ||
| 724 | $code.=<<___; | ||
| 725 | mov %rdx, $X[$i-2] | ||
| 726 | mov $X[$i],%rax | ||
| 727 | mul $A | ||
| 728 | add %rax, $X[$i-2] | ||
| 729 | adc \$0, %rdx | ||
| 730 | ___ | ||
| 731 | } | ||
| 732 | $code.=<<___; | ||
| 733 | mov %rdx, $x7 | ||
| 734 | |||
| 735 | mov $X[0], (+$pDst_o+8*2)($pDst) | ||
| 736 | |||
| 737 | # ------------------ | ||
| 738 | # second pass 12...17 | ||
| 739 | # ------------------ | ||
| 740 | |||
| 741 | mov (+8*1)($pA), $A | ||
| 742 | |||
| 743 | mov (+8*2)($pA),%rax | ||
| 744 | mul $A | ||
| 745 | add %rax, $X[1] | ||
| 746 | adc \$0, %rdx | ||
| 747 | mov $X[1], (+$pDst_o+8*3)($pDst) | ||
| 748 | |||
| 749 | mov %rdx, $X[0] | ||
| 750 | mov (+8*3)($pA),%rax | ||
| 751 | mul $A | ||
| 752 | add %rax, $X[2] | ||
| 753 | adc \$0, %rdx | ||
| 754 | add $X[0], $X[2] | ||
| 755 | adc \$0, %rdx | ||
| 756 | mov $X[2], (+$pDst_o+8*4)($pDst) | ||
| 757 | |||
| 758 | mov %rdx, $X[0] | ||
| 759 | mov (+8*4)($pA),%rax | ||
| 760 | mul $A | ||
| 761 | add %rax, $X[3] | ||
| 762 | adc \$0, %rdx | ||
| 763 | add $X[0], $X[3] | ||
| 764 | adc \$0, %rdx | ||
| 765 | |||
| 766 | mov %rdx, $X[0] | ||
| 767 | mov (+8*5)($pA),%rax | ||
| 768 | mul $A | ||
| 769 | add %rax, $X[4] | ||
| 770 | adc \$0, %rdx | ||
| 771 | add $X[0], $X[4] | ||
| 772 | adc \$0, %rdx | ||
| 773 | |||
| 774 | mov %rdx, $X[0] | ||
| 775 | mov $X[6],%rax | ||
| 776 | mul $A | ||
| 777 | add %rax, $X[5] | ||
| 778 | adc \$0, %rdx | ||
| 779 | add $X[0], $X[5] | ||
| 780 | adc \$0, %rdx | ||
| 781 | |||
| 782 | mov %rdx, $X[0] | ||
| 783 | mov $X[7],%rax | ||
| 784 | mul $A | ||
| 785 | add %rax, $x7 | ||
| 786 | adc \$0, %rdx | ||
| 787 | add $X[0], $x7 | ||
| 788 | adc \$0, %rdx | ||
| 789 | |||
| 790 | mov %rdx, $X[1] | ||
| 791 | |||
| 792 | # ------------------ | ||
| 793 | # third pass 23...27 | ||
| 794 | # ------------------ | ||
| 795 | mov (+8*2)($pA), $A | ||
| 796 | |||
| 797 | mov (+8*3)($pA),%rax | ||
| 798 | mul $A | ||
| 799 | add %rax, $X[3] | ||
| 800 | adc \$0, %rdx | ||
| 801 | mov $X[3], (+$pDst_o+8*5)($pDst) | ||
| 802 | |||
| 803 | mov %rdx, $X[0] | ||
| 804 | mov (+8*4)($pA),%rax | ||
| 805 | mul $A | ||
| 806 | add %rax, $X[4] | ||
| 807 | adc \$0, %rdx | ||
| 808 | add $X[0], $X[4] | ||
| 809 | adc \$0, %rdx | ||
| 810 | mov $X[4], (+$pDst_o+8*6)($pDst) | ||
| 811 | |||
| 812 | mov %rdx, $X[0] | ||
| 813 | mov (+8*5)($pA),%rax | ||
| 814 | mul $A | ||
| 815 | add %rax, $X[5] | ||
| 816 | adc \$0, %rdx | ||
| 817 | add $X[0], $X[5] | ||
| 818 | adc \$0, %rdx | ||
| 819 | |||
| 820 | mov %rdx, $X[0] | ||
| 821 | mov $X[6],%rax | ||
| 822 | mul $A | ||
| 823 | add %rax, $x7 | ||
| 824 | adc \$0, %rdx | ||
| 825 | add $X[0], $x7 | ||
| 826 | adc \$0, %rdx | ||
| 827 | |||
| 828 | mov %rdx, $X[0] | ||
| 829 | mov $X[7],%rax | ||
| 830 | mul $A | ||
| 831 | add %rax, $X[1] | ||
| 832 | adc \$0, %rdx | ||
| 833 | add $X[0], $X[1] | ||
| 834 | adc \$0, %rdx | ||
| 835 | |||
| 836 | mov %rdx, $X[2] | ||
| 837 | |||
| 838 | # ------------------ | ||
| 839 | # fourth pass 34...37 | ||
| 840 | # ------------------ | ||
| 841 | |||
| 842 | mov (+8*3)($pA), $A | ||
| 843 | |||
| 844 | mov (+8*4)($pA),%rax | ||
| 845 | mul $A | ||
| 846 | add %rax, $X[5] | ||
| 847 | adc \$0, %rdx | ||
| 848 | mov $X[5], (+$pDst_o+8*7)($pDst) | ||
| 849 | |||
| 850 | mov %rdx, $X[0] | ||
| 851 | mov (+8*5)($pA),%rax | ||
| 852 | mul $A | ||
| 853 | add %rax, $x7 | ||
| 854 | adc \$0, %rdx | ||
| 855 | add $X[0], $x7 | ||
| 856 | adc \$0, %rdx | ||
| 857 | mov $x7, (+$pDst_o+8*8)($pDst) | ||
| 858 | |||
| 859 | mov %rdx, $X[0] | ||
| 860 | mov $X[6],%rax | ||
| 861 | mul $A | ||
| 862 | add %rax, $X[1] | ||
| 863 | adc \$0, %rdx | ||
| 864 | add $X[0], $X[1] | ||
| 865 | adc \$0, %rdx | ||
| 866 | |||
| 867 | mov %rdx, $X[0] | ||
| 868 | mov $X[7],%rax | ||
| 869 | mul $A | ||
| 870 | add %rax, $X[2] | ||
| 871 | adc \$0, %rdx | ||
| 872 | add $X[0], $X[2] | ||
| 873 | adc \$0, %rdx | ||
| 874 | |||
| 875 | mov %rdx, $X[5] | ||
| 876 | |||
| 877 | # ------------------ | ||
| 878 | # fifth pass 45...47 | ||
| 879 | # ------------------ | ||
| 880 | mov (+8*4)($pA), $A | ||
| 881 | |||
| 882 | mov (+8*5)($pA),%rax | ||
| 883 | mul $A | ||
| 884 | add %rax, $X[1] | ||
| 885 | adc \$0, %rdx | ||
| 886 | mov $X[1], (+$pDst_o+8*9)($pDst) | ||
| 887 | |||
| 888 | mov %rdx, $X[0] | ||
| 889 | mov $X[6],%rax | ||
| 890 | mul $A | ||
| 891 | add %rax, $X[2] | ||
| 892 | adc \$0, %rdx | ||
| 893 | add $X[0], $X[2] | ||
| 894 | adc \$0, %rdx | ||
| 895 | mov $X[2], (+$pDst_o+8*10)($pDst) | ||
| 896 | |||
| 897 | mov %rdx, $X[0] | ||
| 898 | mov $X[7],%rax | ||
| 899 | mul $A | ||
| 900 | add %rax, $X[5] | ||
| 901 | adc \$0, %rdx | ||
| 902 | add $X[0], $X[5] | ||
| 903 | adc \$0, %rdx | ||
| 904 | |||
| 905 | mov %rdx, $X[1] | ||
| 906 | |||
| 907 | # ------------------ | ||
| 908 | # sixth pass 56...57 | ||
| 909 | # ------------------ | ||
| 910 | mov (+8*5)($pA), $A | ||
| 911 | |||
| 912 | mov $X[6],%rax | ||
| 913 | mul $A | ||
| 914 | add %rax, $X[5] | ||
| 915 | adc \$0, %rdx | ||
| 916 | mov $X[5], (+$pDst_o+8*11)($pDst) | ||
| 917 | |||
| 918 | mov %rdx, $X[0] | ||
| 919 | mov $X[7],%rax | ||
| 920 | mul $A | ||
| 921 | add %rax, $X[1] | ||
| 922 | adc \$0, %rdx | ||
| 923 | add $X[0], $X[1] | ||
| 924 | adc \$0, %rdx | ||
| 925 | mov $X[1], (+$pDst_o+8*12)($pDst) | ||
| 926 | |||
| 927 | mov %rdx, $X[2] | ||
| 928 | |||
| 929 | # ------------------ | ||
| 930 | # seventh pass 67 | ||
| 931 | # ------------------ | ||
| 932 | mov $X[6], $A | ||
| 933 | |||
| 934 | mov $X[7],%rax | ||
| 935 | mul $A | ||
| 936 | add %rax, $X[2] | ||
| 937 | adc \$0, %rdx | ||
| 938 | mov $X[2], (+$pDst_o+8*13)($pDst) | ||
| 939 | |||
| 940 | mov %rdx, (+$pDst_o+8*14)($pDst) | ||
| 941 | |||
| 942 | # start finalize (add in squares, and double off-terms) | ||
| 943 | mov (+$pDst_o+8*1)($pDst), $X[0] | ||
| 944 | mov (+$pDst_o+8*2)($pDst), $X[1] | ||
| 945 | mov (+$pDst_o+8*3)($pDst), $X[2] | ||
| 946 | mov (+$pDst_o+8*4)($pDst), $X[3] | ||
| 947 | mov (+$pDst_o+8*5)($pDst), $X[4] | ||
| 948 | mov (+$pDst_o+8*6)($pDst), $X[5] | ||
| 949 | |||
| 950 | mov (+8*3)($pA), %rax | ||
| 951 | mul %rax | ||
| 952 | mov %rax, $x6 | ||
| 953 | mov %rdx, $X[6] | ||
| 954 | |||
| 955 | add $X[0], $X[0] | ||
| 956 | adc $X[1], $X[1] | ||
| 957 | adc $X[2], $X[2] | ||
| 958 | adc $X[3], $X[3] | ||
| 959 | adc $X[4], $X[4] | ||
| 960 | adc $X[5], $X[5] | ||
| 961 | adc \$0, $X[6] | ||
| 962 | |||
| 963 | mov (+8*0)($pA), %rax | ||
| 964 | mul %rax | ||
| 965 | mov %rax, (+$pDst_o+8*0)($pDst) | ||
| 966 | mov %rdx, $A | ||
| 967 | |||
| 968 | mov (+8*1)($pA), %rax | ||
| 969 | mul %rax | ||
| 970 | |||
| 971 | add $A, $X[0] | ||
| 972 | adc %rax, $X[1] | ||
| 973 | adc \$0, %rdx | ||
| 974 | |||
| 975 | mov %rdx, $A | ||
| 976 | mov $X[0], (+$pDst_o+8*1)($pDst) | ||
| 977 | mov $X[1], (+$pDst_o+8*2)($pDst) | ||
| 978 | |||
| 979 | mov (+8*2)($pA), %rax | ||
| 980 | mul %rax | ||
| 981 | |||
| 982 | add $A, $X[2] | ||
| 983 | adc %rax, $X[3] | ||
| 984 | adc \$0, %rdx | ||
| 985 | |||
| 986 | mov %rdx, $A | ||
| 987 | |||
| 988 | mov $X[2], (+$pDst_o+8*3)($pDst) | ||
| 989 | mov $X[3], (+$pDst_o+8*4)($pDst) | ||
| 990 | |||
| 991 | xor $tmp, $tmp | ||
| 992 | add $A, $X[4] | ||
| 993 | adc $x6, $X[5] | ||
| 994 | adc \$0, $tmp | ||
| 995 | |||
| 996 | mov $X[4], (+$pDst_o+8*5)($pDst) | ||
| 997 | mov $X[5], (+$pDst_o+8*6)($pDst) | ||
| 998 | |||
| 999 | # %%tmp has 0/1 in column 7 | ||
| 1000 | # %%A6 has a full value in column 7 | ||
| 1001 | |||
| 1002 | mov (+$pDst_o+8*7)($pDst), $X[0] | ||
| 1003 | mov (+$pDst_o+8*8)($pDst), $X[1] | ||
| 1004 | mov (+$pDst_o+8*9)($pDst), $X[2] | ||
| 1005 | mov (+$pDst_o+8*10)($pDst), $X[3] | ||
| 1006 | mov (+$pDst_o+8*11)($pDst), $X[4] | ||
| 1007 | mov (+$pDst_o+8*12)($pDst), $X[5] | ||
| 1008 | mov (+$pDst_o+8*13)($pDst), $x6 | ||
| 1009 | mov (+$pDst_o+8*14)($pDst), $x7 | ||
| 1010 | |||
| 1011 | mov $X[7], %rax | ||
| 1012 | mul %rax | ||
| 1013 | mov %rax, $X[7] | ||
| 1014 | mov %rdx, $A | ||
| 1015 | |||
| 1016 | add $X[0], $X[0] | ||
| 1017 | adc $X[1], $X[1] | ||
| 1018 | adc $X[2], $X[2] | ||
| 1019 | adc $X[3], $X[3] | ||
| 1020 | adc $X[4], $X[4] | ||
| 1021 | adc $X[5], $X[5] | ||
| 1022 | adc $x6, $x6 | ||
| 1023 | adc $x7, $x7 | ||
| 1024 | adc \$0, $A | ||
| 1025 | |||
| 1026 | add $tmp, $X[0] | ||
| 1027 | |||
| 1028 | mov (+8*4)($pA), %rax | ||
| 1029 | mul %rax | ||
| 1030 | |||
| 1031 | add $X[6], $X[0] | ||
| 1032 | adc %rax, $X[1] | ||
| 1033 | adc \$0, %rdx | ||
| 1034 | |||
| 1035 | mov %rdx, $tmp | ||
| 1036 | |||
| 1037 | mov $X[0], (+$pDst_o+8*7)($pDst) | ||
| 1038 | mov $X[1], (+$pDst_o+8*8)($pDst) | ||
| 1039 | |||
| 1040 | mov (+8*5)($pA), %rax | ||
| 1041 | mul %rax | ||
| 1042 | |||
| 1043 | add $tmp, $X[2] | ||
| 1044 | adc %rax, $X[3] | ||
| 1045 | adc \$0, %rdx | ||
| 1046 | |||
| 1047 | mov %rdx, $tmp | ||
| 1048 | |||
| 1049 | mov $X[2], (+$pDst_o+8*9)($pDst) | ||
| 1050 | mov $X[3], (+$pDst_o+8*10)($pDst) | ||
| 1051 | |||
| 1052 | mov (+8*6)($pA), %rax | ||
| 1053 | mul %rax | ||
| 1054 | |||
| 1055 | add $tmp, $X[4] | ||
| 1056 | adc %rax, $X[5] | ||
| 1057 | adc \$0, %rdx | ||
| 1058 | |||
| 1059 | mov $X[4], (+$pDst_o+8*11)($pDst) | ||
| 1060 | mov $X[5], (+$pDst_o+8*12)($pDst) | ||
| 1061 | |||
| 1062 | add %rdx, $x6 | ||
| 1063 | adc $X[7], $x7 | ||
| 1064 | adc \$0, $A | ||
| 1065 | |||
| 1066 | mov $x6, (+$pDst_o+8*13)($pDst) | ||
| 1067 | mov $x7, (+$pDst_o+8*14)($pDst) | ||
| 1068 | mov $A, (+$pDst_o+8*15)($pDst) | ||
| 1069 | ___ | ||
| 1070 | } | ||
| 1071 | |||
| 1072 | # | ||
| 1073 | # sqr_reduce: subroutine to compute Result = reduce(Result * Result) | ||
| 1074 | # | ||
| 1075 | # input and result also in: r9, r8, r15, r14, r13, r12, r11, r10 | ||
| 1076 | # | ||
| 1077 | $code.=<<___; | ||
| 1078 | .type sqr_reduce,\@abi-omnipotent | ||
| 1079 | .align 16 | ||
| 1080 | sqr_reduce: | ||
| 1081 | mov (+$pResult_offset+8)(%rsp), %rcx | ||
| 1082 | ___ | ||
| 1083 | &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi"); | ||
| 1084 | $code.=<<___; | ||
| 1085 | # tail recursion optimization: jmp to mont_reduce and return from there | ||
| 1086 | jmp mont_reduce | ||
| 1087 | # call mont_reduce | ||
| 1088 | # ret | ||
| 1089 | .size sqr_reduce,.-sqr_reduce | ||
| 1090 | ___ | ||
| 1091 | }}} | ||
| 1092 | |||
| 1093 | # | ||
| 1094 | # MAIN FUNCTION | ||
| 1095 | # | ||
| 1096 | |||
| 1097 | #mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */ | ||
| 1098 | # UINT64 *g, /* 512 bits, 8 qwords */ | ||
| 1099 | # UINT64 *exp, /* 512 bits, 8 qwords */ | ||
| 1100 | # struct mod_ctx_512 *data) | ||
| 1101 | |||
| 1102 | # window size = 5 | ||
| 1103 | # table size = 2^5 = 32 | ||
| 1104 | #table_entries equ 32 | ||
| 1105 | #table_size equ table_entries * 8 | ||
| 1106 | $code.=<<___; | ||
| 1107 | .globl mod_exp_512 | ||
| 1108 | .type mod_exp_512,\@function,4 | ||
| 1109 | mod_exp_512: | ||
| 1110 | push %rbp | ||
| 1111 | push %rbx | ||
| 1112 | push %r12 | ||
| 1113 | push %r13 | ||
| 1114 | push %r14 | ||
| 1115 | push %r15 | ||
| 1116 | |||
| 1117 | # adjust stack down and then align it with cache boundary | ||
| 1118 | mov %rsp, %r8 | ||
| 1119 | sub \$$mem_size, %rsp | ||
| 1120 | and \$-64, %rsp | ||
| 1121 | |||
| 1122 | # store previous stack pointer and arguments | ||
| 1123 | mov %r8, (+$rsp_offset)(%rsp) | ||
| 1124 | mov %rdi, (+$pResult_offset)(%rsp) | ||
| 1125 | mov %rsi, (+$pG_offset)(%rsp) | ||
| 1126 | mov %rcx, (+$pData_offset)(%rsp) | ||
| 1127 | .Lbody: | ||
| 1128 | # transform g into montgomery space | ||
| 1129 | # GT = reduce(g * C2) = reduce(g * (2^256)) | ||
| 1130 | # reduce expects to have the input in [tmp16] | ||
| 1131 | pxor %xmm4, %xmm4 | ||
| 1132 | movdqu (+16*0)(%rsi), %xmm0 | ||
| 1133 | movdqu (+16*1)(%rsi), %xmm1 | ||
| 1134 | movdqu (+16*2)(%rsi), %xmm2 | ||
| 1135 | movdqu (+16*3)(%rsi), %xmm3 | ||
| 1136 | movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp) | ||
| 1137 | movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp) | ||
| 1138 | movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp) | ||
| 1139 | movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp) | ||
| 1140 | movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp) | ||
| 1141 | movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp) | ||
| 1142 | movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp) | ||
| 1143 | movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp) | ||
| 1144 | |||
| 1145 | # load pExp before rdx gets blown away | ||
| 1146 | movdqu (+16*0)(%rdx), %xmm0 | ||
| 1147 | movdqu (+16*1)(%rdx), %xmm1 | ||
| 1148 | movdqu (+16*2)(%rdx), %xmm2 | ||
| 1149 | movdqu (+16*3)(%rdx), %xmm3 | ||
| 1150 | |||
| 1151 | lea (+$GT_offset)(%rsp), %rbx | ||
| 1152 | mov %rbx, (+$red_result_addr_offset)(%rsp) | ||
| 1153 | call mont_reduce | ||
| 1154 | |||
| 1155 | # Initialize tmp = C | ||
| 1156 | lea (+$tmp_offset)(%rsp), %rcx | ||
| 1157 | xor %rax, %rax | ||
| 1158 | mov %rax, (+8*0)(%rcx) | ||
| 1159 | mov %rax, (+8*1)(%rcx) | ||
| 1160 | mov %rax, (+8*3)(%rcx) | ||
| 1161 | mov %rax, (+8*4)(%rcx) | ||
| 1162 | mov %rax, (+8*5)(%rcx) | ||
| 1163 | mov %rax, (+8*6)(%rcx) | ||
| 1164 | mov %rax, (+8*7)(%rcx) | ||
| 1165 | mov %rax, (+$exp_offset+8*8)(%rsp) | ||
| 1166 | movq \$1, (+8*2)(%rcx) | ||
| 1167 | |||
| 1168 | lea (+$garray_offset)(%rsp), %rbp | ||
| 1169 | mov %rcx, %rsi # pTmp | ||
| 1170 | mov %rbp, %rdi # Garray[][0] | ||
| 1171 | ___ | ||
| 1172 | |||
| 1173 | &swizzle("%rdi", "%rcx", "%rax", "%rbx"); | ||
| 1174 | |||
| 1175 | # for (rax = 31; rax != 0; rax--) { | ||
| 1176 | # tmp = reduce(tmp * G) | ||
| 1177 | # swizzle(pg, tmp); | ||
| 1178 | # pg += 2; } | ||
| 1179 | $code.=<<___; | ||
| 1180 | mov \$31, %rax | ||
| 1181 | mov %rax, (+$i_offset)(%rsp) | ||
| 1182 | mov %rbp, (+$pg_offset)(%rsp) | ||
| 1183 | # rsi -> pTmp | ||
| 1184 | mov %rsi, (+$red_result_addr_offset)(%rsp) | ||
| 1185 | mov (+8*0)(%rsi), %r10 | ||
| 1186 | mov (+8*1)(%rsi), %r11 | ||
| 1187 | mov (+8*2)(%rsi), %r12 | ||
| 1188 | mov (+8*3)(%rsi), %r13 | ||
| 1189 | mov (+8*4)(%rsi), %r14 | ||
| 1190 | mov (+8*5)(%rsi), %r15 | ||
| 1191 | mov (+8*6)(%rsi), %r8 | ||
| 1192 | mov (+8*7)(%rsi), %r9 | ||
| 1193 | init_loop: | ||
| 1194 | lea (+$GT_offset)(%rsp), %rdi | ||
| 1195 | call mont_mul_a3b | ||
| 1196 | lea (+$tmp_offset)(%rsp), %rsi | ||
| 1197 | mov (+$pg_offset)(%rsp), %rbp | ||
| 1198 | add \$2, %rbp | ||
| 1199 | mov %rbp, (+$pg_offset)(%rsp) | ||
| 1200 | mov %rsi, %rcx # rcx = rsi = addr of tmp | ||
| 1201 | ___ | ||
| 1202 | |||
| 1203 | &swizzle("%rbp", "%rcx", "%rax", "%rbx"); | ||
| 1204 | $code.=<<___; | ||
| 1205 | mov (+$i_offset)(%rsp), %rax | ||
| 1206 | sub \$1, %rax | ||
| 1207 | mov %rax, (+$i_offset)(%rsp) | ||
| 1208 | jne init_loop | ||
| 1209 | |||
| 1210 | # | ||
| 1211 | # Copy exponent onto stack | ||
| 1212 | movdqa %xmm0, (+$exp_offset+16*0)(%rsp) | ||
| 1213 | movdqa %xmm1, (+$exp_offset+16*1)(%rsp) | ||
| 1214 | movdqa %xmm2, (+$exp_offset+16*2)(%rsp) | ||
| 1215 | movdqa %xmm3, (+$exp_offset+16*3)(%rsp) | ||
| 1216 | |||
| 1217 | |||
| 1218 | # | ||
| 1219 | # Do exponentiation | ||
| 1220 | # Initialize result to G[exp{511:507}] | ||
| 1221 | mov (+$exp_offset+62)(%rsp), %eax | ||
| 1222 | mov %rax, %rdx | ||
| 1223 | shr \$11, %rax | ||
| 1224 | and \$0x07FF, %edx | ||
| 1225 | mov %edx, (+$exp_offset+62)(%rsp) | ||
| 1226 | lea (+$garray_offset)(%rsp,%rax,2), %rsi | ||
| 1227 | mov (+$pResult_offset)(%rsp), %rdx | ||
| 1228 | ___ | ||
| 1229 | |||
| 1230 | &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax"); | ||
| 1231 | |||
| 1232 | # | ||
| 1233 | # Loop variables | ||
| 1234 | # rcx = [loop_idx] = index: 510-5 to 0 by 5 | ||
| 1235 | $code.=<<___; | ||
| 1236 | movq \$505, (+$loop_idx_offset)(%rsp) | ||
| 1237 | |||
| 1238 | mov (+$pResult_offset)(%rsp), %rcx | ||
| 1239 | mov %rcx, (+$red_result_addr_offset)(%rsp) | ||
| 1240 | mov (+8*0)(%rcx), %r10 | ||
| 1241 | mov (+8*1)(%rcx), %r11 | ||
| 1242 | mov (+8*2)(%rcx), %r12 | ||
| 1243 | mov (+8*3)(%rcx), %r13 | ||
| 1244 | mov (+8*4)(%rcx), %r14 | ||
| 1245 | mov (+8*5)(%rcx), %r15 | ||
| 1246 | mov (+8*6)(%rcx), %r8 | ||
| 1247 | mov (+8*7)(%rcx), %r9 | ||
| 1248 | jmp sqr_2 | ||
| 1249 | |||
| 1250 | main_loop_a3b: | ||
| 1251 | call sqr_reduce | ||
| 1252 | call sqr_reduce | ||
| 1253 | call sqr_reduce | ||
| 1254 | sqr_2: | ||
| 1255 | call sqr_reduce | ||
| 1256 | call sqr_reduce | ||
| 1257 | |||
| 1258 | # | ||
| 1259 | # Do multiply, first look up proper value in Garray | ||
| 1260 | mov (+$loop_idx_offset)(%rsp), %rcx # bit index | ||
| 1261 | mov %rcx, %rax | ||
| 1262 | shr \$4, %rax # rax is word pointer | ||
| 1263 | mov (+$exp_offset)(%rsp,%rax,2), %edx | ||
| 1264 | and \$15, %rcx | ||
| 1265 | shrq %cl, %rdx | ||
| 1266 | and \$0x1F, %rdx | ||
| 1267 | |||
| 1268 | lea (+$garray_offset)(%rsp,%rdx,2), %rsi | ||
| 1269 | lea (+$tmp_offset)(%rsp), %rdx | ||
| 1270 | mov %rdx, %rdi | ||
| 1271 | ___ | ||
| 1272 | |||
| 1273 | &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax"); | ||
| 1274 | # rdi = tmp = pG | ||
| 1275 | |||
| 1276 | # | ||
| 1277 | # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData) | ||
| 1278 | # result result pG M Data | ||
| 1279 | $code.=<<___; | ||
| 1280 | mov (+$pResult_offset)(%rsp), %rsi | ||
| 1281 | call mont_mul_a3b | ||
| 1282 | |||
| 1283 | # | ||
| 1284 | # finish loop | ||
| 1285 | mov (+$loop_idx_offset)(%rsp), %rcx | ||
| 1286 | sub \$5, %rcx | ||
| 1287 | mov %rcx, (+$loop_idx_offset)(%rsp) | ||
| 1288 | jge main_loop_a3b | ||
| 1289 | |||
| 1290 | # | ||
| 1291 | |||
| 1292 | end_main_loop_a3b: | ||
| 1293 | # transform result out of Montgomery space | ||
| 1294 | # result = reduce(result) | ||
| 1295 | mov (+$pResult_offset)(%rsp), %rdx | ||
| 1296 | pxor %xmm4, %xmm4 | ||
| 1297 | movdqu (+16*0)(%rdx), %xmm0 | ||
| 1298 | movdqu (+16*1)(%rdx), %xmm1 | ||
| 1299 | movdqu (+16*2)(%rdx), %xmm2 | ||
| 1300 | movdqu (+16*3)(%rdx), %xmm3 | ||
| 1301 | movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp) | ||
| 1302 | movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp) | ||
| 1303 | movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp) | ||
| 1304 | movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp) | ||
| 1305 | movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp) | ||
| 1306 | movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp) | ||
| 1307 | movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp) | ||
| 1308 | movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp) | ||
| 1309 | call mont_reduce | ||
| 1310 | |||
| 1311 | # If result > m, subract m | ||
| 1312 | # load result into r15:r8 | ||
| 1313 | mov (+$pResult_offset)(%rsp), %rax | ||
| 1314 | mov (+8*0)(%rax), %r8 | ||
| 1315 | mov (+8*1)(%rax), %r9 | ||
| 1316 | mov (+8*2)(%rax), %r10 | ||
| 1317 | mov (+8*3)(%rax), %r11 | ||
| 1318 | mov (+8*4)(%rax), %r12 | ||
| 1319 | mov (+8*5)(%rax), %r13 | ||
| 1320 | mov (+8*6)(%rax), %r14 | ||
| 1321 | mov (+8*7)(%rax), %r15 | ||
| 1322 | |||
| 1323 | # subtract m | ||
| 1324 | mov (+$pData_offset)(%rsp), %rbx | ||
| 1325 | add \$$M, %rbx | ||
| 1326 | |||
| 1327 | sub (+8*0)(%rbx), %r8 | ||
| 1328 | sbb (+8*1)(%rbx), %r9 | ||
| 1329 | sbb (+8*2)(%rbx), %r10 | ||
| 1330 | sbb (+8*3)(%rbx), %r11 | ||
| 1331 | sbb (+8*4)(%rbx), %r12 | ||
| 1332 | sbb (+8*5)(%rbx), %r13 | ||
| 1333 | sbb (+8*6)(%rbx), %r14 | ||
| 1334 | sbb (+8*7)(%rbx), %r15 | ||
| 1335 | |||
| 1336 | # if Carry is clear, replace result with difference | ||
| 1337 | mov (+8*0)(%rax), %rsi | ||
| 1338 | mov (+8*1)(%rax), %rdi | ||
| 1339 | mov (+8*2)(%rax), %rcx | ||
| 1340 | mov (+8*3)(%rax), %rdx | ||
| 1341 | cmovnc %r8, %rsi | ||
| 1342 | cmovnc %r9, %rdi | ||
| 1343 | cmovnc %r10, %rcx | ||
| 1344 | cmovnc %r11, %rdx | ||
| 1345 | mov %rsi, (+8*0)(%rax) | ||
| 1346 | mov %rdi, (+8*1)(%rax) | ||
| 1347 | mov %rcx, (+8*2)(%rax) | ||
| 1348 | mov %rdx, (+8*3)(%rax) | ||
| 1349 | |||
| 1350 | mov (+8*4)(%rax), %rsi | ||
| 1351 | mov (+8*5)(%rax), %rdi | ||
| 1352 | mov (+8*6)(%rax), %rcx | ||
| 1353 | mov (+8*7)(%rax), %rdx | ||
| 1354 | cmovnc %r12, %rsi | ||
| 1355 | cmovnc %r13, %rdi | ||
| 1356 | cmovnc %r14, %rcx | ||
| 1357 | cmovnc %r15, %rdx | ||
| 1358 | mov %rsi, (+8*4)(%rax) | ||
| 1359 | mov %rdi, (+8*5)(%rax) | ||
| 1360 | mov %rcx, (+8*6)(%rax) | ||
| 1361 | mov %rdx, (+8*7)(%rax) | ||
| 1362 | |||
| 1363 | mov (+$rsp_offset)(%rsp), %rsi | ||
| 1364 | mov 0(%rsi),%r15 | ||
| 1365 | mov 8(%rsi),%r14 | ||
| 1366 | mov 16(%rsi),%r13 | ||
| 1367 | mov 24(%rsi),%r12 | ||
| 1368 | mov 32(%rsi),%rbx | ||
| 1369 | mov 40(%rsi),%rbp | ||
| 1370 | lea 48(%rsi),%rsp | ||
| 1371 | .Lepilogue: | ||
| 1372 | ret | ||
| 1373 | .size mod_exp_512, . - mod_exp_512 | ||
| 1374 | ___ | ||
| 1375 | |||
| 1376 | if ($win64) { | ||
| 1377 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 1378 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 1379 | my $rec="%rcx"; | ||
| 1380 | my $frame="%rdx"; | ||
| 1381 | my $context="%r8"; | ||
| 1382 | my $disp="%r9"; | ||
| 1383 | |||
| 1384 | $code.=<<___; | ||
| 1385 | .extern __imp_RtlVirtualUnwind | ||
| 1386 | .type mod_exp_512_se_handler,\@abi-omnipotent | ||
| 1387 | .align 16 | ||
| 1388 | mod_exp_512_se_handler: | ||
| 1389 | push %rsi | ||
| 1390 | push %rdi | ||
| 1391 | push %rbx | ||
| 1392 | push %rbp | ||
| 1393 | push %r12 | ||
| 1394 | push %r13 | ||
| 1395 | push %r14 | ||
| 1396 | push %r15 | ||
| 1397 | pushfq | ||
| 1398 | sub \$64,%rsp | ||
| 1399 | |||
| 1400 | mov 120($context),%rax # pull context->Rax | ||
| 1401 | mov 248($context),%rbx # pull context->Rip | ||
| 1402 | |||
| 1403 | lea .Lbody(%rip),%r10 | ||
| 1404 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 1405 | jb .Lin_prologue | ||
| 1406 | |||
| 1407 | mov 152($context),%rax # pull context->Rsp | ||
| 1408 | |||
| 1409 | lea .Lepilogue(%rip),%r10 | ||
| 1410 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 1411 | jae .Lin_prologue | ||
| 1412 | |||
| 1413 | mov $rsp_offset(%rax),%rax # pull saved Rsp | ||
| 1414 | |||
| 1415 | mov 32(%rax),%rbx | ||
| 1416 | mov 40(%rax),%rbp | ||
| 1417 | mov 24(%rax),%r12 | ||
| 1418 | mov 16(%rax),%r13 | ||
| 1419 | mov 8(%rax),%r14 | ||
| 1420 | mov 0(%rax),%r15 | ||
| 1421 | lea 48(%rax),%rax | ||
| 1422 | mov %rbx,144($context) # restore context->Rbx | ||
| 1423 | mov %rbp,160($context) # restore context->Rbp | ||
| 1424 | mov %r12,216($context) # restore context->R12 | ||
| 1425 | mov %r13,224($context) # restore context->R13 | ||
| 1426 | mov %r14,232($context) # restore context->R14 | ||
| 1427 | mov %r15,240($context) # restore context->R15 | ||
| 1428 | |||
| 1429 | .Lin_prologue: | ||
| 1430 | mov 8(%rax),%rdi | ||
| 1431 | mov 16(%rax),%rsi | ||
| 1432 | mov %rax,152($context) # restore context->Rsp | ||
| 1433 | mov %rsi,168($context) # restore context->Rsi | ||
| 1434 | mov %rdi,176($context) # restore context->Rdi | ||
| 1435 | |||
| 1436 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 1437 | mov $context,%rsi # context | ||
| 1438 | mov \$154,%ecx # sizeof(CONTEXT) | ||
| 1439 | .long 0xa548f3fc # cld; rep movsq | ||
| 1440 | |||
| 1441 | mov $disp,%rsi | ||
| 1442 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 1443 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 1444 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 1445 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 1446 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 1447 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 1448 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 1449 | mov %r10,32(%rsp) # arg5 | ||
| 1450 | mov %r11,40(%rsp) # arg6 | ||
| 1451 | mov %r12,48(%rsp) # arg7 | ||
| 1452 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 1453 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 1454 | |||
| 1455 | mov \$1,%eax # ExceptionContinueSearch | ||
| 1456 | add \$64,%rsp | ||
| 1457 | popfq | ||
| 1458 | pop %r15 | ||
| 1459 | pop %r14 | ||
| 1460 | pop %r13 | ||
| 1461 | pop %r12 | ||
| 1462 | pop %rbp | ||
| 1463 | pop %rbx | ||
| 1464 | pop %rdi | ||
| 1465 | pop %rsi | ||
| 1466 | ret | ||
| 1467 | .size mod_exp_512_se_handler,.-mod_exp_512_se_handler | ||
| 1468 | |||
| 1469 | .section .pdata | ||
| 1470 | .align 4 | ||
| 1471 | .rva .LSEH_begin_mod_exp_512 | ||
| 1472 | .rva .LSEH_end_mod_exp_512 | ||
| 1473 | .rva .LSEH_info_mod_exp_512 | ||
| 1474 | |||
| 1475 | .section .xdata | ||
| 1476 | .align 8 | ||
| 1477 | .LSEH_info_mod_exp_512: | ||
| 1478 | .byte 9,0,0,0 | ||
| 1479 | .rva mod_exp_512_se_handler | ||
| 1480 | ___ | ||
| 1481 | } | ||
| 1482 | |||
| 1483 | sub reg_part { | ||
| 1484 | my ($reg,$conv)=@_; | ||
| 1485 | if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } | ||
| 1486 | elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } | ||
| 1487 | elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } | ||
| 1488 | elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } | ||
| 1489 | return $reg; | ||
| 1490 | } | ||
| 1491 | |||
| 1492 | $code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; | ||
| 1493 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 1494 | $code =~ s/(\(\+[^)]+\))/eval $1/gem; | ||
| 1495 | print $code; | ||
| 1496 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/parisc-mont.pl b/src/lib/libcrypto/bn/asm/parisc-mont.pl new file mode 100644 index 0000000000..4a766a87fb --- /dev/null +++ b/src/lib/libcrypto/bn/asm/parisc-mont.pl | |||
| @@ -0,0 +1,993 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # On PA-7100LC this module performs ~90-50% better, less for longer | ||
| 11 | # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means | ||
| 12 | # that compiler utilized xmpyu instruction to perform 32x32=64-bit | ||
| 13 | # multiplication, which in turn means that "baseline" performance was | ||
| 14 | # optimal in respect to instruction set capabilities. Fair comparison | ||
| 15 | # with vendor compiler is problematic, because OpenSSL doesn't define | ||
| 16 | # BN_LLONG [presumably] for historical reasons, which drives compiler | ||
| 17 | # toward 4 times 16x16=32-bit multiplicatons [plus complementary | ||
| 18 | # shifts and additions] instead. This means that you should observe | ||
| 19 | # several times improvement over code generated by vendor compiler | ||
| 20 | # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual | ||
| 21 | # improvement coefficient was never collected on PA-7100LC, or any | ||
| 22 | # other 1.1 CPU, because I don't have access to such machine with | ||
| 23 | # vendor compiler. But to give you a taste, PA-RISC 1.1 code path | ||
| 24 | # reportedly outperformed code generated by cc +DA1.1 +O3 by factor | ||
| 25 | # of ~5x on PA-8600. | ||
| 26 | # | ||
| 27 | # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is | ||
| 28 | # reportedly ~2x faster than vendor compiler generated code [according | ||
| 29 | # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of | ||
| 30 | # this implementation is actually 32-bit one, in the sense that it | ||
| 31 | # operates on 32-bit values. But pa-risc2[W].s operates on arrays of | ||
| 32 | # 64-bit BN_LONGs... How do they interoperate then? No problem. This | ||
| 33 | # module picks halves of 64-bit values in reverse order and pretends | ||
| 34 | # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" | ||
| 35 | # 64-bit code such as pa-risc2[W].s then? Well, the thing is that | ||
| 36 | # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, | ||
| 37 | # i.e. there is no "wider" multiplication like on most other 64-bit | ||
| 38 | # platforms. This means that even being effectively 32-bit, this | ||
| 39 | # implementation performs "64-bit" computational task in same amount | ||
| 40 | # of arithmetic operations, most notably multiplications. It requires | ||
| 41 | # more memory references, most notably to tp[num], but this doesn't | ||
| 42 | # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC | ||
| 43 | # 2.0 code path, provides virtually same performance as pa-risc2[W].s: | ||
| 44 | # it's ~10% better for shortest key length and ~10% worse for longest | ||
| 45 | # one. | ||
| 46 | # | ||
| 47 | # In case it wasn't clear. The module has two distinct code paths: | ||
| 48 | # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit | ||
| 49 | # additions and 64-bit integer loads, not to mention specific | ||
| 50 | # instruction scheduling. In 64-bit build naturally only 2.0 code path | ||
| 51 | # is assembled. In 32-bit application context both code paths are | ||
| 52 | # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path | ||
| 53 | # is taken automatically. Also, in 32-bit build the module imposes | ||
| 54 | # couple of limitations: vector lengths has to be even and vector | ||
| 55 | # addresses has to be 64-bit aligned. Normally neither is a problem: | ||
| 56 | # most common key lengths are even and vectors are commonly malloc-ed, | ||
| 57 | # which ensures alignment. | ||
| 58 | # | ||
| 59 | # Special thanks to polarhome.com for providing HP-UX account on | ||
| 60 | # PA-RISC 1.1 machine, and to correspondent who chose to remain | ||
| 61 | # anonymous for testing the code on PA-RISC 2.0 machine. | ||
| 62 | |||
| 63 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 64 | |||
| 65 | $flavour = shift; | ||
| 66 | $output = shift; | ||
| 67 | |||
| 68 | open STDOUT,">$output"; | ||
| 69 | |||
| 70 | if ($flavour =~ /64/) { | ||
| 71 | $LEVEL ="2.0W"; | ||
| 72 | $SIZE_T =8; | ||
| 73 | $FRAME_MARKER =80; | ||
| 74 | $SAVED_RP =16; | ||
| 75 | $PUSH ="std"; | ||
| 76 | $PUSHMA ="std,ma"; | ||
| 77 | $POP ="ldd"; | ||
| 78 | $POPMB ="ldd,mb"; | ||
| 79 | $BN_SZ =$SIZE_T; | ||
| 80 | } else { | ||
| 81 | $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; | ||
| 82 | $SIZE_T =4; | ||
| 83 | $FRAME_MARKER =48; | ||
| 84 | $SAVED_RP =20; | ||
| 85 | $PUSH ="stw"; | ||
| 86 | $PUSHMA ="stwm"; | ||
| 87 | $POP ="ldw"; | ||
| 88 | $POPMB ="ldwm"; | ||
| 89 | $BN_SZ =$SIZE_T; | ||
| 90 | if (open CONF,"<${dir}../../opensslconf.h") { | ||
| 91 | while(<CONF>) { | ||
| 92 | if (m/#\s*define\s+SIXTY_FOUR_BIT/) { | ||
| 93 | $BN_SZ=8; | ||
| 94 | $LEVEL="2.0"; | ||
| 95 | last; | ||
| 96 | } | ||
| 97 | } | ||
| 98 | close CONF; | ||
| 99 | } | ||
| 100 | } | ||
| 101 | |||
| 102 | $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker | ||
| 103 | # [+ argument transfer] | ||
| 104 | $LOCALS=$FRAME-$FRAME_MARKER; | ||
| 105 | $FRAME+=32; # local variables | ||
| 106 | |||
| 107 | $tp="%r31"; | ||
| 108 | $ti1="%r29"; | ||
| 109 | $ti0="%r28"; | ||
| 110 | |||
| 111 | $rp="%r26"; | ||
| 112 | $ap="%r25"; | ||
| 113 | $bp="%r24"; | ||
| 114 | $np="%r23"; | ||
| 115 | $n0="%r22"; # passed through stack in 32-bit | ||
| 116 | $num="%r21"; # passed through stack in 32-bit | ||
| 117 | $idx="%r20"; | ||
| 118 | $arrsz="%r19"; | ||
| 119 | |||
| 120 | $nm1="%r7"; | ||
| 121 | $nm0="%r6"; | ||
| 122 | $ab1="%r5"; | ||
| 123 | $ab0="%r4"; | ||
| 124 | |||
| 125 | $fp="%r3"; | ||
| 126 | $hi1="%r2"; | ||
| 127 | $hi0="%r1"; | ||
| 128 | |||
| 129 | $xfer=$n0; # accomodates [-16..15] offset in fld[dw]s | ||
| 130 | |||
| 131 | $fm0="%fr4"; $fti=$fm0; | ||
| 132 | $fbi="%fr5L"; | ||
| 133 | $fn0="%fr5R"; | ||
| 134 | $fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; | ||
| 135 | $fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; | ||
| 136 | |||
| 137 | $code=<<___; | ||
| 138 | .LEVEL $LEVEL | ||
| 139 | .SPACE \$TEXT\$ | ||
| 140 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
| 141 | |||
| 142 | .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | ||
| 143 | .ALIGN 64 | ||
| 144 | bn_mul_mont | ||
| 145 | .PROC | ||
| 146 | .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 | ||
| 147 | .ENTRY | ||
| 148 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
| 149 | $PUSHMA %r3,$FRAME(%sp) | ||
| 150 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
| 151 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
| 152 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
| 153 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
| 154 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
| 155 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
| 156 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
| 157 | ldo -$FRAME(%sp),$fp | ||
| 158 | ___ | ||
| 159 | $code.=<<___ if ($SIZE_T==4); | ||
| 160 | ldw `-$FRAME_MARKER-4`($fp),$n0 | ||
| 161 | ldw `-$FRAME_MARKER-8`($fp),$num | ||
| 162 | nop | ||
| 163 | nop ; alignment | ||
| 164 | ___ | ||
| 165 | $code.=<<___ if ($BN_SZ==4); | ||
| 166 | comiclr,<= 6,$num,%r0 ; are vectors long enough? | ||
| 167 | b L\$abort | ||
| 168 | ldi 0,%r28 ; signal "unhandled" | ||
| 169 | add,ev %r0,$num,$num ; is $num even? | ||
| 170 | b L\$abort | ||
| 171 | nop | ||
| 172 | or $ap,$np,$ti1 | ||
| 173 | extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? | ||
| 174 | b L\$abort | ||
| 175 | nop | ||
| 176 | nop ; alignment | ||
| 177 | nop | ||
| 178 | |||
| 179 | fldws 0($n0),${fn0} | ||
| 180 | fldws,ma 4($bp),${fbi} ; bp[0] | ||
| 181 | ___ | ||
| 182 | $code.=<<___ if ($BN_SZ==8); | ||
| 183 | comib,> 3,$num,L\$abort ; are vectors long enough? | ||
| 184 | ldi 0,%r28 ; signal "unhandled" | ||
| 185 | addl $num,$num,$num ; I operate on 32-bit values | ||
| 186 | |||
| 187 | fldws 4($n0),${fn0} ; only low part of n0 | ||
| 188 | fldws 4($bp),${fbi} ; bp[0] in flipped word order | ||
| 189 | ___ | ||
| 190 | $code.=<<___; | ||
| 191 | fldds 0($ap),${fai} ; ap[0,1] | ||
| 192 | fldds 0($np),${fni} ; np[0,1] | ||
| 193 | |||
| 194 | sh2addl $num,%r0,$arrsz | ||
| 195 | ldi 31,$hi0 | ||
| 196 | ldo 36($arrsz),$hi1 ; space for tp[num+1] | ||
| 197 | andcm $hi1,$hi0,$hi1 ; align | ||
| 198 | addl $hi1,%sp,%sp | ||
| 199 | $PUSH $fp,-$SIZE_T(%sp) | ||
| 200 | |||
| 201 | ldo `$LOCALS+16`($fp),$xfer | ||
| 202 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 203 | |||
| 204 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] | ||
| 205 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] | ||
| 206 | xmpyu ${fn0},${fab0}R,${fm0} | ||
| 207 | |||
| 208 | addl $arrsz,$ap,$ap ; point at the end | ||
| 209 | addl $arrsz,$np,$np | ||
| 210 | subi 0,$arrsz,$idx ; j=0 | ||
| 211 | ldo 8($idx),$idx ; j++++ | ||
| 212 | |||
| 213 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
| 214 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
| 215 | fstds ${fab0},-16($xfer) | ||
| 216 | fstds ${fnm0},-8($xfer) | ||
| 217 | fstds ${fab1},0($xfer) | ||
| 218 | fstds ${fnm1},8($xfer) | ||
| 219 | flddx $idx($ap),${fai} ; ap[2,3] | ||
| 220 | flddx $idx($np),${fni} ; np[2,3] | ||
| 221 | ___ | ||
| 222 | $code.=<<___ if ($BN_SZ==4); | ||
| 223 | mtctl $hi0,%cr11 ; $hi0 still holds 31 | ||
| 224 | extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 | ||
| 225 | b L\$parisc11 | ||
| 226 | nop | ||
| 227 | ___ | ||
| 228 | $code.=<<___; # PA-RISC 2.0 code-path | ||
| 229 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
| 230 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 231 | ldd -16($xfer),$ab0 | ||
| 232 | fstds ${fab0},-16($xfer) | ||
| 233 | |||
| 234 | extrd,u $ab0,31,32,$hi0 | ||
| 235 | extrd,u $ab0,63,32,$ab0 | ||
| 236 | ldd -8($xfer),$nm0 | ||
| 237 | fstds ${fnm0},-8($xfer) | ||
| 238 | ldo 8($idx),$idx ; j++++ | ||
| 239 | addl $ab0,$nm0,$nm0 ; low part is discarded | ||
| 240 | extrd,u $nm0,31,32,$hi1 | ||
| 241 | |||
| 242 | L\$1st | ||
| 243 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] | ||
| 244 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
| 245 | ldd 0($xfer),$ab1 | ||
| 246 | fstds ${fab1},0($xfer) | ||
| 247 | addl $hi0,$ab1,$ab1 | ||
| 248 | extrd,u $ab1,31,32,$hi0 | ||
| 249 | ldd 8($xfer),$nm1 | ||
| 250 | fstds ${fnm1},8($xfer) | ||
| 251 | extrd,u $ab1,63,32,$ab1 | ||
| 252 | addl $hi1,$nm1,$nm1 | ||
| 253 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
| 254 | flddx $idx($np),${fni} ; np[j,j+1] | ||
| 255 | addl $ab1,$nm1,$nm1 | ||
| 256 | extrd,u $nm1,31,32,$hi1 | ||
| 257 | |||
| 258 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
| 259 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 260 | ldd -16($xfer),$ab0 | ||
| 261 | fstds ${fab0},-16($xfer) | ||
| 262 | addl $hi0,$ab0,$ab0 | ||
| 263 | extrd,u $ab0,31,32,$hi0 | ||
| 264 | ldd -8($xfer),$nm0 | ||
| 265 | fstds ${fnm0},-8($xfer) | ||
| 266 | extrd,u $ab0,63,32,$ab0 | ||
| 267 | addl $hi1,$nm0,$nm0 | ||
| 268 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 269 | addl $ab0,$nm0,$nm0 | ||
| 270 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
| 271 | addib,<> 8,$idx,L\$1st ; j++++ | ||
| 272 | extrd,u $nm0,31,32,$hi1 | ||
| 273 | |||
| 274 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] | ||
| 275 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
| 276 | ldd 0($xfer),$ab1 | ||
| 277 | fstds ${fab1},0($xfer) | ||
| 278 | addl $hi0,$ab1,$ab1 | ||
| 279 | extrd,u $ab1,31,32,$hi0 | ||
| 280 | ldd 8($xfer),$nm1 | ||
| 281 | fstds ${fnm1},8($xfer) | ||
| 282 | extrd,u $ab1,63,32,$ab1 | ||
| 283 | addl $hi1,$nm1,$nm1 | ||
| 284 | ldd -16($xfer),$ab0 | ||
| 285 | addl $ab1,$nm1,$nm1 | ||
| 286 | ldd -8($xfer),$nm0 | ||
| 287 | extrd,u $nm1,31,32,$hi1 | ||
| 288 | |||
| 289 | addl $hi0,$ab0,$ab0 | ||
| 290 | extrd,u $ab0,31,32,$hi0 | ||
| 291 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 292 | extrd,u $ab0,63,32,$ab0 | ||
| 293 | addl $hi1,$nm0,$nm0 | ||
| 294 | ldd 0($xfer),$ab1 | ||
| 295 | addl $ab0,$nm0,$nm0 | ||
| 296 | ldd,mb 8($xfer),$nm1 | ||
| 297 | extrd,u $nm0,31,32,$hi1 | ||
| 298 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
| 299 | |||
| 300 | ldo -1($num),$num ; i-- | ||
| 301 | subi 0,$arrsz,$idx ; j=0 | ||
| 302 | ___ | ||
| 303 | $code.=<<___ if ($BN_SZ==4); | ||
| 304 | fldws,ma 4($bp),${fbi} ; bp[1] | ||
| 305 | ___ | ||
| 306 | $code.=<<___ if ($BN_SZ==8); | ||
| 307 | fldws 0($bp),${fbi} ; bp[1] in flipped word order | ||
| 308 | ___ | ||
| 309 | $code.=<<___; | ||
| 310 | flddx $idx($ap),${fai} ; ap[0,1] | ||
| 311 | flddx $idx($np),${fni} ; np[0,1] | ||
| 312 | fldws 8($xfer),${fti}R ; tp[0] | ||
| 313 | addl $hi0,$ab1,$ab1 | ||
| 314 | extrd,u $ab1,31,32,$hi0 | ||
| 315 | extrd,u $ab1,63,32,$ab1 | ||
| 316 | ldo 8($idx),$idx ; j++++ | ||
| 317 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] | ||
| 318 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] | ||
| 319 | addl $hi1,$nm1,$nm1 | ||
| 320 | addl $ab1,$nm1,$nm1 | ||
| 321 | extrd,u $nm1,31,32,$hi1 | ||
| 322 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
| 323 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 324 | |||
| 325 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
| 326 | fcpy,sgl %fr0,${fab0}L | ||
| 327 | addl $hi1,$hi0,$hi0 | ||
| 328 | extrd,u $hi0,31,32,$hi1 | ||
| 329 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
| 330 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
| 331 | stw $hi0,0($tp) | ||
| 332 | stw $hi1,4($tp) | ||
| 333 | |||
| 334 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
| 335 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
| 336 | xmpyu ${fn0},${fab0}R,${fm0} | ||
| 337 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 338 | L\$outer | ||
| 339 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
| 340 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
| 341 | fstds ${fab0},-16($xfer) ; 33-bit value | ||
| 342 | fstds ${fnm0},-8($xfer) | ||
| 343 | flddx $idx($ap),${fai} ; ap[2] | ||
| 344 | flddx $idx($np),${fni} ; np[2] | ||
| 345 | ldo 8($idx),$idx ; j++++ | ||
| 346 | ldd -16($xfer),$ab0 ; 33-bit value | ||
| 347 | ldd -8($xfer),$nm0 | ||
| 348 | ldw 0($xfer),$hi0 ; high part | ||
| 349 | |||
| 350 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
| 351 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 352 | extrd,u $ab0,31,32,$ti0 ; carry bit | ||
| 353 | extrd,u $ab0,63,32,$ab0 | ||
| 354 | fstds ${fab1},0($xfer) | ||
| 355 | addl $ti0,$hi0,$hi0 ; account carry bit | ||
| 356 | fstds ${fnm1},8($xfer) | ||
| 357 | addl $ab0,$nm0,$nm0 ; low part is discarded | ||
| 358 | ldw 0($tp),$ti1 ; tp[1] | ||
| 359 | extrd,u $nm0,31,32,$hi1 | ||
| 360 | fstds ${fab0},-16($xfer) | ||
| 361 | fstds ${fnm0},-8($xfer) | ||
| 362 | |||
| 363 | L\$inner | ||
| 364 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] | ||
| 365 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
| 366 | ldd 0($xfer),$ab1 | ||
| 367 | fstds ${fab1},0($xfer) | ||
| 368 | addl $hi0,$ti1,$ti1 | ||
| 369 | addl $ti1,$ab1,$ab1 | ||
| 370 | ldd 8($xfer),$nm1 | ||
| 371 | fstds ${fnm1},8($xfer) | ||
| 372 | extrd,u $ab1,31,32,$hi0 | ||
| 373 | extrd,u $ab1,63,32,$ab1 | ||
| 374 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
| 375 | flddx $idx($np),${fni} ; np[j,j+1] | ||
| 376 | addl $hi1,$nm1,$nm1 | ||
| 377 | addl $ab1,$nm1,$nm1 | ||
| 378 | ldw 4($tp),$ti0 ; tp[j] | ||
| 379 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 380 | |||
| 381 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
| 382 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 383 | ldd -16($xfer),$ab0 | ||
| 384 | fstds ${fab0},-16($xfer) | ||
| 385 | addl $hi0,$ti0,$ti0 | ||
| 386 | addl $ti0,$ab0,$ab0 | ||
| 387 | ldd -8($xfer),$nm0 | ||
| 388 | fstds ${fnm0},-8($xfer) | ||
| 389 | extrd,u $ab0,31,32,$hi0 | ||
| 390 | extrd,u $nm1,31,32,$hi1 | ||
| 391 | ldw 8($tp),$ti1 ; tp[j] | ||
| 392 | extrd,u $ab0,63,32,$ab0 | ||
| 393 | addl $hi1,$nm0,$nm0 | ||
| 394 | addl $ab0,$nm0,$nm0 | ||
| 395 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
| 396 | addib,<> 8,$idx,L\$inner ; j++++ | ||
| 397 | extrd,u $nm0,31,32,$hi1 | ||
| 398 | |||
| 399 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] | ||
| 400 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
| 401 | ldd 0($xfer),$ab1 | ||
| 402 | fstds ${fab1},0($xfer) | ||
| 403 | addl $hi0,$ti1,$ti1 | ||
| 404 | addl $ti1,$ab1,$ab1 | ||
| 405 | ldd 8($xfer),$nm1 | ||
| 406 | fstds ${fnm1},8($xfer) | ||
| 407 | extrd,u $ab1,31,32,$hi0 | ||
| 408 | extrd,u $ab1,63,32,$ab1 | ||
| 409 | ldw 4($tp),$ti0 ; tp[j] | ||
| 410 | addl $hi1,$nm1,$nm1 | ||
| 411 | addl $ab1,$nm1,$nm1 | ||
| 412 | ldd -16($xfer),$ab0 | ||
| 413 | ldd -8($xfer),$nm0 | ||
| 414 | extrd,u $nm1,31,32,$hi1 | ||
| 415 | |||
| 416 | addl $hi0,$ab0,$ab0 | ||
| 417 | addl $ti0,$ab0,$ab0 | ||
| 418 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 419 | extrd,u $ab0,31,32,$hi0 | ||
| 420 | ldw 8($tp),$ti1 ; tp[j] | ||
| 421 | extrd,u $ab0,63,32,$ab0 | ||
| 422 | addl $hi1,$nm0,$nm0 | ||
| 423 | ldd 0($xfer),$ab1 | ||
| 424 | addl $ab0,$nm0,$nm0 | ||
| 425 | ldd,mb 8($xfer),$nm1 | ||
| 426 | extrd,u $nm0,31,32,$hi1 | ||
| 427 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
| 428 | |||
| 429 | addib,= -1,$num,L\$outerdone ; i-- | ||
| 430 | subi 0,$arrsz,$idx ; j=0 | ||
| 431 | ___ | ||
| 432 | $code.=<<___ if ($BN_SZ==4); | ||
| 433 | fldws,ma 4($bp),${fbi} ; bp[i] | ||
| 434 | ___ | ||
| 435 | $code.=<<___ if ($BN_SZ==8); | ||
| 436 | ldi 12,$ti0 ; bp[i] in flipped word order | ||
| 437 | addl,ev %r0,$num,$num | ||
| 438 | ldi -4,$ti0 | ||
| 439 | addl $ti0,$bp,$bp | ||
| 440 | fldws 0($bp),${fbi} | ||
| 441 | ___ | ||
| 442 | $code.=<<___; | ||
| 443 | flddx $idx($ap),${fai} ; ap[0] | ||
| 444 | addl $hi0,$ab1,$ab1 | ||
| 445 | flddx $idx($np),${fni} ; np[0] | ||
| 446 | fldws 8($xfer),${fti}R ; tp[0] | ||
| 447 | addl $ti1,$ab1,$ab1 | ||
| 448 | extrd,u $ab1,31,32,$hi0 | ||
| 449 | extrd,u $ab1,63,32,$ab1 | ||
| 450 | |||
| 451 | ldo 8($idx),$idx ; j++++ | ||
| 452 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] | ||
| 453 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] | ||
| 454 | ldw 4($tp),$ti0 ; tp[j] | ||
| 455 | |||
| 456 | addl $hi1,$nm1,$nm1 | ||
| 457 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
| 458 | addl $ab1,$nm1,$nm1 | ||
| 459 | extrd,u $nm1,31,32,$hi1 | ||
| 460 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
| 461 | fcpy,sgl %fr0,${fab0}L | ||
| 462 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 463 | |||
| 464 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
| 465 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
| 466 | addl $hi1,$hi0,$hi0 | ||
| 467 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
| 468 | addl $ti0,$hi0,$hi0 | ||
| 469 | extrd,u $hi0,31,32,$hi1 | ||
| 470 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
| 471 | stw $hi0,0($tp) | ||
| 472 | stw $hi1,4($tp) | ||
| 473 | xmpyu ${fn0},${fab0}R,${fm0} | ||
| 474 | |||
| 475 | b L\$outer | ||
| 476 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 477 | |||
| 478 | L\$outerdone | ||
| 479 | addl $hi0,$ab1,$ab1 | ||
| 480 | addl $ti1,$ab1,$ab1 | ||
| 481 | extrd,u $ab1,31,32,$hi0 | ||
| 482 | extrd,u $ab1,63,32,$ab1 | ||
| 483 | |||
| 484 | ldw 4($tp),$ti0 ; tp[j] | ||
| 485 | |||
| 486 | addl $hi1,$nm1,$nm1 | ||
| 487 | addl $ab1,$nm1,$nm1 | ||
| 488 | extrd,u $nm1,31,32,$hi1 | ||
| 489 | stw $nm1,-4($tp) ; tp[j-1] | ||
| 490 | |||
| 491 | addl $hi1,$hi0,$hi0 | ||
| 492 | addl $ti0,$hi0,$hi0 | ||
| 493 | extrd,u $hi0,31,32,$hi1 | ||
| 494 | stw $hi0,0($tp) | ||
| 495 | stw $hi1,4($tp) | ||
| 496 | |||
| 497 | ldo `$LOCALS+32`($fp),$tp | ||
| 498 | sub %r0,%r0,%r0 ; clear borrow | ||
| 499 | ___ | ||
| 500 | $code.=<<___ if ($BN_SZ==4); | ||
| 501 | ldws,ma 4($tp),$ti0 | ||
| 502 | extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? | ||
| 503 | b L\$sub_pa11 | ||
| 504 | addl $tp,$arrsz,$tp | ||
| 505 | L\$sub | ||
| 506 | ldwx $idx($np),$hi0 | ||
| 507 | subb $ti0,$hi0,$hi1 | ||
| 508 | ldwx $idx($tp),$ti0 | ||
| 509 | addib,<> 4,$idx,L\$sub | ||
| 510 | stws,ma $hi1,4($rp) | ||
| 511 | |||
| 512 | subb $ti0,%r0,$hi1 | ||
| 513 | ldo -4($tp),$tp | ||
| 514 | ___ | ||
| 515 | $code.=<<___ if ($BN_SZ==8); | ||
| 516 | ldd,ma 8($tp),$ti0 | ||
| 517 | L\$sub | ||
| 518 | ldd $idx($np),$hi0 | ||
| 519 | shrpd $ti0,$ti0,32,$ti0 ; flip word order | ||
| 520 | std $ti0,-8($tp) ; save flipped value | ||
| 521 | sub,db $ti0,$hi0,$hi1 | ||
| 522 | ldd,ma 8($tp),$ti0 | ||
| 523 | addib,<> 8,$idx,L\$sub | ||
| 524 | std,ma $hi1,8($rp) | ||
| 525 | |||
| 526 | extrd,u $ti0,31,32,$ti0 ; carry in flipped word order | ||
| 527 | sub,db $ti0,%r0,$hi1 | ||
| 528 | ldo -8($tp),$tp | ||
| 529 | ___ | ||
| 530 | $code.=<<___; | ||
| 531 | and $tp,$hi1,$ap | ||
| 532 | andcm $rp,$hi1,$bp | ||
| 533 | or $ap,$bp,$np | ||
| 534 | |||
| 535 | sub $rp,$arrsz,$rp ; rewind rp | ||
| 536 | subi 0,$arrsz,$idx | ||
| 537 | ldo `$LOCALS+32`($fp),$tp | ||
| 538 | L\$copy | ||
| 539 | ldd $idx($np),$hi0 | ||
| 540 | std,ma %r0,8($tp) | ||
| 541 | addib,<> 8,$idx,.-8 ; L\$copy | ||
| 542 | std,ma $hi0,8($rp) | ||
| 543 | ___ | ||
| 544 | |||
| 545 | if ($BN_SZ==4) { # PA-RISC 1.1 code-path | ||
| 546 | $ablo=$ab0; | ||
| 547 | $abhi=$ab1; | ||
| 548 | $nmlo0=$nm0; | ||
| 549 | $nmhi0=$nm1; | ||
| 550 | $nmlo1="%r9"; | ||
| 551 | $nmhi1="%r8"; | ||
| 552 | |||
| 553 | $code.=<<___; | ||
| 554 | b L\$done | ||
| 555 | nop | ||
| 556 | |||
| 557 | .ALIGN 8 | ||
| 558 | L\$parisc11 | ||
| 559 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
| 560 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 561 | ldw -12($xfer),$ablo | ||
| 562 | ldw -16($xfer),$hi0 | ||
| 563 | ldw -4($xfer),$nmlo0 | ||
| 564 | ldw -8($xfer),$nmhi0 | ||
| 565 | fstds ${fab0},-16($xfer) | ||
| 566 | fstds ${fnm0},-8($xfer) | ||
| 567 | |||
| 568 | ldo 8($idx),$idx ; j++++ | ||
| 569 | add $ablo,$nmlo0,$nmlo0 ; discarded | ||
| 570 | addc %r0,$nmhi0,$hi1 | ||
| 571 | ldw 4($xfer),$ablo | ||
| 572 | ldw 0($xfer),$abhi | ||
| 573 | nop | ||
| 574 | |||
| 575 | L\$1st_pa11 | ||
| 576 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] | ||
| 577 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
| 578 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
| 579 | flddx $idx($np),${fni} ; np[j,j+1] | ||
| 580 | add $hi0,$ablo,$ablo | ||
| 581 | ldw 12($xfer),$nmlo1 | ||
| 582 | addc %r0,$abhi,$hi0 | ||
| 583 | ldw 8($xfer),$nmhi1 | ||
| 584 | add $ablo,$nmlo1,$nmlo1 | ||
| 585 | fstds ${fab1},0($xfer) | ||
| 586 | addc %r0,$nmhi1,$nmhi1 | ||
| 587 | fstds ${fnm1},8($xfer) | ||
| 588 | add $hi1,$nmlo1,$nmlo1 | ||
| 589 | ldw -12($xfer),$ablo | ||
| 590 | addc %r0,$nmhi1,$hi1 | ||
| 591 | ldw -16($xfer),$abhi | ||
| 592 | |||
| 593 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
| 594 | ldw -4($xfer),$nmlo0 | ||
| 595 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 596 | ldw -8($xfer),$nmhi0 | ||
| 597 | add $hi0,$ablo,$ablo | ||
| 598 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 599 | addc %r0,$abhi,$hi0 | ||
| 600 | fstds ${fab0},-16($xfer) | ||
| 601 | add $ablo,$nmlo0,$nmlo0 | ||
| 602 | fstds ${fnm0},-8($xfer) | ||
| 603 | addc %r0,$nmhi0,$nmhi0 | ||
| 604 | ldw 0($xfer),$abhi | ||
| 605 | add $hi1,$nmlo0,$nmlo0 | ||
| 606 | ldw 4($xfer),$ablo | ||
| 607 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
| 608 | addib,<> 8,$idx,L\$1st_pa11 ; j++++ | ||
| 609 | addc %r0,$nmhi0,$hi1 | ||
| 610 | |||
| 611 | ldw 8($xfer),$nmhi1 | ||
| 612 | ldw 12($xfer),$nmlo1 | ||
| 613 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] | ||
| 614 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
| 615 | add $hi0,$ablo,$ablo | ||
| 616 | fstds ${fab1},0($xfer) | ||
| 617 | addc %r0,$abhi,$hi0 | ||
| 618 | fstds ${fnm1},8($xfer) | ||
| 619 | add $ablo,$nmlo1,$nmlo1 | ||
| 620 | ldw -16($xfer),$abhi | ||
| 621 | addc %r0,$nmhi1,$nmhi1 | ||
| 622 | ldw -12($xfer),$ablo | ||
| 623 | add $hi1,$nmlo1,$nmlo1 | ||
| 624 | ldw -8($xfer),$nmhi0 | ||
| 625 | addc %r0,$nmhi1,$hi1 | ||
| 626 | ldw -4($xfer),$nmlo0 | ||
| 627 | |||
| 628 | add $hi0,$ablo,$ablo | ||
| 629 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 630 | addc %r0,$abhi,$hi0 | ||
| 631 | ldw 0($xfer),$abhi | ||
| 632 | add $ablo,$nmlo0,$nmlo0 | ||
| 633 | ldw 4($xfer),$ablo | ||
| 634 | addc %r0,$nmhi0,$nmhi0 | ||
| 635 | ldws,mb 8($xfer),$nmhi1 | ||
| 636 | add $hi1,$nmlo0,$nmlo0 | ||
| 637 | ldw 4($xfer),$nmlo1 | ||
| 638 | addc %r0,$nmhi0,$hi1 | ||
| 639 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
| 640 | |||
| 641 | ldo -1($num),$num ; i-- | ||
| 642 | subi 0,$arrsz,$idx ; j=0 | ||
| 643 | |||
| 644 | fldws,ma 4($bp),${fbi} ; bp[1] | ||
| 645 | flddx $idx($ap),${fai} ; ap[0,1] | ||
| 646 | flddx $idx($np),${fni} ; np[0,1] | ||
| 647 | fldws 8($xfer),${fti}R ; tp[0] | ||
| 648 | add $hi0,$ablo,$ablo | ||
| 649 | addc %r0,$abhi,$hi0 | ||
| 650 | ldo 8($idx),$idx ; j++++ | ||
| 651 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] | ||
| 652 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] | ||
| 653 | add $hi1,$nmlo1,$nmlo1 | ||
| 654 | addc %r0,$nmhi1,$nmhi1 | ||
| 655 | add $ablo,$nmlo1,$nmlo1 | ||
| 656 | addc %r0,$nmhi1,$hi1 | ||
| 657 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
| 658 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 659 | |||
| 660 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
| 661 | fcpy,sgl %fr0,${fab0}L | ||
| 662 | add $hi1,$hi0,$hi0 | ||
| 663 | addc %r0,%r0,$hi1 | ||
| 664 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
| 665 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
| 666 | stw $hi0,0($tp) | ||
| 667 | stw $hi1,4($tp) | ||
| 668 | |||
| 669 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
| 670 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
| 671 | xmpyu ${fn0},${fab0}R,${fm0} | ||
| 672 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 673 | L\$outer_pa11 | ||
| 674 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
| 675 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
| 676 | fstds ${fab0},-16($xfer) ; 33-bit value | ||
| 677 | fstds ${fnm0},-8($xfer) | ||
| 678 | flddx $idx($ap),${fai} ; ap[2,3] | ||
| 679 | flddx $idx($np),${fni} ; np[2,3] | ||
| 680 | ldw -16($xfer),$abhi ; carry bit actually | ||
| 681 | ldo 8($idx),$idx ; j++++ | ||
| 682 | ldw -12($xfer),$ablo | ||
| 683 | ldw -8($xfer),$nmhi0 | ||
| 684 | ldw -4($xfer),$nmlo0 | ||
| 685 | ldw 0($xfer),$hi0 ; high part | ||
| 686 | |||
| 687 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
| 688 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 689 | fstds ${fab1},0($xfer) | ||
| 690 | addl $abhi,$hi0,$hi0 ; account carry bit | ||
| 691 | fstds ${fnm1},8($xfer) | ||
| 692 | add $ablo,$nmlo0,$nmlo0 ; discarded | ||
| 693 | ldw 0($tp),$ti1 ; tp[1] | ||
| 694 | addc %r0,$nmhi0,$hi1 | ||
| 695 | fstds ${fab0},-16($xfer) | ||
| 696 | fstds ${fnm0},-8($xfer) | ||
| 697 | ldw 4($xfer),$ablo | ||
| 698 | ldw 0($xfer),$abhi | ||
| 699 | |||
| 700 | L\$inner_pa11 | ||
| 701 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] | ||
| 702 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
| 703 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
| 704 | flddx $idx($np),${fni} ; np[j,j+1] | ||
| 705 | add $hi0,$ablo,$ablo | ||
| 706 | ldw 4($tp),$ti0 ; tp[j] | ||
| 707 | addc %r0,$abhi,$abhi | ||
| 708 | ldw 12($xfer),$nmlo1 | ||
| 709 | add $ti1,$ablo,$ablo | ||
| 710 | ldw 8($xfer),$nmhi1 | ||
| 711 | addc %r0,$abhi,$hi0 | ||
| 712 | fstds ${fab1},0($xfer) | ||
| 713 | add $ablo,$nmlo1,$nmlo1 | ||
| 714 | fstds ${fnm1},8($xfer) | ||
| 715 | addc %r0,$nmhi1,$nmhi1 | ||
| 716 | ldw -12($xfer),$ablo | ||
| 717 | add $hi1,$nmlo1,$nmlo1 | ||
| 718 | ldw -16($xfer),$abhi | ||
| 719 | addc %r0,$nmhi1,$hi1 | ||
| 720 | |||
| 721 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
| 722 | ldw 8($tp),$ti1 ; tp[j] | ||
| 723 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
| 724 | ldw -4($xfer),$nmlo0 | ||
| 725 | add $hi0,$ablo,$ablo | ||
| 726 | ldw -8($xfer),$nmhi0 | ||
| 727 | addc %r0,$abhi,$abhi | ||
| 728 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 729 | add $ti0,$ablo,$ablo | ||
| 730 | fstds ${fab0},-16($xfer) | ||
| 731 | addc %r0,$abhi,$hi0 | ||
| 732 | fstds ${fnm0},-8($xfer) | ||
| 733 | add $ablo,$nmlo0,$nmlo0 | ||
| 734 | ldw 4($xfer),$ablo | ||
| 735 | addc %r0,$nmhi0,$nmhi0 | ||
| 736 | ldw 0($xfer),$abhi | ||
| 737 | add $hi1,$nmlo0,$nmlo0 | ||
| 738 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
| 739 | addib,<> 8,$idx,L\$inner_pa11 ; j++++ | ||
| 740 | addc %r0,$nmhi0,$hi1 | ||
| 741 | |||
| 742 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] | ||
| 743 | ldw 12($xfer),$nmlo1 | ||
| 744 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
| 745 | ldw 8($xfer),$nmhi1 | ||
| 746 | add $hi0,$ablo,$ablo | ||
| 747 | ldw 4($tp),$ti0 ; tp[j] | ||
| 748 | addc %r0,$abhi,$abhi | ||
| 749 | fstds ${fab1},0($xfer) | ||
| 750 | add $ti1,$ablo,$ablo | ||
| 751 | fstds ${fnm1},8($xfer) | ||
| 752 | addc %r0,$abhi,$hi0 | ||
| 753 | ldw -16($xfer),$abhi | ||
| 754 | add $ablo,$nmlo1,$nmlo1 | ||
| 755 | ldw -12($xfer),$ablo | ||
| 756 | addc %r0,$nmhi1,$nmhi1 | ||
| 757 | ldw -8($xfer),$nmhi0 | ||
| 758 | add $hi1,$nmlo1,$nmlo1 | ||
| 759 | ldw -4($xfer),$nmlo0 | ||
| 760 | addc %r0,$nmhi1,$hi1 | ||
| 761 | |||
| 762 | add $hi0,$ablo,$ablo | ||
| 763 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 764 | addc %r0,$abhi,$abhi | ||
| 765 | add $ti0,$ablo,$ablo | ||
| 766 | ldw 8($tp),$ti1 ; tp[j] | ||
| 767 | addc %r0,$abhi,$hi0 | ||
| 768 | ldw 0($xfer),$abhi | ||
| 769 | add $ablo,$nmlo0,$nmlo0 | ||
| 770 | ldw 4($xfer),$ablo | ||
| 771 | addc %r0,$nmhi0,$nmhi0 | ||
| 772 | ldws,mb 8($xfer),$nmhi1 | ||
| 773 | add $hi1,$nmlo0,$nmlo0 | ||
| 774 | ldw 4($xfer),$nmlo1 | ||
| 775 | addc %r0,$nmhi0,$hi1 | ||
| 776 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
| 777 | |||
| 778 | addib,= -1,$num,L\$outerdone_pa11; i-- | ||
| 779 | subi 0,$arrsz,$idx ; j=0 | ||
| 780 | |||
| 781 | fldws,ma 4($bp),${fbi} ; bp[i] | ||
| 782 | flddx $idx($ap),${fai} ; ap[0] | ||
| 783 | add $hi0,$ablo,$ablo | ||
| 784 | addc %r0,$abhi,$abhi | ||
| 785 | flddx $idx($np),${fni} ; np[0] | ||
| 786 | fldws 8($xfer),${fti}R ; tp[0] | ||
| 787 | add $ti1,$ablo,$ablo | ||
| 788 | addc %r0,$abhi,$hi0 | ||
| 789 | |||
| 790 | ldo 8($idx),$idx ; j++++ | ||
| 791 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] | ||
| 792 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] | ||
| 793 | ldw 4($tp),$ti0 ; tp[j] | ||
| 794 | |||
| 795 | add $hi1,$nmlo1,$nmlo1 | ||
| 796 | addc %r0,$nmhi1,$nmhi1 | ||
| 797 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
| 798 | add $ablo,$nmlo1,$nmlo1 | ||
| 799 | addc %r0,$nmhi1,$hi1 | ||
| 800 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
| 801 | fcpy,sgl %fr0,${fab0}L | ||
| 802 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 803 | |||
| 804 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
| 805 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
| 806 | add $hi1,$hi0,$hi0 | ||
| 807 | addc %r0,%r0,$hi1 | ||
| 808 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
| 809 | add $ti0,$hi0,$hi0 | ||
| 810 | addc %r0,$hi1,$hi1 | ||
| 811 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
| 812 | stw $hi0,0($tp) | ||
| 813 | stw $hi1,4($tp) | ||
| 814 | xmpyu ${fn0},${fab0}R,${fm0} | ||
| 815 | |||
| 816 | b L\$outer_pa11 | ||
| 817 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 818 | |||
| 819 | L\$outerdone_pa11 | ||
| 820 | add $hi0,$ablo,$ablo | ||
| 821 | addc %r0,$abhi,$abhi | ||
| 822 | add $ti1,$ablo,$ablo | ||
| 823 | addc %r0,$abhi,$hi0 | ||
| 824 | |||
| 825 | ldw 4($tp),$ti0 ; tp[j] | ||
| 826 | |||
| 827 | add $hi1,$nmlo1,$nmlo1 | ||
| 828 | addc %r0,$nmhi1,$nmhi1 | ||
| 829 | add $ablo,$nmlo1,$nmlo1 | ||
| 830 | addc %r0,$nmhi1,$hi1 | ||
| 831 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
| 832 | |||
| 833 | add $hi1,$hi0,$hi0 | ||
| 834 | addc %r0,%r0,$hi1 | ||
| 835 | add $ti0,$hi0,$hi0 | ||
| 836 | addc %r0,$hi1,$hi1 | ||
| 837 | stw $hi0,0($tp) | ||
| 838 | stw $hi1,4($tp) | ||
| 839 | |||
| 840 | ldo `$LOCALS+32+4`($fp),$tp | ||
| 841 | sub %r0,%r0,%r0 ; clear borrow | ||
| 842 | ldw -4($tp),$ti0 | ||
| 843 | addl $tp,$arrsz,$tp | ||
| 844 | L\$sub_pa11 | ||
| 845 | ldwx $idx($np),$hi0 | ||
| 846 | subb $ti0,$hi0,$hi1 | ||
| 847 | ldwx $idx($tp),$ti0 | ||
| 848 | addib,<> 4,$idx,L\$sub_pa11 | ||
| 849 | stws,ma $hi1,4($rp) | ||
| 850 | |||
| 851 | subb $ti0,%r0,$hi1 | ||
| 852 | ldo -4($tp),$tp | ||
| 853 | and $tp,$hi1,$ap | ||
| 854 | andcm $rp,$hi1,$bp | ||
| 855 | or $ap,$bp,$np | ||
| 856 | |||
| 857 | sub $rp,$arrsz,$rp ; rewind rp | ||
| 858 | subi 0,$arrsz,$idx | ||
| 859 | ldo `$LOCALS+32`($fp),$tp | ||
| 860 | L\$copy_pa11 | ||
| 861 | ldwx $idx($np),$hi0 | ||
| 862 | stws,ma %r0,4($tp) | ||
| 863 | addib,<> 4,$idx,L\$copy_pa11 | ||
| 864 | stws,ma $hi0,4($rp) | ||
| 865 | |||
| 866 | nop ; alignment | ||
| 867 | L\$done | ||
| 868 | ___ | ||
| 869 | } | ||
| 870 | |||
| 871 | $code.=<<___; | ||
| 872 | ldi 1,%r28 ; signal "handled" | ||
| 873 | ldo $FRAME($fp),%sp ; destroy tp[num+1] | ||
| 874 | |||
| 875 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
| 876 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
| 877 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
| 878 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
| 879 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
| 880 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
| 881 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
| 882 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
| 883 | L\$abort | ||
| 884 | bv (%r2) | ||
| 885 | .EXIT | ||
| 886 | $POPMB -$FRAME(%sp),%r3 | ||
| 887 | .PROCEND | ||
| 888 | .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 889 | ___ | ||
| 890 | |||
| 891 | # Explicitly encode PA-RISC 2.0 instructions used in this module, so | ||
| 892 | # that it can be compiled with .LEVEL 1.0. It should be noted that I | ||
| 893 | # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 | ||
| 894 | # directive... | ||
| 895 | |||
| 896 | my $ldd = sub { | ||
| 897 | my ($mod,$args) = @_; | ||
| 898 | my $orig = "ldd$mod\t$args"; | ||
| 899 | |||
| 900 | if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 | ||
| 901 | { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; | ||
| 902 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 903 | } | ||
| 904 | elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 | ||
| 905 | { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; | ||
| 906 | $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset | ||
| 907 | $opcode|=(1<<5) if ($mod =~ /^,m/); | ||
| 908 | $opcode|=(1<<13) if ($mod =~ /^,mb/); | ||
| 909 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 910 | } | ||
| 911 | else { "\t".$orig; } | ||
| 912 | }; | ||
| 913 | |||
| 914 | my $std = sub { | ||
| 915 | my ($mod,$args) = @_; | ||
| 916 | my $orig = "std$mod\t$args"; | ||
| 917 | |||
| 918 | if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 | ||
| 919 | { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); | ||
| 920 | $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset | ||
| 921 | $opcode|=(1<<5) if ($mod =~ /^,m/); | ||
| 922 | $opcode|=(1<<13) if ($mod =~ /^,mb/); | ||
| 923 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 924 | } | ||
| 925 | else { "\t".$orig; } | ||
| 926 | }; | ||
| 927 | |||
| 928 | my $extrd = sub { | ||
| 929 | my ($mod,$args) = @_; | ||
| 930 | my $orig = "extrd$mod\t$args"; | ||
| 931 | |||
| 932 | # I only have ",u" completer, it's implicitly encoded... | ||
| 933 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 | ||
| 934 | { my $opcode=(0x36<<26)|($1<<21)|($4<<16); | ||
| 935 | my $len=32-$3; | ||
| 936 | $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos | ||
| 937 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len | ||
| 938 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 939 | } | ||
| 940 | elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 | ||
| 941 | { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); | ||
| 942 | my $len=32-$2; | ||
| 943 | $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len | ||
| 944 | $opcode |= (1<<13) if ($mod =~ /,\**=/); | ||
| 945 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 946 | } | ||
| 947 | else { "\t".$orig; } | ||
| 948 | }; | ||
| 949 | |||
| 950 | my $shrpd = sub { | ||
| 951 | my ($mod,$args) = @_; | ||
| 952 | my $orig = "shrpd$mod\t$args"; | ||
| 953 | |||
| 954 | if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 | ||
| 955 | { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; | ||
| 956 | my $cpos=63-$3; | ||
| 957 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa | ||
| 958 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 959 | } | ||
| 960 | else { "\t".$orig; } | ||
| 961 | }; | ||
| 962 | |||
| 963 | my $sub = sub { | ||
| 964 | my ($mod,$args) = @_; | ||
| 965 | my $orig = "sub$mod\t$args"; | ||
| 966 | |||
| 967 | if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { | ||
| 968 | my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; | ||
| 969 | $opcode|=(1<<10); # e1 | ||
| 970 | $opcode|=(1<<8); # e2 | ||
| 971 | $opcode|=(1<<5); # d | ||
| 972 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig | ||
| 973 | } | ||
| 974 | else { "\t".$orig; } | ||
| 975 | }; | ||
| 976 | |||
| 977 | sub assemble { | ||
| 978 | my ($mnemonic,$mod,$args)=@_; | ||
| 979 | my $opcode = eval("\$$mnemonic"); | ||
| 980 | |||
| 981 | ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; | ||
| 982 | } | ||
| 983 | |||
| 984 | foreach (split("\n",$code)) { | ||
| 985 | s/\`([^\`]*)\`/eval $1/ge; | ||
| 986 | # flip word order in 64-bit mode... | ||
| 987 | s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); | ||
| 988 | # assemble 2.0 instructions in 32-bit mode... | ||
| 989 | s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); | ||
| 990 | |||
| 991 | print $_,"\n"; | ||
| 992 | } | ||
| 993 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl index 7849eae959..f9b6992ccc 100644 --- a/src/lib/libcrypto/bn/asm/ppc-mont.pl +++ b/src/lib/libcrypto/bn/asm/ppc-mont.pl | |||
| @@ -31,7 +31,6 @@ if ($flavour =~ /32/) { | |||
| 31 | $BNSZ= $BITS/8; | 31 | $BNSZ= $BITS/8; |
| 32 | $SIZE_T=4; | 32 | $SIZE_T=4; |
| 33 | $RZONE= 224; | 33 | $RZONE= 224; |
| 34 | $FRAME= $SIZE_T*16; | ||
| 35 | 34 | ||
| 36 | $LD= "lwz"; # load | 35 | $LD= "lwz"; # load |
| 37 | $LDU= "lwzu"; # load and update | 36 | $LDU= "lwzu"; # load and update |
| @@ -51,7 +50,6 @@ if ($flavour =~ /32/) { | |||
| 51 | $BNSZ= $BITS/8; | 50 | $BNSZ= $BITS/8; |
| 52 | $SIZE_T=8; | 51 | $SIZE_T=8; |
| 53 | $RZONE= 288; | 52 | $RZONE= 288; |
| 54 | $FRAME= $SIZE_T*16; | ||
| 55 | 53 | ||
| 56 | # same as above, but 64-bit mnemonics... | 54 | # same as above, but 64-bit mnemonics... |
| 57 | $LD= "ld"; # load | 55 | $LD= "ld"; # load |
| @@ -69,6 +67,9 @@ if ($flavour =~ /32/) { | |||
| 69 | $POP= $LD; | 67 | $POP= $LD; |
| 70 | } else { die "nonsense $flavour"; } | 68 | } else { die "nonsense $flavour"; } |
| 71 | 69 | ||
| 70 | $FRAME=8*$SIZE_T+$RZONE; | ||
| 71 | $LOCALS=8*$SIZE_T; | ||
| 72 | |||
| 72 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | 73 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 73 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | 74 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
| 74 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | 75 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
| @@ -89,18 +90,18 @@ $aj="r10"; | |||
| 89 | $nj="r11"; | 90 | $nj="r11"; |
| 90 | $tj="r12"; | 91 | $tj="r12"; |
| 91 | # non-volatile registers | 92 | # non-volatile registers |
| 92 | $i="r14"; | 93 | $i="r20"; |
| 93 | $j="r15"; | 94 | $j="r21"; |
| 94 | $tp="r16"; | 95 | $tp="r22"; |
| 95 | $m0="r17"; | 96 | $m0="r23"; |
| 96 | $m1="r18"; | 97 | $m1="r24"; |
| 97 | $lo0="r19"; | 98 | $lo0="r25"; |
| 98 | $hi0="r20"; | 99 | $hi0="r26"; |
| 99 | $lo1="r21"; | 100 | $lo1="r27"; |
| 100 | $hi1="r22"; | 101 | $hi1="r28"; |
| 101 | $alo="r23"; | 102 | $alo="r29"; |
| 102 | $ahi="r24"; | 103 | $ahi="r30"; |
| 103 | $nlo="r25"; | 104 | $nlo="r31"; |
| 104 | # | 105 | # |
| 105 | $nhi="r0"; | 106 | $nhi="r0"; |
| 106 | 107 | ||
| @@ -108,42 +109,48 @@ $code=<<___; | |||
| 108 | .machine "any" | 109 | .machine "any" |
| 109 | .text | 110 | .text |
| 110 | 111 | ||
| 111 | .globl .bn_mul_mont | 112 | .globl .bn_mul_mont_int |
| 112 | .align 4 | 113 | .align 4 |
| 113 | .bn_mul_mont: | 114 | .bn_mul_mont_int: |
| 114 | cmpwi $num,4 | 115 | cmpwi $num,4 |
| 115 | mr $rp,r3 ; $rp is reassigned | 116 | mr $rp,r3 ; $rp is reassigned |
| 116 | li r3,0 | 117 | li r3,0 |
| 117 | bltlr | 118 | bltlr |
| 118 | 119 | ___ | |
| 120 | $code.=<<___ if ($BNSZ==4); | ||
| 121 | cmpwi $num,32 ; longer key performance is not better | ||
| 122 | bgelr | ||
| 123 | ___ | ||
| 124 | $code.=<<___; | ||
| 119 | slwi $num,$num,`log($BNSZ)/log(2)` | 125 | slwi $num,$num,`log($BNSZ)/log(2)` |
| 120 | li $tj,-4096 | 126 | li $tj,-4096 |
| 121 | addi $ovf,$num,`$FRAME+$RZONE` | 127 | addi $ovf,$num,$FRAME |
| 122 | subf $ovf,$ovf,$sp ; $sp-$ovf | 128 | subf $ovf,$ovf,$sp ; $sp-$ovf |
| 123 | and $ovf,$ovf,$tj ; minimize TLB usage | 129 | and $ovf,$ovf,$tj ; minimize TLB usage |
| 124 | subf $ovf,$sp,$ovf ; $ovf-$sp | 130 | subf $ovf,$sp,$ovf ; $ovf-$sp |
| 131 | mr $tj,$sp | ||
| 125 | srwi $num,$num,`log($BNSZ)/log(2)` | 132 | srwi $num,$num,`log($BNSZ)/log(2)` |
| 126 | $STUX $sp,$sp,$ovf | 133 | $STUX $sp,$sp,$ovf |
| 127 | 134 | ||
| 128 | $PUSH r14,`4*$SIZE_T`($sp) | 135 | $PUSH r20,`-12*$SIZE_T`($tj) |
| 129 | $PUSH r15,`5*$SIZE_T`($sp) | 136 | $PUSH r21,`-11*$SIZE_T`($tj) |
| 130 | $PUSH r16,`6*$SIZE_T`($sp) | 137 | $PUSH r22,`-10*$SIZE_T`($tj) |
| 131 | $PUSH r17,`7*$SIZE_T`($sp) | 138 | $PUSH r23,`-9*$SIZE_T`($tj) |
| 132 | $PUSH r18,`8*$SIZE_T`($sp) | 139 | $PUSH r24,`-8*$SIZE_T`($tj) |
| 133 | $PUSH r19,`9*$SIZE_T`($sp) | 140 | $PUSH r25,`-7*$SIZE_T`($tj) |
| 134 | $PUSH r20,`10*$SIZE_T`($sp) | 141 | $PUSH r26,`-6*$SIZE_T`($tj) |
| 135 | $PUSH r21,`11*$SIZE_T`($sp) | 142 | $PUSH r27,`-5*$SIZE_T`($tj) |
| 136 | $PUSH r22,`12*$SIZE_T`($sp) | 143 | $PUSH r28,`-4*$SIZE_T`($tj) |
| 137 | $PUSH r23,`13*$SIZE_T`($sp) | 144 | $PUSH r29,`-3*$SIZE_T`($tj) |
| 138 | $PUSH r24,`14*$SIZE_T`($sp) | 145 | $PUSH r30,`-2*$SIZE_T`($tj) |
| 139 | $PUSH r25,`15*$SIZE_T`($sp) | 146 | $PUSH r31,`-1*$SIZE_T`($tj) |
| 140 | 147 | ||
| 141 | $LD $n0,0($n0) ; pull n0[0] value | 148 | $LD $n0,0($n0) ; pull n0[0] value |
| 142 | addi $num,$num,-2 ; adjust $num for counter register | 149 | addi $num,$num,-2 ; adjust $num for counter register |
| 143 | 150 | ||
| 144 | $LD $m0,0($bp) ; m0=bp[0] | 151 | $LD $m0,0($bp) ; m0=bp[0] |
| 145 | $LD $aj,0($ap) ; ap[0] | 152 | $LD $aj,0($ap) ; ap[0] |
| 146 | addi $tp,$sp,$FRAME | 153 | addi $tp,$sp,$LOCALS |
| 147 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] | 154 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] |
| 148 | $UMULH $hi0,$aj,$m0 | 155 | $UMULH $hi0,$aj,$m0 |
| 149 | 156 | ||
| @@ -205,8 +212,8 @@ L1st: | |||
| 205 | Louter: | 212 | Louter: |
| 206 | $LDX $m0,$bp,$i ; m0=bp[i] | 213 | $LDX $m0,$bp,$i ; m0=bp[i] |
| 207 | $LD $aj,0($ap) ; ap[0] | 214 | $LD $aj,0($ap) ; ap[0] |
| 208 | addi $tp,$sp,$FRAME | 215 | addi $tp,$sp,$LOCALS |
| 209 | $LD $tj,$FRAME($sp) ; tp[0] | 216 | $LD $tj,$LOCALS($sp); tp[0] |
| 210 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] | 217 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] |
| 211 | $UMULH $hi0,$aj,$m0 | 218 | $UMULH $hi0,$aj,$m0 |
| 212 | $LD $aj,$BNSZ($ap) ; ap[1] | 219 | $LD $aj,$BNSZ($ap) ; ap[1] |
| @@ -273,7 +280,7 @@ Linner: | |||
| 273 | 280 | ||
| 274 | addi $num,$num,2 ; restore $num | 281 | addi $num,$num,2 ; restore $num |
| 275 | subfc $j,$j,$j ; j=0 and "clear" XER[CA] | 282 | subfc $j,$j,$j ; j=0 and "clear" XER[CA] |
| 276 | addi $tp,$sp,$FRAME | 283 | addi $tp,$sp,$LOCALS |
| 277 | mtctr $num | 284 | mtctr $num |
| 278 | 285 | ||
| 279 | .align 4 | 286 | .align 4 |
| @@ -299,23 +306,27 @@ Lcopy: ; copy or in-place refresh | |||
| 299 | addi $j,$j,$BNSZ | 306 | addi $j,$j,$BNSZ |
| 300 | bdnz- Lcopy | 307 | bdnz- Lcopy |
| 301 | 308 | ||
| 302 | $POP r14,`4*$SIZE_T`($sp) | 309 | $POP $tj,0($sp) |
| 303 | $POP r15,`5*$SIZE_T`($sp) | ||
| 304 | $POP r16,`6*$SIZE_T`($sp) | ||
| 305 | $POP r17,`7*$SIZE_T`($sp) | ||
| 306 | $POP r18,`8*$SIZE_T`($sp) | ||
| 307 | $POP r19,`9*$SIZE_T`($sp) | ||
| 308 | $POP r20,`10*$SIZE_T`($sp) | ||
| 309 | $POP r21,`11*$SIZE_T`($sp) | ||
| 310 | $POP r22,`12*$SIZE_T`($sp) | ||
| 311 | $POP r23,`13*$SIZE_T`($sp) | ||
| 312 | $POP r24,`14*$SIZE_T`($sp) | ||
| 313 | $POP r25,`15*$SIZE_T`($sp) | ||
| 314 | $POP $sp,0($sp) | ||
| 315 | li r3,1 | 310 | li r3,1 |
| 311 | $POP r20,`-12*$SIZE_T`($tj) | ||
| 312 | $POP r21,`-11*$SIZE_T`($tj) | ||
| 313 | $POP r22,`-10*$SIZE_T`($tj) | ||
| 314 | $POP r23,`-9*$SIZE_T`($tj) | ||
| 315 | $POP r24,`-8*$SIZE_T`($tj) | ||
| 316 | $POP r25,`-7*$SIZE_T`($tj) | ||
| 317 | $POP r26,`-6*$SIZE_T`($tj) | ||
| 318 | $POP r27,`-5*$SIZE_T`($tj) | ||
| 319 | $POP r28,`-4*$SIZE_T`($tj) | ||
| 320 | $POP r29,`-3*$SIZE_T`($tj) | ||
| 321 | $POP r30,`-2*$SIZE_T`($tj) | ||
| 322 | $POP r31,`-1*$SIZE_T`($tj) | ||
| 323 | mr $sp,$tj | ||
| 316 | blr | 324 | blr |
| 317 | .long 0 | 325 | .long 0 |
| 318 | .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" | 326 | .byte 0,12,4,0,0x80,12,6,0 |
| 327 | .long 0 | ||
| 328 | |||
| 329 | .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 319 | ___ | 330 | ___ |
| 320 | 331 | ||
| 321 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 332 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl index f4093177e6..1249ce2299 100644 --- a/src/lib/libcrypto/bn/asm/ppc.pl +++ b/src/lib/libcrypto/bn/asm/ppc.pl | |||
| @@ -389,7 +389,9 @@ $data=<<EOF; | |||
| 389 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1 | 389 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1 |
| 390 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2 | 390 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2 |
| 391 | blr | 391 | blr |
| 392 | .long 0x00000000 | 392 | .long 0 |
| 393 | .byte 0,12,0x14,0,0,0,2,0 | ||
| 394 | .long 0 | ||
| 393 | 395 | ||
| 394 | # | 396 | # |
| 395 | # NOTE: The following label name should be changed to | 397 | # NOTE: The following label name should be changed to |
| @@ -814,8 +816,9 @@ $data=<<EOF; | |||
| 814 | 816 | ||
| 815 | 817 | ||
| 816 | blr | 818 | blr |
| 817 | 819 | .long 0 | |
| 818 | .long 0x00000000 | 820 | .byte 0,12,0x14,0,0,0,2,0 |
| 821 | .long 0 | ||
| 819 | 822 | ||
| 820 | # | 823 | # |
| 821 | # NOTE: The following label name should be changed to | 824 | # NOTE: The following label name should be changed to |
| @@ -966,7 +969,9 @@ $data=<<EOF; | |||
| 966 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1 | 969 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1 |
| 967 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2 | 970 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2 |
| 968 | blr | 971 | blr |
| 969 | .long 0x00000000 | 972 | .long 0 |
| 973 | .byte 0,12,0x14,0,0,0,3,0 | ||
| 974 | .long 0 | ||
| 970 | 975 | ||
| 971 | # | 976 | # |
| 972 | # NOTE: The following label name should be changed to | 977 | # NOTE: The following label name should be changed to |
| @@ -1502,7 +1507,9 @@ $data=<<EOF; | |||
| 1502 | $ST r12,`14*$BNSZ`(r3) #r[14]=c3; | 1507 | $ST r12,`14*$BNSZ`(r3) #r[14]=c3; |
| 1503 | $ST r10,`15*$BNSZ`(r3) #r[15]=c1; | 1508 | $ST r10,`15*$BNSZ`(r3) #r[15]=c1; |
| 1504 | blr | 1509 | blr |
| 1505 | .long 0x00000000 | 1510 | .long 0 |
| 1511 | .byte 0,12,0x14,0,0,0,3,0 | ||
| 1512 | .long 0 | ||
| 1506 | 1513 | ||
| 1507 | # | 1514 | # |
| 1508 | # NOTE: The following label name should be changed to | 1515 | # NOTE: The following label name should be changed to |
| @@ -1550,8 +1557,9 @@ Lppcasm_sub_adios: | |||
| 1550 | subfze r3,r0 # if carry bit is set then r3 = 0 else -1 | 1557 | subfze r3,r0 # if carry bit is set then r3 = 0 else -1 |
| 1551 | andi. r3,r3,1 # keep only last bit. | 1558 | andi. r3,r3,1 # keep only last bit. |
| 1552 | blr | 1559 | blr |
| 1553 | .long 0x00000000 | 1560 | .long 0 |
| 1554 | 1561 | .byte 0,12,0x14,0,0,0,4,0 | |
| 1562 | .long 0 | ||
| 1555 | 1563 | ||
| 1556 | # | 1564 | # |
| 1557 | # NOTE: The following label name should be changed to | 1565 | # NOTE: The following label name should be changed to |
| @@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop: | |||
| 1594 | Lppcasm_add_adios: | 1602 | Lppcasm_add_adios: |
| 1595 | addze r3,r0 #return carry bit. | 1603 | addze r3,r0 #return carry bit. |
| 1596 | blr | 1604 | blr |
| 1597 | .long 0x00000000 | 1605 | .long 0 |
| 1606 | .byte 0,12,0x14,0,0,0,4,0 | ||
| 1607 | .long 0 | ||
| 1598 | 1608 | ||
| 1599 | # | 1609 | # |
| 1600 | # NOTE: The following label name should be changed to | 1610 | # NOTE: The following label name should be changed to |
| @@ -1707,7 +1717,9 @@ Lppcasm_div8: | |||
| 1707 | Lppcasm_div9: | 1717 | Lppcasm_div9: |
| 1708 | or r3,r8,r0 | 1718 | or r3,r8,r0 |
| 1709 | blr | 1719 | blr |
| 1710 | .long 0x00000000 | 1720 | .long 0 |
| 1721 | .byte 0,12,0x14,0,0,0,3,0 | ||
| 1722 | .long 0 | ||
| 1711 | 1723 | ||
| 1712 | # | 1724 | # |
| 1713 | # NOTE: The following label name should be changed to | 1725 | # NOTE: The following label name should be changed to |
| @@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop: | |||
| 1746 | bdnz- Lppcasm_sqr_mainloop | 1758 | bdnz- Lppcasm_sqr_mainloop |
| 1747 | Lppcasm_sqr_adios: | 1759 | Lppcasm_sqr_adios: |
| 1748 | blr | 1760 | blr |
| 1749 | .long 0x00000000 | 1761 | .long 0 |
| 1750 | 1762 | .byte 0,12,0x14,0,0,0,3,0 | |
| 1763 | .long 0 | ||
| 1751 | 1764 | ||
| 1752 | # | 1765 | # |
| 1753 | # NOTE: The following label name should be changed to | 1766 | # NOTE: The following label name should be changed to |
| @@ -1850,7 +1863,9 @@ Lppcasm_mw_REM: | |||
| 1850 | Lppcasm_mw_OVER: | 1863 | Lppcasm_mw_OVER: |
| 1851 | addi r3,r12,0 | 1864 | addi r3,r12,0 |
| 1852 | blr | 1865 | blr |
| 1853 | .long 0x00000000 | 1866 | .long 0 |
| 1867 | .byte 0,12,0x14,0,0,0,4,0 | ||
| 1868 | .long 0 | ||
| 1854 | 1869 | ||
| 1855 | # | 1870 | # |
| 1856 | # NOTE: The following label name should be changed to | 1871 | # NOTE: The following label name should be changed to |
| @@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover: | |||
| 1973 | Lppcasm_maw_adios: | 1988 | Lppcasm_maw_adios: |
| 1974 | addi r3,r12,0 | 1989 | addi r3,r12,0 |
| 1975 | blr | 1990 | blr |
| 1976 | .long 0x00000000 | 1991 | .long 0 |
| 1992 | .byte 0,12,0x14,0,0,0,4,0 | ||
| 1993 | .long 0 | ||
| 1977 | .align 4 | 1994 | .align 4 |
| 1978 | EOF | 1995 | EOF |
| 1979 | $data =~ s/\`([^\`]*)\`/eval $1/gem; | 1996 | $data =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl index 3449b35855..a14e769ad0 100644 --- a/src/lib/libcrypto/bn/asm/ppc64-mont.pl +++ b/src/lib/libcrypto/bn/asm/ppc64-mont.pl | |||
| @@ -45,23 +45,40 @@ | |||
| 45 | # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive | 45 | # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive |
| 46 | # in absolute terms, but it's apparently the way Power 6 is... | 46 | # in absolute terms, but it's apparently the way Power 6 is... |
| 47 | 47 | ||
| 48 | # December 2009 | ||
| 49 | |||
| 50 | # Adapted for 32-bit build this module delivers 25-120%, yes, more | ||
| 51 | # than *twice* for longer keys, performance improvement over 32-bit | ||
| 52 | # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes | ||
| 53 | # even 64-bit integer operations and the trouble is that most PPC | ||
| 54 | # operating systems don't preserve upper halves of general purpose | ||
| 55 | # registers upon 32-bit signal delivery. They do preserve them upon | ||
| 56 | # context switch, but not signalling:-( This means that asynchronous | ||
| 57 | # signals have to be blocked upon entry to this subroutine. Signal | ||
| 58 | # masking (and of course complementary unmasking) has quite an impact | ||
| 59 | # on performance, naturally larger for shorter keys. It's so severe | ||
| 60 | # that 512-bit key performance can be as low as 1/3 of expected one. | ||
| 61 | # This is why this routine can be engaged for longer key operations | ||
| 62 | # only on these OSes, see crypto/ppccap.c for further details. MacOS X | ||
| 63 | # is an exception from this and doesn't require signal masking, and | ||
| 64 | # that's where above improvement coefficients were collected. For | ||
| 65 | # others alternative would be to break dependence on upper halves of | ||
| 66 | # GPRs by sticking to 32-bit integer operations... | ||
| 67 | |||
| 48 | $flavour = shift; | 68 | $flavour = shift; |
| 49 | 69 | ||
| 50 | if ($flavour =~ /32/) { | 70 | if ($flavour =~ /32/) { |
| 51 | $SIZE_T=4; | 71 | $SIZE_T=4; |
| 52 | $RZONE= 224; | 72 | $RZONE= 224; |
| 53 | $FRAME= $SIZE_T*12+8*12; | 73 | $fname= "bn_mul_mont_fpu64"; |
| 54 | $fname= "bn_mul_mont_ppc64"; | ||
| 55 | 74 | ||
| 56 | $STUX= "stwux"; # store indexed and update | 75 | $STUX= "stwux"; # store indexed and update |
| 57 | $PUSH= "stw"; | 76 | $PUSH= "stw"; |
| 58 | $POP= "lwz"; | 77 | $POP= "lwz"; |
| 59 | die "not implemented yet"; | ||
| 60 | } elsif ($flavour =~ /64/) { | 78 | } elsif ($flavour =~ /64/) { |
| 61 | $SIZE_T=8; | 79 | $SIZE_T=8; |
| 62 | $RZONE= 288; | 80 | $RZONE= 288; |
| 63 | $FRAME= $SIZE_T*12+8*12; | 81 | $fname= "bn_mul_mont_fpu64"; |
| 64 | $fname= "bn_mul_mont"; | ||
| 65 | 82 | ||
| 66 | # same as above, but 64-bit mnemonics... | 83 | # same as above, but 64-bit mnemonics... |
| 67 | $STUX= "stdux"; # store indexed and update | 84 | $STUX= "stdux"; # store indexed and update |
| @@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl"; | |||
| 76 | 93 | ||
| 77 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | 94 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
| 78 | 95 | ||
| 79 | $FRAME=($FRAME+63)&~63; | 96 | $FRAME=64; # padded frame header |
| 80 | $TRANSFER=16*8; | 97 | $TRANSFER=16*8; |
| 81 | 98 | ||
| 82 | $carry="r0"; | 99 | $carry="r0"; |
| @@ -93,16 +110,16 @@ $tp="r10"; | |||
| 93 | $j="r11"; | 110 | $j="r11"; |
| 94 | $i="r12"; | 111 | $i="r12"; |
| 95 | # non-volatile registers | 112 | # non-volatile registers |
| 96 | $nap_d="r14"; # interleaved ap and np in double format | 113 | $nap_d="r22"; # interleaved ap and np in double format |
| 97 | $a0="r15"; # ap[0] | 114 | $a0="r23"; # ap[0] |
| 98 | $t0="r16"; # temporary registers | 115 | $t0="r24"; # temporary registers |
| 99 | $t1="r17"; | 116 | $t1="r25"; |
| 100 | $t2="r18"; | 117 | $t2="r26"; |
| 101 | $t3="r19"; | 118 | $t3="r27"; |
| 102 | $t4="r20"; | 119 | $t4="r28"; |
| 103 | $t5="r21"; | 120 | $t5="r29"; |
| 104 | $t6="r22"; | 121 | $t6="r30"; |
| 105 | $t7="r23"; | 122 | $t7="r31"; |
| 106 | 123 | ||
| 107 | # PPC offers enough register bank capacity to unroll inner loops twice | 124 | # PPC offers enough register bank capacity to unroll inner loops twice |
| 108 | # | 125 | # |
| @@ -132,28 +149,17 @@ $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; | |||
| 132 | $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; | 149 | $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; |
| 133 | $dota="f8"; $dotb="f9"; | 150 | $dota="f8"; $dotb="f9"; |
| 134 | $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; | 151 | $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; |
| 135 | $N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; | 152 | $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; |
| 136 | $T0a="f18"; $T0b="f19"; | 153 | $T0a="f24"; $T0b="f25"; |
| 137 | $T1a="f20"; $T1b="f21"; | 154 | $T1a="f26"; $T1b="f27"; |
| 138 | $T2a="f22"; $T2b="f23"; | 155 | $T2a="f28"; $T2b="f29"; |
| 139 | $T3a="f24"; $T3b="f25"; | 156 | $T3a="f30"; $T3b="f31"; |
| 140 | 157 | ||
| 141 | # sp----------->+-------------------------------+ | 158 | # sp----------->+-------------------------------+ |
| 142 | # | saved sp | | 159 | # | saved sp | |
| 143 | # +-------------------------------+ | 160 | # +-------------------------------+ |
| 144 | # | | | ||
| 145 | # +-------------------------------+ | ||
| 146 | # | 10 saved gpr, r14-r23 | | ||
| 147 | # . . | ||
| 148 | # . . | ||
| 149 | # +12*size_t +-------------------------------+ | ||
| 150 | # | 12 saved fpr, f14-f25 | | ||
| 151 | # . . | 161 | # . . |
| 152 | # . . | 162 | # +64 +-------------------------------+ |
| 153 | # +12*8 +-------------------------------+ | ||
| 154 | # | padding to 64 byte boundary | | ||
| 155 | # . . | ||
| 156 | # +X +-------------------------------+ | ||
| 157 | # | 16 gpr<->fpr transfer zone | | 163 | # | 16 gpr<->fpr transfer zone | |
| 158 | # . . | 164 | # . . |
| 159 | # . . | 165 | # . . |
| @@ -173,6 +179,16 @@ $T3a="f24"; $T3b="f25"; | |||
| 173 | # . . | 179 | # . . |
| 174 | # . . | 180 | # . . |
| 175 | # +-------------------------------+ | 181 | # +-------------------------------+ |
| 182 | # . . | ||
| 183 | # -12*size_t +-------------------------------+ | ||
| 184 | # | 10 saved gpr, r22-r31 | | ||
| 185 | # . . | ||
| 186 | # . . | ||
| 187 | # -12*8 +-------------------------------+ | ||
| 188 | # | 12 saved fpr, f20-f31 | | ||
| 189 | # . . | ||
| 190 | # . . | ||
| 191 | # +-------------------------------+ | ||
| 176 | 192 | ||
| 177 | $code=<<___; | 193 | $code=<<___; |
| 178 | .machine "any" | 194 | .machine "any" |
| @@ -181,14 +197,14 @@ $code=<<___; | |||
| 181 | .globl .$fname | 197 | .globl .$fname |
| 182 | .align 5 | 198 | .align 5 |
| 183 | .$fname: | 199 | .$fname: |
| 184 | cmpwi $num,4 | 200 | cmpwi $num,`3*8/$SIZE_T` |
| 185 | mr $rp,r3 ; $rp is reassigned | 201 | mr $rp,r3 ; $rp is reassigned |
| 186 | li r3,0 ; possible "not handled" return code | 202 | li r3,0 ; possible "not handled" return code |
| 187 | bltlr- | 203 | bltlr- |
| 188 | andi. r0,$num,1 ; $num has to be even | 204 | andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" |
| 189 | bnelr- | 205 | bnelr- |
| 190 | 206 | ||
| 191 | slwi $num,$num,3 ; num*=8 | 207 | slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) |
| 192 | li $i,-4096 | 208 | li $i,-4096 |
| 193 | slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num | 209 | slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num |
| 194 | add $tp,$tp,$num ; place for tp[num+1] | 210 | add $tp,$tp,$num ; place for tp[num+1] |
| @@ -196,35 +212,50 @@ $code=<<___; | |||
| 196 | subf $tp,$tp,$sp ; $sp-$tp | 212 | subf $tp,$tp,$sp ; $sp-$tp |
| 197 | and $tp,$tp,$i ; minimize TLB usage | 213 | and $tp,$tp,$i ; minimize TLB usage |
| 198 | subf $tp,$sp,$tp ; $tp-$sp | 214 | subf $tp,$sp,$tp ; $tp-$sp |
| 215 | mr $i,$sp | ||
| 199 | $STUX $sp,$sp,$tp ; alloca | 216 | $STUX $sp,$sp,$tp ; alloca |
| 200 | 217 | ||
| 201 | $PUSH r14,`2*$SIZE_T`($sp) | 218 | $PUSH r22,`-12*8-10*$SIZE_T`($i) |
| 202 | $PUSH r15,`3*$SIZE_T`($sp) | 219 | $PUSH r23,`-12*8-9*$SIZE_T`($i) |
| 203 | $PUSH r16,`4*$SIZE_T`($sp) | 220 | $PUSH r24,`-12*8-8*$SIZE_T`($i) |
| 204 | $PUSH r17,`5*$SIZE_T`($sp) | 221 | $PUSH r25,`-12*8-7*$SIZE_T`($i) |
| 205 | $PUSH r18,`6*$SIZE_T`($sp) | 222 | $PUSH r26,`-12*8-6*$SIZE_T`($i) |
| 206 | $PUSH r19,`7*$SIZE_T`($sp) | 223 | $PUSH r27,`-12*8-5*$SIZE_T`($i) |
| 207 | $PUSH r20,`8*$SIZE_T`($sp) | 224 | $PUSH r28,`-12*8-4*$SIZE_T`($i) |
| 208 | $PUSH r21,`9*$SIZE_T`($sp) | 225 | $PUSH r29,`-12*8-3*$SIZE_T`($i) |
| 209 | $PUSH r22,`10*$SIZE_T`($sp) | 226 | $PUSH r30,`-12*8-2*$SIZE_T`($i) |
| 210 | $PUSH r23,`11*$SIZE_T`($sp) | 227 | $PUSH r31,`-12*8-1*$SIZE_T`($i) |
| 211 | stfd f14,`12*$SIZE_T+0`($sp) | 228 | stfd f20,`-12*8`($i) |
| 212 | stfd f15,`12*$SIZE_T+8`($sp) | 229 | stfd f21,`-11*8`($i) |
| 213 | stfd f16,`12*$SIZE_T+16`($sp) | 230 | stfd f22,`-10*8`($i) |
| 214 | stfd f17,`12*$SIZE_T+24`($sp) | 231 | stfd f23,`-9*8`($i) |
| 215 | stfd f18,`12*$SIZE_T+32`($sp) | 232 | stfd f24,`-8*8`($i) |
| 216 | stfd f19,`12*$SIZE_T+40`($sp) | 233 | stfd f25,`-7*8`($i) |
| 217 | stfd f20,`12*$SIZE_T+48`($sp) | 234 | stfd f26,`-6*8`($i) |
| 218 | stfd f21,`12*$SIZE_T+56`($sp) | 235 | stfd f27,`-5*8`($i) |
| 219 | stfd f22,`12*$SIZE_T+64`($sp) | 236 | stfd f28,`-4*8`($i) |
| 220 | stfd f23,`12*$SIZE_T+72`($sp) | 237 | stfd f29,`-3*8`($i) |
| 221 | stfd f24,`12*$SIZE_T+80`($sp) | 238 | stfd f30,`-2*8`($i) |
| 222 | stfd f25,`12*$SIZE_T+88`($sp) | 239 | stfd f31,`-1*8`($i) |
| 223 | 240 | ___ | |
| 241 | $code.=<<___ if ($SIZE_T==8); | ||
| 224 | ld $a0,0($ap) ; pull ap[0] value | 242 | ld $a0,0($ap) ; pull ap[0] value |
| 225 | ld $n0,0($n0) ; pull n0[0] value | 243 | ld $n0,0($n0) ; pull n0[0] value |
| 226 | ld $t3,0($bp) ; bp[0] | 244 | ld $t3,0($bp) ; bp[0] |
| 227 | 245 | ___ | |
| 246 | $code.=<<___ if ($SIZE_T==4); | ||
| 247 | mr $t1,$n0 | ||
| 248 | lwz $a0,0($ap) ; pull ap[0,1] value | ||
| 249 | lwz $t0,4($ap) | ||
| 250 | lwz $n0,0($t1) ; pull n0[0,1] value | ||
| 251 | lwz $t1,4($t1) | ||
| 252 | lwz $t3,0($bp) ; bp[0,1] | ||
| 253 | lwz $t2,4($bp) | ||
| 254 | insrdi $a0,$t0,32,0 | ||
| 255 | insrdi $n0,$t1,32,0 | ||
| 256 | insrdi $t3,$t2,32,0 | ||
| 257 | ___ | ||
| 258 | $code.=<<___; | ||
| 228 | addi $tp,$sp,`$FRAME+$TRANSFER+8+64` | 259 | addi $tp,$sp,`$FRAME+$TRANSFER+8+64` |
| 229 | li $i,-64 | 260 | li $i,-64 |
| 230 | add $nap_d,$tp,$num | 261 | add $nap_d,$tp,$num |
| @@ -258,6 +289,8 @@ $code=<<___; | |||
| 258 | std $t5,`$FRAME+40`($sp) | 289 | std $t5,`$FRAME+40`($sp) |
| 259 | std $t6,`$FRAME+48`($sp) | 290 | std $t6,`$FRAME+48`($sp) |
| 260 | std $t7,`$FRAME+56`($sp) | 291 | std $t7,`$FRAME+56`($sp) |
| 292 | ___ | ||
| 293 | $code.=<<___ if ($SIZE_T==8); | ||
| 261 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair | 294 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair |
| 262 | lwz $t1,0($ap) | 295 | lwz $t1,0($ap) |
| 263 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | 296 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair |
| @@ -266,6 +299,18 @@ $code=<<___; | |||
| 266 | lwz $t5,0($np) | 299 | lwz $t5,0($np) |
| 267 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | 300 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair |
| 268 | lwz $t7,8($np) | 301 | lwz $t7,8($np) |
| 302 | ___ | ||
| 303 | $code.=<<___ if ($SIZE_T==4); | ||
| 304 | lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs | ||
| 305 | lwz $t1,4($ap) | ||
| 306 | lwz $t2,8($ap) | ||
| 307 | lwz $t3,12($ap) | ||
| 308 | lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs | ||
| 309 | lwz $t5,4($np) | ||
| 310 | lwz $t6,8($np) | ||
| 311 | lwz $t7,12($np) | ||
| 312 | ___ | ||
| 313 | $code.=<<___; | ||
| 269 | lfd $ba,`$FRAME+0`($sp) | 314 | lfd $ba,`$FRAME+0`($sp) |
| 270 | lfd $bb,`$FRAME+8`($sp) | 315 | lfd $bb,`$FRAME+8`($sp) |
| 271 | lfd $bc,`$FRAME+16`($sp) | 316 | lfd $bc,`$FRAME+16`($sp) |
| @@ -374,6 +419,8 @@ $code=<<___; | |||
| 374 | 419 | ||
| 375 | .align 5 | 420 | .align 5 |
| 376 | L1st: | 421 | L1st: |
| 422 | ___ | ||
| 423 | $code.=<<___ if ($SIZE_T==8); | ||
| 377 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair | 424 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair |
| 378 | lwz $t1,0($ap) | 425 | lwz $t1,0($ap) |
| 379 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | 426 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair |
| @@ -382,6 +429,18 @@ L1st: | |||
| 382 | lwz $t5,0($np) | 429 | lwz $t5,0($np) |
| 383 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | 430 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair |
| 384 | lwz $t7,8($np) | 431 | lwz $t7,8($np) |
| 432 | ___ | ||
| 433 | $code.=<<___ if ($SIZE_T==4); | ||
| 434 | lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs | ||
| 435 | lwz $t1,4($ap) | ||
| 436 | lwz $t2,8($ap) | ||
| 437 | lwz $t3,12($ap) | ||
| 438 | lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs | ||
| 439 | lwz $t5,4($np) | ||
| 440 | lwz $t6,8($np) | ||
| 441 | lwz $t7,12($np) | ||
| 442 | ___ | ||
| 443 | $code.=<<___; | ||
| 385 | std $t0,`$FRAME+64`($sp) | 444 | std $t0,`$FRAME+64`($sp) |
| 386 | std $t1,`$FRAME+72`($sp) | 445 | std $t1,`$FRAME+72`($sp) |
| 387 | std $t2,`$FRAME+80`($sp) | 446 | std $t2,`$FRAME+80`($sp) |
| @@ -559,7 +618,17 @@ L1st: | |||
| 559 | li $i,8 ; i=1 | 618 | li $i,8 ; i=1 |
| 560 | .align 5 | 619 | .align 5 |
| 561 | Louter: | 620 | Louter: |
| 621 | ___ | ||
| 622 | $code.=<<___ if ($SIZE_T==8); | ||
| 562 | ldx $t3,$bp,$i ; bp[i] | 623 | ldx $t3,$bp,$i ; bp[i] |
| 624 | ___ | ||
| 625 | $code.=<<___ if ($SIZE_T==4); | ||
| 626 | add $t0,$bp,$i | ||
| 627 | lwz $t3,0($t0) ; bp[i,i+1] | ||
| 628 | lwz $t0,4($t0) | ||
| 629 | insrdi $t3,$t0,32,0 | ||
| 630 | ___ | ||
| 631 | $code.=<<___; | ||
| 563 | ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] | 632 | ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] |
| 564 | mulld $t7,$a0,$t3 ; ap[0]*bp[i] | 633 | mulld $t7,$a0,$t3 ; ap[0]*bp[i] |
| 565 | 634 | ||
| @@ -761,6 +830,13 @@ Linner: | |||
| 761 | stfd $T0b,`$FRAME+8`($sp) | 830 | stfd $T0b,`$FRAME+8`($sp) |
| 762 | add $t7,$t7,$carry | 831 | add $t7,$t7,$carry |
| 763 | addc $t3,$t0,$t1 | 832 | addc $t3,$t0,$t1 |
| 833 | ___ | ||
| 834 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
| 835 | extrdi $t0,$t0,32,0 | ||
| 836 | extrdi $t1,$t1,32,0 | ||
| 837 | adde $t0,$t0,$t1 | ||
| 838 | ___ | ||
| 839 | $code.=<<___; | ||
| 764 | stfd $T1a,`$FRAME+16`($sp) | 840 | stfd $T1a,`$FRAME+16`($sp) |
| 765 | stfd $T1b,`$FRAME+24`($sp) | 841 | stfd $T1b,`$FRAME+24`($sp) |
| 766 | insrdi $t4,$t7,16,0 ; 64..127 bits | 842 | insrdi $t4,$t7,16,0 ; 64..127 bits |
| @@ -768,6 +844,13 @@ Linner: | |||
| 768 | stfd $T2a,`$FRAME+32`($sp) | 844 | stfd $T2a,`$FRAME+32`($sp) |
| 769 | stfd $T2b,`$FRAME+40`($sp) | 845 | stfd $T2b,`$FRAME+40`($sp) |
| 770 | adde $t5,$t4,$t2 | 846 | adde $t5,$t4,$t2 |
| 847 | ___ | ||
| 848 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
| 849 | extrdi $t4,$t4,32,0 | ||
| 850 | extrdi $t2,$t2,32,0 | ||
| 851 | adde $t4,$t4,$t2 | ||
| 852 | ___ | ||
| 853 | $code.=<<___; | ||
| 771 | stfd $T3a,`$FRAME+48`($sp) | 854 | stfd $T3a,`$FRAME+48`($sp) |
| 772 | stfd $T3b,`$FRAME+56`($sp) | 855 | stfd $T3b,`$FRAME+56`($sp) |
| 773 | addze $carry,$carry | 856 | addze $carry,$carry |
| @@ -816,7 +899,21 @@ Linner: | |||
| 816 | ld $t7,`$FRAME+72`($sp) | 899 | ld $t7,`$FRAME+72`($sp) |
| 817 | 900 | ||
| 818 | addc $t3,$t0,$t1 | 901 | addc $t3,$t0,$t1 |
| 902 | ___ | ||
| 903 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
| 904 | extrdi $t0,$t0,32,0 | ||
| 905 | extrdi $t1,$t1,32,0 | ||
| 906 | adde $t0,$t0,$t1 | ||
| 907 | ___ | ||
| 908 | $code.=<<___; | ||
| 819 | adde $t5,$t4,$t2 | 909 | adde $t5,$t4,$t2 |
| 910 | ___ | ||
| 911 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
| 912 | extrdi $t4,$t4,32,0 | ||
| 913 | extrdi $t2,$t2,32,0 | ||
| 914 | adde $t4,$t4,$t2 | ||
| 915 | ___ | ||
| 916 | $code.=<<___; | ||
| 820 | addze $carry,$carry | 917 | addze $carry,$carry |
| 821 | 918 | ||
| 822 | std $t3,-16($tp) ; tp[j-1] | 919 | std $t3,-16($tp) ; tp[j-1] |
| @@ -835,7 +932,9 @@ Linner: | |||
| 835 | subf $nap_d,$t7,$nap_d ; rewind pointer | 932 | subf $nap_d,$t7,$nap_d ; rewind pointer |
| 836 | cmpw $i,$num | 933 | cmpw $i,$num |
| 837 | blt- Louter | 934 | blt- Louter |
| 935 | ___ | ||
| 838 | 936 | ||
| 937 | $code.=<<___ if ($SIZE_T==8); | ||
| 839 | subf $np,$num,$np ; rewind np | 938 | subf $np,$num,$np ; rewind np |
| 840 | addi $j,$j,1 ; restore counter | 939 | addi $j,$j,1 ; restore counter |
| 841 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] | 940 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] |
| @@ -883,34 +982,105 @@ Lcopy: ; copy or in-place refresh | |||
| 883 | stdx $i,$t4,$i | 982 | stdx $i,$t4,$i |
| 884 | addi $i,$i,16 | 983 | addi $i,$i,16 |
| 885 | bdnz- Lcopy | 984 | bdnz- Lcopy |
| 985 | ___ | ||
| 986 | $code.=<<___ if ($SIZE_T==4); | ||
| 987 | subf $np,$num,$np ; rewind np | ||
| 988 | addi $j,$j,1 ; restore counter | ||
| 989 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] | ||
| 990 | addi $tp,$sp,`$FRAME+$TRANSFER` | ||
| 991 | addi $np,$np,-4 | ||
| 992 | addi $rp,$rp,-4 | ||
| 993 | addi $ap,$sp,`$FRAME+$TRANSFER+4` | ||
| 994 | mtctr $j | ||
| 995 | |||
| 996 | .align 4 | ||
| 997 | Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order | ||
| 998 | ldu $t2,16($tp) | ||
| 999 | lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order | ||
| 1000 | lwz $t5,8($np) | ||
| 1001 | lwz $t6,12($np) | ||
| 1002 | lwzu $t7,16($np) | ||
| 1003 | extrdi $t1,$t0,32,0 | ||
| 1004 | extrdi $t3,$t2,32,0 | ||
| 1005 | subfe $t4,$t4,$t0 ; tp[j]-np[j] | ||
| 1006 | stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order | ||
| 1007 | subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] | ||
| 1008 | stw $t1,8($ap) | ||
| 1009 | subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] | ||
| 1010 | stw $t2,12($ap) | ||
| 1011 | subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] | ||
| 1012 | stwu $t3,16($ap) | ||
| 1013 | stw $t4,4($rp) | ||
| 1014 | stw $t5,8($rp) | ||
| 1015 | stw $t6,12($rp) | ||
| 1016 | stwu $t7,16($rp) | ||
| 1017 | bdnz- Lsub | ||
| 1018 | |||
| 1019 | li $i,0 | ||
| 1020 | subfe $ovf,$i,$ovf ; handle upmost overflow bit | ||
| 1021 | addi $tp,$sp,`$FRAME+$TRANSFER+4` | ||
| 1022 | subf $rp,$num,$rp ; rewind rp | ||
| 1023 | and $ap,$tp,$ovf | ||
| 1024 | andc $np,$rp,$ovf | ||
| 1025 | or $ap,$ap,$np ; ap=borrow?tp:rp | ||
| 1026 | addi $tp,$sp,`$FRAME+$TRANSFER` | ||
| 1027 | mtctr $j | ||
| 1028 | |||
| 1029 | .align 4 | ||
| 1030 | Lcopy: ; copy or in-place refresh | ||
| 1031 | lwz $t0,4($ap) | ||
| 1032 | lwz $t1,8($ap) | ||
| 1033 | lwz $t2,12($ap) | ||
| 1034 | lwzu $t3,16($ap) | ||
| 1035 | std $i,8($nap_d) ; zap nap_d | ||
| 1036 | std $i,16($nap_d) | ||
| 1037 | std $i,24($nap_d) | ||
| 1038 | std $i,32($nap_d) | ||
| 1039 | std $i,40($nap_d) | ||
| 1040 | std $i,48($nap_d) | ||
| 1041 | std $i,56($nap_d) | ||
| 1042 | stdu $i,64($nap_d) | ||
| 1043 | stw $t0,4($rp) | ||
| 1044 | stw $t1,8($rp) | ||
| 1045 | stw $t2,12($rp) | ||
| 1046 | stwu $t3,16($rp) | ||
| 1047 | std $i,8($tp) ; zap tp at once | ||
| 1048 | stdu $i,16($tp) | ||
| 1049 | bdnz- Lcopy | ||
| 1050 | ___ | ||
| 886 | 1051 | ||
| 887 | $POP r14,`2*$SIZE_T`($sp) | 1052 | $code.=<<___; |
| 888 | $POP r15,`3*$SIZE_T`($sp) | 1053 | $POP $i,0($sp) |
| 889 | $POP r16,`4*$SIZE_T`($sp) | ||
| 890 | $POP r17,`5*$SIZE_T`($sp) | ||
| 891 | $POP r18,`6*$SIZE_T`($sp) | ||
| 892 | $POP r19,`7*$SIZE_T`($sp) | ||
| 893 | $POP r20,`8*$SIZE_T`($sp) | ||
| 894 | $POP r21,`9*$SIZE_T`($sp) | ||
| 895 | $POP r22,`10*$SIZE_T`($sp) | ||
| 896 | $POP r23,`11*$SIZE_T`($sp) | ||
| 897 | lfd f14,`12*$SIZE_T+0`($sp) | ||
| 898 | lfd f15,`12*$SIZE_T+8`($sp) | ||
| 899 | lfd f16,`12*$SIZE_T+16`($sp) | ||
| 900 | lfd f17,`12*$SIZE_T+24`($sp) | ||
| 901 | lfd f18,`12*$SIZE_T+32`($sp) | ||
| 902 | lfd f19,`12*$SIZE_T+40`($sp) | ||
| 903 | lfd f20,`12*$SIZE_T+48`($sp) | ||
| 904 | lfd f21,`12*$SIZE_T+56`($sp) | ||
| 905 | lfd f22,`12*$SIZE_T+64`($sp) | ||
| 906 | lfd f23,`12*$SIZE_T+72`($sp) | ||
| 907 | lfd f24,`12*$SIZE_T+80`($sp) | ||
| 908 | lfd f25,`12*$SIZE_T+88`($sp) | ||
| 909 | $POP $sp,0($sp) | ||
| 910 | li r3,1 ; signal "handled" | 1054 | li r3,1 ; signal "handled" |
| 1055 | $POP r22,`-12*8-10*$SIZE_T`($i) | ||
| 1056 | $POP r23,`-12*8-9*$SIZE_T`($i) | ||
| 1057 | $POP r24,`-12*8-8*$SIZE_T`($i) | ||
| 1058 | $POP r25,`-12*8-7*$SIZE_T`($i) | ||
| 1059 | $POP r26,`-12*8-6*$SIZE_T`($i) | ||
| 1060 | $POP r27,`-12*8-5*$SIZE_T`($i) | ||
| 1061 | $POP r28,`-12*8-4*$SIZE_T`($i) | ||
| 1062 | $POP r29,`-12*8-3*$SIZE_T`($i) | ||
| 1063 | $POP r30,`-12*8-2*$SIZE_T`($i) | ||
| 1064 | $POP r31,`-12*8-1*$SIZE_T`($i) | ||
| 1065 | lfd f20,`-12*8`($i) | ||
| 1066 | lfd f21,`-11*8`($i) | ||
| 1067 | lfd f22,`-10*8`($i) | ||
| 1068 | lfd f23,`-9*8`($i) | ||
| 1069 | lfd f24,`-8*8`($i) | ||
| 1070 | lfd f25,`-7*8`($i) | ||
| 1071 | lfd f26,`-6*8`($i) | ||
| 1072 | lfd f27,`-5*8`($i) | ||
| 1073 | lfd f28,`-4*8`($i) | ||
| 1074 | lfd f29,`-3*8`($i) | ||
| 1075 | lfd f30,`-2*8`($i) | ||
| 1076 | lfd f31,`-1*8`($i) | ||
| 1077 | mr $sp,$i | ||
| 911 | blr | 1078 | blr |
| 912 | .long 0 | 1079 | .long 0 |
| 913 | .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" | 1080 | .byte 0,12,4,0,0x8c,10,6,0 |
| 1081 | .long 0 | ||
| 1082 | |||
| 1083 | .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 914 | ___ | 1084 | ___ |
| 915 | 1085 | ||
| 916 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 1086 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl new file mode 100644 index 0000000000..cd9f13eca2 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl | |||
| @@ -0,0 +1,221 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
| 13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
| 14 | # the time being... gcc 4.3 appeared to generate poor code, therefore | ||
| 15 | # the effort. And indeed, the module delivers 55%-90%(*) improvement | ||
| 16 | # on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit | ||
| 17 | # key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196. | ||
| 18 | # This is for 64-bit build. In 32-bit "highgprs" case improvement is | ||
| 19 | # even higher, for example on z990 it was measured 80%-150%. ECDSA | ||
| 20 | # sign is modest 9%-12% faster. Keep in mind that these coefficients | ||
| 21 | # are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is | ||
| 22 | # burnt in it... | ||
| 23 | # | ||
| 24 | # (*) gcc 4.1 was observed to deliver better results than gcc 4.3, | ||
| 25 | # so that improvement coefficients can vary from one specific | ||
| 26 | # setup to another. | ||
| 27 | |||
| 28 | $flavour = shift; | ||
| 29 | |||
| 30 | if ($flavour =~ /3[12]/) { | ||
| 31 | $SIZE_T=4; | ||
| 32 | $g=""; | ||
| 33 | } else { | ||
| 34 | $SIZE_T=8; | ||
| 35 | $g="g"; | ||
| 36 | } | ||
| 37 | |||
| 38 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 39 | open STDOUT,">$output"; | ||
| 40 | |||
| 41 | $stdframe=16*$SIZE_T+4*8; | ||
| 42 | |||
| 43 | $rp="%r2"; | ||
| 44 | $a1="%r3"; | ||
| 45 | $a0="%r4"; | ||
| 46 | $b1="%r5"; | ||
| 47 | $b0="%r6"; | ||
| 48 | |||
| 49 | $ra="%r14"; | ||
| 50 | $sp="%r15"; | ||
| 51 | |||
| 52 | @T=("%r0","%r1"); | ||
| 53 | @i=("%r12","%r13"); | ||
| 54 | |||
| 55 | ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11)); | ||
| 56 | ($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8; | ||
| 57 | |||
| 58 | $code.=<<___; | ||
| 59 | .text | ||
| 60 | |||
| 61 | .type _mul_1x1,\@function | ||
| 62 | .align 16 | ||
| 63 | _mul_1x1: | ||
| 64 | lgr $a1,$a | ||
| 65 | sllg $a2,$a,1 | ||
| 66 | sllg $a4,$a,2 | ||
| 67 | sllg $a8,$a,3 | ||
| 68 | |||
| 69 | srag $lo,$a1,63 # broadcast 63rd bit | ||
| 70 | nihh $a1,0x1fff | ||
| 71 | srag @i[0],$a2,63 # broadcast 62nd bit | ||
| 72 | nihh $a2,0x3fff | ||
| 73 | srag @i[1],$a4,63 # broadcast 61st bit | ||
| 74 | nihh $a4,0x7fff | ||
| 75 | ngr $lo,$b | ||
| 76 | ngr @i[0],$b | ||
| 77 | ngr @i[1],$b | ||
| 78 | |||
| 79 | lghi @T[0],0 | ||
| 80 | lgr $a12,$a1 | ||
| 81 | stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0 | ||
| 82 | xgr $a12,$a2 | ||
| 83 | stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1 | ||
| 84 | lgr $a48,$a4 | ||
| 85 | stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2 | ||
| 86 | xgr $a48,$a8 | ||
| 87 | stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2 | ||
| 88 | xgr $a1,$a4 | ||
| 89 | |||
| 90 | stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4 | ||
| 91 | xgr $a2,$a4 | ||
| 92 | stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4 | ||
| 93 | xgr $a12,$a4 | ||
| 94 | stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4 | ||
| 95 | xgr $a1,$a48 | ||
| 96 | stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4 | ||
| 97 | xgr $a2,$a48 | ||
| 98 | |||
| 99 | stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8 | ||
| 100 | xgr $a12,$a48 | ||
| 101 | stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8 | ||
| 102 | xgr $a1,$a4 | ||
| 103 | stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8 | ||
| 104 | xgr $a2,$a4 | ||
| 105 | stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8 | ||
| 106 | |||
| 107 | xgr $a12,$a4 | ||
| 108 | stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8 | ||
| 109 | srlg $hi,$lo,1 | ||
| 110 | stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8 | ||
| 111 | sllg $lo,$lo,63 | ||
| 112 | stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8 | ||
| 113 | srlg @T[0],@i[0],2 | ||
| 114 | stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8 | ||
| 115 | |||
| 116 | lghi $mask,`0xf<<3` | ||
| 117 | sllg $a1,@i[0],62 | ||
| 118 | sllg @i[0],$b,3 | ||
| 119 | srlg @T[1],@i[1],3 | ||
| 120 | ngr @i[0],$mask | ||
| 121 | sllg $a2,@i[1],61 | ||
| 122 | srlg @i[1],$b,4-3 | ||
| 123 | xgr $hi,@T[0] | ||
| 124 | ngr @i[1],$mask | ||
| 125 | xgr $lo,$a1 | ||
| 126 | xgr $hi,@T[1] | ||
| 127 | xgr $lo,$a2 | ||
| 128 | |||
| 129 | xg $lo,$stdframe(@i[0],$sp) | ||
| 130 | srlg @i[0],$b,8-3 | ||
| 131 | ngr @i[0],$mask | ||
| 132 | ___ | ||
| 133 | for($n=1;$n<14;$n++) { | ||
| 134 | $code.=<<___; | ||
| 135 | lg @T[1],$stdframe(@i[1],$sp) | ||
| 136 | srlg @i[1],$b,`($n+2)*4`-3 | ||
| 137 | sllg @T[0],@T[1],`$n*4` | ||
| 138 | ngr @i[1],$mask | ||
| 139 | srlg @T[1],@T[1],`64-$n*4` | ||
| 140 | xgr $lo,@T[0] | ||
| 141 | xgr $hi,@T[1] | ||
| 142 | ___ | ||
| 143 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
| 144 | } | ||
| 145 | $code.=<<___; | ||
| 146 | lg @T[1],$stdframe(@i[1],$sp) | ||
| 147 | sllg @T[0],@T[1],`$n*4` | ||
| 148 | srlg @T[1],@T[1],`64-$n*4` | ||
| 149 | xgr $lo,@T[0] | ||
| 150 | xgr $hi,@T[1] | ||
| 151 | |||
| 152 | lg @T[0],$stdframe(@i[0],$sp) | ||
| 153 | sllg @T[1],@T[0],`($n+1)*4` | ||
| 154 | srlg @T[0],@T[0],`64-($n+1)*4` | ||
| 155 | xgr $lo,@T[1] | ||
| 156 | xgr $hi,@T[0] | ||
| 157 | |||
| 158 | br $ra | ||
| 159 | .size _mul_1x1,.-_mul_1x1 | ||
| 160 | |||
| 161 | .globl bn_GF2m_mul_2x2 | ||
| 162 | .type bn_GF2m_mul_2x2,\@function | ||
| 163 | .align 16 | ||
| 164 | bn_GF2m_mul_2x2: | ||
| 165 | stm${g} %r3,%r15,3*$SIZE_T($sp) | ||
| 166 | |||
| 167 | lghi %r1,-$stdframe-128 | ||
| 168 | la %r0,0($sp) | ||
| 169 | la $sp,0(%r1,$sp) # alloca | ||
| 170 | st${g} %r0,0($sp) # back chain | ||
| 171 | ___ | ||
| 172 | if ($SIZE_T==8) { | ||
| 173 | my @r=map("%r$_",(6..9)); | ||
| 174 | $code.=<<___; | ||
| 175 | bras $ra,_mul_1x1 # a1·b1 | ||
| 176 | stmg $lo,$hi,16($rp) | ||
| 177 | |||
| 178 | lg $a,`$stdframe+128+4*$SIZE_T`($sp) | ||
| 179 | lg $b,`$stdframe+128+6*$SIZE_T`($sp) | ||
| 180 | bras $ra,_mul_1x1 # a0·b0 | ||
| 181 | stmg $lo,$hi,0($rp) | ||
| 182 | |||
| 183 | lg $a,`$stdframe+128+3*$SIZE_T`($sp) | ||
| 184 | lg $b,`$stdframe+128+5*$SIZE_T`($sp) | ||
| 185 | xg $a,`$stdframe+128+4*$SIZE_T`($sp) | ||
| 186 | xg $b,`$stdframe+128+6*$SIZE_T`($sp) | ||
| 187 | bras $ra,_mul_1x1 # (a0+a1)·(b0+b1) | ||
| 188 | lmg @r[0],@r[3],0($rp) | ||
| 189 | |||
| 190 | xgr $lo,$hi | ||
| 191 | xgr $hi,@r[1] | ||
| 192 | xgr $lo,@r[0] | ||
| 193 | xgr $hi,@r[2] | ||
| 194 | xgr $lo,@r[3] | ||
| 195 | xgr $hi,@r[3] | ||
| 196 | xgr $lo,$hi | ||
| 197 | stg $hi,16($rp) | ||
| 198 | stg $lo,8($rp) | ||
| 199 | ___ | ||
| 200 | } else { | ||
| 201 | $code.=<<___; | ||
| 202 | sllg %r3,%r3,32 | ||
| 203 | sllg %r5,%r5,32 | ||
| 204 | or %r3,%r4 | ||
| 205 | or %r5,%r6 | ||
| 206 | bras $ra,_mul_1x1 | ||
| 207 | rllg $lo,$lo,32 | ||
| 208 | rllg $hi,$hi,32 | ||
| 209 | stmg $lo,$hi,0($rp) | ||
| 210 | ___ | ||
| 211 | } | ||
| 212 | $code.=<<___; | ||
| 213 | lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp) | ||
| 214 | br $ra | ||
| 215 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
| 216 | .string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 217 | ___ | ||
| 218 | |||
| 219 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 220 | print $code; | ||
| 221 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl index f61246f5b6..9fd64e81ee 100644 --- a/src/lib/libcrypto/bn/asm/s390x-mont.pl +++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl | |||
| @@ -32,6 +32,33 @@ | |||
| 32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, | 32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, |
| 33 | # make inner loops counter-based. | 33 | # make inner loops counter-based. |
| 34 | 34 | ||
| 35 | # November 2010. | ||
| 36 | # | ||
| 37 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
| 38 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
| 39 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
| 40 | # application context. The feature is not specific to any particular | ||
| 41 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
| 42 | # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG | ||
| 43 | # is achieved by swapping words after 64-bit loads, follow _dswap-s. | ||
| 44 | # On z990 it was measured to perform 2.6-2.2 times better than | ||
| 45 | # compiler-generated code, less for longer keys... | ||
| 46 | |||
| 47 | $flavour = shift; | ||
| 48 | |||
| 49 | if ($flavour =~ /3[12]/) { | ||
| 50 | $SIZE_T=4; | ||
| 51 | $g=""; | ||
| 52 | } else { | ||
| 53 | $SIZE_T=8; | ||
| 54 | $g="g"; | ||
| 55 | } | ||
| 56 | |||
| 57 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 58 | open STDOUT,">$output"; | ||
| 59 | |||
| 60 | $stdframe=16*$SIZE_T+4*8; | ||
| 61 | |||
| 35 | $mn0="%r0"; | 62 | $mn0="%r0"; |
| 36 | $num="%r1"; | 63 | $num="%r1"; |
| 37 | 64 | ||
| @@ -60,34 +87,44 @@ $code.=<<___; | |||
| 60 | .globl bn_mul_mont | 87 | .globl bn_mul_mont |
| 61 | .type bn_mul_mont,\@function | 88 | .type bn_mul_mont,\@function |
| 62 | bn_mul_mont: | 89 | bn_mul_mont: |
| 63 | lgf $num,164($sp) # pull $num | 90 | lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num |
| 64 | sla $num,3 # $num to enumerate bytes | 91 | sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes |
| 65 | la $bp,0($num,$bp) | 92 | la $bp,0($num,$bp) |
| 66 | 93 | ||
| 67 | stg %r2,16($sp) | 94 | st${g} %r2,2*$SIZE_T($sp) |
| 68 | 95 | ||
| 69 | cghi $num,16 # | 96 | cghi $num,16 # |
| 70 | lghi %r2,0 # | 97 | lghi %r2,0 # |
| 71 | blr %r14 # if($num<16) return 0; | 98 | blr %r14 # if($num<16) return 0; |
| 99 | ___ | ||
| 100 | $code.=<<___ if ($flavour =~ /3[12]/); | ||
| 101 | tmll $num,4 | ||
| 102 | bnzr %r14 # if ($num&1) return 0; | ||
| 103 | ___ | ||
| 104 | $code.=<<___ if ($flavour !~ /3[12]/); | ||
| 72 | cghi $num,96 # | 105 | cghi $num,96 # |
| 73 | bhr %r14 # if($num>96) return 0; | 106 | bhr %r14 # if($num>96) return 0; |
| 107 | ___ | ||
| 108 | $code.=<<___; | ||
| 109 | stm${g} %r3,%r15,3*$SIZE_T($sp) | ||
| 74 | 110 | ||
| 75 | stmg %r3,%r15,24($sp) | 111 | lghi $rp,-$stdframe-8 # leave room for carry bit |
| 76 | |||
| 77 | lghi $rp,-160-8 # leave room for carry bit | ||
| 78 | lcgr $j,$num # -$num | 112 | lcgr $j,$num # -$num |
| 79 | lgr %r0,$sp | 113 | lgr %r0,$sp |
| 80 | la $rp,0($rp,$sp) | 114 | la $rp,0($rp,$sp) |
| 81 | la $sp,0($j,$rp) # alloca | 115 | la $sp,0($j,$rp) # alloca |
| 82 | stg %r0,0($sp) # back chain | 116 | st${g} %r0,0($sp) # back chain |
| 83 | 117 | ||
| 84 | sra $num,3 # restore $num | 118 | sra $num,3 # restore $num |
| 85 | la $bp,0($j,$bp) # restore $bp | 119 | la $bp,0($j,$bp) # restore $bp |
| 86 | ahi $num,-1 # adjust $num for inner loop | 120 | ahi $num,-1 # adjust $num for inner loop |
| 87 | lg $n0,0($n0) # pull n0 | 121 | lg $n0,0($n0) # pull n0 |
| 122 | _dswap $n0 | ||
| 88 | 123 | ||
| 89 | lg $bi,0($bp) | 124 | lg $bi,0($bp) |
| 125 | _dswap $bi | ||
| 90 | lg $alo,0($ap) | 126 | lg $alo,0($ap) |
| 127 | _dswap $alo | ||
| 91 | mlgr $ahi,$bi # ap[0]*bp[0] | 128 | mlgr $ahi,$bi # ap[0]*bp[0] |
| 92 | lgr $AHI,$ahi | 129 | lgr $AHI,$ahi |
| 93 | 130 | ||
| @@ -95,6 +132,7 @@ bn_mul_mont: | |||
| 95 | msgr $mn0,$n0 | 132 | msgr $mn0,$n0 |
| 96 | 133 | ||
| 97 | lg $nlo,0($np) # | 134 | lg $nlo,0($np) # |
| 135 | _dswap $nlo | ||
| 98 | mlgr $nhi,$mn0 # np[0]*m1 | 136 | mlgr $nhi,$mn0 # np[0]*m1 |
| 99 | algr $nlo,$alo # +="tp[0]" | 137 | algr $nlo,$alo # +="tp[0]" |
| 100 | lghi $NHI,0 | 138 | lghi $NHI,0 |
| @@ -106,12 +144,14 @@ bn_mul_mont: | |||
| 106 | .align 16 | 144 | .align 16 |
| 107 | .L1st: | 145 | .L1st: |
| 108 | lg $alo,0($j,$ap) | 146 | lg $alo,0($j,$ap) |
| 147 | _dswap $alo | ||
| 109 | mlgr $ahi,$bi # ap[j]*bp[0] | 148 | mlgr $ahi,$bi # ap[j]*bp[0] |
| 110 | algr $alo,$AHI | 149 | algr $alo,$AHI |
| 111 | lghi $AHI,0 | 150 | lghi $AHI,0 |
| 112 | alcgr $AHI,$ahi | 151 | alcgr $AHI,$ahi |
| 113 | 152 | ||
| 114 | lg $nlo,0($j,$np) | 153 | lg $nlo,0($j,$np) |
| 154 | _dswap $nlo | ||
| 115 | mlgr $nhi,$mn0 # np[j]*m1 | 155 | mlgr $nhi,$mn0 # np[j]*m1 |
| 116 | algr $nlo,$NHI | 156 | algr $nlo,$NHI |
| 117 | lghi $NHI,0 | 157 | lghi $NHI,0 |
| @@ -119,22 +159,24 @@ bn_mul_mont: | |||
| 119 | algr $nlo,$alo | 159 | algr $nlo,$alo |
| 120 | alcgr $NHI,$nhi | 160 | alcgr $NHI,$nhi |
| 121 | 161 | ||
| 122 | stg $nlo,160-8($j,$sp) # tp[j-1]= | 162 | stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= |
| 123 | la $j,8($j) # j++ | 163 | la $j,8($j) # j++ |
| 124 | brct $count,.L1st | 164 | brct $count,.L1st |
| 125 | 165 | ||
| 126 | algr $NHI,$AHI | 166 | algr $NHI,$AHI |
| 127 | lghi $AHI,0 | 167 | lghi $AHI,0 |
| 128 | alcgr $AHI,$AHI # upmost overflow bit | 168 | alcgr $AHI,$AHI # upmost overflow bit |
| 129 | stg $NHI,160-8($j,$sp) | 169 | stg $NHI,$stdframe-8($j,$sp) |
| 130 | stg $AHI,160($j,$sp) | 170 | stg $AHI,$stdframe($j,$sp) |
| 131 | la $bp,8($bp) # bp++ | 171 | la $bp,8($bp) # bp++ |
| 132 | 172 | ||
| 133 | .Louter: | 173 | .Louter: |
| 134 | lg $bi,0($bp) # bp[i] | 174 | lg $bi,0($bp) # bp[i] |
| 175 | _dswap $bi | ||
| 135 | lg $alo,0($ap) | 176 | lg $alo,0($ap) |
| 177 | _dswap $alo | ||
| 136 | mlgr $ahi,$bi # ap[0]*bp[i] | 178 | mlgr $ahi,$bi # ap[0]*bp[i] |
| 137 | alg $alo,160($sp) # +=tp[0] | 179 | alg $alo,$stdframe($sp) # +=tp[0] |
| 138 | lghi $AHI,0 | 180 | lghi $AHI,0 |
| 139 | alcgr $AHI,$ahi | 181 | alcgr $AHI,$ahi |
| 140 | 182 | ||
| @@ -142,6 +184,7 @@ bn_mul_mont: | |||
| 142 | msgr $mn0,$n0 # tp[0]*n0 | 184 | msgr $mn0,$n0 # tp[0]*n0 |
| 143 | 185 | ||
| 144 | lg $nlo,0($np) # np[0] | 186 | lg $nlo,0($np) # np[0] |
| 187 | _dswap $nlo | ||
| 145 | mlgr $nhi,$mn0 # np[0]*m1 | 188 | mlgr $nhi,$mn0 # np[0]*m1 |
| 146 | algr $nlo,$alo # +="tp[0]" | 189 | algr $nlo,$alo # +="tp[0]" |
| 147 | lghi $NHI,0 | 190 | lghi $NHI,0 |
| @@ -153,14 +196,16 @@ bn_mul_mont: | |||
| 153 | .align 16 | 196 | .align 16 |
| 154 | .Linner: | 197 | .Linner: |
| 155 | lg $alo,0($j,$ap) | 198 | lg $alo,0($j,$ap) |
| 199 | _dswap $alo | ||
| 156 | mlgr $ahi,$bi # ap[j]*bp[i] | 200 | mlgr $ahi,$bi # ap[j]*bp[i] |
| 157 | algr $alo,$AHI | 201 | algr $alo,$AHI |
| 158 | lghi $AHI,0 | 202 | lghi $AHI,0 |
| 159 | alcgr $ahi,$AHI | 203 | alcgr $ahi,$AHI |
| 160 | alg $alo,160($j,$sp)# +=tp[j] | 204 | alg $alo,$stdframe($j,$sp)# +=tp[j] |
| 161 | alcgr $AHI,$ahi | 205 | alcgr $AHI,$ahi |
| 162 | 206 | ||
| 163 | lg $nlo,0($j,$np) | 207 | lg $nlo,0($j,$np) |
| 208 | _dswap $nlo | ||
| 164 | mlgr $nhi,$mn0 # np[j]*m1 | 209 | mlgr $nhi,$mn0 # np[j]*m1 |
| 165 | algr $nlo,$NHI | 210 | algr $nlo,$NHI |
| 166 | lghi $NHI,0 | 211 | lghi $NHI,0 |
| @@ -168,31 +213,33 @@ bn_mul_mont: | |||
| 168 | algr $nlo,$alo # +="tp[j]" | 213 | algr $nlo,$alo # +="tp[j]" |
| 169 | alcgr $NHI,$nhi | 214 | alcgr $NHI,$nhi |
| 170 | 215 | ||
| 171 | stg $nlo,160-8($j,$sp) # tp[j-1]= | 216 | stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= |
| 172 | la $j,8($j) # j++ | 217 | la $j,8($j) # j++ |
| 173 | brct $count,.Linner | 218 | brct $count,.Linner |
| 174 | 219 | ||
| 175 | algr $NHI,$AHI | 220 | algr $NHI,$AHI |
| 176 | lghi $AHI,0 | 221 | lghi $AHI,0 |
| 177 | alcgr $AHI,$AHI | 222 | alcgr $AHI,$AHI |
| 178 | alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit | 223 | alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit |
| 179 | lghi $ahi,0 | 224 | lghi $ahi,0 |
| 180 | alcgr $AHI,$ahi # new upmost overflow bit | 225 | alcgr $AHI,$ahi # new upmost overflow bit |
| 181 | stg $NHI,160-8($j,$sp) | 226 | stg $NHI,$stdframe-8($j,$sp) |
| 182 | stg $AHI,160($j,$sp) | 227 | stg $AHI,$stdframe($j,$sp) |
| 183 | 228 | ||
| 184 | la $bp,8($bp) # bp++ | 229 | la $bp,8($bp) # bp++ |
| 185 | clg $bp,160+8+32($j,$sp) # compare to &bp[num] | 230 | cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num] |
| 186 | jne .Louter | 231 | jne .Louter |
| 187 | 232 | ||
| 188 | lg $rp,160+8+16($j,$sp) # reincarnate rp | 233 | l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp |
| 189 | la $ap,160($sp) | 234 | la $ap,$stdframe($sp) |
| 190 | ahi $num,1 # restore $num, incidentally clears "borrow" | 235 | ahi $num,1 # restore $num, incidentally clears "borrow" |
| 191 | 236 | ||
| 192 | la $j,0(%r0) | 237 | la $j,0(%r0) |
| 193 | lr $count,$num | 238 | lr $count,$num |
| 194 | .Lsub: lg $alo,0($j,$ap) | 239 | .Lsub: lg $alo,0($j,$ap) |
| 195 | slbg $alo,0($j,$np) | 240 | lg $nlo,0($j,$np) |
| 241 | _dswap $nlo | ||
| 242 | slbgr $alo,$nlo | ||
| 196 | stg $alo,0($j,$rp) | 243 | stg $alo,0($j,$rp) |
| 197 | la $j,8($j) | 244 | la $j,8($j) |
| 198 | brct $count,.Lsub | 245 | brct $count,.Lsub |
| @@ -207,19 +254,24 @@ bn_mul_mont: | |||
| 207 | 254 | ||
| 208 | la $j,0(%r0) | 255 | la $j,0(%r0) |
| 209 | lgr $count,$num | 256 | lgr $count,$num |
| 210 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh | 257 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh |
| 211 | stg $j,160($j,$sp) # zap tp | 258 | _dswap $alo |
| 259 | stg $j,$stdframe($j,$sp) # zap tp | ||
| 212 | stg $alo,0($j,$rp) | 260 | stg $alo,0($j,$rp) |
| 213 | la $j,8($j) | 261 | la $j,8($j) |
| 214 | brct $count,.Lcopy | 262 | brct $count,.Lcopy |
| 215 | 263 | ||
| 216 | la %r1,160+8+48($j,$sp) | 264 | la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp) |
| 217 | lmg %r6,%r15,0(%r1) | 265 | lm${g} %r6,%r15,0(%r1) |
| 218 | lghi %r2,1 # signal "processed" | 266 | lghi %r2,1 # signal "processed" |
| 219 | br %r14 | 267 | br %r14 |
| 220 | .size bn_mul_mont,.-bn_mul_mont | 268 | .size bn_mul_mont,.-bn_mul_mont |
| 221 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 269 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
| 222 | ___ | 270 | ___ |
| 223 | 271 | ||
| 224 | print $code; | 272 | foreach (split("\n",$code)) { |
| 273 | s/\`([^\`]*)\`/eval $1/ge; | ||
| 274 | s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e; | ||
| 275 | print $_,"\n"; | ||
| 276 | } | ||
| 225 | close STDOUT; | 277 | close STDOUT; |
diff --git a/src/lib/libcrypto/bn/asm/x86-gf2m.pl b/src/lib/libcrypto/bn/asm/x86-gf2m.pl new file mode 100644 index 0000000000..808a1e5969 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86-gf2m.pl | |||
| @@ -0,0 +1,313 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
| 13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
| 14 | # the time being... Except that it has three code paths: pure integer | ||
| 15 | # code suitable for any x86 CPU, MMX code suitable for PIII and later | ||
| 16 | # and PCLMULQDQ suitable for Westmere and later. Improvement varies | ||
| 17 | # from one benchmark and µ-arch to another. Below are interval values | ||
| 18 | # for 163- and 571-bit ECDH benchmarks relative to compiler-generated | ||
| 19 | # code: | ||
| 20 | # | ||
| 21 | # PIII 16%-30% | ||
| 22 | # P4 12%-12% | ||
| 23 | # Opteron 18%-40% | ||
| 24 | # Core2 19%-44% | ||
| 25 | # Atom 38%-64% | ||
| 26 | # Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) | ||
| 27 | # Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) | ||
| 28 | # | ||
| 29 | # Note that above improvement coefficients are not coefficients for | ||
| 30 | # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result | ||
| 31 | # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark | ||
| 32 | # is more and more dominated by other subroutines, most notably by | ||
| 33 | # BN_GF2m_mod[_mul]_arr... | ||
| 34 | |||
| 35 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 36 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 37 | require "x86asm.pl"; | ||
| 38 | |||
| 39 | &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); | ||
| 40 | |||
| 41 | $sse2=0; | ||
| 42 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
| 43 | |||
| 44 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | ||
| 45 | |||
| 46 | $a="eax"; | ||
| 47 | $b="ebx"; | ||
| 48 | ($a1,$a2,$a4)=("ecx","edx","ebp"); | ||
| 49 | |||
| 50 | $R="mm0"; | ||
| 51 | @T=("mm1","mm2"); | ||
| 52 | ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); | ||
| 53 | @i=("esi","edi"); | ||
| 54 | |||
| 55 | if (!$x86only) { | ||
| 56 | &function_begin_B("_mul_1x1_mmx"); | ||
| 57 | &sub ("esp",32+4); | ||
| 58 | &mov ($a1,$a); | ||
| 59 | &lea ($a2,&DWP(0,$a,$a)); | ||
| 60 | &and ($a1,0x3fffffff); | ||
| 61 | &lea ($a4,&DWP(0,$a2,$a2)); | ||
| 62 | &mov (&DWP(0*4,"esp"),0); | ||
| 63 | &and ($a2,0x7fffffff); | ||
| 64 | &movd ($A,$a); | ||
| 65 | &movd ($B,$b); | ||
| 66 | &mov (&DWP(1*4,"esp"),$a1); # a1 | ||
| 67 | &xor ($a1,$a2); # a1^a2 | ||
| 68 | &pxor ($B31,$B31); | ||
| 69 | &pxor ($B30,$B30); | ||
| 70 | &mov (&DWP(2*4,"esp"),$a2); # a2 | ||
| 71 | &xor ($a2,$a4); # a2^a4 | ||
| 72 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | ||
| 73 | &pcmpgtd($B31,$A); # broadcast 31st bit | ||
| 74 | &paddd ($A,$A); # $A<<=1 | ||
| 75 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | ||
| 76 | &mov (&DWP(4*4,"esp"),$a4); # a4 | ||
| 77 | &xor ($a4,$a2); # a2=a4^a2^a4 | ||
| 78 | &pand ($B31,$B); | ||
| 79 | &pcmpgtd($B30,$A); # broadcast 30th bit | ||
| 80 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | ||
| 81 | &xor ($a4,$a1); # a1^a2^a4 | ||
| 82 | &psllq ($B31,31); | ||
| 83 | &pand ($B30,$B); | ||
| 84 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | ||
| 85 | &mov (@i[0],0x7); | ||
| 86 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | ||
| 87 | &mov ($a4,@i[0]); | ||
| 88 | &and (@i[0],$b); | ||
| 89 | &shr ($b,3); | ||
| 90 | &mov (@i[1],$a4); | ||
| 91 | &psllq ($B30,30); | ||
| 92 | &and (@i[1],$b); | ||
| 93 | &shr ($b,3); | ||
| 94 | &movd ($R,&DWP(0,"esp",@i[0],4)); | ||
| 95 | &mov (@i[0],$a4); | ||
| 96 | &and (@i[0],$b); | ||
| 97 | &shr ($b,3); | ||
| 98 | for($n=1;$n<9;$n++) { | ||
| 99 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 100 | &mov (@i[1],$a4); | ||
| 101 | &psllq (@T[1],3*$n); | ||
| 102 | &and (@i[1],$b); | ||
| 103 | &shr ($b,3); | ||
| 104 | &pxor ($R,@T[1]); | ||
| 105 | |||
| 106 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
| 107 | } | ||
| 108 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 109 | &pxor ($R,$B30); | ||
| 110 | &psllq (@T[1],3*$n++); | ||
| 111 | &pxor ($R,@T[1]); | ||
| 112 | |||
| 113 | &movd (@T[0],&DWP(0,"esp",@i[0],4)); | ||
| 114 | &pxor ($R,$B31); | ||
| 115 | &psllq (@T[0],3*$n); | ||
| 116 | &add ("esp",32+4); | ||
| 117 | &pxor ($R,@T[0]); | ||
| 118 | &ret (); | ||
| 119 | &function_end_B("_mul_1x1_mmx"); | ||
| 120 | } | ||
| 121 | |||
| 122 | ($lo,$hi)=("eax","edx"); | ||
| 123 | @T=("ecx","ebp"); | ||
| 124 | |||
| 125 | &function_begin_B("_mul_1x1_ialu"); | ||
| 126 | &sub ("esp",32+4); | ||
| 127 | &mov ($a1,$a); | ||
| 128 | &lea ($a2,&DWP(0,$a,$a)); | ||
| 129 | &lea ($a4,&DWP(0,"",$a,4)); | ||
| 130 | &and ($a1,0x3fffffff); | ||
| 131 | &lea (@i[1],&DWP(0,$lo,$lo)); | ||
| 132 | &sar ($lo,31); # broadcast 31st bit | ||
| 133 | &mov (&DWP(0*4,"esp"),0); | ||
| 134 | &and ($a2,0x7fffffff); | ||
| 135 | &mov (&DWP(1*4,"esp"),$a1); # a1 | ||
| 136 | &xor ($a1,$a2); # a1^a2 | ||
| 137 | &mov (&DWP(2*4,"esp"),$a2); # a2 | ||
| 138 | &xor ($a2,$a4); # a2^a4 | ||
| 139 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | ||
| 140 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | ||
| 141 | &mov (&DWP(4*4,"esp"),$a4); # a4 | ||
| 142 | &xor ($a4,$a2); # a2=a4^a2^a4 | ||
| 143 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | ||
| 144 | &xor ($a4,$a1); # a1^a2^a4 | ||
| 145 | &sar (@i[1],31); # broardcast 30th bit | ||
| 146 | &and ($lo,$b); | ||
| 147 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | ||
| 148 | &and (@i[1],$b); | ||
| 149 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | ||
| 150 | &mov ($hi,$lo); | ||
| 151 | &shl ($lo,31); | ||
| 152 | &mov (@T[0],@i[1]); | ||
| 153 | &shr ($hi,1); | ||
| 154 | |||
| 155 | &mov (@i[0],0x7); | ||
| 156 | &shl (@i[1],30); | ||
| 157 | &and (@i[0],$b); | ||
| 158 | &shr (@T[0],2); | ||
| 159 | &xor ($lo,@i[1]); | ||
| 160 | |||
| 161 | &shr ($b,3); | ||
| 162 | &mov (@i[1],0x7); # 5-byte instruction!? | ||
| 163 | &and (@i[1],$b); | ||
| 164 | &shr ($b,3); | ||
| 165 | &xor ($hi,@T[0]); | ||
| 166 | &xor ($lo,&DWP(0,"esp",@i[0],4)); | ||
| 167 | &mov (@i[0],0x7); | ||
| 168 | &and (@i[0],$b); | ||
| 169 | &shr ($b,3); | ||
| 170 | for($n=1;$n<9;$n++) { | ||
| 171 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 172 | &mov (@i[1],0x7); | ||
| 173 | &mov (@T[0],@T[1]); | ||
| 174 | &shl (@T[1],3*$n); | ||
| 175 | &and (@i[1],$b); | ||
| 176 | &shr (@T[0],32-3*$n); | ||
| 177 | &xor ($lo,@T[1]); | ||
| 178 | &shr ($b,3); | ||
| 179 | &xor ($hi,@T[0]); | ||
| 180 | |||
| 181 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
| 182 | } | ||
| 183 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | ||
| 184 | &mov (@T[0],@T[1]); | ||
| 185 | &shl (@T[1],3*$n); | ||
| 186 | &mov (@i[1],&DWP(0,"esp",@i[0],4)); | ||
| 187 | &shr (@T[0],32-3*$n); $n++; | ||
| 188 | &mov (@i[0],@i[1]); | ||
| 189 | &xor ($lo,@T[1]); | ||
| 190 | &shl (@i[1],3*$n); | ||
| 191 | &xor ($hi,@T[0]); | ||
| 192 | &shr (@i[0],32-3*$n); | ||
| 193 | &xor ($lo,@i[1]); | ||
| 194 | &xor ($hi,@i[0]); | ||
| 195 | |||
| 196 | &add ("esp",32+4); | ||
| 197 | &ret (); | ||
| 198 | &function_end_B("_mul_1x1_ialu"); | ||
| 199 | |||
| 200 | # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); | ||
| 201 | &function_begin_B("bn_GF2m_mul_2x2"); | ||
| 202 | if (!$x86only) { | ||
| 203 | &picmeup("edx","OPENSSL_ia32cap_P"); | ||
| 204 | &mov ("eax",&DWP(0,"edx")); | ||
| 205 | &mov ("edx",&DWP(4,"edx")); | ||
| 206 | &test ("eax",1<<23); # check MMX bit | ||
| 207 | &jz (&label("ialu")); | ||
| 208 | if ($sse2) { | ||
| 209 | &test ("eax",1<<24); # check FXSR bit | ||
| 210 | &jz (&label("mmx")); | ||
| 211 | &test ("edx",1<<1); # check PCLMULQDQ bit | ||
| 212 | &jz (&label("mmx")); | ||
| 213 | |||
| 214 | &movups ("xmm0",&QWP(8,"esp")); | ||
| 215 | &shufps ("xmm0","xmm0",0b10110001); | ||
| 216 | &pclmulqdq ("xmm0","xmm0",1); | ||
| 217 | &mov ("eax",&DWP(4,"esp")); | ||
| 218 | &movups (&QWP(0,"eax"),"xmm0"); | ||
| 219 | &ret (); | ||
| 220 | |||
| 221 | &set_label("mmx",16); | ||
| 222 | } | ||
| 223 | &push ("ebp"); | ||
| 224 | &push ("ebx"); | ||
| 225 | &push ("esi"); | ||
| 226 | &push ("edi"); | ||
| 227 | &mov ($a,&wparam(1)); | ||
| 228 | &mov ($b,&wparam(3)); | ||
| 229 | &call ("_mul_1x1_mmx"); # a1·b1 | ||
| 230 | &movq ("mm7",$R); | ||
| 231 | |||
| 232 | &mov ($a,&wparam(2)); | ||
| 233 | &mov ($b,&wparam(4)); | ||
| 234 | &call ("_mul_1x1_mmx"); # a0·b0 | ||
| 235 | &movq ("mm6",$R); | ||
| 236 | |||
| 237 | &mov ($a,&wparam(1)); | ||
| 238 | &mov ($b,&wparam(3)); | ||
| 239 | &xor ($a,&wparam(2)); | ||
| 240 | &xor ($b,&wparam(4)); | ||
| 241 | &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) | ||
| 242 | &pxor ($R,"mm7"); | ||
| 243 | &mov ($a,&wparam(0)); | ||
| 244 | &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 | ||
| 245 | |||
| 246 | &movq ($A,$R); | ||
| 247 | &psllq ($R,32); | ||
| 248 | &pop ("edi"); | ||
| 249 | &psrlq ($A,32); | ||
| 250 | &pop ("esi"); | ||
| 251 | &pxor ($R,"mm6"); | ||
| 252 | &pop ("ebx"); | ||
| 253 | &pxor ($A,"mm7"); | ||
| 254 | &movq (&QWP(0,$a),$R); | ||
| 255 | &pop ("ebp"); | ||
| 256 | &movq (&QWP(8,$a),$A); | ||
| 257 | &emms (); | ||
| 258 | &ret (); | ||
| 259 | &set_label("ialu",16); | ||
| 260 | } | ||
| 261 | &push ("ebp"); | ||
| 262 | &push ("ebx"); | ||
| 263 | &push ("esi"); | ||
| 264 | &push ("edi"); | ||
| 265 | &stack_push(4+1); | ||
| 266 | |||
| 267 | &mov ($a,&wparam(1)); | ||
| 268 | &mov ($b,&wparam(3)); | ||
| 269 | &call ("_mul_1x1_ialu"); # a1·b1 | ||
| 270 | &mov (&DWP(8,"esp"),$lo); | ||
| 271 | &mov (&DWP(12,"esp"),$hi); | ||
| 272 | |||
| 273 | &mov ($a,&wparam(2)); | ||
| 274 | &mov ($b,&wparam(4)); | ||
| 275 | &call ("_mul_1x1_ialu"); # a0·b0 | ||
| 276 | &mov (&DWP(0,"esp"),$lo); | ||
| 277 | &mov (&DWP(4,"esp"),$hi); | ||
| 278 | |||
| 279 | &mov ($a,&wparam(1)); | ||
| 280 | &mov ($b,&wparam(3)); | ||
| 281 | &xor ($a,&wparam(2)); | ||
| 282 | &xor ($b,&wparam(4)); | ||
| 283 | &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) | ||
| 284 | |||
| 285 | &mov ("ebp",&wparam(0)); | ||
| 286 | @r=("ebx","ecx","edi","esi"); | ||
| 287 | &mov (@r[0],&DWP(0,"esp")); | ||
| 288 | &mov (@r[1],&DWP(4,"esp")); | ||
| 289 | &mov (@r[2],&DWP(8,"esp")); | ||
| 290 | &mov (@r[3],&DWP(12,"esp")); | ||
| 291 | |||
| 292 | &xor ($lo,$hi); | ||
| 293 | &xor ($hi,@r[1]); | ||
| 294 | &xor ($lo,@r[0]); | ||
| 295 | &mov (&DWP(0,"ebp"),@r[0]); | ||
| 296 | &xor ($hi,@r[2]); | ||
| 297 | &mov (&DWP(12,"ebp"),@r[3]); | ||
| 298 | &xor ($lo,@r[3]); | ||
| 299 | &stack_pop(4+1); | ||
| 300 | &xor ($hi,@r[3]); | ||
| 301 | &pop ("edi"); | ||
| 302 | &xor ($lo,$hi); | ||
| 303 | &pop ("esi"); | ||
| 304 | &mov (&DWP(8,"ebp"),$hi); | ||
| 305 | &pop ("ebx"); | ||
| 306 | &mov (&DWP(4,"ebp"),$lo); | ||
| 307 | &pop ("ebp"); | ||
| 308 | &ret (); | ||
| 309 | &function_end_B("bn_GF2m_mul_2x2"); | ||
| 310 | |||
| 311 | &asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
| 312 | |||
| 313 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl new file mode 100644 index 0000000000..1658acbbdd --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl | |||
| @@ -0,0 +1,389 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
| 13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
| 14 | # the time being... Except that it has two code paths: code suitable | ||
| 15 | # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and | ||
| 16 | # later. Improvement varies from one benchmark and µ-arch to another. | ||
| 17 | # Vanilla code path is at most 20% faster than compiler-generated code | ||
| 18 | # [not very impressive], while PCLMULQDQ - whole 85%-160% better on | ||
| 19 | # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that | ||
| 20 | # these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not | ||
| 21 | # all CPU time is burnt in it... | ||
| 22 | |||
| 23 | $flavour = shift; | ||
| 24 | $output = shift; | ||
| 25 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 26 | |||
| 27 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 28 | |||
| 29 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 30 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 31 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 32 | die "can't locate x86_64-xlate.pl"; | ||
| 33 | |||
| 34 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 35 | |||
| 36 | ($lo,$hi)=("%rax","%rdx"); $a=$lo; | ||
| 37 | ($i0,$i1)=("%rsi","%rdi"); | ||
| 38 | ($t0,$t1)=("%rbx","%rcx"); | ||
| 39 | ($b,$mask)=("%rbp","%r8"); | ||
| 40 | ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15)); | ||
| 41 | ($R,$Tx)=("%xmm0","%xmm1"); | ||
| 42 | |||
| 43 | $code.=<<___; | ||
| 44 | .text | ||
| 45 | |||
| 46 | .type _mul_1x1,\@abi-omnipotent | ||
| 47 | .align 16 | ||
| 48 | _mul_1x1: | ||
| 49 | sub \$128+8,%rsp | ||
| 50 | mov \$-1,$a1 | ||
| 51 | lea ($a,$a),$i0 | ||
| 52 | shr \$3,$a1 | ||
| 53 | lea (,$a,4),$i1 | ||
| 54 | and $a,$a1 # a1=a&0x1fffffffffffffff | ||
| 55 | lea (,$a,8),$a8 | ||
| 56 | sar \$63,$a # broadcast 63rd bit | ||
| 57 | lea ($a1,$a1),$a2 | ||
| 58 | sar \$63,$i0 # broadcast 62nd bit | ||
| 59 | lea (,$a1,4),$a4 | ||
| 60 | and $b,$a | ||
| 61 | sar \$63,$i1 # boardcast 61st bit | ||
| 62 | mov $a,$hi # $a is $lo | ||
| 63 | shl \$63,$lo | ||
| 64 | and $b,$i0 | ||
| 65 | shr \$1,$hi | ||
| 66 | mov $i0,$t1 | ||
| 67 | shl \$62,$i0 | ||
| 68 | and $b,$i1 | ||
| 69 | shr \$2,$t1 | ||
| 70 | xor $i0,$lo | ||
| 71 | mov $i1,$t0 | ||
| 72 | shl \$61,$i1 | ||
| 73 | xor $t1,$hi | ||
| 74 | shr \$3,$t0 | ||
| 75 | xor $i1,$lo | ||
| 76 | xor $t0,$hi | ||
| 77 | |||
| 78 | mov $a1,$a12 | ||
| 79 | movq \$0,0(%rsp) # tab[0]=0 | ||
| 80 | xor $a2,$a12 # a1^a2 | ||
| 81 | mov $a1,8(%rsp) # tab[1]=a1 | ||
| 82 | mov $a4,$a48 | ||
| 83 | mov $a2,16(%rsp) # tab[2]=a2 | ||
| 84 | xor $a8,$a48 # a4^a8 | ||
| 85 | mov $a12,24(%rsp) # tab[3]=a1^a2 | ||
| 86 | |||
| 87 | xor $a4,$a1 | ||
| 88 | mov $a4,32(%rsp) # tab[4]=a4 | ||
| 89 | xor $a4,$a2 | ||
| 90 | mov $a1,40(%rsp) # tab[5]=a1^a4 | ||
| 91 | xor $a4,$a12 | ||
| 92 | mov $a2,48(%rsp) # tab[6]=a2^a4 | ||
| 93 | xor $a48,$a1 # a1^a4^a4^a8=a1^a8 | ||
| 94 | mov $a12,56(%rsp) # tab[7]=a1^a2^a4 | ||
| 95 | xor $a48,$a2 # a2^a4^a4^a8=a1^a8 | ||
| 96 | |||
| 97 | mov $a8,64(%rsp) # tab[8]=a8 | ||
| 98 | xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8 | ||
| 99 | mov $a1,72(%rsp) # tab[9]=a1^a8 | ||
| 100 | xor $a4,$a1 # a1^a8^a4 | ||
| 101 | mov $a2,80(%rsp) # tab[10]=a2^a8 | ||
| 102 | xor $a4,$a2 # a2^a8^a4 | ||
| 103 | mov $a12,88(%rsp) # tab[11]=a1^a2^a8 | ||
| 104 | |||
| 105 | xor $a4,$a12 # a1^a2^a8^a4 | ||
| 106 | mov $a48,96(%rsp) # tab[12]=a4^a8 | ||
| 107 | mov $mask,$i0 | ||
| 108 | mov $a1,104(%rsp) # tab[13]=a1^a4^a8 | ||
| 109 | and $b,$i0 | ||
| 110 | mov $a2,112(%rsp) # tab[14]=a2^a4^a8 | ||
| 111 | shr \$4,$b | ||
| 112 | mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8 | ||
| 113 | mov $mask,$i1 | ||
| 114 | and $b,$i1 | ||
| 115 | shr \$4,$b | ||
| 116 | |||
| 117 | movq (%rsp,$i0,8),$R # half of calculations is done in SSE2 | ||
| 118 | mov $mask,$i0 | ||
| 119 | and $b,$i0 | ||
| 120 | shr \$4,$b | ||
| 121 | ___ | ||
| 122 | for ($n=1;$n<8;$n++) { | ||
| 123 | $code.=<<___; | ||
| 124 | mov (%rsp,$i1,8),$t1 | ||
| 125 | mov $mask,$i1 | ||
| 126 | mov $t1,$t0 | ||
| 127 | shl \$`8*$n-4`,$t1 | ||
| 128 | and $b,$i1 | ||
| 129 | movq (%rsp,$i0,8),$Tx | ||
| 130 | shr \$`64-(8*$n-4)`,$t0 | ||
| 131 | xor $t1,$lo | ||
| 132 | pslldq \$$n,$Tx | ||
| 133 | mov $mask,$i0 | ||
| 134 | shr \$4,$b | ||
| 135 | xor $t0,$hi | ||
| 136 | and $b,$i0 | ||
| 137 | shr \$4,$b | ||
| 138 | pxor $Tx,$R | ||
| 139 | ___ | ||
| 140 | } | ||
| 141 | $code.=<<___; | ||
| 142 | mov (%rsp,$i1,8),$t1 | ||
| 143 | mov $t1,$t0 | ||
| 144 | shl \$`8*$n-4`,$t1 | ||
| 145 | movq $R,$i0 | ||
| 146 | shr \$`64-(8*$n-4)`,$t0 | ||
| 147 | xor $t1,$lo | ||
| 148 | psrldq \$8,$R | ||
| 149 | xor $t0,$hi | ||
| 150 | movq $R,$i1 | ||
| 151 | xor $i0,$lo | ||
| 152 | xor $i1,$hi | ||
| 153 | |||
| 154 | add \$128+8,%rsp | ||
| 155 | ret | ||
| 156 | .Lend_mul_1x1: | ||
| 157 | .size _mul_1x1,.-_mul_1x1 | ||
| 158 | ___ | ||
| 159 | |||
| 160 | ($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order | ||
| 161 | ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order | ||
| 162 | |||
| 163 | $code.=<<___; | ||
| 164 | .extern OPENSSL_ia32cap_P | ||
| 165 | .globl bn_GF2m_mul_2x2 | ||
| 166 | .type bn_GF2m_mul_2x2,\@abi-omnipotent | ||
| 167 | .align 16 | ||
| 168 | bn_GF2m_mul_2x2: | ||
| 169 | mov OPENSSL_ia32cap_P(%rip),%rax | ||
| 170 | bt \$33,%rax | ||
| 171 | jnc .Lvanilla_mul_2x2 | ||
| 172 | |||
| 173 | movq $a1,%xmm0 | ||
| 174 | movq $b1,%xmm1 | ||
| 175 | movq $a0,%xmm2 | ||
| 176 | ___ | ||
| 177 | $code.=<<___ if ($win64); | ||
| 178 | movq 40(%rsp),%xmm3 | ||
| 179 | ___ | ||
| 180 | $code.=<<___ if (!$win64); | ||
| 181 | movq $b0,%xmm3 | ||
| 182 | ___ | ||
| 183 | $code.=<<___; | ||
| 184 | movdqa %xmm0,%xmm4 | ||
| 185 | movdqa %xmm1,%xmm5 | ||
| 186 | pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 | ||
| 187 | pxor %xmm2,%xmm4 | ||
| 188 | pxor %xmm3,%xmm5 | ||
| 189 | pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 | ||
| 190 | pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) | ||
| 191 | xorps %xmm0,%xmm4 | ||
| 192 | xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 | ||
| 193 | movdqa %xmm4,%xmm5 | ||
| 194 | pslldq \$8,%xmm4 | ||
| 195 | psrldq \$8,%xmm5 | ||
| 196 | pxor %xmm4,%xmm2 | ||
| 197 | pxor %xmm5,%xmm0 | ||
| 198 | movdqu %xmm2,0($rp) | ||
| 199 | movdqu %xmm0,16($rp) | ||
| 200 | ret | ||
| 201 | |||
| 202 | .align 16 | ||
| 203 | .Lvanilla_mul_2x2: | ||
| 204 | lea -8*17(%rsp),%rsp | ||
| 205 | ___ | ||
| 206 | $code.=<<___ if ($win64); | ||
| 207 | mov `8*17+40`(%rsp),$b0 | ||
| 208 | mov %rdi,8*15(%rsp) | ||
| 209 | mov %rsi,8*16(%rsp) | ||
| 210 | ___ | ||
| 211 | $code.=<<___; | ||
| 212 | mov %r14,8*10(%rsp) | ||
| 213 | mov %r13,8*11(%rsp) | ||
| 214 | mov %r12,8*12(%rsp) | ||
| 215 | mov %rbp,8*13(%rsp) | ||
| 216 | mov %rbx,8*14(%rsp) | ||
| 217 | .Lbody_mul_2x2: | ||
| 218 | mov $rp,32(%rsp) # save the arguments | ||
| 219 | mov $a1,40(%rsp) | ||
| 220 | mov $a0,48(%rsp) | ||
| 221 | mov $b1,56(%rsp) | ||
| 222 | mov $b0,64(%rsp) | ||
| 223 | |||
| 224 | mov \$0xf,$mask | ||
| 225 | mov $a1,$a | ||
| 226 | mov $b1,$b | ||
| 227 | call _mul_1x1 # a1·b1 | ||
| 228 | mov $lo,16(%rsp) | ||
| 229 | mov $hi,24(%rsp) | ||
| 230 | |||
| 231 | mov 48(%rsp),$a | ||
| 232 | mov 64(%rsp),$b | ||
| 233 | call _mul_1x1 # a0·b0 | ||
| 234 | mov $lo,0(%rsp) | ||
| 235 | mov $hi,8(%rsp) | ||
| 236 | |||
| 237 | mov 40(%rsp),$a | ||
| 238 | mov 56(%rsp),$b | ||
| 239 | xor 48(%rsp),$a | ||
| 240 | xor 64(%rsp),$b | ||
| 241 | call _mul_1x1 # (a0+a1)·(b0+b1) | ||
| 242 | ___ | ||
| 243 | @r=("%rbx","%rcx","%rdi","%rsi"); | ||
| 244 | $code.=<<___; | ||
| 245 | mov 0(%rsp),@r[0] | ||
| 246 | mov 8(%rsp),@r[1] | ||
| 247 | mov 16(%rsp),@r[2] | ||
| 248 | mov 24(%rsp),@r[3] | ||
| 249 | mov 32(%rsp),%rbp | ||
| 250 | |||
| 251 | xor $hi,$lo | ||
| 252 | xor @r[1],$hi | ||
| 253 | xor @r[0],$lo | ||
| 254 | mov @r[0],0(%rbp) | ||
| 255 | xor @r[2],$hi | ||
| 256 | mov @r[3],24(%rbp) | ||
| 257 | xor @r[3],$lo | ||
| 258 | xor @r[3],$hi | ||
| 259 | xor $hi,$lo | ||
| 260 | mov $hi,16(%rbp) | ||
| 261 | mov $lo,8(%rbp) | ||
| 262 | |||
| 263 | mov 8*10(%rsp),%r14 | ||
| 264 | mov 8*11(%rsp),%r13 | ||
| 265 | mov 8*12(%rsp),%r12 | ||
| 266 | mov 8*13(%rsp),%rbp | ||
| 267 | mov 8*14(%rsp),%rbx | ||
| 268 | ___ | ||
| 269 | $code.=<<___ if ($win64); | ||
| 270 | mov 8*15(%rsp),%rdi | ||
| 271 | mov 8*16(%rsp),%rsi | ||
| 272 | ___ | ||
| 273 | $code.=<<___; | ||
| 274 | lea 8*17(%rsp),%rsp | ||
| 275 | ret | ||
| 276 | .Lend_mul_2x2: | ||
| 277 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
| 278 | .asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 279 | .align 16 | ||
| 280 | ___ | ||
| 281 | |||
| 282 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 283 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 284 | if ($win64) { | ||
| 285 | $rec="%rcx"; | ||
| 286 | $frame="%rdx"; | ||
| 287 | $context="%r8"; | ||
| 288 | $disp="%r9"; | ||
| 289 | |||
| 290 | $code.=<<___; | ||
| 291 | .extern __imp_RtlVirtualUnwind | ||
| 292 | |||
| 293 | .type se_handler,\@abi-omnipotent | ||
| 294 | .align 16 | ||
| 295 | se_handler: | ||
| 296 | push %rsi | ||
| 297 | push %rdi | ||
| 298 | push %rbx | ||
| 299 | push %rbp | ||
| 300 | push %r12 | ||
| 301 | push %r13 | ||
| 302 | push %r14 | ||
| 303 | push %r15 | ||
| 304 | pushfq | ||
| 305 | sub \$64,%rsp | ||
| 306 | |||
| 307 | mov 152($context),%rax # pull context->Rsp | ||
| 308 | mov 248($context),%rbx # pull context->Rip | ||
| 309 | |||
| 310 | lea .Lbody_mul_2x2(%rip),%r10 | ||
| 311 | cmp %r10,%rbx # context->Rip<"prologue" label | ||
| 312 | jb .Lin_prologue | ||
| 313 | |||
| 314 | mov 8*10(%rax),%r14 # mimic epilogue | ||
| 315 | mov 8*11(%rax),%r13 | ||
| 316 | mov 8*12(%rax),%r12 | ||
| 317 | mov 8*13(%rax),%rbp | ||
| 318 | mov 8*14(%rax),%rbx | ||
| 319 | mov 8*15(%rax),%rdi | ||
| 320 | mov 8*16(%rax),%rsi | ||
| 321 | |||
| 322 | mov %rbx,144($context) # restore context->Rbx | ||
| 323 | mov %rbp,160($context) # restore context->Rbp | ||
| 324 | mov %rsi,168($context) # restore context->Rsi | ||
| 325 | mov %rdi,176($context) # restore context->Rdi | ||
| 326 | mov %r12,216($context) # restore context->R12 | ||
| 327 | mov %r13,224($context) # restore context->R13 | ||
| 328 | mov %r14,232($context) # restore context->R14 | ||
| 329 | |||
| 330 | .Lin_prologue: | ||
| 331 | lea 8*17(%rax),%rax | ||
| 332 | mov %rax,152($context) # restore context->Rsp | ||
| 333 | |||
| 334 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 335 | mov $context,%rsi # context | ||
| 336 | mov \$154,%ecx # sizeof(CONTEXT) | ||
| 337 | .long 0xa548f3fc # cld; rep movsq | ||
| 338 | |||
| 339 | mov $disp,%rsi | ||
| 340 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 341 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 342 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 343 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 344 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 345 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 346 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 347 | mov %r10,32(%rsp) # arg5 | ||
| 348 | mov %r11,40(%rsp) # arg6 | ||
| 349 | mov %r12,48(%rsp) # arg7 | ||
| 350 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 351 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 352 | |||
| 353 | mov \$1,%eax # ExceptionContinueSearch | ||
| 354 | add \$64,%rsp | ||
| 355 | popfq | ||
| 356 | pop %r15 | ||
| 357 | pop %r14 | ||
| 358 | pop %r13 | ||
| 359 | pop %r12 | ||
| 360 | pop %rbp | ||
| 361 | pop %rbx | ||
| 362 | pop %rdi | ||
| 363 | pop %rsi | ||
| 364 | ret | ||
| 365 | .size se_handler,.-se_handler | ||
| 366 | |||
| 367 | .section .pdata | ||
| 368 | .align 4 | ||
| 369 | .rva _mul_1x1 | ||
| 370 | .rva .Lend_mul_1x1 | ||
| 371 | .rva .LSEH_info_1x1 | ||
| 372 | |||
| 373 | .rva .Lvanilla_mul_2x2 | ||
| 374 | .rva .Lend_mul_2x2 | ||
| 375 | .rva .LSEH_info_2x2 | ||
| 376 | .section .xdata | ||
| 377 | .align 8 | ||
| 378 | .LSEH_info_1x1: | ||
| 379 | .byte 0x01,0x07,0x02,0x00 | ||
| 380 | .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8 | ||
| 381 | .LSEH_info_2x2: | ||
| 382 | .byte 9,0,0,0 | ||
| 383 | .rva se_handler | ||
| 384 | ___ | ||
| 385 | } | ||
| 386 | |||
| 387 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 388 | print $code; | ||
| 389 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl index 3b7a6f243f..5d79b35e1c 100755 --- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl +++ b/src/lib/libcrypto/bn/asm/x86_64-mont.pl | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | #!/usr/bin/env perl | 1 | #!/usr/bin/env perl |
| 2 | 2 | ||
| 3 | # ==================================================================== | 3 | # ==================================================================== |
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| 5 | # project. The module is, however, dual licensed under OpenSSL and | 5 | # project. The module is, however, dual licensed under OpenSSL and |
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | 7 | # details see http://www.openssl.org/~appro/cryptogams/. |
| @@ -15,6 +15,20 @@ | |||
| 15 | # respectful 50%. It remains to be seen if loop unrolling and | 15 | # respectful 50%. It remains to be seen if loop unrolling and |
| 16 | # dedicated squaring routine can provide further improvement... | 16 | # dedicated squaring routine can provide further improvement... |
| 17 | 17 | ||
| 18 | # July 2011. | ||
| 19 | # | ||
| 20 | # Add dedicated squaring procedure. Performance improvement varies | ||
| 21 | # from platform to platform, but in average it's ~5%/15%/25%/33% | ||
| 22 | # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. | ||
| 23 | |||
| 24 | # August 2011. | ||
| 25 | # | ||
| 26 | # Unroll and modulo-schedule inner loops in such manner that they | ||
| 27 | # are "fallen through" for input lengths of 8, which is critical for | ||
| 28 | # 1024-bit RSA *sign*. Average performance improvement in comparison | ||
| 29 | # to *initial* version of this module from 2005 is ~0%/30%/40%/45% | ||
| 30 | # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. | ||
| 31 | |||
| 18 | $flavour = shift; | 32 | $flavour = shift; |
| 19 | $output = shift; | 33 | $output = shift; |
| 20 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | 34 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| @@ -37,7 +51,6 @@ $n0="%r8"; # const BN_ULONG *n0, | |||
| 37 | $num="%r9"; # int num); | 51 | $num="%r9"; # int num); |
| 38 | $lo0="%r10"; | 52 | $lo0="%r10"; |
| 39 | $hi0="%r11"; | 53 | $hi0="%r11"; |
| 40 | $bp="%r12"; # reassign $bp | ||
| 41 | $hi1="%r13"; | 54 | $hi1="%r13"; |
| 42 | $i="%r14"; | 55 | $i="%r14"; |
| 43 | $j="%r15"; | 56 | $j="%r15"; |
| @@ -51,6 +64,16 @@ $code=<<___; | |||
| 51 | .type bn_mul_mont,\@function,6 | 64 | .type bn_mul_mont,\@function,6 |
| 52 | .align 16 | 65 | .align 16 |
| 53 | bn_mul_mont: | 66 | bn_mul_mont: |
| 67 | test \$3,${num}d | ||
| 68 | jnz .Lmul_enter | ||
| 69 | cmp \$8,${num}d | ||
| 70 | jb .Lmul_enter | ||
| 71 | cmp $ap,$bp | ||
| 72 | jne .Lmul4x_enter | ||
| 73 | jmp .Lsqr4x_enter | ||
| 74 | |||
| 75 | .align 16 | ||
| 76 | .Lmul_enter: | ||
| 54 | push %rbx | 77 | push %rbx |
| 55 | push %rbp | 78 | push %rbp |
| 56 | push %r12 | 79 | push %r12 |
| @@ -66,48 +89,66 @@ bn_mul_mont: | |||
| 66 | and \$-1024,%rsp # minimize TLB usage | 89 | and \$-1024,%rsp # minimize TLB usage |
| 67 | 90 | ||
| 68 | mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp | 91 | mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp |
| 69 | .Lprologue: | 92 | .Lmul_body: |
| 70 | mov %rdx,$bp # $bp reassigned, remember? | 93 | mov $bp,%r12 # reassign $bp |
| 71 | 94 | ___ | |
| 95 | $bp="%r12"; | ||
| 96 | $code.=<<___; | ||
| 72 | mov ($n0),$n0 # pull n0[0] value | 97 | mov ($n0),$n0 # pull n0[0] value |
| 98 | mov ($bp),$m0 # m0=bp[0] | ||
| 99 | mov ($ap),%rax | ||
| 73 | 100 | ||
| 74 | xor $i,$i # i=0 | 101 | xor $i,$i # i=0 |
| 75 | xor $j,$j # j=0 | 102 | xor $j,$j # j=0 |
| 76 | 103 | ||
| 77 | mov ($bp),$m0 # m0=bp[0] | 104 | mov $n0,$m1 |
| 78 | mov ($ap),%rax | ||
| 79 | mulq $m0 # ap[0]*bp[0] | 105 | mulq $m0 # ap[0]*bp[0] |
| 80 | mov %rax,$lo0 | 106 | mov %rax,$lo0 |
| 81 | mov %rdx,$hi0 | 107 | mov ($np),%rax |
| 82 | 108 | ||
| 83 | imulq $n0,%rax # "tp[0]"*n0 | 109 | imulq $lo0,$m1 # "tp[0]"*n0 |
| 84 | mov %rax,$m1 | 110 | mov %rdx,$hi0 |
| 85 | 111 | ||
| 86 | mulq ($np) # np[0]*m1 | 112 | mulq $m1 # np[0]*m1 |
| 87 | add $lo0,%rax # discarded | 113 | add %rax,$lo0 # discarded |
| 114 | mov 8($ap),%rax | ||
| 88 | adc \$0,%rdx | 115 | adc \$0,%rdx |
| 89 | mov %rdx,$hi1 | 116 | mov %rdx,$hi1 |
| 90 | 117 | ||
| 91 | lea 1($j),$j # j++ | 118 | lea 1($j),$j # j++ |
| 119 | jmp .L1st_enter | ||
| 120 | |||
| 121 | .align 16 | ||
| 92 | .L1st: | 122 | .L1st: |
| 123 | add %rax,$hi1 | ||
| 93 | mov ($ap,$j,8),%rax | 124 | mov ($ap,$j,8),%rax |
| 94 | mulq $m0 # ap[j]*bp[0] | ||
| 95 | add $hi0,%rax | ||
| 96 | adc \$0,%rdx | 125 | adc \$0,%rdx |
| 97 | mov %rax,$lo0 | 126 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] |
| 127 | mov $lo0,$hi0 | ||
| 128 | adc \$0,%rdx | ||
| 129 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
| 130 | mov %rdx,$hi1 | ||
| 131 | |||
| 132 | .L1st_enter: | ||
| 133 | mulq $m0 # ap[j]*bp[0] | ||
| 134 | add %rax,$hi0 | ||
| 98 | mov ($np,$j,8),%rax | 135 | mov ($np,$j,8),%rax |
| 99 | mov %rdx,$hi0 | 136 | adc \$0,%rdx |
| 137 | lea 1($j),$j # j++ | ||
| 138 | mov %rdx,$lo0 | ||
| 100 | 139 | ||
| 101 | mulq $m1 # np[j]*m1 | 140 | mulq $m1 # np[j]*m1 |
| 102 | add $hi1,%rax | 141 | cmp $num,$j |
| 103 | lea 1($j),$j # j++ | 142 | jne .L1st |
| 143 | |||
| 144 | add %rax,$hi1 | ||
| 145 | mov ($ap),%rax # ap[0] | ||
| 104 | adc \$0,%rdx | 146 | adc \$0,%rdx |
| 105 | add $lo0,%rax # np[j]*m1+ap[j]*bp[0] | 147 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] |
| 106 | adc \$0,%rdx | 148 | adc \$0,%rdx |
| 107 | mov %rax,-16(%rsp,$j,8) # tp[j-1] | 149 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] |
| 108 | cmp $num,$j | ||
| 109 | mov %rdx,$hi1 | 150 | mov %rdx,$hi1 |
| 110 | jl .L1st | 151 | mov $lo0,$hi0 |
| 111 | 152 | ||
| 112 | xor %rdx,%rdx | 153 | xor %rdx,%rdx |
| 113 | add $hi0,$hi1 | 154 | add $hi0,$hi1 |
| @@ -116,50 +157,64 @@ bn_mul_mont: | |||
| 116 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit | 157 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit |
| 117 | 158 | ||
| 118 | lea 1($i),$i # i++ | 159 | lea 1($i),$i # i++ |
| 119 | .align 4 | 160 | jmp .Louter |
| 161 | .align 16 | ||
| 120 | .Louter: | 162 | .Louter: |
| 121 | xor $j,$j # j=0 | ||
| 122 | |||
| 123 | mov ($bp,$i,8),$m0 # m0=bp[i] | 163 | mov ($bp,$i,8),$m0 # m0=bp[i] |
| 124 | mov ($ap),%rax # ap[0] | 164 | xor $j,$j # j=0 |
| 165 | mov $n0,$m1 | ||
| 166 | mov (%rsp),$lo0 | ||
| 125 | mulq $m0 # ap[0]*bp[i] | 167 | mulq $m0 # ap[0]*bp[i] |
| 126 | add (%rsp),%rax # ap[0]*bp[i]+tp[0] | 168 | add %rax,$lo0 # ap[0]*bp[i]+tp[0] |
| 169 | mov ($np),%rax | ||
| 127 | adc \$0,%rdx | 170 | adc \$0,%rdx |
| 128 | mov %rax,$lo0 | ||
| 129 | mov %rdx,$hi0 | ||
| 130 | 171 | ||
| 131 | imulq $n0,%rax # tp[0]*n0 | 172 | imulq $lo0,$m1 # tp[0]*n0 |
| 132 | mov %rax,$m1 | 173 | mov %rdx,$hi0 |
| 133 | 174 | ||
| 134 | mulq ($np,$j,8) # np[0]*m1 | 175 | mulq $m1 # np[0]*m1 |
| 135 | add $lo0,%rax # discarded | 176 | add %rax,$lo0 # discarded |
| 136 | mov 8(%rsp),$lo0 # tp[1] | 177 | mov 8($ap),%rax |
| 137 | adc \$0,%rdx | 178 | adc \$0,%rdx |
| 179 | mov 8(%rsp),$lo0 # tp[1] | ||
| 138 | mov %rdx,$hi1 | 180 | mov %rdx,$hi1 |
| 139 | 181 | ||
| 140 | lea 1($j),$j # j++ | 182 | lea 1($j),$j # j++ |
| 141 | .align 4 | 183 | jmp .Linner_enter |
| 184 | |||
| 185 | .align 16 | ||
| 142 | .Linner: | 186 | .Linner: |
| 187 | add %rax,$hi1 | ||
| 143 | mov ($ap,$j,8),%rax | 188 | mov ($ap,$j,8),%rax |
| 144 | mulq $m0 # ap[j]*bp[i] | ||
| 145 | add $hi0,%rax | ||
| 146 | adc \$0,%rdx | 189 | adc \$0,%rdx |
| 147 | add %rax,$lo0 # ap[j]*bp[i]+tp[j] | 190 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] |
| 191 | mov (%rsp,$j,8),$lo0 | ||
| 192 | adc \$0,%rdx | ||
| 193 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
| 194 | mov %rdx,$hi1 | ||
| 195 | |||
| 196 | .Linner_enter: | ||
| 197 | mulq $m0 # ap[j]*bp[i] | ||
| 198 | add %rax,$hi0 | ||
| 148 | mov ($np,$j,8),%rax | 199 | mov ($np,$j,8),%rax |
| 149 | adc \$0,%rdx | 200 | adc \$0,%rdx |
| 201 | add $hi0,$lo0 # ap[j]*bp[i]+tp[j] | ||
| 150 | mov %rdx,$hi0 | 202 | mov %rdx,$hi0 |
| 203 | adc \$0,$hi0 | ||
| 204 | lea 1($j),$j # j++ | ||
| 151 | 205 | ||
| 152 | mulq $m1 # np[j]*m1 | 206 | mulq $m1 # np[j]*m1 |
| 153 | add $hi1,%rax | 207 | cmp $num,$j |
| 154 | lea 1($j),$j # j++ | 208 | jne .Linner |
| 155 | adc \$0,%rdx | 209 | |
| 156 | add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j] | 210 | add %rax,$hi1 |
| 211 | mov ($ap),%rax # ap[0] | ||
| 157 | adc \$0,%rdx | 212 | adc \$0,%rdx |
| 213 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
| 158 | mov (%rsp,$j,8),$lo0 | 214 | mov (%rsp,$j,8),$lo0 |
| 159 | cmp $num,$j | 215 | adc \$0,%rdx |
| 160 | mov %rax,-16(%rsp,$j,8) # tp[j-1] | 216 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] |
| 161 | mov %rdx,$hi1 | 217 | mov %rdx,$hi1 |
| 162 | jl .Linner | ||
| 163 | 218 | ||
| 164 | xor %rdx,%rdx | 219 | xor %rdx,%rdx |
| 165 | add $hi0,$hi1 | 220 | add $hi0,$hi1 |
| @@ -173,35 +228,449 @@ bn_mul_mont: | |||
| 173 | cmp $num,$i | 228 | cmp $num,$i |
| 174 | jl .Louter | 229 | jl .Louter |
| 175 | 230 | ||
| 176 | lea (%rsp),$ap # borrow ap for tp | ||
| 177 | lea -1($num),$j # j=num-1 | ||
| 178 | |||
| 179 | mov ($ap),%rax # tp[0] | ||
| 180 | xor $i,$i # i=0 and clear CF! | 231 | xor $i,$i # i=0 and clear CF! |
| 232 | mov (%rsp),%rax # tp[0] | ||
| 233 | lea (%rsp),$ap # borrow ap for tp | ||
| 234 | mov $num,$j # j=num | ||
| 181 | jmp .Lsub | 235 | jmp .Lsub |
| 182 | .align 16 | 236 | .align 16 |
| 183 | .Lsub: sbb ($np,$i,8),%rax | 237 | .Lsub: sbb ($np,$i,8),%rax |
| 184 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] | 238 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] |
| 185 | dec $j # doesn't affect CF! | ||
| 186 | mov 8($ap,$i,8),%rax # tp[i+1] | 239 | mov 8($ap,$i,8),%rax # tp[i+1] |
| 187 | lea 1($i),$i # i++ | 240 | lea 1($i),$i # i++ |
| 188 | jge .Lsub | 241 | dec $j # doesnn't affect CF! |
| 242 | jnz .Lsub | ||
| 189 | 243 | ||
| 190 | sbb \$0,%rax # handle upmost overflow bit | 244 | sbb \$0,%rax # handle upmost overflow bit |
| 245 | xor $i,$i | ||
| 191 | and %rax,$ap | 246 | and %rax,$ap |
| 192 | not %rax | 247 | not %rax |
| 193 | mov $rp,$np | 248 | mov $rp,$np |
| 194 | and %rax,$np | 249 | and %rax,$np |
| 195 | lea -1($num),$j | 250 | mov $num,$j # j=num |
| 196 | or $np,$ap # ap=borrow?tp:rp | 251 | or $np,$ap # ap=borrow?tp:rp |
| 197 | .align 16 | 252 | .align 16 |
| 198 | .Lcopy: # copy or in-place refresh | 253 | .Lcopy: # copy or in-place refresh |
| 254 | mov ($ap,$i,8),%rax | ||
| 255 | mov $i,(%rsp,$i,8) # zap temporary vector | ||
| 256 | mov %rax,($rp,$i,8) # rp[i]=tp[i] | ||
| 257 | lea 1($i),$i | ||
| 258 | sub \$1,$j | ||
| 259 | jnz .Lcopy | ||
| 260 | |||
| 261 | mov 8(%rsp,$num,8),%rsi # restore %rsp | ||
| 262 | mov \$1,%rax | ||
| 263 | mov (%rsi),%r15 | ||
| 264 | mov 8(%rsi),%r14 | ||
| 265 | mov 16(%rsi),%r13 | ||
| 266 | mov 24(%rsi),%r12 | ||
| 267 | mov 32(%rsi),%rbp | ||
| 268 | mov 40(%rsi),%rbx | ||
| 269 | lea 48(%rsi),%rsp | ||
| 270 | .Lmul_epilogue: | ||
| 271 | ret | ||
| 272 | .size bn_mul_mont,.-bn_mul_mont | ||
| 273 | ___ | ||
| 274 | {{{ | ||
| 275 | my @A=("%r10","%r11"); | ||
| 276 | my @N=("%r13","%rdi"); | ||
| 277 | $code.=<<___; | ||
| 278 | .type bn_mul4x_mont,\@function,6 | ||
| 279 | .align 16 | ||
| 280 | bn_mul4x_mont: | ||
| 281 | .Lmul4x_enter: | ||
| 282 | push %rbx | ||
| 283 | push %rbp | ||
| 284 | push %r12 | ||
| 285 | push %r13 | ||
| 286 | push %r14 | ||
| 287 | push %r15 | ||
| 288 | |||
| 289 | mov ${num}d,${num}d | ||
| 290 | lea 4($num),%r10 | ||
| 291 | mov %rsp,%r11 | ||
| 292 | neg %r10 | ||
| 293 | lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) | ||
| 294 | and \$-1024,%rsp # minimize TLB usage | ||
| 295 | |||
| 296 | mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp | ||
| 297 | .Lmul4x_body: | ||
| 298 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp | ||
| 299 | mov %rdx,%r12 # reassign $bp | ||
| 300 | ___ | ||
| 301 | $bp="%r12"; | ||
| 302 | $code.=<<___; | ||
| 303 | mov ($n0),$n0 # pull n0[0] value | ||
| 304 | mov ($bp),$m0 # m0=bp[0] | ||
| 305 | mov ($ap),%rax | ||
| 306 | |||
| 307 | xor $i,$i # i=0 | ||
| 308 | xor $j,$j # j=0 | ||
| 309 | |||
| 310 | mov $n0,$m1 | ||
| 311 | mulq $m0 # ap[0]*bp[0] | ||
| 312 | mov %rax,$A[0] | ||
| 313 | mov ($np),%rax | ||
| 314 | |||
| 315 | imulq $A[0],$m1 # "tp[0]"*n0 | ||
| 316 | mov %rdx,$A[1] | ||
| 317 | |||
| 318 | mulq $m1 # np[0]*m1 | ||
| 319 | add %rax,$A[0] # discarded | ||
| 320 | mov 8($ap),%rax | ||
| 321 | adc \$0,%rdx | ||
| 322 | mov %rdx,$N[1] | ||
| 323 | |||
| 324 | mulq $m0 | ||
| 325 | add %rax,$A[1] | ||
| 326 | mov 8($np),%rax | ||
| 327 | adc \$0,%rdx | ||
| 328 | mov %rdx,$A[0] | ||
| 329 | |||
| 330 | mulq $m1 | ||
| 331 | add %rax,$N[1] | ||
| 332 | mov 16($ap),%rax | ||
| 333 | adc \$0,%rdx | ||
| 334 | add $A[1],$N[1] | ||
| 335 | lea 4($j),$j # j++ | ||
| 336 | adc \$0,%rdx | ||
| 337 | mov $N[1],(%rsp) | ||
| 338 | mov %rdx,$N[0] | ||
| 339 | jmp .L1st4x | ||
| 340 | .align 16 | ||
| 341 | .L1st4x: | ||
| 342 | mulq $m0 # ap[j]*bp[0] | ||
| 343 | add %rax,$A[0] | ||
| 344 | mov -16($np,$j,8),%rax | ||
| 345 | adc \$0,%rdx | ||
| 346 | mov %rdx,$A[1] | ||
| 347 | |||
| 348 | mulq $m1 # np[j]*m1 | ||
| 349 | add %rax,$N[0] | ||
| 350 | mov -8($ap,$j,8),%rax | ||
| 351 | adc \$0,%rdx | ||
| 352 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
| 353 | adc \$0,%rdx | ||
| 354 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
| 355 | mov %rdx,$N[1] | ||
| 356 | |||
| 357 | mulq $m0 # ap[j]*bp[0] | ||
| 358 | add %rax,$A[1] | ||
| 359 | mov -8($np,$j,8),%rax | ||
| 360 | adc \$0,%rdx | ||
| 361 | mov %rdx,$A[0] | ||
| 362 | |||
| 363 | mulq $m1 # np[j]*m1 | ||
| 364 | add %rax,$N[1] | ||
| 199 | mov ($ap,$j,8),%rax | 365 | mov ($ap,$j,8),%rax |
| 200 | mov %rax,($rp,$j,8) # rp[i]=tp[i] | 366 | adc \$0,%rdx |
| 201 | mov $i,(%rsp,$j,8) # zap temporary vector | 367 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] |
| 368 | adc \$0,%rdx | ||
| 369 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
| 370 | mov %rdx,$N[0] | ||
| 371 | |||
| 372 | mulq $m0 # ap[j]*bp[0] | ||
| 373 | add %rax,$A[0] | ||
| 374 | mov ($np,$j,8),%rax | ||
| 375 | adc \$0,%rdx | ||
| 376 | mov %rdx,$A[1] | ||
| 377 | |||
| 378 | mulq $m1 # np[j]*m1 | ||
| 379 | add %rax,$N[0] | ||
| 380 | mov 8($ap,$j,8),%rax | ||
| 381 | adc \$0,%rdx | ||
| 382 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
| 383 | adc \$0,%rdx | ||
| 384 | mov $N[0],-8(%rsp,$j,8) # tp[j-1] | ||
| 385 | mov %rdx,$N[1] | ||
| 386 | |||
| 387 | mulq $m0 # ap[j]*bp[0] | ||
| 388 | add %rax,$A[1] | ||
| 389 | mov 8($np,$j,8),%rax | ||
| 390 | adc \$0,%rdx | ||
| 391 | lea 4($j),$j # j++ | ||
| 392 | mov %rdx,$A[0] | ||
| 393 | |||
| 394 | mulq $m1 # np[j]*m1 | ||
| 395 | add %rax,$N[1] | ||
| 396 | mov -16($ap,$j,8),%rax | ||
| 397 | adc \$0,%rdx | ||
| 398 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | ||
| 399 | adc \$0,%rdx | ||
| 400 | mov $N[1],-32(%rsp,$j,8) # tp[j-1] | ||
| 401 | mov %rdx,$N[0] | ||
| 402 | cmp $num,$j | ||
| 403 | jl .L1st4x | ||
| 404 | |||
| 405 | mulq $m0 # ap[j]*bp[0] | ||
| 406 | add %rax,$A[0] | ||
| 407 | mov -16($np,$j,8),%rax | ||
| 408 | adc \$0,%rdx | ||
| 409 | mov %rdx,$A[1] | ||
| 410 | |||
| 411 | mulq $m1 # np[j]*m1 | ||
| 412 | add %rax,$N[0] | ||
| 413 | mov -8($ap,$j,8),%rax | ||
| 414 | adc \$0,%rdx | ||
| 415 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
| 416 | adc \$0,%rdx | ||
| 417 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
| 418 | mov %rdx,$N[1] | ||
| 419 | |||
| 420 | mulq $m0 # ap[j]*bp[0] | ||
| 421 | add %rax,$A[1] | ||
| 422 | mov -8($np,$j,8),%rax | ||
| 423 | adc \$0,%rdx | ||
| 424 | mov %rdx,$A[0] | ||
| 425 | |||
| 426 | mulq $m1 # np[j]*m1 | ||
| 427 | add %rax,$N[1] | ||
| 428 | mov ($ap),%rax # ap[0] | ||
| 429 | adc \$0,%rdx | ||
| 430 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | ||
| 431 | adc \$0,%rdx | ||
| 432 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
| 433 | mov %rdx,$N[0] | ||
| 434 | |||
| 435 | xor $N[1],$N[1] | ||
| 436 | add $A[0],$N[0] | ||
| 437 | adc \$0,$N[1] | ||
| 438 | mov $N[0],-8(%rsp,$j,8) | ||
| 439 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit | ||
| 440 | |||
| 441 | lea 1($i),$i # i++ | ||
| 442 | .align 4 | ||
| 443 | .Louter4x: | ||
| 444 | mov ($bp,$i,8),$m0 # m0=bp[i] | ||
| 445 | xor $j,$j # j=0 | ||
| 446 | mov (%rsp),$A[0] | ||
| 447 | mov $n0,$m1 | ||
| 448 | mulq $m0 # ap[0]*bp[i] | ||
| 449 | add %rax,$A[0] # ap[0]*bp[i]+tp[0] | ||
| 450 | mov ($np),%rax | ||
| 451 | adc \$0,%rdx | ||
| 452 | |||
| 453 | imulq $A[0],$m1 # tp[0]*n0 | ||
| 454 | mov %rdx,$A[1] | ||
| 455 | |||
| 456 | mulq $m1 # np[0]*m1 | ||
| 457 | add %rax,$A[0] # "$N[0]", discarded | ||
| 458 | mov 8($ap),%rax | ||
| 459 | adc \$0,%rdx | ||
| 460 | mov %rdx,$N[1] | ||
| 461 | |||
| 462 | mulq $m0 # ap[j]*bp[i] | ||
| 463 | add %rax,$A[1] | ||
| 464 | mov 8($np),%rax | ||
| 465 | adc \$0,%rdx | ||
| 466 | add 8(%rsp),$A[1] # +tp[1] | ||
| 467 | adc \$0,%rdx | ||
| 468 | mov %rdx,$A[0] | ||
| 469 | |||
| 470 | mulq $m1 # np[j]*m1 | ||
| 471 | add %rax,$N[1] | ||
| 472 | mov 16($ap),%rax | ||
| 473 | adc \$0,%rdx | ||
| 474 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
| 475 | lea 4($j),$j # j+=2 | ||
| 476 | adc \$0,%rdx | ||
| 477 | mov $N[1],(%rsp) # tp[j-1] | ||
| 478 | mov %rdx,$N[0] | ||
| 479 | jmp .Linner4x | ||
| 480 | .align 16 | ||
| 481 | .Linner4x: | ||
| 482 | mulq $m0 # ap[j]*bp[i] | ||
| 483 | add %rax,$A[0] | ||
| 484 | mov -16($np,$j,8),%rax | ||
| 485 | adc \$0,%rdx | ||
| 486 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
| 487 | adc \$0,%rdx | ||
| 488 | mov %rdx,$A[1] | ||
| 489 | |||
| 490 | mulq $m1 # np[j]*m1 | ||
| 491 | add %rax,$N[0] | ||
| 492 | mov -8($ap,$j,8),%rax | ||
| 493 | adc \$0,%rdx | ||
| 494 | add $A[0],$N[0] | ||
| 495 | adc \$0,%rdx | ||
| 496 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
| 497 | mov %rdx,$N[1] | ||
| 498 | |||
| 499 | mulq $m0 # ap[j]*bp[i] | ||
| 500 | add %rax,$A[1] | ||
| 501 | mov -8($np,$j,8),%rax | ||
| 502 | adc \$0,%rdx | ||
| 503 | add -8(%rsp,$j,8),$A[1] | ||
| 504 | adc \$0,%rdx | ||
| 505 | mov %rdx,$A[0] | ||
| 506 | |||
| 507 | mulq $m1 # np[j]*m1 | ||
| 508 | add %rax,$N[1] | ||
| 509 | mov ($ap,$j,8),%rax | ||
| 510 | adc \$0,%rdx | ||
| 511 | add $A[1],$N[1] | ||
| 512 | adc \$0,%rdx | ||
| 513 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
| 514 | mov %rdx,$N[0] | ||
| 515 | |||
| 516 | mulq $m0 # ap[j]*bp[i] | ||
| 517 | add %rax,$A[0] | ||
| 518 | mov ($np,$j,8),%rax | ||
| 519 | adc \$0,%rdx | ||
| 520 | add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
| 521 | adc \$0,%rdx | ||
| 522 | mov %rdx,$A[1] | ||
| 523 | |||
| 524 | mulq $m1 # np[j]*m1 | ||
| 525 | add %rax,$N[0] | ||
| 526 | mov 8($ap,$j,8),%rax | ||
| 527 | adc \$0,%rdx | ||
| 528 | add $A[0],$N[0] | ||
| 529 | adc \$0,%rdx | ||
| 530 | mov $N[0],-8(%rsp,$j,8) # tp[j-1] | ||
| 531 | mov %rdx,$N[1] | ||
| 532 | |||
| 533 | mulq $m0 # ap[j]*bp[i] | ||
| 534 | add %rax,$A[1] | ||
| 535 | mov 8($np,$j,8),%rax | ||
| 536 | adc \$0,%rdx | ||
| 537 | add 8(%rsp,$j,8),$A[1] | ||
| 538 | adc \$0,%rdx | ||
| 539 | lea 4($j),$j # j++ | ||
| 540 | mov %rdx,$A[0] | ||
| 541 | |||
| 542 | mulq $m1 # np[j]*m1 | ||
| 543 | add %rax,$N[1] | ||
| 544 | mov -16($ap,$j,8),%rax | ||
| 545 | adc \$0,%rdx | ||
| 546 | add $A[1],$N[1] | ||
| 547 | adc \$0,%rdx | ||
| 548 | mov $N[1],-32(%rsp,$j,8) # tp[j-1] | ||
| 549 | mov %rdx,$N[0] | ||
| 550 | cmp $num,$j | ||
| 551 | jl .Linner4x | ||
| 552 | |||
| 553 | mulq $m0 # ap[j]*bp[i] | ||
| 554 | add %rax,$A[0] | ||
| 555 | mov -16($np,$j,8),%rax | ||
| 556 | adc \$0,%rdx | ||
| 557 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
| 558 | adc \$0,%rdx | ||
| 559 | mov %rdx,$A[1] | ||
| 560 | |||
| 561 | mulq $m1 # np[j]*m1 | ||
| 562 | add %rax,$N[0] | ||
| 563 | mov -8($ap,$j,8),%rax | ||
| 564 | adc \$0,%rdx | ||
| 565 | add $A[0],$N[0] | ||
| 566 | adc \$0,%rdx | ||
| 567 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
| 568 | mov %rdx,$N[1] | ||
| 569 | |||
| 570 | mulq $m0 # ap[j]*bp[i] | ||
| 571 | add %rax,$A[1] | ||
| 572 | mov -8($np,$j,8),%rax | ||
| 573 | adc \$0,%rdx | ||
| 574 | add -8(%rsp,$j,8),$A[1] | ||
| 575 | adc \$0,%rdx | ||
| 576 | lea 1($i),$i # i++ | ||
| 577 | mov %rdx,$A[0] | ||
| 578 | |||
| 579 | mulq $m1 # np[j]*m1 | ||
| 580 | add %rax,$N[1] | ||
| 581 | mov ($ap),%rax # ap[0] | ||
| 582 | adc \$0,%rdx | ||
| 583 | add $A[1],$N[1] | ||
| 584 | adc \$0,%rdx | ||
| 585 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
| 586 | mov %rdx,$N[0] | ||
| 587 | |||
| 588 | xor $N[1],$N[1] | ||
| 589 | add $A[0],$N[0] | ||
| 590 | adc \$0,$N[1] | ||
| 591 | add (%rsp,$num,8),$N[0] # pull upmost overflow bit | ||
| 592 | adc \$0,$N[1] | ||
| 593 | mov $N[0],-8(%rsp,$j,8) | ||
| 594 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit | ||
| 595 | |||
| 596 | cmp $num,$i | ||
| 597 | jl .Louter4x | ||
| 598 | ___ | ||
| 599 | { | ||
| 600 | my @ri=("%rax","%rdx",$m0,$m1); | ||
| 601 | $code.=<<___; | ||
| 602 | mov 16(%rsp,$num,8),$rp # restore $rp | ||
| 603 | mov 0(%rsp),@ri[0] # tp[0] | ||
| 604 | pxor %xmm0,%xmm0 | ||
| 605 | mov 8(%rsp),@ri[1] # tp[1] | ||
| 606 | shr \$2,$num # num/=4 | ||
| 607 | lea (%rsp),$ap # borrow ap for tp | ||
| 608 | xor $i,$i # i=0 and clear CF! | ||
| 609 | |||
| 610 | sub 0($np),@ri[0] | ||
| 611 | mov 16($ap),@ri[2] # tp[2] | ||
| 612 | mov 24($ap),@ri[3] # tp[3] | ||
| 613 | sbb 8($np),@ri[1] | ||
| 614 | lea -1($num),$j # j=num/4-1 | ||
| 615 | jmp .Lsub4x | ||
| 616 | .align 16 | ||
| 617 | .Lsub4x: | ||
| 618 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 619 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 620 | sbb 16($np,$i,8),@ri[2] | ||
| 621 | mov 32($ap,$i,8),@ri[0] # tp[i+1] | ||
| 622 | mov 40($ap,$i,8),@ri[1] | ||
| 623 | sbb 24($np,$i,8),@ri[3] | ||
| 624 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 625 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 626 | sbb 32($np,$i,8),@ri[0] | ||
| 627 | mov 48($ap,$i,8),@ri[2] | ||
| 628 | mov 56($ap,$i,8),@ri[3] | ||
| 629 | sbb 40($np,$i,8),@ri[1] | ||
| 630 | lea 4($i),$i # i++ | ||
| 631 | dec $j # doesnn't affect CF! | ||
| 632 | jnz .Lsub4x | ||
| 633 | |||
| 634 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 635 | mov 32($ap,$i,8),@ri[0] # load overflow bit | ||
| 636 | sbb 16($np,$i,8),@ri[2] | ||
| 637 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 638 | sbb 24($np,$i,8),@ri[3] | ||
| 639 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 640 | |||
| 641 | sbb \$0,@ri[0] # handle upmost overflow bit | ||
| 642 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 643 | xor $i,$i # i=0 | ||
| 644 | and @ri[0],$ap | ||
| 645 | not @ri[0] | ||
| 646 | mov $rp,$np | ||
| 647 | and @ri[0],$np | ||
| 648 | lea -1($num),$j | ||
| 649 | or $np,$ap # ap=borrow?tp:rp | ||
| 650 | |||
| 651 | movdqu ($ap),%xmm1 | ||
| 652 | movdqa %xmm0,(%rsp) | ||
| 653 | movdqu %xmm1,($rp) | ||
| 654 | jmp .Lcopy4x | ||
| 655 | .align 16 | ||
| 656 | .Lcopy4x: # copy or in-place refresh | ||
| 657 | movdqu 16($ap,$i),%xmm2 | ||
| 658 | movdqu 32($ap,$i),%xmm1 | ||
| 659 | movdqa %xmm0,16(%rsp,$i) | ||
| 660 | movdqu %xmm2,16($rp,$i) | ||
| 661 | movdqa %xmm0,32(%rsp,$i) | ||
| 662 | movdqu %xmm1,32($rp,$i) | ||
| 663 | lea 32($i),$i | ||
| 202 | dec $j | 664 | dec $j |
| 203 | jge .Lcopy | 665 | jnz .Lcopy4x |
| 204 | 666 | ||
| 667 | shl \$2,$num | ||
| 668 | movdqu 16($ap,$i),%xmm2 | ||
| 669 | movdqa %xmm0,16(%rsp,$i) | ||
| 670 | movdqu %xmm2,16($rp,$i) | ||
| 671 | ___ | ||
| 672 | } | ||
| 673 | $code.=<<___; | ||
| 205 | mov 8(%rsp,$num,8),%rsi # restore %rsp | 674 | mov 8(%rsp,$num,8),%rsi # restore %rsp |
| 206 | mov \$1,%rax | 675 | mov \$1,%rax |
| 207 | mov (%rsi),%r15 | 676 | mov (%rsi),%r15 |
| @@ -211,9 +680,823 @@ bn_mul_mont: | |||
| 211 | mov 32(%rsi),%rbp | 680 | mov 32(%rsi),%rbp |
| 212 | mov 40(%rsi),%rbx | 681 | mov 40(%rsi),%rbx |
| 213 | lea 48(%rsi),%rsp | 682 | lea 48(%rsi),%rsp |
| 214 | .Lepilogue: | 683 | .Lmul4x_epilogue: |
| 215 | ret | 684 | ret |
| 216 | .size bn_mul_mont,.-bn_mul_mont | 685 | .size bn_mul4x_mont,.-bn_mul4x_mont |
| 686 | ___ | ||
| 687 | }}} | ||
| 688 | {{{ | ||
| 689 | ###################################################################### | ||
| 690 | # void bn_sqr4x_mont( | ||
| 691 | my $rptr="%rdi"; # const BN_ULONG *rptr, | ||
| 692 | my $aptr="%rsi"; # const BN_ULONG *aptr, | ||
| 693 | my $bptr="%rdx"; # not used | ||
| 694 | my $nptr="%rcx"; # const BN_ULONG *nptr, | ||
| 695 | my $n0 ="%r8"; # const BN_ULONG *n0); | ||
| 696 | my $num ="%r9"; # int num, has to be divisible by 4 and | ||
| 697 | # not less than 8 | ||
| 698 | |||
| 699 | my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); | ||
| 700 | my @A0=("%r10","%r11"); | ||
| 701 | my @A1=("%r12","%r13"); | ||
| 702 | my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); | ||
| 703 | |||
| 704 | $code.=<<___; | ||
| 705 | .type bn_sqr4x_mont,\@function,6 | ||
| 706 | .align 16 | ||
| 707 | bn_sqr4x_mont: | ||
| 708 | .Lsqr4x_enter: | ||
| 709 | push %rbx | ||
| 710 | push %rbp | ||
| 711 | push %r12 | ||
| 712 | push %r13 | ||
| 713 | push %r14 | ||
| 714 | push %r15 | ||
| 715 | |||
| 716 | shl \$3,${num}d # convert $num to bytes | ||
| 717 | xor %r10,%r10 | ||
| 718 | mov %rsp,%r11 # put aside %rsp | ||
| 719 | sub $num,%r10 # -$num | ||
| 720 | mov ($n0),$n0 # *n0 | ||
| 721 | lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num) | ||
| 722 | and \$-1024,%rsp # minimize TLB usage | ||
| 723 | ############################################################## | ||
| 724 | # Stack layout | ||
| 725 | # | ||
| 726 | # +0 saved $num, used in reduction section | ||
| 727 | # +8 &t[2*$num], used in reduction section | ||
| 728 | # +32 saved $rptr | ||
| 729 | # +40 saved $nptr | ||
| 730 | # +48 saved *n0 | ||
| 731 | # +56 saved %rsp | ||
| 732 | # +64 t[2*$num] | ||
| 733 | # | ||
| 734 | mov $rptr,32(%rsp) # save $rptr | ||
| 735 | mov $nptr,40(%rsp) | ||
| 736 | mov $n0, 48(%rsp) | ||
| 737 | mov %r11, 56(%rsp) # save original %rsp | ||
| 738 | .Lsqr4x_body: | ||
| 739 | ############################################################## | ||
| 740 | # Squaring part: | ||
| 741 | # | ||
| 742 | # a) multiply-n-add everything but a[i]*a[i]; | ||
| 743 | # b) shift result of a) by 1 to the left and accumulate | ||
| 744 | # a[i]*a[i] products; | ||
| 745 | # | ||
| 746 | lea 32(%r10),$i # $i=-($num-32) | ||
| 747 | lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] | ||
| 748 | |||
| 749 | mov $num,$j # $j=$num | ||
| 750 | |||
| 751 | # comments apply to $num==8 case | ||
| 752 | mov -32($aptr,$i),$a0 # a[0] | ||
| 753 | lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] | ||
| 754 | mov -24($aptr,$i),%rax # a[1] | ||
| 755 | lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] | ||
| 756 | mov -16($aptr,$i),$ai # a[2] | ||
| 757 | mov %rax,$a1 | ||
| 758 | |||
| 759 | mul $a0 # a[1]*a[0] | ||
| 760 | mov %rax,$A0[0] # a[1]*a[0] | ||
| 761 | mov $ai,%rax # a[2] | ||
| 762 | mov %rdx,$A0[1] | ||
| 763 | mov $A0[0],-24($tptr,$i) # t[1] | ||
| 764 | |||
| 765 | xor $A0[0],$A0[0] | ||
| 766 | mul $a0 # a[2]*a[0] | ||
| 767 | add %rax,$A0[1] | ||
| 768 | mov $ai,%rax | ||
| 769 | adc %rdx,$A0[0] | ||
| 770 | mov $A0[1],-16($tptr,$i) # t[2] | ||
| 771 | |||
| 772 | lea -16($i),$j # j=-16 | ||
| 773 | |||
| 774 | |||
| 775 | mov 8($aptr,$j),$ai # a[3] | ||
| 776 | mul $a1 # a[2]*a[1] | ||
| 777 | mov %rax,$A1[0] # a[2]*a[1]+t[3] | ||
| 778 | mov $ai,%rax | ||
| 779 | mov %rdx,$A1[1] | ||
| 780 | |||
| 781 | xor $A0[1],$A0[1] | ||
| 782 | add $A1[0],$A0[0] | ||
| 783 | lea 16($j),$j | ||
| 784 | adc \$0,$A0[1] | ||
| 785 | mul $a0 # a[3]*a[0] | ||
| 786 | add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] | ||
| 787 | mov $ai,%rax | ||
| 788 | adc %rdx,$A0[1] | ||
| 789 | mov $A0[0],-8($tptr,$j) # t[3] | ||
| 790 | jmp .Lsqr4x_1st | ||
| 791 | |||
| 792 | .align 16 | ||
| 793 | .Lsqr4x_1st: | ||
| 794 | mov ($aptr,$j),$ai # a[4] | ||
| 795 | xor $A1[0],$A1[0] | ||
| 796 | mul $a1 # a[3]*a[1] | ||
| 797 | add %rax,$A1[1] # a[3]*a[1]+t[4] | ||
| 798 | mov $ai,%rax | ||
| 799 | adc %rdx,$A1[0] | ||
| 800 | |||
| 801 | xor $A0[0],$A0[0] | ||
| 802 | add $A1[1],$A0[1] | ||
| 803 | adc \$0,$A0[0] | ||
| 804 | mul $a0 # a[4]*a[0] | ||
| 805 | add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] | ||
| 806 | mov $ai,%rax # a[3] | ||
| 807 | adc %rdx,$A0[0] | ||
| 808 | mov $A0[1],($tptr,$j) # t[4] | ||
| 809 | |||
| 810 | |||
| 811 | mov 8($aptr,$j),$ai # a[5] | ||
| 812 | xor $A1[1],$A1[1] | ||
| 813 | mul $a1 # a[4]*a[3] | ||
| 814 | add %rax,$A1[0] # a[4]*a[3]+t[5] | ||
| 815 | mov $ai,%rax | ||
| 816 | adc %rdx,$A1[1] | ||
| 817 | |||
| 818 | xor $A0[1],$A0[1] | ||
| 819 | add $A1[0],$A0[0] | ||
| 820 | adc \$0,$A0[1] | ||
| 821 | mul $a0 # a[5]*a[2] | ||
| 822 | add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] | ||
| 823 | mov $ai,%rax | ||
| 824 | adc %rdx,$A0[1] | ||
| 825 | mov $A0[0],8($tptr,$j) # t[5] | ||
| 826 | |||
| 827 | mov 16($aptr,$j),$ai # a[6] | ||
| 828 | xor $A1[0],$A1[0] | ||
| 829 | mul $a1 # a[5]*a[3] | ||
| 830 | add %rax,$A1[1] # a[5]*a[3]+t[6] | ||
| 831 | mov $ai,%rax | ||
| 832 | adc %rdx,$A1[0] | ||
| 833 | |||
| 834 | xor $A0[0],$A0[0] | ||
| 835 | add $A1[1],$A0[1] | ||
| 836 | adc \$0,$A0[0] | ||
| 837 | mul $a0 # a[6]*a[2] | ||
| 838 | add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] | ||
| 839 | mov $ai,%rax # a[3] | ||
| 840 | adc %rdx,$A0[0] | ||
| 841 | mov $A0[1],16($tptr,$j) # t[6] | ||
| 842 | |||
| 843 | |||
| 844 | mov 24($aptr,$j),$ai # a[7] | ||
| 845 | xor $A1[1],$A1[1] | ||
| 846 | mul $a1 # a[6]*a[5] | ||
| 847 | add %rax,$A1[0] # a[6]*a[5]+t[7] | ||
| 848 | mov $ai,%rax | ||
| 849 | adc %rdx,$A1[1] | ||
| 850 | |||
| 851 | xor $A0[1],$A0[1] | ||
| 852 | add $A1[0],$A0[0] | ||
| 853 | lea 32($j),$j | ||
| 854 | adc \$0,$A0[1] | ||
| 855 | mul $a0 # a[7]*a[4] | ||
| 856 | add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] | ||
| 857 | mov $ai,%rax | ||
| 858 | adc %rdx,$A0[1] | ||
| 859 | mov $A0[0],-8($tptr,$j) # t[7] | ||
| 860 | |||
| 861 | cmp \$0,$j | ||
| 862 | jne .Lsqr4x_1st | ||
| 863 | |||
| 864 | xor $A1[0],$A1[0] | ||
| 865 | add $A0[1],$A1[1] | ||
| 866 | adc \$0,$A1[0] | ||
| 867 | mul $a1 # a[7]*a[5] | ||
| 868 | add %rax,$A1[1] | ||
| 869 | adc %rdx,$A1[0] | ||
| 870 | |||
| 871 | mov $A1[1],($tptr) # t[8] | ||
| 872 | lea 16($i),$i | ||
| 873 | mov $A1[0],8($tptr) # t[9] | ||
| 874 | jmp .Lsqr4x_outer | ||
| 875 | |||
| 876 | .align 16 | ||
| 877 | .Lsqr4x_outer: # comments apply to $num==6 case | ||
| 878 | mov -32($aptr,$i),$a0 # a[0] | ||
| 879 | lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] | ||
| 880 | mov -24($aptr,$i),%rax # a[1] | ||
| 881 | lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] | ||
| 882 | mov -16($aptr,$i),$ai # a[2] | ||
| 883 | mov %rax,$a1 | ||
| 884 | |||
| 885 | mov -24($tptr,$i),$A0[0] # t[1] | ||
| 886 | xor $A0[1],$A0[1] | ||
| 887 | mul $a0 # a[1]*a[0] | ||
| 888 | add %rax,$A0[0] # a[1]*a[0]+t[1] | ||
| 889 | mov $ai,%rax # a[2] | ||
| 890 | adc %rdx,$A0[1] | ||
| 891 | mov $A0[0],-24($tptr,$i) # t[1] | ||
| 892 | |||
| 893 | xor $A0[0],$A0[0] | ||
| 894 | add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] | ||
| 895 | adc \$0,$A0[0] | ||
| 896 | mul $a0 # a[2]*a[0] | ||
| 897 | add %rax,$A0[1] | ||
| 898 | mov $ai,%rax | ||
| 899 | adc %rdx,$A0[0] | ||
| 900 | mov $A0[1],-16($tptr,$i) # t[2] | ||
| 901 | |||
| 902 | lea -16($i),$j # j=-16 | ||
| 903 | xor $A1[0],$A1[0] | ||
| 904 | |||
| 905 | |||
| 906 | mov 8($aptr,$j),$ai # a[3] | ||
| 907 | xor $A1[1],$A1[1] | ||
| 908 | add 8($tptr,$j),$A1[0] | ||
| 909 | adc \$0,$A1[1] | ||
| 910 | mul $a1 # a[2]*a[1] | ||
| 911 | add %rax,$A1[0] # a[2]*a[1]+t[3] | ||
| 912 | mov $ai,%rax | ||
| 913 | adc %rdx,$A1[1] | ||
| 914 | |||
| 915 | xor $A0[1],$A0[1] | ||
| 916 | add $A1[0],$A0[0] | ||
| 917 | adc \$0,$A0[1] | ||
| 918 | mul $a0 # a[3]*a[0] | ||
| 919 | add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] | ||
| 920 | mov $ai,%rax | ||
| 921 | adc %rdx,$A0[1] | ||
| 922 | mov $A0[0],8($tptr,$j) # t[3] | ||
| 923 | |||
| 924 | lea 16($j),$j | ||
| 925 | jmp .Lsqr4x_inner | ||
| 926 | |||
| 927 | .align 16 | ||
| 928 | .Lsqr4x_inner: | ||
| 929 | mov ($aptr,$j),$ai # a[4] | ||
| 930 | xor $A1[0],$A1[0] | ||
| 931 | add ($tptr,$j),$A1[1] | ||
| 932 | adc \$0,$A1[0] | ||
| 933 | mul $a1 # a[3]*a[1] | ||
| 934 | add %rax,$A1[1] # a[3]*a[1]+t[4] | ||
| 935 | mov $ai,%rax | ||
| 936 | adc %rdx,$A1[0] | ||
| 937 | |||
| 938 | xor $A0[0],$A0[0] | ||
| 939 | add $A1[1],$A0[1] | ||
| 940 | adc \$0,$A0[0] | ||
| 941 | mul $a0 # a[4]*a[0] | ||
| 942 | add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] | ||
| 943 | mov $ai,%rax # a[3] | ||
| 944 | adc %rdx,$A0[0] | ||
| 945 | mov $A0[1],($tptr,$j) # t[4] | ||
| 946 | |||
| 947 | mov 8($aptr,$j),$ai # a[5] | ||
| 948 | xor $A1[1],$A1[1] | ||
| 949 | add 8($tptr,$j),$A1[0] | ||
| 950 | adc \$0,$A1[1] | ||
| 951 | mul $a1 # a[4]*a[3] | ||
| 952 | add %rax,$A1[0] # a[4]*a[3]+t[5] | ||
| 953 | mov $ai,%rax | ||
| 954 | adc %rdx,$A1[1] | ||
| 955 | |||
| 956 | xor $A0[1],$A0[1] | ||
| 957 | add $A1[0],$A0[0] | ||
| 958 | lea 16($j),$j # j++ | ||
| 959 | adc \$0,$A0[1] | ||
| 960 | mul $a0 # a[5]*a[2] | ||
| 961 | add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] | ||
| 962 | mov $ai,%rax | ||
| 963 | adc %rdx,$A0[1] | ||
| 964 | mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below | ||
| 965 | |||
| 966 | cmp \$0,$j | ||
| 967 | jne .Lsqr4x_inner | ||
| 968 | |||
| 969 | xor $A1[0],$A1[0] | ||
| 970 | add $A0[1],$A1[1] | ||
| 971 | adc \$0,$A1[0] | ||
| 972 | mul $a1 # a[5]*a[3] | ||
| 973 | add %rax,$A1[1] | ||
| 974 | adc %rdx,$A1[0] | ||
| 975 | |||
| 976 | mov $A1[1],($tptr) # t[6], "preloaded t[2]" below | ||
| 977 | mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below | ||
| 978 | |||
| 979 | add \$16,$i | ||
| 980 | jnz .Lsqr4x_outer | ||
| 981 | |||
| 982 | # comments apply to $num==4 case | ||
| 983 | mov -32($aptr),$a0 # a[0] | ||
| 984 | lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] | ||
| 985 | mov -24($aptr),%rax # a[1] | ||
| 986 | lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] | ||
| 987 | mov -16($aptr),$ai # a[2] | ||
| 988 | mov %rax,$a1 | ||
| 989 | |||
| 990 | xor $A0[1],$A0[1] | ||
| 991 | mul $a0 # a[1]*a[0] | ||
| 992 | add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] | ||
| 993 | mov $ai,%rax # a[2] | ||
| 994 | adc %rdx,$A0[1] | ||
| 995 | mov $A0[0],-24($tptr) # t[1] | ||
| 996 | |||
| 997 | xor $A0[0],$A0[0] | ||
| 998 | add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] | ||
| 999 | adc \$0,$A0[0] | ||
| 1000 | mul $a0 # a[2]*a[0] | ||
| 1001 | add %rax,$A0[1] | ||
| 1002 | mov $ai,%rax | ||
| 1003 | adc %rdx,$A0[0] | ||
| 1004 | mov $A0[1],-16($tptr) # t[2] | ||
| 1005 | |||
| 1006 | mov -8($aptr),$ai # a[3] | ||
| 1007 | mul $a1 # a[2]*a[1] | ||
| 1008 | add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] | ||
| 1009 | mov $ai,%rax | ||
| 1010 | adc \$0,%rdx | ||
| 1011 | |||
| 1012 | xor $A0[1],$A0[1] | ||
| 1013 | add $A1[0],$A0[0] | ||
| 1014 | mov %rdx,$A1[1] | ||
| 1015 | adc \$0,$A0[1] | ||
| 1016 | mul $a0 # a[3]*a[0] | ||
| 1017 | add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] | ||
| 1018 | mov $ai,%rax | ||
| 1019 | adc %rdx,$A0[1] | ||
| 1020 | mov $A0[0],-8($tptr) # t[3] | ||
| 1021 | |||
| 1022 | xor $A1[0],$A1[0] | ||
| 1023 | add $A0[1],$A1[1] | ||
| 1024 | adc \$0,$A1[0] | ||
| 1025 | mul $a1 # a[3]*a[1] | ||
| 1026 | add %rax,$A1[1] | ||
| 1027 | mov -16($aptr),%rax # a[2] | ||
| 1028 | adc %rdx,$A1[0] | ||
| 1029 | |||
| 1030 | mov $A1[1],($tptr) # t[4] | ||
| 1031 | mov $A1[0],8($tptr) # t[5] | ||
| 1032 | |||
| 1033 | mul $ai # a[2]*a[3] | ||
| 1034 | ___ | ||
| 1035 | { | ||
| 1036 | my ($shift,$carry)=($a0,$a1); | ||
| 1037 | my @S=(@A1,$ai,$n0); | ||
| 1038 | $code.=<<___; | ||
| 1039 | add \$16,$i | ||
| 1040 | xor $shift,$shift | ||
| 1041 | sub $num,$i # $i=16-$num | ||
| 1042 | xor $carry,$carry | ||
| 1043 | |||
| 1044 | add $A1[0],%rax # t[5] | ||
| 1045 | adc \$0,%rdx | ||
| 1046 | mov %rax,8($tptr) # t[5] | ||
| 1047 | mov %rdx,16($tptr) # t[6] | ||
| 1048 | mov $carry,24($tptr) # t[7] | ||
| 1049 | |||
| 1050 | mov -16($aptr,$i),%rax # a[0] | ||
| 1051 | lea 64(%rsp,$num,2),$tptr | ||
| 1052 | xor $A0[0],$A0[0] # t[0] | ||
| 1053 | mov -24($tptr,$i,2),$A0[1] # t[1] | ||
| 1054 | |||
| 1055 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | ||
| 1056 | shr \$63,$A0[0] | ||
| 1057 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | ||
| 1058 | shr \$63,$A0[1] | ||
| 1059 | or $A0[0],$S[1] # | t[2*i]>>63 | ||
| 1060 | mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
| 1061 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
| 1062 | mul %rax # a[i]*a[i] | ||
| 1063 | neg $carry # mov $carry,cf | ||
| 1064 | mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
| 1065 | adc %rax,$S[0] | ||
| 1066 | mov -8($aptr,$i),%rax # a[i+1] # prefetch | ||
| 1067 | mov $S[0],-32($tptr,$i,2) | ||
| 1068 | adc %rdx,$S[1] | ||
| 1069 | |||
| 1070 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift | ||
| 1071 | mov $S[1],-24($tptr,$i,2) | ||
| 1072 | sbb $carry,$carry # mov cf,$carry | ||
| 1073 | shr \$63,$A0[0] | ||
| 1074 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | ||
| 1075 | shr \$63,$A0[1] | ||
| 1076 | or $A0[0],$S[3] # | t[2*i]>>63 | ||
| 1077 | mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
| 1078 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
| 1079 | mul %rax # a[i]*a[i] | ||
| 1080 | neg $carry # mov $carry,cf | ||
| 1081 | mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
| 1082 | adc %rax,$S[2] | ||
| 1083 | mov 0($aptr,$i),%rax # a[i+1] # prefetch | ||
| 1084 | mov $S[2],-16($tptr,$i,2) | ||
| 1085 | adc %rdx,$S[3] | ||
| 1086 | lea 16($i),$i | ||
| 1087 | mov $S[3],-40($tptr,$i,2) | ||
| 1088 | sbb $carry,$carry # mov cf,$carry | ||
| 1089 | jmp .Lsqr4x_shift_n_add | ||
| 1090 | |||
| 1091 | .align 16 | ||
| 1092 | .Lsqr4x_shift_n_add: | ||
| 1093 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | ||
| 1094 | shr \$63,$A0[0] | ||
| 1095 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | ||
| 1096 | shr \$63,$A0[1] | ||
| 1097 | or $A0[0],$S[1] # | t[2*i]>>63 | ||
| 1098 | mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
| 1099 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
| 1100 | mul %rax # a[i]*a[i] | ||
| 1101 | neg $carry # mov $carry,cf | ||
| 1102 | mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
| 1103 | adc %rax,$S[0] | ||
| 1104 | mov -8($aptr,$i),%rax # a[i+1] # prefetch | ||
| 1105 | mov $S[0],-32($tptr,$i,2) | ||
| 1106 | adc %rdx,$S[1] | ||
| 1107 | |||
| 1108 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift | ||
| 1109 | mov $S[1],-24($tptr,$i,2) | ||
| 1110 | sbb $carry,$carry # mov cf,$carry | ||
| 1111 | shr \$63,$A0[0] | ||
| 1112 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | ||
| 1113 | shr \$63,$A0[1] | ||
| 1114 | or $A0[0],$S[3] # | t[2*i]>>63 | ||
| 1115 | mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
| 1116 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
| 1117 | mul %rax # a[i]*a[i] | ||
| 1118 | neg $carry # mov $carry,cf | ||
| 1119 | mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
| 1120 | adc %rax,$S[2] | ||
| 1121 | mov 0($aptr,$i),%rax # a[i+1] # prefetch | ||
| 1122 | mov $S[2],-16($tptr,$i,2) | ||
| 1123 | adc %rdx,$S[3] | ||
| 1124 | |||
| 1125 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | ||
| 1126 | mov $S[3],-8($tptr,$i,2) | ||
| 1127 | sbb $carry,$carry # mov cf,$carry | ||
| 1128 | shr \$63,$A0[0] | ||
| 1129 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | ||
| 1130 | shr \$63,$A0[1] | ||
| 1131 | or $A0[0],$S[1] # | t[2*i]>>63 | ||
| 1132 | mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
| 1133 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
| 1134 | mul %rax # a[i]*a[i] | ||
| 1135 | neg $carry # mov $carry,cf | ||
| 1136 | mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
| 1137 | adc %rax,$S[0] | ||
| 1138 | mov 8($aptr,$i),%rax # a[i+1] # prefetch | ||
| 1139 | mov $S[0],0($tptr,$i,2) | ||
| 1140 | adc %rdx,$S[1] | ||
| 1141 | |||
| 1142 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift | ||
| 1143 | mov $S[1],8($tptr,$i,2) | ||
| 1144 | sbb $carry,$carry # mov cf,$carry | ||
| 1145 | shr \$63,$A0[0] | ||
| 1146 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | ||
| 1147 | shr \$63,$A0[1] | ||
| 1148 | or $A0[0],$S[3] # | t[2*i]>>63 | ||
| 1149 | mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
| 1150 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
| 1151 | mul %rax # a[i]*a[i] | ||
| 1152 | neg $carry # mov $carry,cf | ||
| 1153 | mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
| 1154 | adc %rax,$S[2] | ||
| 1155 | mov 16($aptr,$i),%rax # a[i+1] # prefetch | ||
| 1156 | mov $S[2],16($tptr,$i,2) | ||
| 1157 | adc %rdx,$S[3] | ||
| 1158 | mov $S[3],24($tptr,$i,2) | ||
| 1159 | sbb $carry,$carry # mov cf,$carry | ||
| 1160 | add \$32,$i | ||
| 1161 | jnz .Lsqr4x_shift_n_add | ||
| 1162 | |||
| 1163 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | ||
| 1164 | shr \$63,$A0[0] | ||
| 1165 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | ||
| 1166 | shr \$63,$A0[1] | ||
| 1167 | or $A0[0],$S[1] # | t[2*i]>>63 | ||
| 1168 | mov -16($tptr),$A0[0] # t[2*i+2] # prefetch | ||
| 1169 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
| 1170 | mul %rax # a[i]*a[i] | ||
| 1171 | neg $carry # mov $carry,cf | ||
| 1172 | mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch | ||
| 1173 | adc %rax,$S[0] | ||
| 1174 | mov -8($aptr),%rax # a[i+1] # prefetch | ||
| 1175 | mov $S[0],-32($tptr) | ||
| 1176 | adc %rdx,$S[1] | ||
| 1177 | |||
| 1178 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift | ||
| 1179 | mov $S[1],-24($tptr) | ||
| 1180 | sbb $carry,$carry # mov cf,$carry | ||
| 1181 | shr \$63,$A0[0] | ||
| 1182 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | ||
| 1183 | shr \$63,$A0[1] | ||
| 1184 | or $A0[0],$S[3] # | t[2*i]>>63 | ||
| 1185 | mul %rax # a[i]*a[i] | ||
| 1186 | neg $carry # mov $carry,cf | ||
| 1187 | adc %rax,$S[2] | ||
| 1188 | adc %rdx,$S[3] | ||
| 1189 | mov $S[2],-16($tptr) | ||
| 1190 | mov $S[3],-8($tptr) | ||
| 1191 | ___ | ||
| 1192 | } | ||
| 1193 | ############################################################## | ||
| 1194 | # Montgomery reduction part, "word-by-word" algorithm. | ||
| 1195 | # | ||
| 1196 | { | ||
| 1197 | my ($topbit,$nptr)=("%rbp",$aptr); | ||
| 1198 | my ($m0,$m1)=($a0,$a1); | ||
| 1199 | my @Ni=("%rbx","%r9"); | ||
| 1200 | $code.=<<___; | ||
| 1201 | mov 40(%rsp),$nptr # restore $nptr | ||
| 1202 | mov 48(%rsp),$n0 # restore *n0 | ||
| 1203 | xor $j,$j | ||
| 1204 | mov $num,0(%rsp) # save $num | ||
| 1205 | sub $num,$j # $j=-$num | ||
| 1206 | mov 64(%rsp),$A0[0] # t[0] # modsched # | ||
| 1207 | mov $n0,$m0 # # modsched # | ||
| 1208 | lea 64(%rsp,$num,2),%rax # end of t[] buffer | ||
| 1209 | lea 64(%rsp,$num),$tptr # end of t[] window | ||
| 1210 | mov %rax,8(%rsp) # save end of t[] buffer | ||
| 1211 | lea ($nptr,$num),$nptr # end of n[] buffer | ||
| 1212 | xor $topbit,$topbit # $topbit=0 | ||
| 1213 | |||
| 1214 | mov 0($nptr,$j),%rax # n[0] # modsched # | ||
| 1215 | mov 8($nptr,$j),$Ni[1] # n[1] # modsched # | ||
| 1216 | imulq $A0[0],$m0 # m0=t[0]*n0 # modsched # | ||
| 1217 | mov %rax,$Ni[0] # # modsched # | ||
| 1218 | jmp .Lsqr4x_mont_outer | ||
| 1219 | |||
| 1220 | .align 16 | ||
| 1221 | .Lsqr4x_mont_outer: | ||
| 1222 | xor $A0[1],$A0[1] | ||
| 1223 | mul $m0 # n[0]*m0 | ||
| 1224 | add %rax,$A0[0] # n[0]*m0+t[0] | ||
| 1225 | mov $Ni[1],%rax | ||
| 1226 | adc %rdx,$A0[1] | ||
| 1227 | mov $n0,$m1 | ||
| 1228 | |||
| 1229 | xor $A0[0],$A0[0] | ||
| 1230 | add 8($tptr,$j),$A0[1] | ||
| 1231 | adc \$0,$A0[0] | ||
| 1232 | mul $m0 # n[1]*m0 | ||
| 1233 | add %rax,$A0[1] # n[1]*m0+t[1] | ||
| 1234 | mov $Ni[0],%rax | ||
| 1235 | adc %rdx,$A0[0] | ||
| 1236 | |||
| 1237 | imulq $A0[1],$m1 | ||
| 1238 | |||
| 1239 | mov 16($nptr,$j),$Ni[0] # n[2] | ||
| 1240 | xor $A1[1],$A1[1] | ||
| 1241 | add $A0[1],$A1[0] | ||
| 1242 | adc \$0,$A1[1] | ||
| 1243 | mul $m1 # n[0]*m1 | ||
| 1244 | add %rax,$A1[0] # n[0]*m1+"t[1]" | ||
| 1245 | mov $Ni[0],%rax | ||
| 1246 | adc %rdx,$A1[1] | ||
| 1247 | mov $A1[0],8($tptr,$j) # "t[1]" | ||
| 1248 | |||
| 1249 | xor $A0[1],$A0[1] | ||
| 1250 | add 16($tptr,$j),$A0[0] | ||
| 1251 | adc \$0,$A0[1] | ||
| 1252 | mul $m0 # n[2]*m0 | ||
| 1253 | add %rax,$A0[0] # n[2]*m0+t[2] | ||
| 1254 | mov $Ni[1],%rax | ||
| 1255 | adc %rdx,$A0[1] | ||
| 1256 | |||
| 1257 | mov 24($nptr,$j),$Ni[1] # n[3] | ||
| 1258 | xor $A1[0],$A1[0] | ||
| 1259 | add $A0[0],$A1[1] | ||
| 1260 | adc \$0,$A1[0] | ||
| 1261 | mul $m1 # n[1]*m1 | ||
| 1262 | add %rax,$A1[1] # n[1]*m1+"t[2]" | ||
| 1263 | mov $Ni[1],%rax | ||
| 1264 | adc %rdx,$A1[0] | ||
| 1265 | mov $A1[1],16($tptr,$j) # "t[2]" | ||
| 1266 | |||
| 1267 | xor $A0[0],$A0[0] | ||
| 1268 | add 24($tptr,$j),$A0[1] | ||
| 1269 | lea 32($j),$j | ||
| 1270 | adc \$0,$A0[0] | ||
| 1271 | mul $m0 # n[3]*m0 | ||
| 1272 | add %rax,$A0[1] # n[3]*m0+t[3] | ||
| 1273 | mov $Ni[0],%rax | ||
| 1274 | adc %rdx,$A0[0] | ||
| 1275 | jmp .Lsqr4x_mont_inner | ||
| 1276 | |||
| 1277 | .align 16 | ||
| 1278 | .Lsqr4x_mont_inner: | ||
| 1279 | mov ($nptr,$j),$Ni[0] # n[4] | ||
| 1280 | xor $A1[1],$A1[1] | ||
| 1281 | add $A0[1],$A1[0] | ||
| 1282 | adc \$0,$A1[1] | ||
| 1283 | mul $m1 # n[2]*m1 | ||
| 1284 | add %rax,$A1[0] # n[2]*m1+"t[3]" | ||
| 1285 | mov $Ni[0],%rax | ||
| 1286 | adc %rdx,$A1[1] | ||
| 1287 | mov $A1[0],-8($tptr,$j) # "t[3]" | ||
| 1288 | |||
| 1289 | xor $A0[1],$A0[1] | ||
| 1290 | add ($tptr,$j),$A0[0] | ||
| 1291 | adc \$0,$A0[1] | ||
| 1292 | mul $m0 # n[4]*m0 | ||
| 1293 | add %rax,$A0[0] # n[4]*m0+t[4] | ||
| 1294 | mov $Ni[1],%rax | ||
| 1295 | adc %rdx,$A0[1] | ||
| 1296 | |||
| 1297 | mov 8($nptr,$j),$Ni[1] # n[5] | ||
| 1298 | xor $A1[0],$A1[0] | ||
| 1299 | add $A0[0],$A1[1] | ||
| 1300 | adc \$0,$A1[0] | ||
| 1301 | mul $m1 # n[3]*m1 | ||
| 1302 | add %rax,$A1[1] # n[3]*m1+"t[4]" | ||
| 1303 | mov $Ni[1],%rax | ||
| 1304 | adc %rdx,$A1[0] | ||
| 1305 | mov $A1[1],($tptr,$j) # "t[4]" | ||
| 1306 | |||
| 1307 | xor $A0[0],$A0[0] | ||
| 1308 | add 8($tptr,$j),$A0[1] | ||
| 1309 | adc \$0,$A0[0] | ||
| 1310 | mul $m0 # n[5]*m0 | ||
| 1311 | add %rax,$A0[1] # n[5]*m0+t[5] | ||
| 1312 | mov $Ni[0],%rax | ||
| 1313 | adc %rdx,$A0[0] | ||
| 1314 | |||
| 1315 | |||
| 1316 | mov 16($nptr,$j),$Ni[0] # n[6] | ||
| 1317 | xor $A1[1],$A1[1] | ||
| 1318 | add $A0[1],$A1[0] | ||
| 1319 | adc \$0,$A1[1] | ||
| 1320 | mul $m1 # n[4]*m1 | ||
| 1321 | add %rax,$A1[0] # n[4]*m1+"t[5]" | ||
| 1322 | mov $Ni[0],%rax | ||
| 1323 | adc %rdx,$A1[1] | ||
| 1324 | mov $A1[0],8($tptr,$j) # "t[5]" | ||
| 1325 | |||
| 1326 | xor $A0[1],$A0[1] | ||
| 1327 | add 16($tptr,$j),$A0[0] | ||
| 1328 | adc \$0,$A0[1] | ||
| 1329 | mul $m0 # n[6]*m0 | ||
| 1330 | add %rax,$A0[0] # n[6]*m0+t[6] | ||
| 1331 | mov $Ni[1],%rax | ||
| 1332 | adc %rdx,$A0[1] | ||
| 1333 | |||
| 1334 | mov 24($nptr,$j),$Ni[1] # n[7] | ||
| 1335 | xor $A1[0],$A1[0] | ||
| 1336 | add $A0[0],$A1[1] | ||
| 1337 | adc \$0,$A1[0] | ||
| 1338 | mul $m1 # n[5]*m1 | ||
| 1339 | add %rax,$A1[1] # n[5]*m1+"t[6]" | ||
| 1340 | mov $Ni[1],%rax | ||
| 1341 | adc %rdx,$A1[0] | ||
| 1342 | mov $A1[1],16($tptr,$j) # "t[6]" | ||
| 1343 | |||
| 1344 | xor $A0[0],$A0[0] | ||
| 1345 | add 24($tptr,$j),$A0[1] | ||
| 1346 | lea 32($j),$j | ||
| 1347 | adc \$0,$A0[0] | ||
| 1348 | mul $m0 # n[7]*m0 | ||
| 1349 | add %rax,$A0[1] # n[7]*m0+t[7] | ||
| 1350 | mov $Ni[0],%rax | ||
| 1351 | adc %rdx,$A0[0] | ||
| 1352 | cmp \$0,$j | ||
| 1353 | jne .Lsqr4x_mont_inner | ||
| 1354 | |||
| 1355 | sub 0(%rsp),$j # $j=-$num # modsched # | ||
| 1356 | mov $n0,$m0 # # modsched # | ||
| 1357 | |||
| 1358 | xor $A1[1],$A1[1] | ||
| 1359 | add $A0[1],$A1[0] | ||
| 1360 | adc \$0,$A1[1] | ||
| 1361 | mul $m1 # n[6]*m1 | ||
| 1362 | add %rax,$A1[0] # n[6]*m1+"t[7]" | ||
| 1363 | mov $Ni[1],%rax | ||
| 1364 | adc %rdx,$A1[1] | ||
| 1365 | mov $A1[0],-8($tptr) # "t[7]" | ||
| 1366 | |||
| 1367 | xor $A0[1],$A0[1] | ||
| 1368 | add ($tptr),$A0[0] # +t[8] | ||
| 1369 | adc \$0,$A0[1] | ||
| 1370 | mov 0($nptr,$j),$Ni[0] # n[0] # modsched # | ||
| 1371 | add $topbit,$A0[0] | ||
| 1372 | adc \$0,$A0[1] | ||
| 1373 | |||
| 1374 | imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched # | ||
| 1375 | xor $A1[0],$A1[0] | ||
| 1376 | mov 8($nptr,$j),$Ni[1] # n[1] # modsched # | ||
| 1377 | add $A0[0],$A1[1] | ||
| 1378 | mov 16($tptr,$j),$A0[0] # t[0] # modsched # | ||
| 1379 | adc \$0,$A1[0] | ||
| 1380 | mul $m1 # n[7]*m1 | ||
| 1381 | add %rax,$A1[1] # n[7]*m1+"t[8]" | ||
| 1382 | mov $Ni[0],%rax # # modsched # | ||
| 1383 | adc %rdx,$A1[0] | ||
| 1384 | mov $A1[1],($tptr) # "t[8]" | ||
| 1385 | |||
| 1386 | xor $topbit,$topbit | ||
| 1387 | add 8($tptr),$A1[0] # +t[9] | ||
| 1388 | adc $topbit,$topbit | ||
| 1389 | add $A0[1],$A1[0] | ||
| 1390 | lea 16($tptr),$tptr # "t[$num]>>128" | ||
| 1391 | adc \$0,$topbit | ||
| 1392 | mov $A1[0],-8($tptr) # "t[9]" | ||
| 1393 | cmp 8(%rsp),$tptr # are we done? | ||
| 1394 | jb .Lsqr4x_mont_outer | ||
| 1395 | |||
| 1396 | mov 0(%rsp),$num # restore $num | ||
| 1397 | mov $topbit,($tptr) # save $topbit | ||
| 1398 | ___ | ||
| 1399 | } | ||
| 1400 | ############################################################## | ||
| 1401 | # Post-condition, 4x unrolled copy from bn_mul_mont | ||
| 1402 | # | ||
| 1403 | { | ||
| 1404 | my ($tptr,$nptr)=("%rbx",$aptr); | ||
| 1405 | my @ri=("%rax","%rdx","%r10","%r11"); | ||
| 1406 | $code.=<<___; | ||
| 1407 | mov 64(%rsp,$num),@ri[0] # tp[0] | ||
| 1408 | lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result | ||
| 1409 | mov 40(%rsp),$nptr # restore $nptr | ||
| 1410 | shr \$5,$num # num/4 | ||
| 1411 | mov 8($tptr),@ri[1] # t[1] | ||
| 1412 | xor $i,$i # i=0 and clear CF! | ||
| 1413 | |||
| 1414 | mov 32(%rsp),$rptr # restore $rptr | ||
| 1415 | sub 0($nptr),@ri[0] | ||
| 1416 | mov 16($tptr),@ri[2] # t[2] | ||
| 1417 | mov 24($tptr),@ri[3] # t[3] | ||
| 1418 | sbb 8($nptr),@ri[1] | ||
| 1419 | lea -1($num),$j # j=num/4-1 | ||
| 1420 | jmp .Lsqr4x_sub | ||
| 1421 | .align 16 | ||
| 1422 | .Lsqr4x_sub: | ||
| 1423 | mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
| 1424 | mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
| 1425 | sbb 16($nptr,$i,8),@ri[2] | ||
| 1426 | mov 32($tptr,$i,8),@ri[0] # tp[i+1] | ||
| 1427 | mov 40($tptr,$i,8),@ri[1] | ||
| 1428 | sbb 24($nptr,$i,8),@ri[3] | ||
| 1429 | mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
| 1430 | mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
| 1431 | sbb 32($nptr,$i,8),@ri[0] | ||
| 1432 | mov 48($tptr,$i,8),@ri[2] | ||
| 1433 | mov 56($tptr,$i,8),@ri[3] | ||
| 1434 | sbb 40($nptr,$i,8),@ri[1] | ||
| 1435 | lea 4($i),$i # i++ | ||
| 1436 | dec $j # doesn't affect CF! | ||
| 1437 | jnz .Lsqr4x_sub | ||
| 1438 | |||
| 1439 | mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
| 1440 | mov 32($tptr,$i,8),@ri[0] # load overflow bit | ||
| 1441 | sbb 16($nptr,$i,8),@ri[2] | ||
| 1442 | mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
| 1443 | sbb 24($nptr,$i,8),@ri[3] | ||
| 1444 | mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
| 1445 | |||
| 1446 | sbb \$0,@ri[0] # handle upmost overflow bit | ||
| 1447 | mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
| 1448 | xor $i,$i # i=0 | ||
| 1449 | and @ri[0],$tptr | ||
| 1450 | not @ri[0] | ||
| 1451 | mov $rptr,$nptr | ||
| 1452 | and @ri[0],$nptr | ||
| 1453 | lea -1($num),$j | ||
| 1454 | or $nptr,$tptr # tp=borrow?tp:rp | ||
| 1455 | |||
| 1456 | pxor %xmm0,%xmm0 | ||
| 1457 | lea 64(%rsp,$num,8),$nptr | ||
| 1458 | movdqu ($tptr),%xmm1 | ||
| 1459 | lea ($nptr,$num,8),$nptr | ||
| 1460 | movdqa %xmm0,64(%rsp) # zap lower half of temporary vector | ||
| 1461 | movdqa %xmm0,($nptr) # zap upper half of temporary vector | ||
| 1462 | movdqu %xmm1,($rptr) | ||
| 1463 | jmp .Lsqr4x_copy | ||
| 1464 | .align 16 | ||
| 1465 | .Lsqr4x_copy: # copy or in-place refresh | ||
| 1466 | movdqu 16($tptr,$i),%xmm2 | ||
| 1467 | movdqu 32($tptr,$i),%xmm1 | ||
| 1468 | movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector | ||
| 1469 | movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector | ||
| 1470 | movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector | ||
| 1471 | movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector | ||
| 1472 | movdqu %xmm2,16($rptr,$i) | ||
| 1473 | movdqu %xmm1,32($rptr,$i) | ||
| 1474 | lea 32($i),$i | ||
| 1475 | dec $j | ||
| 1476 | jnz .Lsqr4x_copy | ||
| 1477 | |||
| 1478 | movdqu 16($tptr,$i),%xmm2 | ||
| 1479 | movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector | ||
| 1480 | movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector | ||
| 1481 | movdqu %xmm2,16($rptr,$i) | ||
| 1482 | ___ | ||
| 1483 | } | ||
| 1484 | $code.=<<___; | ||
| 1485 | mov 56(%rsp),%rsi # restore %rsp | ||
| 1486 | mov \$1,%rax | ||
| 1487 | mov 0(%rsi),%r15 | ||
| 1488 | mov 8(%rsi),%r14 | ||
| 1489 | mov 16(%rsi),%r13 | ||
| 1490 | mov 24(%rsi),%r12 | ||
| 1491 | mov 32(%rsi),%rbp | ||
| 1492 | mov 40(%rsi),%rbx | ||
| 1493 | lea 48(%rsi),%rsp | ||
| 1494 | .Lsqr4x_epilogue: | ||
| 1495 | ret | ||
| 1496 | .size bn_sqr4x_mont,.-bn_sqr4x_mont | ||
| 1497 | ___ | ||
| 1498 | }}} | ||
| 1499 | $code.=<<___; | ||
| 217 | .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | 1500 | .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
| 218 | .align 16 | 1501 | .align 16 |
| 219 | ___ | 1502 | ___ |
| @@ -228,9 +1511,9 @@ $disp="%r9"; | |||
| 228 | 1511 | ||
| 229 | $code.=<<___; | 1512 | $code.=<<___; |
| 230 | .extern __imp_RtlVirtualUnwind | 1513 | .extern __imp_RtlVirtualUnwind |
| 231 | .type se_handler,\@abi-omnipotent | 1514 | .type mul_handler,\@abi-omnipotent |
| 232 | .align 16 | 1515 | .align 16 |
| 233 | se_handler: | 1516 | mul_handler: |
| 234 | push %rsi | 1517 | push %rsi |
| 235 | push %rdi | 1518 | push %rdi |
| 236 | push %rbx | 1519 | push %rbx |
| @@ -245,15 +1528,20 @@ se_handler: | |||
| 245 | mov 120($context),%rax # pull context->Rax | 1528 | mov 120($context),%rax # pull context->Rax |
| 246 | mov 248($context),%rbx # pull context->Rip | 1529 | mov 248($context),%rbx # pull context->Rip |
| 247 | 1530 | ||
| 248 | lea .Lprologue(%rip),%r10 | 1531 | mov 8($disp),%rsi # disp->ImageBase |
| 249 | cmp %r10,%rbx # context->Rip<.Lprologue | 1532 | mov 56($disp),%r11 # disp->HandlerData |
| 250 | jb .Lin_prologue | 1533 | |
| 1534 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 1535 | lea (%rsi,%r10),%r10 # end of prologue label | ||
| 1536 | cmp %r10,%rbx # context->Rip<end of prologue label | ||
| 1537 | jb .Lcommon_seh_tail | ||
| 251 | 1538 | ||
| 252 | mov 152($context),%rax # pull context->Rsp | 1539 | mov 152($context),%rax # pull context->Rsp |
| 253 | 1540 | ||
| 254 | lea .Lepilogue(%rip),%r10 | 1541 | mov 4(%r11),%r10d # HandlerData[1] |
| 255 | cmp %r10,%rbx # context->Rip>=.Lepilogue | 1542 | lea (%rsi,%r10),%r10 # epilogue label |
| 256 | jae .Lin_prologue | 1543 | cmp %r10,%rbx # context->Rip>=epilogue label |
| 1544 | jae .Lcommon_seh_tail | ||
| 257 | 1545 | ||
| 258 | mov 192($context),%r10 # pull $num | 1546 | mov 192($context),%r10 # pull $num |
| 259 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer | 1547 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer |
| @@ -272,7 +1560,53 @@ se_handler: | |||
| 272 | mov %r14,232($context) # restore context->R14 | 1560 | mov %r14,232($context) # restore context->R14 |
| 273 | mov %r15,240($context) # restore context->R15 | 1561 | mov %r15,240($context) # restore context->R15 |
| 274 | 1562 | ||
| 275 | .Lin_prologue: | 1563 | jmp .Lcommon_seh_tail |
| 1564 | .size mul_handler,.-mul_handler | ||
| 1565 | |||
| 1566 | .type sqr_handler,\@abi-omnipotent | ||
| 1567 | .align 16 | ||
| 1568 | sqr_handler: | ||
| 1569 | push %rsi | ||
| 1570 | push %rdi | ||
| 1571 | push %rbx | ||
| 1572 | push %rbp | ||
| 1573 | push %r12 | ||
| 1574 | push %r13 | ||
| 1575 | push %r14 | ||
| 1576 | push %r15 | ||
| 1577 | pushfq | ||
| 1578 | sub \$64,%rsp | ||
| 1579 | |||
| 1580 | mov 120($context),%rax # pull context->Rax | ||
| 1581 | mov 248($context),%rbx # pull context->Rip | ||
| 1582 | |||
| 1583 | lea .Lsqr4x_body(%rip),%r10 | ||
| 1584 | cmp %r10,%rbx # context->Rip<.Lsqr_body | ||
| 1585 | jb .Lcommon_seh_tail | ||
| 1586 | |||
| 1587 | mov 152($context),%rax # pull context->Rsp | ||
| 1588 | |||
| 1589 | lea .Lsqr4x_epilogue(%rip),%r10 | ||
| 1590 | cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue | ||
| 1591 | jae .Lcommon_seh_tail | ||
| 1592 | |||
| 1593 | mov 56(%rax),%rax # pull saved stack pointer | ||
| 1594 | lea 48(%rax),%rax | ||
| 1595 | |||
| 1596 | mov -8(%rax),%rbx | ||
| 1597 | mov -16(%rax),%rbp | ||
| 1598 | mov -24(%rax),%r12 | ||
| 1599 | mov -32(%rax),%r13 | ||
| 1600 | mov -40(%rax),%r14 | ||
| 1601 | mov -48(%rax),%r15 | ||
| 1602 | mov %rbx,144($context) # restore context->Rbx | ||
| 1603 | mov %rbp,160($context) # restore context->Rbp | ||
| 1604 | mov %r12,216($context) # restore context->R12 | ||
| 1605 | mov %r13,224($context) # restore context->R13 | ||
| 1606 | mov %r14,232($context) # restore context->R14 | ||
| 1607 | mov %r15,240($context) # restore context->R15 | ||
| 1608 | |||
| 1609 | .Lcommon_seh_tail: | ||
| 276 | mov 8(%rax),%rdi | 1610 | mov 8(%rax),%rdi |
| 277 | mov 16(%rax),%rsi | 1611 | mov 16(%rax),%rsi |
| 278 | mov %rax,152($context) # restore context->Rsp | 1612 | mov %rax,152($context) # restore context->Rsp |
| @@ -310,7 +1644,7 @@ se_handler: | |||
| 310 | pop %rdi | 1644 | pop %rdi |
| 311 | pop %rsi | 1645 | pop %rsi |
| 312 | ret | 1646 | ret |
| 313 | .size se_handler,.-se_handler | 1647 | .size sqr_handler,.-sqr_handler |
| 314 | 1648 | ||
| 315 | .section .pdata | 1649 | .section .pdata |
| 316 | .align 4 | 1650 | .align 4 |
| @@ -318,11 +1652,27 @@ se_handler: | |||
| 318 | .rva .LSEH_end_bn_mul_mont | 1652 | .rva .LSEH_end_bn_mul_mont |
| 319 | .rva .LSEH_info_bn_mul_mont | 1653 | .rva .LSEH_info_bn_mul_mont |
| 320 | 1654 | ||
| 1655 | .rva .LSEH_begin_bn_mul4x_mont | ||
| 1656 | .rva .LSEH_end_bn_mul4x_mont | ||
| 1657 | .rva .LSEH_info_bn_mul4x_mont | ||
| 1658 | |||
| 1659 | .rva .LSEH_begin_bn_sqr4x_mont | ||
| 1660 | .rva .LSEH_end_bn_sqr4x_mont | ||
| 1661 | .rva .LSEH_info_bn_sqr4x_mont | ||
| 1662 | |||
| 321 | .section .xdata | 1663 | .section .xdata |
| 322 | .align 8 | 1664 | .align 8 |
| 323 | .LSEH_info_bn_mul_mont: | 1665 | .LSEH_info_bn_mul_mont: |
| 324 | .byte 9,0,0,0 | 1666 | .byte 9,0,0,0 |
| 325 | .rva se_handler | 1667 | .rva mul_handler |
| 1668 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[] | ||
| 1669 | .LSEH_info_bn_mul4x_mont: | ||
| 1670 | .byte 9,0,0,0 | ||
| 1671 | .rva mul_handler | ||
| 1672 | .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] | ||
| 1673 | .LSEH_info_bn_sqr4x_mont: | ||
| 1674 | .byte 9,0,0,0 | ||
| 1675 | .rva sqr_handler | ||
| 326 | ___ | 1676 | ___ |
| 327 | } | 1677 | } |
| 328 | 1678 | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl new file mode 100755 index 0000000000..057cda28aa --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl | |||
| @@ -0,0 +1,1070 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # August 2011. | ||
| 11 | # | ||
| 12 | # Companion to x86_64-mont.pl that optimizes cache-timing attack | ||
| 13 | # countermeasures. The subroutines are produced by replacing bp[i] | ||
| 14 | # references in their x86_64-mont.pl counterparts with cache-neutral | ||
| 15 | # references to powers table computed in BN_mod_exp_mont_consttime. | ||
| 16 | # In addition subroutine that scatters elements of the powers table | ||
| 17 | # is implemented, so that scatter-/gathering can be tuned without | ||
| 18 | # bn_exp.c modifications. | ||
| 19 | |||
| 20 | $flavour = shift; | ||
| 21 | $output = shift; | ||
| 22 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 23 | |||
| 24 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 25 | |||
| 26 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 27 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 28 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 29 | die "can't locate x86_64-xlate.pl"; | ||
| 30 | |||
| 31 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 32 | |||
| 33 | # int bn_mul_mont_gather5( | ||
| 34 | $rp="%rdi"; # BN_ULONG *rp, | ||
| 35 | $ap="%rsi"; # const BN_ULONG *ap, | ||
| 36 | $bp="%rdx"; # const BN_ULONG *bp, | ||
| 37 | $np="%rcx"; # const BN_ULONG *np, | ||
| 38 | $n0="%r8"; # const BN_ULONG *n0, | ||
| 39 | $num="%r9"; # int num, | ||
| 40 | # int idx); # 0 to 2^5-1, "index" in $bp holding | ||
| 41 | # pre-computed powers of a', interlaced | ||
| 42 | # in such manner that b[0] is $bp[idx], | ||
| 43 | # b[1] is [2^5+idx], etc. | ||
| 44 | $lo0="%r10"; | ||
| 45 | $hi0="%r11"; | ||
| 46 | $hi1="%r13"; | ||
| 47 | $i="%r14"; | ||
| 48 | $j="%r15"; | ||
| 49 | $m0="%rbx"; | ||
| 50 | $m1="%rbp"; | ||
| 51 | |||
| 52 | $code=<<___; | ||
| 53 | .text | ||
| 54 | |||
| 55 | .globl bn_mul_mont_gather5 | ||
| 56 | .type bn_mul_mont_gather5,\@function,6 | ||
| 57 | .align 64 | ||
| 58 | bn_mul_mont_gather5: | ||
| 59 | test \$3,${num}d | ||
| 60 | jnz .Lmul_enter | ||
| 61 | cmp \$8,${num}d | ||
| 62 | jb .Lmul_enter | ||
| 63 | jmp .Lmul4x_enter | ||
| 64 | |||
| 65 | .align 16 | ||
| 66 | .Lmul_enter: | ||
| 67 | mov ${num}d,${num}d | ||
| 68 | mov `($win64?56:8)`(%rsp),%r10d # load 7th argument | ||
| 69 | push %rbx | ||
| 70 | push %rbp | ||
| 71 | push %r12 | ||
| 72 | push %r13 | ||
| 73 | push %r14 | ||
| 74 | push %r15 | ||
| 75 | ___ | ||
| 76 | $code.=<<___ if ($win64); | ||
| 77 | lea -0x28(%rsp),%rsp | ||
| 78 | movaps %xmm6,(%rsp) | ||
| 79 | movaps %xmm7,0x10(%rsp) | ||
| 80 | .Lmul_alloca: | ||
| 81 | ___ | ||
| 82 | $code.=<<___; | ||
| 83 | mov %rsp,%rax | ||
| 84 | lea 2($num),%r11 | ||
| 85 | neg %r11 | ||
| 86 | lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) | ||
| 87 | and \$-1024,%rsp # minimize TLB usage | ||
| 88 | |||
| 89 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | ||
| 90 | .Lmul_body: | ||
| 91 | mov $bp,%r12 # reassign $bp | ||
| 92 | ___ | ||
| 93 | $bp="%r12"; | ||
| 94 | $STRIDE=2**5*8; # 5 is "window size" | ||
| 95 | $N=$STRIDE/4; # should match cache line size | ||
| 96 | $code.=<<___; | ||
| 97 | mov %r10,%r11 | ||
| 98 | shr \$`log($N/8)/log(2)`,%r10 | ||
| 99 | and \$`$N/8-1`,%r11 | ||
| 100 | not %r10 | ||
| 101 | lea .Lmagic_masks(%rip),%rax | ||
| 102 | and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" | ||
| 103 | lea 96($bp,%r11,8),$bp # pointer within 1st cache line | ||
| 104 | movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which | ||
| 105 | movq 8(%rax,%r10,8),%xmm5 # cache line contains element | ||
| 106 | movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument | ||
| 107 | movq 24(%rax,%r10,8),%xmm7 | ||
| 108 | |||
| 109 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
| 110 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
| 111 | pand %xmm4,%xmm0 | ||
| 112 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
| 113 | pand %xmm5,%xmm1 | ||
| 114 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
| 115 | pand %xmm6,%xmm2 | ||
| 116 | por %xmm1,%xmm0 | ||
| 117 | pand %xmm7,%xmm3 | ||
| 118 | por %xmm2,%xmm0 | ||
| 119 | lea $STRIDE($bp),$bp | ||
| 120 | por %xmm3,%xmm0 | ||
| 121 | |||
| 122 | movq %xmm0,$m0 # m0=bp[0] | ||
| 123 | |||
| 124 | mov ($n0),$n0 # pull n0[0] value | ||
| 125 | mov ($ap),%rax | ||
| 126 | |||
| 127 | xor $i,$i # i=0 | ||
| 128 | xor $j,$j # j=0 | ||
| 129 | |||
| 130 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
| 131 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
| 132 | pand %xmm4,%xmm0 | ||
| 133 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
| 134 | pand %xmm5,%xmm1 | ||
| 135 | |||
| 136 | mov $n0,$m1 | ||
| 137 | mulq $m0 # ap[0]*bp[0] | ||
| 138 | mov %rax,$lo0 | ||
| 139 | mov ($np),%rax | ||
| 140 | |||
| 141 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
| 142 | pand %xmm6,%xmm2 | ||
| 143 | por %xmm1,%xmm0 | ||
| 144 | pand %xmm7,%xmm3 | ||
| 145 | |||
| 146 | imulq $lo0,$m1 # "tp[0]"*n0 | ||
| 147 | mov %rdx,$hi0 | ||
| 148 | |||
| 149 | por %xmm2,%xmm0 | ||
| 150 | lea $STRIDE($bp),$bp | ||
| 151 | por %xmm3,%xmm0 | ||
| 152 | |||
| 153 | mulq $m1 # np[0]*m1 | ||
| 154 | add %rax,$lo0 # discarded | ||
| 155 | mov 8($ap),%rax | ||
| 156 | adc \$0,%rdx | ||
| 157 | mov %rdx,$hi1 | ||
| 158 | |||
| 159 | lea 1($j),$j # j++ | ||
| 160 | jmp .L1st_enter | ||
| 161 | |||
| 162 | .align 16 | ||
| 163 | .L1st: | ||
| 164 | add %rax,$hi1 | ||
| 165 | mov ($ap,$j,8),%rax | ||
| 166 | adc \$0,%rdx | ||
| 167 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] | ||
| 168 | mov $lo0,$hi0 | ||
| 169 | adc \$0,%rdx | ||
| 170 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
| 171 | mov %rdx,$hi1 | ||
| 172 | |||
| 173 | .L1st_enter: | ||
| 174 | mulq $m0 # ap[j]*bp[0] | ||
| 175 | add %rax,$hi0 | ||
| 176 | mov ($np,$j,8),%rax | ||
| 177 | adc \$0,%rdx | ||
| 178 | lea 1($j),$j # j++ | ||
| 179 | mov %rdx,$lo0 | ||
| 180 | |||
| 181 | mulq $m1 # np[j]*m1 | ||
| 182 | cmp $num,$j | ||
| 183 | jne .L1st | ||
| 184 | |||
| 185 | movq %xmm0,$m0 # bp[1] | ||
| 186 | |||
| 187 | add %rax,$hi1 | ||
| 188 | mov ($ap),%rax # ap[0] | ||
| 189 | adc \$0,%rdx | ||
| 190 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] | ||
| 191 | adc \$0,%rdx | ||
| 192 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
| 193 | mov %rdx,$hi1 | ||
| 194 | mov $lo0,$hi0 | ||
| 195 | |||
| 196 | xor %rdx,%rdx | ||
| 197 | add $hi0,$hi1 | ||
| 198 | adc \$0,%rdx | ||
| 199 | mov $hi1,-8(%rsp,$num,8) | ||
| 200 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit | ||
| 201 | |||
| 202 | lea 1($i),$i # i++ | ||
| 203 | jmp .Louter | ||
| 204 | .align 16 | ||
| 205 | .Louter: | ||
| 206 | xor $j,$j # j=0 | ||
| 207 | mov $n0,$m1 | ||
| 208 | mov (%rsp),$lo0 | ||
| 209 | |||
| 210 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
| 211 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
| 212 | pand %xmm4,%xmm0 | ||
| 213 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
| 214 | pand %xmm5,%xmm1 | ||
| 215 | |||
| 216 | mulq $m0 # ap[0]*bp[i] | ||
| 217 | add %rax,$lo0 # ap[0]*bp[i]+tp[0] | ||
| 218 | mov ($np),%rax | ||
| 219 | adc \$0,%rdx | ||
| 220 | |||
| 221 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
| 222 | pand %xmm6,%xmm2 | ||
| 223 | por %xmm1,%xmm0 | ||
| 224 | pand %xmm7,%xmm3 | ||
| 225 | |||
| 226 | imulq $lo0,$m1 # tp[0]*n0 | ||
| 227 | mov %rdx,$hi0 | ||
| 228 | |||
| 229 | por %xmm2,%xmm0 | ||
| 230 | lea $STRIDE($bp),$bp | ||
| 231 | por %xmm3,%xmm0 | ||
| 232 | |||
| 233 | mulq $m1 # np[0]*m1 | ||
| 234 | add %rax,$lo0 # discarded | ||
| 235 | mov 8($ap),%rax | ||
| 236 | adc \$0,%rdx | ||
| 237 | mov 8(%rsp),$lo0 # tp[1] | ||
| 238 | mov %rdx,$hi1 | ||
| 239 | |||
| 240 | lea 1($j),$j # j++ | ||
| 241 | jmp .Linner_enter | ||
| 242 | |||
| 243 | .align 16 | ||
| 244 | .Linner: | ||
| 245 | add %rax,$hi1 | ||
| 246 | mov ($ap,$j,8),%rax | ||
| 247 | adc \$0,%rdx | ||
| 248 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
| 249 | mov (%rsp,$j,8),$lo0 | ||
| 250 | adc \$0,%rdx | ||
| 251 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
| 252 | mov %rdx,$hi1 | ||
| 253 | |||
| 254 | .Linner_enter: | ||
| 255 | mulq $m0 # ap[j]*bp[i] | ||
| 256 | add %rax,$hi0 | ||
| 257 | mov ($np,$j,8),%rax | ||
| 258 | adc \$0,%rdx | ||
| 259 | add $hi0,$lo0 # ap[j]*bp[i]+tp[j] | ||
| 260 | mov %rdx,$hi0 | ||
| 261 | adc \$0,$hi0 | ||
| 262 | lea 1($j),$j # j++ | ||
| 263 | |||
| 264 | mulq $m1 # np[j]*m1 | ||
| 265 | cmp $num,$j | ||
| 266 | jne .Linner | ||
| 267 | |||
| 268 | movq %xmm0,$m0 # bp[i+1] | ||
| 269 | |||
| 270 | add %rax,$hi1 | ||
| 271 | mov ($ap),%rax # ap[0] | ||
| 272 | adc \$0,%rdx | ||
| 273 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
| 274 | mov (%rsp,$j,8),$lo0 | ||
| 275 | adc \$0,%rdx | ||
| 276 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
| 277 | mov %rdx,$hi1 | ||
| 278 | |||
| 279 | xor %rdx,%rdx | ||
| 280 | add $hi0,$hi1 | ||
| 281 | adc \$0,%rdx | ||
| 282 | add $lo0,$hi1 # pull upmost overflow bit | ||
| 283 | adc \$0,%rdx | ||
| 284 | mov $hi1,-8(%rsp,$num,8) | ||
| 285 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit | ||
| 286 | |||
| 287 | lea 1($i),$i # i++ | ||
| 288 | cmp $num,$i | ||
| 289 | jl .Louter | ||
| 290 | |||
| 291 | xor $i,$i # i=0 and clear CF! | ||
| 292 | mov (%rsp),%rax # tp[0] | ||
| 293 | lea (%rsp),$ap # borrow ap for tp | ||
| 294 | mov $num,$j # j=num | ||
| 295 | jmp .Lsub | ||
| 296 | .align 16 | ||
| 297 | .Lsub: sbb ($np,$i,8),%rax | ||
| 298 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 299 | mov 8($ap,$i,8),%rax # tp[i+1] | ||
| 300 | lea 1($i),$i # i++ | ||
| 301 | dec $j # doesnn't affect CF! | ||
| 302 | jnz .Lsub | ||
| 303 | |||
| 304 | sbb \$0,%rax # handle upmost overflow bit | ||
| 305 | xor $i,$i | ||
| 306 | and %rax,$ap | ||
| 307 | not %rax | ||
| 308 | mov $rp,$np | ||
| 309 | and %rax,$np | ||
| 310 | mov $num,$j # j=num | ||
| 311 | or $np,$ap # ap=borrow?tp:rp | ||
| 312 | .align 16 | ||
| 313 | .Lcopy: # copy or in-place refresh | ||
| 314 | mov ($ap,$i,8),%rax | ||
| 315 | mov $i,(%rsp,$i,8) # zap temporary vector | ||
| 316 | mov %rax,($rp,$i,8) # rp[i]=tp[i] | ||
| 317 | lea 1($i),$i | ||
| 318 | sub \$1,$j | ||
| 319 | jnz .Lcopy | ||
| 320 | |||
| 321 | mov 8(%rsp,$num,8),%rsi # restore %rsp | ||
| 322 | mov \$1,%rax | ||
| 323 | ___ | ||
| 324 | $code.=<<___ if ($win64); | ||
| 325 | movaps (%rsi),%xmm6 | ||
| 326 | movaps 0x10(%rsi),%xmm7 | ||
| 327 | lea 0x28(%rsi),%rsi | ||
| 328 | ___ | ||
| 329 | $code.=<<___; | ||
| 330 | mov (%rsi),%r15 | ||
| 331 | mov 8(%rsi),%r14 | ||
| 332 | mov 16(%rsi),%r13 | ||
| 333 | mov 24(%rsi),%r12 | ||
| 334 | mov 32(%rsi),%rbp | ||
| 335 | mov 40(%rsi),%rbx | ||
| 336 | lea 48(%rsi),%rsp | ||
| 337 | .Lmul_epilogue: | ||
| 338 | ret | ||
| 339 | .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 | ||
| 340 | ___ | ||
| 341 | {{{ | ||
| 342 | my @A=("%r10","%r11"); | ||
| 343 | my @N=("%r13","%rdi"); | ||
| 344 | $code.=<<___; | ||
| 345 | .type bn_mul4x_mont_gather5,\@function,6 | ||
| 346 | .align 16 | ||
| 347 | bn_mul4x_mont_gather5: | ||
| 348 | .Lmul4x_enter: | ||
| 349 | mov ${num}d,${num}d | ||
| 350 | mov `($win64?56:8)`(%rsp),%r10d # load 7th argument | ||
| 351 | push %rbx | ||
| 352 | push %rbp | ||
| 353 | push %r12 | ||
| 354 | push %r13 | ||
| 355 | push %r14 | ||
| 356 | push %r15 | ||
| 357 | ___ | ||
| 358 | $code.=<<___ if ($win64); | ||
| 359 | lea -0x28(%rsp),%rsp | ||
| 360 | movaps %xmm6,(%rsp) | ||
| 361 | movaps %xmm7,0x10(%rsp) | ||
| 362 | .Lmul4x_alloca: | ||
| 363 | ___ | ||
| 364 | $code.=<<___; | ||
| 365 | mov %rsp,%rax | ||
| 366 | lea 4($num),%r11 | ||
| 367 | neg %r11 | ||
| 368 | lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) | ||
| 369 | and \$-1024,%rsp # minimize TLB usage | ||
| 370 | |||
| 371 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | ||
| 372 | .Lmul4x_body: | ||
| 373 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp | ||
| 374 | mov %rdx,%r12 # reassign $bp | ||
| 375 | ___ | ||
| 376 | $bp="%r12"; | ||
| 377 | $STRIDE=2**5*8; # 5 is "window size" | ||
| 378 | $N=$STRIDE/4; # should match cache line size | ||
| 379 | $code.=<<___; | ||
| 380 | mov %r10,%r11 | ||
| 381 | shr \$`log($N/8)/log(2)`,%r10 | ||
| 382 | and \$`$N/8-1`,%r11 | ||
| 383 | not %r10 | ||
| 384 | lea .Lmagic_masks(%rip),%rax | ||
| 385 | and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" | ||
| 386 | lea 96($bp,%r11,8),$bp # pointer within 1st cache line | ||
| 387 | movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which | ||
| 388 | movq 8(%rax,%r10,8),%xmm5 # cache line contains element | ||
| 389 | movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument | ||
| 390 | movq 24(%rax,%r10,8),%xmm7 | ||
| 391 | |||
| 392 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
| 393 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
| 394 | pand %xmm4,%xmm0 | ||
| 395 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
| 396 | pand %xmm5,%xmm1 | ||
| 397 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
| 398 | pand %xmm6,%xmm2 | ||
| 399 | por %xmm1,%xmm0 | ||
| 400 | pand %xmm7,%xmm3 | ||
| 401 | por %xmm2,%xmm0 | ||
| 402 | lea $STRIDE($bp),$bp | ||
| 403 | por %xmm3,%xmm0 | ||
| 404 | |||
| 405 | movq %xmm0,$m0 # m0=bp[0] | ||
| 406 | mov ($n0),$n0 # pull n0[0] value | ||
| 407 | mov ($ap),%rax | ||
| 408 | |||
| 409 | xor $i,$i # i=0 | ||
| 410 | xor $j,$j # j=0 | ||
| 411 | |||
| 412 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
| 413 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
| 414 | pand %xmm4,%xmm0 | ||
| 415 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
| 416 | pand %xmm5,%xmm1 | ||
| 417 | |||
| 418 | mov $n0,$m1 | ||
| 419 | mulq $m0 # ap[0]*bp[0] | ||
| 420 | mov %rax,$A[0] | ||
| 421 | mov ($np),%rax | ||
| 422 | |||
| 423 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
| 424 | pand %xmm6,%xmm2 | ||
| 425 | por %xmm1,%xmm0 | ||
| 426 | pand %xmm7,%xmm3 | ||
| 427 | |||
| 428 | imulq $A[0],$m1 # "tp[0]"*n0 | ||
| 429 | mov %rdx,$A[1] | ||
| 430 | |||
| 431 | por %xmm2,%xmm0 | ||
| 432 | lea $STRIDE($bp),$bp | ||
| 433 | por %xmm3,%xmm0 | ||
| 434 | |||
| 435 | mulq $m1 # np[0]*m1 | ||
| 436 | add %rax,$A[0] # discarded | ||
| 437 | mov 8($ap),%rax | ||
| 438 | adc \$0,%rdx | ||
| 439 | mov %rdx,$N[1] | ||
| 440 | |||
| 441 | mulq $m0 | ||
| 442 | add %rax,$A[1] | ||
| 443 | mov 8($np),%rax | ||
| 444 | adc \$0,%rdx | ||
| 445 | mov %rdx,$A[0] | ||
| 446 | |||
| 447 | mulq $m1 | ||
| 448 | add %rax,$N[1] | ||
| 449 | mov 16($ap),%rax | ||
| 450 | adc \$0,%rdx | ||
| 451 | add $A[1],$N[1] | ||
| 452 | lea 4($j),$j # j++ | ||
| 453 | adc \$0,%rdx | ||
| 454 | mov $N[1],(%rsp) | ||
| 455 | mov %rdx,$N[0] | ||
| 456 | jmp .L1st4x | ||
| 457 | .align 16 | ||
| 458 | .L1st4x: | ||
| 459 | mulq $m0 # ap[j]*bp[0] | ||
| 460 | add %rax,$A[0] | ||
| 461 | mov -16($np,$j,8),%rax | ||
| 462 | adc \$0,%rdx | ||
| 463 | mov %rdx,$A[1] | ||
| 464 | |||
| 465 | mulq $m1 # np[j]*m1 | ||
| 466 | add %rax,$N[0] | ||
| 467 | mov -8($ap,$j,8),%rax | ||
| 468 | adc \$0,%rdx | ||
| 469 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
| 470 | adc \$0,%rdx | ||
| 471 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
| 472 | mov %rdx,$N[1] | ||
| 473 | |||
| 474 | mulq $m0 # ap[j]*bp[0] | ||
| 475 | add %rax,$A[1] | ||
| 476 | mov -8($np,$j,8),%rax | ||
| 477 | adc \$0,%rdx | ||
| 478 | mov %rdx,$A[0] | ||
| 479 | |||
| 480 | mulq $m1 # np[j]*m1 | ||
| 481 | add %rax,$N[1] | ||
| 482 | mov ($ap,$j,8),%rax | ||
| 483 | adc \$0,%rdx | ||
| 484 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | ||
| 485 | adc \$0,%rdx | ||
| 486 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
| 487 | mov %rdx,$N[0] | ||
| 488 | |||
| 489 | mulq $m0 # ap[j]*bp[0] | ||
| 490 | add %rax,$A[0] | ||
| 491 | mov ($np,$j,8),%rax | ||
| 492 | adc \$0,%rdx | ||
| 493 | mov %rdx,$A[1] | ||
| 494 | |||
| 495 | mulq $m1 # np[j]*m1 | ||
| 496 | add %rax,$N[0] | ||
| 497 | mov 8($ap,$j,8),%rax | ||
| 498 | adc \$0,%rdx | ||
| 499 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
| 500 | adc \$0,%rdx | ||
| 501 | mov $N[0],-8(%rsp,$j,8) # tp[j-1] | ||
| 502 | mov %rdx,$N[1] | ||
| 503 | |||
| 504 | mulq $m0 # ap[j]*bp[0] | ||
| 505 | add %rax,$A[1] | ||
| 506 | mov 8($np,$j,8),%rax | ||
| 507 | adc \$0,%rdx | ||
| 508 | lea 4($j),$j # j++ | ||
| 509 | mov %rdx,$A[0] | ||
| 510 | |||
| 511 | mulq $m1 # np[j]*m1 | ||
| 512 | add %rax,$N[1] | ||
| 513 | mov -16($ap,$j,8),%rax | ||
| 514 | adc \$0,%rdx | ||
| 515 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | ||
| 516 | adc \$0,%rdx | ||
| 517 | mov $N[1],-32(%rsp,$j,8) # tp[j-1] | ||
| 518 | mov %rdx,$N[0] | ||
| 519 | cmp $num,$j | ||
| 520 | jl .L1st4x | ||
| 521 | |||
| 522 | mulq $m0 # ap[j]*bp[0] | ||
| 523 | add %rax,$A[0] | ||
| 524 | mov -16($np,$j,8),%rax | ||
| 525 | adc \$0,%rdx | ||
| 526 | mov %rdx,$A[1] | ||
| 527 | |||
| 528 | mulq $m1 # np[j]*m1 | ||
| 529 | add %rax,$N[0] | ||
| 530 | mov -8($ap,$j,8),%rax | ||
| 531 | adc \$0,%rdx | ||
| 532 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
| 533 | adc \$0,%rdx | ||
| 534 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
| 535 | mov %rdx,$N[1] | ||
| 536 | |||
| 537 | mulq $m0 # ap[j]*bp[0] | ||
| 538 | add %rax,$A[1] | ||
| 539 | mov -8($np,$j,8),%rax | ||
| 540 | adc \$0,%rdx | ||
| 541 | mov %rdx,$A[0] | ||
| 542 | |||
| 543 | mulq $m1 # np[j]*m1 | ||
| 544 | add %rax,$N[1] | ||
| 545 | mov ($ap),%rax # ap[0] | ||
| 546 | adc \$0,%rdx | ||
| 547 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | ||
| 548 | adc \$0,%rdx | ||
| 549 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
| 550 | mov %rdx,$N[0] | ||
| 551 | |||
| 552 | movq %xmm0,$m0 # bp[1] | ||
| 553 | |||
| 554 | xor $N[1],$N[1] | ||
| 555 | add $A[0],$N[0] | ||
| 556 | adc \$0,$N[1] | ||
| 557 | mov $N[0],-8(%rsp,$j,8) | ||
| 558 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit | ||
| 559 | |||
| 560 | lea 1($i),$i # i++ | ||
| 561 | .align 4 | ||
| 562 | .Louter4x: | ||
| 563 | xor $j,$j # j=0 | ||
| 564 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
| 565 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
| 566 | pand %xmm4,%xmm0 | ||
| 567 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
| 568 | pand %xmm5,%xmm1 | ||
| 569 | |||
| 570 | mov (%rsp),$A[0] | ||
| 571 | mov $n0,$m1 | ||
| 572 | mulq $m0 # ap[0]*bp[i] | ||
| 573 | add %rax,$A[0] # ap[0]*bp[i]+tp[0] | ||
| 574 | mov ($np),%rax | ||
| 575 | adc \$0,%rdx | ||
| 576 | |||
| 577 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
| 578 | pand %xmm6,%xmm2 | ||
| 579 | por %xmm1,%xmm0 | ||
| 580 | pand %xmm7,%xmm3 | ||
| 581 | |||
| 582 | imulq $A[0],$m1 # tp[0]*n0 | ||
| 583 | mov %rdx,$A[1] | ||
| 584 | |||
| 585 | por %xmm2,%xmm0 | ||
| 586 | lea $STRIDE($bp),$bp | ||
| 587 | por %xmm3,%xmm0 | ||
| 588 | |||
| 589 | mulq $m1 # np[0]*m1 | ||
| 590 | add %rax,$A[0] # "$N[0]", discarded | ||
| 591 | mov 8($ap),%rax | ||
| 592 | adc \$0,%rdx | ||
| 593 | mov %rdx,$N[1] | ||
| 594 | |||
| 595 | mulq $m0 # ap[j]*bp[i] | ||
| 596 | add %rax,$A[1] | ||
| 597 | mov 8($np),%rax | ||
| 598 | adc \$0,%rdx | ||
| 599 | add 8(%rsp),$A[1] # +tp[1] | ||
| 600 | adc \$0,%rdx | ||
| 601 | mov %rdx,$A[0] | ||
| 602 | |||
| 603 | mulq $m1 # np[j]*m1 | ||
| 604 | add %rax,$N[1] | ||
| 605 | mov 16($ap),%rax | ||
| 606 | adc \$0,%rdx | ||
| 607 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
| 608 | lea 4($j),$j # j+=2 | ||
| 609 | adc \$0,%rdx | ||
| 610 | mov %rdx,$N[0] | ||
| 611 | jmp .Linner4x | ||
| 612 | .align 16 | ||
| 613 | .Linner4x: | ||
| 614 | mulq $m0 # ap[j]*bp[i] | ||
| 615 | add %rax,$A[0] | ||
| 616 | mov -16($np,$j,8),%rax | ||
| 617 | adc \$0,%rdx | ||
| 618 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
| 619 | adc \$0,%rdx | ||
| 620 | mov %rdx,$A[1] | ||
| 621 | |||
| 622 | mulq $m1 # np[j]*m1 | ||
| 623 | add %rax,$N[0] | ||
| 624 | mov -8($ap,$j,8),%rax | ||
| 625 | adc \$0,%rdx | ||
| 626 | add $A[0],$N[0] | ||
| 627 | adc \$0,%rdx | ||
| 628 | mov $N[1],-32(%rsp,$j,8) # tp[j-1] | ||
| 629 | mov %rdx,$N[1] | ||
| 630 | |||
| 631 | mulq $m0 # ap[j]*bp[i] | ||
| 632 | add %rax,$A[1] | ||
| 633 | mov -8($np,$j,8),%rax | ||
| 634 | adc \$0,%rdx | ||
| 635 | add -8(%rsp,$j,8),$A[1] | ||
| 636 | adc \$0,%rdx | ||
| 637 | mov %rdx,$A[0] | ||
| 638 | |||
| 639 | mulq $m1 # np[j]*m1 | ||
| 640 | add %rax,$N[1] | ||
| 641 | mov ($ap,$j,8),%rax | ||
| 642 | adc \$0,%rdx | ||
| 643 | add $A[1],$N[1] | ||
| 644 | adc \$0,%rdx | ||
| 645 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
| 646 | mov %rdx,$N[0] | ||
| 647 | |||
| 648 | mulq $m0 # ap[j]*bp[i] | ||
| 649 | add %rax,$A[0] | ||
| 650 | mov ($np,$j,8),%rax | ||
| 651 | adc \$0,%rdx | ||
| 652 | add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
| 653 | adc \$0,%rdx | ||
| 654 | mov %rdx,$A[1] | ||
| 655 | |||
| 656 | mulq $m1 # np[j]*m1 | ||
| 657 | add %rax,$N[0] | ||
| 658 | mov 8($ap,$j,8),%rax | ||
| 659 | adc \$0,%rdx | ||
| 660 | add $A[0],$N[0] | ||
| 661 | adc \$0,%rdx | ||
| 662 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
| 663 | mov %rdx,$N[1] | ||
| 664 | |||
| 665 | mulq $m0 # ap[j]*bp[i] | ||
| 666 | add %rax,$A[1] | ||
| 667 | mov 8($np,$j,8),%rax | ||
| 668 | adc \$0,%rdx | ||
| 669 | add 8(%rsp,$j,8),$A[1] | ||
| 670 | adc \$0,%rdx | ||
| 671 | lea 4($j),$j # j++ | ||
| 672 | mov %rdx,$A[0] | ||
| 673 | |||
| 674 | mulq $m1 # np[j]*m1 | ||
| 675 | add %rax,$N[1] | ||
| 676 | mov -16($ap,$j,8),%rax | ||
| 677 | adc \$0,%rdx | ||
| 678 | add $A[1],$N[1] | ||
| 679 | adc \$0,%rdx | ||
| 680 | mov $N[0],-40(%rsp,$j,8) # tp[j-1] | ||
| 681 | mov %rdx,$N[0] | ||
| 682 | cmp $num,$j | ||
| 683 | jl .Linner4x | ||
| 684 | |||
| 685 | mulq $m0 # ap[j]*bp[i] | ||
| 686 | add %rax,$A[0] | ||
| 687 | mov -16($np,$j,8),%rax | ||
| 688 | adc \$0,%rdx | ||
| 689 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
| 690 | adc \$0,%rdx | ||
| 691 | mov %rdx,$A[1] | ||
| 692 | |||
| 693 | mulq $m1 # np[j]*m1 | ||
| 694 | add %rax,$N[0] | ||
| 695 | mov -8($ap,$j,8),%rax | ||
| 696 | adc \$0,%rdx | ||
| 697 | add $A[0],$N[0] | ||
| 698 | adc \$0,%rdx | ||
| 699 | mov $N[1],-32(%rsp,$j,8) # tp[j-1] | ||
| 700 | mov %rdx,$N[1] | ||
| 701 | |||
| 702 | mulq $m0 # ap[j]*bp[i] | ||
| 703 | add %rax,$A[1] | ||
| 704 | mov -8($np,$j,8),%rax | ||
| 705 | adc \$0,%rdx | ||
| 706 | add -8(%rsp,$j,8),$A[1] | ||
| 707 | adc \$0,%rdx | ||
| 708 | lea 1($i),$i # i++ | ||
| 709 | mov %rdx,$A[0] | ||
| 710 | |||
| 711 | mulq $m1 # np[j]*m1 | ||
| 712 | add %rax,$N[1] | ||
| 713 | mov ($ap),%rax # ap[0] | ||
| 714 | adc \$0,%rdx | ||
| 715 | add $A[1],$N[1] | ||
| 716 | adc \$0,%rdx | ||
| 717 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
| 718 | mov %rdx,$N[0] | ||
| 719 | |||
| 720 | movq %xmm0,$m0 # bp[i+1] | ||
| 721 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
| 722 | |||
| 723 | xor $N[1],$N[1] | ||
| 724 | add $A[0],$N[0] | ||
| 725 | adc \$0,$N[1] | ||
| 726 | add (%rsp,$num,8),$N[0] # pull upmost overflow bit | ||
| 727 | adc \$0,$N[1] | ||
| 728 | mov $N[0],-8(%rsp,$j,8) | ||
| 729 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit | ||
| 730 | |||
| 731 | cmp $num,$i | ||
| 732 | jl .Louter4x | ||
| 733 | ___ | ||
| 734 | { | ||
| 735 | my @ri=("%rax","%rdx",$m0,$m1); | ||
| 736 | $code.=<<___; | ||
| 737 | mov 16(%rsp,$num,8),$rp # restore $rp | ||
| 738 | mov 0(%rsp),@ri[0] # tp[0] | ||
| 739 | pxor %xmm0,%xmm0 | ||
| 740 | mov 8(%rsp),@ri[1] # tp[1] | ||
| 741 | shr \$2,$num # num/=4 | ||
| 742 | lea (%rsp),$ap # borrow ap for tp | ||
| 743 | xor $i,$i # i=0 and clear CF! | ||
| 744 | |||
| 745 | sub 0($np),@ri[0] | ||
| 746 | mov 16($ap),@ri[2] # tp[2] | ||
| 747 | mov 24($ap),@ri[3] # tp[3] | ||
| 748 | sbb 8($np),@ri[1] | ||
| 749 | lea -1($num),$j # j=num/4-1 | ||
| 750 | jmp .Lsub4x | ||
| 751 | .align 16 | ||
| 752 | .Lsub4x: | ||
| 753 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 754 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 755 | sbb 16($np,$i,8),@ri[2] | ||
| 756 | mov 32($ap,$i,8),@ri[0] # tp[i+1] | ||
| 757 | mov 40($ap,$i,8),@ri[1] | ||
| 758 | sbb 24($np,$i,8),@ri[3] | ||
| 759 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 760 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 761 | sbb 32($np,$i,8),@ri[0] | ||
| 762 | mov 48($ap,$i,8),@ri[2] | ||
| 763 | mov 56($ap,$i,8),@ri[3] | ||
| 764 | sbb 40($np,$i,8),@ri[1] | ||
| 765 | lea 4($i),$i # i++ | ||
| 766 | dec $j # doesnn't affect CF! | ||
| 767 | jnz .Lsub4x | ||
| 768 | |||
| 769 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 770 | mov 32($ap,$i,8),@ri[0] # load overflow bit | ||
| 771 | sbb 16($np,$i,8),@ri[2] | ||
| 772 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 773 | sbb 24($np,$i,8),@ri[3] | ||
| 774 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 775 | |||
| 776 | sbb \$0,@ri[0] # handle upmost overflow bit | ||
| 777 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 778 | xor $i,$i # i=0 | ||
| 779 | and @ri[0],$ap | ||
| 780 | not @ri[0] | ||
| 781 | mov $rp,$np | ||
| 782 | and @ri[0],$np | ||
| 783 | lea -1($num),$j | ||
| 784 | or $np,$ap # ap=borrow?tp:rp | ||
| 785 | |||
| 786 | movdqu ($ap),%xmm1 | ||
| 787 | movdqa %xmm0,(%rsp) | ||
| 788 | movdqu %xmm1,($rp) | ||
| 789 | jmp .Lcopy4x | ||
| 790 | .align 16 | ||
| 791 | .Lcopy4x: # copy or in-place refresh | ||
| 792 | movdqu 16($ap,$i),%xmm2 | ||
| 793 | movdqu 32($ap,$i),%xmm1 | ||
| 794 | movdqa %xmm0,16(%rsp,$i) | ||
| 795 | movdqu %xmm2,16($rp,$i) | ||
| 796 | movdqa %xmm0,32(%rsp,$i) | ||
| 797 | movdqu %xmm1,32($rp,$i) | ||
| 798 | lea 32($i),$i | ||
| 799 | dec $j | ||
| 800 | jnz .Lcopy4x | ||
| 801 | |||
| 802 | shl \$2,$num | ||
| 803 | movdqu 16($ap,$i),%xmm2 | ||
| 804 | movdqa %xmm0,16(%rsp,$i) | ||
| 805 | movdqu %xmm2,16($rp,$i) | ||
| 806 | ___ | ||
| 807 | } | ||
| 808 | $code.=<<___; | ||
| 809 | mov 8(%rsp,$num,8),%rsi # restore %rsp | ||
| 810 | mov \$1,%rax | ||
| 811 | ___ | ||
| 812 | $code.=<<___ if ($win64); | ||
| 813 | movaps (%rsi),%xmm6 | ||
| 814 | movaps 0x10(%rsi),%xmm7 | ||
| 815 | lea 0x28(%rsi),%rsi | ||
| 816 | ___ | ||
| 817 | $code.=<<___; | ||
| 818 | mov (%rsi),%r15 | ||
| 819 | mov 8(%rsi),%r14 | ||
| 820 | mov 16(%rsi),%r13 | ||
| 821 | mov 24(%rsi),%r12 | ||
| 822 | mov 32(%rsi),%rbp | ||
| 823 | mov 40(%rsi),%rbx | ||
| 824 | lea 48(%rsi),%rsp | ||
| 825 | .Lmul4x_epilogue: | ||
| 826 | ret | ||
| 827 | .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 | ||
| 828 | ___ | ||
| 829 | }}} | ||
| 830 | |||
| 831 | { | ||
| 832 | my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order | ||
| 833 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | ||
| 834 | my $out=$inp; | ||
| 835 | my $STRIDE=2**5*8; | ||
| 836 | my $N=$STRIDE/4; | ||
| 837 | |||
| 838 | $code.=<<___; | ||
| 839 | .globl bn_scatter5 | ||
| 840 | .type bn_scatter5,\@abi-omnipotent | ||
| 841 | .align 16 | ||
| 842 | bn_scatter5: | ||
| 843 | cmp \$0, $num | ||
| 844 | jz .Lscatter_epilogue | ||
| 845 | lea ($tbl,$idx,8),$tbl | ||
| 846 | .Lscatter: | ||
| 847 | mov ($inp),%rax | ||
| 848 | lea 8($inp),$inp | ||
| 849 | mov %rax,($tbl) | ||
| 850 | lea 32*8($tbl),$tbl | ||
| 851 | sub \$1,$num | ||
| 852 | jnz .Lscatter | ||
| 853 | .Lscatter_epilogue: | ||
| 854 | ret | ||
| 855 | .size bn_scatter5,.-bn_scatter5 | ||
| 856 | |||
| 857 | .globl bn_gather5 | ||
| 858 | .type bn_gather5,\@abi-omnipotent | ||
| 859 | .align 16 | ||
| 860 | bn_gather5: | ||
| 861 | ___ | ||
| 862 | $code.=<<___ if ($win64); | ||
| 863 | .LSEH_begin_bn_gather5: | ||
| 864 | # I can't trust assembler to use specific encoding:-( | ||
| 865 | .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp | ||
| 866 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) | ||
| 867 | .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) | ||
| 868 | ___ | ||
| 869 | $code.=<<___; | ||
| 870 | mov $idx,%r11 | ||
| 871 | shr \$`log($N/8)/log(2)`,$idx | ||
| 872 | and \$`$N/8-1`,%r11 | ||
| 873 | not $idx | ||
| 874 | lea .Lmagic_masks(%rip),%rax | ||
| 875 | and \$`2**5/($N/8)-1`,$idx # 5 is "window size" | ||
| 876 | lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line | ||
| 877 | movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which | ||
| 878 | movq 8(%rax,$idx,8),%xmm5 # cache line contains element | ||
| 879 | movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument | ||
| 880 | movq 24(%rax,$idx,8),%xmm7 | ||
| 881 | jmp .Lgather | ||
| 882 | .align 16 | ||
| 883 | .Lgather: | ||
| 884 | movq `0*$STRIDE/4-96`($tbl),%xmm0 | ||
| 885 | movq `1*$STRIDE/4-96`($tbl),%xmm1 | ||
| 886 | pand %xmm4,%xmm0 | ||
| 887 | movq `2*$STRIDE/4-96`($tbl),%xmm2 | ||
| 888 | pand %xmm5,%xmm1 | ||
| 889 | movq `3*$STRIDE/4-96`($tbl),%xmm3 | ||
| 890 | pand %xmm6,%xmm2 | ||
| 891 | por %xmm1,%xmm0 | ||
| 892 | pand %xmm7,%xmm3 | ||
| 893 | por %xmm2,%xmm0 | ||
| 894 | lea $STRIDE($tbl),$tbl | ||
| 895 | por %xmm3,%xmm0 | ||
| 896 | |||
| 897 | movq %xmm0,($out) # m0=bp[0] | ||
| 898 | lea 8($out),$out | ||
| 899 | sub \$1,$num | ||
| 900 | jnz .Lgather | ||
| 901 | ___ | ||
| 902 | $code.=<<___ if ($win64); | ||
| 903 | movaps %xmm6,(%rsp) | ||
| 904 | movaps %xmm7,0x10(%rsp) | ||
| 905 | lea 0x28(%rsp),%rsp | ||
| 906 | ___ | ||
| 907 | $code.=<<___; | ||
| 908 | ret | ||
| 909 | .LSEH_end_bn_gather5: | ||
| 910 | .size bn_gather5,.-bn_gather5 | ||
| 911 | ___ | ||
| 912 | } | ||
| 913 | $code.=<<___; | ||
| 914 | .align 64 | ||
| 915 | .Lmagic_masks: | ||
| 916 | .long 0,0, 0,0, 0,0, -1,-1 | ||
| 917 | .long 0,0, 0,0, 0,0, 0,0 | ||
| 918 | .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 919 | ___ | ||
| 920 | |||
| 921 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 922 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 923 | if ($win64) { | ||
| 924 | $rec="%rcx"; | ||
| 925 | $frame="%rdx"; | ||
| 926 | $context="%r8"; | ||
| 927 | $disp="%r9"; | ||
| 928 | |||
| 929 | $code.=<<___; | ||
| 930 | .extern __imp_RtlVirtualUnwind | ||
| 931 | .type mul_handler,\@abi-omnipotent | ||
| 932 | .align 16 | ||
| 933 | mul_handler: | ||
| 934 | push %rsi | ||
| 935 | push %rdi | ||
| 936 | push %rbx | ||
| 937 | push %rbp | ||
| 938 | push %r12 | ||
| 939 | push %r13 | ||
| 940 | push %r14 | ||
| 941 | push %r15 | ||
| 942 | pushfq | ||
| 943 | sub \$64,%rsp | ||
| 944 | |||
| 945 | mov 120($context),%rax # pull context->Rax | ||
| 946 | mov 248($context),%rbx # pull context->Rip | ||
| 947 | |||
| 948 | mov 8($disp),%rsi # disp->ImageBase | ||
| 949 | mov 56($disp),%r11 # disp->HandlerData | ||
| 950 | |||
| 951 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 952 | lea (%rsi,%r10),%r10 # end of prologue label | ||
| 953 | cmp %r10,%rbx # context->Rip<end of prologue label | ||
| 954 | jb .Lcommon_seh_tail | ||
| 955 | |||
| 956 | lea `40+48`(%rax),%rax | ||
| 957 | |||
| 958 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 959 | lea (%rsi,%r10),%r10 # end of alloca label | ||
| 960 | cmp %r10,%rbx # context->Rip<end of alloca label | ||
| 961 | jb .Lcommon_seh_tail | ||
| 962 | |||
| 963 | mov 152($context),%rax # pull context->Rsp | ||
| 964 | |||
| 965 | mov 8(%r11),%r10d # HandlerData[2] | ||
| 966 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 967 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 968 | jae .Lcommon_seh_tail | ||
| 969 | |||
| 970 | mov 192($context),%r10 # pull $num | ||
| 971 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer | ||
| 972 | |||
| 973 | movaps (%rax),%xmm0 | ||
| 974 | movaps 16(%rax),%xmm1 | ||
| 975 | lea `40+48`(%rax),%rax | ||
| 976 | |||
| 977 | mov -8(%rax),%rbx | ||
| 978 | mov -16(%rax),%rbp | ||
| 979 | mov -24(%rax),%r12 | ||
| 980 | mov -32(%rax),%r13 | ||
| 981 | mov -40(%rax),%r14 | ||
| 982 | mov -48(%rax),%r15 | ||
| 983 | mov %rbx,144($context) # restore context->Rbx | ||
| 984 | mov %rbp,160($context) # restore context->Rbp | ||
| 985 | mov %r12,216($context) # restore context->R12 | ||
| 986 | mov %r13,224($context) # restore context->R13 | ||
| 987 | mov %r14,232($context) # restore context->R14 | ||
| 988 | mov %r15,240($context) # restore context->R15 | ||
| 989 | movups %xmm0,512($context) # restore context->Xmm6 | ||
| 990 | movups %xmm1,528($context) # restore context->Xmm7 | ||
| 991 | |||
| 992 | .Lcommon_seh_tail: | ||
| 993 | mov 8(%rax),%rdi | ||
| 994 | mov 16(%rax),%rsi | ||
| 995 | mov %rax,152($context) # restore context->Rsp | ||
| 996 | mov %rsi,168($context) # restore context->Rsi | ||
| 997 | mov %rdi,176($context) # restore context->Rdi | ||
| 998 | |||
| 999 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 1000 | mov $context,%rsi # context | ||
| 1001 | mov \$154,%ecx # sizeof(CONTEXT) | ||
| 1002 | .long 0xa548f3fc # cld; rep movsq | ||
| 1003 | |||
| 1004 | mov $disp,%rsi | ||
| 1005 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 1006 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 1007 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 1008 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 1009 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 1010 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 1011 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 1012 | mov %r10,32(%rsp) # arg5 | ||
| 1013 | mov %r11,40(%rsp) # arg6 | ||
| 1014 | mov %r12,48(%rsp) # arg7 | ||
| 1015 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 1016 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 1017 | |||
| 1018 | mov \$1,%eax # ExceptionContinueSearch | ||
| 1019 | add \$64,%rsp | ||
| 1020 | popfq | ||
| 1021 | pop %r15 | ||
| 1022 | pop %r14 | ||
| 1023 | pop %r13 | ||
| 1024 | pop %r12 | ||
| 1025 | pop %rbp | ||
| 1026 | pop %rbx | ||
| 1027 | pop %rdi | ||
| 1028 | pop %rsi | ||
| 1029 | ret | ||
| 1030 | .size mul_handler,.-mul_handler | ||
| 1031 | |||
| 1032 | .section .pdata | ||
| 1033 | .align 4 | ||
| 1034 | .rva .LSEH_begin_bn_mul_mont_gather5 | ||
| 1035 | .rva .LSEH_end_bn_mul_mont_gather5 | ||
| 1036 | .rva .LSEH_info_bn_mul_mont_gather5 | ||
| 1037 | |||
| 1038 | .rva .LSEH_begin_bn_mul4x_mont_gather5 | ||
| 1039 | .rva .LSEH_end_bn_mul4x_mont_gather5 | ||
| 1040 | .rva .LSEH_info_bn_mul4x_mont_gather5 | ||
| 1041 | |||
| 1042 | .rva .LSEH_begin_bn_gather5 | ||
| 1043 | .rva .LSEH_end_bn_gather5 | ||
| 1044 | .rva .LSEH_info_bn_gather5 | ||
| 1045 | |||
| 1046 | .section .xdata | ||
| 1047 | .align 8 | ||
| 1048 | .LSEH_info_bn_mul_mont_gather5: | ||
| 1049 | .byte 9,0,0,0 | ||
| 1050 | .rva mul_handler | ||
| 1051 | .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[] | ||
| 1052 | .align 8 | ||
| 1053 | .LSEH_info_bn_mul4x_mont_gather5: | ||
| 1054 | .byte 9,0,0,0 | ||
| 1055 | .rva mul_handler | ||
| 1056 | .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] | ||
| 1057 | .align 8 | ||
| 1058 | .LSEH_info_bn_gather5: | ||
| 1059 | .byte 0x01,0x0d,0x05,0x00 | ||
| 1060 | .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 | ||
| 1061 | .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 | ||
| 1062 | .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 | ||
| 1063 | .align 8 | ||
| 1064 | ___ | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 1068 | |||
| 1069 | print $code; | ||
| 1070 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86.pl b/src/lib/libcrypto/camellia/asm/cmll-x86.pl index 027302ac86..c314d62312 100644 --- a/src/lib/libcrypto/camellia/asm/cmll-x86.pl +++ b/src/lib/libcrypto/camellia/asm/cmll-x86.pl | |||
| @@ -723,11 +723,11 @@ my $bias=int(@T[0])?shift(@T):0; | |||
| 723 | &function_end("Camellia_Ekeygen"); | 723 | &function_end("Camellia_Ekeygen"); |
| 724 | 724 | ||
| 725 | if ($OPENSSL) { | 725 | if ($OPENSSL) { |
| 726 | # int Camellia_set_key ( | 726 | # int private_Camellia_set_key ( |
| 727 | # const unsigned char *userKey, | 727 | # const unsigned char *userKey, |
| 728 | # int bits, | 728 | # int bits, |
| 729 | # CAMELLIA_KEY *key) | 729 | # CAMELLIA_KEY *key) |
| 730 | &function_begin_B("Camellia_set_key"); | 730 | &function_begin_B("private_Camellia_set_key"); |
| 731 | &push ("ebx"); | 731 | &push ("ebx"); |
| 732 | &mov ("ecx",&wparam(0)); # pull arguments | 732 | &mov ("ecx",&wparam(0)); # pull arguments |
| 733 | &mov ("ebx",&wparam(1)); | 733 | &mov ("ebx",&wparam(1)); |
| @@ -760,7 +760,7 @@ if ($OPENSSL) { | |||
| 760 | &set_label("done",4); | 760 | &set_label("done",4); |
| 761 | &pop ("ebx"); | 761 | &pop ("ebx"); |
| 762 | &ret (); | 762 | &ret (); |
| 763 | &function_end_B("Camellia_set_key"); | 763 | &function_end_B("private_Camellia_set_key"); |
| 764 | } | 764 | } |
| 765 | 765 | ||
| 766 | @SBOX=( | 766 | @SBOX=( |
diff --git a/src/lib/libcrypto/camellia/camellia.h b/src/lib/libcrypto/camellia/camellia.h index cf0457dd97..67911e0adf 100644 --- a/src/lib/libcrypto/camellia/camellia.h +++ b/src/lib/libcrypto/camellia/camellia.h | |||
| @@ -88,6 +88,10 @@ struct camellia_key_st | |||
| 88 | }; | 88 | }; |
| 89 | typedef struct camellia_key_st CAMELLIA_KEY; | 89 | typedef struct camellia_key_st CAMELLIA_KEY; |
| 90 | 90 | ||
| 91 | #ifdef OPENSSL_FIPS | ||
| 92 | int private_Camellia_set_key(const unsigned char *userKey, const int bits, | ||
| 93 | CAMELLIA_KEY *key); | ||
| 94 | #endif | ||
| 91 | int Camellia_set_key(const unsigned char *userKey, const int bits, | 95 | int Camellia_set_key(const unsigned char *userKey, const int bits, |
| 92 | CAMELLIA_KEY *key); | 96 | CAMELLIA_KEY *key); |
| 93 | 97 | ||
diff --git a/src/lib/libcrypto/camellia/cmll_locl.h b/src/lib/libcrypto/camellia/cmll_locl.h index 4a4d880d16..246b6ce1d8 100644 --- a/src/lib/libcrypto/camellia/cmll_locl.h +++ b/src/lib/libcrypto/camellia/cmll_locl.h | |||
| @@ -71,7 +71,8 @@ | |||
| 71 | typedef unsigned int u32; | 71 | typedef unsigned int u32; |
| 72 | typedef unsigned char u8; | 72 | typedef unsigned char u8; |
| 73 | 73 | ||
| 74 | int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, KEY_TABLE_TYPE keyTable); | 74 | int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, |
| 75 | KEY_TABLE_TYPE keyTable); | ||
| 75 | void Camellia_EncryptBlock_Rounds(int grandRounds, const u8 plaintext[], | 76 | void Camellia_EncryptBlock_Rounds(int grandRounds, const u8 plaintext[], |
| 76 | const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); | 77 | const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); |
| 77 | void Camellia_DecryptBlock_Rounds(int grandRounds, const u8 ciphertext[], | 78 | void Camellia_DecryptBlock_Rounds(int grandRounds, const u8 ciphertext[], |
| @@ -80,4 +81,6 @@ void Camellia_EncryptBlock(int keyBitLength, const u8 plaintext[], | |||
| 80 | const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); | 81 | const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); |
| 81 | void Camellia_DecryptBlock(int keyBitLength, const u8 ciphertext[], | 82 | void Camellia_DecryptBlock(int keyBitLength, const u8 ciphertext[], |
| 82 | const KEY_TABLE_TYPE keyTable, u8 plaintext[]); | 83 | const KEY_TABLE_TYPE keyTable, u8 plaintext[]); |
| 84 | int private_Camellia_set_key(const unsigned char *userKey, const int bits, | ||
| 85 | CAMELLIA_KEY *key); | ||
| 83 | #endif /* #ifndef HEADER_CAMELLIA_LOCL_H */ | 86 | #endif /* #ifndef HEADER_CAMELLIA_LOCL_H */ |
diff --git a/src/lib/libcrypto/camellia/cmll_misc.c b/src/lib/libcrypto/camellia/cmll_misc.c index f44689124b..f44d48564c 100644 --- a/src/lib/libcrypto/camellia/cmll_misc.c +++ b/src/lib/libcrypto/camellia/cmll_misc.c | |||
| @@ -50,12 +50,13 @@ | |||
| 50 | */ | 50 | */ |
| 51 | 51 | ||
| 52 | #include <openssl/opensslv.h> | 52 | #include <openssl/opensslv.h> |
| 53 | #include <openssl/crypto.h> | ||
| 53 | #include <openssl/camellia.h> | 54 | #include <openssl/camellia.h> |
| 54 | #include "cmll_locl.h" | 55 | #include "cmll_locl.h" |
| 55 | 56 | ||
| 56 | const char CAMELLIA_version[]="CAMELLIA" OPENSSL_VERSION_PTEXT; | 57 | const char CAMELLIA_version[]="CAMELLIA" OPENSSL_VERSION_PTEXT; |
| 57 | 58 | ||
| 58 | int Camellia_set_key(const unsigned char *userKey, const int bits, | 59 | int private_Camellia_set_key(const unsigned char *userKey, const int bits, |
| 59 | CAMELLIA_KEY *key) | 60 | CAMELLIA_KEY *key) |
| 60 | { | 61 | { |
| 61 | if(!userKey || !key) | 62 | if(!userKey || !key) |
diff --git a/src/lib/libcrypto/cmac/cm_ameth.c b/src/lib/libcrypto/cmac/cm_ameth.c new file mode 100644 index 0000000000..0b8e5670b0 --- /dev/null +++ b/src/lib/libcrypto/cmac/cm_ameth.c | |||
| @@ -0,0 +1,97 @@ | |||
| 1 | /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | ||
| 2 | * project 2010. | ||
| 3 | */ | ||
| 4 | /* ==================================================================== | ||
| 5 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
| 6 | * | ||
| 7 | * Redistribution and use in source and binary forms, with or without | ||
| 8 | * modification, are permitted provided that the following conditions | ||
| 9 | * are met: | ||
| 10 | * | ||
| 11 | * 1. Redistributions of source code must retain the above copyright | ||
| 12 | * notice, this list of conditions and the following disclaimer. | ||
| 13 | * | ||
| 14 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 15 | * notice, this list of conditions and the following disclaimer in | ||
| 16 | * the documentation and/or other materials provided with the | ||
| 17 | * distribution. | ||
| 18 | * | ||
| 19 | * 3. All advertising materials mentioning features or use of this | ||
| 20 | * software must display the following acknowledgment: | ||
| 21 | * "This product includes software developed by the OpenSSL Project | ||
| 22 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
| 23 | * | ||
| 24 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 25 | * endorse or promote products derived from this software without | ||
| 26 | * prior written permission. For written permission, please contact | ||
| 27 | * licensing@OpenSSL.org. | ||
| 28 | * | ||
| 29 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 30 | * nor may "OpenSSL" appear in their names without prior written | ||
| 31 | * permission of the OpenSSL Project. | ||
| 32 | * | ||
| 33 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 34 | * acknowledgment: | ||
| 35 | * "This product includes software developed by the OpenSSL Project | ||
| 36 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
| 37 | * | ||
| 38 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 39 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 40 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 41 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 42 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 43 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 44 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 45 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 46 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 47 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 48 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 49 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 50 | * ==================================================================== | ||
| 51 | */ | ||
| 52 | |||
| 53 | #include <stdio.h> | ||
| 54 | #include "cryptlib.h" | ||
| 55 | #include <openssl/evp.h> | ||
| 56 | #include <openssl/cmac.h> | ||
| 57 | #include "asn1_locl.h" | ||
| 58 | |||
| 59 | /* CMAC "ASN1" method. This is just here to indicate the | ||
| 60 | * maximum CMAC output length and to free up a CMAC | ||
| 61 | * key. | ||
| 62 | */ | ||
| 63 | |||
| 64 | static int cmac_size(const EVP_PKEY *pkey) | ||
| 65 | { | ||
| 66 | return EVP_MAX_BLOCK_LENGTH; | ||
| 67 | } | ||
| 68 | |||
| 69 | static void cmac_key_free(EVP_PKEY *pkey) | ||
| 70 | { | ||
| 71 | CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr; | ||
| 72 | if (cmctx) | ||
| 73 | CMAC_CTX_free(cmctx); | ||
| 74 | } | ||
| 75 | |||
| 76 | const EVP_PKEY_ASN1_METHOD cmac_asn1_meth = | ||
| 77 | { | ||
| 78 | EVP_PKEY_CMAC, | ||
| 79 | EVP_PKEY_CMAC, | ||
| 80 | 0, | ||
| 81 | |||
| 82 | "CMAC", | ||
| 83 | "OpenSSL CMAC method", | ||
| 84 | |||
| 85 | 0,0,0,0, | ||
| 86 | |||
| 87 | 0,0,0, | ||
| 88 | |||
| 89 | cmac_size, | ||
| 90 | 0, | ||
| 91 | 0,0,0,0,0,0,0, | ||
| 92 | |||
| 93 | cmac_key_free, | ||
| 94 | 0, | ||
| 95 | 0,0 | ||
| 96 | }; | ||
| 97 | |||
diff --git a/src/lib/libcrypto/cmac/cm_pmeth.c b/src/lib/libcrypto/cmac/cm_pmeth.c new file mode 100644 index 0000000000..072228ec7f --- /dev/null +++ b/src/lib/libcrypto/cmac/cm_pmeth.c | |||
| @@ -0,0 +1,224 @@ | |||
| 1 | /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | ||
| 2 | * project 2010. | ||
| 3 | */ | ||
| 4 | /* ==================================================================== | ||
| 5 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
| 6 | * | ||
| 7 | * Redistribution and use in source and binary forms, with or without | ||
| 8 | * modification, are permitted provided that the following conditions | ||
| 9 | * are met: | ||
| 10 | * | ||
| 11 | * 1. Redistributions of source code must retain the above copyright | ||
| 12 | * notice, this list of conditions and the following disclaimer. | ||
| 13 | * | ||
| 14 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 15 | * notice, this list of conditions and the following disclaimer in | ||
| 16 | * the documentation and/or other materials provided with the | ||
| 17 | * distribution. | ||
| 18 | * | ||
| 19 | * 3. All advertising materials mentioning features or use of this | ||
| 20 | * software must display the following acknowledgment: | ||
| 21 | * "This product includes software developed by the OpenSSL Project | ||
| 22 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
| 23 | * | ||
| 24 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 25 | * endorse or promote products derived from this software without | ||
| 26 | * prior written permission. For written permission, please contact | ||
| 27 | * licensing@OpenSSL.org. | ||
| 28 | * | ||
| 29 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 30 | * nor may "OpenSSL" appear in their names without prior written | ||
| 31 | * permission of the OpenSSL Project. | ||
| 32 | * | ||
| 33 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 34 | * acknowledgment: | ||
| 35 | * "This product includes software developed by the OpenSSL Project | ||
| 36 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
| 37 | * | ||
| 38 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 39 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 40 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 41 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 42 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 43 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 44 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 45 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 46 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 47 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 48 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 49 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 50 | * ==================================================================== | ||
| 51 | */ | ||
| 52 | |||
| 53 | #include <stdio.h> | ||
| 54 | #include "cryptlib.h" | ||
| 55 | #include <openssl/x509.h> | ||
| 56 | #include <openssl/x509v3.h> | ||
| 57 | #include <openssl/evp.h> | ||
| 58 | #include <openssl/cmac.h> | ||
| 59 | #include "evp_locl.h" | ||
| 60 | |||
| 61 | /* The context structure and "key" is simply a CMAC_CTX */ | ||
| 62 | |||
| 63 | static int pkey_cmac_init(EVP_PKEY_CTX *ctx) | ||
| 64 | { | ||
| 65 | ctx->data = CMAC_CTX_new(); | ||
| 66 | if (!ctx->data) | ||
| 67 | return 0; | ||
| 68 | ctx->keygen_info_count = 0; | ||
| 69 | return 1; | ||
| 70 | } | ||
| 71 | |||
| 72 | static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) | ||
| 73 | { | ||
| 74 | if (!pkey_cmac_init(dst)) | ||
| 75 | return 0; | ||
| 76 | if (!CMAC_CTX_copy(dst->data, src->data)) | ||
| 77 | return 0; | ||
| 78 | return 1; | ||
| 79 | } | ||
| 80 | |||
| 81 | static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx) | ||
| 82 | { | ||
| 83 | CMAC_CTX_free(ctx->data); | ||
| 84 | } | ||
| 85 | |||
| 86 | static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) | ||
| 87 | { | ||
| 88 | CMAC_CTX *cmkey = CMAC_CTX_new(); | ||
| 89 | CMAC_CTX *cmctx = ctx->data; | ||
| 90 | if (!cmkey) | ||
| 91 | return 0; | ||
| 92 | if (!CMAC_CTX_copy(cmkey, cmctx)) | ||
| 93 | { | ||
| 94 | CMAC_CTX_free(cmkey); | ||
| 95 | return 0; | ||
| 96 | } | ||
| 97 | EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey); | ||
| 98 | |||
| 99 | return 1; | ||
| 100 | } | ||
| 101 | |||
| 102 | static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count) | ||
| 103 | { | ||
| 104 | if (!CMAC_Update(ctx->pctx->data, data, count)) | ||
| 105 | return 0; | ||
| 106 | return 1; | ||
| 107 | } | ||
| 108 | |||
| 109 | static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx) | ||
| 110 | { | ||
| 111 | EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT); | ||
| 112 | mctx->update = int_update; | ||
| 113 | return 1; | ||
| 114 | } | ||
| 115 | |||
| 116 | static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | ||
| 117 | EVP_MD_CTX *mctx) | ||
| 118 | { | ||
| 119 | return CMAC_Final(ctx->data, sig, siglen); | ||
| 120 | } | ||
| 121 | |||
| 122 | static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | ||
| 123 | { | ||
| 124 | CMAC_CTX *cmctx = ctx->data; | ||
| 125 | switch (type) | ||
| 126 | { | ||
| 127 | |||
| 128 | case EVP_PKEY_CTRL_SET_MAC_KEY: | ||
| 129 | if (!p2 || p1 < 0) | ||
| 130 | return 0; | ||
| 131 | if (!CMAC_Init(cmctx, p2, p1, NULL, NULL)) | ||
| 132 | return 0; | ||
| 133 | break; | ||
| 134 | |||
| 135 | case EVP_PKEY_CTRL_CIPHER: | ||
| 136 | if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine)) | ||
| 137 | return 0; | ||
| 138 | break; | ||
| 139 | |||
| 140 | case EVP_PKEY_CTRL_MD: | ||
| 141 | if (ctx->pkey && !CMAC_CTX_copy(ctx->data, | ||
| 142 | (CMAC_CTX *)ctx->pkey->pkey.ptr)) | ||
| 143 | return 0; | ||
| 144 | if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL)) | ||
| 145 | return 0; | ||
| 146 | break; | ||
| 147 | |||
| 148 | default: | ||
| 149 | return -2; | ||
| 150 | |||
| 151 | } | ||
| 152 | return 1; | ||
| 153 | } | ||
| 154 | |||
| 155 | static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx, | ||
| 156 | const char *type, const char *value) | ||
| 157 | { | ||
| 158 | if (!value) | ||
| 159 | { | ||
| 160 | return 0; | ||
| 161 | } | ||
| 162 | if (!strcmp(type, "key")) | ||
| 163 | { | ||
| 164 | void *p = (void *)value; | ||
| 165 | return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, | ||
| 166 | strlen(p), p); | ||
| 167 | } | ||
| 168 | if (!strcmp(type, "cipher")) | ||
| 169 | { | ||
| 170 | const EVP_CIPHER *c; | ||
| 171 | c = EVP_get_cipherbyname(value); | ||
| 172 | if (!c) | ||
| 173 | return 0; | ||
| 174 | return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c); | ||
| 175 | } | ||
| 176 | if (!strcmp(type, "hexkey")) | ||
| 177 | { | ||
| 178 | unsigned char *key; | ||
| 179 | int r; | ||
| 180 | long keylen; | ||
| 181 | key = string_to_hex(value, &keylen); | ||
| 182 | if (!key) | ||
| 183 | return 0; | ||
| 184 | r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key); | ||
| 185 | OPENSSL_free(key); | ||
| 186 | return r; | ||
| 187 | } | ||
| 188 | return -2; | ||
| 189 | } | ||
| 190 | |||
| 191 | const EVP_PKEY_METHOD cmac_pkey_meth = | ||
| 192 | { | ||
| 193 | EVP_PKEY_CMAC, | ||
| 194 | EVP_PKEY_FLAG_SIGCTX_CUSTOM, | ||
| 195 | pkey_cmac_init, | ||
| 196 | pkey_cmac_copy, | ||
| 197 | pkey_cmac_cleanup, | ||
| 198 | |||
| 199 | 0, 0, | ||
| 200 | |||
| 201 | 0, | ||
| 202 | pkey_cmac_keygen, | ||
| 203 | |||
| 204 | 0, 0, | ||
| 205 | |||
| 206 | 0, 0, | ||
| 207 | |||
| 208 | 0,0, | ||
| 209 | |||
| 210 | cmac_signctx_init, | ||
| 211 | cmac_signctx, | ||
| 212 | |||
| 213 | 0,0, | ||
| 214 | |||
| 215 | 0,0, | ||
| 216 | |||
| 217 | 0,0, | ||
| 218 | |||
| 219 | 0,0, | ||
| 220 | |||
| 221 | pkey_cmac_ctrl, | ||
| 222 | pkey_cmac_ctrl_str | ||
| 223 | |||
| 224 | }; | ||
diff --git a/src/lib/libcrypto/cmac/cmac.c b/src/lib/libcrypto/cmac/cmac.c new file mode 100644 index 0000000000..8b72b09681 --- /dev/null +++ b/src/lib/libcrypto/cmac/cmac.c | |||
| @@ -0,0 +1,308 @@ | |||
| 1 | /* crypto/cmac/cmac.c */ | ||
| 2 | /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | ||
| 3 | * project. | ||
| 4 | */ | ||
| 5 | /* ==================================================================== | ||
| 6 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
| 7 | * | ||
| 8 | * Redistribution and use in source and binary forms, with or without | ||
| 9 | * modification, are permitted provided that the following conditions | ||
| 10 | * are met: | ||
| 11 | * | ||
| 12 | * 1. Redistributions of source code must retain the above copyright | ||
| 13 | * notice, this list of conditions and the following disclaimer. | ||
| 14 | * | ||
| 15 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 16 | * notice, this list of conditions and the following disclaimer in | ||
| 17 | * the documentation and/or other materials provided with the | ||
| 18 | * distribution. | ||
| 19 | * | ||
| 20 | * 3. All advertising materials mentioning features or use of this | ||
| 21 | * software must display the following acknowledgment: | ||
| 22 | * "This product includes software developed by the OpenSSL Project | ||
| 23 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
| 24 | * | ||
| 25 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 26 | * endorse or promote products derived from this software without | ||
| 27 | * prior written permission. For written permission, please contact | ||
| 28 | * licensing@OpenSSL.org. | ||
| 29 | * | ||
| 30 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 31 | * nor may "OpenSSL" appear in their names without prior written | ||
| 32 | * permission of the OpenSSL Project. | ||
| 33 | * | ||
| 34 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 35 | * acknowledgment: | ||
| 36 | * "This product includes software developed by the OpenSSL Project | ||
| 37 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
| 38 | * | ||
| 39 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 40 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 41 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 42 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 43 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 44 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 45 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 46 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 47 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 48 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 49 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 50 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 51 | * ==================================================================== | ||
| 52 | */ | ||
| 53 | |||
| 54 | #include <stdio.h> | ||
| 55 | #include <stdlib.h> | ||
| 56 | #include <string.h> | ||
| 57 | #include "cryptlib.h" | ||
| 58 | #include <openssl/cmac.h> | ||
| 59 | |||
| 60 | #ifdef OPENSSL_FIPS | ||
| 61 | #include <openssl/fips.h> | ||
| 62 | #endif | ||
| 63 | |||
| 64 | struct CMAC_CTX_st | ||
| 65 | { | ||
| 66 | /* Cipher context to use */ | ||
| 67 | EVP_CIPHER_CTX cctx; | ||
| 68 | /* Keys k1 and k2 */ | ||
| 69 | unsigned char k1[EVP_MAX_BLOCK_LENGTH]; | ||
| 70 | unsigned char k2[EVP_MAX_BLOCK_LENGTH]; | ||
| 71 | /* Temporary block */ | ||
| 72 | unsigned char tbl[EVP_MAX_BLOCK_LENGTH]; | ||
| 73 | /* Last (possibly partial) block */ | ||
| 74 | unsigned char last_block[EVP_MAX_BLOCK_LENGTH]; | ||
| 75 | /* Number of bytes in last block: -1 means context not initialised */ | ||
| 76 | int nlast_block; | ||
| 77 | }; | ||
| 78 | |||
| 79 | |||
| 80 | /* Make temporary keys K1 and K2 */ | ||
| 81 | |||
| 82 | static void make_kn(unsigned char *k1, unsigned char *l, int bl) | ||
| 83 | { | ||
| 84 | int i; | ||
| 85 | /* Shift block to left, including carry */ | ||
| 86 | for (i = 0; i < bl; i++) | ||
| 87 | { | ||
| 88 | k1[i] = l[i] << 1; | ||
| 89 | if (i < bl - 1 && l[i + 1] & 0x80) | ||
| 90 | k1[i] |= 1; | ||
| 91 | } | ||
| 92 | /* If MSB set fixup with R */ | ||
| 93 | if (l[0] & 0x80) | ||
| 94 | k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b; | ||
| 95 | } | ||
| 96 | |||
| 97 | CMAC_CTX *CMAC_CTX_new(void) | ||
| 98 | { | ||
| 99 | CMAC_CTX *ctx; | ||
| 100 | ctx = OPENSSL_malloc(sizeof(CMAC_CTX)); | ||
| 101 | if (!ctx) | ||
| 102 | return NULL; | ||
| 103 | EVP_CIPHER_CTX_init(&ctx->cctx); | ||
| 104 | ctx->nlast_block = -1; | ||
| 105 | return ctx; | ||
| 106 | } | ||
| 107 | |||
| 108 | void CMAC_CTX_cleanup(CMAC_CTX *ctx) | ||
| 109 | { | ||
| 110 | #ifdef OPENSSL_FIPS | ||
| 111 | if (FIPS_mode() && !ctx->cctx.engine) | ||
| 112 | { | ||
| 113 | FIPS_cmac_ctx_cleanup(ctx); | ||
| 114 | return; | ||
| 115 | } | ||
| 116 | #endif | ||
| 117 | EVP_CIPHER_CTX_cleanup(&ctx->cctx); | ||
| 118 | OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH); | ||
| 119 | OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH); | ||
| 120 | OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH); | ||
| 121 | OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH); | ||
| 122 | ctx->nlast_block = -1; | ||
| 123 | } | ||
| 124 | |||
| 125 | EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx) | ||
| 126 | { | ||
| 127 | return &ctx->cctx; | ||
| 128 | } | ||
| 129 | |||
| 130 | void CMAC_CTX_free(CMAC_CTX *ctx) | ||
| 131 | { | ||
| 132 | CMAC_CTX_cleanup(ctx); | ||
| 133 | OPENSSL_free(ctx); | ||
| 134 | } | ||
| 135 | |||
| 136 | int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in) | ||
| 137 | { | ||
| 138 | int bl; | ||
| 139 | if (in->nlast_block == -1) | ||
| 140 | return 0; | ||
| 141 | if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx)) | ||
| 142 | return 0; | ||
| 143 | bl = EVP_CIPHER_CTX_block_size(&in->cctx); | ||
| 144 | memcpy(out->k1, in->k1, bl); | ||
| 145 | memcpy(out->k2, in->k2, bl); | ||
| 146 | memcpy(out->tbl, in->tbl, bl); | ||
| 147 | memcpy(out->last_block, in->last_block, bl); | ||
| 148 | out->nlast_block = in->nlast_block; | ||
| 149 | return 1; | ||
| 150 | } | ||
| 151 | |||
| 152 | int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, | ||
| 153 | const EVP_CIPHER *cipher, ENGINE *impl) | ||
| 154 | { | ||
| 155 | static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH]; | ||
| 156 | #ifdef OPENSSL_FIPS | ||
| 157 | if (FIPS_mode()) | ||
| 158 | { | ||
| 159 | /* If we have an ENGINE need to allow non FIPS */ | ||
| 160 | if ((impl || ctx->cctx.engine) | ||
| 161 | && !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)) | ||
| 162 | |||
| 163 | { | ||
| 164 | EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS); | ||
| 165 | return 0; | ||
| 166 | } | ||
| 167 | /* Other algorithm blocking will be done in FIPS_cmac_init, | ||
| 168 | * via FIPS_cipherinit(). | ||
| 169 | */ | ||
| 170 | if (!impl && !ctx->cctx.engine) | ||
| 171 | return FIPS_cmac_init(ctx, key, keylen, cipher, NULL); | ||
| 172 | } | ||
| 173 | #endif | ||
| 174 | /* All zeros means restart */ | ||
| 175 | if (!key && !cipher && !impl && keylen == 0) | ||
| 176 | { | ||
| 177 | /* Not initialised */ | ||
| 178 | if (ctx->nlast_block == -1) | ||
| 179 | return 0; | ||
| 180 | if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv)) | ||
| 181 | return 0; | ||
| 182 | memset(ctx->tbl, 0, EVP_CIPHER_CTX_block_size(&ctx->cctx)); | ||
| 183 | ctx->nlast_block = 0; | ||
| 184 | return 1; | ||
| 185 | } | ||
| 186 | /* Initialiase context */ | ||
| 187 | if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL)) | ||
| 188 | return 0; | ||
| 189 | /* Non-NULL key means initialisation complete */ | ||
| 190 | if (key) | ||
| 191 | { | ||
| 192 | int bl; | ||
| 193 | if (!EVP_CIPHER_CTX_cipher(&ctx->cctx)) | ||
| 194 | return 0; | ||
| 195 | if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen)) | ||
| 196 | return 0; | ||
| 197 | if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv)) | ||
| 198 | return 0; | ||
| 199 | bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); | ||
| 200 | if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl)) | ||
| 201 | return 0; | ||
| 202 | make_kn(ctx->k1, ctx->tbl, bl); | ||
| 203 | make_kn(ctx->k2, ctx->k1, bl); | ||
| 204 | OPENSSL_cleanse(ctx->tbl, bl); | ||
| 205 | /* Reset context again ready for first data block */ | ||
| 206 | if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv)) | ||
| 207 | return 0; | ||
| 208 | /* Zero tbl so resume works */ | ||
| 209 | memset(ctx->tbl, 0, bl); | ||
| 210 | ctx->nlast_block = 0; | ||
| 211 | } | ||
| 212 | return 1; | ||
| 213 | } | ||
| 214 | |||
| 215 | int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen) | ||
| 216 | { | ||
| 217 | const unsigned char *data = in; | ||
| 218 | size_t bl; | ||
| 219 | #ifdef OPENSSL_FIPS | ||
| 220 | if (FIPS_mode() && !ctx->cctx.engine) | ||
| 221 | return FIPS_cmac_update(ctx, in, dlen); | ||
| 222 | #endif | ||
| 223 | if (ctx->nlast_block == -1) | ||
| 224 | return 0; | ||
| 225 | if (dlen == 0) | ||
| 226 | return 1; | ||
| 227 | bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); | ||
| 228 | /* Copy into partial block if we need to */ | ||
| 229 | if (ctx->nlast_block > 0) | ||
| 230 | { | ||
| 231 | size_t nleft; | ||
| 232 | nleft = bl - ctx->nlast_block; | ||
| 233 | if (dlen < nleft) | ||
| 234 | nleft = dlen; | ||
| 235 | memcpy(ctx->last_block + ctx->nlast_block, data, nleft); | ||
| 236 | dlen -= nleft; | ||
| 237 | ctx->nlast_block += nleft; | ||
| 238 | /* If no more to process return */ | ||
| 239 | if (dlen == 0) | ||
| 240 | return 1; | ||
| 241 | data += nleft; | ||
| 242 | /* Else not final block so encrypt it */ | ||
| 243 | if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl)) | ||
| 244 | return 0; | ||
| 245 | } | ||
| 246 | /* Encrypt all but one of the complete blocks left */ | ||
| 247 | while(dlen > bl) | ||
| 248 | { | ||
| 249 | if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl)) | ||
| 250 | return 0; | ||
| 251 | dlen -= bl; | ||
| 252 | data += bl; | ||
| 253 | } | ||
| 254 | /* Copy any data left to last block buffer */ | ||
| 255 | memcpy(ctx->last_block, data, dlen); | ||
| 256 | ctx->nlast_block = dlen; | ||
| 257 | return 1; | ||
| 258 | |||
| 259 | } | ||
| 260 | |||
| 261 | int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen) | ||
| 262 | { | ||
| 263 | int i, bl, lb; | ||
| 264 | #ifdef OPENSSL_FIPS | ||
| 265 | if (FIPS_mode() && !ctx->cctx.engine) | ||
| 266 | return FIPS_cmac_final(ctx, out, poutlen); | ||
| 267 | #endif | ||
| 268 | if (ctx->nlast_block == -1) | ||
| 269 | return 0; | ||
| 270 | bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); | ||
| 271 | *poutlen = (size_t)bl; | ||
| 272 | if (!out) | ||
| 273 | return 1; | ||
| 274 | lb = ctx->nlast_block; | ||
| 275 | /* Is last block complete? */ | ||
| 276 | if (lb == bl) | ||
| 277 | { | ||
| 278 | for (i = 0; i < bl; i++) | ||
| 279 | out[i] = ctx->last_block[i] ^ ctx->k1[i]; | ||
| 280 | } | ||
| 281 | else | ||
| 282 | { | ||
| 283 | ctx->last_block[lb] = 0x80; | ||
| 284 | if (bl - lb > 1) | ||
| 285 | memset(ctx->last_block + lb + 1, 0, bl - lb - 1); | ||
| 286 | for (i = 0; i < bl; i++) | ||
| 287 | out[i] = ctx->last_block[i] ^ ctx->k2[i]; | ||
| 288 | } | ||
| 289 | if (!EVP_Cipher(&ctx->cctx, out, out, bl)) | ||
| 290 | { | ||
| 291 | OPENSSL_cleanse(out, bl); | ||
| 292 | return 0; | ||
| 293 | } | ||
| 294 | return 1; | ||
| 295 | } | ||
| 296 | |||
| 297 | int CMAC_resume(CMAC_CTX *ctx) | ||
| 298 | { | ||
| 299 | if (ctx->nlast_block == -1) | ||
| 300 | return 0; | ||
| 301 | /* The buffer "tbl" containes the last fully encrypted block | ||
| 302 | * which is the last IV (or all zeroes if no last encrypted block). | ||
| 303 | * The last block has not been modified since CMAC_final(). | ||
| 304 | * So reinitliasing using the last decrypted block will allow | ||
| 305 | * CMAC to continue after calling CMAC_Final(). | ||
| 306 | */ | ||
| 307 | return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl); | ||
| 308 | } | ||
diff --git a/src/lib/libcrypto/cmac/cmac.h b/src/lib/libcrypto/cmac/cmac.h new file mode 100644 index 0000000000..712e92dced --- /dev/null +++ b/src/lib/libcrypto/cmac/cmac.h | |||
| @@ -0,0 +1,82 @@ | |||
| 1 | /* crypto/cmac/cmac.h */ | ||
| 2 | /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | ||
| 3 | * project. | ||
| 4 | */ | ||
| 5 | /* ==================================================================== | ||
| 6 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
| 7 | * | ||
| 8 | * Redistribution and use in source and binary forms, with or without | ||
| 9 | * modification, are permitted provided that the following conditions | ||
| 10 | * are met: | ||
| 11 | * | ||
| 12 | * 1. Redistributions of source code must retain the above copyright | ||
| 13 | * notice, this list of conditions and the following disclaimer. | ||
| 14 | * | ||
| 15 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 16 | * notice, this list of conditions and the following disclaimer in | ||
| 17 | * the documentation and/or other materials provided with the | ||
| 18 | * distribution. | ||
| 19 | * | ||
| 20 | * 3. All advertising materials mentioning features or use of this | ||
| 21 | * software must display the following acknowledgment: | ||
| 22 | * "This product includes software developed by the OpenSSL Project | ||
| 23 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
| 24 | * | ||
| 25 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 26 | * endorse or promote products derived from this software without | ||
| 27 | * prior written permission. For written permission, please contact | ||
| 28 | * licensing@OpenSSL.org. | ||
| 29 | * | ||
| 30 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 31 | * nor may "OpenSSL" appear in their names without prior written | ||
| 32 | * permission of the OpenSSL Project. | ||
| 33 | * | ||
| 34 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 35 | * acknowledgment: | ||
| 36 | * "This product includes software developed by the OpenSSL Project | ||
| 37 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
| 38 | * | ||
| 39 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 40 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 41 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 42 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 43 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 44 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 45 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 46 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 47 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 48 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 49 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 50 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 51 | * ==================================================================== | ||
| 52 | */ | ||
| 53 | |||
| 54 | |||
| 55 | #ifndef HEADER_CMAC_H | ||
| 56 | #define HEADER_CMAC_H | ||
| 57 | |||
| 58 | #ifdef __cplusplus | ||
| 59 | extern "C" { | ||
| 60 | #endif | ||
| 61 | |||
| 62 | #include <openssl/evp.h> | ||
| 63 | |||
| 64 | /* Opaque */ | ||
| 65 | typedef struct CMAC_CTX_st CMAC_CTX; | ||
| 66 | |||
| 67 | CMAC_CTX *CMAC_CTX_new(void); | ||
| 68 | void CMAC_CTX_cleanup(CMAC_CTX *ctx); | ||
| 69 | void CMAC_CTX_free(CMAC_CTX *ctx); | ||
| 70 | EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx); | ||
| 71 | int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in); | ||
| 72 | |||
| 73 | int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, | ||
| 74 | const EVP_CIPHER *cipher, ENGINE *impl); | ||
| 75 | int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen); | ||
| 76 | int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen); | ||
| 77 | int CMAC_resume(CMAC_CTX *ctx); | ||
| 78 | |||
| 79 | #ifdef __cplusplus | ||
| 80 | } | ||
| 81 | #endif | ||
| 82 | #endif | ||
diff --git a/src/lib/libcrypto/cms/cms.h b/src/lib/libcrypto/cms/cms.h index 09c45d0412..36994fa6a2 100644 --- a/src/lib/libcrypto/cms/cms.h +++ b/src/lib/libcrypto/cms/cms.h | |||
| @@ -111,6 +111,7 @@ DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo) | |||
| 111 | #define CMS_PARTIAL 0x4000 | 111 | #define CMS_PARTIAL 0x4000 |
| 112 | #define CMS_REUSE_DIGEST 0x8000 | 112 | #define CMS_REUSE_DIGEST 0x8000 |
| 113 | #define CMS_USE_KEYID 0x10000 | 113 | #define CMS_USE_KEYID 0x10000 |
| 114 | #define CMS_DEBUG_DECRYPT 0x20000 | ||
| 114 | 115 | ||
| 115 | const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms); | 116 | const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms); |
| 116 | 117 | ||
| @@ -184,6 +185,8 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert); | |||
| 184 | int CMS_decrypt_set1_key(CMS_ContentInfo *cms, | 185 | int CMS_decrypt_set1_key(CMS_ContentInfo *cms, |
| 185 | unsigned char *key, size_t keylen, | 186 | unsigned char *key, size_t keylen, |
| 186 | unsigned char *id, size_t idlen); | 187 | unsigned char *id, size_t idlen); |
| 188 | int CMS_decrypt_set1_password(CMS_ContentInfo *cms, | ||
| 189 | unsigned char *pass, ossl_ssize_t passlen); | ||
| 187 | 190 | ||
| 188 | STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms); | 191 | STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms); |
| 189 | int CMS_RecipientInfo_type(CMS_RecipientInfo *ri); | 192 | int CMS_RecipientInfo_type(CMS_RecipientInfo *ri); |
| @@ -219,6 +222,16 @@ int CMS_RecipientInfo_set0_key(CMS_RecipientInfo *ri, | |||
| 219 | int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri, | 222 | int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri, |
| 220 | const unsigned char *id, size_t idlen); | 223 | const unsigned char *id, size_t idlen); |
| 221 | 224 | ||
| 225 | int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, | ||
| 226 | unsigned char *pass, | ||
| 227 | ossl_ssize_t passlen); | ||
| 228 | |||
| 229 | CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms, | ||
| 230 | int iter, int wrap_nid, int pbe_nid, | ||
| 231 | unsigned char *pass, | ||
| 232 | ossl_ssize_t passlen, | ||
| 233 | const EVP_CIPHER *kekciph); | ||
| 234 | |||
| 222 | int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri); | 235 | int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri); |
| 223 | 236 | ||
| 224 | int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out, | 237 | int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out, |
| @@ -330,6 +343,7 @@ void ERR_load_CMS_strings(void); | |||
| 330 | #define CMS_F_CHECK_CONTENT 99 | 343 | #define CMS_F_CHECK_CONTENT 99 |
| 331 | #define CMS_F_CMS_ADD0_CERT 164 | 344 | #define CMS_F_CMS_ADD0_CERT 164 |
| 332 | #define CMS_F_CMS_ADD0_RECIPIENT_KEY 100 | 345 | #define CMS_F_CMS_ADD0_RECIPIENT_KEY 100 |
| 346 | #define CMS_F_CMS_ADD0_RECIPIENT_PASSWORD 165 | ||
| 333 | #define CMS_F_CMS_ADD1_RECEIPTREQUEST 158 | 347 | #define CMS_F_CMS_ADD1_RECEIPTREQUEST 158 |
| 334 | #define CMS_F_CMS_ADD1_RECIPIENT_CERT 101 | 348 | #define CMS_F_CMS_ADD1_RECIPIENT_CERT 101 |
| 335 | #define CMS_F_CMS_ADD1_SIGNER 102 | 349 | #define CMS_F_CMS_ADD1_SIGNER 102 |
| @@ -344,6 +358,7 @@ void ERR_load_CMS_strings(void); | |||
| 344 | #define CMS_F_CMS_DATAINIT 111 | 358 | #define CMS_F_CMS_DATAINIT 111 |
| 345 | #define CMS_F_CMS_DECRYPT 112 | 359 | #define CMS_F_CMS_DECRYPT 112 |
| 346 | #define CMS_F_CMS_DECRYPT_SET1_KEY 113 | 360 | #define CMS_F_CMS_DECRYPT_SET1_KEY 113 |
| 361 | #define CMS_F_CMS_DECRYPT_SET1_PASSWORD 166 | ||
| 347 | #define CMS_F_CMS_DECRYPT_SET1_PKEY 114 | 362 | #define CMS_F_CMS_DECRYPT_SET1_PKEY 114 |
| 348 | #define CMS_F_CMS_DIGESTALGORITHM_FIND_CTX 115 | 363 | #define CMS_F_CMS_DIGESTALGORITHM_FIND_CTX 115 |
| 349 | #define CMS_F_CMS_DIGESTALGORITHM_INIT_BIO 116 | 364 | #define CMS_F_CMS_DIGESTALGORITHM_INIT_BIO 116 |
| @@ -378,7 +393,9 @@ void ERR_load_CMS_strings(void); | |||
| 378 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT 141 | 393 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT 141 |
| 379 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS 142 | 394 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS 142 |
| 380 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID 143 | 395 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID 143 |
| 396 | #define CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT 167 | ||
| 381 | #define CMS_F_CMS_RECIPIENTINFO_SET0_KEY 144 | 397 | #define CMS_F_CMS_RECIPIENTINFO_SET0_KEY 144 |
| 398 | #define CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD 168 | ||
| 382 | #define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY 145 | 399 | #define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY 145 |
| 383 | #define CMS_F_CMS_SET1_SIGNERIDENTIFIER 146 | 400 | #define CMS_F_CMS_SET1_SIGNERIDENTIFIER 146 |
| 384 | #define CMS_F_CMS_SET_DETACHED 147 | 401 | #define CMS_F_CMS_SET_DETACHED 147 |
| @@ -419,6 +436,7 @@ void ERR_load_CMS_strings(void); | |||
| 419 | #define CMS_R_ERROR_SETTING_KEY 115 | 436 | #define CMS_R_ERROR_SETTING_KEY 115 |
| 420 | #define CMS_R_ERROR_SETTING_RECIPIENTINFO 116 | 437 | #define CMS_R_ERROR_SETTING_RECIPIENTINFO 116 |
| 421 | #define CMS_R_INVALID_ENCRYPTED_KEY_LENGTH 117 | 438 | #define CMS_R_INVALID_ENCRYPTED_KEY_LENGTH 117 |
| 439 | #define CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER 176 | ||
| 422 | #define CMS_R_INVALID_KEY_LENGTH 118 | 440 | #define CMS_R_INVALID_KEY_LENGTH 118 |
| 423 | #define CMS_R_MD_BIO_INIT_ERROR 119 | 441 | #define CMS_R_MD_BIO_INIT_ERROR 119 |
| 424 | #define CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH 120 | 442 | #define CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH 120 |
| @@ -431,6 +449,7 @@ void ERR_load_CMS_strings(void); | |||
| 431 | #define CMS_R_NOT_ENCRYPTED_DATA 122 | 449 | #define CMS_R_NOT_ENCRYPTED_DATA 122 |
| 432 | #define CMS_R_NOT_KEK 123 | 450 | #define CMS_R_NOT_KEK 123 |
| 433 | #define CMS_R_NOT_KEY_TRANSPORT 124 | 451 | #define CMS_R_NOT_KEY_TRANSPORT 124 |
| 452 | #define CMS_R_NOT_PWRI 177 | ||
| 434 | #define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE 125 | 453 | #define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE 125 |
| 435 | #define CMS_R_NO_CIPHER 126 | 454 | #define CMS_R_NO_CIPHER 126 |
| 436 | #define CMS_R_NO_CONTENT 127 | 455 | #define CMS_R_NO_CONTENT 127 |
| @@ -443,6 +462,7 @@ void ERR_load_CMS_strings(void); | |||
| 443 | #define CMS_R_NO_MATCHING_RECIPIENT 132 | 462 | #define CMS_R_NO_MATCHING_RECIPIENT 132 |
| 444 | #define CMS_R_NO_MATCHING_SIGNATURE 166 | 463 | #define CMS_R_NO_MATCHING_SIGNATURE 166 |
| 445 | #define CMS_R_NO_MSGSIGDIGEST 167 | 464 | #define CMS_R_NO_MSGSIGDIGEST 167 |
| 465 | #define CMS_R_NO_PASSWORD 178 | ||
| 446 | #define CMS_R_NO_PRIVATE_KEY 133 | 466 | #define CMS_R_NO_PRIVATE_KEY 133 |
| 447 | #define CMS_R_NO_PUBLIC_KEY 134 | 467 | #define CMS_R_NO_PUBLIC_KEY 134 |
| 448 | #define CMS_R_NO_RECEIPT_REQUEST 168 | 468 | #define CMS_R_NO_RECEIPT_REQUEST 168 |
| @@ -466,10 +486,12 @@ void ERR_load_CMS_strings(void); | |||
| 466 | #define CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM 151 | 486 | #define CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM 151 |
| 467 | #define CMS_R_UNSUPPORTED_CONTENT_TYPE 152 | 487 | #define CMS_R_UNSUPPORTED_CONTENT_TYPE 152 |
| 468 | #define CMS_R_UNSUPPORTED_KEK_ALGORITHM 153 | 488 | #define CMS_R_UNSUPPORTED_KEK_ALGORITHM 153 |
| 489 | #define CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM 179 | ||
| 469 | #define CMS_R_UNSUPPORTED_RECIPIENT_TYPE 154 | 490 | #define CMS_R_UNSUPPORTED_RECIPIENT_TYPE 154 |
| 470 | #define CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE 155 | 491 | #define CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE 155 |
| 471 | #define CMS_R_UNSUPPORTED_TYPE 156 | 492 | #define CMS_R_UNSUPPORTED_TYPE 156 |
| 472 | #define CMS_R_UNWRAP_ERROR 157 | 493 | #define CMS_R_UNWRAP_ERROR 157 |
| 494 | #define CMS_R_UNWRAP_FAILURE 180 | ||
| 473 | #define CMS_R_VERIFICATION_FAILURE 158 | 495 | #define CMS_R_VERIFICATION_FAILURE 158 |
| 474 | #define CMS_R_WRAP_ERROR 159 | 496 | #define CMS_R_WRAP_ERROR 159 |
| 475 | 497 | ||
diff --git a/src/lib/libcrypto/cms/cms_asn1.c b/src/lib/libcrypto/cms/cms_asn1.c index fcba4dcbcc..cfe67fb6c1 100644 --- a/src/lib/libcrypto/cms/cms_asn1.c +++ b/src/lib/libcrypto/cms/cms_asn1.c | |||
| @@ -237,6 +237,15 @@ static int cms_ri_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it, | |||
| 237 | OPENSSL_free(kekri->key); | 237 | OPENSSL_free(kekri->key); |
| 238 | } | 238 | } |
| 239 | } | 239 | } |
| 240 | else if (ri->type == CMS_RECIPINFO_PASS) | ||
| 241 | { | ||
| 242 | CMS_PasswordRecipientInfo *pwri = ri->d.pwri; | ||
| 243 | if (pwri->pass) | ||
| 244 | { | ||
| 245 | OPENSSL_cleanse(pwri->pass, pwri->passlen); | ||
| 246 | OPENSSL_free(pwri->pass); | ||
| 247 | } | ||
| 248 | } | ||
| 240 | } | 249 | } |
| 241 | return 1; | 250 | return 1; |
| 242 | } | 251 | } |
diff --git a/src/lib/libcrypto/cms/cms_enc.c b/src/lib/libcrypto/cms/cms_enc.c index bab26235bd..f873ce3794 100644 --- a/src/lib/libcrypto/cms/cms_enc.c +++ b/src/lib/libcrypto/cms/cms_enc.c | |||
| @@ -73,6 +73,8 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) | |||
| 73 | const EVP_CIPHER *ciph; | 73 | const EVP_CIPHER *ciph; |
| 74 | X509_ALGOR *calg = ec->contentEncryptionAlgorithm; | 74 | X509_ALGOR *calg = ec->contentEncryptionAlgorithm; |
| 75 | unsigned char iv[EVP_MAX_IV_LENGTH], *piv = NULL; | 75 | unsigned char iv[EVP_MAX_IV_LENGTH], *piv = NULL; |
| 76 | unsigned char *tkey = NULL; | ||
| 77 | size_t tkeylen; | ||
| 76 | 78 | ||
| 77 | int ok = 0; | 79 | int ok = 0; |
| 78 | 80 | ||
| @@ -137,32 +139,57 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) | |||
| 137 | CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); | 139 | CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); |
| 138 | goto err; | 140 | goto err; |
| 139 | } | 141 | } |
| 140 | 142 | tkeylen = EVP_CIPHER_CTX_key_length(ctx); | |
| 141 | 143 | /* Generate random session key */ | |
| 142 | if (enc && !ec->key) | 144 | if (!enc || !ec->key) |
| 143 | { | 145 | { |
| 144 | /* Generate random key */ | 146 | tkey = OPENSSL_malloc(tkeylen); |
| 145 | if (!ec->keylen) | 147 | if (!tkey) |
| 146 | ec->keylen = EVP_CIPHER_CTX_key_length(ctx); | ||
| 147 | ec->key = OPENSSL_malloc(ec->keylen); | ||
| 148 | if (!ec->key) | ||
| 149 | { | 148 | { |
| 150 | CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, | 149 | CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, |
| 151 | ERR_R_MALLOC_FAILURE); | 150 | ERR_R_MALLOC_FAILURE); |
| 152 | goto err; | 151 | goto err; |
| 153 | } | 152 | } |
| 154 | if (EVP_CIPHER_CTX_rand_key(ctx, ec->key) <= 0) | 153 | if (EVP_CIPHER_CTX_rand_key(ctx, tkey) <= 0) |
| 155 | goto err; | 154 | goto err; |
| 156 | keep_key = 1; | ||
| 157 | } | 155 | } |
| 158 | else if (ec->keylen != (unsigned int)EVP_CIPHER_CTX_key_length(ctx)) | 156 | |
| 157 | if (!ec->key) | ||
| 158 | { | ||
| 159 | ec->key = tkey; | ||
| 160 | ec->keylen = tkeylen; | ||
| 161 | tkey = NULL; | ||
| 162 | if (enc) | ||
| 163 | keep_key = 1; | ||
| 164 | else | ||
| 165 | ERR_clear_error(); | ||
| 166 | |||
| 167 | } | ||
| 168 | |||
| 169 | if (ec->keylen != tkeylen) | ||
| 159 | { | 170 | { |
| 160 | /* If necessary set key length */ | 171 | /* If necessary set key length */ |
| 161 | if (EVP_CIPHER_CTX_set_key_length(ctx, ec->keylen) <= 0) | 172 | if (EVP_CIPHER_CTX_set_key_length(ctx, ec->keylen) <= 0) |
| 162 | { | 173 | { |
| 163 | CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, | 174 | /* Only reveal failure if debugging so we don't |
| 164 | CMS_R_INVALID_KEY_LENGTH); | 175 | * leak information which may be useful in MMA. |
| 165 | goto err; | 176 | */ |
| 177 | if (enc || ec->debug) | ||
| 178 | { | ||
| 179 | CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, | ||
| 180 | CMS_R_INVALID_KEY_LENGTH); | ||
| 181 | goto err; | ||
| 182 | } | ||
| 183 | else | ||
| 184 | { | ||
| 185 | /* Use random key */ | ||
| 186 | OPENSSL_cleanse(ec->key, ec->keylen); | ||
| 187 | OPENSSL_free(ec->key); | ||
| 188 | ec->key = tkey; | ||
| 189 | ec->keylen = tkeylen; | ||
| 190 | tkey = NULL; | ||
| 191 | ERR_clear_error(); | ||
| 192 | } | ||
| 166 | } | 193 | } |
| 167 | } | 194 | } |
| 168 | 195 | ||
| @@ -198,6 +225,11 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) | |||
| 198 | OPENSSL_free(ec->key); | 225 | OPENSSL_free(ec->key); |
| 199 | ec->key = NULL; | 226 | ec->key = NULL; |
| 200 | } | 227 | } |
| 228 | if (tkey) | ||
| 229 | { | ||
| 230 | OPENSSL_cleanse(tkey, tkeylen); | ||
| 231 | OPENSSL_free(tkey); | ||
| 232 | } | ||
| 201 | if (ok) | 233 | if (ok) |
| 202 | return b; | 234 | return b; |
| 203 | BIO_free(b); | 235 | BIO_free(b); |
diff --git a/src/lib/libcrypto/cms/cms_env.c b/src/lib/libcrypto/cms/cms_env.c index b3237d4b94..be20b1c024 100644 --- a/src/lib/libcrypto/cms/cms_env.c +++ b/src/lib/libcrypto/cms/cms_env.c | |||
| @@ -65,14 +65,13 @@ | |||
| 65 | /* CMS EnvelopedData Utilities */ | 65 | /* CMS EnvelopedData Utilities */ |
| 66 | 66 | ||
| 67 | DECLARE_ASN1_ITEM(CMS_EnvelopedData) | 67 | DECLARE_ASN1_ITEM(CMS_EnvelopedData) |
| 68 | DECLARE_ASN1_ITEM(CMS_RecipientInfo) | ||
| 69 | DECLARE_ASN1_ITEM(CMS_KeyTransRecipientInfo) | 68 | DECLARE_ASN1_ITEM(CMS_KeyTransRecipientInfo) |
| 70 | DECLARE_ASN1_ITEM(CMS_KEKRecipientInfo) | 69 | DECLARE_ASN1_ITEM(CMS_KEKRecipientInfo) |
| 71 | DECLARE_ASN1_ITEM(CMS_OtherKeyAttribute) | 70 | DECLARE_ASN1_ITEM(CMS_OtherKeyAttribute) |
| 72 | 71 | ||
| 73 | DECLARE_STACK_OF(CMS_RecipientInfo) | 72 | DECLARE_STACK_OF(CMS_RecipientInfo) |
| 74 | 73 | ||
| 75 | static CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms) | 74 | CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms) |
| 76 | { | 75 | { |
| 77 | if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_enveloped) | 76 | if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_enveloped) |
| 78 | { | 77 | { |
| @@ -371,6 +370,8 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms, | |||
| 371 | unsigned char *ek = NULL; | 370 | unsigned char *ek = NULL; |
| 372 | size_t eklen; | 371 | size_t eklen; |
| 373 | int ret = 0; | 372 | int ret = 0; |
| 373 | CMS_EncryptedContentInfo *ec; | ||
| 374 | ec = cms->d.envelopedData->encryptedContentInfo; | ||
| 374 | 375 | ||
| 375 | if (ktri->pkey == NULL) | 376 | if (ktri->pkey == NULL) |
| 376 | { | 377 | { |
| @@ -417,8 +418,14 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms, | |||
| 417 | 418 | ||
| 418 | ret = 1; | 419 | ret = 1; |
| 419 | 420 | ||
| 420 | cms->d.envelopedData->encryptedContentInfo->key = ek; | 421 | if (ec->key) |
| 421 | cms->d.envelopedData->encryptedContentInfo->keylen = eklen; | 422 | { |
| 423 | OPENSSL_cleanse(ec->key, ec->keylen); | ||
| 424 | OPENSSL_free(ec->key); | ||
| 425 | } | ||
| 426 | |||
| 427 | ec->key = ek; | ||
| 428 | ec->keylen = eklen; | ||
| 422 | 429 | ||
| 423 | err: | 430 | err: |
| 424 | if (pctx) | 431 | if (pctx) |
| @@ -786,6 +793,9 @@ int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri) | |||
| 786 | case CMS_RECIPINFO_KEK: | 793 | case CMS_RECIPINFO_KEK: |
| 787 | return cms_RecipientInfo_kekri_decrypt(cms, ri); | 794 | return cms_RecipientInfo_kekri_decrypt(cms, ri); |
| 788 | 795 | ||
| 796 | case CMS_RECIPINFO_PASS: | ||
| 797 | return cms_RecipientInfo_pwri_crypt(cms, ri, 0); | ||
| 798 | |||
| 789 | default: | 799 | default: |
| 790 | CMSerr(CMS_F_CMS_RECIPIENTINFO_DECRYPT, | 800 | CMSerr(CMS_F_CMS_RECIPIENTINFO_DECRYPT, |
| 791 | CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE); | 801 | CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE); |
| @@ -829,6 +839,10 @@ BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms) | |||
| 829 | r = cms_RecipientInfo_kekri_encrypt(cms, ri); | 839 | r = cms_RecipientInfo_kekri_encrypt(cms, ri); |
| 830 | break; | 840 | break; |
| 831 | 841 | ||
| 842 | case CMS_RECIPINFO_PASS: | ||
| 843 | r = cms_RecipientInfo_pwri_crypt(cms, ri, 1); | ||
| 844 | break; | ||
| 845 | |||
| 832 | default: | 846 | default: |
| 833 | CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO, | 847 | CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO, |
| 834 | CMS_R_UNSUPPORTED_RECIPIENT_TYPE); | 848 | CMS_R_UNSUPPORTED_RECIPIENT_TYPE); |
diff --git a/src/lib/libcrypto/cms/cms_err.c b/src/lib/libcrypto/cms/cms_err.c index ff7b0309e5..8330ead7ed 100644 --- a/src/lib/libcrypto/cms/cms_err.c +++ b/src/lib/libcrypto/cms/cms_err.c | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | /* crypto/cms/cms_err.c */ | 1 | /* crypto/cms/cms_err.c */ |
| 2 | /* ==================================================================== | 2 | /* ==================================================================== |
| 3 | * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved. |
| 4 | * | 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
| @@ -73,6 +73,7 @@ static ERR_STRING_DATA CMS_str_functs[]= | |||
| 73 | {ERR_FUNC(CMS_F_CHECK_CONTENT), "CHECK_CONTENT"}, | 73 | {ERR_FUNC(CMS_F_CHECK_CONTENT), "CHECK_CONTENT"}, |
| 74 | {ERR_FUNC(CMS_F_CMS_ADD0_CERT), "CMS_add0_cert"}, | 74 | {ERR_FUNC(CMS_F_CMS_ADD0_CERT), "CMS_add0_cert"}, |
| 75 | {ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_KEY), "CMS_add0_recipient_key"}, | 75 | {ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_KEY), "CMS_add0_recipient_key"}, |
| 76 | {ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD), "CMS_add0_recipient_password"}, | ||
| 76 | {ERR_FUNC(CMS_F_CMS_ADD1_RECEIPTREQUEST), "CMS_add1_ReceiptRequest"}, | 77 | {ERR_FUNC(CMS_F_CMS_ADD1_RECEIPTREQUEST), "CMS_add1_ReceiptRequest"}, |
| 77 | {ERR_FUNC(CMS_F_CMS_ADD1_RECIPIENT_CERT), "CMS_add1_recipient_cert"}, | 78 | {ERR_FUNC(CMS_F_CMS_ADD1_RECIPIENT_CERT), "CMS_add1_recipient_cert"}, |
| 78 | {ERR_FUNC(CMS_F_CMS_ADD1_SIGNER), "CMS_add1_signer"}, | 79 | {ERR_FUNC(CMS_F_CMS_ADD1_SIGNER), "CMS_add1_signer"}, |
| @@ -87,6 +88,7 @@ static ERR_STRING_DATA CMS_str_functs[]= | |||
| 87 | {ERR_FUNC(CMS_F_CMS_DATAINIT), "CMS_dataInit"}, | 88 | {ERR_FUNC(CMS_F_CMS_DATAINIT), "CMS_dataInit"}, |
| 88 | {ERR_FUNC(CMS_F_CMS_DECRYPT), "CMS_decrypt"}, | 89 | {ERR_FUNC(CMS_F_CMS_DECRYPT), "CMS_decrypt"}, |
| 89 | {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_KEY), "CMS_decrypt_set1_key"}, | 90 | {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_KEY), "CMS_decrypt_set1_key"}, |
| 91 | {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PASSWORD), "CMS_decrypt_set1_password"}, | ||
| 90 | {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PKEY), "CMS_decrypt_set1_pkey"}, | 92 | {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PKEY), "CMS_decrypt_set1_pkey"}, |
| 91 | {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX), "cms_DigestAlgorithm_find_ctx"}, | 93 | {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX), "cms_DigestAlgorithm_find_ctx"}, |
| 92 | {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO), "cms_DigestAlgorithm_init_bio"}, | 94 | {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO), "cms_DigestAlgorithm_init_bio"}, |
| @@ -105,7 +107,7 @@ static ERR_STRING_DATA CMS_str_functs[]= | |||
| 105 | {ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES), "CMS_GET0_CERTIFICATE_CHOICES"}, | 107 | {ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES), "CMS_GET0_CERTIFICATE_CHOICES"}, |
| 106 | {ERR_FUNC(CMS_F_CMS_GET0_CONTENT), "CMS_get0_content"}, | 108 | {ERR_FUNC(CMS_F_CMS_GET0_CONTENT), "CMS_get0_content"}, |
| 107 | {ERR_FUNC(CMS_F_CMS_GET0_ECONTENT_TYPE), "CMS_GET0_ECONTENT_TYPE"}, | 109 | {ERR_FUNC(CMS_F_CMS_GET0_ECONTENT_TYPE), "CMS_GET0_ECONTENT_TYPE"}, |
| 108 | {ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED), "CMS_GET0_ENVELOPED"}, | 110 | {ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED), "cms_get0_enveloped"}, |
| 109 | {ERR_FUNC(CMS_F_CMS_GET0_REVOCATION_CHOICES), "CMS_GET0_REVOCATION_CHOICES"}, | 111 | {ERR_FUNC(CMS_F_CMS_GET0_REVOCATION_CHOICES), "CMS_GET0_REVOCATION_CHOICES"}, |
| 110 | {ERR_FUNC(CMS_F_CMS_GET0_SIGNED), "CMS_GET0_SIGNED"}, | 112 | {ERR_FUNC(CMS_F_CMS_GET0_SIGNED), "CMS_GET0_SIGNED"}, |
| 111 | {ERR_FUNC(CMS_F_CMS_MSGSIGDIGEST_ADD1), "cms_msgSigDigest_add1"}, | 113 | {ERR_FUNC(CMS_F_CMS_MSGSIGDIGEST_ADD1), "cms_msgSigDigest_add1"}, |
| @@ -121,7 +123,9 @@ static ERR_STRING_DATA CMS_str_functs[]= | |||
| 121 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT), "CMS_RECIPIENTINFO_KTRI_ENCRYPT"}, | 123 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT), "CMS_RECIPIENTINFO_KTRI_ENCRYPT"}, |
| 122 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS), "CMS_RecipientInfo_ktri_get0_algs"}, | 124 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS), "CMS_RecipientInfo_ktri_get0_algs"}, |
| 123 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID), "CMS_RecipientInfo_ktri_get0_signer_id"}, | 125 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID), "CMS_RecipientInfo_ktri_get0_signer_id"}, |
| 126 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT), "cms_RecipientInfo_pwri_crypt"}, | ||
| 124 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_KEY), "CMS_RecipientInfo_set0_key"}, | 127 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_KEY), "CMS_RecipientInfo_set0_key"}, |
| 128 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD), "CMS_RecipientInfo_set0_password"}, | ||
| 125 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY), "CMS_RecipientInfo_set0_pkey"}, | 129 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY), "CMS_RecipientInfo_set0_pkey"}, |
| 126 | {ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER), "cms_set1_SignerIdentifier"}, | 130 | {ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER), "cms_set1_SignerIdentifier"}, |
| 127 | {ERR_FUNC(CMS_F_CMS_SET_DETACHED), "CMS_set_detached"}, | 131 | {ERR_FUNC(CMS_F_CMS_SET_DETACHED), "CMS_set_detached"}, |
| @@ -165,6 +169,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= | |||
| 165 | {ERR_REASON(CMS_R_ERROR_SETTING_KEY) ,"error setting key"}, | 169 | {ERR_REASON(CMS_R_ERROR_SETTING_KEY) ,"error setting key"}, |
| 166 | {ERR_REASON(CMS_R_ERROR_SETTING_RECIPIENTINFO),"error setting recipientinfo"}, | 170 | {ERR_REASON(CMS_R_ERROR_SETTING_RECIPIENTINFO),"error setting recipientinfo"}, |
| 167 | {ERR_REASON(CMS_R_INVALID_ENCRYPTED_KEY_LENGTH),"invalid encrypted key length"}, | 171 | {ERR_REASON(CMS_R_INVALID_ENCRYPTED_KEY_LENGTH),"invalid encrypted key length"}, |
| 172 | {ERR_REASON(CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER),"invalid key encryption parameter"}, | ||
| 168 | {ERR_REASON(CMS_R_INVALID_KEY_LENGTH) ,"invalid key length"}, | 173 | {ERR_REASON(CMS_R_INVALID_KEY_LENGTH) ,"invalid key length"}, |
| 169 | {ERR_REASON(CMS_R_MD_BIO_INIT_ERROR) ,"md bio init error"}, | 174 | {ERR_REASON(CMS_R_MD_BIO_INIT_ERROR) ,"md bio init error"}, |
| 170 | {ERR_REASON(CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH),"messagedigest attribute wrong length"}, | 175 | {ERR_REASON(CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH),"messagedigest attribute wrong length"}, |
| @@ -177,6 +182,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= | |||
| 177 | {ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA) ,"not encrypted data"}, | 182 | {ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA) ,"not encrypted data"}, |
| 178 | {ERR_REASON(CMS_R_NOT_KEK) ,"not kek"}, | 183 | {ERR_REASON(CMS_R_NOT_KEK) ,"not kek"}, |
| 179 | {ERR_REASON(CMS_R_NOT_KEY_TRANSPORT) ,"not key transport"}, | 184 | {ERR_REASON(CMS_R_NOT_KEY_TRANSPORT) ,"not key transport"}, |
| 185 | {ERR_REASON(CMS_R_NOT_PWRI) ,"not pwri"}, | ||
| 180 | {ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),"not supported for this key type"}, | 186 | {ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),"not supported for this key type"}, |
| 181 | {ERR_REASON(CMS_R_NO_CIPHER) ,"no cipher"}, | 187 | {ERR_REASON(CMS_R_NO_CIPHER) ,"no cipher"}, |
| 182 | {ERR_REASON(CMS_R_NO_CONTENT) ,"no content"}, | 188 | {ERR_REASON(CMS_R_NO_CONTENT) ,"no content"}, |
| @@ -189,6 +195,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= | |||
| 189 | {ERR_REASON(CMS_R_NO_MATCHING_RECIPIENT) ,"no matching recipient"}, | 195 | {ERR_REASON(CMS_R_NO_MATCHING_RECIPIENT) ,"no matching recipient"}, |
| 190 | {ERR_REASON(CMS_R_NO_MATCHING_SIGNATURE) ,"no matching signature"}, | 196 | {ERR_REASON(CMS_R_NO_MATCHING_SIGNATURE) ,"no matching signature"}, |
| 191 | {ERR_REASON(CMS_R_NO_MSGSIGDIGEST) ,"no msgsigdigest"}, | 197 | {ERR_REASON(CMS_R_NO_MSGSIGDIGEST) ,"no msgsigdigest"}, |
| 198 | {ERR_REASON(CMS_R_NO_PASSWORD) ,"no password"}, | ||
| 192 | {ERR_REASON(CMS_R_NO_PRIVATE_KEY) ,"no private key"}, | 199 | {ERR_REASON(CMS_R_NO_PRIVATE_KEY) ,"no private key"}, |
| 193 | {ERR_REASON(CMS_R_NO_PUBLIC_KEY) ,"no public key"}, | 200 | {ERR_REASON(CMS_R_NO_PUBLIC_KEY) ,"no public key"}, |
| 194 | {ERR_REASON(CMS_R_NO_RECEIPT_REQUEST) ,"no receipt request"}, | 201 | {ERR_REASON(CMS_R_NO_RECEIPT_REQUEST) ,"no receipt request"}, |
| @@ -212,10 +219,12 @@ static ERR_STRING_DATA CMS_str_reasons[]= | |||
| 212 | {ERR_REASON(CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM),"unsupported compression algorithm"}, | 219 | {ERR_REASON(CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM),"unsupported compression algorithm"}, |
| 213 | {ERR_REASON(CMS_R_UNSUPPORTED_CONTENT_TYPE),"unsupported content type"}, | 220 | {ERR_REASON(CMS_R_UNSUPPORTED_CONTENT_TYPE),"unsupported content type"}, |
| 214 | {ERR_REASON(CMS_R_UNSUPPORTED_KEK_ALGORITHM),"unsupported kek algorithm"}, | 221 | {ERR_REASON(CMS_R_UNSUPPORTED_KEK_ALGORITHM),"unsupported kek algorithm"}, |
| 222 | {ERR_REASON(CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM),"unsupported key encryption algorithm"}, | ||
| 215 | {ERR_REASON(CMS_R_UNSUPPORTED_RECIPIENT_TYPE),"unsupported recipient type"}, | 223 | {ERR_REASON(CMS_R_UNSUPPORTED_RECIPIENT_TYPE),"unsupported recipient type"}, |
| 216 | {ERR_REASON(CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE),"unsupported recpientinfo type"}, | 224 | {ERR_REASON(CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE),"unsupported recpientinfo type"}, |
| 217 | {ERR_REASON(CMS_R_UNSUPPORTED_TYPE) ,"unsupported type"}, | 225 | {ERR_REASON(CMS_R_UNSUPPORTED_TYPE) ,"unsupported type"}, |
| 218 | {ERR_REASON(CMS_R_UNWRAP_ERROR) ,"unwrap error"}, | 226 | {ERR_REASON(CMS_R_UNWRAP_ERROR) ,"unwrap error"}, |
| 227 | {ERR_REASON(CMS_R_UNWRAP_FAILURE) ,"unwrap failure"}, | ||
| 219 | {ERR_REASON(CMS_R_VERIFICATION_FAILURE) ,"verification failure"}, | 228 | {ERR_REASON(CMS_R_VERIFICATION_FAILURE) ,"verification failure"}, |
| 220 | {ERR_REASON(CMS_R_WRAP_ERROR) ,"wrap error"}, | 229 | {ERR_REASON(CMS_R_WRAP_ERROR) ,"wrap error"}, |
| 221 | {0,NULL} | 230 | {0,NULL} |
diff --git a/src/lib/libcrypto/cms/cms_lcl.h b/src/lib/libcrypto/cms/cms_lcl.h index c8ecfa724a..a9f9730157 100644 --- a/src/lib/libcrypto/cms/cms_lcl.h +++ b/src/lib/libcrypto/cms/cms_lcl.h | |||
| @@ -175,6 +175,8 @@ struct CMS_EncryptedContentInfo_st | |||
| 175 | const EVP_CIPHER *cipher; | 175 | const EVP_CIPHER *cipher; |
| 176 | unsigned char *key; | 176 | unsigned char *key; |
| 177 | size_t keylen; | 177 | size_t keylen; |
| 178 | /* Set to 1 if we are debugging decrypt and don't fake keys for MMA */ | ||
| 179 | int debug; | ||
| 178 | }; | 180 | }; |
| 179 | 181 | ||
| 180 | struct CMS_RecipientInfo_st | 182 | struct CMS_RecipientInfo_st |
| @@ -273,6 +275,9 @@ struct CMS_PasswordRecipientInfo_st | |||
| 273 | X509_ALGOR *keyDerivationAlgorithm; | 275 | X509_ALGOR *keyDerivationAlgorithm; |
| 274 | X509_ALGOR *keyEncryptionAlgorithm; | 276 | X509_ALGOR *keyEncryptionAlgorithm; |
| 275 | ASN1_OCTET_STRING *encryptedKey; | 277 | ASN1_OCTET_STRING *encryptedKey; |
| 278 | /* Extra info: password to use */ | ||
| 279 | unsigned char *pass; | ||
| 280 | size_t passlen; | ||
| 276 | }; | 281 | }; |
| 277 | 282 | ||
| 278 | struct CMS_OtherRecipientInfo_st | 283 | struct CMS_OtherRecipientInfo_st |
| @@ -411,6 +416,8 @@ DECLARE_ASN1_ITEM(CMS_SignerInfo) | |||
| 411 | DECLARE_ASN1_ITEM(CMS_IssuerAndSerialNumber) | 416 | DECLARE_ASN1_ITEM(CMS_IssuerAndSerialNumber) |
| 412 | DECLARE_ASN1_ITEM(CMS_Attributes_Sign) | 417 | DECLARE_ASN1_ITEM(CMS_Attributes_Sign) |
| 413 | DECLARE_ASN1_ITEM(CMS_Attributes_Verify) | 418 | DECLARE_ASN1_ITEM(CMS_Attributes_Verify) |
| 419 | DECLARE_ASN1_ITEM(CMS_RecipientInfo) | ||
| 420 | DECLARE_ASN1_ITEM(CMS_PasswordRecipientInfo) | ||
| 414 | DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber) | 421 | DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber) |
| 415 | 422 | ||
| 416 | #define CMS_SIGNERINFO_ISSUER_SERIAL 0 | 423 | #define CMS_SIGNERINFO_ISSUER_SERIAL 0 |
| @@ -454,6 +461,11 @@ int cms_msgSigDigest_add1(CMS_SignerInfo *dest, CMS_SignerInfo *src); | |||
| 454 | ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si); | 461 | ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si); |
| 455 | 462 | ||
| 456 | BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms); | 463 | BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms); |
| 464 | CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms); | ||
| 465 | |||
| 466 | /* PWRI routines */ | ||
| 467 | int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri, | ||
| 468 | int en_de); | ||
| 457 | 469 | ||
| 458 | #ifdef __cplusplus | 470 | #ifdef __cplusplus |
| 459 | } | 471 | } |
diff --git a/src/lib/libcrypto/cms/cms_lib.c b/src/lib/libcrypto/cms/cms_lib.c index d00fe0f87b..f88e8f3b52 100644 --- a/src/lib/libcrypto/cms/cms_lib.c +++ b/src/lib/libcrypto/cms/cms_lib.c | |||
| @@ -412,8 +412,7 @@ int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain, | |||
| 412 | */ | 412 | */ |
| 413 | || EVP_MD_pkey_type(EVP_MD_CTX_md(mtmp)) == nid) | 413 | || EVP_MD_pkey_type(EVP_MD_CTX_md(mtmp)) == nid) |
| 414 | { | 414 | { |
| 415 | EVP_MD_CTX_copy_ex(mctx, mtmp); | 415 | return EVP_MD_CTX_copy_ex(mctx, mtmp); |
| 416 | return 1; | ||
| 417 | } | 416 | } |
| 418 | chain = BIO_next(chain); | 417 | chain = BIO_next(chain); |
| 419 | } | 418 | } |
diff --git a/src/lib/libcrypto/cms/cms_pwri.c b/src/lib/libcrypto/cms/cms_pwri.c new file mode 100644 index 0000000000..b79612a12d --- /dev/null +++ b/src/lib/libcrypto/cms/cms_pwri.c | |||
| @@ -0,0 +1,454 @@ | |||
| 1 | /* crypto/cms/cms_pwri.c */ | ||
| 2 | /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | ||
| 3 | * project. | ||
| 4 | */ | ||
| 5 | /* ==================================================================== | ||
| 6 | * Copyright (c) 2009 The OpenSSL Project. All rights reserved. | ||
| 7 | * | ||
| 8 | * Redistribution and use in source and binary forms, with or without | ||
| 9 | * modification, are permitted provided that the following conditions | ||
| 10 | * are met: | ||
| 11 | * | ||
| 12 | * 1. Redistributions of source code must retain the above copyright | ||
| 13 | * notice, this list of conditions and the following disclaimer. | ||
| 14 | * | ||
| 15 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 16 | * notice, this list of conditions and the following disclaimer in | ||
| 17 | * the documentation and/or other materials provided with the | ||
| 18 | * distribution. | ||
| 19 | * | ||
| 20 | * 3. All advertising materials mentioning features or use of this | ||
| 21 | * software must display the following acknowledgment: | ||
| 22 | * "This product includes software developed by the OpenSSL Project | ||
| 23 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
| 24 | * | ||
| 25 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 26 | * endorse or promote products derived from this software without | ||
| 27 | * prior written permission. For written permission, please contact | ||
| 28 | * licensing@OpenSSL.org. | ||
| 29 | * | ||
| 30 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 31 | * nor may "OpenSSL" appear in their names without prior written | ||
| 32 | * permission of the OpenSSL Project. | ||
| 33 | * | ||
| 34 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 35 | * acknowledgment: | ||
| 36 | * "This product includes software developed by the OpenSSL Project | ||
| 37 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
| 38 | * | ||
| 39 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 40 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 41 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 42 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 43 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 44 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 45 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 46 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 47 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 48 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 49 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 50 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 51 | * ==================================================================== | ||
| 52 | */ | ||
| 53 | |||
| 54 | #include "cryptlib.h" | ||
| 55 | #include <openssl/asn1t.h> | ||
| 56 | #include <openssl/pem.h> | ||
| 57 | #include <openssl/x509v3.h> | ||
| 58 | #include <openssl/err.h> | ||
| 59 | #include <openssl/cms.h> | ||
| 60 | #include <openssl/rand.h> | ||
| 61 | #include <openssl/aes.h> | ||
| 62 | #include "cms_lcl.h" | ||
| 63 | #include "asn1_locl.h" | ||
| 64 | |||
| 65 | int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, | ||
| 66 | unsigned char *pass, ossl_ssize_t passlen) | ||
| 67 | { | ||
| 68 | CMS_PasswordRecipientInfo *pwri; | ||
| 69 | if (ri->type != CMS_RECIPINFO_PASS) | ||
| 70 | { | ||
| 71 | CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD, CMS_R_NOT_PWRI); | ||
| 72 | return 0; | ||
| 73 | } | ||
| 74 | |||
| 75 | pwri = ri->d.pwri; | ||
| 76 | pwri->pass = pass; | ||
| 77 | if (pass && passlen < 0) | ||
| 78 | passlen = strlen((char *)pass); | ||
| 79 | pwri->passlen = passlen; | ||
| 80 | return 1; | ||
| 81 | } | ||
| 82 | |||
| 83 | CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms, | ||
| 84 | int iter, int wrap_nid, int pbe_nid, | ||
| 85 | unsigned char *pass, | ||
| 86 | ossl_ssize_t passlen, | ||
| 87 | const EVP_CIPHER *kekciph) | ||
| 88 | { | ||
| 89 | CMS_RecipientInfo *ri = NULL; | ||
| 90 | CMS_EnvelopedData *env; | ||
| 91 | CMS_PasswordRecipientInfo *pwri; | ||
| 92 | EVP_CIPHER_CTX ctx; | ||
| 93 | X509_ALGOR *encalg = NULL; | ||
| 94 | unsigned char iv[EVP_MAX_IV_LENGTH]; | ||
| 95 | int ivlen; | ||
| 96 | env = cms_get0_enveloped(cms); | ||
| 97 | if (!env) | ||
| 98 | goto err; | ||
| 99 | |||
| 100 | if (wrap_nid <= 0) | ||
| 101 | wrap_nid = NID_id_alg_PWRI_KEK; | ||
| 102 | |||
| 103 | if (pbe_nid <= 0) | ||
| 104 | pbe_nid = NID_id_pbkdf2; | ||
| 105 | |||
| 106 | /* Get from enveloped data */ | ||
| 107 | if (kekciph == NULL) | ||
| 108 | kekciph = env->encryptedContentInfo->cipher; | ||
| 109 | |||
| 110 | if (kekciph == NULL) | ||
| 111 | { | ||
| 112 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, CMS_R_NO_CIPHER); | ||
| 113 | return NULL; | ||
| 114 | } | ||
| 115 | if (wrap_nid != NID_id_alg_PWRI_KEK) | ||
| 116 | { | ||
| 117 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, | ||
| 118 | CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM); | ||
| 119 | return NULL; | ||
| 120 | } | ||
| 121 | |||
| 122 | /* Setup algorithm identifier for cipher */ | ||
| 123 | encalg = X509_ALGOR_new(); | ||
| 124 | EVP_CIPHER_CTX_init(&ctx); | ||
| 125 | |||
| 126 | if (EVP_EncryptInit_ex(&ctx, kekciph, NULL, NULL, NULL) <= 0) | ||
| 127 | { | ||
| 128 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_EVP_LIB); | ||
| 129 | goto err; | ||
| 130 | } | ||
| 131 | |||
| 132 | ivlen = EVP_CIPHER_CTX_iv_length(&ctx); | ||
| 133 | |||
| 134 | if (ivlen > 0) | ||
| 135 | { | ||
| 136 | if (RAND_pseudo_bytes(iv, ivlen) <= 0) | ||
| 137 | goto err; | ||
| 138 | if (EVP_EncryptInit_ex(&ctx, NULL, NULL, NULL, iv) <= 0) | ||
| 139 | { | ||
| 140 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, | ||
| 141 | ERR_R_EVP_LIB); | ||
| 142 | goto err; | ||
| 143 | } | ||
| 144 | encalg->parameter = ASN1_TYPE_new(); | ||
| 145 | if (!encalg->parameter) | ||
| 146 | { | ||
| 147 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, | ||
| 148 | ERR_R_MALLOC_FAILURE); | ||
| 149 | goto err; | ||
| 150 | } | ||
| 151 | if (EVP_CIPHER_param_to_asn1(&ctx, encalg->parameter) <= 0) | ||
| 152 | { | ||
| 153 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, | ||
| 154 | CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); | ||
| 155 | goto err; | ||
| 156 | } | ||
| 157 | } | ||
| 158 | |||
| 159 | |||
| 160 | encalg->algorithm = OBJ_nid2obj(EVP_CIPHER_CTX_type(&ctx)); | ||
| 161 | |||
| 162 | EVP_CIPHER_CTX_cleanup(&ctx); | ||
| 163 | |||
| 164 | /* Initialize recipient info */ | ||
| 165 | ri = M_ASN1_new_of(CMS_RecipientInfo); | ||
| 166 | if (!ri) | ||
| 167 | goto merr; | ||
| 168 | |||
| 169 | ri->d.pwri = M_ASN1_new_of(CMS_PasswordRecipientInfo); | ||
| 170 | if (!ri->d.pwri) | ||
| 171 | goto merr; | ||
| 172 | ri->type = CMS_RECIPINFO_PASS; | ||
| 173 | |||
| 174 | pwri = ri->d.pwri; | ||
| 175 | /* Since this is overwritten, free up empty structure already there */ | ||
| 176 | X509_ALGOR_free(pwri->keyEncryptionAlgorithm); | ||
| 177 | pwri->keyEncryptionAlgorithm = X509_ALGOR_new(); | ||
| 178 | if (!pwri->keyEncryptionAlgorithm) | ||
| 179 | goto merr; | ||
| 180 | pwri->keyEncryptionAlgorithm->algorithm = OBJ_nid2obj(wrap_nid); | ||
| 181 | pwri->keyEncryptionAlgorithm->parameter = ASN1_TYPE_new(); | ||
| 182 | if (!pwri->keyEncryptionAlgorithm->parameter) | ||
| 183 | goto merr; | ||
| 184 | |||
| 185 | if(!ASN1_item_pack(encalg, ASN1_ITEM_rptr(X509_ALGOR), | ||
| 186 | &pwri->keyEncryptionAlgorithm->parameter->value.sequence)) | ||
| 187 | goto merr; | ||
| 188 | pwri->keyEncryptionAlgorithm->parameter->type = V_ASN1_SEQUENCE; | ||
| 189 | |||
| 190 | X509_ALGOR_free(encalg); | ||
| 191 | encalg = NULL; | ||
| 192 | |||
| 193 | /* Setup PBE algorithm */ | ||
| 194 | |||
| 195 | pwri->keyDerivationAlgorithm = PKCS5_pbkdf2_set(iter, NULL, 0, -1, -1); | ||
| 196 | |||
| 197 | if (!pwri->keyDerivationAlgorithm) | ||
| 198 | goto err; | ||
| 199 | |||
| 200 | CMS_RecipientInfo_set0_password(ri, pass, passlen); | ||
| 201 | pwri->version = 0; | ||
| 202 | |||
| 203 | if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri)) | ||
| 204 | goto merr; | ||
| 205 | |||
| 206 | return ri; | ||
| 207 | |||
| 208 | merr: | ||
| 209 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_MALLOC_FAILURE); | ||
| 210 | err: | ||
| 211 | EVP_CIPHER_CTX_cleanup(&ctx); | ||
| 212 | if (ri) | ||
| 213 | M_ASN1_free_of(ri, CMS_RecipientInfo); | ||
| 214 | if (encalg) | ||
| 215 | X509_ALGOR_free(encalg); | ||
| 216 | return NULL; | ||
| 217 | |||
| 218 | } | ||
| 219 | |||
| 220 | /* This is an implementation of the key wrapping mechanism in RFC3211, | ||
| 221 | * at some point this should go into EVP. | ||
| 222 | */ | ||
| 223 | |||
| 224 | static int kek_unwrap_key(unsigned char *out, size_t *outlen, | ||
| 225 | const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx) | ||
| 226 | { | ||
| 227 | size_t blocklen = EVP_CIPHER_CTX_block_size(ctx); | ||
| 228 | unsigned char *tmp; | ||
| 229 | int outl, rv = 0; | ||
| 230 | if (inlen < 2 * blocklen) | ||
| 231 | { | ||
| 232 | /* too small */ | ||
| 233 | return 0; | ||
| 234 | } | ||
| 235 | if (inlen % blocklen) | ||
| 236 | { | ||
| 237 | /* Invalid size */ | ||
| 238 | return 0; | ||
| 239 | } | ||
| 240 | tmp = OPENSSL_malloc(inlen); | ||
| 241 | /* setup IV by decrypting last two blocks */ | ||
| 242 | EVP_DecryptUpdate(ctx, tmp + inlen - 2 * blocklen, &outl, | ||
| 243 | in + inlen - 2 * blocklen, blocklen * 2); | ||
| 244 | /* Do a decrypt of last decrypted block to set IV to correct value | ||
| 245 | * output it to start of buffer so we don't corrupt decrypted block | ||
| 246 | * this works because buffer is at least two block lengths long. | ||
| 247 | */ | ||
| 248 | EVP_DecryptUpdate(ctx, tmp, &outl, | ||
| 249 | tmp + inlen - blocklen, blocklen); | ||
| 250 | /* Can now decrypt first n - 1 blocks */ | ||
| 251 | EVP_DecryptUpdate(ctx, tmp, &outl, in, inlen - blocklen); | ||
| 252 | |||
| 253 | /* Reset IV to original value */ | ||
| 254 | EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, NULL); | ||
| 255 | /* Decrypt again */ | ||
| 256 | EVP_DecryptUpdate(ctx, tmp, &outl, tmp, inlen); | ||
| 257 | /* Check check bytes */ | ||
| 258 | if (((tmp[1] ^ tmp[4]) & (tmp[2] ^ tmp[5]) & (tmp[3] ^ tmp[6])) != 0xff) | ||
| 259 | { | ||
| 260 | /* Check byte failure */ | ||
| 261 | goto err; | ||
| 262 | } | ||
| 263 | if (inlen < (size_t)(tmp[0] - 4 )) | ||
| 264 | { | ||
| 265 | /* Invalid length value */ | ||
| 266 | goto err; | ||
| 267 | } | ||
| 268 | *outlen = (size_t)tmp[0]; | ||
| 269 | memcpy(out, tmp + 4, *outlen); | ||
| 270 | rv = 1; | ||
| 271 | err: | ||
| 272 | OPENSSL_cleanse(tmp, inlen); | ||
| 273 | OPENSSL_free(tmp); | ||
| 274 | return rv; | ||
| 275 | |||
| 276 | } | ||
| 277 | |||
| 278 | static int kek_wrap_key(unsigned char *out, size_t *outlen, | ||
| 279 | const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx) | ||
| 280 | { | ||
| 281 | size_t blocklen = EVP_CIPHER_CTX_block_size(ctx); | ||
| 282 | size_t olen; | ||
| 283 | int dummy; | ||
| 284 | /* First decide length of output buffer: need header and round up to | ||
| 285 | * multiple of block length. | ||
| 286 | */ | ||
| 287 | olen = (inlen + 4 + blocklen - 1)/blocklen; | ||
| 288 | olen *= blocklen; | ||
| 289 | if (olen < 2 * blocklen) | ||
| 290 | { | ||
| 291 | /* Key too small */ | ||
| 292 | return 0; | ||
| 293 | } | ||
| 294 | if (inlen > 0xFF) | ||
| 295 | { | ||
| 296 | /* Key too large */ | ||
| 297 | return 0; | ||
| 298 | } | ||
| 299 | if (out) | ||
| 300 | { | ||
| 301 | /* Set header */ | ||
| 302 | out[0] = (unsigned char)inlen; | ||
| 303 | out[1] = in[0] ^ 0xFF; | ||
| 304 | out[2] = in[1] ^ 0xFF; | ||
| 305 | out[3] = in[2] ^ 0xFF; | ||
| 306 | memcpy(out + 4, in, inlen); | ||
| 307 | /* Add random padding to end */ | ||
| 308 | if (olen > inlen + 4) | ||
| 309 | RAND_pseudo_bytes(out + 4 + inlen, olen - 4 - inlen); | ||
| 310 | /* Encrypt twice */ | ||
| 311 | EVP_EncryptUpdate(ctx, out, &dummy, out, olen); | ||
| 312 | EVP_EncryptUpdate(ctx, out, &dummy, out, olen); | ||
| 313 | } | ||
| 314 | |||
| 315 | *outlen = olen; | ||
| 316 | |||
| 317 | return 1; | ||
| 318 | } | ||
| 319 | |||
| 320 | /* Encrypt/Decrypt content key in PWRI recipient info */ | ||
| 321 | |||
| 322 | int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri, | ||
| 323 | int en_de) | ||
| 324 | { | ||
| 325 | CMS_EncryptedContentInfo *ec; | ||
| 326 | CMS_PasswordRecipientInfo *pwri; | ||
| 327 | const unsigned char *p = NULL; | ||
| 328 | int plen; | ||
| 329 | int r = 0; | ||
| 330 | X509_ALGOR *algtmp, *kekalg = NULL; | ||
| 331 | EVP_CIPHER_CTX kekctx; | ||
| 332 | const EVP_CIPHER *kekcipher; | ||
| 333 | unsigned char *key = NULL; | ||
| 334 | size_t keylen; | ||
| 335 | |||
| 336 | ec = cms->d.envelopedData->encryptedContentInfo; | ||
| 337 | |||
| 338 | pwri = ri->d.pwri; | ||
| 339 | EVP_CIPHER_CTX_init(&kekctx); | ||
| 340 | |||
| 341 | if (!pwri->pass) | ||
| 342 | { | ||
| 343 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, CMS_R_NO_PASSWORD); | ||
| 344 | return 0; | ||
| 345 | } | ||
| 346 | algtmp = pwri->keyEncryptionAlgorithm; | ||
| 347 | |||
| 348 | if (!algtmp || OBJ_obj2nid(algtmp->algorithm) != NID_id_alg_PWRI_KEK) | ||
| 349 | { | ||
| 350 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
| 351 | CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM); | ||
| 352 | return 0; | ||
| 353 | } | ||
| 354 | |||
| 355 | if (algtmp->parameter->type == V_ASN1_SEQUENCE) | ||
| 356 | { | ||
| 357 | p = algtmp->parameter->value.sequence->data; | ||
| 358 | plen = algtmp->parameter->value.sequence->length; | ||
| 359 | kekalg = d2i_X509_ALGOR(NULL, &p, plen); | ||
| 360 | } | ||
| 361 | if (kekalg == NULL) | ||
| 362 | { | ||
| 363 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
| 364 | CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER); | ||
| 365 | return 0; | ||
| 366 | } | ||
| 367 | |||
| 368 | kekcipher = EVP_get_cipherbyobj(kekalg->algorithm); | ||
| 369 | |||
| 370 | if(!kekcipher) | ||
| 371 | { | ||
| 372 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
| 373 | CMS_R_UNKNOWN_CIPHER); | ||
| 374 | goto err; | ||
| 375 | } | ||
| 376 | |||
| 377 | /* Fixup cipher based on AlgorithmIdentifier to set IV etc */ | ||
| 378 | if (!EVP_CipherInit_ex(&kekctx, kekcipher, NULL, NULL, NULL, en_de)) | ||
| 379 | goto err; | ||
| 380 | EVP_CIPHER_CTX_set_padding(&kekctx, 0); | ||
| 381 | if(EVP_CIPHER_asn1_to_param(&kekctx, kekalg->parameter) < 0) | ||
| 382 | { | ||
| 383 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
| 384 | CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); | ||
| 385 | goto err; | ||
| 386 | } | ||
| 387 | |||
| 388 | algtmp = pwri->keyDerivationAlgorithm; | ||
| 389 | |||
| 390 | /* Finish password based key derivation to setup key in "ctx" */ | ||
| 391 | |||
| 392 | if (EVP_PBE_CipherInit(algtmp->algorithm, | ||
| 393 | (char *)pwri->pass, pwri->passlen, | ||
| 394 | algtmp->parameter, &kekctx, en_de) < 0) | ||
| 395 | { | ||
| 396 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, ERR_R_EVP_LIB); | ||
| 397 | goto err; | ||
| 398 | } | ||
| 399 | |||
| 400 | /* Finally wrap/unwrap the key */ | ||
| 401 | |||
| 402 | if (en_de) | ||
| 403 | { | ||
| 404 | |||
| 405 | if (!kek_wrap_key(NULL, &keylen, ec->key, ec->keylen, &kekctx)) | ||
| 406 | goto err; | ||
| 407 | |||
| 408 | key = OPENSSL_malloc(keylen); | ||
| 409 | |||
| 410 | if (!key) | ||
| 411 | goto err; | ||
| 412 | |||
| 413 | if (!kek_wrap_key(key, &keylen, ec->key, ec->keylen, &kekctx)) | ||
| 414 | goto err; | ||
| 415 | pwri->encryptedKey->data = key; | ||
| 416 | pwri->encryptedKey->length = keylen; | ||
| 417 | } | ||
| 418 | else | ||
| 419 | { | ||
| 420 | key = OPENSSL_malloc(pwri->encryptedKey->length); | ||
| 421 | |||
| 422 | if (!key) | ||
| 423 | { | ||
| 424 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
| 425 | ERR_R_MALLOC_FAILURE); | ||
| 426 | goto err; | ||
| 427 | } | ||
| 428 | if (!kek_unwrap_key(key, &keylen, | ||
| 429 | pwri->encryptedKey->data, | ||
| 430 | pwri->encryptedKey->length, &kekctx)) | ||
| 431 | { | ||
| 432 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
| 433 | CMS_R_UNWRAP_FAILURE); | ||
| 434 | goto err; | ||
| 435 | } | ||
| 436 | |||
| 437 | ec->key = key; | ||
| 438 | ec->keylen = keylen; | ||
| 439 | |||
| 440 | } | ||
| 441 | |||
| 442 | r = 1; | ||
| 443 | |||
| 444 | err: | ||
| 445 | |||
| 446 | EVP_CIPHER_CTX_cleanup(&kekctx); | ||
| 447 | |||
| 448 | if (!r && key) | ||
| 449 | OPENSSL_free(key); | ||
| 450 | X509_ALGOR_free(kekalg); | ||
| 451 | |||
| 452 | return r; | ||
| 453 | |||
| 454 | } | ||
diff --git a/src/lib/libcrypto/cms/cms_sd.c b/src/lib/libcrypto/cms/cms_sd.c index e3192b9c57..77fbd13596 100644 --- a/src/lib/libcrypto/cms/cms_sd.c +++ b/src/lib/libcrypto/cms/cms_sd.c | |||
| @@ -641,7 +641,8 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms, | |||
| 641 | cms->d.signedData->encapContentInfo->eContentType; | 641 | cms->d.signedData->encapContentInfo->eContentType; |
| 642 | unsigned char md[EVP_MAX_MD_SIZE]; | 642 | unsigned char md[EVP_MAX_MD_SIZE]; |
| 643 | unsigned int mdlen; | 643 | unsigned int mdlen; |
| 644 | EVP_DigestFinal_ex(&mctx, md, &mdlen); | 644 | if (!EVP_DigestFinal_ex(&mctx, md, &mdlen)) |
| 645 | goto err; | ||
| 645 | if (!CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest, | 646 | if (!CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest, |
| 646 | V_ASN1_OCTET_STRING, | 647 | V_ASN1_OCTET_STRING, |
| 647 | md, mdlen)) | 648 | md, mdlen)) |
diff --git a/src/lib/libcrypto/dh/dh_ameth.c b/src/lib/libcrypto/dh/dh_ameth.c index 377caf96c9..02ec2d47b4 100644 --- a/src/lib/libcrypto/dh/dh_ameth.c +++ b/src/lib/libcrypto/dh/dh_ameth.c | |||
| @@ -493,6 +493,7 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth = | |||
| 493 | dh_copy_parameters, | 493 | dh_copy_parameters, |
| 494 | dh_cmp_parameters, | 494 | dh_cmp_parameters, |
| 495 | dh_param_print, | 495 | dh_param_print, |
| 496 | 0, | ||
| 496 | 497 | ||
| 497 | int_dh_free, | 498 | int_dh_free, |
| 498 | 0 | 499 | 0 |
diff --git a/src/lib/libcrypto/dsa/dsa_ameth.c b/src/lib/libcrypto/dsa/dsa_ameth.c index 6413aae46e..376156ec5e 100644 --- a/src/lib/libcrypto/dsa/dsa_ameth.c +++ b/src/lib/libcrypto/dsa/dsa_ameth.c | |||
| @@ -542,6 +542,52 @@ static int old_dsa_priv_encode(const EVP_PKEY *pkey, unsigned char **pder) | |||
| 542 | return i2d_DSAPrivateKey(pkey->pkey.dsa, pder); | 542 | return i2d_DSAPrivateKey(pkey->pkey.dsa, pder); |
| 543 | } | 543 | } |
| 544 | 544 | ||
| 545 | static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg, | ||
| 546 | const ASN1_STRING *sig, | ||
| 547 | int indent, ASN1_PCTX *pctx) | ||
| 548 | { | ||
| 549 | DSA_SIG *dsa_sig; | ||
| 550 | const unsigned char *p; | ||
| 551 | if (!sig) | ||
| 552 | { | ||
| 553 | if (BIO_puts(bp, "\n") <= 0) | ||
| 554 | return 0; | ||
| 555 | else | ||
| 556 | return 1; | ||
| 557 | } | ||
| 558 | p = sig->data; | ||
| 559 | dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length); | ||
| 560 | if (dsa_sig) | ||
| 561 | { | ||
| 562 | int rv = 0; | ||
| 563 | size_t buf_len = 0; | ||
| 564 | unsigned char *m=NULL; | ||
| 565 | update_buflen(dsa_sig->r, &buf_len); | ||
| 566 | update_buflen(dsa_sig->s, &buf_len); | ||
| 567 | m = OPENSSL_malloc(buf_len+10); | ||
| 568 | if (m == NULL) | ||
| 569 | { | ||
| 570 | DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE); | ||
| 571 | goto err; | ||
| 572 | } | ||
| 573 | |||
| 574 | if (BIO_write(bp, "\n", 1) != 1) | ||
| 575 | goto err; | ||
| 576 | |||
| 577 | if (!ASN1_bn_print(bp,"r: ",dsa_sig->r,m,indent)) | ||
| 578 | goto err; | ||
| 579 | if (!ASN1_bn_print(bp,"s: ",dsa_sig->s,m,indent)) | ||
| 580 | goto err; | ||
| 581 | rv = 1; | ||
| 582 | err: | ||
| 583 | if (m) | ||
| 584 | OPENSSL_free(m); | ||
| 585 | DSA_SIG_free(dsa_sig); | ||
| 586 | return rv; | ||
| 587 | } | ||
| 588 | return X509_signature_dump(bp, sig, indent); | ||
| 589 | } | ||
| 590 | |||
| 545 | static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) | 591 | static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) |
| 546 | { | 592 | { |
| 547 | switch (op) | 593 | switch (op) |
| @@ -647,6 +693,7 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[] = | |||
| 647 | dsa_copy_parameters, | 693 | dsa_copy_parameters, |
| 648 | dsa_cmp_parameters, | 694 | dsa_cmp_parameters, |
| 649 | dsa_param_print, | 695 | dsa_param_print, |
| 696 | dsa_sig_print, | ||
| 650 | 697 | ||
| 651 | int_dsa_free, | 698 | int_dsa_free, |
| 652 | dsa_pkey_ctrl, | 699 | dsa_pkey_ctrl, |
diff --git a/src/lib/libcrypto/dsa/dsa_locl.h b/src/lib/libcrypto/dsa/dsa_locl.h index 2b8cfee3db..21e2e45242 100644 --- a/src/lib/libcrypto/dsa/dsa_locl.h +++ b/src/lib/libcrypto/dsa/dsa_locl.h | |||
| @@ -56,4 +56,5 @@ | |||
| 56 | 56 | ||
| 57 | int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits, | 57 | int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits, |
| 58 | const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len, | 58 | const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len, |
| 59 | unsigned char *seed_out, | ||
| 59 | int *counter_ret, unsigned long *h_ret, BN_GENCB *cb); | 60 | int *counter_ret, unsigned long *h_ret, BN_GENCB *cb); |
diff --git a/src/lib/libcrypto/dsa/dsa_pmeth.c b/src/lib/libcrypto/dsa/dsa_pmeth.c index e2df54fec6..715d8d675b 100644 --- a/src/lib/libcrypto/dsa/dsa_pmeth.c +++ b/src/lib/libcrypto/dsa/dsa_pmeth.c | |||
| @@ -189,7 +189,9 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | |||
| 189 | EVP_MD_type((const EVP_MD *)p2) != NID_dsa && | 189 | EVP_MD_type((const EVP_MD *)p2) != NID_dsa && |
| 190 | EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA && | 190 | EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA && |
| 191 | EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && | 191 | EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && |
| 192 | EVP_MD_type((const EVP_MD *)p2) != NID_sha256) | 192 | EVP_MD_type((const EVP_MD *)p2) != NID_sha256 && |
| 193 | EVP_MD_type((const EVP_MD *)p2) != NID_sha384 && | ||
| 194 | EVP_MD_type((const EVP_MD *)p2) != NID_sha512) | ||
| 193 | { | 195 | { |
| 194 | DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE); | 196 | DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE); |
| 195 | return 0; | 197 | return 0; |
| @@ -253,7 +255,7 @@ static int pkey_dsa_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) | |||
| 253 | if (!dsa) | 255 | if (!dsa) |
| 254 | return 0; | 256 | return 0; |
| 255 | ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd, | 257 | ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd, |
| 256 | NULL, 0, NULL, NULL, pcb); | 258 | NULL, 0, NULL, NULL, NULL, pcb); |
| 257 | if (ret) | 259 | if (ret) |
| 258 | EVP_PKEY_assign_DSA(pkey, dsa); | 260 | EVP_PKEY_assign_DSA(pkey, dsa); |
| 259 | else | 261 | else |
diff --git a/src/lib/libcrypto/ec/ec2_mult.c b/src/lib/libcrypto/ec/ec2_mult.c index e12b9b284a..26f4a783fc 100644 --- a/src/lib/libcrypto/ec/ec2_mult.c +++ b/src/lib/libcrypto/ec/ec2_mult.c | |||
| @@ -71,6 +71,8 @@ | |||
| 71 | 71 | ||
| 72 | #include "ec_lcl.h" | 72 | #include "ec_lcl.h" |
| 73 | 73 | ||
| 74 | #ifndef OPENSSL_NO_EC2M | ||
| 75 | |||
| 74 | 76 | ||
| 75 | /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective | 77 | /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective |
| 76 | * coordinates. | 78 | * coordinates. |
| @@ -384,3 +386,5 @@ int ec_GF2m_have_precompute_mult(const EC_GROUP *group) | |||
| 384 | { | 386 | { |
| 385 | return ec_wNAF_have_precompute_mult(group); | 387 | return ec_wNAF_have_precompute_mult(group); |
| 386 | } | 388 | } |
| 389 | |||
| 390 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ec2_oct.c b/src/lib/libcrypto/ec/ec2_oct.c new file mode 100644 index 0000000000..f1d75e5ddf --- /dev/null +++ b/src/lib/libcrypto/ec/ec2_oct.c | |||
| @@ -0,0 +1,407 @@ | |||
| 1 | /* crypto/ec/ec2_oct.c */ | ||
| 2 | /* ==================================================================== | ||
| 3 | * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. | ||
| 4 | * | ||
| 5 | * The Elliptic Curve Public-Key Crypto Library (ECC Code) included | ||
| 6 | * herein is developed by SUN MICROSYSTEMS, INC., and is contributed | ||
| 7 | * to the OpenSSL project. | ||
| 8 | * | ||
| 9 | * The ECC Code is licensed pursuant to the OpenSSL open source | ||
| 10 | * license provided below. | ||
| 11 | * | ||
| 12 | * The software is originally written by Sheueling Chang Shantz and | ||
| 13 | * Douglas Stebila of Sun Microsystems Laboratories. | ||
| 14 | * | ||
| 15 | */ | ||
| 16 | /* ==================================================================== | ||
| 17 | * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. | ||
| 18 | * | ||
| 19 | * Redistribution and use in source and binary forms, with or without | ||
| 20 | * modification, are permitted provided that the following conditions | ||
| 21 | * are met: | ||
| 22 | * | ||
| 23 | * 1. Redistributions of source code must retain the above copyright | ||
| 24 | * notice, this list of conditions and the following disclaimer. | ||
| 25 | * | ||
| 26 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 27 | * notice, this list of conditions and the following disclaimer in | ||
| 28 | * the documentation and/or other materials provided with the | ||
| 29 | * distribution. | ||
| 30 | * | ||
| 31 | * 3. All advertising materials mentioning features or use of this | ||
| 32 | * software must display the following acknowledgment: | ||
| 33 | * "This product includes software developed by the OpenSSL Project | ||
| 34 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
| 35 | * | ||
| 36 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 37 | * endorse or promote products derived from this software without | ||
| 38 | * prior written permission. For written permission, please contact | ||
| 39 | * openssl-core@openssl.org. | ||
| 40 | * | ||
| 41 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 42 | * nor may "OpenSSL" appear in their names without prior written | ||
| 43 | * permission of the OpenSSL Project. | ||
| 44 | * | ||
| 45 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 46 | * acknowledgment: | ||
| 47 | * "This product includes software developed by the OpenSSL Project | ||
| 48 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
| 49 | * | ||
| 50 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 51 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 52 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 53 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 54 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 55 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 56 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 57 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 58 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 59 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 60 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 61 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 62 | * ==================================================================== | ||
| 63 | * | ||
| 64 | * This product includes cryptographic software written by Eric Young | ||
| 65 | * (eay@cryptsoft.com). This product includes software written by Tim | ||
| 66 | * Hudson (tjh@cryptsoft.com). | ||
| 67 | * | ||
| 68 | */ | ||
| 69 | |||
| 70 | #include <openssl/err.h> | ||
| 71 | |||
| 72 | #include "ec_lcl.h" | ||
| 73 | |||
| 74 | #ifndef OPENSSL_NO_EC2M | ||
| 75 | |||
| 76 | /* Calculates and sets the affine coordinates of an EC_POINT from the given | ||
| 77 | * compressed coordinates. Uses algorithm 2.3.4 of SEC 1. | ||
| 78 | * Note that the simple implementation only uses affine coordinates. | ||
| 79 | * | ||
| 80 | * The method is from the following publication: | ||
| 81 | * | ||
| 82 | * Harper, Menezes, Vanstone: | ||
| 83 | * "Public-Key Cryptosystems with Very Small Key Lengths", | ||
| 84 | * EUROCRYPT '92, Springer-Verlag LNCS 658, | ||
| 85 | * published February 1993 | ||
| 86 | * | ||
| 87 | * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe | ||
| 88 | * the same method, but claim no priority date earlier than July 29, 1994 | ||
| 89 | * (and additionally fail to cite the EUROCRYPT '92 publication as prior art). | ||
| 90 | */ | ||
| 91 | int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point, | ||
| 92 | const BIGNUM *x_, int y_bit, BN_CTX *ctx) | ||
| 93 | { | ||
| 94 | BN_CTX *new_ctx = NULL; | ||
| 95 | BIGNUM *tmp, *x, *y, *z; | ||
| 96 | int ret = 0, z0; | ||
| 97 | |||
| 98 | /* clear error queue */ | ||
| 99 | ERR_clear_error(); | ||
| 100 | |||
| 101 | if (ctx == NULL) | ||
| 102 | { | ||
| 103 | ctx = new_ctx = BN_CTX_new(); | ||
| 104 | if (ctx == NULL) | ||
| 105 | return 0; | ||
| 106 | } | ||
| 107 | |||
| 108 | y_bit = (y_bit != 0) ? 1 : 0; | ||
| 109 | |||
| 110 | BN_CTX_start(ctx); | ||
| 111 | tmp = BN_CTX_get(ctx); | ||
| 112 | x = BN_CTX_get(ctx); | ||
| 113 | y = BN_CTX_get(ctx); | ||
| 114 | z = BN_CTX_get(ctx); | ||
| 115 | if (z == NULL) goto err; | ||
| 116 | |||
| 117 | if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err; | ||
| 118 | if (BN_is_zero(x)) | ||
| 119 | { | ||
| 120 | if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err; | ||
| 121 | } | ||
| 122 | else | ||
| 123 | { | ||
| 124 | if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err; | ||
| 125 | if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err; | ||
| 126 | if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err; | ||
| 127 | if (!BN_GF2m_add(tmp, x, tmp)) goto err; | ||
| 128 | if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx)) | ||
| 129 | { | ||
| 130 | unsigned long err = ERR_peek_last_error(); | ||
| 131 | |||
| 132 | if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION) | ||
| 133 | { | ||
| 134 | ERR_clear_error(); | ||
| 135 | ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); | ||
| 136 | } | ||
| 137 | else | ||
| 138 | ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB); | ||
| 139 | goto err; | ||
| 140 | } | ||
| 141 | z0 = (BN_is_odd(z)) ? 1 : 0; | ||
| 142 | if (!group->meth->field_mul(group, y, x, z, ctx)) goto err; | ||
| 143 | if (z0 != y_bit) | ||
| 144 | { | ||
| 145 | if (!BN_GF2m_add(y, y, x)) goto err; | ||
| 146 | } | ||
| 147 | } | ||
| 148 | |||
| 149 | if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; | ||
| 150 | |||
| 151 | ret = 1; | ||
| 152 | |||
| 153 | err: | ||
| 154 | BN_CTX_end(ctx); | ||
| 155 | if (new_ctx != NULL) | ||
| 156 | BN_CTX_free(new_ctx); | ||
| 157 | return ret; | ||
| 158 | } | ||
| 159 | |||
| 160 | |||
| 161 | /* Converts an EC_POINT to an octet string. | ||
| 162 | * If buf is NULL, the encoded length will be returned. | ||
| 163 | * If the length len of buf is smaller than required an error will be returned. | ||
| 164 | */ | ||
| 165 | size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, | ||
| 166 | unsigned char *buf, size_t len, BN_CTX *ctx) | ||
| 167 | { | ||
| 168 | size_t ret; | ||
| 169 | BN_CTX *new_ctx = NULL; | ||
| 170 | int used_ctx = 0; | ||
| 171 | BIGNUM *x, *y, *yxi; | ||
| 172 | size_t field_len, i, skip; | ||
| 173 | |||
| 174 | if ((form != POINT_CONVERSION_COMPRESSED) | ||
| 175 | && (form != POINT_CONVERSION_UNCOMPRESSED) | ||
| 176 | && (form != POINT_CONVERSION_HYBRID)) | ||
| 177 | { | ||
| 178 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM); | ||
| 179 | goto err; | ||
| 180 | } | ||
| 181 | |||
| 182 | if (EC_POINT_is_at_infinity(group, point)) | ||
| 183 | { | ||
| 184 | /* encodes to a single 0 octet */ | ||
| 185 | if (buf != NULL) | ||
| 186 | { | ||
| 187 | if (len < 1) | ||
| 188 | { | ||
| 189 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); | ||
| 190 | return 0; | ||
| 191 | } | ||
| 192 | buf[0] = 0; | ||
| 193 | } | ||
| 194 | return 1; | ||
| 195 | } | ||
| 196 | |||
| 197 | |||
| 198 | /* ret := required output buffer length */ | ||
| 199 | field_len = (EC_GROUP_get_degree(group) + 7) / 8; | ||
| 200 | ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; | ||
| 201 | |||
| 202 | /* if 'buf' is NULL, just return required length */ | ||
| 203 | if (buf != NULL) | ||
| 204 | { | ||
| 205 | if (len < ret) | ||
| 206 | { | ||
| 207 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); | ||
| 208 | goto err; | ||
| 209 | } | ||
| 210 | |||
| 211 | if (ctx == NULL) | ||
| 212 | { | ||
| 213 | ctx = new_ctx = BN_CTX_new(); | ||
| 214 | if (ctx == NULL) | ||
| 215 | return 0; | ||
| 216 | } | ||
| 217 | |||
| 218 | BN_CTX_start(ctx); | ||
| 219 | used_ctx = 1; | ||
| 220 | x = BN_CTX_get(ctx); | ||
| 221 | y = BN_CTX_get(ctx); | ||
| 222 | yxi = BN_CTX_get(ctx); | ||
| 223 | if (yxi == NULL) goto err; | ||
| 224 | |||
| 225 | if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; | ||
| 226 | |||
| 227 | buf[0] = form; | ||
| 228 | if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x)) | ||
| 229 | { | ||
| 230 | if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err; | ||
| 231 | if (BN_is_odd(yxi)) buf[0]++; | ||
| 232 | } | ||
| 233 | |||
| 234 | i = 1; | ||
| 235 | |||
| 236 | skip = field_len - BN_num_bytes(x); | ||
| 237 | if (skip > field_len) | ||
| 238 | { | ||
| 239 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
| 240 | goto err; | ||
| 241 | } | ||
| 242 | while (skip > 0) | ||
| 243 | { | ||
| 244 | buf[i++] = 0; | ||
| 245 | skip--; | ||
| 246 | } | ||
| 247 | skip = BN_bn2bin(x, buf + i); | ||
| 248 | i += skip; | ||
| 249 | if (i != 1 + field_len) | ||
| 250 | { | ||
| 251 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
| 252 | goto err; | ||
| 253 | } | ||
| 254 | |||
| 255 | if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID) | ||
| 256 | { | ||
| 257 | skip = field_len - BN_num_bytes(y); | ||
| 258 | if (skip > field_len) | ||
| 259 | { | ||
| 260 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
| 261 | goto err; | ||
| 262 | } | ||
| 263 | while (skip > 0) | ||
| 264 | { | ||
| 265 | buf[i++] = 0; | ||
| 266 | skip--; | ||
| 267 | } | ||
| 268 | skip = BN_bn2bin(y, buf + i); | ||
| 269 | i += skip; | ||
| 270 | } | ||
| 271 | |||
| 272 | if (i != ret) | ||
| 273 | { | ||
| 274 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
| 275 | goto err; | ||
| 276 | } | ||
| 277 | } | ||
| 278 | |||
| 279 | if (used_ctx) | ||
| 280 | BN_CTX_end(ctx); | ||
| 281 | if (new_ctx != NULL) | ||
| 282 | BN_CTX_free(new_ctx); | ||
| 283 | return ret; | ||
| 284 | |||
| 285 | err: | ||
| 286 | if (used_ctx) | ||
| 287 | BN_CTX_end(ctx); | ||
| 288 | if (new_ctx != NULL) | ||
| 289 | BN_CTX_free(new_ctx); | ||
| 290 | return 0; | ||
| 291 | } | ||
| 292 | |||
| 293 | |||
| 294 | /* Converts an octet string representation to an EC_POINT. | ||
| 295 | * Note that the simple implementation only uses affine coordinates. | ||
| 296 | */ | ||
| 297 | int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point, | ||
| 298 | const unsigned char *buf, size_t len, BN_CTX *ctx) | ||
| 299 | { | ||
| 300 | point_conversion_form_t form; | ||
| 301 | int y_bit; | ||
| 302 | BN_CTX *new_ctx = NULL; | ||
| 303 | BIGNUM *x, *y, *yxi; | ||
| 304 | size_t field_len, enc_len; | ||
| 305 | int ret = 0; | ||
| 306 | |||
| 307 | if (len == 0) | ||
| 308 | { | ||
| 309 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL); | ||
| 310 | return 0; | ||
| 311 | } | ||
| 312 | form = buf[0]; | ||
| 313 | y_bit = form & 1; | ||
| 314 | form = form & ~1U; | ||
| 315 | if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED) | ||
| 316 | && (form != POINT_CONVERSION_UNCOMPRESSED) | ||
| 317 | && (form != POINT_CONVERSION_HYBRID)) | ||
| 318 | { | ||
| 319 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 320 | return 0; | ||
| 321 | } | ||
| 322 | if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit) | ||
| 323 | { | ||
| 324 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 325 | return 0; | ||
| 326 | } | ||
| 327 | |||
| 328 | if (form == 0) | ||
| 329 | { | ||
| 330 | if (len != 1) | ||
| 331 | { | ||
| 332 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 333 | return 0; | ||
| 334 | } | ||
| 335 | |||
| 336 | return EC_POINT_set_to_infinity(group, point); | ||
| 337 | } | ||
| 338 | |||
| 339 | field_len = (EC_GROUP_get_degree(group) + 7) / 8; | ||
| 340 | enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; | ||
| 341 | |||
| 342 | if (len != enc_len) | ||
| 343 | { | ||
| 344 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 345 | return 0; | ||
| 346 | } | ||
| 347 | |||
| 348 | if (ctx == NULL) | ||
| 349 | { | ||
| 350 | ctx = new_ctx = BN_CTX_new(); | ||
| 351 | if (ctx == NULL) | ||
| 352 | return 0; | ||
| 353 | } | ||
| 354 | |||
| 355 | BN_CTX_start(ctx); | ||
| 356 | x = BN_CTX_get(ctx); | ||
| 357 | y = BN_CTX_get(ctx); | ||
| 358 | yxi = BN_CTX_get(ctx); | ||
| 359 | if (yxi == NULL) goto err; | ||
| 360 | |||
| 361 | if (!BN_bin2bn(buf + 1, field_len, x)) goto err; | ||
| 362 | if (BN_ucmp(x, &group->field) >= 0) | ||
| 363 | { | ||
| 364 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 365 | goto err; | ||
| 366 | } | ||
| 367 | |||
| 368 | if (form == POINT_CONVERSION_COMPRESSED) | ||
| 369 | { | ||
| 370 | if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err; | ||
| 371 | } | ||
| 372 | else | ||
| 373 | { | ||
| 374 | if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err; | ||
| 375 | if (BN_ucmp(y, &group->field) >= 0) | ||
| 376 | { | ||
| 377 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 378 | goto err; | ||
| 379 | } | ||
| 380 | if (form == POINT_CONVERSION_HYBRID) | ||
| 381 | { | ||
| 382 | if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err; | ||
| 383 | if (y_bit != BN_is_odd(yxi)) | ||
| 384 | { | ||
| 385 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 386 | goto err; | ||
| 387 | } | ||
| 388 | } | ||
| 389 | |||
| 390 | if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; | ||
| 391 | } | ||
| 392 | |||
| 393 | if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */ | ||
| 394 | { | ||
| 395 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE); | ||
| 396 | goto err; | ||
| 397 | } | ||
| 398 | |||
| 399 | ret = 1; | ||
| 400 | |||
| 401 | err: | ||
| 402 | BN_CTX_end(ctx); | ||
| 403 | if (new_ctx != NULL) | ||
| 404 | BN_CTX_free(new_ctx); | ||
| 405 | return ret; | ||
| 406 | } | ||
| 407 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ec_ameth.c b/src/lib/libcrypto/ec/ec_ameth.c index c00f7d746c..83909c1853 100644 --- a/src/lib/libcrypto/ec/ec_ameth.c +++ b/src/lib/libcrypto/ec/ec_ameth.c | |||
| @@ -651,6 +651,7 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth = | |||
| 651 | ec_copy_parameters, | 651 | ec_copy_parameters, |
| 652 | ec_cmp_parameters, | 652 | ec_cmp_parameters, |
| 653 | eckey_param_print, | 653 | eckey_param_print, |
| 654 | 0, | ||
| 654 | 655 | ||
| 655 | int_ec_free, | 656 | int_ec_free, |
| 656 | ec_pkey_ctrl, | 657 | ec_pkey_ctrl, |
diff --git a/src/lib/libcrypto/ec/ec_asn1.c b/src/lib/libcrypto/ec/ec_asn1.c index ae55539859..175eec5342 100644 --- a/src/lib/libcrypto/ec/ec_asn1.c +++ b/src/lib/libcrypto/ec/ec_asn1.c | |||
| @@ -83,7 +83,7 @@ int EC_GROUP_get_basis_type(const EC_GROUP *group) | |||
| 83 | /* everything else is currently not supported */ | 83 | /* everything else is currently not supported */ |
| 84 | return 0; | 84 | return 0; |
| 85 | } | 85 | } |
| 86 | 86 | #ifndef OPENSSL_NO_EC2M | |
| 87 | int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k) | 87 | int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k) |
| 88 | { | 88 | { |
| 89 | if (group == NULL) | 89 | if (group == NULL) |
| @@ -101,7 +101,6 @@ int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k) | |||
| 101 | 101 | ||
| 102 | return 1; | 102 | return 1; |
| 103 | } | 103 | } |
| 104 | |||
| 105 | int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1, | 104 | int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1, |
| 106 | unsigned int *k2, unsigned int *k3) | 105 | unsigned int *k2, unsigned int *k3) |
| 107 | { | 106 | { |
| @@ -124,7 +123,7 @@ int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1, | |||
| 124 | 123 | ||
| 125 | return 1; | 124 | return 1; |
| 126 | } | 125 | } |
| 127 | 126 | #endif | |
| 128 | 127 | ||
| 129 | 128 | ||
| 130 | /* some structures needed for the asn1 encoding */ | 129 | /* some structures needed for the asn1 encoding */ |
| @@ -340,6 +339,12 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field) | |||
| 340 | } | 339 | } |
| 341 | } | 340 | } |
| 342 | else /* nid == NID_X9_62_characteristic_two_field */ | 341 | else /* nid == NID_X9_62_characteristic_two_field */ |
| 342 | #ifdef OPENSSL_NO_EC2M | ||
| 343 | { | ||
| 344 | ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED); | ||
| 345 | goto err; | ||
| 346 | } | ||
| 347 | #else | ||
| 343 | { | 348 | { |
| 344 | int field_type; | 349 | int field_type; |
| 345 | X9_62_CHARACTERISTIC_TWO *char_two; | 350 | X9_62_CHARACTERISTIC_TWO *char_two; |
| @@ -419,6 +424,7 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field) | |||
| 419 | } | 424 | } |
| 420 | } | 425 | } |
| 421 | } | 426 | } |
| 427 | #endif | ||
| 422 | 428 | ||
| 423 | ok = 1; | 429 | ok = 1; |
| 424 | 430 | ||
| @@ -456,6 +462,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve) | |||
| 456 | goto err; | 462 | goto err; |
| 457 | } | 463 | } |
| 458 | } | 464 | } |
| 465 | #ifndef OPENSSL_NO_EC2M | ||
| 459 | else /* nid == NID_X9_62_characteristic_two_field */ | 466 | else /* nid == NID_X9_62_characteristic_two_field */ |
| 460 | { | 467 | { |
| 461 | if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL)) | 468 | if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL)) |
| @@ -464,7 +471,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve) | |||
| 464 | goto err; | 471 | goto err; |
| 465 | } | 472 | } |
| 466 | } | 473 | } |
| 467 | 474 | #endif | |
| 468 | len_1 = (size_t)BN_num_bytes(tmp_1); | 475 | len_1 = (size_t)BN_num_bytes(tmp_1); |
| 469 | len_2 = (size_t)BN_num_bytes(tmp_2); | 476 | len_2 = (size_t)BN_num_bytes(tmp_2); |
| 470 | 477 | ||
| @@ -775,8 +782,13 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params) | |||
| 775 | 782 | ||
| 776 | /* get the field parameters */ | 783 | /* get the field parameters */ |
| 777 | tmp = OBJ_obj2nid(params->fieldID->fieldType); | 784 | tmp = OBJ_obj2nid(params->fieldID->fieldType); |
| 778 | |||
| 779 | if (tmp == NID_X9_62_characteristic_two_field) | 785 | if (tmp == NID_X9_62_characteristic_two_field) |
| 786 | #ifdef OPENSSL_NO_EC2M | ||
| 787 | { | ||
| 788 | ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED); | ||
| 789 | goto err; | ||
| 790 | } | ||
| 791 | #else | ||
| 780 | { | 792 | { |
| 781 | X9_62_CHARACTERISTIC_TWO *char_two; | 793 | X9_62_CHARACTERISTIC_TWO *char_two; |
| 782 | 794 | ||
| @@ -862,6 +874,7 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params) | |||
| 862 | /* create the EC_GROUP structure */ | 874 | /* create the EC_GROUP structure */ |
| 863 | ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL); | 875 | ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL); |
| 864 | } | 876 | } |
| 877 | #endif | ||
| 865 | else if (tmp == NID_X9_62_prime_field) | 878 | else if (tmp == NID_X9_62_prime_field) |
| 866 | { | 879 | { |
| 867 | /* we have a curve over a prime field */ | 880 | /* we have a curve over a prime field */ |
| @@ -1065,6 +1078,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len) | |||
| 1065 | if ((group = ec_asn1_pkparameters2group(params)) == NULL) | 1078 | if ((group = ec_asn1_pkparameters2group(params)) == NULL) |
| 1066 | { | 1079 | { |
| 1067 | ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE); | 1080 | ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE); |
| 1081 | ECPKPARAMETERS_free(params); | ||
| 1068 | return NULL; | 1082 | return NULL; |
| 1069 | } | 1083 | } |
| 1070 | 1084 | ||
diff --git a/src/lib/libcrypto/ec/ec_curve.c b/src/lib/libcrypto/ec/ec_curve.c index 23274e4031..c72fb2697c 100644 --- a/src/lib/libcrypto/ec/ec_curve.c +++ b/src/lib/libcrypto/ec/ec_curve.c | |||
| @@ -3,7 +3,7 @@ | |||
| 3 | * Written by Nils Larsch for the OpenSSL project. | 3 | * Written by Nils Larsch for the OpenSSL project. |
| 4 | */ | 4 | */ |
| 5 | /* ==================================================================== | 5 | /* ==================================================================== |
| 6 | * Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved. | 6 | * Copyright (c) 1998-2010 The OpenSSL Project. All rights reserved. |
| 7 | * | 7 | * |
| 8 | * Redistribution and use in source and binary forms, with or without | 8 | * Redistribution and use in source and binary forms, with or without |
| 9 | * modification, are permitted provided that the following conditions | 9 | * modification, are permitted provided that the following conditions |
| @@ -72,6 +72,7 @@ | |||
| 72 | #include "ec_lcl.h" | 72 | #include "ec_lcl.h" |
| 73 | #include <openssl/err.h> | 73 | #include <openssl/err.h> |
| 74 | #include <openssl/obj_mac.h> | 74 | #include <openssl/obj_mac.h> |
| 75 | #include <openssl/opensslconf.h> | ||
| 75 | 76 | ||
| 76 | typedef struct { | 77 | typedef struct { |
| 77 | int field_type, /* either NID_X9_62_prime_field or | 78 | int field_type, /* either NID_X9_62_prime_field or |
| @@ -703,6 +704,8 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+28*6]; } | |||
| 703 | 0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D } | 704 | 0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D } |
| 704 | }; | 705 | }; |
| 705 | 706 | ||
| 707 | #ifndef OPENSSL_NO_EC2M | ||
| 708 | |||
| 706 | /* characteristic two curves */ | 709 | /* characteristic two curves */ |
| 707 | static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; } | 710 | static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; } |
| 708 | _EC_SECG_CHAR2_113R1 = { | 711 | _EC_SECG_CHAR2_113R1 = { |
| @@ -1300,7 +1303,7 @@ static const struct { EC_CURVE_DATA h; unsigned char data[20+21*6]; } | |||
| 1300 | { 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76, /* seed */ | 1303 | { 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76, /* seed */ |
| 1301 | 0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD, | 1304 | 0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD, |
| 1302 | 1305 | ||
| 1303 | 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */ | 1306 | 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */ |
| 1304 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, | 1307 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, |
| 1305 | 0x07, | 1308 | 0x07, |
| 1306 | 0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9, /* a */ | 1309 | 0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9, /* a */ |
| @@ -1817,103 +1820,128 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+24*6]; } | |||
| 1817 | 0xBA,0xFC,0xA7,0x5E } | 1820 | 0xBA,0xFC,0xA7,0x5E } |
| 1818 | }; | 1821 | }; |
| 1819 | 1822 | ||
| 1823 | #endif | ||
| 1824 | |||
| 1820 | typedef struct _ec_list_element_st { | 1825 | typedef struct _ec_list_element_st { |
| 1821 | int nid; | 1826 | int nid; |
| 1822 | const EC_CURVE_DATA *data; | 1827 | const EC_CURVE_DATA *data; |
| 1828 | const EC_METHOD *(*meth)(void); | ||
| 1823 | const char *comment; | 1829 | const char *comment; |
| 1824 | } ec_list_element; | 1830 | } ec_list_element; |
| 1825 | 1831 | ||
| 1826 | static const ec_list_element curve_list[] = { | 1832 | static const ec_list_element curve_list[] = { |
| 1827 | /* prime field curves */ | 1833 | /* prime field curves */ |
| 1828 | /* secg curves */ | 1834 | /* secg curves */ |
| 1829 | { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"}, | 1835 | { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" }, |
| 1830 | { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"}, | 1836 | { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" }, |
| 1831 | { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"}, | 1837 | { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" }, |
| 1832 | { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"}, | 1838 | { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" }, |
| 1833 | { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"}, | 1839 | { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" }, |
| 1834 | { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"}, | 1840 | { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" }, |
| 1835 | { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"}, | 1841 | { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" }, |
| 1836 | /* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */ | 1842 | /* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */ |
| 1837 | { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"}, | 1843 | { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" }, |
| 1838 | { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"}, | 1844 | { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" }, |
| 1839 | { NID_secp224r1, &_EC_NIST_PRIME_224.h, "NIST/SECG curve over a 224 bit prime field"}, | 1845 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 |
| 1840 | { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"}, | 1846 | { NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" }, |
| 1847 | #else | ||
| 1848 | { NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" }, | ||
| 1849 | #endif | ||
| 1850 | { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" }, | ||
| 1841 | /* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */ | 1851 | /* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */ |
| 1842 | { NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"}, | 1852 | { NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" }, |
| 1843 | { NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"}, | 1853 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 |
| 1854 | { NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" }, | ||
| 1855 | #else | ||
| 1856 | { NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" }, | ||
| 1857 | #endif | ||
| 1844 | /* X9.62 curves */ | 1858 | /* X9.62 curves */ |
| 1845 | { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"}, | 1859 | { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" }, |
| 1846 | { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"}, | 1860 | { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" }, |
| 1847 | { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"}, | 1861 | { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" }, |
| 1848 | { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"}, | 1862 | { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" }, |
| 1849 | { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"}, | 1863 | { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" }, |
| 1850 | { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"}, | 1864 | { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" }, |
| 1851 | { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"}, | 1865 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 |
| 1866 | { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" }, | ||
| 1867 | #else | ||
| 1868 | { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" }, | ||
| 1869 | #endif | ||
| 1870 | #ifndef OPENSSL_NO_EC2M | ||
| 1852 | /* characteristic two field curves */ | 1871 | /* characteristic two field curves */ |
| 1853 | /* NIST/SECG curves */ | 1872 | /* NIST/SECG curves */ |
| 1854 | { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"}, | 1873 | { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" }, |
| 1855 | { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"}, | 1874 | { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" }, |
| 1856 | { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"}, | 1875 | { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" }, |
| 1857 | { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"}, | 1876 | { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" }, |
| 1858 | { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field" }, | 1877 | { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" }, |
| 1859 | { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"}, | 1878 | { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" }, |
| 1860 | { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, "NIST/SECG curve over a 163 bit binary field" }, | 1879 | { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" }, |
| 1861 | { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"}, | 1880 | { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" }, |
| 1862 | { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"}, | 1881 | { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" }, |
| 1863 | { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field" }, | 1882 | { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, |
| 1864 | { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field" }, | 1883 | { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, |
| 1865 | { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"}, | 1884 | { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" }, |
| 1866 | { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, "NIST/SECG curve over a 283 bit binary field" }, | 1885 | { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" }, |
| 1867 | { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, "NIST/SECG curve over a 283 bit binary field" }, | 1886 | { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" }, |
| 1868 | { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, "NIST/SECG curve over a 409 bit binary field" }, | 1887 | { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" }, |
| 1869 | { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, "NIST/SECG curve over a 409 bit binary field" }, | 1888 | { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" }, |
| 1870 | { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, "NIST/SECG curve over a 571 bit binary field" }, | 1889 | { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" }, |
| 1871 | { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, "NIST/SECG curve over a 571 bit binary field" }, | 1890 | { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" }, |
| 1872 | /* X9.62 curves */ | 1891 | /* X9.62 curves */ |
| 1873 | { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"}, | 1892 | { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" }, |
| 1874 | { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"}, | 1893 | { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" }, |
| 1875 | { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"}, | 1894 | { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" }, |
| 1876 | { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"}, | 1895 | { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" }, |
| 1877 | { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"}, | 1896 | { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" }, |
| 1878 | { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"}, | 1897 | { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" }, |
| 1879 | { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"}, | 1898 | { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" }, |
| 1880 | { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"}, | 1899 | { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" }, |
| 1881 | { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"}, | 1900 | { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" }, |
| 1882 | { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"}, | 1901 | { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" }, |
| 1883 | { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"}, | 1902 | { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" }, |
| 1884 | { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"}, | 1903 | { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" }, |
| 1885 | { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"}, | 1904 | { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" }, |
| 1886 | { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"}, | 1905 | { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" }, |
| 1887 | { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"}, | 1906 | { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" }, |
| 1888 | { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"}, | 1907 | { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" }, |
| 1889 | /* the WAP/WTLS curves | 1908 | /* the WAP/WTLS curves |
| 1890 | * [unlike SECG, spec has its own OIDs for curves from X9.62] */ | 1909 | * [unlike SECG, spec has its own OIDs for curves from X9.62] */ |
| 1891 | { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"}, | 1910 | { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" }, |
| 1892 | { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field"}, | 1911 | { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" }, |
| 1893 | { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"}, | 1912 | { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" }, |
| 1894 | { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"}, | 1913 | { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" }, |
| 1895 | { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"}, | 1914 | #endif |
| 1896 | { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"}, | 1915 | { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" }, |
| 1897 | { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"}, | 1916 | { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" }, |
| 1898 | { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" }, | 1917 | { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" }, |
| 1899 | { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"}, | 1918 | { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" }, |
| 1900 | { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"}, | 1919 | #ifndef OPENSSL_NO_EC2M |
| 1901 | { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"}, | 1920 | { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, |
| 1921 | { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, | ||
| 1922 | #endif | ||
| 1923 | { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" }, | ||
| 1924 | #ifndef OPENSSL_NO_EC2M | ||
| 1902 | /* IPSec curves */ | 1925 | /* IPSec curves */ |
| 1903 | { NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"}, | 1926 | { NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n" |
| 1904 | { NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"}, | 1927 | "\tNot suitable for ECDSA.\n\tQuestionable extension field!" }, |
| 1928 | { NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n" | ||
| 1929 | "\tNot suitable for ECDSA.\n\tQuestionable extension field!" }, | ||
| 1930 | #endif | ||
| 1905 | }; | 1931 | }; |
| 1906 | 1932 | ||
| 1907 | #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element)) | 1933 | #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element)) |
| 1908 | 1934 | ||
| 1909 | static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) | 1935 | static EC_GROUP *ec_group_new_from_data(const ec_list_element curve) |
| 1910 | { | 1936 | { |
| 1911 | EC_GROUP *group=NULL; | 1937 | EC_GROUP *group=NULL; |
| 1912 | EC_POINT *P=NULL; | 1938 | EC_POINT *P=NULL; |
| 1913 | BN_CTX *ctx=NULL; | 1939 | BN_CTX *ctx=NULL; |
| 1914 | BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL; | 1940 | BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL; |
| 1915 | int ok=0; | 1941 | int ok=0; |
| 1916 | int seed_len,param_len; | 1942 | int seed_len,param_len; |
| 1943 | const EC_METHOD *meth; | ||
| 1944 | const EC_CURVE_DATA *data; | ||
| 1917 | const unsigned char *params; | 1945 | const unsigned char *params; |
| 1918 | 1946 | ||
| 1919 | if ((ctx = BN_CTX_new()) == NULL) | 1947 | if ((ctx = BN_CTX_new()) == NULL) |
| @@ -1922,10 +1950,11 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) | |||
| 1922 | goto err; | 1950 | goto err; |
| 1923 | } | 1951 | } |
| 1924 | 1952 | ||
| 1953 | data = curve.data; | ||
| 1925 | seed_len = data->seed_len; | 1954 | seed_len = data->seed_len; |
| 1926 | param_len = data->param_len; | 1955 | param_len = data->param_len; |
| 1927 | params = (const unsigned char *)(data+1); /* skip header */ | 1956 | params = (const unsigned char *)(data+1); /* skip header */ |
| 1928 | params += seed_len; /* skip seed */ | 1957 | params += seed_len; /* skip seed */ |
| 1929 | 1958 | ||
| 1930 | if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL)) | 1959 | if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL)) |
| 1931 | || !(a = BN_bin2bn(params+1*param_len, param_len, NULL)) | 1960 | || !(a = BN_bin2bn(params+1*param_len, param_len, NULL)) |
| @@ -1935,7 +1964,17 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) | |||
| 1935 | goto err; | 1964 | goto err; |
| 1936 | } | 1965 | } |
| 1937 | 1966 | ||
| 1938 | if (data->field_type == NID_X9_62_prime_field) | 1967 | if (curve.meth != 0) |
| 1968 | { | ||
| 1969 | meth = curve.meth(); | ||
| 1970 | if (((group = EC_GROUP_new(meth)) == NULL) || | ||
| 1971 | (!(group->meth->group_set_curve(group, p, a, b, ctx)))) | ||
| 1972 | { | ||
| 1973 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); | ||
| 1974 | goto err; | ||
| 1975 | } | ||
| 1976 | } | ||
| 1977 | else if (data->field_type == NID_X9_62_prime_field) | ||
| 1939 | { | 1978 | { |
| 1940 | if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL) | 1979 | if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL) |
| 1941 | { | 1980 | { |
| @@ -1943,6 +1982,7 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) | |||
| 1943 | goto err; | 1982 | goto err; |
| 1944 | } | 1983 | } |
| 1945 | } | 1984 | } |
| 1985 | #ifndef OPENSSL_NO_EC2M | ||
| 1946 | else /* field_type == NID_X9_62_characteristic_two_field */ | 1986 | else /* field_type == NID_X9_62_characteristic_two_field */ |
| 1947 | { | 1987 | { |
| 1948 | if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL) | 1988 | if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL) |
| @@ -1951,20 +1991,21 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) | |||
| 1951 | goto err; | 1991 | goto err; |
| 1952 | } | 1992 | } |
| 1953 | } | 1993 | } |
| 1994 | #endif | ||
| 1954 | 1995 | ||
| 1955 | if ((P = EC_POINT_new(group)) == NULL) | 1996 | if ((P = EC_POINT_new(group)) == NULL) |
| 1956 | { | 1997 | { |
| 1957 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); | 1998 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); |
| 1958 | goto err; | 1999 | goto err; |
| 1959 | } | 2000 | } |
| 1960 | 2001 | ||
| 1961 | if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL)) | 2002 | if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL)) |
| 1962 | || !(y = BN_bin2bn(params+4*param_len, param_len, NULL))) | 2003 | || !(y = BN_bin2bn(params+4*param_len, param_len, NULL))) |
| 1963 | { | 2004 | { |
| 1964 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB); | 2005 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB); |
| 1965 | goto err; | 2006 | goto err; |
| 1966 | } | 2007 | } |
| 1967 | if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx)) | 2008 | if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx)) |
| 1968 | { | 2009 | { |
| 1969 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); | 2010 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); |
| 1970 | goto err; | 2011 | goto err; |
| @@ -2025,7 +2066,7 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid) | |||
| 2025 | for (i=0; i<curve_list_length; i++) | 2066 | for (i=0; i<curve_list_length; i++) |
| 2026 | if (curve_list[i].nid == nid) | 2067 | if (curve_list[i].nid == nid) |
| 2027 | { | 2068 | { |
| 2028 | ret = ec_group_new_from_data(curve_list[i].data); | 2069 | ret = ec_group_new_from_data(curve_list[i]); |
| 2029 | break; | 2070 | break; |
| 2030 | } | 2071 | } |
| 2031 | 2072 | ||
diff --git a/src/lib/libcrypto/ec/ec_key.c b/src/lib/libcrypto/ec/ec_key.c index 522802c07a..bf9fd2dc2c 100644 --- a/src/lib/libcrypto/ec/ec_key.c +++ b/src/lib/libcrypto/ec/ec_key.c | |||
| @@ -64,7 +64,9 @@ | |||
| 64 | #include <string.h> | 64 | #include <string.h> |
| 65 | #include "ec_lcl.h" | 65 | #include "ec_lcl.h" |
| 66 | #include <openssl/err.h> | 66 | #include <openssl/err.h> |
| 67 | #include <string.h> | 67 | #ifdef OPENSSL_FIPS |
| 68 | #include <openssl/fips.h> | ||
| 69 | #endif | ||
| 68 | 70 | ||
| 69 | EC_KEY *EC_KEY_new(void) | 71 | EC_KEY *EC_KEY_new(void) |
| 70 | { | 72 | { |
| @@ -78,6 +80,7 @@ EC_KEY *EC_KEY_new(void) | |||
| 78 | } | 80 | } |
| 79 | 81 | ||
| 80 | ret->version = 1; | 82 | ret->version = 1; |
| 83 | ret->flags = 0; | ||
| 81 | ret->group = NULL; | 84 | ret->group = NULL; |
| 82 | ret->pub_key = NULL; | 85 | ret->pub_key = NULL; |
| 83 | ret->priv_key= NULL; | 86 | ret->priv_key= NULL; |
| @@ -197,6 +200,7 @@ EC_KEY *EC_KEY_copy(EC_KEY *dest, const EC_KEY *src) | |||
| 197 | dest->enc_flag = src->enc_flag; | 200 | dest->enc_flag = src->enc_flag; |
| 198 | dest->conv_form = src->conv_form; | 201 | dest->conv_form = src->conv_form; |
| 199 | dest->version = src->version; | 202 | dest->version = src->version; |
| 203 | dest->flags = src->flags; | ||
| 200 | 204 | ||
| 201 | return dest; | 205 | return dest; |
| 202 | } | 206 | } |
| @@ -237,6 +241,11 @@ int EC_KEY_generate_key(EC_KEY *eckey) | |||
| 237 | BIGNUM *priv_key = NULL, *order = NULL; | 241 | BIGNUM *priv_key = NULL, *order = NULL; |
| 238 | EC_POINT *pub_key = NULL; | 242 | EC_POINT *pub_key = NULL; |
| 239 | 243 | ||
| 244 | #ifdef OPENSSL_FIPS | ||
| 245 | if (FIPS_mode()) | ||
| 246 | return FIPS_ec_key_generate_key(eckey); | ||
| 247 | #endif | ||
| 248 | |||
| 240 | if (!eckey || !eckey->group) | 249 | if (!eckey || !eckey->group) |
| 241 | { | 250 | { |
| 242 | ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER); | 251 | ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER); |
| @@ -371,6 +380,82 @@ err: | |||
| 371 | return(ok); | 380 | return(ok); |
| 372 | } | 381 | } |
| 373 | 382 | ||
| 383 | int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y) | ||
| 384 | { | ||
| 385 | BN_CTX *ctx = NULL; | ||
| 386 | BIGNUM *tx, *ty; | ||
| 387 | EC_POINT *point = NULL; | ||
| 388 | int ok = 0, tmp_nid, is_char_two = 0; | ||
| 389 | |||
| 390 | if (!key || !key->group || !x || !y) | ||
| 391 | { | ||
| 392 | ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES, | ||
| 393 | ERR_R_PASSED_NULL_PARAMETER); | ||
| 394 | return 0; | ||
| 395 | } | ||
| 396 | ctx = BN_CTX_new(); | ||
| 397 | if (!ctx) | ||
| 398 | goto err; | ||
| 399 | |||
| 400 | point = EC_POINT_new(key->group); | ||
| 401 | |||
| 402 | if (!point) | ||
| 403 | goto err; | ||
| 404 | |||
| 405 | tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group)); | ||
| 406 | |||
| 407 | if (tmp_nid == NID_X9_62_characteristic_two_field) | ||
| 408 | is_char_two = 1; | ||
| 409 | |||
| 410 | tx = BN_CTX_get(ctx); | ||
| 411 | ty = BN_CTX_get(ctx); | ||
| 412 | #ifndef OPENSSL_NO_EC2M | ||
| 413 | if (is_char_two) | ||
| 414 | { | ||
| 415 | if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point, | ||
| 416 | x, y, ctx)) | ||
| 417 | goto err; | ||
| 418 | if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point, | ||
| 419 | tx, ty, ctx)) | ||
| 420 | goto err; | ||
| 421 | } | ||
| 422 | else | ||
| 423 | #endif | ||
| 424 | { | ||
| 425 | if (!EC_POINT_set_affine_coordinates_GFp(key->group, point, | ||
| 426 | x, y, ctx)) | ||
| 427 | goto err; | ||
| 428 | if (!EC_POINT_get_affine_coordinates_GFp(key->group, point, | ||
| 429 | tx, ty, ctx)) | ||
| 430 | goto err; | ||
| 431 | } | ||
| 432 | /* Check if retrieved coordinates match originals: if not values | ||
| 433 | * are out of range. | ||
| 434 | */ | ||
| 435 | if (BN_cmp(x, tx) || BN_cmp(y, ty)) | ||
| 436 | { | ||
| 437 | ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES, | ||
| 438 | EC_R_COORDINATES_OUT_OF_RANGE); | ||
| 439 | goto err; | ||
| 440 | } | ||
| 441 | |||
| 442 | if (!EC_KEY_set_public_key(key, point)) | ||
| 443 | goto err; | ||
| 444 | |||
| 445 | if (EC_KEY_check_key(key) == 0) | ||
| 446 | goto err; | ||
| 447 | |||
| 448 | ok = 1; | ||
| 449 | |||
| 450 | err: | ||
| 451 | if (ctx) | ||
| 452 | BN_CTX_free(ctx); | ||
| 453 | if (point) | ||
| 454 | EC_POINT_free(point); | ||
| 455 | return ok; | ||
| 456 | |||
| 457 | } | ||
| 458 | |||
| 374 | const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key) | 459 | const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key) |
| 375 | { | 460 | { |
| 376 | return key->group; | 461 | return key->group; |
| @@ -461,3 +546,18 @@ int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx) | |||
| 461 | return 0; | 546 | return 0; |
| 462 | return EC_GROUP_precompute_mult(key->group, ctx); | 547 | return EC_GROUP_precompute_mult(key->group, ctx); |
| 463 | } | 548 | } |
| 549 | |||
| 550 | int EC_KEY_get_flags(const EC_KEY *key) | ||
| 551 | { | ||
| 552 | return key->flags; | ||
| 553 | } | ||
| 554 | |||
| 555 | void EC_KEY_set_flags(EC_KEY *key, int flags) | ||
| 556 | { | ||
| 557 | key->flags |= flags; | ||
| 558 | } | ||
| 559 | |||
| 560 | void EC_KEY_clear_flags(EC_KEY *key, int flags) | ||
| 561 | { | ||
| 562 | key->flags &= ~flags; | ||
| 563 | } | ||
diff --git a/src/lib/libcrypto/ec/ec_oct.c b/src/lib/libcrypto/ec/ec_oct.c new file mode 100644 index 0000000000..fd9db0798d --- /dev/null +++ b/src/lib/libcrypto/ec/ec_oct.c | |||
| @@ -0,0 +1,199 @@ | |||
| 1 | /* crypto/ec/ec_lib.c */ | ||
| 2 | /* | ||
| 3 | * Originally written by Bodo Moeller for the OpenSSL project. | ||
| 4 | */ | ||
| 5 | /* ==================================================================== | ||
| 6 | * Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved. | ||
| 7 | * | ||
| 8 | * Redistribution and use in source and binary forms, with or without | ||
| 9 | * modification, are permitted provided that the following conditions | ||
| 10 | * are met: | ||
| 11 | * | ||
| 12 | * 1. Redistributions of source code must retain the above copyright | ||
| 13 | * notice, this list of conditions and the following disclaimer. | ||
| 14 | * | ||
| 15 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 16 | * notice, this list of conditions and the following disclaimer in | ||
| 17 | * the documentation and/or other materials provided with the | ||
| 18 | * distribution. | ||
| 19 | * | ||
| 20 | * 3. All advertising materials mentioning features or use of this | ||
| 21 | * software must display the following acknowledgment: | ||
| 22 | * "This product includes software developed by the OpenSSL Project | ||
| 23 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
| 24 | * | ||
| 25 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 26 | * endorse or promote products derived from this software without | ||
| 27 | * prior written permission. For written permission, please contact | ||
| 28 | * openssl-core@openssl.org. | ||
| 29 | * | ||
| 30 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 31 | * nor may "OpenSSL" appear in their names without prior written | ||
| 32 | * permission of the OpenSSL Project. | ||
| 33 | * | ||
| 34 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 35 | * acknowledgment: | ||
| 36 | * "This product includes software developed by the OpenSSL Project | ||
| 37 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
| 38 | * | ||
| 39 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 40 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 41 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 42 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 43 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 44 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 45 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 46 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 47 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 48 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 49 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 50 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 51 | * ==================================================================== | ||
| 52 | * | ||
| 53 | * This product includes cryptographic software written by Eric Young | ||
| 54 | * (eay@cryptsoft.com). This product includes software written by Tim | ||
| 55 | * Hudson (tjh@cryptsoft.com). | ||
| 56 | * | ||
| 57 | */ | ||
| 58 | /* ==================================================================== | ||
| 59 | * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. | ||
| 60 | * Binary polynomial ECC support in OpenSSL originally developed by | ||
| 61 | * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project. | ||
| 62 | */ | ||
| 63 | |||
| 64 | #include <string.h> | ||
| 65 | |||
| 66 | #include <openssl/err.h> | ||
| 67 | #include <openssl/opensslv.h> | ||
| 68 | |||
| 69 | #include "ec_lcl.h" | ||
| 70 | |||
| 71 | int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point, | ||
| 72 | const BIGNUM *x, int y_bit, BN_CTX *ctx) | ||
| 73 | { | ||
| 74 | if (group->meth->point_set_compressed_coordinates == 0 | ||
| 75 | && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) | ||
| 76 | { | ||
| 77 | ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); | ||
| 78 | return 0; | ||
| 79 | } | ||
| 80 | if (group->meth != point->meth) | ||
| 81 | { | ||
| 82 | ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS); | ||
| 83 | return 0; | ||
| 84 | } | ||
| 85 | if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) | ||
| 86 | { | ||
| 87 | if (group->meth->field_type == NID_X9_62_prime_field) | ||
| 88 | return ec_GFp_simple_set_compressed_coordinates( | ||
| 89 | group, point, x, y_bit, ctx); | ||
| 90 | else | ||
| 91 | #ifdef OPENSSL_NO_EC2M | ||
| 92 | { | ||
| 93 | ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED); | ||
| 94 | return 0; | ||
| 95 | } | ||
| 96 | #else | ||
| 97 | return ec_GF2m_simple_set_compressed_coordinates( | ||
| 98 | group, point, x, y_bit, ctx); | ||
| 99 | #endif | ||
| 100 | } | ||
| 101 | return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx); | ||
| 102 | } | ||
| 103 | |||
| 104 | #ifndef OPENSSL_NO_EC2M | ||
| 105 | int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point, | ||
| 106 | const BIGNUM *x, int y_bit, BN_CTX *ctx) | ||
| 107 | { | ||
| 108 | if (group->meth->point_set_compressed_coordinates == 0 | ||
| 109 | && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) | ||
| 110 | { | ||
| 111 | ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); | ||
| 112 | return 0; | ||
| 113 | } | ||
| 114 | if (group->meth != point->meth) | ||
| 115 | { | ||
| 116 | ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS); | ||
| 117 | return 0; | ||
| 118 | } | ||
| 119 | if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) | ||
| 120 | { | ||
| 121 | if (group->meth->field_type == NID_X9_62_prime_field) | ||
| 122 | return ec_GFp_simple_set_compressed_coordinates( | ||
| 123 | group, point, x, y_bit, ctx); | ||
| 124 | else | ||
| 125 | return ec_GF2m_simple_set_compressed_coordinates( | ||
| 126 | group, point, x, y_bit, ctx); | ||
| 127 | } | ||
| 128 | return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx); | ||
| 129 | } | ||
| 130 | #endif | ||
| 131 | |||
| 132 | size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, | ||
| 133 | unsigned char *buf, size_t len, BN_CTX *ctx) | ||
| 134 | { | ||
| 135 | if (group->meth->point2oct == 0 | ||
| 136 | && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) | ||
| 137 | { | ||
| 138 | ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); | ||
| 139 | return 0; | ||
| 140 | } | ||
| 141 | if (group->meth != point->meth) | ||
| 142 | { | ||
| 143 | ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS); | ||
| 144 | return 0; | ||
| 145 | } | ||
| 146 | if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) | ||
| 147 | { | ||
| 148 | if (group->meth->field_type == NID_X9_62_prime_field) | ||
| 149 | return ec_GFp_simple_point2oct(group, point, | ||
| 150 | form, buf, len, ctx); | ||
| 151 | else | ||
| 152 | #ifdef OPENSSL_NO_EC2M | ||
| 153 | { | ||
| 154 | ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED); | ||
| 155 | return 0; | ||
| 156 | } | ||
| 157 | #else | ||
| 158 | return ec_GF2m_simple_point2oct(group, point, | ||
| 159 | form, buf, len, ctx); | ||
| 160 | #endif | ||
| 161 | } | ||
| 162 | |||
| 163 | return group->meth->point2oct(group, point, form, buf, len, ctx); | ||
| 164 | } | ||
| 165 | |||
| 166 | |||
| 167 | int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point, | ||
| 168 | const unsigned char *buf, size_t len, BN_CTX *ctx) | ||
| 169 | { | ||
| 170 | if (group->meth->oct2point == 0 | ||
| 171 | && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) | ||
| 172 | { | ||
| 173 | ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); | ||
| 174 | return 0; | ||
| 175 | } | ||
| 176 | if (group->meth != point->meth) | ||
| 177 | { | ||
| 178 | ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS); | ||
| 179 | return 0; | ||
| 180 | } | ||
| 181 | if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) | ||
| 182 | { | ||
| 183 | if (group->meth->field_type == NID_X9_62_prime_field) | ||
| 184 | return ec_GFp_simple_oct2point(group, point, | ||
| 185 | buf, len, ctx); | ||
| 186 | else | ||
| 187 | #ifdef OPENSSL_NO_EC2M | ||
| 188 | { | ||
| 189 | ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED); | ||
| 190 | return 0; | ||
| 191 | } | ||
| 192 | #else | ||
| 193 | return ec_GF2m_simple_oct2point(group, point, | ||
| 194 | buf, len, ctx); | ||
| 195 | #endif | ||
| 196 | } | ||
| 197 | return group->meth->oct2point(group, point, buf, len, ctx); | ||
| 198 | } | ||
| 199 | |||
diff --git a/src/lib/libcrypto/ec/ec_pmeth.c b/src/lib/libcrypto/ec/ec_pmeth.c index f433076ca1..d1ed66c37e 100644 --- a/src/lib/libcrypto/ec/ec_pmeth.c +++ b/src/lib/libcrypto/ec/ec_pmeth.c | |||
| @@ -221,6 +221,7 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | |||
| 221 | 221 | ||
| 222 | case EVP_PKEY_CTRL_MD: | 222 | case EVP_PKEY_CTRL_MD: |
| 223 | if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 && | 223 | if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 && |
| 224 | EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 && | ||
| 224 | EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && | 225 | EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && |
| 225 | EVP_MD_type((const EVP_MD *)p2) != NID_sha256 && | 226 | EVP_MD_type((const EVP_MD *)p2) != NID_sha256 && |
| 226 | EVP_MD_type((const EVP_MD *)p2) != NID_sha384 && | 227 | EVP_MD_type((const EVP_MD *)p2) != NID_sha384 && |
diff --git a/src/lib/libcrypto/ec/eck_prn.c b/src/lib/libcrypto/ec/eck_prn.c index 7d3e175ae7..06de8f3959 100644 --- a/src/lib/libcrypto/ec/eck_prn.c +++ b/src/lib/libcrypto/ec/eck_prn.c | |||
| @@ -207,7 +207,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off) | |||
| 207 | reason = ERR_R_MALLOC_FAILURE; | 207 | reason = ERR_R_MALLOC_FAILURE; |
| 208 | goto err; | 208 | goto err; |
| 209 | } | 209 | } |
| 210 | 210 | #ifndef OPENSSL_NO_EC2M | |
| 211 | if (is_char_two) | 211 | if (is_char_two) |
| 212 | { | 212 | { |
| 213 | if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx)) | 213 | if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx)) |
| @@ -217,6 +217,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off) | |||
| 217 | } | 217 | } |
| 218 | } | 218 | } |
| 219 | else /* prime field */ | 219 | else /* prime field */ |
| 220 | #endif | ||
| 220 | { | 221 | { |
| 221 | if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx)) | 222 | if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx)) |
| 222 | { | 223 | { |
diff --git a/src/lib/libcrypto/ec/ecp_nistp224.c b/src/lib/libcrypto/ec/ecp_nistp224.c new file mode 100644 index 0000000000..b5ff56c252 --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistp224.c | |||
| @@ -0,0 +1,1658 @@ | |||
| 1 | /* crypto/ec/ecp_nistp224.c */ | ||
| 2 | /* | ||
| 3 | * Written by Emilia Kasper (Google) for the OpenSSL project. | ||
| 4 | */ | ||
| 5 | /* Copyright 2011 Google Inc. | ||
| 6 | * | ||
| 7 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| 8 | * | ||
| 9 | * you may not use this file except in compliance with the License. | ||
| 10 | * You may obtain a copy of the License at | ||
| 11 | * | ||
| 12 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
| 13 | * | ||
| 14 | * Unless required by applicable law or agreed to in writing, software | ||
| 15 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
| 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| 17 | * See the License for the specific language governing permissions and | ||
| 18 | * limitations under the License. | ||
| 19 | */ | ||
| 20 | |||
| 21 | /* | ||
| 22 | * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication | ||
| 23 | * | ||
| 24 | * Inspired by Daniel J. Bernstein's public domain nistp224 implementation | ||
| 25 | * and Adam Langley's public domain 64-bit C implementation of curve25519 | ||
| 26 | */ | ||
| 27 | |||
| 28 | #include <openssl/opensslconf.h> | ||
| 29 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 | ||
| 30 | |||
| 31 | #ifndef OPENSSL_SYS_VMS | ||
| 32 | #include <stdint.h> | ||
| 33 | #else | ||
| 34 | #include <inttypes.h> | ||
| 35 | #endif | ||
| 36 | |||
| 37 | #include <string.h> | ||
| 38 | #include <openssl/err.h> | ||
| 39 | #include "ec_lcl.h" | ||
| 40 | |||
| 41 | #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) | ||
| 42 | /* even with gcc, the typedef won't work for 32-bit platforms */ | ||
| 43 | typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ | ||
| 44 | #else | ||
| 45 | #error "Need GCC 3.1 or later to define type uint128_t" | ||
| 46 | #endif | ||
| 47 | |||
| 48 | typedef uint8_t u8; | ||
| 49 | typedef uint64_t u64; | ||
| 50 | typedef int64_t s64; | ||
| 51 | |||
| 52 | |||
| 53 | /******************************************************************************/ | ||
| 54 | /* INTERNAL REPRESENTATION OF FIELD ELEMENTS | ||
| 55 | * | ||
| 56 | * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3 | ||
| 57 | * using 64-bit coefficients called 'limbs', | ||
| 58 | * and sometimes (for multiplication results) as | ||
| 59 | * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6 | ||
| 60 | * using 128-bit coefficients called 'widelimbs'. | ||
| 61 | * A 4-limb representation is an 'felem'; | ||
| 62 | * a 7-widelimb representation is a 'widefelem'. | ||
| 63 | * Even within felems, bits of adjacent limbs overlap, and we don't always | ||
| 64 | * reduce the representations: we ensure that inputs to each felem | ||
| 65 | * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60, | ||
| 66 | * and fit into a 128-bit word without overflow. The coefficients are then | ||
| 67 | * again partially reduced to obtain an felem satisfying a_i < 2^57. | ||
| 68 | * We only reduce to the unique minimal representation at the end of the | ||
| 69 | * computation. | ||
| 70 | */ | ||
| 71 | |||
| 72 | typedef uint64_t limb; | ||
| 73 | typedef uint128_t widelimb; | ||
| 74 | |||
| 75 | typedef limb felem[4]; | ||
| 76 | typedef widelimb widefelem[7]; | ||
| 77 | |||
| 78 | /* Field element represented as a byte arrary. | ||
| 79 | * 28*8 = 224 bits is also the group order size for the elliptic curve, | ||
| 80 | * and we also use this type for scalars for point multiplication. | ||
| 81 | */ | ||
| 82 | typedef u8 felem_bytearray[28]; | ||
| 83 | |||
| 84 | static const felem_bytearray nistp224_curve_params[5] = { | ||
| 85 | {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, /* p */ | ||
| 86 | 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00, | ||
| 87 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01}, | ||
| 88 | {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, /* a */ | ||
| 89 | 0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFF,0xFF, | ||
| 90 | 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE}, | ||
| 91 | {0xB4,0x05,0x0A,0x85,0x0C,0x04,0xB3,0xAB,0xF5,0x41, /* b */ | ||
| 92 | 0x32,0x56,0x50,0x44,0xB0,0xB7,0xD7,0xBF,0xD8,0xBA, | ||
| 93 | 0x27,0x0B,0x39,0x43,0x23,0x55,0xFF,0xB4}, | ||
| 94 | {0xB7,0x0E,0x0C,0xBD,0x6B,0xB4,0xBF,0x7F,0x32,0x13, /* x */ | ||
| 95 | 0x90,0xB9,0x4A,0x03,0xC1,0xD3,0x56,0xC2,0x11,0x22, | ||
| 96 | 0x34,0x32,0x80,0xD6,0x11,0x5C,0x1D,0x21}, | ||
| 97 | {0xbd,0x37,0x63,0x88,0xb5,0xf7,0x23,0xfb,0x4c,0x22, /* y */ | ||
| 98 | 0xdf,0xe6,0xcd,0x43,0x75,0xa0,0x5a,0x07,0x47,0x64, | ||
| 99 | 0x44,0xd5,0x81,0x99,0x85,0x00,0x7e,0x34} | ||
| 100 | }; | ||
| 101 | |||
| 102 | /* Precomputed multiples of the standard generator | ||
| 103 | * Points are given in coordinates (X, Y, Z) where Z normally is 1 | ||
| 104 | * (0 for the point at infinity). | ||
| 105 | * For each field element, slice a_0 is word 0, etc. | ||
| 106 | * | ||
| 107 | * The table has 2 * 16 elements, starting with the following: | ||
| 108 | * index | bits | point | ||
| 109 | * ------+---------+------------------------------ | ||
| 110 | * 0 | 0 0 0 0 | 0G | ||
| 111 | * 1 | 0 0 0 1 | 1G | ||
| 112 | * 2 | 0 0 1 0 | 2^56G | ||
| 113 | * 3 | 0 0 1 1 | (2^56 + 1)G | ||
| 114 | * 4 | 0 1 0 0 | 2^112G | ||
| 115 | * 5 | 0 1 0 1 | (2^112 + 1)G | ||
| 116 | * 6 | 0 1 1 0 | (2^112 + 2^56)G | ||
| 117 | * 7 | 0 1 1 1 | (2^112 + 2^56 + 1)G | ||
| 118 | * 8 | 1 0 0 0 | 2^168G | ||
| 119 | * 9 | 1 0 0 1 | (2^168 + 1)G | ||
| 120 | * 10 | 1 0 1 0 | (2^168 + 2^56)G | ||
| 121 | * 11 | 1 0 1 1 | (2^168 + 2^56 + 1)G | ||
| 122 | * 12 | 1 1 0 0 | (2^168 + 2^112)G | ||
| 123 | * 13 | 1 1 0 1 | (2^168 + 2^112 + 1)G | ||
| 124 | * 14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G | ||
| 125 | * 15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G | ||
| 126 | * followed by a copy of this with each element multiplied by 2^28. | ||
| 127 | * | ||
| 128 | * The reason for this is so that we can clock bits into four different | ||
| 129 | * locations when doing simple scalar multiplies against the base point, | ||
| 130 | * and then another four locations using the second 16 elements. | ||
| 131 | */ | ||
| 132 | static const felem gmul[2][16][3] = | ||
| 133 | {{{{0, 0, 0, 0}, | ||
| 134 | {0, 0, 0, 0}, | ||
| 135 | {0, 0, 0, 0}}, | ||
| 136 | {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf}, | ||
| 137 | {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723}, | ||
| 138 | {1, 0, 0, 0}}, | ||
| 139 | {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5}, | ||
| 140 | {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321}, | ||
| 141 | {1, 0, 0, 0}}, | ||
| 142 | {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748}, | ||
| 143 | {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17}, | ||
| 144 | {1, 0, 0, 0}}, | ||
| 145 | {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe}, | ||
| 146 | {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b}, | ||
| 147 | {1, 0, 0, 0}}, | ||
| 148 | {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3}, | ||
| 149 | {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a}, | ||
| 150 | {1, 0, 0, 0}}, | ||
| 151 | {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c}, | ||
| 152 | {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244}, | ||
| 153 | {1, 0, 0, 0}}, | ||
| 154 | {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849}, | ||
| 155 | {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112}, | ||
| 156 | {1, 0, 0, 0}}, | ||
| 157 | {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47}, | ||
| 158 | {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394}, | ||
| 159 | {1, 0, 0, 0}}, | ||
| 160 | {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d}, | ||
| 161 | {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7}, | ||
| 162 | {1, 0, 0, 0}}, | ||
| 163 | {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24}, | ||
| 164 | {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881}, | ||
| 165 | {1, 0, 0, 0}}, | ||
| 166 | {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984}, | ||
| 167 | {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369}, | ||
| 168 | {1, 0, 0, 0}}, | ||
| 169 | {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3}, | ||
| 170 | {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60}, | ||
| 171 | {1, 0, 0, 0}}, | ||
| 172 | {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057}, | ||
| 173 | {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9}, | ||
| 174 | {1, 0, 0, 0}}, | ||
| 175 | {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9}, | ||
| 176 | {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc}, | ||
| 177 | {1, 0, 0, 0}}, | ||
| 178 | {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58}, | ||
| 179 | {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558}, | ||
| 180 | {1, 0, 0, 0}}}, | ||
| 181 | {{{0, 0, 0, 0}, | ||
| 182 | {0, 0, 0, 0}, | ||
| 183 | {0, 0, 0, 0}}, | ||
| 184 | {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31}, | ||
| 185 | {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d}, | ||
| 186 | {1, 0, 0, 0}}, | ||
| 187 | {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3}, | ||
| 188 | {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a}, | ||
| 189 | {1, 0, 0, 0}}, | ||
| 190 | {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33}, | ||
| 191 | {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100}, | ||
| 192 | {1, 0, 0, 0}}, | ||
| 193 | {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5}, | ||
| 194 | {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea}, | ||
| 195 | {1, 0, 0, 0}}, | ||
| 196 | {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be}, | ||
| 197 | {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51}, | ||
| 198 | {1, 0, 0, 0}}, | ||
| 199 | {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1}, | ||
| 200 | {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb}, | ||
| 201 | {1, 0, 0, 0}}, | ||
| 202 | {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233}, | ||
| 203 | {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def}, | ||
| 204 | {1, 0, 0, 0}}, | ||
| 205 | {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae}, | ||
| 206 | {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45}, | ||
| 207 | {1, 0, 0, 0}}, | ||
| 208 | {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e}, | ||
| 209 | {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb}, | ||
| 210 | {1, 0, 0, 0}}, | ||
| 211 | {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de}, | ||
| 212 | {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3}, | ||
| 213 | {1, 0, 0, 0}}, | ||
| 214 | {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05}, | ||
| 215 | {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58}, | ||
| 216 | {1, 0, 0, 0}}, | ||
| 217 | {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb}, | ||
| 218 | {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0}, | ||
| 219 | {1, 0, 0, 0}}, | ||
| 220 | {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9}, | ||
| 221 | {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea}, | ||
| 222 | {1, 0, 0, 0}}, | ||
| 223 | {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba}, | ||
| 224 | {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405}, | ||
| 225 | {1, 0, 0, 0}}, | ||
| 226 | {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e}, | ||
| 227 | {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e}, | ||
| 228 | {1, 0, 0, 0}}}}; | ||
| 229 | |||
| 230 | /* Precomputation for the group generator. */ | ||
| 231 | typedef struct { | ||
| 232 | felem g_pre_comp[2][16][3]; | ||
| 233 | int references; | ||
| 234 | } NISTP224_PRE_COMP; | ||
| 235 | |||
| 236 | const EC_METHOD *EC_GFp_nistp224_method(void) | ||
| 237 | { | ||
| 238 | static const EC_METHOD ret = { | ||
| 239 | EC_FLAGS_DEFAULT_OCT, | ||
| 240 | NID_X9_62_prime_field, | ||
| 241 | ec_GFp_nistp224_group_init, | ||
| 242 | ec_GFp_simple_group_finish, | ||
| 243 | ec_GFp_simple_group_clear_finish, | ||
| 244 | ec_GFp_nist_group_copy, | ||
| 245 | ec_GFp_nistp224_group_set_curve, | ||
| 246 | ec_GFp_simple_group_get_curve, | ||
| 247 | ec_GFp_simple_group_get_degree, | ||
| 248 | ec_GFp_simple_group_check_discriminant, | ||
| 249 | ec_GFp_simple_point_init, | ||
| 250 | ec_GFp_simple_point_finish, | ||
| 251 | ec_GFp_simple_point_clear_finish, | ||
| 252 | ec_GFp_simple_point_copy, | ||
| 253 | ec_GFp_simple_point_set_to_infinity, | ||
| 254 | ec_GFp_simple_set_Jprojective_coordinates_GFp, | ||
| 255 | ec_GFp_simple_get_Jprojective_coordinates_GFp, | ||
| 256 | ec_GFp_simple_point_set_affine_coordinates, | ||
| 257 | ec_GFp_nistp224_point_get_affine_coordinates, | ||
| 258 | 0 /* point_set_compressed_coordinates */, | ||
| 259 | 0 /* point2oct */, | ||
| 260 | 0 /* oct2point */, | ||
| 261 | ec_GFp_simple_add, | ||
| 262 | ec_GFp_simple_dbl, | ||
| 263 | ec_GFp_simple_invert, | ||
| 264 | ec_GFp_simple_is_at_infinity, | ||
| 265 | ec_GFp_simple_is_on_curve, | ||
| 266 | ec_GFp_simple_cmp, | ||
| 267 | ec_GFp_simple_make_affine, | ||
| 268 | ec_GFp_simple_points_make_affine, | ||
| 269 | ec_GFp_nistp224_points_mul, | ||
| 270 | ec_GFp_nistp224_precompute_mult, | ||
| 271 | ec_GFp_nistp224_have_precompute_mult, | ||
| 272 | ec_GFp_nist_field_mul, | ||
| 273 | ec_GFp_nist_field_sqr, | ||
| 274 | 0 /* field_div */, | ||
| 275 | 0 /* field_encode */, | ||
| 276 | 0 /* field_decode */, | ||
| 277 | 0 /* field_set_to_one */ }; | ||
| 278 | |||
| 279 | return &ret; | ||
| 280 | } | ||
| 281 | |||
| 282 | /* Helper functions to convert field elements to/from internal representation */ | ||
| 283 | static void bin28_to_felem(felem out, const u8 in[28]) | ||
| 284 | { | ||
| 285 | out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff; | ||
| 286 | out[1] = (*((const uint64_t *)(in+7))) & 0x00ffffffffffffff; | ||
| 287 | out[2] = (*((const uint64_t *)(in+14))) & 0x00ffffffffffffff; | ||
| 288 | out[3] = (*((const uint64_t *)(in+21))) & 0x00ffffffffffffff; | ||
| 289 | } | ||
| 290 | |||
| 291 | static void felem_to_bin28(u8 out[28], const felem in) | ||
| 292 | { | ||
| 293 | unsigned i; | ||
| 294 | for (i = 0; i < 7; ++i) | ||
| 295 | { | ||
| 296 | out[i] = in[0]>>(8*i); | ||
| 297 | out[i+7] = in[1]>>(8*i); | ||
| 298 | out[i+14] = in[2]>>(8*i); | ||
| 299 | out[i+21] = in[3]>>(8*i); | ||
| 300 | } | ||
| 301 | } | ||
| 302 | |||
| 303 | /* To preserve endianness when using BN_bn2bin and BN_bin2bn */ | ||
| 304 | static void flip_endian(u8 *out, const u8 *in, unsigned len) | ||
| 305 | { | ||
| 306 | unsigned i; | ||
| 307 | for (i = 0; i < len; ++i) | ||
| 308 | out[i] = in[len-1-i]; | ||
| 309 | } | ||
| 310 | |||
| 311 | /* From OpenSSL BIGNUM to internal representation */ | ||
| 312 | static int BN_to_felem(felem out, const BIGNUM *bn) | ||
| 313 | { | ||
| 314 | felem_bytearray b_in; | ||
| 315 | felem_bytearray b_out; | ||
| 316 | unsigned num_bytes; | ||
| 317 | |||
| 318 | /* BN_bn2bin eats leading zeroes */ | ||
| 319 | memset(b_out, 0, sizeof b_out); | ||
| 320 | num_bytes = BN_num_bytes(bn); | ||
| 321 | if (num_bytes > sizeof b_out) | ||
| 322 | { | ||
| 323 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
| 324 | return 0; | ||
| 325 | } | ||
| 326 | if (BN_is_negative(bn)) | ||
| 327 | { | ||
| 328 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
| 329 | return 0; | ||
| 330 | } | ||
| 331 | num_bytes = BN_bn2bin(bn, b_in); | ||
| 332 | flip_endian(b_out, b_in, num_bytes); | ||
| 333 | bin28_to_felem(out, b_out); | ||
| 334 | return 1; | ||
| 335 | } | ||
| 336 | |||
| 337 | /* From internal representation to OpenSSL BIGNUM */ | ||
| 338 | static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) | ||
| 339 | { | ||
| 340 | felem_bytearray b_in, b_out; | ||
| 341 | felem_to_bin28(b_in, in); | ||
| 342 | flip_endian(b_out, b_in, sizeof b_out); | ||
| 343 | return BN_bin2bn(b_out, sizeof b_out, out); | ||
| 344 | } | ||
| 345 | |||
| 346 | /******************************************************************************/ | ||
| 347 | /* FIELD OPERATIONS | ||
| 348 | * | ||
| 349 | * Field operations, using the internal representation of field elements. | ||
| 350 | * NB! These operations are specific to our point multiplication and cannot be | ||
| 351 | * expected to be correct in general - e.g., multiplication with a large scalar | ||
| 352 | * will cause an overflow. | ||
| 353 | * | ||
| 354 | */ | ||
| 355 | |||
| 356 | static void felem_one(felem out) | ||
| 357 | { | ||
| 358 | out[0] = 1; | ||
| 359 | out[1] = 0; | ||
| 360 | out[2] = 0; | ||
| 361 | out[3] = 0; | ||
| 362 | } | ||
| 363 | |||
| 364 | static void felem_assign(felem out, const felem in) | ||
| 365 | { | ||
| 366 | out[0] = in[0]; | ||
| 367 | out[1] = in[1]; | ||
| 368 | out[2] = in[2]; | ||
| 369 | out[3] = in[3]; | ||
| 370 | } | ||
| 371 | |||
| 372 | /* Sum two field elements: out += in */ | ||
| 373 | static void felem_sum(felem out, const felem in) | ||
| 374 | { | ||
| 375 | out[0] += in[0]; | ||
| 376 | out[1] += in[1]; | ||
| 377 | out[2] += in[2]; | ||
| 378 | out[3] += in[3]; | ||
| 379 | } | ||
| 380 | |||
| 381 | /* Get negative value: out = -in */ | ||
| 382 | /* Assumes in[i] < 2^57 */ | ||
| 383 | static void felem_neg(felem out, const felem in) | ||
| 384 | { | ||
| 385 | static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2); | ||
| 386 | static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2); | ||
| 387 | static const limb two58m42m2 = (((limb) 1) << 58) - | ||
| 388 | (((limb) 1) << 42) - (((limb) 1) << 2); | ||
| 389 | |||
| 390 | /* Set to 0 mod 2^224-2^96+1 to ensure out > in */ | ||
| 391 | out[0] = two58p2 - in[0]; | ||
| 392 | out[1] = two58m42m2 - in[1]; | ||
| 393 | out[2] = two58m2 - in[2]; | ||
| 394 | out[3] = two58m2 - in[3]; | ||
| 395 | } | ||
| 396 | |||
| 397 | /* Subtract field elements: out -= in */ | ||
| 398 | /* Assumes in[i] < 2^57 */ | ||
| 399 | static void felem_diff(felem out, const felem in) | ||
| 400 | { | ||
| 401 | static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2); | ||
| 402 | static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2); | ||
| 403 | static const limb two58m42m2 = (((limb) 1) << 58) - | ||
| 404 | (((limb) 1) << 42) - (((limb) 1) << 2); | ||
| 405 | |||
| 406 | /* Add 0 mod 2^224-2^96+1 to ensure out > in */ | ||
| 407 | out[0] += two58p2; | ||
| 408 | out[1] += two58m42m2; | ||
| 409 | out[2] += two58m2; | ||
| 410 | out[3] += two58m2; | ||
| 411 | |||
| 412 | out[0] -= in[0]; | ||
| 413 | out[1] -= in[1]; | ||
| 414 | out[2] -= in[2]; | ||
| 415 | out[3] -= in[3]; | ||
| 416 | } | ||
| 417 | |||
| 418 | /* Subtract in unreduced 128-bit mode: out -= in */ | ||
| 419 | /* Assumes in[i] < 2^119 */ | ||
| 420 | static void widefelem_diff(widefelem out, const widefelem in) | ||
| 421 | { | ||
| 422 | static const widelimb two120 = ((widelimb) 1) << 120; | ||
| 423 | static const widelimb two120m64 = (((widelimb) 1) << 120) - | ||
| 424 | (((widelimb) 1) << 64); | ||
| 425 | static const widelimb two120m104m64 = (((widelimb) 1) << 120) - | ||
| 426 | (((widelimb) 1) << 104) - (((widelimb) 1) << 64); | ||
| 427 | |||
| 428 | /* Add 0 mod 2^224-2^96+1 to ensure out > in */ | ||
| 429 | out[0] += two120; | ||
| 430 | out[1] += two120m64; | ||
| 431 | out[2] += two120m64; | ||
| 432 | out[3] += two120; | ||
| 433 | out[4] += two120m104m64; | ||
| 434 | out[5] += two120m64; | ||
| 435 | out[6] += two120m64; | ||
| 436 | |||
| 437 | out[0] -= in[0]; | ||
| 438 | out[1] -= in[1]; | ||
| 439 | out[2] -= in[2]; | ||
| 440 | out[3] -= in[3]; | ||
| 441 | out[4] -= in[4]; | ||
| 442 | out[5] -= in[5]; | ||
| 443 | out[6] -= in[6]; | ||
| 444 | } | ||
| 445 | |||
| 446 | /* Subtract in mixed mode: out128 -= in64 */ | ||
| 447 | /* in[i] < 2^63 */ | ||
| 448 | static void felem_diff_128_64(widefelem out, const felem in) | ||
| 449 | { | ||
| 450 | static const widelimb two64p8 = (((widelimb) 1) << 64) + | ||
| 451 | (((widelimb) 1) << 8); | ||
| 452 | static const widelimb two64m8 = (((widelimb) 1) << 64) - | ||
| 453 | (((widelimb) 1) << 8); | ||
| 454 | static const widelimb two64m48m8 = (((widelimb) 1) << 64) - | ||
| 455 | (((widelimb) 1) << 48) - (((widelimb) 1) << 8); | ||
| 456 | |||
| 457 | /* Add 0 mod 2^224-2^96+1 to ensure out > in */ | ||
| 458 | out[0] += two64p8; | ||
| 459 | out[1] += two64m48m8; | ||
| 460 | out[2] += two64m8; | ||
| 461 | out[3] += two64m8; | ||
| 462 | |||
| 463 | out[0] -= in[0]; | ||
| 464 | out[1] -= in[1]; | ||
| 465 | out[2] -= in[2]; | ||
| 466 | out[3] -= in[3]; | ||
| 467 | } | ||
| 468 | |||
| 469 | /* Multiply a field element by a scalar: out = out * scalar | ||
| 470 | * The scalars we actually use are small, so results fit without overflow */ | ||
| 471 | static void felem_scalar(felem out, const limb scalar) | ||
| 472 | { | ||
| 473 | out[0] *= scalar; | ||
| 474 | out[1] *= scalar; | ||
| 475 | out[2] *= scalar; | ||
| 476 | out[3] *= scalar; | ||
| 477 | } | ||
| 478 | |||
| 479 | /* Multiply an unreduced field element by a scalar: out = out * scalar | ||
| 480 | * The scalars we actually use are small, so results fit without overflow */ | ||
| 481 | static void widefelem_scalar(widefelem out, const widelimb scalar) | ||
| 482 | { | ||
| 483 | out[0] *= scalar; | ||
| 484 | out[1] *= scalar; | ||
| 485 | out[2] *= scalar; | ||
| 486 | out[3] *= scalar; | ||
| 487 | out[4] *= scalar; | ||
| 488 | out[5] *= scalar; | ||
| 489 | out[6] *= scalar; | ||
| 490 | } | ||
| 491 | |||
| 492 | /* Square a field element: out = in^2 */ | ||
| 493 | static void felem_square(widefelem out, const felem in) | ||
| 494 | { | ||
| 495 | limb tmp0, tmp1, tmp2; | ||
| 496 | tmp0 = 2 * in[0]; tmp1 = 2 * in[1]; tmp2 = 2 * in[2]; | ||
| 497 | out[0] = ((widelimb) in[0]) * in[0]; | ||
| 498 | out[1] = ((widelimb) in[0]) * tmp1; | ||
| 499 | out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1]; | ||
| 500 | out[3] = ((widelimb) in[3]) * tmp0 + | ||
| 501 | ((widelimb) in[1]) * tmp2; | ||
| 502 | out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2]; | ||
| 503 | out[5] = ((widelimb) in[3]) * tmp2; | ||
| 504 | out[6] = ((widelimb) in[3]) * in[3]; | ||
| 505 | } | ||
| 506 | |||
| 507 | /* Multiply two field elements: out = in1 * in2 */ | ||
| 508 | static void felem_mul(widefelem out, const felem in1, const felem in2) | ||
| 509 | { | ||
| 510 | out[0] = ((widelimb) in1[0]) * in2[0]; | ||
| 511 | out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0]; | ||
| 512 | out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] + | ||
| 513 | ((widelimb) in1[2]) * in2[0]; | ||
| 514 | out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] + | ||
| 515 | ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0]; | ||
| 516 | out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] + | ||
| 517 | ((widelimb) in1[3]) * in2[1]; | ||
| 518 | out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2]; | ||
| 519 | out[6] = ((widelimb) in1[3]) * in2[3]; | ||
| 520 | } | ||
| 521 | |||
| 522 | /* Reduce seven 128-bit coefficients to four 64-bit coefficients. | ||
| 523 | * Requires in[i] < 2^126, | ||
| 524 | * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */ | ||
| 525 | static void felem_reduce(felem out, const widefelem in) | ||
| 526 | { | ||
| 527 | static const widelimb two127p15 = (((widelimb) 1) << 127) + | ||
| 528 | (((widelimb) 1) << 15); | ||
| 529 | static const widelimb two127m71 = (((widelimb) 1) << 127) - | ||
| 530 | (((widelimb) 1) << 71); | ||
| 531 | static const widelimb two127m71m55 = (((widelimb) 1) << 127) - | ||
| 532 | (((widelimb) 1) << 71) - (((widelimb) 1) << 55); | ||
| 533 | widelimb output[5]; | ||
| 534 | |||
| 535 | /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */ | ||
| 536 | output[0] = in[0] + two127p15; | ||
| 537 | output[1] = in[1] + two127m71m55; | ||
| 538 | output[2] = in[2] + two127m71; | ||
| 539 | output[3] = in[3]; | ||
| 540 | output[4] = in[4]; | ||
| 541 | |||
| 542 | /* Eliminate in[4], in[5], in[6] */ | ||
| 543 | output[4] += in[6] >> 16; | ||
| 544 | output[3] += (in[6] & 0xffff) << 40; | ||
| 545 | output[2] -= in[6]; | ||
| 546 | |||
| 547 | output[3] += in[5] >> 16; | ||
| 548 | output[2] += (in[5] & 0xffff) << 40; | ||
| 549 | output[1] -= in[5]; | ||
| 550 | |||
| 551 | output[2] += output[4] >> 16; | ||
| 552 | output[1] += (output[4] & 0xffff) << 40; | ||
| 553 | output[0] -= output[4]; | ||
| 554 | |||
| 555 | /* Carry 2 -> 3 -> 4 */ | ||
| 556 | output[3] += output[2] >> 56; | ||
| 557 | output[2] &= 0x00ffffffffffffff; | ||
| 558 | |||
| 559 | output[4] = output[3] >> 56; | ||
| 560 | output[3] &= 0x00ffffffffffffff; | ||
| 561 | |||
| 562 | /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */ | ||
| 563 | |||
| 564 | /* Eliminate output[4] */ | ||
| 565 | output[2] += output[4] >> 16; | ||
| 566 | /* output[2] < 2^56 + 2^56 = 2^57 */ | ||
| 567 | output[1] += (output[4] & 0xffff) << 40; | ||
| 568 | output[0] -= output[4]; | ||
| 569 | |||
| 570 | /* Carry 0 -> 1 -> 2 -> 3 */ | ||
| 571 | output[1] += output[0] >> 56; | ||
| 572 | out[0] = output[0] & 0x00ffffffffffffff; | ||
| 573 | |||
| 574 | output[2] += output[1] >> 56; | ||
| 575 | /* output[2] < 2^57 + 2^72 */ | ||
| 576 | out[1] = output[1] & 0x00ffffffffffffff; | ||
| 577 | output[3] += output[2] >> 56; | ||
| 578 | /* output[3] <= 2^56 + 2^16 */ | ||
| 579 | out[2] = output[2] & 0x00ffffffffffffff; | ||
| 580 | |||
| 581 | /* out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, | ||
| 582 | * out[3] <= 2^56 + 2^16 (due to final carry), | ||
| 583 | * so out < 2*p */ | ||
| 584 | out[3] = output[3]; | ||
| 585 | } | ||
| 586 | |||
| 587 | static void felem_square_reduce(felem out, const felem in) | ||
| 588 | { | ||
| 589 | widefelem tmp; | ||
| 590 | felem_square(tmp, in); | ||
| 591 | felem_reduce(out, tmp); | ||
| 592 | } | ||
| 593 | |||
| 594 | static void felem_mul_reduce(felem out, const felem in1, const felem in2) | ||
| 595 | { | ||
| 596 | widefelem tmp; | ||
| 597 | felem_mul(tmp, in1, in2); | ||
| 598 | felem_reduce(out, tmp); | ||
| 599 | } | ||
| 600 | |||
| 601 | /* Reduce to unique minimal representation. | ||
| 602 | * Requires 0 <= in < 2*p (always call felem_reduce first) */ | ||
| 603 | static void felem_contract(felem out, const felem in) | ||
| 604 | { | ||
| 605 | static const int64_t two56 = ((limb) 1) << 56; | ||
| 606 | /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */ | ||
| 607 | /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */ | ||
| 608 | int64_t tmp[4], a; | ||
| 609 | tmp[0] = in[0]; | ||
| 610 | tmp[1] = in[1]; | ||
| 611 | tmp[2] = in[2]; | ||
| 612 | tmp[3] = in[3]; | ||
| 613 | /* Case 1: a = 1 iff in >= 2^224 */ | ||
| 614 | a = (in[3] >> 56); | ||
| 615 | tmp[0] -= a; | ||
| 616 | tmp[1] += a << 40; | ||
| 617 | tmp[3] &= 0x00ffffffffffffff; | ||
| 618 | /* Case 2: a = 0 iff p <= in < 2^224, i.e., | ||
| 619 | * the high 128 bits are all 1 and the lower part is non-zero */ | ||
| 620 | a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) | | ||
| 621 | (((int64_t)(in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63); | ||
| 622 | a &= 0x00ffffffffffffff; | ||
| 623 | /* turn a into an all-one mask (if a = 0) or an all-zero mask */ | ||
| 624 | a = (a - 1) >> 63; | ||
| 625 | /* subtract 2^224 - 2^96 + 1 if a is all-one*/ | ||
| 626 | tmp[3] &= a ^ 0xffffffffffffffff; | ||
| 627 | tmp[2] &= a ^ 0xffffffffffffffff; | ||
| 628 | tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff; | ||
| 629 | tmp[0] -= 1 & a; | ||
| 630 | |||
| 631 | /* eliminate negative coefficients: if tmp[0] is negative, tmp[1] must | ||
| 632 | * be non-zero, so we only need one step */ | ||
| 633 | a = tmp[0] >> 63; | ||
| 634 | tmp[0] += two56 & a; | ||
| 635 | tmp[1] -= 1 & a; | ||
| 636 | |||
| 637 | /* carry 1 -> 2 -> 3 */ | ||
| 638 | tmp[2] += tmp[1] >> 56; | ||
| 639 | tmp[1] &= 0x00ffffffffffffff; | ||
| 640 | |||
| 641 | tmp[3] += tmp[2] >> 56; | ||
| 642 | tmp[2] &= 0x00ffffffffffffff; | ||
| 643 | |||
| 644 | /* Now 0 <= out < p */ | ||
| 645 | out[0] = tmp[0]; | ||
| 646 | out[1] = tmp[1]; | ||
| 647 | out[2] = tmp[2]; | ||
| 648 | out[3] = tmp[3]; | ||
| 649 | } | ||
| 650 | |||
| 651 | /* Zero-check: returns 1 if input is 0, and 0 otherwise. | ||
| 652 | * We know that field elements are reduced to in < 2^225, | ||
| 653 | * so we only need to check three cases: 0, 2^224 - 2^96 + 1, | ||
| 654 | * and 2^225 - 2^97 + 2 */ | ||
| 655 | static limb felem_is_zero(const felem in) | ||
| 656 | { | ||
| 657 | limb zero, two224m96p1, two225m97p2; | ||
| 658 | |||
| 659 | zero = in[0] | in[1] | in[2] | in[3]; | ||
| 660 | zero = (((int64_t)(zero) - 1) >> 63) & 1; | ||
| 661 | two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000) | ||
| 662 | | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff); | ||
| 663 | two224m96p1 = (((int64_t)(two224m96p1) - 1) >> 63) & 1; | ||
| 664 | two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000) | ||
| 665 | | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff); | ||
| 666 | two225m97p2 = (((int64_t)(two225m97p2) - 1) >> 63) & 1; | ||
| 667 | return (zero | two224m96p1 | two225m97p2); | ||
| 668 | } | ||
| 669 | |||
| 670 | static limb felem_is_zero_int(const felem in) | ||
| 671 | { | ||
| 672 | return (int) (felem_is_zero(in) & ((limb)1)); | ||
| 673 | } | ||
| 674 | |||
| 675 | /* Invert a field element */ | ||
| 676 | /* Computation chain copied from djb's code */ | ||
| 677 | static void felem_inv(felem out, const felem in) | ||
| 678 | { | ||
| 679 | felem ftmp, ftmp2, ftmp3, ftmp4; | ||
| 680 | widefelem tmp; | ||
| 681 | unsigned i; | ||
| 682 | |||
| 683 | felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2 */ | ||
| 684 | felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 1 */ | ||
| 685 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2 */ | ||
| 686 | felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 1 */ | ||
| 687 | felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp); /* 2^4 - 2 */ | ||
| 688 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^5 - 4 */ | ||
| 689 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^6 - 8 */ | ||
| 690 | felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp); /* 2^6 - 1 */ | ||
| 691 | felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp); /* 2^7 - 2 */ | ||
| 692 | for (i = 0; i < 5; ++i) /* 2^12 - 2^6 */ | ||
| 693 | { | ||
| 694 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); | ||
| 695 | } | ||
| 696 | felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp2, tmp); /* 2^12 - 1 */ | ||
| 697 | felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^13 - 2 */ | ||
| 698 | for (i = 0; i < 11; ++i) /* 2^24 - 2^12 */ | ||
| 699 | { | ||
| 700 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); | ||
| 701 | } | ||
| 702 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp2, tmp); /* 2^24 - 1 */ | ||
| 703 | felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^25 - 2 */ | ||
| 704 | for (i = 0; i < 23; ++i) /* 2^48 - 2^24 */ | ||
| 705 | { | ||
| 706 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); | ||
| 707 | } | ||
| 708 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^48 - 1 */ | ||
| 709 | felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp); /* 2^49 - 2 */ | ||
| 710 | for (i = 0; i < 47; ++i) /* 2^96 - 2^48 */ | ||
| 711 | { | ||
| 712 | felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); | ||
| 713 | } | ||
| 714 | felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^96 - 1 */ | ||
| 715 | felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp); /* 2^97 - 2 */ | ||
| 716 | for (i = 0; i < 23; ++i) /* 2^120 - 2^24 */ | ||
| 717 | { | ||
| 718 | felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); | ||
| 719 | } | ||
| 720 | felem_mul(tmp, ftmp2, ftmp4); felem_reduce(ftmp2, tmp); /* 2^120 - 1 */ | ||
| 721 | for (i = 0; i < 6; ++i) /* 2^126 - 2^6 */ | ||
| 722 | { | ||
| 723 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); | ||
| 724 | } | ||
| 725 | felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp); /* 2^126 - 1 */ | ||
| 726 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^127 - 2 */ | ||
| 727 | felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp); /* 2^127 - 1 */ | ||
| 728 | for (i = 0; i < 97; ++i) /* 2^224 - 2^97 */ | ||
| 729 | { | ||
| 730 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); | ||
| 731 | } | ||
| 732 | felem_mul(tmp, ftmp, ftmp3); felem_reduce(out, tmp); /* 2^224 - 2^96 - 1 */ | ||
| 733 | } | ||
| 734 | |||
| 735 | /* Copy in constant time: | ||
| 736 | * if icopy == 1, copy in to out, | ||
| 737 | * if icopy == 0, copy out to itself. */ | ||
| 738 | static void | ||
| 739 | copy_conditional(felem out, const felem in, limb icopy) | ||
| 740 | { | ||
| 741 | unsigned i; | ||
| 742 | /* icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one */ | ||
| 743 | const limb copy = -icopy; | ||
| 744 | for (i = 0; i < 4; ++i) | ||
| 745 | { | ||
| 746 | const limb tmp = copy & (in[i] ^ out[i]); | ||
| 747 | out[i] ^= tmp; | ||
| 748 | } | ||
| 749 | } | ||
| 750 | |||
| 751 | /******************************************************************************/ | ||
| 752 | /* ELLIPTIC CURVE POINT OPERATIONS | ||
| 753 | * | ||
| 754 | * Points are represented in Jacobian projective coordinates: | ||
| 755 | * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3), | ||
| 756 | * or to the point at infinity if Z == 0. | ||
| 757 | * | ||
| 758 | */ | ||
| 759 | |||
| 760 | /* Double an elliptic curve point: | ||
| 761 | * (X', Y', Z') = 2 * (X, Y, Z), where | ||
| 762 | * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2 | ||
| 763 | * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2 | ||
| 764 | * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z | ||
| 765 | * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed, | ||
| 766 | * while x_out == y_in is not (maybe this works, but it's not tested). */ | ||
| 767 | static void | ||
| 768 | point_double(felem x_out, felem y_out, felem z_out, | ||
| 769 | const felem x_in, const felem y_in, const felem z_in) | ||
| 770 | { | ||
| 771 | widefelem tmp, tmp2; | ||
| 772 | felem delta, gamma, beta, alpha, ftmp, ftmp2; | ||
| 773 | |||
| 774 | felem_assign(ftmp, x_in); | ||
| 775 | felem_assign(ftmp2, x_in); | ||
| 776 | |||
| 777 | /* delta = z^2 */ | ||
| 778 | felem_square(tmp, z_in); | ||
| 779 | felem_reduce(delta, tmp); | ||
| 780 | |||
| 781 | /* gamma = y^2 */ | ||
| 782 | felem_square(tmp, y_in); | ||
| 783 | felem_reduce(gamma, tmp); | ||
| 784 | |||
| 785 | /* beta = x*gamma */ | ||
| 786 | felem_mul(tmp, x_in, gamma); | ||
| 787 | felem_reduce(beta, tmp); | ||
| 788 | |||
| 789 | /* alpha = 3*(x-delta)*(x+delta) */ | ||
| 790 | felem_diff(ftmp, delta); | ||
| 791 | /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */ | ||
| 792 | felem_sum(ftmp2, delta); | ||
| 793 | /* ftmp2[i] < 2^57 + 2^57 = 2^58 */ | ||
| 794 | felem_scalar(ftmp2, 3); | ||
| 795 | /* ftmp2[i] < 3 * 2^58 < 2^60 */ | ||
| 796 | felem_mul(tmp, ftmp, ftmp2); | ||
| 797 | /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */ | ||
| 798 | felem_reduce(alpha, tmp); | ||
| 799 | |||
| 800 | /* x' = alpha^2 - 8*beta */ | ||
| 801 | felem_square(tmp, alpha); | ||
| 802 | /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ | ||
| 803 | felem_assign(ftmp, beta); | ||
| 804 | felem_scalar(ftmp, 8); | ||
| 805 | /* ftmp[i] < 8 * 2^57 = 2^60 */ | ||
| 806 | felem_diff_128_64(tmp, ftmp); | ||
| 807 | /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ | ||
| 808 | felem_reduce(x_out, tmp); | ||
| 809 | |||
| 810 | /* z' = (y + z)^2 - gamma - delta */ | ||
| 811 | felem_sum(delta, gamma); | ||
| 812 | /* delta[i] < 2^57 + 2^57 = 2^58 */ | ||
| 813 | felem_assign(ftmp, y_in); | ||
| 814 | felem_sum(ftmp, z_in); | ||
| 815 | /* ftmp[i] < 2^57 + 2^57 = 2^58 */ | ||
| 816 | felem_square(tmp, ftmp); | ||
| 817 | /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */ | ||
| 818 | felem_diff_128_64(tmp, delta); | ||
| 819 | /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */ | ||
| 820 | felem_reduce(z_out, tmp); | ||
| 821 | |||
| 822 | /* y' = alpha*(4*beta - x') - 8*gamma^2 */ | ||
| 823 | felem_scalar(beta, 4); | ||
| 824 | /* beta[i] < 4 * 2^57 = 2^59 */ | ||
| 825 | felem_diff(beta, x_out); | ||
| 826 | /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */ | ||
| 827 | felem_mul(tmp, alpha, beta); | ||
| 828 | /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */ | ||
| 829 | felem_square(tmp2, gamma); | ||
| 830 | /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */ | ||
| 831 | widefelem_scalar(tmp2, 8); | ||
| 832 | /* tmp2[i] < 8 * 2^116 = 2^119 */ | ||
| 833 | widefelem_diff(tmp, tmp2); | ||
| 834 | /* tmp[i] < 2^119 + 2^120 < 2^121 */ | ||
| 835 | felem_reduce(y_out, tmp); | ||
| 836 | } | ||
| 837 | |||
| 838 | /* Add two elliptic curve points: | ||
| 839 | * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where | ||
| 840 | * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 - | ||
| 841 | * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 | ||
| 842 | * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) - | ||
| 843 | * Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3 | ||
| 844 | * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2) | ||
| 845 | * | ||
| 846 | * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0. | ||
| 847 | */ | ||
| 848 | |||
| 849 | /* This function is not entirely constant-time: | ||
| 850 | * it includes a branch for checking whether the two input points are equal, | ||
| 851 | * (while not equal to the point at infinity). | ||
| 852 | * This case never happens during single point multiplication, | ||
| 853 | * so there is no timing leak for ECDH or ECDSA signing. */ | ||
| 854 | static void point_add(felem x3, felem y3, felem z3, | ||
| 855 | const felem x1, const felem y1, const felem z1, | ||
| 856 | const int mixed, const felem x2, const felem y2, const felem z2) | ||
| 857 | { | ||
| 858 | felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out; | ||
| 859 | widefelem tmp, tmp2; | ||
| 860 | limb z1_is_zero, z2_is_zero, x_equal, y_equal; | ||
| 861 | |||
| 862 | if (!mixed) | ||
| 863 | { | ||
| 864 | /* ftmp2 = z2^2 */ | ||
| 865 | felem_square(tmp, z2); | ||
| 866 | felem_reduce(ftmp2, tmp); | ||
| 867 | |||
| 868 | /* ftmp4 = z2^3 */ | ||
| 869 | felem_mul(tmp, ftmp2, z2); | ||
| 870 | felem_reduce(ftmp4, tmp); | ||
| 871 | |||
| 872 | /* ftmp4 = z2^3*y1 */ | ||
| 873 | felem_mul(tmp2, ftmp4, y1); | ||
| 874 | felem_reduce(ftmp4, tmp2); | ||
| 875 | |||
| 876 | /* ftmp2 = z2^2*x1 */ | ||
| 877 | felem_mul(tmp2, ftmp2, x1); | ||
| 878 | felem_reduce(ftmp2, tmp2); | ||
| 879 | } | ||
| 880 | else | ||
| 881 | { | ||
| 882 | /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ | ||
| 883 | |||
| 884 | /* ftmp4 = z2^3*y1 */ | ||
| 885 | felem_assign(ftmp4, y1); | ||
| 886 | |||
| 887 | /* ftmp2 = z2^2*x1 */ | ||
| 888 | felem_assign(ftmp2, x1); | ||
| 889 | } | ||
| 890 | |||
| 891 | /* ftmp = z1^2 */ | ||
| 892 | felem_square(tmp, z1); | ||
| 893 | felem_reduce(ftmp, tmp); | ||
| 894 | |||
| 895 | /* ftmp3 = z1^3 */ | ||
| 896 | felem_mul(tmp, ftmp, z1); | ||
| 897 | felem_reduce(ftmp3, tmp); | ||
| 898 | |||
| 899 | /* tmp = z1^3*y2 */ | ||
| 900 | felem_mul(tmp, ftmp3, y2); | ||
| 901 | /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ | ||
| 902 | |||
| 903 | /* ftmp3 = z1^3*y2 - z2^3*y1 */ | ||
| 904 | felem_diff_128_64(tmp, ftmp4); | ||
| 905 | /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ | ||
| 906 | felem_reduce(ftmp3, tmp); | ||
| 907 | |||
| 908 | /* tmp = z1^2*x2 */ | ||
| 909 | felem_mul(tmp, ftmp, x2); | ||
| 910 | /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ | ||
| 911 | |||
| 912 | /* ftmp = z1^2*x2 - z2^2*x1 */ | ||
| 913 | felem_diff_128_64(tmp, ftmp2); | ||
| 914 | /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ | ||
| 915 | felem_reduce(ftmp, tmp); | ||
| 916 | |||
| 917 | /* the formulae are incorrect if the points are equal | ||
| 918 | * so we check for this and do doubling if this happens */ | ||
| 919 | x_equal = felem_is_zero(ftmp); | ||
| 920 | y_equal = felem_is_zero(ftmp3); | ||
| 921 | z1_is_zero = felem_is_zero(z1); | ||
| 922 | z2_is_zero = felem_is_zero(z2); | ||
| 923 | /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */ | ||
| 924 | if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) | ||
| 925 | { | ||
| 926 | point_double(x3, y3, z3, x1, y1, z1); | ||
| 927 | return; | ||
| 928 | } | ||
| 929 | |||
| 930 | /* ftmp5 = z1*z2 */ | ||
| 931 | if (!mixed) | ||
| 932 | { | ||
| 933 | felem_mul(tmp, z1, z2); | ||
| 934 | felem_reduce(ftmp5, tmp); | ||
| 935 | } | ||
| 936 | else | ||
| 937 | { | ||
| 938 | /* special case z2 = 0 is handled later */ | ||
| 939 | felem_assign(ftmp5, z1); | ||
| 940 | } | ||
| 941 | |||
| 942 | /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */ | ||
| 943 | felem_mul(tmp, ftmp, ftmp5); | ||
| 944 | felem_reduce(z_out, tmp); | ||
| 945 | |||
| 946 | /* ftmp = (z1^2*x2 - z2^2*x1)^2 */ | ||
| 947 | felem_assign(ftmp5, ftmp); | ||
| 948 | felem_square(tmp, ftmp); | ||
| 949 | felem_reduce(ftmp, tmp); | ||
| 950 | |||
| 951 | /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */ | ||
| 952 | felem_mul(tmp, ftmp, ftmp5); | ||
| 953 | felem_reduce(ftmp5, tmp); | ||
| 954 | |||
| 955 | /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ | ||
| 956 | felem_mul(tmp, ftmp2, ftmp); | ||
| 957 | felem_reduce(ftmp2, tmp); | ||
| 958 | |||
| 959 | /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */ | ||
| 960 | felem_mul(tmp, ftmp4, ftmp5); | ||
| 961 | /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ | ||
| 962 | |||
| 963 | /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */ | ||
| 964 | felem_square(tmp2, ftmp3); | ||
| 965 | /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */ | ||
| 966 | |||
| 967 | /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */ | ||
| 968 | felem_diff_128_64(tmp2, ftmp5); | ||
| 969 | /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */ | ||
| 970 | |||
| 971 | /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ | ||
| 972 | felem_assign(ftmp5, ftmp2); | ||
| 973 | felem_scalar(ftmp5, 2); | ||
| 974 | /* ftmp5[i] < 2 * 2^57 = 2^58 */ | ||
| 975 | |||
| 976 | /* x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 - | ||
| 977 | 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ | ||
| 978 | felem_diff_128_64(tmp2, ftmp5); | ||
| 979 | /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */ | ||
| 980 | felem_reduce(x_out, tmp2); | ||
| 981 | |||
| 982 | /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */ | ||
| 983 | felem_diff(ftmp2, x_out); | ||
| 984 | /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */ | ||
| 985 | |||
| 986 | /* tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) */ | ||
| 987 | felem_mul(tmp2, ftmp3, ftmp2); | ||
| 988 | /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */ | ||
| 989 | |||
| 990 | /* y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) - | ||
| 991 | z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */ | ||
| 992 | widefelem_diff(tmp2, tmp); | ||
| 993 | /* tmp2[i] < 2^118 + 2^120 < 2^121 */ | ||
| 994 | felem_reduce(y_out, tmp2); | ||
| 995 | |||
| 996 | /* the result (x_out, y_out, z_out) is incorrect if one of the inputs is | ||
| 997 | * the point at infinity, so we need to check for this separately */ | ||
| 998 | |||
| 999 | /* if point 1 is at infinity, copy point 2 to output, and vice versa */ | ||
| 1000 | copy_conditional(x_out, x2, z1_is_zero); | ||
| 1001 | copy_conditional(x_out, x1, z2_is_zero); | ||
| 1002 | copy_conditional(y_out, y2, z1_is_zero); | ||
| 1003 | copy_conditional(y_out, y1, z2_is_zero); | ||
| 1004 | copy_conditional(z_out, z2, z1_is_zero); | ||
| 1005 | copy_conditional(z_out, z1, z2_is_zero); | ||
| 1006 | felem_assign(x3, x_out); | ||
| 1007 | felem_assign(y3, y_out); | ||
| 1008 | felem_assign(z3, z_out); | ||
| 1009 | } | ||
| 1010 | |||
| 1011 | /* select_point selects the |idx|th point from a precomputation table and | ||
| 1012 | * copies it to out. */ | ||
| 1013 | static void select_point(const u64 idx, unsigned int size, const felem pre_comp[/*size*/][3], felem out[3]) | ||
| 1014 | { | ||
| 1015 | unsigned i, j; | ||
| 1016 | limb *outlimbs = &out[0][0]; | ||
| 1017 | memset(outlimbs, 0, 3 * sizeof(felem)); | ||
| 1018 | |||
| 1019 | for (i = 0; i < size; i++) | ||
| 1020 | { | ||
| 1021 | const limb *inlimbs = &pre_comp[i][0][0]; | ||
| 1022 | u64 mask = i ^ idx; | ||
| 1023 | mask |= mask >> 4; | ||
| 1024 | mask |= mask >> 2; | ||
| 1025 | mask |= mask >> 1; | ||
| 1026 | mask &= 1; | ||
| 1027 | mask--; | ||
| 1028 | for (j = 0; j < 4 * 3; j++) | ||
| 1029 | outlimbs[j] |= inlimbs[j] & mask; | ||
| 1030 | } | ||
| 1031 | } | ||
| 1032 | |||
| 1033 | /* get_bit returns the |i|th bit in |in| */ | ||
| 1034 | static char get_bit(const felem_bytearray in, unsigned i) | ||
| 1035 | { | ||
| 1036 | if (i >= 224) | ||
| 1037 | return 0; | ||
| 1038 | return (in[i >> 3] >> (i & 7)) & 1; | ||
| 1039 | } | ||
| 1040 | |||
| 1041 | /* Interleaved point multiplication using precomputed point multiples: | ||
| 1042 | * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], | ||
| 1043 | * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple | ||
| 1044 | * of the generator, using certain (large) precomputed multiples in g_pre_comp. | ||
| 1045 | * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ | ||
| 1046 | static void batch_mul(felem x_out, felem y_out, felem z_out, | ||
| 1047 | const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, | ||
| 1048 | const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[2][16][3]) | ||
| 1049 | { | ||
| 1050 | int i, skip; | ||
| 1051 | unsigned num; | ||
| 1052 | unsigned gen_mul = (g_scalar != NULL); | ||
| 1053 | felem nq[3], tmp[4]; | ||
| 1054 | u64 bits; | ||
| 1055 | u8 sign, digit; | ||
| 1056 | |||
| 1057 | /* set nq to the point at infinity */ | ||
| 1058 | memset(nq, 0, 3 * sizeof(felem)); | ||
| 1059 | |||
| 1060 | /* Loop over all scalars msb-to-lsb, interleaving additions | ||
| 1061 | * of multiples of the generator (two in each of the last 28 rounds) | ||
| 1062 | * and additions of other points multiples (every 5th round). | ||
| 1063 | */ | ||
| 1064 | skip = 1; /* save two point operations in the first round */ | ||
| 1065 | for (i = (num_points ? 220 : 27); i >= 0; --i) | ||
| 1066 | { | ||
| 1067 | /* double */ | ||
| 1068 | if (!skip) | ||
| 1069 | point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); | ||
| 1070 | |||
| 1071 | /* add multiples of the generator */ | ||
| 1072 | if (gen_mul && (i <= 27)) | ||
| 1073 | { | ||
| 1074 | /* first, look 28 bits upwards */ | ||
| 1075 | bits = get_bit(g_scalar, i + 196) << 3; | ||
| 1076 | bits |= get_bit(g_scalar, i + 140) << 2; | ||
| 1077 | bits |= get_bit(g_scalar, i + 84) << 1; | ||
| 1078 | bits |= get_bit(g_scalar, i + 28); | ||
| 1079 | /* select the point to add, in constant time */ | ||
| 1080 | select_point(bits, 16, g_pre_comp[1], tmp); | ||
| 1081 | |||
| 1082 | if (!skip) | ||
| 1083 | { | ||
| 1084 | point_add(nq[0], nq[1], nq[2], | ||
| 1085 | nq[0], nq[1], nq[2], | ||
| 1086 | 1 /* mixed */, tmp[0], tmp[1], tmp[2]); | ||
| 1087 | } | ||
| 1088 | else | ||
| 1089 | { | ||
| 1090 | memcpy(nq, tmp, 3 * sizeof(felem)); | ||
| 1091 | skip = 0; | ||
| 1092 | } | ||
| 1093 | |||
| 1094 | /* second, look at the current position */ | ||
| 1095 | bits = get_bit(g_scalar, i + 168) << 3; | ||
| 1096 | bits |= get_bit(g_scalar, i + 112) << 2; | ||
| 1097 | bits |= get_bit(g_scalar, i + 56) << 1; | ||
| 1098 | bits |= get_bit(g_scalar, i); | ||
| 1099 | /* select the point to add, in constant time */ | ||
| 1100 | select_point(bits, 16, g_pre_comp[0], tmp); | ||
| 1101 | point_add(nq[0], nq[1], nq[2], | ||
| 1102 | nq[0], nq[1], nq[2], | ||
| 1103 | 1 /* mixed */, tmp[0], tmp[1], tmp[2]); | ||
| 1104 | } | ||
| 1105 | |||
| 1106 | /* do other additions every 5 doublings */ | ||
| 1107 | if (num_points && (i % 5 == 0)) | ||
| 1108 | { | ||
| 1109 | /* loop over all scalars */ | ||
| 1110 | for (num = 0; num < num_points; ++num) | ||
| 1111 | { | ||
| 1112 | bits = get_bit(scalars[num], i + 4) << 5; | ||
| 1113 | bits |= get_bit(scalars[num], i + 3) << 4; | ||
| 1114 | bits |= get_bit(scalars[num], i + 2) << 3; | ||
| 1115 | bits |= get_bit(scalars[num], i + 1) << 2; | ||
| 1116 | bits |= get_bit(scalars[num], i) << 1; | ||
| 1117 | bits |= get_bit(scalars[num], i - 1); | ||
| 1118 | ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); | ||
| 1119 | |||
| 1120 | /* select the point to add or subtract */ | ||
| 1121 | select_point(digit, 17, pre_comp[num], tmp); | ||
| 1122 | felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */ | ||
| 1123 | copy_conditional(tmp[1], tmp[3], sign); | ||
| 1124 | |||
| 1125 | if (!skip) | ||
| 1126 | { | ||
| 1127 | point_add(nq[0], nq[1], nq[2], | ||
| 1128 | nq[0], nq[1], nq[2], | ||
| 1129 | mixed, tmp[0], tmp[1], tmp[2]); | ||
| 1130 | } | ||
| 1131 | else | ||
| 1132 | { | ||
| 1133 | memcpy(nq, tmp, 3 * sizeof(felem)); | ||
| 1134 | skip = 0; | ||
| 1135 | } | ||
| 1136 | } | ||
| 1137 | } | ||
| 1138 | } | ||
| 1139 | felem_assign(x_out, nq[0]); | ||
| 1140 | felem_assign(y_out, nq[1]); | ||
| 1141 | felem_assign(z_out, nq[2]); | ||
| 1142 | } | ||
| 1143 | |||
| 1144 | /******************************************************************************/ | ||
| 1145 | /* FUNCTIONS TO MANAGE PRECOMPUTATION | ||
| 1146 | */ | ||
| 1147 | |||
| 1148 | static NISTP224_PRE_COMP *nistp224_pre_comp_new() | ||
| 1149 | { | ||
| 1150 | NISTP224_PRE_COMP *ret = NULL; | ||
| 1151 | ret = (NISTP224_PRE_COMP *) OPENSSL_malloc(sizeof *ret); | ||
| 1152 | if (!ret) | ||
| 1153 | { | ||
| 1154 | ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); | ||
| 1155 | return ret; | ||
| 1156 | } | ||
| 1157 | memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); | ||
| 1158 | ret->references = 1; | ||
| 1159 | return ret; | ||
| 1160 | } | ||
| 1161 | |||
| 1162 | static void *nistp224_pre_comp_dup(void *src_) | ||
| 1163 | { | ||
| 1164 | NISTP224_PRE_COMP *src = src_; | ||
| 1165 | |||
| 1166 | /* no need to actually copy, these objects never change! */ | ||
| 1167 | CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); | ||
| 1168 | |||
| 1169 | return src_; | ||
| 1170 | } | ||
| 1171 | |||
| 1172 | static void nistp224_pre_comp_free(void *pre_) | ||
| 1173 | { | ||
| 1174 | int i; | ||
| 1175 | NISTP224_PRE_COMP *pre = pre_; | ||
| 1176 | |||
| 1177 | if (!pre) | ||
| 1178 | return; | ||
| 1179 | |||
| 1180 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
| 1181 | if (i > 0) | ||
| 1182 | return; | ||
| 1183 | |||
| 1184 | OPENSSL_free(pre); | ||
| 1185 | } | ||
| 1186 | |||
| 1187 | static void nistp224_pre_comp_clear_free(void *pre_) | ||
| 1188 | { | ||
| 1189 | int i; | ||
| 1190 | NISTP224_PRE_COMP *pre = pre_; | ||
| 1191 | |||
| 1192 | if (!pre) | ||
| 1193 | return; | ||
| 1194 | |||
| 1195 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
| 1196 | if (i > 0) | ||
| 1197 | return; | ||
| 1198 | |||
| 1199 | OPENSSL_cleanse(pre, sizeof *pre); | ||
| 1200 | OPENSSL_free(pre); | ||
| 1201 | } | ||
| 1202 | |||
| 1203 | /******************************************************************************/ | ||
| 1204 | /* OPENSSL EC_METHOD FUNCTIONS | ||
| 1205 | */ | ||
| 1206 | |||
| 1207 | int ec_GFp_nistp224_group_init(EC_GROUP *group) | ||
| 1208 | { | ||
| 1209 | int ret; | ||
| 1210 | ret = ec_GFp_simple_group_init(group); | ||
| 1211 | group->a_is_minus3 = 1; | ||
| 1212 | return ret; | ||
| 1213 | } | ||
| 1214 | |||
| 1215 | int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, | ||
| 1216 | const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) | ||
| 1217 | { | ||
| 1218 | int ret = 0; | ||
| 1219 | BN_CTX *new_ctx = NULL; | ||
| 1220 | BIGNUM *curve_p, *curve_a, *curve_b; | ||
| 1221 | |||
| 1222 | if (ctx == NULL) | ||
| 1223 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
| 1224 | BN_CTX_start(ctx); | ||
| 1225 | if (((curve_p = BN_CTX_get(ctx)) == NULL) || | ||
| 1226 | ((curve_a = BN_CTX_get(ctx)) == NULL) || | ||
| 1227 | ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; | ||
| 1228 | BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p); | ||
| 1229 | BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a); | ||
| 1230 | BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b); | ||
| 1231 | if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || | ||
| 1232 | (BN_cmp(curve_b, b))) | ||
| 1233 | { | ||
| 1234 | ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE, | ||
| 1235 | EC_R_WRONG_CURVE_PARAMETERS); | ||
| 1236 | goto err; | ||
| 1237 | } | ||
| 1238 | group->field_mod_func = BN_nist_mod_224; | ||
| 1239 | ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); | ||
| 1240 | err: | ||
| 1241 | BN_CTX_end(ctx); | ||
| 1242 | if (new_ctx != NULL) | ||
| 1243 | BN_CTX_free(new_ctx); | ||
| 1244 | return ret; | ||
| 1245 | } | ||
| 1246 | |||
| 1247 | /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns | ||
| 1248 | * (X', Y') = (X/Z^2, Y/Z^3) */ | ||
| 1249 | int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, | ||
| 1250 | const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) | ||
| 1251 | { | ||
| 1252 | felem z1, z2, x_in, y_in, x_out, y_out; | ||
| 1253 | widefelem tmp; | ||
| 1254 | |||
| 1255 | if (EC_POINT_is_at_infinity(group, point)) | ||
| 1256 | { | ||
| 1257 | ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, | ||
| 1258 | EC_R_POINT_AT_INFINITY); | ||
| 1259 | return 0; | ||
| 1260 | } | ||
| 1261 | if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || | ||
| 1262 | (!BN_to_felem(z1, &point->Z))) return 0; | ||
| 1263 | felem_inv(z2, z1); | ||
| 1264 | felem_square(tmp, z2); felem_reduce(z1, tmp); | ||
| 1265 | felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); | ||
| 1266 | felem_contract(x_out, x_in); | ||
| 1267 | if (x != NULL) | ||
| 1268 | { | ||
| 1269 | if (!felem_to_BN(x, x_out)) { | ||
| 1270 | ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, | ||
| 1271 | ERR_R_BN_LIB); | ||
| 1272 | return 0; | ||
| 1273 | } | ||
| 1274 | } | ||
| 1275 | felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); | ||
| 1276 | felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); | ||
| 1277 | felem_contract(y_out, y_in); | ||
| 1278 | if (y != NULL) | ||
| 1279 | { | ||
| 1280 | if (!felem_to_BN(y, y_out)) { | ||
| 1281 | ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, | ||
| 1282 | ERR_R_BN_LIB); | ||
| 1283 | return 0; | ||
| 1284 | } | ||
| 1285 | } | ||
| 1286 | return 1; | ||
| 1287 | } | ||
| 1288 | |||
| 1289 | static void make_points_affine(size_t num, felem points[/*num*/][3], felem tmp_felems[/*num+1*/]) | ||
| 1290 | { | ||
| 1291 | /* Runs in constant time, unless an input is the point at infinity | ||
| 1292 | * (which normally shouldn't happen). */ | ||
| 1293 | ec_GFp_nistp_points_make_affine_internal( | ||
| 1294 | num, | ||
| 1295 | points, | ||
| 1296 | sizeof(felem), | ||
| 1297 | tmp_felems, | ||
| 1298 | (void (*)(void *)) felem_one, | ||
| 1299 | (int (*)(const void *)) felem_is_zero_int, | ||
| 1300 | (void (*)(void *, const void *)) felem_assign, | ||
| 1301 | (void (*)(void *, const void *)) felem_square_reduce, | ||
| 1302 | (void (*)(void *, const void *, const void *)) felem_mul_reduce, | ||
| 1303 | (void (*)(void *, const void *)) felem_inv, | ||
| 1304 | (void (*)(void *, const void *)) felem_contract); | ||
| 1305 | } | ||
| 1306 | |||
| 1307 | /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values | ||
| 1308 | * Result is stored in r (r can equal one of the inputs). */ | ||
| 1309 | int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, | ||
| 1310 | const BIGNUM *scalar, size_t num, const EC_POINT *points[], | ||
| 1311 | const BIGNUM *scalars[], BN_CTX *ctx) | ||
| 1312 | { | ||
| 1313 | int ret = 0; | ||
| 1314 | int j; | ||
| 1315 | unsigned i; | ||
| 1316 | int mixed = 0; | ||
| 1317 | BN_CTX *new_ctx = NULL; | ||
| 1318 | BIGNUM *x, *y, *z, *tmp_scalar; | ||
| 1319 | felem_bytearray g_secret; | ||
| 1320 | felem_bytearray *secrets = NULL; | ||
| 1321 | felem (*pre_comp)[17][3] = NULL; | ||
| 1322 | felem *tmp_felems = NULL; | ||
| 1323 | felem_bytearray tmp; | ||
| 1324 | unsigned num_bytes; | ||
| 1325 | int have_pre_comp = 0; | ||
| 1326 | size_t num_points = num; | ||
| 1327 | felem x_in, y_in, z_in, x_out, y_out, z_out; | ||
| 1328 | NISTP224_PRE_COMP *pre = NULL; | ||
| 1329 | const felem (*g_pre_comp)[16][3] = NULL; | ||
| 1330 | EC_POINT *generator = NULL; | ||
| 1331 | const EC_POINT *p = NULL; | ||
| 1332 | const BIGNUM *p_scalar = NULL; | ||
| 1333 | |||
| 1334 | if (ctx == NULL) | ||
| 1335 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
| 1336 | BN_CTX_start(ctx); | ||
| 1337 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
| 1338 | ((y = BN_CTX_get(ctx)) == NULL) || | ||
| 1339 | ((z = BN_CTX_get(ctx)) == NULL) || | ||
| 1340 | ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) | ||
| 1341 | goto err; | ||
| 1342 | |||
| 1343 | if (scalar != NULL) | ||
| 1344 | { | ||
| 1345 | pre = EC_EX_DATA_get_data(group->extra_data, | ||
| 1346 | nistp224_pre_comp_dup, nistp224_pre_comp_free, | ||
| 1347 | nistp224_pre_comp_clear_free); | ||
| 1348 | if (pre) | ||
| 1349 | /* we have precomputation, try to use it */ | ||
| 1350 | g_pre_comp = (const felem (*)[16][3]) pre->g_pre_comp; | ||
| 1351 | else | ||
| 1352 | /* try to use the standard precomputation */ | ||
| 1353 | g_pre_comp = &gmul[0]; | ||
| 1354 | generator = EC_POINT_new(group); | ||
| 1355 | if (generator == NULL) | ||
| 1356 | goto err; | ||
| 1357 | /* get the generator from precomputation */ | ||
| 1358 | if (!felem_to_BN(x, g_pre_comp[0][1][0]) || | ||
| 1359 | !felem_to_BN(y, g_pre_comp[0][1][1]) || | ||
| 1360 | !felem_to_BN(z, g_pre_comp[0][1][2])) | ||
| 1361 | { | ||
| 1362 | ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1363 | goto err; | ||
| 1364 | } | ||
| 1365 | if (!EC_POINT_set_Jprojective_coordinates_GFp(group, | ||
| 1366 | generator, x, y, z, ctx)) | ||
| 1367 | goto err; | ||
| 1368 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
| 1369 | /* precomputation matches generator */ | ||
| 1370 | have_pre_comp = 1; | ||
| 1371 | else | ||
| 1372 | /* we don't have valid precomputation: | ||
| 1373 | * treat the generator as a random point */ | ||
| 1374 | num_points = num_points + 1; | ||
| 1375 | } | ||
| 1376 | |||
| 1377 | if (num_points > 0) | ||
| 1378 | { | ||
| 1379 | if (num_points >= 3) | ||
| 1380 | { | ||
| 1381 | /* unless we precompute multiples for just one or two points, | ||
| 1382 | * converting those into affine form is time well spent */ | ||
| 1383 | mixed = 1; | ||
| 1384 | } | ||
| 1385 | secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); | ||
| 1386 | pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem)); | ||
| 1387 | if (mixed) | ||
| 1388 | tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem)); | ||
| 1389 | if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL))) | ||
| 1390 | { | ||
| 1391 | ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE); | ||
| 1392 | goto err; | ||
| 1393 | } | ||
| 1394 | |||
| 1395 | /* we treat NULL scalars as 0, and NULL points as points at infinity, | ||
| 1396 | * i.e., they contribute nothing to the linear combination */ | ||
| 1397 | memset(secrets, 0, num_points * sizeof(felem_bytearray)); | ||
| 1398 | memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem)); | ||
| 1399 | for (i = 0; i < num_points; ++i) | ||
| 1400 | { | ||
| 1401 | if (i == num) | ||
| 1402 | /* the generator */ | ||
| 1403 | { | ||
| 1404 | p = EC_GROUP_get0_generator(group); | ||
| 1405 | p_scalar = scalar; | ||
| 1406 | } | ||
| 1407 | else | ||
| 1408 | /* the i^th point */ | ||
| 1409 | { | ||
| 1410 | p = points[i]; | ||
| 1411 | p_scalar = scalars[i]; | ||
| 1412 | } | ||
| 1413 | if ((p_scalar != NULL) && (p != NULL)) | ||
| 1414 | { | ||
| 1415 | /* reduce scalar to 0 <= scalar < 2^224 */ | ||
| 1416 | if ((BN_num_bits(p_scalar) > 224) || (BN_is_negative(p_scalar))) | ||
| 1417 | { | ||
| 1418 | /* this is an unusual input, and we don't guarantee | ||
| 1419 | * constant-timeness */ | ||
| 1420 | if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) | ||
| 1421 | { | ||
| 1422 | ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1423 | goto err; | ||
| 1424 | } | ||
| 1425 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
| 1426 | } | ||
| 1427 | else | ||
| 1428 | num_bytes = BN_bn2bin(p_scalar, tmp); | ||
| 1429 | flip_endian(secrets[i], tmp, num_bytes); | ||
| 1430 | /* precompute multiples */ | ||
| 1431 | if ((!BN_to_felem(x_out, &p->X)) || | ||
| 1432 | (!BN_to_felem(y_out, &p->Y)) || | ||
| 1433 | (!BN_to_felem(z_out, &p->Z))) goto err; | ||
| 1434 | felem_assign(pre_comp[i][1][0], x_out); | ||
| 1435 | felem_assign(pre_comp[i][1][1], y_out); | ||
| 1436 | felem_assign(pre_comp[i][1][2], z_out); | ||
| 1437 | for (j = 2; j <= 16; ++j) | ||
| 1438 | { | ||
| 1439 | if (j & 1) | ||
| 1440 | { | ||
| 1441 | point_add( | ||
| 1442 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
| 1443 | pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], | ||
| 1444 | 0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); | ||
| 1445 | } | ||
| 1446 | else | ||
| 1447 | { | ||
| 1448 | point_double( | ||
| 1449 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
| 1450 | pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); | ||
| 1451 | } | ||
| 1452 | } | ||
| 1453 | } | ||
| 1454 | } | ||
| 1455 | if (mixed) | ||
| 1456 | make_points_affine(num_points * 17, pre_comp[0], tmp_felems); | ||
| 1457 | } | ||
| 1458 | |||
| 1459 | /* the scalar for the generator */ | ||
| 1460 | if ((scalar != NULL) && (have_pre_comp)) | ||
| 1461 | { | ||
| 1462 | memset(g_secret, 0, sizeof g_secret); | ||
| 1463 | /* reduce scalar to 0 <= scalar < 2^224 */ | ||
| 1464 | if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) | ||
| 1465 | { | ||
| 1466 | /* this is an unusual input, and we don't guarantee | ||
| 1467 | * constant-timeness */ | ||
| 1468 | if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) | ||
| 1469 | { | ||
| 1470 | ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1471 | goto err; | ||
| 1472 | } | ||
| 1473 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
| 1474 | } | ||
| 1475 | else | ||
| 1476 | num_bytes = BN_bn2bin(scalar, tmp); | ||
| 1477 | flip_endian(g_secret, tmp, num_bytes); | ||
| 1478 | /* do the multiplication with generator precomputation*/ | ||
| 1479 | batch_mul(x_out, y_out, z_out, | ||
| 1480 | (const felem_bytearray (*)) secrets, num_points, | ||
| 1481 | g_secret, | ||
| 1482 | mixed, (const felem (*)[17][3]) pre_comp, | ||
| 1483 | g_pre_comp); | ||
| 1484 | } | ||
| 1485 | else | ||
| 1486 | /* do the multiplication without generator precomputation */ | ||
| 1487 | batch_mul(x_out, y_out, z_out, | ||
| 1488 | (const felem_bytearray (*)) secrets, num_points, | ||
| 1489 | NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL); | ||
| 1490 | /* reduce the output to its unique minimal representation */ | ||
| 1491 | felem_contract(x_in, x_out); | ||
| 1492 | felem_contract(y_in, y_out); | ||
| 1493 | felem_contract(z_in, z_out); | ||
| 1494 | if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || | ||
| 1495 | (!felem_to_BN(z, z_in))) | ||
| 1496 | { | ||
| 1497 | ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1498 | goto err; | ||
| 1499 | } | ||
| 1500 | ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); | ||
| 1501 | |||
| 1502 | err: | ||
| 1503 | BN_CTX_end(ctx); | ||
| 1504 | if (generator != NULL) | ||
| 1505 | EC_POINT_free(generator); | ||
| 1506 | if (new_ctx != NULL) | ||
| 1507 | BN_CTX_free(new_ctx); | ||
| 1508 | if (secrets != NULL) | ||
| 1509 | OPENSSL_free(secrets); | ||
| 1510 | if (pre_comp != NULL) | ||
| 1511 | OPENSSL_free(pre_comp); | ||
| 1512 | if (tmp_felems != NULL) | ||
| 1513 | OPENSSL_free(tmp_felems); | ||
| 1514 | return ret; | ||
| 1515 | } | ||
| 1516 | |||
| 1517 | int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx) | ||
| 1518 | { | ||
| 1519 | int ret = 0; | ||
| 1520 | NISTP224_PRE_COMP *pre = NULL; | ||
| 1521 | int i, j; | ||
| 1522 | BN_CTX *new_ctx = NULL; | ||
| 1523 | BIGNUM *x, *y; | ||
| 1524 | EC_POINT *generator = NULL; | ||
| 1525 | felem tmp_felems[32]; | ||
| 1526 | |||
| 1527 | /* throw away old precomputation */ | ||
| 1528 | EC_EX_DATA_free_data(&group->extra_data, nistp224_pre_comp_dup, | ||
| 1529 | nistp224_pre_comp_free, nistp224_pre_comp_clear_free); | ||
| 1530 | if (ctx == NULL) | ||
| 1531 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
| 1532 | BN_CTX_start(ctx); | ||
| 1533 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
| 1534 | ((y = BN_CTX_get(ctx)) == NULL)) | ||
| 1535 | goto err; | ||
| 1536 | /* get the generator */ | ||
| 1537 | if (group->generator == NULL) goto err; | ||
| 1538 | generator = EC_POINT_new(group); | ||
| 1539 | if (generator == NULL) | ||
| 1540 | goto err; | ||
| 1541 | BN_bin2bn(nistp224_curve_params[3], sizeof (felem_bytearray), x); | ||
| 1542 | BN_bin2bn(nistp224_curve_params[4], sizeof (felem_bytearray), y); | ||
| 1543 | if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) | ||
| 1544 | goto err; | ||
| 1545 | if ((pre = nistp224_pre_comp_new()) == NULL) | ||
| 1546 | goto err; | ||
| 1547 | /* if the generator is the standard one, use built-in precomputation */ | ||
| 1548 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
| 1549 | { | ||
| 1550 | memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); | ||
| 1551 | ret = 1; | ||
| 1552 | goto err; | ||
| 1553 | } | ||
| 1554 | if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) || | ||
| 1555 | (!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) || | ||
| 1556 | (!BN_to_felem(pre->g_pre_comp[0][1][2], &group->generator->Z))) | ||
| 1557 | goto err; | ||
| 1558 | /* compute 2^56*G, 2^112*G, 2^168*G for the first table, | ||
| 1559 | * 2^28*G, 2^84*G, 2^140*G, 2^196*G for the second one | ||
| 1560 | */ | ||
| 1561 | for (i = 1; i <= 8; i <<= 1) | ||
| 1562 | { | ||
| 1563 | point_double( | ||
| 1564 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], | ||
| 1565 | pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]); | ||
| 1566 | for (j = 0; j < 27; ++j) | ||
| 1567 | { | ||
| 1568 | point_double( | ||
| 1569 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], | ||
| 1570 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); | ||
| 1571 | } | ||
| 1572 | if (i == 8) | ||
| 1573 | break; | ||
| 1574 | point_double( | ||
| 1575 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], | ||
| 1576 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); | ||
| 1577 | for (j = 0; j < 27; ++j) | ||
| 1578 | { | ||
| 1579 | point_double( | ||
| 1580 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], | ||
| 1581 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]); | ||
| 1582 | } | ||
| 1583 | } | ||
| 1584 | for (i = 0; i < 2; i++) | ||
| 1585 | { | ||
| 1586 | /* g_pre_comp[i][0] is the point at infinity */ | ||
| 1587 | memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0])); | ||
| 1588 | /* the remaining multiples */ | ||
| 1589 | /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */ | ||
| 1590 | point_add( | ||
| 1591 | pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], | ||
| 1592 | pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0], | ||
| 1593 | pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2], | ||
| 1594 | 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], | ||
| 1595 | pre->g_pre_comp[i][2][2]); | ||
| 1596 | /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */ | ||
| 1597 | point_add( | ||
| 1598 | pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], | ||
| 1599 | pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0], | ||
| 1600 | pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], | ||
| 1601 | 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], | ||
| 1602 | pre->g_pre_comp[i][2][2]); | ||
| 1603 | /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */ | ||
| 1604 | point_add( | ||
| 1605 | pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], | ||
| 1606 | pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0], | ||
| 1607 | pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], | ||
| 1608 | 0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], | ||
| 1609 | pre->g_pre_comp[i][4][2]); | ||
| 1610 | /* 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G */ | ||
| 1611 | point_add( | ||
| 1612 | pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], | ||
| 1613 | pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0], | ||
| 1614 | pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], | ||
| 1615 | 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], | ||
| 1616 | pre->g_pre_comp[i][2][2]); | ||
| 1617 | for (j = 1; j < 8; ++j) | ||
| 1618 | { | ||
| 1619 | /* odd multiples: add G resp. 2^28*G */ | ||
| 1620 | point_add( | ||
| 1621 | pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], | ||
| 1622 | pre->g_pre_comp[i][2*j+1][2], pre->g_pre_comp[i][2*j][0], | ||
| 1623 | pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2], | ||
| 1624 | 0, pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], | ||
| 1625 | pre->g_pre_comp[i][1][2]); | ||
| 1626 | } | ||
| 1627 | } | ||
| 1628 | make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems); | ||
| 1629 | |||
| 1630 | if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup, | ||
| 1631 | nistp224_pre_comp_free, nistp224_pre_comp_clear_free)) | ||
| 1632 | goto err; | ||
| 1633 | ret = 1; | ||
| 1634 | pre = NULL; | ||
| 1635 | err: | ||
| 1636 | BN_CTX_end(ctx); | ||
| 1637 | if (generator != NULL) | ||
| 1638 | EC_POINT_free(generator); | ||
| 1639 | if (new_ctx != NULL) | ||
| 1640 | BN_CTX_free(new_ctx); | ||
| 1641 | if (pre) | ||
| 1642 | nistp224_pre_comp_free(pre); | ||
| 1643 | return ret; | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group) | ||
| 1647 | { | ||
| 1648 | if (EC_EX_DATA_get_data(group->extra_data, nistp224_pre_comp_dup, | ||
| 1649 | nistp224_pre_comp_free, nistp224_pre_comp_clear_free) | ||
| 1650 | != NULL) | ||
| 1651 | return 1; | ||
| 1652 | else | ||
| 1653 | return 0; | ||
| 1654 | } | ||
| 1655 | |||
| 1656 | #else | ||
| 1657 | static void *dummy=&dummy; | ||
| 1658 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ecp_nistp256.c b/src/lib/libcrypto/ec/ecp_nistp256.c new file mode 100644 index 0000000000..4bc0f5dce0 --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistp256.c | |||
| @@ -0,0 +1,2171 @@ | |||
| 1 | /* crypto/ec/ecp_nistp256.c */ | ||
| 2 | /* | ||
| 3 | * Written by Adam Langley (Google) for the OpenSSL project | ||
| 4 | */ | ||
| 5 | /* Copyright 2011 Google Inc. | ||
| 6 | * | ||
| 7 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| 8 | * | ||
| 9 | * you may not use this file except in compliance with the License. | ||
| 10 | * You may obtain a copy of the License at | ||
| 11 | * | ||
| 12 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
| 13 | * | ||
| 14 | * Unless required by applicable law or agreed to in writing, software | ||
| 15 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
| 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| 17 | * See the License for the specific language governing permissions and | ||
| 18 | * limitations under the License. | ||
| 19 | */ | ||
| 20 | |||
| 21 | /* | ||
| 22 | * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication | ||
| 23 | * | ||
| 24 | * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c. | ||
| 25 | * Otherwise based on Emilia's P224 work, which was inspired by my curve25519 | ||
| 26 | * work which got its smarts from Daniel J. Bernstein's work on the same. | ||
| 27 | */ | ||
| 28 | |||
| 29 | #include <openssl/opensslconf.h> | ||
| 30 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 | ||
| 31 | |||
| 32 | #ifndef OPENSSL_SYS_VMS | ||
| 33 | #include <stdint.h> | ||
| 34 | #else | ||
| 35 | #include <inttypes.h> | ||
| 36 | #endif | ||
| 37 | |||
| 38 | #include <string.h> | ||
| 39 | #include <openssl/err.h> | ||
| 40 | #include "ec_lcl.h" | ||
| 41 | |||
| 42 | #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) | ||
| 43 | /* even with gcc, the typedef won't work for 32-bit platforms */ | ||
| 44 | typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ | ||
| 45 | typedef __int128_t int128_t; | ||
| 46 | #else | ||
| 47 | #error "Need GCC 3.1 or later to define type uint128_t" | ||
| 48 | #endif | ||
| 49 | |||
| 50 | typedef uint8_t u8; | ||
| 51 | typedef uint32_t u32; | ||
| 52 | typedef uint64_t u64; | ||
| 53 | typedef int64_t s64; | ||
| 54 | |||
| 55 | /* The underlying field. | ||
| 56 | * | ||
| 57 | * P256 operates over GF(2^256-2^224+2^192+2^96-1). We can serialise an element | ||
| 58 | * of this field into 32 bytes. We call this an felem_bytearray. */ | ||
| 59 | |||
| 60 | typedef u8 felem_bytearray[32]; | ||
| 61 | |||
| 62 | /* These are the parameters of P256, taken from FIPS 186-3, page 86. These | ||
| 63 | * values are big-endian. */ | ||
| 64 | static const felem_bytearray nistp256_curve_params[5] = { | ||
| 65 | {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */ | ||
| 66 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | ||
| 67 | 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, | ||
| 68 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, | ||
| 69 | {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */ | ||
| 70 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | ||
| 71 | 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, | ||
| 72 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */ | ||
| 73 | {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, | ||
| 74 | 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc, | ||
| 75 | 0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6, | ||
| 76 | 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b}, | ||
| 77 | {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */ | ||
| 78 | 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, | ||
| 79 | 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0, | ||
| 80 | 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96}, | ||
| 81 | {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */ | ||
| 82 | 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16, | ||
| 83 | 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, | ||
| 84 | 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5} | ||
| 85 | }; | ||
| 86 | |||
| 87 | /* The representation of field elements. | ||
| 88 | * ------------------------------------ | ||
| 89 | * | ||
| 90 | * We represent field elements with either four 128-bit values, eight 128-bit | ||
| 91 | * values, or four 64-bit values. The field element represented is: | ||
| 92 | * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192 (mod p) | ||
| 93 | * or: | ||
| 94 | * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512 (mod p) | ||
| 95 | * | ||
| 96 | * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits | ||
| 97 | * apart, but are 128-bits wide, the most significant bits of each limb overlap | ||
| 98 | * with the least significant bits of the next. | ||
| 99 | * | ||
| 100 | * A field element with four limbs is an 'felem'. One with eight limbs is a | ||
| 101 | * 'longfelem' | ||
| 102 | * | ||
| 103 | * A field element with four, 64-bit values is called a 'smallfelem'. Small | ||
| 104 | * values are used as intermediate values before multiplication. | ||
| 105 | */ | ||
| 106 | |||
| 107 | #define NLIMBS 4 | ||
| 108 | |||
| 109 | typedef uint128_t limb; | ||
| 110 | typedef limb felem[NLIMBS]; | ||
| 111 | typedef limb longfelem[NLIMBS * 2]; | ||
| 112 | typedef u64 smallfelem[NLIMBS]; | ||
| 113 | |||
| 114 | /* This is the value of the prime as four 64-bit words, little-endian. */ | ||
| 115 | static const u64 kPrime[4] = { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul }; | ||
| 116 | static const limb bottom32bits = 0xffffffff; | ||
| 117 | static const u64 bottom63bits = 0x7ffffffffffffffful; | ||
| 118 | |||
| 119 | /* bin32_to_felem takes a little-endian byte array and converts it into felem | ||
| 120 | * form. This assumes that the CPU is little-endian. */ | ||
| 121 | static void bin32_to_felem(felem out, const u8 in[32]) | ||
| 122 | { | ||
| 123 | out[0] = *((u64*) &in[0]); | ||
| 124 | out[1] = *((u64*) &in[8]); | ||
| 125 | out[2] = *((u64*) &in[16]); | ||
| 126 | out[3] = *((u64*) &in[24]); | ||
| 127 | } | ||
| 128 | |||
| 129 | /* smallfelem_to_bin32 takes a smallfelem and serialises into a little endian, | ||
| 130 | * 32 byte array. This assumes that the CPU is little-endian. */ | ||
| 131 | static void smallfelem_to_bin32(u8 out[32], const smallfelem in) | ||
| 132 | { | ||
| 133 | *((u64*) &out[0]) = in[0]; | ||
| 134 | *((u64*) &out[8]) = in[1]; | ||
| 135 | *((u64*) &out[16]) = in[2]; | ||
| 136 | *((u64*) &out[24]) = in[3]; | ||
| 137 | } | ||
| 138 | |||
| 139 | /* To preserve endianness when using BN_bn2bin and BN_bin2bn */ | ||
| 140 | static void flip_endian(u8 *out, const u8 *in, unsigned len) | ||
| 141 | { | ||
| 142 | unsigned i; | ||
| 143 | for (i = 0; i < len; ++i) | ||
| 144 | out[i] = in[len-1-i]; | ||
| 145 | } | ||
| 146 | |||
| 147 | /* BN_to_felem converts an OpenSSL BIGNUM into an felem */ | ||
| 148 | static int BN_to_felem(felem out, const BIGNUM *bn) | ||
| 149 | { | ||
| 150 | felem_bytearray b_in; | ||
| 151 | felem_bytearray b_out; | ||
| 152 | unsigned num_bytes; | ||
| 153 | |||
| 154 | /* BN_bn2bin eats leading zeroes */ | ||
| 155 | memset(b_out, 0, sizeof b_out); | ||
| 156 | num_bytes = BN_num_bytes(bn); | ||
| 157 | if (num_bytes > sizeof b_out) | ||
| 158 | { | ||
| 159 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
| 160 | return 0; | ||
| 161 | } | ||
| 162 | if (BN_is_negative(bn)) | ||
| 163 | { | ||
| 164 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
| 165 | return 0; | ||
| 166 | } | ||
| 167 | num_bytes = BN_bn2bin(bn, b_in); | ||
| 168 | flip_endian(b_out, b_in, num_bytes); | ||
| 169 | bin32_to_felem(out, b_out); | ||
| 170 | return 1; | ||
| 171 | } | ||
| 172 | |||
| 173 | /* felem_to_BN converts an felem into an OpenSSL BIGNUM */ | ||
| 174 | static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in) | ||
| 175 | { | ||
| 176 | felem_bytearray b_in, b_out; | ||
| 177 | smallfelem_to_bin32(b_in, in); | ||
| 178 | flip_endian(b_out, b_in, sizeof b_out); | ||
| 179 | return BN_bin2bn(b_out, sizeof b_out, out); | ||
| 180 | } | ||
| 181 | |||
| 182 | |||
| 183 | /* Field operations | ||
| 184 | * ---------------- */ | ||
| 185 | |||
| 186 | static void smallfelem_one(smallfelem out) | ||
| 187 | { | ||
| 188 | out[0] = 1; | ||
| 189 | out[1] = 0; | ||
| 190 | out[2] = 0; | ||
| 191 | out[3] = 0; | ||
| 192 | } | ||
| 193 | |||
| 194 | static void smallfelem_assign(smallfelem out, const smallfelem in) | ||
| 195 | { | ||
| 196 | out[0] = in[0]; | ||
| 197 | out[1] = in[1]; | ||
| 198 | out[2] = in[2]; | ||
| 199 | out[3] = in[3]; | ||
| 200 | } | ||
| 201 | |||
| 202 | static void felem_assign(felem out, const felem in) | ||
| 203 | { | ||
| 204 | out[0] = in[0]; | ||
| 205 | out[1] = in[1]; | ||
| 206 | out[2] = in[2]; | ||
| 207 | out[3] = in[3]; | ||
| 208 | } | ||
| 209 | |||
| 210 | /* felem_sum sets out = out + in. */ | ||
| 211 | static void felem_sum(felem out, const felem in) | ||
| 212 | { | ||
| 213 | out[0] += in[0]; | ||
| 214 | out[1] += in[1]; | ||
| 215 | out[2] += in[2]; | ||
| 216 | out[3] += in[3]; | ||
| 217 | } | ||
| 218 | |||
| 219 | /* felem_small_sum sets out = out + in. */ | ||
| 220 | static void felem_small_sum(felem out, const smallfelem in) | ||
| 221 | { | ||
| 222 | out[0] += in[0]; | ||
| 223 | out[1] += in[1]; | ||
| 224 | out[2] += in[2]; | ||
| 225 | out[3] += in[3]; | ||
| 226 | } | ||
| 227 | |||
| 228 | /* felem_scalar sets out = out * scalar */ | ||
| 229 | static void felem_scalar(felem out, const u64 scalar) | ||
| 230 | { | ||
| 231 | out[0] *= scalar; | ||
| 232 | out[1] *= scalar; | ||
| 233 | out[2] *= scalar; | ||
| 234 | out[3] *= scalar; | ||
| 235 | } | ||
| 236 | |||
| 237 | /* longfelem_scalar sets out = out * scalar */ | ||
| 238 | static void longfelem_scalar(longfelem out, const u64 scalar) | ||
| 239 | { | ||
| 240 | out[0] *= scalar; | ||
| 241 | out[1] *= scalar; | ||
| 242 | out[2] *= scalar; | ||
| 243 | out[3] *= scalar; | ||
| 244 | out[4] *= scalar; | ||
| 245 | out[5] *= scalar; | ||
| 246 | out[6] *= scalar; | ||
| 247 | out[7] *= scalar; | ||
| 248 | } | ||
| 249 | |||
| 250 | #define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9) | ||
| 251 | #define two105 (((limb)1) << 105) | ||
| 252 | #define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9) | ||
| 253 | |||
| 254 | /* zero105 is 0 mod p */ | ||
| 255 | static const felem zero105 = { two105m41m9, two105, two105m41p9, two105m41p9 }; | ||
| 256 | |||
| 257 | /* smallfelem_neg sets |out| to |-small| | ||
| 258 | * On exit: | ||
| 259 | * out[i] < out[i] + 2^105 | ||
| 260 | */ | ||
| 261 | static void smallfelem_neg(felem out, const smallfelem small) | ||
| 262 | { | ||
| 263 | /* In order to prevent underflow, we subtract from 0 mod p. */ | ||
| 264 | out[0] = zero105[0] - small[0]; | ||
| 265 | out[1] = zero105[1] - small[1]; | ||
| 266 | out[2] = zero105[2] - small[2]; | ||
| 267 | out[3] = zero105[3] - small[3]; | ||
| 268 | } | ||
| 269 | |||
| 270 | /* felem_diff subtracts |in| from |out| | ||
| 271 | * On entry: | ||
| 272 | * in[i] < 2^104 | ||
| 273 | * On exit: | ||
| 274 | * out[i] < out[i] + 2^105 | ||
| 275 | */ | ||
| 276 | static void felem_diff(felem out, const felem in) | ||
| 277 | { | ||
| 278 | /* In order to prevent underflow, we add 0 mod p before subtracting. */ | ||
| 279 | out[0] += zero105[0]; | ||
| 280 | out[1] += zero105[1]; | ||
| 281 | out[2] += zero105[2]; | ||
| 282 | out[3] += zero105[3]; | ||
| 283 | |||
| 284 | out[0] -= in[0]; | ||
| 285 | out[1] -= in[1]; | ||
| 286 | out[2] -= in[2]; | ||
| 287 | out[3] -= in[3]; | ||
| 288 | } | ||
| 289 | |||
| 290 | #define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11) | ||
| 291 | #define two107 (((limb)1) << 107) | ||
| 292 | #define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11) | ||
| 293 | |||
| 294 | /* zero107 is 0 mod p */ | ||
| 295 | static const felem zero107 = { two107m43m11, two107, two107m43p11, two107m43p11 }; | ||
| 296 | |||
| 297 | /* An alternative felem_diff for larger inputs |in| | ||
| 298 | * felem_diff_zero107 subtracts |in| from |out| | ||
| 299 | * On entry: | ||
| 300 | * in[i] < 2^106 | ||
| 301 | * On exit: | ||
| 302 | * out[i] < out[i] + 2^107 | ||
| 303 | */ | ||
| 304 | static void felem_diff_zero107(felem out, const felem in) | ||
| 305 | { | ||
| 306 | /* In order to prevent underflow, we add 0 mod p before subtracting. */ | ||
| 307 | out[0] += zero107[0]; | ||
| 308 | out[1] += zero107[1]; | ||
| 309 | out[2] += zero107[2]; | ||
| 310 | out[3] += zero107[3]; | ||
| 311 | |||
| 312 | out[0] -= in[0]; | ||
| 313 | out[1] -= in[1]; | ||
| 314 | out[2] -= in[2]; | ||
| 315 | out[3] -= in[3]; | ||
| 316 | } | ||
| 317 | |||
| 318 | /* longfelem_diff subtracts |in| from |out| | ||
| 319 | * On entry: | ||
| 320 | * in[i] < 7*2^67 | ||
| 321 | * On exit: | ||
| 322 | * out[i] < out[i] + 2^70 + 2^40 | ||
| 323 | */ | ||
| 324 | static void longfelem_diff(longfelem out, const longfelem in) | ||
| 325 | { | ||
| 326 | static const limb two70m8p6 = (((limb)1) << 70) - (((limb)1) << 8) + (((limb)1) << 6); | ||
| 327 | static const limb two70p40 = (((limb)1) << 70) + (((limb)1) << 40); | ||
| 328 | static const limb two70 = (((limb)1) << 70); | ||
| 329 | static const limb two70m40m38p6 = (((limb)1) << 70) - (((limb)1) << 40) - (((limb)1) << 38) + (((limb)1) << 6); | ||
| 330 | static const limb two70m6 = (((limb)1) << 70) - (((limb)1) << 6); | ||
| 331 | |||
| 332 | /* add 0 mod p to avoid underflow */ | ||
| 333 | out[0] += two70m8p6; | ||
| 334 | out[1] += two70p40; | ||
| 335 | out[2] += two70; | ||
| 336 | out[3] += two70m40m38p6; | ||
| 337 | out[4] += two70m6; | ||
| 338 | out[5] += two70m6; | ||
| 339 | out[6] += two70m6; | ||
| 340 | out[7] += two70m6; | ||
| 341 | |||
| 342 | /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */ | ||
| 343 | out[0] -= in[0]; | ||
| 344 | out[1] -= in[1]; | ||
| 345 | out[2] -= in[2]; | ||
| 346 | out[3] -= in[3]; | ||
| 347 | out[4] -= in[4]; | ||
| 348 | out[5] -= in[5]; | ||
| 349 | out[6] -= in[6]; | ||
| 350 | out[7] -= in[7]; | ||
| 351 | } | ||
| 352 | |||
| 353 | #define two64m0 (((limb)1) << 64) - 1 | ||
| 354 | #define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1 | ||
| 355 | #define two64m46 (((limb)1) << 64) - (((limb)1) << 46) | ||
| 356 | #define two64m32 (((limb)1) << 64) - (((limb)1) << 32) | ||
| 357 | |||
| 358 | /* zero110 is 0 mod p */ | ||
| 359 | static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 }; | ||
| 360 | |||
| 361 | /* felem_shrink converts an felem into a smallfelem. The result isn't quite | ||
| 362 | * minimal as the value may be greater than p. | ||
| 363 | * | ||
| 364 | * On entry: | ||
| 365 | * in[i] < 2^109 | ||
| 366 | * On exit: | ||
| 367 | * out[i] < 2^64 | ||
| 368 | */ | ||
| 369 | static void felem_shrink(smallfelem out, const felem in) | ||
| 370 | { | ||
| 371 | felem tmp; | ||
| 372 | u64 a, b, mask; | ||
| 373 | s64 high, low; | ||
| 374 | static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */ | ||
| 375 | |||
| 376 | /* Carry 2->3 */ | ||
| 377 | tmp[3] = zero110[3] + in[3] + ((u64) (in[2] >> 64)); | ||
| 378 | /* tmp[3] < 2^110 */ | ||
| 379 | |||
| 380 | tmp[2] = zero110[2] + (u64) in[2]; | ||
| 381 | tmp[0] = zero110[0] + in[0]; | ||
| 382 | tmp[1] = zero110[1] + in[1]; | ||
| 383 | /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */ | ||
| 384 | |||
| 385 | /* We perform two partial reductions where we eliminate the | ||
| 386 | * high-word of tmp[3]. We don't update the other words till the end. | ||
| 387 | */ | ||
| 388 | a = tmp[3] >> 64; /* a < 2^46 */ | ||
| 389 | tmp[3] = (u64) tmp[3]; | ||
| 390 | tmp[3] -= a; | ||
| 391 | tmp[3] += ((limb)a) << 32; | ||
| 392 | /* tmp[3] < 2^79 */ | ||
| 393 | |||
| 394 | b = a; | ||
| 395 | a = tmp[3] >> 64; /* a < 2^15 */ | ||
| 396 | b += a; /* b < 2^46 + 2^15 < 2^47 */ | ||
| 397 | tmp[3] = (u64) tmp[3]; | ||
| 398 | tmp[3] -= a; | ||
| 399 | tmp[3] += ((limb)a) << 32; | ||
| 400 | /* tmp[3] < 2^64 + 2^47 */ | ||
| 401 | |||
| 402 | /* This adjusts the other two words to complete the two partial | ||
| 403 | * reductions. */ | ||
| 404 | tmp[0] += b; | ||
| 405 | tmp[1] -= (((limb)b) << 32); | ||
| 406 | |||
| 407 | /* In order to make space in tmp[3] for the carry from 2 -> 3, we | ||
| 408 | * conditionally subtract kPrime if tmp[3] is large enough. */ | ||
| 409 | high = tmp[3] >> 64; | ||
| 410 | /* As tmp[3] < 2^65, high is either 1 or 0 */ | ||
| 411 | high <<= 63; | ||
| 412 | high >>= 63; | ||
| 413 | /* high is: | ||
| 414 | * all ones if the high word of tmp[3] is 1 | ||
| 415 | * all zeros if the high word of tmp[3] if 0 */ | ||
| 416 | low = tmp[3]; | ||
| 417 | mask = low >> 63; | ||
| 418 | /* mask is: | ||
| 419 | * all ones if the MSB of low is 1 | ||
| 420 | * all zeros if the MSB of low if 0 */ | ||
| 421 | low &= bottom63bits; | ||
| 422 | low -= kPrime3Test; | ||
| 423 | /* if low was greater than kPrime3Test then the MSB is zero */ | ||
| 424 | low = ~low; | ||
| 425 | low >>= 63; | ||
| 426 | /* low is: | ||
| 427 | * all ones if low was > kPrime3Test | ||
| 428 | * all zeros if low was <= kPrime3Test */ | ||
| 429 | mask = (mask & low) | high; | ||
| 430 | tmp[0] -= mask & kPrime[0]; | ||
| 431 | tmp[1] -= mask & kPrime[1]; | ||
| 432 | /* kPrime[2] is zero, so omitted */ | ||
| 433 | tmp[3] -= mask & kPrime[3]; | ||
| 434 | /* tmp[3] < 2**64 - 2**32 + 1 */ | ||
| 435 | |||
| 436 | tmp[1] += ((u64) (tmp[0] >> 64)); tmp[0] = (u64) tmp[0]; | ||
| 437 | tmp[2] += ((u64) (tmp[1] >> 64)); tmp[1] = (u64) tmp[1]; | ||
| 438 | tmp[3] += ((u64) (tmp[2] >> 64)); tmp[2] = (u64) tmp[2]; | ||
| 439 | /* tmp[i] < 2^64 */ | ||
| 440 | |||
| 441 | out[0] = tmp[0]; | ||
| 442 | out[1] = tmp[1]; | ||
| 443 | out[2] = tmp[2]; | ||
| 444 | out[3] = tmp[3]; | ||
| 445 | } | ||
| 446 | |||
| 447 | /* smallfelem_expand converts a smallfelem to an felem */ | ||
| 448 | static void smallfelem_expand(felem out, const smallfelem in) | ||
| 449 | { | ||
| 450 | out[0] = in[0]; | ||
| 451 | out[1] = in[1]; | ||
| 452 | out[2] = in[2]; | ||
| 453 | out[3] = in[3]; | ||
| 454 | } | ||
| 455 | |||
| 456 | /* smallfelem_square sets |out| = |small|^2 | ||
| 457 | * On entry: | ||
| 458 | * small[i] < 2^64 | ||
| 459 | * On exit: | ||
| 460 | * out[i] < 7 * 2^64 < 2^67 | ||
| 461 | */ | ||
| 462 | static void smallfelem_square(longfelem out, const smallfelem small) | ||
| 463 | { | ||
| 464 | limb a; | ||
| 465 | u64 high, low; | ||
| 466 | |||
| 467 | a = ((uint128_t) small[0]) * small[0]; | ||
| 468 | low = a; | ||
| 469 | high = a >> 64; | ||
| 470 | out[0] = low; | ||
| 471 | out[1] = high; | ||
| 472 | |||
| 473 | a = ((uint128_t) small[0]) * small[1]; | ||
| 474 | low = a; | ||
| 475 | high = a >> 64; | ||
| 476 | out[1] += low; | ||
| 477 | out[1] += low; | ||
| 478 | out[2] = high; | ||
| 479 | |||
| 480 | a = ((uint128_t) small[0]) * small[2]; | ||
| 481 | low = a; | ||
| 482 | high = a >> 64; | ||
| 483 | out[2] += low; | ||
| 484 | out[2] *= 2; | ||
| 485 | out[3] = high; | ||
| 486 | |||
| 487 | a = ((uint128_t) small[0]) * small[3]; | ||
| 488 | low = a; | ||
| 489 | high = a >> 64; | ||
| 490 | out[3] += low; | ||
| 491 | out[4] = high; | ||
| 492 | |||
| 493 | a = ((uint128_t) small[1]) * small[2]; | ||
| 494 | low = a; | ||
| 495 | high = a >> 64; | ||
| 496 | out[3] += low; | ||
| 497 | out[3] *= 2; | ||
| 498 | out[4] += high; | ||
| 499 | |||
| 500 | a = ((uint128_t) small[1]) * small[1]; | ||
| 501 | low = a; | ||
| 502 | high = a >> 64; | ||
| 503 | out[2] += low; | ||
| 504 | out[3] += high; | ||
| 505 | |||
| 506 | a = ((uint128_t) small[1]) * small[3]; | ||
| 507 | low = a; | ||
| 508 | high = a >> 64; | ||
| 509 | out[4] += low; | ||
| 510 | out[4] *= 2; | ||
| 511 | out[5] = high; | ||
| 512 | |||
| 513 | a = ((uint128_t) small[2]) * small[3]; | ||
| 514 | low = a; | ||
| 515 | high = a >> 64; | ||
| 516 | out[5] += low; | ||
| 517 | out[5] *= 2; | ||
| 518 | out[6] = high; | ||
| 519 | out[6] += high; | ||
| 520 | |||
| 521 | a = ((uint128_t) small[2]) * small[2]; | ||
| 522 | low = a; | ||
| 523 | high = a >> 64; | ||
| 524 | out[4] += low; | ||
| 525 | out[5] += high; | ||
| 526 | |||
| 527 | a = ((uint128_t) small[3]) * small[3]; | ||
| 528 | low = a; | ||
| 529 | high = a >> 64; | ||
| 530 | out[6] += low; | ||
| 531 | out[7] = high; | ||
| 532 | } | ||
| 533 | |||
| 534 | /* felem_square sets |out| = |in|^2 | ||
| 535 | * On entry: | ||
| 536 | * in[i] < 2^109 | ||
| 537 | * On exit: | ||
| 538 | * out[i] < 7 * 2^64 < 2^67 | ||
| 539 | */ | ||
| 540 | static void felem_square(longfelem out, const felem in) | ||
| 541 | { | ||
| 542 | u64 small[4]; | ||
| 543 | felem_shrink(small, in); | ||
| 544 | smallfelem_square(out, small); | ||
| 545 | } | ||
| 546 | |||
| 547 | /* smallfelem_mul sets |out| = |small1| * |small2| | ||
| 548 | * On entry: | ||
| 549 | * small1[i] < 2^64 | ||
| 550 | * small2[i] < 2^64 | ||
| 551 | * On exit: | ||
| 552 | * out[i] < 7 * 2^64 < 2^67 | ||
| 553 | */ | ||
| 554 | static void smallfelem_mul(longfelem out, const smallfelem small1, const smallfelem small2) | ||
| 555 | { | ||
| 556 | limb a; | ||
| 557 | u64 high, low; | ||
| 558 | |||
| 559 | a = ((uint128_t) small1[0]) * small2[0]; | ||
| 560 | low = a; | ||
| 561 | high = a >> 64; | ||
| 562 | out[0] = low; | ||
| 563 | out[1] = high; | ||
| 564 | |||
| 565 | |||
| 566 | a = ((uint128_t) small1[0]) * small2[1]; | ||
| 567 | low = a; | ||
| 568 | high = a >> 64; | ||
| 569 | out[1] += low; | ||
| 570 | out[2] = high; | ||
| 571 | |||
| 572 | a = ((uint128_t) small1[1]) * small2[0]; | ||
| 573 | low = a; | ||
| 574 | high = a >> 64; | ||
| 575 | out[1] += low; | ||
| 576 | out[2] += high; | ||
| 577 | |||
| 578 | |||
| 579 | a = ((uint128_t) small1[0]) * small2[2]; | ||
| 580 | low = a; | ||
| 581 | high = a >> 64; | ||
| 582 | out[2] += low; | ||
| 583 | out[3] = high; | ||
| 584 | |||
| 585 | a = ((uint128_t) small1[1]) * small2[1]; | ||
| 586 | low = a; | ||
| 587 | high = a >> 64; | ||
| 588 | out[2] += low; | ||
| 589 | out[3] += high; | ||
| 590 | |||
| 591 | a = ((uint128_t) small1[2]) * small2[0]; | ||
| 592 | low = a; | ||
| 593 | high = a >> 64; | ||
| 594 | out[2] += low; | ||
| 595 | out[3] += high; | ||
| 596 | |||
| 597 | |||
| 598 | a = ((uint128_t) small1[0]) * small2[3]; | ||
| 599 | low = a; | ||
| 600 | high = a >> 64; | ||
| 601 | out[3] += low; | ||
| 602 | out[4] = high; | ||
| 603 | |||
| 604 | a = ((uint128_t) small1[1]) * small2[2]; | ||
| 605 | low = a; | ||
| 606 | high = a >> 64; | ||
| 607 | out[3] += low; | ||
| 608 | out[4] += high; | ||
| 609 | |||
| 610 | a = ((uint128_t) small1[2]) * small2[1]; | ||
| 611 | low = a; | ||
| 612 | high = a >> 64; | ||
| 613 | out[3] += low; | ||
| 614 | out[4] += high; | ||
| 615 | |||
| 616 | a = ((uint128_t) small1[3]) * small2[0]; | ||
| 617 | low = a; | ||
| 618 | high = a >> 64; | ||
| 619 | out[3] += low; | ||
| 620 | out[4] += high; | ||
| 621 | |||
| 622 | |||
| 623 | a = ((uint128_t) small1[1]) * small2[3]; | ||
| 624 | low = a; | ||
| 625 | high = a >> 64; | ||
| 626 | out[4] += low; | ||
| 627 | out[5] = high; | ||
| 628 | |||
| 629 | a = ((uint128_t) small1[2]) * small2[2]; | ||
| 630 | low = a; | ||
| 631 | high = a >> 64; | ||
| 632 | out[4] += low; | ||
| 633 | out[5] += high; | ||
| 634 | |||
| 635 | a = ((uint128_t) small1[3]) * small2[1]; | ||
| 636 | low = a; | ||
| 637 | high = a >> 64; | ||
| 638 | out[4] += low; | ||
| 639 | out[5] += high; | ||
| 640 | |||
| 641 | |||
| 642 | a = ((uint128_t) small1[2]) * small2[3]; | ||
| 643 | low = a; | ||
| 644 | high = a >> 64; | ||
| 645 | out[5] += low; | ||
| 646 | out[6] = high; | ||
| 647 | |||
| 648 | a = ((uint128_t) small1[3]) * small2[2]; | ||
| 649 | low = a; | ||
| 650 | high = a >> 64; | ||
| 651 | out[5] += low; | ||
| 652 | out[6] += high; | ||
| 653 | |||
| 654 | |||
| 655 | a = ((uint128_t) small1[3]) * small2[3]; | ||
| 656 | low = a; | ||
| 657 | high = a >> 64; | ||
| 658 | out[6] += low; | ||
| 659 | out[7] = high; | ||
| 660 | } | ||
| 661 | |||
| 662 | /* felem_mul sets |out| = |in1| * |in2| | ||
| 663 | * On entry: | ||
| 664 | * in1[i] < 2^109 | ||
| 665 | * in2[i] < 2^109 | ||
| 666 | * On exit: | ||
| 667 | * out[i] < 7 * 2^64 < 2^67 | ||
| 668 | */ | ||
| 669 | static void felem_mul(longfelem out, const felem in1, const felem in2) | ||
| 670 | { | ||
| 671 | smallfelem small1, small2; | ||
| 672 | felem_shrink(small1, in1); | ||
| 673 | felem_shrink(small2, in2); | ||
| 674 | smallfelem_mul(out, small1, small2); | ||
| 675 | } | ||
| 676 | |||
| 677 | /* felem_small_mul sets |out| = |small1| * |in2| | ||
| 678 | * On entry: | ||
| 679 | * small1[i] < 2^64 | ||
| 680 | * in2[i] < 2^109 | ||
| 681 | * On exit: | ||
| 682 | * out[i] < 7 * 2^64 < 2^67 | ||
| 683 | */ | ||
| 684 | static void felem_small_mul(longfelem out, const smallfelem small1, const felem in2) | ||
| 685 | { | ||
| 686 | smallfelem small2; | ||
| 687 | felem_shrink(small2, in2); | ||
| 688 | smallfelem_mul(out, small1, small2); | ||
| 689 | } | ||
| 690 | |||
| 691 | #define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4) | ||
| 692 | #define two100 (((limb)1) << 100) | ||
| 693 | #define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4) | ||
| 694 | /* zero100 is 0 mod p */ | ||
| 695 | static const felem zero100 = { two100m36m4, two100, two100m36p4, two100m36p4 }; | ||
| 696 | |||
| 697 | /* Internal function for the different flavours of felem_reduce. | ||
| 698 | * felem_reduce_ reduces the higher coefficients in[4]-in[7]. | ||
| 699 | * On entry: | ||
| 700 | * out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7] | ||
| 701 | * out[1] >= in[7] + 2^32*in[4] | ||
| 702 | * out[2] >= in[5] + 2^32*in[5] | ||
| 703 | * out[3] >= in[4] + 2^32*in[5] + 2^32*in[6] | ||
| 704 | * On exit: | ||
| 705 | * out[0] <= out[0] + in[4] + 2^32*in[5] | ||
| 706 | * out[1] <= out[1] + in[5] + 2^33*in[6] | ||
| 707 | * out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7] | ||
| 708 | * out[3] <= out[3] + 2^32*in[4] + 3*in[7] | ||
| 709 | */ | ||
| 710 | static void felem_reduce_(felem out, const longfelem in) | ||
| 711 | { | ||
| 712 | int128_t c; | ||
| 713 | /* combine common terms from below */ | ||
| 714 | c = in[4] + (in[5] << 32); | ||
| 715 | out[0] += c; | ||
| 716 | out[3] -= c; | ||
| 717 | |||
| 718 | c = in[5] - in[7]; | ||
| 719 | out[1] += c; | ||
| 720 | out[2] -= c; | ||
| 721 | |||
| 722 | /* the remaining terms */ | ||
| 723 | /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */ | ||
| 724 | out[1] -= (in[4] << 32); | ||
| 725 | out[3] += (in[4] << 32); | ||
| 726 | |||
| 727 | /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */ | ||
| 728 | out[2] -= (in[5] << 32); | ||
| 729 | |||
| 730 | /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */ | ||
| 731 | out[0] -= in[6]; | ||
| 732 | out[0] -= (in[6] << 32); | ||
| 733 | out[1] += (in[6] << 33); | ||
| 734 | out[2] += (in[6] * 2); | ||
| 735 | out[3] -= (in[6] << 32); | ||
| 736 | |||
| 737 | /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */ | ||
| 738 | out[0] -= in[7]; | ||
| 739 | out[0] -= (in[7] << 32); | ||
| 740 | out[2] += (in[7] << 33); | ||
| 741 | out[3] += (in[7] * 3); | ||
| 742 | } | ||
| 743 | |||
| 744 | /* felem_reduce converts a longfelem into an felem. | ||
| 745 | * To be called directly after felem_square or felem_mul. | ||
| 746 | * On entry: | ||
| 747 | * in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64 | ||
| 748 | * in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64 | ||
| 749 | * On exit: | ||
| 750 | * out[i] < 2^101 | ||
| 751 | */ | ||
| 752 | static void felem_reduce(felem out, const longfelem in) | ||
| 753 | { | ||
| 754 | out[0] = zero100[0] + in[0]; | ||
| 755 | out[1] = zero100[1] + in[1]; | ||
| 756 | out[2] = zero100[2] + in[2]; | ||
| 757 | out[3] = zero100[3] + in[3]; | ||
| 758 | |||
| 759 | felem_reduce_(out, in); | ||
| 760 | |||
| 761 | /* out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0 | ||
| 762 | * out[1] > 2^100 - 2^64 - 7*2^96 > 0 | ||
| 763 | * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0 | ||
| 764 | * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0 | ||
| 765 | * | ||
| 766 | * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101 | ||
| 767 | * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101 | ||
| 768 | * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101 | ||
| 769 | * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101 | ||
| 770 | */ | ||
| 771 | } | ||
| 772 | |||
| 773 | /* felem_reduce_zero105 converts a larger longfelem into an felem. | ||
| 774 | * On entry: | ||
| 775 | * in[0] < 2^71 | ||
| 776 | * On exit: | ||
| 777 | * out[i] < 2^106 | ||
| 778 | */ | ||
| 779 | static void felem_reduce_zero105(felem out, const longfelem in) | ||
| 780 | { | ||
| 781 | out[0] = zero105[0] + in[0]; | ||
| 782 | out[1] = zero105[1] + in[1]; | ||
| 783 | out[2] = zero105[2] + in[2]; | ||
| 784 | out[3] = zero105[3] + in[3]; | ||
| 785 | |||
| 786 | felem_reduce_(out, in); | ||
| 787 | |||
| 788 | /* out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0 | ||
| 789 | * out[1] > 2^105 - 2^71 - 2^103 > 0 | ||
| 790 | * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0 | ||
| 791 | * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0 | ||
| 792 | * | ||
| 793 | * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 | ||
| 794 | * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 | ||
| 795 | * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106 | ||
| 796 | * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106 | ||
| 797 | */ | ||
| 798 | } | ||
| 799 | |||
| 800 | /* subtract_u64 sets *result = *result - v and *carry to one if the subtraction | ||
| 801 | * underflowed. */ | ||
| 802 | static void subtract_u64(u64* result, u64* carry, u64 v) | ||
| 803 | { | ||
| 804 | uint128_t r = *result; | ||
| 805 | r -= v; | ||
| 806 | *carry = (r >> 64) & 1; | ||
| 807 | *result = (u64) r; | ||
| 808 | } | ||
| 809 | |||
| 810 | /* felem_contract converts |in| to its unique, minimal representation. | ||
| 811 | * On entry: | ||
| 812 | * in[i] < 2^109 | ||
| 813 | */ | ||
| 814 | static void felem_contract(smallfelem out, const felem in) | ||
| 815 | { | ||
| 816 | unsigned i; | ||
| 817 | u64 all_equal_so_far = 0, result = 0, carry; | ||
| 818 | |||
| 819 | felem_shrink(out, in); | ||
| 820 | /* small is minimal except that the value might be > p */ | ||
| 821 | |||
| 822 | all_equal_so_far--; | ||
| 823 | /* We are doing a constant time test if out >= kPrime. We need to | ||
| 824 | * compare each u64, from most-significant to least significant. For | ||
| 825 | * each one, if all words so far have been equal (m is all ones) then a | ||
| 826 | * non-equal result is the answer. Otherwise we continue. */ | ||
| 827 | for (i = 3; i < 4; i--) | ||
| 828 | { | ||
| 829 | u64 equal; | ||
| 830 | uint128_t a = ((uint128_t) kPrime[i]) - out[i]; | ||
| 831 | /* if out[i] > kPrime[i] then a will underflow and the high | ||
| 832 | * 64-bits will all be set. */ | ||
| 833 | result |= all_equal_so_far & ((u64) (a >> 64)); | ||
| 834 | |||
| 835 | /* if kPrime[i] == out[i] then |equal| will be all zeros and | ||
| 836 | * the decrement will make it all ones. */ | ||
| 837 | equal = kPrime[i] ^ out[i]; | ||
| 838 | equal--; | ||
| 839 | equal &= equal << 32; | ||
| 840 | equal &= equal << 16; | ||
| 841 | equal &= equal << 8; | ||
| 842 | equal &= equal << 4; | ||
| 843 | equal &= equal << 2; | ||
| 844 | equal &= equal << 1; | ||
| 845 | equal = ((s64) equal) >> 63; | ||
| 846 | |||
| 847 | all_equal_so_far &= equal; | ||
| 848 | } | ||
| 849 | |||
| 850 | /* if all_equal_so_far is still all ones then the two values are equal | ||
| 851 | * and so out >= kPrime is true. */ | ||
| 852 | result |= all_equal_so_far; | ||
| 853 | |||
| 854 | /* if out >= kPrime then we subtract kPrime. */ | ||
| 855 | subtract_u64(&out[0], &carry, result & kPrime[0]); | ||
| 856 | subtract_u64(&out[1], &carry, carry); | ||
| 857 | subtract_u64(&out[2], &carry, carry); | ||
| 858 | subtract_u64(&out[3], &carry, carry); | ||
| 859 | |||
| 860 | subtract_u64(&out[1], &carry, result & kPrime[1]); | ||
| 861 | subtract_u64(&out[2], &carry, carry); | ||
| 862 | subtract_u64(&out[3], &carry, carry); | ||
| 863 | |||
| 864 | subtract_u64(&out[2], &carry, result & kPrime[2]); | ||
| 865 | subtract_u64(&out[3], &carry, carry); | ||
| 866 | |||
| 867 | subtract_u64(&out[3], &carry, result & kPrime[3]); | ||
| 868 | } | ||
| 869 | |||
| 870 | static void smallfelem_square_contract(smallfelem out, const smallfelem in) | ||
| 871 | { | ||
| 872 | longfelem longtmp; | ||
| 873 | felem tmp; | ||
| 874 | |||
| 875 | smallfelem_square(longtmp, in); | ||
| 876 | felem_reduce(tmp, longtmp); | ||
| 877 | felem_contract(out, tmp); | ||
| 878 | } | ||
| 879 | |||
| 880 | static void smallfelem_mul_contract(smallfelem out, const smallfelem in1, const smallfelem in2) | ||
| 881 | { | ||
| 882 | longfelem longtmp; | ||
| 883 | felem tmp; | ||
| 884 | |||
| 885 | smallfelem_mul(longtmp, in1, in2); | ||
| 886 | felem_reduce(tmp, longtmp); | ||
| 887 | felem_contract(out, tmp); | ||
| 888 | } | ||
| 889 | |||
| 890 | /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0 | ||
| 891 | * otherwise. | ||
| 892 | * On entry: | ||
| 893 | * small[i] < 2^64 | ||
| 894 | */ | ||
| 895 | static limb smallfelem_is_zero(const smallfelem small) | ||
| 896 | { | ||
| 897 | limb result; | ||
| 898 | u64 is_p; | ||
| 899 | |||
| 900 | u64 is_zero = small[0] | small[1] | small[2] | small[3]; | ||
| 901 | is_zero--; | ||
| 902 | is_zero &= is_zero << 32; | ||
| 903 | is_zero &= is_zero << 16; | ||
| 904 | is_zero &= is_zero << 8; | ||
| 905 | is_zero &= is_zero << 4; | ||
| 906 | is_zero &= is_zero << 2; | ||
| 907 | is_zero &= is_zero << 1; | ||
| 908 | is_zero = ((s64) is_zero) >> 63; | ||
| 909 | |||
| 910 | is_p = (small[0] ^ kPrime[0]) | | ||
| 911 | (small[1] ^ kPrime[1]) | | ||
| 912 | (small[2] ^ kPrime[2]) | | ||
| 913 | (small[3] ^ kPrime[3]); | ||
| 914 | is_p--; | ||
| 915 | is_p &= is_p << 32; | ||
| 916 | is_p &= is_p << 16; | ||
| 917 | is_p &= is_p << 8; | ||
| 918 | is_p &= is_p << 4; | ||
| 919 | is_p &= is_p << 2; | ||
| 920 | is_p &= is_p << 1; | ||
| 921 | is_p = ((s64) is_p) >> 63; | ||
| 922 | |||
| 923 | is_zero |= is_p; | ||
| 924 | |||
| 925 | result = is_zero; | ||
| 926 | result |= ((limb) is_zero) << 64; | ||
| 927 | return result; | ||
| 928 | } | ||
| 929 | |||
| 930 | static int smallfelem_is_zero_int(const smallfelem small) | ||
| 931 | { | ||
| 932 | return (int) (smallfelem_is_zero(small) & ((limb)1)); | ||
| 933 | } | ||
| 934 | |||
| 935 | /* felem_inv calculates |out| = |in|^{-1} | ||
| 936 | * | ||
| 937 | * Based on Fermat's Little Theorem: | ||
| 938 | * a^p = a (mod p) | ||
| 939 | * a^{p-1} = 1 (mod p) | ||
| 940 | * a^{p-2} = a^{-1} (mod p) | ||
| 941 | */ | ||
| 942 | static void felem_inv(felem out, const felem in) | ||
| 943 | { | ||
| 944 | felem ftmp, ftmp2; | ||
| 945 | /* each e_I will hold |in|^{2^I - 1} */ | ||
| 946 | felem e2, e4, e8, e16, e32, e64; | ||
| 947 | longfelem tmp; | ||
| 948 | unsigned i; | ||
| 949 | |||
| 950 | felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */ | ||
| 951 | felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */ | ||
| 952 | felem_assign(e2, ftmp); | ||
| 953 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */ | ||
| 954 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^2 */ | ||
| 955 | felem_mul(tmp, ftmp, e2); felem_reduce(ftmp, tmp); /* 2^4 - 2^0 */ | ||
| 956 | felem_assign(e4, ftmp); | ||
| 957 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^5 - 2^1 */ | ||
| 958 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^6 - 2^2 */ | ||
| 959 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^7 - 2^3 */ | ||
| 960 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^8 - 2^4 */ | ||
| 961 | felem_mul(tmp, ftmp, e4); felem_reduce(ftmp, tmp); /* 2^8 - 2^0 */ | ||
| 962 | felem_assign(e8, ftmp); | ||
| 963 | for (i = 0; i < 8; i++) { | ||
| 964 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); | ||
| 965 | } /* 2^16 - 2^8 */ | ||
| 966 | felem_mul(tmp, ftmp, e8); felem_reduce(ftmp, tmp); /* 2^16 - 2^0 */ | ||
| 967 | felem_assign(e16, ftmp); | ||
| 968 | for (i = 0; i < 16; i++) { | ||
| 969 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); | ||
| 970 | } /* 2^32 - 2^16 */ | ||
| 971 | felem_mul(tmp, ftmp, e16); felem_reduce(ftmp, tmp); /* 2^32 - 2^0 */ | ||
| 972 | felem_assign(e32, ftmp); | ||
| 973 | for (i = 0; i < 32; i++) { | ||
| 974 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); | ||
| 975 | } /* 2^64 - 2^32 */ | ||
| 976 | felem_assign(e64, ftmp); | ||
| 977 | felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp); /* 2^64 - 2^32 + 2^0 */ | ||
| 978 | for (i = 0; i < 192; i++) { | ||
| 979 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); | ||
| 980 | } /* 2^256 - 2^224 + 2^192 */ | ||
| 981 | |||
| 982 | felem_mul(tmp, e64, e32); felem_reduce(ftmp2, tmp); /* 2^64 - 2^0 */ | ||
| 983 | for (i = 0; i < 16; i++) { | ||
| 984 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); | ||
| 985 | } /* 2^80 - 2^16 */ | ||
| 986 | felem_mul(tmp, ftmp2, e16); felem_reduce(ftmp2, tmp); /* 2^80 - 2^0 */ | ||
| 987 | for (i = 0; i < 8; i++) { | ||
| 988 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); | ||
| 989 | } /* 2^88 - 2^8 */ | ||
| 990 | felem_mul(tmp, ftmp2, e8); felem_reduce(ftmp2, tmp); /* 2^88 - 2^0 */ | ||
| 991 | for (i = 0; i < 4; i++) { | ||
| 992 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); | ||
| 993 | } /* 2^92 - 2^4 */ | ||
| 994 | felem_mul(tmp, ftmp2, e4); felem_reduce(ftmp2, tmp); /* 2^92 - 2^0 */ | ||
| 995 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^93 - 2^1 */ | ||
| 996 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^94 - 2^2 */ | ||
| 997 | felem_mul(tmp, ftmp2, e2); felem_reduce(ftmp2, tmp); /* 2^94 - 2^0 */ | ||
| 998 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^95 - 2^1 */ | ||
| 999 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^96 - 2^2 */ | ||
| 1000 | felem_mul(tmp, ftmp2, in); felem_reduce(ftmp2, tmp); /* 2^96 - 3 */ | ||
| 1001 | |||
| 1002 | felem_mul(tmp, ftmp2, ftmp); felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */ | ||
| 1003 | } | ||
| 1004 | |||
| 1005 | static void smallfelem_inv_contract(smallfelem out, const smallfelem in) | ||
| 1006 | { | ||
| 1007 | felem tmp; | ||
| 1008 | |||
| 1009 | smallfelem_expand(tmp, in); | ||
| 1010 | felem_inv(tmp, tmp); | ||
| 1011 | felem_contract(out, tmp); | ||
| 1012 | } | ||
| 1013 | |||
| 1014 | /* Group operations | ||
| 1015 | * ---------------- | ||
| 1016 | * | ||
| 1017 | * Building on top of the field operations we have the operations on the | ||
| 1018 | * elliptic curve group itself. Points on the curve are represented in Jacobian | ||
| 1019 | * coordinates */ | ||
| 1020 | |||
| 1021 | /* point_double calculates 2*(x_in, y_in, z_in) | ||
| 1022 | * | ||
| 1023 | * The method is taken from: | ||
| 1024 | * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b | ||
| 1025 | * | ||
| 1026 | * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. | ||
| 1027 | * while x_out == y_in is not (maybe this works, but it's not tested). */ | ||
| 1028 | static void | ||
| 1029 | point_double(felem x_out, felem y_out, felem z_out, | ||
| 1030 | const felem x_in, const felem y_in, const felem z_in) | ||
| 1031 | { | ||
| 1032 | longfelem tmp, tmp2; | ||
| 1033 | felem delta, gamma, beta, alpha, ftmp, ftmp2; | ||
| 1034 | smallfelem small1, small2; | ||
| 1035 | |||
| 1036 | felem_assign(ftmp, x_in); | ||
| 1037 | /* ftmp[i] < 2^106 */ | ||
| 1038 | felem_assign(ftmp2, x_in); | ||
| 1039 | /* ftmp2[i] < 2^106 */ | ||
| 1040 | |||
| 1041 | /* delta = z^2 */ | ||
| 1042 | felem_square(tmp, z_in); | ||
| 1043 | felem_reduce(delta, tmp); | ||
| 1044 | /* delta[i] < 2^101 */ | ||
| 1045 | |||
| 1046 | /* gamma = y^2 */ | ||
| 1047 | felem_square(tmp, y_in); | ||
| 1048 | felem_reduce(gamma, tmp); | ||
| 1049 | /* gamma[i] < 2^101 */ | ||
| 1050 | felem_shrink(small1, gamma); | ||
| 1051 | |||
| 1052 | /* beta = x*gamma */ | ||
| 1053 | felem_small_mul(tmp, small1, x_in); | ||
| 1054 | felem_reduce(beta, tmp); | ||
| 1055 | /* beta[i] < 2^101 */ | ||
| 1056 | |||
| 1057 | /* alpha = 3*(x-delta)*(x+delta) */ | ||
| 1058 | felem_diff(ftmp, delta); | ||
| 1059 | /* ftmp[i] < 2^105 + 2^106 < 2^107 */ | ||
| 1060 | felem_sum(ftmp2, delta); | ||
| 1061 | /* ftmp2[i] < 2^105 + 2^106 < 2^107 */ | ||
| 1062 | felem_scalar(ftmp2, 3); | ||
| 1063 | /* ftmp2[i] < 3 * 2^107 < 2^109 */ | ||
| 1064 | felem_mul(tmp, ftmp, ftmp2); | ||
| 1065 | felem_reduce(alpha, tmp); | ||
| 1066 | /* alpha[i] < 2^101 */ | ||
| 1067 | felem_shrink(small2, alpha); | ||
| 1068 | |||
| 1069 | /* x' = alpha^2 - 8*beta */ | ||
| 1070 | smallfelem_square(tmp, small2); | ||
| 1071 | felem_reduce(x_out, tmp); | ||
| 1072 | felem_assign(ftmp, beta); | ||
| 1073 | felem_scalar(ftmp, 8); | ||
| 1074 | /* ftmp[i] < 8 * 2^101 = 2^104 */ | ||
| 1075 | felem_diff(x_out, ftmp); | ||
| 1076 | /* x_out[i] < 2^105 + 2^101 < 2^106 */ | ||
| 1077 | |||
| 1078 | /* z' = (y + z)^2 - gamma - delta */ | ||
| 1079 | felem_sum(delta, gamma); | ||
| 1080 | /* delta[i] < 2^101 + 2^101 = 2^102 */ | ||
| 1081 | felem_assign(ftmp, y_in); | ||
| 1082 | felem_sum(ftmp, z_in); | ||
| 1083 | /* ftmp[i] < 2^106 + 2^106 = 2^107 */ | ||
| 1084 | felem_square(tmp, ftmp); | ||
| 1085 | felem_reduce(z_out, tmp); | ||
| 1086 | felem_diff(z_out, delta); | ||
| 1087 | /* z_out[i] < 2^105 + 2^101 < 2^106 */ | ||
| 1088 | |||
| 1089 | /* y' = alpha*(4*beta - x') - 8*gamma^2 */ | ||
| 1090 | felem_scalar(beta, 4); | ||
| 1091 | /* beta[i] < 4 * 2^101 = 2^103 */ | ||
| 1092 | felem_diff_zero107(beta, x_out); | ||
| 1093 | /* beta[i] < 2^107 + 2^103 < 2^108 */ | ||
| 1094 | felem_small_mul(tmp, small2, beta); | ||
| 1095 | /* tmp[i] < 7 * 2^64 < 2^67 */ | ||
| 1096 | smallfelem_square(tmp2, small1); | ||
| 1097 | /* tmp2[i] < 7 * 2^64 */ | ||
| 1098 | longfelem_scalar(tmp2, 8); | ||
| 1099 | /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */ | ||
| 1100 | longfelem_diff(tmp, tmp2); | ||
| 1101 | /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */ | ||
| 1102 | felem_reduce_zero105(y_out, tmp); | ||
| 1103 | /* y_out[i] < 2^106 */ | ||
| 1104 | } | ||
| 1105 | |||
| 1106 | /* point_double_small is the same as point_double, except that it operates on | ||
| 1107 | * smallfelems */ | ||
| 1108 | static void | ||
| 1109 | point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out, | ||
| 1110 | const smallfelem x_in, const smallfelem y_in, const smallfelem z_in) | ||
| 1111 | { | ||
| 1112 | felem felem_x_out, felem_y_out, felem_z_out; | ||
| 1113 | felem felem_x_in, felem_y_in, felem_z_in; | ||
| 1114 | |||
| 1115 | smallfelem_expand(felem_x_in, x_in); | ||
| 1116 | smallfelem_expand(felem_y_in, y_in); | ||
| 1117 | smallfelem_expand(felem_z_in, z_in); | ||
| 1118 | point_double(felem_x_out, felem_y_out, felem_z_out, | ||
| 1119 | felem_x_in, felem_y_in, felem_z_in); | ||
| 1120 | felem_shrink(x_out, felem_x_out); | ||
| 1121 | felem_shrink(y_out, felem_y_out); | ||
| 1122 | felem_shrink(z_out, felem_z_out); | ||
| 1123 | } | ||
| 1124 | |||
| 1125 | /* copy_conditional copies in to out iff mask is all ones. */ | ||
| 1126 | static void | ||
| 1127 | copy_conditional(felem out, const felem in, limb mask) | ||
| 1128 | { | ||
| 1129 | unsigned i; | ||
| 1130 | for (i = 0; i < NLIMBS; ++i) | ||
| 1131 | { | ||
| 1132 | const limb tmp = mask & (in[i] ^ out[i]); | ||
| 1133 | out[i] ^= tmp; | ||
| 1134 | } | ||
| 1135 | } | ||
| 1136 | |||
| 1137 | /* copy_small_conditional copies in to out iff mask is all ones. */ | ||
| 1138 | static void | ||
| 1139 | copy_small_conditional(felem out, const smallfelem in, limb mask) | ||
| 1140 | { | ||
| 1141 | unsigned i; | ||
| 1142 | const u64 mask64 = mask; | ||
| 1143 | for (i = 0; i < NLIMBS; ++i) | ||
| 1144 | { | ||
| 1145 | out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask); | ||
| 1146 | } | ||
| 1147 | } | ||
| 1148 | |||
| 1149 | /* point_add calcuates (x1, y1, z1) + (x2, y2, z2) | ||
| 1150 | * | ||
| 1151 | * The method is taken from: | ||
| 1152 | * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, | ||
| 1153 | * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). | ||
| 1154 | * | ||
| 1155 | * This function includes a branch for checking whether the two input points | ||
| 1156 | * are equal, (while not equal to the point at infinity). This case never | ||
| 1157 | * happens during single point multiplication, so there is no timing leak for | ||
| 1158 | * ECDH or ECDSA signing. */ | ||
| 1159 | static void point_add(felem x3, felem y3, felem z3, | ||
| 1160 | const felem x1, const felem y1, const felem z1, | ||
| 1161 | const int mixed, const smallfelem x2, const smallfelem y2, const smallfelem z2) | ||
| 1162 | { | ||
| 1163 | felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; | ||
| 1164 | longfelem tmp, tmp2; | ||
| 1165 | smallfelem small1, small2, small3, small4, small5; | ||
| 1166 | limb x_equal, y_equal, z1_is_zero, z2_is_zero; | ||
| 1167 | |||
| 1168 | felem_shrink(small3, z1); | ||
| 1169 | |||
| 1170 | z1_is_zero = smallfelem_is_zero(small3); | ||
| 1171 | z2_is_zero = smallfelem_is_zero(z2); | ||
| 1172 | |||
| 1173 | /* ftmp = z1z1 = z1**2 */ | ||
| 1174 | smallfelem_square(tmp, small3); | ||
| 1175 | felem_reduce(ftmp, tmp); | ||
| 1176 | /* ftmp[i] < 2^101 */ | ||
| 1177 | felem_shrink(small1, ftmp); | ||
| 1178 | |||
| 1179 | if(!mixed) | ||
| 1180 | { | ||
| 1181 | /* ftmp2 = z2z2 = z2**2 */ | ||
| 1182 | smallfelem_square(tmp, z2); | ||
| 1183 | felem_reduce(ftmp2, tmp); | ||
| 1184 | /* ftmp2[i] < 2^101 */ | ||
| 1185 | felem_shrink(small2, ftmp2); | ||
| 1186 | |||
| 1187 | felem_shrink(small5, x1); | ||
| 1188 | |||
| 1189 | /* u1 = ftmp3 = x1*z2z2 */ | ||
| 1190 | smallfelem_mul(tmp, small5, small2); | ||
| 1191 | felem_reduce(ftmp3, tmp); | ||
| 1192 | /* ftmp3[i] < 2^101 */ | ||
| 1193 | |||
| 1194 | /* ftmp5 = z1 + z2 */ | ||
| 1195 | felem_assign(ftmp5, z1); | ||
| 1196 | felem_small_sum(ftmp5, z2); | ||
| 1197 | /* ftmp5[i] < 2^107 */ | ||
| 1198 | |||
| 1199 | /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */ | ||
| 1200 | felem_square(tmp, ftmp5); | ||
| 1201 | felem_reduce(ftmp5, tmp); | ||
| 1202 | /* ftmp2 = z2z2 + z1z1 */ | ||
| 1203 | felem_sum(ftmp2, ftmp); | ||
| 1204 | /* ftmp2[i] < 2^101 + 2^101 = 2^102 */ | ||
| 1205 | felem_diff(ftmp5, ftmp2); | ||
| 1206 | /* ftmp5[i] < 2^105 + 2^101 < 2^106 */ | ||
| 1207 | |||
| 1208 | /* ftmp2 = z2 * z2z2 */ | ||
| 1209 | smallfelem_mul(tmp, small2, z2); | ||
| 1210 | felem_reduce(ftmp2, tmp); | ||
| 1211 | |||
| 1212 | /* s1 = ftmp2 = y1 * z2**3 */ | ||
| 1213 | felem_mul(tmp, y1, ftmp2); | ||
| 1214 | felem_reduce(ftmp6, tmp); | ||
| 1215 | /* ftmp6[i] < 2^101 */ | ||
| 1216 | } | ||
| 1217 | else | ||
| 1218 | { | ||
| 1219 | /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ | ||
| 1220 | |||
| 1221 | /* u1 = ftmp3 = x1*z2z2 */ | ||
| 1222 | felem_assign(ftmp3, x1); | ||
| 1223 | /* ftmp3[i] < 2^106 */ | ||
| 1224 | |||
| 1225 | /* ftmp5 = 2z1z2 */ | ||
| 1226 | felem_assign(ftmp5, z1); | ||
| 1227 | felem_scalar(ftmp5, 2); | ||
| 1228 | /* ftmp5[i] < 2*2^106 = 2^107 */ | ||
| 1229 | |||
| 1230 | /* s1 = ftmp2 = y1 * z2**3 */ | ||
| 1231 | felem_assign(ftmp6, y1); | ||
| 1232 | /* ftmp6[i] < 2^106 */ | ||
| 1233 | } | ||
| 1234 | |||
| 1235 | /* u2 = x2*z1z1 */ | ||
| 1236 | smallfelem_mul(tmp, x2, small1); | ||
| 1237 | felem_reduce(ftmp4, tmp); | ||
| 1238 | |||
| 1239 | /* h = ftmp4 = u2 - u1 */ | ||
| 1240 | felem_diff_zero107(ftmp4, ftmp3); | ||
| 1241 | /* ftmp4[i] < 2^107 + 2^101 < 2^108 */ | ||
| 1242 | felem_shrink(small4, ftmp4); | ||
| 1243 | |||
| 1244 | x_equal = smallfelem_is_zero(small4); | ||
| 1245 | |||
| 1246 | /* z_out = ftmp5 * h */ | ||
| 1247 | felem_small_mul(tmp, small4, ftmp5); | ||
| 1248 | felem_reduce(z_out, tmp); | ||
| 1249 | /* z_out[i] < 2^101 */ | ||
| 1250 | |||
| 1251 | /* ftmp = z1 * z1z1 */ | ||
| 1252 | smallfelem_mul(tmp, small1, small3); | ||
| 1253 | felem_reduce(ftmp, tmp); | ||
| 1254 | |||
| 1255 | /* s2 = tmp = y2 * z1**3 */ | ||
| 1256 | felem_small_mul(tmp, y2, ftmp); | ||
| 1257 | felem_reduce(ftmp5, tmp); | ||
| 1258 | |||
| 1259 | /* r = ftmp5 = (s2 - s1)*2 */ | ||
| 1260 | felem_diff_zero107(ftmp5, ftmp6); | ||
| 1261 | /* ftmp5[i] < 2^107 + 2^107 = 2^108*/ | ||
| 1262 | felem_scalar(ftmp5, 2); | ||
| 1263 | /* ftmp5[i] < 2^109 */ | ||
| 1264 | felem_shrink(small1, ftmp5); | ||
| 1265 | y_equal = smallfelem_is_zero(small1); | ||
| 1266 | |||
| 1267 | if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) | ||
| 1268 | { | ||
| 1269 | point_double(x3, y3, z3, x1, y1, z1); | ||
| 1270 | return; | ||
| 1271 | } | ||
| 1272 | |||
| 1273 | /* I = ftmp = (2h)**2 */ | ||
| 1274 | felem_assign(ftmp, ftmp4); | ||
| 1275 | felem_scalar(ftmp, 2); | ||
| 1276 | /* ftmp[i] < 2*2^108 = 2^109 */ | ||
| 1277 | felem_square(tmp, ftmp); | ||
| 1278 | felem_reduce(ftmp, tmp); | ||
| 1279 | |||
| 1280 | /* J = ftmp2 = h * I */ | ||
| 1281 | felem_mul(tmp, ftmp4, ftmp); | ||
| 1282 | felem_reduce(ftmp2, tmp); | ||
| 1283 | |||
| 1284 | /* V = ftmp4 = U1 * I */ | ||
| 1285 | felem_mul(tmp, ftmp3, ftmp); | ||
| 1286 | felem_reduce(ftmp4, tmp); | ||
| 1287 | |||
| 1288 | /* x_out = r**2 - J - 2V */ | ||
| 1289 | smallfelem_square(tmp, small1); | ||
| 1290 | felem_reduce(x_out, tmp); | ||
| 1291 | felem_assign(ftmp3, ftmp4); | ||
| 1292 | felem_scalar(ftmp4, 2); | ||
| 1293 | felem_sum(ftmp4, ftmp2); | ||
| 1294 | /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */ | ||
| 1295 | felem_diff(x_out, ftmp4); | ||
| 1296 | /* x_out[i] < 2^105 + 2^101 */ | ||
| 1297 | |||
| 1298 | /* y_out = r(V-x_out) - 2 * s1 * J */ | ||
| 1299 | felem_diff_zero107(ftmp3, x_out); | ||
| 1300 | /* ftmp3[i] < 2^107 + 2^101 < 2^108 */ | ||
| 1301 | felem_small_mul(tmp, small1, ftmp3); | ||
| 1302 | felem_mul(tmp2, ftmp6, ftmp2); | ||
| 1303 | longfelem_scalar(tmp2, 2); | ||
| 1304 | /* tmp2[i] < 2*2^67 = 2^68 */ | ||
| 1305 | longfelem_diff(tmp, tmp2); | ||
| 1306 | /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */ | ||
| 1307 | felem_reduce_zero105(y_out, tmp); | ||
| 1308 | /* y_out[i] < 2^106 */ | ||
| 1309 | |||
| 1310 | copy_small_conditional(x_out, x2, z1_is_zero); | ||
| 1311 | copy_conditional(x_out, x1, z2_is_zero); | ||
| 1312 | copy_small_conditional(y_out, y2, z1_is_zero); | ||
| 1313 | copy_conditional(y_out, y1, z2_is_zero); | ||
| 1314 | copy_small_conditional(z_out, z2, z1_is_zero); | ||
| 1315 | copy_conditional(z_out, z1, z2_is_zero); | ||
| 1316 | felem_assign(x3, x_out); | ||
| 1317 | felem_assign(y3, y_out); | ||
| 1318 | felem_assign(z3, z_out); | ||
| 1319 | } | ||
| 1320 | |||
| 1321 | /* point_add_small is the same as point_add, except that it operates on | ||
| 1322 | * smallfelems */ | ||
| 1323 | static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3, | ||
| 1324 | smallfelem x1, smallfelem y1, smallfelem z1, | ||
| 1325 | smallfelem x2, smallfelem y2, smallfelem z2) | ||
| 1326 | { | ||
| 1327 | felem felem_x3, felem_y3, felem_z3; | ||
| 1328 | felem felem_x1, felem_y1, felem_z1; | ||
| 1329 | smallfelem_expand(felem_x1, x1); | ||
| 1330 | smallfelem_expand(felem_y1, y1); | ||
| 1331 | smallfelem_expand(felem_z1, z1); | ||
| 1332 | point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0, x2, y2, z2); | ||
| 1333 | felem_shrink(x3, felem_x3); | ||
| 1334 | felem_shrink(y3, felem_y3); | ||
| 1335 | felem_shrink(z3, felem_z3); | ||
| 1336 | } | ||
| 1337 | |||
| 1338 | /* Base point pre computation | ||
| 1339 | * -------------------------- | ||
| 1340 | * | ||
| 1341 | * Two different sorts of precomputed tables are used in the following code. | ||
| 1342 | * Each contain various points on the curve, where each point is three field | ||
| 1343 | * elements (x, y, z). | ||
| 1344 | * | ||
| 1345 | * For the base point table, z is usually 1 (0 for the point at infinity). | ||
| 1346 | * This table has 2 * 16 elements, starting with the following: | ||
| 1347 | * index | bits | point | ||
| 1348 | * ------+---------+------------------------------ | ||
| 1349 | * 0 | 0 0 0 0 | 0G | ||
| 1350 | * 1 | 0 0 0 1 | 1G | ||
| 1351 | * 2 | 0 0 1 0 | 2^64G | ||
| 1352 | * 3 | 0 0 1 1 | (2^64 + 1)G | ||
| 1353 | * 4 | 0 1 0 0 | 2^128G | ||
| 1354 | * 5 | 0 1 0 1 | (2^128 + 1)G | ||
| 1355 | * 6 | 0 1 1 0 | (2^128 + 2^64)G | ||
| 1356 | * 7 | 0 1 1 1 | (2^128 + 2^64 + 1)G | ||
| 1357 | * 8 | 1 0 0 0 | 2^192G | ||
| 1358 | * 9 | 1 0 0 1 | (2^192 + 1)G | ||
| 1359 | * 10 | 1 0 1 0 | (2^192 + 2^64)G | ||
| 1360 | * 11 | 1 0 1 1 | (2^192 + 2^64 + 1)G | ||
| 1361 | * 12 | 1 1 0 0 | (2^192 + 2^128)G | ||
| 1362 | * 13 | 1 1 0 1 | (2^192 + 2^128 + 1)G | ||
| 1363 | * 14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G | ||
| 1364 | * 15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G | ||
| 1365 | * followed by a copy of this with each element multiplied by 2^32. | ||
| 1366 | * | ||
| 1367 | * The reason for this is so that we can clock bits into four different | ||
| 1368 | * locations when doing simple scalar multiplies against the base point, | ||
| 1369 | * and then another four locations using the second 16 elements. | ||
| 1370 | * | ||
| 1371 | * Tables for other points have table[i] = iG for i in 0 .. 16. */ | ||
| 1372 | |||
| 1373 | /* gmul is the table of precomputed base points */ | ||
| 1374 | static const smallfelem gmul[2][16][3] = | ||
| 1375 | {{{{0, 0, 0, 0}, | ||
| 1376 | {0, 0, 0, 0}, | ||
| 1377 | {0, 0, 0, 0}}, | ||
| 1378 | {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2, 0x6b17d1f2e12c4247}, | ||
| 1379 | {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16, 0x4fe342e2fe1a7f9b}, | ||
| 1380 | {1, 0, 0, 0}}, | ||
| 1381 | {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de, 0x0fa822bc2811aaa5}, | ||
| 1382 | {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b, 0xbff44ae8f5dba80d}, | ||
| 1383 | {1, 0, 0, 0}}, | ||
| 1384 | {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789, 0x300a4bbc89d6726f}, | ||
| 1385 | {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f, 0x72aac7e0d09b4644}, | ||
| 1386 | {1, 0, 0, 0}}, | ||
| 1387 | {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e, 0x447d739beedb5e67}, | ||
| 1388 | {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7, 0x2d4825ab834131ee}, | ||
| 1389 | {1, 0, 0, 0}}, | ||
| 1390 | {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60, 0xef9519328a9c72ff}, | ||
| 1391 | {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c, 0x611e9fc37dbb2c9b}, | ||
| 1392 | {1, 0, 0, 0}}, | ||
| 1393 | {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf, 0x550663797b51f5d8}, | ||
| 1394 | {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5, 0x157164848aecb851}, | ||
| 1395 | {1, 0, 0, 0}}, | ||
| 1396 | {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391, 0xeb5d7745b21141ea}, | ||
| 1397 | {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee, 0xeafd72ebdbecc17b}, | ||
| 1398 | {1, 0, 0, 0}}, | ||
| 1399 | {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5, 0xa6d39677a7849276}, | ||
| 1400 | {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf, 0x674f84749b0b8816}, | ||
| 1401 | {1, 0, 0, 0}}, | ||
| 1402 | {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb, 0x4e769e7672c9ddad}, | ||
| 1403 | {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281, 0x42b99082de830663}, | ||
| 1404 | {1, 0, 0, 0}}, | ||
| 1405 | {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478, 0x78878ef61c6ce04d}, | ||
| 1406 | {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def, 0xb6cb3f5d7b72c321}, | ||
| 1407 | {1, 0, 0, 0}}, | ||
| 1408 | {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae, 0x0c88bc4d716b1287}, | ||
| 1409 | {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa, 0xdd5ddea3f3901dc6}, | ||
| 1410 | {1, 0, 0, 0}}, | ||
| 1411 | {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3, 0x68f344af6b317466}, | ||
| 1412 | {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3, 0x31b9c405f8540a20}, | ||
| 1413 | {1, 0, 0, 0}}, | ||
| 1414 | {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0, 0x4052bf4b6f461db9}, | ||
| 1415 | {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8, 0xfecf4d5190b0fc61}, | ||
| 1416 | {1, 0, 0, 0}}, | ||
| 1417 | {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a, 0x1eddbae2c802e41a}, | ||
| 1418 | {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0, 0x43104d86560ebcfc}, | ||
| 1419 | {1, 0, 0, 0}}, | ||
| 1420 | {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a, 0xb48e26b484f7a21c}, | ||
| 1421 | {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668, 0xfac015404d4d3dab}, | ||
| 1422 | {1, 0, 0, 0}}}, | ||
| 1423 | {{{0, 0, 0, 0}, | ||
| 1424 | {0, 0, 0, 0}, | ||
| 1425 | {0, 0, 0, 0}}, | ||
| 1426 | {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da, 0x7fe36b40af22af89}, | ||
| 1427 | {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1, 0xe697d45825b63624}, | ||
| 1428 | {1, 0, 0, 0}}, | ||
| 1429 | {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902, 0x4a5b506612a677a6}, | ||
| 1430 | {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40, 0xeb13461ceac089f1}, | ||
| 1431 | {1, 0, 0, 0}}, | ||
| 1432 | {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857, 0x0781b8291c6a220a}, | ||
| 1433 | {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434, 0x690cde8df0151593}, | ||
| 1434 | {1, 0, 0, 0}}, | ||
| 1435 | {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326, 0x8a535f566ec73617}, | ||
| 1436 | {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf, 0x0455c08468b08bd7}, | ||
| 1437 | {1, 0, 0, 0}}, | ||
| 1438 | {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279, 0x06bada7ab77f8276}, | ||
| 1439 | {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70, 0x5b476dfd0e6cb18a}, | ||
| 1440 | {1, 0, 0, 0}}, | ||
| 1441 | {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8, 0x3e29864e8a2ec908}, | ||
| 1442 | {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed, 0x239b90ea3dc31e7e}, | ||
| 1443 | {1, 0, 0, 0}}, | ||
| 1444 | {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4, 0x820f4dd949f72ff7}, | ||
| 1445 | {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3, 0x140406ec783a05ec}, | ||
| 1446 | {1, 0, 0, 0}}, | ||
| 1447 | {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe, 0x68f6b8542783dfee}, | ||
| 1448 | {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028, 0xcbe1feba92e40ce6}, | ||
| 1449 | {1, 0, 0, 0}}, | ||
| 1450 | {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927, 0xd0b2f94d2f420109}, | ||
| 1451 | {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a, 0x971459828b0719e5}, | ||
| 1452 | {1, 0, 0, 0}}, | ||
| 1453 | {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687, 0x961610004a866aba}, | ||
| 1454 | {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c, 0x7acb9fadcee75e44}, | ||
| 1455 | {1, 0, 0, 0}}, | ||
| 1456 | {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea, 0x24eb9acca333bf5b}, | ||
| 1457 | {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d, 0x69f891c5acd079cc}, | ||
| 1458 | {1, 0, 0, 0}}, | ||
| 1459 | {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514, 0xe51f547c5972a107}, | ||
| 1460 | {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06, 0x1c309a2b25bb1387}, | ||
| 1461 | {1, 0, 0, 0}}, | ||
| 1462 | {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828, 0x20b87b8aa2c4e503}, | ||
| 1463 | {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044, 0xf5c6fa49919776be}, | ||
| 1464 | {1, 0, 0, 0}}, | ||
| 1465 | {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56, 0x1ed7d1b9332010b9}, | ||
| 1466 | {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24, 0x3a2b03f03217257a}, | ||
| 1467 | {1, 0, 0, 0}}, | ||
| 1468 | {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b, 0x15fee545c78dd9f6}, | ||
| 1469 | {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb, 0x4ab5b6b2b8753f81}, | ||
| 1470 | {1, 0, 0, 0}}}}; | ||
| 1471 | |||
| 1472 | /* select_point selects the |idx|th point from a precomputation table and | ||
| 1473 | * copies it to out. */ | ||
| 1474 | static void select_point(const u64 idx, unsigned int size, const smallfelem pre_comp[16][3], smallfelem out[3]) | ||
| 1475 | { | ||
| 1476 | unsigned i, j; | ||
| 1477 | u64 *outlimbs = &out[0][0]; | ||
| 1478 | memset(outlimbs, 0, 3 * sizeof(smallfelem)); | ||
| 1479 | |||
| 1480 | for (i = 0; i < size; i++) | ||
| 1481 | { | ||
| 1482 | const u64 *inlimbs = (u64*) &pre_comp[i][0][0]; | ||
| 1483 | u64 mask = i ^ idx; | ||
| 1484 | mask |= mask >> 4; | ||
| 1485 | mask |= mask >> 2; | ||
| 1486 | mask |= mask >> 1; | ||
| 1487 | mask &= 1; | ||
| 1488 | mask--; | ||
| 1489 | for (j = 0; j < NLIMBS * 3; j++) | ||
| 1490 | outlimbs[j] |= inlimbs[j] & mask; | ||
| 1491 | } | ||
| 1492 | } | ||
| 1493 | |||
| 1494 | /* get_bit returns the |i|th bit in |in| */ | ||
| 1495 | static char get_bit(const felem_bytearray in, int i) | ||
| 1496 | { | ||
| 1497 | if ((i < 0) || (i >= 256)) | ||
| 1498 | return 0; | ||
| 1499 | return (in[i >> 3] >> (i & 7)) & 1; | ||
| 1500 | } | ||
| 1501 | |||
| 1502 | /* Interleaved point multiplication using precomputed point multiples: | ||
| 1503 | * The small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], | ||
| 1504 | * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple | ||
| 1505 | * of the generator, using certain (large) precomputed multiples in g_pre_comp. | ||
| 1506 | * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ | ||
| 1507 | static void batch_mul(felem x_out, felem y_out, felem z_out, | ||
| 1508 | const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, | ||
| 1509 | const int mixed, const smallfelem pre_comp[][17][3], const smallfelem g_pre_comp[2][16][3]) | ||
| 1510 | { | ||
| 1511 | int i, skip; | ||
| 1512 | unsigned num, gen_mul = (g_scalar != NULL); | ||
| 1513 | felem nq[3], ftmp; | ||
| 1514 | smallfelem tmp[3]; | ||
| 1515 | u64 bits; | ||
| 1516 | u8 sign, digit; | ||
| 1517 | |||
| 1518 | /* set nq to the point at infinity */ | ||
| 1519 | memset(nq, 0, 3 * sizeof(felem)); | ||
| 1520 | |||
| 1521 | /* Loop over all scalars msb-to-lsb, interleaving additions | ||
| 1522 | * of multiples of the generator (two in each of the last 32 rounds) | ||
| 1523 | * and additions of other points multiples (every 5th round). | ||
| 1524 | */ | ||
| 1525 | skip = 1; /* save two point operations in the first round */ | ||
| 1526 | for (i = (num_points ? 255 : 31); i >= 0; --i) | ||
| 1527 | { | ||
| 1528 | /* double */ | ||
| 1529 | if (!skip) | ||
| 1530 | point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); | ||
| 1531 | |||
| 1532 | /* add multiples of the generator */ | ||
| 1533 | if (gen_mul && (i <= 31)) | ||
| 1534 | { | ||
| 1535 | /* first, look 32 bits upwards */ | ||
| 1536 | bits = get_bit(g_scalar, i + 224) << 3; | ||
| 1537 | bits |= get_bit(g_scalar, i + 160) << 2; | ||
| 1538 | bits |= get_bit(g_scalar, i + 96) << 1; | ||
| 1539 | bits |= get_bit(g_scalar, i + 32); | ||
| 1540 | /* select the point to add, in constant time */ | ||
| 1541 | select_point(bits, 16, g_pre_comp[1], tmp); | ||
| 1542 | |||
| 1543 | if (!skip) | ||
| 1544 | { | ||
| 1545 | point_add(nq[0], nq[1], nq[2], | ||
| 1546 | nq[0], nq[1], nq[2], | ||
| 1547 | 1 /* mixed */, tmp[0], tmp[1], tmp[2]); | ||
| 1548 | } | ||
| 1549 | else | ||
| 1550 | { | ||
| 1551 | smallfelem_expand(nq[0], tmp[0]); | ||
| 1552 | smallfelem_expand(nq[1], tmp[1]); | ||
| 1553 | smallfelem_expand(nq[2], tmp[2]); | ||
| 1554 | skip = 0; | ||
| 1555 | } | ||
| 1556 | |||
| 1557 | /* second, look at the current position */ | ||
| 1558 | bits = get_bit(g_scalar, i + 192) << 3; | ||
| 1559 | bits |= get_bit(g_scalar, i + 128) << 2; | ||
| 1560 | bits |= get_bit(g_scalar, i + 64) << 1; | ||
| 1561 | bits |= get_bit(g_scalar, i); | ||
| 1562 | /* select the point to add, in constant time */ | ||
| 1563 | select_point(bits, 16, g_pre_comp[0], tmp); | ||
| 1564 | point_add(nq[0], nq[1], nq[2], | ||
| 1565 | nq[0], nq[1], nq[2], | ||
| 1566 | 1 /* mixed */, tmp[0], tmp[1], tmp[2]); | ||
| 1567 | } | ||
| 1568 | |||
| 1569 | /* do other additions every 5 doublings */ | ||
| 1570 | if (num_points && (i % 5 == 0)) | ||
| 1571 | { | ||
| 1572 | /* loop over all scalars */ | ||
| 1573 | for (num = 0; num < num_points; ++num) | ||
| 1574 | { | ||
| 1575 | bits = get_bit(scalars[num], i + 4) << 5; | ||
| 1576 | bits |= get_bit(scalars[num], i + 3) << 4; | ||
| 1577 | bits |= get_bit(scalars[num], i + 2) << 3; | ||
| 1578 | bits |= get_bit(scalars[num], i + 1) << 2; | ||
| 1579 | bits |= get_bit(scalars[num], i) << 1; | ||
| 1580 | bits |= get_bit(scalars[num], i - 1); | ||
| 1581 | ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); | ||
| 1582 | |||
| 1583 | /* select the point to add or subtract, in constant time */ | ||
| 1584 | select_point(digit, 17, pre_comp[num], tmp); | ||
| 1585 | smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative point */ | ||
| 1586 | copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1)); | ||
| 1587 | felem_contract(tmp[1], ftmp); | ||
| 1588 | |||
| 1589 | if (!skip) | ||
| 1590 | { | ||
| 1591 | point_add(nq[0], nq[1], nq[2], | ||
| 1592 | nq[0], nq[1], nq[2], | ||
| 1593 | mixed, tmp[0], tmp[1], tmp[2]); | ||
| 1594 | } | ||
| 1595 | else | ||
| 1596 | { | ||
| 1597 | smallfelem_expand(nq[0], tmp[0]); | ||
| 1598 | smallfelem_expand(nq[1], tmp[1]); | ||
| 1599 | smallfelem_expand(nq[2], tmp[2]); | ||
| 1600 | skip = 0; | ||
| 1601 | } | ||
| 1602 | } | ||
| 1603 | } | ||
| 1604 | } | ||
| 1605 | felem_assign(x_out, nq[0]); | ||
| 1606 | felem_assign(y_out, nq[1]); | ||
| 1607 | felem_assign(z_out, nq[2]); | ||
| 1608 | } | ||
| 1609 | |||
| 1610 | /* Precomputation for the group generator. */ | ||
| 1611 | typedef struct { | ||
| 1612 | smallfelem g_pre_comp[2][16][3]; | ||
| 1613 | int references; | ||
| 1614 | } NISTP256_PRE_COMP; | ||
| 1615 | |||
| 1616 | const EC_METHOD *EC_GFp_nistp256_method(void) | ||
| 1617 | { | ||
| 1618 | static const EC_METHOD ret = { | ||
| 1619 | EC_FLAGS_DEFAULT_OCT, | ||
| 1620 | NID_X9_62_prime_field, | ||
| 1621 | ec_GFp_nistp256_group_init, | ||
| 1622 | ec_GFp_simple_group_finish, | ||
| 1623 | ec_GFp_simple_group_clear_finish, | ||
| 1624 | ec_GFp_nist_group_copy, | ||
| 1625 | ec_GFp_nistp256_group_set_curve, | ||
| 1626 | ec_GFp_simple_group_get_curve, | ||
| 1627 | ec_GFp_simple_group_get_degree, | ||
| 1628 | ec_GFp_simple_group_check_discriminant, | ||
| 1629 | ec_GFp_simple_point_init, | ||
| 1630 | ec_GFp_simple_point_finish, | ||
| 1631 | ec_GFp_simple_point_clear_finish, | ||
| 1632 | ec_GFp_simple_point_copy, | ||
| 1633 | ec_GFp_simple_point_set_to_infinity, | ||
| 1634 | ec_GFp_simple_set_Jprojective_coordinates_GFp, | ||
| 1635 | ec_GFp_simple_get_Jprojective_coordinates_GFp, | ||
| 1636 | ec_GFp_simple_point_set_affine_coordinates, | ||
| 1637 | ec_GFp_nistp256_point_get_affine_coordinates, | ||
| 1638 | 0 /* point_set_compressed_coordinates */, | ||
| 1639 | 0 /* point2oct */, | ||
| 1640 | 0 /* oct2point */, | ||
| 1641 | ec_GFp_simple_add, | ||
| 1642 | ec_GFp_simple_dbl, | ||
| 1643 | ec_GFp_simple_invert, | ||
| 1644 | ec_GFp_simple_is_at_infinity, | ||
| 1645 | ec_GFp_simple_is_on_curve, | ||
| 1646 | ec_GFp_simple_cmp, | ||
| 1647 | ec_GFp_simple_make_affine, | ||
| 1648 | ec_GFp_simple_points_make_affine, | ||
| 1649 | ec_GFp_nistp256_points_mul, | ||
| 1650 | ec_GFp_nistp256_precompute_mult, | ||
| 1651 | ec_GFp_nistp256_have_precompute_mult, | ||
| 1652 | ec_GFp_nist_field_mul, | ||
| 1653 | ec_GFp_nist_field_sqr, | ||
| 1654 | 0 /* field_div */, | ||
| 1655 | 0 /* field_encode */, | ||
| 1656 | 0 /* field_decode */, | ||
| 1657 | 0 /* field_set_to_one */ }; | ||
| 1658 | |||
| 1659 | return &ret; | ||
| 1660 | } | ||
| 1661 | |||
| 1662 | /******************************************************************************/ | ||
| 1663 | /* FUNCTIONS TO MANAGE PRECOMPUTATION | ||
| 1664 | */ | ||
| 1665 | |||
| 1666 | static NISTP256_PRE_COMP *nistp256_pre_comp_new() | ||
| 1667 | { | ||
| 1668 | NISTP256_PRE_COMP *ret = NULL; | ||
| 1669 | ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret); | ||
| 1670 | if (!ret) | ||
| 1671 | { | ||
| 1672 | ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); | ||
| 1673 | return ret; | ||
| 1674 | } | ||
| 1675 | memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); | ||
| 1676 | ret->references = 1; | ||
| 1677 | return ret; | ||
| 1678 | } | ||
| 1679 | |||
| 1680 | static void *nistp256_pre_comp_dup(void *src_) | ||
| 1681 | { | ||
| 1682 | NISTP256_PRE_COMP *src = src_; | ||
| 1683 | |||
| 1684 | /* no need to actually copy, these objects never change! */ | ||
| 1685 | CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); | ||
| 1686 | |||
| 1687 | return src_; | ||
| 1688 | } | ||
| 1689 | |||
| 1690 | static void nistp256_pre_comp_free(void *pre_) | ||
| 1691 | { | ||
| 1692 | int i; | ||
| 1693 | NISTP256_PRE_COMP *pre = pre_; | ||
| 1694 | |||
| 1695 | if (!pre) | ||
| 1696 | return; | ||
| 1697 | |||
| 1698 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
| 1699 | if (i > 0) | ||
| 1700 | return; | ||
| 1701 | |||
| 1702 | OPENSSL_free(pre); | ||
| 1703 | } | ||
| 1704 | |||
| 1705 | static void nistp256_pre_comp_clear_free(void *pre_) | ||
| 1706 | { | ||
| 1707 | int i; | ||
| 1708 | NISTP256_PRE_COMP *pre = pre_; | ||
| 1709 | |||
| 1710 | if (!pre) | ||
| 1711 | return; | ||
| 1712 | |||
| 1713 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
| 1714 | if (i > 0) | ||
| 1715 | return; | ||
| 1716 | |||
| 1717 | OPENSSL_cleanse(pre, sizeof *pre); | ||
| 1718 | OPENSSL_free(pre); | ||
| 1719 | } | ||
| 1720 | |||
| 1721 | /******************************************************************************/ | ||
| 1722 | /* OPENSSL EC_METHOD FUNCTIONS | ||
| 1723 | */ | ||
| 1724 | |||
| 1725 | int ec_GFp_nistp256_group_init(EC_GROUP *group) | ||
| 1726 | { | ||
| 1727 | int ret; | ||
| 1728 | ret = ec_GFp_simple_group_init(group); | ||
| 1729 | group->a_is_minus3 = 1; | ||
| 1730 | return ret; | ||
| 1731 | } | ||
| 1732 | |||
| 1733 | int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, | ||
| 1734 | const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) | ||
| 1735 | { | ||
| 1736 | int ret = 0; | ||
| 1737 | BN_CTX *new_ctx = NULL; | ||
| 1738 | BIGNUM *curve_p, *curve_a, *curve_b; | ||
| 1739 | |||
| 1740 | if (ctx == NULL) | ||
| 1741 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
| 1742 | BN_CTX_start(ctx); | ||
| 1743 | if (((curve_p = BN_CTX_get(ctx)) == NULL) || | ||
| 1744 | ((curve_a = BN_CTX_get(ctx)) == NULL) || | ||
| 1745 | ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; | ||
| 1746 | BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p); | ||
| 1747 | BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a); | ||
| 1748 | BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b); | ||
| 1749 | if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || | ||
| 1750 | (BN_cmp(curve_b, b))) | ||
| 1751 | { | ||
| 1752 | ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE, | ||
| 1753 | EC_R_WRONG_CURVE_PARAMETERS); | ||
| 1754 | goto err; | ||
| 1755 | } | ||
| 1756 | group->field_mod_func = BN_nist_mod_256; | ||
| 1757 | ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); | ||
| 1758 | err: | ||
| 1759 | BN_CTX_end(ctx); | ||
| 1760 | if (new_ctx != NULL) | ||
| 1761 | BN_CTX_free(new_ctx); | ||
| 1762 | return ret; | ||
| 1763 | } | ||
| 1764 | |||
| 1765 | /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns | ||
| 1766 | * (X', Y') = (X/Z^2, Y/Z^3) */ | ||
| 1767 | int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, | ||
| 1768 | const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) | ||
| 1769 | { | ||
| 1770 | felem z1, z2, x_in, y_in; | ||
| 1771 | smallfelem x_out, y_out; | ||
| 1772 | longfelem tmp; | ||
| 1773 | |||
| 1774 | if (EC_POINT_is_at_infinity(group, point)) | ||
| 1775 | { | ||
| 1776 | ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, | ||
| 1777 | EC_R_POINT_AT_INFINITY); | ||
| 1778 | return 0; | ||
| 1779 | } | ||
| 1780 | if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || | ||
| 1781 | (!BN_to_felem(z1, &point->Z))) return 0; | ||
| 1782 | felem_inv(z2, z1); | ||
| 1783 | felem_square(tmp, z2); felem_reduce(z1, tmp); | ||
| 1784 | felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); | ||
| 1785 | felem_contract(x_out, x_in); | ||
| 1786 | if (x != NULL) | ||
| 1787 | { | ||
| 1788 | if (!smallfelem_to_BN(x, x_out)) { | ||
| 1789 | ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, | ||
| 1790 | ERR_R_BN_LIB); | ||
| 1791 | return 0; | ||
| 1792 | } | ||
| 1793 | } | ||
| 1794 | felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); | ||
| 1795 | felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); | ||
| 1796 | felem_contract(y_out, y_in); | ||
| 1797 | if (y != NULL) | ||
| 1798 | { | ||
| 1799 | if (!smallfelem_to_BN(y, y_out)) | ||
| 1800 | { | ||
| 1801 | ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, | ||
| 1802 | ERR_R_BN_LIB); | ||
| 1803 | return 0; | ||
| 1804 | } | ||
| 1805 | } | ||
| 1806 | return 1; | ||
| 1807 | } | ||
| 1808 | |||
| 1809 | static void make_points_affine(size_t num, smallfelem points[/* num */][3], smallfelem tmp_smallfelems[/* num+1 */]) | ||
| 1810 | { | ||
| 1811 | /* Runs in constant time, unless an input is the point at infinity | ||
| 1812 | * (which normally shouldn't happen). */ | ||
| 1813 | ec_GFp_nistp_points_make_affine_internal( | ||
| 1814 | num, | ||
| 1815 | points, | ||
| 1816 | sizeof(smallfelem), | ||
| 1817 | tmp_smallfelems, | ||
| 1818 | (void (*)(void *)) smallfelem_one, | ||
| 1819 | (int (*)(const void *)) smallfelem_is_zero_int, | ||
| 1820 | (void (*)(void *, const void *)) smallfelem_assign, | ||
| 1821 | (void (*)(void *, const void *)) smallfelem_square_contract, | ||
| 1822 | (void (*)(void *, const void *, const void *)) smallfelem_mul_contract, | ||
| 1823 | (void (*)(void *, const void *)) smallfelem_inv_contract, | ||
| 1824 | (void (*)(void *, const void *)) smallfelem_assign /* nothing to contract */); | ||
| 1825 | } | ||
| 1826 | |||
| 1827 | /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values | ||
| 1828 | * Result is stored in r (r can equal one of the inputs). */ | ||
| 1829 | int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, | ||
| 1830 | const BIGNUM *scalar, size_t num, const EC_POINT *points[], | ||
| 1831 | const BIGNUM *scalars[], BN_CTX *ctx) | ||
| 1832 | { | ||
| 1833 | int ret = 0; | ||
| 1834 | int j; | ||
| 1835 | int mixed = 0; | ||
| 1836 | BN_CTX *new_ctx = NULL; | ||
| 1837 | BIGNUM *x, *y, *z, *tmp_scalar; | ||
| 1838 | felem_bytearray g_secret; | ||
| 1839 | felem_bytearray *secrets = NULL; | ||
| 1840 | smallfelem (*pre_comp)[17][3] = NULL; | ||
| 1841 | smallfelem *tmp_smallfelems = NULL; | ||
| 1842 | felem_bytearray tmp; | ||
| 1843 | unsigned i, num_bytes; | ||
| 1844 | int have_pre_comp = 0; | ||
| 1845 | size_t num_points = num; | ||
| 1846 | smallfelem x_in, y_in, z_in; | ||
| 1847 | felem x_out, y_out, z_out; | ||
| 1848 | NISTP256_PRE_COMP *pre = NULL; | ||
| 1849 | const smallfelem (*g_pre_comp)[16][3] = NULL; | ||
| 1850 | EC_POINT *generator = NULL; | ||
| 1851 | const EC_POINT *p = NULL; | ||
| 1852 | const BIGNUM *p_scalar = NULL; | ||
| 1853 | |||
| 1854 | if (ctx == NULL) | ||
| 1855 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
| 1856 | BN_CTX_start(ctx); | ||
| 1857 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
| 1858 | ((y = BN_CTX_get(ctx)) == NULL) || | ||
| 1859 | ((z = BN_CTX_get(ctx)) == NULL) || | ||
| 1860 | ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) | ||
| 1861 | goto err; | ||
| 1862 | |||
| 1863 | if (scalar != NULL) | ||
| 1864 | { | ||
| 1865 | pre = EC_EX_DATA_get_data(group->extra_data, | ||
| 1866 | nistp256_pre_comp_dup, nistp256_pre_comp_free, | ||
| 1867 | nistp256_pre_comp_clear_free); | ||
| 1868 | if (pre) | ||
| 1869 | /* we have precomputation, try to use it */ | ||
| 1870 | g_pre_comp = (const smallfelem (*)[16][3]) pre->g_pre_comp; | ||
| 1871 | else | ||
| 1872 | /* try to use the standard precomputation */ | ||
| 1873 | g_pre_comp = &gmul[0]; | ||
| 1874 | generator = EC_POINT_new(group); | ||
| 1875 | if (generator == NULL) | ||
| 1876 | goto err; | ||
| 1877 | /* get the generator from precomputation */ | ||
| 1878 | if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) || | ||
| 1879 | !smallfelem_to_BN(y, g_pre_comp[0][1][1]) || | ||
| 1880 | !smallfelem_to_BN(z, g_pre_comp[0][1][2])) | ||
| 1881 | { | ||
| 1882 | ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1883 | goto err; | ||
| 1884 | } | ||
| 1885 | if (!EC_POINT_set_Jprojective_coordinates_GFp(group, | ||
| 1886 | generator, x, y, z, ctx)) | ||
| 1887 | goto err; | ||
| 1888 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
| 1889 | /* precomputation matches generator */ | ||
| 1890 | have_pre_comp = 1; | ||
| 1891 | else | ||
| 1892 | /* we don't have valid precomputation: | ||
| 1893 | * treat the generator as a random point */ | ||
| 1894 | num_points++; | ||
| 1895 | } | ||
| 1896 | if (num_points > 0) | ||
| 1897 | { | ||
| 1898 | if (num_points >= 3) | ||
| 1899 | { | ||
| 1900 | /* unless we precompute multiples for just one or two points, | ||
| 1901 | * converting those into affine form is time well spent */ | ||
| 1902 | mixed = 1; | ||
| 1903 | } | ||
| 1904 | secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); | ||
| 1905 | pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem)); | ||
| 1906 | if (mixed) | ||
| 1907 | tmp_smallfelems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem)); | ||
| 1908 | if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_smallfelems == NULL))) | ||
| 1909 | { | ||
| 1910 | ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE); | ||
| 1911 | goto err; | ||
| 1912 | } | ||
| 1913 | |||
| 1914 | /* we treat NULL scalars as 0, and NULL points as points at infinity, | ||
| 1915 | * i.e., they contribute nothing to the linear combination */ | ||
| 1916 | memset(secrets, 0, num_points * sizeof(felem_bytearray)); | ||
| 1917 | memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem)); | ||
| 1918 | for (i = 0; i < num_points; ++i) | ||
| 1919 | { | ||
| 1920 | if (i == num) | ||
| 1921 | /* we didn't have a valid precomputation, so we pick | ||
| 1922 | * the generator */ | ||
| 1923 | { | ||
| 1924 | p = EC_GROUP_get0_generator(group); | ||
| 1925 | p_scalar = scalar; | ||
| 1926 | } | ||
| 1927 | else | ||
| 1928 | /* the i^th point */ | ||
| 1929 | { | ||
| 1930 | p = points[i]; | ||
| 1931 | p_scalar = scalars[i]; | ||
| 1932 | } | ||
| 1933 | if ((p_scalar != NULL) && (p != NULL)) | ||
| 1934 | { | ||
| 1935 | /* reduce scalar to 0 <= scalar < 2^256 */ | ||
| 1936 | if ((BN_num_bits(p_scalar) > 256) || (BN_is_negative(p_scalar))) | ||
| 1937 | { | ||
| 1938 | /* this is an unusual input, and we don't guarantee | ||
| 1939 | * constant-timeness */ | ||
| 1940 | if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) | ||
| 1941 | { | ||
| 1942 | ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1943 | goto err; | ||
| 1944 | } | ||
| 1945 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
| 1946 | } | ||
| 1947 | else | ||
| 1948 | num_bytes = BN_bn2bin(p_scalar, tmp); | ||
| 1949 | flip_endian(secrets[i], tmp, num_bytes); | ||
| 1950 | /* precompute multiples */ | ||
| 1951 | if ((!BN_to_felem(x_out, &p->X)) || | ||
| 1952 | (!BN_to_felem(y_out, &p->Y)) || | ||
| 1953 | (!BN_to_felem(z_out, &p->Z))) goto err; | ||
| 1954 | felem_shrink(pre_comp[i][1][0], x_out); | ||
| 1955 | felem_shrink(pre_comp[i][1][1], y_out); | ||
| 1956 | felem_shrink(pre_comp[i][1][2], z_out); | ||
| 1957 | for (j = 2; j <= 16; ++j) | ||
| 1958 | { | ||
| 1959 | if (j & 1) | ||
| 1960 | { | ||
| 1961 | point_add_small( | ||
| 1962 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
| 1963 | pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], | ||
| 1964 | pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); | ||
| 1965 | } | ||
| 1966 | else | ||
| 1967 | { | ||
| 1968 | point_double_small( | ||
| 1969 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
| 1970 | pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); | ||
| 1971 | } | ||
| 1972 | } | ||
| 1973 | } | ||
| 1974 | } | ||
| 1975 | if (mixed) | ||
| 1976 | make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems); | ||
| 1977 | } | ||
| 1978 | |||
| 1979 | /* the scalar for the generator */ | ||
| 1980 | if ((scalar != NULL) && (have_pre_comp)) | ||
| 1981 | { | ||
| 1982 | memset(g_secret, 0, sizeof(g_secret)); | ||
| 1983 | /* reduce scalar to 0 <= scalar < 2^256 */ | ||
| 1984 | if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) | ||
| 1985 | { | ||
| 1986 | /* this is an unusual input, and we don't guarantee | ||
| 1987 | * constant-timeness */ | ||
| 1988 | if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) | ||
| 1989 | { | ||
| 1990 | ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1991 | goto err; | ||
| 1992 | } | ||
| 1993 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
| 1994 | } | ||
| 1995 | else | ||
| 1996 | num_bytes = BN_bn2bin(scalar, tmp); | ||
| 1997 | flip_endian(g_secret, tmp, num_bytes); | ||
| 1998 | /* do the multiplication with generator precomputation*/ | ||
| 1999 | batch_mul(x_out, y_out, z_out, | ||
| 2000 | (const felem_bytearray (*)) secrets, num_points, | ||
| 2001 | g_secret, | ||
| 2002 | mixed, (const smallfelem (*)[17][3]) pre_comp, | ||
| 2003 | g_pre_comp); | ||
| 2004 | } | ||
| 2005 | else | ||
| 2006 | /* do the multiplication without generator precomputation */ | ||
| 2007 | batch_mul(x_out, y_out, z_out, | ||
| 2008 | (const felem_bytearray (*)) secrets, num_points, | ||
| 2009 | NULL, mixed, (const smallfelem (*)[17][3]) pre_comp, NULL); | ||
| 2010 | /* reduce the output to its unique minimal representation */ | ||
| 2011 | felem_contract(x_in, x_out); | ||
| 2012 | felem_contract(y_in, y_out); | ||
| 2013 | felem_contract(z_in, z_out); | ||
| 2014 | if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) || | ||
| 2015 | (!smallfelem_to_BN(z, z_in))) | ||
| 2016 | { | ||
| 2017 | ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); | ||
| 2018 | goto err; | ||
| 2019 | } | ||
| 2020 | ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); | ||
| 2021 | |||
| 2022 | err: | ||
| 2023 | BN_CTX_end(ctx); | ||
| 2024 | if (generator != NULL) | ||
| 2025 | EC_POINT_free(generator); | ||
| 2026 | if (new_ctx != NULL) | ||
| 2027 | BN_CTX_free(new_ctx); | ||
| 2028 | if (secrets != NULL) | ||
| 2029 | OPENSSL_free(secrets); | ||
| 2030 | if (pre_comp != NULL) | ||
| 2031 | OPENSSL_free(pre_comp); | ||
| 2032 | if (tmp_smallfelems != NULL) | ||
| 2033 | OPENSSL_free(tmp_smallfelems); | ||
| 2034 | return ret; | ||
| 2035 | } | ||
| 2036 | |||
| 2037 | int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx) | ||
| 2038 | { | ||
| 2039 | int ret = 0; | ||
| 2040 | NISTP256_PRE_COMP *pre = NULL; | ||
| 2041 | int i, j; | ||
| 2042 | BN_CTX *new_ctx = NULL; | ||
| 2043 | BIGNUM *x, *y; | ||
| 2044 | EC_POINT *generator = NULL; | ||
| 2045 | smallfelem tmp_smallfelems[32]; | ||
| 2046 | felem x_tmp, y_tmp, z_tmp; | ||
| 2047 | |||
| 2048 | /* throw away old precomputation */ | ||
| 2049 | EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup, | ||
| 2050 | nistp256_pre_comp_free, nistp256_pre_comp_clear_free); | ||
| 2051 | if (ctx == NULL) | ||
| 2052 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
| 2053 | BN_CTX_start(ctx); | ||
| 2054 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
| 2055 | ((y = BN_CTX_get(ctx)) == NULL)) | ||
| 2056 | goto err; | ||
| 2057 | /* get the generator */ | ||
| 2058 | if (group->generator == NULL) goto err; | ||
| 2059 | generator = EC_POINT_new(group); | ||
| 2060 | if (generator == NULL) | ||
| 2061 | goto err; | ||
| 2062 | BN_bin2bn(nistp256_curve_params[3], sizeof (felem_bytearray), x); | ||
| 2063 | BN_bin2bn(nistp256_curve_params[4], sizeof (felem_bytearray), y); | ||
| 2064 | if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) | ||
| 2065 | goto err; | ||
| 2066 | if ((pre = nistp256_pre_comp_new()) == NULL) | ||
| 2067 | goto err; | ||
| 2068 | /* if the generator is the standard one, use built-in precomputation */ | ||
| 2069 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
| 2070 | { | ||
| 2071 | memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); | ||
| 2072 | ret = 1; | ||
| 2073 | goto err; | ||
| 2074 | } | ||
| 2075 | if ((!BN_to_felem(x_tmp, &group->generator->X)) || | ||
| 2076 | (!BN_to_felem(y_tmp, &group->generator->Y)) || | ||
| 2077 | (!BN_to_felem(z_tmp, &group->generator->Z))) | ||
| 2078 | goto err; | ||
| 2079 | felem_shrink(pre->g_pre_comp[0][1][0], x_tmp); | ||
| 2080 | felem_shrink(pre->g_pre_comp[0][1][1], y_tmp); | ||
| 2081 | felem_shrink(pre->g_pre_comp[0][1][2], z_tmp); | ||
| 2082 | /* compute 2^64*G, 2^128*G, 2^192*G for the first table, | ||
| 2083 | * 2^32*G, 2^96*G, 2^160*G, 2^224*G for the second one | ||
| 2084 | */ | ||
| 2085 | for (i = 1; i <= 8; i <<= 1) | ||
| 2086 | { | ||
| 2087 | point_double_small( | ||
| 2088 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], | ||
| 2089 | pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]); | ||
| 2090 | for (j = 0; j < 31; ++j) | ||
| 2091 | { | ||
| 2092 | point_double_small( | ||
| 2093 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], | ||
| 2094 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); | ||
| 2095 | } | ||
| 2096 | if (i == 8) | ||
| 2097 | break; | ||
| 2098 | point_double_small( | ||
| 2099 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], | ||
| 2100 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); | ||
| 2101 | for (j = 0; j < 31; ++j) | ||
| 2102 | { | ||
| 2103 | point_double_small( | ||
| 2104 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], | ||
| 2105 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]); | ||
| 2106 | } | ||
| 2107 | } | ||
| 2108 | for (i = 0; i < 2; i++) | ||
| 2109 | { | ||
| 2110 | /* g_pre_comp[i][0] is the point at infinity */ | ||
| 2111 | memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0])); | ||
| 2112 | /* the remaining multiples */ | ||
| 2113 | /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */ | ||
| 2114 | point_add_small( | ||
| 2115 | pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], pre->g_pre_comp[i][6][2], | ||
| 2116 | pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2], | ||
| 2117 | pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); | ||
| 2118 | /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */ | ||
| 2119 | point_add_small( | ||
| 2120 | pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], pre->g_pre_comp[i][10][2], | ||
| 2121 | pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], | ||
| 2122 | pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); | ||
| 2123 | /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */ | ||
| 2124 | point_add_small( | ||
| 2125 | pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], | ||
| 2126 | pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], | ||
| 2127 | pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2]); | ||
| 2128 | /* 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G */ | ||
| 2129 | point_add_small( | ||
| 2130 | pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], pre->g_pre_comp[i][14][2], | ||
| 2131 | pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], | ||
| 2132 | pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); | ||
| 2133 | for (j = 1; j < 8; ++j) | ||
| 2134 | { | ||
| 2135 | /* odd multiples: add G resp. 2^32*G */ | ||
| 2136 | point_add_small( | ||
| 2137 | pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], pre->g_pre_comp[i][2*j+1][2], | ||
| 2138 | pre->g_pre_comp[i][2*j][0], pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2], | ||
| 2139 | pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], pre->g_pre_comp[i][1][2]); | ||
| 2140 | } | ||
| 2141 | } | ||
| 2142 | make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems); | ||
| 2143 | |||
| 2144 | if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup, | ||
| 2145 | nistp256_pre_comp_free, nistp256_pre_comp_clear_free)) | ||
| 2146 | goto err; | ||
| 2147 | ret = 1; | ||
| 2148 | pre = NULL; | ||
| 2149 | err: | ||
| 2150 | BN_CTX_end(ctx); | ||
| 2151 | if (generator != NULL) | ||
| 2152 | EC_POINT_free(generator); | ||
| 2153 | if (new_ctx != NULL) | ||
| 2154 | BN_CTX_free(new_ctx); | ||
| 2155 | if (pre) | ||
| 2156 | nistp256_pre_comp_free(pre); | ||
| 2157 | return ret; | ||
| 2158 | } | ||
| 2159 | |||
| 2160 | int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group) | ||
| 2161 | { | ||
| 2162 | if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup, | ||
| 2163 | nistp256_pre_comp_free, nistp256_pre_comp_clear_free) | ||
| 2164 | != NULL) | ||
| 2165 | return 1; | ||
| 2166 | else | ||
| 2167 | return 0; | ||
| 2168 | } | ||
| 2169 | #else | ||
| 2170 | static void *dummy=&dummy; | ||
| 2171 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ecp_nistp521.c b/src/lib/libcrypto/ec/ecp_nistp521.c new file mode 100644 index 0000000000..178b655f7f --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistp521.c | |||
| @@ -0,0 +1,2025 @@ | |||
| 1 | /* crypto/ec/ecp_nistp521.c */ | ||
| 2 | /* | ||
| 3 | * Written by Adam Langley (Google) for the OpenSSL project | ||
| 4 | */ | ||
| 5 | /* Copyright 2011 Google Inc. | ||
| 6 | * | ||
| 7 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| 8 | * | ||
| 9 | * you may not use this file except in compliance with the License. | ||
| 10 | * You may obtain a copy of the License at | ||
| 11 | * | ||
| 12 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
| 13 | * | ||
| 14 | * Unless required by applicable law or agreed to in writing, software | ||
| 15 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
| 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| 17 | * See the License for the specific language governing permissions and | ||
| 18 | * limitations under the License. | ||
| 19 | */ | ||
| 20 | |||
| 21 | /* | ||
| 22 | * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication | ||
| 23 | * | ||
| 24 | * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c. | ||
| 25 | * Otherwise based on Emilia's P224 work, which was inspired by my curve25519 | ||
| 26 | * work which got its smarts from Daniel J. Bernstein's work on the same. | ||
| 27 | */ | ||
| 28 | |||
| 29 | #include <openssl/opensslconf.h> | ||
| 30 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 | ||
| 31 | |||
| 32 | #ifndef OPENSSL_SYS_VMS | ||
| 33 | #include <stdint.h> | ||
| 34 | #else | ||
| 35 | #include <inttypes.h> | ||
| 36 | #endif | ||
| 37 | |||
| 38 | #include <string.h> | ||
| 39 | #include <openssl/err.h> | ||
| 40 | #include "ec_lcl.h" | ||
| 41 | |||
| 42 | #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) | ||
| 43 | /* even with gcc, the typedef won't work for 32-bit platforms */ | ||
| 44 | typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ | ||
| 45 | #else | ||
| 46 | #error "Need GCC 3.1 or later to define type uint128_t" | ||
| 47 | #endif | ||
| 48 | |||
| 49 | typedef uint8_t u8; | ||
| 50 | typedef uint64_t u64; | ||
| 51 | typedef int64_t s64; | ||
| 52 | |||
| 53 | /* The underlying field. | ||
| 54 | * | ||
| 55 | * P521 operates over GF(2^521-1). We can serialise an element of this field | ||
| 56 | * into 66 bytes where the most significant byte contains only a single bit. We | ||
| 57 | * call this an felem_bytearray. */ | ||
| 58 | |||
| 59 | typedef u8 felem_bytearray[66]; | ||
| 60 | |||
| 61 | /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5. | ||
| 62 | * These values are big-endian. */ | ||
| 63 | static const felem_bytearray nistp521_curve_params[5] = | ||
| 64 | { | ||
| 65 | {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */ | ||
| 66 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 67 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 68 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 69 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 70 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 71 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 72 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 73 | 0xff, 0xff}, | ||
| 74 | {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */ | ||
| 75 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 76 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 77 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 78 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 79 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 80 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 81 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
| 82 | 0xff, 0xfc}, | ||
| 83 | {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */ | ||
| 84 | 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85, | ||
| 85 | 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3, | ||
| 86 | 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1, | ||
| 87 | 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e, | ||
| 88 | 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1, | ||
| 89 | 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c, | ||
| 90 | 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50, | ||
| 91 | 0x3f, 0x00}, | ||
| 92 | {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */ | ||
| 93 | 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95, | ||
| 94 | 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f, | ||
| 95 | 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d, | ||
| 96 | 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7, | ||
| 97 | 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff, | ||
| 98 | 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a, | ||
| 99 | 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5, | ||
| 100 | 0xbd, 0x66}, | ||
| 101 | {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */ | ||
| 102 | 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d, | ||
| 103 | 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b, | ||
| 104 | 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e, | ||
| 105 | 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4, | ||
| 106 | 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad, | ||
| 107 | 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72, | ||
| 108 | 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1, | ||
| 109 | 0x66, 0x50} | ||
| 110 | }; | ||
| 111 | |||
| 112 | /* The representation of field elements. | ||
| 113 | * ------------------------------------ | ||
| 114 | * | ||
| 115 | * We represent field elements with nine values. These values are either 64 or | ||
| 116 | * 128 bits and the field element represented is: | ||
| 117 | * v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464 (mod p) | ||
| 118 | * Each of the nine values is called a 'limb'. Since the limbs are spaced only | ||
| 119 | * 58 bits apart, but are greater than 58 bits in length, the most significant | ||
| 120 | * bits of each limb overlap with the least significant bits of the next. | ||
| 121 | * | ||
| 122 | * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a | ||
| 123 | * 'largefelem' */ | ||
| 124 | |||
| 125 | #define NLIMBS 9 | ||
| 126 | |||
| 127 | typedef uint64_t limb; | ||
| 128 | typedef limb felem[NLIMBS]; | ||
| 129 | typedef uint128_t largefelem[NLIMBS]; | ||
| 130 | |||
| 131 | static const limb bottom57bits = 0x1ffffffffffffff; | ||
| 132 | static const limb bottom58bits = 0x3ffffffffffffff; | ||
| 133 | |||
| 134 | /* bin66_to_felem takes a little-endian byte array and converts it into felem | ||
| 135 | * form. This assumes that the CPU is little-endian. */ | ||
| 136 | static void bin66_to_felem(felem out, const u8 in[66]) | ||
| 137 | { | ||
| 138 | out[0] = (*((limb*) &in[0])) & bottom58bits; | ||
| 139 | out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits; | ||
| 140 | out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits; | ||
| 141 | out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits; | ||
| 142 | out[4] = (*((limb*) &in[29])) & bottom58bits; | ||
| 143 | out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits; | ||
| 144 | out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits; | ||
| 145 | out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits; | ||
| 146 | out[8] = (*((limb*) &in[58])) & bottom57bits; | ||
| 147 | } | ||
| 148 | |||
| 149 | /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte | ||
| 150 | * array. This assumes that the CPU is little-endian. */ | ||
| 151 | static void felem_to_bin66(u8 out[66], const felem in) | ||
| 152 | { | ||
| 153 | memset(out, 0, 66); | ||
| 154 | (*((limb*) &out[0])) = in[0]; | ||
| 155 | (*((limb*) &out[7])) |= in[1] << 2; | ||
| 156 | (*((limb*) &out[14])) |= in[2] << 4; | ||
| 157 | (*((limb*) &out[21])) |= in[3] << 6; | ||
| 158 | (*((limb*) &out[29])) = in[4]; | ||
| 159 | (*((limb*) &out[36])) |= in[5] << 2; | ||
| 160 | (*((limb*) &out[43])) |= in[6] << 4; | ||
| 161 | (*((limb*) &out[50])) |= in[7] << 6; | ||
| 162 | (*((limb*) &out[58])) = in[8]; | ||
| 163 | } | ||
| 164 | |||
| 165 | /* To preserve endianness when using BN_bn2bin and BN_bin2bn */ | ||
| 166 | static void flip_endian(u8 *out, const u8 *in, unsigned len) | ||
| 167 | { | ||
| 168 | unsigned i; | ||
| 169 | for (i = 0; i < len; ++i) | ||
| 170 | out[i] = in[len-1-i]; | ||
| 171 | } | ||
| 172 | |||
| 173 | /* BN_to_felem converts an OpenSSL BIGNUM into an felem */ | ||
| 174 | static int BN_to_felem(felem out, const BIGNUM *bn) | ||
| 175 | { | ||
| 176 | felem_bytearray b_in; | ||
| 177 | felem_bytearray b_out; | ||
| 178 | unsigned num_bytes; | ||
| 179 | |||
| 180 | /* BN_bn2bin eats leading zeroes */ | ||
| 181 | memset(b_out, 0, sizeof b_out); | ||
| 182 | num_bytes = BN_num_bytes(bn); | ||
| 183 | if (num_bytes > sizeof b_out) | ||
| 184 | { | ||
| 185 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
| 186 | return 0; | ||
| 187 | } | ||
| 188 | if (BN_is_negative(bn)) | ||
| 189 | { | ||
| 190 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
| 191 | return 0; | ||
| 192 | } | ||
| 193 | num_bytes = BN_bn2bin(bn, b_in); | ||
| 194 | flip_endian(b_out, b_in, num_bytes); | ||
| 195 | bin66_to_felem(out, b_out); | ||
| 196 | return 1; | ||
| 197 | } | ||
| 198 | |||
| 199 | /* felem_to_BN converts an felem into an OpenSSL BIGNUM */ | ||
| 200 | static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) | ||
| 201 | { | ||
| 202 | felem_bytearray b_in, b_out; | ||
| 203 | felem_to_bin66(b_in, in); | ||
| 204 | flip_endian(b_out, b_in, sizeof b_out); | ||
| 205 | return BN_bin2bn(b_out, sizeof b_out, out); | ||
| 206 | } | ||
| 207 | |||
| 208 | |||
| 209 | /* Field operations | ||
| 210 | * ---------------- */ | ||
| 211 | |||
| 212 | static void felem_one(felem out) | ||
| 213 | { | ||
| 214 | out[0] = 1; | ||
| 215 | out[1] = 0; | ||
| 216 | out[2] = 0; | ||
| 217 | out[3] = 0; | ||
| 218 | out[4] = 0; | ||
| 219 | out[5] = 0; | ||
| 220 | out[6] = 0; | ||
| 221 | out[7] = 0; | ||
| 222 | out[8] = 0; | ||
| 223 | } | ||
| 224 | |||
| 225 | static void felem_assign(felem out, const felem in) | ||
| 226 | { | ||
| 227 | out[0] = in[0]; | ||
| 228 | out[1] = in[1]; | ||
| 229 | out[2] = in[2]; | ||
| 230 | out[3] = in[3]; | ||
| 231 | out[4] = in[4]; | ||
| 232 | out[5] = in[5]; | ||
| 233 | out[6] = in[6]; | ||
| 234 | out[7] = in[7]; | ||
| 235 | out[8] = in[8]; | ||
| 236 | } | ||
| 237 | |||
| 238 | /* felem_sum64 sets out = out + in. */ | ||
| 239 | static void felem_sum64(felem out, const felem in) | ||
| 240 | { | ||
| 241 | out[0] += in[0]; | ||
| 242 | out[1] += in[1]; | ||
| 243 | out[2] += in[2]; | ||
| 244 | out[3] += in[3]; | ||
| 245 | out[4] += in[4]; | ||
| 246 | out[5] += in[5]; | ||
| 247 | out[6] += in[6]; | ||
| 248 | out[7] += in[7]; | ||
| 249 | out[8] += in[8]; | ||
| 250 | } | ||
| 251 | |||
| 252 | /* felem_scalar sets out = in * scalar */ | ||
| 253 | static void felem_scalar(felem out, const felem in, limb scalar) | ||
| 254 | { | ||
| 255 | out[0] = in[0] * scalar; | ||
| 256 | out[1] = in[1] * scalar; | ||
| 257 | out[2] = in[2] * scalar; | ||
| 258 | out[3] = in[3] * scalar; | ||
| 259 | out[4] = in[4] * scalar; | ||
| 260 | out[5] = in[5] * scalar; | ||
| 261 | out[6] = in[6] * scalar; | ||
| 262 | out[7] = in[7] * scalar; | ||
| 263 | out[8] = in[8] * scalar; | ||
| 264 | } | ||
| 265 | |||
| 266 | /* felem_scalar64 sets out = out * scalar */ | ||
| 267 | static void felem_scalar64(felem out, limb scalar) | ||
| 268 | { | ||
| 269 | out[0] *= scalar; | ||
| 270 | out[1] *= scalar; | ||
| 271 | out[2] *= scalar; | ||
| 272 | out[3] *= scalar; | ||
| 273 | out[4] *= scalar; | ||
| 274 | out[5] *= scalar; | ||
| 275 | out[6] *= scalar; | ||
| 276 | out[7] *= scalar; | ||
| 277 | out[8] *= scalar; | ||
| 278 | } | ||
| 279 | |||
| 280 | /* felem_scalar128 sets out = out * scalar */ | ||
| 281 | static void felem_scalar128(largefelem out, limb scalar) | ||
| 282 | { | ||
| 283 | out[0] *= scalar; | ||
| 284 | out[1] *= scalar; | ||
| 285 | out[2] *= scalar; | ||
| 286 | out[3] *= scalar; | ||
| 287 | out[4] *= scalar; | ||
| 288 | out[5] *= scalar; | ||
| 289 | out[6] *= scalar; | ||
| 290 | out[7] *= scalar; | ||
| 291 | out[8] *= scalar; | ||
| 292 | } | ||
| 293 | |||
| 294 | /* felem_neg sets |out| to |-in| | ||
| 295 | * On entry: | ||
| 296 | * in[i] < 2^59 + 2^14 | ||
| 297 | * On exit: | ||
| 298 | * out[i] < 2^62 | ||
| 299 | */ | ||
| 300 | static void felem_neg(felem out, const felem in) | ||
| 301 | { | ||
| 302 | /* In order to prevent underflow, we subtract from 0 mod p. */ | ||
| 303 | static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5); | ||
| 304 | static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4); | ||
| 305 | |||
| 306 | out[0] = two62m3 - in[0]; | ||
| 307 | out[1] = two62m2 - in[1]; | ||
| 308 | out[2] = two62m2 - in[2]; | ||
| 309 | out[3] = two62m2 - in[3]; | ||
| 310 | out[4] = two62m2 - in[4]; | ||
| 311 | out[5] = two62m2 - in[5]; | ||
| 312 | out[6] = two62m2 - in[6]; | ||
| 313 | out[7] = two62m2 - in[7]; | ||
| 314 | out[8] = two62m2 - in[8]; | ||
| 315 | } | ||
| 316 | |||
| 317 | /* felem_diff64 subtracts |in| from |out| | ||
| 318 | * On entry: | ||
| 319 | * in[i] < 2^59 + 2^14 | ||
| 320 | * On exit: | ||
| 321 | * out[i] < out[i] + 2^62 | ||
| 322 | */ | ||
| 323 | static void felem_diff64(felem out, const felem in) | ||
| 324 | { | ||
| 325 | /* In order to prevent underflow, we add 0 mod p before subtracting. */ | ||
| 326 | static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5); | ||
| 327 | static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4); | ||
| 328 | |||
| 329 | out[0] += two62m3 - in[0]; | ||
| 330 | out[1] += two62m2 - in[1]; | ||
| 331 | out[2] += two62m2 - in[2]; | ||
| 332 | out[3] += two62m2 - in[3]; | ||
| 333 | out[4] += two62m2 - in[4]; | ||
| 334 | out[5] += two62m2 - in[5]; | ||
| 335 | out[6] += two62m2 - in[6]; | ||
| 336 | out[7] += two62m2 - in[7]; | ||
| 337 | out[8] += two62m2 - in[8]; | ||
| 338 | } | ||
| 339 | |||
| 340 | /* felem_diff_128_64 subtracts |in| from |out| | ||
| 341 | * On entry: | ||
| 342 | * in[i] < 2^62 + 2^17 | ||
| 343 | * On exit: | ||
| 344 | * out[i] < out[i] + 2^63 | ||
| 345 | */ | ||
| 346 | static void felem_diff_128_64(largefelem out, const felem in) | ||
| 347 | { | ||
| 348 | /* In order to prevent underflow, we add 0 mod p before subtracting. */ | ||
| 349 | static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5); | ||
| 350 | static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4); | ||
| 351 | |||
| 352 | out[0] += two63m6 - in[0]; | ||
| 353 | out[1] += two63m5 - in[1]; | ||
| 354 | out[2] += two63m5 - in[2]; | ||
| 355 | out[3] += two63m5 - in[3]; | ||
| 356 | out[4] += two63m5 - in[4]; | ||
| 357 | out[5] += two63m5 - in[5]; | ||
| 358 | out[6] += two63m5 - in[6]; | ||
| 359 | out[7] += two63m5 - in[7]; | ||
| 360 | out[8] += two63m5 - in[8]; | ||
| 361 | } | ||
| 362 | |||
| 363 | /* felem_diff_128_64 subtracts |in| from |out| | ||
| 364 | * On entry: | ||
| 365 | * in[i] < 2^126 | ||
| 366 | * On exit: | ||
| 367 | * out[i] < out[i] + 2^127 - 2^69 | ||
| 368 | */ | ||
| 369 | static void felem_diff128(largefelem out, const largefelem in) | ||
| 370 | { | ||
| 371 | /* In order to prevent underflow, we add 0 mod p before subtracting. */ | ||
| 372 | static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70); | ||
| 373 | static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69); | ||
| 374 | |||
| 375 | out[0] += (two127m70 - in[0]); | ||
| 376 | out[1] += (two127m69 - in[1]); | ||
| 377 | out[2] += (two127m69 - in[2]); | ||
| 378 | out[3] += (two127m69 - in[3]); | ||
| 379 | out[4] += (two127m69 - in[4]); | ||
| 380 | out[5] += (two127m69 - in[5]); | ||
| 381 | out[6] += (two127m69 - in[6]); | ||
| 382 | out[7] += (two127m69 - in[7]); | ||
| 383 | out[8] += (two127m69 - in[8]); | ||
| 384 | } | ||
| 385 | |||
| 386 | /* felem_square sets |out| = |in|^2 | ||
| 387 | * On entry: | ||
| 388 | * in[i] < 2^62 | ||
| 389 | * On exit: | ||
| 390 | * out[i] < 17 * max(in[i]) * max(in[i]) | ||
| 391 | */ | ||
| 392 | static void felem_square(largefelem out, const felem in) | ||
| 393 | { | ||
| 394 | felem inx2, inx4; | ||
| 395 | felem_scalar(inx2, in, 2); | ||
| 396 | felem_scalar(inx4, in, 4); | ||
| 397 | |||
| 398 | /* We have many cases were we want to do | ||
| 399 | * in[x] * in[y] + | ||
| 400 | * in[y] * in[x] | ||
| 401 | * This is obviously just | ||
| 402 | * 2 * in[x] * in[y] | ||
| 403 | * However, rather than do the doubling on the 128 bit result, we | ||
| 404 | * double one of the inputs to the multiplication by reading from | ||
| 405 | * |inx2| */ | ||
| 406 | |||
| 407 | out[0] = ((uint128_t) in[0]) * in[0]; | ||
| 408 | out[1] = ((uint128_t) in[0]) * inx2[1]; | ||
| 409 | out[2] = ((uint128_t) in[0]) * inx2[2] + | ||
| 410 | ((uint128_t) in[1]) * in[1]; | ||
| 411 | out[3] = ((uint128_t) in[0]) * inx2[3] + | ||
| 412 | ((uint128_t) in[1]) * inx2[2]; | ||
| 413 | out[4] = ((uint128_t) in[0]) * inx2[4] + | ||
| 414 | ((uint128_t) in[1]) * inx2[3] + | ||
| 415 | ((uint128_t) in[2]) * in[2]; | ||
| 416 | out[5] = ((uint128_t) in[0]) * inx2[5] + | ||
| 417 | ((uint128_t) in[1]) * inx2[4] + | ||
| 418 | ((uint128_t) in[2]) * inx2[3]; | ||
| 419 | out[6] = ((uint128_t) in[0]) * inx2[6] + | ||
| 420 | ((uint128_t) in[1]) * inx2[5] + | ||
| 421 | ((uint128_t) in[2]) * inx2[4] + | ||
| 422 | ((uint128_t) in[3]) * in[3]; | ||
| 423 | out[7] = ((uint128_t) in[0]) * inx2[7] + | ||
| 424 | ((uint128_t) in[1]) * inx2[6] + | ||
| 425 | ((uint128_t) in[2]) * inx2[5] + | ||
| 426 | ((uint128_t) in[3]) * inx2[4]; | ||
| 427 | out[8] = ((uint128_t) in[0]) * inx2[8] + | ||
| 428 | ((uint128_t) in[1]) * inx2[7] + | ||
| 429 | ((uint128_t) in[2]) * inx2[6] + | ||
| 430 | ((uint128_t) in[3]) * inx2[5] + | ||
| 431 | ((uint128_t) in[4]) * in[4]; | ||
| 432 | |||
| 433 | /* The remaining limbs fall above 2^521, with the first falling at | ||
| 434 | * 2^522. They correspond to locations one bit up from the limbs | ||
| 435 | * produced above so we would have to multiply by two to align them. | ||
| 436 | * Again, rather than operate on the 128-bit result, we double one of | ||
| 437 | * the inputs to the multiplication. If we want to double for both this | ||
| 438 | * reason, and the reason above, then we end up multiplying by four. */ | ||
| 439 | |||
| 440 | /* 9 */ | ||
| 441 | out[0] += ((uint128_t) in[1]) * inx4[8] + | ||
| 442 | ((uint128_t) in[2]) * inx4[7] + | ||
| 443 | ((uint128_t) in[3]) * inx4[6] + | ||
| 444 | ((uint128_t) in[4]) * inx4[5]; | ||
| 445 | |||
| 446 | /* 10 */ | ||
| 447 | out[1] += ((uint128_t) in[2]) * inx4[8] + | ||
| 448 | ((uint128_t) in[3]) * inx4[7] + | ||
| 449 | ((uint128_t) in[4]) * inx4[6] + | ||
| 450 | ((uint128_t) in[5]) * inx2[5]; | ||
| 451 | |||
| 452 | /* 11 */ | ||
| 453 | out[2] += ((uint128_t) in[3]) * inx4[8] + | ||
| 454 | ((uint128_t) in[4]) * inx4[7] + | ||
| 455 | ((uint128_t) in[5]) * inx4[6]; | ||
| 456 | |||
| 457 | /* 12 */ | ||
| 458 | out[3] += ((uint128_t) in[4]) * inx4[8] + | ||
| 459 | ((uint128_t) in[5]) * inx4[7] + | ||
| 460 | ((uint128_t) in[6]) * inx2[6]; | ||
| 461 | |||
| 462 | /* 13 */ | ||
| 463 | out[4] += ((uint128_t) in[5]) * inx4[8] + | ||
| 464 | ((uint128_t) in[6]) * inx4[7]; | ||
| 465 | |||
| 466 | /* 14 */ | ||
| 467 | out[5] += ((uint128_t) in[6]) * inx4[8] + | ||
| 468 | ((uint128_t) in[7]) * inx2[7]; | ||
| 469 | |||
| 470 | /* 15 */ | ||
| 471 | out[6] += ((uint128_t) in[7]) * inx4[8]; | ||
| 472 | |||
| 473 | /* 16 */ | ||
| 474 | out[7] += ((uint128_t) in[8]) * inx2[8]; | ||
| 475 | } | ||
| 476 | |||
| 477 | /* felem_mul sets |out| = |in1| * |in2| | ||
| 478 | * On entry: | ||
| 479 | * in1[i] < 2^64 | ||
| 480 | * in2[i] < 2^63 | ||
| 481 | * On exit: | ||
| 482 | * out[i] < 17 * max(in1[i]) * max(in2[i]) | ||
| 483 | */ | ||
| 484 | static void felem_mul(largefelem out, const felem in1, const felem in2) | ||
| 485 | { | ||
| 486 | felem in2x2; | ||
| 487 | felem_scalar(in2x2, in2, 2); | ||
| 488 | |||
| 489 | out[0] = ((uint128_t) in1[0]) * in2[0]; | ||
| 490 | |||
| 491 | out[1] = ((uint128_t) in1[0]) * in2[1] + | ||
| 492 | ((uint128_t) in1[1]) * in2[0]; | ||
| 493 | |||
| 494 | out[2] = ((uint128_t) in1[0]) * in2[2] + | ||
| 495 | ((uint128_t) in1[1]) * in2[1] + | ||
| 496 | ((uint128_t) in1[2]) * in2[0]; | ||
| 497 | |||
| 498 | out[3] = ((uint128_t) in1[0]) * in2[3] + | ||
| 499 | ((uint128_t) in1[1]) * in2[2] + | ||
| 500 | ((uint128_t) in1[2]) * in2[1] + | ||
| 501 | ((uint128_t) in1[3]) * in2[0]; | ||
| 502 | |||
| 503 | out[4] = ((uint128_t) in1[0]) * in2[4] + | ||
| 504 | ((uint128_t) in1[1]) * in2[3] + | ||
| 505 | ((uint128_t) in1[2]) * in2[2] + | ||
| 506 | ((uint128_t) in1[3]) * in2[1] + | ||
| 507 | ((uint128_t) in1[4]) * in2[0]; | ||
| 508 | |||
| 509 | out[5] = ((uint128_t) in1[0]) * in2[5] + | ||
| 510 | ((uint128_t) in1[1]) * in2[4] + | ||
| 511 | ((uint128_t) in1[2]) * in2[3] + | ||
| 512 | ((uint128_t) in1[3]) * in2[2] + | ||
| 513 | ((uint128_t) in1[4]) * in2[1] + | ||
| 514 | ((uint128_t) in1[5]) * in2[0]; | ||
| 515 | |||
| 516 | out[6] = ((uint128_t) in1[0]) * in2[6] + | ||
| 517 | ((uint128_t) in1[1]) * in2[5] + | ||
| 518 | ((uint128_t) in1[2]) * in2[4] + | ||
| 519 | ((uint128_t) in1[3]) * in2[3] + | ||
| 520 | ((uint128_t) in1[4]) * in2[2] + | ||
| 521 | ((uint128_t) in1[5]) * in2[1] + | ||
| 522 | ((uint128_t) in1[6]) * in2[0]; | ||
| 523 | |||
| 524 | out[7] = ((uint128_t) in1[0]) * in2[7] + | ||
| 525 | ((uint128_t) in1[1]) * in2[6] + | ||
| 526 | ((uint128_t) in1[2]) * in2[5] + | ||
| 527 | ((uint128_t) in1[3]) * in2[4] + | ||
| 528 | ((uint128_t) in1[4]) * in2[3] + | ||
| 529 | ((uint128_t) in1[5]) * in2[2] + | ||
| 530 | ((uint128_t) in1[6]) * in2[1] + | ||
| 531 | ((uint128_t) in1[7]) * in2[0]; | ||
| 532 | |||
| 533 | out[8] = ((uint128_t) in1[0]) * in2[8] + | ||
| 534 | ((uint128_t) in1[1]) * in2[7] + | ||
| 535 | ((uint128_t) in1[2]) * in2[6] + | ||
| 536 | ((uint128_t) in1[3]) * in2[5] + | ||
| 537 | ((uint128_t) in1[4]) * in2[4] + | ||
| 538 | ((uint128_t) in1[5]) * in2[3] + | ||
| 539 | ((uint128_t) in1[6]) * in2[2] + | ||
| 540 | ((uint128_t) in1[7]) * in2[1] + | ||
| 541 | ((uint128_t) in1[8]) * in2[0]; | ||
| 542 | |||
| 543 | /* See comment in felem_square about the use of in2x2 here */ | ||
| 544 | |||
| 545 | out[0] += ((uint128_t) in1[1]) * in2x2[8] + | ||
| 546 | ((uint128_t) in1[2]) * in2x2[7] + | ||
| 547 | ((uint128_t) in1[3]) * in2x2[6] + | ||
| 548 | ((uint128_t) in1[4]) * in2x2[5] + | ||
| 549 | ((uint128_t) in1[5]) * in2x2[4] + | ||
| 550 | ((uint128_t) in1[6]) * in2x2[3] + | ||
| 551 | ((uint128_t) in1[7]) * in2x2[2] + | ||
| 552 | ((uint128_t) in1[8]) * in2x2[1]; | ||
| 553 | |||
| 554 | out[1] += ((uint128_t) in1[2]) * in2x2[8] + | ||
| 555 | ((uint128_t) in1[3]) * in2x2[7] + | ||
| 556 | ((uint128_t) in1[4]) * in2x2[6] + | ||
| 557 | ((uint128_t) in1[5]) * in2x2[5] + | ||
| 558 | ((uint128_t) in1[6]) * in2x2[4] + | ||
| 559 | ((uint128_t) in1[7]) * in2x2[3] + | ||
| 560 | ((uint128_t) in1[8]) * in2x2[2]; | ||
| 561 | |||
| 562 | out[2] += ((uint128_t) in1[3]) * in2x2[8] + | ||
| 563 | ((uint128_t) in1[4]) * in2x2[7] + | ||
| 564 | ((uint128_t) in1[5]) * in2x2[6] + | ||
| 565 | ((uint128_t) in1[6]) * in2x2[5] + | ||
| 566 | ((uint128_t) in1[7]) * in2x2[4] + | ||
| 567 | ((uint128_t) in1[8]) * in2x2[3]; | ||
| 568 | |||
| 569 | out[3] += ((uint128_t) in1[4]) * in2x2[8] + | ||
| 570 | ((uint128_t) in1[5]) * in2x2[7] + | ||
| 571 | ((uint128_t) in1[6]) * in2x2[6] + | ||
| 572 | ((uint128_t) in1[7]) * in2x2[5] + | ||
| 573 | ((uint128_t) in1[8]) * in2x2[4]; | ||
| 574 | |||
| 575 | out[4] += ((uint128_t) in1[5]) * in2x2[8] + | ||
| 576 | ((uint128_t) in1[6]) * in2x2[7] + | ||
| 577 | ((uint128_t) in1[7]) * in2x2[6] + | ||
| 578 | ((uint128_t) in1[8]) * in2x2[5]; | ||
| 579 | |||
| 580 | out[5] += ((uint128_t) in1[6]) * in2x2[8] + | ||
| 581 | ((uint128_t) in1[7]) * in2x2[7] + | ||
| 582 | ((uint128_t) in1[8]) * in2x2[6]; | ||
| 583 | |||
| 584 | out[6] += ((uint128_t) in1[7]) * in2x2[8] + | ||
| 585 | ((uint128_t) in1[8]) * in2x2[7]; | ||
| 586 | |||
| 587 | out[7] += ((uint128_t) in1[8]) * in2x2[8]; | ||
| 588 | } | ||
| 589 | |||
| 590 | static const limb bottom52bits = 0xfffffffffffff; | ||
| 591 | |||
| 592 | /* felem_reduce converts a largefelem to an felem. | ||
| 593 | * On entry: | ||
| 594 | * in[i] < 2^128 | ||
| 595 | * On exit: | ||
| 596 | * out[i] < 2^59 + 2^14 | ||
| 597 | */ | ||
| 598 | static void felem_reduce(felem out, const largefelem in) | ||
| 599 | { | ||
| 600 | u64 overflow1, overflow2; | ||
| 601 | |||
| 602 | out[0] = ((limb) in[0]) & bottom58bits; | ||
| 603 | out[1] = ((limb) in[1]) & bottom58bits; | ||
| 604 | out[2] = ((limb) in[2]) & bottom58bits; | ||
| 605 | out[3] = ((limb) in[3]) & bottom58bits; | ||
| 606 | out[4] = ((limb) in[4]) & bottom58bits; | ||
| 607 | out[5] = ((limb) in[5]) & bottom58bits; | ||
| 608 | out[6] = ((limb) in[6]) & bottom58bits; | ||
| 609 | out[7] = ((limb) in[7]) & bottom58bits; | ||
| 610 | out[8] = ((limb) in[8]) & bottom58bits; | ||
| 611 | |||
| 612 | /* out[i] < 2^58 */ | ||
| 613 | |||
| 614 | out[1] += ((limb) in[0]) >> 58; | ||
| 615 | out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6; | ||
| 616 | /* out[1] < 2^58 + 2^6 + 2^58 | ||
| 617 | * = 2^59 + 2^6 */ | ||
| 618 | out[2] += ((limb) (in[0] >> 64)) >> 52; | ||
| 619 | |||
| 620 | out[2] += ((limb) in[1]) >> 58; | ||
| 621 | out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6; | ||
| 622 | out[3] += ((limb) (in[1] >> 64)) >> 52; | ||
| 623 | |||
| 624 | out[3] += ((limb) in[2]) >> 58; | ||
| 625 | out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6; | ||
| 626 | out[4] += ((limb) (in[2] >> 64)) >> 52; | ||
| 627 | |||
| 628 | out[4] += ((limb) in[3]) >> 58; | ||
| 629 | out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6; | ||
| 630 | out[5] += ((limb) (in[3] >> 64)) >> 52; | ||
| 631 | |||
| 632 | out[5] += ((limb) in[4]) >> 58; | ||
| 633 | out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6; | ||
| 634 | out[6] += ((limb) (in[4] >> 64)) >> 52; | ||
| 635 | |||
| 636 | out[6] += ((limb) in[5]) >> 58; | ||
| 637 | out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6; | ||
| 638 | out[7] += ((limb) (in[5] >> 64)) >> 52; | ||
| 639 | |||
| 640 | out[7] += ((limb) in[6]) >> 58; | ||
| 641 | out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6; | ||
| 642 | out[8] += ((limb) (in[6] >> 64)) >> 52; | ||
| 643 | |||
| 644 | out[8] += ((limb) in[7]) >> 58; | ||
| 645 | out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6; | ||
| 646 | /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12 | ||
| 647 | * < 2^59 + 2^13 */ | ||
| 648 | overflow1 = ((limb) (in[7] >> 64)) >> 52; | ||
| 649 | |||
| 650 | overflow1 += ((limb) in[8]) >> 58; | ||
| 651 | overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6; | ||
| 652 | overflow2 = ((limb) (in[8] >> 64)) >> 52; | ||
| 653 | |||
| 654 | overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */ | ||
| 655 | overflow2 <<= 1; /* overflow2 < 2^13 */ | ||
| 656 | |||
| 657 | out[0] += overflow1; /* out[0] < 2^60 */ | ||
| 658 | out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */ | ||
| 659 | |||
| 660 | out[1] += out[0] >> 58; out[0] &= bottom58bits; | ||
| 661 | /* out[0] < 2^58 | ||
| 662 | * out[1] < 2^59 + 2^6 + 2^13 + 2^2 | ||
| 663 | * < 2^59 + 2^14 */ | ||
| 664 | } | ||
| 665 | |||
| 666 | static void felem_square_reduce(felem out, const felem in) | ||
| 667 | { | ||
| 668 | largefelem tmp; | ||
| 669 | felem_square(tmp, in); | ||
| 670 | felem_reduce(out, tmp); | ||
| 671 | } | ||
| 672 | |||
| 673 | static void felem_mul_reduce(felem out, const felem in1, const felem in2) | ||
| 674 | { | ||
| 675 | largefelem tmp; | ||
| 676 | felem_mul(tmp, in1, in2); | ||
| 677 | felem_reduce(out, tmp); | ||
| 678 | } | ||
| 679 | |||
| 680 | /* felem_inv calculates |out| = |in|^{-1} | ||
| 681 | * | ||
| 682 | * Based on Fermat's Little Theorem: | ||
| 683 | * a^p = a (mod p) | ||
| 684 | * a^{p-1} = 1 (mod p) | ||
| 685 | * a^{p-2} = a^{-1} (mod p) | ||
| 686 | */ | ||
| 687 | static void felem_inv(felem out, const felem in) | ||
| 688 | { | ||
| 689 | felem ftmp, ftmp2, ftmp3, ftmp4; | ||
| 690 | largefelem tmp; | ||
| 691 | unsigned i; | ||
| 692 | |||
| 693 | felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */ | ||
| 694 | felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */ | ||
| 695 | felem_assign(ftmp2, ftmp); | ||
| 696 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */ | ||
| 697 | felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */ | ||
| 698 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */ | ||
| 699 | |||
| 700 | felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */ | ||
| 701 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */ | ||
| 702 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */ | ||
| 703 | |||
| 704 | felem_assign(ftmp2, ftmp3); | ||
| 705 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */ | ||
| 706 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */ | ||
| 707 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */ | ||
| 708 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */ | ||
| 709 | felem_assign(ftmp4, ftmp3); | ||
| 710 | felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */ | ||
| 711 | felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */ | ||
| 712 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */ | ||
| 713 | felem_assign(ftmp2, ftmp3); | ||
| 714 | |||
| 715 | for (i = 0; i < 8; i++) | ||
| 716 | { | ||
| 717 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */ | ||
| 718 | } | ||
| 719 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */ | ||
| 720 | felem_assign(ftmp2, ftmp3); | ||
| 721 | |||
| 722 | for (i = 0; i < 16; i++) | ||
| 723 | { | ||
| 724 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */ | ||
| 725 | } | ||
| 726 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */ | ||
| 727 | felem_assign(ftmp2, ftmp3); | ||
| 728 | |||
| 729 | for (i = 0; i < 32; i++) | ||
| 730 | { | ||
| 731 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */ | ||
| 732 | } | ||
| 733 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */ | ||
| 734 | felem_assign(ftmp2, ftmp3); | ||
| 735 | |||
| 736 | for (i = 0; i < 64; i++) | ||
| 737 | { | ||
| 738 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */ | ||
| 739 | } | ||
| 740 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */ | ||
| 741 | felem_assign(ftmp2, ftmp3); | ||
| 742 | |||
| 743 | for (i = 0; i < 128; i++) | ||
| 744 | { | ||
| 745 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */ | ||
| 746 | } | ||
| 747 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */ | ||
| 748 | felem_assign(ftmp2, ftmp3); | ||
| 749 | |||
| 750 | for (i = 0; i < 256; i++) | ||
| 751 | { | ||
| 752 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */ | ||
| 753 | } | ||
| 754 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */ | ||
| 755 | |||
| 756 | for (i = 0; i < 9; i++) | ||
| 757 | { | ||
| 758 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */ | ||
| 759 | } | ||
| 760 | felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */ | ||
| 761 | felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp); /* 2^512 - 3 */ | ||
| 762 | } | ||
| 763 | |||
| 764 | /* This is 2^521-1, expressed as an felem */ | ||
| 765 | static const felem kPrime = | ||
| 766 | { | ||
| 767 | 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff, | ||
| 768 | 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff, | ||
| 769 | 0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff | ||
| 770 | }; | ||
| 771 | |||
| 772 | /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0 | ||
| 773 | * otherwise. | ||
| 774 | * On entry: | ||
| 775 | * in[i] < 2^59 + 2^14 | ||
| 776 | */ | ||
| 777 | static limb felem_is_zero(const felem in) | ||
| 778 | { | ||
| 779 | felem ftmp; | ||
| 780 | limb is_zero, is_p; | ||
| 781 | felem_assign(ftmp, in); | ||
| 782 | |||
| 783 | ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits; | ||
| 784 | /* ftmp[8] < 2^57 */ | ||
| 785 | ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits; | ||
| 786 | ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits; | ||
| 787 | ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits; | ||
| 788 | ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits; | ||
| 789 | ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits; | ||
| 790 | ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits; | ||
| 791 | ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits; | ||
| 792 | ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits; | ||
| 793 | /* ftmp[8] < 2^57 + 4 */ | ||
| 794 | |||
| 795 | /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is | ||
| 796 | * greater than our bound for ftmp[8]. Therefore we only have to check | ||
| 797 | * if the zero is zero or 2^521-1. */ | ||
| 798 | |||
| 799 | is_zero = 0; | ||
| 800 | is_zero |= ftmp[0]; | ||
| 801 | is_zero |= ftmp[1]; | ||
| 802 | is_zero |= ftmp[2]; | ||
| 803 | is_zero |= ftmp[3]; | ||
| 804 | is_zero |= ftmp[4]; | ||
| 805 | is_zero |= ftmp[5]; | ||
| 806 | is_zero |= ftmp[6]; | ||
| 807 | is_zero |= ftmp[7]; | ||
| 808 | is_zero |= ftmp[8]; | ||
| 809 | |||
| 810 | is_zero--; | ||
| 811 | /* We know that ftmp[i] < 2^63, therefore the only way that the top bit | ||
| 812 | * can be set is if is_zero was 0 before the decrement. */ | ||
| 813 | is_zero = ((s64) is_zero) >> 63; | ||
| 814 | |||
| 815 | is_p = ftmp[0] ^ kPrime[0]; | ||
| 816 | is_p |= ftmp[1] ^ kPrime[1]; | ||
| 817 | is_p |= ftmp[2] ^ kPrime[2]; | ||
| 818 | is_p |= ftmp[3] ^ kPrime[3]; | ||
| 819 | is_p |= ftmp[4] ^ kPrime[4]; | ||
| 820 | is_p |= ftmp[5] ^ kPrime[5]; | ||
| 821 | is_p |= ftmp[6] ^ kPrime[6]; | ||
| 822 | is_p |= ftmp[7] ^ kPrime[7]; | ||
| 823 | is_p |= ftmp[8] ^ kPrime[8]; | ||
| 824 | |||
| 825 | is_p--; | ||
| 826 | is_p = ((s64) is_p) >> 63; | ||
| 827 | |||
| 828 | is_zero |= is_p; | ||
| 829 | return is_zero; | ||
| 830 | } | ||
| 831 | |||
| 832 | static int felem_is_zero_int(const felem in) | ||
| 833 | { | ||
| 834 | return (int) (felem_is_zero(in) & ((limb)1)); | ||
| 835 | } | ||
| 836 | |||
| 837 | /* felem_contract converts |in| to its unique, minimal representation. | ||
| 838 | * On entry: | ||
| 839 | * in[i] < 2^59 + 2^14 | ||
| 840 | */ | ||
| 841 | static void felem_contract(felem out, const felem in) | ||
| 842 | { | ||
| 843 | limb is_p, is_greater, sign; | ||
| 844 | static const limb two58 = ((limb)1) << 58; | ||
| 845 | |||
| 846 | felem_assign(out, in); | ||
| 847 | |||
| 848 | out[0] += out[8] >> 57; out[8] &= bottom57bits; | ||
| 849 | /* out[8] < 2^57 */ | ||
| 850 | out[1] += out[0] >> 58; out[0] &= bottom58bits; | ||
| 851 | out[2] += out[1] >> 58; out[1] &= bottom58bits; | ||
| 852 | out[3] += out[2] >> 58; out[2] &= bottom58bits; | ||
| 853 | out[4] += out[3] >> 58; out[3] &= bottom58bits; | ||
| 854 | out[5] += out[4] >> 58; out[4] &= bottom58bits; | ||
| 855 | out[6] += out[5] >> 58; out[5] &= bottom58bits; | ||
| 856 | out[7] += out[6] >> 58; out[6] &= bottom58bits; | ||
| 857 | out[8] += out[7] >> 58; out[7] &= bottom58bits; | ||
| 858 | /* out[8] < 2^57 + 4 */ | ||
| 859 | |||
| 860 | /* If the value is greater than 2^521-1 then we have to subtract | ||
| 861 | * 2^521-1 out. See the comments in felem_is_zero regarding why we | ||
| 862 | * don't test for other multiples of the prime. */ | ||
| 863 | |||
| 864 | /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */ | ||
| 865 | |||
| 866 | is_p = out[0] ^ kPrime[0]; | ||
| 867 | is_p |= out[1] ^ kPrime[1]; | ||
| 868 | is_p |= out[2] ^ kPrime[2]; | ||
| 869 | is_p |= out[3] ^ kPrime[3]; | ||
| 870 | is_p |= out[4] ^ kPrime[4]; | ||
| 871 | is_p |= out[5] ^ kPrime[5]; | ||
| 872 | is_p |= out[6] ^ kPrime[6]; | ||
| 873 | is_p |= out[7] ^ kPrime[7]; | ||
| 874 | is_p |= out[8] ^ kPrime[8]; | ||
| 875 | |||
| 876 | is_p--; | ||
| 877 | is_p &= is_p << 32; | ||
| 878 | is_p &= is_p << 16; | ||
| 879 | is_p &= is_p << 8; | ||
| 880 | is_p &= is_p << 4; | ||
| 881 | is_p &= is_p << 2; | ||
| 882 | is_p &= is_p << 1; | ||
| 883 | is_p = ((s64) is_p) >> 63; | ||
| 884 | is_p = ~is_p; | ||
| 885 | |||
| 886 | /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */ | ||
| 887 | |||
| 888 | out[0] &= is_p; | ||
| 889 | out[1] &= is_p; | ||
| 890 | out[2] &= is_p; | ||
| 891 | out[3] &= is_p; | ||
| 892 | out[4] &= is_p; | ||
| 893 | out[5] &= is_p; | ||
| 894 | out[6] &= is_p; | ||
| 895 | out[7] &= is_p; | ||
| 896 | out[8] &= is_p; | ||
| 897 | |||
| 898 | /* In order to test that |out| >= 2^521-1 we need only test if out[8] | ||
| 899 | * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */ | ||
| 900 | is_greater = out[8] >> 57; | ||
| 901 | is_greater |= is_greater << 32; | ||
| 902 | is_greater |= is_greater << 16; | ||
| 903 | is_greater |= is_greater << 8; | ||
| 904 | is_greater |= is_greater << 4; | ||
| 905 | is_greater |= is_greater << 2; | ||
| 906 | is_greater |= is_greater << 1; | ||
| 907 | is_greater = ((s64) is_greater) >> 63; | ||
| 908 | |||
| 909 | out[0] -= kPrime[0] & is_greater; | ||
| 910 | out[1] -= kPrime[1] & is_greater; | ||
| 911 | out[2] -= kPrime[2] & is_greater; | ||
| 912 | out[3] -= kPrime[3] & is_greater; | ||
| 913 | out[4] -= kPrime[4] & is_greater; | ||
| 914 | out[5] -= kPrime[5] & is_greater; | ||
| 915 | out[6] -= kPrime[6] & is_greater; | ||
| 916 | out[7] -= kPrime[7] & is_greater; | ||
| 917 | out[8] -= kPrime[8] & is_greater; | ||
| 918 | |||
| 919 | /* Eliminate negative coefficients */ | ||
| 920 | sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign); | ||
| 921 | sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign); | ||
| 922 | sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign); | ||
| 923 | sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign); | ||
| 924 | sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign); | ||
| 925 | sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign); | ||
| 926 | sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign); | ||
| 927 | sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign); | ||
| 928 | sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign); | ||
| 929 | sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign); | ||
| 930 | sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign); | ||
| 931 | } | ||
| 932 | |||
| 933 | /* Group operations | ||
| 934 | * ---------------- | ||
| 935 | * | ||
| 936 | * Building on top of the field operations we have the operations on the | ||
| 937 | * elliptic curve group itself. Points on the curve are represented in Jacobian | ||
| 938 | * coordinates */ | ||
| 939 | |||
| 940 | /* point_double calcuates 2*(x_in, y_in, z_in) | ||
| 941 | * | ||
| 942 | * The method is taken from: | ||
| 943 | * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b | ||
| 944 | * | ||
| 945 | * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. | ||
| 946 | * while x_out == y_in is not (maybe this works, but it's not tested). */ | ||
| 947 | static void | ||
| 948 | point_double(felem x_out, felem y_out, felem z_out, | ||
| 949 | const felem x_in, const felem y_in, const felem z_in) | ||
| 950 | { | ||
| 951 | largefelem tmp, tmp2; | ||
| 952 | felem delta, gamma, beta, alpha, ftmp, ftmp2; | ||
| 953 | |||
| 954 | felem_assign(ftmp, x_in); | ||
| 955 | felem_assign(ftmp2, x_in); | ||
| 956 | |||
| 957 | /* delta = z^2 */ | ||
| 958 | felem_square(tmp, z_in); | ||
| 959 | felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */ | ||
| 960 | |||
| 961 | /* gamma = y^2 */ | ||
| 962 | felem_square(tmp, y_in); | ||
| 963 | felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */ | ||
| 964 | |||
| 965 | /* beta = x*gamma */ | ||
| 966 | felem_mul(tmp, x_in, gamma); | ||
| 967 | felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */ | ||
| 968 | |||
| 969 | /* alpha = 3*(x-delta)*(x+delta) */ | ||
| 970 | felem_diff64(ftmp, delta); | ||
| 971 | /* ftmp[i] < 2^61 */ | ||
| 972 | felem_sum64(ftmp2, delta); | ||
| 973 | /* ftmp2[i] < 2^60 + 2^15 */ | ||
| 974 | felem_scalar64(ftmp2, 3); | ||
| 975 | /* ftmp2[i] < 3*2^60 + 3*2^15 */ | ||
| 976 | felem_mul(tmp, ftmp, ftmp2); | ||
| 977 | /* tmp[i] < 17(3*2^121 + 3*2^76) | ||
| 978 | * = 61*2^121 + 61*2^76 | ||
| 979 | * < 64*2^121 + 64*2^76 | ||
| 980 | * = 2^127 + 2^82 | ||
| 981 | * < 2^128 */ | ||
| 982 | felem_reduce(alpha, tmp); | ||
| 983 | |||
| 984 | /* x' = alpha^2 - 8*beta */ | ||
| 985 | felem_square(tmp, alpha); | ||
| 986 | /* tmp[i] < 17*2^120 | ||
| 987 | * < 2^125 */ | ||
| 988 | felem_assign(ftmp, beta); | ||
| 989 | felem_scalar64(ftmp, 8); | ||
| 990 | /* ftmp[i] < 2^62 + 2^17 */ | ||
| 991 | felem_diff_128_64(tmp, ftmp); | ||
| 992 | /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */ | ||
| 993 | felem_reduce(x_out, tmp); | ||
| 994 | |||
| 995 | /* z' = (y + z)^2 - gamma - delta */ | ||
| 996 | felem_sum64(delta, gamma); | ||
| 997 | /* delta[i] < 2^60 + 2^15 */ | ||
| 998 | felem_assign(ftmp, y_in); | ||
| 999 | felem_sum64(ftmp, z_in); | ||
| 1000 | /* ftmp[i] < 2^60 + 2^15 */ | ||
| 1001 | felem_square(tmp, ftmp); | ||
| 1002 | /* tmp[i] < 17(2^122) | ||
| 1003 | * < 2^127 */ | ||
| 1004 | felem_diff_128_64(tmp, delta); | ||
| 1005 | /* tmp[i] < 2^127 + 2^63 */ | ||
| 1006 | felem_reduce(z_out, tmp); | ||
| 1007 | |||
| 1008 | /* y' = alpha*(4*beta - x') - 8*gamma^2 */ | ||
| 1009 | felem_scalar64(beta, 4); | ||
| 1010 | /* beta[i] < 2^61 + 2^16 */ | ||
| 1011 | felem_diff64(beta, x_out); | ||
| 1012 | /* beta[i] < 2^61 + 2^60 + 2^16 */ | ||
| 1013 | felem_mul(tmp, alpha, beta); | ||
| 1014 | /* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16)) | ||
| 1015 | * = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30) | ||
| 1016 | * = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30) | ||
| 1017 | * < 2^128 */ | ||
| 1018 | felem_square(tmp2, gamma); | ||
| 1019 | /* tmp2[i] < 17*(2^59 + 2^14)^2 | ||
| 1020 | * = 17*(2^118 + 2^74 + 2^28) */ | ||
| 1021 | felem_scalar128(tmp2, 8); | ||
| 1022 | /* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28) | ||
| 1023 | * = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31 | ||
| 1024 | * < 2^126 */ | ||
| 1025 | felem_diff128(tmp, tmp2); | ||
| 1026 | /* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30) | ||
| 1027 | * = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 + | ||
| 1028 | * 2^74 + 2^69 + 2^34 + 2^30 | ||
| 1029 | * < 2^128 */ | ||
| 1030 | felem_reduce(y_out, tmp); | ||
| 1031 | } | ||
| 1032 | |||
| 1033 | /* copy_conditional copies in to out iff mask is all ones. */ | ||
| 1034 | static void | ||
| 1035 | copy_conditional(felem out, const felem in, limb mask) | ||
| 1036 | { | ||
| 1037 | unsigned i; | ||
| 1038 | for (i = 0; i < NLIMBS; ++i) | ||
| 1039 | { | ||
| 1040 | const limb tmp = mask & (in[i] ^ out[i]); | ||
| 1041 | out[i] ^= tmp; | ||
| 1042 | } | ||
| 1043 | } | ||
| 1044 | |||
| 1045 | /* point_add calcuates (x1, y1, z1) + (x2, y2, z2) | ||
| 1046 | * | ||
| 1047 | * The method is taken from | ||
| 1048 | * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, | ||
| 1049 | * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). | ||
| 1050 | * | ||
| 1051 | * This function includes a branch for checking whether the two input points | ||
| 1052 | * are equal (while not equal to the point at infinity). This case never | ||
| 1053 | * happens during single point multiplication, so there is no timing leak for | ||
| 1054 | * ECDH or ECDSA signing. */ | ||
| 1055 | static void point_add(felem x3, felem y3, felem z3, | ||
| 1056 | const felem x1, const felem y1, const felem z1, | ||
| 1057 | const int mixed, const felem x2, const felem y2, const felem z2) | ||
| 1058 | { | ||
| 1059 | felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; | ||
| 1060 | largefelem tmp, tmp2; | ||
| 1061 | limb x_equal, y_equal, z1_is_zero, z2_is_zero; | ||
| 1062 | |||
| 1063 | z1_is_zero = felem_is_zero(z1); | ||
| 1064 | z2_is_zero = felem_is_zero(z2); | ||
| 1065 | |||
| 1066 | /* ftmp = z1z1 = z1**2 */ | ||
| 1067 | felem_square(tmp, z1); | ||
| 1068 | felem_reduce(ftmp, tmp); | ||
| 1069 | |||
| 1070 | if (!mixed) | ||
| 1071 | { | ||
| 1072 | /* ftmp2 = z2z2 = z2**2 */ | ||
| 1073 | felem_square(tmp, z2); | ||
| 1074 | felem_reduce(ftmp2, tmp); | ||
| 1075 | |||
| 1076 | /* u1 = ftmp3 = x1*z2z2 */ | ||
| 1077 | felem_mul(tmp, x1, ftmp2); | ||
| 1078 | felem_reduce(ftmp3, tmp); | ||
| 1079 | |||
| 1080 | /* ftmp5 = z1 + z2 */ | ||
| 1081 | felem_assign(ftmp5, z1); | ||
| 1082 | felem_sum64(ftmp5, z2); | ||
| 1083 | /* ftmp5[i] < 2^61 */ | ||
| 1084 | |||
| 1085 | /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */ | ||
| 1086 | felem_square(tmp, ftmp5); | ||
| 1087 | /* tmp[i] < 17*2^122 */ | ||
| 1088 | felem_diff_128_64(tmp, ftmp); | ||
| 1089 | /* tmp[i] < 17*2^122 + 2^63 */ | ||
| 1090 | felem_diff_128_64(tmp, ftmp2); | ||
| 1091 | /* tmp[i] < 17*2^122 + 2^64 */ | ||
| 1092 | felem_reduce(ftmp5, tmp); | ||
| 1093 | |||
| 1094 | /* ftmp2 = z2 * z2z2 */ | ||
| 1095 | felem_mul(tmp, ftmp2, z2); | ||
| 1096 | felem_reduce(ftmp2, tmp); | ||
| 1097 | |||
| 1098 | /* s1 = ftmp6 = y1 * z2**3 */ | ||
| 1099 | felem_mul(tmp, y1, ftmp2); | ||
| 1100 | felem_reduce(ftmp6, tmp); | ||
| 1101 | } | ||
| 1102 | else | ||
| 1103 | { | ||
| 1104 | /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ | ||
| 1105 | |||
| 1106 | /* u1 = ftmp3 = x1*z2z2 */ | ||
| 1107 | felem_assign(ftmp3, x1); | ||
| 1108 | |||
| 1109 | /* ftmp5 = 2*z1z2 */ | ||
| 1110 | felem_scalar(ftmp5, z1, 2); | ||
| 1111 | |||
| 1112 | /* s1 = ftmp6 = y1 * z2**3 */ | ||
| 1113 | felem_assign(ftmp6, y1); | ||
| 1114 | } | ||
| 1115 | |||
| 1116 | /* u2 = x2*z1z1 */ | ||
| 1117 | felem_mul(tmp, x2, ftmp); | ||
| 1118 | /* tmp[i] < 17*2^120 */ | ||
| 1119 | |||
| 1120 | /* h = ftmp4 = u2 - u1 */ | ||
| 1121 | felem_diff_128_64(tmp, ftmp3); | ||
| 1122 | /* tmp[i] < 17*2^120 + 2^63 */ | ||
| 1123 | felem_reduce(ftmp4, tmp); | ||
| 1124 | |||
| 1125 | x_equal = felem_is_zero(ftmp4); | ||
| 1126 | |||
| 1127 | /* z_out = ftmp5 * h */ | ||
| 1128 | felem_mul(tmp, ftmp5, ftmp4); | ||
| 1129 | felem_reduce(z_out, tmp); | ||
| 1130 | |||
| 1131 | /* ftmp = z1 * z1z1 */ | ||
| 1132 | felem_mul(tmp, ftmp, z1); | ||
| 1133 | felem_reduce(ftmp, tmp); | ||
| 1134 | |||
| 1135 | /* s2 = tmp = y2 * z1**3 */ | ||
| 1136 | felem_mul(tmp, y2, ftmp); | ||
| 1137 | /* tmp[i] < 17*2^120 */ | ||
| 1138 | |||
| 1139 | /* r = ftmp5 = (s2 - s1)*2 */ | ||
| 1140 | felem_diff_128_64(tmp, ftmp6); | ||
| 1141 | /* tmp[i] < 17*2^120 + 2^63 */ | ||
| 1142 | felem_reduce(ftmp5, tmp); | ||
| 1143 | y_equal = felem_is_zero(ftmp5); | ||
| 1144 | felem_scalar64(ftmp5, 2); | ||
| 1145 | /* ftmp5[i] < 2^61 */ | ||
| 1146 | |||
| 1147 | if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) | ||
| 1148 | { | ||
| 1149 | point_double(x3, y3, z3, x1, y1, z1); | ||
| 1150 | return; | ||
| 1151 | } | ||
| 1152 | |||
| 1153 | /* I = ftmp = (2h)**2 */ | ||
| 1154 | felem_assign(ftmp, ftmp4); | ||
| 1155 | felem_scalar64(ftmp, 2); | ||
| 1156 | /* ftmp[i] < 2^61 */ | ||
| 1157 | felem_square(tmp, ftmp); | ||
| 1158 | /* tmp[i] < 17*2^122 */ | ||
| 1159 | felem_reduce(ftmp, tmp); | ||
| 1160 | |||
| 1161 | /* J = ftmp2 = h * I */ | ||
| 1162 | felem_mul(tmp, ftmp4, ftmp); | ||
| 1163 | felem_reduce(ftmp2, tmp); | ||
| 1164 | |||
| 1165 | /* V = ftmp4 = U1 * I */ | ||
| 1166 | felem_mul(tmp, ftmp3, ftmp); | ||
| 1167 | felem_reduce(ftmp4, tmp); | ||
| 1168 | |||
| 1169 | /* x_out = r**2 - J - 2V */ | ||
| 1170 | felem_square(tmp, ftmp5); | ||
| 1171 | /* tmp[i] < 17*2^122 */ | ||
| 1172 | felem_diff_128_64(tmp, ftmp2); | ||
| 1173 | /* tmp[i] < 17*2^122 + 2^63 */ | ||
| 1174 | felem_assign(ftmp3, ftmp4); | ||
| 1175 | felem_scalar64(ftmp4, 2); | ||
| 1176 | /* ftmp4[i] < 2^61 */ | ||
| 1177 | felem_diff_128_64(tmp, ftmp4); | ||
| 1178 | /* tmp[i] < 17*2^122 + 2^64 */ | ||
| 1179 | felem_reduce(x_out, tmp); | ||
| 1180 | |||
| 1181 | /* y_out = r(V-x_out) - 2 * s1 * J */ | ||
| 1182 | felem_diff64(ftmp3, x_out); | ||
| 1183 | /* ftmp3[i] < 2^60 + 2^60 | ||
| 1184 | * = 2^61 */ | ||
| 1185 | felem_mul(tmp, ftmp5, ftmp3); | ||
| 1186 | /* tmp[i] < 17*2^122 */ | ||
| 1187 | felem_mul(tmp2, ftmp6, ftmp2); | ||
| 1188 | /* tmp2[i] < 17*2^120 */ | ||
| 1189 | felem_scalar128(tmp2, 2); | ||
| 1190 | /* tmp2[i] < 17*2^121 */ | ||
| 1191 | felem_diff128(tmp, tmp2); | ||
| 1192 | /* tmp[i] < 2^127 - 2^69 + 17*2^122 | ||
| 1193 | * = 2^126 - 2^122 - 2^6 - 2^2 - 1 | ||
| 1194 | * < 2^127 */ | ||
| 1195 | felem_reduce(y_out, tmp); | ||
| 1196 | |||
| 1197 | copy_conditional(x_out, x2, z1_is_zero); | ||
| 1198 | copy_conditional(x_out, x1, z2_is_zero); | ||
| 1199 | copy_conditional(y_out, y2, z1_is_zero); | ||
| 1200 | copy_conditional(y_out, y1, z2_is_zero); | ||
| 1201 | copy_conditional(z_out, z2, z1_is_zero); | ||
| 1202 | copy_conditional(z_out, z1, z2_is_zero); | ||
| 1203 | felem_assign(x3, x_out); | ||
| 1204 | felem_assign(y3, y_out); | ||
| 1205 | felem_assign(z3, z_out); | ||
| 1206 | } | ||
| 1207 | |||
| 1208 | /* Base point pre computation | ||
| 1209 | * -------------------------- | ||
| 1210 | * | ||
| 1211 | * Two different sorts of precomputed tables are used in the following code. | ||
| 1212 | * Each contain various points on the curve, where each point is three field | ||
| 1213 | * elements (x, y, z). | ||
| 1214 | * | ||
| 1215 | * For the base point table, z is usually 1 (0 for the point at infinity). | ||
| 1216 | * This table has 16 elements: | ||
| 1217 | * index | bits | point | ||
| 1218 | * ------+---------+------------------------------ | ||
| 1219 | * 0 | 0 0 0 0 | 0G | ||
| 1220 | * 1 | 0 0 0 1 | 1G | ||
| 1221 | * 2 | 0 0 1 0 | 2^130G | ||
| 1222 | * 3 | 0 0 1 1 | (2^130 + 1)G | ||
| 1223 | * 4 | 0 1 0 0 | 2^260G | ||
| 1224 | * 5 | 0 1 0 1 | (2^260 + 1)G | ||
| 1225 | * 6 | 0 1 1 0 | (2^260 + 2^130)G | ||
| 1226 | * 7 | 0 1 1 1 | (2^260 + 2^130 + 1)G | ||
| 1227 | * 8 | 1 0 0 0 | 2^390G | ||
| 1228 | * 9 | 1 0 0 1 | (2^390 + 1)G | ||
| 1229 | * 10 | 1 0 1 0 | (2^390 + 2^130)G | ||
| 1230 | * 11 | 1 0 1 1 | (2^390 + 2^130 + 1)G | ||
| 1231 | * 12 | 1 1 0 0 | (2^390 + 2^260)G | ||
| 1232 | * 13 | 1 1 0 1 | (2^390 + 2^260 + 1)G | ||
| 1233 | * 14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G | ||
| 1234 | * 15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G | ||
| 1235 | * | ||
| 1236 | * The reason for this is so that we can clock bits into four different | ||
| 1237 | * locations when doing simple scalar multiplies against the base point. | ||
| 1238 | * | ||
| 1239 | * Tables for other points have table[i] = iG for i in 0 .. 16. */ | ||
| 1240 | |||
| 1241 | /* gmul is the table of precomputed base points */ | ||
| 1242 | static const felem gmul[16][3] = | ||
| 1243 | {{{0, 0, 0, 0, 0, 0, 0, 0, 0}, | ||
| 1244 | {0, 0, 0, 0, 0, 0, 0, 0, 0}, | ||
| 1245 | {0, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1246 | {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334, | ||
| 1247 | 0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8, | ||
| 1248 | 0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404}, | ||
| 1249 | {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353, | ||
| 1250 | 0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45, | ||
| 1251 | 0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b}, | ||
| 1252 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1253 | {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad, | ||
| 1254 | 0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e, | ||
| 1255 | 0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5}, | ||
| 1256 | {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58, | ||
| 1257 | 0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c, | ||
| 1258 | 0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7}, | ||
| 1259 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1260 | {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873, | ||
| 1261 | 0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c, | ||
| 1262 | 0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9}, | ||
| 1263 | {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52, | ||
| 1264 | 0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e, | ||
| 1265 | 0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe}, | ||
| 1266 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1267 | {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2, | ||
| 1268 | 0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561, | ||
| 1269 | 0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065}, | ||
| 1270 | {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a, | ||
| 1271 | 0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e, | ||
| 1272 | 0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524}, | ||
| 1273 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1274 | {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6, | ||
| 1275 | 0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51, | ||
| 1276 | 0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe}, | ||
| 1277 | {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d, | ||
| 1278 | 0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c, | ||
| 1279 | 0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7}, | ||
| 1280 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1281 | {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27, | ||
| 1282 | 0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f, | ||
| 1283 | 0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256}, | ||
| 1284 | {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa, | ||
| 1285 | 0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2, | ||
| 1286 | 0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd}, | ||
| 1287 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1288 | {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890, | ||
| 1289 | 0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74, | ||
| 1290 | 0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23}, | ||
| 1291 | {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516, | ||
| 1292 | 0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1, | ||
| 1293 | 0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e}, | ||
| 1294 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1295 | {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce, | ||
| 1296 | 0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7, | ||
| 1297 | 0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5}, | ||
| 1298 | {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318, | ||
| 1299 | 0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83, | ||
| 1300 | 0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242}, | ||
| 1301 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1302 | {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae, | ||
| 1303 | 0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef, | ||
| 1304 | 0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203}, | ||
| 1305 | {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447, | ||
| 1306 | 0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283, | ||
| 1307 | 0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f}, | ||
| 1308 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1309 | {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5, | ||
| 1310 | 0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c, | ||
| 1311 | 0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a}, | ||
| 1312 | {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df, | ||
| 1313 | 0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645, | ||
| 1314 | 0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a}, | ||
| 1315 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1316 | {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292, | ||
| 1317 | 0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422, | ||
| 1318 | 0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b}, | ||
| 1319 | {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30, | ||
| 1320 | 0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb, | ||
| 1321 | 0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f}, | ||
| 1322 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1323 | {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767, | ||
| 1324 | 0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3, | ||
| 1325 | 0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf}, | ||
| 1326 | {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2, | ||
| 1327 | 0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692, | ||
| 1328 | 0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d}, | ||
| 1329 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1330 | {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3, | ||
| 1331 | 0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade, | ||
| 1332 | 0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684}, | ||
| 1333 | {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8, | ||
| 1334 | 0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a, | ||
| 1335 | 0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81}, | ||
| 1336 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1337 | {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608, | ||
| 1338 | 0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610, | ||
| 1339 | 0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d}, | ||
| 1340 | {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006, | ||
| 1341 | 0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86, | ||
| 1342 | 0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42}, | ||
| 1343 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
| 1344 | {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c, | ||
| 1345 | 0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9, | ||
| 1346 | 0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f}, | ||
| 1347 | {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7, | ||
| 1348 | 0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c, | ||
| 1349 | 0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055}, | ||
| 1350 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}}; | ||
| 1351 | |||
| 1352 | /* select_point selects the |idx|th point from a precomputation table and | ||
| 1353 | * copies it to out. */ | ||
| 1354 | static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3], | ||
| 1355 | felem out[3]) | ||
| 1356 | { | ||
| 1357 | unsigned i, j; | ||
| 1358 | limb *outlimbs = &out[0][0]; | ||
| 1359 | memset(outlimbs, 0, 3 * sizeof(felem)); | ||
| 1360 | |||
| 1361 | for (i = 0; i < size; i++) | ||
| 1362 | { | ||
| 1363 | const limb *inlimbs = &pre_comp[i][0][0]; | ||
| 1364 | limb mask = i ^ idx; | ||
| 1365 | mask |= mask >> 4; | ||
| 1366 | mask |= mask >> 2; | ||
| 1367 | mask |= mask >> 1; | ||
| 1368 | mask &= 1; | ||
| 1369 | mask--; | ||
| 1370 | for (j = 0; j < NLIMBS * 3; j++) | ||
| 1371 | outlimbs[j] |= inlimbs[j] & mask; | ||
| 1372 | } | ||
| 1373 | } | ||
| 1374 | |||
| 1375 | /* get_bit returns the |i|th bit in |in| */ | ||
| 1376 | static char get_bit(const felem_bytearray in, int i) | ||
| 1377 | { | ||
| 1378 | if (i < 0) | ||
| 1379 | return 0; | ||
| 1380 | return (in[i >> 3] >> (i & 7)) & 1; | ||
| 1381 | } | ||
| 1382 | |||
| 1383 | /* Interleaved point multiplication using precomputed point multiples: | ||
| 1384 | * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], | ||
| 1385 | * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple | ||
| 1386 | * of the generator, using certain (large) precomputed multiples in g_pre_comp. | ||
| 1387 | * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ | ||
| 1388 | static void batch_mul(felem x_out, felem y_out, felem z_out, | ||
| 1389 | const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, | ||
| 1390 | const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3]) | ||
| 1391 | { | ||
| 1392 | int i, skip; | ||
| 1393 | unsigned num, gen_mul = (g_scalar != NULL); | ||
| 1394 | felem nq[3], tmp[4]; | ||
| 1395 | limb bits; | ||
| 1396 | u8 sign, digit; | ||
| 1397 | |||
| 1398 | /* set nq to the point at infinity */ | ||
| 1399 | memset(nq, 0, 3 * sizeof(felem)); | ||
| 1400 | |||
| 1401 | /* Loop over all scalars msb-to-lsb, interleaving additions | ||
| 1402 | * of multiples of the generator (last quarter of rounds) | ||
| 1403 | * and additions of other points multiples (every 5th round). | ||
| 1404 | */ | ||
| 1405 | skip = 1; /* save two point operations in the first round */ | ||
| 1406 | for (i = (num_points ? 520 : 130); i >= 0; --i) | ||
| 1407 | { | ||
| 1408 | /* double */ | ||
| 1409 | if (!skip) | ||
| 1410 | point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); | ||
| 1411 | |||
| 1412 | /* add multiples of the generator */ | ||
| 1413 | if (gen_mul && (i <= 130)) | ||
| 1414 | { | ||
| 1415 | bits = get_bit(g_scalar, i + 390) << 3; | ||
| 1416 | if (i < 130) | ||
| 1417 | { | ||
| 1418 | bits |= get_bit(g_scalar, i + 260) << 2; | ||
| 1419 | bits |= get_bit(g_scalar, i + 130) << 1; | ||
| 1420 | bits |= get_bit(g_scalar, i); | ||
| 1421 | } | ||
| 1422 | /* select the point to add, in constant time */ | ||
| 1423 | select_point(bits, 16, g_pre_comp, tmp); | ||
| 1424 | if (!skip) | ||
| 1425 | { | ||
| 1426 | point_add(nq[0], nq[1], nq[2], | ||
| 1427 | nq[0], nq[1], nq[2], | ||
| 1428 | 1 /* mixed */, tmp[0], tmp[1], tmp[2]); | ||
| 1429 | } | ||
| 1430 | else | ||
| 1431 | { | ||
| 1432 | memcpy(nq, tmp, 3 * sizeof(felem)); | ||
| 1433 | skip = 0; | ||
| 1434 | } | ||
| 1435 | } | ||
| 1436 | |||
| 1437 | /* do other additions every 5 doublings */ | ||
| 1438 | if (num_points && (i % 5 == 0)) | ||
| 1439 | { | ||
| 1440 | /* loop over all scalars */ | ||
| 1441 | for (num = 0; num < num_points; ++num) | ||
| 1442 | { | ||
| 1443 | bits = get_bit(scalars[num], i + 4) << 5; | ||
| 1444 | bits |= get_bit(scalars[num], i + 3) << 4; | ||
| 1445 | bits |= get_bit(scalars[num], i + 2) << 3; | ||
| 1446 | bits |= get_bit(scalars[num], i + 1) << 2; | ||
| 1447 | bits |= get_bit(scalars[num], i) << 1; | ||
| 1448 | bits |= get_bit(scalars[num], i - 1); | ||
| 1449 | ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); | ||
| 1450 | |||
| 1451 | /* select the point to add or subtract, in constant time */ | ||
| 1452 | select_point(digit, 17, pre_comp[num], tmp); | ||
| 1453 | felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */ | ||
| 1454 | copy_conditional(tmp[1], tmp[3], (-(limb) sign)); | ||
| 1455 | |||
| 1456 | if (!skip) | ||
| 1457 | { | ||
| 1458 | point_add(nq[0], nq[1], nq[2], | ||
| 1459 | nq[0], nq[1], nq[2], | ||
| 1460 | mixed, tmp[0], tmp[1], tmp[2]); | ||
| 1461 | } | ||
| 1462 | else | ||
| 1463 | { | ||
| 1464 | memcpy(nq, tmp, 3 * sizeof(felem)); | ||
| 1465 | skip = 0; | ||
| 1466 | } | ||
| 1467 | } | ||
| 1468 | } | ||
| 1469 | } | ||
| 1470 | felem_assign(x_out, nq[0]); | ||
| 1471 | felem_assign(y_out, nq[1]); | ||
| 1472 | felem_assign(z_out, nq[2]); | ||
| 1473 | } | ||
| 1474 | |||
| 1475 | |||
| 1476 | /* Precomputation for the group generator. */ | ||
| 1477 | typedef struct { | ||
| 1478 | felem g_pre_comp[16][3]; | ||
| 1479 | int references; | ||
| 1480 | } NISTP521_PRE_COMP; | ||
| 1481 | |||
| 1482 | const EC_METHOD *EC_GFp_nistp521_method(void) | ||
| 1483 | { | ||
| 1484 | static const EC_METHOD ret = { | ||
| 1485 | EC_FLAGS_DEFAULT_OCT, | ||
| 1486 | NID_X9_62_prime_field, | ||
| 1487 | ec_GFp_nistp521_group_init, | ||
| 1488 | ec_GFp_simple_group_finish, | ||
| 1489 | ec_GFp_simple_group_clear_finish, | ||
| 1490 | ec_GFp_nist_group_copy, | ||
| 1491 | ec_GFp_nistp521_group_set_curve, | ||
| 1492 | ec_GFp_simple_group_get_curve, | ||
| 1493 | ec_GFp_simple_group_get_degree, | ||
| 1494 | ec_GFp_simple_group_check_discriminant, | ||
| 1495 | ec_GFp_simple_point_init, | ||
| 1496 | ec_GFp_simple_point_finish, | ||
| 1497 | ec_GFp_simple_point_clear_finish, | ||
| 1498 | ec_GFp_simple_point_copy, | ||
| 1499 | ec_GFp_simple_point_set_to_infinity, | ||
| 1500 | ec_GFp_simple_set_Jprojective_coordinates_GFp, | ||
| 1501 | ec_GFp_simple_get_Jprojective_coordinates_GFp, | ||
| 1502 | ec_GFp_simple_point_set_affine_coordinates, | ||
| 1503 | ec_GFp_nistp521_point_get_affine_coordinates, | ||
| 1504 | 0 /* point_set_compressed_coordinates */, | ||
| 1505 | 0 /* point2oct */, | ||
| 1506 | 0 /* oct2point */, | ||
| 1507 | ec_GFp_simple_add, | ||
| 1508 | ec_GFp_simple_dbl, | ||
| 1509 | ec_GFp_simple_invert, | ||
| 1510 | ec_GFp_simple_is_at_infinity, | ||
| 1511 | ec_GFp_simple_is_on_curve, | ||
| 1512 | ec_GFp_simple_cmp, | ||
| 1513 | ec_GFp_simple_make_affine, | ||
| 1514 | ec_GFp_simple_points_make_affine, | ||
| 1515 | ec_GFp_nistp521_points_mul, | ||
| 1516 | ec_GFp_nistp521_precompute_mult, | ||
| 1517 | ec_GFp_nistp521_have_precompute_mult, | ||
| 1518 | ec_GFp_nist_field_mul, | ||
| 1519 | ec_GFp_nist_field_sqr, | ||
| 1520 | 0 /* field_div */, | ||
| 1521 | 0 /* field_encode */, | ||
| 1522 | 0 /* field_decode */, | ||
| 1523 | 0 /* field_set_to_one */ }; | ||
| 1524 | |||
| 1525 | return &ret; | ||
| 1526 | } | ||
| 1527 | |||
| 1528 | |||
| 1529 | /******************************************************************************/ | ||
| 1530 | /* FUNCTIONS TO MANAGE PRECOMPUTATION | ||
| 1531 | */ | ||
| 1532 | |||
| 1533 | static NISTP521_PRE_COMP *nistp521_pre_comp_new() | ||
| 1534 | { | ||
| 1535 | NISTP521_PRE_COMP *ret = NULL; | ||
| 1536 | ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP)); | ||
| 1537 | if (!ret) | ||
| 1538 | { | ||
| 1539 | ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); | ||
| 1540 | return ret; | ||
| 1541 | } | ||
| 1542 | memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); | ||
| 1543 | ret->references = 1; | ||
| 1544 | return ret; | ||
| 1545 | } | ||
| 1546 | |||
| 1547 | static void *nistp521_pre_comp_dup(void *src_) | ||
| 1548 | { | ||
| 1549 | NISTP521_PRE_COMP *src = src_; | ||
| 1550 | |||
| 1551 | /* no need to actually copy, these objects never change! */ | ||
| 1552 | CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); | ||
| 1553 | |||
| 1554 | return src_; | ||
| 1555 | } | ||
| 1556 | |||
| 1557 | static void nistp521_pre_comp_free(void *pre_) | ||
| 1558 | { | ||
| 1559 | int i; | ||
| 1560 | NISTP521_PRE_COMP *pre = pre_; | ||
| 1561 | |||
| 1562 | if (!pre) | ||
| 1563 | return; | ||
| 1564 | |||
| 1565 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
| 1566 | if (i > 0) | ||
| 1567 | return; | ||
| 1568 | |||
| 1569 | OPENSSL_free(pre); | ||
| 1570 | } | ||
| 1571 | |||
| 1572 | static void nistp521_pre_comp_clear_free(void *pre_) | ||
| 1573 | { | ||
| 1574 | int i; | ||
| 1575 | NISTP521_PRE_COMP *pre = pre_; | ||
| 1576 | |||
| 1577 | if (!pre) | ||
| 1578 | return; | ||
| 1579 | |||
| 1580 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
| 1581 | if (i > 0) | ||
| 1582 | return; | ||
| 1583 | |||
| 1584 | OPENSSL_cleanse(pre, sizeof(*pre)); | ||
| 1585 | OPENSSL_free(pre); | ||
| 1586 | } | ||
| 1587 | |||
| 1588 | /******************************************************************************/ | ||
| 1589 | /* OPENSSL EC_METHOD FUNCTIONS | ||
| 1590 | */ | ||
| 1591 | |||
| 1592 | int ec_GFp_nistp521_group_init(EC_GROUP *group) | ||
| 1593 | { | ||
| 1594 | int ret; | ||
| 1595 | ret = ec_GFp_simple_group_init(group); | ||
| 1596 | group->a_is_minus3 = 1; | ||
| 1597 | return ret; | ||
| 1598 | } | ||
| 1599 | |||
| 1600 | int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, | ||
| 1601 | const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) | ||
| 1602 | { | ||
| 1603 | int ret = 0; | ||
| 1604 | BN_CTX *new_ctx = NULL; | ||
| 1605 | BIGNUM *curve_p, *curve_a, *curve_b; | ||
| 1606 | |||
| 1607 | if (ctx == NULL) | ||
| 1608 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
| 1609 | BN_CTX_start(ctx); | ||
| 1610 | if (((curve_p = BN_CTX_get(ctx)) == NULL) || | ||
| 1611 | ((curve_a = BN_CTX_get(ctx)) == NULL) || | ||
| 1612 | ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; | ||
| 1613 | BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p); | ||
| 1614 | BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a); | ||
| 1615 | BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b); | ||
| 1616 | if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || | ||
| 1617 | (BN_cmp(curve_b, b))) | ||
| 1618 | { | ||
| 1619 | ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE, | ||
| 1620 | EC_R_WRONG_CURVE_PARAMETERS); | ||
| 1621 | goto err; | ||
| 1622 | } | ||
| 1623 | group->field_mod_func = BN_nist_mod_521; | ||
| 1624 | ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); | ||
| 1625 | err: | ||
| 1626 | BN_CTX_end(ctx); | ||
| 1627 | if (new_ctx != NULL) | ||
| 1628 | BN_CTX_free(new_ctx); | ||
| 1629 | return ret; | ||
| 1630 | } | ||
| 1631 | |||
| 1632 | /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns | ||
| 1633 | * (X', Y') = (X/Z^2, Y/Z^3) */ | ||
| 1634 | int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, | ||
| 1635 | const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) | ||
| 1636 | { | ||
| 1637 | felem z1, z2, x_in, y_in, x_out, y_out; | ||
| 1638 | largefelem tmp; | ||
| 1639 | |||
| 1640 | if (EC_POINT_is_at_infinity(group, point)) | ||
| 1641 | { | ||
| 1642 | ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, | ||
| 1643 | EC_R_POINT_AT_INFINITY); | ||
| 1644 | return 0; | ||
| 1645 | } | ||
| 1646 | if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || | ||
| 1647 | (!BN_to_felem(z1, &point->Z))) return 0; | ||
| 1648 | felem_inv(z2, z1); | ||
| 1649 | felem_square(tmp, z2); felem_reduce(z1, tmp); | ||
| 1650 | felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); | ||
| 1651 | felem_contract(x_out, x_in); | ||
| 1652 | if (x != NULL) | ||
| 1653 | { | ||
| 1654 | if (!felem_to_BN(x, x_out)) | ||
| 1655 | { | ||
| 1656 | ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB); | ||
| 1657 | return 0; | ||
| 1658 | } | ||
| 1659 | } | ||
| 1660 | felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); | ||
| 1661 | felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); | ||
| 1662 | felem_contract(y_out, y_in); | ||
| 1663 | if (y != NULL) | ||
| 1664 | { | ||
| 1665 | if (!felem_to_BN(y, y_out)) | ||
| 1666 | { | ||
| 1667 | ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB); | ||
| 1668 | return 0; | ||
| 1669 | } | ||
| 1670 | } | ||
| 1671 | return 1; | ||
| 1672 | } | ||
| 1673 | |||
| 1674 | static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */]) | ||
| 1675 | { | ||
| 1676 | /* Runs in constant time, unless an input is the point at infinity | ||
| 1677 | * (which normally shouldn't happen). */ | ||
| 1678 | ec_GFp_nistp_points_make_affine_internal( | ||
| 1679 | num, | ||
| 1680 | points, | ||
| 1681 | sizeof(felem), | ||
| 1682 | tmp_felems, | ||
| 1683 | (void (*)(void *)) felem_one, | ||
| 1684 | (int (*)(const void *)) felem_is_zero_int, | ||
| 1685 | (void (*)(void *, const void *)) felem_assign, | ||
| 1686 | (void (*)(void *, const void *)) felem_square_reduce, | ||
| 1687 | (void (*)(void *, const void *, const void *)) felem_mul_reduce, | ||
| 1688 | (void (*)(void *, const void *)) felem_inv, | ||
| 1689 | (void (*)(void *, const void *)) felem_contract); | ||
| 1690 | } | ||
| 1691 | |||
| 1692 | /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values | ||
| 1693 | * Result is stored in r (r can equal one of the inputs). */ | ||
| 1694 | int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, | ||
| 1695 | const BIGNUM *scalar, size_t num, const EC_POINT *points[], | ||
| 1696 | const BIGNUM *scalars[], BN_CTX *ctx) | ||
| 1697 | { | ||
| 1698 | int ret = 0; | ||
| 1699 | int j; | ||
| 1700 | int mixed = 0; | ||
| 1701 | BN_CTX *new_ctx = NULL; | ||
| 1702 | BIGNUM *x, *y, *z, *tmp_scalar; | ||
| 1703 | felem_bytearray g_secret; | ||
| 1704 | felem_bytearray *secrets = NULL; | ||
| 1705 | felem (*pre_comp)[17][3] = NULL; | ||
| 1706 | felem *tmp_felems = NULL; | ||
| 1707 | felem_bytearray tmp; | ||
| 1708 | unsigned i, num_bytes; | ||
| 1709 | int have_pre_comp = 0; | ||
| 1710 | size_t num_points = num; | ||
| 1711 | felem x_in, y_in, z_in, x_out, y_out, z_out; | ||
| 1712 | NISTP521_PRE_COMP *pre = NULL; | ||
| 1713 | felem (*g_pre_comp)[3] = NULL; | ||
| 1714 | EC_POINT *generator = NULL; | ||
| 1715 | const EC_POINT *p = NULL; | ||
| 1716 | const BIGNUM *p_scalar = NULL; | ||
| 1717 | |||
| 1718 | if (ctx == NULL) | ||
| 1719 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
| 1720 | BN_CTX_start(ctx); | ||
| 1721 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
| 1722 | ((y = BN_CTX_get(ctx)) == NULL) || | ||
| 1723 | ((z = BN_CTX_get(ctx)) == NULL) || | ||
| 1724 | ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) | ||
| 1725 | goto err; | ||
| 1726 | |||
| 1727 | if (scalar != NULL) | ||
| 1728 | { | ||
| 1729 | pre = EC_EX_DATA_get_data(group->extra_data, | ||
| 1730 | nistp521_pre_comp_dup, nistp521_pre_comp_free, | ||
| 1731 | nistp521_pre_comp_clear_free); | ||
| 1732 | if (pre) | ||
| 1733 | /* we have precomputation, try to use it */ | ||
| 1734 | g_pre_comp = &pre->g_pre_comp[0]; | ||
| 1735 | else | ||
| 1736 | /* try to use the standard precomputation */ | ||
| 1737 | g_pre_comp = (felem (*)[3]) gmul; | ||
| 1738 | generator = EC_POINT_new(group); | ||
| 1739 | if (generator == NULL) | ||
| 1740 | goto err; | ||
| 1741 | /* get the generator from precomputation */ | ||
| 1742 | if (!felem_to_BN(x, g_pre_comp[1][0]) || | ||
| 1743 | !felem_to_BN(y, g_pre_comp[1][1]) || | ||
| 1744 | !felem_to_BN(z, g_pre_comp[1][2])) | ||
| 1745 | { | ||
| 1746 | ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1747 | goto err; | ||
| 1748 | } | ||
| 1749 | if (!EC_POINT_set_Jprojective_coordinates_GFp(group, | ||
| 1750 | generator, x, y, z, ctx)) | ||
| 1751 | goto err; | ||
| 1752 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
| 1753 | /* precomputation matches generator */ | ||
| 1754 | have_pre_comp = 1; | ||
| 1755 | else | ||
| 1756 | /* we don't have valid precomputation: | ||
| 1757 | * treat the generator as a random point */ | ||
| 1758 | num_points++; | ||
| 1759 | } | ||
| 1760 | |||
| 1761 | if (num_points > 0) | ||
| 1762 | { | ||
| 1763 | if (num_points >= 2) | ||
| 1764 | { | ||
| 1765 | /* unless we precompute multiples for just one point, | ||
| 1766 | * converting those into affine form is time well spent */ | ||
| 1767 | mixed = 1; | ||
| 1768 | } | ||
| 1769 | secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); | ||
| 1770 | pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem)); | ||
| 1771 | if (mixed) | ||
| 1772 | tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem)); | ||
| 1773 | if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL))) | ||
| 1774 | { | ||
| 1775 | ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE); | ||
| 1776 | goto err; | ||
| 1777 | } | ||
| 1778 | |||
| 1779 | /* we treat NULL scalars as 0, and NULL points as points at infinity, | ||
| 1780 | * i.e., they contribute nothing to the linear combination */ | ||
| 1781 | memset(secrets, 0, num_points * sizeof(felem_bytearray)); | ||
| 1782 | memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem)); | ||
| 1783 | for (i = 0; i < num_points; ++i) | ||
| 1784 | { | ||
| 1785 | if (i == num) | ||
| 1786 | /* we didn't have a valid precomputation, so we pick | ||
| 1787 | * the generator */ | ||
| 1788 | { | ||
| 1789 | p = EC_GROUP_get0_generator(group); | ||
| 1790 | p_scalar = scalar; | ||
| 1791 | } | ||
| 1792 | else | ||
| 1793 | /* the i^th point */ | ||
| 1794 | { | ||
| 1795 | p = points[i]; | ||
| 1796 | p_scalar = scalars[i]; | ||
| 1797 | } | ||
| 1798 | if ((p_scalar != NULL) && (p != NULL)) | ||
| 1799 | { | ||
| 1800 | /* reduce scalar to 0 <= scalar < 2^521 */ | ||
| 1801 | if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar))) | ||
| 1802 | { | ||
| 1803 | /* this is an unusual input, and we don't guarantee | ||
| 1804 | * constant-timeness */ | ||
| 1805 | if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) | ||
| 1806 | { | ||
| 1807 | ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1808 | goto err; | ||
| 1809 | } | ||
| 1810 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
| 1811 | } | ||
| 1812 | else | ||
| 1813 | num_bytes = BN_bn2bin(p_scalar, tmp); | ||
| 1814 | flip_endian(secrets[i], tmp, num_bytes); | ||
| 1815 | /* precompute multiples */ | ||
| 1816 | if ((!BN_to_felem(x_out, &p->X)) || | ||
| 1817 | (!BN_to_felem(y_out, &p->Y)) || | ||
| 1818 | (!BN_to_felem(z_out, &p->Z))) goto err; | ||
| 1819 | memcpy(pre_comp[i][1][0], x_out, sizeof(felem)); | ||
| 1820 | memcpy(pre_comp[i][1][1], y_out, sizeof(felem)); | ||
| 1821 | memcpy(pre_comp[i][1][2], z_out, sizeof(felem)); | ||
| 1822 | for (j = 2; j <= 16; ++j) | ||
| 1823 | { | ||
| 1824 | if (j & 1) | ||
| 1825 | { | ||
| 1826 | point_add( | ||
| 1827 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
| 1828 | pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], | ||
| 1829 | 0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); | ||
| 1830 | } | ||
| 1831 | else | ||
| 1832 | { | ||
| 1833 | point_double( | ||
| 1834 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
| 1835 | pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); | ||
| 1836 | } | ||
| 1837 | } | ||
| 1838 | } | ||
| 1839 | } | ||
| 1840 | if (mixed) | ||
| 1841 | make_points_affine(num_points * 17, pre_comp[0], tmp_felems); | ||
| 1842 | } | ||
| 1843 | |||
| 1844 | /* the scalar for the generator */ | ||
| 1845 | if ((scalar != NULL) && (have_pre_comp)) | ||
| 1846 | { | ||
| 1847 | memset(g_secret, 0, sizeof(g_secret)); | ||
| 1848 | /* reduce scalar to 0 <= scalar < 2^521 */ | ||
| 1849 | if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) | ||
| 1850 | { | ||
| 1851 | /* this is an unusual input, and we don't guarantee | ||
| 1852 | * constant-timeness */ | ||
| 1853 | if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) | ||
| 1854 | { | ||
| 1855 | ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1856 | goto err; | ||
| 1857 | } | ||
| 1858 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
| 1859 | } | ||
| 1860 | else | ||
| 1861 | num_bytes = BN_bn2bin(scalar, tmp); | ||
| 1862 | flip_endian(g_secret, tmp, num_bytes); | ||
| 1863 | /* do the multiplication with generator precomputation*/ | ||
| 1864 | batch_mul(x_out, y_out, z_out, | ||
| 1865 | (const felem_bytearray (*)) secrets, num_points, | ||
| 1866 | g_secret, | ||
| 1867 | mixed, (const felem (*)[17][3]) pre_comp, | ||
| 1868 | (const felem (*)[3]) g_pre_comp); | ||
| 1869 | } | ||
| 1870 | else | ||
| 1871 | /* do the multiplication without generator precomputation */ | ||
| 1872 | batch_mul(x_out, y_out, z_out, | ||
| 1873 | (const felem_bytearray (*)) secrets, num_points, | ||
| 1874 | NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL); | ||
| 1875 | /* reduce the output to its unique minimal representation */ | ||
| 1876 | felem_contract(x_in, x_out); | ||
| 1877 | felem_contract(y_in, y_out); | ||
| 1878 | felem_contract(z_in, z_out); | ||
| 1879 | if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || | ||
| 1880 | (!felem_to_BN(z, z_in))) | ||
| 1881 | { | ||
| 1882 | ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); | ||
| 1883 | goto err; | ||
| 1884 | } | ||
| 1885 | ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); | ||
| 1886 | |||
| 1887 | err: | ||
| 1888 | BN_CTX_end(ctx); | ||
| 1889 | if (generator != NULL) | ||
| 1890 | EC_POINT_free(generator); | ||
| 1891 | if (new_ctx != NULL) | ||
| 1892 | BN_CTX_free(new_ctx); | ||
| 1893 | if (secrets != NULL) | ||
| 1894 | OPENSSL_free(secrets); | ||
| 1895 | if (pre_comp != NULL) | ||
| 1896 | OPENSSL_free(pre_comp); | ||
| 1897 | if (tmp_felems != NULL) | ||
| 1898 | OPENSSL_free(tmp_felems); | ||
| 1899 | return ret; | ||
| 1900 | } | ||
| 1901 | |||
| 1902 | int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx) | ||
| 1903 | { | ||
| 1904 | int ret = 0; | ||
| 1905 | NISTP521_PRE_COMP *pre = NULL; | ||
| 1906 | int i, j; | ||
| 1907 | BN_CTX *new_ctx = NULL; | ||
| 1908 | BIGNUM *x, *y; | ||
| 1909 | EC_POINT *generator = NULL; | ||
| 1910 | felem tmp_felems[16]; | ||
| 1911 | |||
| 1912 | /* throw away old precomputation */ | ||
| 1913 | EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup, | ||
| 1914 | nistp521_pre_comp_free, nistp521_pre_comp_clear_free); | ||
| 1915 | if (ctx == NULL) | ||
| 1916 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
| 1917 | BN_CTX_start(ctx); | ||
| 1918 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
| 1919 | ((y = BN_CTX_get(ctx)) == NULL)) | ||
| 1920 | goto err; | ||
| 1921 | /* get the generator */ | ||
| 1922 | if (group->generator == NULL) goto err; | ||
| 1923 | generator = EC_POINT_new(group); | ||
| 1924 | if (generator == NULL) | ||
| 1925 | goto err; | ||
| 1926 | BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x); | ||
| 1927 | BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y); | ||
| 1928 | if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) | ||
| 1929 | goto err; | ||
| 1930 | if ((pre = nistp521_pre_comp_new()) == NULL) | ||
| 1931 | goto err; | ||
| 1932 | /* if the generator is the standard one, use built-in precomputation */ | ||
| 1933 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
| 1934 | { | ||
| 1935 | memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); | ||
| 1936 | ret = 1; | ||
| 1937 | goto err; | ||
| 1938 | } | ||
| 1939 | if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) || | ||
| 1940 | (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) || | ||
| 1941 | (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z))) | ||
| 1942 | goto err; | ||
| 1943 | /* compute 2^130*G, 2^260*G, 2^390*G */ | ||
| 1944 | for (i = 1; i <= 4; i <<= 1) | ||
| 1945 | { | ||
| 1946 | point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1], | ||
| 1947 | pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0], | ||
| 1948 | pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]); | ||
| 1949 | for (j = 0; j < 129; ++j) | ||
| 1950 | { | ||
| 1951 | point_double(pre->g_pre_comp[2*i][0], | ||
| 1952 | pre->g_pre_comp[2*i][1], | ||
| 1953 | pre->g_pre_comp[2*i][2], | ||
| 1954 | pre->g_pre_comp[2*i][0], | ||
| 1955 | pre->g_pre_comp[2*i][1], | ||
| 1956 | pre->g_pre_comp[2*i][2]); | ||
| 1957 | } | ||
| 1958 | } | ||
| 1959 | /* g_pre_comp[0] is the point at infinity */ | ||
| 1960 | memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0])); | ||
| 1961 | /* the remaining multiples */ | ||
| 1962 | /* 2^130*G + 2^260*G */ | ||
| 1963 | point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], | ||
| 1964 | pre->g_pre_comp[6][2], pre->g_pre_comp[4][0], | ||
| 1965 | pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], | ||
| 1966 | 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], | ||
| 1967 | pre->g_pre_comp[2][2]); | ||
| 1968 | /* 2^130*G + 2^390*G */ | ||
| 1969 | point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], | ||
| 1970 | pre->g_pre_comp[10][2], pre->g_pre_comp[8][0], | ||
| 1971 | pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], | ||
| 1972 | 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], | ||
| 1973 | pre->g_pre_comp[2][2]); | ||
| 1974 | /* 2^260*G + 2^390*G */ | ||
| 1975 | point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], | ||
| 1976 | pre->g_pre_comp[12][2], pre->g_pre_comp[8][0], | ||
| 1977 | pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], | ||
| 1978 | 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], | ||
| 1979 | pre->g_pre_comp[4][2]); | ||
| 1980 | /* 2^130*G + 2^260*G + 2^390*G */ | ||
| 1981 | point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], | ||
| 1982 | pre->g_pre_comp[14][2], pre->g_pre_comp[12][0], | ||
| 1983 | pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], | ||
| 1984 | 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], | ||
| 1985 | pre->g_pre_comp[2][2]); | ||
| 1986 | for (i = 1; i < 8; ++i) | ||
| 1987 | { | ||
| 1988 | /* odd multiples: add G */ | ||
| 1989 | point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1], | ||
| 1990 | pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0], | ||
| 1991 | pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2], | ||
| 1992 | 0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], | ||
| 1993 | pre->g_pre_comp[1][2]); | ||
| 1994 | } | ||
| 1995 | make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems); | ||
| 1996 | |||
| 1997 | if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup, | ||
| 1998 | nistp521_pre_comp_free, nistp521_pre_comp_clear_free)) | ||
| 1999 | goto err; | ||
| 2000 | ret = 1; | ||
| 2001 | pre = NULL; | ||
| 2002 | err: | ||
| 2003 | BN_CTX_end(ctx); | ||
| 2004 | if (generator != NULL) | ||
| 2005 | EC_POINT_free(generator); | ||
| 2006 | if (new_ctx != NULL) | ||
| 2007 | BN_CTX_free(new_ctx); | ||
| 2008 | if (pre) | ||
| 2009 | nistp521_pre_comp_free(pre); | ||
| 2010 | return ret; | ||
| 2011 | } | ||
| 2012 | |||
| 2013 | int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group) | ||
| 2014 | { | ||
| 2015 | if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup, | ||
| 2016 | nistp521_pre_comp_free, nistp521_pre_comp_clear_free) | ||
| 2017 | != NULL) | ||
| 2018 | return 1; | ||
| 2019 | else | ||
| 2020 | return 0; | ||
| 2021 | } | ||
| 2022 | |||
| 2023 | #else | ||
| 2024 | static void *dummy=&dummy; | ||
| 2025 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ecp_nistputil.c b/src/lib/libcrypto/ec/ecp_nistputil.c new file mode 100644 index 0000000000..c8140c807f --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistputil.c | |||
| @@ -0,0 +1,197 @@ | |||
| 1 | /* crypto/ec/ecp_nistputil.c */ | ||
| 2 | /* | ||
| 3 | * Written by Bodo Moeller for the OpenSSL project. | ||
| 4 | */ | ||
| 5 | /* Copyright 2011 Google Inc. | ||
| 6 | * | ||
| 7 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| 8 | * | ||
| 9 | * you may not use this file except in compliance with the License. | ||
| 10 | * You may obtain a copy of the License at | ||
| 11 | * | ||
| 12 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
| 13 | * | ||
| 14 | * Unless required by applicable law or agreed to in writing, software | ||
| 15 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
| 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| 17 | * See the License for the specific language governing permissions and | ||
| 18 | * limitations under the License. | ||
| 19 | */ | ||
| 20 | |||
| 21 | #include <openssl/opensslconf.h> | ||
| 22 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 | ||
| 23 | |||
| 24 | /* | ||
| 25 | * Common utility functions for ecp_nistp224.c, ecp_nistp256.c, ecp_nistp521.c. | ||
| 26 | */ | ||
| 27 | |||
| 28 | #include <stddef.h> | ||
| 29 | #include "ec_lcl.h" | ||
| 30 | |||
| 31 | /* Convert an array of points into affine coordinates. | ||
| 32 | * (If the point at infinity is found (Z = 0), it remains unchanged.) | ||
| 33 | * This function is essentially an equivalent to EC_POINTs_make_affine(), but | ||
| 34 | * works with the internal representation of points as used by ecp_nistp###.c | ||
| 35 | * rather than with (BIGNUM-based) EC_POINT data structures. | ||
| 36 | * | ||
| 37 | * point_array is the input/output buffer ('num' points in projective form, | ||
| 38 | * i.e. three coordinates each), based on an internal representation of | ||
| 39 | * field elements of size 'felem_size'. | ||
| 40 | * | ||
| 41 | * tmp_felems needs to point to a temporary array of 'num'+1 field elements | ||
| 42 | * for storage of intermediate values. | ||
| 43 | */ | ||
| 44 | void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array, | ||
| 45 | size_t felem_size, void *tmp_felems, | ||
| 46 | void (*felem_one)(void *out), | ||
| 47 | int (*felem_is_zero)(const void *in), | ||
| 48 | void (*felem_assign)(void *out, const void *in), | ||
| 49 | void (*felem_square)(void *out, const void *in), | ||
| 50 | void (*felem_mul)(void *out, const void *in1, const void *in2), | ||
| 51 | void (*felem_inv)(void *out, const void *in), | ||
| 52 | void (*felem_contract)(void *out, const void *in)) | ||
| 53 | { | ||
| 54 | int i = 0; | ||
| 55 | |||
| 56 | #define tmp_felem(I) (&((char *)tmp_felems)[(I) * felem_size]) | ||
| 57 | #define X(I) (&((char *)point_array)[3*(I) * felem_size]) | ||
| 58 | #define Y(I) (&((char *)point_array)[(3*(I) + 1) * felem_size]) | ||
| 59 | #define Z(I) (&((char *)point_array)[(3*(I) + 2) * felem_size]) | ||
| 60 | |||
| 61 | if (!felem_is_zero(Z(0))) | ||
| 62 | felem_assign(tmp_felem(0), Z(0)); | ||
| 63 | else | ||
| 64 | felem_one(tmp_felem(0)); | ||
| 65 | for (i = 1; i < (int)num; i++) | ||
| 66 | { | ||
| 67 | if (!felem_is_zero(Z(i))) | ||
| 68 | felem_mul(tmp_felem(i), tmp_felem(i-1), Z(i)); | ||
| 69 | else | ||
| 70 | felem_assign(tmp_felem(i), tmp_felem(i-1)); | ||
| 71 | } | ||
| 72 | /* Now each tmp_felem(i) is the product of Z(0) .. Z(i), skipping any zero-valued factors: | ||
| 73 | * if Z(i) = 0, we essentially pretend that Z(i) = 1 */ | ||
| 74 | |||
| 75 | felem_inv(tmp_felem(num-1), tmp_felem(num-1)); | ||
| 76 | for (i = num - 1; i >= 0; i--) | ||
| 77 | { | ||
| 78 | if (i > 0) | ||
| 79 | /* tmp_felem(i-1) is the product of Z(0) .. Z(i-1), | ||
| 80 | * tmp_felem(i) is the inverse of the product of Z(0) .. Z(i) | ||
| 81 | */ | ||
| 82 | felem_mul(tmp_felem(num), tmp_felem(i-1), tmp_felem(i)); /* 1/Z(i) */ | ||
| 83 | else | ||
| 84 | felem_assign(tmp_felem(num), tmp_felem(0)); /* 1/Z(0) */ | ||
| 85 | |||
| 86 | if (!felem_is_zero(Z(i))) | ||
| 87 | { | ||
| 88 | if (i > 0) | ||
| 89 | /* For next iteration, replace tmp_felem(i-1) by its inverse */ | ||
| 90 | felem_mul(tmp_felem(i-1), tmp_felem(i), Z(i)); | ||
| 91 | |||
| 92 | /* Convert point (X, Y, Z) into affine form (X/(Z^2), Y/(Z^3), 1) */ | ||
| 93 | felem_square(Z(i), tmp_felem(num)); /* 1/(Z^2) */ | ||
| 94 | felem_mul(X(i), X(i), Z(i)); /* X/(Z^2) */ | ||
| 95 | felem_mul(Z(i), Z(i), tmp_felem(num)); /* 1/(Z^3) */ | ||
| 96 | felem_mul(Y(i), Y(i), Z(i)); /* Y/(Z^3) */ | ||
| 97 | felem_contract(X(i), X(i)); | ||
| 98 | felem_contract(Y(i), Y(i)); | ||
| 99 | felem_one(Z(i)); | ||
| 100 | } | ||
| 101 | else | ||
| 102 | { | ||
| 103 | if (i > 0) | ||
| 104 | /* For next iteration, replace tmp_felem(i-1) by its inverse */ | ||
| 105 | felem_assign(tmp_felem(i-1), tmp_felem(i)); | ||
| 106 | } | ||
| 107 | } | ||
| 108 | } | ||
| 109 | |||
| 110 | /* | ||
| 111 | * This function looks at 5+1 scalar bits (5 current, 1 adjacent less | ||
| 112 | * significant bit), and recodes them into a signed digit for use in fast point | ||
| 113 | * multiplication: the use of signed rather than unsigned digits means that | ||
| 114 | * fewer points need to be precomputed, given that point inversion is easy | ||
| 115 | * (a precomputed point dP makes -dP available as well). | ||
| 116 | * | ||
| 117 | * BACKGROUND: | ||
| 118 | * | ||
| 119 | * Signed digits for multiplication were introduced by Booth ("A signed binary | ||
| 120 | * multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV, | ||
| 121 | * pt. 2 (1951), pp. 236-240), in that case for multiplication of integers. | ||
| 122 | * Booth's original encoding did not generally improve the density of nonzero | ||
| 123 | * digits over the binary representation, and was merely meant to simplify the | ||
| 124 | * handling of signed factors given in two's complement; but it has since been | ||
| 125 | * shown to be the basis of various signed-digit representations that do have | ||
| 126 | * further advantages, including the wNAF, using the following general approach: | ||
| 127 | * | ||
| 128 | * (1) Given a binary representation | ||
| 129 | * | ||
| 130 | * b_k ... b_2 b_1 b_0, | ||
| 131 | * | ||
| 132 | * of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1 | ||
| 133 | * by using bit-wise subtraction as follows: | ||
| 134 | * | ||
| 135 | * b_k b_(k-1) ... b_2 b_1 b_0 | ||
| 136 | * - b_k ... b_3 b_2 b_1 b_0 | ||
| 137 | * ------------------------------------- | ||
| 138 | * s_k b_(k-1) ... s_3 s_2 s_1 s_0 | ||
| 139 | * | ||
| 140 | * A left-shift followed by subtraction of the original value yields a new | ||
| 141 | * representation of the same value, using signed bits s_i = b_(i+1) - b_i. | ||
| 142 | * This representation from Booth's paper has since appeared in the | ||
| 143 | * literature under a variety of different names including "reversed binary | ||
| 144 | * form", "alternating greedy expansion", "mutual opposite form", and | ||
| 145 | * "sign-alternating {+-1}-representation". | ||
| 146 | * | ||
| 147 | * An interesting property is that among the nonzero bits, values 1 and -1 | ||
| 148 | * strictly alternate. | ||
| 149 | * | ||
| 150 | * (2) Various window schemes can be applied to the Booth representation of | ||
| 151 | * integers: for example, right-to-left sliding windows yield the wNAF | ||
| 152 | * (a signed-digit encoding independently discovered by various researchers | ||
| 153 | * in the 1990s), and left-to-right sliding windows yield a left-to-right | ||
| 154 | * equivalent of the wNAF (independently discovered by various researchers | ||
| 155 | * around 2004). | ||
| 156 | * | ||
| 157 | * To prevent leaking information through side channels in point multiplication, | ||
| 158 | * we need to recode the given integer into a regular pattern: sliding windows | ||
| 159 | * as in wNAFs won't do, we need their fixed-window equivalent -- which is a few | ||
| 160 | * decades older: we'll be using the so-called "modified Booth encoding" due to | ||
| 161 | * MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49 | ||
| 162 | * (1961), pp. 67-91), in a radix-2^5 setting. That is, we always combine five | ||
| 163 | * signed bits into a signed digit: | ||
| 164 | * | ||
| 165 | * s_(4j + 4) s_(4j + 3) s_(4j + 2) s_(4j + 1) s_(4j) | ||
| 166 | * | ||
| 167 | * The sign-alternating property implies that the resulting digit values are | ||
| 168 | * integers from -16 to 16. | ||
| 169 | * | ||
| 170 | * Of course, we don't actually need to compute the signed digits s_i as an | ||
| 171 | * intermediate step (that's just a nice way to see how this scheme relates | ||
| 172 | * to the wNAF): a direct computation obtains the recoded digit from the | ||
| 173 | * six bits b_(4j + 4) ... b_(4j - 1). | ||
| 174 | * | ||
| 175 | * This function takes those five bits as an integer (0 .. 63), writing the | ||
| 176 | * recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute | ||
| 177 | * value, in the range 0 .. 8). Note that this integer essentially provides the | ||
| 178 | * input bits "shifted to the left" by one position: for example, the input to | ||
| 179 | * compute the least significant recoded digit, given that there's no bit b_-1, | ||
| 180 | * has to be b_4 b_3 b_2 b_1 b_0 0. | ||
| 181 | * | ||
| 182 | */ | ||
| 183 | void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in) | ||
| 184 | { | ||
| 185 | unsigned char s, d; | ||
| 186 | |||
| 187 | s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as 6-bit value */ | ||
| 188 | d = (1 << 6) - in - 1; | ||
| 189 | d = (d & s) | (in & ~s); | ||
| 190 | d = (d >> 1) + (d & 1); | ||
| 191 | |||
| 192 | *sign = s & 1; | ||
| 193 | *digit = d; | ||
| 194 | } | ||
| 195 | #else | ||
| 196 | static void *dummy=&dummy; | ||
| 197 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ecp_oct.c b/src/lib/libcrypto/ec/ecp_oct.c new file mode 100644 index 0000000000..374a0ee731 --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_oct.c | |||
| @@ -0,0 +1,433 @@ | |||
| 1 | /* crypto/ec/ecp_oct.c */ | ||
| 2 | /* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de> | ||
| 3 | * for the OpenSSL project. | ||
| 4 | * Includes code written by Bodo Moeller for the OpenSSL project. | ||
| 5 | */ | ||
| 6 | /* ==================================================================== | ||
| 7 | * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. | ||
| 8 | * | ||
| 9 | * Redistribution and use in source and binary forms, with or without | ||
| 10 | * modification, are permitted provided that the following conditions | ||
| 11 | * are met: | ||
| 12 | * | ||
| 13 | * 1. Redistributions of source code must retain the above copyright | ||
| 14 | * notice, this list of conditions and the following disclaimer. | ||
| 15 | * | ||
| 16 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 17 | * notice, this list of conditions and the following disclaimer in | ||
| 18 | * the documentation and/or other materials provided with the | ||
| 19 | * distribution. | ||
| 20 | * | ||
| 21 | * 3. All advertising materials mentioning features or use of this | ||
| 22 | * software must display the following acknowledgment: | ||
| 23 | * "This product includes software developed by the OpenSSL Project | ||
| 24 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
| 25 | * | ||
| 26 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 27 | * endorse or promote products derived from this software without | ||
| 28 | * prior written permission. For written permission, please contact | ||
| 29 | * openssl-core@openssl.org. | ||
| 30 | * | ||
| 31 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 32 | * nor may "OpenSSL" appear in their names without prior written | ||
| 33 | * permission of the OpenSSL Project. | ||
| 34 | * | ||
| 35 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 36 | * acknowledgment: | ||
| 37 | * "This product includes software developed by the OpenSSL Project | ||
| 38 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
| 39 | * | ||
| 40 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 41 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 42 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 43 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 44 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 45 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 46 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 47 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 49 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 50 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 51 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 52 | * ==================================================================== | ||
| 53 | * | ||
| 54 | * This product includes cryptographic software written by Eric Young | ||
| 55 | * (eay@cryptsoft.com). This product includes software written by Tim | ||
| 56 | * Hudson (tjh@cryptsoft.com). | ||
| 57 | * | ||
| 58 | */ | ||
| 59 | /* ==================================================================== | ||
| 60 | * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. | ||
| 61 | * Portions of this software developed by SUN MICROSYSTEMS, INC., | ||
| 62 | * and contributed to the OpenSSL project. | ||
| 63 | */ | ||
| 64 | |||
| 65 | #include <openssl/err.h> | ||
| 66 | #include <openssl/symhacks.h> | ||
| 67 | |||
| 68 | #include "ec_lcl.h" | ||
| 69 | |||
| 70 | int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point, | ||
| 71 | const BIGNUM *x_, int y_bit, BN_CTX *ctx) | ||
| 72 | { | ||
| 73 | BN_CTX *new_ctx = NULL; | ||
| 74 | BIGNUM *tmp1, *tmp2, *x, *y; | ||
| 75 | int ret = 0; | ||
| 76 | |||
| 77 | /* clear error queue*/ | ||
| 78 | ERR_clear_error(); | ||
| 79 | |||
| 80 | if (ctx == NULL) | ||
| 81 | { | ||
| 82 | ctx = new_ctx = BN_CTX_new(); | ||
| 83 | if (ctx == NULL) | ||
| 84 | return 0; | ||
| 85 | } | ||
| 86 | |||
| 87 | y_bit = (y_bit != 0); | ||
| 88 | |||
| 89 | BN_CTX_start(ctx); | ||
| 90 | tmp1 = BN_CTX_get(ctx); | ||
| 91 | tmp2 = BN_CTX_get(ctx); | ||
| 92 | x = BN_CTX_get(ctx); | ||
| 93 | y = BN_CTX_get(ctx); | ||
| 94 | if (y == NULL) goto err; | ||
| 95 | |||
| 96 | /* Recover y. We have a Weierstrass equation | ||
| 97 | * y^2 = x^3 + a*x + b, | ||
| 98 | * so y is one of the square roots of x^3 + a*x + b. | ||
| 99 | */ | ||
| 100 | |||
| 101 | /* tmp1 := x^3 */ | ||
| 102 | if (!BN_nnmod(x, x_, &group->field,ctx)) goto err; | ||
| 103 | if (group->meth->field_decode == 0) | ||
| 104 | { | ||
| 105 | /* field_{sqr,mul} work on standard representation */ | ||
| 106 | if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err; | ||
| 107 | if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err; | ||
| 108 | } | ||
| 109 | else | ||
| 110 | { | ||
| 111 | if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err; | ||
| 112 | if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err; | ||
| 113 | } | ||
| 114 | |||
| 115 | /* tmp1 := tmp1 + a*x */ | ||
| 116 | if (group->a_is_minus3) | ||
| 117 | { | ||
| 118 | if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err; | ||
| 119 | if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err; | ||
| 120 | if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err; | ||
| 121 | } | ||
| 122 | else | ||
| 123 | { | ||
| 124 | if (group->meth->field_decode) | ||
| 125 | { | ||
| 126 | if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err; | ||
| 127 | if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err; | ||
| 128 | } | ||
| 129 | else | ||
| 130 | { | ||
| 131 | /* field_mul works on standard representation */ | ||
| 132 | if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err; | ||
| 133 | } | ||
| 134 | |||
| 135 | if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err; | ||
| 136 | } | ||
| 137 | |||
| 138 | /* tmp1 := tmp1 + b */ | ||
| 139 | if (group->meth->field_decode) | ||
| 140 | { | ||
| 141 | if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err; | ||
| 142 | if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err; | ||
| 143 | } | ||
| 144 | else | ||
| 145 | { | ||
| 146 | if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err; | ||
| 147 | } | ||
| 148 | |||
| 149 | if (!BN_mod_sqrt(y, tmp1, &group->field, ctx)) | ||
| 150 | { | ||
| 151 | unsigned long err = ERR_peek_last_error(); | ||
| 152 | |||
| 153 | if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE) | ||
| 154 | { | ||
| 155 | ERR_clear_error(); | ||
| 156 | ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); | ||
| 157 | } | ||
| 158 | else | ||
| 159 | ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB); | ||
| 160 | goto err; | ||
| 161 | } | ||
| 162 | |||
| 163 | if (y_bit != BN_is_odd(y)) | ||
| 164 | { | ||
| 165 | if (BN_is_zero(y)) | ||
| 166 | { | ||
| 167 | int kron; | ||
| 168 | |||
| 169 | kron = BN_kronecker(x, &group->field, ctx); | ||
| 170 | if (kron == -2) goto err; | ||
| 171 | |||
| 172 | if (kron == 1) | ||
| 173 | ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT); | ||
| 174 | else | ||
| 175 | /* BN_mod_sqrt() should have cought this error (not a square) */ | ||
| 176 | ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); | ||
| 177 | goto err; | ||
| 178 | } | ||
| 179 | if (!BN_usub(y, &group->field, y)) goto err; | ||
| 180 | } | ||
| 181 | if (y_bit != BN_is_odd(y)) | ||
| 182 | { | ||
| 183 | ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR); | ||
| 184 | goto err; | ||
| 185 | } | ||
| 186 | |||
| 187 | if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; | ||
| 188 | |||
| 189 | ret = 1; | ||
| 190 | |||
| 191 | err: | ||
| 192 | BN_CTX_end(ctx); | ||
| 193 | if (new_ctx != NULL) | ||
| 194 | BN_CTX_free(new_ctx); | ||
| 195 | return ret; | ||
| 196 | } | ||
| 197 | |||
| 198 | |||
| 199 | size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, | ||
| 200 | unsigned char *buf, size_t len, BN_CTX *ctx) | ||
| 201 | { | ||
| 202 | size_t ret; | ||
| 203 | BN_CTX *new_ctx = NULL; | ||
| 204 | int used_ctx = 0; | ||
| 205 | BIGNUM *x, *y; | ||
| 206 | size_t field_len, i, skip; | ||
| 207 | |||
| 208 | if ((form != POINT_CONVERSION_COMPRESSED) | ||
| 209 | && (form != POINT_CONVERSION_UNCOMPRESSED) | ||
| 210 | && (form != POINT_CONVERSION_HYBRID)) | ||
| 211 | { | ||
| 212 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM); | ||
| 213 | goto err; | ||
| 214 | } | ||
| 215 | |||
| 216 | if (EC_POINT_is_at_infinity(group, point)) | ||
| 217 | { | ||
| 218 | /* encodes to a single 0 octet */ | ||
| 219 | if (buf != NULL) | ||
| 220 | { | ||
| 221 | if (len < 1) | ||
| 222 | { | ||
| 223 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); | ||
| 224 | return 0; | ||
| 225 | } | ||
| 226 | buf[0] = 0; | ||
| 227 | } | ||
| 228 | return 1; | ||
| 229 | } | ||
| 230 | |||
| 231 | |||
| 232 | /* ret := required output buffer length */ | ||
| 233 | field_len = BN_num_bytes(&group->field); | ||
| 234 | ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; | ||
| 235 | |||
| 236 | /* if 'buf' is NULL, just return required length */ | ||
| 237 | if (buf != NULL) | ||
| 238 | { | ||
| 239 | if (len < ret) | ||
| 240 | { | ||
| 241 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); | ||
| 242 | goto err; | ||
| 243 | } | ||
| 244 | |||
| 245 | if (ctx == NULL) | ||
| 246 | { | ||
| 247 | ctx = new_ctx = BN_CTX_new(); | ||
| 248 | if (ctx == NULL) | ||
| 249 | return 0; | ||
| 250 | } | ||
| 251 | |||
| 252 | BN_CTX_start(ctx); | ||
| 253 | used_ctx = 1; | ||
| 254 | x = BN_CTX_get(ctx); | ||
| 255 | y = BN_CTX_get(ctx); | ||
| 256 | if (y == NULL) goto err; | ||
| 257 | |||
| 258 | if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; | ||
| 259 | |||
| 260 | if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y)) | ||
| 261 | buf[0] = form + 1; | ||
| 262 | else | ||
| 263 | buf[0] = form; | ||
| 264 | |||
| 265 | i = 1; | ||
| 266 | |||
| 267 | skip = field_len - BN_num_bytes(x); | ||
| 268 | if (skip > field_len) | ||
| 269 | { | ||
| 270 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
| 271 | goto err; | ||
| 272 | } | ||
| 273 | while (skip > 0) | ||
| 274 | { | ||
| 275 | buf[i++] = 0; | ||
| 276 | skip--; | ||
| 277 | } | ||
| 278 | skip = BN_bn2bin(x, buf + i); | ||
| 279 | i += skip; | ||
| 280 | if (i != 1 + field_len) | ||
| 281 | { | ||
| 282 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
| 283 | goto err; | ||
| 284 | } | ||
| 285 | |||
| 286 | if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID) | ||
| 287 | { | ||
| 288 | skip = field_len - BN_num_bytes(y); | ||
| 289 | if (skip > field_len) | ||
| 290 | { | ||
| 291 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
| 292 | goto err; | ||
| 293 | } | ||
| 294 | while (skip > 0) | ||
| 295 | { | ||
| 296 | buf[i++] = 0; | ||
| 297 | skip--; | ||
| 298 | } | ||
| 299 | skip = BN_bn2bin(y, buf + i); | ||
| 300 | i += skip; | ||
| 301 | } | ||
| 302 | |||
| 303 | if (i != ret) | ||
| 304 | { | ||
| 305 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
| 306 | goto err; | ||
| 307 | } | ||
| 308 | } | ||
| 309 | |||
| 310 | if (used_ctx) | ||
| 311 | BN_CTX_end(ctx); | ||
| 312 | if (new_ctx != NULL) | ||
| 313 | BN_CTX_free(new_ctx); | ||
| 314 | return ret; | ||
| 315 | |||
| 316 | err: | ||
| 317 | if (used_ctx) | ||
| 318 | BN_CTX_end(ctx); | ||
| 319 | if (new_ctx != NULL) | ||
| 320 | BN_CTX_free(new_ctx); | ||
| 321 | return 0; | ||
| 322 | } | ||
| 323 | |||
| 324 | |||
| 325 | int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point, | ||
| 326 | const unsigned char *buf, size_t len, BN_CTX *ctx) | ||
| 327 | { | ||
| 328 | point_conversion_form_t form; | ||
| 329 | int y_bit; | ||
| 330 | BN_CTX *new_ctx = NULL; | ||
| 331 | BIGNUM *x, *y; | ||
| 332 | size_t field_len, enc_len; | ||
| 333 | int ret = 0; | ||
| 334 | |||
| 335 | if (len == 0) | ||
| 336 | { | ||
| 337 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL); | ||
| 338 | return 0; | ||
| 339 | } | ||
| 340 | form = buf[0]; | ||
| 341 | y_bit = form & 1; | ||
| 342 | form = form & ~1U; | ||
| 343 | if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED) | ||
| 344 | && (form != POINT_CONVERSION_UNCOMPRESSED) | ||
| 345 | && (form != POINT_CONVERSION_HYBRID)) | ||
| 346 | { | ||
| 347 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 348 | return 0; | ||
| 349 | } | ||
| 350 | if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit) | ||
| 351 | { | ||
| 352 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 353 | return 0; | ||
| 354 | } | ||
| 355 | |||
| 356 | if (form == 0) | ||
| 357 | { | ||
| 358 | if (len != 1) | ||
| 359 | { | ||
| 360 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 361 | return 0; | ||
| 362 | } | ||
| 363 | |||
| 364 | return EC_POINT_set_to_infinity(group, point); | ||
| 365 | } | ||
| 366 | |||
| 367 | field_len = BN_num_bytes(&group->field); | ||
| 368 | enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; | ||
| 369 | |||
| 370 | if (len != enc_len) | ||
| 371 | { | ||
| 372 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 373 | return 0; | ||
| 374 | } | ||
| 375 | |||
| 376 | if (ctx == NULL) | ||
| 377 | { | ||
| 378 | ctx = new_ctx = BN_CTX_new(); | ||
| 379 | if (ctx == NULL) | ||
| 380 | return 0; | ||
| 381 | } | ||
| 382 | |||
| 383 | BN_CTX_start(ctx); | ||
| 384 | x = BN_CTX_get(ctx); | ||
| 385 | y = BN_CTX_get(ctx); | ||
| 386 | if (y == NULL) goto err; | ||
| 387 | |||
| 388 | if (!BN_bin2bn(buf + 1, field_len, x)) goto err; | ||
| 389 | if (BN_ucmp(x, &group->field) >= 0) | ||
| 390 | { | ||
| 391 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 392 | goto err; | ||
| 393 | } | ||
| 394 | |||
| 395 | if (form == POINT_CONVERSION_COMPRESSED) | ||
| 396 | { | ||
| 397 | if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err; | ||
| 398 | } | ||
| 399 | else | ||
| 400 | { | ||
| 401 | if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err; | ||
| 402 | if (BN_ucmp(y, &group->field) >= 0) | ||
| 403 | { | ||
| 404 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 405 | goto err; | ||
| 406 | } | ||
| 407 | if (form == POINT_CONVERSION_HYBRID) | ||
| 408 | { | ||
| 409 | if (y_bit != BN_is_odd(y)) | ||
| 410 | { | ||
| 411 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
| 412 | goto err; | ||
| 413 | } | ||
| 414 | } | ||
| 415 | |||
| 416 | if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; | ||
| 417 | } | ||
| 418 | |||
| 419 | if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */ | ||
| 420 | { | ||
| 421 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE); | ||
| 422 | goto err; | ||
| 423 | } | ||
| 424 | |||
| 425 | ret = 1; | ||
| 426 | |||
| 427 | err: | ||
| 428 | BN_CTX_end(ctx); | ||
| 429 | if (new_ctx != NULL) | ||
| 430 | BN_CTX_free(new_ctx); | ||
| 431 | return ret; | ||
| 432 | } | ||
| 433 | |||
diff --git a/src/lib/libcrypto/ecdh/ecdh.h b/src/lib/libcrypto/ecdh/ecdh.h index b4b58ee65b..8887102c0b 100644 --- a/src/lib/libcrypto/ecdh/ecdh.h +++ b/src/lib/libcrypto/ecdh/ecdh.h | |||
| @@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void); | |||
| 109 | /* Error codes for the ECDH functions. */ | 109 | /* Error codes for the ECDH functions. */ |
| 110 | 110 | ||
| 111 | /* Function codes. */ | 111 | /* Function codes. */ |
| 112 | #define ECDH_F_ECDH_CHECK 102 | ||
| 112 | #define ECDH_F_ECDH_COMPUTE_KEY 100 | 113 | #define ECDH_F_ECDH_COMPUTE_KEY 100 |
| 113 | #define ECDH_F_ECDH_DATA_NEW_METHOD 101 | 114 | #define ECDH_F_ECDH_DATA_NEW_METHOD 101 |
| 114 | 115 | ||
| 115 | /* Reason codes. */ | 116 | /* Reason codes. */ |
| 116 | #define ECDH_R_KDF_FAILED 102 | 117 | #define ECDH_R_KDF_FAILED 102 |
| 118 | #define ECDH_R_NON_FIPS_METHOD 103 | ||
| 117 | #define ECDH_R_NO_PRIVATE_VALUE 100 | 119 | #define ECDH_R_NO_PRIVATE_VALUE 100 |
| 118 | #define ECDH_R_POINT_ARITHMETIC_FAILURE 101 | 120 | #define ECDH_R_POINT_ARITHMETIC_FAILURE 101 |
| 119 | 121 | ||
diff --git a/src/lib/libcrypto/ecdh/ech_err.c b/src/lib/libcrypto/ecdh/ech_err.c index 6f4b0c9953..3bd247398d 100644 --- a/src/lib/libcrypto/ecdh/ech_err.c +++ b/src/lib/libcrypto/ecdh/ech_err.c | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | /* crypto/ecdh/ech_err.c */ | 1 | /* crypto/ecdh/ech_err.c */ |
| 2 | /* ==================================================================== | 2 | /* ==================================================================== |
| 3 | * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. |
| 4 | * | 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
| @@ -70,6 +70,7 @@ | |||
| 70 | 70 | ||
| 71 | static ERR_STRING_DATA ECDH_str_functs[]= | 71 | static ERR_STRING_DATA ECDH_str_functs[]= |
| 72 | { | 72 | { |
| 73 | {ERR_FUNC(ECDH_F_ECDH_CHECK), "ECDH_CHECK"}, | ||
| 73 | {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY), "ECDH_compute_key"}, | 74 | {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY), "ECDH_compute_key"}, |
| 74 | {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD), "ECDH_DATA_new_method"}, | 75 | {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD), "ECDH_DATA_new_method"}, |
| 75 | {0,NULL} | 76 | {0,NULL} |
| @@ -78,6 +79,7 @@ static ERR_STRING_DATA ECDH_str_functs[]= | |||
| 78 | static ERR_STRING_DATA ECDH_str_reasons[]= | 79 | static ERR_STRING_DATA ECDH_str_reasons[]= |
| 79 | { | 80 | { |
| 80 | {ERR_REASON(ECDH_R_KDF_FAILED) ,"KDF failed"}, | 81 | {ERR_REASON(ECDH_R_KDF_FAILED) ,"KDF failed"}, |
| 82 | {ERR_REASON(ECDH_R_NON_FIPS_METHOD) ,"non fips method"}, | ||
| 81 | {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE) ,"no private value"}, | 83 | {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE) ,"no private value"}, |
| 82 | {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"}, | 84 | {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"}, |
| 83 | {0,NULL} | 85 | {0,NULL} |
diff --git a/src/lib/libcrypto/ecdh/ech_lib.c b/src/lib/libcrypto/ecdh/ech_lib.c index 4d8ea03d3d..dadbfd3c49 100644 --- a/src/lib/libcrypto/ecdh/ech_lib.c +++ b/src/lib/libcrypto/ecdh/ech_lib.c | |||
| @@ -73,6 +73,9 @@ | |||
| 73 | #include <openssl/engine.h> | 73 | #include <openssl/engine.h> |
| 74 | #endif | 74 | #endif |
| 75 | #include <openssl/err.h> | 75 | #include <openssl/err.h> |
| 76 | #ifdef OPENSSL_FIPS | ||
| 77 | #include <openssl/fips.h> | ||
| 78 | #endif | ||
| 76 | 79 | ||
| 77 | const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT; | 80 | const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT; |
| 78 | 81 | ||
| @@ -90,7 +93,16 @@ void ECDH_set_default_method(const ECDH_METHOD *meth) | |||
| 90 | const ECDH_METHOD *ECDH_get_default_method(void) | 93 | const ECDH_METHOD *ECDH_get_default_method(void) |
| 91 | { | 94 | { |
| 92 | if(!default_ECDH_method) | 95 | if(!default_ECDH_method) |
| 96 | { | ||
| 97 | #ifdef OPENSSL_FIPS | ||
| 98 | if (FIPS_mode()) | ||
| 99 | return FIPS_ecdh_openssl(); | ||
| 100 | else | ||
| 101 | return ECDH_OpenSSL(); | ||
| 102 | #else | ||
| 93 | default_ECDH_method = ECDH_OpenSSL(); | 103 | default_ECDH_method = ECDH_OpenSSL(); |
| 104 | #endif | ||
| 105 | } | ||
| 94 | return default_ECDH_method; | 106 | return default_ECDH_method; |
| 95 | } | 107 | } |
| 96 | 108 | ||
| @@ -215,6 +227,14 @@ ECDH_DATA *ecdh_check(EC_KEY *key) | |||
| 215 | } | 227 | } |
| 216 | else | 228 | else |
| 217 | ecdh_data = (ECDH_DATA *)data; | 229 | ecdh_data = (ECDH_DATA *)data; |
| 230 | #ifdef OPENSSL_FIPS | ||
| 231 | if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD) | ||
| 232 | && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW)) | ||
| 233 | { | ||
| 234 | ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD); | ||
| 235 | return NULL; | ||
| 236 | } | ||
| 237 | #endif | ||
| 218 | 238 | ||
| 219 | 239 | ||
| 220 | return ecdh_data; | 240 | return ecdh_data; |
diff --git a/src/lib/libcrypto/ecdh/ech_locl.h b/src/lib/libcrypto/ecdh/ech_locl.h index f658526a7e..f6cad6a894 100644 --- a/src/lib/libcrypto/ecdh/ech_locl.h +++ b/src/lib/libcrypto/ecdh/ech_locl.h | |||
| @@ -75,6 +75,14 @@ struct ecdh_method | |||
| 75 | char *app_data; | 75 | char *app_data; |
| 76 | }; | 76 | }; |
| 77 | 77 | ||
| 78 | /* If this flag is set the ECDH method is FIPS compliant and can be used | ||
| 79 | * in FIPS mode. This is set in the validated module method. If an | ||
| 80 | * application sets this flag in its own methods it is its responsibility | ||
| 81 | * to ensure the result is compliant. | ||
| 82 | */ | ||
| 83 | |||
| 84 | #define ECDH_FLAG_FIPS_METHOD 0x1 | ||
| 85 | |||
| 78 | typedef struct ecdh_data_st { | 86 | typedef struct ecdh_data_st { |
| 79 | /* EC_KEY_METH_DATA part */ | 87 | /* EC_KEY_METH_DATA part */ |
| 80 | int (*init)(EC_KEY *); | 88 | int (*init)(EC_KEY *); |
diff --git a/src/lib/libcrypto/ecdsa/ecdsa.h b/src/lib/libcrypto/ecdsa/ecdsa.h index e61c539812..7fb5254b62 100644 --- a/src/lib/libcrypto/ecdsa/ecdsa.h +++ b/src/lib/libcrypto/ecdsa/ecdsa.h | |||
| @@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void); | |||
| 238 | /* Error codes for the ECDSA functions. */ | 238 | /* Error codes for the ECDSA functions. */ |
| 239 | 239 | ||
| 240 | /* Function codes. */ | 240 | /* Function codes. */ |
| 241 | #define ECDSA_F_ECDSA_CHECK 104 | ||
| 241 | #define ECDSA_F_ECDSA_DATA_NEW_METHOD 100 | 242 | #define ECDSA_F_ECDSA_DATA_NEW_METHOD 100 |
| 242 | #define ECDSA_F_ECDSA_DO_SIGN 101 | 243 | #define ECDSA_F_ECDSA_DO_SIGN 101 |
| 243 | #define ECDSA_F_ECDSA_DO_VERIFY 102 | 244 | #define ECDSA_F_ECDSA_DO_VERIFY 102 |
| @@ -249,6 +250,7 @@ void ERR_load_ECDSA_strings(void); | |||
| 249 | #define ECDSA_R_ERR_EC_LIB 102 | 250 | #define ECDSA_R_ERR_EC_LIB 102 |
| 250 | #define ECDSA_R_MISSING_PARAMETERS 103 | 251 | #define ECDSA_R_MISSING_PARAMETERS 103 |
| 251 | #define ECDSA_R_NEED_NEW_SETUP_VALUES 106 | 252 | #define ECDSA_R_NEED_NEW_SETUP_VALUES 106 |
| 253 | #define ECDSA_R_NON_FIPS_METHOD 107 | ||
| 252 | #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED 104 | 254 | #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED 104 |
| 253 | #define ECDSA_R_SIGNATURE_MALLOC_FAILED 105 | 255 | #define ECDSA_R_SIGNATURE_MALLOC_FAILED 105 |
| 254 | 256 | ||
diff --git a/src/lib/libcrypto/ecdsa/ecs_err.c b/src/lib/libcrypto/ecdsa/ecs_err.c index 98e38d537f..81542e6d15 100644 --- a/src/lib/libcrypto/ecdsa/ecs_err.c +++ b/src/lib/libcrypto/ecdsa/ecs_err.c | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | /* crypto/ecdsa/ecs_err.c */ | 1 | /* crypto/ecdsa/ecs_err.c */ |
| 2 | /* ==================================================================== | 2 | /* ==================================================================== |
| 3 | * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. |
| 4 | * | 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
| @@ -70,6 +70,7 @@ | |||
| 70 | 70 | ||
| 71 | static ERR_STRING_DATA ECDSA_str_functs[]= | 71 | static ERR_STRING_DATA ECDSA_str_functs[]= |
| 72 | { | 72 | { |
| 73 | {ERR_FUNC(ECDSA_F_ECDSA_CHECK), "ECDSA_CHECK"}, | ||
| 73 | {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD), "ECDSA_DATA_NEW_METHOD"}, | 74 | {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD), "ECDSA_DATA_NEW_METHOD"}, |
| 74 | {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN), "ECDSA_do_sign"}, | 75 | {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN), "ECDSA_do_sign"}, |
| 75 | {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY), "ECDSA_do_verify"}, | 76 | {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY), "ECDSA_do_verify"}, |
| @@ -84,6 +85,7 @@ static ERR_STRING_DATA ECDSA_str_reasons[]= | |||
| 84 | {ERR_REASON(ECDSA_R_ERR_EC_LIB) ,"err ec lib"}, | 85 | {ERR_REASON(ECDSA_R_ERR_EC_LIB) ,"err ec lib"}, |
| 85 | {ERR_REASON(ECDSA_R_MISSING_PARAMETERS) ,"missing parameters"}, | 86 | {ERR_REASON(ECDSA_R_MISSING_PARAMETERS) ,"missing parameters"}, |
| 86 | {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"}, | 87 | {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"}, |
| 88 | {ERR_REASON(ECDSA_R_NON_FIPS_METHOD) ,"non fips method"}, | ||
| 87 | {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"}, | 89 | {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"}, |
| 88 | {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"}, | 90 | {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"}, |
| 89 | {0,NULL} | 91 | {0,NULL} |
diff --git a/src/lib/libcrypto/ecdsa/ecs_lib.c b/src/lib/libcrypto/ecdsa/ecs_lib.c index 2ebae3aa27..e477da430b 100644 --- a/src/lib/libcrypto/ecdsa/ecs_lib.c +++ b/src/lib/libcrypto/ecdsa/ecs_lib.c | |||
| @@ -60,6 +60,9 @@ | |||
| 60 | #endif | 60 | #endif |
| 61 | #include <openssl/err.h> | 61 | #include <openssl/err.h> |
| 62 | #include <openssl/bn.h> | 62 | #include <openssl/bn.h> |
| 63 | #ifdef OPENSSL_FIPS | ||
| 64 | #include <openssl/fips.h> | ||
| 65 | #endif | ||
| 63 | 66 | ||
| 64 | const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT; | 67 | const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT; |
| 65 | 68 | ||
| @@ -77,7 +80,16 @@ void ECDSA_set_default_method(const ECDSA_METHOD *meth) | |||
| 77 | const ECDSA_METHOD *ECDSA_get_default_method(void) | 80 | const ECDSA_METHOD *ECDSA_get_default_method(void) |
| 78 | { | 81 | { |
| 79 | if(!default_ECDSA_method) | 82 | if(!default_ECDSA_method) |
| 83 | { | ||
| 84 | #ifdef OPENSSL_FIPS | ||
| 85 | if (FIPS_mode()) | ||
| 86 | return FIPS_ecdsa_openssl(); | ||
| 87 | else | ||
| 88 | return ECDSA_OpenSSL(); | ||
| 89 | #else | ||
| 80 | default_ECDSA_method = ECDSA_OpenSSL(); | 90 | default_ECDSA_method = ECDSA_OpenSSL(); |
| 91 | #endif | ||
| 92 | } | ||
| 81 | return default_ECDSA_method; | 93 | return default_ECDSA_method; |
| 82 | } | 94 | } |
| 83 | 95 | ||
| @@ -193,7 +205,14 @@ ECDSA_DATA *ecdsa_check(EC_KEY *key) | |||
| 193 | } | 205 | } |
| 194 | else | 206 | else |
| 195 | ecdsa_data = (ECDSA_DATA *)data; | 207 | ecdsa_data = (ECDSA_DATA *)data; |
| 196 | 208 | #ifdef OPENSSL_FIPS | |
| 209 | if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD) | ||
| 210 | && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW)) | ||
| 211 | { | ||
| 212 | ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD); | ||
| 213 | return NULL; | ||
| 214 | } | ||
| 215 | #endif | ||
| 197 | 216 | ||
| 198 | return ecdsa_data; | 217 | return ecdsa_data; |
| 199 | } | 218 | } |
diff --git a/src/lib/libcrypto/ecdsa/ecs_locl.h b/src/lib/libcrypto/ecdsa/ecs_locl.h index 3a69a840e2..cb3be13cfc 100644 --- a/src/lib/libcrypto/ecdsa/ecs_locl.h +++ b/src/lib/libcrypto/ecdsa/ecs_locl.h | |||
| @@ -82,6 +82,14 @@ struct ecdsa_method | |||
| 82 | char *app_data; | 82 | char *app_data; |
| 83 | }; | 83 | }; |
| 84 | 84 | ||
| 85 | /* If this flag is set the ECDSA method is FIPS compliant and can be used | ||
| 86 | * in FIPS mode. This is set in the validated module method. If an | ||
| 87 | * application sets this flag in its own methods it is its responsibility | ||
| 88 | * to ensure the result is compliant. | ||
| 89 | */ | ||
| 90 | |||
| 91 | #define ECDSA_FLAG_FIPS_METHOD 0x1 | ||
| 92 | |||
| 85 | typedef struct ecdsa_data_st { | 93 | typedef struct ecdsa_data_st { |
| 86 | /* EC_KEY_METH_DATA part */ | 94 | /* EC_KEY_METH_DATA part */ |
| 87 | int (*init)(EC_KEY *); | 95 | int (*init)(EC_KEY *); |
diff --git a/src/lib/libcrypto/ecdsa/ecs_ossl.c b/src/lib/libcrypto/ecdsa/ecs_ossl.c index 1bbf328de5..7725935610 100644 --- a/src/lib/libcrypto/ecdsa/ecs_ossl.c +++ b/src/lib/libcrypto/ecdsa/ecs_ossl.c | |||
| @@ -167,6 +167,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, | |||
| 167 | goto err; | 167 | goto err; |
| 168 | } | 168 | } |
| 169 | } | 169 | } |
| 170 | #ifndef OPENSSL_NO_EC2M | ||
| 170 | else /* NID_X9_62_characteristic_two_field */ | 171 | else /* NID_X9_62_characteristic_two_field */ |
| 171 | { | 172 | { |
| 172 | if (!EC_POINT_get_affine_coordinates_GF2m(group, | 173 | if (!EC_POINT_get_affine_coordinates_GF2m(group, |
| @@ -176,6 +177,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, | |||
| 176 | goto err; | 177 | goto err; |
| 177 | } | 178 | } |
| 178 | } | 179 | } |
| 180 | #endif | ||
| 179 | if (!BN_nnmod(r, X, order, ctx)) | 181 | if (!BN_nnmod(r, X, order, ctx)) |
| 180 | { | 182 | { |
| 181 | ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB); | 183 | ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB); |
| @@ -454,6 +456,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len, | |||
| 454 | goto err; | 456 | goto err; |
| 455 | } | 457 | } |
| 456 | } | 458 | } |
| 459 | #ifndef OPENSSL_NO_EC2M | ||
| 457 | else /* NID_X9_62_characteristic_two_field */ | 460 | else /* NID_X9_62_characteristic_two_field */ |
| 458 | { | 461 | { |
| 459 | if (!EC_POINT_get_affine_coordinates_GF2m(group, | 462 | if (!EC_POINT_get_affine_coordinates_GF2m(group, |
| @@ -463,7 +466,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len, | |||
| 463 | goto err; | 466 | goto err; |
| 464 | } | 467 | } |
| 465 | } | 468 | } |
| 466 | 469 | #endif | |
| 467 | if (!BN_nnmod(u1, X, order, ctx)) | 470 | if (!BN_nnmod(u1, X, order, ctx)) |
| 468 | { | 471 | { |
| 469 | ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB); | 472 | ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB); |
diff --git a/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c b/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c new file mode 100644 index 0000000000..710fb79baf --- /dev/null +++ b/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c | |||
| @@ -0,0 +1,406 @@ | |||
| 1 | /* ==================================================================== | ||
| 2 | * Copyright (c) 2011 The OpenSSL Project. All rights reserved. | ||
| 3 | * | ||
| 4 | * Redistribution and use in source and binary forms, with or without | ||
| 5 | * modification, are permitted provided that the following conditions | ||
| 6 | * are met: | ||
| 7 | * | ||
| 8 | * 1. Redistributions of source code must retain the above copyright | ||
| 9 | * notice, this list of conditions and the following disclaimer. | ||
| 10 | * | ||
| 11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 12 | * notice, this list of conditions and the following disclaimer in | ||
| 13 | * the documentation and/or other materials provided with the | ||
| 14 | * distribution. | ||
| 15 | * | ||
| 16 | * 3. All advertising materials mentioning features or use of this | ||
| 17 | * software must display the following acknowledgment: | ||
| 18 | * "This product includes software developed by the OpenSSL Project | ||
| 19 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
| 20 | * | ||
| 21 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 22 | * endorse or promote products derived from this software without | ||
| 23 | * prior written permission. For written permission, please contact | ||
| 24 | * licensing@OpenSSL.org. | ||
| 25 | * | ||
| 26 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 27 | * nor may "OpenSSL" appear in their names without prior written | ||
| 28 | * permission of the OpenSSL Project. | ||
| 29 | * | ||
| 30 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 31 | * acknowledgment: | ||
| 32 | * "This product includes software developed by the OpenSSL Project | ||
| 33 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
| 34 | * | ||
| 35 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 36 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 38 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 39 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 40 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 41 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 42 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 45 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 46 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 47 | * ==================================================================== | ||
| 48 | */ | ||
| 49 | |||
| 50 | #include <openssl/opensslconf.h> | ||
| 51 | |||
| 52 | #include <stdio.h> | ||
| 53 | #include <string.h> | ||
| 54 | |||
| 55 | #if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1) | ||
| 56 | |||
| 57 | #include <openssl/evp.h> | ||
| 58 | #include <openssl/objects.h> | ||
| 59 | #include <openssl/aes.h> | ||
| 60 | #include <openssl/sha.h> | ||
| 61 | #include "evp_locl.h" | ||
| 62 | |||
| 63 | #ifndef EVP_CIPH_FLAG_AEAD_CIPHER | ||
| 64 | #define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000 | ||
| 65 | #define EVP_CTRL_AEAD_TLS1_AAD 0x16 | ||
| 66 | #define EVP_CTRL_AEAD_SET_MAC_KEY 0x17 | ||
| 67 | #endif | ||
| 68 | |||
| 69 | #if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1) | ||
| 70 | #define EVP_CIPH_FLAG_DEFAULT_ASN1 0 | ||
| 71 | #endif | ||
| 72 | |||
| 73 | #define TLS1_1_VERSION 0x0302 | ||
| 74 | |||
| 75 | typedef struct | ||
| 76 | { | ||
| 77 | AES_KEY ks; | ||
| 78 | SHA_CTX head,tail,md; | ||
| 79 | size_t payload_length; /* AAD length in decrypt case */ | ||
| 80 | union { | ||
| 81 | unsigned int tls_ver; | ||
| 82 | unsigned char tls_aad[16]; /* 13 used */ | ||
| 83 | } aux; | ||
| 84 | } EVP_AES_HMAC_SHA1; | ||
| 85 | |||
| 86 | #define NO_PAYLOAD_LENGTH ((size_t)-1) | ||
| 87 | |||
| 88 | #if defined(AES_ASM) && ( \ | ||
| 89 | defined(__x86_64) || defined(__x86_64__) || \ | ||
| 90 | defined(_M_AMD64) || defined(_M_X64) || \ | ||
| 91 | defined(__INTEL__) ) | ||
| 92 | |||
| 93 | extern unsigned int OPENSSL_ia32cap_P[2]; | ||
| 94 | #define AESNI_CAPABLE (1<<(57-32)) | ||
| 95 | |||
| 96 | int aesni_set_encrypt_key(const unsigned char *userKey, int bits, | ||
| 97 | AES_KEY *key); | ||
| 98 | int aesni_set_decrypt_key(const unsigned char *userKey, int bits, | ||
| 99 | AES_KEY *key); | ||
| 100 | |||
| 101 | void aesni_cbc_encrypt(const unsigned char *in, | ||
| 102 | unsigned char *out, | ||
| 103 | size_t length, | ||
| 104 | const AES_KEY *key, | ||
| 105 | unsigned char *ivec, int enc); | ||
| 106 | |||
| 107 | void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks, | ||
| 108 | const AES_KEY *key, unsigned char iv[16], | ||
| 109 | SHA_CTX *ctx,const void *in0); | ||
| 110 | |||
| 111 | #define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data) | ||
| 112 | |||
| 113 | static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx, | ||
| 114 | const unsigned char *inkey, | ||
| 115 | const unsigned char *iv, int enc) | ||
| 116 | { | ||
| 117 | EVP_AES_HMAC_SHA1 *key = data(ctx); | ||
| 118 | int ret; | ||
| 119 | |||
| 120 | if (enc) | ||
| 121 | ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks); | ||
| 122 | else | ||
| 123 | ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks); | ||
| 124 | |||
| 125 | SHA1_Init(&key->head); /* handy when benchmarking */ | ||
| 126 | key->tail = key->head; | ||
| 127 | key->md = key->head; | ||
| 128 | |||
| 129 | key->payload_length = NO_PAYLOAD_LENGTH; | ||
| 130 | |||
| 131 | return ret<0?0:1; | ||
| 132 | } | ||
| 133 | |||
| 134 | #define STITCHED_CALL | ||
| 135 | |||
| 136 | #if !defined(STITCHED_CALL) | ||
| 137 | #define aes_off 0 | ||
| 138 | #endif | ||
| 139 | |||
| 140 | void sha1_block_data_order (void *c,const void *p,size_t len); | ||
| 141 | |||
| 142 | static void sha1_update(SHA_CTX *c,const void *data,size_t len) | ||
| 143 | { const unsigned char *ptr = data; | ||
| 144 | size_t res; | ||
| 145 | |||
| 146 | if ((res = c->num)) { | ||
| 147 | res = SHA_CBLOCK-res; | ||
| 148 | if (len<res) res=len; | ||
| 149 | SHA1_Update (c,ptr,res); | ||
| 150 | ptr += res; | ||
| 151 | len -= res; | ||
| 152 | } | ||
| 153 | |||
| 154 | res = len % SHA_CBLOCK; | ||
| 155 | len -= res; | ||
| 156 | |||
| 157 | if (len) { | ||
| 158 | sha1_block_data_order(c,ptr,len/SHA_CBLOCK); | ||
| 159 | |||
| 160 | ptr += len; | ||
| 161 | c->Nh += len>>29; | ||
| 162 | c->Nl += len<<=3; | ||
| 163 | if (c->Nl<(unsigned int)len) c->Nh++; | ||
| 164 | } | ||
| 165 | |||
| 166 | if (res) | ||
| 167 | SHA1_Update(c,ptr,res); | ||
| 168 | } | ||
| 169 | |||
| 170 | #define SHA1_Update sha1_update | ||
| 171 | |||
| 172 | static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, | ||
| 173 | const unsigned char *in, size_t len) | ||
| 174 | { | ||
| 175 | EVP_AES_HMAC_SHA1 *key = data(ctx); | ||
| 176 | unsigned int l; | ||
| 177 | size_t plen = key->payload_length, | ||
| 178 | iv = 0, /* explicit IV in TLS 1.1 and later */ | ||
| 179 | sha_off = 0; | ||
| 180 | #if defined(STITCHED_CALL) | ||
| 181 | size_t aes_off = 0, | ||
| 182 | blocks; | ||
| 183 | |||
| 184 | sha_off = SHA_CBLOCK-key->md.num; | ||
| 185 | #endif | ||
| 186 | |||
| 187 | if (len%AES_BLOCK_SIZE) return 0; | ||
| 188 | |||
| 189 | if (ctx->encrypt) { | ||
| 190 | if (plen==NO_PAYLOAD_LENGTH) | ||
| 191 | plen = len; | ||
| 192 | else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)) | ||
| 193 | return 0; | ||
| 194 | else if (key->aux.tls_ver >= TLS1_1_VERSION) | ||
| 195 | iv = AES_BLOCK_SIZE; | ||
| 196 | |||
| 197 | #if defined(STITCHED_CALL) | ||
| 198 | if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) { | ||
| 199 | SHA1_Update(&key->md,in+iv,sha_off); | ||
| 200 | |||
| 201 | aesni_cbc_sha1_enc(in,out,blocks,&key->ks, | ||
| 202 | ctx->iv,&key->md,in+iv+sha_off); | ||
| 203 | blocks *= SHA_CBLOCK; | ||
| 204 | aes_off += blocks; | ||
| 205 | sha_off += blocks; | ||
| 206 | key->md.Nh += blocks>>29; | ||
| 207 | key->md.Nl += blocks<<=3; | ||
| 208 | if (key->md.Nl<(unsigned int)blocks) key->md.Nh++; | ||
| 209 | } else { | ||
| 210 | sha_off = 0; | ||
| 211 | } | ||
| 212 | #endif | ||
| 213 | sha_off += iv; | ||
| 214 | SHA1_Update(&key->md,in+sha_off,plen-sha_off); | ||
| 215 | |||
| 216 | if (plen!=len) { /* "TLS" mode of operation */ | ||
| 217 | if (in!=out) | ||
| 218 | memcpy(out+aes_off,in+aes_off,plen-aes_off); | ||
| 219 | |||
| 220 | /* calculate HMAC and append it to payload */ | ||
| 221 | SHA1_Final(out+plen,&key->md); | ||
| 222 | key->md = key->tail; | ||
| 223 | SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH); | ||
| 224 | SHA1_Final(out+plen,&key->md); | ||
| 225 | |||
| 226 | /* pad the payload|hmac */ | ||
| 227 | plen += SHA_DIGEST_LENGTH; | ||
| 228 | for (l=len-plen-1;plen<len;plen++) out[plen]=l; | ||
| 229 | /* encrypt HMAC|padding at once */ | ||
| 230 | aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off, | ||
| 231 | &key->ks,ctx->iv,1); | ||
| 232 | } else { | ||
| 233 | aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off, | ||
| 234 | &key->ks,ctx->iv,1); | ||
| 235 | } | ||
| 236 | } else { | ||
| 237 | unsigned char mac[SHA_DIGEST_LENGTH]; | ||
| 238 | |||
| 239 | /* decrypt HMAC|padding at once */ | ||
| 240 | aesni_cbc_encrypt(in,out,len, | ||
| 241 | &key->ks,ctx->iv,0); | ||
| 242 | |||
| 243 | if (plen) { /* "TLS" mode of operation */ | ||
| 244 | /* figure out payload length */ | ||
| 245 | if (len<(size_t)(out[len-1]+1+SHA_DIGEST_LENGTH)) | ||
| 246 | return 0; | ||
| 247 | |||
| 248 | len -= (out[len-1]+1+SHA_DIGEST_LENGTH); | ||
| 249 | |||
| 250 | if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3]) | ||
| 251 | >= TLS1_1_VERSION) { | ||
| 252 | len -= AES_BLOCK_SIZE; | ||
| 253 | iv = AES_BLOCK_SIZE; | ||
| 254 | } | ||
| 255 | |||
| 256 | key->aux.tls_aad[plen-2] = len>>8; | ||
| 257 | key->aux.tls_aad[plen-1] = len; | ||
| 258 | |||
| 259 | /* calculate HMAC and verify it */ | ||
| 260 | key->md = key->head; | ||
| 261 | SHA1_Update(&key->md,key->aux.tls_aad,plen); | ||
| 262 | SHA1_Update(&key->md,out+iv,len); | ||
| 263 | SHA1_Final(mac,&key->md); | ||
| 264 | |||
| 265 | key->md = key->tail; | ||
| 266 | SHA1_Update(&key->md,mac,SHA_DIGEST_LENGTH); | ||
| 267 | SHA1_Final(mac,&key->md); | ||
| 268 | |||
| 269 | if (memcmp(out+iv+len,mac,SHA_DIGEST_LENGTH)) | ||
| 270 | return 0; | ||
| 271 | } else { | ||
| 272 | SHA1_Update(&key->md,out,len); | ||
| 273 | } | ||
| 274 | } | ||
| 275 | |||
| 276 | key->payload_length = NO_PAYLOAD_LENGTH; | ||
| 277 | |||
| 278 | return 1; | ||
| 279 | } | ||
| 280 | |||
| 281 | static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) | ||
| 282 | { | ||
| 283 | EVP_AES_HMAC_SHA1 *key = data(ctx); | ||
| 284 | |||
| 285 | switch (type) | ||
| 286 | { | ||
| 287 | case EVP_CTRL_AEAD_SET_MAC_KEY: | ||
| 288 | { | ||
| 289 | unsigned int i; | ||
| 290 | unsigned char hmac_key[64]; | ||
| 291 | |||
| 292 | memset (hmac_key,0,sizeof(hmac_key)); | ||
| 293 | |||
| 294 | if (arg > (int)sizeof(hmac_key)) { | ||
| 295 | SHA1_Init(&key->head); | ||
| 296 | SHA1_Update(&key->head,ptr,arg); | ||
| 297 | SHA1_Final(hmac_key,&key->head); | ||
| 298 | } else { | ||
| 299 | memcpy(hmac_key,ptr,arg); | ||
| 300 | } | ||
| 301 | |||
| 302 | for (i=0;i<sizeof(hmac_key);i++) | ||
| 303 | hmac_key[i] ^= 0x36; /* ipad */ | ||
| 304 | SHA1_Init(&key->head); | ||
| 305 | SHA1_Update(&key->head,hmac_key,sizeof(hmac_key)); | ||
| 306 | |||
| 307 | for (i=0;i<sizeof(hmac_key);i++) | ||
| 308 | hmac_key[i] ^= 0x36^0x5c; /* opad */ | ||
| 309 | SHA1_Init(&key->tail); | ||
| 310 | SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key)); | ||
| 311 | |||
| 312 | return 1; | ||
| 313 | } | ||
| 314 | case EVP_CTRL_AEAD_TLS1_AAD: | ||
| 315 | { | ||
| 316 | unsigned char *p=ptr; | ||
| 317 | unsigned int len=p[arg-2]<<8|p[arg-1]; | ||
| 318 | |||
| 319 | if (ctx->encrypt) | ||
| 320 | { | ||
| 321 | key->payload_length = len; | ||
| 322 | if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) { | ||
| 323 | len -= AES_BLOCK_SIZE; | ||
| 324 | p[arg-2] = len>>8; | ||
| 325 | p[arg-1] = len; | ||
| 326 | } | ||
| 327 | key->md = key->head; | ||
| 328 | SHA1_Update(&key->md,p,arg); | ||
| 329 | |||
| 330 | return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE) | ||
| 331 | - len); | ||
| 332 | } | ||
| 333 | else | ||
| 334 | { | ||
| 335 | if (arg>13) arg = 13; | ||
| 336 | memcpy(key->aux.tls_aad,ptr,arg); | ||
| 337 | key->payload_length = arg; | ||
| 338 | |||
| 339 | return SHA_DIGEST_LENGTH; | ||
| 340 | } | ||
| 341 | } | ||
| 342 | default: | ||
| 343 | return -1; | ||
| 344 | } | ||
| 345 | } | ||
| 346 | |||
| 347 | static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher = | ||
| 348 | { | ||
| 349 | #ifdef NID_aes_128_cbc_hmac_sha1 | ||
| 350 | NID_aes_128_cbc_hmac_sha1, | ||
| 351 | #else | ||
| 352 | NID_undef, | ||
| 353 | #endif | ||
| 354 | 16,16,16, | ||
| 355 | EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER, | ||
| 356 | aesni_cbc_hmac_sha1_init_key, | ||
| 357 | aesni_cbc_hmac_sha1_cipher, | ||
| 358 | NULL, | ||
| 359 | sizeof(EVP_AES_HMAC_SHA1), | ||
| 360 | EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv, | ||
| 361 | EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv, | ||
| 362 | aesni_cbc_hmac_sha1_ctrl, | ||
| 363 | NULL | ||
| 364 | }; | ||
| 365 | |||
| 366 | static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher = | ||
| 367 | { | ||
| 368 | #ifdef NID_aes_256_cbc_hmac_sha1 | ||
| 369 | NID_aes_256_cbc_hmac_sha1, | ||
| 370 | #else | ||
| 371 | NID_undef, | ||
| 372 | #endif | ||
| 373 | 16,32,16, | ||
| 374 | EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER, | ||
| 375 | aesni_cbc_hmac_sha1_init_key, | ||
| 376 | aesni_cbc_hmac_sha1_cipher, | ||
| 377 | NULL, | ||
| 378 | sizeof(EVP_AES_HMAC_SHA1), | ||
| 379 | EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv, | ||
| 380 | EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv, | ||
| 381 | aesni_cbc_hmac_sha1_ctrl, | ||
| 382 | NULL | ||
| 383 | }; | ||
| 384 | |||
| 385 | const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void) | ||
| 386 | { | ||
| 387 | return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE? | ||
| 388 | &aesni_128_cbc_hmac_sha1_cipher:NULL); | ||
| 389 | } | ||
| 390 | |||
| 391 | const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void) | ||
| 392 | { | ||
| 393 | return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE? | ||
| 394 | &aesni_256_cbc_hmac_sha1_cipher:NULL); | ||
| 395 | } | ||
| 396 | #else | ||
| 397 | const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void) | ||
| 398 | { | ||
| 399 | return NULL; | ||
| 400 | } | ||
| 401 | const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void) | ||
| 402 | { | ||
| 403 | return NULL; | ||
| 404 | } | ||
| 405 | #endif | ||
| 406 | #endif | ||
diff --git a/src/lib/libcrypto/evp/e_rc4_hmac_md5.c b/src/lib/libcrypto/evp/e_rc4_hmac_md5.c new file mode 100644 index 0000000000..56563191ba --- /dev/null +++ b/src/lib/libcrypto/evp/e_rc4_hmac_md5.c | |||
| @@ -0,0 +1,298 @@ | |||
| 1 | /* ==================================================================== | ||
| 2 | * Copyright (c) 2011 The OpenSSL Project. All rights reserved. | ||
| 3 | * | ||
| 4 | * Redistribution and use in source and binary forms, with or without | ||
| 5 | * modification, are permitted provided that the following conditions | ||
| 6 | * are met: | ||
| 7 | * | ||
| 8 | * 1. Redistributions of source code must retain the above copyright | ||
| 9 | * notice, this list of conditions and the following disclaimer. | ||
| 10 | * | ||
| 11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 12 | * notice, this list of conditions and the following disclaimer in | ||
| 13 | * the documentation and/or other materials provided with the | ||
| 14 | * distribution. | ||
| 15 | * | ||
| 16 | * 3. All advertising materials mentioning features or use of this | ||
| 17 | * software must display the following acknowledgment: | ||
| 18 | * "This product includes software developed by the OpenSSL Project | ||
| 19 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
| 20 | * | ||
| 21 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 22 | * endorse or promote products derived from this software without | ||
| 23 | * prior written permission. For written permission, please contact | ||
| 24 | * licensing@OpenSSL.org. | ||
| 25 | * | ||
| 26 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 27 | * nor may "OpenSSL" appear in their names without prior written | ||
| 28 | * permission of the OpenSSL Project. | ||
| 29 | * | ||
| 30 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 31 | * acknowledgment: | ||
| 32 | * "This product includes software developed by the OpenSSL Project | ||
| 33 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
| 34 | * | ||
| 35 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 36 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 38 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 39 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 40 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 41 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 42 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 45 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 46 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 47 | * ==================================================================== | ||
| 48 | */ | ||
| 49 | |||
| 50 | #include <openssl/opensslconf.h> | ||
| 51 | |||
| 52 | #include <stdio.h> | ||
| 53 | #include <string.h> | ||
| 54 | |||
| 55 | #if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5) | ||
| 56 | |||
| 57 | #include <openssl/evp.h> | ||
| 58 | #include <openssl/objects.h> | ||
| 59 | #include <openssl/rc4.h> | ||
| 60 | #include <openssl/md5.h> | ||
| 61 | |||
| 62 | #ifndef EVP_CIPH_FLAG_AEAD_CIPHER | ||
| 63 | #define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000 | ||
| 64 | #define EVP_CTRL_AEAD_TLS1_AAD 0x16 | ||
| 65 | #define EVP_CTRL_AEAD_SET_MAC_KEY 0x17 | ||
| 66 | #endif | ||
| 67 | |||
| 68 | /* FIXME: surely this is available elsewhere? */ | ||
| 69 | #define EVP_RC4_KEY_SIZE 16 | ||
| 70 | |||
| 71 | typedef struct | ||
| 72 | { | ||
| 73 | RC4_KEY ks; | ||
| 74 | MD5_CTX head,tail,md; | ||
| 75 | size_t payload_length; | ||
| 76 | } EVP_RC4_HMAC_MD5; | ||
| 77 | |||
| 78 | #define NO_PAYLOAD_LENGTH ((size_t)-1) | ||
| 79 | |||
| 80 | void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out, | ||
| 81 | MD5_CTX *ctx,const void *inp,size_t blocks); | ||
| 82 | |||
| 83 | #define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data) | ||
| 84 | |||
| 85 | static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx, | ||
| 86 | const unsigned char *inkey, | ||
| 87 | const unsigned char *iv, int enc) | ||
| 88 | { | ||
| 89 | EVP_RC4_HMAC_MD5 *key = data(ctx); | ||
| 90 | |||
| 91 | RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx), | ||
| 92 | inkey); | ||
| 93 | |||
| 94 | MD5_Init(&key->head); /* handy when benchmarking */ | ||
| 95 | key->tail = key->head; | ||
| 96 | key->md = key->head; | ||
| 97 | |||
| 98 | key->payload_length = NO_PAYLOAD_LENGTH; | ||
| 99 | |||
| 100 | return 1; | ||
| 101 | } | ||
| 102 | |||
| 103 | #if !defined(OPENSSL_NO_ASM) && ( \ | ||
| 104 | defined(__x86_64) || defined(__x86_64__) || \ | ||
| 105 | defined(_M_AMD64) || defined(_M_X64) || \ | ||
| 106 | defined(__INTEL__) ) && \ | ||
| 107 | !(defined(__APPLE__) && defined(__MACH__)) | ||
| 108 | #define STITCHED_CALL | ||
| 109 | #endif | ||
| 110 | |||
| 111 | #if !defined(STITCHED_CALL) | ||
| 112 | #define rc4_off 0 | ||
| 113 | #define md5_off 0 | ||
| 114 | #endif | ||
| 115 | |||
| 116 | static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, | ||
| 117 | const unsigned char *in, size_t len) | ||
| 118 | { | ||
| 119 | EVP_RC4_HMAC_MD5 *key = data(ctx); | ||
| 120 | #if defined(STITCHED_CALL) | ||
| 121 | size_t rc4_off = 32-1-(key->ks.x&(32-1)), /* 32 is $MOD from rc4_md5-x86_64.pl */ | ||
| 122 | md5_off = MD5_CBLOCK-key->md.num, | ||
| 123 | blocks; | ||
| 124 | unsigned int l; | ||
| 125 | extern unsigned int OPENSSL_ia32cap_P[]; | ||
| 126 | #endif | ||
| 127 | size_t plen = key->payload_length; | ||
| 128 | |||
| 129 | if (plen!=NO_PAYLOAD_LENGTH && len!=(plen+MD5_DIGEST_LENGTH)) return 0; | ||
| 130 | |||
| 131 | if (ctx->encrypt) { | ||
| 132 | if (plen==NO_PAYLOAD_LENGTH) plen = len; | ||
| 133 | #if defined(STITCHED_CALL) | ||
| 134 | /* cipher has to "fall behind" */ | ||
| 135 | if (rc4_off>md5_off) md5_off+=MD5_CBLOCK; | ||
| 136 | |||
| 137 | if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK) && | ||
| 138 | (OPENSSL_ia32cap_P[0]&(1<<20))==0) { | ||
| 139 | MD5_Update(&key->md,in,md5_off); | ||
| 140 | RC4(&key->ks,rc4_off,in,out); | ||
| 141 | |||
| 142 | rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off, | ||
| 143 | &key->md,in+md5_off,blocks); | ||
| 144 | blocks *= MD5_CBLOCK; | ||
| 145 | rc4_off += blocks; | ||
| 146 | md5_off += blocks; | ||
| 147 | key->md.Nh += blocks>>29; | ||
| 148 | key->md.Nl += blocks<<=3; | ||
| 149 | if (key->md.Nl<(unsigned int)blocks) key->md.Nh++; | ||
| 150 | } else { | ||
| 151 | rc4_off = 0; | ||
| 152 | md5_off = 0; | ||
| 153 | } | ||
| 154 | #endif | ||
| 155 | MD5_Update(&key->md,in+md5_off,plen-md5_off); | ||
| 156 | |||
| 157 | if (plen!=len) { /* "TLS" mode of operation */ | ||
| 158 | if (in!=out) | ||
| 159 | memcpy(out+rc4_off,in+rc4_off,plen-rc4_off); | ||
| 160 | |||
| 161 | /* calculate HMAC and append it to payload */ | ||
| 162 | MD5_Final(out+plen,&key->md); | ||
| 163 | key->md = key->tail; | ||
| 164 | MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH); | ||
| 165 | MD5_Final(out+plen,&key->md); | ||
| 166 | /* encrypt HMAC at once */ | ||
| 167 | RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off); | ||
| 168 | } else { | ||
| 169 | RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off); | ||
| 170 | } | ||
| 171 | } else { | ||
| 172 | unsigned char mac[MD5_DIGEST_LENGTH]; | ||
| 173 | #if defined(STITCHED_CALL) | ||
| 174 | /* digest has to "fall behind" */ | ||
| 175 | if (md5_off>rc4_off) rc4_off += 2*MD5_CBLOCK; | ||
| 176 | else rc4_off += MD5_CBLOCK; | ||
| 177 | |||
| 178 | if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK) && | ||
| 179 | (OPENSSL_ia32cap_P[0]&(1<<20))==0) { | ||
| 180 | RC4(&key->ks,rc4_off,in,out); | ||
| 181 | MD5_Update(&key->md,out,md5_off); | ||
| 182 | |||
| 183 | rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off, | ||
| 184 | &key->md,out+md5_off,blocks); | ||
| 185 | blocks *= MD5_CBLOCK; | ||
| 186 | rc4_off += blocks; | ||
| 187 | md5_off += blocks; | ||
| 188 | l = (key->md.Nl+(blocks<<3))&0xffffffffU; | ||
| 189 | if (l<key->md.Nl) key->md.Nh++; | ||
| 190 | key->md.Nl = l; | ||
| 191 | key->md.Nh += blocks>>29; | ||
| 192 | } else { | ||
| 193 | md5_off=0; | ||
| 194 | rc4_off=0; | ||
| 195 | } | ||
| 196 | #endif | ||
| 197 | /* decrypt HMAC at once */ | ||
| 198 | RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off); | ||
| 199 | if (plen!=NO_PAYLOAD_LENGTH) { /* "TLS" mode of operation */ | ||
| 200 | MD5_Update(&key->md,out+md5_off,plen-md5_off); | ||
| 201 | |||
| 202 | /* calculate HMAC and verify it */ | ||
| 203 | MD5_Final(mac,&key->md); | ||
| 204 | key->md = key->tail; | ||
| 205 | MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH); | ||
| 206 | MD5_Final(mac,&key->md); | ||
| 207 | |||
| 208 | if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH)) | ||
| 209 | return 0; | ||
| 210 | } else { | ||
| 211 | MD5_Update(&key->md,out+md5_off,len-md5_off); | ||
| 212 | } | ||
| 213 | } | ||
| 214 | |||
| 215 | key->payload_length = NO_PAYLOAD_LENGTH; | ||
| 216 | |||
| 217 | return 1; | ||
| 218 | } | ||
| 219 | |||
| 220 | static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) | ||
| 221 | { | ||
| 222 | EVP_RC4_HMAC_MD5 *key = data(ctx); | ||
| 223 | |||
| 224 | switch (type) | ||
| 225 | { | ||
| 226 | case EVP_CTRL_AEAD_SET_MAC_KEY: | ||
| 227 | { | ||
| 228 | unsigned int i; | ||
| 229 | unsigned char hmac_key[64]; | ||
| 230 | |||
| 231 | memset (hmac_key,0,sizeof(hmac_key)); | ||
| 232 | |||
| 233 | if (arg > (int)sizeof(hmac_key)) { | ||
| 234 | MD5_Init(&key->head); | ||
| 235 | MD5_Update(&key->head,ptr,arg); | ||
| 236 | MD5_Final(hmac_key,&key->head); | ||
| 237 | } else { | ||
| 238 | memcpy(hmac_key,ptr,arg); | ||
| 239 | } | ||
| 240 | |||
| 241 | for (i=0;i<sizeof(hmac_key);i++) | ||
| 242 | hmac_key[i] ^= 0x36; /* ipad */ | ||
| 243 | MD5_Init(&key->head); | ||
| 244 | MD5_Update(&key->head,hmac_key,sizeof(hmac_key)); | ||
| 245 | |||
| 246 | for (i=0;i<sizeof(hmac_key);i++) | ||
| 247 | hmac_key[i] ^= 0x36^0x5c; /* opad */ | ||
| 248 | MD5_Init(&key->tail); | ||
| 249 | MD5_Update(&key->tail,hmac_key,sizeof(hmac_key)); | ||
| 250 | |||
| 251 | return 1; | ||
| 252 | } | ||
| 253 | case EVP_CTRL_AEAD_TLS1_AAD: | ||
| 254 | { | ||
| 255 | unsigned char *p=ptr; | ||
| 256 | unsigned int len=p[arg-2]<<8|p[arg-1]; | ||
| 257 | |||
| 258 | if (!ctx->encrypt) | ||
| 259 | { | ||
| 260 | len -= MD5_DIGEST_LENGTH; | ||
| 261 | p[arg-2] = len>>8; | ||
| 262 | p[arg-1] = len; | ||
| 263 | } | ||
| 264 | key->payload_length=len; | ||
| 265 | key->md = key->head; | ||
| 266 | MD5_Update(&key->md,p,arg); | ||
| 267 | |||
| 268 | return MD5_DIGEST_LENGTH; | ||
| 269 | } | ||
| 270 | default: | ||
| 271 | return -1; | ||
| 272 | } | ||
| 273 | } | ||
| 274 | |||
| 275 | static EVP_CIPHER r4_hmac_md5_cipher= | ||
| 276 | { | ||
| 277 | #ifdef NID_rc4_hmac_md5 | ||
| 278 | NID_rc4_hmac_md5, | ||
| 279 | #else | ||
| 280 | NID_undef, | ||
| 281 | #endif | ||
| 282 | 1,EVP_RC4_KEY_SIZE,0, | ||
| 283 | EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER, | ||
| 284 | rc4_hmac_md5_init_key, | ||
| 285 | rc4_hmac_md5_cipher, | ||
| 286 | NULL, | ||
| 287 | sizeof(EVP_RC4_HMAC_MD5), | ||
| 288 | NULL, | ||
| 289 | NULL, | ||
| 290 | rc4_hmac_md5_ctrl, | ||
| 291 | NULL | ||
| 292 | }; | ||
| 293 | |||
| 294 | const EVP_CIPHER *EVP_rc4_hmac_md5(void) | ||
| 295 | { | ||
| 296 | return(&r4_hmac_md5_cipher); | ||
| 297 | } | ||
| 298 | #endif | ||
diff --git a/src/lib/libcrypto/evp/m_ecdsa.c b/src/lib/libcrypto/evp/m_ecdsa.c index 8d87a49ebe..4b15fb0f6c 100644 --- a/src/lib/libcrypto/evp/m_ecdsa.c +++ b/src/lib/libcrypto/evp/m_ecdsa.c | |||
| @@ -116,6 +116,8 @@ | |||
| 116 | #include <openssl/x509.h> | 116 | #include <openssl/x509.h> |
| 117 | 117 | ||
| 118 | #ifndef OPENSSL_NO_SHA | 118 | #ifndef OPENSSL_NO_SHA |
| 119 | #ifndef OPENSSL_FIPS | ||
| 120 | |||
| 119 | static int init(EVP_MD_CTX *ctx) | 121 | static int init(EVP_MD_CTX *ctx) |
| 120 | { return SHA1_Init(ctx->md_data); } | 122 | { return SHA1_Init(ctx->md_data); } |
| 121 | 123 | ||
| @@ -146,3 +148,4 @@ const EVP_MD *EVP_ecdsa(void) | |||
| 146 | return(&ecdsa_md); | 148 | return(&ecdsa_md); |
| 147 | } | 149 | } |
| 148 | #endif | 150 | #endif |
| 151 | #endif | ||
diff --git a/src/lib/libcrypto/evp/m_wp.c b/src/lib/libcrypto/evp/m_wp.c index 1ce47c040b..c51bc2d5d1 100644 --- a/src/lib/libcrypto/evp/m_wp.c +++ b/src/lib/libcrypto/evp/m_wp.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include <openssl/objects.h> | 9 | #include <openssl/objects.h> |
| 10 | #include <openssl/x509.h> | 10 | #include <openssl/x509.h> |
| 11 | #include <openssl/whrlpool.h> | 11 | #include <openssl/whrlpool.h> |
| 12 | #include "evp_locl.h" | ||
| 12 | 13 | ||
| 13 | static int init(EVP_MD_CTX *ctx) | 14 | static int init(EVP_MD_CTX *ctx) |
| 14 | { return WHIRLPOOL_Init(ctx->md_data); } | 15 | { return WHIRLPOOL_Init(ctx->md_data); } |
diff --git a/src/lib/libcrypto/evp/pmeth_gn.c b/src/lib/libcrypto/evp/pmeth_gn.c index 5d74161a09..4651c81370 100644 --- a/src/lib/libcrypto/evp/pmeth_gn.c +++ b/src/lib/libcrypto/evp/pmeth_gn.c | |||
| @@ -199,7 +199,7 @@ int EVP_PKEY_CTX_get_keygen_info(EVP_PKEY_CTX *ctx, int idx) | |||
| 199 | } | 199 | } |
| 200 | 200 | ||
| 201 | EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, | 201 | EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, |
| 202 | unsigned char *key, int keylen) | 202 | const unsigned char *key, int keylen) |
| 203 | { | 203 | { |
| 204 | EVP_PKEY_CTX *mac_ctx = NULL; | 204 | EVP_PKEY_CTX *mac_ctx = NULL; |
| 205 | EVP_PKEY *mac_key = NULL; | 205 | EVP_PKEY *mac_key = NULL; |
| @@ -209,7 +209,8 @@ EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, | |||
| 209 | if (EVP_PKEY_keygen_init(mac_ctx) <= 0) | 209 | if (EVP_PKEY_keygen_init(mac_ctx) <= 0) |
| 210 | goto merr; | 210 | goto merr; |
| 211 | if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN, | 211 | if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN, |
| 212 | EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key) <= 0) | 212 | EVP_PKEY_CTRL_SET_MAC_KEY, |
| 213 | keylen, (void *)key) <= 0) | ||
| 213 | goto merr; | 214 | goto merr; |
| 214 | if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0) | 215 | if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0) |
| 215 | goto merr; | 216 | goto merr; |
diff --git a/src/lib/libcrypto/evp/pmeth_lib.c b/src/lib/libcrypto/evp/pmeth_lib.c index 5481d4b8a5..acfa7b6f87 100644 --- a/src/lib/libcrypto/evp/pmeth_lib.c +++ b/src/lib/libcrypto/evp/pmeth_lib.c | |||
| @@ -73,7 +73,7 @@ DECLARE_STACK_OF(EVP_PKEY_METHOD) | |||
| 73 | STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL; | 73 | STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL; |
| 74 | 74 | ||
| 75 | extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth; | 75 | extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth; |
| 76 | extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth; | 76 | extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth; |
| 77 | 77 | ||
| 78 | static const EVP_PKEY_METHOD *standard_methods[] = | 78 | static const EVP_PKEY_METHOD *standard_methods[] = |
| 79 | { | 79 | { |
| @@ -90,6 +90,7 @@ static const EVP_PKEY_METHOD *standard_methods[] = | |||
| 90 | &ec_pkey_meth, | 90 | &ec_pkey_meth, |
| 91 | #endif | 91 | #endif |
| 92 | &hmac_pkey_meth, | 92 | &hmac_pkey_meth, |
| 93 | &cmac_pkey_meth | ||
| 93 | }; | 94 | }; |
| 94 | 95 | ||
| 95 | DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *, | 96 | DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *, |
| @@ -203,6 +204,8 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags) | |||
| 203 | if (!pmeth) | 204 | if (!pmeth) |
| 204 | return NULL; | 205 | return NULL; |
| 205 | 206 | ||
| 207 | memset(pmeth, 0, sizeof(EVP_PKEY_METHOD)); | ||
| 208 | |||
| 206 | pmeth->pkey_id = id; | 209 | pmeth->pkey_id = id; |
| 207 | pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC; | 210 | pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC; |
| 208 | 211 | ||
| @@ -235,6 +238,56 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags) | |||
| 235 | return pmeth; | 238 | return pmeth; |
| 236 | } | 239 | } |
| 237 | 240 | ||
| 241 | void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags, | ||
| 242 | const EVP_PKEY_METHOD *meth) | ||
| 243 | { | ||
| 244 | if (ppkey_id) | ||
| 245 | *ppkey_id = meth->pkey_id; | ||
| 246 | if (pflags) | ||
| 247 | *pflags = meth->flags; | ||
| 248 | } | ||
| 249 | |||
| 250 | void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src) | ||
| 251 | { | ||
| 252 | |||
| 253 | dst->init = src->init; | ||
| 254 | dst->copy = src->copy; | ||
| 255 | dst->cleanup = src->cleanup; | ||
| 256 | |||
| 257 | dst->paramgen_init = src->paramgen_init; | ||
| 258 | dst->paramgen = src->paramgen; | ||
| 259 | |||
| 260 | dst->keygen_init = src->keygen_init; | ||
| 261 | dst->keygen = src->keygen; | ||
| 262 | |||
| 263 | dst->sign_init = src->sign_init; | ||
| 264 | dst->sign = src->sign; | ||
| 265 | |||
| 266 | dst->verify_init = src->verify_init; | ||
| 267 | dst->verify = src->verify; | ||
| 268 | |||
| 269 | dst->verify_recover_init = src->verify_recover_init; | ||
| 270 | dst->verify_recover = src->verify_recover; | ||
| 271 | |||
| 272 | dst->signctx_init = src->signctx_init; | ||
| 273 | dst->signctx = src->signctx; | ||
| 274 | |||
| 275 | dst->verifyctx_init = src->verifyctx_init; | ||
| 276 | dst->verifyctx = src->verifyctx; | ||
| 277 | |||
| 278 | dst->encrypt_init = src->encrypt_init; | ||
| 279 | dst->encrypt = src->encrypt; | ||
| 280 | |||
| 281 | dst->decrypt_init = src->decrypt_init; | ||
| 282 | dst->decrypt = src->decrypt; | ||
| 283 | |||
| 284 | dst->derive_init = src->derive_init; | ||
| 285 | dst->derive = src->derive; | ||
| 286 | |||
| 287 | dst->ctrl = src->ctrl; | ||
| 288 | dst->ctrl_str = src->ctrl_str; | ||
| 289 | } | ||
| 290 | |||
| 238 | void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth) | 291 | void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth) |
| 239 | { | 292 | { |
| 240 | if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC)) | 293 | if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC)) |
diff --git a/src/lib/libcrypto/hmac/hm_ameth.c b/src/lib/libcrypto/hmac/hm_ameth.c index 6d8a89149e..e03f24aeda 100644 --- a/src/lib/libcrypto/hmac/hm_ameth.c +++ b/src/lib/libcrypto/hmac/hm_ameth.c | |||
| @@ -153,7 +153,7 @@ const EVP_PKEY_ASN1_METHOD hmac_asn1_meth = | |||
| 153 | 153 | ||
| 154 | hmac_size, | 154 | hmac_size, |
| 155 | 0, | 155 | 0, |
| 156 | 0,0,0,0,0,0, | 156 | 0,0,0,0,0,0,0, |
| 157 | 157 | ||
| 158 | hmac_key_free, | 158 | hmac_key_free, |
| 159 | hmac_pkey_ctrl, | 159 | hmac_pkey_ctrl, |
diff --git a/src/lib/libcrypto/hmac/hm_pmeth.c b/src/lib/libcrypto/hmac/hm_pmeth.c index 71e8567a14..0daa44511d 100644 --- a/src/lib/libcrypto/hmac/hm_pmeth.c +++ b/src/lib/libcrypto/hmac/hm_pmeth.c | |||
| @@ -100,7 +100,8 @@ static int pkey_hmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) | |||
| 100 | dctx = dst->data; | 100 | dctx = dst->data; |
| 101 | dctx->md = sctx->md; | 101 | dctx->md = sctx->md; |
| 102 | HMAC_CTX_init(&dctx->ctx); | 102 | HMAC_CTX_init(&dctx->ctx); |
| 103 | HMAC_CTX_copy(&dctx->ctx, &sctx->ctx); | 103 | if (!HMAC_CTX_copy(&dctx->ctx, &sctx->ctx)) |
| 104 | return 0; | ||
| 104 | if (sctx->ktmp.data) | 105 | if (sctx->ktmp.data) |
| 105 | { | 106 | { |
| 106 | if (!ASN1_OCTET_STRING_set(&dctx->ktmp, | 107 | if (!ASN1_OCTET_STRING_set(&dctx->ktmp, |
| @@ -141,7 +142,8 @@ static int pkey_hmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) | |||
| 141 | static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count) | 142 | static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count) |
| 142 | { | 143 | { |
| 143 | HMAC_PKEY_CTX *hctx = ctx->pctx->data; | 144 | HMAC_PKEY_CTX *hctx = ctx->pctx->data; |
| 144 | HMAC_Update(&hctx->ctx, data, count); | 145 | if (!HMAC_Update(&hctx->ctx, data, count)) |
| 146 | return 0; | ||
| 145 | return 1; | 147 | return 1; |
| 146 | } | 148 | } |
| 147 | 149 | ||
| @@ -167,7 +169,8 @@ static int hmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | |||
| 167 | if (!sig) | 169 | if (!sig) |
| 168 | return 1; | 170 | return 1; |
| 169 | 171 | ||
| 170 | HMAC_Final(&hctx->ctx, sig, &hlen); | 172 | if (!HMAC_Final(&hctx->ctx, sig, &hlen)) |
| 173 | return 0; | ||
| 171 | *siglen = (size_t)hlen; | 174 | *siglen = (size_t)hlen; |
| 172 | return 1; | 175 | return 1; |
| 173 | } | 176 | } |
| @@ -192,8 +195,9 @@ static int pkey_hmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | |||
| 192 | 195 | ||
| 193 | case EVP_PKEY_CTRL_DIGESTINIT: | 196 | case EVP_PKEY_CTRL_DIGESTINIT: |
| 194 | key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr; | 197 | key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr; |
| 195 | HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md, | 198 | if (!HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md, |
| 196 | ctx->engine); | 199 | ctx->engine)) |
| 200 | return 0; | ||
| 197 | break; | 201 | break; |
| 198 | 202 | ||
| 199 | default: | 203 | default: |
diff --git a/src/lib/libcrypto/ia64cpuid.S b/src/lib/libcrypto/ia64cpuid.S index d705fff7ee..7832b9b640 100644 --- a/src/lib/libcrypto/ia64cpuid.S +++ b/src/lib/libcrypto/ia64cpuid.S | |||
| @@ -26,7 +26,7 @@ OPENSSL_atomic_add: | |||
| 26 | { .mii; mov ar.ccv=r2 | 26 | { .mii; mov ar.ccv=r2 |
| 27 | add r8=r2,r33 | 27 | add r8=r2,r33 |
| 28 | mov r3=r2 };; | 28 | mov r3=r2 };; |
| 29 | { .mmi; mf | 29 | { .mmi; mf;; |
| 30 | cmpxchg4.acq r2=[r32],r8,ar.ccv | 30 | cmpxchg4.acq r2=[r32],r8,ar.ccv |
| 31 | nop.i 0 };; | 31 | nop.i 0 };; |
| 32 | { .mib; cmp.ne p6,p0=r2,r3 | 32 | { .mib; cmp.ne p6,p0=r2,r3 |
diff --git a/src/lib/libcrypto/idea/i_cbc.c b/src/lib/libcrypto/idea/i_cbc.c new file mode 100644 index 0000000000..ecb9cb8b83 --- /dev/null +++ b/src/lib/libcrypto/idea/i_cbc.c | |||
| @@ -0,0 +1,168 @@ | |||
| 1 | /* crypto/idea/i_cbc.c */ | ||
| 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 3 | * All rights reserved. | ||
| 4 | * | ||
| 5 | * This package is an SSL implementation written | ||
| 6 | * by Eric Young (eay@cryptsoft.com). | ||
| 7 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 8 | * | ||
| 9 | * This library is free for commercial and non-commercial use as long as | ||
| 10 | * the following conditions are aheared to. The following conditions | ||
| 11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 13 | * included with this distribution is covered by the same copyright terms | ||
| 14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 15 | * | ||
| 16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 17 | * the code are not to be removed. | ||
| 18 | * If this package is used in a product, Eric Young should be given attribution | ||
| 19 | * as the author of the parts of the library used. | ||
| 20 | * This can be in the form of a textual message at program startup or | ||
| 21 | * in documentation (online or textual) provided with the package. | ||
| 22 | * | ||
| 23 | * Redistribution and use in source and binary forms, with or without | ||
| 24 | * modification, are permitted provided that the following conditions | ||
| 25 | * are met: | ||
| 26 | * 1. Redistributions of source code must retain the copyright | ||
| 27 | * notice, this list of conditions and the following disclaimer. | ||
| 28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 29 | * notice, this list of conditions and the following disclaimer in the | ||
| 30 | * documentation and/or other materials provided with the distribution. | ||
| 31 | * 3. All advertising materials mentioning features or use of this software | ||
| 32 | * must display the following acknowledgement: | ||
| 33 | * "This product includes cryptographic software written by | ||
| 34 | * Eric Young (eay@cryptsoft.com)" | ||
| 35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 36 | * being used are not cryptographic related :-). | ||
| 37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 38 | * the apps directory (application code) you must include an acknowledgement: | ||
| 39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 40 | * | ||
| 41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 51 | * SUCH DAMAGE. | ||
| 52 | * | ||
| 53 | * The licence and distribution terms for any publically available version or | ||
| 54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 55 | * copied and put under another distribution licence | ||
| 56 | * [including the GNU Public Licence.] | ||
| 57 | */ | ||
| 58 | |||
| 59 | #include <openssl/idea.h> | ||
| 60 | #include "idea_lcl.h" | ||
| 61 | |||
| 62 | void idea_cbc_encrypt(const unsigned char *in, unsigned char *out, long length, | ||
| 63 | IDEA_KEY_SCHEDULE *ks, unsigned char *iv, int encrypt) | ||
| 64 | { | ||
| 65 | register unsigned long tin0,tin1; | ||
| 66 | register unsigned long tout0,tout1,xor0,xor1; | ||
| 67 | register long l=length; | ||
| 68 | unsigned long tin[2]; | ||
| 69 | |||
| 70 | if (encrypt) | ||
| 71 | { | ||
| 72 | n2l(iv,tout0); | ||
| 73 | n2l(iv,tout1); | ||
| 74 | iv-=8; | ||
| 75 | for (l-=8; l>=0; l-=8) | ||
| 76 | { | ||
| 77 | n2l(in,tin0); | ||
| 78 | n2l(in,tin1); | ||
| 79 | tin0^=tout0; | ||
| 80 | tin1^=tout1; | ||
| 81 | tin[0]=tin0; | ||
| 82 | tin[1]=tin1; | ||
| 83 | idea_encrypt(tin,ks); | ||
| 84 | tout0=tin[0]; l2n(tout0,out); | ||
| 85 | tout1=tin[1]; l2n(tout1,out); | ||
| 86 | } | ||
| 87 | if (l != -8) | ||
| 88 | { | ||
| 89 | n2ln(in,tin0,tin1,l+8); | ||
| 90 | tin0^=tout0; | ||
| 91 | tin1^=tout1; | ||
| 92 | tin[0]=tin0; | ||
| 93 | tin[1]=tin1; | ||
| 94 | idea_encrypt(tin,ks); | ||
| 95 | tout0=tin[0]; l2n(tout0,out); | ||
| 96 | tout1=tin[1]; l2n(tout1,out); | ||
| 97 | } | ||
| 98 | l2n(tout0,iv); | ||
| 99 | l2n(tout1,iv); | ||
| 100 | } | ||
| 101 | else | ||
| 102 | { | ||
| 103 | n2l(iv,xor0); | ||
| 104 | n2l(iv,xor1); | ||
| 105 | iv-=8; | ||
| 106 | for (l-=8; l>=0; l-=8) | ||
| 107 | { | ||
| 108 | n2l(in,tin0); tin[0]=tin0; | ||
| 109 | n2l(in,tin1); tin[1]=tin1; | ||
| 110 | idea_encrypt(tin,ks); | ||
| 111 | tout0=tin[0]^xor0; | ||
| 112 | tout1=tin[1]^xor1; | ||
| 113 | l2n(tout0,out); | ||
| 114 | l2n(tout1,out); | ||
| 115 | xor0=tin0; | ||
| 116 | xor1=tin1; | ||
| 117 | } | ||
| 118 | if (l != -8) | ||
| 119 | { | ||
| 120 | n2l(in,tin0); tin[0]=tin0; | ||
| 121 | n2l(in,tin1); tin[1]=tin1; | ||
| 122 | idea_encrypt(tin,ks); | ||
| 123 | tout0=tin[0]^xor0; | ||
| 124 | tout1=tin[1]^xor1; | ||
| 125 | l2nn(tout0,tout1,out,l+8); | ||
| 126 | xor0=tin0; | ||
| 127 | xor1=tin1; | ||
| 128 | } | ||
| 129 | l2n(xor0,iv); | ||
| 130 | l2n(xor1,iv); | ||
| 131 | } | ||
| 132 | tin0=tin1=tout0=tout1=xor0=xor1=0; | ||
| 133 | tin[0]=tin[1]=0; | ||
| 134 | } | ||
| 135 | |||
| 136 | void idea_encrypt(unsigned long *d, IDEA_KEY_SCHEDULE *key) | ||
| 137 | { | ||
| 138 | register IDEA_INT *p; | ||
| 139 | register unsigned long x1,x2,x3,x4,t0,t1,ul; | ||
| 140 | |||
| 141 | x2=d[0]; | ||
| 142 | x1=(x2>>16); | ||
| 143 | x4=d[1]; | ||
| 144 | x3=(x4>>16); | ||
| 145 | |||
| 146 | p= &(key->data[0][0]); | ||
| 147 | |||
| 148 | E_IDEA(0); | ||
| 149 | E_IDEA(1); | ||
| 150 | E_IDEA(2); | ||
| 151 | E_IDEA(3); | ||
| 152 | E_IDEA(4); | ||
| 153 | E_IDEA(5); | ||
| 154 | E_IDEA(6); | ||
| 155 | E_IDEA(7); | ||
| 156 | |||
| 157 | x1&=0xffff; | ||
| 158 | idea_mul(x1,x1,*p,ul); p++; | ||
| 159 | |||
| 160 | t0= x3+ *(p++); | ||
| 161 | t1= x2+ *(p++); | ||
| 162 | |||
| 163 | x4&=0xffff; | ||
| 164 | idea_mul(x4,x4,*p,ul); | ||
| 165 | |||
| 166 | d[0]=(t0&0xffff)|((x1&0xffff)<<16); | ||
| 167 | d[1]=(x4&0xffff)|((t1&0xffff)<<16); | ||
| 168 | } | ||
diff --git a/src/lib/libcrypto/idea/i_cfb64.c b/src/lib/libcrypto/idea/i_cfb64.c new file mode 100644 index 0000000000..66d49d520e --- /dev/null +++ b/src/lib/libcrypto/idea/i_cfb64.c | |||
| @@ -0,0 +1,122 @@ | |||
| 1 | /* crypto/idea/i_cfb64.c */ | ||
| 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 3 | * All rights reserved. | ||
| 4 | * | ||
| 5 | * This package is an SSL implementation written | ||
| 6 | * by Eric Young (eay@cryptsoft.com). | ||
| 7 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 8 | * | ||
| 9 | * This library is free for commercial and non-commercial use as long as | ||
| 10 | * the following conditions are aheared to. The following conditions | ||
| 11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 13 | * included with this distribution is covered by the same copyright terms | ||
| 14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 15 | * | ||
| 16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 17 | * the code are not to be removed. | ||
| 18 | * If this package is used in a product, Eric Young should be given attribution | ||
| 19 | * as the author of the parts of the library used. | ||
| 20 | * This can be in the form of a textual message at program startup or | ||
| 21 | * in documentation (online or textual) provided with the package. | ||
| 22 | * | ||
| 23 | * Redistribution and use in source and binary forms, with or without | ||
| 24 | * modification, are permitted provided that the following conditions | ||
| 25 | * are met: | ||
| 26 | * 1. Redistributions of source code must retain the copyright | ||
| 27 | * notice, this list of conditions and the following disclaimer. | ||
| 28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 29 | * notice, this list of conditions and the following disclaimer in the | ||
| 30 | * documentation and/or other materials provided with the distribution. | ||
| 31 | * 3. All advertising materials mentioning features or use of this software | ||
| 32 | * must display the following acknowledgement: | ||
| 33 | * "This product includes cryptographic software written by | ||
| 34 | * Eric Young (eay@cryptsoft.com)" | ||
| 35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 36 | * being used are not cryptographic related :-). | ||
| 37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 38 | * the apps directory (application code) you must include an acknowledgement: | ||
| 39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 40 | * | ||
| 41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 51 | * SUCH DAMAGE. | ||
| 52 | * | ||
| 53 | * The licence and distribution terms for any publically available version or | ||
| 54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 55 | * copied and put under another distribution licence | ||
| 56 | * [including the GNU Public Licence.] | ||
| 57 | */ | ||
| 58 | |||
| 59 | #include <openssl/idea.h> | ||
| 60 | #include "idea_lcl.h" | ||
| 61 | |||
| 62 | /* The input and output encrypted as though 64bit cfb mode is being | ||
| 63 | * used. The extra state information to record how much of the | ||
| 64 | * 64bit block we have used is contained in *num; | ||
| 65 | */ | ||
| 66 | |||
| 67 | void idea_cfb64_encrypt(const unsigned char *in, unsigned char *out, | ||
| 68 | long length, IDEA_KEY_SCHEDULE *schedule, | ||
| 69 | unsigned char *ivec, int *num, int encrypt) | ||
| 70 | { | ||
| 71 | register unsigned long v0,v1,t; | ||
| 72 | register int n= *num; | ||
| 73 | register long l=length; | ||
| 74 | unsigned long ti[2]; | ||
| 75 | unsigned char *iv,c,cc; | ||
| 76 | |||
| 77 | iv=(unsigned char *)ivec; | ||
| 78 | if (encrypt) | ||
| 79 | { | ||
| 80 | while (l--) | ||
| 81 | { | ||
| 82 | if (n == 0) | ||
| 83 | { | ||
| 84 | n2l(iv,v0); ti[0]=v0; | ||
| 85 | n2l(iv,v1); ti[1]=v1; | ||
| 86 | idea_encrypt((unsigned long *)ti,schedule); | ||
| 87 | iv=(unsigned char *)ivec; | ||
| 88 | t=ti[0]; l2n(t,iv); | ||
| 89 | t=ti[1]; l2n(t,iv); | ||
| 90 | iv=(unsigned char *)ivec; | ||
| 91 | } | ||
| 92 | c= *(in++)^iv[n]; | ||
| 93 | *(out++)=c; | ||
| 94 | iv[n]=c; | ||
| 95 | n=(n+1)&0x07; | ||
| 96 | } | ||
| 97 | } | ||
| 98 | else | ||
| 99 | { | ||
| 100 | while (l--) | ||
| 101 | { | ||
| 102 | if (n == 0) | ||
| 103 | { | ||
| 104 | n2l(iv,v0); ti[0]=v0; | ||
| 105 | n2l(iv,v1); ti[1]=v1; | ||
| 106 | idea_encrypt((unsigned long *)ti,schedule); | ||
| 107 | iv=(unsigned char *)ivec; | ||
| 108 | t=ti[0]; l2n(t,iv); | ||
| 109 | t=ti[1]; l2n(t,iv); | ||
| 110 | iv=(unsigned char *)ivec; | ||
| 111 | } | ||
| 112 | cc= *(in++); | ||
| 113 | c=iv[n]; | ||
| 114 | iv[n]=cc; | ||
| 115 | *(out++)=c^cc; | ||
| 116 | n=(n+1)&0x07; | ||
| 117 | } | ||
| 118 | } | ||
| 119 | v0=v1=ti[0]=ti[1]=t=c=cc=0; | ||
| 120 | *num=n; | ||
| 121 | } | ||
| 122 | |||
diff --git a/src/lib/libcrypto/idea/i_ecb.c b/src/lib/libcrypto/idea/i_ecb.c new file mode 100644 index 0000000000..fef38230a7 --- /dev/null +++ b/src/lib/libcrypto/idea/i_ecb.c | |||
| @@ -0,0 +1,85 @@ | |||
| 1 | /* crypto/idea/i_ecb.c */ | ||
| 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 3 | * All rights reserved. | ||
| 4 | * | ||
| 5 | * This package is an SSL implementation written | ||
| 6 | * by Eric Young (eay@cryptsoft.com). | ||
| 7 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 8 | * | ||
| 9 | * This library is free for commercial and non-commercial use as long as | ||
| 10 | * the following conditions are aheared to. The following conditions | ||
| 11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 13 | * included with this distribution is covered by the same copyright terms | ||
| 14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 15 | * | ||
| 16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 17 | * the code are not to be removed. | ||
| 18 | * If this package is used in a product, Eric Young should be given attribution | ||
| 19 | * as the author of the parts of the library used. | ||
| 20 | * This can be in the form of a textual message at program startup or | ||
| 21 | * in documentation (online or textual) provided with the package. | ||
| 22 | * | ||
| 23 | * Redistribution and use in source and binary forms, with or without | ||
| 24 | * modification, are permitted provided that the following conditions | ||
| 25 | * are met: | ||
| 26 | * 1. Redistributions of source code must retain the copyright | ||
| 27 | * notice, this list of conditions and the following disclaimer. | ||
| 28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 29 | * notice, this list of conditions and the following disclaimer in the | ||
| 30 | * documentation and/or other materials provided with the distribution. | ||
| 31 | * 3. All advertising materials mentioning features or use of this software | ||
| 32 | * must display the following acknowledgement: | ||
| 33 | * "This product includes cryptographic software written by | ||
| 34 | * Eric Young (eay@cryptsoft.com)" | ||
| 35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 36 | * being used are not cryptographic related :-). | ||
| 37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 38 | * the apps directory (application code) you must include an acknowledgement: | ||
| 39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 40 | * | ||
| 41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 51 | * SUCH DAMAGE. | ||
| 52 | * | ||
| 53 | * The licence and distribution terms for any publically available version or | ||
| 54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 55 | * copied and put under another distribution licence | ||
| 56 | * [including the GNU Public Licence.] | ||
| 57 | */ | ||
| 58 | |||
| 59 | #include <openssl/idea.h> | ||
| 60 | #include "idea_lcl.h" | ||
| 61 | #include <openssl/opensslv.h> | ||
| 62 | |||
| 63 | const char IDEA_version[]="IDEA" OPENSSL_VERSION_PTEXT; | ||
| 64 | |||
| 65 | const char *idea_options(void) | ||
| 66 | { | ||
| 67 | if (sizeof(short) != sizeof(IDEA_INT)) | ||
| 68 | return("idea(int)"); | ||
| 69 | else | ||
| 70 | return("idea(short)"); | ||
| 71 | } | ||
| 72 | |||
| 73 | void idea_ecb_encrypt(const unsigned char *in, unsigned char *out, | ||
| 74 | IDEA_KEY_SCHEDULE *ks) | ||
| 75 | { | ||
| 76 | unsigned long l0,l1,d[2]; | ||
| 77 | |||
| 78 | n2l(in,l0); d[0]=l0; | ||
| 79 | n2l(in,l1); d[1]=l1; | ||
| 80 | idea_encrypt(d,ks); | ||
| 81 | l0=d[0]; l2n(l0,out); | ||
| 82 | l1=d[1]; l2n(l1,out); | ||
| 83 | l0=l1=d[0]=d[1]=0; | ||
| 84 | } | ||
| 85 | |||
diff --git a/src/lib/libcrypto/idea/i_ofb64.c b/src/lib/libcrypto/idea/i_ofb64.c new file mode 100644 index 0000000000..e749e88e34 --- /dev/null +++ b/src/lib/libcrypto/idea/i_ofb64.c | |||
| @@ -0,0 +1,111 @@ | |||
| 1 | /* crypto/idea/i_ofb64.c */ | ||
| 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 3 | * All rights reserved. | ||
| 4 | * | ||
| 5 | * This package is an SSL implementation written | ||
| 6 | * by Eric Young (eay@cryptsoft.com). | ||
| 7 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 8 | * | ||
| 9 | * This library is free for commercial and non-commercial use as long as | ||
| 10 | * the following conditions are aheared to. The following conditions | ||
| 11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 13 | * included with this distribution is covered by the same copyright terms | ||
| 14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 15 | * | ||
| 16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 17 | * the code are not to be removed. | ||
| 18 | * If this package is used in a product, Eric Young should be given attribution | ||
| 19 | * as the author of the parts of the library used. | ||
| 20 | * This can be in the form of a textual message at program startup or | ||
| 21 | * in documentation (online or textual) provided with the package. | ||
| 22 | * | ||
| 23 | * Redistribution and use in source and binary forms, with or without | ||
| 24 | * modification, are permitted provided that the following conditions | ||
| 25 | * are met: | ||
| 26 | * 1. Redistributions of source code must retain the copyright | ||
| 27 | * notice, this list of conditions and the following disclaimer. | ||
| 28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 29 | * notice, this list of conditions and the following disclaimer in the | ||
| 30 | * documentation and/or other materials provided with the distribution. | ||
| 31 | * 3. All advertising materials mentioning features or use of this software | ||
| 32 | * must display the following acknowledgement: | ||
| 33 | * "This product includes cryptographic software written by | ||
| 34 | * Eric Young (eay@cryptsoft.com)" | ||
| 35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 36 | * being used are not cryptographic related :-). | ||
| 37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 38 | * the apps directory (application code) you must include an acknowledgement: | ||
| 39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 40 | * | ||
| 41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 51 | * SUCH DAMAGE. | ||
| 52 | * | ||
| 53 | * The licence and distribution terms for any publically available version or | ||
| 54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 55 | * copied and put under another distribution licence | ||
| 56 | * [including the GNU Public Licence.] | ||
| 57 | */ | ||
| 58 | |||
| 59 | #include <openssl/idea.h> | ||
| 60 | #include "idea_lcl.h" | ||
| 61 | |||
| 62 | /* The input and output encrypted as though 64bit ofb mode is being | ||
| 63 | * used. The extra state information to record how much of the | ||
| 64 | * 64bit block we have used is contained in *num; | ||
| 65 | */ | ||
| 66 | void idea_ofb64_encrypt(const unsigned char *in, unsigned char *out, | ||
| 67 | long length, IDEA_KEY_SCHEDULE *schedule, | ||
| 68 | unsigned char *ivec, int *num) | ||
| 69 | { | ||
| 70 | register unsigned long v0,v1,t; | ||
| 71 | register int n= *num; | ||
| 72 | register long l=length; | ||
| 73 | unsigned char d[8]; | ||
| 74 | register char *dp; | ||
| 75 | unsigned long ti[2]; | ||
| 76 | unsigned char *iv; | ||
| 77 | int save=0; | ||
| 78 | |||
| 79 | iv=(unsigned char *)ivec; | ||
| 80 | n2l(iv,v0); | ||
| 81 | n2l(iv,v1); | ||
| 82 | ti[0]=v0; | ||
| 83 | ti[1]=v1; | ||
| 84 | dp=(char *)d; | ||
| 85 | l2n(v0,dp); | ||
| 86 | l2n(v1,dp); | ||
| 87 | while (l--) | ||
| 88 | { | ||
| 89 | if (n == 0) | ||
| 90 | { | ||
| 91 | idea_encrypt((unsigned long *)ti,schedule); | ||
| 92 | dp=(char *)d; | ||
| 93 | t=ti[0]; l2n(t,dp); | ||
| 94 | t=ti[1]; l2n(t,dp); | ||
| 95 | save++; | ||
| 96 | } | ||
| 97 | *(out++)= *(in++)^d[n]; | ||
| 98 | n=(n+1)&0x07; | ||
| 99 | } | ||
| 100 | if (save) | ||
| 101 | { | ||
| 102 | v0=ti[0]; | ||
| 103 | v1=ti[1]; | ||
| 104 | iv=(unsigned char *)ivec; | ||
| 105 | l2n(v0,iv); | ||
| 106 | l2n(v1,iv); | ||
| 107 | } | ||
| 108 | t=v0=v1=ti[0]=ti[1]=0; | ||
| 109 | *num=n; | ||
| 110 | } | ||
| 111 | |||
diff --git a/src/lib/libcrypto/idea/i_skey.c b/src/lib/libcrypto/idea/i_skey.c new file mode 100644 index 0000000000..afb830964d --- /dev/null +++ b/src/lib/libcrypto/idea/i_skey.c | |||
| @@ -0,0 +1,164 @@ | |||
| 1 | /* crypto/idea/i_skey.c */ | ||
| 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 3 | * All rights reserved. | ||
| 4 | * | ||
| 5 | * This package is an SSL implementation written | ||
| 6 | * by Eric Young (eay@cryptsoft.com). | ||
| 7 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 8 | * | ||
| 9 | * This library is free for commercial and non-commercial use as long as | ||
| 10 | * the following conditions are aheared to. The following conditions | ||
| 11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 13 | * included with this distribution is covered by the same copyright terms | ||
| 14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 15 | * | ||
| 16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 17 | * the code are not to be removed. | ||
| 18 | * If this package is used in a product, Eric Young should be given attribution | ||
| 19 | * as the author of the parts of the library used. | ||
| 20 | * This can be in the form of a textual message at program startup or | ||
| 21 | * in documentation (online or textual) provided with the package. | ||
| 22 | * | ||
| 23 | * Redistribution and use in source and binary forms, with or without | ||
| 24 | * modification, are permitted provided that the following conditions | ||
| 25 | * are met: | ||
| 26 | * 1. Redistributions of source code must retain the copyright | ||
| 27 | * notice, this list of conditions and the following disclaimer. | ||
| 28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 29 | * notice, this list of conditions and the following disclaimer in the | ||
| 30 | * documentation and/or other materials provided with the distribution. | ||
| 31 | * 3. All advertising materials mentioning features or use of this software | ||
| 32 | * must display the following acknowledgement: | ||
| 33 | * "This product includes cryptographic software written by | ||
| 34 | * Eric Young (eay@cryptsoft.com)" | ||
| 35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 36 | * being used are not cryptographic related :-). | ||
| 37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 38 | * the apps directory (application code) you must include an acknowledgement: | ||
| 39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 40 | * | ||
| 41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 51 | * SUCH DAMAGE. | ||
| 52 | * | ||
| 53 | * The licence and distribution terms for any publically available version or | ||
| 54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 55 | * copied and put under another distribution licence | ||
| 56 | * [including the GNU Public Licence.] | ||
| 57 | */ | ||
| 58 | |||
| 59 | #include <openssl/crypto.h> | ||
| 60 | #include <openssl/idea.h> | ||
| 61 | #include "idea_lcl.h" | ||
| 62 | |||
| 63 | static IDEA_INT inverse(unsigned int xin); | ||
| 64 | void idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks) | ||
| 65 | #ifdef OPENSSL_FIPS | ||
| 66 | { | ||
| 67 | fips_cipher_abort(IDEA); | ||
| 68 | private_idea_set_encrypt_key(key, ks); | ||
| 69 | } | ||
| 70 | void private_idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks) | ||
| 71 | #endif | ||
| 72 | { | ||
| 73 | int i; | ||
| 74 | register IDEA_INT *kt,*kf,r0,r1,r2; | ||
| 75 | |||
| 76 | kt= &(ks->data[0][0]); | ||
| 77 | n2s(key,kt[0]); n2s(key,kt[1]); n2s(key,kt[2]); n2s(key,kt[3]); | ||
| 78 | n2s(key,kt[4]); n2s(key,kt[5]); n2s(key,kt[6]); n2s(key,kt[7]); | ||
| 79 | |||
| 80 | kf=kt; | ||
| 81 | kt+=8; | ||
| 82 | for (i=0; i<6; i++) | ||
| 83 | { | ||
| 84 | r2= kf[1]; | ||
| 85 | r1= kf[2]; | ||
| 86 | *(kt++)= ((r2<<9) | (r1>>7))&0xffff; | ||
| 87 | r0= kf[3]; | ||
| 88 | *(kt++)= ((r1<<9) | (r0>>7))&0xffff; | ||
| 89 | r1= kf[4]; | ||
| 90 | *(kt++)= ((r0<<9) | (r1>>7))&0xffff; | ||
| 91 | r0= kf[5]; | ||
| 92 | *(kt++)= ((r1<<9) | (r0>>7))&0xffff; | ||
| 93 | r1= kf[6]; | ||
| 94 | *(kt++)= ((r0<<9) | (r1>>7))&0xffff; | ||
| 95 | r0= kf[7]; | ||
| 96 | *(kt++)= ((r1<<9) | (r0>>7))&0xffff; | ||
| 97 | r1= kf[0]; | ||
| 98 | if (i >= 5) break; | ||
| 99 | *(kt++)= ((r0<<9) | (r1>>7))&0xffff; | ||
| 100 | *(kt++)= ((r1<<9) | (r2>>7))&0xffff; | ||
| 101 | kf+=8; | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | void idea_set_decrypt_key(IDEA_KEY_SCHEDULE *ek, IDEA_KEY_SCHEDULE *dk) | ||
| 106 | { | ||
| 107 | int r; | ||
| 108 | register IDEA_INT *fp,*tp,t; | ||
| 109 | |||
| 110 | tp= &(dk->data[0][0]); | ||
| 111 | fp= &(ek->data[8][0]); | ||
| 112 | for (r=0; r<9; r++) | ||
| 113 | { | ||
| 114 | *(tp++)=inverse(fp[0]); | ||
| 115 | *(tp++)=((int)(0x10000L-fp[2])&0xffff); | ||
| 116 | *(tp++)=((int)(0x10000L-fp[1])&0xffff); | ||
| 117 | *(tp++)=inverse(fp[3]); | ||
| 118 | if (r == 8) break; | ||
| 119 | fp-=6; | ||
| 120 | *(tp++)=fp[4]; | ||
| 121 | *(tp++)=fp[5]; | ||
| 122 | } | ||
| 123 | |||
| 124 | tp= &(dk->data[0][0]); | ||
| 125 | t=tp[1]; | ||
| 126 | tp[1]=tp[2]; | ||
| 127 | tp[2]=t; | ||
| 128 | |||
| 129 | t=tp[49]; | ||
| 130 | tp[49]=tp[50]; | ||
| 131 | tp[50]=t; | ||
| 132 | } | ||
| 133 | |||
| 134 | /* taken directly from the 'paper' I'll have a look at it later */ | ||
| 135 | static IDEA_INT inverse(unsigned int xin) | ||
| 136 | { | ||
| 137 | long n1,n2,q,r,b1,b2,t; | ||
| 138 | |||
| 139 | if (xin == 0) | ||
| 140 | b2=0; | ||
| 141 | else | ||
| 142 | { | ||
| 143 | n1=0x10001; | ||
| 144 | n2=xin; | ||
| 145 | b2=1; | ||
| 146 | b1=0; | ||
| 147 | |||
| 148 | do { | ||
| 149 | r=(n1%n2); | ||
| 150 | q=(n1-r)/n2; | ||
| 151 | if (r == 0) | ||
| 152 | { if (b2 < 0) b2=0x10001+b2; } | ||
| 153 | else | ||
| 154 | { | ||
| 155 | n1=n2; | ||
| 156 | n2=r; | ||
| 157 | t=b2; | ||
| 158 | b2=b1-q*b2; | ||
| 159 | b1=t; | ||
| 160 | } | ||
| 161 | } while (r != 0); | ||
| 162 | } | ||
| 163 | return((IDEA_INT)b2); | ||
| 164 | } | ||
diff --git a/src/lib/libcrypto/idea/idea_lcl.h b/src/lib/libcrypto/idea/idea_lcl.h new file mode 100644 index 0000000000..f3dbfa67e9 --- /dev/null +++ b/src/lib/libcrypto/idea/idea_lcl.h | |||
| @@ -0,0 +1,215 @@ | |||
| 1 | /* crypto/idea/idea_lcl.h */ | ||
| 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 3 | * All rights reserved. | ||
| 4 | * | ||
| 5 | * This package is an SSL implementation written | ||
| 6 | * by Eric Young (eay@cryptsoft.com). | ||
| 7 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 8 | * | ||
| 9 | * This library is free for commercial and non-commercial use as long as | ||
| 10 | * the following conditions are aheared to. The following conditions | ||
| 11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 13 | * included with this distribution is covered by the same copyright terms | ||
| 14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 15 | * | ||
| 16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 17 | * the code are not to be removed. | ||
| 18 | * If this package is used in a product, Eric Young should be given attribution | ||
| 19 | * as the author of the parts of the library used. | ||
| 20 | * This can be in the form of a textual message at program startup or | ||
| 21 | * in documentation (online or textual) provided with the package. | ||
| 22 | * | ||
| 23 | * Redistribution and use in source and binary forms, with or without | ||
| 24 | * modification, are permitted provided that the following conditions | ||
| 25 | * are met: | ||
| 26 | * 1. Redistributions of source code must retain the copyright | ||
| 27 | * notice, this list of conditions and the following disclaimer. | ||
| 28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 29 | * notice, this list of conditions and the following disclaimer in the | ||
| 30 | * documentation and/or other materials provided with the distribution. | ||
| 31 | * 3. All advertising materials mentioning features or use of this software | ||
| 32 | * must display the following acknowledgement: | ||
| 33 | * "This product includes cryptographic software written by | ||
| 34 | * Eric Young (eay@cryptsoft.com)" | ||
| 35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 36 | * being used are not cryptographic related :-). | ||
| 37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 38 | * the apps directory (application code) you must include an acknowledgement: | ||
| 39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 40 | * | ||
| 41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 51 | * SUCH DAMAGE. | ||
| 52 | * | ||
| 53 | * The licence and distribution terms for any publically available version or | ||
| 54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 55 | * copied and put under another distribution licence | ||
| 56 | * [including the GNU Public Licence.] | ||
| 57 | */ | ||
| 58 | |||
| 59 | /* The new form of this macro (check if the a*b == 0) was suggested by | ||
| 60 | * Colin Plumb <colin@nyx10.cs.du.edu> */ | ||
| 61 | /* Removal of the inner if from from Wei Dai 24/4/96 */ | ||
| 62 | #define idea_mul(r,a,b,ul) \ | ||
| 63 | ul=(unsigned long)a*b; \ | ||
| 64 | if (ul != 0) \ | ||
| 65 | { \ | ||
| 66 | r=(ul&0xffff)-(ul>>16); \ | ||
| 67 | r-=((r)>>16); \ | ||
| 68 | } \ | ||
| 69 | else \ | ||
| 70 | r=(-(int)a-b+1); /* assuming a or b is 0 and in range */ | ||
| 71 | |||
| 72 | #ifdef undef | ||
| 73 | #define idea_mul(r,a,b,ul,sl) \ | ||
| 74 | if (a == 0) r=(0x10001-b)&0xffff; \ | ||
| 75 | else if (b == 0) r=(0x10001-a)&0xffff; \ | ||
| 76 | else { \ | ||
| 77 | ul=(unsigned long)a*b; \ | ||
| 78 | sl=(ul&0xffff)-(ul>>16); \ | ||
| 79 | if (sl <= 0) sl+=0x10001; \ | ||
| 80 | r=sl; \ | ||
| 81 | } | ||
| 82 | #endif | ||
| 83 | |||
| 84 | /* 7/12/95 - Many thanks to Rhys Weatherley <rweather@us.oracle.com> | ||
| 85 | * for pointing out that I was assuming little endian | ||
| 86 | * byte order for all quantities what idea | ||
| 87 | * actually used bigendian. No where in the spec does it mention | ||
| 88 | * this, it is all in terms of 16 bit numbers and even the example | ||
| 89 | * does not use byte streams for the input example :-(. | ||
| 90 | * If you byte swap each pair of input, keys and iv, the functions | ||
| 91 | * would produce the output as the old version :-(. | ||
| 92 | */ | ||
| 93 | |||
| 94 | /* NOTE - c is not incremented as per n2l */ | ||
| 95 | #define n2ln(c,l1,l2,n) { \ | ||
| 96 | c+=n; \ | ||
| 97 | l1=l2=0; \ | ||
| 98 | switch (n) { \ | ||
| 99 | case 8: l2 =((unsigned long)(*(--(c)))) ; \ | ||
| 100 | case 7: l2|=((unsigned long)(*(--(c))))<< 8; \ | ||
| 101 | case 6: l2|=((unsigned long)(*(--(c))))<<16; \ | ||
| 102 | case 5: l2|=((unsigned long)(*(--(c))))<<24; \ | ||
| 103 | case 4: l1 =((unsigned long)(*(--(c)))) ; \ | ||
| 104 | case 3: l1|=((unsigned long)(*(--(c))))<< 8; \ | ||
| 105 | case 2: l1|=((unsigned long)(*(--(c))))<<16; \ | ||
| 106 | case 1: l1|=((unsigned long)(*(--(c))))<<24; \ | ||
| 107 | } \ | ||
| 108 | } | ||
| 109 | |||
| 110 | /* NOTE - c is not incremented as per l2n */ | ||
| 111 | #define l2nn(l1,l2,c,n) { \ | ||
| 112 | c+=n; \ | ||
| 113 | switch (n) { \ | ||
| 114 | case 8: *(--(c))=(unsigned char)(((l2) )&0xff); \ | ||
| 115 | case 7: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \ | ||
| 116 | case 6: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \ | ||
| 117 | case 5: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \ | ||
| 118 | case 4: *(--(c))=(unsigned char)(((l1) )&0xff); \ | ||
| 119 | case 3: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \ | ||
| 120 | case 2: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \ | ||
| 121 | case 1: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \ | ||
| 122 | } \ | ||
| 123 | } | ||
| 124 | |||
| 125 | #undef n2l | ||
| 126 | #define n2l(c,l) (l =((unsigned long)(*((c)++)))<<24L, \ | ||
| 127 | l|=((unsigned long)(*((c)++)))<<16L, \ | ||
| 128 | l|=((unsigned long)(*((c)++)))<< 8L, \ | ||
| 129 | l|=((unsigned long)(*((c)++)))) | ||
| 130 | |||
| 131 | #undef l2n | ||
| 132 | #define l2n(l,c) (*((c)++)=(unsigned char)(((l)>>24L)&0xff), \ | ||
| 133 | *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ | ||
| 134 | *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ | ||
| 135 | *((c)++)=(unsigned char)(((l) )&0xff)) | ||
| 136 | |||
| 137 | #undef s2n | ||
| 138 | #define s2n(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ | ||
| 139 | *((c)++)=(unsigned char)(((l)>> 8L)&0xff)) | ||
| 140 | |||
| 141 | #undef n2s | ||
| 142 | #define n2s(c,l) (l =((IDEA_INT)(*((c)++)))<< 8L, \ | ||
| 143 | l|=((IDEA_INT)(*((c)++))) ) | ||
| 144 | |||
| 145 | #ifdef undef | ||
| 146 | /* NOTE - c is not incremented as per c2l */ | ||
| 147 | #define c2ln(c,l1,l2,n) { \ | ||
| 148 | c+=n; \ | ||
| 149 | l1=l2=0; \ | ||
| 150 | switch (n) { \ | ||
| 151 | case 8: l2 =((unsigned long)(*(--(c))))<<24; \ | ||
| 152 | case 7: l2|=((unsigned long)(*(--(c))))<<16; \ | ||
| 153 | case 6: l2|=((unsigned long)(*(--(c))))<< 8; \ | ||
| 154 | case 5: l2|=((unsigned long)(*(--(c)))); \ | ||
| 155 | case 4: l1 =((unsigned long)(*(--(c))))<<24; \ | ||
| 156 | case 3: l1|=((unsigned long)(*(--(c))))<<16; \ | ||
| 157 | case 2: l1|=((unsigned long)(*(--(c))))<< 8; \ | ||
| 158 | case 1: l1|=((unsigned long)(*(--(c)))); \ | ||
| 159 | } \ | ||
| 160 | } | ||
| 161 | |||
| 162 | /* NOTE - c is not incremented as per l2c */ | ||
| 163 | #define l2cn(l1,l2,c,n) { \ | ||
| 164 | c+=n; \ | ||
| 165 | switch (n) { \ | ||
| 166 | case 8: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \ | ||
| 167 | case 7: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \ | ||
| 168 | case 6: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \ | ||
| 169 | case 5: *(--(c))=(unsigned char)(((l2) )&0xff); \ | ||
| 170 | case 4: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \ | ||
| 171 | case 3: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \ | ||
| 172 | case 2: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \ | ||
| 173 | case 1: *(--(c))=(unsigned char)(((l1) )&0xff); \ | ||
| 174 | } \ | ||
| 175 | } | ||
| 176 | |||
| 177 | #undef c2s | ||
| 178 | #define c2s(c,l) (l =((unsigned long)(*((c)++))) , \ | ||
| 179 | l|=((unsigned long)(*((c)++)))<< 8L) | ||
| 180 | |||
| 181 | #undef s2c | ||
| 182 | #define s2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ | ||
| 183 | *((c)++)=(unsigned char)(((l)>> 8L)&0xff)) | ||
| 184 | |||
| 185 | #undef c2l | ||
| 186 | #define c2l(c,l) (l =((unsigned long)(*((c)++))) , \ | ||
| 187 | l|=((unsigned long)(*((c)++)))<< 8L, \ | ||
| 188 | l|=((unsigned long)(*((c)++)))<<16L, \ | ||
| 189 | l|=((unsigned long)(*((c)++)))<<24L) | ||
| 190 | |||
| 191 | #undef l2c | ||
| 192 | #define l2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ | ||
| 193 | *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ | ||
| 194 | *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ | ||
| 195 | *((c)++)=(unsigned char)(((l)>>24L)&0xff)) | ||
| 196 | #endif | ||
| 197 | |||
| 198 | #define E_IDEA(num) \ | ||
| 199 | x1&=0xffff; \ | ||
| 200 | idea_mul(x1,x1,*p,ul); p++; \ | ||
| 201 | x2+= *(p++); \ | ||
| 202 | x3+= *(p++); \ | ||
| 203 | x4&=0xffff; \ | ||
| 204 | idea_mul(x4,x4,*p,ul); p++; \ | ||
| 205 | t0=(x1^x3)&0xffff; \ | ||
| 206 | idea_mul(t0,t0,*p,ul); p++; \ | ||
| 207 | t1=(t0+(x2^x4))&0xffff; \ | ||
| 208 | idea_mul(t1,t1,*p,ul); p++; \ | ||
| 209 | t0+=t1; \ | ||
| 210 | x1^=t1; \ | ||
| 211 | x4^=t0; \ | ||
| 212 | ul=x2^t0; /* do the swap to x3 */ \ | ||
| 213 | x2=x3^t1; \ | ||
| 214 | x3=ul; | ||
| 215 | |||
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl new file mode 100644 index 0000000000..6358b2750f --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-alpha.pl | |||
| @@ -0,0 +1,451 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # March 2010 | ||
| 11 | # | ||
| 12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
| 13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
| 14 | # uses 256 bytes per-key table [+128 bytes shared table]. Even though | ||
| 15 | # loops are aggressively modulo-scheduled in respect to references to | ||
| 16 | # Htbl and Z.hi updates for 8 cycles per byte, measured performance is | ||
| 17 | # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic | ||
| 18 | # scheduling "glitch," because uprofile(1) indicates uniform sample | ||
| 19 | # distribution, as if all instruction bundles execute in 1.5 cycles. | ||
| 20 | # Meaning that it could have been even faster, yet 12 cycles is ~60% | ||
| 21 | # better than gcc-generated code and ~80% than code generated by vendor | ||
| 22 | # compiler. | ||
| 23 | |||
| 24 | $cnt="v0"; # $0 | ||
| 25 | $t0="t0"; | ||
| 26 | $t1="t1"; | ||
| 27 | $t2="t2"; | ||
| 28 | $Thi0="t3"; # $4 | ||
| 29 | $Tlo0="t4"; | ||
| 30 | $Thi1="t5"; | ||
| 31 | $Tlo1="t6"; | ||
| 32 | $rem="t7"; # $8 | ||
| 33 | ################# | ||
| 34 | $Xi="a0"; # $16, input argument block | ||
| 35 | $Htbl="a1"; | ||
| 36 | $inp="a2"; | ||
| 37 | $len="a3"; | ||
| 38 | $nlo="a4"; # $20 | ||
| 39 | $nhi="a5"; | ||
| 40 | $Zhi="t8"; | ||
| 41 | $Zlo="t9"; | ||
| 42 | $Xhi="t10"; # $24 | ||
| 43 | $Xlo="t11"; | ||
| 44 | $remp="t12"; | ||
| 45 | $rem_4bit="AT"; # $28 | ||
| 46 | |||
| 47 | { my $N; | ||
| 48 | sub loop() { | ||
| 49 | |||
| 50 | $N++; | ||
| 51 | $code.=<<___; | ||
| 52 | .align 4 | ||
| 53 | extbl $Xlo,7,$nlo | ||
| 54 | and $nlo,0xf0,$nhi | ||
| 55 | sll $nlo,4,$nlo | ||
| 56 | and $nlo,0xf0,$nlo | ||
| 57 | |||
| 58 | addq $nlo,$Htbl,$nlo | ||
| 59 | ldq $Zlo,8($nlo) | ||
| 60 | addq $nhi,$Htbl,$nhi | ||
| 61 | ldq $Zhi,0($nlo) | ||
| 62 | |||
| 63 | and $Zlo,0x0f,$remp | ||
| 64 | sll $Zhi,60,$t0 | ||
| 65 | lda $cnt,6(zero) | ||
| 66 | extbl $Xlo,6,$nlo | ||
| 67 | |||
| 68 | ldq $Tlo1,8($nhi) | ||
| 69 | s8addq $remp,$rem_4bit,$remp | ||
| 70 | ldq $Thi1,0($nhi) | ||
| 71 | srl $Zlo,4,$Zlo | ||
| 72 | |||
| 73 | ldq $rem,0($remp) | ||
| 74 | srl $Zhi,4,$Zhi | ||
| 75 | xor $t0,$Zlo,$Zlo | ||
| 76 | and $nlo,0xf0,$nhi | ||
| 77 | |||
| 78 | xor $Tlo1,$Zlo,$Zlo | ||
| 79 | sll $nlo,4,$nlo | ||
| 80 | xor $Thi1,$Zhi,$Zhi | ||
| 81 | and $nlo,0xf0,$nlo | ||
| 82 | |||
| 83 | addq $nlo,$Htbl,$nlo | ||
| 84 | ldq $Tlo0,8($nlo) | ||
| 85 | addq $nhi,$Htbl,$nhi | ||
| 86 | ldq $Thi0,0($nlo) | ||
| 87 | |||
| 88 | .Looplo$N: | ||
| 89 | and $Zlo,0x0f,$remp | ||
| 90 | sll $Zhi,60,$t0 | ||
| 91 | subq $cnt,1,$cnt | ||
| 92 | srl $Zlo,4,$Zlo | ||
| 93 | |||
| 94 | ldq $Tlo1,8($nhi) | ||
| 95 | xor $rem,$Zhi,$Zhi | ||
| 96 | ldq $Thi1,0($nhi) | ||
| 97 | s8addq $remp,$rem_4bit,$remp | ||
| 98 | |||
| 99 | ldq $rem,0($remp) | ||
| 100 | srl $Zhi,4,$Zhi | ||
| 101 | xor $t0,$Zlo,$Zlo | ||
| 102 | extbl $Xlo,$cnt,$nlo | ||
| 103 | |||
| 104 | and $nlo,0xf0,$nhi | ||
| 105 | xor $Thi0,$Zhi,$Zhi | ||
| 106 | xor $Tlo0,$Zlo,$Zlo | ||
| 107 | sll $nlo,4,$nlo | ||
| 108 | |||
| 109 | |||
| 110 | and $Zlo,0x0f,$remp | ||
| 111 | sll $Zhi,60,$t0 | ||
| 112 | and $nlo,0xf0,$nlo | ||
| 113 | srl $Zlo,4,$Zlo | ||
| 114 | |||
| 115 | s8addq $remp,$rem_4bit,$remp | ||
| 116 | xor $rem,$Zhi,$Zhi | ||
| 117 | addq $nlo,$Htbl,$nlo | ||
| 118 | addq $nhi,$Htbl,$nhi | ||
| 119 | |||
| 120 | ldq $rem,0($remp) | ||
| 121 | srl $Zhi,4,$Zhi | ||
| 122 | ldq $Tlo0,8($nlo) | ||
| 123 | xor $t0,$Zlo,$Zlo | ||
| 124 | |||
| 125 | xor $Tlo1,$Zlo,$Zlo | ||
| 126 | xor $Thi1,$Zhi,$Zhi | ||
| 127 | ldq $Thi0,0($nlo) | ||
| 128 | bne $cnt,.Looplo$N | ||
| 129 | |||
| 130 | |||
| 131 | and $Zlo,0x0f,$remp | ||
| 132 | sll $Zhi,60,$t0 | ||
| 133 | lda $cnt,7(zero) | ||
| 134 | srl $Zlo,4,$Zlo | ||
| 135 | |||
| 136 | ldq $Tlo1,8($nhi) | ||
| 137 | xor $rem,$Zhi,$Zhi | ||
| 138 | ldq $Thi1,0($nhi) | ||
| 139 | s8addq $remp,$rem_4bit,$remp | ||
| 140 | |||
| 141 | ldq $rem,0($remp) | ||
| 142 | srl $Zhi,4,$Zhi | ||
| 143 | xor $t0,$Zlo,$Zlo | ||
| 144 | extbl $Xhi,$cnt,$nlo | ||
| 145 | |||
| 146 | and $nlo,0xf0,$nhi | ||
| 147 | xor $Thi0,$Zhi,$Zhi | ||
| 148 | xor $Tlo0,$Zlo,$Zlo | ||
| 149 | sll $nlo,4,$nlo | ||
| 150 | |||
| 151 | and $Zlo,0x0f,$remp | ||
| 152 | sll $Zhi,60,$t0 | ||
| 153 | and $nlo,0xf0,$nlo | ||
| 154 | srl $Zlo,4,$Zlo | ||
| 155 | |||
| 156 | s8addq $remp,$rem_4bit,$remp | ||
| 157 | xor $rem,$Zhi,$Zhi | ||
| 158 | addq $nlo,$Htbl,$nlo | ||
| 159 | addq $nhi,$Htbl,$nhi | ||
| 160 | |||
| 161 | ldq $rem,0($remp) | ||
| 162 | srl $Zhi,4,$Zhi | ||
| 163 | ldq $Tlo0,8($nlo) | ||
| 164 | xor $t0,$Zlo,$Zlo | ||
| 165 | |||
| 166 | xor $Tlo1,$Zlo,$Zlo | ||
| 167 | xor $Thi1,$Zhi,$Zhi | ||
| 168 | ldq $Thi0,0($nlo) | ||
| 169 | unop | ||
| 170 | |||
| 171 | |||
| 172 | .Loophi$N: | ||
| 173 | and $Zlo,0x0f,$remp | ||
| 174 | sll $Zhi,60,$t0 | ||
| 175 | subq $cnt,1,$cnt | ||
| 176 | srl $Zlo,4,$Zlo | ||
| 177 | |||
| 178 | ldq $Tlo1,8($nhi) | ||
| 179 | xor $rem,$Zhi,$Zhi | ||
| 180 | ldq $Thi1,0($nhi) | ||
| 181 | s8addq $remp,$rem_4bit,$remp | ||
| 182 | |||
| 183 | ldq $rem,0($remp) | ||
| 184 | srl $Zhi,4,$Zhi | ||
| 185 | xor $t0,$Zlo,$Zlo | ||
| 186 | extbl $Xhi,$cnt,$nlo | ||
| 187 | |||
| 188 | and $nlo,0xf0,$nhi | ||
| 189 | xor $Thi0,$Zhi,$Zhi | ||
| 190 | xor $Tlo0,$Zlo,$Zlo | ||
| 191 | sll $nlo,4,$nlo | ||
| 192 | |||
| 193 | |||
| 194 | and $Zlo,0x0f,$remp | ||
| 195 | sll $Zhi,60,$t0 | ||
| 196 | and $nlo,0xf0,$nlo | ||
| 197 | srl $Zlo,4,$Zlo | ||
| 198 | |||
| 199 | s8addq $remp,$rem_4bit,$remp | ||
| 200 | xor $rem,$Zhi,$Zhi | ||
| 201 | addq $nlo,$Htbl,$nlo | ||
| 202 | addq $nhi,$Htbl,$nhi | ||
| 203 | |||
| 204 | ldq $rem,0($remp) | ||
| 205 | srl $Zhi,4,$Zhi | ||
| 206 | ldq $Tlo0,8($nlo) | ||
| 207 | xor $t0,$Zlo,$Zlo | ||
| 208 | |||
| 209 | xor $Tlo1,$Zlo,$Zlo | ||
| 210 | xor $Thi1,$Zhi,$Zhi | ||
| 211 | ldq $Thi0,0($nlo) | ||
| 212 | bne $cnt,.Loophi$N | ||
| 213 | |||
| 214 | |||
| 215 | and $Zlo,0x0f,$remp | ||
| 216 | sll $Zhi,60,$t0 | ||
| 217 | srl $Zlo,4,$Zlo | ||
| 218 | |||
| 219 | ldq $Tlo1,8($nhi) | ||
| 220 | xor $rem,$Zhi,$Zhi | ||
| 221 | ldq $Thi1,0($nhi) | ||
| 222 | s8addq $remp,$rem_4bit,$remp | ||
| 223 | |||
| 224 | ldq $rem,0($remp) | ||
| 225 | srl $Zhi,4,$Zhi | ||
| 226 | xor $t0,$Zlo,$Zlo | ||
| 227 | |||
| 228 | xor $Tlo0,$Zlo,$Zlo | ||
| 229 | xor $Thi0,$Zhi,$Zhi | ||
| 230 | |||
| 231 | and $Zlo,0x0f,$remp | ||
| 232 | sll $Zhi,60,$t0 | ||
| 233 | srl $Zlo,4,$Zlo | ||
| 234 | |||
| 235 | s8addq $remp,$rem_4bit,$remp | ||
| 236 | xor $rem,$Zhi,$Zhi | ||
| 237 | |||
| 238 | ldq $rem,0($remp) | ||
| 239 | srl $Zhi,4,$Zhi | ||
| 240 | xor $Tlo1,$Zlo,$Zlo | ||
| 241 | xor $Thi1,$Zhi,$Zhi | ||
| 242 | xor $t0,$Zlo,$Zlo | ||
| 243 | xor $rem,$Zhi,$Zhi | ||
| 244 | ___ | ||
| 245 | }} | ||
| 246 | |||
| 247 | $code=<<___; | ||
| 248 | #ifdef __linux__ | ||
| 249 | #include <asm/regdef.h> | ||
| 250 | #else | ||
| 251 | #include <asm.h> | ||
| 252 | #include <regdef.h> | ||
| 253 | #endif | ||
| 254 | |||
| 255 | .text | ||
| 256 | |||
| 257 | .set noat | ||
| 258 | .set noreorder | ||
| 259 | .globl gcm_gmult_4bit | ||
| 260 | .align 4 | ||
| 261 | .ent gcm_gmult_4bit | ||
| 262 | gcm_gmult_4bit: | ||
| 263 | .frame sp,0,ra | ||
| 264 | .prologue 0 | ||
| 265 | |||
| 266 | ldq $Xlo,8($Xi) | ||
| 267 | ldq $Xhi,0($Xi) | ||
| 268 | |||
| 269 | br $rem_4bit,.Lpic1 | ||
| 270 | .Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit) | ||
| 271 | ___ | ||
| 272 | |||
| 273 | &loop(); | ||
| 274 | |||
| 275 | $code.=<<___; | ||
| 276 | srl $Zlo,24,$t0 # byte swap | ||
| 277 | srl $Zlo,8,$t1 | ||
| 278 | |||
| 279 | sll $Zlo,8,$t2 | ||
| 280 | sll $Zlo,24,$Zlo | ||
| 281 | zapnot $t0,0x11,$t0 | ||
| 282 | zapnot $t1,0x22,$t1 | ||
| 283 | |||
| 284 | zapnot $Zlo,0x88,$Zlo | ||
| 285 | or $t0,$t1,$t0 | ||
| 286 | zapnot $t2,0x44,$t2 | ||
| 287 | |||
| 288 | or $Zlo,$t0,$Zlo | ||
| 289 | srl $Zhi,24,$t0 | ||
| 290 | srl $Zhi,8,$t1 | ||
| 291 | |||
| 292 | or $Zlo,$t2,$Zlo | ||
| 293 | sll $Zhi,8,$t2 | ||
| 294 | sll $Zhi,24,$Zhi | ||
| 295 | |||
| 296 | srl $Zlo,32,$Xlo | ||
| 297 | sll $Zlo,32,$Zlo | ||
| 298 | |||
| 299 | zapnot $t0,0x11,$t0 | ||
| 300 | zapnot $t1,0x22,$t1 | ||
| 301 | or $Zlo,$Xlo,$Xlo | ||
| 302 | |||
| 303 | zapnot $Zhi,0x88,$Zhi | ||
| 304 | or $t0,$t1,$t0 | ||
| 305 | zapnot $t2,0x44,$t2 | ||
| 306 | |||
| 307 | or $Zhi,$t0,$Zhi | ||
| 308 | or $Zhi,$t2,$Zhi | ||
| 309 | |||
| 310 | srl $Zhi,32,$Xhi | ||
| 311 | sll $Zhi,32,$Zhi | ||
| 312 | |||
| 313 | or $Zhi,$Xhi,$Xhi | ||
| 314 | stq $Xlo,8($Xi) | ||
| 315 | stq $Xhi,0($Xi) | ||
| 316 | |||
| 317 | ret (ra) | ||
| 318 | .end gcm_gmult_4bit | ||
| 319 | ___ | ||
| 320 | |||
| 321 | $inhi="s0"; | ||
| 322 | $inlo="s1"; | ||
| 323 | |||
| 324 | $code.=<<___; | ||
| 325 | .globl gcm_ghash_4bit | ||
| 326 | .align 4 | ||
| 327 | .ent gcm_ghash_4bit | ||
| 328 | gcm_ghash_4bit: | ||
| 329 | lda sp,-32(sp) | ||
| 330 | stq ra,0(sp) | ||
| 331 | stq s0,8(sp) | ||
| 332 | stq s1,16(sp) | ||
| 333 | .mask 0x04000600,-32 | ||
| 334 | .frame sp,32,ra | ||
| 335 | .prologue 0 | ||
| 336 | |||
| 337 | ldq_u $inhi,0($inp) | ||
| 338 | ldq_u $Thi0,7($inp) | ||
| 339 | ldq_u $inlo,8($inp) | ||
| 340 | ldq_u $Tlo0,15($inp) | ||
| 341 | ldq $Xhi,0($Xi) | ||
| 342 | ldq $Xlo,8($Xi) | ||
| 343 | |||
| 344 | br $rem_4bit,.Lpic2 | ||
| 345 | .Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit) | ||
| 346 | |||
| 347 | .Louter: | ||
| 348 | extql $inhi,$inp,$inhi | ||
| 349 | extqh $Thi0,$inp,$Thi0 | ||
| 350 | or $inhi,$Thi0,$inhi | ||
| 351 | lda $inp,16($inp) | ||
| 352 | |||
| 353 | extql $inlo,$inp,$inlo | ||
| 354 | extqh $Tlo0,$inp,$Tlo0 | ||
| 355 | or $inlo,$Tlo0,$inlo | ||
| 356 | subq $len,16,$len | ||
| 357 | |||
| 358 | xor $Xlo,$inlo,$Xlo | ||
| 359 | xor $Xhi,$inhi,$Xhi | ||
| 360 | ___ | ||
| 361 | |||
| 362 | &loop(); | ||
| 363 | |||
| 364 | $code.=<<___; | ||
| 365 | srl $Zlo,24,$t0 # byte swap | ||
| 366 | srl $Zlo,8,$t1 | ||
| 367 | |||
| 368 | sll $Zlo,8,$t2 | ||
| 369 | sll $Zlo,24,$Zlo | ||
| 370 | zapnot $t0,0x11,$t0 | ||
| 371 | zapnot $t1,0x22,$t1 | ||
| 372 | |||
| 373 | zapnot $Zlo,0x88,$Zlo | ||
| 374 | or $t0,$t1,$t0 | ||
| 375 | zapnot $t2,0x44,$t2 | ||
| 376 | |||
| 377 | or $Zlo,$t0,$Zlo | ||
| 378 | srl $Zhi,24,$t0 | ||
| 379 | srl $Zhi,8,$t1 | ||
| 380 | |||
| 381 | or $Zlo,$t2,$Zlo | ||
| 382 | sll $Zhi,8,$t2 | ||
| 383 | sll $Zhi,24,$Zhi | ||
| 384 | |||
| 385 | srl $Zlo,32,$Xlo | ||
| 386 | sll $Zlo,32,$Zlo | ||
| 387 | beq $len,.Ldone | ||
| 388 | |||
| 389 | zapnot $t0,0x11,$t0 | ||
| 390 | zapnot $t1,0x22,$t1 | ||
| 391 | or $Zlo,$Xlo,$Xlo | ||
| 392 | ldq_u $inhi,0($inp) | ||
| 393 | |||
| 394 | zapnot $Zhi,0x88,$Zhi | ||
| 395 | or $t0,$t1,$t0 | ||
| 396 | zapnot $t2,0x44,$t2 | ||
| 397 | ldq_u $Thi0,7($inp) | ||
| 398 | |||
| 399 | or $Zhi,$t0,$Zhi | ||
| 400 | or $Zhi,$t2,$Zhi | ||
| 401 | ldq_u $inlo,8($inp) | ||
| 402 | ldq_u $Tlo0,15($inp) | ||
| 403 | |||
| 404 | srl $Zhi,32,$Xhi | ||
| 405 | sll $Zhi,32,$Zhi | ||
| 406 | |||
| 407 | or $Zhi,$Xhi,$Xhi | ||
| 408 | br zero,.Louter | ||
| 409 | |||
| 410 | .Ldone: | ||
| 411 | zapnot $t0,0x11,$t0 | ||
| 412 | zapnot $t1,0x22,$t1 | ||
| 413 | or $Zlo,$Xlo,$Xlo | ||
| 414 | |||
| 415 | zapnot $Zhi,0x88,$Zhi | ||
| 416 | or $t0,$t1,$t0 | ||
| 417 | zapnot $t2,0x44,$t2 | ||
| 418 | |||
| 419 | or $Zhi,$t0,$Zhi | ||
| 420 | or $Zhi,$t2,$Zhi | ||
| 421 | |||
| 422 | srl $Zhi,32,$Xhi | ||
| 423 | sll $Zhi,32,$Zhi | ||
| 424 | |||
| 425 | or $Zhi,$Xhi,$Xhi | ||
| 426 | |||
| 427 | stq $Xlo,8($Xi) | ||
| 428 | stq $Xhi,0($Xi) | ||
| 429 | |||
| 430 | .set noreorder | ||
| 431 | /*ldq ra,0(sp)*/ | ||
| 432 | ldq s0,8(sp) | ||
| 433 | ldq s1,16(sp) | ||
| 434 | lda sp,32(sp) | ||
| 435 | ret (ra) | ||
| 436 | .end gcm_ghash_4bit | ||
| 437 | |||
| 438 | .align 4 | ||
| 439 | rem_4bit: | ||
| 440 | .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 | ||
| 441 | .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 | ||
| 442 | .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 | ||
| 443 | .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 | ||
| 444 | .ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 445 | .align 4 | ||
| 446 | |||
| 447 | ___ | ||
| 448 | $output=shift and open STDOUT,">$output"; | ||
| 449 | print $code; | ||
| 450 | close STDOUT; | ||
| 451 | |||
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl new file mode 100644 index 0000000000..d91586ee29 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-armv4.pl | |||
| @@ -0,0 +1,429 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # April 2010 | ||
| 11 | # | ||
| 12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
| 13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
| 14 | # uses 256 bytes per-key table [+32 bytes shared table]. There is no | ||
| 15 | # experimental performance data available yet. The only approximation | ||
| 16 | # that can be made at this point is based on code size. Inner loop is | ||
| 17 | # 32 instructions long and on single-issue core should execute in <40 | ||
| 18 | # cycles. Having verified that gcc 3.4 didn't unroll corresponding | ||
| 19 | # loop, this assembler loop body was found to be ~3x smaller than | ||
| 20 | # compiler-generated one... | ||
| 21 | # | ||
| 22 | # July 2010 | ||
| 23 | # | ||
| 24 | # Rescheduling for dual-issue pipeline resulted in 8.5% improvement on | ||
| 25 | # Cortex A8 core and ~25 cycles per processed byte (which was observed | ||
| 26 | # to be ~3 times faster than gcc-generated code:-) | ||
| 27 | # | ||
| 28 | # February 2011 | ||
| 29 | # | ||
| 30 | # Profiler-assisted and platform-specific optimization resulted in 7% | ||
| 31 | # improvement on Cortex A8 core and ~23.5 cycles per byte. | ||
| 32 | # | ||
| 33 | # March 2011 | ||
| 34 | # | ||
| 35 | # Add NEON implementation featuring polynomial multiplication, i.e. no | ||
| 36 | # lookup tables involved. On Cortex A8 it was measured to process one | ||
| 37 | # byte in 15 cycles or 55% faster than integer-only code. | ||
| 38 | |||
| 39 | # ==================================================================== | ||
| 40 | # Note about "528B" variant. In ARM case it makes lesser sense to | ||
| 41 | # implement it for following reasons: | ||
| 42 | # | ||
| 43 | # - performance improvement won't be anywhere near 50%, because 128- | ||
| 44 | # bit shift operation is neatly fused with 128-bit xor here, and | ||
| 45 | # "538B" variant would eliminate only 4-5 instructions out of 32 | ||
| 46 | # in the inner loop (meaning that estimated improvement is ~15%); | ||
| 47 | # - ARM-based systems are often embedded ones and extra memory | ||
| 48 | # consumption might be unappreciated (for so little improvement); | ||
| 49 | # | ||
| 50 | # Byte order [in]dependence. ========================================= | ||
| 51 | # | ||
| 52 | # Caller is expected to maintain specific *dword* order in Htable, | ||
| 53 | # namely with *least* significant dword of 128-bit value at *lower* | ||
| 54 | # address. This differs completely from C code and has everything to | ||
| 55 | # do with ldm instruction and order in which dwords are "consumed" by | ||
| 56 | # algorithm. *Byte* order within these dwords in turn is whatever | ||
| 57 | # *native* byte order on current platform. See gcm128.c for working | ||
| 58 | # example... | ||
| 59 | |||
| 60 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 61 | open STDOUT,">$output"; | ||
| 62 | |||
| 63 | $Xi="r0"; # argument block | ||
| 64 | $Htbl="r1"; | ||
| 65 | $inp="r2"; | ||
| 66 | $len="r3"; | ||
| 67 | |||
| 68 | $Zll="r4"; # variables | ||
| 69 | $Zlh="r5"; | ||
| 70 | $Zhl="r6"; | ||
| 71 | $Zhh="r7"; | ||
| 72 | $Tll="r8"; | ||
| 73 | $Tlh="r9"; | ||
| 74 | $Thl="r10"; | ||
| 75 | $Thh="r11"; | ||
| 76 | $nlo="r12"; | ||
| 77 | ################# r13 is stack pointer | ||
| 78 | $nhi="r14"; | ||
| 79 | ################# r15 is program counter | ||
| 80 | |||
| 81 | $rem_4bit=$inp; # used in gcm_gmult_4bit | ||
| 82 | $cnt=$len; | ||
| 83 | |||
| 84 | sub Zsmash() { | ||
| 85 | my $i=12; | ||
| 86 | my @args=@_; | ||
| 87 | for ($Zll,$Zlh,$Zhl,$Zhh) { | ||
| 88 | $code.=<<___; | ||
| 89 | #if __ARM_ARCH__>=7 && defined(__ARMEL__) | ||
| 90 | rev $_,$_ | ||
| 91 | str $_,[$Xi,#$i] | ||
| 92 | #elif defined(__ARMEB__) | ||
| 93 | str $_,[$Xi,#$i] | ||
| 94 | #else | ||
| 95 | mov $Tlh,$_,lsr#8 | ||
| 96 | strb $_,[$Xi,#$i+3] | ||
| 97 | mov $Thl,$_,lsr#16 | ||
| 98 | strb $Tlh,[$Xi,#$i+2] | ||
| 99 | mov $Thh,$_,lsr#24 | ||
| 100 | strb $Thl,[$Xi,#$i+1] | ||
| 101 | strb $Thh,[$Xi,#$i] | ||
| 102 | #endif | ||
| 103 | ___ | ||
| 104 | $code.="\t".shift(@args)."\n"; | ||
| 105 | $i-=4; | ||
| 106 | } | ||
| 107 | } | ||
| 108 | |||
| 109 | $code=<<___; | ||
| 110 | #include "arm_arch.h" | ||
| 111 | |||
| 112 | .text | ||
| 113 | .code 32 | ||
| 114 | |||
| 115 | .type rem_4bit,%object | ||
| 116 | .align 5 | ||
| 117 | rem_4bit: | ||
| 118 | .short 0x0000,0x1C20,0x3840,0x2460 | ||
| 119 | .short 0x7080,0x6CA0,0x48C0,0x54E0 | ||
| 120 | .short 0xE100,0xFD20,0xD940,0xC560 | ||
| 121 | .short 0x9180,0x8DA0,0xA9C0,0xB5E0 | ||
| 122 | .size rem_4bit,.-rem_4bit | ||
| 123 | |||
| 124 | .type rem_4bit_get,%function | ||
| 125 | rem_4bit_get: | ||
| 126 | sub $rem_4bit,pc,#8 | ||
| 127 | sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit | ||
| 128 | b .Lrem_4bit_got | ||
| 129 | nop | ||
| 130 | .size rem_4bit_get,.-rem_4bit_get | ||
| 131 | |||
| 132 | .global gcm_ghash_4bit | ||
| 133 | .type gcm_ghash_4bit,%function | ||
| 134 | gcm_ghash_4bit: | ||
| 135 | sub r12,pc,#8 | ||
| 136 | add $len,$inp,$len @ $len to point at the end | ||
| 137 | stmdb sp!,{r3-r11,lr} @ save $len/end too | ||
| 138 | sub r12,r12,#48 @ &rem_4bit | ||
| 139 | |||
| 140 | ldmia r12,{r4-r11} @ copy rem_4bit ... | ||
| 141 | stmdb sp!,{r4-r11} @ ... to stack | ||
| 142 | |||
| 143 | ldrb $nlo,[$inp,#15] | ||
| 144 | ldrb $nhi,[$Xi,#15] | ||
| 145 | .Louter: | ||
| 146 | eor $nlo,$nlo,$nhi | ||
| 147 | and $nhi,$nlo,#0xf0 | ||
| 148 | and $nlo,$nlo,#0x0f | ||
| 149 | mov $cnt,#14 | ||
| 150 | |||
| 151 | add $Zhh,$Htbl,$nlo,lsl#4 | ||
| 152 | ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] | ||
| 153 | add $Thh,$Htbl,$nhi | ||
| 154 | ldrb $nlo,[$inp,#14] | ||
| 155 | |||
| 156 | and $nhi,$Zll,#0xf @ rem | ||
| 157 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] | ||
| 158 | add $nhi,$nhi,$nhi | ||
| 159 | eor $Zll,$Tll,$Zll,lsr#4 | ||
| 160 | ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] | ||
| 161 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
| 162 | ldrb $nhi,[$Xi,#14] | ||
| 163 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
| 164 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
| 165 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
| 166 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
| 167 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
| 168 | eor $nlo,$nlo,$nhi | ||
| 169 | and $nhi,$nlo,#0xf0 | ||
| 170 | and $nlo,$nlo,#0x0f | ||
| 171 | eor $Zhh,$Zhh,$Tll,lsl#16 | ||
| 172 | |||
| 173 | .Linner: | ||
| 174 | add $Thh,$Htbl,$nlo,lsl#4 | ||
| 175 | and $nlo,$Zll,#0xf @ rem | ||
| 176 | subs $cnt,$cnt,#1 | ||
| 177 | add $nlo,$nlo,$nlo | ||
| 178 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] | ||
| 179 | eor $Zll,$Tll,$Zll,lsr#4 | ||
| 180 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
| 181 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
| 182 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
| 183 | ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] | ||
| 184 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
| 185 | ldrplb $nlo,[$inp,$cnt] | ||
| 186 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
| 187 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
| 188 | |||
| 189 | add $Thh,$Htbl,$nhi | ||
| 190 | and $nhi,$Zll,#0xf @ rem | ||
| 191 | eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] | ||
| 192 | add $nhi,$nhi,$nhi | ||
| 193 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] | ||
| 194 | eor $Zll,$Tll,$Zll,lsr#4 | ||
| 195 | ldrplb $Tll,[$Xi,$cnt] | ||
| 196 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
| 197 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
| 198 | ldrh $Tlh,[sp,$nhi] | ||
| 199 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
| 200 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
| 201 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
| 202 | eorpl $nlo,$nlo,$Tll | ||
| 203 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
| 204 | andpl $nhi,$nlo,#0xf0 | ||
| 205 | andpl $nlo,$nlo,#0x0f | ||
| 206 | eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] | ||
| 207 | bpl .Linner | ||
| 208 | |||
| 209 | ldr $len,[sp,#32] @ re-load $len/end | ||
| 210 | add $inp,$inp,#16 | ||
| 211 | mov $nhi,$Zll | ||
| 212 | ___ | ||
| 213 | &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]"); | ||
| 214 | $code.=<<___; | ||
| 215 | bne .Louter | ||
| 216 | |||
| 217 | add sp,sp,#36 | ||
| 218 | #if __ARM_ARCH__>=5 | ||
| 219 | ldmia sp!,{r4-r11,pc} | ||
| 220 | #else | ||
| 221 | ldmia sp!,{r4-r11,lr} | ||
| 222 | tst lr,#1 | ||
| 223 | moveq pc,lr @ be binary compatible with V4, yet | ||
| 224 | bx lr @ interoperable with Thumb ISA:-) | ||
| 225 | #endif | ||
| 226 | .size gcm_ghash_4bit,.-gcm_ghash_4bit | ||
| 227 | |||
| 228 | .global gcm_gmult_4bit | ||
| 229 | .type gcm_gmult_4bit,%function | ||
| 230 | gcm_gmult_4bit: | ||
| 231 | stmdb sp!,{r4-r11,lr} | ||
| 232 | ldrb $nlo,[$Xi,#15] | ||
| 233 | b rem_4bit_get | ||
| 234 | .Lrem_4bit_got: | ||
| 235 | and $nhi,$nlo,#0xf0 | ||
| 236 | and $nlo,$nlo,#0x0f | ||
| 237 | mov $cnt,#14 | ||
| 238 | |||
| 239 | add $Zhh,$Htbl,$nlo,lsl#4 | ||
| 240 | ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] | ||
| 241 | ldrb $nlo,[$Xi,#14] | ||
| 242 | |||
| 243 | add $Thh,$Htbl,$nhi | ||
| 244 | and $nhi,$Zll,#0xf @ rem | ||
| 245 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] | ||
| 246 | add $nhi,$nhi,$nhi | ||
| 247 | eor $Zll,$Tll,$Zll,lsr#4 | ||
| 248 | ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] | ||
| 249 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
| 250 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
| 251 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
| 252 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
| 253 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
| 254 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
| 255 | and $nhi,$nlo,#0xf0 | ||
| 256 | eor $Zhh,$Zhh,$Tll,lsl#16 | ||
| 257 | and $nlo,$nlo,#0x0f | ||
| 258 | |||
| 259 | .Loop: | ||
| 260 | add $Thh,$Htbl,$nlo,lsl#4 | ||
| 261 | and $nlo,$Zll,#0xf @ rem | ||
| 262 | subs $cnt,$cnt,#1 | ||
| 263 | add $nlo,$nlo,$nlo | ||
| 264 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] | ||
| 265 | eor $Zll,$Tll,$Zll,lsr#4 | ||
| 266 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
| 267 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
| 268 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
| 269 | ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] | ||
| 270 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
| 271 | ldrplb $nlo,[$Xi,$cnt] | ||
| 272 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
| 273 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
| 274 | |||
| 275 | add $Thh,$Htbl,$nhi | ||
| 276 | and $nhi,$Zll,#0xf @ rem | ||
| 277 | eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] | ||
| 278 | add $nhi,$nhi,$nhi | ||
| 279 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] | ||
| 280 | eor $Zll,$Tll,$Zll,lsr#4 | ||
| 281 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
| 282 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
| 283 | ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] | ||
| 284 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
| 285 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
| 286 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
| 287 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
| 288 | andpl $nhi,$nlo,#0xf0 | ||
| 289 | andpl $nlo,$nlo,#0x0f | ||
| 290 | eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] | ||
| 291 | bpl .Loop | ||
| 292 | ___ | ||
| 293 | &Zsmash(); | ||
| 294 | $code.=<<___; | ||
| 295 | #if __ARM_ARCH__>=5 | ||
| 296 | ldmia sp!,{r4-r11,pc} | ||
| 297 | #else | ||
| 298 | ldmia sp!,{r4-r11,lr} | ||
| 299 | tst lr,#1 | ||
| 300 | moveq pc,lr @ be binary compatible with V4, yet | ||
| 301 | bx lr @ interoperable with Thumb ISA:-) | ||
| 302 | #endif | ||
| 303 | .size gcm_gmult_4bit,.-gcm_gmult_4bit | ||
| 304 | ___ | ||
| 305 | { | ||
| 306 | my $cnt=$Htbl; # $Htbl is used once in the very beginning | ||
| 307 | |||
| 308 | my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); | ||
| 309 | my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); | ||
| 310 | |||
| 311 | # Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit | ||
| 312 | # in Zo. Or should I say "top bit", because GHASH is specified in | ||
| 313 | # reverse bit order? Otherwise straightforward 128-bt H by one input | ||
| 314 | # byte multiplication and modulo-reduction, times 16. | ||
| 315 | |||
| 316 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | ||
| 317 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | ||
| 318 | sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } | ||
| 319 | |||
| 320 | $code.=<<___; | ||
| 321 | #if __ARM_ARCH__>=7 | ||
| 322 | .fpu neon | ||
| 323 | |||
| 324 | .global gcm_gmult_neon | ||
| 325 | .type gcm_gmult_neon,%function | ||
| 326 | .align 4 | ||
| 327 | gcm_gmult_neon: | ||
| 328 | sub $Htbl,#16 @ point at H in GCM128_CTX | ||
| 329 | vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi | ||
| 330 | vmov.i32 $mod,#0xe1 @ our irreducible polynomial | ||
| 331 | vld1.64 `&Dlo("$IN")`,[$Xi,:64]! | ||
| 332 | vshr.u64 $mod,#32 | ||
| 333 | vldmia $Htbl,{$Hhi-$Hlo} @ load H | ||
| 334 | veor $zero,$zero | ||
| 335 | #ifdef __ARMEL__ | ||
| 336 | vrev64.8 $IN,$IN | ||
| 337 | #endif | ||
| 338 | veor $Qpost,$Qpost | ||
| 339 | veor $R,$R | ||
| 340 | mov $cnt,#16 | ||
| 341 | veor $Z,$Z | ||
| 342 | mov $len,#16 | ||
| 343 | veor $Zo,$Zo | ||
| 344 | vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte | ||
| 345 | b .Linner_neon | ||
| 346 | .size gcm_gmult_neon,.-gcm_gmult_neon | ||
| 347 | |||
| 348 | .global gcm_ghash_neon | ||
| 349 | .type gcm_ghash_neon,%function | ||
| 350 | .align 4 | ||
| 351 | gcm_ghash_neon: | ||
| 352 | vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi | ||
| 353 | vmov.i32 $mod,#0xe1 @ our irreducible polynomial | ||
| 354 | vld1.64 `&Dlo("$Z")`,[$Xi,:64]! | ||
| 355 | vshr.u64 $mod,#32 | ||
| 356 | vldmia $Xi,{$Hhi-$Hlo} @ load H | ||
| 357 | veor $zero,$zero | ||
| 358 | nop | ||
| 359 | #ifdef __ARMEL__ | ||
| 360 | vrev64.8 $Z,$Z | ||
| 361 | #endif | ||
| 362 | .Louter_neon: | ||
| 363 | vld1.64 `&Dhi($IN)`,[$inp]! @ load inp | ||
| 364 | veor $Qpost,$Qpost | ||
| 365 | vld1.64 `&Dlo($IN)`,[$inp]! | ||
| 366 | veor $R,$R | ||
| 367 | mov $cnt,#16 | ||
| 368 | #ifdef __ARMEL__ | ||
| 369 | vrev64.8 $IN,$IN | ||
| 370 | #endif | ||
| 371 | veor $Zo,$Zo | ||
| 372 | veor $IN,$Z @ inp^=Xi | ||
| 373 | veor $Z,$Z | ||
| 374 | vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte | ||
| 375 | .Linner_neon: | ||
| 376 | subs $cnt,$cnt,#1 | ||
| 377 | vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i] | ||
| 378 | vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i] | ||
| 379 | vext.8 $IN,$zero,#1 @ IN>>=8 | ||
| 380 | |||
| 381 | veor $Z,$Qpost @ modulo-scheduled part | ||
| 382 | vshl.i64 `&Dlo("$R")`,#48 | ||
| 383 | vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte | ||
| 384 | veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")` | ||
| 385 | |||
| 386 | veor `&Dhi("$Z")`,`&Dlo("$R")` | ||
| 387 | vuzp.8 $Qlo,$Qhi | ||
| 388 | vsli.8 $Zo,$T,#1 @ compose the "carry" byte | ||
| 389 | vext.8 $Z,$zero,#1 @ Z>>=8 | ||
| 390 | |||
| 391 | vmull.p8 $R,$Zo,$mod @ "carry"·0xe1 | ||
| 392 | vshr.u8 $Zo,$T,#7 @ save Z's bottom bit | ||
| 393 | vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8 | ||
| 394 | veor $Z,$Qhi | ||
| 395 | bne .Linner_neon | ||
| 396 | |||
| 397 | veor $Z,$Qpost @ modulo-scheduled artefact | ||
| 398 | vshl.i64 `&Dlo("$R")`,#48 | ||
| 399 | veor `&Dhi("$Z")`,`&Dlo("$R")` | ||
| 400 | |||
| 401 | @ finalization, normalize Z:Zo | ||
| 402 | vand $Zo,$mod @ suffices to mask the bit | ||
| 403 | vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 | ||
| 404 | vshl.i64 $Z,#1 | ||
| 405 | subs $len,#16 | ||
| 406 | vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1 | ||
| 407 | bne .Louter_neon | ||
| 408 | |||
| 409 | #ifdef __ARMEL__ | ||
| 410 | vrev64.8 $Z,$Z | ||
| 411 | #endif | ||
| 412 | sub $Xi,#16 | ||
| 413 | vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi | ||
| 414 | vst1.64 `&Dlo("$Z")`,[$Xi,:64] | ||
| 415 | |||
| 416 | bx lr | ||
| 417 | .size gcm_ghash_neon,.-gcm_ghash_neon | ||
| 418 | #endif | ||
| 419 | ___ | ||
| 420 | } | ||
| 421 | $code.=<<___; | ||
| 422 | .asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 423 | .align 2 | ||
| 424 | ___ | ||
| 425 | |||
| 426 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 427 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
| 428 | print $code; | ||
| 429 | close STDOUT; # enforce flush | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl new file mode 100755 index 0000000000..0354c95444 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-ia64.pl | |||
| @@ -0,0 +1,463 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # March 2010 | ||
| 11 | # | ||
| 12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
| 13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
| 14 | # uses 256 bytes per-key table [+128 bytes shared table]. Streamed | ||
| 15 | # GHASH performance was measured to be 6.67 cycles per processed byte | ||
| 16 | # on Itanium 2, which is >90% better than Microsoft compiler generated | ||
| 17 | # code. To anchor to something else sha1-ia64.pl module processes one | ||
| 18 | # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per | ||
| 19 | # byte. | ||
| 20 | |||
| 21 | # September 2010 | ||
| 22 | # | ||
| 23 | # It was originally thought that it makes lesser sense to implement | ||
| 24 | # "528B" variant on Itanium 2 for following reason. Because number of | ||
| 25 | # functional units is naturally limited, it appeared impossible to | ||
| 26 | # implement "528B" loop in 4 cycles, only in 5. This would mean that | ||
| 27 | # theoretically performance improvement couldn't be more than 20%. | ||
| 28 | # But occasionally you prove yourself wrong:-) I figured out a way to | ||
| 29 | # fold couple of instructions and having freed yet another instruction | ||
| 30 | # slot by unrolling the loop... Resulting performance is 4.45 cycles | ||
| 31 | # per processed byte and 50% better than "256B" version. On original | ||
| 32 | # Itanium performance should remain the same as the "256B" version, | ||
| 33 | # i.e. ~8.5 cycles. | ||
| 34 | |||
| 35 | $output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); | ||
| 36 | |||
| 37 | if ($^O eq "hpux") { | ||
| 38 | $ADDP="addp4"; | ||
| 39 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | ||
| 40 | } else { $ADDP="add"; } | ||
| 41 | for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); | ||
| 42 | $big_endian=0 if (/\-DL_ENDIAN/); } | ||
| 43 | if (!defined($big_endian)) | ||
| 44 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
| 45 | |||
| 46 | sub loop() { | ||
| 47 | my $label=shift; | ||
| 48 | my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp | ||
| 49 | |||
| 50 | # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. | ||
| 51 | # in scalable manner;-) Naturally assuming data in L1 cache... | ||
| 52 | # Special note about 'dep' instruction, which is used to construct | ||
| 53 | # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 | ||
| 54 | # bytes boundary and lower 7 bits of its address are guaranteed to | ||
| 55 | # be zero. | ||
| 56 | $code.=<<___; | ||
| 57 | $label: | ||
| 58 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | ||
| 59 | (p19) dep rem=Zlo,rem_4bitp,3,4 } | ||
| 60 | { .mfi; (p19) xor Zhi=Zhi,Hhi | ||
| 61 | ($p17) xor xi[1]=xi[1],in[1] };; | ||
| 62 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | ||
| 63 | (p19) shrp Zlo=Zhi,Zlo,4 } | ||
| 64 | { .mfi; (p19) ld8 rem=[rem] | ||
| 65 | (p18) and Hi[1]=mask0xf0,xi[2] };; | ||
| 66 | { .mmi; ($p16) ld1 in[0]=[inp],-1 | ||
| 67 | (p18) xor Zlo=Zlo,Hlo | ||
| 68 | (p19) shr.u Zhi=Zhi,4 } | ||
| 69 | { .mib; (p19) xor Hhi=Hhi,rem | ||
| 70 | (p18) add Hi[1]=Htbl,Hi[1] };; | ||
| 71 | |||
| 72 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | ||
| 73 | (p18) dep rem=Zlo,rem_4bitp,3,4 } | ||
| 74 | { .mfi; (p17) shladd Hi[0]=xi[1],4,r0 | ||
| 75 | (p18) xor Zhi=Zhi,Hhi };; | ||
| 76 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | ||
| 77 | (p18) shrp Zlo=Zhi,Zlo,4 } | ||
| 78 | { .mfi; (p18) ld8 rem=[rem] | ||
| 79 | (p17) and Hi[0]=mask0xf0,Hi[0] };; | ||
| 80 | { .mmi; (p16) ld1 xi[0]=[Xi],-1 | ||
| 81 | (p18) xor Zlo=Zlo,Hlo | ||
| 82 | (p18) shr.u Zhi=Zhi,4 } | ||
| 83 | { .mib; (p18) xor Hhi=Hhi,rem | ||
| 84 | (p17) add Hi[0]=Htbl,Hi[0] | ||
| 85 | br.ctop.sptk $label };; | ||
| 86 | ___ | ||
| 87 | } | ||
| 88 | |||
| 89 | $code=<<___; | ||
| 90 | .explicit | ||
| 91 | .text | ||
| 92 | |||
| 93 | prevfs=r2; prevlc=r3; prevpr=r8; | ||
| 94 | mask0xf0=r21; | ||
| 95 | rem=r22; rem_4bitp=r23; | ||
| 96 | Xi=r24; Htbl=r25; | ||
| 97 | inp=r26; end=r27; | ||
| 98 | Hhi=r28; Hlo=r29; | ||
| 99 | Zhi=r30; Zlo=r31; | ||
| 100 | |||
| 101 | .align 128 | ||
| 102 | .skip 16 // aligns loop body | ||
| 103 | .global gcm_gmult_4bit# | ||
| 104 | .proc gcm_gmult_4bit# | ||
| 105 | gcm_gmult_4bit: | ||
| 106 | .prologue | ||
| 107 | { .mmi; .save ar.pfs,prevfs | ||
| 108 | alloc prevfs=ar.pfs,2,6,0,8 | ||
| 109 | $ADDP Xi=15,in0 // &Xi[15] | ||
| 110 | mov rem_4bitp=ip } | ||
| 111 | { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo | ||
| 112 | .save ar.lc,prevlc | ||
| 113 | mov prevlc=ar.lc | ||
| 114 | .save pr,prevpr | ||
| 115 | mov prevpr=pr };; | ||
| 116 | |||
| 117 | .body | ||
| 118 | .rotr in[3],xi[3],Hi[2] | ||
| 119 | |||
| 120 | { .mib; ld1 xi[2]=[Xi],-1 // Xi[15] | ||
| 121 | mov mask0xf0=0xf0 | ||
| 122 | brp.loop.imp .Loop1,.Lend1-16};; | ||
| 123 | { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] | ||
| 124 | };; | ||
| 125 | { .mii; shladd Hi[1]=xi[2],4,r0 | ||
| 126 | mov pr.rot=0x7<<16 | ||
| 127 | mov ar.lc=13 };; | ||
| 128 | { .mii; and Hi[1]=mask0xf0,Hi[1] | ||
| 129 | mov ar.ec=3 | ||
| 130 | xor Zlo=Zlo,Zlo };; | ||
| 131 | { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo | ||
| 132 | add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp | ||
| 133 | xor Zhi=Zhi,Zhi };; | ||
| 134 | ___ | ||
| 135 | &loop (".Loop1",1); | ||
| 136 | $code.=<<___; | ||
| 137 | .Lend1: | ||
| 138 | { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact | ||
| 139 | { .mib; mux1 Zlo=Zlo,\@rev };; | ||
| 140 | { .mib; mux1 Zhi=Zhi,\@rev };; | ||
| 141 | { .mmi; add Hlo=9,Xi;; // ;; is here to prevent | ||
| 142 | add Hhi=1,Xi };; // pipeline flush on Itanium | ||
| 143 | { .mib; st8 [Hlo]=Zlo | ||
| 144 | mov pr=prevpr,0x1ffff };; | ||
| 145 | { .mib; st8 [Hhi]=Zhi | ||
| 146 | mov ar.lc=prevlc | ||
| 147 | br.ret.sptk.many b0 };; | ||
| 148 | .endp gcm_gmult_4bit# | ||
| 149 | ___ | ||
| 150 | |||
| 151 | ###################################################################### | ||
| 152 | # "528B" (well, "512B" actualy) streamed GHASH | ||
| 153 | # | ||
| 154 | $Xip="in0"; | ||
| 155 | $Htbl="in1"; | ||
| 156 | $inp="in2"; | ||
| 157 | $len="in3"; | ||
| 158 | $rem_8bit="loc0"; | ||
| 159 | $mask0xff="loc1"; | ||
| 160 | ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); | ||
| 161 | |||
| 162 | sub load_htable() { | ||
| 163 | for (my $i=0;$i<8;$i++) { | ||
| 164 | $code.=<<___; | ||
| 165 | { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi | ||
| 166 | ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo | ||
| 167 | { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi | ||
| 168 | ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo | ||
| 169 | ___ | ||
| 170 | $code.=shift if (($i+$#_)==7); | ||
| 171 | $code.="\t};;\n" | ||
| 172 | } | ||
| 173 | } | ||
| 174 | |||
| 175 | $code.=<<___; | ||
| 176 | prevsp=r3; | ||
| 177 | |||
| 178 | .align 32 | ||
| 179 | .skip 16 // aligns loop body | ||
| 180 | .global gcm_ghash_4bit# | ||
| 181 | .proc gcm_ghash_4bit# | ||
| 182 | gcm_ghash_4bit: | ||
| 183 | .prologue | ||
| 184 | { .mmi; .save ar.pfs,prevfs | ||
| 185 | alloc prevfs=ar.pfs,4,2,0,0 | ||
| 186 | .vframe prevsp | ||
| 187 | mov prevsp=sp | ||
| 188 | mov $rem_8bit=ip };; | ||
| 189 | .body | ||
| 190 | { .mfi; $ADDP r8=0+0,$Htbl | ||
| 191 | $ADDP r9=0+8,$Htbl } | ||
| 192 | { .mfi; $ADDP r10=128+0,$Htbl | ||
| 193 | $ADDP r11=128+8,$Htbl };; | ||
| 194 | ___ | ||
| 195 | &load_htable( | ||
| 196 | " $ADDP $Xip=15,$Xip", # &Xi[15] | ||
| 197 | " $ADDP $len=$len,$inp", # &inp[len] | ||
| 198 | " $ADDP $inp=15,$inp", # &inp[15] | ||
| 199 | " mov $mask0xff=0xff", | ||
| 200 | " add sp=-512,sp", | ||
| 201 | " andcm sp=sp,$mask0xff", # align stack frame | ||
| 202 | " add r14=0,sp", | ||
| 203 | " add r15=8,sp"); | ||
| 204 | $code.=<<___; | ||
| 205 | { .mmi; $sum 1<<1 // go big-endian | ||
| 206 | add r8=256+0,sp | ||
| 207 | add r9=256+8,sp } | ||
| 208 | { .mmi; add r10=256+128+0,sp | ||
| 209 | add r11=256+128+8,sp | ||
| 210 | add $len=-17,$len };; | ||
| 211 | ___ | ||
| 212 | for($i=0;$i<8;$i++) { # generate first half of Hshr4[] | ||
| 213 | my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); | ||
| 214 | $code.=<<___; | ||
| 215 | { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo | ||
| 216 | st8 [r9]=$rhi,16 // Htable[$i].hi | ||
| 217 | shrp $rlo=$rhi,$rlo,4 }//;; | ||
| 218 | { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo | ||
| 219 | stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi | ||
| 220 | shr.u $rhi=$rhi,4 };; | ||
| 221 | { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 | ||
| 222 | st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 | ||
| 223 | ___ | ||
| 224 | } | ||
| 225 | $code.=<<___; | ||
| 226 | { .mmi; ld8 r16=[r8],16 // Htable[8].lo | ||
| 227 | ld8 r17=[r9],16 };; // Htable[8].hi | ||
| 228 | { .mmi; ld8 r18=[r8],16 // Htable[9].lo | ||
| 229 | ld8 r19=[r9],16 } // Htable[9].hi | ||
| 230 | { .mmi; rum 1<<5 // clear um.mfh | ||
| 231 | shrp r16=r17,r16,4 };; | ||
| 232 | ___ | ||
| 233 | for($i=0;$i<6;$i++) { # generate second half of Hshr4[] | ||
| 234 | $code.=<<___; | ||
| 235 | { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo | ||
| 236 | ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi | ||
| 237 | shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; | ||
| 238 | { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 | ||
| 239 | st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 | ||
| 240 | shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } | ||
| 241 | ___ | ||
| 242 | } | ||
| 243 | $code.=<<___; | ||
| 244 | { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; | ||
| 245 | { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 | ||
| 246 | st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 | ||
| 247 | shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } | ||
| 248 | { .mmi; add $Htbl=256,sp // &Htable[0] | ||
| 249 | add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit | ||
| 250 | shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; | ||
| 251 | { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 | ||
| 252 | st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 | ||
| 253 | ___ | ||
| 254 | |||
| 255 | $in="r15"; | ||
| 256 | @xi=("r16","r17"); | ||
| 257 | @rem=("r18","r19"); | ||
| 258 | ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); | ||
| 259 | ($Atbl,$Btbl)=("r26","r27"); | ||
| 260 | |||
| 261 | $code.=<<___; # (p16) | ||
| 262 | { .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- | ||
| 263 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
| 264 | cmp.eq p0,p6=r0,r0 };; // clear p6 | ||
| 265 | ___ | ||
| 266 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 267 | |||
| 268 | $code.=<<___; # (p16),(p17) | ||
| 269 | { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
| 270 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
| 271 | { .mii; ld1 $in=[$inp],-1 //(p16) *inp-- | ||
| 272 | dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo | ||
| 273 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
| 274 | .align 32 | ||
| 275 | .LOOP: | ||
| 276 | { .mmi; | ||
| 277 | (p6) st8 [$Xip]=$Zhi,13 | ||
| 278 | xor $Zlo=$Zlo,$Zlo | ||
| 279 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo | ||
| 280 | ___ | ||
| 281 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 282 | |||
| 283 | $code.=<<___; # (p16),(p17),(p18) | ||
| 284 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
| 285 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
| 286 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
| 287 | { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
| 288 | dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo | ||
| 289 | { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
| 290 | xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo | ||
| 291 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
| 292 | ld1 $in=[$inp],-1 } //(p16) *inp-- | ||
| 293 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
| 294 | mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
| 295 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
| 296 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
| 297 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
| 298 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
| 299 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
| 300 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
| 301 | ___ | ||
| 302 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 303 | |||
| 304 | for ($i=1;$i<14;$i++) { | ||
| 305 | # Above and below fragments are derived from this one by removing | ||
| 306 | # unsuitable (p??) instructions. | ||
| 307 | $code.=<<___; # (p16),(p17),(p18),(p19) | ||
| 308 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
| 309 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
| 310 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
| 311 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
| 312 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
| 313 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
| 314 | { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
| 315 | ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
| 316 | dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo | ||
| 317 | { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
| 318 | xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo | ||
| 319 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
| 320 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
| 321 | ld1 $in=[$inp],-1 //(p16) *inp-- | ||
| 322 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
| 323 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
| 324 | xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
| 325 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
| 326 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
| 327 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
| 328 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
| 329 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
| 330 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
| 331 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
| 332 | ___ | ||
| 333 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 334 | } | ||
| 335 | |||
| 336 | $code.=<<___; # (p17),(p18),(p19) | ||
| 337 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
| 338 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
| 339 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
| 340 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
| 341 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
| 342 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
| 343 | { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
| 344 | ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
| 345 | dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo | ||
| 346 | { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
| 347 | xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo | ||
| 348 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
| 349 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
| 350 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
| 351 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
| 352 | xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
| 353 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
| 354 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
| 355 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
| 356 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
| 357 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
| 358 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
| 359 | ___ | ||
| 360 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 361 | |||
| 362 | $code.=<<___; # (p18),(p19) | ||
| 363 | { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
| 364 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
| 365 | { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
| 366 | xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo | ||
| 367 | { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
| 368 | xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo | ||
| 369 | { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
| 370 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
| 371 | { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi | ||
| 372 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
| 373 | { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 | ||
| 374 | xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi | ||
| 375 | { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi | ||
| 376 | shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) | ||
| 377 | { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
| 378 | xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
| 379 | ___ | ||
| 380 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 381 | |||
| 382 | $code.=<<___; # (p19) | ||
| 383 | { .mmi; cmp.ltu p6,p0=$inp,$len | ||
| 384 | add $inp=32,$inp | ||
| 385 | shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 | ||
| 386 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
| 387 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
| 388 | add $Xip=9,$Xip };; // &Xi.lo | ||
| 389 | { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
| 390 | (p6) ld1 $in=[$inp],-1 //[p16] *inp-- | ||
| 391 | (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] | ||
| 392 | { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi | ||
| 393 | (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] | ||
| 394 | { .mmi; st8 [$Xip]=$Zlo,-8 | ||
| 395 | (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] | ||
| 396 | shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 | ||
| 397 | { .mmi; | ||
| 398 | (p6) ld1 $in=[$inp],-1 //[p16] *inp-- | ||
| 399 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
| 400 | (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo | ||
| 401 | { .mib; | ||
| 402 | (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 | ||
| 403 | (p6) br.cond.dptk.many .LOOP };; | ||
| 404 | |||
| 405 | { .mib; st8 [$Xip]=$Zhi };; | ||
| 406 | { .mib; $rum 1<<1 // return to little-endian | ||
| 407 | .restore sp | ||
| 408 | mov sp=prevsp | ||
| 409 | br.ret.sptk.many b0 };; | ||
| 410 | .endp gcm_ghash_4bit# | ||
| 411 | ___ | ||
| 412 | $code.=<<___; | ||
| 413 | .align 128 | ||
| 414 | .type rem_4bit#,\@object | ||
| 415 | rem_4bit: | ||
| 416 | data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 | ||
| 417 | data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 | ||
| 418 | data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 | ||
| 419 | data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 | ||
| 420 | .size rem_4bit#,128 | ||
| 421 | .type rem_8bit#,\@object | ||
| 422 | rem_8bit: | ||
| 423 | data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E | ||
| 424 | data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E | ||
| 425 | data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E | ||
| 426 | data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E | ||
| 427 | data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E | ||
| 428 | data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E | ||
| 429 | data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E | ||
| 430 | data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E | ||
| 431 | data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE | ||
| 432 | data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE | ||
| 433 | data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE | ||
| 434 | data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE | ||
| 435 | data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E | ||
| 436 | data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E | ||
| 437 | data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE | ||
| 438 | data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE | ||
| 439 | data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E | ||
| 440 | data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E | ||
| 441 | data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E | ||
| 442 | data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E | ||
| 443 | data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E | ||
| 444 | data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E | ||
| 445 | data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E | ||
| 446 | data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E | ||
| 447 | data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE | ||
| 448 | data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE | ||
| 449 | data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE | ||
| 450 | data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE | ||
| 451 | data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E | ||
| 452 | data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E | ||
| 453 | data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE | ||
| 454 | data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE | ||
| 455 | .size rem_8bit#,512 | ||
| 456 | stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 457 | ___ | ||
| 458 | |||
| 459 | $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); | ||
| 460 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 461 | |||
| 462 | print $code; | ||
| 463 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl new file mode 100644 index 0000000000..8c7454ee93 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-parisc.pl | |||
| @@ -0,0 +1,730 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # April 2010 | ||
| 11 | # | ||
| 12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
| 13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
| 14 | # uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC | ||
| 15 | # it processes one byte in 19.6 cycles, which is more than twice as | ||
| 16 | # fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for | ||
| 17 | # 8 cycles, but measured performance on PA-8600 system is ~9 cycles per | ||
| 18 | # processed byte. This is ~2.2x faster than 64-bit code generated by | ||
| 19 | # vendor compiler (which used to be very hard to beat:-). | ||
| 20 | # | ||
| 21 | # Special thanks to polarhome.com for providing HP-UX account. | ||
| 22 | |||
| 23 | $flavour = shift; | ||
| 24 | $output = shift; | ||
| 25 | open STDOUT,">$output"; | ||
| 26 | |||
| 27 | if ($flavour =~ /64/) { | ||
| 28 | $LEVEL ="2.0W"; | ||
| 29 | $SIZE_T =8; | ||
| 30 | $FRAME_MARKER =80; | ||
| 31 | $SAVED_RP =16; | ||
| 32 | $PUSH ="std"; | ||
| 33 | $PUSHMA ="std,ma"; | ||
| 34 | $POP ="ldd"; | ||
| 35 | $POPMB ="ldd,mb"; | ||
| 36 | $NREGS =6; | ||
| 37 | } else { | ||
| 38 | $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0"; | ||
| 39 | $SIZE_T =4; | ||
| 40 | $FRAME_MARKER =48; | ||
| 41 | $SAVED_RP =20; | ||
| 42 | $PUSH ="stw"; | ||
| 43 | $PUSHMA ="stwm"; | ||
| 44 | $POP ="ldw"; | ||
| 45 | $POPMB ="ldwm"; | ||
| 46 | $NREGS =11; | ||
| 47 | } | ||
| 48 | |||
| 49 | $FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker | ||
| 50 | # [+ argument transfer] | ||
| 51 | |||
| 52 | ################# volatile registers | ||
| 53 | $Xi="%r26"; # argument block | ||
| 54 | $Htbl="%r25"; | ||
| 55 | $inp="%r24"; | ||
| 56 | $len="%r23"; | ||
| 57 | $Hhh=$Htbl; # variables | ||
| 58 | $Hll="%r22"; | ||
| 59 | $Zhh="%r21"; | ||
| 60 | $Zll="%r20"; | ||
| 61 | $cnt="%r19"; | ||
| 62 | $rem_4bit="%r28"; | ||
| 63 | $rem="%r29"; | ||
| 64 | $mask0xf0="%r31"; | ||
| 65 | |||
| 66 | ################# preserved registers | ||
| 67 | $Thh="%r1"; | ||
| 68 | $Tll="%r2"; | ||
| 69 | $nlo="%r3"; | ||
| 70 | $nhi="%r4"; | ||
| 71 | $byte="%r5"; | ||
| 72 | if ($SIZE_T==4) { | ||
| 73 | $Zhl="%r6"; | ||
| 74 | $Zlh="%r7"; | ||
| 75 | $Hhl="%r8"; | ||
| 76 | $Hlh="%r9"; | ||
| 77 | $Thl="%r10"; | ||
| 78 | $Tlh="%r11"; | ||
| 79 | } | ||
| 80 | $rem2="%r6"; # used in PA-RISC 2.0 code | ||
| 81 | |||
| 82 | $code.=<<___; | ||
| 83 | .LEVEL $LEVEL | ||
| 84 | .SPACE \$TEXT\$ | ||
| 85 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
| 86 | |||
| 87 | .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR | ||
| 88 | .ALIGN 64 | ||
| 89 | gcm_gmult_4bit | ||
| 90 | .PROC | ||
| 91 | .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS | ||
| 92 | .ENTRY | ||
| 93 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
| 94 | $PUSHMA %r3,$FRAME(%sp) | ||
| 95 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
| 96 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
| 97 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
| 98 | ___ | ||
| 99 | $code.=<<___ if ($SIZE_T==4); | ||
| 100 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
| 101 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
| 102 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
| 103 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
| 104 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
| 105 | ___ | ||
| 106 | $code.=<<___; | ||
| 107 | blr %r0,$rem_4bit | ||
| 108 | ldi 3,$rem | ||
| 109 | L\$pic_gmult | ||
| 110 | andcm $rem_4bit,$rem,$rem_4bit | ||
| 111 | addl $inp,$len,$len | ||
| 112 | ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit | ||
| 113 | ldi 0xf0,$mask0xf0 | ||
| 114 | ___ | ||
| 115 | $code.=<<___ if ($SIZE_T==4); | ||
| 116 | ldi 31,$rem | ||
| 117 | mtctl $rem,%cr11 | ||
| 118 | extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 | ||
| 119 | b L\$parisc1_gmult | ||
| 120 | nop | ||
| 121 | ___ | ||
| 122 | |||
| 123 | $code.=<<___; | ||
| 124 | ldb 15($Xi),$nlo | ||
| 125 | ldo 8($Htbl),$Hll | ||
| 126 | |||
| 127 | and $mask0xf0,$nlo,$nhi | ||
| 128 | depd,z $nlo,59,4,$nlo | ||
| 129 | |||
| 130 | ldd $nlo($Hll),$Zll | ||
| 131 | ldd $nlo($Hhh),$Zhh | ||
| 132 | |||
| 133 | depd,z $Zll,60,4,$rem | ||
| 134 | shrpd $Zhh,$Zll,4,$Zll | ||
| 135 | extrd,u $Zhh,59,60,$Zhh | ||
| 136 | ldb 14($Xi),$nlo | ||
| 137 | |||
| 138 | ldd $nhi($Hll),$Tll | ||
| 139 | ldd $nhi($Hhh),$Thh | ||
| 140 | and $mask0xf0,$nlo,$nhi | ||
| 141 | depd,z $nlo,59,4,$nlo | ||
| 142 | |||
| 143 | xor $Tll,$Zll,$Zll | ||
| 144 | xor $Thh,$Zhh,$Zhh | ||
| 145 | ldd $rem($rem_4bit),$rem | ||
| 146 | b L\$oop_gmult_pa2 | ||
| 147 | ldi 13,$cnt | ||
| 148 | |||
| 149 | .ALIGN 8 | ||
| 150 | L\$oop_gmult_pa2 | ||
| 151 | xor $rem,$Zhh,$Zhh ; moved here to work around gas bug | ||
| 152 | depd,z $Zll,60,4,$rem | ||
| 153 | |||
| 154 | shrpd $Zhh,$Zll,4,$Zll | ||
| 155 | extrd,u $Zhh,59,60,$Zhh | ||
| 156 | ldd $nlo($Hll),$Tll | ||
| 157 | ldd $nlo($Hhh),$Thh | ||
| 158 | |||
| 159 | xor $Tll,$Zll,$Zll | ||
| 160 | xor $Thh,$Zhh,$Zhh | ||
| 161 | ldd $rem($rem_4bit),$rem | ||
| 162 | |||
| 163 | xor $rem,$Zhh,$Zhh | ||
| 164 | depd,z $Zll,60,4,$rem | ||
| 165 | ldbx $cnt($Xi),$nlo | ||
| 166 | |||
| 167 | shrpd $Zhh,$Zll,4,$Zll | ||
| 168 | extrd,u $Zhh,59,60,$Zhh | ||
| 169 | ldd $nhi($Hll),$Tll | ||
| 170 | ldd $nhi($Hhh),$Thh | ||
| 171 | |||
| 172 | and $mask0xf0,$nlo,$nhi | ||
| 173 | depd,z $nlo,59,4,$nlo | ||
| 174 | ldd $rem($rem_4bit),$rem | ||
| 175 | |||
| 176 | xor $Tll,$Zll,$Zll | ||
| 177 | addib,uv -1,$cnt,L\$oop_gmult_pa2 | ||
| 178 | xor $Thh,$Zhh,$Zhh | ||
| 179 | |||
| 180 | xor $rem,$Zhh,$Zhh | ||
| 181 | depd,z $Zll,60,4,$rem | ||
| 182 | |||
| 183 | shrpd $Zhh,$Zll,4,$Zll | ||
| 184 | extrd,u $Zhh,59,60,$Zhh | ||
| 185 | ldd $nlo($Hll),$Tll | ||
| 186 | ldd $nlo($Hhh),$Thh | ||
| 187 | |||
| 188 | xor $Tll,$Zll,$Zll | ||
| 189 | xor $Thh,$Zhh,$Zhh | ||
| 190 | ldd $rem($rem_4bit),$rem | ||
| 191 | |||
| 192 | xor $rem,$Zhh,$Zhh | ||
| 193 | depd,z $Zll,60,4,$rem | ||
| 194 | |||
| 195 | shrpd $Zhh,$Zll,4,$Zll | ||
| 196 | extrd,u $Zhh,59,60,$Zhh | ||
| 197 | ldd $nhi($Hll),$Tll | ||
| 198 | ldd $nhi($Hhh),$Thh | ||
| 199 | |||
| 200 | xor $Tll,$Zll,$Zll | ||
| 201 | xor $Thh,$Zhh,$Zhh | ||
| 202 | ldd $rem($rem_4bit),$rem | ||
| 203 | |||
| 204 | xor $rem,$Zhh,$Zhh | ||
| 205 | std $Zll,8($Xi) | ||
| 206 | std $Zhh,0($Xi) | ||
| 207 | ___ | ||
| 208 | |||
| 209 | $code.=<<___ if ($SIZE_T==4); | ||
| 210 | b L\$done_gmult | ||
| 211 | nop | ||
| 212 | |||
| 213 | L\$parisc1_gmult | ||
| 214 | ldb 15($Xi),$nlo | ||
| 215 | ldo 12($Htbl),$Hll | ||
| 216 | ldo 8($Htbl),$Hlh | ||
| 217 | ldo 4($Htbl),$Hhl | ||
| 218 | |||
| 219 | and $mask0xf0,$nlo,$nhi | ||
| 220 | zdep $nlo,27,4,$nlo | ||
| 221 | |||
| 222 | ldwx $nlo($Hll),$Zll | ||
| 223 | ldwx $nlo($Hlh),$Zlh | ||
| 224 | ldwx $nlo($Hhl),$Zhl | ||
| 225 | ldwx $nlo($Hhh),$Zhh | ||
| 226 | zdep $Zll,28,4,$rem | ||
| 227 | ldb 14($Xi),$nlo | ||
| 228 | ldwx $rem($rem_4bit),$rem | ||
| 229 | shrpw $Zlh,$Zll,4,$Zll | ||
| 230 | ldwx $nhi($Hll),$Tll | ||
| 231 | shrpw $Zhl,$Zlh,4,$Zlh | ||
| 232 | ldwx $nhi($Hlh),$Tlh | ||
| 233 | shrpw $Zhh,$Zhl,4,$Zhl | ||
| 234 | ldwx $nhi($Hhl),$Thl | ||
| 235 | extru $Zhh,27,28,$Zhh | ||
| 236 | ldwx $nhi($Hhh),$Thh | ||
| 237 | xor $rem,$Zhh,$Zhh | ||
| 238 | and $mask0xf0,$nlo,$nhi | ||
| 239 | zdep $nlo,27,4,$nlo | ||
| 240 | |||
| 241 | xor $Tll,$Zll,$Zll | ||
| 242 | ldwx $nlo($Hll),$Tll | ||
| 243 | xor $Tlh,$Zlh,$Zlh | ||
| 244 | ldwx $nlo($Hlh),$Tlh | ||
| 245 | xor $Thl,$Zhl,$Zhl | ||
| 246 | b L\$oop_gmult_pa1 | ||
| 247 | ldi 13,$cnt | ||
| 248 | |||
| 249 | .ALIGN 8 | ||
| 250 | L\$oop_gmult_pa1 | ||
| 251 | zdep $Zll,28,4,$rem | ||
| 252 | ldwx $nlo($Hhl),$Thl | ||
| 253 | xor $Thh,$Zhh,$Zhh | ||
| 254 | ldwx $rem($rem_4bit),$rem | ||
| 255 | shrpw $Zlh,$Zll,4,$Zll | ||
| 256 | ldwx $nlo($Hhh),$Thh | ||
| 257 | shrpw $Zhl,$Zlh,4,$Zlh | ||
| 258 | ldbx $cnt($Xi),$nlo | ||
| 259 | xor $Tll,$Zll,$Zll | ||
| 260 | ldwx $nhi($Hll),$Tll | ||
| 261 | shrpw $Zhh,$Zhl,4,$Zhl | ||
| 262 | xor $Tlh,$Zlh,$Zlh | ||
| 263 | ldwx $nhi($Hlh),$Tlh | ||
| 264 | extru $Zhh,27,28,$Zhh | ||
| 265 | xor $Thl,$Zhl,$Zhl | ||
| 266 | ldwx $nhi($Hhl),$Thl | ||
| 267 | xor $rem,$Zhh,$Zhh | ||
| 268 | zdep $Zll,28,4,$rem | ||
| 269 | xor $Thh,$Zhh,$Zhh | ||
| 270 | ldwx $nhi($Hhh),$Thh | ||
| 271 | shrpw $Zlh,$Zll,4,$Zll | ||
| 272 | ldwx $rem($rem_4bit),$rem | ||
| 273 | shrpw $Zhl,$Zlh,4,$Zlh | ||
| 274 | shrpw $Zhh,$Zhl,4,$Zhl | ||
| 275 | and $mask0xf0,$nlo,$nhi | ||
| 276 | extru $Zhh,27,28,$Zhh | ||
| 277 | zdep $nlo,27,4,$nlo | ||
| 278 | xor $Tll,$Zll,$Zll | ||
| 279 | ldwx $nlo($Hll),$Tll | ||
| 280 | xor $Tlh,$Zlh,$Zlh | ||
| 281 | ldwx $nlo($Hlh),$Tlh | ||
| 282 | xor $rem,$Zhh,$Zhh | ||
| 283 | addib,uv -1,$cnt,L\$oop_gmult_pa1 | ||
| 284 | xor $Thl,$Zhl,$Zhl | ||
| 285 | |||
| 286 | zdep $Zll,28,4,$rem | ||
| 287 | ldwx $nlo($Hhl),$Thl | ||
| 288 | xor $Thh,$Zhh,$Zhh | ||
| 289 | ldwx $rem($rem_4bit),$rem | ||
| 290 | shrpw $Zlh,$Zll,4,$Zll | ||
| 291 | ldwx $nlo($Hhh),$Thh | ||
| 292 | shrpw $Zhl,$Zlh,4,$Zlh | ||
| 293 | xor $Tll,$Zll,$Zll | ||
| 294 | ldwx $nhi($Hll),$Tll | ||
| 295 | shrpw $Zhh,$Zhl,4,$Zhl | ||
| 296 | xor $Tlh,$Zlh,$Zlh | ||
| 297 | ldwx $nhi($Hlh),$Tlh | ||
| 298 | extru $Zhh,27,28,$Zhh | ||
| 299 | xor $rem,$Zhh,$Zhh | ||
| 300 | xor $Thl,$Zhl,$Zhl | ||
| 301 | ldwx $nhi($Hhl),$Thl | ||
| 302 | xor $Thh,$Zhh,$Zhh | ||
| 303 | ldwx $nhi($Hhh),$Thh | ||
| 304 | zdep $Zll,28,4,$rem | ||
| 305 | ldwx $rem($rem_4bit),$rem | ||
| 306 | shrpw $Zlh,$Zll,4,$Zll | ||
| 307 | shrpw $Zhl,$Zlh,4,$Zlh | ||
| 308 | shrpw $Zhh,$Zhl,4,$Zhl | ||
| 309 | extru $Zhh,27,28,$Zhh | ||
| 310 | xor $Tll,$Zll,$Zll | ||
| 311 | xor $Tlh,$Zlh,$Zlh | ||
| 312 | xor $rem,$Zhh,$Zhh | ||
| 313 | stw $Zll,12($Xi) | ||
| 314 | xor $Thl,$Zhl,$Zhl | ||
| 315 | stw $Zlh,8($Xi) | ||
| 316 | xor $Thh,$Zhh,$Zhh | ||
| 317 | stw $Zhl,4($Xi) | ||
| 318 | stw $Zhh,0($Xi) | ||
| 319 | ___ | ||
| 320 | $code.=<<___; | ||
| 321 | L\$done_gmult | ||
| 322 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
| 323 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
| 324 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
| 325 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
| 326 | ___ | ||
| 327 | $code.=<<___ if ($SIZE_T==4); | ||
| 328 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
| 329 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
| 330 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
| 331 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
| 332 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
| 333 | ___ | ||
| 334 | $code.=<<___; | ||
| 335 | bv (%r2) | ||
| 336 | .EXIT | ||
| 337 | $POPMB -$FRAME(%sp),%r3 | ||
| 338 | .PROCEND | ||
| 339 | |||
| 340 | .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | ||
| 341 | .ALIGN 64 | ||
| 342 | gcm_ghash_4bit | ||
| 343 | .PROC | ||
| 344 | .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11 | ||
| 345 | .ENTRY | ||
| 346 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
| 347 | $PUSHMA %r3,$FRAME(%sp) | ||
| 348 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
| 349 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
| 350 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
| 351 | ___ | ||
| 352 | $code.=<<___ if ($SIZE_T==4); | ||
| 353 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
| 354 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
| 355 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
| 356 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
| 357 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
| 358 | ___ | ||
| 359 | $code.=<<___; | ||
| 360 | blr %r0,$rem_4bit | ||
| 361 | ldi 3,$rem | ||
| 362 | L\$pic_ghash | ||
| 363 | andcm $rem_4bit,$rem,$rem_4bit | ||
| 364 | addl $inp,$len,$len | ||
| 365 | ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit | ||
| 366 | ldi 0xf0,$mask0xf0 | ||
| 367 | ___ | ||
| 368 | $code.=<<___ if ($SIZE_T==4); | ||
| 369 | ldi 31,$rem | ||
| 370 | mtctl $rem,%cr11 | ||
| 371 | extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 | ||
| 372 | b L\$parisc1_ghash | ||
| 373 | nop | ||
| 374 | ___ | ||
| 375 | |||
| 376 | $code.=<<___; | ||
| 377 | ldb 15($Xi),$nlo | ||
| 378 | ldo 8($Htbl),$Hll | ||
| 379 | |||
| 380 | L\$outer_ghash_pa2 | ||
| 381 | ldb 15($inp),$nhi | ||
| 382 | xor $nhi,$nlo,$nlo | ||
| 383 | and $mask0xf0,$nlo,$nhi | ||
| 384 | depd,z $nlo,59,4,$nlo | ||
| 385 | |||
| 386 | ldd $nlo($Hll),$Zll | ||
| 387 | ldd $nlo($Hhh),$Zhh | ||
| 388 | |||
| 389 | depd,z $Zll,60,4,$rem | ||
| 390 | shrpd $Zhh,$Zll,4,$Zll | ||
| 391 | extrd,u $Zhh,59,60,$Zhh | ||
| 392 | ldb 14($Xi),$nlo | ||
| 393 | ldb 14($inp),$byte | ||
| 394 | |||
| 395 | ldd $nhi($Hll),$Tll | ||
| 396 | ldd $nhi($Hhh),$Thh | ||
| 397 | xor $byte,$nlo,$nlo | ||
| 398 | and $mask0xf0,$nlo,$nhi | ||
| 399 | depd,z $nlo,59,4,$nlo | ||
| 400 | |||
| 401 | xor $Tll,$Zll,$Zll | ||
| 402 | xor $Thh,$Zhh,$Zhh | ||
| 403 | ldd $rem($rem_4bit),$rem | ||
| 404 | b L\$oop_ghash_pa2 | ||
| 405 | ldi 13,$cnt | ||
| 406 | |||
| 407 | .ALIGN 8 | ||
| 408 | L\$oop_ghash_pa2 | ||
| 409 | xor $rem,$Zhh,$Zhh ; moved here to work around gas bug | ||
| 410 | depd,z $Zll,60,4,$rem2 | ||
| 411 | |||
| 412 | shrpd $Zhh,$Zll,4,$Zll | ||
| 413 | extrd,u $Zhh,59,60,$Zhh | ||
| 414 | ldd $nlo($Hll),$Tll | ||
| 415 | ldd $nlo($Hhh),$Thh | ||
| 416 | |||
| 417 | xor $Tll,$Zll,$Zll | ||
| 418 | xor $Thh,$Zhh,$Zhh | ||
| 419 | ldbx $cnt($Xi),$nlo | ||
| 420 | ldbx $cnt($inp),$byte | ||
| 421 | |||
| 422 | depd,z $Zll,60,4,$rem | ||
| 423 | shrpd $Zhh,$Zll,4,$Zll | ||
| 424 | ldd $rem2($rem_4bit),$rem2 | ||
| 425 | |||
| 426 | xor $rem2,$Zhh,$Zhh | ||
| 427 | xor $byte,$nlo,$nlo | ||
| 428 | ldd $nhi($Hll),$Tll | ||
| 429 | ldd $nhi($Hhh),$Thh | ||
| 430 | |||
| 431 | and $mask0xf0,$nlo,$nhi | ||
| 432 | depd,z $nlo,59,4,$nlo | ||
| 433 | |||
| 434 | extrd,u $Zhh,59,60,$Zhh | ||
| 435 | xor $Tll,$Zll,$Zll | ||
| 436 | |||
| 437 | ldd $rem($rem_4bit),$rem | ||
| 438 | addib,uv -1,$cnt,L\$oop_ghash_pa2 | ||
| 439 | xor $Thh,$Zhh,$Zhh | ||
| 440 | |||
| 441 | xor $rem,$Zhh,$Zhh | ||
| 442 | depd,z $Zll,60,4,$rem2 | ||
| 443 | |||
| 444 | shrpd $Zhh,$Zll,4,$Zll | ||
| 445 | extrd,u $Zhh,59,60,$Zhh | ||
| 446 | ldd $nlo($Hll),$Tll | ||
| 447 | ldd $nlo($Hhh),$Thh | ||
| 448 | |||
| 449 | xor $Tll,$Zll,$Zll | ||
| 450 | xor $Thh,$Zhh,$Zhh | ||
| 451 | |||
| 452 | depd,z $Zll,60,4,$rem | ||
| 453 | shrpd $Zhh,$Zll,4,$Zll | ||
| 454 | ldd $rem2($rem_4bit),$rem2 | ||
| 455 | |||
| 456 | xor $rem2,$Zhh,$Zhh | ||
| 457 | ldd $nhi($Hll),$Tll | ||
| 458 | ldd $nhi($Hhh),$Thh | ||
| 459 | |||
| 460 | extrd,u $Zhh,59,60,$Zhh | ||
| 461 | xor $Tll,$Zll,$Zll | ||
| 462 | xor $Thh,$Zhh,$Zhh | ||
| 463 | ldd $rem($rem_4bit),$rem | ||
| 464 | |||
| 465 | xor $rem,$Zhh,$Zhh | ||
| 466 | std $Zll,8($Xi) | ||
| 467 | ldo 16($inp),$inp | ||
| 468 | std $Zhh,0($Xi) | ||
| 469 | cmpb,*<> $inp,$len,L\$outer_ghash_pa2 | ||
| 470 | copy $Zll,$nlo | ||
| 471 | ___ | ||
| 472 | |||
| 473 | $code.=<<___ if ($SIZE_T==4); | ||
| 474 | b L\$done_ghash | ||
| 475 | nop | ||
| 476 | |||
| 477 | L\$parisc1_ghash | ||
| 478 | ldb 15($Xi),$nlo | ||
| 479 | ldo 12($Htbl),$Hll | ||
| 480 | ldo 8($Htbl),$Hlh | ||
| 481 | ldo 4($Htbl),$Hhl | ||
| 482 | |||
| 483 | L\$outer_ghash_pa1 | ||
| 484 | ldb 15($inp),$byte | ||
| 485 | xor $byte,$nlo,$nlo | ||
| 486 | and $mask0xf0,$nlo,$nhi | ||
| 487 | zdep $nlo,27,4,$nlo | ||
| 488 | |||
| 489 | ldwx $nlo($Hll),$Zll | ||
| 490 | ldwx $nlo($Hlh),$Zlh | ||
| 491 | ldwx $nlo($Hhl),$Zhl | ||
| 492 | ldwx $nlo($Hhh),$Zhh | ||
| 493 | zdep $Zll,28,4,$rem | ||
| 494 | ldb 14($Xi),$nlo | ||
| 495 | ldb 14($inp),$byte | ||
| 496 | ldwx $rem($rem_4bit),$rem | ||
| 497 | shrpw $Zlh,$Zll,4,$Zll | ||
| 498 | ldwx $nhi($Hll),$Tll | ||
| 499 | shrpw $Zhl,$Zlh,4,$Zlh | ||
| 500 | ldwx $nhi($Hlh),$Tlh | ||
| 501 | shrpw $Zhh,$Zhl,4,$Zhl | ||
| 502 | ldwx $nhi($Hhl),$Thl | ||
| 503 | extru $Zhh,27,28,$Zhh | ||
| 504 | ldwx $nhi($Hhh),$Thh | ||
| 505 | xor $byte,$nlo,$nlo | ||
| 506 | xor $rem,$Zhh,$Zhh | ||
| 507 | and $mask0xf0,$nlo,$nhi | ||
| 508 | zdep $nlo,27,4,$nlo | ||
| 509 | |||
| 510 | xor $Tll,$Zll,$Zll | ||
| 511 | ldwx $nlo($Hll),$Tll | ||
| 512 | xor $Tlh,$Zlh,$Zlh | ||
| 513 | ldwx $nlo($Hlh),$Tlh | ||
| 514 | xor $Thl,$Zhl,$Zhl | ||
| 515 | b L\$oop_ghash_pa1 | ||
| 516 | ldi 13,$cnt | ||
| 517 | |||
| 518 | .ALIGN 8 | ||
| 519 | L\$oop_ghash_pa1 | ||
| 520 | zdep $Zll,28,4,$rem | ||
| 521 | ldwx $nlo($Hhl),$Thl | ||
| 522 | xor $Thh,$Zhh,$Zhh | ||
| 523 | ldwx $rem($rem_4bit),$rem | ||
| 524 | shrpw $Zlh,$Zll,4,$Zll | ||
| 525 | ldwx $nlo($Hhh),$Thh | ||
| 526 | shrpw $Zhl,$Zlh,4,$Zlh | ||
| 527 | ldbx $cnt($Xi),$nlo | ||
| 528 | xor $Tll,$Zll,$Zll | ||
| 529 | ldwx $nhi($Hll),$Tll | ||
| 530 | shrpw $Zhh,$Zhl,4,$Zhl | ||
| 531 | ldbx $cnt($inp),$byte | ||
| 532 | xor $Tlh,$Zlh,$Zlh | ||
| 533 | ldwx $nhi($Hlh),$Tlh | ||
| 534 | extru $Zhh,27,28,$Zhh | ||
| 535 | xor $Thl,$Zhl,$Zhl | ||
| 536 | ldwx $nhi($Hhl),$Thl | ||
| 537 | xor $rem,$Zhh,$Zhh | ||
| 538 | zdep $Zll,28,4,$rem | ||
| 539 | xor $Thh,$Zhh,$Zhh | ||
| 540 | ldwx $nhi($Hhh),$Thh | ||
| 541 | shrpw $Zlh,$Zll,4,$Zll | ||
| 542 | ldwx $rem($rem_4bit),$rem | ||
| 543 | shrpw $Zhl,$Zlh,4,$Zlh | ||
| 544 | xor $byte,$nlo,$nlo | ||
| 545 | shrpw $Zhh,$Zhl,4,$Zhl | ||
| 546 | and $mask0xf0,$nlo,$nhi | ||
| 547 | extru $Zhh,27,28,$Zhh | ||
| 548 | zdep $nlo,27,4,$nlo | ||
| 549 | xor $Tll,$Zll,$Zll | ||
| 550 | ldwx $nlo($Hll),$Tll | ||
| 551 | xor $Tlh,$Zlh,$Zlh | ||
| 552 | ldwx $nlo($Hlh),$Tlh | ||
| 553 | xor $rem,$Zhh,$Zhh | ||
| 554 | addib,uv -1,$cnt,L\$oop_ghash_pa1 | ||
| 555 | xor $Thl,$Zhl,$Zhl | ||
| 556 | |||
| 557 | zdep $Zll,28,4,$rem | ||
| 558 | ldwx $nlo($Hhl),$Thl | ||
| 559 | xor $Thh,$Zhh,$Zhh | ||
| 560 | ldwx $rem($rem_4bit),$rem | ||
| 561 | shrpw $Zlh,$Zll,4,$Zll | ||
| 562 | ldwx $nlo($Hhh),$Thh | ||
| 563 | shrpw $Zhl,$Zlh,4,$Zlh | ||
| 564 | xor $Tll,$Zll,$Zll | ||
| 565 | ldwx $nhi($Hll),$Tll | ||
| 566 | shrpw $Zhh,$Zhl,4,$Zhl | ||
| 567 | xor $Tlh,$Zlh,$Zlh | ||
| 568 | ldwx $nhi($Hlh),$Tlh | ||
| 569 | extru $Zhh,27,28,$Zhh | ||
| 570 | xor $rem,$Zhh,$Zhh | ||
| 571 | xor $Thl,$Zhl,$Zhl | ||
| 572 | ldwx $nhi($Hhl),$Thl | ||
| 573 | xor $Thh,$Zhh,$Zhh | ||
| 574 | ldwx $nhi($Hhh),$Thh | ||
| 575 | zdep $Zll,28,4,$rem | ||
| 576 | ldwx $rem($rem_4bit),$rem | ||
| 577 | shrpw $Zlh,$Zll,4,$Zll | ||
| 578 | shrpw $Zhl,$Zlh,4,$Zlh | ||
| 579 | shrpw $Zhh,$Zhl,4,$Zhl | ||
| 580 | extru $Zhh,27,28,$Zhh | ||
| 581 | xor $Tll,$Zll,$Zll | ||
| 582 | xor $Tlh,$Zlh,$Zlh | ||
| 583 | xor $rem,$Zhh,$Zhh | ||
| 584 | stw $Zll,12($Xi) | ||
| 585 | xor $Thl,$Zhl,$Zhl | ||
| 586 | stw $Zlh,8($Xi) | ||
| 587 | xor $Thh,$Zhh,$Zhh | ||
| 588 | stw $Zhl,4($Xi) | ||
| 589 | ldo 16($inp),$inp | ||
| 590 | stw $Zhh,0($Xi) | ||
| 591 | comb,<> $inp,$len,L\$outer_ghash_pa1 | ||
| 592 | copy $Zll,$nlo | ||
| 593 | ___ | ||
| 594 | $code.=<<___; | ||
| 595 | L\$done_ghash | ||
| 596 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
| 597 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
| 598 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
| 599 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
| 600 | ___ | ||
| 601 | $code.=<<___ if ($SIZE_T==4); | ||
| 602 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
| 603 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
| 604 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
| 605 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
| 606 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
| 607 | ___ | ||
| 608 | $code.=<<___; | ||
| 609 | bv (%r2) | ||
| 610 | .EXIT | ||
| 611 | $POPMB -$FRAME(%sp),%r3 | ||
| 612 | .PROCEND | ||
| 613 | |||
| 614 | .ALIGN 64 | ||
| 615 | L\$rem_4bit | ||
| 616 | .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 | ||
| 617 | .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 | ||
| 618 | .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 | ||
| 619 | .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 | ||
| 620 | .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>" | ||
| 621 | .ALIGN 64 | ||
| 622 | ___ | ||
| 623 | |||
| 624 | # Explicitly encode PA-RISC 2.0 instructions used in this module, so | ||
| 625 | # that it can be compiled with .LEVEL 1.0. It should be noted that I | ||
| 626 | # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 | ||
| 627 | # directive... | ||
| 628 | |||
| 629 | my $ldd = sub { | ||
| 630 | my ($mod,$args) = @_; | ||
| 631 | my $orig = "ldd$mod\t$args"; | ||
| 632 | |||
| 633 | if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 | ||
| 634 | { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; | ||
| 635 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 636 | } | ||
| 637 | elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 | ||
| 638 | { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; | ||
| 639 | $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset | ||
| 640 | $opcode|=(1<<5) if ($mod =~ /^,m/); | ||
| 641 | $opcode|=(1<<13) if ($mod =~ /^,mb/); | ||
| 642 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 643 | } | ||
| 644 | else { "\t".$orig; } | ||
| 645 | }; | ||
| 646 | |||
| 647 | my $std = sub { | ||
| 648 | my ($mod,$args) = @_; | ||
| 649 | my $orig = "std$mod\t$args"; | ||
| 650 | |||
| 651 | if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices | ||
| 652 | { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); | ||
| 653 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 654 | } | ||
| 655 | else { "\t".$orig; } | ||
| 656 | }; | ||
| 657 | |||
| 658 | my $extrd = sub { | ||
| 659 | my ($mod,$args) = @_; | ||
| 660 | my $orig = "extrd$mod\t$args"; | ||
| 661 | |||
| 662 | # I only have ",u" completer, it's implicitly encoded... | ||
| 663 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 | ||
| 664 | { my $opcode=(0x36<<26)|($1<<21)|($4<<16); | ||
| 665 | my $len=32-$3; | ||
| 666 | $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos | ||
| 667 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len | ||
| 668 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 669 | } | ||
| 670 | elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 | ||
| 671 | { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); | ||
| 672 | my $len=32-$2; | ||
| 673 | $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len | ||
| 674 | $opcode |= (1<<13) if ($mod =~ /,\**=/); | ||
| 675 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 676 | } | ||
| 677 | else { "\t".$orig; } | ||
| 678 | }; | ||
| 679 | |||
| 680 | my $shrpd = sub { | ||
| 681 | my ($mod,$args) = @_; | ||
| 682 | my $orig = "shrpd$mod\t$args"; | ||
| 683 | |||
| 684 | if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 | ||
| 685 | { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; | ||
| 686 | my $cpos=63-$3; | ||
| 687 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa | ||
| 688 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 689 | } | ||
| 690 | elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 | ||
| 691 | { sprintf "\t.WORD\t0x%08x\t; %s", | ||
| 692 | (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; | ||
| 693 | } | ||
| 694 | else { "\t".$orig; } | ||
| 695 | }; | ||
| 696 | |||
| 697 | my $depd = sub { | ||
| 698 | my ($mod,$args) = @_; | ||
| 699 | my $orig = "depd$mod\t$args"; | ||
| 700 | |||
| 701 | # I only have ",z" completer, it's impicitly encoded... | ||
| 702 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 | ||
| 703 | { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); | ||
| 704 | my $cpos=63-$2; | ||
| 705 | my $len=32-$3; | ||
| 706 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos | ||
| 707 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len | ||
| 708 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 709 | } | ||
| 710 | else { "\t".$orig; } | ||
| 711 | }; | ||
| 712 | |||
| 713 | sub assemble { | ||
| 714 | my ($mnemonic,$mod,$args)=@_; | ||
| 715 | my $opcode = eval("\$$mnemonic"); | ||
| 716 | |||
| 717 | ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; | ||
| 718 | } | ||
| 719 | |||
| 720 | foreach (split("\n",$code)) { | ||
| 721 | s/\`([^\`]*)\`/eval $1/ge; | ||
| 722 | if ($SIZE_T==4) { | ||
| 723 | s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e; | ||
| 724 | s/cmpb,\*/comb,/; | ||
| 725 | s/,\*/,/; | ||
| 726 | } | ||
| 727 | print $_,"\n"; | ||
| 728 | } | ||
| 729 | |||
| 730 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl new file mode 100644 index 0000000000..6a40d5d89c --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-s390x.pl | |||
| @@ -0,0 +1,262 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # September 2010. | ||
| 11 | # | ||
| 12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
| 13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
| 14 | # uses 256 bytes per-key table [+128 bytes shared table]. Performance | ||
| 15 | # was measured to be ~18 cycles per processed byte on z10, which is | ||
| 16 | # almost 40% better than gcc-generated code. It should be noted that | ||
| 17 | # 18 cycles is worse result than expected: loop is scheduled for 12 | ||
| 18 | # and the result should be close to 12. In the lack of instruction- | ||
| 19 | # level profiling data it's impossible to tell why... | ||
| 20 | |||
| 21 | # November 2010. | ||
| 22 | # | ||
| 23 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
| 24 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
| 25 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
| 26 | # application context. The feature is not specific to any particular | ||
| 27 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
| 28 | # remains z/Architecture specific. On z990 it was measured to perform | ||
| 29 | # 2.8x better than 32-bit code generated by gcc 4.3. | ||
| 30 | |||
| 31 | # March 2011. | ||
| 32 | # | ||
| 33 | # Support for hardware KIMD-GHASH is verified to produce correct | ||
| 34 | # result and therefore is engaged. On z196 it was measured to process | ||
| 35 | # 8KB buffer ~7 faster than software implementation. It's not as | ||
| 36 | # impressive for smaller buffer sizes and for smallest 16-bytes buffer | ||
| 37 | # it's actually almost 2 times slower. Which is the reason why | ||
| 38 | # KIMD-GHASH is not used in gcm_gmult_4bit. | ||
| 39 | |||
| 40 | $flavour = shift; | ||
| 41 | |||
| 42 | if ($flavour =~ /3[12]/) { | ||
| 43 | $SIZE_T=4; | ||
| 44 | $g=""; | ||
| 45 | } else { | ||
| 46 | $SIZE_T=8; | ||
| 47 | $g="g"; | ||
| 48 | } | ||
| 49 | |||
| 50 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 51 | open STDOUT,">$output"; | ||
| 52 | |||
| 53 | $softonly=0; | ||
| 54 | |||
| 55 | $Zhi="%r0"; | ||
| 56 | $Zlo="%r1"; | ||
| 57 | |||
| 58 | $Xi="%r2"; # argument block | ||
| 59 | $Htbl="%r3"; | ||
| 60 | $inp="%r4"; | ||
| 61 | $len="%r5"; | ||
| 62 | |||
| 63 | $rem0="%r6"; # variables | ||
| 64 | $rem1="%r7"; | ||
| 65 | $nlo="%r8"; | ||
| 66 | $nhi="%r9"; | ||
| 67 | $xi="%r10"; | ||
| 68 | $cnt="%r11"; | ||
| 69 | $tmp="%r12"; | ||
| 70 | $x78="%r13"; | ||
| 71 | $rem_4bit="%r14"; | ||
| 72 | |||
| 73 | $sp="%r15"; | ||
| 74 | |||
| 75 | $code.=<<___; | ||
| 76 | .text | ||
| 77 | |||
| 78 | .globl gcm_gmult_4bit | ||
| 79 | .align 32 | ||
| 80 | gcm_gmult_4bit: | ||
| 81 | ___ | ||
| 82 | $code.=<<___ if(!$softonly && 0); # hardware is slow for single block... | ||
| 83 | larl %r1,OPENSSL_s390xcap_P | ||
| 84 | lg %r0,0(%r1) | ||
| 85 | tmhl %r0,0x4000 # check for message-security-assist | ||
| 86 | jz .Lsoft_gmult | ||
| 87 | lghi %r0,0 | ||
| 88 | la %r1,16($sp) | ||
| 89 | .long 0xb93e0004 # kimd %r0,%r4 | ||
| 90 | lg %r1,24($sp) | ||
| 91 | tmhh %r1,0x4000 # check for function 65 | ||
| 92 | jz .Lsoft_gmult | ||
| 93 | stg %r0,16($sp) # arrange 16 bytes of zero input | ||
| 94 | stg %r0,24($sp) | ||
| 95 | lghi %r0,65 # function 65 | ||
| 96 | la %r1,0($Xi) # H lies right after Xi in gcm128_context | ||
| 97 | la $inp,16($sp) | ||
| 98 | lghi $len,16 | ||
| 99 | .long 0xb93e0004 # kimd %r0,$inp | ||
| 100 | brc 1,.-4 # pay attention to "partial completion" | ||
| 101 | br %r14 | ||
| 102 | .align 32 | ||
| 103 | .Lsoft_gmult: | ||
| 104 | ___ | ||
| 105 | $code.=<<___; | ||
| 106 | stm${g} %r6,%r14,6*$SIZE_T($sp) | ||
| 107 | |||
| 108 | aghi $Xi,-1 | ||
| 109 | lghi $len,1 | ||
| 110 | lghi $x78,`0xf<<3` | ||
| 111 | larl $rem_4bit,rem_4bit | ||
| 112 | |||
| 113 | lg $Zlo,8+1($Xi) # Xi | ||
| 114 | j .Lgmult_shortcut | ||
| 115 | .type gcm_gmult_4bit,\@function | ||
| 116 | .size gcm_gmult_4bit,(.-gcm_gmult_4bit) | ||
| 117 | |||
| 118 | .globl gcm_ghash_4bit | ||
| 119 | .align 32 | ||
| 120 | gcm_ghash_4bit: | ||
| 121 | ___ | ||
| 122 | $code.=<<___ if(!$softonly); | ||
| 123 | larl %r1,OPENSSL_s390xcap_P | ||
| 124 | lg %r0,0(%r1) | ||
| 125 | tmhl %r0,0x4000 # check for message-security-assist | ||
| 126 | jz .Lsoft_ghash | ||
| 127 | lghi %r0,0 | ||
| 128 | la %r1,16($sp) | ||
| 129 | .long 0xb93e0004 # kimd %r0,%r4 | ||
| 130 | lg %r1,24($sp) | ||
| 131 | tmhh %r1,0x4000 # check for function 65 | ||
| 132 | jz .Lsoft_ghash | ||
| 133 | lghi %r0,65 # function 65 | ||
| 134 | la %r1,0($Xi) # H lies right after Xi in gcm128_context | ||
| 135 | .long 0xb93e0004 # kimd %r0,$inp | ||
| 136 | brc 1,.-4 # pay attention to "partial completion" | ||
| 137 | br %r14 | ||
| 138 | .align 32 | ||
| 139 | .Lsoft_ghash: | ||
| 140 | ___ | ||
| 141 | $code.=<<___ if ($flavour =~ /3[12]/); | ||
| 142 | llgfr $len,$len | ||
| 143 | ___ | ||
| 144 | $code.=<<___; | ||
| 145 | stm${g} %r6,%r14,6*$SIZE_T($sp) | ||
| 146 | |||
| 147 | aghi $Xi,-1 | ||
| 148 | srlg $len,$len,4 | ||
| 149 | lghi $x78,`0xf<<3` | ||
| 150 | larl $rem_4bit,rem_4bit | ||
| 151 | |||
| 152 | lg $Zlo,8+1($Xi) # Xi | ||
| 153 | lg $Zhi,0+1($Xi) | ||
| 154 | lghi $tmp,0 | ||
| 155 | .Louter: | ||
| 156 | xg $Zhi,0($inp) # Xi ^= inp | ||
| 157 | xg $Zlo,8($inp) | ||
| 158 | xgr $Zhi,$tmp | ||
| 159 | stg $Zlo,8+1($Xi) | ||
| 160 | stg $Zhi,0+1($Xi) | ||
| 161 | |||
| 162 | .Lgmult_shortcut: | ||
| 163 | lghi $tmp,0xf0 | ||
| 164 | sllg $nlo,$Zlo,4 | ||
| 165 | srlg $xi,$Zlo,8 # extract second byte | ||
| 166 | ngr $nlo,$tmp | ||
| 167 | lgr $nhi,$Zlo | ||
| 168 | lghi $cnt,14 | ||
| 169 | ngr $nhi,$tmp | ||
| 170 | |||
| 171 | lg $Zlo,8($nlo,$Htbl) | ||
| 172 | lg $Zhi,0($nlo,$Htbl) | ||
| 173 | |||
| 174 | sllg $nlo,$xi,4 | ||
| 175 | sllg $rem0,$Zlo,3 | ||
| 176 | ngr $nlo,$tmp | ||
| 177 | ngr $rem0,$x78 | ||
| 178 | ngr $xi,$tmp | ||
| 179 | |||
| 180 | sllg $tmp,$Zhi,60 | ||
| 181 | srlg $Zlo,$Zlo,4 | ||
| 182 | srlg $Zhi,$Zhi,4 | ||
| 183 | xg $Zlo,8($nhi,$Htbl) | ||
| 184 | xg $Zhi,0($nhi,$Htbl) | ||
| 185 | lgr $nhi,$xi | ||
| 186 | sllg $rem1,$Zlo,3 | ||
| 187 | xgr $Zlo,$tmp | ||
| 188 | ngr $rem1,$x78 | ||
| 189 | j .Lghash_inner | ||
| 190 | .align 16 | ||
| 191 | .Lghash_inner: | ||
| 192 | srlg $Zlo,$Zlo,4 | ||
| 193 | sllg $tmp,$Zhi,60 | ||
| 194 | xg $Zlo,8($nlo,$Htbl) | ||
| 195 | srlg $Zhi,$Zhi,4 | ||
| 196 | llgc $xi,0($cnt,$Xi) | ||
| 197 | xg $Zhi,0($nlo,$Htbl) | ||
| 198 | sllg $nlo,$xi,4 | ||
| 199 | xg $Zhi,0($rem0,$rem_4bit) | ||
| 200 | nill $nlo,0xf0 | ||
| 201 | sllg $rem0,$Zlo,3 | ||
| 202 | xgr $Zlo,$tmp | ||
| 203 | ngr $rem0,$x78 | ||
| 204 | nill $xi,0xf0 | ||
| 205 | |||
| 206 | sllg $tmp,$Zhi,60 | ||
| 207 | srlg $Zlo,$Zlo,4 | ||
| 208 | srlg $Zhi,$Zhi,4 | ||
| 209 | xg $Zlo,8($nhi,$Htbl) | ||
| 210 | xg $Zhi,0($nhi,$Htbl) | ||
| 211 | lgr $nhi,$xi | ||
| 212 | xg $Zhi,0($rem1,$rem_4bit) | ||
| 213 | sllg $rem1,$Zlo,3 | ||
| 214 | xgr $Zlo,$tmp | ||
| 215 | ngr $rem1,$x78 | ||
| 216 | brct $cnt,.Lghash_inner | ||
| 217 | |||
| 218 | sllg $tmp,$Zhi,60 | ||
| 219 | srlg $Zlo,$Zlo,4 | ||
| 220 | srlg $Zhi,$Zhi,4 | ||
| 221 | xg $Zlo,8($nlo,$Htbl) | ||
| 222 | xg $Zhi,0($nlo,$Htbl) | ||
| 223 | sllg $xi,$Zlo,3 | ||
| 224 | xg $Zhi,0($rem0,$rem_4bit) | ||
| 225 | xgr $Zlo,$tmp | ||
| 226 | ngr $xi,$x78 | ||
| 227 | |||
| 228 | sllg $tmp,$Zhi,60 | ||
| 229 | srlg $Zlo,$Zlo,4 | ||
| 230 | srlg $Zhi,$Zhi,4 | ||
| 231 | xg $Zlo,8($nhi,$Htbl) | ||
| 232 | xg $Zhi,0($nhi,$Htbl) | ||
| 233 | xgr $Zlo,$tmp | ||
| 234 | xg $Zhi,0($rem1,$rem_4bit) | ||
| 235 | |||
| 236 | lg $tmp,0($xi,$rem_4bit) | ||
| 237 | la $inp,16($inp) | ||
| 238 | sllg $tmp,$tmp,4 # correct last rem_4bit[rem] | ||
| 239 | brctg $len,.Louter | ||
| 240 | |||
| 241 | xgr $Zhi,$tmp | ||
| 242 | stg $Zlo,8+1($Xi) | ||
| 243 | stg $Zhi,0+1($Xi) | ||
| 244 | lm${g} %r6,%r14,6*$SIZE_T($sp) | ||
| 245 | br %r14 | ||
| 246 | .type gcm_ghash_4bit,\@function | ||
| 247 | .size gcm_ghash_4bit,(.-gcm_ghash_4bit) | ||
| 248 | |||
| 249 | .align 64 | ||
| 250 | rem_4bit: | ||
| 251 | .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0 | ||
| 252 | .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0 | ||
| 253 | .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0 | ||
| 254 | .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0 | ||
| 255 | .type rem_4bit,\@object | ||
| 256 | .size rem_4bit,(.-rem_4bit) | ||
| 257 | .string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 258 | ___ | ||
| 259 | |||
| 260 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 261 | print $code; | ||
| 262 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl new file mode 100644 index 0000000000..70e7b044a3 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl | |||
| @@ -0,0 +1,330 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # March 2010 | ||
| 11 | # | ||
| 12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
| 13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
| 14 | # uses 256 bytes per-key table [+128 bytes shared table]. Performance | ||
| 15 | # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU | ||
| 16 | # and are expressed in cycles per processed byte, less is better: | ||
| 17 | # | ||
| 18 | # gcc 3.3.x cc 5.2 this assembler | ||
| 19 | # | ||
| 20 | # 32-bit build 81.4 43.3 12.6 (+546%/+244%) | ||
| 21 | # 64-bit build 20.2 21.2 12.6 (+60%/+68%) | ||
| 22 | # | ||
| 23 | # Here is data collected on UltraSPARC T1 system running Linux: | ||
| 24 | # | ||
| 25 | # gcc 4.4.1 this assembler | ||
| 26 | # | ||
| 27 | # 32-bit build 566 50 (+1000%) | ||
| 28 | # 64-bit build 56 50 (+12%) | ||
| 29 | # | ||
| 30 | # I don't quite understand why difference between 32-bit and 64-bit | ||
| 31 | # compiler-generated code is so big. Compilers *were* instructed to | ||
| 32 | # generate code for UltraSPARC and should have used 64-bit registers | ||
| 33 | # for Z vector (see C code) even in 32-bit build... Oh well, it only | ||
| 34 | # means more impressive improvement coefficients for this assembler | ||
| 35 | # module;-) Loops are aggressively modulo-scheduled in respect to | ||
| 36 | # references to input data and Z.hi updates to achieve 12 cycles | ||
| 37 | # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 | ||
| 38 | # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. | ||
| 39 | |||
| 40 | $bits=32; | ||
| 41 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
| 42 | if ($bits==64) { $bias=2047; $frame=192; } | ||
| 43 | else { $bias=0; $frame=112; } | ||
| 44 | |||
| 45 | $output=shift; | ||
| 46 | open STDOUT,">$output"; | ||
| 47 | |||
| 48 | $Zhi="%o0"; # 64-bit values | ||
| 49 | $Zlo="%o1"; | ||
| 50 | $Thi="%o2"; | ||
| 51 | $Tlo="%o3"; | ||
| 52 | $rem="%o4"; | ||
| 53 | $tmp="%o5"; | ||
| 54 | |||
| 55 | $nhi="%l0"; # small values and pointers | ||
| 56 | $nlo="%l1"; | ||
| 57 | $xi0="%l2"; | ||
| 58 | $xi1="%l3"; | ||
| 59 | $rem_4bit="%l4"; | ||
| 60 | $remi="%l5"; | ||
| 61 | $Htblo="%l6"; | ||
| 62 | $cnt="%l7"; | ||
| 63 | |||
| 64 | $Xi="%i0"; # input argument block | ||
| 65 | $Htbl="%i1"; | ||
| 66 | $inp="%i2"; | ||
| 67 | $len="%i3"; | ||
| 68 | |||
| 69 | $code.=<<___; | ||
| 70 | .section ".text",#alloc,#execinstr | ||
| 71 | |||
| 72 | .align 64 | ||
| 73 | rem_4bit: | ||
| 74 | .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 | ||
| 75 | .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 | ||
| 76 | .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 | ||
| 77 | .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 | ||
| 78 | .type rem_4bit,#object | ||
| 79 | .size rem_4bit,(.-rem_4bit) | ||
| 80 | |||
| 81 | .globl gcm_ghash_4bit | ||
| 82 | .align 32 | ||
| 83 | gcm_ghash_4bit: | ||
| 84 | save %sp,-$frame,%sp | ||
| 85 | ldub [$inp+15],$nlo | ||
| 86 | ldub [$Xi+15],$xi0 | ||
| 87 | ldub [$Xi+14],$xi1 | ||
| 88 | add $len,$inp,$len | ||
| 89 | add $Htbl,8,$Htblo | ||
| 90 | |||
| 91 | 1: call .+8 | ||
| 92 | add %o7,rem_4bit-1b,$rem_4bit | ||
| 93 | |||
| 94 | .Louter: | ||
| 95 | xor $xi0,$nlo,$nlo | ||
| 96 | and $nlo,0xf0,$nhi | ||
| 97 | and $nlo,0x0f,$nlo | ||
| 98 | sll $nlo,4,$nlo | ||
| 99 | ldx [$Htblo+$nlo],$Zlo | ||
| 100 | ldx [$Htbl+$nlo],$Zhi | ||
| 101 | |||
| 102 | ldub [$inp+14],$nlo | ||
| 103 | |||
| 104 | ldx [$Htblo+$nhi],$Tlo | ||
| 105 | and $Zlo,0xf,$remi | ||
| 106 | ldx [$Htbl+$nhi],$Thi | ||
| 107 | sll $remi,3,$remi | ||
| 108 | ldx [$rem_4bit+$remi],$rem | ||
| 109 | srlx $Zlo,4,$Zlo | ||
| 110 | mov 13,$cnt | ||
| 111 | sllx $Zhi,60,$tmp | ||
| 112 | xor $Tlo,$Zlo,$Zlo | ||
| 113 | srlx $Zhi,4,$Zhi | ||
| 114 | xor $Zlo,$tmp,$Zlo | ||
| 115 | |||
| 116 | xor $xi1,$nlo,$nlo | ||
| 117 | and $Zlo,0xf,$remi | ||
| 118 | and $nlo,0xf0,$nhi | ||
| 119 | and $nlo,0x0f,$nlo | ||
| 120 | ba .Lghash_inner | ||
| 121 | sll $nlo,4,$nlo | ||
| 122 | .align 32 | ||
| 123 | .Lghash_inner: | ||
| 124 | ldx [$Htblo+$nlo],$Tlo | ||
| 125 | sll $remi,3,$remi | ||
| 126 | xor $Thi,$Zhi,$Zhi | ||
| 127 | ldx [$Htbl+$nlo],$Thi | ||
| 128 | srlx $Zlo,4,$Zlo | ||
| 129 | xor $rem,$Zhi,$Zhi | ||
| 130 | ldx [$rem_4bit+$remi],$rem | ||
| 131 | sllx $Zhi,60,$tmp | ||
| 132 | xor $Tlo,$Zlo,$Zlo | ||
| 133 | ldub [$inp+$cnt],$nlo | ||
| 134 | srlx $Zhi,4,$Zhi | ||
| 135 | xor $Zlo,$tmp,$Zlo | ||
| 136 | ldub [$Xi+$cnt],$xi1 | ||
| 137 | xor $Thi,$Zhi,$Zhi | ||
| 138 | and $Zlo,0xf,$remi | ||
| 139 | |||
| 140 | ldx [$Htblo+$nhi],$Tlo | ||
| 141 | sll $remi,3,$remi | ||
| 142 | xor $rem,$Zhi,$Zhi | ||
| 143 | ldx [$Htbl+$nhi],$Thi | ||
| 144 | srlx $Zlo,4,$Zlo | ||
| 145 | ldx [$rem_4bit+$remi],$rem | ||
| 146 | sllx $Zhi,60,$tmp | ||
| 147 | xor $xi1,$nlo,$nlo | ||
| 148 | srlx $Zhi,4,$Zhi | ||
| 149 | and $nlo,0xf0,$nhi | ||
| 150 | addcc $cnt,-1,$cnt | ||
| 151 | xor $Zlo,$tmp,$Zlo | ||
| 152 | and $nlo,0x0f,$nlo | ||
| 153 | xor $Tlo,$Zlo,$Zlo | ||
| 154 | sll $nlo,4,$nlo | ||
| 155 | blu .Lghash_inner | ||
| 156 | and $Zlo,0xf,$remi | ||
| 157 | |||
| 158 | ldx [$Htblo+$nlo],$Tlo | ||
| 159 | sll $remi,3,$remi | ||
| 160 | xor $Thi,$Zhi,$Zhi | ||
| 161 | ldx [$Htbl+$nlo],$Thi | ||
| 162 | srlx $Zlo,4,$Zlo | ||
| 163 | xor $rem,$Zhi,$Zhi | ||
| 164 | ldx [$rem_4bit+$remi],$rem | ||
| 165 | sllx $Zhi,60,$tmp | ||
| 166 | xor $Tlo,$Zlo,$Zlo | ||
| 167 | srlx $Zhi,4,$Zhi | ||
| 168 | xor $Zlo,$tmp,$Zlo | ||
| 169 | xor $Thi,$Zhi,$Zhi | ||
| 170 | |||
| 171 | add $inp,16,$inp | ||
| 172 | cmp $inp,$len | ||
| 173 | be,pn `$bits==64?"%xcc":"%icc"`,.Ldone | ||
| 174 | and $Zlo,0xf,$remi | ||
| 175 | |||
| 176 | ldx [$Htblo+$nhi],$Tlo | ||
| 177 | sll $remi,3,$remi | ||
| 178 | xor $rem,$Zhi,$Zhi | ||
| 179 | ldx [$Htbl+$nhi],$Thi | ||
| 180 | srlx $Zlo,4,$Zlo | ||
| 181 | ldx [$rem_4bit+$remi],$rem | ||
| 182 | sllx $Zhi,60,$tmp | ||
| 183 | xor $Tlo,$Zlo,$Zlo | ||
| 184 | ldub [$inp+15],$nlo | ||
| 185 | srlx $Zhi,4,$Zhi | ||
| 186 | xor $Zlo,$tmp,$Zlo | ||
| 187 | xor $Thi,$Zhi,$Zhi | ||
| 188 | stx $Zlo,[$Xi+8] | ||
| 189 | xor $rem,$Zhi,$Zhi | ||
| 190 | stx $Zhi,[$Xi] | ||
| 191 | srl $Zlo,8,$xi1 | ||
| 192 | and $Zlo,0xff,$xi0 | ||
| 193 | ba .Louter | ||
| 194 | and $xi1,0xff,$xi1 | ||
| 195 | .align 32 | ||
| 196 | .Ldone: | ||
| 197 | ldx [$Htblo+$nhi],$Tlo | ||
| 198 | sll $remi,3,$remi | ||
| 199 | xor $rem,$Zhi,$Zhi | ||
| 200 | ldx [$Htbl+$nhi],$Thi | ||
| 201 | srlx $Zlo,4,$Zlo | ||
| 202 | ldx [$rem_4bit+$remi],$rem | ||
| 203 | sllx $Zhi,60,$tmp | ||
| 204 | xor $Tlo,$Zlo,$Zlo | ||
| 205 | srlx $Zhi,4,$Zhi | ||
| 206 | xor $Zlo,$tmp,$Zlo | ||
| 207 | xor $Thi,$Zhi,$Zhi | ||
| 208 | stx $Zlo,[$Xi+8] | ||
| 209 | xor $rem,$Zhi,$Zhi | ||
| 210 | stx $Zhi,[$Xi] | ||
| 211 | |||
| 212 | ret | ||
| 213 | restore | ||
| 214 | .type gcm_ghash_4bit,#function | ||
| 215 | .size gcm_ghash_4bit,(.-gcm_ghash_4bit) | ||
| 216 | ___ | ||
| 217 | |||
| 218 | undef $inp; | ||
| 219 | undef $len; | ||
| 220 | |||
| 221 | $code.=<<___; | ||
| 222 | .globl gcm_gmult_4bit | ||
| 223 | .align 32 | ||
| 224 | gcm_gmult_4bit: | ||
| 225 | save %sp,-$frame,%sp | ||
| 226 | ldub [$Xi+15],$nlo | ||
| 227 | add $Htbl,8,$Htblo | ||
| 228 | |||
| 229 | 1: call .+8 | ||
| 230 | add %o7,rem_4bit-1b,$rem_4bit | ||
| 231 | |||
| 232 | and $nlo,0xf0,$nhi | ||
| 233 | and $nlo,0x0f,$nlo | ||
| 234 | sll $nlo,4,$nlo | ||
| 235 | ldx [$Htblo+$nlo],$Zlo | ||
| 236 | ldx [$Htbl+$nlo],$Zhi | ||
| 237 | |||
| 238 | ldub [$Xi+14],$nlo | ||
| 239 | |||
| 240 | ldx [$Htblo+$nhi],$Tlo | ||
| 241 | and $Zlo,0xf,$remi | ||
| 242 | ldx [$Htbl+$nhi],$Thi | ||
| 243 | sll $remi,3,$remi | ||
| 244 | ldx [$rem_4bit+$remi],$rem | ||
| 245 | srlx $Zlo,4,$Zlo | ||
| 246 | mov 13,$cnt | ||
| 247 | sllx $Zhi,60,$tmp | ||
| 248 | xor $Tlo,$Zlo,$Zlo | ||
| 249 | srlx $Zhi,4,$Zhi | ||
| 250 | xor $Zlo,$tmp,$Zlo | ||
| 251 | |||
| 252 | and $Zlo,0xf,$remi | ||
| 253 | and $nlo,0xf0,$nhi | ||
| 254 | and $nlo,0x0f,$nlo | ||
| 255 | ba .Lgmult_inner | ||
| 256 | sll $nlo,4,$nlo | ||
| 257 | .align 32 | ||
| 258 | .Lgmult_inner: | ||
| 259 | ldx [$Htblo+$nlo],$Tlo | ||
| 260 | sll $remi,3,$remi | ||
| 261 | xor $Thi,$Zhi,$Zhi | ||
| 262 | ldx [$Htbl+$nlo],$Thi | ||
| 263 | srlx $Zlo,4,$Zlo | ||
| 264 | xor $rem,$Zhi,$Zhi | ||
| 265 | ldx [$rem_4bit+$remi],$rem | ||
| 266 | sllx $Zhi,60,$tmp | ||
| 267 | xor $Tlo,$Zlo,$Zlo | ||
| 268 | ldub [$Xi+$cnt],$nlo | ||
| 269 | srlx $Zhi,4,$Zhi | ||
| 270 | xor $Zlo,$tmp,$Zlo | ||
| 271 | xor $Thi,$Zhi,$Zhi | ||
| 272 | and $Zlo,0xf,$remi | ||
| 273 | |||
| 274 | ldx [$Htblo+$nhi],$Tlo | ||
| 275 | sll $remi,3,$remi | ||
| 276 | xor $rem,$Zhi,$Zhi | ||
| 277 | ldx [$Htbl+$nhi],$Thi | ||
| 278 | srlx $Zlo,4,$Zlo | ||
| 279 | ldx [$rem_4bit+$remi],$rem | ||
| 280 | sllx $Zhi,60,$tmp | ||
| 281 | srlx $Zhi,4,$Zhi | ||
| 282 | and $nlo,0xf0,$nhi | ||
| 283 | addcc $cnt,-1,$cnt | ||
| 284 | xor $Zlo,$tmp,$Zlo | ||
| 285 | and $nlo,0x0f,$nlo | ||
| 286 | xor $Tlo,$Zlo,$Zlo | ||
| 287 | sll $nlo,4,$nlo | ||
| 288 | blu .Lgmult_inner | ||
| 289 | and $Zlo,0xf,$remi | ||
| 290 | |||
| 291 | ldx [$Htblo+$nlo],$Tlo | ||
| 292 | sll $remi,3,$remi | ||
| 293 | xor $Thi,$Zhi,$Zhi | ||
| 294 | ldx [$Htbl+$nlo],$Thi | ||
| 295 | srlx $Zlo,4,$Zlo | ||
| 296 | xor $rem,$Zhi,$Zhi | ||
| 297 | ldx [$rem_4bit+$remi],$rem | ||
| 298 | sllx $Zhi,60,$tmp | ||
| 299 | xor $Tlo,$Zlo,$Zlo | ||
| 300 | srlx $Zhi,4,$Zhi | ||
| 301 | xor $Zlo,$tmp,$Zlo | ||
| 302 | xor $Thi,$Zhi,$Zhi | ||
| 303 | and $Zlo,0xf,$remi | ||
| 304 | |||
| 305 | ldx [$Htblo+$nhi],$Tlo | ||
| 306 | sll $remi,3,$remi | ||
| 307 | xor $rem,$Zhi,$Zhi | ||
| 308 | ldx [$Htbl+$nhi],$Thi | ||
| 309 | srlx $Zlo,4,$Zlo | ||
| 310 | ldx [$rem_4bit+$remi],$rem | ||
| 311 | sllx $Zhi,60,$tmp | ||
| 312 | xor $Tlo,$Zlo,$Zlo | ||
| 313 | srlx $Zhi,4,$Zhi | ||
| 314 | xor $Zlo,$tmp,$Zlo | ||
| 315 | xor $Thi,$Zhi,$Zhi | ||
| 316 | stx $Zlo,[$Xi+8] | ||
| 317 | xor $rem,$Zhi,$Zhi | ||
| 318 | stx $Zhi,[$Xi] | ||
| 319 | |||
| 320 | ret | ||
| 321 | restore | ||
| 322 | .type gcm_gmult_4bit,#function | ||
| 323 | .size gcm_gmult_4bit,(.-gcm_gmult_4bit) | ||
| 324 | .asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 325 | .align 4 | ||
| 326 | ___ | ||
| 327 | |||
| 328 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 329 | print $code; | ||
| 330 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl new file mode 100644 index 0000000000..6b09669d47 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-x86.pl | |||
| @@ -0,0 +1,1342 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # March, May, June 2010 | ||
| 11 | # | ||
| 12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
| 13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
| 14 | # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two | ||
| 15 | # code paths: vanilla x86 and vanilla MMX. Former will be executed on | ||
| 16 | # 486 and Pentium, latter on all others. MMX GHASH features so called | ||
| 17 | # "528B" variant of "4-bit" method utilizing additional 256+16 bytes | ||
| 18 | # of per-key storage [+512 bytes shared table]. Performance results | ||
| 19 | # are for streamed GHASH subroutine and are expressed in cycles per | ||
| 20 | # processed byte, less is better: | ||
| 21 | # | ||
| 22 | # gcc 2.95.3(*) MMX assembler x86 assembler | ||
| 23 | # | ||
| 24 | # Pentium 105/111(**) - 50 | ||
| 25 | # PIII 68 /75 12.2 24 | ||
| 26 | # P4 125/125 17.8 84(***) | ||
| 27 | # Opteron 66 /70 10.1 30 | ||
| 28 | # Core2 54 /67 8.4 18 | ||
| 29 | # | ||
| 30 | # (*) gcc 3.4.x was observed to generate few percent slower code, | ||
| 31 | # which is one of reasons why 2.95.3 results were chosen, | ||
| 32 | # another reason is lack of 3.4.x results for older CPUs; | ||
| 33 | # comparison with MMX results is not completely fair, because C | ||
| 34 | # results are for vanilla "256B" implementation, while | ||
| 35 | # assembler results are for "528B";-) | ||
| 36 | # (**) second number is result for code compiled with -fPIC flag, | ||
| 37 | # which is actually more relevant, because assembler code is | ||
| 38 | # position-independent; | ||
| 39 | # (***) see comment in non-MMX routine for further details; | ||
| 40 | # | ||
| 41 | # To summarize, it's >2-5 times faster than gcc-generated code. To | ||
| 42 | # anchor it to something else SHA1 assembler processes one byte in | ||
| 43 | # 11-13 cycles on contemporary x86 cores. As for choice of MMX in | ||
| 44 | # particular, see comment at the end of the file... | ||
| 45 | |||
| 46 | # May 2010 | ||
| 47 | # | ||
| 48 | # Add PCLMULQDQ version performing at 2.10 cycles per processed byte. | ||
| 49 | # The question is how close is it to theoretical limit? The pclmulqdq | ||
| 50 | # instruction latency appears to be 14 cycles and there can't be more | ||
| 51 | # than 2 of them executing at any given time. This means that single | ||
| 52 | # Karatsuba multiplication would take 28 cycles *plus* few cycles for | ||
| 53 | # pre- and post-processing. Then multiplication has to be followed by | ||
| 54 | # modulo-reduction. Given that aggregated reduction method [see | ||
| 55 | # "Carry-less Multiplication and Its Usage for Computing the GCM Mode" | ||
| 56 | # white paper by Intel] allows you to perform reduction only once in | ||
| 57 | # a while we can assume that asymptotic performance can be estimated | ||
| 58 | # as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction | ||
| 59 | # and Naggr is the aggregation factor. | ||
| 60 | # | ||
| 61 | # Before we proceed to this implementation let's have closer look at | ||
| 62 | # the best-performing code suggested by Intel in their white paper. | ||
| 63 | # By tracing inter-register dependencies Tmod is estimated as ~19 | ||
| 64 | # cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per | ||
| 65 | # processed byte. As implied, this is quite optimistic estimate, | ||
| 66 | # because it does not account for Karatsuba pre- and post-processing, | ||
| 67 | # which for a single multiplication is ~5 cycles. Unfortunately Intel | ||
| 68 | # does not provide performance data for GHASH alone. But benchmarking | ||
| 69 | # AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt | ||
| 70 | # alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that | ||
| 71 | # the result accounts even for pre-computing of degrees of the hash | ||
| 72 | # key H, but its portion is negligible at 16KB buffer size. | ||
| 73 | # | ||
| 74 | # Moving on to the implementation in question. Tmod is estimated as | ||
| 75 | # ~13 cycles and Naggr is 2, giving asymptotic performance of ... | ||
| 76 | # 2.16. How is it possible that measured performance is better than | ||
| 77 | # optimistic theoretical estimate? There is one thing Intel failed | ||
| 78 | # to recognize. By serializing GHASH with CTR in same subroutine | ||
| 79 | # former's performance is really limited to above (Tmul + Tmod/Naggr) | ||
| 80 | # equation. But if GHASH procedure is detached, the modulo-reduction | ||
| 81 | # can be interleaved with Naggr-1 multiplications at instruction level | ||
| 82 | # and under ideal conditions even disappear from the equation. So that | ||
| 83 | # optimistic theoretical estimate for this implementation is ... | ||
| 84 | # 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, | ||
| 85 | # at least for such small Naggr. I'd argue that (28+Tproc/Naggr), | ||
| 86 | # where Tproc is time required for Karatsuba pre- and post-processing, | ||
| 87 | # is more realistic estimate. In this case it gives ... 1.91 cycles. | ||
| 88 | # Or in other words, depending on how well we can interleave reduction | ||
| 89 | # and one of the two multiplications the performance should be betwen | ||
| 90 | # 1.91 and 2.16. As already mentioned, this implementation processes | ||
| 91 | # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart | ||
| 92 | # - in 2.02. x86_64 performance is better, because larger register | ||
| 93 | # bank allows to interleave reduction and multiplication better. | ||
| 94 | # | ||
| 95 | # Does it make sense to increase Naggr? To start with it's virtually | ||
| 96 | # impossible in 32-bit mode, because of limited register bank | ||
| 97 | # capacity. Otherwise improvement has to be weighed agiainst slower | ||
| 98 | # setup, as well as code size and complexity increase. As even | ||
| 99 | # optimistic estimate doesn't promise 30% performance improvement, | ||
| 100 | # there are currently no plans to increase Naggr. | ||
| 101 | # | ||
| 102 | # Special thanks to David Woodhouse <dwmw2@infradead.org> for | ||
| 103 | # providing access to a Westmere-based system on behalf of Intel | ||
| 104 | # Open Source Technology Centre. | ||
| 105 | |||
| 106 | # January 2010 | ||
| 107 | # | ||
| 108 | # Tweaked to optimize transitions between integer and FP operations | ||
| 109 | # on same XMM register, PCLMULQDQ subroutine was measured to process | ||
| 110 | # one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. | ||
| 111 | # The minor regression on Westmere is outweighed by ~15% improvement | ||
| 112 | # on Sandy Bridge. Strangely enough attempt to modify 64-bit code in | ||
| 113 | # similar manner resulted in almost 20% degradation on Sandy Bridge, | ||
| 114 | # where original 64-bit code processes one byte in 1.95 cycles. | ||
| 115 | |||
| 116 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 117 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 118 | require "x86asm.pl"; | ||
| 119 | |||
| 120 | &asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); | ||
| 121 | |||
| 122 | $sse2=0; | ||
| 123 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
| 124 | |||
| 125 | ($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); | ||
| 126 | $inp = "edi"; | ||
| 127 | $Htbl = "esi"; | ||
| 128 | |||
| 129 | $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse | ||
| 130 | # than unrolled, which has to be weighted against | ||
| 131 | # 2.5x x86-specific code size reduction. | ||
| 132 | |||
| 133 | sub x86_loop { | ||
| 134 | my $off = shift; | ||
| 135 | my $rem = "eax"; | ||
| 136 | |||
| 137 | &mov ($Zhh,&DWP(4,$Htbl,$Zll)); | ||
| 138 | &mov ($Zhl,&DWP(0,$Htbl,$Zll)); | ||
| 139 | &mov ($Zlh,&DWP(12,$Htbl,$Zll)); | ||
| 140 | &mov ($Zll,&DWP(8,$Htbl,$Zll)); | ||
| 141 | &xor ($rem,$rem); # avoid partial register stalls on PIII | ||
| 142 | |||
| 143 | # shrd practically kills P4, 2.5x deterioration, but P4 has | ||
| 144 | # MMX code-path to execute. shrd runs tad faster [than twice | ||
| 145 | # the shifts, move's and or's] on pre-MMX Pentium (as well as | ||
| 146 | # PIII and Core2), *but* minimizes code size, spares register | ||
| 147 | # and thus allows to fold the loop... | ||
| 148 | if (!$unroll) { | ||
| 149 | my $cnt = $inp; | ||
| 150 | &mov ($cnt,15); | ||
| 151 | &jmp (&label("x86_loop")); | ||
| 152 | &set_label("x86_loop",16); | ||
| 153 | for($i=1;$i<=2;$i++) { | ||
| 154 | &mov (&LB($rem),&LB($Zll)); | ||
| 155 | &shrd ($Zll,$Zlh,4); | ||
| 156 | &and (&LB($rem),0xf); | ||
| 157 | &shrd ($Zlh,$Zhl,4); | ||
| 158 | &shrd ($Zhl,$Zhh,4); | ||
| 159 | &shr ($Zhh,4); | ||
| 160 | &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); | ||
| 161 | |||
| 162 | &mov (&LB($rem),&BP($off,"esp",$cnt)); | ||
| 163 | if ($i&1) { | ||
| 164 | &and (&LB($rem),0xf0); | ||
| 165 | } else { | ||
| 166 | &shl (&LB($rem),4); | ||
| 167 | } | ||
| 168 | |||
| 169 | &xor ($Zll,&DWP(8,$Htbl,$rem)); | ||
| 170 | &xor ($Zlh,&DWP(12,$Htbl,$rem)); | ||
| 171 | &xor ($Zhl,&DWP(0,$Htbl,$rem)); | ||
| 172 | &xor ($Zhh,&DWP(4,$Htbl,$rem)); | ||
| 173 | |||
| 174 | if ($i&1) { | ||
| 175 | &dec ($cnt); | ||
| 176 | &js (&label("x86_break")); | ||
| 177 | } else { | ||
| 178 | &jmp (&label("x86_loop")); | ||
| 179 | } | ||
| 180 | } | ||
| 181 | &set_label("x86_break",16); | ||
| 182 | } else { | ||
| 183 | for($i=1;$i<32;$i++) { | ||
| 184 | &comment($i); | ||
| 185 | &mov (&LB($rem),&LB($Zll)); | ||
| 186 | &shrd ($Zll,$Zlh,4); | ||
| 187 | &and (&LB($rem),0xf); | ||
| 188 | &shrd ($Zlh,$Zhl,4); | ||
| 189 | &shrd ($Zhl,$Zhh,4); | ||
| 190 | &shr ($Zhh,4); | ||
| 191 | &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); | ||
| 192 | |||
| 193 | if ($i&1) { | ||
| 194 | &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); | ||
| 195 | &and (&LB($rem),0xf0); | ||
| 196 | } else { | ||
| 197 | &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); | ||
| 198 | &shl (&LB($rem),4); | ||
| 199 | } | ||
| 200 | |||
| 201 | &xor ($Zll,&DWP(8,$Htbl,$rem)); | ||
| 202 | &xor ($Zlh,&DWP(12,$Htbl,$rem)); | ||
| 203 | &xor ($Zhl,&DWP(0,$Htbl,$rem)); | ||
| 204 | &xor ($Zhh,&DWP(4,$Htbl,$rem)); | ||
| 205 | } | ||
| 206 | } | ||
| 207 | &bswap ($Zll); | ||
| 208 | &bswap ($Zlh); | ||
| 209 | &bswap ($Zhl); | ||
| 210 | if (!$x86only) { | ||
| 211 | &bswap ($Zhh); | ||
| 212 | } else { | ||
| 213 | &mov ("eax",$Zhh); | ||
| 214 | &bswap ("eax"); | ||
| 215 | &mov ($Zhh,"eax"); | ||
| 216 | } | ||
| 217 | } | ||
| 218 | |||
| 219 | if ($unroll) { | ||
| 220 | &function_begin_B("_x86_gmult_4bit_inner"); | ||
| 221 | &x86_loop(4); | ||
| 222 | &ret (); | ||
| 223 | &function_end_B("_x86_gmult_4bit_inner"); | ||
| 224 | } | ||
| 225 | |||
| 226 | sub deposit_rem_4bit { | ||
| 227 | my $bias = shift; | ||
| 228 | |||
| 229 | &mov (&DWP($bias+0, "esp"),0x0000<<16); | ||
| 230 | &mov (&DWP($bias+4, "esp"),0x1C20<<16); | ||
| 231 | &mov (&DWP($bias+8, "esp"),0x3840<<16); | ||
| 232 | &mov (&DWP($bias+12,"esp"),0x2460<<16); | ||
| 233 | &mov (&DWP($bias+16,"esp"),0x7080<<16); | ||
| 234 | &mov (&DWP($bias+20,"esp"),0x6CA0<<16); | ||
| 235 | &mov (&DWP($bias+24,"esp"),0x48C0<<16); | ||
| 236 | &mov (&DWP($bias+28,"esp"),0x54E0<<16); | ||
| 237 | &mov (&DWP($bias+32,"esp"),0xE100<<16); | ||
| 238 | &mov (&DWP($bias+36,"esp"),0xFD20<<16); | ||
| 239 | &mov (&DWP($bias+40,"esp"),0xD940<<16); | ||
| 240 | &mov (&DWP($bias+44,"esp"),0xC560<<16); | ||
| 241 | &mov (&DWP($bias+48,"esp"),0x9180<<16); | ||
| 242 | &mov (&DWP($bias+52,"esp"),0x8DA0<<16); | ||
| 243 | &mov (&DWP($bias+56,"esp"),0xA9C0<<16); | ||
| 244 | &mov (&DWP($bias+60,"esp"),0xB5E0<<16); | ||
| 245 | } | ||
| 246 | |||
| 247 | $suffix = $x86only ? "" : "_x86"; | ||
| 248 | |||
| 249 | &function_begin("gcm_gmult_4bit".$suffix); | ||
| 250 | &stack_push(16+4+1); # +1 for stack alignment | ||
| 251 | &mov ($inp,&wparam(0)); # load Xi | ||
| 252 | &mov ($Htbl,&wparam(1)); # load Htable | ||
| 253 | |||
| 254 | &mov ($Zhh,&DWP(0,$inp)); # load Xi[16] | ||
| 255 | &mov ($Zhl,&DWP(4,$inp)); | ||
| 256 | &mov ($Zlh,&DWP(8,$inp)); | ||
| 257 | &mov ($Zll,&DWP(12,$inp)); | ||
| 258 | |||
| 259 | &deposit_rem_4bit(16); | ||
| 260 | |||
| 261 | &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack | ||
| 262 | &mov (&DWP(4,"esp"),$Zhl); | ||
| 263 | &mov (&DWP(8,"esp"),$Zlh); | ||
| 264 | &mov (&DWP(12,"esp"),$Zll); | ||
| 265 | &shr ($Zll,20); | ||
| 266 | &and ($Zll,0xf0); | ||
| 267 | |||
| 268 | if ($unroll) { | ||
| 269 | &call ("_x86_gmult_4bit_inner"); | ||
| 270 | } else { | ||
| 271 | &x86_loop(0); | ||
| 272 | &mov ($inp,&wparam(0)); | ||
| 273 | } | ||
| 274 | |||
| 275 | &mov (&DWP(12,$inp),$Zll); | ||
| 276 | &mov (&DWP(8,$inp),$Zlh); | ||
| 277 | &mov (&DWP(4,$inp),$Zhl); | ||
| 278 | &mov (&DWP(0,$inp),$Zhh); | ||
| 279 | &stack_pop(16+4+1); | ||
| 280 | &function_end("gcm_gmult_4bit".$suffix); | ||
| 281 | |||
| 282 | &function_begin("gcm_ghash_4bit".$suffix); | ||
| 283 | &stack_push(16+4+1); # +1 for 64-bit alignment | ||
| 284 | &mov ($Zll,&wparam(0)); # load Xi | ||
| 285 | &mov ($Htbl,&wparam(1)); # load Htable | ||
| 286 | &mov ($inp,&wparam(2)); # load in | ||
| 287 | &mov ("ecx",&wparam(3)); # load len | ||
| 288 | &add ("ecx",$inp); | ||
| 289 | &mov (&wparam(3),"ecx"); | ||
| 290 | |||
| 291 | &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] | ||
| 292 | &mov ($Zhl,&DWP(4,$Zll)); | ||
| 293 | &mov ($Zlh,&DWP(8,$Zll)); | ||
| 294 | &mov ($Zll,&DWP(12,$Zll)); | ||
| 295 | |||
| 296 | &deposit_rem_4bit(16); | ||
| 297 | |||
| 298 | &set_label("x86_outer_loop",16); | ||
| 299 | &xor ($Zll,&DWP(12,$inp)); # xor with input | ||
| 300 | &xor ($Zlh,&DWP(8,$inp)); | ||
| 301 | &xor ($Zhl,&DWP(4,$inp)); | ||
| 302 | &xor ($Zhh,&DWP(0,$inp)); | ||
| 303 | &mov (&DWP(12,"esp"),$Zll); # dump it on stack | ||
| 304 | &mov (&DWP(8,"esp"),$Zlh); | ||
| 305 | &mov (&DWP(4,"esp"),$Zhl); | ||
| 306 | &mov (&DWP(0,"esp"),$Zhh); | ||
| 307 | |||
| 308 | &shr ($Zll,20); | ||
| 309 | &and ($Zll,0xf0); | ||
| 310 | |||
| 311 | if ($unroll) { | ||
| 312 | &call ("_x86_gmult_4bit_inner"); | ||
| 313 | } else { | ||
| 314 | &x86_loop(0); | ||
| 315 | &mov ($inp,&wparam(2)); | ||
| 316 | } | ||
| 317 | &lea ($inp,&DWP(16,$inp)); | ||
| 318 | &cmp ($inp,&wparam(3)); | ||
| 319 | &mov (&wparam(2),$inp) if (!$unroll); | ||
| 320 | &jb (&label("x86_outer_loop")); | ||
| 321 | |||
| 322 | &mov ($inp,&wparam(0)); # load Xi | ||
| 323 | &mov (&DWP(12,$inp),$Zll); | ||
| 324 | &mov (&DWP(8,$inp),$Zlh); | ||
| 325 | &mov (&DWP(4,$inp),$Zhl); | ||
| 326 | &mov (&DWP(0,$inp),$Zhh); | ||
| 327 | &stack_pop(16+4+1); | ||
| 328 | &function_end("gcm_ghash_4bit".$suffix); | ||
| 329 | |||
| 330 | if (!$x86only) {{{ | ||
| 331 | |||
| 332 | &static_label("rem_4bit"); | ||
| 333 | |||
| 334 | if (!$sse2) {{ # pure-MMX "May" version... | ||
| 335 | |||
| 336 | $S=12; # shift factor for rem_4bit | ||
| 337 | |||
| 338 | &function_begin_B("_mmx_gmult_4bit_inner"); | ||
| 339 | # MMX version performs 3.5 times better on P4 (see comment in non-MMX | ||
| 340 | # routine for further details), 100% better on Opteron, ~70% better | ||
| 341 | # on Core2 and PIII... In other words effort is considered to be well | ||
| 342 | # spent... Since initial release the loop was unrolled in order to | ||
| 343 | # "liberate" register previously used as loop counter. Instead it's | ||
| 344 | # used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. | ||
| 345 | # The path involves move of Z.lo from MMX to integer register, | ||
| 346 | # effective address calculation and finally merge of value to Z.hi. | ||
| 347 | # Reference to rem_4bit is scheduled so late that I had to >>4 | ||
| 348 | # rem_4bit elements. This resulted in 20-45% procent improvement | ||
| 349 | # on contemporary µ-archs. | ||
| 350 | { | ||
| 351 | my $cnt; | ||
| 352 | my $rem_4bit = "eax"; | ||
| 353 | my @rem = ($Zhh,$Zll); | ||
| 354 | my $nhi = $Zhl; | ||
| 355 | my $nlo = $Zlh; | ||
| 356 | |||
| 357 | my ($Zlo,$Zhi) = ("mm0","mm1"); | ||
| 358 | my $tmp = "mm2"; | ||
| 359 | |||
| 360 | &xor ($nlo,$nlo); # avoid partial register stalls on PIII | ||
| 361 | &mov ($nhi,$Zll); | ||
| 362 | &mov (&LB($nlo),&LB($nhi)); | ||
| 363 | &shl (&LB($nlo),4); | ||
| 364 | &and ($nhi,0xf0); | ||
| 365 | &movq ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
| 366 | &movq ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
| 367 | &movd ($rem[0],$Zlo); | ||
| 368 | |||
| 369 | for ($cnt=28;$cnt>=-2;$cnt--) { | ||
| 370 | my $odd = $cnt&1; | ||
| 371 | my $nix = $odd ? $nlo : $nhi; | ||
| 372 | |||
| 373 | &shl (&LB($nlo),4) if ($odd); | ||
| 374 | &psrlq ($Zlo,4); | ||
| 375 | &movq ($tmp,$Zhi); | ||
| 376 | &psrlq ($Zhi,4); | ||
| 377 | &pxor ($Zlo,&QWP(8,$Htbl,$nix)); | ||
| 378 | &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0); | ||
| 379 | &psllq ($tmp,60); | ||
| 380 | &and ($nhi,0xf0) if ($odd); | ||
| 381 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28); | ||
| 382 | &and ($rem[0],0xf); | ||
| 383 | &pxor ($Zhi,&QWP(0,$Htbl,$nix)); | ||
| 384 | &mov ($nhi,$nlo) if (!$odd && $cnt>=0); | ||
| 385 | &movd ($rem[1],$Zlo); | ||
| 386 | &pxor ($Zlo,$tmp); | ||
| 387 | |||
| 388 | push (@rem,shift(@rem)); # "rotate" registers | ||
| 389 | } | ||
| 390 | |||
| 391 | &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem] | ||
| 392 | |||
| 393 | &psrlq ($Zlo,32); # lower part of Zlo is already there | ||
| 394 | &movd ($Zhl,$Zhi); | ||
| 395 | &psrlq ($Zhi,32); | ||
| 396 | &movd ($Zlh,$Zlo); | ||
| 397 | &movd ($Zhh,$Zhi); | ||
| 398 | &shl ($inp,4); # compensate for rem_4bit[i] being >>4 | ||
| 399 | |||
| 400 | &bswap ($Zll); | ||
| 401 | &bswap ($Zhl); | ||
| 402 | &bswap ($Zlh); | ||
| 403 | &xor ($Zhh,$inp); | ||
| 404 | &bswap ($Zhh); | ||
| 405 | |||
| 406 | &ret (); | ||
| 407 | } | ||
| 408 | &function_end_B("_mmx_gmult_4bit_inner"); | ||
| 409 | |||
| 410 | &function_begin("gcm_gmult_4bit_mmx"); | ||
| 411 | &mov ($inp,&wparam(0)); # load Xi | ||
| 412 | &mov ($Htbl,&wparam(1)); # load Htable | ||
| 413 | |||
| 414 | &call (&label("pic_point")); | ||
| 415 | &set_label("pic_point"); | ||
| 416 | &blindpop("eax"); | ||
| 417 | &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); | ||
| 418 | |||
| 419 | &movz ($Zll,&BP(15,$inp)); | ||
| 420 | |||
| 421 | &call ("_mmx_gmult_4bit_inner"); | ||
| 422 | |||
| 423 | &mov ($inp,&wparam(0)); # load Xi | ||
| 424 | &emms (); | ||
| 425 | &mov (&DWP(12,$inp),$Zll); | ||
| 426 | &mov (&DWP(4,$inp),$Zhl); | ||
| 427 | &mov (&DWP(8,$inp),$Zlh); | ||
| 428 | &mov (&DWP(0,$inp),$Zhh); | ||
| 429 | &function_end("gcm_gmult_4bit_mmx"); | ||
| 430 | |||
| 431 | # Streamed version performs 20% better on P4, 7% on Opteron, | ||
| 432 | # 10% on Core2 and PIII... | ||
| 433 | &function_begin("gcm_ghash_4bit_mmx"); | ||
| 434 | &mov ($Zhh,&wparam(0)); # load Xi | ||
| 435 | &mov ($Htbl,&wparam(1)); # load Htable | ||
| 436 | &mov ($inp,&wparam(2)); # load in | ||
| 437 | &mov ($Zlh,&wparam(3)); # load len | ||
| 438 | |||
| 439 | &call (&label("pic_point")); | ||
| 440 | &set_label("pic_point"); | ||
| 441 | &blindpop("eax"); | ||
| 442 | &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); | ||
| 443 | |||
| 444 | &add ($Zlh,$inp); | ||
| 445 | &mov (&wparam(3),$Zlh); # len to point at the end of input | ||
| 446 | &stack_push(4+1); # +1 for stack alignment | ||
| 447 | |||
| 448 | &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] | ||
| 449 | &mov ($Zhl,&DWP(4,$Zhh)); | ||
| 450 | &mov ($Zlh,&DWP(8,$Zhh)); | ||
| 451 | &mov ($Zhh,&DWP(0,$Zhh)); | ||
| 452 | &jmp (&label("mmx_outer_loop")); | ||
| 453 | |||
| 454 | &set_label("mmx_outer_loop",16); | ||
| 455 | &xor ($Zll,&DWP(12,$inp)); | ||
| 456 | &xor ($Zhl,&DWP(4,$inp)); | ||
| 457 | &xor ($Zlh,&DWP(8,$inp)); | ||
| 458 | &xor ($Zhh,&DWP(0,$inp)); | ||
| 459 | &mov (&wparam(2),$inp); | ||
| 460 | &mov (&DWP(12,"esp"),$Zll); | ||
| 461 | &mov (&DWP(4,"esp"),$Zhl); | ||
| 462 | &mov (&DWP(8,"esp"),$Zlh); | ||
| 463 | &mov (&DWP(0,"esp"),$Zhh); | ||
| 464 | |||
| 465 | &mov ($inp,"esp"); | ||
| 466 | &shr ($Zll,24); | ||
| 467 | |||
| 468 | &call ("_mmx_gmult_4bit_inner"); | ||
| 469 | |||
| 470 | &mov ($inp,&wparam(2)); | ||
| 471 | &lea ($inp,&DWP(16,$inp)); | ||
| 472 | &cmp ($inp,&wparam(3)); | ||
| 473 | &jb (&label("mmx_outer_loop")); | ||
| 474 | |||
| 475 | &mov ($inp,&wparam(0)); # load Xi | ||
| 476 | &emms (); | ||
| 477 | &mov (&DWP(12,$inp),$Zll); | ||
| 478 | &mov (&DWP(4,$inp),$Zhl); | ||
| 479 | &mov (&DWP(8,$inp),$Zlh); | ||
| 480 | &mov (&DWP(0,$inp),$Zhh); | ||
| 481 | |||
| 482 | &stack_pop(4+1); | ||
| 483 | &function_end("gcm_ghash_4bit_mmx"); | ||
| 484 | |||
| 485 | }} else {{ # "June" MMX version... | ||
| 486 | # ... has slower "April" gcm_gmult_4bit_mmx with folded | ||
| 487 | # loop. This is done to conserve code size... | ||
| 488 | $S=16; # shift factor for rem_4bit | ||
| 489 | |||
| 490 | sub mmx_loop() { | ||
| 491 | # MMX version performs 2.8 times better on P4 (see comment in non-MMX | ||
| 492 | # routine for further details), 40% better on Opteron and Core2, 50% | ||
| 493 | # better on PIII... In other words effort is considered to be well | ||
| 494 | # spent... | ||
| 495 | my $inp = shift; | ||
| 496 | my $rem_4bit = shift; | ||
| 497 | my $cnt = $Zhh; | ||
| 498 | my $nhi = $Zhl; | ||
| 499 | my $nlo = $Zlh; | ||
| 500 | my $rem = $Zll; | ||
| 501 | |||
| 502 | my ($Zlo,$Zhi) = ("mm0","mm1"); | ||
| 503 | my $tmp = "mm2"; | ||
| 504 | |||
| 505 | &xor ($nlo,$nlo); # avoid partial register stalls on PIII | ||
| 506 | &mov ($nhi,$Zll); | ||
| 507 | &mov (&LB($nlo),&LB($nhi)); | ||
| 508 | &mov ($cnt,14); | ||
| 509 | &shl (&LB($nlo),4); | ||
| 510 | &and ($nhi,0xf0); | ||
| 511 | &movq ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
| 512 | &movq ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
| 513 | &movd ($rem,$Zlo); | ||
| 514 | &jmp (&label("mmx_loop")); | ||
| 515 | |||
| 516 | &set_label("mmx_loop",16); | ||
| 517 | &psrlq ($Zlo,4); | ||
| 518 | &and ($rem,0xf); | ||
| 519 | &movq ($tmp,$Zhi); | ||
| 520 | &psrlq ($Zhi,4); | ||
| 521 | &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); | ||
| 522 | &mov (&LB($nlo),&BP(0,$inp,$cnt)); | ||
| 523 | &psllq ($tmp,60); | ||
| 524 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
| 525 | &dec ($cnt); | ||
| 526 | &movd ($rem,$Zlo); | ||
| 527 | &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); | ||
| 528 | &mov ($nhi,$nlo); | ||
| 529 | &pxor ($Zlo,$tmp); | ||
| 530 | &js (&label("mmx_break")); | ||
| 531 | |||
| 532 | &shl (&LB($nlo),4); | ||
| 533 | &and ($rem,0xf); | ||
| 534 | &psrlq ($Zlo,4); | ||
| 535 | &and ($nhi,0xf0); | ||
| 536 | &movq ($tmp,$Zhi); | ||
| 537 | &psrlq ($Zhi,4); | ||
| 538 | &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
| 539 | &psllq ($tmp,60); | ||
| 540 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
| 541 | &movd ($rem,$Zlo); | ||
| 542 | &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
| 543 | &pxor ($Zlo,$tmp); | ||
| 544 | &jmp (&label("mmx_loop")); | ||
| 545 | |||
| 546 | &set_label("mmx_break",16); | ||
| 547 | &shl (&LB($nlo),4); | ||
| 548 | &and ($rem,0xf); | ||
| 549 | &psrlq ($Zlo,4); | ||
| 550 | &and ($nhi,0xf0); | ||
| 551 | &movq ($tmp,$Zhi); | ||
| 552 | &psrlq ($Zhi,4); | ||
| 553 | &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
| 554 | &psllq ($tmp,60); | ||
| 555 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
| 556 | &movd ($rem,$Zlo); | ||
| 557 | &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
| 558 | &pxor ($Zlo,$tmp); | ||
| 559 | |||
| 560 | &psrlq ($Zlo,4); | ||
| 561 | &and ($rem,0xf); | ||
| 562 | &movq ($tmp,$Zhi); | ||
| 563 | &psrlq ($Zhi,4); | ||
| 564 | &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); | ||
| 565 | &psllq ($tmp,60); | ||
| 566 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
| 567 | &movd ($rem,$Zlo); | ||
| 568 | &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); | ||
| 569 | &pxor ($Zlo,$tmp); | ||
| 570 | |||
| 571 | &psrlq ($Zlo,32); # lower part of Zlo is already there | ||
| 572 | &movd ($Zhl,$Zhi); | ||
| 573 | &psrlq ($Zhi,32); | ||
| 574 | &movd ($Zlh,$Zlo); | ||
| 575 | &movd ($Zhh,$Zhi); | ||
| 576 | |||
| 577 | &bswap ($Zll); | ||
| 578 | &bswap ($Zhl); | ||
| 579 | &bswap ($Zlh); | ||
| 580 | &bswap ($Zhh); | ||
| 581 | } | ||
| 582 | |||
| 583 | &function_begin("gcm_gmult_4bit_mmx"); | ||
| 584 | &mov ($inp,&wparam(0)); # load Xi | ||
| 585 | &mov ($Htbl,&wparam(1)); # load Htable | ||
| 586 | |||
| 587 | &call (&label("pic_point")); | ||
| 588 | &set_label("pic_point"); | ||
| 589 | &blindpop("eax"); | ||
| 590 | &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); | ||
| 591 | |||
| 592 | &movz ($Zll,&BP(15,$inp)); | ||
| 593 | |||
| 594 | &mmx_loop($inp,"eax"); | ||
| 595 | |||
| 596 | &emms (); | ||
| 597 | &mov (&DWP(12,$inp),$Zll); | ||
| 598 | &mov (&DWP(4,$inp),$Zhl); | ||
| 599 | &mov (&DWP(8,$inp),$Zlh); | ||
| 600 | &mov (&DWP(0,$inp),$Zhh); | ||
| 601 | &function_end("gcm_gmult_4bit_mmx"); | ||
| 602 | |||
| 603 | ###################################################################### | ||
| 604 | # Below subroutine is "528B" variant of "4-bit" GCM GHASH function | ||
| 605 | # (see gcm128.c for details). It provides further 20-40% performance | ||
| 606 | # improvement over above mentioned "May" version. | ||
| 607 | |||
| 608 | &static_label("rem_8bit"); | ||
| 609 | |||
| 610 | &function_begin("gcm_ghash_4bit_mmx"); | ||
| 611 | { my ($Zlo,$Zhi) = ("mm7","mm6"); | ||
| 612 | my $rem_8bit = "esi"; | ||
| 613 | my $Htbl = "ebx"; | ||
| 614 | |||
| 615 | # parameter block | ||
| 616 | &mov ("eax",&wparam(0)); # Xi | ||
| 617 | &mov ("ebx",&wparam(1)); # Htable | ||
| 618 | &mov ("ecx",&wparam(2)); # inp | ||
| 619 | &mov ("edx",&wparam(3)); # len | ||
| 620 | &mov ("ebp","esp"); # original %esp | ||
| 621 | &call (&label("pic_point")); | ||
| 622 | &set_label ("pic_point"); | ||
| 623 | &blindpop ($rem_8bit); | ||
| 624 | &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit)); | ||
| 625 | |||
| 626 | &sub ("esp",512+16+16); # allocate stack frame... | ||
| 627 | &and ("esp",-64); # ...and align it | ||
| 628 | &sub ("esp",16); # place for (u8)(H[]<<4) | ||
| 629 | |||
| 630 | &add ("edx","ecx"); # pointer to the end of input | ||
| 631 | &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi | ||
| 632 | &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len | ||
| 633 | &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp | ||
| 634 | |||
| 635 | { my @lo = ("mm0","mm1","mm2"); | ||
| 636 | my @hi = ("mm3","mm4","mm5"); | ||
| 637 | my @tmp = ("mm6","mm7"); | ||
| 638 | my $off1=0,$off2=0,$i; | ||
| 639 | |||
| 640 | &add ($Htbl,128); # optimize for size | ||
| 641 | &lea ("edi",&DWP(16+128,"esp")); | ||
| 642 | &lea ("ebp",&DWP(16+256+128,"esp")); | ||
| 643 | |||
| 644 | # decompose Htable (low and high parts are kept separately), | ||
| 645 | # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack... | ||
| 646 | for ($i=0;$i<18;$i++) { | ||
| 647 | |||
| 648 | &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16); | ||
| 649 | &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16); | ||
| 650 | &psllq ($tmp[1],60) if ($i>1); | ||
| 651 | &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16); | ||
| 652 | &por ($lo[2],$tmp[1]) if ($i>1); | ||
| 653 | &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17); | ||
| 654 | &psrlq ($lo[1],4) if ($i>0 && $i<17); | ||
| 655 | &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17); | ||
| 656 | &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17); | ||
| 657 | &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1); | ||
| 658 | &psrlq ($hi[1],4) if ($i>0 && $i<17); | ||
| 659 | &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1); | ||
| 660 | &shl ("edx",4) if ($i<16); | ||
| 661 | &mov (&BP($i,"esp"),&LB("edx")) if ($i<16); | ||
| 662 | |||
| 663 | unshift (@lo,pop(@lo)); # "rotate" registers | ||
| 664 | unshift (@hi,pop(@hi)); | ||
| 665 | unshift (@tmp,pop(@tmp)); | ||
| 666 | $off1 += 8 if ($i>0); | ||
| 667 | $off2 += 8 if ($i>1); | ||
| 668 | } | ||
| 669 | } | ||
| 670 | |||
| 671 | &movq ($Zhi,&QWP(0,"eax")); | ||
| 672 | &mov ("ebx",&DWP(8,"eax")); | ||
| 673 | &mov ("edx",&DWP(12,"eax")); # load Xi | ||
| 674 | |||
| 675 | &set_label("outer",16); | ||
| 676 | { my $nlo = "eax"; | ||
| 677 | my $dat = "edx"; | ||
| 678 | my @nhi = ("edi","ebp"); | ||
| 679 | my @rem = ("ebx","ecx"); | ||
| 680 | my @red = ("mm0","mm1","mm2"); | ||
| 681 | my $tmp = "mm3"; | ||
| 682 | |||
| 683 | &xor ($dat,&DWP(12,"ecx")); # merge input data | ||
| 684 | &xor ("ebx",&DWP(8,"ecx")); | ||
| 685 | &pxor ($Zhi,&QWP(0,"ecx")); | ||
| 686 | &lea ("ecx",&DWP(16,"ecx")); # inp+=16 | ||
| 687 | #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi | ||
| 688 | &mov (&DWP(528+8,"esp"),"ebx"); | ||
| 689 | &movq (&QWP(528+0,"esp"),$Zhi); | ||
| 690 | &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp | ||
| 691 | |||
| 692 | &xor ($nlo,$nlo); | ||
| 693 | &rol ($dat,8); | ||
| 694 | &mov (&LB($nlo),&LB($dat)); | ||
| 695 | &mov ($nhi[1],$nlo); | ||
| 696 | &and (&LB($nlo),0x0f); | ||
| 697 | &shr ($nhi[1],4); | ||
| 698 | &pxor ($red[0],$red[0]); | ||
| 699 | &rol ($dat,8); # next byte | ||
| 700 | &pxor ($red[1],$red[1]); | ||
| 701 | &pxor ($red[2],$red[2]); | ||
| 702 | |||
| 703 | # Just like in "May" verson modulo-schedule for critical path in | ||
| 704 | # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor' | ||
| 705 | # is scheduled so late that rem_8bit[] has to be shifted *right* | ||
| 706 | # by 16, which is why last argument to pinsrw is 2, which | ||
| 707 | # corresponds to <<32=<<48>>16... | ||
| 708 | for ($j=11,$i=0;$i<15;$i++) { | ||
| 709 | |||
| 710 | if ($i>0) { | ||
| 711 | &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] | ||
| 712 | &rol ($dat,8); # next byte | ||
| 713 | &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); | ||
| 714 | |||
| 715 | &pxor ($Zlo,$tmp); | ||
| 716 | &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); | ||
| 717 | &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) | ||
| 718 | } else { | ||
| 719 | &movq ($Zlo,&QWP(16,"esp",$nlo,8)); | ||
| 720 | &movq ($Zhi,&QWP(16+128,"esp",$nlo,8)); | ||
| 721 | } | ||
| 722 | |||
| 723 | &mov (&LB($nlo),&LB($dat)); | ||
| 724 | &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0); | ||
| 725 | |||
| 726 | &movd ($rem[0],$Zlo); | ||
| 727 | &movz ($rem[1],&LB($rem[1])) if ($i>0); | ||
| 728 | &psrlq ($Zlo,8); # Z>>=8 | ||
| 729 | |||
| 730 | &movq ($tmp,$Zhi); | ||
| 731 | &mov ($nhi[0],$nlo); | ||
| 732 | &psrlq ($Zhi,8); | ||
| 733 | |||
| 734 | &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4 | ||
| 735 | &and (&LB($nlo),0x0f); | ||
| 736 | &psllq ($tmp,56); | ||
| 737 | |||
| 738 | &pxor ($Zhi,$red[1]) if ($i>1); | ||
| 739 | &shr ($nhi[0],4); | ||
| 740 | &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0); | ||
| 741 | |||
| 742 | unshift (@red,pop(@red)); # "rotate" registers | ||
| 743 | unshift (@rem,pop(@rem)); | ||
| 744 | unshift (@nhi,pop(@nhi)); | ||
| 745 | } | ||
| 746 | |||
| 747 | &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] | ||
| 748 | &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); | ||
| 749 | &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) | ||
| 750 | |||
| 751 | &pxor ($Zlo,$tmp); | ||
| 752 | &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); | ||
| 753 | &movz ($rem[1],&LB($rem[1])); | ||
| 754 | |||
| 755 | &pxor ($red[2],$red[2]); # clear 2nd word | ||
| 756 | &psllq ($red[1],4); | ||
| 757 | |||
| 758 | &movd ($rem[0],$Zlo); | ||
| 759 | &psrlq ($Zlo,4); # Z>>=4 | ||
| 760 | |||
| 761 | &movq ($tmp,$Zhi); | ||
| 762 | &psrlq ($Zhi,4); | ||
| 763 | &shl ($rem[0],4); # rem<<4 | ||
| 764 | |||
| 765 | &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi] | ||
| 766 | &psllq ($tmp,60); | ||
| 767 | &movz ($rem[0],&LB($rem[0])); | ||
| 768 | |||
| 769 | &pxor ($Zlo,$tmp); | ||
| 770 | &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8)); | ||
| 771 | |||
| 772 | &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2); | ||
| 773 | &pxor ($Zhi,$red[1]); | ||
| 774 | |||
| 775 | &movd ($dat,$Zlo); | ||
| 776 | &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48 | ||
| 777 | |||
| 778 | &psllq ($red[0],12); # correct by <<16>>4 | ||
| 779 | &pxor ($Zhi,$red[0]); | ||
| 780 | &psrlq ($Zlo,32); | ||
| 781 | &pxor ($Zhi,$red[2]); | ||
| 782 | |||
| 783 | &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp | ||
| 784 | &movd ("ebx",$Zlo); | ||
| 785 | &movq ($tmp,$Zhi); # 01234567 | ||
| 786 | &psllw ($Zhi,8); # 1.3.5.7. | ||
| 787 | &psrlw ($tmp,8); # .0.2.4.6 | ||
| 788 | &por ($Zhi,$tmp); # 10325476 | ||
| 789 | &bswap ($dat); | ||
| 790 | &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 | ||
| 791 | &bswap ("ebx"); | ||
| 792 | |||
| 793 | &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? | ||
| 794 | &jne (&label("outer")); | ||
| 795 | } | ||
| 796 | |||
| 797 | &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi | ||
| 798 | &mov (&DWP(12,"eax"),"edx"); | ||
| 799 | &mov (&DWP(8,"eax"),"ebx"); | ||
| 800 | &movq (&QWP(0,"eax"),$Zhi); | ||
| 801 | |||
| 802 | &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp | ||
| 803 | &emms (); | ||
| 804 | } | ||
| 805 | &function_end("gcm_ghash_4bit_mmx"); | ||
| 806 | }} | ||
| 807 | |||
| 808 | if ($sse2) {{ | ||
| 809 | ###################################################################### | ||
| 810 | # PCLMULQDQ version. | ||
| 811 | |||
| 812 | $Xip="eax"; | ||
| 813 | $Htbl="edx"; | ||
| 814 | $const="ecx"; | ||
| 815 | $inp="esi"; | ||
| 816 | $len="ebx"; | ||
| 817 | |||
| 818 | ($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; | ||
| 819 | ($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); | ||
| 820 | ($Xn,$Xhn)=("xmm6","xmm7"); | ||
| 821 | |||
| 822 | &static_label("bswap"); | ||
| 823 | |||
| 824 | sub clmul64x64_T2 { # minimal "register" pressure | ||
| 825 | my ($Xhi,$Xi,$Hkey)=@_; | ||
| 826 | |||
| 827 | &movdqa ($Xhi,$Xi); # | ||
| 828 | &pshufd ($T1,$Xi,0b01001110); | ||
| 829 | &pshufd ($T2,$Hkey,0b01001110); | ||
| 830 | &pxor ($T1,$Xi); # | ||
| 831 | &pxor ($T2,$Hkey); | ||
| 832 | |||
| 833 | &pclmulqdq ($Xi,$Hkey,0x00); ####### | ||
| 834 | &pclmulqdq ($Xhi,$Hkey,0x11); ####### | ||
| 835 | &pclmulqdq ($T1,$T2,0x00); ####### | ||
| 836 | &xorps ($T1,$Xi); # | ||
| 837 | &xorps ($T1,$Xhi); # | ||
| 838 | |||
| 839 | &movdqa ($T2,$T1); # | ||
| 840 | &psrldq ($T1,8); | ||
| 841 | &pslldq ($T2,8); # | ||
| 842 | &pxor ($Xhi,$T1); | ||
| 843 | &pxor ($Xi,$T2); # | ||
| 844 | } | ||
| 845 | |||
| 846 | sub clmul64x64_T3 { | ||
| 847 | # Even though this subroutine offers visually better ILP, it | ||
| 848 | # was empirically found to be a tad slower than above version. | ||
| 849 | # At least in gcm_ghash_clmul context. But it's just as well, | ||
| 850 | # because loop modulo-scheduling is possible only thanks to | ||
| 851 | # minimized "register" pressure... | ||
| 852 | my ($Xhi,$Xi,$Hkey)=@_; | ||
| 853 | |||
| 854 | &movdqa ($T1,$Xi); # | ||
| 855 | &movdqa ($Xhi,$Xi); | ||
| 856 | &pclmulqdq ($Xi,$Hkey,0x00); ####### | ||
| 857 | &pclmulqdq ($Xhi,$Hkey,0x11); ####### | ||
| 858 | &pshufd ($T2,$T1,0b01001110); # | ||
| 859 | &pshufd ($T3,$Hkey,0b01001110); | ||
| 860 | &pxor ($T2,$T1); # | ||
| 861 | &pxor ($T3,$Hkey); | ||
| 862 | &pclmulqdq ($T2,$T3,0x00); ####### | ||
| 863 | &pxor ($T2,$Xi); # | ||
| 864 | &pxor ($T2,$Xhi); # | ||
| 865 | |||
| 866 | &movdqa ($T3,$T2); # | ||
| 867 | &psrldq ($T2,8); | ||
| 868 | &pslldq ($T3,8); # | ||
| 869 | &pxor ($Xhi,$T2); | ||
| 870 | &pxor ($Xi,$T3); # | ||
| 871 | } | ||
| 872 | |||
| 873 | if (1) { # Algorithm 9 with <<1 twist. | ||
| 874 | # Reduction is shorter and uses only two | ||
| 875 | # temporary registers, which makes it better | ||
| 876 | # candidate for interleaving with 64x64 | ||
| 877 | # multiplication. Pre-modulo-scheduled loop | ||
| 878 | # was found to be ~20% faster than Algorithm 5 | ||
| 879 | # below. Algorithm 9 was therefore chosen for | ||
| 880 | # further optimization... | ||
| 881 | |||
| 882 | sub reduction_alg9 { # 17/13 times faster than Intel version | ||
| 883 | my ($Xhi,$Xi) = @_; | ||
| 884 | |||
| 885 | # 1st phase | ||
| 886 | &movdqa ($T1,$Xi) # | ||
| 887 | &psllq ($Xi,1); | ||
| 888 | &pxor ($Xi,$T1); # | ||
| 889 | &psllq ($Xi,5); # | ||
| 890 | &pxor ($Xi,$T1); # | ||
| 891 | &psllq ($Xi,57); # | ||
| 892 | &movdqa ($T2,$Xi); # | ||
| 893 | &pslldq ($Xi,8); | ||
| 894 | &psrldq ($T2,8); # | ||
| 895 | &pxor ($Xi,$T1); | ||
| 896 | &pxor ($Xhi,$T2); # | ||
| 897 | |||
| 898 | # 2nd phase | ||
| 899 | &movdqa ($T2,$Xi); | ||
| 900 | &psrlq ($Xi,5); | ||
| 901 | &pxor ($Xi,$T2); # | ||
| 902 | &psrlq ($Xi,1); # | ||
| 903 | &pxor ($Xi,$T2); # | ||
| 904 | &pxor ($T2,$Xhi); | ||
| 905 | &psrlq ($Xi,1); # | ||
| 906 | &pxor ($Xi,$T2); # | ||
| 907 | } | ||
| 908 | |||
| 909 | &function_begin_B("gcm_init_clmul"); | ||
| 910 | &mov ($Htbl,&wparam(0)); | ||
| 911 | &mov ($Xip,&wparam(1)); | ||
| 912 | |||
| 913 | &call (&label("pic")); | ||
| 914 | &set_label("pic"); | ||
| 915 | &blindpop ($const); | ||
| 916 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
| 917 | |||
| 918 | &movdqu ($Hkey,&QWP(0,$Xip)); | ||
| 919 | &pshufd ($Hkey,$Hkey,0b01001110);# dword swap | ||
| 920 | |||
| 921 | # <<1 twist | ||
| 922 | &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword | ||
| 923 | &movdqa ($T1,$Hkey); | ||
| 924 | &psllq ($Hkey,1); | ||
| 925 | &pxor ($T3,$T3); # | ||
| 926 | &psrlq ($T1,63); | ||
| 927 | &pcmpgtd ($T3,$T2); # broadcast carry bit | ||
| 928 | &pslldq ($T1,8); | ||
| 929 | &por ($Hkey,$T1); # H<<=1 | ||
| 930 | |||
| 931 | # magic reduction | ||
| 932 | &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial | ||
| 933 | &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial | ||
| 934 | |||
| 935 | # calculate H^2 | ||
| 936 | &movdqa ($Xi,$Hkey); | ||
| 937 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); | ||
| 938 | &reduction_alg9 ($Xhi,$Xi); | ||
| 939 | |||
| 940 | &movdqu (&QWP(0,$Htbl),$Hkey); # save H | ||
| 941 | &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 | ||
| 942 | |||
| 943 | &ret (); | ||
| 944 | &function_end_B("gcm_init_clmul"); | ||
| 945 | |||
| 946 | &function_begin_B("gcm_gmult_clmul"); | ||
| 947 | &mov ($Xip,&wparam(0)); | ||
| 948 | &mov ($Htbl,&wparam(1)); | ||
| 949 | |||
| 950 | &call (&label("pic")); | ||
| 951 | &set_label("pic"); | ||
| 952 | &blindpop ($const); | ||
| 953 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
| 954 | |||
| 955 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
| 956 | &movdqa ($T3,&QWP(0,$const)); | ||
| 957 | &movups ($Hkey,&QWP(0,$Htbl)); | ||
| 958 | &pshufb ($Xi,$T3); | ||
| 959 | |||
| 960 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); | ||
| 961 | &reduction_alg9 ($Xhi,$Xi); | ||
| 962 | |||
| 963 | &pshufb ($Xi,$T3); | ||
| 964 | &movdqu (&QWP(0,$Xip),$Xi); | ||
| 965 | |||
| 966 | &ret (); | ||
| 967 | &function_end_B("gcm_gmult_clmul"); | ||
| 968 | |||
| 969 | &function_begin("gcm_ghash_clmul"); | ||
| 970 | &mov ($Xip,&wparam(0)); | ||
| 971 | &mov ($Htbl,&wparam(1)); | ||
| 972 | &mov ($inp,&wparam(2)); | ||
| 973 | &mov ($len,&wparam(3)); | ||
| 974 | |||
| 975 | &call (&label("pic")); | ||
| 976 | &set_label("pic"); | ||
| 977 | &blindpop ($const); | ||
| 978 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
| 979 | |||
| 980 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
| 981 | &movdqa ($T3,&QWP(0,$const)); | ||
| 982 | &movdqu ($Hkey,&QWP(0,$Htbl)); | ||
| 983 | &pshufb ($Xi,$T3); | ||
| 984 | |||
| 985 | &sub ($len,0x10); | ||
| 986 | &jz (&label("odd_tail")); | ||
| 987 | |||
| 988 | ####### | ||
| 989 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = | ||
| 990 | # [(H*Ii+1) + (H*Xi+1)] mod P = | ||
| 991 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P | ||
| 992 | # | ||
| 993 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
| 994 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
| 995 | &pshufb ($T1,$T3); | ||
| 996 | &pshufb ($Xn,$T3); | ||
| 997 | &pxor ($Xi,$T1); # Ii+Xi | ||
| 998 | |||
| 999 | &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
| 1000 | &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
| 1001 | |||
| 1002 | &lea ($inp,&DWP(32,$inp)); # i+=2 | ||
| 1003 | &sub ($len,0x20); | ||
| 1004 | &jbe (&label("even_tail")); | ||
| 1005 | |||
| 1006 | &set_label("mod_loop"); | ||
| 1007 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
| 1008 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
| 1009 | &movups ($Hkey,&QWP(0,$Htbl)); # load H | ||
| 1010 | |||
| 1011 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
| 1012 | &pxor ($Xhi,$Xhn); | ||
| 1013 | |||
| 1014 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
| 1015 | &pshufb ($T1,$T3); | ||
| 1016 | &pshufb ($Xn,$T3); | ||
| 1017 | |||
| 1018 | &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 | ||
| 1019 | &movdqa ($Xhn,$Xn); | ||
| 1020 | &pxor ($Xhi,$T1); # "Ii+Xi", consume early | ||
| 1021 | |||
| 1022 | &movdqa ($T1,$Xi) #&reduction_alg9($Xhi,$Xi); 1st phase | ||
| 1023 | &psllq ($Xi,1); | ||
| 1024 | &pxor ($Xi,$T1); # | ||
| 1025 | &psllq ($Xi,5); # | ||
| 1026 | &pxor ($Xi,$T1); # | ||
| 1027 | &pclmulqdq ($Xn,$Hkey,0x00); ####### | ||
| 1028 | &psllq ($Xi,57); # | ||
| 1029 | &movdqa ($T2,$Xi); # | ||
| 1030 | &pslldq ($Xi,8); | ||
| 1031 | &psrldq ($T2,8); # | ||
| 1032 | &pxor ($Xi,$T1); | ||
| 1033 | &pshufd ($T1,$T3,0b01001110); | ||
| 1034 | &pxor ($Xhi,$T2); # | ||
| 1035 | &pxor ($T1,$T3); | ||
| 1036 | &pshufd ($T3,$Hkey,0b01001110); | ||
| 1037 | &pxor ($T3,$Hkey); # | ||
| 1038 | |||
| 1039 | &pclmulqdq ($Xhn,$Hkey,0x11); ####### | ||
| 1040 | &movdqa ($T2,$Xi); # 2nd phase | ||
| 1041 | &psrlq ($Xi,5); | ||
| 1042 | &pxor ($Xi,$T2); # | ||
| 1043 | &psrlq ($Xi,1); # | ||
| 1044 | &pxor ($Xi,$T2); # | ||
| 1045 | &pxor ($T2,$Xhi); | ||
| 1046 | &psrlq ($Xi,1); # | ||
| 1047 | &pxor ($Xi,$T2); # | ||
| 1048 | |||
| 1049 | &pclmulqdq ($T1,$T3,0x00); ####### | ||
| 1050 | &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
| 1051 | &xorps ($T1,$Xn); # | ||
| 1052 | &xorps ($T1,$Xhn); # | ||
| 1053 | |||
| 1054 | &movdqa ($T3,$T1); # | ||
| 1055 | &psrldq ($T1,8); | ||
| 1056 | &pslldq ($T3,8); # | ||
| 1057 | &pxor ($Xhn,$T1); | ||
| 1058 | &pxor ($Xn,$T3); # | ||
| 1059 | &movdqa ($T3,&QWP(0,$const)); | ||
| 1060 | |||
| 1061 | &lea ($inp,&DWP(32,$inp)); | ||
| 1062 | &sub ($len,0x20); | ||
| 1063 | &ja (&label("mod_loop")); | ||
| 1064 | |||
| 1065 | &set_label("even_tail"); | ||
| 1066 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
| 1067 | |||
| 1068 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
| 1069 | &pxor ($Xhi,$Xhn); | ||
| 1070 | |||
| 1071 | &reduction_alg9 ($Xhi,$Xi); | ||
| 1072 | |||
| 1073 | &test ($len,$len); | ||
| 1074 | &jnz (&label("done")); | ||
| 1075 | |||
| 1076 | &movups ($Hkey,&QWP(0,$Htbl)); # load H | ||
| 1077 | &set_label("odd_tail"); | ||
| 1078 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
| 1079 | &pshufb ($T1,$T3); | ||
| 1080 | &pxor ($Xi,$T1); # Ii+Xi | ||
| 1081 | |||
| 1082 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) | ||
| 1083 | &reduction_alg9 ($Xhi,$Xi); | ||
| 1084 | |||
| 1085 | &set_label("done"); | ||
| 1086 | &pshufb ($Xi,$T3); | ||
| 1087 | &movdqu (&QWP(0,$Xip),$Xi); | ||
| 1088 | &function_end("gcm_ghash_clmul"); | ||
| 1089 | |||
| 1090 | } else { # Algorith 5. Kept for reference purposes. | ||
| 1091 | |||
| 1092 | sub reduction_alg5 { # 19/16 times faster than Intel version | ||
| 1093 | my ($Xhi,$Xi)=@_; | ||
| 1094 | |||
| 1095 | # <<1 | ||
| 1096 | &movdqa ($T1,$Xi); # | ||
| 1097 | &movdqa ($T2,$Xhi); | ||
| 1098 | &pslld ($Xi,1); | ||
| 1099 | &pslld ($Xhi,1); # | ||
| 1100 | &psrld ($T1,31); | ||
| 1101 | &psrld ($T2,31); # | ||
| 1102 | &movdqa ($T3,$T1); | ||
| 1103 | &pslldq ($T1,4); | ||
| 1104 | &psrldq ($T3,12); # | ||
| 1105 | &pslldq ($T2,4); | ||
| 1106 | &por ($Xhi,$T3); # | ||
| 1107 | &por ($Xi,$T1); | ||
| 1108 | &por ($Xhi,$T2); # | ||
| 1109 | |||
| 1110 | # 1st phase | ||
| 1111 | &movdqa ($T1,$Xi); | ||
| 1112 | &movdqa ($T2,$Xi); | ||
| 1113 | &movdqa ($T3,$Xi); # | ||
| 1114 | &pslld ($T1,31); | ||
| 1115 | &pslld ($T2,30); | ||
| 1116 | &pslld ($Xi,25); # | ||
| 1117 | &pxor ($T1,$T2); | ||
| 1118 | &pxor ($T1,$Xi); # | ||
| 1119 | &movdqa ($T2,$T1); # | ||
| 1120 | &pslldq ($T1,12); | ||
| 1121 | &psrldq ($T2,4); # | ||
| 1122 | &pxor ($T3,$T1); | ||
| 1123 | |||
| 1124 | # 2nd phase | ||
| 1125 | &pxor ($Xhi,$T3); # | ||
| 1126 | &movdqa ($Xi,$T3); | ||
| 1127 | &movdqa ($T1,$T3); | ||
| 1128 | &psrld ($Xi,1); # | ||
| 1129 | &psrld ($T1,2); | ||
| 1130 | &psrld ($T3,7); # | ||
| 1131 | &pxor ($Xi,$T1); | ||
| 1132 | &pxor ($Xhi,$T2); | ||
| 1133 | &pxor ($Xi,$T3); # | ||
| 1134 | &pxor ($Xi,$Xhi); # | ||
| 1135 | } | ||
| 1136 | |||
| 1137 | &function_begin_B("gcm_init_clmul"); | ||
| 1138 | &mov ($Htbl,&wparam(0)); | ||
| 1139 | &mov ($Xip,&wparam(1)); | ||
| 1140 | |||
| 1141 | &call (&label("pic")); | ||
| 1142 | &set_label("pic"); | ||
| 1143 | &blindpop ($const); | ||
| 1144 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
| 1145 | |||
| 1146 | &movdqu ($Hkey,&QWP(0,$Xip)); | ||
| 1147 | &pshufd ($Hkey,$Hkey,0b01001110);# dword swap | ||
| 1148 | |||
| 1149 | # calculate H^2 | ||
| 1150 | &movdqa ($Xi,$Hkey); | ||
| 1151 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); | ||
| 1152 | &reduction_alg5 ($Xhi,$Xi); | ||
| 1153 | |||
| 1154 | &movdqu (&QWP(0,$Htbl),$Hkey); # save H | ||
| 1155 | &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 | ||
| 1156 | |||
| 1157 | &ret (); | ||
| 1158 | &function_end_B("gcm_init_clmul"); | ||
| 1159 | |||
| 1160 | &function_begin_B("gcm_gmult_clmul"); | ||
| 1161 | &mov ($Xip,&wparam(0)); | ||
| 1162 | &mov ($Htbl,&wparam(1)); | ||
| 1163 | |||
| 1164 | &call (&label("pic")); | ||
| 1165 | &set_label("pic"); | ||
| 1166 | &blindpop ($const); | ||
| 1167 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
| 1168 | |||
| 1169 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
| 1170 | &movdqa ($Xn,&QWP(0,$const)); | ||
| 1171 | &movdqu ($Hkey,&QWP(0,$Htbl)); | ||
| 1172 | &pshufb ($Xi,$Xn); | ||
| 1173 | |||
| 1174 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); | ||
| 1175 | &reduction_alg5 ($Xhi,$Xi); | ||
| 1176 | |||
| 1177 | &pshufb ($Xi,$Xn); | ||
| 1178 | &movdqu (&QWP(0,$Xip),$Xi); | ||
| 1179 | |||
| 1180 | &ret (); | ||
| 1181 | &function_end_B("gcm_gmult_clmul"); | ||
| 1182 | |||
| 1183 | &function_begin("gcm_ghash_clmul"); | ||
| 1184 | &mov ($Xip,&wparam(0)); | ||
| 1185 | &mov ($Htbl,&wparam(1)); | ||
| 1186 | &mov ($inp,&wparam(2)); | ||
| 1187 | &mov ($len,&wparam(3)); | ||
| 1188 | |||
| 1189 | &call (&label("pic")); | ||
| 1190 | &set_label("pic"); | ||
| 1191 | &blindpop ($const); | ||
| 1192 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
| 1193 | |||
| 1194 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
| 1195 | &movdqa ($T3,&QWP(0,$const)); | ||
| 1196 | &movdqu ($Hkey,&QWP(0,$Htbl)); | ||
| 1197 | &pshufb ($Xi,$T3); | ||
| 1198 | |||
| 1199 | &sub ($len,0x10); | ||
| 1200 | &jz (&label("odd_tail")); | ||
| 1201 | |||
| 1202 | ####### | ||
| 1203 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = | ||
| 1204 | # [(H*Ii+1) + (H*Xi+1)] mod P = | ||
| 1205 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P | ||
| 1206 | # | ||
| 1207 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
| 1208 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
| 1209 | &pshufb ($T1,$T3); | ||
| 1210 | &pshufb ($Xn,$T3); | ||
| 1211 | &pxor ($Xi,$T1); # Ii+Xi | ||
| 1212 | |||
| 1213 | &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
| 1214 | &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
| 1215 | |||
| 1216 | &sub ($len,0x20); | ||
| 1217 | &lea ($inp,&DWP(32,$inp)); # i+=2 | ||
| 1218 | &jbe (&label("even_tail")); | ||
| 1219 | |||
| 1220 | &set_label("mod_loop"); | ||
| 1221 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
| 1222 | &movdqu ($Hkey,&QWP(0,$Htbl)); # load H | ||
| 1223 | |||
| 1224 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
| 1225 | &pxor ($Xhi,$Xhn); | ||
| 1226 | |||
| 1227 | &reduction_alg5 ($Xhi,$Xi); | ||
| 1228 | |||
| 1229 | ####### | ||
| 1230 | &movdqa ($T3,&QWP(0,$const)); | ||
| 1231 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
| 1232 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
| 1233 | &pshufb ($T1,$T3); | ||
| 1234 | &pshufb ($Xn,$T3); | ||
| 1235 | &pxor ($Xi,$T1); # Ii+Xi | ||
| 1236 | |||
| 1237 | &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
| 1238 | &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
| 1239 | |||
| 1240 | &sub ($len,0x20); | ||
| 1241 | &lea ($inp,&DWP(32,$inp)); | ||
| 1242 | &ja (&label("mod_loop")); | ||
| 1243 | |||
| 1244 | &set_label("even_tail"); | ||
| 1245 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
| 1246 | |||
| 1247 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
| 1248 | &pxor ($Xhi,$Xhn); | ||
| 1249 | |||
| 1250 | &reduction_alg5 ($Xhi,$Xi); | ||
| 1251 | |||
| 1252 | &movdqa ($T3,&QWP(0,$const)); | ||
| 1253 | &test ($len,$len); | ||
| 1254 | &jnz (&label("done")); | ||
| 1255 | |||
| 1256 | &movdqu ($Hkey,&QWP(0,$Htbl)); # load H | ||
| 1257 | &set_label("odd_tail"); | ||
| 1258 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
| 1259 | &pshufb ($T1,$T3); | ||
| 1260 | &pxor ($Xi,$T1); # Ii+Xi | ||
| 1261 | |||
| 1262 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) | ||
| 1263 | &reduction_alg5 ($Xhi,$Xi); | ||
| 1264 | |||
| 1265 | &movdqa ($T3,&QWP(0,$const)); | ||
| 1266 | &set_label("done"); | ||
| 1267 | &pshufb ($Xi,$T3); | ||
| 1268 | &movdqu (&QWP(0,$Xip),$Xi); | ||
| 1269 | &function_end("gcm_ghash_clmul"); | ||
| 1270 | |||
| 1271 | } | ||
| 1272 | |||
| 1273 | &set_label("bswap",64); | ||
| 1274 | &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | ||
| 1275 | &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial | ||
| 1276 | }} # $sse2 | ||
| 1277 | |||
| 1278 | &set_label("rem_4bit",64); | ||
| 1279 | &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); | ||
| 1280 | &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); | ||
| 1281 | &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); | ||
| 1282 | &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); | ||
| 1283 | &set_label("rem_8bit",64); | ||
| 1284 | &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); | ||
| 1285 | &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); | ||
| 1286 | &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E); | ||
| 1287 | &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E); | ||
| 1288 | &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E); | ||
| 1289 | &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E); | ||
| 1290 | &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E); | ||
| 1291 | &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E); | ||
| 1292 | &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE); | ||
| 1293 | &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE); | ||
| 1294 | &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE); | ||
| 1295 | &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE); | ||
| 1296 | &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E); | ||
| 1297 | &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E); | ||
| 1298 | &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE); | ||
| 1299 | &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE); | ||
| 1300 | &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E); | ||
| 1301 | &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E); | ||
| 1302 | &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E); | ||
| 1303 | &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E); | ||
| 1304 | &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E); | ||
| 1305 | &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E); | ||
| 1306 | &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E); | ||
| 1307 | &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E); | ||
| 1308 | &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE); | ||
| 1309 | &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE); | ||
| 1310 | &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE); | ||
| 1311 | &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE); | ||
| 1312 | &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E); | ||
| 1313 | &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); | ||
| 1314 | &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); | ||
| 1315 | &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); | ||
| 1316 | }}} # !$x86only | ||
| 1317 | |||
| 1318 | &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
| 1319 | &asm_finish(); | ||
| 1320 | |||
| 1321 | # A question was risen about choice of vanilla MMX. Or rather why wasn't | ||
| 1322 | # SSE2 chosen instead? In addition to the fact that MMX runs on legacy | ||
| 1323 | # CPUs such as PIII, "4-bit" MMX version was observed to provide better | ||
| 1324 | # performance than *corresponding* SSE2 one even on contemporary CPUs. | ||
| 1325 | # SSE2 results were provided by Peter-Michael Hager. He maintains SSE2 | ||
| 1326 | # implementation featuring full range of lookup-table sizes, but with | ||
| 1327 | # per-invocation lookup table setup. Latter means that table size is | ||
| 1328 | # chosen depending on how much data is to be hashed in every given call, | ||
| 1329 | # more data - larger table. Best reported result for Core2 is ~4 cycles | ||
| 1330 | # per processed byte out of 64KB block. This number accounts even for | ||
| 1331 | # 64KB table setup overhead. As discussed in gcm128.c we choose to be | ||
| 1332 | # more conservative in respect to lookup table sizes, but how do the | ||
| 1333 | # results compare? Minimalistic "256B" MMX version delivers ~11 cycles | ||
| 1334 | # on same platform. As also discussed in gcm128.c, next in line "8-bit | ||
| 1335 | # Shoup's" or "4KB" method should deliver twice the performance of | ||
| 1336 | # "256B" one, in other words not worse than ~6 cycles per byte. It | ||
| 1337 | # should be also be noted that in SSE2 case improvement can be "super- | ||
| 1338 | # linear," i.e. more than twice, mostly because >>8 maps to single | ||
| 1339 | # instruction on SSE2 register. This is unlike "4-bit" case when >>4 | ||
| 1340 | # maps to same amount of instructions in both MMX and SSE2 cases. | ||
| 1341 | # Bottom line is that switch to SSE2 is considered to be justifiable | ||
| 1342 | # only in case we choose to implement "8-bit" method... | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl new file mode 100644 index 0000000000..a5ae180882 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl | |||
| @@ -0,0 +1,805 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # March, June 2010 | ||
| 11 | # | ||
| 12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
| 13 | # single multiplication operation in GF(2^128). "4-bit" means that | ||
| 14 | # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH | ||
| 15 | # function features so called "528B" variant utilizing additional | ||
| 16 | # 256+16 bytes of per-key storage [+512 bytes shared table]. | ||
| 17 | # Performance results are for this streamed GHASH subroutine and are | ||
| 18 | # expressed in cycles per processed byte, less is better: | ||
| 19 | # | ||
| 20 | # gcc 3.4.x(*) assembler | ||
| 21 | # | ||
| 22 | # P4 28.6 14.0 +100% | ||
| 23 | # Opteron 19.3 7.7 +150% | ||
| 24 | # Core2 17.8 8.1(**) +120% | ||
| 25 | # | ||
| 26 | # (*) comparison is not completely fair, because C results are | ||
| 27 | # for vanilla "256B" implementation, while assembler results | ||
| 28 | # are for "528B";-) | ||
| 29 | # (**) it's mystery [to me] why Core2 result is not same as for | ||
| 30 | # Opteron; | ||
| 31 | |||
| 32 | # May 2010 | ||
| 33 | # | ||
| 34 | # Add PCLMULQDQ version performing at 2.02 cycles per processed byte. | ||
| 35 | # See ghash-x86.pl for background information and details about coding | ||
| 36 | # techniques. | ||
| 37 | # | ||
| 38 | # Special thanks to David Woodhouse <dwmw2@infradead.org> for | ||
| 39 | # providing access to a Westmere-based system on behalf of Intel | ||
| 40 | # Open Source Technology Centre. | ||
| 41 | |||
| 42 | $flavour = shift; | ||
| 43 | $output = shift; | ||
| 44 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 45 | |||
| 46 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 47 | |||
| 48 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 49 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 50 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 51 | die "can't locate x86_64-xlate.pl"; | ||
| 52 | |||
| 53 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 54 | |||
| 55 | # common register layout | ||
| 56 | $nlo="%rax"; | ||
| 57 | $nhi="%rbx"; | ||
| 58 | $Zlo="%r8"; | ||
| 59 | $Zhi="%r9"; | ||
| 60 | $tmp="%r10"; | ||
| 61 | $rem_4bit = "%r11"; | ||
| 62 | |||
| 63 | $Xi="%rdi"; | ||
| 64 | $Htbl="%rsi"; | ||
| 65 | |||
| 66 | # per-function register layout | ||
| 67 | $cnt="%rcx"; | ||
| 68 | $rem="%rdx"; | ||
| 69 | |||
| 70 | sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or | ||
| 71 | $r =~ s/%[er]([sd]i)/%\1l/ or | ||
| 72 | $r =~ s/%[er](bp)/%\1l/ or | ||
| 73 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } | ||
| 74 | |||
| 75 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | ||
| 76 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | ||
| 77 | my $arg = pop; | ||
| 78 | $arg = "\$$arg" if ($arg*1 eq $arg); | ||
| 79 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | ||
| 80 | } | ||
| 81 | |||
| 82 | { my $N; | ||
| 83 | sub loop() { | ||
| 84 | my $inp = shift; | ||
| 85 | |||
| 86 | $N++; | ||
| 87 | $code.=<<___; | ||
| 88 | xor $nlo,$nlo | ||
| 89 | xor $nhi,$nhi | ||
| 90 | mov `&LB("$Zlo")`,`&LB("$nlo")` | ||
| 91 | mov `&LB("$Zlo")`,`&LB("$nhi")` | ||
| 92 | shl \$4,`&LB("$nlo")` | ||
| 93 | mov \$14,$cnt | ||
| 94 | mov 8($Htbl,$nlo),$Zlo | ||
| 95 | mov ($Htbl,$nlo),$Zhi | ||
| 96 | and \$0xf0,`&LB("$nhi")` | ||
| 97 | mov $Zlo,$rem | ||
| 98 | jmp .Loop$N | ||
| 99 | |||
| 100 | .align 16 | ||
| 101 | .Loop$N: | ||
| 102 | shr \$4,$Zlo | ||
| 103 | and \$0xf,$rem | ||
| 104 | mov $Zhi,$tmp | ||
| 105 | mov ($inp,$cnt),`&LB("$nlo")` | ||
| 106 | shr \$4,$Zhi | ||
| 107 | xor 8($Htbl,$nhi),$Zlo | ||
| 108 | shl \$60,$tmp | ||
| 109 | xor ($Htbl,$nhi),$Zhi | ||
| 110 | mov `&LB("$nlo")`,`&LB("$nhi")` | ||
| 111 | xor ($rem_4bit,$rem,8),$Zhi | ||
| 112 | mov $Zlo,$rem | ||
| 113 | shl \$4,`&LB("$nlo")` | ||
| 114 | xor $tmp,$Zlo | ||
| 115 | dec $cnt | ||
| 116 | js .Lbreak$N | ||
| 117 | |||
| 118 | shr \$4,$Zlo | ||
| 119 | and \$0xf,$rem | ||
| 120 | mov $Zhi,$tmp | ||
| 121 | shr \$4,$Zhi | ||
| 122 | xor 8($Htbl,$nlo),$Zlo | ||
| 123 | shl \$60,$tmp | ||
| 124 | xor ($Htbl,$nlo),$Zhi | ||
| 125 | and \$0xf0,`&LB("$nhi")` | ||
| 126 | xor ($rem_4bit,$rem,8),$Zhi | ||
| 127 | mov $Zlo,$rem | ||
| 128 | xor $tmp,$Zlo | ||
| 129 | jmp .Loop$N | ||
| 130 | |||
| 131 | .align 16 | ||
| 132 | .Lbreak$N: | ||
| 133 | shr \$4,$Zlo | ||
| 134 | and \$0xf,$rem | ||
| 135 | mov $Zhi,$tmp | ||
| 136 | shr \$4,$Zhi | ||
| 137 | xor 8($Htbl,$nlo),$Zlo | ||
| 138 | shl \$60,$tmp | ||
| 139 | xor ($Htbl,$nlo),$Zhi | ||
| 140 | and \$0xf0,`&LB("$nhi")` | ||
| 141 | xor ($rem_4bit,$rem,8),$Zhi | ||
| 142 | mov $Zlo,$rem | ||
| 143 | xor $tmp,$Zlo | ||
| 144 | |||
| 145 | shr \$4,$Zlo | ||
| 146 | and \$0xf,$rem | ||
| 147 | mov $Zhi,$tmp | ||
| 148 | shr \$4,$Zhi | ||
| 149 | xor 8($Htbl,$nhi),$Zlo | ||
| 150 | shl \$60,$tmp | ||
| 151 | xor ($Htbl,$nhi),$Zhi | ||
| 152 | xor $tmp,$Zlo | ||
| 153 | xor ($rem_4bit,$rem,8),$Zhi | ||
| 154 | |||
| 155 | bswap $Zlo | ||
| 156 | bswap $Zhi | ||
| 157 | ___ | ||
| 158 | }} | ||
| 159 | |||
| 160 | $code=<<___; | ||
| 161 | .text | ||
| 162 | |||
| 163 | .globl gcm_gmult_4bit | ||
| 164 | .type gcm_gmult_4bit,\@function,2 | ||
| 165 | .align 16 | ||
| 166 | gcm_gmult_4bit: | ||
| 167 | push %rbx | ||
| 168 | push %rbp # %rbp and %r12 are pushed exclusively in | ||
| 169 | push %r12 # order to reuse Win64 exception handler... | ||
| 170 | .Lgmult_prologue: | ||
| 171 | |||
| 172 | movzb 15($Xi),$Zlo | ||
| 173 | lea .Lrem_4bit(%rip),$rem_4bit | ||
| 174 | ___ | ||
| 175 | &loop ($Xi); | ||
| 176 | $code.=<<___; | ||
| 177 | mov $Zlo,8($Xi) | ||
| 178 | mov $Zhi,($Xi) | ||
| 179 | |||
| 180 | mov 16(%rsp),%rbx | ||
| 181 | lea 24(%rsp),%rsp | ||
| 182 | .Lgmult_epilogue: | ||
| 183 | ret | ||
| 184 | .size gcm_gmult_4bit,.-gcm_gmult_4bit | ||
| 185 | ___ | ||
| 186 | |||
| 187 | # per-function register layout | ||
| 188 | $inp="%rdx"; | ||
| 189 | $len="%rcx"; | ||
| 190 | $rem_8bit=$rem_4bit; | ||
| 191 | |||
| 192 | $code.=<<___; | ||
| 193 | .globl gcm_ghash_4bit | ||
| 194 | .type gcm_ghash_4bit,\@function,4 | ||
| 195 | .align 16 | ||
| 196 | gcm_ghash_4bit: | ||
| 197 | push %rbx | ||
| 198 | push %rbp | ||
| 199 | push %r12 | ||
| 200 | push %r13 | ||
| 201 | push %r14 | ||
| 202 | push %r15 | ||
| 203 | sub \$280,%rsp | ||
| 204 | .Lghash_prologue: | ||
| 205 | mov $inp,%r14 # reassign couple of args | ||
| 206 | mov $len,%r15 | ||
| 207 | ___ | ||
| 208 | { my $inp="%r14"; | ||
| 209 | my $dat="%edx"; | ||
| 210 | my $len="%r15"; | ||
| 211 | my @nhi=("%ebx","%ecx"); | ||
| 212 | my @rem=("%r12","%r13"); | ||
| 213 | my $Hshr4="%rbp"; | ||
| 214 | |||
| 215 | &sub ($Htbl,-128); # size optimization | ||
| 216 | &lea ($Hshr4,"16+128(%rsp)"); | ||
| 217 | { my @lo =($nlo,$nhi); | ||
| 218 | my @hi =($Zlo,$Zhi); | ||
| 219 | |||
| 220 | &xor ($dat,$dat); | ||
| 221 | for ($i=0,$j=-2;$i<18;$i++,$j++) { | ||
| 222 | &mov ("$j(%rsp)",&LB($dat)) if ($i>1); | ||
| 223 | &or ($lo[0],$tmp) if ($i>1); | ||
| 224 | &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); | ||
| 225 | &shr ($lo[1],4) if ($i>0 && $i<17); | ||
| 226 | &mov ($tmp,$hi[1]) if ($i>0 && $i<17); | ||
| 227 | &shr ($hi[1],4) if ($i>0 && $i<17); | ||
| 228 | &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); | ||
| 229 | &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); | ||
| 230 | &shl (&LB($dat),4) if ($i>0 && $i<17); | ||
| 231 | &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); | ||
| 232 | &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); | ||
| 233 | &shl ($tmp,60) if ($i>0 && $i<17); | ||
| 234 | |||
| 235 | push (@lo,shift(@lo)); | ||
| 236 | push (@hi,shift(@hi)); | ||
| 237 | } | ||
| 238 | } | ||
| 239 | &add ($Htbl,-128); | ||
| 240 | &mov ($Zlo,"8($Xi)"); | ||
| 241 | &mov ($Zhi,"0($Xi)"); | ||
| 242 | &add ($len,$inp); # pointer to the end of data | ||
| 243 | &lea ($rem_8bit,".Lrem_8bit(%rip)"); | ||
| 244 | &jmp (".Louter_loop"); | ||
| 245 | |||
| 246 | $code.=".align 16\n.Louter_loop:\n"; | ||
| 247 | &xor ($Zhi,"($inp)"); | ||
| 248 | &mov ("%rdx","8($inp)"); | ||
| 249 | &lea ($inp,"16($inp)"); | ||
| 250 | &xor ("%rdx",$Zlo); | ||
| 251 | &mov ("($Xi)",$Zhi); | ||
| 252 | &mov ("8($Xi)","%rdx"); | ||
| 253 | &shr ("%rdx",32); | ||
| 254 | |||
| 255 | &xor ($nlo,$nlo); | ||
| 256 | &rol ($dat,8); | ||
| 257 | &mov (&LB($nlo),&LB($dat)); | ||
| 258 | &movz ($nhi[0],&LB($dat)); | ||
| 259 | &shl (&LB($nlo),4); | ||
| 260 | &shr ($nhi[0],4); | ||
| 261 | |||
| 262 | for ($j=11,$i=0;$i<15;$i++) { | ||
| 263 | &rol ($dat,8); | ||
| 264 | &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); | ||
| 265 | &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); | ||
| 266 | &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); | ||
| 267 | &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); | ||
| 268 | |||
| 269 | &mov (&LB($nlo),&LB($dat)); | ||
| 270 | &xor ($Zlo,$tmp) if ($i>0); | ||
| 271 | &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); | ||
| 272 | |||
| 273 | &movz ($nhi[1],&LB($dat)); | ||
| 274 | &shl (&LB($nlo),4); | ||
| 275 | &movzb ($rem[0],"(%rsp,$nhi[0])"); | ||
| 276 | |||
| 277 | &shr ($nhi[1],4) if ($i<14); | ||
| 278 | &and ($nhi[1],0xf0) if ($i==14); | ||
| 279 | &shl ($rem[1],48) if ($i>0); | ||
| 280 | &xor ($rem[0],$Zlo); | ||
| 281 | |||
| 282 | &mov ($tmp,$Zhi); | ||
| 283 | &xor ($Zhi,$rem[1]) if ($i>0); | ||
| 284 | &shr ($Zlo,8); | ||
| 285 | |||
| 286 | &movz ($rem[0],&LB($rem[0])); | ||
| 287 | &mov ($dat,"$j($Xi)") if (--$j%4==0); | ||
| 288 | &shr ($Zhi,8); | ||
| 289 | |||
| 290 | &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); | ||
| 291 | &shl ($tmp,56); | ||
| 292 | &xor ($Zhi,"($Hshr4,$nhi[0],8)"); | ||
| 293 | |||
| 294 | unshift (@nhi,pop(@nhi)); # "rotate" registers | ||
| 295 | unshift (@rem,pop(@rem)); | ||
| 296 | } | ||
| 297 | &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); | ||
| 298 | &xor ($Zlo,"8($Htbl,$nlo)"); | ||
| 299 | &xor ($Zhi,"($Htbl,$nlo)"); | ||
| 300 | |||
| 301 | &shl ($rem[1],48); | ||
| 302 | &xor ($Zlo,$tmp); | ||
| 303 | |||
| 304 | &xor ($Zhi,$rem[1]); | ||
| 305 | &movz ($rem[0],&LB($Zlo)); | ||
| 306 | &shr ($Zlo,4); | ||
| 307 | |||
| 308 | &mov ($tmp,$Zhi); | ||
| 309 | &shl (&LB($rem[0]),4); | ||
| 310 | &shr ($Zhi,4); | ||
| 311 | |||
| 312 | &xor ($Zlo,"8($Htbl,$nhi[0])"); | ||
| 313 | &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); | ||
| 314 | &shl ($tmp,60); | ||
| 315 | |||
| 316 | &xor ($Zhi,"($Htbl,$nhi[0])"); | ||
| 317 | &xor ($Zlo,$tmp); | ||
| 318 | &shl ($rem[0],48); | ||
| 319 | |||
| 320 | &bswap ($Zlo); | ||
| 321 | &xor ($Zhi,$rem[0]); | ||
| 322 | |||
| 323 | &bswap ($Zhi); | ||
| 324 | &cmp ($inp,$len); | ||
| 325 | &jb (".Louter_loop"); | ||
| 326 | } | ||
| 327 | $code.=<<___; | ||
| 328 | mov $Zlo,8($Xi) | ||
| 329 | mov $Zhi,($Xi) | ||
| 330 | |||
| 331 | lea 280(%rsp),%rsi | ||
| 332 | mov 0(%rsi),%r15 | ||
| 333 | mov 8(%rsi),%r14 | ||
| 334 | mov 16(%rsi),%r13 | ||
| 335 | mov 24(%rsi),%r12 | ||
| 336 | mov 32(%rsi),%rbp | ||
| 337 | mov 40(%rsi),%rbx | ||
| 338 | lea 48(%rsi),%rsp | ||
| 339 | .Lghash_epilogue: | ||
| 340 | ret | ||
| 341 | .size gcm_ghash_4bit,.-gcm_ghash_4bit | ||
| 342 | ___ | ||
| 343 | |||
| 344 | ###################################################################### | ||
| 345 | # PCLMULQDQ version. | ||
| 346 | |||
| 347 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order | ||
| 348 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | ||
| 349 | |||
| 350 | ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; | ||
| 351 | ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); | ||
| 352 | |||
| 353 | sub clmul64x64_T2 { # minimal register pressure | ||
| 354 | my ($Xhi,$Xi,$Hkey,$modulo)=@_; | ||
| 355 | |||
| 356 | $code.=<<___ if (!defined($modulo)); | ||
| 357 | movdqa $Xi,$Xhi # | ||
| 358 | pshufd \$0b01001110,$Xi,$T1 | ||
| 359 | pshufd \$0b01001110,$Hkey,$T2 | ||
| 360 | pxor $Xi,$T1 # | ||
| 361 | pxor $Hkey,$T2 | ||
| 362 | ___ | ||
| 363 | $code.=<<___; | ||
| 364 | pclmulqdq \$0x00,$Hkey,$Xi ####### | ||
| 365 | pclmulqdq \$0x11,$Hkey,$Xhi ####### | ||
| 366 | pclmulqdq \$0x00,$T2,$T1 ####### | ||
| 367 | pxor $Xi,$T1 # | ||
| 368 | pxor $Xhi,$T1 # | ||
| 369 | |||
| 370 | movdqa $T1,$T2 # | ||
| 371 | psrldq \$8,$T1 | ||
| 372 | pslldq \$8,$T2 # | ||
| 373 | pxor $T1,$Xhi | ||
| 374 | pxor $T2,$Xi # | ||
| 375 | ___ | ||
| 376 | } | ||
| 377 | |||
| 378 | sub reduction_alg9 { # 17/13 times faster than Intel version | ||
| 379 | my ($Xhi,$Xi) = @_; | ||
| 380 | |||
| 381 | $code.=<<___; | ||
| 382 | # 1st phase | ||
| 383 | movdqa $Xi,$T1 # | ||
| 384 | psllq \$1,$Xi | ||
| 385 | pxor $T1,$Xi # | ||
| 386 | psllq \$5,$Xi # | ||
| 387 | pxor $T1,$Xi # | ||
| 388 | psllq \$57,$Xi # | ||
| 389 | movdqa $Xi,$T2 # | ||
| 390 | pslldq \$8,$Xi | ||
| 391 | psrldq \$8,$T2 # | ||
| 392 | pxor $T1,$Xi | ||
| 393 | pxor $T2,$Xhi # | ||
| 394 | |||
| 395 | # 2nd phase | ||
| 396 | movdqa $Xi,$T2 | ||
| 397 | psrlq \$5,$Xi | ||
| 398 | pxor $T2,$Xi # | ||
| 399 | psrlq \$1,$Xi # | ||
| 400 | pxor $T2,$Xi # | ||
| 401 | pxor $Xhi,$T2 | ||
| 402 | psrlq \$1,$Xi # | ||
| 403 | pxor $T2,$Xi # | ||
| 404 | ___ | ||
| 405 | } | ||
| 406 | |||
| 407 | { my ($Htbl,$Xip)=@_4args; | ||
| 408 | |||
| 409 | $code.=<<___; | ||
| 410 | .globl gcm_init_clmul | ||
| 411 | .type gcm_init_clmul,\@abi-omnipotent | ||
| 412 | .align 16 | ||
| 413 | gcm_init_clmul: | ||
| 414 | movdqu ($Xip),$Hkey | ||
| 415 | pshufd \$0b01001110,$Hkey,$Hkey # dword swap | ||
| 416 | |||
| 417 | # <<1 twist | ||
| 418 | pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword | ||
| 419 | movdqa $Hkey,$T1 | ||
| 420 | psllq \$1,$Hkey | ||
| 421 | pxor $T3,$T3 # | ||
| 422 | psrlq \$63,$T1 | ||
| 423 | pcmpgtd $T2,$T3 # broadcast carry bit | ||
| 424 | pslldq \$8,$T1 | ||
| 425 | por $T1,$Hkey # H<<=1 | ||
| 426 | |||
| 427 | # magic reduction | ||
| 428 | pand .L0x1c2_polynomial(%rip),$T3 | ||
| 429 | pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial | ||
| 430 | |||
| 431 | # calculate H^2 | ||
| 432 | movdqa $Hkey,$Xi | ||
| 433 | ___ | ||
| 434 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); | ||
| 435 | &reduction_alg9 ($Xhi,$Xi); | ||
| 436 | $code.=<<___; | ||
| 437 | movdqu $Hkey,($Htbl) # save H | ||
| 438 | movdqu $Xi,16($Htbl) # save H^2 | ||
| 439 | ret | ||
| 440 | .size gcm_init_clmul,.-gcm_init_clmul | ||
| 441 | ___ | ||
| 442 | } | ||
| 443 | |||
| 444 | { my ($Xip,$Htbl)=@_4args; | ||
| 445 | |||
| 446 | $code.=<<___; | ||
| 447 | .globl gcm_gmult_clmul | ||
| 448 | .type gcm_gmult_clmul,\@abi-omnipotent | ||
| 449 | .align 16 | ||
| 450 | gcm_gmult_clmul: | ||
| 451 | movdqu ($Xip),$Xi | ||
| 452 | movdqa .Lbswap_mask(%rip),$T3 | ||
| 453 | movdqu ($Htbl),$Hkey | ||
| 454 | pshufb $T3,$Xi | ||
| 455 | ___ | ||
| 456 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); | ||
| 457 | &reduction_alg9 ($Xhi,$Xi); | ||
| 458 | $code.=<<___; | ||
| 459 | pshufb $T3,$Xi | ||
| 460 | movdqu $Xi,($Xip) | ||
| 461 | ret | ||
| 462 | .size gcm_gmult_clmul,.-gcm_gmult_clmul | ||
| 463 | ___ | ||
| 464 | } | ||
| 465 | |||
| 466 | { my ($Xip,$Htbl,$inp,$len)=@_4args; | ||
| 467 | my $Xn="%xmm6"; | ||
| 468 | my $Xhn="%xmm7"; | ||
| 469 | my $Hkey2="%xmm8"; | ||
| 470 | my $T1n="%xmm9"; | ||
| 471 | my $T2n="%xmm10"; | ||
| 472 | |||
| 473 | $code.=<<___; | ||
| 474 | .globl gcm_ghash_clmul | ||
| 475 | .type gcm_ghash_clmul,\@abi-omnipotent | ||
| 476 | .align 16 | ||
| 477 | gcm_ghash_clmul: | ||
| 478 | ___ | ||
| 479 | $code.=<<___ if ($win64); | ||
| 480 | .LSEH_begin_gcm_ghash_clmul: | ||
| 481 | # I can't trust assembler to use specific encoding:-( | ||
| 482 | .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp | ||
| 483 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) | ||
| 484 | .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) | ||
| 485 | .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp) | ||
| 486 | .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp) | ||
| 487 | .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp) | ||
| 488 | ___ | ||
| 489 | $code.=<<___; | ||
| 490 | movdqa .Lbswap_mask(%rip),$T3 | ||
| 491 | |||
| 492 | movdqu ($Xip),$Xi | ||
| 493 | movdqu ($Htbl),$Hkey | ||
| 494 | pshufb $T3,$Xi | ||
| 495 | |||
| 496 | sub \$0x10,$len | ||
| 497 | jz .Lodd_tail | ||
| 498 | |||
| 499 | movdqu 16($Htbl),$Hkey2 | ||
| 500 | ####### | ||
| 501 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = | ||
| 502 | # [(H*Ii+1) + (H*Xi+1)] mod P = | ||
| 503 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P | ||
| 504 | # | ||
| 505 | movdqu ($inp),$T1 # Ii | ||
| 506 | movdqu 16($inp),$Xn # Ii+1 | ||
| 507 | pshufb $T3,$T1 | ||
| 508 | pshufb $T3,$Xn | ||
| 509 | pxor $T1,$Xi # Ii+Xi | ||
| 510 | ___ | ||
| 511 | &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
| 512 | $code.=<<___; | ||
| 513 | movdqa $Xi,$Xhi # | ||
| 514 | pshufd \$0b01001110,$Xi,$T1 | ||
| 515 | pshufd \$0b01001110,$Hkey2,$T2 | ||
| 516 | pxor $Xi,$T1 # | ||
| 517 | pxor $Hkey2,$T2 | ||
| 518 | |||
| 519 | lea 32($inp),$inp # i+=2 | ||
| 520 | sub \$0x20,$len | ||
| 521 | jbe .Leven_tail | ||
| 522 | |||
| 523 | .Lmod_loop: | ||
| 524 | ___ | ||
| 525 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) | ||
| 526 | $code.=<<___; | ||
| 527 | movdqu ($inp),$T1 # Ii | ||
| 528 | pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) | ||
| 529 | pxor $Xhn,$Xhi | ||
| 530 | |||
| 531 | movdqu 16($inp),$Xn # Ii+1 | ||
| 532 | pshufb $T3,$T1 | ||
| 533 | pshufb $T3,$Xn | ||
| 534 | |||
| 535 | movdqa $Xn,$Xhn # | ||
| 536 | pshufd \$0b01001110,$Xn,$T1n | ||
| 537 | pshufd \$0b01001110,$Hkey,$T2n | ||
| 538 | pxor $Xn,$T1n # | ||
| 539 | pxor $Hkey,$T2n | ||
| 540 | pxor $T1,$Xhi # "Ii+Xi", consume early | ||
| 541 | |||
| 542 | movdqa $Xi,$T1 # 1st phase | ||
| 543 | psllq \$1,$Xi | ||
| 544 | pxor $T1,$Xi # | ||
| 545 | psllq \$5,$Xi # | ||
| 546 | pxor $T1,$Xi # | ||
| 547 | pclmulqdq \$0x00,$Hkey,$Xn ####### | ||
| 548 | psllq \$57,$Xi # | ||
| 549 | movdqa $Xi,$T2 # | ||
| 550 | pslldq \$8,$Xi | ||
| 551 | psrldq \$8,$T2 # | ||
| 552 | pxor $T1,$Xi | ||
| 553 | pxor $T2,$Xhi # | ||
| 554 | |||
| 555 | pclmulqdq \$0x11,$Hkey,$Xhn ####### | ||
| 556 | movdqa $Xi,$T2 # 2nd phase | ||
| 557 | psrlq \$5,$Xi | ||
| 558 | pxor $T2,$Xi # | ||
| 559 | psrlq \$1,$Xi # | ||
| 560 | pxor $T2,$Xi # | ||
| 561 | pxor $Xhi,$T2 | ||
| 562 | psrlq \$1,$Xi # | ||
| 563 | pxor $T2,$Xi # | ||
| 564 | |||
| 565 | pclmulqdq \$0x00,$T2n,$T1n ####### | ||
| 566 | movdqa $Xi,$Xhi # | ||
| 567 | pshufd \$0b01001110,$Xi,$T1 | ||
| 568 | pshufd \$0b01001110,$Hkey2,$T2 | ||
| 569 | pxor $Xi,$T1 # | ||
| 570 | pxor $Hkey2,$T2 | ||
| 571 | |||
| 572 | pxor $Xn,$T1n # | ||
| 573 | pxor $Xhn,$T1n # | ||
| 574 | movdqa $T1n,$T2n # | ||
| 575 | psrldq \$8,$T1n | ||
| 576 | pslldq \$8,$T2n # | ||
| 577 | pxor $T1n,$Xhn | ||
| 578 | pxor $T2n,$Xn # | ||
| 579 | |||
| 580 | lea 32($inp),$inp | ||
| 581 | sub \$0x20,$len | ||
| 582 | ja .Lmod_loop | ||
| 583 | |||
| 584 | .Leven_tail: | ||
| 585 | ___ | ||
| 586 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) | ||
| 587 | $code.=<<___; | ||
| 588 | pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) | ||
| 589 | pxor $Xhn,$Xhi | ||
| 590 | ___ | ||
| 591 | &reduction_alg9 ($Xhi,$Xi); | ||
| 592 | $code.=<<___; | ||
| 593 | test $len,$len | ||
| 594 | jnz .Ldone | ||
| 595 | |||
| 596 | .Lodd_tail: | ||
| 597 | movdqu ($inp),$T1 # Ii | ||
| 598 | pshufb $T3,$T1 | ||
| 599 | pxor $T1,$Xi # Ii+Xi | ||
| 600 | ___ | ||
| 601 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) | ||
| 602 | &reduction_alg9 ($Xhi,$Xi); | ||
| 603 | $code.=<<___; | ||
| 604 | .Ldone: | ||
| 605 | pshufb $T3,$Xi | ||
| 606 | movdqu $Xi,($Xip) | ||
| 607 | ___ | ||
| 608 | $code.=<<___ if ($win64); | ||
| 609 | movaps (%rsp),%xmm6 | ||
| 610 | movaps 0x10(%rsp),%xmm7 | ||
| 611 | movaps 0x20(%rsp),%xmm8 | ||
| 612 | movaps 0x30(%rsp),%xmm9 | ||
| 613 | movaps 0x40(%rsp),%xmm10 | ||
| 614 | add \$0x58,%rsp | ||
| 615 | ___ | ||
| 616 | $code.=<<___; | ||
| 617 | ret | ||
| 618 | .LSEH_end_gcm_ghash_clmul: | ||
| 619 | .size gcm_ghash_clmul,.-gcm_ghash_clmul | ||
| 620 | ___ | ||
| 621 | } | ||
| 622 | |||
| 623 | $code.=<<___; | ||
| 624 | .align 64 | ||
| 625 | .Lbswap_mask: | ||
| 626 | .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | ||
| 627 | .L0x1c2_polynomial: | ||
| 628 | .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 | ||
| 629 | .align 64 | ||
| 630 | .type .Lrem_4bit,\@object | ||
| 631 | .Lrem_4bit: | ||
| 632 | .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` | ||
| 633 | .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` | ||
| 634 | .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` | ||
| 635 | .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` | ||
| 636 | .type .Lrem_8bit,\@object | ||
| 637 | .Lrem_8bit: | ||
| 638 | .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E | ||
| 639 | .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E | ||
| 640 | .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E | ||
| 641 | .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E | ||
| 642 | .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E | ||
| 643 | .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E | ||
| 644 | .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E | ||
| 645 | .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E | ||
| 646 | .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE | ||
| 647 | .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE | ||
| 648 | .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE | ||
| 649 | .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE | ||
| 650 | .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E | ||
| 651 | .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E | ||
| 652 | .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE | ||
| 653 | .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE | ||
| 654 | .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E | ||
| 655 | .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E | ||
| 656 | .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E | ||
| 657 | .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E | ||
| 658 | .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E | ||
| 659 | .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E | ||
| 660 | .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E | ||
| 661 | .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E | ||
| 662 | .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE | ||
| 663 | .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE | ||
| 664 | .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE | ||
| 665 | .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE | ||
| 666 | .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E | ||
| 667 | .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E | ||
| 668 | .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE | ||
| 669 | .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE | ||
| 670 | |||
| 671 | .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 672 | .align 64 | ||
| 673 | ___ | ||
| 674 | |||
| 675 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 676 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 677 | if ($win64) { | ||
| 678 | $rec="%rcx"; | ||
| 679 | $frame="%rdx"; | ||
| 680 | $context="%r8"; | ||
| 681 | $disp="%r9"; | ||
| 682 | |||
| 683 | $code.=<<___; | ||
| 684 | .extern __imp_RtlVirtualUnwind | ||
| 685 | .type se_handler,\@abi-omnipotent | ||
| 686 | .align 16 | ||
| 687 | se_handler: | ||
| 688 | push %rsi | ||
| 689 | push %rdi | ||
| 690 | push %rbx | ||
| 691 | push %rbp | ||
| 692 | push %r12 | ||
| 693 | push %r13 | ||
| 694 | push %r14 | ||
| 695 | push %r15 | ||
| 696 | pushfq | ||
| 697 | sub \$64,%rsp | ||
| 698 | |||
| 699 | mov 120($context),%rax # pull context->Rax | ||
| 700 | mov 248($context),%rbx # pull context->Rip | ||
| 701 | |||
| 702 | mov 8($disp),%rsi # disp->ImageBase | ||
| 703 | mov 56($disp),%r11 # disp->HandlerData | ||
| 704 | |||
| 705 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 706 | lea (%rsi,%r10),%r10 # prologue label | ||
| 707 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 708 | jb .Lin_prologue | ||
| 709 | |||
| 710 | mov 152($context),%rax # pull context->Rsp | ||
| 711 | |||
| 712 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 713 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 714 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 715 | jae .Lin_prologue | ||
| 716 | |||
| 717 | lea 24(%rax),%rax # adjust "rsp" | ||
| 718 | |||
| 719 | mov -8(%rax),%rbx | ||
| 720 | mov -16(%rax),%rbp | ||
| 721 | mov -24(%rax),%r12 | ||
| 722 | mov %rbx,144($context) # restore context->Rbx | ||
| 723 | mov %rbp,160($context) # restore context->Rbp | ||
| 724 | mov %r12,216($context) # restore context->R12 | ||
| 725 | |||
| 726 | .Lin_prologue: | ||
| 727 | mov 8(%rax),%rdi | ||
| 728 | mov 16(%rax),%rsi | ||
| 729 | mov %rax,152($context) # restore context->Rsp | ||
| 730 | mov %rsi,168($context) # restore context->Rsi | ||
| 731 | mov %rdi,176($context) # restore context->Rdi | ||
| 732 | |||
| 733 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 734 | mov $context,%rsi # context | ||
| 735 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
| 736 | .long 0xa548f3fc # cld; rep movsq | ||
| 737 | |||
| 738 | mov $disp,%rsi | ||
| 739 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 740 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 741 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 742 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 743 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 744 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 745 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 746 | mov %r10,32(%rsp) # arg5 | ||
| 747 | mov %r11,40(%rsp) # arg6 | ||
| 748 | mov %r12,48(%rsp) # arg7 | ||
| 749 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 750 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 751 | |||
| 752 | mov \$1,%eax # ExceptionContinueSearch | ||
| 753 | add \$64,%rsp | ||
| 754 | popfq | ||
| 755 | pop %r15 | ||
| 756 | pop %r14 | ||
| 757 | pop %r13 | ||
| 758 | pop %r12 | ||
| 759 | pop %rbp | ||
| 760 | pop %rbx | ||
| 761 | pop %rdi | ||
| 762 | pop %rsi | ||
| 763 | ret | ||
| 764 | .size se_handler,.-se_handler | ||
| 765 | |||
| 766 | .section .pdata | ||
| 767 | .align 4 | ||
| 768 | .rva .LSEH_begin_gcm_gmult_4bit | ||
| 769 | .rva .LSEH_end_gcm_gmult_4bit | ||
| 770 | .rva .LSEH_info_gcm_gmult_4bit | ||
| 771 | |||
| 772 | .rva .LSEH_begin_gcm_ghash_4bit | ||
| 773 | .rva .LSEH_end_gcm_ghash_4bit | ||
| 774 | .rva .LSEH_info_gcm_ghash_4bit | ||
| 775 | |||
| 776 | .rva .LSEH_begin_gcm_ghash_clmul | ||
| 777 | .rva .LSEH_end_gcm_ghash_clmul | ||
| 778 | .rva .LSEH_info_gcm_ghash_clmul | ||
| 779 | |||
| 780 | .section .xdata | ||
| 781 | .align 8 | ||
| 782 | .LSEH_info_gcm_gmult_4bit: | ||
| 783 | .byte 9,0,0,0 | ||
| 784 | .rva se_handler | ||
| 785 | .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData | ||
| 786 | .LSEH_info_gcm_ghash_4bit: | ||
| 787 | .byte 9,0,0,0 | ||
| 788 | .rva se_handler | ||
| 789 | .rva .Lghash_prologue,.Lghash_epilogue # HandlerData | ||
| 790 | .LSEH_info_gcm_ghash_clmul: | ||
| 791 | .byte 0x01,0x1f,0x0b,0x00 | ||
| 792 | .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 | ||
| 793 | .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 | ||
| 794 | .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 | ||
| 795 | .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 | ||
| 796 | .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 | ||
| 797 | .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58 | ||
| 798 | ___ | ||
| 799 | } | ||
| 800 | |||
| 801 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 802 | |||
| 803 | print $code; | ||
| 804 | |||
| 805 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c index 8f8bd563b9..3d3782cbe1 100644 --- a/src/lib/libcrypto/modes/cbc128.c +++ b/src/lib/libcrypto/modes/cbc128.c | |||
| @@ -48,7 +48,8 @@ | |||
| 48 | * | 48 | * |
| 49 | */ | 49 | */ |
| 50 | 50 | ||
| 51 | #include "modes.h" | 51 | #include <openssl/crypto.h> |
| 52 | #include "modes_lcl.h" | ||
| 52 | #include <string.h> | 53 | #include <string.h> |
| 53 | 54 | ||
| 54 | #ifndef MODES_DEBUG | 55 | #ifndef MODES_DEBUG |
| @@ -58,12 +59,7 @@ | |||
| 58 | #endif | 59 | #endif |
| 59 | #include <assert.h> | 60 | #include <assert.h> |
| 60 | 61 | ||
| 61 | #define STRICT_ALIGNMENT 1 | 62 | #ifndef STRICT_ALIGNMENT |
| 62 | #if defined(__i386) || defined(__i386__) || \ | ||
| 63 | defined(__x86_64) || defined(__x86_64__) || \ | ||
| 64 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ | ||
| 65 | defined(__s390__) || defined(__s390x__) | ||
| 66 | # undef STRICT_ALIGNMENT | ||
| 67 | # define STRICT_ALIGNMENT 0 | 63 | # define STRICT_ALIGNMENT 0 |
| 68 | #endif | 64 | #endif |
| 69 | 65 | ||
diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c new file mode 100644 index 0000000000..c9b35e5b35 --- /dev/null +++ b/src/lib/libcrypto/modes/ccm128.c | |||
| @@ -0,0 +1,441 @@ | |||
| 1 | /* ==================================================================== | ||
| 2 | * Copyright (c) 2011 The OpenSSL Project. All rights reserved. | ||
| 3 | * | ||
| 4 | * Redistribution and use in source and binary forms, with or without | ||
| 5 | * modification, are permitted provided that the following conditions | ||
| 6 | * are met: | ||
| 7 | * | ||
| 8 | * 1. Redistributions of source code must retain the above copyright | ||
| 9 | * notice, this list of conditions and the following disclaimer. | ||
| 10 | * | ||
| 11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 12 | * notice, this list of conditions and the following disclaimer in | ||
| 13 | * the documentation and/or other materials provided with the | ||
| 14 | * distribution. | ||
| 15 | * | ||
| 16 | * 3. All advertising materials mentioning features or use of this | ||
| 17 | * software must display the following acknowledgment: | ||
| 18 | * "This product includes software developed by the OpenSSL Project | ||
| 19 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
| 20 | * | ||
| 21 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 22 | * endorse or promote products derived from this software without | ||
| 23 | * prior written permission. For written permission, please contact | ||
| 24 | * openssl-core@openssl.org. | ||
| 25 | * | ||
| 26 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 27 | * nor may "OpenSSL" appear in their names without prior written | ||
| 28 | * permission of the OpenSSL Project. | ||
| 29 | * | ||
| 30 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 31 | * acknowledgment: | ||
| 32 | * "This product includes software developed by the OpenSSL Project | ||
| 33 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
| 34 | * | ||
| 35 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 36 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 38 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 39 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 40 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 41 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 42 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 45 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 46 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 47 | * ==================================================================== | ||
| 48 | */ | ||
| 49 | |||
| 50 | #include <openssl/crypto.h> | ||
| 51 | #include "modes_lcl.h" | ||
| 52 | #include <string.h> | ||
| 53 | |||
| 54 | #ifndef MODES_DEBUG | ||
| 55 | # ifndef NDEBUG | ||
| 56 | # define NDEBUG | ||
| 57 | # endif | ||
| 58 | #endif | ||
| 59 | #include <assert.h> | ||
| 60 | |||
| 61 | /* First you setup M and L parameters and pass the key schedule. | ||
| 62 | * This is called once per session setup... */ | ||
| 63 | void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, | ||
| 64 | unsigned int M,unsigned int L,void *key,block128_f block) | ||
| 65 | { | ||
| 66 | memset(ctx->nonce.c,0,sizeof(ctx->nonce.c)); | ||
| 67 | ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3; | ||
| 68 | ctx->blocks = 0; | ||
| 69 | ctx->block = block; | ||
| 70 | ctx->key = key; | ||
| 71 | } | ||
| 72 | |||
| 73 | /* !!! Following interfaces are to be called *once* per packet !!! */ | ||
| 74 | |||
| 75 | /* Then you setup per-message nonce and pass the length of the message */ | ||
| 76 | int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, | ||
| 77 | const unsigned char *nonce,size_t nlen,size_t mlen) | ||
| 78 | { | ||
| 79 | unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */ | ||
| 80 | |||
| 81 | if (nlen<(14-L)) return -1; /* nonce is too short */ | ||
| 82 | |||
| 83 | if (sizeof(mlen)==8 && L>=3) { | ||
| 84 | ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8))); | ||
| 85 | ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8))); | ||
| 86 | ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8))); | ||
| 87 | ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8))); | ||
| 88 | } | ||
| 89 | else | ||
| 90 | *(u32*)(&ctx->nonce.c[8]) = 0; | ||
| 91 | |||
| 92 | ctx->nonce.c[12] = (u8)(mlen>>24); | ||
| 93 | ctx->nonce.c[13] = (u8)(mlen>>16); | ||
| 94 | ctx->nonce.c[14] = (u8)(mlen>>8); | ||
| 95 | ctx->nonce.c[15] = (u8)mlen; | ||
| 96 | |||
| 97 | ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */ | ||
| 98 | memcpy(&ctx->nonce.c[1],nonce,14-L); | ||
| 99 | |||
| 100 | return 0; | ||
| 101 | } | ||
| 102 | |||
| 103 | /* Then you pass additional authentication data, this is optional */ | ||
| 104 | void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, | ||
| 105 | const unsigned char *aad,size_t alen) | ||
| 106 | { unsigned int i; | ||
| 107 | block128_f block = ctx->block; | ||
| 108 | |||
| 109 | if (alen==0) return; | ||
| 110 | |||
| 111 | ctx->nonce.c[0] |= 0x40; /* set Adata flag */ | ||
| 112 | (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key), | ||
| 113 | ctx->blocks++; | ||
| 114 | |||
| 115 | if (alen<(0x10000-0x100)) { | ||
| 116 | ctx->cmac.c[0] ^= (u8)(alen>>8); | ||
| 117 | ctx->cmac.c[1] ^= (u8)alen; | ||
| 118 | i=2; | ||
| 119 | } | ||
| 120 | else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) { | ||
| 121 | ctx->cmac.c[0] ^= 0xFF; | ||
| 122 | ctx->cmac.c[1] ^= 0xFF; | ||
| 123 | ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8))); | ||
| 124 | ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8))); | ||
| 125 | ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8))); | ||
| 126 | ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8))); | ||
| 127 | ctx->cmac.c[6] ^= (u8)(alen>>24); | ||
| 128 | ctx->cmac.c[7] ^= (u8)(alen>>16); | ||
| 129 | ctx->cmac.c[8] ^= (u8)(alen>>8); | ||
| 130 | ctx->cmac.c[9] ^= (u8)alen; | ||
| 131 | i=10; | ||
| 132 | } | ||
| 133 | else { | ||
| 134 | ctx->cmac.c[0] ^= 0xFF; | ||
| 135 | ctx->cmac.c[1] ^= 0xFE; | ||
| 136 | ctx->cmac.c[2] ^= (u8)(alen>>24); | ||
| 137 | ctx->cmac.c[3] ^= (u8)(alen>>16); | ||
| 138 | ctx->cmac.c[4] ^= (u8)(alen>>8); | ||
| 139 | ctx->cmac.c[5] ^= (u8)alen; | ||
| 140 | i=6; | ||
| 141 | } | ||
| 142 | |||
| 143 | do { | ||
| 144 | for(;i<16 && alen;++i,++aad,--alen) | ||
| 145 | ctx->cmac.c[i] ^= *aad; | ||
| 146 | (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key), | ||
| 147 | ctx->blocks++; | ||
| 148 | i=0; | ||
| 149 | } while (alen); | ||
| 150 | } | ||
| 151 | |||
| 152 | /* Finally you encrypt or decrypt the message */ | ||
| 153 | |||
| 154 | /* counter part of nonce may not be larger than L*8 bits, | ||
| 155 | * L is not larger than 8, therefore 64-bit counter... */ | ||
| 156 | static void ctr64_inc(unsigned char *counter) { | ||
| 157 | unsigned int n=8; | ||
| 158 | u8 c; | ||
| 159 | |||
| 160 | counter += 8; | ||
| 161 | do { | ||
| 162 | --n; | ||
| 163 | c = counter[n]; | ||
| 164 | ++c; | ||
| 165 | counter[n] = c; | ||
| 166 | if (c) return; | ||
| 167 | } while (n); | ||
| 168 | } | ||
| 169 | |||
| 170 | int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, | ||
| 171 | const unsigned char *inp, unsigned char *out, | ||
| 172 | size_t len) | ||
| 173 | { | ||
| 174 | size_t n; | ||
| 175 | unsigned int i,L; | ||
| 176 | unsigned char flags0 = ctx->nonce.c[0]; | ||
| 177 | block128_f block = ctx->block; | ||
| 178 | void * key = ctx->key; | ||
| 179 | union { u64 u[2]; u8 c[16]; } scratch; | ||
| 180 | |||
| 181 | if (!(flags0&0x40)) | ||
| 182 | (*block)(ctx->nonce.c,ctx->cmac.c,key), | ||
| 183 | ctx->blocks++; | ||
| 184 | |||
| 185 | ctx->nonce.c[0] = L = flags0&7; | ||
| 186 | for (n=0,i=15-L;i<15;++i) { | ||
| 187 | n |= ctx->nonce.c[i]; | ||
| 188 | ctx->nonce.c[i]=0; | ||
| 189 | n <<= 8; | ||
| 190 | } | ||
| 191 | n |= ctx->nonce.c[15]; /* reconstructed length */ | ||
| 192 | ctx->nonce.c[15]=1; | ||
| 193 | |||
| 194 | if (n!=len) return -1; /* length mismatch */ | ||
| 195 | |||
| 196 | ctx->blocks += ((len+15)>>3)|1; | ||
| 197 | if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */ | ||
| 198 | |||
| 199 | while (len>=16) { | ||
| 200 | #if defined(STRICT_ALIGNMENT) | ||
| 201 | union { u64 u[2]; u8 c[16]; } temp; | ||
| 202 | |||
| 203 | memcpy (temp.c,inp,16); | ||
| 204 | ctx->cmac.u[0] ^= temp.u[0]; | ||
| 205 | ctx->cmac.u[1] ^= temp.u[1]; | ||
| 206 | #else | ||
| 207 | ctx->cmac.u[0] ^= ((u64*)inp)[0]; | ||
| 208 | ctx->cmac.u[1] ^= ((u64*)inp)[1]; | ||
| 209 | #endif | ||
| 210 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
| 211 | (*block)(ctx->nonce.c,scratch.c,key); | ||
| 212 | ctr64_inc(ctx->nonce.c); | ||
| 213 | #if defined(STRICT_ALIGNMENT) | ||
| 214 | temp.u[0] ^= scratch.u[0]; | ||
| 215 | temp.u[1] ^= scratch.u[1]; | ||
| 216 | memcpy(out,temp.c,16); | ||
| 217 | #else | ||
| 218 | ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]; | ||
| 219 | ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]; | ||
| 220 | #endif | ||
| 221 | inp += 16; | ||
| 222 | out += 16; | ||
| 223 | len -= 16; | ||
| 224 | } | ||
| 225 | |||
| 226 | if (len) { | ||
| 227 | for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i]; | ||
| 228 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
| 229 | (*block)(ctx->nonce.c,scratch.c,key); | ||
| 230 | for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i]; | ||
| 231 | } | ||
| 232 | |||
| 233 | for (i=15-L;i<16;++i) | ||
| 234 | ctx->nonce.c[i]=0; | ||
| 235 | |||
| 236 | (*block)(ctx->nonce.c,scratch.c,key); | ||
| 237 | ctx->cmac.u[0] ^= scratch.u[0]; | ||
| 238 | ctx->cmac.u[1] ^= scratch.u[1]; | ||
| 239 | |||
| 240 | ctx->nonce.c[0] = flags0; | ||
| 241 | |||
| 242 | return 0; | ||
| 243 | } | ||
| 244 | |||
| 245 | int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, | ||
| 246 | const unsigned char *inp, unsigned char *out, | ||
| 247 | size_t len) | ||
| 248 | { | ||
| 249 | size_t n; | ||
| 250 | unsigned int i,L; | ||
| 251 | unsigned char flags0 = ctx->nonce.c[0]; | ||
| 252 | block128_f block = ctx->block; | ||
| 253 | void * key = ctx->key; | ||
| 254 | union { u64 u[2]; u8 c[16]; } scratch; | ||
| 255 | |||
| 256 | if (!(flags0&0x40)) | ||
| 257 | (*block)(ctx->nonce.c,ctx->cmac.c,key); | ||
| 258 | |||
| 259 | ctx->nonce.c[0] = L = flags0&7; | ||
| 260 | for (n=0,i=15-L;i<15;++i) { | ||
| 261 | n |= ctx->nonce.c[i]; | ||
| 262 | ctx->nonce.c[i]=0; | ||
| 263 | n <<= 8; | ||
| 264 | } | ||
| 265 | n |= ctx->nonce.c[15]; /* reconstructed length */ | ||
| 266 | ctx->nonce.c[15]=1; | ||
| 267 | |||
| 268 | if (n!=len) return -1; | ||
| 269 | |||
| 270 | while (len>=16) { | ||
| 271 | #if defined(STRICT_ALIGNMENT) | ||
| 272 | union { u64 u[2]; u8 c[16]; } temp; | ||
| 273 | #endif | ||
| 274 | (*block)(ctx->nonce.c,scratch.c,key); | ||
| 275 | ctr64_inc(ctx->nonce.c); | ||
| 276 | #if defined(STRICT_ALIGNMENT) | ||
| 277 | memcpy (temp.c,inp,16); | ||
| 278 | ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]); | ||
| 279 | ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]); | ||
| 280 | memcpy (out,scratch.c,16); | ||
| 281 | #else | ||
| 282 | ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]); | ||
| 283 | ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]); | ||
| 284 | #endif | ||
| 285 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
| 286 | |||
| 287 | inp += 16; | ||
| 288 | out += 16; | ||
| 289 | len -= 16; | ||
| 290 | } | ||
| 291 | |||
| 292 | if (len) { | ||
| 293 | (*block)(ctx->nonce.c,scratch.c,key); | ||
| 294 | for (i=0; i<len; ++i) | ||
| 295 | ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); | ||
| 296 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
| 297 | } | ||
| 298 | |||
| 299 | for (i=15-L;i<16;++i) | ||
| 300 | ctx->nonce.c[i]=0; | ||
| 301 | |||
| 302 | (*block)(ctx->nonce.c,scratch.c,key); | ||
| 303 | ctx->cmac.u[0] ^= scratch.u[0]; | ||
| 304 | ctx->cmac.u[1] ^= scratch.u[1]; | ||
| 305 | |||
| 306 | ctx->nonce.c[0] = flags0; | ||
| 307 | |||
| 308 | return 0; | ||
| 309 | } | ||
| 310 | |||
| 311 | static void ctr64_add (unsigned char *counter,size_t inc) | ||
| 312 | { size_t n=8, val=0; | ||
| 313 | |||
| 314 | counter += 8; | ||
| 315 | do { | ||
| 316 | --n; | ||
| 317 | val += counter[n] + (inc&0xff); | ||
| 318 | counter[n] = (unsigned char)val; | ||
| 319 | val >>= 8; /* carry bit */ | ||
| 320 | inc >>= 8; | ||
| 321 | } while(n && (inc || val)); | ||
| 322 | } | ||
| 323 | |||
| 324 | int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, | ||
| 325 | const unsigned char *inp, unsigned char *out, | ||
| 326 | size_t len,ccm128_f stream) | ||
| 327 | { | ||
| 328 | size_t n; | ||
| 329 | unsigned int i,L; | ||
| 330 | unsigned char flags0 = ctx->nonce.c[0]; | ||
| 331 | block128_f block = ctx->block; | ||
| 332 | void * key = ctx->key; | ||
| 333 | union { u64 u[2]; u8 c[16]; } scratch; | ||
| 334 | |||
| 335 | if (!(flags0&0x40)) | ||
| 336 | (*block)(ctx->nonce.c,ctx->cmac.c,key), | ||
| 337 | ctx->blocks++; | ||
| 338 | |||
| 339 | ctx->nonce.c[0] = L = flags0&7; | ||
| 340 | for (n=0,i=15-L;i<15;++i) { | ||
| 341 | n |= ctx->nonce.c[i]; | ||
| 342 | ctx->nonce.c[i]=0; | ||
| 343 | n <<= 8; | ||
| 344 | } | ||
| 345 | n |= ctx->nonce.c[15]; /* reconstructed length */ | ||
| 346 | ctx->nonce.c[15]=1; | ||
| 347 | |||
| 348 | if (n!=len) return -1; /* length mismatch */ | ||
| 349 | |||
| 350 | ctx->blocks += ((len+15)>>3)|1; | ||
| 351 | if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */ | ||
| 352 | |||
| 353 | if ((n=len/16)) { | ||
| 354 | (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c); | ||
| 355 | n *= 16; | ||
| 356 | inp += n; | ||
| 357 | out += n; | ||
| 358 | len -= n; | ||
| 359 | if (len) ctr64_add(ctx->nonce.c,n/16); | ||
| 360 | } | ||
| 361 | |||
| 362 | if (len) { | ||
| 363 | for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i]; | ||
| 364 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
| 365 | (*block)(ctx->nonce.c,scratch.c,key); | ||
| 366 | for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i]; | ||
| 367 | } | ||
| 368 | |||
| 369 | for (i=15-L;i<16;++i) | ||
| 370 | ctx->nonce.c[i]=0; | ||
| 371 | |||
| 372 | (*block)(ctx->nonce.c,scratch.c,key); | ||
| 373 | ctx->cmac.u[0] ^= scratch.u[0]; | ||
| 374 | ctx->cmac.u[1] ^= scratch.u[1]; | ||
| 375 | |||
| 376 | ctx->nonce.c[0] = flags0; | ||
| 377 | |||
| 378 | return 0; | ||
| 379 | } | ||
| 380 | |||
| 381 | int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, | ||
| 382 | const unsigned char *inp, unsigned char *out, | ||
| 383 | size_t len,ccm128_f stream) | ||
| 384 | { | ||
| 385 | size_t n; | ||
| 386 | unsigned int i,L; | ||
| 387 | unsigned char flags0 = ctx->nonce.c[0]; | ||
| 388 | block128_f block = ctx->block; | ||
| 389 | void * key = ctx->key; | ||
| 390 | union { u64 u[2]; u8 c[16]; } scratch; | ||
| 391 | |||
| 392 | if (!(flags0&0x40)) | ||
| 393 | (*block)(ctx->nonce.c,ctx->cmac.c,key); | ||
| 394 | |||
| 395 | ctx->nonce.c[0] = L = flags0&7; | ||
| 396 | for (n=0,i=15-L;i<15;++i) { | ||
| 397 | n |= ctx->nonce.c[i]; | ||
| 398 | ctx->nonce.c[i]=0; | ||
| 399 | n <<= 8; | ||
| 400 | } | ||
| 401 | n |= ctx->nonce.c[15]; /* reconstructed length */ | ||
| 402 | ctx->nonce.c[15]=1; | ||
| 403 | |||
| 404 | if (n!=len) return -1; | ||
| 405 | |||
| 406 | if ((n=len/16)) { | ||
| 407 | (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c); | ||
| 408 | n *= 16; | ||
| 409 | inp += n; | ||
| 410 | out += n; | ||
| 411 | len -= n; | ||
| 412 | if (len) ctr64_add(ctx->nonce.c,n/16); | ||
| 413 | } | ||
| 414 | |||
| 415 | if (len) { | ||
| 416 | (*block)(ctx->nonce.c,scratch.c,key); | ||
| 417 | for (i=0; i<len; ++i) | ||
| 418 | ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); | ||
| 419 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
| 420 | } | ||
| 421 | |||
| 422 | for (i=15-L;i<16;++i) | ||
| 423 | ctx->nonce.c[i]=0; | ||
| 424 | |||
| 425 | (*block)(ctx->nonce.c,scratch.c,key); | ||
| 426 | ctx->cmac.u[0] ^= scratch.u[0]; | ||
| 427 | ctx->cmac.u[1] ^= scratch.u[1]; | ||
| 428 | |||
| 429 | ctx->nonce.c[0] = flags0; | ||
| 430 | |||
| 431 | return 0; | ||
| 432 | } | ||
| 433 | |||
| 434 | size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len) | ||
| 435 | { unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */ | ||
| 436 | |||
| 437 | M *= 2; M += 2; | ||
| 438 | if (len<M) return 0; | ||
| 439 | memcpy(tag,ctx->cmac.c,M); | ||
| 440 | return M; | ||
| 441 | } | ||
diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c index e5938c6137..4e6f5d35e1 100644 --- a/src/lib/libcrypto/modes/cfb128.c +++ b/src/lib/libcrypto/modes/cfb128.c | |||
| @@ -48,7 +48,8 @@ | |||
| 48 | * | 48 | * |
| 49 | */ | 49 | */ |
| 50 | 50 | ||
| 51 | #include "modes.h" | 51 | #include <openssl/crypto.h> |
| 52 | #include "modes_lcl.h" | ||
| 52 | #include <string.h> | 53 | #include <string.h> |
| 53 | 54 | ||
| 54 | #ifndef MODES_DEBUG | 55 | #ifndef MODES_DEBUG |
| @@ -58,14 +59,6 @@ | |||
| 58 | #endif | 59 | #endif |
| 59 | #include <assert.h> | 60 | #include <assert.h> |
| 60 | 61 | ||
| 61 | #define STRICT_ALIGNMENT | ||
| 62 | #if defined(__i386) || defined(__i386__) || \ | ||
| 63 | defined(__x86_64) || defined(__x86_64__) || \ | ||
| 64 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ | ||
| 65 | defined(__s390__) || defined(__s390x__) | ||
| 66 | # undef STRICT_ALIGNMENT | ||
| 67 | #endif | ||
| 68 | |||
| 69 | /* The input and output encrypted as though 128bit cfb mode is being | 62 | /* The input and output encrypted as though 128bit cfb mode is being |
| 70 | * used. The extra state information to record how much of the | 63 | * used. The extra state information to record how much of the |
| 71 | * 128bit block we have used is contained in *num; | 64 | * 128bit block we have used is contained in *num; |
diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c index 932037f551..ee642c5863 100644 --- a/src/lib/libcrypto/modes/ctr128.c +++ b/src/lib/libcrypto/modes/ctr128.c | |||
| @@ -48,7 +48,8 @@ | |||
| 48 | * | 48 | * |
| 49 | */ | 49 | */ |
| 50 | 50 | ||
| 51 | #include "modes.h" | 51 | #include <openssl/crypto.h> |
| 52 | #include "modes_lcl.h" | ||
| 52 | #include <string.h> | 53 | #include <string.h> |
| 53 | 54 | ||
| 54 | #ifndef MODES_DEBUG | 55 | #ifndef MODES_DEBUG |
| @@ -58,17 +59,6 @@ | |||
| 58 | #endif | 59 | #endif |
| 59 | #include <assert.h> | 60 | #include <assert.h> |
| 60 | 61 | ||
| 61 | typedef unsigned int u32; | ||
| 62 | typedef unsigned char u8; | ||
| 63 | |||
| 64 | #define STRICT_ALIGNMENT | ||
| 65 | #if defined(__i386) || defined(__i386__) || \ | ||
| 66 | defined(__x86_64) || defined(__x86_64__) || \ | ||
| 67 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ | ||
| 68 | defined(__s390__) || defined(__s390x__) | ||
| 69 | # undef STRICT_ALIGNMENT | ||
| 70 | #endif | ||
| 71 | |||
| 72 | /* NOTE: the IV/counter CTR mode is big-endian. The code itself | 62 | /* NOTE: the IV/counter CTR mode is big-endian. The code itself |
| 73 | * is endian-neutral. */ | 63 | * is endian-neutral. */ |
| 74 | 64 | ||
| @@ -182,3 +172,81 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, | |||
| 182 | 172 | ||
| 183 | *num=n; | 173 | *num=n; |
| 184 | } | 174 | } |
| 175 | |||
| 176 | /* increment upper 96 bits of 128-bit counter by 1 */ | ||
| 177 | static void ctr96_inc(unsigned char *counter) { | ||
| 178 | u32 n=12; | ||
| 179 | u8 c; | ||
| 180 | |||
| 181 | do { | ||
| 182 | --n; | ||
| 183 | c = counter[n]; | ||
| 184 | ++c; | ||
| 185 | counter[n] = c; | ||
| 186 | if (c) return; | ||
| 187 | } while (n); | ||
| 188 | } | ||
| 189 | |||
| 190 | void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, | ||
| 191 | size_t len, const void *key, | ||
| 192 | unsigned char ivec[16], unsigned char ecount_buf[16], | ||
| 193 | unsigned int *num, ctr128_f func) | ||
| 194 | { | ||
| 195 | unsigned int n,ctr32; | ||
| 196 | |||
| 197 | assert(in && out && key && ecount_buf && num); | ||
| 198 | assert(*num < 16); | ||
| 199 | |||
| 200 | n = *num; | ||
| 201 | |||
| 202 | while (n && len) { | ||
| 203 | *(out++) = *(in++) ^ ecount_buf[n]; | ||
| 204 | --len; | ||
| 205 | n = (n+1) % 16; | ||
| 206 | } | ||
| 207 | |||
| 208 | ctr32 = GETU32(ivec+12); | ||
| 209 | while (len>=16) { | ||
| 210 | size_t blocks = len/16; | ||
| 211 | /* | ||
| 212 | * 1<<28 is just a not-so-small yet not-so-large number... | ||
| 213 | * Below condition is practically never met, but it has to | ||
| 214 | * be checked for code correctness. | ||
| 215 | */ | ||
| 216 | if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28)) | ||
| 217 | blocks = (1U<<28); | ||
| 218 | /* | ||
| 219 | * As (*func) operates on 32-bit counter, caller | ||
| 220 | * has to handle overflow. 'if' below detects the | ||
| 221 | * overflow, which is then handled by limiting the | ||
| 222 | * amount of blocks to the exact overflow point... | ||
| 223 | */ | ||
| 224 | ctr32 += (u32)blocks; | ||
| 225 | if (ctr32 < blocks) { | ||
| 226 | blocks -= ctr32; | ||
| 227 | ctr32 = 0; | ||
| 228 | } | ||
| 229 | (*func)(in,out,blocks,key,ivec); | ||
| 230 | /* (*ctr) does not update ivec, caller does: */ | ||
| 231 | PUTU32(ivec+12,ctr32); | ||
| 232 | /* ... overflow was detected, propogate carry. */ | ||
| 233 | if (ctr32 == 0) ctr96_inc(ivec); | ||
| 234 | blocks *= 16; | ||
| 235 | len -= blocks; | ||
| 236 | out += blocks; | ||
| 237 | in += blocks; | ||
| 238 | } | ||
| 239 | if (len) { | ||
| 240 | memset(ecount_buf,0,16); | ||
| 241 | (*func)(ecount_buf,ecount_buf,1,key,ivec); | ||
| 242 | ++ctr32; | ||
| 243 | PUTU32(ivec+12,ctr32); | ||
| 244 | if (ctr32 == 0) ctr96_inc(ivec); | ||
| 245 | while (len--) { | ||
| 246 | out[n] = in[n] ^ ecount_buf[n]; | ||
| 247 | ++n; | ||
| 248 | } | ||
| 249 | } | ||
| 250 | |||
| 251 | *num=n; | ||
| 252 | } | ||
diff --git a/src/lib/libcrypto/modes/cts128.c b/src/lib/libcrypto/modes/cts128.c index e0430f9fdc..c0e1f3696c 100644 --- a/src/lib/libcrypto/modes/cts128.c +++ b/src/lib/libcrypto/modes/cts128.c | |||
| @@ -5,7 +5,8 @@ | |||
| 5 | * forms are granted according to the OpenSSL license. | 5 | * forms are granted according to the OpenSSL license. |
| 6 | */ | 6 | */ |
| 7 | 7 | ||
| 8 | #include "modes.h" | 8 | #include <openssl/crypto.h> |
| 9 | #include "modes_lcl.h" | ||
| 9 | #include <string.h> | 10 | #include <string.h> |
| 10 | 11 | ||
| 11 | #ifndef MODES_DEBUG | 12 | #ifndef MODES_DEBUG |
| @@ -23,8 +24,9 @@ | |||
| 23 | * deviates from mentioned RFCs. Most notably it allows input to be | 24 | * deviates from mentioned RFCs. Most notably it allows input to be |
| 24 | * of block length and it doesn't flip the order of the last two | 25 | * of block length and it doesn't flip the order of the last two |
| 25 | * blocks. CTS is being discussed even in ECB context, but it's not | 26 | * blocks. CTS is being discussed even in ECB context, but it's not |
| 26 | * adopted for any known application. This implementation complies | 27 | * adopted for any known application. This implementation provides |
| 27 | * with mentioned RFCs and [as such] extends CBC mode. | 28 | * two interfaces: one compliant with above mentioned RFCs and one |
| 29 | * compliant with the NIST proposal, both extending CBC mode. | ||
| 28 | */ | 30 | */ |
| 29 | 31 | ||
| 30 | size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, | 32 | size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, |
| @@ -54,6 +56,34 @@ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, | |||
| 54 | return len+residue; | 56 | return len+residue; |
| 55 | } | 57 | } |
| 56 | 58 | ||
| 59 | size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out, | ||
| 60 | size_t len, const void *key, | ||
| 61 | unsigned char ivec[16], block128_f block) | ||
| 62 | { size_t residue, n; | ||
| 63 | |||
| 64 | assert (in && out && key && ivec); | ||
| 65 | |||
| 66 | if (len < 16) return 0; | ||
| 67 | |||
| 68 | residue=len%16; | ||
| 69 | |||
| 70 | len -= residue; | ||
| 71 | |||
| 72 | CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block); | ||
| 73 | |||
| 74 | if (residue==0) return len; | ||
| 75 | |||
| 76 | in += len; | ||
| 77 | out += len; | ||
| 78 | |||
| 79 | for (n=0; n<residue; ++n) | ||
| 80 | ivec[n] ^= in[n]; | ||
| 81 | (*block)(ivec,ivec,key); | ||
| 82 | memcpy(out-16+residue,ivec,16); | ||
| 83 | |||
| 84 | return len+residue; | ||
| 85 | } | ||
| 86 | |||
| 57 | size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, | 87 | size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, |
| 58 | size_t len, const void *key, | 88 | size_t len, const void *key, |
| 59 | unsigned char ivec[16], cbc128_f cbc) | 89 | unsigned char ivec[16], cbc128_f cbc) |
| @@ -90,6 +120,41 @@ size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, | |||
| 90 | return len+residue; | 120 | return len+residue; |
| 91 | } | 121 | } |
| 92 | 122 | ||
| 123 | size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out, | ||
| 124 | size_t len, const void *key, | ||
| 125 | unsigned char ivec[16], cbc128_f cbc) | ||
| 126 | { size_t residue; | ||
| 127 | union { size_t align; unsigned char c[16]; } tmp; | ||
| 128 | |||
| 129 | assert (in && out && key && ivec); | ||
| 130 | |||
| 131 | if (len < 16) return 0; | ||
| 132 | |||
| 133 | residue=len%16; | ||
| 134 | |||
| 135 | len -= residue; | ||
| 136 | |||
| 137 | (*cbc)(in,out,len,key,ivec,1); | ||
| 138 | |||
| 139 | if (residue==0) return len; | ||
| 140 | |||
| 141 | in += len; | ||
| 142 | out += len; | ||
| 143 | |||
| 144 | #if defined(CBC_HANDLES_TRUNCATED_IO) | ||
| 145 | (*cbc)(in,out-16+residue,residue,key,ivec,1); | ||
| 146 | #else | ||
| 147 | { | ||
| 148 | size_t n; | ||
| 149 | for (n=0; n<16; n+=sizeof(size_t)) | ||
| 150 | *(size_t *)(tmp.c+n) = 0; | ||
| 151 | memcpy(tmp.c,in,residue); | ||
| 152 | } | ||
| 153 | (*cbc)(tmp.c,out-16+residue,16,key,ivec,1); | ||
| 154 | #endif | ||
| 155 | return len+residue; | ||
| 156 | } | ||
| 157 | |||
| 93 | size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, | 158 | size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, |
| 94 | size_t len, const void *key, | 159 | size_t len, const void *key, |
| 95 | unsigned char ivec[16], block128_f block) | 160 | unsigned char ivec[16], block128_f block) |
| @@ -125,7 +190,51 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, | |||
| 125 | for(residue+=16; n<residue; ++n) | 190 | for(residue+=16; n<residue; ++n) |
| 126 | out[n] = tmp.c[n] ^ in[n]; | 191 | out[n] = tmp.c[n] ^ in[n]; |
| 127 | 192 | ||
| 128 | return len+residue-16; | 193 | return 16+len+residue; |
| 194 | } | ||
| 195 | |||
| 196 | size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out, | ||
| 197 | size_t len, const void *key, | ||
| 198 | unsigned char ivec[16], block128_f block) | ||
| 199 | { size_t residue, n; | ||
| 200 | union { size_t align; unsigned char c[32]; } tmp; | ||
| 201 | |||
| 202 | assert (in && out && key && ivec); | ||
| 203 | |||
| 204 | if (len<16) return 0; | ||
| 205 | |||
| 206 | residue=len%16; | ||
| 207 | |||
| 208 | if (residue==0) { | ||
| 209 | CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block); | ||
| 210 | return len; | ||
| 211 | } | ||
| 212 | |||
| 213 | len -= 16+residue; | ||
| 214 | |||
| 215 | if (len) { | ||
| 216 | CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block); | ||
| 217 | in += len; | ||
| 218 | out += len; | ||
| 219 | } | ||
| 220 | |||
| 221 | (*block)(in+residue,tmp.c+16,key); | ||
| 222 | |||
| 223 | for (n=0; n<16; n+=sizeof(size_t)) | ||
| 224 | *(size_t *)(tmp.c+n) = *(size_t *)(tmp.c+16+n); | ||
| 225 | memcpy(tmp.c,in,residue); | ||
| 226 | (*block)(tmp.c,tmp.c,key); | ||
| 227 | |||
| 228 | for(n=0; n<16; ++n) { | ||
| 229 | unsigned char c = in[n]; | ||
| 230 | out[n] = tmp.c[n] ^ ivec[n]; | ||
| 231 | ivec[n] = in[n+residue]; | ||
| 232 | tmp.c[n] = c; | ||
| 233 | } | ||
| 234 | for(residue+=16; n<residue; ++n) | ||
| 235 | out[n] = tmp.c[n] ^ tmp.c[n-16]; | ||
| 236 | |||
| 237 | return 16+len+residue; | ||
| 129 | } | 238 | } |
| 130 | 239 | ||
| 131 | size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, | 240 | size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, |
| @@ -160,7 +269,47 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, | |||
| 160 | (*cbc)(tmp.c,tmp.c,32,key,ivec,0); | 269 | (*cbc)(tmp.c,tmp.c,32,key,ivec,0); |
| 161 | memcpy(out,tmp.c,16+residue); | 270 | memcpy(out,tmp.c,16+residue); |
| 162 | #endif | 271 | #endif |
| 163 | return len+residue; | 272 | return 16+len+residue; |
| 273 | } | ||
| 274 | |||
| 275 | size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out, | ||
| 276 | size_t len, const void *key, | ||
| 277 | unsigned char ivec[16], cbc128_f cbc) | ||
| 278 | { size_t residue, n; | ||
| 279 | union { size_t align; unsigned char c[32]; } tmp; | ||
| 280 | |||
| 281 | assert (in && out && key && ivec); | ||
| 282 | |||
| 283 | if (len<16) return 0; | ||
| 284 | |||
| 285 | residue=len%16; | ||
| 286 | |||
| 287 | if (residue==0) { | ||
| 288 | (*cbc)(in,out,len,key,ivec,0); | ||
| 289 | return len; | ||
| 290 | } | ||
| 291 | |||
| 292 | len -= 16+residue; | ||
| 293 | |||
| 294 | if (len) { | ||
| 295 | (*cbc)(in,out,len,key,ivec,0); | ||
| 296 | in += len; | ||
| 297 | out += len; | ||
| 298 | } | ||
| 299 | |||
| 300 | for (n=16; n<32; n+=sizeof(size_t)) | ||
| 301 | *(size_t *)(tmp.c+n) = 0; | ||
| 302 | /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */ | ||
| 303 | (*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0); | ||
| 304 | |||
| 305 | memcpy(tmp.c,in,residue); | ||
| 306 | #if defined(CBC_HANDLES_TRUNCATED_IO) | ||
| 307 | (*cbc)(tmp.c,out,16+residue,key,ivec,0); | ||
| 308 | #else | ||
| 309 | (*cbc)(tmp.c,tmp.c,32,key,ivec,0); | ||
| 310 | memcpy(out,tmp.c,16+residue); | ||
| 311 | #endif | ||
| 312 | return 16+len+residue; | ||
| 164 | } | 313 | } |
| 165 | 314 | ||
| 166 | #if defined(SELFTEST) | 315 | #if defined(SELFTEST) |
| @@ -200,9 +349,8 @@ static const unsigned char vector_64[64] = | |||
| 200 | static AES_KEY encks, decks; | 349 | static AES_KEY encks, decks; |
| 201 | 350 | ||
| 202 | void test_vector(const unsigned char *vector,size_t len) | 351 | void test_vector(const unsigned char *vector,size_t len) |
| 203 | { unsigned char cleartext[64]; | 352 | { unsigned char iv[sizeof(test_iv)]; |
| 204 | unsigned char iv[sizeof(test_iv)]; | 353 | unsigned char cleartext[64],ciphertext[64]; |
| 205 | unsigned char ciphertext[64]; | ||
| 206 | size_t tail; | 354 | size_t tail; |
| 207 | 355 | ||
| 208 | printf("vector_%d\n",len); fflush(stdout); | 356 | printf("vector_%d\n",len); fflush(stdout); |
| @@ -243,7 +391,57 @@ void test_vector(const unsigned char *vector,size_t len) | |||
| 243 | fprintf(stderr,"iv_%d mismatch\n",len), exit(4); | 391 | fprintf(stderr,"iv_%d mismatch\n",len), exit(4); |
| 244 | } | 392 | } |
| 245 | 393 | ||
| 246 | main() | 394 | void test_nistvector(const unsigned char *vector,size_t len) |
| 395 | { unsigned char iv[sizeof(test_iv)]; | ||
| 396 | unsigned char cleartext[64],ciphertext[64],nistvector[64]; | ||
| 397 | size_t tail; | ||
| 398 | |||
| 399 | printf("nistvector_%d\n",len); fflush(stdout); | ||
| 400 | |||
| 401 | if ((tail=len%16) == 0) tail = 16; | ||
| 402 | |||
| 403 | len -= 16 + tail; | ||
| 404 | memcpy(nistvector,vector,len); | ||
| 405 | /* flip two last blocks */ | ||
| 406 | memcpy(nistvector+len,vector+len+16,tail); | ||
| 407 | memcpy(nistvector+len+tail,vector+len,16); | ||
| 408 | len += 16 + tail; | ||
| 409 | tail = 16; | ||
| 410 | |||
| 411 | /* test block-based encryption */ | ||
| 412 | memcpy(iv,test_iv,sizeof(test_iv)); | ||
| 413 | CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt); | ||
| 414 | if (memcmp(ciphertext,nistvector,len)) | ||
| 415 | fprintf(stderr,"output_%d mismatch\n",len), exit(1); | ||
| 416 | if (memcmp(iv,nistvector+len-tail,sizeof(iv))) | ||
| 417 | fprintf(stderr,"iv_%d mismatch\n",len), exit(1); | ||
| 418 | |||
| 419 | /* test block-based decryption */ | ||
| 420 | memcpy(iv,test_iv,sizeof(test_iv)); | ||
| 421 | CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt); | ||
| 422 | if (memcmp(cleartext,test_input,len)) | ||
| 423 | fprintf(stderr,"input_%d mismatch\n",len), exit(2); | ||
| 424 | if (memcmp(iv,nistvector+len-tail,sizeof(iv))) | ||
| 425 | fprintf(stderr,"iv_%d mismatch\n",len), exit(2); | ||
| 426 | |||
| 427 | /* test streamed encryption */ | ||
| 428 | memcpy(iv,test_iv,sizeof(test_iv)); | ||
| 429 | CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt); | ||
| 430 | if (memcmp(ciphertext,nistvector,len)) | ||
| 431 | fprintf(stderr,"output_%d mismatch\n",len), exit(3); | ||
| 432 | if (memcmp(iv,nistvector+len-tail,sizeof(iv))) | ||
| 433 | fprintf(stderr,"iv_%d mismatch\n",len), exit(3); | ||
| 434 | |||
| 435 | /* test streamed decryption */ | ||
| 436 | memcpy(iv,test_iv,sizeof(test_iv)); | ||
| 437 | CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt); | ||
| 438 | if (memcmp(cleartext,test_input,len)) | ||
| 439 | fprintf(stderr,"input_%d mismatch\n",len), exit(4); | ||
| 440 | if (memcmp(iv,nistvector+len-tail,sizeof(iv))) | ||
| 441 | fprintf(stderr,"iv_%d mismatch\n",len), exit(4); | ||
| 442 | } | ||
| 443 | |||
| 444 | int main() | ||
| 247 | { | 445 | { |
| 248 | AES_set_encrypt_key(test_key,128,&encks); | 446 | AES_set_encrypt_key(test_key,128,&encks); |
| 249 | AES_set_decrypt_key(test_key,128,&decks); | 447 | AES_set_decrypt_key(test_key,128,&decks); |
| @@ -254,6 +452,14 @@ main() | |||
| 254 | test_vector(vector_47,sizeof(vector_47)); | 452 | test_vector(vector_47,sizeof(vector_47)); |
| 255 | test_vector(vector_48,sizeof(vector_48)); | 453 | test_vector(vector_48,sizeof(vector_48)); |
| 256 | test_vector(vector_64,sizeof(vector_64)); | 454 | test_vector(vector_64,sizeof(vector_64)); |
| 257 | exit(0); | 455 | |
| 456 | test_nistvector(vector_17,sizeof(vector_17)); | ||
| 457 | test_nistvector(vector_31,sizeof(vector_31)); | ||
| 458 | test_nistvector(vector_32,sizeof(vector_32)); | ||
| 459 | test_nistvector(vector_47,sizeof(vector_47)); | ||
| 460 | test_nistvector(vector_48,sizeof(vector_48)); | ||
| 461 | test_nistvector(vector_64,sizeof(vector_64)); | ||
| 462 | |||
| 463 | return 0; | ||
| 258 | } | 464 | } |
| 259 | #endif | 465 | #endif |
diff --git a/src/lib/libcrypto/modes/gcm128.c b/src/lib/libcrypto/modes/gcm128.c new file mode 100644 index 0000000000..7d6d034970 --- /dev/null +++ b/src/lib/libcrypto/modes/gcm128.c | |||
| @@ -0,0 +1,1757 @@ | |||
| 1 | /* ==================================================================== | ||
| 2 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
| 3 | * | ||
| 4 | * Redistribution and use in source and binary forms, with or without | ||
| 5 | * modification, are permitted provided that the following conditions | ||
| 6 | * are met: | ||
| 7 | * | ||
| 8 | * 1. Redistributions of source code must retain the above copyright | ||
| 9 | * notice, this list of conditions and the following disclaimer. | ||
| 10 | * | ||
| 11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 12 | * notice, this list of conditions and the following disclaimer in | ||
| 13 | * the documentation and/or other materials provided with the | ||
| 14 | * distribution. | ||
| 15 | * | ||
| 16 | * 3. All advertising materials mentioning features or use of this | ||
| 17 | * software must display the following acknowledgment: | ||
| 18 | * "This product includes software developed by the OpenSSL Project | ||
| 19 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
| 20 | * | ||
| 21 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 22 | * endorse or promote products derived from this software without | ||
| 23 | * prior written permission. For written permission, please contact | ||
| 24 | * openssl-core@openssl.org. | ||
| 25 | * | ||
| 26 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 27 | * nor may "OpenSSL" appear in their names without prior written | ||
| 28 | * permission of the OpenSSL Project. | ||
| 29 | * | ||
| 30 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 31 | * acknowledgment: | ||
| 32 | * "This product includes software developed by the OpenSSL Project | ||
| 33 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
| 34 | * | ||
| 35 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 36 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 38 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 39 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 40 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 41 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 42 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 45 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 46 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 47 | * ==================================================================== | ||
| 48 | */ | ||
| 49 | |||
| 50 | #define OPENSSL_FIPSAPI | ||
| 51 | |||
| 52 | #include <openssl/crypto.h> | ||
| 53 | #include "modes_lcl.h" | ||
| 54 | #include <string.h> | ||
| 55 | |||
| 56 | #ifndef MODES_DEBUG | ||
| 57 | # ifndef NDEBUG | ||
| 58 | # define NDEBUG | ||
| 59 | # endif | ||
| 60 | #endif | ||
| 61 | #include <assert.h> | ||
| 62 | |||
| 63 | #if defined(BSWAP4) && defined(STRICT_ALIGNMENT) | ||
| 64 | /* redefine, because alignment is ensured */ | ||
| 65 | #undef GETU32 | ||
| 66 | #define GETU32(p) BSWAP4(*(const u32 *)(p)) | ||
| 67 | #undef PUTU32 | ||
| 68 | #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) | ||
| 69 | #endif | ||
| 70 | |||
| 71 | #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) | ||
| 72 | #define REDUCE1BIT(V) do { \ | ||
| 73 | if (sizeof(size_t)==8) { \ | ||
| 74 | u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ | ||
| 75 | V.lo = (V.hi<<63)|(V.lo>>1); \ | ||
| 76 | V.hi = (V.hi>>1 )^T; \ | ||
| 77 | } \ | ||
| 78 | else { \ | ||
| 79 | u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ | ||
| 80 | V.lo = (V.hi<<63)|(V.lo>>1); \ | ||
| 81 | V.hi = (V.hi>>1 )^((u64)T<<32); \ | ||
| 82 | } \ | ||
| 83 | } while(0) | ||
| 84 | |||
| 85 | /* | ||
| 86 | * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should | ||
| 87 | * never be set to 8. 8 is effectively reserved for testing purposes. | ||
| 88 | * TABLE_BITS>1 are lookup-table-driven implementations referred to as | ||
| 89 | * "Shoup's" in GCM specification. In other words OpenSSL does not cover | ||
| 90 | * whole spectrum of possible table driven implementations. Why? In | ||
| 91 | * non-"Shoup's" case memory access pattern is segmented in such manner, | ||
| 92 | * that it's trivial to see that cache timing information can reveal | ||
| 93 | * fair portion of intermediate hash value. Given that ciphertext is | ||
| 94 | * always available to attacker, it's possible for him to attempt to | ||
| 95 | * deduce secret parameter H and if successful, tamper with messages | ||
| 96 | * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's | ||
| 97 | * not as trivial, but there is no reason to believe that it's resistant | ||
| 98 | * to cache-timing attack. And the thing about "8-bit" implementation is | ||
| 99 | * that it consumes 16 (sixteen) times more memory, 4KB per individual | ||
| 100 | * key + 1KB shared. Well, on pros side it should be twice as fast as | ||
| 101 | * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version | ||
| 102 | * was observed to run ~75% faster, closer to 100% for commercial | ||
| 103 | * compilers... Yet "4-bit" procedure is preferred, because it's | ||
| 104 | * believed to provide better security-performance balance and adequate | ||
| 105 | * all-round performance. "All-round" refers to things like: | ||
| 106 | * | ||
| 107 | * - shorter setup time effectively improves overall timing for | ||
| 108 | * handling short messages; | ||
| 109 | * - larger table allocation can become unbearable because of VM | ||
| 110 | * subsystem penalties (for example on Windows large enough free | ||
| 111 | * results in VM working set trimming, meaning that consequent | ||
| 112 | * malloc would immediately incur working set expansion); | ||
| 113 | * - larger table has larger cache footprint, which can affect | ||
| 114 | * performance of other code paths (not necessarily even from same | ||
| 115 | * thread in Hyper-Threading world); | ||
| 116 | * | ||
| 117 | * Value of 1 is not appropriate for performance reasons. | ||
| 118 | */ | ||
| 119 | #if TABLE_BITS==8 | ||
| 120 | |||
| 121 | static void gcm_init_8bit(u128 Htable[256], u64 H[2]) | ||
| 122 | { | ||
| 123 | int i, j; | ||
| 124 | u128 V; | ||
| 125 | |||
| 126 | Htable[0].hi = 0; | ||
| 127 | Htable[0].lo = 0; | ||
| 128 | V.hi = H[0]; | ||
| 129 | V.lo = H[1]; | ||
| 130 | |||
| 131 | for (Htable[128]=V, i=64; i>0; i>>=1) { | ||
| 132 | REDUCE1BIT(V); | ||
| 133 | Htable[i] = V; | ||
| 134 | } | ||
| 135 | |||
| 136 | for (i=2; i<256; i<<=1) { | ||
| 137 | u128 *Hi = Htable+i, H0 = *Hi; | ||
| 138 | for (j=1; j<i; ++j) { | ||
| 139 | Hi[j].hi = H0.hi^Htable[j].hi; | ||
| 140 | Hi[j].lo = H0.lo^Htable[j].lo; | ||
| 141 | } | ||
| 142 | } | ||
| 143 | } | ||
| 144 | |||
| 145 | static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) | ||
| 146 | { | ||
| 147 | u128 Z = { 0, 0}; | ||
| 148 | const u8 *xi = (const u8 *)Xi+15; | ||
| 149 | size_t rem, n = *xi; | ||
| 150 | const union { long one; char little; } is_endian = {1}; | ||
| 151 | static const size_t rem_8bit[256] = { | ||
| 152 | PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), | ||
| 153 | PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), | ||
| 154 | PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), | ||
| 155 | PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), | ||
| 156 | PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), | ||
| 157 | PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), | ||
| 158 | PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), | ||
| 159 | PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), | ||
| 160 | PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), | ||
| 161 | PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), | ||
| 162 | PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), | ||
| 163 | PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), | ||
| 164 | PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), | ||
| 165 | PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), | ||
| 166 | PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), | ||
| 167 | PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), | ||
| 168 | PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), | ||
| 169 | PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), | ||
| 170 | PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), | ||
| 171 | PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), | ||
| 172 | PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), | ||
| 173 | PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), | ||
| 174 | PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), | ||
| 175 | PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), | ||
| 176 | PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), | ||
| 177 | PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), | ||
| 178 | PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), | ||
| 179 | PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), | ||
| 180 | PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), | ||
| 181 | PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), | ||
| 182 | PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), | ||
| 183 | PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), | ||
| 184 | PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), | ||
| 185 | PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), | ||
| 186 | PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), | ||
| 187 | PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), | ||
| 188 | PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), | ||
| 189 | PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), | ||
| 190 | PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), | ||
| 191 | PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), | ||
| 192 | PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), | ||
| 193 | PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), | ||
| 194 | PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), | ||
| 195 | PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), | ||
| 196 | PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), | ||
| 197 | PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), | ||
| 198 | PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), | ||
| 199 | PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), | ||
| 200 | PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), | ||
| 201 | PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), | ||
| 202 | PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), | ||
| 203 | PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), | ||
| 204 | PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), | ||
| 205 | PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), | ||
| 206 | PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), | ||
| 207 | PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), | ||
| 208 | PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), | ||
| 209 | PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), | ||
| 210 | PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), | ||
| 211 | PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), | ||
| 212 | PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), | ||
| 213 | PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), | ||
| 214 | PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), | ||
| 215 | PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; | ||
| 216 | |||
| 217 | while (1) { | ||
| 218 | Z.hi ^= Htable[n].hi; | ||
| 219 | Z.lo ^= Htable[n].lo; | ||
| 220 | |||
| 221 | if ((u8 *)Xi==xi) break; | ||
| 222 | |||
| 223 | n = *(--xi); | ||
| 224 | |||
| 225 | rem = (size_t)Z.lo&0xff; | ||
| 226 | Z.lo = (Z.hi<<56)|(Z.lo>>8); | ||
| 227 | Z.hi = (Z.hi>>8); | ||
| 228 | if (sizeof(size_t)==8) | ||
| 229 | Z.hi ^= rem_8bit[rem]; | ||
| 230 | else | ||
| 231 | Z.hi ^= (u64)rem_8bit[rem]<<32; | ||
| 232 | } | ||
| 233 | |||
| 234 | if (is_endian.little) { | ||
| 235 | #ifdef BSWAP8 | ||
| 236 | Xi[0] = BSWAP8(Z.hi); | ||
| 237 | Xi[1] = BSWAP8(Z.lo); | ||
| 238 | #else | ||
| 239 | u8 *p = (u8 *)Xi; | ||
| 240 | u32 v; | ||
| 241 | v = (u32)(Z.hi>>32); PUTU32(p,v); | ||
| 242 | v = (u32)(Z.hi); PUTU32(p+4,v); | ||
| 243 | v = (u32)(Z.lo>>32); PUTU32(p+8,v); | ||
| 244 | v = (u32)(Z.lo); PUTU32(p+12,v); | ||
| 245 | #endif | ||
| 246 | } | ||
| 247 | else { | ||
| 248 | Xi[0] = Z.hi; | ||
| 249 | Xi[1] = Z.lo; | ||
| 250 | } | ||
| 251 | } | ||
| 252 | #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) | ||
| 253 | |||
| 254 | #elif TABLE_BITS==4 | ||
| 255 | |||
| 256 | static void gcm_init_4bit(u128 Htable[16], u64 H[2]) | ||
| 257 | { | ||
| 258 | u128 V; | ||
| 259 | #if defined(OPENSSL_SMALL_FOOTPRINT) | ||
| 260 | int i; | ||
| 261 | #endif | ||
| 262 | |||
| 263 | Htable[0].hi = 0; | ||
| 264 | Htable[0].lo = 0; | ||
| 265 | V.hi = H[0]; | ||
| 266 | V.lo = H[1]; | ||
| 267 | |||
| 268 | #if defined(OPENSSL_SMALL_FOOTPRINT) | ||
| 269 | for (Htable[8]=V, i=4; i>0; i>>=1) { | ||
| 270 | REDUCE1BIT(V); | ||
| 271 | Htable[i] = V; | ||
| 272 | } | ||
| 273 | |||
| 274 | for (i=2; i<16; i<<=1) { | ||
| 275 | u128 *Hi = Htable+i; | ||
| 276 | int j; | ||
| 277 | for (V=*Hi, j=1; j<i; ++j) { | ||
| 278 | Hi[j].hi = V.hi^Htable[j].hi; | ||
| 279 | Hi[j].lo = V.lo^Htable[j].lo; | ||
| 280 | } | ||
| 281 | } | ||
| 282 | #else | ||
| 283 | Htable[8] = V; | ||
| 284 | REDUCE1BIT(V); | ||
| 285 | Htable[4] = V; | ||
| 286 | REDUCE1BIT(V); | ||
| 287 | Htable[2] = V; | ||
| 288 | REDUCE1BIT(V); | ||
| 289 | Htable[1] = V; | ||
| 290 | Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo; | ||
| 291 | V=Htable[4]; | ||
| 292 | Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo; | ||
| 293 | Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo; | ||
| 294 | Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo; | ||
| 295 | V=Htable[8]; | ||
| 296 | Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo; | ||
| 297 | Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo; | ||
| 298 | Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo; | ||
| 299 | Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo; | ||
| 300 | Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo; | ||
| 301 | Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo; | ||
| 302 | Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo; | ||
| 303 | #endif | ||
| 304 | #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) | ||
| 305 | /* | ||
| 306 | * ARM assembler expects specific dword order in Htable. | ||
| 307 | */ | ||
| 308 | { | ||
| 309 | int j; | ||
| 310 | const union { long one; char little; } is_endian = {1}; | ||
| 311 | |||
| 312 | if (is_endian.little) | ||
| 313 | for (j=0;j<16;++j) { | ||
| 314 | V = Htable[j]; | ||
| 315 | Htable[j].hi = V.lo; | ||
| 316 | Htable[j].lo = V.hi; | ||
| 317 | } | ||
| 318 | else | ||
| 319 | for (j=0;j<16;++j) { | ||
| 320 | V = Htable[j]; | ||
| 321 | Htable[j].hi = V.lo<<32|V.lo>>32; | ||
| 322 | Htable[j].lo = V.hi<<32|V.hi>>32; | ||
| 323 | } | ||
| 324 | } | ||
| 325 | #endif | ||
| 326 | } | ||
| 327 | |||
| 328 | #ifndef GHASH_ASM | ||
| 329 | static const size_t rem_4bit[16] = { | ||
| 330 | PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), | ||
| 331 | PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), | ||
| 332 | PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), | ||
| 333 | PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; | ||
| 334 | |||
| 335 | static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) | ||
| 336 | { | ||
| 337 | u128 Z; | ||
| 338 | int cnt = 15; | ||
| 339 | size_t rem, nlo, nhi; | ||
| 340 | const union { long one; char little; } is_endian = {1}; | ||
| 341 | |||
| 342 | nlo = ((const u8 *)Xi)[15]; | ||
| 343 | nhi = nlo>>4; | ||
| 344 | nlo &= 0xf; | ||
| 345 | |||
| 346 | Z.hi = Htable[nlo].hi; | ||
| 347 | Z.lo = Htable[nlo].lo; | ||
| 348 | |||
| 349 | while (1) { | ||
| 350 | rem = (size_t)Z.lo&0xf; | ||
| 351 | Z.lo = (Z.hi<<60)|(Z.lo>>4); | ||
| 352 | Z.hi = (Z.hi>>4); | ||
| 353 | if (sizeof(size_t)==8) | ||
| 354 | Z.hi ^= rem_4bit[rem]; | ||
| 355 | else | ||
| 356 | Z.hi ^= (u64)rem_4bit[rem]<<32; | ||
| 357 | |||
| 358 | Z.hi ^= Htable[nhi].hi; | ||
| 359 | Z.lo ^= Htable[nhi].lo; | ||
| 360 | |||
| 361 | if (--cnt<0) break; | ||
| 362 | |||
| 363 | nlo = ((const u8 *)Xi)[cnt]; | ||
| 364 | nhi = nlo>>4; | ||
| 365 | nlo &= 0xf; | ||
| 366 | |||
| 367 | rem = (size_t)Z.lo&0xf; | ||
| 368 | Z.lo = (Z.hi<<60)|(Z.lo>>4); | ||
| 369 | Z.hi = (Z.hi>>4); | ||
| 370 | if (sizeof(size_t)==8) | ||
| 371 | Z.hi ^= rem_4bit[rem]; | ||
| 372 | else | ||
| 373 | Z.hi ^= (u64)rem_4bit[rem]<<32; | ||
| 374 | |||
| 375 | Z.hi ^= Htable[nlo].hi; | ||
| 376 | Z.lo ^= Htable[nlo].lo; | ||
| 377 | } | ||
| 378 | |||
| 379 | if (is_endian.little) { | ||
| 380 | #ifdef BSWAP8 | ||
| 381 | Xi[0] = BSWAP8(Z.hi); | ||
| 382 | Xi[1] = BSWAP8(Z.lo); | ||
| 383 | #else | ||
| 384 | u8 *p = (u8 *)Xi; | ||
| 385 | u32 v; | ||
| 386 | v = (u32)(Z.hi>>32); PUTU32(p,v); | ||
| 387 | v = (u32)(Z.hi); PUTU32(p+4,v); | ||
| 388 | v = (u32)(Z.lo>>32); PUTU32(p+8,v); | ||
| 389 | v = (u32)(Z.lo); PUTU32(p+12,v); | ||
| 390 | #endif | ||
| 391 | } | ||
| 392 | else { | ||
| 393 | Xi[0] = Z.hi; | ||
| 394 | Xi[1] = Z.lo; | ||
| 395 | } | ||
| 396 | } | ||
| 397 | |||
| 398 | #if !defined(OPENSSL_SMALL_FOOTPRINT) | ||
| 399 | /* | ||
| 400 | * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for | ||
| 401 | * details... Compiler-generated code doesn't seem to give any | ||
| 402 | * performance improvement, at least not on x86[_64]. It's here | ||
| 403 | * mostly as reference and a placeholder for possible future | ||
| 404 | * non-trivial optimization[s]... | ||
| 405 | */ | ||
| 406 | static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16], | ||
| 407 | const u8 *inp,size_t len) | ||
| 408 | { | ||
| 409 | u128 Z; | ||
| 410 | int cnt; | ||
| 411 | size_t rem, nlo, nhi; | ||
| 412 | const union { long one; char little; } is_endian = {1}; | ||
| 413 | |||
| 414 | #if 1 | ||
| 415 | do { | ||
| 416 | cnt = 15; | ||
| 417 | nlo = ((const u8 *)Xi)[15]; | ||
| 418 | nlo ^= inp[15]; | ||
| 419 | nhi = nlo>>4; | ||
| 420 | nlo &= 0xf; | ||
| 421 | |||
| 422 | Z.hi = Htable[nlo].hi; | ||
| 423 | Z.lo = Htable[nlo].lo; | ||
| 424 | |||
| 425 | while (1) { | ||
| 426 | rem = (size_t)Z.lo&0xf; | ||
| 427 | Z.lo = (Z.hi<<60)|(Z.lo>>4); | ||
| 428 | Z.hi = (Z.hi>>4); | ||
| 429 | if (sizeof(size_t)==8) | ||
| 430 | Z.hi ^= rem_4bit[rem]; | ||
| 431 | else | ||
| 432 | Z.hi ^= (u64)rem_4bit[rem]<<32; | ||
| 433 | |||
| 434 | Z.hi ^= Htable[nhi].hi; | ||
| 435 | Z.lo ^= Htable[nhi].lo; | ||
| 436 | |||
| 437 | if (--cnt<0) break; | ||
| 438 | |||
| 439 | nlo = ((const u8 *)Xi)[cnt]; | ||
| 440 | nlo ^= inp[cnt]; | ||
| 441 | nhi = nlo>>4; | ||
| 442 | nlo &= 0xf; | ||
| 443 | |||
| 444 | rem = (size_t)Z.lo&0xf; | ||
| 445 | Z.lo = (Z.hi<<60)|(Z.lo>>4); | ||
| 446 | Z.hi = (Z.hi>>4); | ||
| 447 | if (sizeof(size_t)==8) | ||
| 448 | Z.hi ^= rem_4bit[rem]; | ||
| 449 | else | ||
| 450 | Z.hi ^= (u64)rem_4bit[rem]<<32; | ||
| 451 | |||
| 452 | Z.hi ^= Htable[nlo].hi; | ||
| 453 | Z.lo ^= Htable[nlo].lo; | ||
| 454 | } | ||
| 455 | #else | ||
| 456 | /* | ||
| 457 | * Extra 256+16 bytes per-key plus 512 bytes shared tables | ||
| 458 | * [should] give ~50% improvement... One could have PACK()-ed | ||
| 459 | * the rem_8bit even here, but the priority is to minimize | ||
| 460 | * cache footprint... | ||
| 461 | */ | ||
| 462 | u128 Hshr4[16]; /* Htable shifted right by 4 bits */ | ||
| 463 | u8 Hshl4[16]; /* Htable shifted left by 4 bits */ | ||
| 464 | static const unsigned short rem_8bit[256] = { | ||
| 465 | 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, | ||
| 466 | 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, | ||
| 467 | 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, | ||
| 468 | 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, | ||
| 469 | 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, | ||
| 470 | 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, | ||
| 471 | 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, | ||
| 472 | 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, | ||
| 473 | 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, | ||
| 474 | 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, | ||
| 475 | 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, | ||
| 476 | 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, | ||
| 477 | 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, | ||
| 478 | 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, | ||
| 479 | 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, | ||
| 480 | 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, | ||
| 481 | 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, | ||
| 482 | 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, | ||
| 483 | 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, | ||
| 484 | 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, | ||
| 485 | 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, | ||
| 486 | 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, | ||
| 487 | 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, | ||
| 488 | 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, | ||
| 489 | 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, | ||
| 490 | 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, | ||
| 491 | 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, | ||
| 492 | 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, | ||
| 493 | 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, | ||
| 494 | 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, | ||
| 495 | 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, | ||
| 496 | 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; | ||
| 497 | /* | ||
| 498 | * This pre-processing phase slows down procedure by approximately | ||
| 499 | * same time as it makes each loop spin faster. In other words | ||
| 500 | * single block performance is approximately same as straightforward | ||
| 501 | * "4-bit" implementation, and then it goes only faster... | ||
| 502 | */ | ||
| 503 | for (cnt=0; cnt<16; ++cnt) { | ||
| 504 | Z.hi = Htable[cnt].hi; | ||
| 505 | Z.lo = Htable[cnt].lo; | ||
| 506 | Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4); | ||
| 507 | Hshr4[cnt].hi = (Z.hi>>4); | ||
| 508 | Hshl4[cnt] = (u8)(Z.lo<<4); | ||
| 509 | } | ||
| 510 | |||
| 511 | do { | ||
| 512 | for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) { | ||
| 513 | nlo = ((const u8 *)Xi)[cnt]; | ||
| 514 | nlo ^= inp[cnt]; | ||
| 515 | nhi = nlo>>4; | ||
| 516 | nlo &= 0xf; | ||
| 517 | |||
| 518 | Z.hi ^= Htable[nlo].hi; | ||
| 519 | Z.lo ^= Htable[nlo].lo; | ||
| 520 | |||
| 521 | rem = (size_t)Z.lo&0xff; | ||
| 522 | |||
| 523 | Z.lo = (Z.hi<<56)|(Z.lo>>8); | ||
| 524 | Z.hi = (Z.hi>>8); | ||
| 525 | |||
| 526 | Z.hi ^= Hshr4[nhi].hi; | ||
| 527 | Z.lo ^= Hshr4[nhi].lo; | ||
| 528 | Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48; | ||
| 529 | } | ||
| 530 | |||
| 531 | nlo = ((const u8 *)Xi)[0]; | ||
| 532 | nlo ^= inp[0]; | ||
| 533 | nhi = nlo>>4; | ||
| 534 | nlo &= 0xf; | ||
| 535 | |||
| 536 | Z.hi ^= Htable[nlo].hi; | ||
| 537 | Z.lo ^= Htable[nlo].lo; | ||
| 538 | |||
| 539 | rem = (size_t)Z.lo&0xf; | ||
| 540 | |||
| 541 | Z.lo = (Z.hi<<60)|(Z.lo>>4); | ||
| 542 | Z.hi = (Z.hi>>4); | ||
| 543 | |||
| 544 | Z.hi ^= Htable[nhi].hi; | ||
| 545 | Z.lo ^= Htable[nhi].lo; | ||
| 546 | Z.hi ^= ((u64)rem_8bit[rem<<4])<<48; | ||
| 547 | #endif | ||
| 548 | |||
| 549 | if (is_endian.little) { | ||
| 550 | #ifdef BSWAP8 | ||
| 551 | Xi[0] = BSWAP8(Z.hi); | ||
| 552 | Xi[1] = BSWAP8(Z.lo); | ||
| 553 | #else | ||
| 554 | u8 *p = (u8 *)Xi; | ||
| 555 | u32 v; | ||
| 556 | v = (u32)(Z.hi>>32); PUTU32(p,v); | ||
| 557 | v = (u32)(Z.hi); PUTU32(p+4,v); | ||
| 558 | v = (u32)(Z.lo>>32); PUTU32(p+8,v); | ||
| 559 | v = (u32)(Z.lo); PUTU32(p+12,v); | ||
| 560 | #endif | ||
| 561 | } | ||
| 562 | else { | ||
| 563 | Xi[0] = Z.hi; | ||
| 564 | Xi[1] = Z.lo; | ||
| 565 | } | ||
| 566 | } while (inp+=16, len-=16); | ||
| 567 | } | ||
| 568 | #endif | ||
| 569 | #else | ||
| 570 | void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]); | ||
| 571 | void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
| 572 | #endif | ||
| 573 | |||
| 574 | #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) | ||
| 575 | #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) | ||
| 576 | #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) | ||
| 577 | /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache | ||
| 578 | * trashing effect. In other words idea is to hash data while it's | ||
| 579 | * still in L1 cache after encryption pass... */ | ||
| 580 | #define GHASH_CHUNK (3*1024) | ||
| 581 | #endif | ||
| 582 | |||
| 583 | #else /* TABLE_BITS */ | ||
| 584 | |||
| 585 | static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) | ||
| 586 | { | ||
| 587 | u128 V,Z = { 0,0 }; | ||
| 588 | long X; | ||
| 589 | int i,j; | ||
| 590 | const long *xi = (const long *)Xi; | ||
| 591 | const union { long one; char little; } is_endian = {1}; | ||
| 592 | |||
| 593 | V.hi = H[0]; /* H is in host byte order, no byte swapping */ | ||
| 594 | V.lo = H[1]; | ||
| 595 | |||
| 596 | for (j=0; j<16/sizeof(long); ++j) { | ||
| 597 | if (is_endian.little) { | ||
| 598 | if (sizeof(long)==8) { | ||
| 599 | #ifdef BSWAP8 | ||
| 600 | X = (long)(BSWAP8(xi[j])); | ||
| 601 | #else | ||
| 602 | const u8 *p = (const u8 *)(xi+j); | ||
| 603 | X = (long)((u64)GETU32(p)<<32|GETU32(p+4)); | ||
| 604 | #endif | ||
| 605 | } | ||
| 606 | else { | ||
| 607 | const u8 *p = (const u8 *)(xi+j); | ||
| 608 | X = (long)GETU32(p); | ||
| 609 | } | ||
| 610 | } | ||
| 611 | else | ||
| 612 | X = xi[j]; | ||
| 613 | |||
| 614 | for (i=0; i<8*sizeof(long); ++i, X<<=1) { | ||
| 615 | u64 M = (u64)(X>>(8*sizeof(long)-1)); | ||
| 616 | Z.hi ^= V.hi&M; | ||
| 617 | Z.lo ^= V.lo&M; | ||
| 618 | |||
| 619 | REDUCE1BIT(V); | ||
| 620 | } | ||
| 621 | } | ||
| 622 | |||
| 623 | if (is_endian.little) { | ||
| 624 | #ifdef BSWAP8 | ||
| 625 | Xi[0] = BSWAP8(Z.hi); | ||
| 626 | Xi[1] = BSWAP8(Z.lo); | ||
| 627 | #else | ||
| 628 | u8 *p = (u8 *)Xi; | ||
| 629 | u32 v; | ||
| 630 | v = (u32)(Z.hi>>32); PUTU32(p,v); | ||
| 631 | v = (u32)(Z.hi); PUTU32(p+4,v); | ||
| 632 | v = (u32)(Z.lo>>32); PUTU32(p+8,v); | ||
| 633 | v = (u32)(Z.lo); PUTU32(p+12,v); | ||
| 634 | #endif | ||
| 635 | } | ||
| 636 | else { | ||
| 637 | Xi[0] = Z.hi; | ||
| 638 | Xi[1] = Z.lo; | ||
| 639 | } | ||
| 640 | } | ||
| 641 | #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) | ||
| 642 | |||
| 643 | #endif | ||
| 644 | |||
| 645 | #if TABLE_BITS==4 && defined(GHASH_ASM) | ||
| 646 | # if !defined(I386_ONLY) && \ | ||
| 647 | (defined(__i386) || defined(__i386__) || \ | ||
| 648 | defined(__x86_64) || defined(__x86_64__) || \ | ||
| 649 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) | ||
| 650 | # define GHASH_ASM_X86_OR_64 | ||
| 651 | # define GCM_FUNCREF_4BIT | ||
| 652 | extern unsigned int OPENSSL_ia32cap_P[2]; | ||
| 653 | |||
| 654 | void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); | ||
| 655 | void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); | ||
| 656 | void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
| 657 | |||
| 658 | # if defined(__i386) || defined(__i386__) || defined(_M_IX86) | ||
| 659 | # define GHASH_ASM_X86 | ||
| 660 | void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); | ||
| 661 | void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
| 662 | |||
| 663 | void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); | ||
| 664 | void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
| 665 | # endif | ||
| 666 | # elif defined(__arm__) || defined(__arm) | ||
| 667 | # include "arm_arch.h" | ||
| 668 | # if __ARM_ARCH__>=7 | ||
| 669 | # define GHASH_ASM_ARM | ||
| 670 | # define GCM_FUNCREF_4BIT | ||
| 671 | void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); | ||
| 672 | void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
| 673 | # endif | ||
| 674 | # endif | ||
| 675 | #endif | ||
| 676 | |||
| 677 | #ifdef GCM_FUNCREF_4BIT | ||
| 678 | # undef GCM_MUL | ||
| 679 | # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) | ||
| 680 | # ifdef GHASH | ||
| 681 | # undef GHASH | ||
| 682 | # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) | ||
| 683 | # endif | ||
| 684 | #endif | ||
| 685 | |||
| 686 | void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) | ||
| 687 | { | ||
| 688 | const union { long one; char little; } is_endian = {1}; | ||
| 689 | |||
| 690 | memset(ctx,0,sizeof(*ctx)); | ||
| 691 | ctx->block = block; | ||
| 692 | ctx->key = key; | ||
| 693 | |||
| 694 | (*block)(ctx->H.c,ctx->H.c,key); | ||
| 695 | |||
| 696 | if (is_endian.little) { | ||
| 697 | /* H is stored in host byte order */ | ||
| 698 | #ifdef BSWAP8 | ||
| 699 | ctx->H.u[0] = BSWAP8(ctx->H.u[0]); | ||
| 700 | ctx->H.u[1] = BSWAP8(ctx->H.u[1]); | ||
| 701 | #else | ||
| 702 | u8 *p = ctx->H.c; | ||
| 703 | u64 hi,lo; | ||
| 704 | hi = (u64)GETU32(p) <<32|GETU32(p+4); | ||
| 705 | lo = (u64)GETU32(p+8)<<32|GETU32(p+12); | ||
| 706 | ctx->H.u[0] = hi; | ||
| 707 | ctx->H.u[1] = lo; | ||
| 708 | #endif | ||
| 709 | } | ||
| 710 | |||
| 711 | #if TABLE_BITS==8 | ||
| 712 | gcm_init_8bit(ctx->Htable,ctx->H.u); | ||
| 713 | #elif TABLE_BITS==4 | ||
| 714 | # if defined(GHASH_ASM_X86_OR_64) | ||
| 715 | # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) | ||
| 716 | if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */ | ||
| 717 | OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */ | ||
| 718 | gcm_init_clmul(ctx->Htable,ctx->H.u); | ||
| 719 | ctx->gmult = gcm_gmult_clmul; | ||
| 720 | ctx->ghash = gcm_ghash_clmul; | ||
| 721 | return; | ||
| 722 | } | ||
| 723 | # endif | ||
| 724 | gcm_init_4bit(ctx->Htable,ctx->H.u); | ||
| 725 | # if defined(GHASH_ASM_X86) /* x86 only */ | ||
| 726 | # if defined(OPENSSL_IA32_SSE2) | ||
| 727 | if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */ | ||
| 728 | # else | ||
| 729 | if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */ | ||
| 730 | # endif | ||
| 731 | ctx->gmult = gcm_gmult_4bit_mmx; | ||
| 732 | ctx->ghash = gcm_ghash_4bit_mmx; | ||
| 733 | } else { | ||
| 734 | ctx->gmult = gcm_gmult_4bit_x86; | ||
| 735 | ctx->ghash = gcm_ghash_4bit_x86; | ||
| 736 | } | ||
| 737 | # else | ||
| 738 | ctx->gmult = gcm_gmult_4bit; | ||
| 739 | ctx->ghash = gcm_ghash_4bit; | ||
| 740 | # endif | ||
| 741 | # elif defined(GHASH_ASM_ARM) | ||
| 742 | if (OPENSSL_armcap_P & ARMV7_NEON) { | ||
| 743 | ctx->gmult = gcm_gmult_neon; | ||
| 744 | ctx->ghash = gcm_ghash_neon; | ||
| 745 | } else { | ||
| 746 | gcm_init_4bit(ctx->Htable,ctx->H.u); | ||
| 747 | ctx->gmult = gcm_gmult_4bit; | ||
| 748 | ctx->ghash = gcm_ghash_4bit; | ||
| 749 | } | ||
| 750 | # else | ||
| 751 | gcm_init_4bit(ctx->Htable,ctx->H.u); | ||
| 752 | # endif | ||
| 753 | #endif | ||
| 754 | } | ||
| 755 | |||
| 756 | void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len) | ||
| 757 | { | ||
| 758 | const union { long one; char little; } is_endian = {1}; | ||
| 759 | unsigned int ctr; | ||
| 760 | #ifdef GCM_FUNCREF_4BIT | ||
| 761 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
| 762 | #endif | ||
| 763 | |||
| 764 | ctx->Yi.u[0] = 0; | ||
| 765 | ctx->Yi.u[1] = 0; | ||
| 766 | ctx->Xi.u[0] = 0; | ||
| 767 | ctx->Xi.u[1] = 0; | ||
| 768 | ctx->len.u[0] = 0; /* AAD length */ | ||
| 769 | ctx->len.u[1] = 0; /* message length */ | ||
| 770 | ctx->ares = 0; | ||
| 771 | ctx->mres = 0; | ||
| 772 | |||
| 773 | if (len==12) { | ||
| 774 | memcpy(ctx->Yi.c,iv,12); | ||
| 775 | ctx->Yi.c[15]=1; | ||
| 776 | ctr=1; | ||
| 777 | } | ||
| 778 | else { | ||
| 779 | size_t i; | ||
| 780 | u64 len0 = len; | ||
| 781 | |||
| 782 | while (len>=16) { | ||
| 783 | for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i]; | ||
| 784 | GCM_MUL(ctx,Yi); | ||
| 785 | iv += 16; | ||
| 786 | len -= 16; | ||
| 787 | } | ||
| 788 | if (len) { | ||
| 789 | for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i]; | ||
| 790 | GCM_MUL(ctx,Yi); | ||
| 791 | } | ||
| 792 | len0 <<= 3; | ||
| 793 | if (is_endian.little) { | ||
| 794 | #ifdef BSWAP8 | ||
| 795 | ctx->Yi.u[1] ^= BSWAP8(len0); | ||
| 796 | #else | ||
| 797 | ctx->Yi.c[8] ^= (u8)(len0>>56); | ||
| 798 | ctx->Yi.c[9] ^= (u8)(len0>>48); | ||
| 799 | ctx->Yi.c[10] ^= (u8)(len0>>40); | ||
| 800 | ctx->Yi.c[11] ^= (u8)(len0>>32); | ||
| 801 | ctx->Yi.c[12] ^= (u8)(len0>>24); | ||
| 802 | ctx->Yi.c[13] ^= (u8)(len0>>16); | ||
| 803 | ctx->Yi.c[14] ^= (u8)(len0>>8); | ||
| 804 | ctx->Yi.c[15] ^= (u8)(len0); | ||
| 805 | #endif | ||
| 806 | } | ||
| 807 | else | ||
| 808 | ctx->Yi.u[1] ^= len0; | ||
| 809 | |||
| 810 | GCM_MUL(ctx,Yi); | ||
| 811 | |||
| 812 | if (is_endian.little) | ||
| 813 | ctr = GETU32(ctx->Yi.c+12); | ||
| 814 | else | ||
| 815 | ctr = ctx->Yi.d[3]; | ||
| 816 | } | ||
| 817 | |||
| 818 | (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key); | ||
| 819 | ++ctr; | ||
| 820 | if (is_endian.little) | ||
| 821 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 822 | else | ||
| 823 | ctx->Yi.d[3] = ctr; | ||
| 824 | } | ||
| 825 | |||
| 826 | int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) | ||
| 827 | { | ||
| 828 | size_t i; | ||
| 829 | unsigned int n; | ||
| 830 | u64 alen = ctx->len.u[0]; | ||
| 831 | #ifdef GCM_FUNCREF_4BIT | ||
| 832 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
| 833 | # ifdef GHASH | ||
| 834 | void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], | ||
| 835 | const u8 *inp,size_t len) = ctx->ghash; | ||
| 836 | # endif | ||
| 837 | #endif | ||
| 838 | |||
| 839 | if (ctx->len.u[1]) return -2; | ||
| 840 | |||
| 841 | alen += len; | ||
| 842 | if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len)) | ||
| 843 | return -1; | ||
| 844 | ctx->len.u[0] = alen; | ||
| 845 | |||
| 846 | n = ctx->ares; | ||
| 847 | if (n) { | ||
| 848 | while (n && len) { | ||
| 849 | ctx->Xi.c[n] ^= *(aad++); | ||
| 850 | --len; | ||
| 851 | n = (n+1)%16; | ||
| 852 | } | ||
| 853 | if (n==0) GCM_MUL(ctx,Xi); | ||
| 854 | else { | ||
| 855 | ctx->ares = n; | ||
| 856 | return 0; | ||
| 857 | } | ||
| 858 | } | ||
| 859 | |||
| 860 | #ifdef GHASH | ||
| 861 | if ((i = (len&(size_t)-16))) { | ||
| 862 | GHASH(ctx,aad,i); | ||
| 863 | aad += i; | ||
| 864 | len -= i; | ||
| 865 | } | ||
| 866 | #else | ||
| 867 | while (len>=16) { | ||
| 868 | for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i]; | ||
| 869 | GCM_MUL(ctx,Xi); | ||
| 870 | aad += 16; | ||
| 871 | len -= 16; | ||
| 872 | } | ||
| 873 | #endif | ||
| 874 | if (len) { | ||
| 875 | n = (unsigned int)len; | ||
| 876 | for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i]; | ||
| 877 | } | ||
| 878 | |||
| 879 | ctx->ares = n; | ||
| 880 | return 0; | ||
| 881 | } | ||
| 882 | |||
| 883 | int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, | ||
| 884 | const unsigned char *in, unsigned char *out, | ||
| 885 | size_t len) | ||
| 886 | { | ||
| 887 | const union { long one; char little; } is_endian = {1}; | ||
| 888 | unsigned int n, ctr; | ||
| 889 | size_t i; | ||
| 890 | u64 mlen = ctx->len.u[1]; | ||
| 891 | block128_f block = ctx->block; | ||
| 892 | void *key = ctx->key; | ||
| 893 | #ifdef GCM_FUNCREF_4BIT | ||
| 894 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
| 895 | # ifdef GHASH | ||
| 896 | void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], | ||
| 897 | const u8 *inp,size_t len) = ctx->ghash; | ||
| 898 | # endif | ||
| 899 | #endif | ||
| 900 | |||
| 901 | #if 0 | ||
| 902 | n = (unsigned int)mlen%16; /* alternative to ctx->mres */ | ||
| 903 | #endif | ||
| 904 | mlen += len; | ||
| 905 | if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) | ||
| 906 | return -1; | ||
| 907 | ctx->len.u[1] = mlen; | ||
| 908 | |||
| 909 | if (ctx->ares) { | ||
| 910 | /* First call to encrypt finalizes GHASH(AAD) */ | ||
| 911 | GCM_MUL(ctx,Xi); | ||
| 912 | ctx->ares = 0; | ||
| 913 | } | ||
| 914 | |||
| 915 | if (is_endian.little) | ||
| 916 | ctr = GETU32(ctx->Yi.c+12); | ||
| 917 | else | ||
| 918 | ctr = ctx->Yi.d[3]; | ||
| 919 | |||
| 920 | n = ctx->mres; | ||
| 921 | #if !defined(OPENSSL_SMALL_FOOTPRINT) | ||
| 922 | if (16%sizeof(size_t) == 0) do { /* always true actually */ | ||
| 923 | if (n) { | ||
| 924 | while (n && len) { | ||
| 925 | ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; | ||
| 926 | --len; | ||
| 927 | n = (n+1)%16; | ||
| 928 | } | ||
| 929 | if (n==0) GCM_MUL(ctx,Xi); | ||
| 930 | else { | ||
| 931 | ctx->mres = n; | ||
| 932 | return 0; | ||
| 933 | } | ||
| 934 | } | ||
| 935 | #if defined(STRICT_ALIGNMENT) | ||
| 936 | if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) | ||
| 937 | break; | ||
| 938 | #endif | ||
| 939 | #if defined(GHASH) && defined(GHASH_CHUNK) | ||
| 940 | while (len>=GHASH_CHUNK) { | ||
| 941 | size_t j=GHASH_CHUNK; | ||
| 942 | |||
| 943 | while (j) { | ||
| 944 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 945 | ++ctr; | ||
| 946 | if (is_endian.little) | ||
| 947 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 948 | else | ||
| 949 | ctx->Yi.d[3] = ctr; | ||
| 950 | for (i=0; i<16; i+=sizeof(size_t)) | ||
| 951 | *(size_t *)(out+i) = | ||
| 952 | *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); | ||
| 953 | out += 16; | ||
| 954 | in += 16; | ||
| 955 | j -= 16; | ||
| 956 | } | ||
| 957 | GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK); | ||
| 958 | len -= GHASH_CHUNK; | ||
| 959 | } | ||
| 960 | if ((i = (len&(size_t)-16))) { | ||
| 961 | size_t j=i; | ||
| 962 | |||
| 963 | while (len>=16) { | ||
| 964 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 965 | ++ctr; | ||
| 966 | if (is_endian.little) | ||
| 967 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 968 | else | ||
| 969 | ctx->Yi.d[3] = ctr; | ||
| 970 | for (i=0; i<16; i+=sizeof(size_t)) | ||
| 971 | *(size_t *)(out+i) = | ||
| 972 | *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); | ||
| 973 | out += 16; | ||
| 974 | in += 16; | ||
| 975 | len -= 16; | ||
| 976 | } | ||
| 977 | GHASH(ctx,out-j,j); | ||
| 978 | } | ||
| 979 | #else | ||
| 980 | while (len>=16) { | ||
| 981 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 982 | ++ctr; | ||
| 983 | if (is_endian.little) | ||
| 984 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 985 | else | ||
| 986 | ctx->Yi.d[3] = ctr; | ||
| 987 | for (i=0; i<16; i+=sizeof(size_t)) | ||
| 988 | *(size_t *)(ctx->Xi.c+i) ^= | ||
| 989 | *(size_t *)(out+i) = | ||
| 990 | *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); | ||
| 991 | GCM_MUL(ctx,Xi); | ||
| 992 | out += 16; | ||
| 993 | in += 16; | ||
| 994 | len -= 16; | ||
| 995 | } | ||
| 996 | #endif | ||
| 997 | if (len) { | ||
| 998 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 999 | ++ctr; | ||
| 1000 | if (is_endian.little) | ||
| 1001 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1002 | else | ||
| 1003 | ctx->Yi.d[3] = ctr; | ||
| 1004 | while (len--) { | ||
| 1005 | ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; | ||
| 1006 | ++n; | ||
| 1007 | } | ||
| 1008 | } | ||
| 1009 | |||
| 1010 | ctx->mres = n; | ||
| 1011 | return 0; | ||
| 1012 | } while(0); | ||
| 1013 | #endif | ||
| 1014 | for (i=0;i<len;++i) { | ||
| 1015 | if (n==0) { | ||
| 1016 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 1017 | ++ctr; | ||
| 1018 | if (is_endian.little) | ||
| 1019 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1020 | else | ||
| 1021 | ctx->Yi.d[3] = ctr; | ||
| 1022 | } | ||
| 1023 | ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n]; | ||
| 1024 | n = (n+1)%16; | ||
| 1025 | if (n==0) | ||
| 1026 | GCM_MUL(ctx,Xi); | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | ctx->mres = n; | ||
| 1030 | return 0; | ||
| 1031 | } | ||
| 1032 | |||
| 1033 | int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, | ||
| 1034 | const unsigned char *in, unsigned char *out, | ||
| 1035 | size_t len) | ||
| 1036 | { | ||
| 1037 | const union { long one; char little; } is_endian = {1}; | ||
| 1038 | unsigned int n, ctr; | ||
| 1039 | size_t i; | ||
| 1040 | u64 mlen = ctx->len.u[1]; | ||
| 1041 | block128_f block = ctx->block; | ||
| 1042 | void *key = ctx->key; | ||
| 1043 | #ifdef GCM_FUNCREF_4BIT | ||
| 1044 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
| 1045 | # ifdef GHASH | ||
| 1046 | void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], | ||
| 1047 | const u8 *inp,size_t len) = ctx->ghash; | ||
| 1048 | # endif | ||
| 1049 | #endif | ||
| 1050 | |||
| 1051 | mlen += len; | ||
| 1052 | if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) | ||
| 1053 | return -1; | ||
| 1054 | ctx->len.u[1] = mlen; | ||
| 1055 | |||
| 1056 | if (ctx->ares) { | ||
| 1057 | /* First call to decrypt finalizes GHASH(AAD) */ | ||
| 1058 | GCM_MUL(ctx,Xi); | ||
| 1059 | ctx->ares = 0; | ||
| 1060 | } | ||
| 1061 | |||
| 1062 | if (is_endian.little) | ||
| 1063 | ctr = GETU32(ctx->Yi.c+12); | ||
| 1064 | else | ||
| 1065 | ctr = ctx->Yi.d[3]; | ||
| 1066 | |||
| 1067 | n = ctx->mres; | ||
| 1068 | #if !defined(OPENSSL_SMALL_FOOTPRINT) | ||
| 1069 | if (16%sizeof(size_t) == 0) do { /* always true actually */ | ||
| 1070 | if (n) { | ||
| 1071 | while (n && len) { | ||
| 1072 | u8 c = *(in++); | ||
| 1073 | *(out++) = c^ctx->EKi.c[n]; | ||
| 1074 | ctx->Xi.c[n] ^= c; | ||
| 1075 | --len; | ||
| 1076 | n = (n+1)%16; | ||
| 1077 | } | ||
| 1078 | if (n==0) GCM_MUL (ctx,Xi); | ||
| 1079 | else { | ||
| 1080 | ctx->mres = n; | ||
| 1081 | return 0; | ||
| 1082 | } | ||
| 1083 | } | ||
| 1084 | #if defined(STRICT_ALIGNMENT) | ||
| 1085 | if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) | ||
| 1086 | break; | ||
| 1087 | #endif | ||
| 1088 | #if defined(GHASH) && defined(GHASH_CHUNK) | ||
| 1089 | while (len>=GHASH_CHUNK) { | ||
| 1090 | size_t j=GHASH_CHUNK; | ||
| 1091 | |||
| 1092 | GHASH(ctx,in,GHASH_CHUNK); | ||
| 1093 | while (j) { | ||
| 1094 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 1095 | ++ctr; | ||
| 1096 | if (is_endian.little) | ||
| 1097 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1098 | else | ||
| 1099 | ctx->Yi.d[3] = ctr; | ||
| 1100 | for (i=0; i<16; i+=sizeof(size_t)) | ||
| 1101 | *(size_t *)(out+i) = | ||
| 1102 | *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); | ||
| 1103 | out += 16; | ||
| 1104 | in += 16; | ||
| 1105 | j -= 16; | ||
| 1106 | } | ||
| 1107 | len -= GHASH_CHUNK; | ||
| 1108 | } | ||
| 1109 | if ((i = (len&(size_t)-16))) { | ||
| 1110 | GHASH(ctx,in,i); | ||
| 1111 | while (len>=16) { | ||
| 1112 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 1113 | ++ctr; | ||
| 1114 | if (is_endian.little) | ||
| 1115 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1116 | else | ||
| 1117 | ctx->Yi.d[3] = ctr; | ||
| 1118 | for (i=0; i<16; i+=sizeof(size_t)) | ||
| 1119 | *(size_t *)(out+i) = | ||
| 1120 | *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); | ||
| 1121 | out += 16; | ||
| 1122 | in += 16; | ||
| 1123 | len -= 16; | ||
| 1124 | } | ||
| 1125 | } | ||
| 1126 | #else | ||
| 1127 | while (len>=16) { | ||
| 1128 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 1129 | ++ctr; | ||
| 1130 | if (is_endian.little) | ||
| 1131 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1132 | else | ||
| 1133 | ctx->Yi.d[3] = ctr; | ||
| 1134 | for (i=0; i<16; i+=sizeof(size_t)) { | ||
| 1135 | size_t c = *(size_t *)(in+i); | ||
| 1136 | *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i); | ||
| 1137 | *(size_t *)(ctx->Xi.c+i) ^= c; | ||
| 1138 | } | ||
| 1139 | GCM_MUL(ctx,Xi); | ||
| 1140 | out += 16; | ||
| 1141 | in += 16; | ||
| 1142 | len -= 16; | ||
| 1143 | } | ||
| 1144 | #endif | ||
| 1145 | if (len) { | ||
| 1146 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 1147 | ++ctr; | ||
| 1148 | if (is_endian.little) | ||
| 1149 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1150 | else | ||
| 1151 | ctx->Yi.d[3] = ctr; | ||
| 1152 | while (len--) { | ||
| 1153 | u8 c = in[n]; | ||
| 1154 | ctx->Xi.c[n] ^= c; | ||
| 1155 | out[n] = c^ctx->EKi.c[n]; | ||
| 1156 | ++n; | ||
| 1157 | } | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | ctx->mres = n; | ||
| 1161 | return 0; | ||
| 1162 | } while(0); | ||
| 1163 | #endif | ||
| 1164 | for (i=0;i<len;++i) { | ||
| 1165 | u8 c; | ||
| 1166 | if (n==0) { | ||
| 1167 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 1168 | ++ctr; | ||
| 1169 | if (is_endian.little) | ||
| 1170 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1171 | else | ||
| 1172 | ctx->Yi.d[3] = ctr; | ||
| 1173 | } | ||
| 1174 | c = in[i]; | ||
| 1175 | out[i] = c^ctx->EKi.c[n]; | ||
| 1176 | ctx->Xi.c[n] ^= c; | ||
| 1177 | n = (n+1)%16; | ||
| 1178 | if (n==0) | ||
| 1179 | GCM_MUL(ctx,Xi); | ||
| 1180 | } | ||
| 1181 | |||
| 1182 | ctx->mres = n; | ||
| 1183 | return 0; | ||
| 1184 | } | ||
| 1185 | |||
| 1186 | int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, | ||
| 1187 | const unsigned char *in, unsigned char *out, | ||
| 1188 | size_t len, ctr128_f stream) | ||
| 1189 | { | ||
| 1190 | const union { long one; char little; } is_endian = {1}; | ||
| 1191 | unsigned int n, ctr; | ||
| 1192 | size_t i; | ||
| 1193 | u64 mlen = ctx->len.u[1]; | ||
| 1194 | void *key = ctx->key; | ||
| 1195 | #ifdef GCM_FUNCREF_4BIT | ||
| 1196 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
| 1197 | # ifdef GHASH | ||
| 1198 | void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], | ||
| 1199 | const u8 *inp,size_t len) = ctx->ghash; | ||
| 1200 | # endif | ||
| 1201 | #endif | ||
| 1202 | |||
| 1203 | mlen += len; | ||
| 1204 | if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) | ||
| 1205 | return -1; | ||
| 1206 | ctx->len.u[1] = mlen; | ||
| 1207 | |||
| 1208 | if (ctx->ares) { | ||
| 1209 | /* First call to encrypt finalizes GHASH(AAD) */ | ||
| 1210 | GCM_MUL(ctx,Xi); | ||
| 1211 | ctx->ares = 0; | ||
| 1212 | } | ||
| 1213 | |||
| 1214 | if (is_endian.little) | ||
| 1215 | ctr = GETU32(ctx->Yi.c+12); | ||
| 1216 | else | ||
| 1217 | ctr = ctx->Yi.d[3]; | ||
| 1218 | |||
| 1219 | n = ctx->mres; | ||
| 1220 | if (n) { | ||
| 1221 | while (n && len) { | ||
| 1222 | ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; | ||
| 1223 | --len; | ||
| 1224 | n = (n+1)%16; | ||
| 1225 | } | ||
| 1226 | if (n==0) GCM_MUL(ctx,Xi); | ||
| 1227 | else { | ||
| 1228 | ctx->mres = n; | ||
| 1229 | return 0; | ||
| 1230 | } | ||
| 1231 | } | ||
| 1232 | #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) | ||
| 1233 | while (len>=GHASH_CHUNK) { | ||
| 1234 | (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); | ||
| 1235 | ctr += GHASH_CHUNK/16; | ||
| 1236 | if (is_endian.little) | ||
| 1237 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1238 | else | ||
| 1239 | ctx->Yi.d[3] = ctr; | ||
| 1240 | GHASH(ctx,out,GHASH_CHUNK); | ||
| 1241 | out += GHASH_CHUNK; | ||
| 1242 | in += GHASH_CHUNK; | ||
| 1243 | len -= GHASH_CHUNK; | ||
| 1244 | } | ||
| 1245 | #endif | ||
| 1246 | if ((i = (len&(size_t)-16))) { | ||
| 1247 | size_t j=i/16; | ||
| 1248 | |||
| 1249 | (*stream)(in,out,j,key,ctx->Yi.c); | ||
| 1250 | ctr += (unsigned int)j; | ||
| 1251 | if (is_endian.little) | ||
| 1252 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1253 | else | ||
| 1254 | ctx->Yi.d[3] = ctr; | ||
| 1255 | in += i; | ||
| 1256 | len -= i; | ||
| 1257 | #if defined(GHASH) | ||
| 1258 | GHASH(ctx,out,i); | ||
| 1259 | out += i; | ||
| 1260 | #else | ||
| 1261 | while (j--) { | ||
| 1262 | for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i]; | ||
| 1263 | GCM_MUL(ctx,Xi); | ||
| 1264 | out += 16; | ||
| 1265 | } | ||
| 1266 | #endif | ||
| 1267 | } | ||
| 1268 | if (len) { | ||
| 1269 | (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 1270 | ++ctr; | ||
| 1271 | if (is_endian.little) | ||
| 1272 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1273 | else | ||
| 1274 | ctx->Yi.d[3] = ctr; | ||
| 1275 | while (len--) { | ||
| 1276 | ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; | ||
| 1277 | ++n; | ||
| 1278 | } | ||
| 1279 | } | ||
| 1280 | |||
| 1281 | ctx->mres = n; | ||
| 1282 | return 0; | ||
| 1283 | } | ||
| 1284 | |||
| 1285 | int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, | ||
| 1286 | const unsigned char *in, unsigned char *out, | ||
| 1287 | size_t len,ctr128_f stream) | ||
| 1288 | { | ||
| 1289 | const union { long one; char little; } is_endian = {1}; | ||
| 1290 | unsigned int n, ctr; | ||
| 1291 | size_t i; | ||
| 1292 | u64 mlen = ctx->len.u[1]; | ||
| 1293 | void *key = ctx->key; | ||
| 1294 | #ifdef GCM_FUNCREF_4BIT | ||
| 1295 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
| 1296 | # ifdef GHASH | ||
| 1297 | void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], | ||
| 1298 | const u8 *inp,size_t len) = ctx->ghash; | ||
| 1299 | # endif | ||
| 1300 | #endif | ||
| 1301 | |||
| 1302 | mlen += len; | ||
| 1303 | if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) | ||
| 1304 | return -1; | ||
| 1305 | ctx->len.u[1] = mlen; | ||
| 1306 | |||
| 1307 | if (ctx->ares) { | ||
| 1308 | /* First call to decrypt finalizes GHASH(AAD) */ | ||
| 1309 | GCM_MUL(ctx,Xi); | ||
| 1310 | ctx->ares = 0; | ||
| 1311 | } | ||
| 1312 | |||
| 1313 | if (is_endian.little) | ||
| 1314 | ctr = GETU32(ctx->Yi.c+12); | ||
| 1315 | else | ||
| 1316 | ctr = ctx->Yi.d[3]; | ||
| 1317 | |||
| 1318 | n = ctx->mres; | ||
| 1319 | if (n) { | ||
| 1320 | while (n && len) { | ||
| 1321 | u8 c = *(in++); | ||
| 1322 | *(out++) = c^ctx->EKi.c[n]; | ||
| 1323 | ctx->Xi.c[n] ^= c; | ||
| 1324 | --len; | ||
| 1325 | n = (n+1)%16; | ||
| 1326 | } | ||
| 1327 | if (n==0) GCM_MUL (ctx,Xi); | ||
| 1328 | else { | ||
| 1329 | ctx->mres = n; | ||
| 1330 | return 0; | ||
| 1331 | } | ||
| 1332 | } | ||
| 1333 | #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) | ||
| 1334 | while (len>=GHASH_CHUNK) { | ||
| 1335 | GHASH(ctx,in,GHASH_CHUNK); | ||
| 1336 | (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); | ||
| 1337 | ctr += GHASH_CHUNK/16; | ||
| 1338 | if (is_endian.little) | ||
| 1339 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1340 | else | ||
| 1341 | ctx->Yi.d[3] = ctr; | ||
| 1342 | out += GHASH_CHUNK; | ||
| 1343 | in += GHASH_CHUNK; | ||
| 1344 | len -= GHASH_CHUNK; | ||
| 1345 | } | ||
| 1346 | #endif | ||
| 1347 | if ((i = (len&(size_t)-16))) { | ||
| 1348 | size_t j=i/16; | ||
| 1349 | |||
| 1350 | #if defined(GHASH) | ||
| 1351 | GHASH(ctx,in,i); | ||
| 1352 | #else | ||
| 1353 | while (j--) { | ||
| 1354 | size_t k; | ||
| 1355 | for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k]; | ||
| 1356 | GCM_MUL(ctx,Xi); | ||
| 1357 | in += 16; | ||
| 1358 | } | ||
| 1359 | j = i/16; | ||
| 1360 | in -= i; | ||
| 1361 | #endif | ||
| 1362 | (*stream)(in,out,j,key,ctx->Yi.c); | ||
| 1363 | ctr += (unsigned int)j; | ||
| 1364 | if (is_endian.little) | ||
| 1365 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1366 | else | ||
| 1367 | ctx->Yi.d[3] = ctr; | ||
| 1368 | out += i; | ||
| 1369 | in += i; | ||
| 1370 | len -= i; | ||
| 1371 | } | ||
| 1372 | if (len) { | ||
| 1373 | (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); | ||
| 1374 | ++ctr; | ||
| 1375 | if (is_endian.little) | ||
| 1376 | PUTU32(ctx->Yi.c+12,ctr); | ||
| 1377 | else | ||
| 1378 | ctx->Yi.d[3] = ctr; | ||
| 1379 | while (len--) { | ||
| 1380 | u8 c = in[n]; | ||
| 1381 | ctx->Xi.c[n] ^= c; | ||
| 1382 | out[n] = c^ctx->EKi.c[n]; | ||
| 1383 | ++n; | ||
| 1384 | } | ||
| 1385 | } | ||
| 1386 | |||
| 1387 | ctx->mres = n; | ||
| 1388 | return 0; | ||
| 1389 | } | ||
| 1390 | |||
| 1391 | int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, | ||
| 1392 | size_t len) | ||
| 1393 | { | ||
| 1394 | const union { long one; char little; } is_endian = {1}; | ||
| 1395 | u64 alen = ctx->len.u[0]<<3; | ||
| 1396 | u64 clen = ctx->len.u[1]<<3; | ||
| 1397 | #ifdef GCM_FUNCREF_4BIT | ||
| 1398 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
| 1399 | #endif | ||
| 1400 | |||
| 1401 | if (ctx->mres) | ||
| 1402 | GCM_MUL(ctx,Xi); | ||
| 1403 | |||
| 1404 | if (is_endian.little) { | ||
| 1405 | #ifdef BSWAP8 | ||
| 1406 | alen = BSWAP8(alen); | ||
| 1407 | clen = BSWAP8(clen); | ||
| 1408 | #else | ||
| 1409 | u8 *p = ctx->len.c; | ||
| 1410 | |||
| 1411 | ctx->len.u[0] = alen; | ||
| 1412 | ctx->len.u[1] = clen; | ||
| 1413 | |||
| 1414 | alen = (u64)GETU32(p) <<32|GETU32(p+4); | ||
| 1415 | clen = (u64)GETU32(p+8)<<32|GETU32(p+12); | ||
| 1416 | #endif | ||
| 1417 | } | ||
| 1418 | |||
| 1419 | ctx->Xi.u[0] ^= alen; | ||
| 1420 | ctx->Xi.u[1] ^= clen; | ||
| 1421 | GCM_MUL(ctx,Xi); | ||
| 1422 | |||
| 1423 | ctx->Xi.u[0] ^= ctx->EK0.u[0]; | ||
| 1424 | ctx->Xi.u[1] ^= ctx->EK0.u[1]; | ||
| 1425 | |||
| 1426 | if (tag && len<=sizeof(ctx->Xi)) | ||
| 1427 | return memcmp(ctx->Xi.c,tag,len); | ||
| 1428 | else | ||
| 1429 | return -1; | ||
| 1430 | } | ||
| 1431 | |||
| 1432 | void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) | ||
| 1433 | { | ||
| 1434 | CRYPTO_gcm128_finish(ctx, NULL, 0); | ||
| 1435 | memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c)); | ||
| 1436 | } | ||
| 1437 | |||
| 1438 | GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) | ||
| 1439 | { | ||
| 1440 | GCM128_CONTEXT *ret; | ||
| 1441 | |||
| 1442 | if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT)))) | ||
| 1443 | CRYPTO_gcm128_init(ret,key,block); | ||
| 1444 | |||
| 1445 | return ret; | ||
| 1446 | } | ||
| 1447 | |||
| 1448 | void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) | ||
| 1449 | { | ||
| 1450 | if (ctx) { | ||
| 1451 | OPENSSL_cleanse(ctx,sizeof(*ctx)); | ||
| 1452 | OPENSSL_free(ctx); | ||
| 1453 | } | ||
| 1454 | } | ||
| 1455 | |||
| 1456 | #if defined(SELFTEST) | ||
| 1457 | #include <stdio.h> | ||
| 1458 | #include <openssl/aes.h> | ||
| 1459 | |||
| 1460 | /* Test Case 1 */ | ||
| 1461 | static const u8 K1[16], | ||
| 1462 | *P1=NULL, | ||
| 1463 | *A1=NULL, | ||
| 1464 | IV1[12], | ||
| 1465 | *C1=NULL, | ||
| 1466 | T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a}; | ||
| 1467 | |||
| 1468 | /* Test Case 2 */ | ||
| 1469 | #define K2 K1 | ||
| 1470 | #define A2 A1 | ||
| 1471 | #define IV2 IV1 | ||
| 1472 | static const u8 P2[16], | ||
| 1473 | C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78}, | ||
| 1474 | T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf}; | ||
| 1475 | |||
| 1476 | /* Test Case 3 */ | ||
| 1477 | #define A3 A2 | ||
| 1478 | static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08}, | ||
| 1479 | P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
| 1480 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
| 1481 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
| 1482 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, | ||
| 1483 | IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, | ||
| 1484 | C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c, | ||
| 1485 | 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e, | ||
| 1486 | 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05, | ||
| 1487 | 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85}, | ||
| 1488 | T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4}; | ||
| 1489 | |||
| 1490 | /* Test Case 4 */ | ||
| 1491 | #define K4 K3 | ||
| 1492 | #define IV4 IV3 | ||
| 1493 | static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
| 1494 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
| 1495 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
| 1496 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, | ||
| 1497 | A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, | ||
| 1498 | 0xab,0xad,0xda,0xd2}, | ||
| 1499 | C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c, | ||
| 1500 | 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e, | ||
| 1501 | 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05, | ||
| 1502 | 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91}, | ||
| 1503 | T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47}; | ||
| 1504 | |||
| 1505 | /* Test Case 5 */ | ||
| 1506 | #define K5 K4 | ||
| 1507 | #define P5 P4 | ||
| 1508 | #define A5 A4 | ||
| 1509 | static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, | ||
| 1510 | C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55, | ||
| 1511 | 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23, | ||
| 1512 | 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42, | ||
| 1513 | 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98}, | ||
| 1514 | T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb}; | ||
| 1515 | |||
| 1516 | /* Test Case 6 */ | ||
| 1517 | #define K6 K5 | ||
| 1518 | #define P6 P5 | ||
| 1519 | #define A6 A5 | ||
| 1520 | static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, | ||
| 1521 | 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, | ||
| 1522 | 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, | ||
| 1523 | 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, | ||
| 1524 | C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94, | ||
| 1525 | 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7, | ||
| 1526 | 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f, | ||
| 1527 | 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5}, | ||
| 1528 | T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50}; | ||
| 1529 | |||
| 1530 | /* Test Case 7 */ | ||
| 1531 | static const u8 K7[24], | ||
| 1532 | *P7=NULL, | ||
| 1533 | *A7=NULL, | ||
| 1534 | IV7[12], | ||
| 1535 | *C7=NULL, | ||
| 1536 | T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35}; | ||
| 1537 | |||
| 1538 | /* Test Case 8 */ | ||
| 1539 | #define K8 K7 | ||
| 1540 | #define IV8 IV7 | ||
| 1541 | #define A8 A7 | ||
| 1542 | static const u8 P8[16], | ||
| 1543 | C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00}, | ||
| 1544 | T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb}; | ||
| 1545 | |||
| 1546 | /* Test Case 9 */ | ||
| 1547 | #define A9 A8 | ||
| 1548 | static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08, | ||
| 1549 | 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c}, | ||
| 1550 | P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
| 1551 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
| 1552 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
| 1553 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, | ||
| 1554 | IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, | ||
| 1555 | C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57, | ||
| 1556 | 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c, | ||
| 1557 | 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47, | ||
| 1558 | 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56}, | ||
| 1559 | T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14}; | ||
| 1560 | |||
| 1561 | /* Test Case 10 */ | ||
| 1562 | #define K10 K9 | ||
| 1563 | #define IV10 IV9 | ||
| 1564 | static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
| 1565 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
| 1566 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
| 1567 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, | ||
| 1568 | A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, | ||
| 1569 | 0xab,0xad,0xda,0xd2}, | ||
| 1570 | C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57, | ||
| 1571 | 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c, | ||
| 1572 | 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47, | ||
| 1573 | 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10}, | ||
| 1574 | T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c}; | ||
| 1575 | |||
| 1576 | /* Test Case 11 */ | ||
| 1577 | #define K11 K10 | ||
| 1578 | #define P11 P10 | ||
| 1579 | #define A11 A10 | ||
| 1580 | static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, | ||
| 1581 | C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8, | ||
| 1582 | 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57, | ||
| 1583 | 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9, | ||
| 1584 | 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7}, | ||
| 1585 | T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8}; | ||
| 1586 | |||
| 1587 | /* Test Case 12 */ | ||
| 1588 | #define K12 K11 | ||
| 1589 | #define P12 P11 | ||
| 1590 | #define A12 A11 | ||
| 1591 | static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, | ||
| 1592 | 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, | ||
| 1593 | 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, | ||
| 1594 | 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, | ||
| 1595 | C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff, | ||
| 1596 | 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45, | ||
| 1597 | 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3, | ||
| 1598 | 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b}, | ||
| 1599 | T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9}; | ||
| 1600 | |||
| 1601 | /* Test Case 13 */ | ||
| 1602 | static const u8 K13[32], | ||
| 1603 | *P13=NULL, | ||
| 1604 | *A13=NULL, | ||
| 1605 | IV13[12], | ||
| 1606 | *C13=NULL, | ||
| 1607 | T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b}; | ||
| 1608 | |||
| 1609 | /* Test Case 14 */ | ||
| 1610 | #define K14 K13 | ||
| 1611 | #define A14 A13 | ||
| 1612 | static const u8 P14[16], | ||
| 1613 | IV14[12], | ||
| 1614 | C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18}, | ||
| 1615 | T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19}; | ||
| 1616 | |||
| 1617 | /* Test Case 15 */ | ||
| 1618 | #define A15 A14 | ||
| 1619 | static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08, | ||
| 1620 | 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08}, | ||
| 1621 | P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
| 1622 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
| 1623 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
| 1624 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, | ||
| 1625 | IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, | ||
| 1626 | C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, | ||
| 1627 | 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, | ||
| 1628 | 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, | ||
| 1629 | 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad}, | ||
| 1630 | T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c}; | ||
| 1631 | |||
| 1632 | /* Test Case 16 */ | ||
| 1633 | #define K16 K15 | ||
| 1634 | #define IV16 IV15 | ||
| 1635 | static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
| 1636 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
| 1637 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
| 1638 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, | ||
| 1639 | A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, | ||
| 1640 | 0xab,0xad,0xda,0xd2}, | ||
| 1641 | C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, | ||
| 1642 | 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, | ||
| 1643 | 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, | ||
| 1644 | 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62}, | ||
| 1645 | T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b}; | ||
| 1646 | |||
| 1647 | /* Test Case 17 */ | ||
| 1648 | #define K17 K16 | ||
| 1649 | #define P17 P16 | ||
| 1650 | #define A17 A16 | ||
| 1651 | static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, | ||
| 1652 | C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb, | ||
| 1653 | 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0, | ||
| 1654 | 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78, | ||
| 1655 | 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f}, | ||
| 1656 | T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2}; | ||
| 1657 | |||
| 1658 | /* Test Case 18 */ | ||
| 1659 | #define K18 K17 | ||
| 1660 | #define P18 P17 | ||
| 1661 | #define A18 A17 | ||
| 1662 | static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, | ||
| 1663 | 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, | ||
| 1664 | 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, | ||
| 1665 | 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, | ||
| 1666 | C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20, | ||
| 1667 | 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4, | ||
| 1668 | 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde, | ||
| 1669 | 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f}, | ||
| 1670 | T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a}; | ||
| 1671 | |||
| 1672 | #define TEST_CASE(n) do { \ | ||
| 1673 | u8 out[sizeof(P##n)]; \ | ||
| 1674 | AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \ | ||
| 1675 | CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \ | ||
| 1676 | CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \ | ||
| 1677 | memset(out,0,sizeof(out)); \ | ||
| 1678 | if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \ | ||
| 1679 | if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \ | ||
| 1680 | if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \ | ||
| 1681 | (C##n && memcmp(out,C##n,sizeof(out)))) \ | ||
| 1682 | ret++, printf ("encrypt test#%d failed.\n",n); \ | ||
| 1683 | CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \ | ||
| 1684 | memset(out,0,sizeof(out)); \ | ||
| 1685 | if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \ | ||
| 1686 | if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \ | ||
| 1687 | if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \ | ||
| 1688 | (P##n && memcmp(out,P##n,sizeof(out)))) \ | ||
| 1689 | ret++, printf ("decrypt test#%d failed.\n",n); \ | ||
| 1690 | } while(0) | ||
| 1691 | |||
| 1692 | int main() | ||
| 1693 | { | ||
| 1694 | GCM128_CONTEXT ctx; | ||
| 1695 | AES_KEY key; | ||
| 1696 | int ret=0; | ||
| 1697 | |||
| 1698 | TEST_CASE(1); | ||
| 1699 | TEST_CASE(2); | ||
| 1700 | TEST_CASE(3); | ||
| 1701 | TEST_CASE(4); | ||
| 1702 | TEST_CASE(5); | ||
| 1703 | TEST_CASE(6); | ||
| 1704 | TEST_CASE(7); | ||
| 1705 | TEST_CASE(8); | ||
| 1706 | TEST_CASE(9); | ||
| 1707 | TEST_CASE(10); | ||
| 1708 | TEST_CASE(11); | ||
| 1709 | TEST_CASE(12); | ||
| 1710 | TEST_CASE(13); | ||
| 1711 | TEST_CASE(14); | ||
| 1712 | TEST_CASE(15); | ||
| 1713 | TEST_CASE(16); | ||
| 1714 | TEST_CASE(17); | ||
| 1715 | TEST_CASE(18); | ||
| 1716 | |||
| 1717 | #ifdef OPENSSL_CPUID_OBJ | ||
| 1718 | { | ||
| 1719 | size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc(); | ||
| 1720 | union { u64 u; u8 c[1024]; } buf; | ||
| 1721 | int i; | ||
| 1722 | |||
| 1723 | AES_set_encrypt_key(K1,sizeof(K1)*8,&key); | ||
| 1724 | CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); | ||
| 1725 | CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1)); | ||
| 1726 | |||
| 1727 | CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf)); | ||
| 1728 | start = OPENSSL_rdtsc(); | ||
| 1729 | CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf)); | ||
| 1730 | gcm_t = OPENSSL_rdtsc() - start; | ||
| 1731 | |||
| 1732 | CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf), | ||
| 1733 | &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres, | ||
| 1734 | (block128_f)AES_encrypt); | ||
| 1735 | start = OPENSSL_rdtsc(); | ||
| 1736 | CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf), | ||
| 1737 | &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres, | ||
| 1738 | (block128_f)AES_encrypt); | ||
| 1739 | ctr_t = OPENSSL_rdtsc() - start; | ||
| 1740 | |||
| 1741 | printf("%.2f-%.2f=%.2f\n", | ||
| 1742 | gcm_t/(double)sizeof(buf), | ||
| 1743 | ctr_t/(double)sizeof(buf), | ||
| 1744 | (gcm_t-ctr_t)/(double)sizeof(buf)); | ||
| 1745 | #ifdef GHASH | ||
| 1746 | GHASH(&ctx,buf.c,sizeof(buf)); | ||
| 1747 | start = OPENSSL_rdtsc(); | ||
| 1748 | for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf)); | ||
| 1749 | gcm_t = OPENSSL_rdtsc() - start; | ||
| 1750 | printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i); | ||
| 1751 | #endif | ||
| 1752 | } | ||
| 1753 | #endif | ||
| 1754 | |||
| 1755 | return ret; | ||
| 1756 | } | ||
| 1757 | #endif | ||
diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h index af8d97d795..f18215bb2b 100644 --- a/src/lib/libcrypto/modes/modes.h +++ b/src/lib/libcrypto/modes/modes.h | |||
| @@ -15,6 +15,14 @@ typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out, | |||
| 15 | size_t len, const void *key, | 15 | size_t len, const void *key, |
| 16 | unsigned char ivec[16], int enc); | 16 | unsigned char ivec[16], int enc); |
| 17 | 17 | ||
| 18 | typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out, | ||
| 19 | size_t blocks, const void *key, | ||
| 20 | const unsigned char ivec[16]); | ||
| 21 | |||
| 22 | typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out, | ||
| 23 | size_t blocks, const void *key, | ||
| 24 | const unsigned char ivec[16],unsigned char cmac[16]); | ||
| 25 | |||
| 18 | void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out, | 26 | void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out, |
| 19 | size_t len, const void *key, | 27 | size_t len, const void *key, |
| 20 | unsigned char ivec[16], block128_f block); | 28 | unsigned char ivec[16], block128_f block); |
| @@ -27,6 +35,11 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, | |||
| 27 | unsigned char ivec[16], unsigned char ecount_buf[16], | 35 | unsigned char ivec[16], unsigned char ecount_buf[16], |
| 28 | unsigned int *num, block128_f block); | 36 | unsigned int *num, block128_f block); |
| 29 | 37 | ||
| 38 | void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, | ||
| 39 | size_t len, const void *key, | ||
| 40 | unsigned char ivec[16], unsigned char ecount_buf[16], | ||
| 41 | unsigned int *num, ctr128_f ctr); | ||
| 42 | |||
| 30 | void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, | 43 | void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, |
| 31 | size_t len, const void *key, | 44 | size_t len, const void *key, |
| 32 | unsigned char ivec[16], int *num, | 45 | unsigned char ivec[16], int *num, |
| @@ -57,3 +70,66 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, | |||
| 57 | size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, | 70 | size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, |
| 58 | size_t len, const void *key, | 71 | size_t len, const void *key, |
| 59 | unsigned char ivec[16], cbc128_f cbc); | 72 | unsigned char ivec[16], cbc128_f cbc); |
| 73 | |||
| 74 | size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out, | ||
| 75 | size_t len, const void *key, | ||
| 76 | unsigned char ivec[16], block128_f block); | ||
| 77 | size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out, | ||
| 78 | size_t len, const void *key, | ||
| 79 | unsigned char ivec[16], cbc128_f cbc); | ||
| 80 | size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out, | ||
| 81 | size_t len, const void *key, | ||
| 82 | unsigned char ivec[16], block128_f block); | ||
| 83 | size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out, | ||
| 84 | size_t len, const void *key, | ||
| 85 | unsigned char ivec[16], cbc128_f cbc); | ||
| 86 | |||
| 87 | typedef struct gcm128_context GCM128_CONTEXT; | ||
| 88 | |||
| 89 | GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block); | ||
| 90 | void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block); | ||
| 91 | void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, | ||
| 92 | size_t len); | ||
| 93 | int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, | ||
| 94 | size_t len); | ||
| 95 | int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, | ||
| 96 | const unsigned char *in, unsigned char *out, | ||
| 97 | size_t len); | ||
| 98 | int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, | ||
| 99 | const unsigned char *in, unsigned char *out, | ||
| 100 | size_t len); | ||
| 101 | int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, | ||
| 102 | const unsigned char *in, unsigned char *out, | ||
| 103 | size_t len, ctr128_f stream); | ||
| 104 | int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, | ||
| 105 | const unsigned char *in, unsigned char *out, | ||
| 106 | size_t len, ctr128_f stream); | ||
| 107 | int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, | ||
| 108 | size_t len); | ||
| 109 | void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len); | ||
| 110 | void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx); | ||
| 111 | |||
| 112 | typedef struct ccm128_context CCM128_CONTEXT; | ||
| 113 | |||
| 114 | void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, | ||
| 115 | unsigned int M, unsigned int L, void *key,block128_f block); | ||
| 116 | int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, | ||
| 117 | const unsigned char *nonce, size_t nlen, size_t mlen); | ||
| 118 | void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, | ||
| 119 | const unsigned char *aad, size_t alen); | ||
| 120 | int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, | ||
| 121 | const unsigned char *inp, unsigned char *out, size_t len); | ||
| 122 | int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, | ||
| 123 | const unsigned char *inp, unsigned char *out, size_t len); | ||
| 124 | int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, | ||
| 125 | const unsigned char *inp, unsigned char *out, size_t len, | ||
| 126 | ccm128_f stream); | ||
| 127 | int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, | ||
| 128 | const unsigned char *inp, unsigned char *out, size_t len, | ||
| 129 | ccm128_f stream); | ||
| 130 | size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len); | ||
| 131 | |||
| 132 | typedef struct xts128_context XTS128_CONTEXT; | ||
| 133 | |||
| 134 | int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16], | ||
| 135 | const unsigned char *inp, unsigned char *out, size_t len, int enc); | ||
diff --git a/src/lib/libcrypto/modes/modes_lcl.h b/src/lib/libcrypto/modes/modes_lcl.h new file mode 100644 index 0000000000..b6dc3c336f --- /dev/null +++ b/src/lib/libcrypto/modes/modes_lcl.h | |||
| @@ -0,0 +1,131 @@ | |||
| 1 | /* ==================================================================== | ||
| 2 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
| 3 | * | ||
| 4 | * Redistribution and use is governed by OpenSSL license. | ||
| 5 | * ==================================================================== | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <openssl/modes.h> | ||
| 9 | |||
| 10 | |||
| 11 | #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) | ||
| 12 | typedef __int64 i64; | ||
| 13 | typedef unsigned __int64 u64; | ||
| 14 | #define U64(C) C##UI64 | ||
| 15 | #elif defined(__arch64__) | ||
| 16 | typedef long i64; | ||
| 17 | typedef unsigned long u64; | ||
| 18 | #define U64(C) C##UL | ||
| 19 | #else | ||
| 20 | typedef long long i64; | ||
| 21 | typedef unsigned long long u64; | ||
| 22 | #define U64(C) C##ULL | ||
| 23 | #endif | ||
| 24 | |||
| 25 | typedef unsigned int u32; | ||
| 26 | typedef unsigned char u8; | ||
| 27 | |||
| 28 | #define STRICT_ALIGNMENT 1 | ||
| 29 | #if defined(__i386) || defined(__i386__) || \ | ||
| 30 | defined(__x86_64) || defined(__x86_64__) || \ | ||
| 31 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ | ||
| 32 | defined(__s390__) || defined(__s390x__) || \ | ||
| 33 | ( (defined(__arm__) || defined(__arm)) && \ | ||
| 34 | (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ | ||
| 35 | defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)) ) | ||
| 36 | # undef STRICT_ALIGNMENT | ||
| 37 | #endif | ||
| 38 | |||
| 39 | #if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) | ||
| 40 | #if defined(__GNUC__) && __GNUC__>=2 | ||
| 41 | # if defined(__x86_64) || defined(__x86_64__) | ||
| 42 | # define BSWAP8(x) ({ u64 ret=(x); \ | ||
| 43 | asm ("bswapq %0" \ | ||
| 44 | : "+r"(ret)); ret; }) | ||
| 45 | # define BSWAP4(x) ({ u32 ret=(x); \ | ||
| 46 | asm ("bswapl %0" \ | ||
| 47 | : "+r"(ret)); ret; }) | ||
| 48 | # elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY) | ||
| 49 | # define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \ | ||
| 50 | asm ("bswapl %0; bswapl %1" \ | ||
| 51 | : "+r"(hi),"+r"(lo)); \ | ||
| 52 | (u64)hi<<32|lo; }) | ||
| 53 | # define BSWAP4(x) ({ u32 ret=(x); \ | ||
| 54 | asm ("bswapl %0" \ | ||
| 55 | : "+r"(ret)); ret; }) | ||
| 56 | # elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT) | ||
| 57 | # define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \ | ||
| 58 | asm ("rev %0,%0; rev %1,%1" \ | ||
| 59 | : "+r"(hi),"+r"(lo)); \ | ||
| 60 | (u64)hi<<32|lo; }) | ||
| 61 | # define BSWAP4(x) ({ u32 ret; \ | ||
| 62 | asm ("rev %0,%1" \ | ||
| 63 | : "=r"(ret) : "r"((u32)(x))); \ | ||
| 64 | ret; }) | ||
| 65 | # endif | ||
| 66 | #elif defined(_MSC_VER) | ||
| 67 | # if _MSC_VER>=1300 | ||
| 68 | # pragma intrinsic(_byteswap_uint64,_byteswap_ulong) | ||
| 69 | # define BSWAP8(x) _byteswap_uint64((u64)(x)) | ||
| 70 | # define BSWAP4(x) _byteswap_ulong((u32)(x)) | ||
| 71 | # elif defined(_M_IX86) | ||
| 72 | __inline u32 _bswap4(u32 val) { | ||
| 73 | _asm mov eax,val | ||
| 74 | _asm bswap eax | ||
| 75 | } | ||
| 76 | # define BSWAP4(x) _bswap4(x) | ||
| 77 | # endif | ||
| 78 | #endif | ||
| 79 | #endif | ||
| 80 | |||
| 81 | #if defined(BSWAP4) && !defined(STRICT_ALIGNMENT) | ||
| 82 | #define GETU32(p) BSWAP4(*(const u32 *)(p)) | ||
| 83 | #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) | ||
| 84 | #else | ||
| 85 | #define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) | ||
| 86 | #define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v)) | ||
| 87 | #endif | ||
| 88 | |||
| 89 | /* GCM definitions */ | ||
| 90 | |||
| 91 | typedef struct { u64 hi,lo; } u128; | ||
| 92 | |||
| 93 | #ifdef TABLE_BITS | ||
| 94 | #undef TABLE_BITS | ||
| 95 | #endif | ||
| 96 | /* | ||
| 97 | * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should | ||
| 98 | * never be set to 8 [or 1]. For further information see gcm128.c. | ||
| 99 | */ | ||
| 100 | #define TABLE_BITS 4 | ||
| 101 | |||
| 102 | struct gcm128_context { | ||
| 103 | /* Following 6 names follow names in GCM specification */ | ||
| 104 | union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,len, | ||
| 105 | Xi,H; | ||
| 106 | /* Relative position of Xi, H and pre-computed Htable is used | ||
| 107 | * in some assembler modules, i.e. don't change the order! */ | ||
| 108 | #if TABLE_BITS==8 | ||
| 109 | u128 Htable[256]; | ||
| 110 | #else | ||
| 111 | u128 Htable[16]; | ||
| 112 | void (*gmult)(u64 Xi[2],const u128 Htable[16]); | ||
| 113 | void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
| 114 | #endif | ||
| 115 | unsigned int mres, ares; | ||
| 116 | block128_f block; | ||
| 117 | void *key; | ||
| 118 | }; | ||
| 119 | |||
| 120 | struct xts128_context { | ||
| 121 | void *key1, *key2; | ||
| 122 | block128_f block1,block2; | ||
| 123 | }; | ||
| 124 | |||
| 125 | struct ccm128_context { | ||
| 126 | union { u64 u[2]; u8 c[16]; } nonce, cmac; | ||
| 127 | u64 blocks; | ||
| 128 | block128_f block; | ||
| 129 | void *key; | ||
| 130 | }; | ||
| 131 | |||
diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c index c732e2ec58..01c01702c4 100644 --- a/src/lib/libcrypto/modes/ofb128.c +++ b/src/lib/libcrypto/modes/ofb128.c | |||
| @@ -48,7 +48,8 @@ | |||
| 48 | * | 48 | * |
| 49 | */ | 49 | */ |
| 50 | 50 | ||
| 51 | #include "modes.h" | 51 | #include <openssl/crypto.h> |
| 52 | #include "modes_lcl.h" | ||
| 52 | #include <string.h> | 53 | #include <string.h> |
| 53 | 54 | ||
| 54 | #ifndef MODES_DEBUG | 55 | #ifndef MODES_DEBUG |
| @@ -58,14 +59,6 @@ | |||
| 58 | #endif | 59 | #endif |
| 59 | #include <assert.h> | 60 | #include <assert.h> |
| 60 | 61 | ||
| 61 | #define STRICT_ALIGNMENT | ||
| 62 | #if defined(__i386) || defined(__i386__) || \ | ||
| 63 | defined(__x86_64) || defined(__x86_64__) || \ | ||
| 64 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ | ||
| 65 | defined(__s390__) || defined(__s390x__) | ||
| 66 | # undef STRICT_ALIGNMENT | ||
| 67 | #endif | ||
| 68 | |||
| 69 | /* The input and output encrypted as though 128bit ofb mode is being | 62 | /* The input and output encrypted as though 128bit ofb mode is being |
| 70 | * used. The extra state information to record how much of the | 63 | * used. The extra state information to record how much of the |
| 71 | * 128bit block we have used is contained in *num; | 64 | * 128bit block we have used is contained in *num; |
diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c new file mode 100644 index 0000000000..9cf27a25e9 --- /dev/null +++ b/src/lib/libcrypto/modes/xts128.c | |||
| @@ -0,0 +1,187 @@ | |||
| 1 | /* ==================================================================== | ||
| 2 | * Copyright (c) 2011 The OpenSSL Project. All rights reserved. | ||
| 3 | * | ||
| 4 | * Redistribution and use in source and binary forms, with or without | ||
| 5 | * modification, are permitted provided that the following conditions | ||
| 6 | * are met: | ||
| 7 | * | ||
| 8 | * 1. Redistributions of source code must retain the above copyright | ||
| 9 | * notice, this list of conditions and the following disclaimer. | ||
| 10 | * | ||
| 11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 12 | * notice, this list of conditions and the following disclaimer in | ||
| 13 | * the documentation and/or other materials provided with the | ||
| 14 | * distribution. | ||
| 15 | * | ||
| 16 | * 3. All advertising materials mentioning features or use of this | ||
| 17 | * software must display the following acknowledgment: | ||
| 18 | * "This product includes software developed by the OpenSSL Project | ||
| 19 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
| 20 | * | ||
| 21 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 22 | * endorse or promote products derived from this software without | ||
| 23 | * prior written permission. For written permission, please contact | ||
| 24 | * openssl-core@openssl.org. | ||
| 25 | * | ||
| 26 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 27 | * nor may "OpenSSL" appear in their names without prior written | ||
| 28 | * permission of the OpenSSL Project. | ||
| 29 | * | ||
| 30 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 31 | * acknowledgment: | ||
| 32 | * "This product includes software developed by the OpenSSL Project | ||
| 33 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
| 34 | * | ||
| 35 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 36 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 38 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 39 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 40 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 41 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 42 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 45 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 46 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 47 | * ==================================================================== | ||
| 48 | */ | ||
| 49 | |||
| 50 | #include <openssl/crypto.h> | ||
| 51 | #include "modes_lcl.h" | ||
| 52 | #include <string.h> | ||
| 53 | |||
| 54 | #ifndef MODES_DEBUG | ||
| 55 | # ifndef NDEBUG | ||
| 56 | # define NDEBUG | ||
| 57 | # endif | ||
| 58 | #endif | ||
| 59 | #include <assert.h> | ||
| 60 | |||
| 61 | int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16], | ||
| 62 | const unsigned char *inp, unsigned char *out, | ||
| 63 | size_t len, int enc) | ||
| 64 | { | ||
| 65 | const union { long one; char little; } is_endian = {1}; | ||
| 66 | union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch; | ||
| 67 | unsigned int i; | ||
| 68 | |||
| 69 | if (len<16) return -1; | ||
| 70 | |||
| 71 | memcpy(tweak.c, iv, 16); | ||
| 72 | |||
| 73 | (*ctx->block2)(tweak.c,tweak.c,ctx->key2); | ||
| 74 | |||
| 75 | if (!enc && (len%16)) len-=16; | ||
| 76 | |||
| 77 | while (len>=16) { | ||
| 78 | #if defined(STRICT_ALIGNMENT) | ||
| 79 | memcpy(scratch.c,inp,16); | ||
| 80 | scratch.u[0] ^= tweak.u[0]; | ||
| 81 | scratch.u[1] ^= tweak.u[1]; | ||
| 82 | #else | ||
| 83 | scratch.u[0] = ((u64*)inp)[0]^tweak.u[0]; | ||
| 84 | scratch.u[1] = ((u64*)inp)[1]^tweak.u[1]; | ||
| 85 | #endif | ||
| 86 | (*ctx->block1)(scratch.c,scratch.c,ctx->key1); | ||
| 87 | #if defined(STRICT_ALIGNMENT) | ||
| 88 | scratch.u[0] ^= tweak.u[0]; | ||
| 89 | scratch.u[1] ^= tweak.u[1]; | ||
| 90 | memcpy(out,scratch.c,16); | ||
| 91 | #else | ||
| 92 | ((u64*)out)[0] = scratch.u[0]^=tweak.u[0]; | ||
| 93 | ((u64*)out)[1] = scratch.u[1]^=tweak.u[1]; | ||
| 94 | #endif | ||
| 95 | inp += 16; | ||
| 96 | out += 16; | ||
| 97 | len -= 16; | ||
| 98 | |||
| 99 | if (len==0) return 0; | ||
| 100 | |||
| 101 | if (is_endian.little) { | ||
| 102 | unsigned int carry,res; | ||
| 103 | |||
| 104 | res = 0x87&(((int)tweak.d[3])>>31); | ||
| 105 | carry = (unsigned int)(tweak.u[0]>>63); | ||
| 106 | tweak.u[0] = (tweak.u[0]<<1)^res; | ||
| 107 | tweak.u[1] = (tweak.u[1]<<1)|carry; | ||
| 108 | } | ||
| 109 | else { | ||
| 110 | size_t c; | ||
| 111 | |||
| 112 | for (c=0,i=0;i<16;++i) { | ||
| 113 | /*+ substitutes for |, because c is 1 bit */ | ||
| 114 | c += ((size_t)tweak.c[i])<<1; | ||
| 115 | tweak.c[i] = (u8)c; | ||
| 116 | c = c>>8; | ||
| 117 | } | ||
| 118 | tweak.c[0] ^= (u8)(0x87&(0-c)); | ||
| 119 | } | ||
| 120 | } | ||
| 121 | if (enc) { | ||
| 122 | for (i=0;i<len;++i) { | ||
| 123 | u8 c = inp[i]; | ||
| 124 | out[i] = scratch.c[i]; | ||
| 125 | scratch.c[i] = c; | ||
| 126 | } | ||
| 127 | scratch.u[0] ^= tweak.u[0]; | ||
| 128 | scratch.u[1] ^= tweak.u[1]; | ||
| 129 | (*ctx->block1)(scratch.c,scratch.c,ctx->key1); | ||
| 130 | scratch.u[0] ^= tweak.u[0]; | ||
| 131 | scratch.u[1] ^= tweak.u[1]; | ||
| 132 | memcpy(out-16,scratch.c,16); | ||
| 133 | } | ||
| 134 | else { | ||
| 135 | union { u64 u[2]; u8 c[16]; } tweak1; | ||
| 136 | |||
| 137 | if (is_endian.little) { | ||
| 138 | unsigned int carry,res; | ||
| 139 | |||
| 140 | res = 0x87&(((int)tweak.d[3])>>31); | ||
| 141 | carry = (unsigned int)(tweak.u[0]>>63); | ||
| 142 | tweak1.u[0] = (tweak.u[0]<<1)^res; | ||
| 143 | tweak1.u[1] = (tweak.u[1]<<1)|carry; | ||
| 144 | } | ||
| 145 | else { | ||
| 146 | size_t c; | ||
| 147 | |||
| 148 | for (c=0,i=0;i<16;++i) { | ||
| 149 | /*+ substitutes for |, because c is 1 bit */ | ||
| 150 | c += ((size_t)tweak.c[i])<<1; | ||
| 151 | tweak1.c[i] = (u8)c; | ||
| 152 | c = c>>8; | ||
| 153 | } | ||
| 154 | tweak1.c[0] ^= (u8)(0x87&(0-c)); | ||
| 155 | } | ||
| 156 | #if defined(STRICT_ALIGNMENT) | ||
| 157 | memcpy(scratch.c,inp,16); | ||
| 158 | scratch.u[0] ^= tweak1.u[0]; | ||
| 159 | scratch.u[1] ^= tweak1.u[1]; | ||
| 160 | #else | ||
| 161 | scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0]; | ||
| 162 | scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1]; | ||
| 163 | #endif | ||
| 164 | (*ctx->block1)(scratch.c,scratch.c,ctx->key1); | ||
| 165 | scratch.u[0] ^= tweak1.u[0]; | ||
| 166 | scratch.u[1] ^= tweak1.u[1]; | ||
| 167 | |||
| 168 | for (i=0;i<len;++i) { | ||
| 169 | u8 c = inp[16+i]; | ||
| 170 | out[16+i] = scratch.c[i]; | ||
| 171 | scratch.c[i] = c; | ||
| 172 | } | ||
| 173 | scratch.u[0] ^= tweak.u[0]; | ||
| 174 | scratch.u[1] ^= tweak.u[1]; | ||
| 175 | (*ctx->block1)(scratch.c,scratch.c,ctx->key1); | ||
| 176 | #if defined(STRICT_ALIGNMENT) | ||
| 177 | scratch.u[0] ^= tweak.u[0]; | ||
| 178 | scratch.u[1] ^= tweak.u[1]; | ||
| 179 | memcpy (out,scratch.c,16); | ||
| 180 | #else | ||
| 181 | ((u64*)out)[0] = scratch.u[0]^tweak.u[0]; | ||
| 182 | ((u64*)out)[1] = scratch.u[1]^tweak.u[1]; | ||
| 183 | #endif | ||
| 184 | } | ||
| 185 | |||
| 186 | return 0; | ||
| 187 | } | ||
diff --git a/src/lib/libcrypto/objects/obj_xref.c b/src/lib/libcrypto/objects/obj_xref.c index 152eca5c67..9f744bcede 100644 --- a/src/lib/libcrypto/objects/obj_xref.c +++ b/src/lib/libcrypto/objects/obj_xref.c | |||
| @@ -110,8 +110,10 @@ int OBJ_find_sigid_algs(int signid, int *pdig_nid, int *ppkey_nid) | |||
| 110 | #endif | 110 | #endif |
| 111 | if (rv == NULL) | 111 | if (rv == NULL) |
| 112 | return 0; | 112 | return 0; |
| 113 | *pdig_nid = rv->hash_id; | 113 | if (pdig_nid) |
| 114 | *ppkey_nid = rv->pkey_id; | 114 | *pdig_nid = rv->hash_id; |
| 115 | if (ppkey_nid) | ||
| 116 | *ppkey_nid = rv->pkey_id; | ||
| 115 | return 1; | 117 | return 1; |
| 116 | } | 118 | } |
| 117 | 119 | ||
| @@ -144,7 +146,8 @@ int OBJ_find_sigid_by_algs(int *psignid, int dig_nid, int pkey_nid) | |||
| 144 | #endif | 146 | #endif |
| 145 | if (rv == NULL) | 147 | if (rv == NULL) |
| 146 | return 0; | 148 | return 0; |
| 147 | *psignid = (*rv)->sign_id; | 149 | if (psignid) |
| 150 | *psignid = (*rv)->sign_id; | ||
| 148 | return 1; | 151 | return 1; |
| 149 | } | 152 | } |
| 150 | 153 | ||
diff --git a/src/lib/libcrypto/objects/obj_xref.h b/src/lib/libcrypto/objects/obj_xref.h index d5b9b8e198..e23938c296 100644 --- a/src/lib/libcrypto/objects/obj_xref.h +++ b/src/lib/libcrypto/objects/obj_xref.h | |||
| @@ -38,10 +38,12 @@ static const nid_triple sigoid_srt[] = | |||
| 38 | {NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94}, | 38 | {NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94}, |
| 39 | {NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc}, | 39 | {NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc}, |
| 40 | {NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc}, | 40 | {NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc}, |
| 41 | {NID_rsassaPss, NID_undef, NID_rsaEncryption}, | ||
| 41 | }; | 42 | }; |
| 42 | 43 | ||
| 43 | static const nid_triple * const sigoid_srt_xref[] = | 44 | static const nid_triple * const sigoid_srt_xref[] = |
| 44 | { | 45 | { |
| 46 | &sigoid_srt[29], | ||
| 45 | &sigoid_srt[17], | 47 | &sigoid_srt[17], |
| 46 | &sigoid_srt[18], | 48 | &sigoid_srt[18], |
| 47 | &sigoid_srt[0], | 49 | &sigoid_srt[0], |
diff --git a/src/lib/libcrypto/objects/obj_xref.txt b/src/lib/libcrypto/objects/obj_xref.txt index e45b3d34b9..cb917182ee 100644 --- a/src/lib/libcrypto/objects/obj_xref.txt +++ b/src/lib/libcrypto/objects/obj_xref.txt | |||
| @@ -13,6 +13,10 @@ sha512WithRSAEncryption sha512 rsaEncryption | |||
| 13 | sha224WithRSAEncryption sha224 rsaEncryption | 13 | sha224WithRSAEncryption sha224 rsaEncryption |
| 14 | mdc2WithRSA mdc2 rsaEncryption | 14 | mdc2WithRSA mdc2 rsaEncryption |
| 15 | ripemd160WithRSA ripemd160 rsaEncryption | 15 | ripemd160WithRSA ripemd160 rsaEncryption |
| 16 | # For PSS the digest algorithm can vary and depends on the included | ||
| 17 | # AlgorithmIdentifier. The digest "undef" indicates the public key | ||
| 18 | # method should handle this explicitly. | ||
| 19 | rsassaPss undef rsaEncryption | ||
| 16 | 20 | ||
| 17 | # Alternative deprecated OIDs. By using the older "rsa" OID this | 21 | # Alternative deprecated OIDs. By using the older "rsa" OID this |
| 18 | # type will be recognized by not normally used. | 22 | # type will be recognized by not normally used. |
diff --git a/src/lib/libcrypto/pariscid.pl b/src/lib/libcrypto/pariscid.pl new file mode 100644 index 0000000000..477ec9b87d --- /dev/null +++ b/src/lib/libcrypto/pariscid.pl | |||
| @@ -0,0 +1,224 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | $flavour = shift; | ||
| 4 | $output = shift; | ||
| 5 | open STDOUT,">$output"; | ||
| 6 | |||
| 7 | if ($flavour =~ /64/) { | ||
| 8 | $LEVEL ="2.0W"; | ||
| 9 | $SIZE_T =8; | ||
| 10 | $ST ="std"; | ||
| 11 | } else { | ||
| 12 | $LEVEL ="1.1"; | ||
| 13 | $SIZE_T =4; | ||
| 14 | $ST ="stw"; | ||
| 15 | } | ||
| 16 | |||
| 17 | $rp="%r2"; | ||
| 18 | $sp="%r30"; | ||
| 19 | $rv="%r28"; | ||
| 20 | |||
| 21 | $code=<<___; | ||
| 22 | .LEVEL $LEVEL | ||
| 23 | .SPACE \$TEXT\$ | ||
| 24 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
| 25 | |||
| 26 | .EXPORT OPENSSL_cpuid_setup,ENTRY | ||
| 27 | .ALIGN 8 | ||
| 28 | OPENSSL_cpuid_setup | ||
| 29 | .PROC | ||
| 30 | .CALLINFO NO_CALLS | ||
| 31 | .ENTRY | ||
| 32 | bv ($rp) | ||
| 33 | .EXIT | ||
| 34 | nop | ||
| 35 | .PROCEND | ||
| 36 | |||
| 37 | .EXPORT OPENSSL_rdtsc,ENTRY | ||
| 38 | .ALIGN 8 | ||
| 39 | OPENSSL_rdtsc | ||
| 40 | .PROC | ||
| 41 | .CALLINFO NO_CALLS | ||
| 42 | .ENTRY | ||
| 43 | mfctl %cr16,$rv | ||
| 44 | bv ($rp) | ||
| 45 | .EXIT | ||
| 46 | nop | ||
| 47 | .PROCEND | ||
| 48 | |||
| 49 | .EXPORT OPENSSL_wipe_cpu,ENTRY | ||
| 50 | .ALIGN 8 | ||
| 51 | OPENSSL_wipe_cpu | ||
| 52 | .PROC | ||
| 53 | .CALLINFO NO_CALLS | ||
| 54 | .ENTRY | ||
| 55 | xor %r0,%r0,%r1 | ||
| 56 | fcpy,dbl %fr0,%fr4 | ||
| 57 | xor %r0,%r0,%r19 | ||
| 58 | fcpy,dbl %fr0,%fr5 | ||
| 59 | xor %r0,%r0,%r20 | ||
| 60 | fcpy,dbl %fr0,%fr6 | ||
| 61 | xor %r0,%r0,%r21 | ||
| 62 | fcpy,dbl %fr0,%fr7 | ||
| 63 | xor %r0,%r0,%r22 | ||
| 64 | fcpy,dbl %fr0,%fr8 | ||
| 65 | xor %r0,%r0,%r23 | ||
| 66 | fcpy,dbl %fr0,%fr9 | ||
| 67 | xor %r0,%r0,%r24 | ||
| 68 | fcpy,dbl %fr0,%fr10 | ||
| 69 | xor %r0,%r0,%r25 | ||
| 70 | fcpy,dbl %fr0,%fr11 | ||
| 71 | xor %r0,%r0,%r26 | ||
| 72 | fcpy,dbl %fr0,%fr22 | ||
| 73 | xor %r0,%r0,%r29 | ||
| 74 | fcpy,dbl %fr0,%fr23 | ||
| 75 | xor %r0,%r0,%r31 | ||
| 76 | fcpy,dbl %fr0,%fr24 | ||
| 77 | fcpy,dbl %fr0,%fr25 | ||
| 78 | fcpy,dbl %fr0,%fr26 | ||
| 79 | fcpy,dbl %fr0,%fr27 | ||
| 80 | fcpy,dbl %fr0,%fr28 | ||
| 81 | fcpy,dbl %fr0,%fr29 | ||
| 82 | fcpy,dbl %fr0,%fr30 | ||
| 83 | fcpy,dbl %fr0,%fr31 | ||
| 84 | bv ($rp) | ||
| 85 | .EXIT | ||
| 86 | ldo 0($sp),$rv | ||
| 87 | .PROCEND | ||
| 88 | ___ | ||
| 89 | { | ||
| 90 | my $inp="%r26"; | ||
| 91 | my $len="%r25"; | ||
| 92 | |||
| 93 | $code.=<<___; | ||
| 94 | .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR | ||
| 95 | .ALIGN 8 | ||
| 96 | OPENSSL_cleanse | ||
| 97 | .PROC | ||
| 98 | .CALLINFO NO_CALLS | ||
| 99 | .ENTRY | ||
| 100 | cmpib,*= 0,$len,Ldone | ||
| 101 | nop | ||
| 102 | cmpib,*>>= 15,$len,Little | ||
| 103 | ldi $SIZE_T-1,%r1 | ||
| 104 | |||
| 105 | Lalign | ||
| 106 | and,*<> $inp,%r1,%r28 | ||
| 107 | b,n Laligned | ||
| 108 | stb %r0,0($inp) | ||
| 109 | ldo -1($len),$len | ||
| 110 | b Lalign | ||
| 111 | ldo 1($inp),$inp | ||
| 112 | |||
| 113 | Laligned | ||
| 114 | andcm $len,%r1,%r28 | ||
| 115 | Lot | ||
| 116 | $ST %r0,0($inp) | ||
| 117 | addib,*<> -$SIZE_T,%r28,Lot | ||
| 118 | ldo $SIZE_T($inp),$inp | ||
| 119 | |||
| 120 | and,*<> $len,%r1,$len | ||
| 121 | b,n Ldone | ||
| 122 | Little | ||
| 123 | stb %r0,0($inp) | ||
| 124 | addib,*<> -1,$len,Little | ||
| 125 | ldo 1($inp),$inp | ||
| 126 | Ldone | ||
| 127 | bv ($rp) | ||
| 128 | .EXIT | ||
| 129 | nop | ||
| 130 | .PROCEND | ||
| 131 | ___ | ||
| 132 | } | ||
| 133 | { | ||
| 134 | my ($out,$cnt,$max)=("%r26","%r25","%r24"); | ||
| 135 | my ($tick,$lasttick)=("%r23","%r22"); | ||
| 136 | my ($diff,$lastdiff)=("%r21","%r20"); | ||
| 137 | |||
| 138 | $code.=<<___; | ||
| 139 | .EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR | ||
| 140 | .ALIGN 8 | ||
| 141 | OPENSSL_instrument_bus | ||
| 142 | .PROC | ||
| 143 | .CALLINFO NO_CALLS | ||
| 144 | .ENTRY | ||
| 145 | copy $cnt,$rv | ||
| 146 | mfctl %cr16,$tick | ||
| 147 | copy $tick,$lasttick | ||
| 148 | ldi 0,$diff | ||
| 149 | |||
| 150 | fdc 0($out) | ||
| 151 | ldw 0($out),$tick | ||
| 152 | add $diff,$tick,$tick | ||
| 153 | stw $tick,0($out) | ||
| 154 | Loop | ||
| 155 | mfctl %cr16,$tick | ||
| 156 | sub $tick,$lasttick,$diff | ||
| 157 | copy $tick,$lasttick | ||
| 158 | |||
| 159 | fdc 0($out) | ||
| 160 | ldw 0($out),$tick | ||
| 161 | add $diff,$tick,$tick | ||
| 162 | stw $tick,0($out) | ||
| 163 | |||
| 164 | addib,<> -1,$cnt,Loop | ||
| 165 | addi 4,$out,$out | ||
| 166 | |||
| 167 | bv ($rp) | ||
| 168 | .EXIT | ||
| 169 | sub $rv,$cnt,$rv | ||
| 170 | .PROCEND | ||
| 171 | |||
| 172 | .EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR | ||
| 173 | .ALIGN 8 | ||
| 174 | OPENSSL_instrument_bus2 | ||
| 175 | .PROC | ||
| 176 | .CALLINFO NO_CALLS | ||
| 177 | .ENTRY | ||
| 178 | copy $cnt,$rv | ||
| 179 | sub %r0,$cnt,$cnt | ||
| 180 | |||
| 181 | mfctl %cr16,$tick | ||
| 182 | copy $tick,$lasttick | ||
| 183 | ldi 0,$diff | ||
| 184 | |||
| 185 | fdc 0($out) | ||
| 186 | ldw 0($out),$tick | ||
| 187 | add $diff,$tick,$tick | ||
| 188 | stw $tick,0($out) | ||
| 189 | |||
| 190 | mfctl %cr16,$tick | ||
| 191 | sub $tick,$lasttick,$diff | ||
| 192 | copy $tick,$lasttick | ||
| 193 | Loop2 | ||
| 194 | copy $diff,$lastdiff | ||
| 195 | fdc 0($out) | ||
| 196 | ldw 0($out),$tick | ||
| 197 | add $diff,$tick,$tick | ||
| 198 | stw $tick,0($out) | ||
| 199 | |||
| 200 | addib,= -1,$max,Ldone2 | ||
| 201 | nop | ||
| 202 | |||
| 203 | mfctl %cr16,$tick | ||
| 204 | sub $tick,$lasttick,$diff | ||
| 205 | copy $tick,$lasttick | ||
| 206 | cmpclr,<> $lastdiff,$diff,$tick | ||
| 207 | ldi 1,$tick | ||
| 208 | |||
| 209 | ldi 1,%r1 | ||
| 210 | xor %r1,$tick,$tick | ||
| 211 | addb,<> $tick,$cnt,Loop2 | ||
| 212 | shladd,l $tick,2,$out,$out | ||
| 213 | Ldone2 | ||
| 214 | bv ($rp) | ||
| 215 | .EXIT | ||
| 216 | add $rv,$cnt,$rv | ||
| 217 | .PROCEND | ||
| 218 | ___ | ||
| 219 | } | ||
| 220 | $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); | ||
| 221 | $code =~ s/,\*/,/gm if ($SIZE_T==4); | ||
| 222 | print $code; | ||
| 223 | close STDOUT; | ||
| 224 | |||
diff --git a/src/lib/libcrypto/pem/pvkfmt.c b/src/lib/libcrypto/pem/pvkfmt.c index 5f130c4528..b1bf71a5da 100644 --- a/src/lib/libcrypto/pem/pvkfmt.c +++ b/src/lib/libcrypto/pem/pvkfmt.c | |||
| @@ -709,13 +709,16 @@ static int derive_pvk_key(unsigned char *key, | |||
| 709 | const unsigned char *pass, int passlen) | 709 | const unsigned char *pass, int passlen) |
| 710 | { | 710 | { |
| 711 | EVP_MD_CTX mctx; | 711 | EVP_MD_CTX mctx; |
| 712 | int rv = 1; | ||
| 712 | EVP_MD_CTX_init(&mctx); | 713 | EVP_MD_CTX_init(&mctx); |
| 713 | EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL); | 714 | if (!EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL) |
| 714 | EVP_DigestUpdate(&mctx, salt, saltlen); | 715 | || !EVP_DigestUpdate(&mctx, salt, saltlen) |
| 715 | EVP_DigestUpdate(&mctx, pass, passlen); | 716 | || !EVP_DigestUpdate(&mctx, pass, passlen) |
| 716 | EVP_DigestFinal_ex(&mctx, key, NULL); | 717 | || !EVP_DigestFinal_ex(&mctx, key, NULL)) |
| 718 | rv = 0; | ||
| 719 | |||
| 717 | EVP_MD_CTX_cleanup(&mctx); | 720 | EVP_MD_CTX_cleanup(&mctx); |
| 718 | return 1; | 721 | return rv; |
| 719 | } | 722 | } |
| 720 | 723 | ||
| 721 | 724 | ||
| @@ -727,11 +730,12 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in, | |||
| 727 | const unsigned char *p = *in; | 730 | const unsigned char *p = *in; |
| 728 | unsigned int magic; | 731 | unsigned int magic; |
| 729 | unsigned char *enctmp = NULL, *q; | 732 | unsigned char *enctmp = NULL, *q; |
| 733 | EVP_CIPHER_CTX cctx; | ||
| 734 | EVP_CIPHER_CTX_init(&cctx); | ||
| 730 | if (saltlen) | 735 | if (saltlen) |
| 731 | { | 736 | { |
| 732 | char psbuf[PEM_BUFSIZE]; | 737 | char psbuf[PEM_BUFSIZE]; |
| 733 | unsigned char keybuf[20]; | 738 | unsigned char keybuf[20]; |
| 734 | EVP_CIPHER_CTX cctx; | ||
| 735 | int enctmplen, inlen; | 739 | int enctmplen, inlen; |
| 736 | if (cb) | 740 | if (cb) |
| 737 | inlen=cb(psbuf,PEM_BUFSIZE,0,u); | 741 | inlen=cb(psbuf,PEM_BUFSIZE,0,u); |
| @@ -757,37 +761,41 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in, | |||
| 757 | p += 8; | 761 | p += 8; |
| 758 | inlen = keylen - 8; | 762 | inlen = keylen - 8; |
| 759 | q = enctmp + 8; | 763 | q = enctmp + 8; |
| 760 | EVP_CIPHER_CTX_init(&cctx); | 764 | if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL)) |
| 761 | EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL); | 765 | goto err; |
| 762 | EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen); | 766 | if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen)) |
| 763 | EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen); | 767 | goto err; |
| 768 | if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen)) | ||
| 769 | goto err; | ||
| 764 | magic = read_ledword((const unsigned char **)&q); | 770 | magic = read_ledword((const unsigned char **)&q); |
| 765 | if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) | 771 | if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) |
| 766 | { | 772 | { |
| 767 | q = enctmp + 8; | 773 | q = enctmp + 8; |
| 768 | memset(keybuf + 5, 0, 11); | 774 | memset(keybuf + 5, 0, 11); |
| 769 | EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, | 775 | if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, |
| 770 | NULL); | 776 | NULL)) |
| 777 | goto err; | ||
| 771 | OPENSSL_cleanse(keybuf, 20); | 778 | OPENSSL_cleanse(keybuf, 20); |
| 772 | EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen); | 779 | if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen)) |
| 773 | EVP_DecryptFinal_ex(&cctx, q + enctmplen, | 780 | goto err; |
| 774 | &enctmplen); | 781 | if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, |
| 782 | &enctmplen)) | ||
| 783 | goto err; | ||
| 775 | magic = read_ledword((const unsigned char **)&q); | 784 | magic = read_ledword((const unsigned char **)&q); |
| 776 | if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) | 785 | if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) |
| 777 | { | 786 | { |
| 778 | EVP_CIPHER_CTX_cleanup(&cctx); | ||
| 779 | PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT); | 787 | PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT); |
| 780 | goto err; | 788 | goto err; |
| 781 | } | 789 | } |
| 782 | } | 790 | } |
| 783 | else | 791 | else |
| 784 | OPENSSL_cleanse(keybuf, 20); | 792 | OPENSSL_cleanse(keybuf, 20); |
| 785 | EVP_CIPHER_CTX_cleanup(&cctx); | ||
| 786 | p = enctmp; | 793 | p = enctmp; |
| 787 | } | 794 | } |
| 788 | 795 | ||
| 789 | ret = b2i_PrivateKey(&p, keylen); | 796 | ret = b2i_PrivateKey(&p, keylen); |
| 790 | err: | 797 | err: |
| 798 | EVP_CIPHER_CTX_cleanup(&cctx); | ||
| 791 | if (enctmp && saltlen) | 799 | if (enctmp && saltlen) |
| 792 | OPENSSL_free(enctmp); | 800 | OPENSSL_free(enctmp); |
| 793 | return ret; | 801 | return ret; |
| @@ -841,6 +849,8 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, | |||
| 841 | { | 849 | { |
| 842 | int outlen = 24, pklen; | 850 | int outlen = 24, pklen; |
| 843 | unsigned char *p, *salt = NULL; | 851 | unsigned char *p, *salt = NULL; |
| 852 | EVP_CIPHER_CTX cctx; | ||
| 853 | EVP_CIPHER_CTX_init(&cctx); | ||
| 844 | if (enclevel) | 854 | if (enclevel) |
| 845 | outlen += PVK_SALTLEN; | 855 | outlen += PVK_SALTLEN; |
| 846 | pklen = do_i2b(NULL, pk, 0); | 856 | pklen = do_i2b(NULL, pk, 0); |
| @@ -885,7 +895,6 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, | |||
| 885 | { | 895 | { |
| 886 | char psbuf[PEM_BUFSIZE]; | 896 | char psbuf[PEM_BUFSIZE]; |
| 887 | unsigned char keybuf[20]; | 897 | unsigned char keybuf[20]; |
| 888 | EVP_CIPHER_CTX cctx; | ||
| 889 | int enctmplen, inlen; | 898 | int enctmplen, inlen; |
| 890 | if (cb) | 899 | if (cb) |
| 891 | inlen=cb(psbuf,PEM_BUFSIZE,1,u); | 900 | inlen=cb(psbuf,PEM_BUFSIZE,1,u); |
| @@ -902,16 +911,19 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, | |||
| 902 | if (enclevel == 1) | 911 | if (enclevel == 1) |
| 903 | memset(keybuf + 5, 0, 11); | 912 | memset(keybuf + 5, 0, 11); |
| 904 | p = salt + PVK_SALTLEN + 8; | 913 | p = salt + PVK_SALTLEN + 8; |
| 905 | EVP_CIPHER_CTX_init(&cctx); | 914 | if (!EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL)) |
| 906 | EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL); | 915 | goto error; |
| 907 | OPENSSL_cleanse(keybuf, 20); | 916 | OPENSSL_cleanse(keybuf, 20); |
| 908 | EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8); | 917 | if (!EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8)) |
| 909 | EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen); | 918 | goto error; |
| 910 | EVP_CIPHER_CTX_cleanup(&cctx); | 919 | if (!EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen)) |
| 920 | goto error; | ||
| 911 | } | 921 | } |
| 922 | EVP_CIPHER_CTX_cleanup(&cctx); | ||
| 912 | return outlen; | 923 | return outlen; |
| 913 | 924 | ||
| 914 | error: | 925 | error: |
| 926 | EVP_CIPHER_CTX_cleanup(&cctx); | ||
| 915 | return -1; | 927 | return -1; |
| 916 | } | 928 | } |
| 917 | 929 | ||
diff --git a/src/lib/libcrypto/perlasm/ppc-xlate.pl b/src/lib/libcrypto/perlasm/ppc-xlate.pl index 4579671c97..a3edd982b6 100755 --- a/src/lib/libcrypto/perlasm/ppc-xlate.pl +++ b/src/lib/libcrypto/perlasm/ppc-xlate.pl | |||
| @@ -31,10 +31,9 @@ my $globl = sub { | |||
| 31 | $ret .= ".type $name,\@function"; | 31 | $ret .= ".type $name,\@function"; |
| 32 | last; | 32 | last; |
| 33 | }; | 33 | }; |
| 34 | /linux.*64/ && do { $ret .= ".globl .$name\n"; | 34 | /linux.*64/ && do { $ret .= ".globl $name\n"; |
| 35 | $ret .= ".type .$name,\@function\n"; | 35 | $ret .= ".type $name,\@function\n"; |
| 36 | $ret .= ".section \".opd\",\"aw\"\n"; | 36 | $ret .= ".section \".opd\",\"aw\"\n"; |
| 37 | $ret .= ".globl $name\n"; | ||
| 38 | $ret .= ".align 3\n"; | 37 | $ret .= ".align 3\n"; |
| 39 | $ret .= "$name:\n"; | 38 | $ret .= "$name:\n"; |
| 40 | $ret .= ".quad .$name,.TOC.\@tocbase,0\n"; | 39 | $ret .= ".quad .$name,.TOC.\@tocbase,0\n"; |
| @@ -62,6 +61,14 @@ my $machine = sub { | |||
| 62 | } | 61 | } |
| 63 | ".machine $arch"; | 62 | ".machine $arch"; |
| 64 | }; | 63 | }; |
| 64 | my $size = sub { | ||
| 65 | if ($flavour =~ /linux.*32/) | ||
| 66 | { shift; | ||
| 67 | ".size " . join(",",@_); | ||
| 68 | } | ||
| 69 | else | ||
| 70 | { ""; } | ||
| 71 | }; | ||
| 65 | my $asciz = sub { | 72 | my $asciz = sub { |
| 66 | shift; | 73 | shift; |
| 67 | my $line = join(",",@_); | 74 | my $line = join(",",@_); |
diff --git a/src/lib/libcrypto/ppccap.c b/src/lib/libcrypto/ppccap.c new file mode 100644 index 0000000000..ab89ccaa12 --- /dev/null +++ b/src/lib/libcrypto/ppccap.c | |||
| @@ -0,0 +1,115 @@ | |||
| 1 | #include <stdio.h> | ||
| 2 | #include <stdlib.h> | ||
| 3 | #include <string.h> | ||
| 4 | #include <setjmp.h> | ||
| 5 | #include <signal.h> | ||
| 6 | #include <crypto.h> | ||
| 7 | #include <openssl/bn.h> | ||
| 8 | |||
| 9 | #define PPC_FPU64 (1<<0) | ||
| 10 | #define PPC_ALTIVEC (1<<1) | ||
| 11 | |||
| 12 | static int OPENSSL_ppccap_P = 0; | ||
| 13 | |||
| 14 | static sigset_t all_masked; | ||
| 15 | |||
| 16 | #ifdef OPENSSL_BN_ASM_MONT | ||
| 17 | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num) | ||
| 18 | { | ||
| 19 | int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); | ||
| 20 | int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); | ||
| 21 | |||
| 22 | if (sizeof(size_t)==4) | ||
| 23 | { | ||
| 24 | #if (defined(__APPLE__) && defined(__MACH__)) | ||
| 25 | if (num>=8 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64)) | ||
| 26 | return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); | ||
| 27 | #else | ||
| 28 | /* boundary of 32 was experimentally determined on | ||
| 29 | Linux 2.6.22, might have to be adjusted on AIX... */ | ||
| 30 | if (num>=32 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64)) | ||
| 31 | { | ||
| 32 | sigset_t oset; | ||
| 33 | int ret; | ||
| 34 | |||
| 35 | sigprocmask(SIG_SETMASK,&all_masked,&oset); | ||
| 36 | ret=bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); | ||
| 37 | sigprocmask(SIG_SETMASK,&oset,NULL); | ||
| 38 | |||
| 39 | return ret; | ||
| 40 | } | ||
| 41 | #endif | ||
| 42 | } | ||
| 43 | else if ((OPENSSL_ppccap_P&PPC_FPU64)) | ||
| 44 | /* this is a "must" on POWER6, but run-time detection | ||
| 45 | * is not implemented yet... */ | ||
| 46 | return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); | ||
| 47 | |||
| 48 | return bn_mul_mont_int(rp,ap,bp,np,n0,num); | ||
| 49 | } | ||
| 50 | #endif | ||
| 51 | |||
| 52 | static sigjmp_buf ill_jmp; | ||
| 53 | static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } | ||
| 54 | |||
| 55 | void OPENSSL_ppc64_probe(void); | ||
| 56 | |||
| 57 | void OPENSSL_cpuid_setup(void) | ||
| 58 | { | ||
| 59 | char *e; | ||
| 60 | struct sigaction ill_oact,ill_act; | ||
| 61 | sigset_t oset; | ||
| 62 | static int trigger=0; | ||
| 63 | |||
| 64 | if (trigger) return; | ||
| 65 | trigger=1; | ||
| 66 | |||
| 67 | sigfillset(&all_masked); | ||
| 68 | sigdelset(&all_masked,SIGILL); | ||
| 69 | sigdelset(&all_masked,SIGTRAP); | ||
| 70 | #ifdef SIGEMT | ||
| 71 | sigdelset(&all_masked,SIGEMT); | ||
| 72 | #endif | ||
| 73 | sigdelset(&all_masked,SIGFPE); | ||
| 74 | sigdelset(&all_masked,SIGBUS); | ||
| 75 | sigdelset(&all_masked,SIGSEGV); | ||
| 76 | |||
| 77 | if ((e=getenv("OPENSSL_ppccap"))) | ||
| 78 | { | ||
| 79 | OPENSSL_ppccap_P=strtoul(e,NULL,0); | ||
| 80 | return; | ||
| 81 | } | ||
| 82 | |||
| 83 | OPENSSL_ppccap_P = 0; | ||
| 84 | |||
| 85 | memset(&ill_act,0,sizeof(ill_act)); | ||
| 86 | ill_act.sa_handler = ill_handler; | ||
| 87 | ill_act.sa_mask = all_masked; | ||
| 88 | |||
| 89 | sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset); | ||
| 90 | sigaction(SIGILL,&ill_act,&ill_oact); | ||
| 91 | |||
| 92 | if (sizeof(size_t)==4) | ||
| 93 | { | ||
| 94 | if (sigsetjmp(ill_jmp,1) == 0) | ||
| 95 | { | ||
| 96 | OPENSSL_ppc64_probe(); | ||
| 97 | OPENSSL_ppccap_P |= PPC_FPU64; | ||
| 98 | } | ||
| 99 | } | ||
| 100 | else | ||
| 101 | { | ||
| 102 | /* | ||
| 103 | * Wanted code detecting POWER6 CPU and setting PPC_FPU64 | ||
| 104 | */ | ||
| 105 | } | ||
| 106 | |||
| 107 | if (sigsetjmp(ill_jmp,1) == 0) | ||
| 108 | { | ||
| 109 | OPENSSL_altivec_probe(); | ||
| 110 | OPENSSL_ppccap_P |= PPC_ALTIVEC; | ||
| 111 | } | ||
| 112 | |||
| 113 | sigaction (SIGILL,&ill_oact,NULL); | ||
| 114 | sigprocmask(SIG_SETMASK,&oset,NULL); | ||
| 115 | } | ||
diff --git a/src/lib/libcrypto/ppccpuid.pl b/src/lib/libcrypto/ppccpuid.pl index 369e1d0df9..4ba736a1d1 100755 --- a/src/lib/libcrypto/ppccpuid.pl +++ b/src/lib/libcrypto/ppccpuid.pl | |||
| @@ -23,36 +23,67 @@ $code=<<___; | |||
| 23 | .machine "any" | 23 | .machine "any" |
| 24 | .text | 24 | .text |
| 25 | 25 | ||
| 26 | .globl .OPENSSL_cpuid_setup | 26 | .globl .OPENSSL_ppc64_probe |
| 27 | .align 4 | 27 | .align 4 |
| 28 | .OPENSSL_cpuid_setup: | 28 | .OPENSSL_ppc64_probe: |
| 29 | fcfid f1,f1 | ||
| 30 | extrdi r0,r0,32,0 | ||
| 29 | blr | 31 | blr |
| 32 | .long 0 | ||
| 33 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 34 | |||
| 35 | .globl .OPENSSL_altivec_probe | ||
| 36 | .align 4 | ||
| 37 | .OPENSSL_altivec_probe: | ||
| 38 | .long 0x10000484 # vor v0,v0,v0 | ||
| 39 | blr | ||
| 40 | .long 0 | ||
| 41 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 30 | 42 | ||
| 31 | .globl .OPENSSL_wipe_cpu | 43 | .globl .OPENSSL_wipe_cpu |
| 32 | .align 4 | 44 | .align 4 |
| 33 | .OPENSSL_wipe_cpu: | 45 | .OPENSSL_wipe_cpu: |
| 34 | xor r0,r0,r0 | 46 | xor r0,r0,r0 |
| 47 | fmr f0,f31 | ||
| 48 | fmr f1,f31 | ||
| 49 | fmr f2,f31 | ||
| 35 | mr r3,r1 | 50 | mr r3,r1 |
| 51 | fmr f3,f31 | ||
| 36 | xor r4,r4,r4 | 52 | xor r4,r4,r4 |
| 53 | fmr f4,f31 | ||
| 37 | xor r5,r5,r5 | 54 | xor r5,r5,r5 |
| 55 | fmr f5,f31 | ||
| 38 | xor r6,r6,r6 | 56 | xor r6,r6,r6 |
| 57 | fmr f6,f31 | ||
| 39 | xor r7,r7,r7 | 58 | xor r7,r7,r7 |
| 59 | fmr f7,f31 | ||
| 40 | xor r8,r8,r8 | 60 | xor r8,r8,r8 |
| 61 | fmr f8,f31 | ||
| 41 | xor r9,r9,r9 | 62 | xor r9,r9,r9 |
| 63 | fmr f9,f31 | ||
| 42 | xor r10,r10,r10 | 64 | xor r10,r10,r10 |
| 65 | fmr f10,f31 | ||
| 43 | xor r11,r11,r11 | 66 | xor r11,r11,r11 |
| 67 | fmr f11,f31 | ||
| 44 | xor r12,r12,r12 | 68 | xor r12,r12,r12 |
| 69 | fmr f12,f31 | ||
| 70 | fmr f13,f31 | ||
| 45 | blr | 71 | blr |
| 72 | .long 0 | ||
| 73 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 46 | 74 | ||
| 47 | .globl .OPENSSL_atomic_add | 75 | .globl .OPENSSL_atomic_add |
| 48 | .align 4 | 76 | .align 4 |
| 49 | .OPENSSL_atomic_add: | 77 | .OPENSSL_atomic_add: |
| 50 | Loop: lwarx r5,0,r3 | 78 | Ladd: lwarx r5,0,r3 |
| 51 | add r0,r4,r5 | 79 | add r0,r4,r5 |
| 52 | stwcx. r0,0,r3 | 80 | stwcx. r0,0,r3 |
| 53 | bne- Loop | 81 | bne- Ladd |
| 54 | $SIGNX r3,r0 | 82 | $SIGNX r3,r0 |
| 55 | blr | 83 | blr |
| 84 | .long 0 | ||
| 85 | .byte 0,12,0x14,0,0,0,2,0 | ||
| 86 | .long 0 | ||
| 56 | 87 | ||
| 57 | .globl .OPENSSL_rdtsc | 88 | .globl .OPENSSL_rdtsc |
| 58 | .align 4 | 89 | .align 4 |
| @@ -60,6 +91,8 @@ Loop: lwarx r5,0,r3 | |||
| 60 | mftb r3 | 91 | mftb r3 |
| 61 | mftbu r4 | 92 | mftbu r4 |
| 62 | blr | 93 | blr |
| 94 | .long 0 | ||
| 95 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 63 | 96 | ||
| 64 | .globl .OPENSSL_cleanse | 97 | .globl .OPENSSL_cleanse |
| 65 | .align 4 | 98 | .align 4 |
| @@ -72,7 +105,7 @@ Loop: lwarx r5,0,r3 | |||
| 72 | Little: mtctr r4 | 105 | Little: mtctr r4 |
| 73 | stb r0,0(r3) | 106 | stb r0,0(r3) |
| 74 | addi r3,r3,1 | 107 | addi r3,r3,1 |
| 75 | bdnz- \$-8 | 108 | bdnz \$-8 |
| 76 | blr | 109 | blr |
| 77 | Lot: andi. r5,r3,3 | 110 | Lot: andi. r5,r3,3 |
| 78 | beq Laligned | 111 | beq Laligned |
| @@ -85,10 +118,13 @@ Laligned: | |||
| 85 | mtctr r5 | 118 | mtctr r5 |
| 86 | stw r0,0(r3) | 119 | stw r0,0(r3) |
| 87 | addi r3,r3,4 | 120 | addi r3,r3,4 |
| 88 | bdnz- \$-8 | 121 | bdnz \$-8 |
| 89 | andi. r4,r4,3 | 122 | andi. r4,r4,3 |
| 90 | bne Little | 123 | bne Little |
| 91 | blr | 124 | blr |
| 125 | .long 0 | ||
| 126 | .byte 0,12,0x14,0,0,0,2,0 | ||
| 127 | .long 0 | ||
| 92 | ___ | 128 | ___ |
| 93 | 129 | ||
| 94 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 130 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl new file mode 100644 index 0000000000..7f684092d4 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl | |||
| @@ -0,0 +1,631 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # June 2011 | ||
| 11 | # | ||
| 12 | # This is RC4+MD5 "stitch" implementation. The idea, as spelled in | ||
| 13 | # http://download.intel.com/design/intarch/papers/323686.pdf, is that | ||
| 14 | # since both algorithms exhibit instruction-level parallelism, ILP, | ||
| 15 | # below theoretical maximum, interleaving them would allow to utilize | ||
| 16 | # processor resources better and achieve better performance. RC4 | ||
| 17 | # instruction sequence is virtually identical to rc4-x86_64.pl, which | ||
| 18 | # is heavily based on submission by Maxim Perminov, Maxim Locktyukhin | ||
| 19 | # and Jim Guilford of Intel. MD5 is fresh implementation aiming to | ||
| 20 | # minimize register usage, which was used as "main thread" with RC4 | ||
| 21 | # weaved into it, one RC4 round per one MD5 round. In addition to the | ||
| 22 | # stiched subroutine the script can generate standalone replacement | ||
| 23 | # md5_block_asm_data_order and RC4. Below are performance numbers in | ||
| 24 | # cycles per processed byte, less is better, for these the standalone | ||
| 25 | # subroutines, sum of them, and stitched one: | ||
| 26 | # | ||
| 27 | # RC4 MD5 RC4+MD5 stitch gain | ||
| 28 | # Opteron 6.5(*) 5.4 11.9 7.0 +70%(*) | ||
| 29 | # Core2 6.5 5.8 12.3 7.7 +60% | ||
| 30 | # Westmere 4.3 5.2 9.5 7.0 +36% | ||
| 31 | # Sandy Bridge 4.2 5.5 9.7 6.8 +43% | ||
| 32 | # Atom 9.3 6.5 15.8 11.1 +42% | ||
| 33 | # | ||
| 34 | # (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement | ||
| 35 | # is +53%... | ||
| 36 | |||
| 37 | my ($rc4,$md5)=(1,1); # what to generate? | ||
| 38 | my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(), | ||
| 39 | # but its result is discarded. Idea here is | ||
| 40 | # to be able to use 'openssl speed rc4' for | ||
| 41 | # benchmarking the stitched subroutine... | ||
| 42 | |||
| 43 | my $flavour = shift; | ||
| 44 | my $output = shift; | ||
| 45 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 46 | |||
| 47 | my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 48 | |||
| 49 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; | ||
| 50 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 51 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 52 | die "can't locate x86_64-xlate.pl"; | ||
| 53 | |||
| 54 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 55 | |||
| 56 | my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs); | ||
| 57 | |||
| 58 | if ($rc4 && !$md5) { | ||
| 59 | ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx"); | ||
| 60 | $func="RC4"; $nargs=4; | ||
| 61 | } elsif ($md5 && !$rc4) { | ||
| 62 | ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx"); | ||
| 63 | $func="md5_block_asm_data_order"; $nargs=3; | ||
| 64 | } else { | ||
| 65 | ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||
| 66 | $func="rc4_md5_enc"; $nargs=6; | ||
| 67 | # void rc4_md5_enc( | ||
| 68 | # RC4_KEY *key, # | ||
| 69 | # const void *in0, # RC4 input | ||
| 70 | # void *out, # RC4 output | ||
| 71 | # MD5_CTX *ctx, # | ||
| 72 | # const void *inp, # MD5 input | ||
| 73 | # size_t len); # number of 64-byte blocks | ||
| 74 | } | ||
| 75 | |||
| 76 | my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, | ||
| 77 | 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, | ||
| 78 | 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, | ||
| 79 | 0x6b901122,0xfd987193,0xa679438e,0x49b40821, | ||
| 80 | |||
| 81 | 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, | ||
| 82 | 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, | ||
| 83 | 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, | ||
| 84 | 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, | ||
| 85 | |||
| 86 | 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, | ||
| 87 | 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, | ||
| 88 | 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, | ||
| 89 | 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, | ||
| 90 | |||
| 91 | 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, | ||
| 92 | 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, | ||
| 93 | 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, | ||
| 94 | 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 ); | ||
| 95 | |||
| 96 | my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers | ||
| 97 | my $tmp="%r12d"; | ||
| 98 | |||
| 99 | my @XX=("%rbp","%rsi"); # RC4 registers | ||
| 100 | my @TX=("%rax","%rbx"); | ||
| 101 | my $YY="%rcx"; | ||
| 102 | my $TY="%rdx"; | ||
| 103 | |||
| 104 | my $MOD=32; # 16, 32 or 64 | ||
| 105 | |||
| 106 | $code.=<<___; | ||
| 107 | .text | ||
| 108 | .align 16 | ||
| 109 | |||
| 110 | .globl $func | ||
| 111 | .type $func,\@function,$nargs | ||
| 112 | $func: | ||
| 113 | cmp \$0,$len | ||
| 114 | je .Labort | ||
| 115 | push %rbx | ||
| 116 | push %rbp | ||
| 117 | push %r12 | ||
| 118 | push %r13 | ||
| 119 | push %r14 | ||
| 120 | push %r15 | ||
| 121 | sub \$40,%rsp | ||
| 122 | .Lbody: | ||
| 123 | ___ | ||
| 124 | if ($rc4) { | ||
| 125 | $code.=<<___; | ||
| 126 | $D#md5# mov $ctx,%r11 # reassign arguments | ||
| 127 | mov $len,%r12 | ||
| 128 | mov $in0,%r13 | ||
| 129 | mov $out,%r14 | ||
| 130 | $D#md5# mov $inp,%r15 | ||
| 131 | ___ | ||
| 132 | $ctx="%r11" if ($md5); # reassign arguments | ||
| 133 | $len="%r12"; | ||
| 134 | $in0="%r13"; | ||
| 135 | $out="%r14"; | ||
| 136 | $inp="%r15" if ($md5); | ||
| 137 | $inp=$in0 if (!$md5); | ||
| 138 | $code.=<<___; | ||
| 139 | xor $XX[0],$XX[0] | ||
| 140 | xor $YY,$YY | ||
| 141 | |||
| 142 | lea 8($dat),$dat | ||
| 143 | mov -8($dat),$XX[0]#b | ||
| 144 | mov -4($dat),$YY#b | ||
| 145 | |||
| 146 | inc $XX[0]#b | ||
| 147 | sub $in0,$out | ||
| 148 | movl ($dat,$XX[0],4),$TX[0]#d | ||
| 149 | ___ | ||
| 150 | $code.=<<___ if (!$md5); | ||
| 151 | xor $TX[1],$TX[1] | ||
| 152 | test \$-128,$len | ||
| 153 | jz .Loop1 | ||
| 154 | sub $XX[0],$TX[1] | ||
| 155 | and \$`$MOD-1`,$TX[1] | ||
| 156 | jz .Loop${MOD}_is_hot | ||
| 157 | sub $TX[1],$len | ||
| 158 | .Loop${MOD}_warmup: | ||
| 159 | add $TX[0]#b,$YY#b | ||
| 160 | movl ($dat,$YY,4),$TY#d | ||
| 161 | movl $TX[0]#d,($dat,$YY,4) | ||
| 162 | movl $TY#d,($dat,$XX[0],4) | ||
| 163 | add $TY#b,$TX[0]#b | ||
| 164 | inc $XX[0]#b | ||
| 165 | movl ($dat,$TX[0],4),$TY#d | ||
| 166 | movl ($dat,$XX[0],4),$TX[0]#d | ||
| 167 | xorb ($in0),$TY#b | ||
| 168 | movb $TY#b,($out,$in0) | ||
| 169 | lea 1($in0),$in0 | ||
| 170 | dec $TX[1] | ||
| 171 | jnz .Loop${MOD}_warmup | ||
| 172 | |||
| 173 | mov $YY,$TX[1] | ||
| 174 | xor $YY,$YY | ||
| 175 | mov $TX[1]#b,$YY#b | ||
| 176 | |||
| 177 | .Loop${MOD}_is_hot: | ||
| 178 | mov $len,32(%rsp) # save original $len | ||
| 179 | shr \$6,$len # number of 64-byte blocks | ||
| 180 | ___ | ||
| 181 | if ($D && !$md5) { # stitch in dummy MD5 | ||
| 182 | $md5=1; | ||
| 183 | $ctx="%r11"; | ||
| 184 | $inp="%r15"; | ||
| 185 | $code.=<<___; | ||
| 186 | mov %rsp,$ctx | ||
| 187 | mov $in0,$inp | ||
| 188 | ___ | ||
| 189 | } | ||
| 190 | } | ||
| 191 | $code.=<<___; | ||
| 192 | #rc4# add $TX[0]#b,$YY#b | ||
| 193 | #rc4# lea ($dat,$XX[0],4),$XX[1] | ||
| 194 | shl \$6,$len | ||
| 195 | add $inp,$len # pointer to the end of input | ||
| 196 | mov $len,16(%rsp) | ||
| 197 | |||
| 198 | #md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX | ||
| 199 | #md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX | ||
| 200 | #md5# mov 1*4($ctx),$V[1] | ||
| 201 | #md5# mov 2*4($ctx),$V[2] | ||
| 202 | #md5# mov 3*4($ctx),$V[3] | ||
| 203 | jmp .Loop | ||
| 204 | |||
| 205 | .align 16 | ||
| 206 | .Loop: | ||
| 207 | #md5# mov $V[0],0*4(%rsp) # put aside current hash value | ||
| 208 | #md5# mov $V[1],1*4(%rsp) | ||
| 209 | #md5# mov $V[2],2*4(%rsp) | ||
| 210 | #md5# mov $V[3],$tmp # forward reference | ||
| 211 | #md5# mov $V[3],3*4(%rsp) | ||
| 212 | ___ | ||
| 213 | |||
| 214 | sub R0 { | ||
| 215 | my ($i,$a,$b,$c,$d)=@_; | ||
| 216 | my @rot0=(7,12,17,22); | ||
| 217 | my $j=$i%16; | ||
| 218 | my $k=$i%$MOD; | ||
| 219 | my $xmm="%xmm".($j&1); | ||
| 220 | $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15); | ||
| 221 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 222 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 223 | $code.=<<___; | ||
| 224 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 225 | #md5# xor $c,$tmp | ||
| 226 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 227 | #md5# and $b,$tmp | ||
| 228 | #md5# add 4*`$j`($inp),$a | ||
| 229 | #rc4# add $TY#b,$TX[0]#b | ||
| 230 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 231 | #md5# add \$$K[$i],$a | ||
| 232 | #md5# xor $d,$tmp | ||
| 233 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 234 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 235 | #md5# add $tmp,$a | ||
| 236 | #rc4# add $TX[1]#b,$YY#b | ||
| 237 | #md5# rol \$$rot0[$j%4],$a | ||
| 238 | #md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference | ||
| 239 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 240 | #md5# add $b,$a | ||
| 241 | ___ | ||
| 242 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 243 | mov $YY,$XX[1] | ||
| 244 | xor $YY,$YY # keyword to partial register | ||
| 245 | mov $XX[1]#b,$YY#b | ||
| 246 | lea ($dat,$XX[0],4),$XX[1] | ||
| 247 | ___ | ||
| 248 | $code.=<<___ if ($rc4 && $j==15); | ||
| 249 | psllq \$8,%xmm1 | ||
| 250 | pxor %xmm0,%xmm2 | ||
| 251 | pxor %xmm1,%xmm2 | ||
| 252 | ___ | ||
| 253 | } | ||
| 254 | sub R1 { | ||
| 255 | my ($i,$a,$b,$c,$d)=@_; | ||
| 256 | my @rot1=(5,9,14,20); | ||
| 257 | my $j=$i%16; | ||
| 258 | my $k=$i%$MOD; | ||
| 259 | my $xmm="%xmm".($j&1); | ||
| 260 | $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15); | ||
| 261 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 262 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 263 | $code.=<<___; | ||
| 264 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 265 | #md5# xor $b,$tmp | ||
| 266 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 267 | #md5# and $d,$tmp | ||
| 268 | #md5# add 4*`((1+5*$j)%16)`($inp),$a | ||
| 269 | #rc4# add $TY#b,$TX[0]#b | ||
| 270 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 271 | #md5# add \$$K[$i],$a | ||
| 272 | #md5# xor $c,$tmp | ||
| 273 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 274 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 275 | #md5# add $tmp,$a | ||
| 276 | #rc4# add $TX[1]#b,$YY#b | ||
| 277 | #md5# rol \$$rot1[$j%4],$a | ||
| 278 | #md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference | ||
| 279 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 280 | #md5# add $b,$a | ||
| 281 | ___ | ||
| 282 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 283 | mov $YY,$XX[1] | ||
| 284 | xor $YY,$YY # keyword to partial register | ||
| 285 | mov $XX[1]#b,$YY#b | ||
| 286 | lea ($dat,$XX[0],4),$XX[1] | ||
| 287 | ___ | ||
| 288 | $code.=<<___ if ($rc4 && $j==15); | ||
| 289 | psllq \$8,%xmm1 | ||
| 290 | pxor %xmm0,%xmm3 | ||
| 291 | pxor %xmm1,%xmm3 | ||
| 292 | ___ | ||
| 293 | } | ||
| 294 | sub R2 { | ||
| 295 | my ($i,$a,$b,$c,$d)=@_; | ||
| 296 | my @rot2=(4,11,16,23); | ||
| 297 | my $j=$i%16; | ||
| 298 | my $k=$i%$MOD; | ||
| 299 | my $xmm="%xmm".($j&1); | ||
| 300 | $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15); | ||
| 301 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 302 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 303 | $code.=<<___; | ||
| 304 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 305 | #md5# xor $c,$tmp | ||
| 306 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 307 | #md5# xor $b,$tmp | ||
| 308 | #md5# add 4*`((5+3*$j)%16)`($inp),$a | ||
| 309 | #rc4# add $TY#b,$TX[0]#b | ||
| 310 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 311 | #md5# add \$$K[$i],$a | ||
| 312 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 313 | #md5# add $tmp,$a | ||
| 314 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 315 | #rc4# add $TX[1]#b,$YY#b | ||
| 316 | #md5# rol \$$rot2[$j%4],$a | ||
| 317 | #md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference | ||
| 318 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 319 | #md5# add $b,$a | ||
| 320 | ___ | ||
| 321 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 322 | mov $YY,$XX[1] | ||
| 323 | xor $YY,$YY # keyword to partial register | ||
| 324 | mov $XX[1]#b,$YY#b | ||
| 325 | lea ($dat,$XX[0],4),$XX[1] | ||
| 326 | ___ | ||
| 327 | $code.=<<___ if ($rc4 && $j==15); | ||
| 328 | psllq \$8,%xmm1 | ||
| 329 | pxor %xmm0,%xmm4 | ||
| 330 | pxor %xmm1,%xmm4 | ||
| 331 | ___ | ||
| 332 | } | ||
| 333 | sub R3 { | ||
| 334 | my ($i,$a,$b,$c,$d)=@_; | ||
| 335 | my @rot3=(6,10,15,21); | ||
| 336 | my $j=$i%16; | ||
| 337 | my $k=$i%$MOD; | ||
| 338 | my $xmm="%xmm".($j&1); | ||
| 339 | $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15); | ||
| 340 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 341 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 342 | $code.=<<___; | ||
| 343 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 344 | #md5# xor $d,$tmp | ||
| 345 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 346 | #md5# or $b,$tmp | ||
| 347 | #md5# add 4*`((7*$j)%16)`($inp),$a | ||
| 348 | #rc4# add $TY#b,$TX[0]#b | ||
| 349 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 350 | #md5# add \$$K[$i],$a | ||
| 351 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 352 | #md5# xor $c,$tmp | ||
| 353 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 354 | #md5# add $tmp,$a | ||
| 355 | #rc4# add $TX[1]#b,$YY#b | ||
| 356 | #md5# rol \$$rot3[$j%4],$a | ||
| 357 | #md5# mov \$-1,$tmp # forward reference | ||
| 358 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 359 | #md5# add $b,$a | ||
| 360 | ___ | ||
| 361 | $code.=<<___ if ($rc4 && $j==15); | ||
| 362 | mov $XX[0],$XX[1] | ||
| 363 | xor $XX[0],$XX[0] # keyword to partial register | ||
| 364 | mov $XX[1]#b,$XX[0]#b | ||
| 365 | mov $YY,$XX[1] | ||
| 366 | xor $YY,$YY # keyword to partial register | ||
| 367 | mov $XX[1]#b,$YY#b | ||
| 368 | lea ($dat,$XX[0],4),$XX[1] | ||
| 369 | psllq \$8,%xmm1 | ||
| 370 | pxor %xmm0,%xmm5 | ||
| 371 | pxor %xmm1,%xmm5 | ||
| 372 | ___ | ||
| 373 | } | ||
| 374 | |||
| 375 | my $i=0; | ||
| 376 | for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 377 | for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 378 | for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 379 | for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 380 | |||
| 381 | $code.=<<___; | ||
| 382 | #md5# add 0*4(%rsp),$V[0] # accumulate hash value | ||
| 383 | #md5# add 1*4(%rsp),$V[1] | ||
| 384 | #md5# add 2*4(%rsp),$V[2] | ||
| 385 | #md5# add 3*4(%rsp),$V[3] | ||
| 386 | |||
| 387 | #rc4# movdqu %xmm2,($out,$in0) # write RC4 output | ||
| 388 | #rc4# movdqu %xmm3,16($out,$in0) | ||
| 389 | #rc4# movdqu %xmm4,32($out,$in0) | ||
| 390 | #rc4# movdqu %xmm5,48($out,$in0) | ||
| 391 | #md5# lea 64($inp),$inp | ||
| 392 | #rc4# lea 64($in0),$in0 | ||
| 393 | cmp 16(%rsp),$inp # are we done? | ||
| 394 | jb .Loop | ||
| 395 | |||
| 396 | #md5# mov 24(%rsp),$len # restore pointer to MD5_CTX | ||
| 397 | #rc4# sub $TX[0]#b,$YY#b # correct $YY | ||
| 398 | #md5# mov $V[0],0*4($len) # write MD5_CTX | ||
| 399 | #md5# mov $V[1],1*4($len) | ||
| 400 | #md5# mov $V[2],2*4($len) | ||
| 401 | #md5# mov $V[3],3*4($len) | ||
| 402 | ___ | ||
| 403 | $code.=<<___ if ($rc4 && (!$md5 || $D)); | ||
| 404 | mov 32(%rsp),$len # restore original $len | ||
| 405 | and \$63,$len # remaining bytes | ||
| 406 | jnz .Loop1 | ||
| 407 | jmp .Ldone | ||
| 408 | |||
| 409 | .align 16 | ||
| 410 | .Loop1: | ||
| 411 | add $TX[0]#b,$YY#b | ||
| 412 | movl ($dat,$YY,4),$TY#d | ||
| 413 | movl $TX[0]#d,($dat,$YY,4) | ||
| 414 | movl $TY#d,($dat,$XX[0],4) | ||
| 415 | add $TY#b,$TX[0]#b | ||
| 416 | inc $XX[0]#b | ||
| 417 | movl ($dat,$TX[0],4),$TY#d | ||
| 418 | movl ($dat,$XX[0],4),$TX[0]#d | ||
| 419 | xorb ($in0),$TY#b | ||
| 420 | movb $TY#b,($out,$in0) | ||
| 421 | lea 1($in0),$in0 | ||
| 422 | dec $len | ||
| 423 | jnz .Loop1 | ||
| 424 | |||
| 425 | .Ldone: | ||
| 426 | ___ | ||
| 427 | $code.=<<___; | ||
| 428 | #rc4# sub \$1,$XX[0]#b | ||
| 429 | #rc4# movl $XX[0]#d,-8($dat) | ||
| 430 | #rc4# movl $YY#d,-4($dat) | ||
| 431 | |||
| 432 | mov 40(%rsp),%r15 | ||
| 433 | mov 48(%rsp),%r14 | ||
| 434 | mov 56(%rsp),%r13 | ||
| 435 | mov 64(%rsp),%r12 | ||
| 436 | mov 72(%rsp),%rbp | ||
| 437 | mov 80(%rsp),%rbx | ||
| 438 | lea 88(%rsp),%rsp | ||
| 439 | .Lepilogue: | ||
| 440 | .Labort: | ||
| 441 | ret | ||
| 442 | .size $func,.-$func | ||
| 443 | ___ | ||
| 444 | |||
| 445 | if ($rc4 && $D) { # sole purpose of this section is to provide | ||
| 446 | # option to use the generated module as drop-in | ||
| 447 | # replacement for rc4-x86_64.pl for debugging | ||
| 448 | # and testing purposes... | ||
| 449 | my ($idx,$ido)=("%r8","%r9"); | ||
| 450 | my ($dat,$len,$inp)=("%rdi","%rsi","%rdx"); | ||
| 451 | |||
| 452 | $code.=<<___; | ||
| 453 | .globl RC4_set_key | ||
| 454 | .type RC4_set_key,\@function,3 | ||
| 455 | .align 16 | ||
| 456 | RC4_set_key: | ||
| 457 | lea 8($dat),$dat | ||
| 458 | lea ($inp,$len),$inp | ||
| 459 | neg $len | ||
| 460 | mov $len,%rcx | ||
| 461 | xor %eax,%eax | ||
| 462 | xor $ido,$ido | ||
| 463 | xor %r10,%r10 | ||
| 464 | xor %r11,%r11 | ||
| 465 | jmp .Lw1stloop | ||
| 466 | |||
| 467 | .align 16 | ||
| 468 | .Lw1stloop: | ||
| 469 | mov %eax,($dat,%rax,4) | ||
| 470 | add \$1,%al | ||
| 471 | jnc .Lw1stloop | ||
| 472 | |||
| 473 | xor $ido,$ido | ||
| 474 | xor $idx,$idx | ||
| 475 | .align 16 | ||
| 476 | .Lw2ndloop: | ||
| 477 | mov ($dat,$ido,4),%r10d | ||
| 478 | add ($inp,$len,1),$idx#b | ||
| 479 | add %r10b,$idx#b | ||
| 480 | add \$1,$len | ||
| 481 | mov ($dat,$idx,4),%r11d | ||
| 482 | cmovz %rcx,$len | ||
| 483 | mov %r10d,($dat,$idx,4) | ||
| 484 | mov %r11d,($dat,$ido,4) | ||
| 485 | add \$1,$ido#b | ||
| 486 | jnc .Lw2ndloop | ||
| 487 | |||
| 488 | xor %eax,%eax | ||
| 489 | mov %eax,-8($dat) | ||
| 490 | mov %eax,-4($dat) | ||
| 491 | ret | ||
| 492 | .size RC4_set_key,.-RC4_set_key | ||
| 493 | |||
| 494 | .globl RC4_options | ||
| 495 | .type RC4_options,\@abi-omnipotent | ||
| 496 | .align 16 | ||
| 497 | RC4_options: | ||
| 498 | lea .Lopts(%rip),%rax | ||
| 499 | ret | ||
| 500 | .align 64 | ||
| 501 | .Lopts: | ||
| 502 | .asciz "rc4(64x,int)" | ||
| 503 | .align 64 | ||
| 504 | .size RC4_options,.-RC4_options | ||
| 505 | ___ | ||
| 506 | } | ||
| 507 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 508 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 509 | if ($win64) { | ||
| 510 | my $rec="%rcx"; | ||
| 511 | my $frame="%rdx"; | ||
| 512 | my $context="%r8"; | ||
| 513 | my $disp="%r9"; | ||
| 514 | |||
| 515 | $code.=<<___; | ||
| 516 | .extern __imp_RtlVirtualUnwind | ||
| 517 | .type se_handler,\@abi-omnipotent | ||
| 518 | .align 16 | ||
| 519 | se_handler: | ||
| 520 | push %rsi | ||
| 521 | push %rdi | ||
| 522 | push %rbx | ||
| 523 | push %rbp | ||
| 524 | push %r12 | ||
| 525 | push %r13 | ||
| 526 | push %r14 | ||
| 527 | push %r15 | ||
| 528 | pushfq | ||
| 529 | sub \$64,%rsp | ||
| 530 | |||
| 531 | mov 120($context),%rax # pull context->Rax | ||
| 532 | mov 248($context),%rbx # pull context->Rip | ||
| 533 | |||
| 534 | lea .Lbody(%rip),%r10 | ||
| 535 | cmp %r10,%rbx # context->Rip<.Lbody | ||
| 536 | jb .Lin_prologue | ||
| 537 | |||
| 538 | mov 152($context),%rax # pull context->Rsp | ||
| 539 | |||
| 540 | lea .Lepilogue(%rip),%r10 | ||
| 541 | cmp %r10,%rbx # context->Rip>=.Lepilogue | ||
| 542 | jae .Lin_prologue | ||
| 543 | |||
| 544 | mov 40(%rax),%r15 | ||
| 545 | mov 48(%rax),%r14 | ||
| 546 | mov 56(%rax),%r13 | ||
| 547 | mov 64(%rax),%r12 | ||
| 548 | mov 72(%rax),%rbp | ||
| 549 | mov 80(%rax),%rbx | ||
| 550 | lea 88(%rax),%rax | ||
| 551 | |||
| 552 | mov %rbx,144($context) # restore context->Rbx | ||
| 553 | mov %rbp,160($context) # restore context->Rbp | ||
| 554 | mov %r12,216($context) # restore context->R12 | ||
| 555 | mov %r13,224($context) # restore context->R12 | ||
| 556 | mov %r14,232($context) # restore context->R14 | ||
| 557 | mov %r15,240($context) # restore context->R15 | ||
| 558 | |||
| 559 | .Lin_prologue: | ||
| 560 | mov 8(%rax),%rdi | ||
| 561 | mov 16(%rax),%rsi | ||
| 562 | mov %rax,152($context) # restore context->Rsp | ||
| 563 | mov %rsi,168($context) # restore context->Rsi | ||
| 564 | mov %rdi,176($context) # restore context->Rdi | ||
| 565 | |||
| 566 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 567 | mov $context,%rsi # context | ||
| 568 | mov \$154,%ecx # sizeof(CONTEXT) | ||
| 569 | .long 0xa548f3fc # cld; rep movsq | ||
| 570 | |||
| 571 | mov $disp,%rsi | ||
| 572 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 573 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 574 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 575 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 576 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 577 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 578 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 579 | mov %r10,32(%rsp) # arg5 | ||
| 580 | mov %r11,40(%rsp) # arg6 | ||
| 581 | mov %r12,48(%rsp) # arg7 | ||
| 582 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 583 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 584 | |||
| 585 | mov \$1,%eax # ExceptionContinueSearch | ||
| 586 | add \$64,%rsp | ||
| 587 | popfq | ||
| 588 | pop %r15 | ||
| 589 | pop %r14 | ||
| 590 | pop %r13 | ||
| 591 | pop %r12 | ||
| 592 | pop %rbp | ||
| 593 | pop %rbx | ||
| 594 | pop %rdi | ||
| 595 | pop %rsi | ||
| 596 | ret | ||
| 597 | .size se_handler,.-se_handler | ||
| 598 | |||
| 599 | .section .pdata | ||
| 600 | .align 4 | ||
| 601 | .rva .LSEH_begin_$func | ||
| 602 | .rva .LSEH_end_$func | ||
| 603 | .rva .LSEH_info_$func | ||
| 604 | |||
| 605 | .section .xdata | ||
| 606 | .align 8 | ||
| 607 | .LSEH_info_$func: | ||
| 608 | .byte 9,0,0,0 | ||
| 609 | .rva se_handler | ||
| 610 | ___ | ||
| 611 | } | ||
| 612 | |||
| 613 | sub reg_part { | ||
| 614 | my ($reg,$conv)=@_; | ||
| 615 | if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } | ||
| 616 | elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } | ||
| 617 | elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } | ||
| 618 | elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } | ||
| 619 | return $reg; | ||
| 620 | } | ||
| 621 | |||
| 622 | $code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; | ||
| 623 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 624 | $code =~ s/pinsrw\s+\$0,/movd /gm; | ||
| 625 | |||
| 626 | $code =~ s/#md5#//gm if ($md5); | ||
| 627 | $code =~ s/#rc4#//gm if ($rc4); | ||
| 628 | |||
| 629 | print $code; | ||
| 630 | |||
| 631 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/rc4/asm/rc4-parisc.pl b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl new file mode 100644 index 0000000000..9165067080 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl | |||
| @@ -0,0 +1,313 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # RC4 for PA-RISC. | ||
| 11 | |||
| 12 | # June 2009. | ||
| 13 | # | ||
| 14 | # Performance is 33% better than gcc 3.2 generated code on PA-7100LC. | ||
| 15 | # For reference, [4x] unrolled loop is >40% faster than folded one. | ||
| 16 | # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement | ||
| 17 | # is believed to be not sufficient to justify the effort... | ||
| 18 | # | ||
| 19 | # Special thanks to polarhome.com for providing HP-UX account. | ||
| 20 | |||
| 21 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 22 | |||
| 23 | $flavour = shift; | ||
| 24 | $output = shift; | ||
| 25 | open STDOUT,">$output"; | ||
| 26 | |||
| 27 | if ($flavour =~ /64/) { | ||
| 28 | $LEVEL ="2.0W"; | ||
| 29 | $SIZE_T =8; | ||
| 30 | $FRAME_MARKER =80; | ||
| 31 | $SAVED_RP =16; | ||
| 32 | $PUSH ="std"; | ||
| 33 | $PUSHMA ="std,ma"; | ||
| 34 | $POP ="ldd"; | ||
| 35 | $POPMB ="ldd,mb"; | ||
| 36 | } else { | ||
| 37 | $LEVEL ="1.0"; | ||
| 38 | $SIZE_T =4; | ||
| 39 | $FRAME_MARKER =48; | ||
| 40 | $SAVED_RP =20; | ||
| 41 | $PUSH ="stw"; | ||
| 42 | $PUSHMA ="stwm"; | ||
| 43 | $POP ="ldw"; | ||
| 44 | $POPMB ="ldwm"; | ||
| 45 | } | ||
| 46 | |||
| 47 | $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker | ||
| 48 | # [+ argument transfer] | ||
| 49 | $SZ=1; # defaults to RC4_CHAR | ||
| 50 | if (open CONF,"<${dir}../../opensslconf.h") { | ||
| 51 | while(<CONF>) { | ||
| 52 | if (m/#\s*define\s+RC4_INT\s+(.*)/) { | ||
| 53 | $SZ = ($1=~/char$/) ? 1 : 4; | ||
| 54 | last; | ||
| 55 | } | ||
| 56 | } | ||
| 57 | close CONF; | ||
| 58 | } | ||
| 59 | |||
| 60 | if ($SZ==1) { # RC4_CHAR | ||
| 61 | $LD="ldb"; | ||
| 62 | $LDX="ldbx"; | ||
| 63 | $MKX="addl"; | ||
| 64 | $ST="stb"; | ||
| 65 | } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) | ||
| 66 | $LD="ldw"; | ||
| 67 | $LDX="ldwx,s"; | ||
| 68 | $MKX="sh2addl"; | ||
| 69 | $ST="stw"; | ||
| 70 | } | ||
| 71 | |||
| 72 | $key="%r26"; | ||
| 73 | $len="%r25"; | ||
| 74 | $inp="%r24"; | ||
| 75 | $out="%r23"; | ||
| 76 | |||
| 77 | @XX=("%r19","%r20"); | ||
| 78 | @TX=("%r21","%r22"); | ||
| 79 | $YY="%r28"; | ||
| 80 | $TY="%r29"; | ||
| 81 | |||
| 82 | $acc="%r1"; | ||
| 83 | $ix="%r2"; | ||
| 84 | $iy="%r3"; | ||
| 85 | $dat0="%r4"; | ||
| 86 | $dat1="%r5"; | ||
| 87 | $rem="%r6"; | ||
| 88 | $mask="%r31"; | ||
| 89 | |||
| 90 | sub unrolledloopbody { | ||
| 91 | for ($i=0;$i<4;$i++) { | ||
| 92 | $code.=<<___; | ||
| 93 | ldo 1($XX[0]),$XX[1] | ||
| 94 | `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` | ||
| 95 | and $mask,$XX[1],$XX[1] | ||
| 96 | $LDX $YY($key),$TY | ||
| 97 | $MKX $YY,$key,$ix | ||
| 98 | $LDX $XX[1]($key),$TX[1] | ||
| 99 | $MKX $XX[0],$key,$iy | ||
| 100 | $ST $TX[0],0($ix) | ||
| 101 | comclr,<> $XX[1],$YY,%r0 ; conditional | ||
| 102 | copy $TX[0],$TX[1] ; move | ||
| 103 | `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` | ||
| 104 | $ST $TY,0($iy) | ||
| 105 | addl $TX[0],$TY,$TY | ||
| 106 | addl $TX[1],$YY,$YY | ||
| 107 | and $mask,$TY,$TY | ||
| 108 | and $mask,$YY,$YY | ||
| 109 | ___ | ||
| 110 | push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers | ||
| 111 | } } | ||
| 112 | |||
| 113 | sub foldedloop { | ||
| 114 | my ($label,$count)=@_; | ||
| 115 | $code.=<<___; | ||
| 116 | $label | ||
| 117 | $MKX $YY,$key,$iy | ||
| 118 | $LDX $YY($key),$TY | ||
| 119 | $MKX $XX[0],$key,$ix | ||
| 120 | $ST $TX[0],0($iy) | ||
| 121 | ldo 1($XX[0]),$XX[0] | ||
| 122 | $ST $TY,0($ix) | ||
| 123 | addl $TX[0],$TY,$TY | ||
| 124 | ldbx $inp($out),$dat1 | ||
| 125 | and $mask,$TY,$TY | ||
| 126 | and $mask,$XX[0],$XX[0] | ||
| 127 | $LDX $TY($key),$acc | ||
| 128 | $LDX $XX[0]($key),$TX[0] | ||
| 129 | ldo 1($out),$out | ||
| 130 | xor $dat1,$acc,$acc | ||
| 131 | addl $TX[0],$YY,$YY | ||
| 132 | stb $acc,-1($out) | ||
| 133 | addib,<> -1,$count,$label ; $count is always small | ||
| 134 | and $mask,$YY,$YY | ||
| 135 | ___ | ||
| 136 | } | ||
| 137 | |||
| 138 | $code=<<___; | ||
| 139 | .LEVEL $LEVEL | ||
| 140 | .SPACE \$TEXT\$ | ||
| 141 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
| 142 | |||
| 143 | .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | ||
| 144 | RC4 | ||
| 145 | .PROC | ||
| 146 | .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 | ||
| 147 | .ENTRY | ||
| 148 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
| 149 | $PUSHMA %r3,$FRAME(%sp) | ||
| 150 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
| 151 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
| 152 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
| 153 | |||
| 154 | cmpib,*= 0,$len,L\$abort | ||
| 155 | sub $inp,$out,$inp ; distance between $inp and $out | ||
| 156 | |||
| 157 | $LD `0*$SZ`($key),$XX[0] | ||
| 158 | $LD `1*$SZ`($key),$YY | ||
| 159 | ldo `2*$SZ`($key),$key | ||
| 160 | |||
| 161 | ldi 0xff,$mask | ||
| 162 | ldi 3,$dat0 | ||
| 163 | |||
| 164 | ldo 1($XX[0]),$XX[0] ; warm up loop | ||
| 165 | and $mask,$XX[0],$XX[0] | ||
| 166 | $LDX $XX[0]($key),$TX[0] | ||
| 167 | addl $TX[0],$YY,$YY | ||
| 168 | cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? | ||
| 169 | and $mask,$YY,$YY | ||
| 170 | |||
| 171 | and,<> $out,$dat0,$rem ; is $out aligned? | ||
| 172 | b L\$alignedout | ||
| 173 | subi 4,$rem,$rem | ||
| 174 | sub $len,$rem,$len | ||
| 175 | ___ | ||
| 176 | &foldedloop("L\$alignout",$rem); # process till $out is aligned | ||
| 177 | |||
| 178 | $code.=<<___; | ||
| 179 | L\$alignedout ; $len is at least 4 here | ||
| 180 | and,<> $inp,$dat0,$acc ; is $inp aligned? | ||
| 181 | b L\$oop4 | ||
| 182 | sub $inp,$acc,$rem ; align $inp | ||
| 183 | |||
| 184 | sh3addl $acc,%r0,$acc | ||
| 185 | subi 32,$acc,$acc | ||
| 186 | mtctl $acc,%cr11 ; load %sar with vshd align factor | ||
| 187 | ldwx $rem($out),$dat0 | ||
| 188 | ldo 4($rem),$rem | ||
| 189 | L\$oop4misalignedinp | ||
| 190 | ___ | ||
| 191 | &unrolledloopbody(); | ||
| 192 | $code.=<<___; | ||
| 193 | $LDX $TY($key),$ix | ||
| 194 | ldwx $rem($out),$dat1 | ||
| 195 | ldo -4($len),$len | ||
| 196 | or $ix,$acc,$acc ; last piece, no need to dep | ||
| 197 | vshd $dat0,$dat1,$iy ; align data | ||
| 198 | copy $dat1,$dat0 | ||
| 199 | xor $iy,$acc,$acc | ||
| 200 | stw $acc,0($out) | ||
| 201 | cmpib,*<< 3,$len,L\$oop4misalignedinp | ||
| 202 | ldo 4($out),$out | ||
| 203 | cmpib,*= 0,$len,L\$done | ||
| 204 | nop | ||
| 205 | b L\$oop1 | ||
| 206 | nop | ||
| 207 | |||
| 208 | .ALIGN 8 | ||
| 209 | L\$oop4 | ||
| 210 | ___ | ||
| 211 | &unrolledloopbody(); | ||
| 212 | $code.=<<___; | ||
| 213 | $LDX $TY($key),$ix | ||
| 214 | ldwx $inp($out),$dat0 | ||
| 215 | ldo -4($len),$len | ||
| 216 | or $ix,$acc,$acc ; last piece, no need to dep | ||
| 217 | xor $dat0,$acc,$acc | ||
| 218 | stw $acc,0($out) | ||
| 219 | cmpib,*<< 3,$len,L\$oop4 | ||
| 220 | ldo 4($out),$out | ||
| 221 | cmpib,*= 0,$len,L\$done | ||
| 222 | nop | ||
| 223 | ___ | ||
| 224 | &foldedloop("L\$oop1",$len); | ||
| 225 | $code.=<<___; | ||
| 226 | L\$done | ||
| 227 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 | ||
| 228 | ldo -1($XX[0]),$XX[0] ; chill out loop | ||
| 229 | sub $YY,$TX[0],$YY | ||
| 230 | and $mask,$XX[0],$XX[0] | ||
| 231 | and $mask,$YY,$YY | ||
| 232 | $ST $XX[0],`-2*$SZ`($key) | ||
| 233 | $ST $YY,`-1*$SZ`($key) | ||
| 234 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
| 235 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
| 236 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
| 237 | L\$abort | ||
| 238 | bv (%r2) | ||
| 239 | .EXIT | ||
| 240 | $POPMB -$FRAME(%sp),%r3 | ||
| 241 | .PROCEND | ||
| 242 | ___ | ||
| 243 | |||
| 244 | $code.=<<___; | ||
| 245 | |||
| 246 | .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
| 247 | .ALIGN 8 | ||
| 248 | private_RC4_set_key | ||
| 249 | .PROC | ||
| 250 | .CALLINFO NO_CALLS | ||
| 251 | .ENTRY | ||
| 252 | $ST %r0,`0*$SZ`($key) | ||
| 253 | $ST %r0,`1*$SZ`($key) | ||
| 254 | ldo `2*$SZ`($key),$key | ||
| 255 | copy %r0,@XX[0] | ||
| 256 | L\$1st | ||
| 257 | $ST @XX[0],0($key) | ||
| 258 | ldo 1(@XX[0]),@XX[0] | ||
| 259 | bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 | ||
| 260 | ldo $SZ($key),$key | ||
| 261 | |||
| 262 | ldo `-256*$SZ`($key),$key ; rewind $key | ||
| 263 | addl $len,$inp,$inp ; $inp to point at the end | ||
| 264 | sub %r0,$len,%r23 ; inverse index | ||
| 265 | copy %r0,@XX[0] | ||
| 266 | copy %r0,@XX[1] | ||
| 267 | ldi 0xff,$mask | ||
| 268 | |||
| 269 | L\$2nd | ||
| 270 | $LDX @XX[0]($key),@TX[0] | ||
| 271 | ldbx %r23($inp),@TX[1] | ||
| 272 | addi,nuv 1,%r23,%r23 ; increment and conditional | ||
| 273 | sub %r0,$len,%r23 ; inverse index | ||
| 274 | addl @TX[0],@XX[1],@XX[1] | ||
| 275 | addl @TX[1],@XX[1],@XX[1] | ||
| 276 | and $mask,@XX[1],@XX[1] | ||
| 277 | $MKX @XX[0],$key,$TY | ||
| 278 | $LDX @XX[1]($key),@TX[1] | ||
| 279 | $MKX @XX[1],$key,$YY | ||
| 280 | ldo 1(@XX[0]),@XX[0] | ||
| 281 | $ST @TX[0],0($YY) | ||
| 282 | bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 | ||
| 283 | $ST @TX[1],0($TY) | ||
| 284 | |||
| 285 | bv,n (%r2) | ||
| 286 | .EXIT | ||
| 287 | nop | ||
| 288 | .PROCEND | ||
| 289 | |||
| 290 | .EXPORT RC4_options,ENTRY | ||
| 291 | .ALIGN 8 | ||
| 292 | RC4_options | ||
| 293 | .PROC | ||
| 294 | .CALLINFO NO_CALLS | ||
| 295 | .ENTRY | ||
| 296 | blr %r0,%r28 | ||
| 297 | ldi 3,%r1 | ||
| 298 | L\$pic | ||
| 299 | andcm %r28,%r1,%r28 | ||
| 300 | bv (%r2) | ||
| 301 | .EXIT | ||
| 302 | ldo L\$opts-L\$pic(%r28),%r28 | ||
| 303 | .PROCEND | ||
| 304 | .ALIGN 8 | ||
| 305 | L\$opts | ||
| 306 | .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" | ||
| 307 | .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 308 | ___ | ||
| 309 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 310 | $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); | ||
| 311 | |||
| 312 | print $code; | ||
| 313 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl index 96681fa05e..7528ece13c 100644 --- a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl | |||
| @@ -13,6 +13,29 @@ | |||
| 13 | # "cluster" Address Generation Interlocks, so that one pipeline stall | 13 | # "cluster" Address Generation Interlocks, so that one pipeline stall |
| 14 | # resolves several dependencies. | 14 | # resolves several dependencies. |
| 15 | 15 | ||
| 16 | # November 2010. | ||
| 17 | # | ||
| 18 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
| 19 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
| 20 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
| 21 | # application context. The feature is not specific to any particular | ||
| 22 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
| 23 | # remains z/Architecture specific. On z990 it was measured to perform | ||
| 24 | # 50% better than code generated by gcc 4.3. | ||
| 25 | |||
| 26 | $flavour = shift; | ||
| 27 | |||
| 28 | if ($flavour =~ /3[12]/) { | ||
| 29 | $SIZE_T=4; | ||
| 30 | $g=""; | ||
| 31 | } else { | ||
| 32 | $SIZE_T=8; | ||
| 33 | $g="g"; | ||
| 34 | } | ||
| 35 | |||
| 36 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 37 | open STDOUT,">$output"; | ||
| 38 | |||
| 16 | $rp="%r14"; | 39 | $rp="%r14"; |
| 17 | $sp="%r15"; | 40 | $sp="%r15"; |
| 18 | $code=<<___; | 41 | $code=<<___; |
| @@ -39,7 +62,12 @@ $code.=<<___; | |||
| 39 | .type RC4,\@function | 62 | .type RC4,\@function |
| 40 | .align 64 | 63 | .align 64 |
| 41 | RC4: | 64 | RC4: |
| 42 | stmg %r6,%r11,48($sp) | 65 | stm${g} %r6,%r11,6*$SIZE_T($sp) |
| 66 | ___ | ||
| 67 | $code.=<<___ if ($flavour =~ /3[12]/); | ||
| 68 | llgfr $len,$len | ||
| 69 | ___ | ||
| 70 | $code.=<<___; | ||
| 43 | llgc $XX[0],0($key) | 71 | llgc $XX[0],0($key) |
| 44 | llgc $YY,1($key) | 72 | llgc $YY,1($key) |
| 45 | la $XX[0],1($XX[0]) | 73 | la $XX[0],1($XX[0]) |
| @@ -90,7 +118,7 @@ $code.=<<___; | |||
| 90 | xgr $acc,$TX[1] | 118 | xgr $acc,$TX[1] |
| 91 | stg $acc,0($out) | 119 | stg $acc,0($out) |
| 92 | la $out,8($out) | 120 | la $out,8($out) |
| 93 | brct $cnt,.Loop8 | 121 | brctg $cnt,.Loop8 |
| 94 | 122 | ||
| 95 | .Lshort: | 123 | .Lshort: |
| 96 | lghi $acc,7 | 124 | lghi $acc,7 |
| @@ -122,7 +150,7 @@ $code.=<<___; | |||
| 122 | ahi $XX[0],-1 | 150 | ahi $XX[0],-1 |
| 123 | stc $XX[0],0($key) | 151 | stc $XX[0],0($key) |
| 124 | stc $YY,1($key) | 152 | stc $YY,1($key) |
| 125 | lmg %r6,%r11,48($sp) | 153 | lm${g} %r6,%r11,6*$SIZE_T($sp) |
| 126 | br $rp | 154 | br $rp |
| 127 | .size RC4,.-RC4 | 155 | .size RC4,.-RC4 |
| 128 | .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 156 | .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
| @@ -143,11 +171,11 @@ $ikey="%r7"; | |||
| 143 | $iinp="%r8"; | 171 | $iinp="%r8"; |
| 144 | 172 | ||
| 145 | $code.=<<___; | 173 | $code.=<<___; |
| 146 | .globl RC4_set_key | 174 | .globl private_RC4_set_key |
| 147 | .type RC4_set_key,\@function | 175 | .type private_RC4_set_key,\@function |
| 148 | .align 64 | 176 | .align 64 |
| 149 | RC4_set_key: | 177 | private_RC4_set_key: |
| 150 | stmg %r6,%r8,48($sp) | 178 | stm${g} %r6,%r8,6*$SIZE_T($sp) |
| 151 | lhi $cnt,256 | 179 | lhi $cnt,256 |
| 152 | la $idx,0(%r0) | 180 | la $idx,0(%r0) |
| 153 | sth $idx,0($key) | 181 | sth $idx,0($key) |
| @@ -180,9 +208,9 @@ RC4_set_key: | |||
| 180 | la $iinp,0(%r0) | 208 | la $iinp,0(%r0) |
| 181 | j .L2ndloop | 209 | j .L2ndloop |
| 182 | .Ldone: | 210 | .Ldone: |
| 183 | lmg %r6,%r8,48($sp) | 211 | lm${g} %r6,%r8,6*$SIZE_T($sp) |
| 184 | br $rp | 212 | br $rp |
| 185 | .size RC4_set_key,.-RC4_set_key | 213 | .size private_RC4_set_key,.-private_RC4_set_key |
| 186 | 214 | ||
| 187 | ___ | 215 | ___ |
| 188 | } | 216 | } |
| @@ -203,3 +231,4 @@ RC4_options: | |||
| 203 | ___ | 231 | ___ |
| 204 | 232 | ||
| 205 | print $code; | 233 | print $code; |
| 234 | close STDOUT; # force flush | ||
diff --git a/src/lib/libcrypto/rsa/rsa_ameth.c b/src/lib/libcrypto/rsa/rsa_ameth.c index 8c3209885e..2460910ab2 100644 --- a/src/lib/libcrypto/rsa/rsa_ameth.c +++ b/src/lib/libcrypto/rsa/rsa_ameth.c | |||
| @@ -265,6 +265,147 @@ static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent, | |||
| 265 | return do_rsa_print(bp, pkey->pkey.rsa, indent, 1); | 265 | return do_rsa_print(bp, pkey->pkey.rsa, indent, 1); |
| 266 | } | 266 | } |
| 267 | 267 | ||
| 268 | static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg, | ||
| 269 | X509_ALGOR **pmaskHash) | ||
| 270 | { | ||
| 271 | const unsigned char *p; | ||
| 272 | int plen; | ||
| 273 | RSA_PSS_PARAMS *pss; | ||
| 274 | |||
| 275 | *pmaskHash = NULL; | ||
| 276 | |||
| 277 | if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE) | ||
| 278 | return NULL; | ||
| 279 | p = alg->parameter->value.sequence->data; | ||
| 280 | plen = alg->parameter->value.sequence->length; | ||
| 281 | pss = d2i_RSA_PSS_PARAMS(NULL, &p, plen); | ||
| 282 | |||
| 283 | if (!pss) | ||
| 284 | return NULL; | ||
| 285 | |||
| 286 | if (pss->maskGenAlgorithm) | ||
| 287 | { | ||
| 288 | ASN1_TYPE *param = pss->maskGenAlgorithm->parameter; | ||
| 289 | if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1 | ||
| 290 | && param->type == V_ASN1_SEQUENCE) | ||
| 291 | { | ||
| 292 | p = param->value.sequence->data; | ||
| 293 | plen = param->value.sequence->length; | ||
| 294 | *pmaskHash = d2i_X509_ALGOR(NULL, &p, plen); | ||
| 295 | } | ||
| 296 | } | ||
| 297 | |||
| 298 | return pss; | ||
| 299 | } | ||
| 300 | |||
| 301 | static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss, | ||
| 302 | X509_ALGOR *maskHash, int indent) | ||
| 303 | { | ||
| 304 | int rv = 0; | ||
| 305 | if (!pss) | ||
| 306 | { | ||
| 307 | if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0) | ||
| 308 | return 0; | ||
| 309 | return 1; | ||
| 310 | } | ||
| 311 | if (BIO_puts(bp, "\n") <= 0) | ||
| 312 | goto err; | ||
| 313 | if (!BIO_indent(bp, indent, 128)) | ||
| 314 | goto err; | ||
| 315 | if (BIO_puts(bp, "Hash Algorithm: ") <= 0) | ||
| 316 | goto err; | ||
| 317 | |||
| 318 | if (pss->hashAlgorithm) | ||
| 319 | { | ||
| 320 | if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0) | ||
| 321 | goto err; | ||
| 322 | } | ||
| 323 | else if (BIO_puts(bp, "sha1 (default)") <= 0) | ||
| 324 | goto err; | ||
| 325 | |||
| 326 | if (BIO_puts(bp, "\n") <= 0) | ||
| 327 | goto err; | ||
| 328 | |||
| 329 | if (!BIO_indent(bp, indent, 128)) | ||
| 330 | goto err; | ||
| 331 | |||
| 332 | if (BIO_puts(bp, "Mask Algorithm: ") <= 0) | ||
| 333 | goto err; | ||
| 334 | if (pss->maskGenAlgorithm) | ||
| 335 | { | ||
| 336 | if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0) | ||
| 337 | goto err; | ||
| 338 | if (BIO_puts(bp, " with ") <= 0) | ||
| 339 | goto err; | ||
| 340 | if (maskHash) | ||
| 341 | { | ||
| 342 | if (i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0) | ||
| 343 | goto err; | ||
| 344 | } | ||
| 345 | else if (BIO_puts(bp, "INVALID") <= 0) | ||
| 346 | goto err; | ||
| 347 | } | ||
| 348 | else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0) | ||
| 349 | goto err; | ||
| 350 | BIO_puts(bp, "\n"); | ||
| 351 | |||
| 352 | if (!BIO_indent(bp, indent, 128)) | ||
| 353 | goto err; | ||
| 354 | if (BIO_puts(bp, "Salt Length: ") <= 0) | ||
| 355 | goto err; | ||
| 356 | if (pss->saltLength) | ||
| 357 | { | ||
| 358 | if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0) | ||
| 359 | goto err; | ||
| 360 | } | ||
| 361 | else if (BIO_puts(bp, "20 (default)") <= 0) | ||
| 362 | goto err; | ||
| 363 | BIO_puts(bp, "\n"); | ||
| 364 | |||
| 365 | if (!BIO_indent(bp, indent, 128)) | ||
| 366 | goto err; | ||
| 367 | if (BIO_puts(bp, "Trailer Field: ") <= 0) | ||
| 368 | goto err; | ||
| 369 | if (pss->trailerField) | ||
| 370 | { | ||
| 371 | if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0) | ||
| 372 | goto err; | ||
| 373 | } | ||
| 374 | else if (BIO_puts(bp, "0xbc (default)") <= 0) | ||
| 375 | goto err; | ||
| 376 | BIO_puts(bp, "\n"); | ||
| 377 | |||
| 378 | rv = 1; | ||
| 379 | |||
| 380 | err: | ||
| 381 | return rv; | ||
| 382 | |||
| 383 | } | ||
| 384 | |||
| 385 | static int rsa_sig_print(BIO *bp, const X509_ALGOR *sigalg, | ||
| 386 | const ASN1_STRING *sig, | ||
| 387 | int indent, ASN1_PCTX *pctx) | ||
| 388 | { | ||
| 389 | if (OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss) | ||
| 390 | { | ||
| 391 | int rv; | ||
| 392 | RSA_PSS_PARAMS *pss; | ||
| 393 | X509_ALGOR *maskHash; | ||
| 394 | pss = rsa_pss_decode(sigalg, &maskHash); | ||
| 395 | rv = rsa_pss_param_print(bp, pss, maskHash, indent); | ||
| 396 | if (pss) | ||
| 397 | RSA_PSS_PARAMS_free(pss); | ||
| 398 | if (maskHash) | ||
| 399 | X509_ALGOR_free(maskHash); | ||
| 400 | if (!rv) | ||
| 401 | return 0; | ||
| 402 | } | ||
| 403 | else if (!sig && BIO_puts(bp, "\n") <= 0) | ||
| 404 | return 0; | ||
| 405 | if (sig) | ||
| 406 | return X509_signature_dump(bp, sig, indent); | ||
| 407 | return 1; | ||
| 408 | } | ||
| 268 | 409 | ||
| 269 | static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) | 410 | static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) |
| 270 | { | 411 | { |
| @@ -310,6 +451,211 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) | |||
| 310 | 451 | ||
| 311 | } | 452 | } |
| 312 | 453 | ||
| 454 | /* Customised RSA item verification routine. This is called | ||
| 455 | * when a signature is encountered requiring special handling. We | ||
| 456 | * currently only handle PSS. | ||
| 457 | */ | ||
| 458 | |||
| 459 | |||
| 460 | static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, | ||
| 461 | X509_ALGOR *sigalg, ASN1_BIT_STRING *sig, | ||
| 462 | EVP_PKEY *pkey) | ||
| 463 | { | ||
| 464 | int rv = -1; | ||
| 465 | int saltlen; | ||
| 466 | const EVP_MD *mgf1md = NULL, *md = NULL; | ||
| 467 | RSA_PSS_PARAMS *pss; | ||
| 468 | X509_ALGOR *maskHash; | ||
| 469 | EVP_PKEY_CTX *pkctx; | ||
| 470 | /* Sanity check: make sure it is PSS */ | ||
| 471 | if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss) | ||
| 472 | { | ||
| 473 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE); | ||
| 474 | return -1; | ||
| 475 | } | ||
| 476 | /* Decode PSS parameters */ | ||
| 477 | pss = rsa_pss_decode(sigalg, &maskHash); | ||
| 478 | |||
| 479 | if (pss == NULL) | ||
| 480 | { | ||
| 481 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS); | ||
| 482 | goto err; | ||
| 483 | } | ||
| 484 | /* Check mask and lookup mask hash algorithm */ | ||
| 485 | if (pss->maskGenAlgorithm) | ||
| 486 | { | ||
| 487 | if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1) | ||
| 488 | { | ||
| 489 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM); | ||
| 490 | goto err; | ||
| 491 | } | ||
| 492 | if (!maskHash) | ||
| 493 | { | ||
| 494 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER); | ||
| 495 | goto err; | ||
| 496 | } | ||
| 497 | mgf1md = EVP_get_digestbyobj(maskHash->algorithm); | ||
| 498 | if (mgf1md == NULL) | ||
| 499 | { | ||
| 500 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST); | ||
| 501 | goto err; | ||
| 502 | } | ||
| 503 | } | ||
| 504 | else | ||
| 505 | mgf1md = EVP_sha1(); | ||
| 506 | |||
| 507 | if (pss->hashAlgorithm) | ||
| 508 | { | ||
| 509 | md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm); | ||
| 510 | if (md == NULL) | ||
| 511 | { | ||
| 512 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST); | ||
| 513 | goto err; | ||
| 514 | } | ||
| 515 | } | ||
| 516 | else | ||
| 517 | md = EVP_sha1(); | ||
| 518 | |||
| 519 | if (pss->saltLength) | ||
| 520 | { | ||
| 521 | saltlen = ASN1_INTEGER_get(pss->saltLength); | ||
| 522 | |||
| 523 | /* Could perform more salt length sanity checks but the main | ||
| 524 | * RSA routines will trap other invalid values anyway. | ||
| 525 | */ | ||
| 526 | if (saltlen < 0) | ||
| 527 | { | ||
| 528 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH); | ||
| 529 | goto err; | ||
| 530 | } | ||
| 531 | } | ||
| 532 | else | ||
| 533 | saltlen = 20; | ||
| 534 | |||
| 535 | /* low-level routines support only trailer field 0xbc (value 1) | ||
| 536 | * and PKCS#1 says we should reject any other value anyway. | ||
| 537 | */ | ||
| 538 | if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1) | ||
| 539 | { | ||
| 540 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER); | ||
| 541 | goto err; | ||
| 542 | } | ||
| 543 | |||
| 544 | /* We have all parameters now set up context */ | ||
| 545 | |||
| 546 | if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey)) | ||
| 547 | goto err; | ||
| 548 | |||
| 549 | if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0) | ||
| 550 | goto err; | ||
| 551 | |||
| 552 | if (EVP_PKEY_CTX_set_rsa_pss_saltlen(pkctx, saltlen) <= 0) | ||
| 553 | goto err; | ||
| 554 | |||
| 555 | if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0) | ||
| 556 | goto err; | ||
| 557 | /* Carry on */ | ||
| 558 | rv = 2; | ||
| 559 | |||
| 560 | err: | ||
| 561 | RSA_PSS_PARAMS_free(pss); | ||
| 562 | if (maskHash) | ||
| 563 | X509_ALGOR_free(maskHash); | ||
| 564 | return rv; | ||
| 565 | } | ||
| 566 | |||
| 567 | static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, | ||
| 568 | X509_ALGOR *alg1, X509_ALGOR *alg2, | ||
| 569 | ASN1_BIT_STRING *sig) | ||
| 570 | { | ||
| 571 | int pad_mode; | ||
| 572 | EVP_PKEY_CTX *pkctx = ctx->pctx; | ||
| 573 | if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0) | ||
| 574 | return 0; | ||
| 575 | if (pad_mode == RSA_PKCS1_PADDING) | ||
| 576 | return 2; | ||
| 577 | if (pad_mode == RSA_PKCS1_PSS_PADDING) | ||
| 578 | { | ||
| 579 | const EVP_MD *sigmd, *mgf1md; | ||
| 580 | RSA_PSS_PARAMS *pss = NULL; | ||
| 581 | X509_ALGOR *mgf1alg = NULL; | ||
| 582 | ASN1_STRING *os1 = NULL, *os2 = NULL; | ||
| 583 | EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx); | ||
| 584 | int saltlen, rv = 0; | ||
| 585 | sigmd = EVP_MD_CTX_md(ctx); | ||
| 586 | if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0) | ||
| 587 | goto err; | ||
| 588 | if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen)) | ||
| 589 | goto err; | ||
| 590 | if (saltlen == -1) | ||
| 591 | saltlen = EVP_MD_size(sigmd); | ||
| 592 | else if (saltlen == -2) | ||
| 593 | { | ||
| 594 | saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2; | ||
| 595 | if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0) | ||
| 596 | saltlen--; | ||
| 597 | } | ||
| 598 | pss = RSA_PSS_PARAMS_new(); | ||
| 599 | if (!pss) | ||
| 600 | goto err; | ||
| 601 | if (saltlen != 20) | ||
| 602 | { | ||
| 603 | pss->saltLength = ASN1_INTEGER_new(); | ||
| 604 | if (!pss->saltLength) | ||
| 605 | goto err; | ||
| 606 | if (!ASN1_INTEGER_set(pss->saltLength, saltlen)) | ||
| 607 | goto err; | ||
| 608 | } | ||
| 609 | if (EVP_MD_type(sigmd) != NID_sha1) | ||
| 610 | { | ||
| 611 | pss->hashAlgorithm = X509_ALGOR_new(); | ||
| 612 | if (!pss->hashAlgorithm) | ||
| 613 | goto err; | ||
| 614 | X509_ALGOR_set_md(pss->hashAlgorithm, sigmd); | ||
| 615 | } | ||
| 616 | if (EVP_MD_type(mgf1md) != NID_sha1) | ||
| 617 | { | ||
| 618 | ASN1_STRING *stmp = NULL; | ||
| 619 | /* need to embed algorithm ID inside another */ | ||
| 620 | mgf1alg = X509_ALGOR_new(); | ||
| 621 | X509_ALGOR_set_md(mgf1alg, mgf1md); | ||
| 622 | if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR), | ||
| 623 | &stmp)) | ||
| 624 | goto err; | ||
| 625 | pss->maskGenAlgorithm = X509_ALGOR_new(); | ||
| 626 | if (!pss->maskGenAlgorithm) | ||
| 627 | goto err; | ||
| 628 | X509_ALGOR_set0(pss->maskGenAlgorithm, | ||
| 629 | OBJ_nid2obj(NID_mgf1), | ||
| 630 | V_ASN1_SEQUENCE, stmp); | ||
| 631 | } | ||
| 632 | /* Finally create string with pss parameter encoding. */ | ||
| 633 | if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1)) | ||
| 634 | goto err; | ||
| 635 | if (alg2) | ||
| 636 | { | ||
| 637 | os2 = ASN1_STRING_dup(os1); | ||
| 638 | if (!os2) | ||
| 639 | goto err; | ||
| 640 | X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss), | ||
| 641 | V_ASN1_SEQUENCE, os2); | ||
| 642 | } | ||
| 643 | X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss), | ||
| 644 | V_ASN1_SEQUENCE, os1); | ||
| 645 | os1 = os2 = NULL; | ||
| 646 | rv = 3; | ||
| 647 | err: | ||
| 648 | if (mgf1alg) | ||
| 649 | X509_ALGOR_free(mgf1alg); | ||
| 650 | if (pss) | ||
| 651 | RSA_PSS_PARAMS_free(pss); | ||
| 652 | if (os1) | ||
| 653 | ASN1_STRING_free(os1); | ||
| 654 | return rv; | ||
| 655 | |||
| 656 | } | ||
| 657 | return 2; | ||
| 658 | } | ||
| 313 | 659 | ||
| 314 | const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = | 660 | const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = |
| 315 | { | 661 | { |
| @@ -335,10 +681,13 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = | |||
| 335 | 681 | ||
| 336 | 0,0,0,0,0,0, | 682 | 0,0,0,0,0,0, |
| 337 | 683 | ||
| 684 | rsa_sig_print, | ||
| 338 | int_rsa_free, | 685 | int_rsa_free, |
| 339 | rsa_pkey_ctrl, | 686 | rsa_pkey_ctrl, |
| 340 | old_rsa_priv_decode, | 687 | old_rsa_priv_decode, |
| 341 | old_rsa_priv_encode | 688 | old_rsa_priv_encode, |
| 689 | rsa_item_verify, | ||
| 690 | rsa_item_sign | ||
| 342 | }, | 691 | }, |
| 343 | 692 | ||
| 344 | { | 693 | { |
diff --git a/src/lib/libcrypto/rsa/rsa_crpt.c b/src/lib/libcrypto/rsa/rsa_crpt.c new file mode 100644 index 0000000000..d3e44785dc --- /dev/null +++ b/src/lib/libcrypto/rsa/rsa_crpt.c | |||
| @@ -0,0 +1,257 @@ | |||
| 1 | /* crypto/rsa/rsa_lib.c */ | ||
| 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 3 | * All rights reserved. | ||
| 4 | * | ||
| 5 | * This package is an SSL implementation written | ||
| 6 | * by Eric Young (eay@cryptsoft.com). | ||
| 7 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 8 | * | ||
| 9 | * This library is free for commercial and non-commercial use as long as | ||
| 10 | * the following conditions are aheared to. The following conditions | ||
| 11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 13 | * included with this distribution is covered by the same copyright terms | ||
| 14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 15 | * | ||
| 16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 17 | * the code are not to be removed. | ||
| 18 | * If this package is used in a product, Eric Young should be given attribution | ||
| 19 | * as the author of the parts of the library used. | ||
| 20 | * This can be in the form of a textual message at program startup or | ||
| 21 | * in documentation (online or textual) provided with the package. | ||
| 22 | * | ||
| 23 | * Redistribution and use in source and binary forms, with or without | ||
| 24 | * modification, are permitted provided that the following conditions | ||
| 25 | * are met: | ||
| 26 | * 1. Redistributions of source code must retain the copyright | ||
| 27 | * notice, this list of conditions and the following disclaimer. | ||
| 28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 29 | * notice, this list of conditions and the following disclaimer in the | ||
| 30 | * documentation and/or other materials provided with the distribution. | ||
| 31 | * 3. All advertising materials mentioning features or use of this software | ||
| 32 | * must display the following acknowledgement: | ||
| 33 | * "This product includes cryptographic software written by | ||
| 34 | * Eric Young (eay@cryptsoft.com)" | ||
| 35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 36 | * being used are not cryptographic related :-). | ||
| 37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 38 | * the apps directory (application code) you must include an acknowledgement: | ||
| 39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 40 | * | ||
| 41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 51 | * SUCH DAMAGE. | ||
| 52 | * | ||
| 53 | * The licence and distribution terms for any publically available version or | ||
| 54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 55 | * copied and put under another distribution licence | ||
| 56 | * [including the GNU Public Licence.] | ||
| 57 | */ | ||
| 58 | |||
| 59 | #include <stdio.h> | ||
| 60 | #include <openssl/crypto.h> | ||
| 61 | #include "cryptlib.h" | ||
| 62 | #include <openssl/lhash.h> | ||
| 63 | #include <openssl/bn.h> | ||
| 64 | #include <openssl/rsa.h> | ||
| 65 | #include <openssl/rand.h> | ||
| 66 | #ifndef OPENSSL_NO_ENGINE | ||
| 67 | #include <openssl/engine.h> | ||
| 68 | #endif | ||
| 69 | |||
| 70 | int RSA_size(const RSA *r) | ||
| 71 | { | ||
| 72 | return(BN_num_bytes(r->n)); | ||
| 73 | } | ||
| 74 | |||
| 75 | int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to, | ||
| 76 | RSA *rsa, int padding) | ||
| 77 | { | ||
| 78 | #ifdef OPENSSL_FIPS | ||
| 79 | if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) | ||
| 80 | && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) | ||
| 81 | { | ||
| 82 | RSAerr(RSA_F_RSA_PUBLIC_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD); | ||
| 83 | return -1; | ||
| 84 | } | ||
| 85 | #endif | ||
| 86 | return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding)); | ||
| 87 | } | ||
| 88 | |||
| 89 | int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to, | ||
| 90 | RSA *rsa, int padding) | ||
| 91 | { | ||
| 92 | #ifdef OPENSSL_FIPS | ||
| 93 | if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) | ||
| 94 | && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) | ||
| 95 | { | ||
| 96 | RSAerr(RSA_F_RSA_PRIVATE_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD); | ||
| 97 | return -1; | ||
| 98 | } | ||
| 99 | #endif | ||
| 100 | return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding)); | ||
| 101 | } | ||
| 102 | |||
| 103 | int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to, | ||
| 104 | RSA *rsa, int padding) | ||
| 105 | { | ||
| 106 | #ifdef OPENSSL_FIPS | ||
| 107 | if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) | ||
| 108 | && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) | ||
| 109 | { | ||
| 110 | RSAerr(RSA_F_RSA_PRIVATE_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD); | ||
| 111 | return -1; | ||
| 112 | } | ||
| 113 | #endif | ||
| 114 | return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding)); | ||
| 115 | } | ||
| 116 | |||
| 117 | int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to, | ||
| 118 | RSA *rsa, int padding) | ||
| 119 | { | ||
| 120 | #ifdef OPENSSL_FIPS | ||
| 121 | if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) | ||
| 122 | && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) | ||
| 123 | { | ||
| 124 | RSAerr(RSA_F_RSA_PUBLIC_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD); | ||
| 125 | return -1; | ||
| 126 | } | ||
| 127 | #endif | ||
| 128 | return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding)); | ||
| 129 | } | ||
| 130 | |||
| 131 | int RSA_flags(const RSA *r) | ||
| 132 | { | ||
| 133 | return((r == NULL)?0:r->meth->flags); | ||
| 134 | } | ||
| 135 | |||
| 136 | void RSA_blinding_off(RSA *rsa) | ||
| 137 | { | ||
| 138 | if (rsa->blinding != NULL) | ||
| 139 | { | ||
| 140 | BN_BLINDING_free(rsa->blinding); | ||
| 141 | rsa->blinding=NULL; | ||
| 142 | } | ||
| 143 | rsa->flags &= ~RSA_FLAG_BLINDING; | ||
| 144 | rsa->flags |= RSA_FLAG_NO_BLINDING; | ||
| 145 | } | ||
| 146 | |||
| 147 | int RSA_blinding_on(RSA *rsa, BN_CTX *ctx) | ||
| 148 | { | ||
| 149 | int ret=0; | ||
| 150 | |||
| 151 | if (rsa->blinding != NULL) | ||
| 152 | RSA_blinding_off(rsa); | ||
| 153 | |||
| 154 | rsa->blinding = RSA_setup_blinding(rsa, ctx); | ||
| 155 | if (rsa->blinding == NULL) | ||
| 156 | goto err; | ||
| 157 | |||
| 158 | rsa->flags |= RSA_FLAG_BLINDING; | ||
| 159 | rsa->flags &= ~RSA_FLAG_NO_BLINDING; | ||
| 160 | ret=1; | ||
| 161 | err: | ||
| 162 | return(ret); | ||
| 163 | } | ||
| 164 | |||
| 165 | static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p, | ||
| 166 | const BIGNUM *q, BN_CTX *ctx) | ||
| 167 | { | ||
| 168 | BIGNUM *ret = NULL, *r0, *r1, *r2; | ||
| 169 | |||
| 170 | if (d == NULL || p == NULL || q == NULL) | ||
| 171 | return NULL; | ||
| 172 | |||
| 173 | BN_CTX_start(ctx); | ||
| 174 | r0 = BN_CTX_get(ctx); | ||
| 175 | r1 = BN_CTX_get(ctx); | ||
| 176 | r2 = BN_CTX_get(ctx); | ||
| 177 | if (r2 == NULL) | ||
| 178 | goto err; | ||
| 179 | |||
| 180 | if (!BN_sub(r1, p, BN_value_one())) goto err; | ||
| 181 | if (!BN_sub(r2, q, BN_value_one())) goto err; | ||
| 182 | if (!BN_mul(r0, r1, r2, ctx)) goto err; | ||
| 183 | |||
| 184 | ret = BN_mod_inverse(NULL, d, r0, ctx); | ||
| 185 | err: | ||
| 186 | BN_CTX_end(ctx); | ||
| 187 | return ret; | ||
| 188 | } | ||
| 189 | |||
| 190 | BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx) | ||
| 191 | { | ||
| 192 | BIGNUM local_n; | ||
| 193 | BIGNUM *e,*n; | ||
| 194 | BN_CTX *ctx; | ||
| 195 | BN_BLINDING *ret = NULL; | ||
| 196 | |||
| 197 | if (in_ctx == NULL) | ||
| 198 | { | ||
| 199 | if ((ctx = BN_CTX_new()) == NULL) return 0; | ||
| 200 | } | ||
| 201 | else | ||
| 202 | ctx = in_ctx; | ||
| 203 | |||
| 204 | BN_CTX_start(ctx); | ||
| 205 | e = BN_CTX_get(ctx); | ||
| 206 | if (e == NULL) | ||
| 207 | { | ||
| 208 | RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE); | ||
| 209 | goto err; | ||
| 210 | } | ||
| 211 | |||
| 212 | if (rsa->e == NULL) | ||
| 213 | { | ||
| 214 | e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx); | ||
| 215 | if (e == NULL) | ||
| 216 | { | ||
| 217 | RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT); | ||
| 218 | goto err; | ||
| 219 | } | ||
| 220 | } | ||
| 221 | else | ||
| 222 | e = rsa->e; | ||
| 223 | |||
| 224 | |||
| 225 | if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL) | ||
| 226 | { | ||
| 227 | /* if PRNG is not properly seeded, resort to secret | ||
| 228 | * exponent as unpredictable seed */ | ||
| 229 | RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0); | ||
| 230 | } | ||
| 231 | |||
| 232 | if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) | ||
| 233 | { | ||
| 234 | /* Set BN_FLG_CONSTTIME flag */ | ||
| 235 | n = &local_n; | ||
| 236 | BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME); | ||
| 237 | } | ||
| 238 | else | ||
| 239 | n = rsa->n; | ||
| 240 | |||
| 241 | ret = BN_BLINDING_create_param(NULL, e, n, ctx, | ||
| 242 | rsa->meth->bn_mod_exp, rsa->_method_mod_n); | ||
| 243 | if (ret == NULL) | ||
| 244 | { | ||
| 245 | RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB); | ||
| 246 | goto err; | ||
| 247 | } | ||
| 248 | CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret)); | ||
| 249 | err: | ||
| 250 | BN_CTX_end(ctx); | ||
| 251 | if (in_ctx == NULL) | ||
| 252 | BN_CTX_free(ctx); | ||
| 253 | if(rsa->e == NULL) | ||
| 254 | BN_free(e); | ||
| 255 | |||
| 256 | return ret; | ||
| 257 | } | ||
diff --git a/src/lib/libcrypto/rsa/rsa_pmeth.c b/src/lib/libcrypto/rsa/rsa_pmeth.c index c6892ecd09..5b2ecf56ad 100644 --- a/src/lib/libcrypto/rsa/rsa_pmeth.c +++ b/src/lib/libcrypto/rsa/rsa_pmeth.c | |||
| @@ -63,6 +63,12 @@ | |||
| 63 | #include <openssl/rsa.h> | 63 | #include <openssl/rsa.h> |
| 64 | #include <openssl/bn.h> | 64 | #include <openssl/bn.h> |
| 65 | #include <openssl/evp.h> | 65 | #include <openssl/evp.h> |
| 66 | #ifndef OPENSSL_NO_CMS | ||
| 67 | #include <openssl/cms.h> | ||
| 68 | #endif | ||
| 69 | #ifdef OPENSSL_FIPS | ||
| 70 | #include <openssl/fips.h> | ||
| 71 | #endif | ||
| 66 | #include "evp_locl.h" | 72 | #include "evp_locl.h" |
| 67 | #include "rsa_locl.h" | 73 | #include "rsa_locl.h" |
| 68 | 74 | ||
| @@ -79,6 +85,8 @@ typedef struct | |||
| 79 | int pad_mode; | 85 | int pad_mode; |
| 80 | /* message digest */ | 86 | /* message digest */ |
| 81 | const EVP_MD *md; | 87 | const EVP_MD *md; |
| 88 | /* message digest for MGF1 */ | ||
| 89 | const EVP_MD *mgf1md; | ||
| 82 | /* PSS/OAEP salt length */ | 90 | /* PSS/OAEP salt length */ |
| 83 | int saltlen; | 91 | int saltlen; |
| 84 | /* Temp buffer */ | 92 | /* Temp buffer */ |
| @@ -95,6 +103,7 @@ static int pkey_rsa_init(EVP_PKEY_CTX *ctx) | |||
| 95 | rctx->pub_exp = NULL; | 103 | rctx->pub_exp = NULL; |
| 96 | rctx->pad_mode = RSA_PKCS1_PADDING; | 104 | rctx->pad_mode = RSA_PKCS1_PADDING; |
| 97 | rctx->md = NULL; | 105 | rctx->md = NULL; |
| 106 | rctx->mgf1md = NULL; | ||
| 98 | rctx->tbuf = NULL; | 107 | rctx->tbuf = NULL; |
| 99 | 108 | ||
| 100 | rctx->saltlen = -2; | 109 | rctx->saltlen = -2; |
| @@ -147,6 +156,31 @@ static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx) | |||
| 147 | OPENSSL_free(rctx); | 156 | OPENSSL_free(rctx); |
| 148 | } | 157 | } |
| 149 | } | 158 | } |
| 159 | #ifdef OPENSSL_FIPS | ||
| 160 | /* FIP checker. Return value indicates status of context parameters: | ||
| 161 | * 1 : redirect to FIPS. | ||
| 162 | * 0 : don't redirect to FIPS. | ||
| 163 | * -1 : illegal operation in FIPS mode. | ||
| 164 | */ | ||
| 165 | |||
| 166 | static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx) | ||
| 167 | { | ||
| 168 | RSA_PKEY_CTX *rctx = ctx->data; | ||
| 169 | RSA *rsa = ctx->pkey->pkey.rsa; | ||
| 170 | int rv = -1; | ||
| 171 | if (!FIPS_mode()) | ||
| 172 | return 0; | ||
| 173 | if (rsa->flags & RSA_FLAG_NON_FIPS_ALLOW) | ||
| 174 | rv = 0; | ||
| 175 | if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv) | ||
| 176 | return -1; | ||
| 177 | if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS)) | ||
| 178 | return rv; | ||
| 179 | if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS)) | ||
| 180 | return rv; | ||
| 181 | return 1; | ||
| 182 | } | ||
| 183 | #endif | ||
| 150 | 184 | ||
| 151 | static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | 185 | static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, |
| 152 | const unsigned char *tbs, size_t tbslen) | 186 | const unsigned char *tbs, size_t tbslen) |
| @@ -155,6 +189,15 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | |||
| 155 | RSA_PKEY_CTX *rctx = ctx->data; | 189 | RSA_PKEY_CTX *rctx = ctx->data; |
| 156 | RSA *rsa = ctx->pkey->pkey.rsa; | 190 | RSA *rsa = ctx->pkey->pkey.rsa; |
| 157 | 191 | ||
| 192 | #ifdef OPENSSL_FIPS | ||
| 193 | ret = pkey_fips_check_ctx(ctx); | ||
| 194 | if (ret < 0) | ||
| 195 | { | ||
| 196 | RSAerr(RSA_F_PKEY_RSA_SIGN, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE); | ||
| 197 | return -1; | ||
| 198 | } | ||
| 199 | #endif | ||
| 200 | |||
| 158 | if (rctx->md) | 201 | if (rctx->md) |
| 159 | { | 202 | { |
| 160 | if (tbslen != (size_t)EVP_MD_size(rctx->md)) | 203 | if (tbslen != (size_t)EVP_MD_size(rctx->md)) |
| @@ -163,7 +206,36 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | |||
| 163 | RSA_R_INVALID_DIGEST_LENGTH); | 206 | RSA_R_INVALID_DIGEST_LENGTH); |
| 164 | return -1; | 207 | return -1; |
| 165 | } | 208 | } |
| 166 | if (rctx->pad_mode == RSA_X931_PADDING) | 209 | #ifdef OPENSSL_FIPS |
| 210 | if (ret > 0) | ||
| 211 | { | ||
| 212 | unsigned int slen; | ||
| 213 | ret = FIPS_rsa_sign_digest(rsa, tbs, tbslen, rctx->md, | ||
| 214 | rctx->pad_mode, | ||
| 215 | rctx->saltlen, | ||
| 216 | rctx->mgf1md, | ||
| 217 | sig, &slen); | ||
| 218 | if (ret > 0) | ||
| 219 | *siglen = slen; | ||
| 220 | else | ||
| 221 | *siglen = 0; | ||
| 222 | return ret; | ||
| 223 | } | ||
| 224 | #endif | ||
| 225 | |||
| 226 | if (EVP_MD_type(rctx->md) == NID_mdc2) | ||
| 227 | { | ||
| 228 | unsigned int sltmp; | ||
| 229 | if (rctx->pad_mode != RSA_PKCS1_PADDING) | ||
| 230 | return -1; | ||
| 231 | ret = RSA_sign_ASN1_OCTET_STRING(NID_mdc2, | ||
| 232 | tbs, tbslen, sig, &sltmp, rsa); | ||
| 233 | |||
| 234 | if (ret <= 0) | ||
| 235 | return ret; | ||
| 236 | ret = sltmp; | ||
| 237 | } | ||
| 238 | else if (rctx->pad_mode == RSA_X931_PADDING) | ||
| 167 | { | 239 | { |
| 168 | if (!setup_tbuf(rctx, ctx)) | 240 | if (!setup_tbuf(rctx, ctx)) |
| 169 | return -1; | 241 | return -1; |
| @@ -186,8 +258,10 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | |||
| 186 | { | 258 | { |
| 187 | if (!setup_tbuf(rctx, ctx)) | 259 | if (!setup_tbuf(rctx, ctx)) |
| 188 | return -1; | 260 | return -1; |
| 189 | if (!RSA_padding_add_PKCS1_PSS(rsa, rctx->tbuf, tbs, | 261 | if (!RSA_padding_add_PKCS1_PSS_mgf1(rsa, |
| 190 | rctx->md, rctx->saltlen)) | 262 | rctx->tbuf, tbs, |
| 263 | rctx->md, rctx->mgf1md, | ||
| 264 | rctx->saltlen)) | ||
| 191 | return -1; | 265 | return -1; |
| 192 | ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf, | 266 | ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf, |
| 193 | sig, rsa, RSA_NO_PADDING); | 267 | sig, rsa, RSA_NO_PADDING); |
| @@ -269,8 +343,30 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx, | |||
| 269 | RSA_PKEY_CTX *rctx = ctx->data; | 343 | RSA_PKEY_CTX *rctx = ctx->data; |
| 270 | RSA *rsa = ctx->pkey->pkey.rsa; | 344 | RSA *rsa = ctx->pkey->pkey.rsa; |
| 271 | size_t rslen; | 345 | size_t rslen; |
| 346 | #ifdef OPENSSL_FIPS | ||
| 347 | int rv; | ||
| 348 | rv = pkey_fips_check_ctx(ctx); | ||
| 349 | if (rv < 0) | ||
| 350 | { | ||
| 351 | RSAerr(RSA_F_PKEY_RSA_VERIFY, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE); | ||
| 352 | return -1; | ||
| 353 | } | ||
| 354 | #endif | ||
| 272 | if (rctx->md) | 355 | if (rctx->md) |
| 273 | { | 356 | { |
| 357 | #ifdef OPENSSL_FIPS | ||
| 358 | if (rv > 0) | ||
| 359 | { | ||
| 360 | return FIPS_rsa_verify_digest(rsa, | ||
| 361 | tbs, tbslen, | ||
| 362 | rctx->md, | ||
| 363 | rctx->pad_mode, | ||
| 364 | rctx->saltlen, | ||
| 365 | rctx->mgf1md, | ||
| 366 | sig, siglen); | ||
| 367 | |||
| 368 | } | ||
| 369 | #endif | ||
| 274 | if (rctx->pad_mode == RSA_PKCS1_PADDING) | 370 | if (rctx->pad_mode == RSA_PKCS1_PADDING) |
| 275 | return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen, | 371 | return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen, |
| 276 | sig, siglen, rsa); | 372 | sig, siglen, rsa); |
| @@ -289,7 +385,8 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx, | |||
| 289 | rsa, RSA_NO_PADDING); | 385 | rsa, RSA_NO_PADDING); |
| 290 | if (ret <= 0) | 386 | if (ret <= 0) |
| 291 | return 0; | 387 | return 0; |
| 292 | ret = RSA_verify_PKCS1_PSS(rsa, tbs, rctx->md, | 388 | ret = RSA_verify_PKCS1_PSS_mgf1(rsa, tbs, |
| 389 | rctx->md, rctx->mgf1md, | ||
| 293 | rctx->tbuf, rctx->saltlen); | 390 | rctx->tbuf, rctx->saltlen); |
| 294 | if (ret <= 0) | 391 | if (ret <= 0) |
| 295 | return 0; | 392 | return 0; |
| @@ -403,15 +500,25 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | |||
| 403 | RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE); | 500 | RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE); |
| 404 | return -2; | 501 | return -2; |
| 405 | 502 | ||
| 503 | case EVP_PKEY_CTRL_GET_RSA_PADDING: | ||
| 504 | *(int *)p2 = rctx->pad_mode; | ||
| 505 | return 1; | ||
| 506 | |||
| 406 | case EVP_PKEY_CTRL_RSA_PSS_SALTLEN: | 507 | case EVP_PKEY_CTRL_RSA_PSS_SALTLEN: |
| 407 | if (p1 < -2) | 508 | case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN: |
| 408 | return -2; | ||
| 409 | if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) | 509 | if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) |
| 410 | { | 510 | { |
| 411 | RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN); | 511 | RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN); |
| 412 | return -2; | 512 | return -2; |
| 413 | } | 513 | } |
| 414 | rctx->saltlen = p1; | 514 | if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN) |
| 515 | *(int *)p2 = rctx->saltlen; | ||
| 516 | else | ||
| 517 | { | ||
| 518 | if (p1 < -2) | ||
| 519 | return -2; | ||
| 520 | rctx->saltlen = p1; | ||
| 521 | } | ||
| 415 | return 1; | 522 | return 1; |
| 416 | 523 | ||
| 417 | case EVP_PKEY_CTRL_RSA_KEYGEN_BITS: | 524 | case EVP_PKEY_CTRL_RSA_KEYGEN_BITS: |
| @@ -435,16 +542,45 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | |||
| 435 | rctx->md = p2; | 542 | rctx->md = p2; |
| 436 | return 1; | 543 | return 1; |
| 437 | 544 | ||
| 545 | case EVP_PKEY_CTRL_RSA_MGF1_MD: | ||
| 546 | case EVP_PKEY_CTRL_GET_RSA_MGF1_MD: | ||
| 547 | if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) | ||
| 548 | { | ||
| 549 | RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD); | ||
| 550 | return -2; | ||
| 551 | } | ||
| 552 | if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD) | ||
| 553 | { | ||
| 554 | if (rctx->mgf1md) | ||
| 555 | *(const EVP_MD **)p2 = rctx->mgf1md; | ||
| 556 | else | ||
| 557 | *(const EVP_MD **)p2 = rctx->md; | ||
| 558 | } | ||
| 559 | else | ||
| 560 | rctx->mgf1md = p2; | ||
| 561 | return 1; | ||
| 562 | |||
| 438 | case EVP_PKEY_CTRL_DIGESTINIT: | 563 | case EVP_PKEY_CTRL_DIGESTINIT: |
| 439 | case EVP_PKEY_CTRL_PKCS7_ENCRYPT: | 564 | case EVP_PKEY_CTRL_PKCS7_ENCRYPT: |
| 440 | case EVP_PKEY_CTRL_PKCS7_DECRYPT: | 565 | case EVP_PKEY_CTRL_PKCS7_DECRYPT: |
| 441 | case EVP_PKEY_CTRL_PKCS7_SIGN: | 566 | case EVP_PKEY_CTRL_PKCS7_SIGN: |
| 567 | return 1; | ||
| 442 | #ifndef OPENSSL_NO_CMS | 568 | #ifndef OPENSSL_NO_CMS |
| 443 | case EVP_PKEY_CTRL_CMS_ENCRYPT: | ||
| 444 | case EVP_PKEY_CTRL_CMS_DECRYPT: | 569 | case EVP_PKEY_CTRL_CMS_DECRYPT: |
| 570 | { | ||
| 571 | X509_ALGOR *alg = NULL; | ||
| 572 | ASN1_OBJECT *encalg = NULL; | ||
| 573 | if (p2) | ||
| 574 | CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg); | ||
| 575 | if (alg) | ||
| 576 | X509_ALGOR_get0(&encalg, NULL, NULL, alg); | ||
| 577 | if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep) | ||
| 578 | rctx->pad_mode = RSA_PKCS1_OAEP_PADDING; | ||
| 579 | } | ||
| 580 | case EVP_PKEY_CTRL_CMS_ENCRYPT: | ||
| 445 | case EVP_PKEY_CTRL_CMS_SIGN: | 581 | case EVP_PKEY_CTRL_CMS_SIGN: |
| 446 | #endif | ||
| 447 | return 1; | 582 | return 1; |
| 583 | #endif | ||
| 448 | case EVP_PKEY_CTRL_PEER_KEY: | 584 | case EVP_PKEY_CTRL_PEER_KEY: |
| 449 | RSAerr(RSA_F_PKEY_RSA_CTRL, | 585 | RSAerr(RSA_F_PKEY_RSA_CTRL, |
| 450 | RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); | 586 | RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); |
diff --git a/src/lib/libcrypto/rsa/rsa_pss.c b/src/lib/libcrypto/rsa/rsa_pss.c index ac211e2ffe..5f9f533d0c 100644 --- a/src/lib/libcrypto/rsa/rsa_pss.c +++ b/src/lib/libcrypto/rsa/rsa_pss.c | |||
| @@ -73,6 +73,13 @@ static const unsigned char zeroes[] = {0,0,0,0,0,0,0,0}; | |||
| 73 | int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | 73 | int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, |
| 74 | const EVP_MD *Hash, const unsigned char *EM, int sLen) | 74 | const EVP_MD *Hash, const unsigned char *EM, int sLen) |
| 75 | { | 75 | { |
| 76 | return RSA_verify_PKCS1_PSS_mgf1(rsa, mHash, Hash, NULL, EM, sLen); | ||
| 77 | } | ||
| 78 | |||
| 79 | int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash, | ||
| 80 | const EVP_MD *Hash, const EVP_MD *mgf1Hash, | ||
| 81 | const unsigned char *EM, int sLen) | ||
| 82 | { | ||
| 76 | int i; | 83 | int i; |
| 77 | int ret = 0; | 84 | int ret = 0; |
| 78 | int hLen, maskedDBLen, MSBits, emLen; | 85 | int hLen, maskedDBLen, MSBits, emLen; |
| @@ -80,6 +87,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
| 80 | unsigned char *DB = NULL; | 87 | unsigned char *DB = NULL; |
| 81 | EVP_MD_CTX ctx; | 88 | EVP_MD_CTX ctx; |
| 82 | unsigned char H_[EVP_MAX_MD_SIZE]; | 89 | unsigned char H_[EVP_MAX_MD_SIZE]; |
| 90 | EVP_MD_CTX_init(&ctx); | ||
| 91 | |||
| 92 | if (mgf1Hash == NULL) | ||
| 93 | mgf1Hash = Hash; | ||
| 83 | 94 | ||
| 84 | hLen = EVP_MD_size(Hash); | 95 | hLen = EVP_MD_size(Hash); |
| 85 | if (hLen < 0) | 96 | if (hLen < 0) |
| @@ -94,7 +105,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
| 94 | else if (sLen == -2) sLen = -2; | 105 | else if (sLen == -2) sLen = -2; |
| 95 | else if (sLen < -2) | 106 | else if (sLen < -2) |
| 96 | { | 107 | { |
| 97 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); | 108 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); |
| 98 | goto err; | 109 | goto err; |
| 99 | } | 110 | } |
| 100 | 111 | ||
| @@ -102,7 +113,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
| 102 | emLen = RSA_size(rsa); | 113 | emLen = RSA_size(rsa); |
| 103 | if (EM[0] & (0xFF << MSBits)) | 114 | if (EM[0] & (0xFF << MSBits)) |
| 104 | { | 115 | { |
| 105 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_FIRST_OCTET_INVALID); | 116 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_FIRST_OCTET_INVALID); |
| 106 | goto err; | 117 | goto err; |
| 107 | } | 118 | } |
| 108 | if (MSBits == 0) | 119 | if (MSBits == 0) |
| @@ -112,12 +123,12 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
| 112 | } | 123 | } |
| 113 | if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */ | 124 | if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */ |
| 114 | { | 125 | { |
| 115 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_DATA_TOO_LARGE); | 126 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_DATA_TOO_LARGE); |
| 116 | goto err; | 127 | goto err; |
| 117 | } | 128 | } |
| 118 | if (EM[emLen - 1] != 0xbc) | 129 | if (EM[emLen - 1] != 0xbc) |
| 119 | { | 130 | { |
| 120 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_LAST_OCTET_INVALID); | 131 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_LAST_OCTET_INVALID); |
| 121 | goto err; | 132 | goto err; |
| 122 | } | 133 | } |
| 123 | maskedDBLen = emLen - hLen - 1; | 134 | maskedDBLen = emLen - hLen - 1; |
| @@ -125,10 +136,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
| 125 | DB = OPENSSL_malloc(maskedDBLen); | 136 | DB = OPENSSL_malloc(maskedDBLen); |
| 126 | if (!DB) | 137 | if (!DB) |
| 127 | { | 138 | { |
| 128 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, ERR_R_MALLOC_FAILURE); | 139 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, ERR_R_MALLOC_FAILURE); |
| 129 | goto err; | 140 | goto err; |
| 130 | } | 141 | } |
| 131 | if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, Hash) < 0) | 142 | if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash) < 0) |
| 132 | goto err; | 143 | goto err; |
| 133 | for (i = 0; i < maskedDBLen; i++) | 144 | for (i = 0; i < maskedDBLen; i++) |
| 134 | DB[i] ^= EM[i]; | 145 | DB[i] ^= EM[i]; |
| @@ -137,25 +148,28 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
| 137 | for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ; | 148 | for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ; |
| 138 | if (DB[i++] != 0x1) | 149 | if (DB[i++] != 0x1) |
| 139 | { | 150 | { |
| 140 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_RECOVERY_FAILED); | 151 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_RECOVERY_FAILED); |
| 141 | goto err; | 152 | goto err; |
| 142 | } | 153 | } |
| 143 | if (sLen >= 0 && (maskedDBLen - i) != sLen) | 154 | if (sLen >= 0 && (maskedDBLen - i) != sLen) |
| 144 | { | 155 | { |
| 145 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); | 156 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); |
| 146 | goto err; | 157 | goto err; |
| 147 | } | 158 | } |
| 148 | EVP_MD_CTX_init(&ctx); | 159 | if (!EVP_DigestInit_ex(&ctx, Hash, NULL) |
| 149 | EVP_DigestInit_ex(&ctx, Hash, NULL); | 160 | || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes) |
| 150 | EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes); | 161 | || !EVP_DigestUpdate(&ctx, mHash, hLen)) |
| 151 | EVP_DigestUpdate(&ctx, mHash, hLen); | 162 | goto err; |
| 152 | if (maskedDBLen - i) | 163 | if (maskedDBLen - i) |
| 153 | EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i); | 164 | { |
| 154 | EVP_DigestFinal(&ctx, H_, NULL); | 165 | if (!EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i)) |
| 155 | EVP_MD_CTX_cleanup(&ctx); | 166 | goto err; |
| 167 | } | ||
| 168 | if (!EVP_DigestFinal_ex(&ctx, H_, NULL)) | ||
| 169 | goto err; | ||
| 156 | if (memcmp(H_, H, hLen)) | 170 | if (memcmp(H_, H, hLen)) |
| 157 | { | 171 | { |
| 158 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_BAD_SIGNATURE); | 172 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_BAD_SIGNATURE); |
| 159 | ret = 0; | 173 | ret = 0; |
| 160 | } | 174 | } |
| 161 | else | 175 | else |
| @@ -164,6 +178,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
| 164 | err: | 178 | err: |
| 165 | if (DB) | 179 | if (DB) |
| 166 | OPENSSL_free(DB); | 180 | OPENSSL_free(DB); |
| 181 | EVP_MD_CTX_cleanup(&ctx); | ||
| 167 | 182 | ||
| 168 | return ret; | 183 | return ret; |
| 169 | 184 | ||
| @@ -173,12 +188,22 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, | |||
| 173 | const unsigned char *mHash, | 188 | const unsigned char *mHash, |
| 174 | const EVP_MD *Hash, int sLen) | 189 | const EVP_MD *Hash, int sLen) |
| 175 | { | 190 | { |
| 191 | return RSA_padding_add_PKCS1_PSS_mgf1(rsa, EM, mHash, Hash, NULL, sLen); | ||
| 192 | } | ||
| 193 | |||
| 194 | int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM, | ||
| 195 | const unsigned char *mHash, | ||
| 196 | const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen) | ||
| 197 | { | ||
| 176 | int i; | 198 | int i; |
| 177 | int ret = 0; | 199 | int ret = 0; |
| 178 | int hLen, maskedDBLen, MSBits, emLen; | 200 | int hLen, maskedDBLen, MSBits, emLen; |
| 179 | unsigned char *H, *salt = NULL, *p; | 201 | unsigned char *H, *salt = NULL, *p; |
| 180 | EVP_MD_CTX ctx; | 202 | EVP_MD_CTX ctx; |
| 181 | 203 | ||
| 204 | if (mgf1Hash == NULL) | ||
| 205 | mgf1Hash = Hash; | ||
| 206 | |||
| 182 | hLen = EVP_MD_size(Hash); | 207 | hLen = EVP_MD_size(Hash); |
| 183 | if (hLen < 0) | 208 | if (hLen < 0) |
| 184 | goto err; | 209 | goto err; |
| @@ -192,7 +217,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, | |||
| 192 | else if (sLen == -2) sLen = -2; | 217 | else if (sLen == -2) sLen = -2; |
| 193 | else if (sLen < -2) | 218 | else if (sLen < -2) |
| 194 | { | 219 | { |
| 195 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); | 220 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); |
| 196 | goto err; | 221 | goto err; |
| 197 | } | 222 | } |
| 198 | 223 | ||
| @@ -209,8 +234,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, | |||
| 209 | } | 234 | } |
| 210 | else if (emLen < (hLen + sLen + 2)) | 235 | else if (emLen < (hLen + sLen + 2)) |
| 211 | { | 236 | { |
| 212 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, | 237 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); |
| 213 | RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); | ||
| 214 | goto err; | 238 | goto err; |
| 215 | } | 239 | } |
| 216 | if (sLen > 0) | 240 | if (sLen > 0) |
| @@ -218,8 +242,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, | |||
| 218 | salt = OPENSSL_malloc(sLen); | 242 | salt = OPENSSL_malloc(sLen); |
| 219 | if (!salt) | 243 | if (!salt) |
| 220 | { | 244 | { |
| 221 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, | 245 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,ERR_R_MALLOC_FAILURE); |
| 222 | ERR_R_MALLOC_FAILURE); | ||
| 223 | goto err; | 246 | goto err; |
| 224 | } | 247 | } |
| 225 | if (RAND_bytes(salt, sLen) <= 0) | 248 | if (RAND_bytes(salt, sLen) <= 0) |
| @@ -228,16 +251,18 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, | |||
| 228 | maskedDBLen = emLen - hLen - 1; | 251 | maskedDBLen = emLen - hLen - 1; |
| 229 | H = EM + maskedDBLen; | 252 | H = EM + maskedDBLen; |
| 230 | EVP_MD_CTX_init(&ctx); | 253 | EVP_MD_CTX_init(&ctx); |
| 231 | EVP_DigestInit_ex(&ctx, Hash, NULL); | 254 | if (!EVP_DigestInit_ex(&ctx, Hash, NULL) |
| 232 | EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes); | 255 | || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes) |
| 233 | EVP_DigestUpdate(&ctx, mHash, hLen); | 256 | || !EVP_DigestUpdate(&ctx, mHash, hLen)) |
| 234 | if (sLen) | 257 | goto err; |
| 235 | EVP_DigestUpdate(&ctx, salt, sLen); | 258 | if (sLen && !EVP_DigestUpdate(&ctx, salt, sLen)) |
| 236 | EVP_DigestFinal(&ctx, H, NULL); | 259 | goto err; |
| 260 | if (!EVP_DigestFinal_ex(&ctx, H, NULL)) | ||
| 261 | goto err; | ||
| 237 | EVP_MD_CTX_cleanup(&ctx); | 262 | EVP_MD_CTX_cleanup(&ctx); |
| 238 | 263 | ||
| 239 | /* Generate dbMask in place then perform XOR on it */ | 264 | /* Generate dbMask in place then perform XOR on it */ |
| 240 | if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, Hash)) | 265 | if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash)) |
| 241 | goto err; | 266 | goto err; |
| 242 | 267 | ||
| 243 | p = EM; | 268 | p = EM; |
diff --git a/src/lib/libcrypto/s390xcap.c b/src/lib/libcrypto/s390xcap.c index ffbe0235f9..f2e94ef47e 100644 --- a/src/lib/libcrypto/s390xcap.c +++ b/src/lib/libcrypto/s390xcap.c | |||
| @@ -4,7 +4,7 @@ | |||
| 4 | #include <setjmp.h> | 4 | #include <setjmp.h> |
| 5 | #include <signal.h> | 5 | #include <signal.h> |
| 6 | 6 | ||
| 7 | extern unsigned long OPENSSL_s390xcap_P; | 7 | extern unsigned long OPENSSL_s390xcap_P[]; |
| 8 | 8 | ||
| 9 | static sigjmp_buf ill_jmp; | 9 | static sigjmp_buf ill_jmp; |
| 10 | static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } | 10 | static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } |
| @@ -16,7 +16,9 @@ void OPENSSL_cpuid_setup(void) | |||
| 16 | sigset_t oset; | 16 | sigset_t oset; |
| 17 | struct sigaction ill_act,oact; | 17 | struct sigaction ill_act,oact; |
| 18 | 18 | ||
| 19 | if (OPENSSL_s390xcap_P) return; | 19 | if (OPENSSL_s390xcap_P[0]) return; |
| 20 | |||
| 21 | OPENSSL_s390xcap_P[0] = 1UL<<(8*sizeof(unsigned long)-1); | ||
| 20 | 22 | ||
| 21 | memset(&ill_act,0,sizeof(ill_act)); | 23 | memset(&ill_act,0,sizeof(ill_act)); |
| 22 | ill_act.sa_handler = ill_handler; | 24 | ill_act.sa_handler = ill_handler; |
| @@ -27,10 +29,8 @@ void OPENSSL_cpuid_setup(void) | |||
| 27 | sigaction (SIGILL,&ill_act,&oact); | 29 | sigaction (SIGILL,&ill_act,&oact); |
| 28 | 30 | ||
| 29 | /* protection against missing store-facility-list-extended */ | 31 | /* protection against missing store-facility-list-extended */ |
| 30 | if (sigsetjmp(ill_jmp,0) == 0) | 32 | if (sigsetjmp(ill_jmp,1) == 0) |
| 31 | OPENSSL_s390xcap_P = OPENSSL_s390x_facilities(); | 33 | OPENSSL_s390x_facilities(); |
| 32 | else | ||
| 33 | OPENSSL_s390xcap_P = 1UL<<63; | ||
| 34 | 34 | ||
| 35 | sigaction (SIGILL,&oact,NULL); | 35 | sigaction (SIGILL,&oact,NULL); |
| 36 | sigprocmask(SIG_SETMASK,&oset,NULL); | 36 | sigprocmask(SIG_SETMASK,&oset,NULL); |
diff --git a/src/lib/libcrypto/s390xcpuid.S b/src/lib/libcrypto/s390xcpuid.S index b053c6a281..06815347e6 100644 --- a/src/lib/libcrypto/s390xcpuid.S +++ b/src/lib/libcrypto/s390xcpuid.S | |||
| @@ -5,10 +5,14 @@ | |||
| 5 | .align 16 | 5 | .align 16 |
| 6 | OPENSSL_s390x_facilities: | 6 | OPENSSL_s390x_facilities: |
| 7 | lghi %r0,0 | 7 | lghi %r0,0 |
| 8 | .long 0xb2b0f010 # stfle 16(%r15) | 8 | larl %r2,OPENSSL_s390xcap_P |
| 9 | lg %r2,16(%r15) | 9 | stg %r0,8(%r2) |
| 10 | larl %r1,OPENSSL_s390xcap_P | 10 | .long 0xb2b02000 # stfle 0(%r2) |
| 11 | stg %r2,0(%r1) | 11 | brc 8,.Ldone |
| 12 | lghi %r0,1 | ||
| 13 | .long 0xb2b02000 # stfle 0(%r2) | ||
| 14 | .Ldone: | ||
| 15 | lg %r2,0(%r2) | ||
| 12 | br %r14 | 16 | br %r14 |
| 13 | .size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities | 17 | .size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities |
| 14 | 18 | ||
| @@ -58,6 +62,9 @@ OPENSSL_wipe_cpu: | |||
| 58 | .type OPENSSL_cleanse,@function | 62 | .type OPENSSL_cleanse,@function |
| 59 | .align 16 | 63 | .align 16 |
| 60 | OPENSSL_cleanse: | 64 | OPENSSL_cleanse: |
| 65 | #if !defined(__s390x__) && !defined(__s390x) | ||
| 66 | llgfr %r3,%r3 | ||
| 67 | #endif | ||
| 61 | lghi %r4,15 | 68 | lghi %r4,15 |
| 62 | lghi %r0,0 | 69 | lghi %r0,0 |
| 63 | clgr %r3,%r4 | 70 | clgr %r3,%r4 |
| @@ -89,4 +96,4 @@ OPENSSL_cleanse: | |||
| 89 | .section .init | 96 | .section .init |
| 90 | brasl %r14,OPENSSL_cpuid_setup | 97 | brasl %r14,OPENSSL_cpuid_setup |
| 91 | 98 | ||
| 92 | .comm OPENSSL_s390xcap_P,8,8 | 99 | .comm OPENSSL_s390xcap_P,16,8 |
diff --git a/src/lib/libcrypto/sha/asm/sha1-alpha.pl b/src/lib/libcrypto/sha/asm/sha1-alpha.pl new file mode 100644 index 0000000000..6c4b9251fd --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha1-alpha.pl | |||
| @@ -0,0 +1,322 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # SHA1 block procedure for Alpha. | ||
| 11 | |||
| 12 | # On 21264 performance is 33% better than code generated by vendor | ||
| 13 | # compiler, and 75% better than GCC [3.4], and in absolute terms is | ||
| 14 | # 8.7 cycles per processed byte. Implementation features vectorized | ||
| 15 | # byte swap, but not Xupdate. | ||
| 16 | |||
| 17 | @X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", | ||
| 18 | "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15"); | ||
| 19 | $ctx="a0"; # $16 | ||
| 20 | $inp="a1"; | ||
| 21 | $num="a2"; | ||
| 22 | $A="a3"; | ||
| 23 | $B="a4"; # 20 | ||
| 24 | $C="a5"; | ||
| 25 | $D="t8"; | ||
| 26 | $E="t9"; @V=($A,$B,$C,$D,$E); | ||
| 27 | $t0="t10"; # 24 | ||
| 28 | $t1="t11"; | ||
| 29 | $t2="ra"; | ||
| 30 | $t3="t12"; | ||
| 31 | $K="AT"; # 28 | ||
| 32 | |||
| 33 | sub BODY_00_19 { | ||
| 34 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 35 | my $j=$i+1; | ||
| 36 | $code.=<<___ if ($i==0); | ||
| 37 | ldq_u @X[0],0+0($inp) | ||
| 38 | ldq_u @X[1],0+7($inp) | ||
| 39 | ___ | ||
| 40 | $code.=<<___ if (!($i&1) && $i<14); | ||
| 41 | ldq_u @X[$i+2],($i+2)*4+0($inp) | ||
| 42 | ldq_u @X[$i+3],($i+2)*4+7($inp) | ||
| 43 | ___ | ||
| 44 | $code.=<<___ if (!($i&1) && $i<15); | ||
| 45 | extql @X[$i],$inp,@X[$i] | ||
| 46 | extqh @X[$i+1],$inp,@X[$i+1] | ||
| 47 | |||
| 48 | or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched | ||
| 49 | |||
| 50 | srl @X[$i],24,$t0 # vectorized byte swap | ||
| 51 | srl @X[$i],8,$t2 | ||
| 52 | |||
| 53 | sll @X[$i],8,$t3 | ||
| 54 | sll @X[$i],24,@X[$i] | ||
| 55 | zapnot $t0,0x11,$t0 | ||
| 56 | zapnot $t2,0x22,$t2 | ||
| 57 | |||
| 58 | zapnot @X[$i],0x88,@X[$i] | ||
| 59 | or $t0,$t2,$t0 | ||
| 60 | zapnot $t3,0x44,$t3 | ||
| 61 | sll $a,5,$t1 | ||
| 62 | |||
| 63 | or @X[$i],$t0,@X[$i] | ||
| 64 | addl $K,$e,$e | ||
| 65 | and $b,$c,$t2 | ||
| 66 | zapnot $a,0xf,$a | ||
| 67 | |||
| 68 | or @X[$i],$t3,@X[$i] | ||
| 69 | srl $a,27,$t0 | ||
| 70 | bic $d,$b,$t3 | ||
| 71 | sll $b,30,$b | ||
| 72 | |||
| 73 | extll @X[$i],4,@X[$i+1] # extract upper half | ||
| 74 | or $t2,$t3,$t2 | ||
| 75 | addl @X[$i],$e,$e | ||
| 76 | |||
| 77 | addl $t1,$e,$e | ||
| 78 | srl $b,32,$t3 | ||
| 79 | zapnot @X[$i],0xf,@X[$i] | ||
| 80 | |||
| 81 | addl $t0,$e,$e | ||
| 82 | addl $t2,$e,$e | ||
| 83 | or $t3,$b,$b | ||
| 84 | ___ | ||
| 85 | $code.=<<___ if (($i&1) && $i<15); | ||
| 86 | sll $a,5,$t1 | ||
| 87 | addl $K,$e,$e | ||
| 88 | and $b,$c,$t2 | ||
| 89 | zapnot $a,0xf,$a | ||
| 90 | |||
| 91 | srl $a,27,$t0 | ||
| 92 | addl @X[$i%16],$e,$e | ||
| 93 | bic $d,$b,$t3 | ||
| 94 | sll $b,30,$b | ||
| 95 | |||
| 96 | or $t2,$t3,$t2 | ||
| 97 | addl $t1,$e,$e | ||
| 98 | srl $b,32,$t3 | ||
| 99 | zapnot @X[$i],0xf,@X[$i] | ||
| 100 | |||
| 101 | addl $t0,$e,$e | ||
| 102 | addl $t2,$e,$e | ||
| 103 | or $t3,$b,$b | ||
| 104 | ___ | ||
| 105 | $code.=<<___ if ($i>=15); # with forward Xupdate | ||
| 106 | sll $a,5,$t1 | ||
| 107 | addl $K,$e,$e | ||
| 108 | and $b,$c,$t2 | ||
| 109 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] | ||
| 110 | |||
| 111 | zapnot $a,0xf,$a | ||
| 112 | addl @X[$i%16],$e,$e | ||
| 113 | bic $d,$b,$t3 | ||
| 114 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
| 115 | |||
| 116 | srl $a,27,$t0 | ||
| 117 | addl $t1,$e,$e | ||
| 118 | or $t2,$t3,$t2 | ||
| 119 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
| 120 | |||
| 121 | sll $b,30,$b | ||
| 122 | addl $t0,$e,$e | ||
| 123 | srl @X[$j%16],31,$t1 | ||
| 124 | |||
| 125 | addl $t2,$e,$e | ||
| 126 | srl $b,32,$t3 | ||
| 127 | addl @X[$j%16],@X[$j%16],@X[$j%16] | ||
| 128 | |||
| 129 | or $t3,$b,$b | ||
| 130 | zapnot @X[$i%16],0xf,@X[$i%16] | ||
| 131 | or $t1,@X[$j%16],@X[$j%16] | ||
| 132 | ___ | ||
| 133 | } | ||
| 134 | |||
| 135 | sub BODY_20_39 { | ||
| 136 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 137 | my $j=$i+1; | ||
| 138 | $code.=<<___ if ($i<79); # with forward Xupdate | ||
| 139 | sll $a,5,$t1 | ||
| 140 | addl $K,$e,$e | ||
| 141 | zapnot $a,0xf,$a | ||
| 142 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] | ||
| 143 | |||
| 144 | sll $b,30,$t3 | ||
| 145 | addl $t1,$e,$e | ||
| 146 | xor $b,$c,$t2 | ||
| 147 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
| 148 | |||
| 149 | srl $b,2,$b | ||
| 150 | addl @X[$i%16],$e,$e | ||
| 151 | xor $d,$t2,$t2 | ||
| 152 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
| 153 | |||
| 154 | srl @X[$j%16],31,$t1 | ||
| 155 | addl $t2,$e,$e | ||
| 156 | srl $a,27,$t0 | ||
| 157 | addl @X[$j%16],@X[$j%16],@X[$j%16] | ||
| 158 | |||
| 159 | or $t3,$b,$b | ||
| 160 | addl $t0,$e,$e | ||
| 161 | or $t1,@X[$j%16],@X[$j%16] | ||
| 162 | ___ | ||
| 163 | $code.=<<___ if ($i<77); | ||
| 164 | zapnot @X[$i%16],0xf,@X[$i%16] | ||
| 165 | ___ | ||
| 166 | $code.=<<___ if ($i==79); # with context fetch | ||
| 167 | sll $a,5,$t1 | ||
| 168 | addl $K,$e,$e | ||
| 169 | zapnot $a,0xf,$a | ||
| 170 | ldl @X[0],0($ctx) | ||
| 171 | |||
| 172 | sll $b,30,$t3 | ||
| 173 | addl $t1,$e,$e | ||
| 174 | xor $b,$c,$t2 | ||
| 175 | ldl @X[1],4($ctx) | ||
| 176 | |||
| 177 | srl $b,2,$b | ||
| 178 | addl @X[$i%16],$e,$e | ||
| 179 | xor $d,$t2,$t2 | ||
| 180 | ldl @X[2],8($ctx) | ||
| 181 | |||
| 182 | srl $a,27,$t0 | ||
| 183 | addl $t2,$e,$e | ||
| 184 | ldl @X[3],12($ctx) | ||
| 185 | |||
| 186 | or $t3,$b,$b | ||
| 187 | addl $t0,$e,$e | ||
| 188 | ldl @X[4],16($ctx) | ||
| 189 | ___ | ||
| 190 | } | ||
| 191 | |||
| 192 | sub BODY_40_59 { | ||
| 193 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 194 | my $j=$i+1; | ||
| 195 | $code.=<<___; # with forward Xupdate | ||
| 196 | sll $a,5,$t1 | ||
| 197 | addl $K,$e,$e | ||
| 198 | zapnot $a,0xf,$a | ||
| 199 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] | ||
| 200 | |||
| 201 | srl $a,27,$t0 | ||
| 202 | and $b,$c,$t2 | ||
| 203 | and $b,$d,$t3 | ||
| 204 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
| 205 | |||
| 206 | sll $b,30,$b | ||
| 207 | addl $t1,$e,$e | ||
| 208 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
| 209 | |||
| 210 | srl @X[$j%16],31,$t1 | ||
| 211 | addl $t0,$e,$e | ||
| 212 | or $t2,$t3,$t2 | ||
| 213 | and $c,$d,$t3 | ||
| 214 | |||
| 215 | or $t2,$t3,$t2 | ||
| 216 | srl $b,32,$t3 | ||
| 217 | addl @X[$i%16],$e,$e | ||
| 218 | addl @X[$j%16],@X[$j%16],@X[$j%16] | ||
| 219 | |||
| 220 | or $t3,$b,$b | ||
| 221 | addl $t2,$e,$e | ||
| 222 | or $t1,@X[$j%16],@X[$j%16] | ||
| 223 | zapnot @X[$i%16],0xf,@X[$i%16] | ||
| 224 | ___ | ||
| 225 | } | ||
| 226 | |||
| 227 | $code=<<___; | ||
| 228 | #ifdef __linux__ | ||
| 229 | #include <asm/regdef.h> | ||
| 230 | #else | ||
| 231 | #include <asm.h> | ||
| 232 | #include <regdef.h> | ||
| 233 | #endif | ||
| 234 | |||
| 235 | .text | ||
| 236 | |||
| 237 | .set noat | ||
| 238 | .set noreorder | ||
| 239 | .globl sha1_block_data_order | ||
| 240 | .align 5 | ||
| 241 | .ent sha1_block_data_order | ||
| 242 | sha1_block_data_order: | ||
| 243 | lda sp,-64(sp) | ||
| 244 | stq ra,0(sp) | ||
| 245 | stq s0,8(sp) | ||
| 246 | stq s1,16(sp) | ||
| 247 | stq s2,24(sp) | ||
| 248 | stq s3,32(sp) | ||
| 249 | stq s4,40(sp) | ||
| 250 | stq s5,48(sp) | ||
| 251 | stq fp,56(sp) | ||
| 252 | .mask 0x0400fe00,-64 | ||
| 253 | .frame sp,64,ra | ||
| 254 | .prologue 0 | ||
| 255 | |||
| 256 | ldl $A,0($ctx) | ||
| 257 | ldl $B,4($ctx) | ||
| 258 | sll $num,6,$num | ||
| 259 | ldl $C,8($ctx) | ||
| 260 | ldl $D,12($ctx) | ||
| 261 | ldl $E,16($ctx) | ||
| 262 | addq $inp,$num,$num | ||
| 263 | |||
| 264 | .Lloop: | ||
| 265 | .set noreorder | ||
| 266 | ldah $K,23170(zero) | ||
| 267 | zapnot $B,0xf,$B | ||
| 268 | lda $K,31129($K) # K_00_19 | ||
| 269 | ___ | ||
| 270 | for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } | ||
| 271 | |||
| 272 | $code.=<<___; | ||
| 273 | ldah $K,28378(zero) | ||
| 274 | lda $K,-5215($K) # K_20_39 | ||
| 275 | ___ | ||
| 276 | for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
| 277 | |||
| 278 | $code.=<<___; | ||
| 279 | ldah $K,-28900(zero) | ||
| 280 | lda $K,-17188($K) # K_40_59 | ||
| 281 | ___ | ||
| 282 | for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | ||
| 283 | |||
| 284 | $code.=<<___; | ||
| 285 | ldah $K,-13725(zero) | ||
| 286 | lda $K,-15914($K) # K_60_79 | ||
| 287 | ___ | ||
| 288 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
| 289 | |||
| 290 | $code.=<<___; | ||
| 291 | addl @X[0],$A,$A | ||
| 292 | addl @X[1],$B,$B | ||
| 293 | addl @X[2],$C,$C | ||
| 294 | addl @X[3],$D,$D | ||
| 295 | addl @X[4],$E,$E | ||
| 296 | stl $A,0($ctx) | ||
| 297 | stl $B,4($ctx) | ||
| 298 | addq $inp,64,$inp | ||
| 299 | stl $C,8($ctx) | ||
| 300 | stl $D,12($ctx) | ||
| 301 | stl $E,16($ctx) | ||
| 302 | cmpult $inp,$num,$t1 | ||
| 303 | bne $t1,.Lloop | ||
| 304 | |||
| 305 | .set noreorder | ||
| 306 | ldq ra,0(sp) | ||
| 307 | ldq s0,8(sp) | ||
| 308 | ldq s1,16(sp) | ||
| 309 | ldq s2,24(sp) | ||
| 310 | ldq s3,32(sp) | ||
| 311 | ldq s4,40(sp) | ||
| 312 | ldq s5,48(sp) | ||
| 313 | ldq fp,56(sp) | ||
| 314 | lda sp,64(sp) | ||
| 315 | ret (ra) | ||
| 316 | .end sha1_block_data_order | ||
| 317 | .ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 318 | .align 2 | ||
| 319 | ___ | ||
| 320 | $output=shift and open STDOUT,">$output"; | ||
| 321 | print $code; | ||
| 322 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl index 6e65fe3e01..fe8207f77f 100644 --- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl +++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl | |||
| @@ -47,6 +47,10 @@ | |||
| 47 | # Cortex A8 core and in absolute terms ~870 cycles per input block | 47 | # Cortex A8 core and in absolute terms ~870 cycles per input block |
| 48 | # [or 13.6 cycles per byte]. | 48 | # [or 13.6 cycles per byte]. |
| 49 | 49 | ||
| 50 | # February 2011. | ||
| 51 | # | ||
| 52 | # Profiler-assisted and platform-specific optimization resulted in 10% | ||
| 53 | # improvement on Cortex A8 core and 12.2 cycles per byte. | ||
| 50 | 54 | ||
| 51 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | 55 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| 52 | open STDOUT,">$output"; | 56 | open STDOUT,">$output"; |
| @@ -76,31 +80,41 @@ $code.=<<___; | |||
| 76 | add $e,$K,$e,ror#2 @ E+=K_xx_xx | 80 | add $e,$K,$e,ror#2 @ E+=K_xx_xx |
| 77 | ldr $t3,[$Xi,#2*4] | 81 | ldr $t3,[$Xi,#2*4] |
| 78 | eor $t0,$t0,$t1 | 82 | eor $t0,$t0,$t1 |
| 79 | eor $t2,$t2,$t3 | 83 | eor $t2,$t2,$t3 @ 1 cycle stall |
| 80 | eor $t1,$c,$d @ F_xx_xx | 84 | eor $t1,$c,$d @ F_xx_xx |
| 81 | mov $t0,$t0,ror#31 | 85 | mov $t0,$t0,ror#31 |
| 82 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) | 86 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) |
| 83 | eor $t0,$t0,$t2,ror#31 | 87 | eor $t0,$t0,$t2,ror#31 |
| 88 | str $t0,[$Xi,#-4]! | ||
| 84 | $opt1 @ F_xx_xx | 89 | $opt1 @ F_xx_xx |
| 85 | $opt2 @ F_xx_xx | 90 | $opt2 @ F_xx_xx |
| 86 | add $e,$e,$t0 @ E+=X[i] | 91 | add $e,$e,$t0 @ E+=X[i] |
| 87 | str $t0,[$Xi,#-4]! | ||
| 88 | ___ | 92 | ___ |
| 89 | } | 93 | } |
| 90 | 94 | ||
| 91 | sub BODY_00_15 { | 95 | sub BODY_00_15 { |
| 92 | my ($a,$b,$c,$d,$e)=@_; | 96 | my ($a,$b,$c,$d,$e)=@_; |
| 93 | $code.=<<___; | 97 | $code.=<<___; |
| 94 | ldrb $t0,[$inp],#4 | 98 | #if __ARM_ARCH__<7 |
| 95 | ldrb $t1,[$inp,#-1] | 99 | ldrb $t1,[$inp,#2] |
| 96 | ldrb $t2,[$inp,#-2] | 100 | ldrb $t0,[$inp,#3] |
| 101 | ldrb $t2,[$inp,#1] | ||
| 97 | add $e,$K,$e,ror#2 @ E+=K_00_19 | 102 | add $e,$K,$e,ror#2 @ E+=K_00_19 |
| 98 | ldrb $t3,[$inp,#-3] | 103 | ldrb $t3,[$inp],#4 |
| 104 | orr $t0,$t0,$t1,lsl#8 | ||
| 105 | eor $t1,$c,$d @ F_xx_xx | ||
| 106 | orr $t0,$t0,$t2,lsl#16 | ||
| 99 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) | 107 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) |
| 100 | orr $t0,$t1,$t0,lsl#24 | 108 | orr $t0,$t0,$t3,lsl#24 |
| 109 | #else | ||
| 110 | ldr $t0,[$inp],#4 @ handles unaligned | ||
| 111 | add $e,$K,$e,ror#2 @ E+=K_00_19 | ||
| 101 | eor $t1,$c,$d @ F_xx_xx | 112 | eor $t1,$c,$d @ F_xx_xx |
| 102 | orr $t0,$t0,$t2,lsl#8 | 113 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) |
| 103 | orr $t0,$t0,$t3,lsl#16 | 114 | #ifdef __ARMEL__ |
| 115 | rev $t0,$t0 @ byte swap | ||
| 116 | #endif | ||
| 117 | #endif | ||
| 104 | and $t1,$b,$t1,ror#2 | 118 | and $t1,$b,$t1,ror#2 |
| 105 | add $e,$e,$t0 @ E+=X[i] | 119 | add $e,$e,$t0 @ E+=X[i] |
| 106 | eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) | 120 | eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) |
| @@ -136,6 +150,8 @@ ___ | |||
| 136 | } | 150 | } |
| 137 | 151 | ||
| 138 | $code=<<___; | 152 | $code=<<___; |
| 153 | #include "arm_arch.h" | ||
| 154 | |||
| 139 | .text | 155 | .text |
| 140 | 156 | ||
| 141 | .global sha1_block_data_order | 157 | .global sha1_block_data_order |
| @@ -209,10 +225,14 @@ $code.=<<___; | |||
| 209 | teq $inp,$len | 225 | teq $inp,$len |
| 210 | bne .Lloop @ [+18], total 1307 | 226 | bne .Lloop @ [+18], total 1307 |
| 211 | 227 | ||
| 228 | #if __ARM_ARCH__>=5 | ||
| 229 | ldmia sp!,{r4-r12,pc} | ||
| 230 | #else | ||
| 212 | ldmia sp!,{r4-r12,lr} | 231 | ldmia sp!,{r4-r12,lr} |
| 213 | tst lr,#1 | 232 | tst lr,#1 |
| 214 | moveq pc,lr @ be binary compatible with V4, yet | 233 | moveq pc,lr @ be binary compatible with V4, yet |
| 215 | bx lr @ interoperable with Thumb ISA:-) | 234 | bx lr @ interoperable with Thumb ISA:-) |
| 235 | #endif | ||
| 216 | .align 2 | 236 | .align 2 |
| 217 | .LK_00_19: .word 0x5a827999 | 237 | .LK_00_19: .word 0x5a827999 |
| 218 | .LK_20_39: .word 0x6ed9eba1 | 238 | .LK_20_39: .word 0x6ed9eba1 |
diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl index 51c4f47ecb..db28f0805a 100644 --- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl +++ b/src/lib/libcrypto/sha/asm/sha1-ia64.pl | |||
| @@ -15,7 +15,7 @@ | |||
| 15 | # is >50% better than HP C and >2x better than gcc. | 15 | # is >50% better than HP C and >2x better than gcc. |
| 16 | 16 | ||
| 17 | $code=<<___; | 17 | $code=<<___; |
| 18 | .ident \"sha1-ia64.s, version 1.2\" | 18 | .ident \"sha1-ia64.s, version 1.3\" |
| 19 | .ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" | 19 | .ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" |
| 20 | .explicit | 20 | .explicit |
| 21 | 21 | ||
| @@ -26,14 +26,10 @@ if ($^O eq "hpux") { | |||
| 26 | $ADDP="addp4"; | 26 | $ADDP="addp4"; |
| 27 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | 27 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } |
| 28 | } else { $ADDP="add"; } | 28 | } else { $ADDP="add"; } |
| 29 | for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); | ||
| 30 | $big_endian=0 if (/\-DL_ENDIAN/); } | ||
| 31 | if (!defined($big_endian)) | ||
| 32 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
| 33 | 29 | ||
| 34 | #$human=1; | 30 | #$human=1; |
| 35 | if ($human) { # useful for visual code auditing... | 31 | if ($human) { # useful for visual code auditing... |
| 36 | ($A,$B,$C,$D,$E,$T) = ("A","B","C","D","E","T"); | 32 | ($A,$B,$C,$D,$E) = ("A","B","C","D","E"); |
| 37 | ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4"); | 33 | ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4"); |
| 38 | ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = | 34 | ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = |
| 39 | ( "K_00_19","K_20_39","K_40_59","K_60_79" ); | 35 | ( "K_00_19","K_20_39","K_40_59","K_60_79" ); |
| @@ -41,47 +37,50 @@ if ($human) { # useful for visual code auditing... | |||
| 41 | "X8", "X9","X10","X11","X12","X13","X14","X15" ); | 37 | "X8", "X9","X10","X11","X12","X13","X14","X15" ); |
| 42 | } | 38 | } |
| 43 | else { | 39 | else { |
| 44 | ($A,$B,$C,$D,$E,$T) = ("loc0","loc1","loc2","loc3","loc4","loc5"); | 40 | ($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4"); |
| 45 | ($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10"); | 41 | ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9"); |
| 46 | ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = | 42 | ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = |
| 47 | ( "r14", "r15", "loc11", "loc12" ); | 43 | ( "r14", "r15", "loc10", "loc11" ); |
| 48 | @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", | 44 | @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", |
| 49 | "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" ); | 45 | "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" ); |
| 50 | } | 46 | } |
| 51 | 47 | ||
| 52 | sub BODY_00_15 { | 48 | sub BODY_00_15 { |
| 53 | local *code=shift; | 49 | local *code=shift; |
| 54 | local ($i,$a,$b,$c,$d,$e,$f)=@_; | 50 | my ($i,$a,$b,$c,$d,$e)=@_; |
| 51 | my $j=$i+1; | ||
| 52 | my $Xn=@X[$j%16]; | ||
| 55 | 53 | ||
| 56 | $code.=<<___ if ($i==0); | 54 | $code.=<<___ if ($i==0); |
| 57 | { .mmi; ld1 $X[$i&0xf]=[inp],2 // MSB | 55 | { .mmi; ld1 $X[$i]=[inp],2 // MSB |
| 58 | ld1 tmp2=[tmp3],2 };; | 56 | ld1 tmp2=[tmp3],2 };; |
| 59 | { .mmi; ld1 tmp0=[inp],2 | 57 | { .mmi; ld1 tmp0=[inp],2 |
| 60 | ld1 tmp4=[tmp3],2 // LSB | 58 | ld1 tmp4=[tmp3],2 // LSB |
| 61 | dep $X[$i&0xf]=$X[$i&0xf],tmp2,8,8 };; | 59 | dep $X[$i]=$X[$i],tmp2,8,8 };; |
| 62 | ___ | 60 | ___ |
| 63 | if ($i<15) { | 61 | if ($i<15) { |
| 64 | $code.=<<___; | 62 | $code.=<<___; |
| 65 | { .mmi; ld1 $X[($i+1)&0xf]=[inp],2 // +1 | 63 | { .mmi; ld1 $Xn=[inp],2 // forward Xload |
| 64 | nop.m 0x0 | ||
| 66 | dep tmp1=tmp0,tmp4,8,8 };; | 65 | dep tmp1=tmp0,tmp4,8,8 };; |
| 67 | { .mmi; ld1 tmp2=[tmp3],2 // +1 | 66 | { .mmi; ld1 tmp2=[tmp3],2 // forward Xload |
| 68 | and tmp4=$c,$b | 67 | and tmp4=$c,$b |
| 69 | dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; | 68 | dep $X[$i]=$X[$i],tmp1,16,16} //;; |
| 70 | { .mmi; andcm tmp1=$d,$b | 69 | { .mmi; add $e=$e,$K_00_19 // e+=K_00_19 |
| 71 | add tmp0=$e,$K_00_19 | 70 | andcm tmp1=$d,$b |
| 72 | dep.z tmp5=$a,5,27 };; // a<<5 | 71 | dep.z tmp5=$a,5,27 };; // a<<5 |
| 73 | { .mmi; or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) | 72 | { .mmi; add $e=$e,$X[$i] // e+=Xload |
| 74 | add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 | 73 | or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) |
| 75 | extr.u tmp1=$a,27,5 };; // a>>27 | 74 | extr.u tmp1=$a,27,5 };; // a>>27 |
| 76 | { .mmi; ld1 tmp0=[inp],2 // +1 | 75 | { .mmi; ld1 tmp0=[inp],2 // forward Xload |
| 77 | add $f=$f,tmp4 // f+=F_00_19(b,c,d) | 76 | add $e=$e,tmp4 // e+=F_00_19(b,c,d) |
| 78 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | 77 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) |
| 79 | { .mmi; ld1 tmp4=[tmp3],2 // +1 | 78 | { .mmi; ld1 tmp4=[tmp3],2 // forward Xload |
| 80 | or tmp5=tmp1,tmp5 // ROTATE(a,5) | 79 | or tmp5=tmp1,tmp5 // ROTATE(a,5) |
| 81 | mux2 tmp6=$a,0x44 };; // see b in next iteration | 80 | mux2 tmp6=$a,0x44 };; // see b in next iteration |
| 82 | { .mii; add $f=$f,tmp5 // f+=ROTATE(a,5) | 81 | { .mii; add $e=$e,tmp5 // e+=ROTATE(a,5) |
| 83 | dep $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8 // +1 | 82 | dep $Xn=$Xn,tmp2,8,8 // forward Xload |
| 84 | mux2 $X[$i&0xf]=$X[$i&0xf],0x44 } //;; | 83 | mux2 $X[$i]=$X[$i],0x44 } //;; |
| 85 | 84 | ||
| 86 | ___ | 85 | ___ |
| 87 | } | 86 | } |
| @@ -89,24 +88,24 @@ else { | |||
| 89 | $code.=<<___; | 88 | $code.=<<___; |
| 90 | { .mii; and tmp3=$c,$b | 89 | { .mii; and tmp3=$c,$b |
| 91 | dep tmp1=tmp0,tmp4,8,8;; | 90 | dep tmp1=tmp0,tmp4,8,8;; |
| 92 | dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; | 91 | dep $X[$i]=$X[$i],tmp1,16,16} //;; |
| 93 | { .mmi; andcm tmp1=$d,$b | 92 | { .mmi; add $e=$e,$K_00_19 // e+=K_00_19 |
| 94 | add tmp0=$e,$K_00_19 | 93 | andcm tmp1=$d,$b |
| 95 | dep.z tmp5=$a,5,27 };; // a<<5 | 94 | dep.z tmp5=$a,5,27 };; // a<<5 |
| 96 | { .mmi; or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) | 95 | { .mmi; add $e=$e,$X[$i] // e+=Xupdate |
| 97 | add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 | 96 | or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) |
| 98 | extr.u tmp1=$a,27,5 } // a>>27 | 97 | extr.u tmp1=$a,27,5 } // a>>27 |
| 99 | { .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 | 98 | { .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate |
| 100 | xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 | 99 | xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate |
| 101 | nop.i 0 };; | 100 | nop.i 0 };; |
| 102 | { .mmi; add $f=$f,tmp4 // f+=F_00_19(b,c,d) | 101 | { .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d) |
| 103 | xor tmp2=tmp2,tmp3 // +1 | 102 | xor $Xn=$Xn,tmp3 // forward Xupdate |
| 104 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | 103 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) |
| 105 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | 104 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) |
| 106 | mux2 tmp6=$a,0x44 };; // see b in next iteration | 105 | mux2 tmp6=$a,0x44 };; // see b in next iteration |
| 107 | { .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) | 106 | { .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) |
| 108 | shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) | 107 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) |
| 109 | mux2 $X[$i&0xf]=$X[$i&0xf],0x44 };; | 108 | mux2 $X[$i]=$X[$i],0x44 };; |
| 110 | 109 | ||
| 111 | ___ | 110 | ___ |
| 112 | } | 111 | } |
| @@ -114,27 +113,28 @@ ___ | |||
| 114 | 113 | ||
| 115 | sub BODY_16_19 { | 114 | sub BODY_16_19 { |
| 116 | local *code=shift; | 115 | local *code=shift; |
| 117 | local ($i,$a,$b,$c,$d,$e,$f)=@_; | 116 | my ($i,$a,$b,$c,$d,$e)=@_; |
| 117 | my $j=$i+1; | ||
| 118 | my $Xn=@X[$j%16]; | ||
| 118 | 119 | ||
| 119 | $code.=<<___; | 120 | $code.=<<___; |
| 120 | { .mmi; mov $X[$i&0xf]=$f // Xupdate | 121 | { .mib; add $e=$e,$K_00_19 // e+=K_00_19 |
| 121 | and tmp0=$c,$b | ||
| 122 | dep.z tmp5=$a,5,27 } // a<<5 | 122 | dep.z tmp5=$a,5,27 } // a<<5 |
| 123 | { .mmi; andcm tmp1=$d,$b | 123 | { .mib; andcm tmp1=$d,$b |
| 124 | add tmp4=$e,$K_00_19 };; | 124 | and tmp0=$c,$b };; |
| 125 | { .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) | 125 | { .mmi; add $e=$e,$X[$i%16] // e+=Xupdate |
| 126 | add $f=$f,tmp4 // f+=e+K_00_19 | 126 | or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) |
| 127 | extr.u tmp1=$a,27,5 } // a>>27 | 127 | extr.u tmp1=$a,27,5 } // a>>27 |
| 128 | { .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 | 128 | { .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate |
| 129 | xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 | 129 | xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate |
| 130 | nop.i 0 };; | 130 | nop.i 0 };; |
| 131 | { .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d) | 131 | { .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d) |
| 132 | xor tmp2=tmp2,tmp3 // +1 | 132 | xor $Xn=$Xn,tmp3 // forward Xupdate |
| 133 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | 133 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) |
| 134 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | 134 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) |
| 135 | mux2 tmp6=$a,0x44 };; // see b in next iteration | 135 | mux2 tmp6=$a,0x44 };; // see b in next iteration |
| 136 | { .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) | 136 | { .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) |
| 137 | shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) | 137 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) |
| 138 | nop.i 0 };; | 138 | nop.i 0 };; |
| 139 | 139 | ||
| 140 | ___ | 140 | ___ |
| @@ -142,49 +142,47 @@ ___ | |||
| 142 | 142 | ||
| 143 | sub BODY_20_39 { | 143 | sub BODY_20_39 { |
| 144 | local *code=shift; | 144 | local *code=shift; |
| 145 | local ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_; | 145 | my ($i,$a,$b,$c,$d,$e,$Konst)=@_; |
| 146 | $Konst = $K_20_39 if (!defined($Konst)); | 146 | $Konst = $K_20_39 if (!defined($Konst)); |
| 147 | my $j=$i+1; | ||
| 148 | my $Xn=@X[$j%16]; | ||
| 147 | 149 | ||
| 148 | if ($i<79) { | 150 | if ($i<79) { |
| 149 | $code.=<<___; | 151 | $code.=<<___; |
| 150 | { .mib; mov $X[$i&0xf]=$f // Xupdate | 152 | { .mib; add $e=$e,$Konst // e+=K_XX_XX |
| 151 | dep.z tmp5=$a,5,27 } // a<<5 | 153 | dep.z tmp5=$a,5,27 } // a<<5 |
| 152 | { .mib; xor tmp0=$c,$b | 154 | { .mib; xor tmp0=$c,$b |
| 153 | add tmp4=$e,$Konst };; | 155 | xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate |
| 154 | { .mmi; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d | 156 | { .mib; add $e=$e,$X[$i%16] // e+=Xupdate |
| 155 | add $f=$f,tmp4 // f+=e+K_20_39 | ||
| 156 | extr.u tmp1=$a,27,5 } // a>>27 | 157 | extr.u tmp1=$a,27,5 } // a>>27 |
| 157 | { .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 | 158 | { .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d |
| 158 | xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 | 159 | xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate |
| 159 | nop.i 0 };; | 160 | { .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) |
| 160 | { .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) | 161 | xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate |
| 161 | xor tmp2=tmp2,tmp3 // +1 | ||
| 162 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | 162 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) |
| 163 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | 163 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) |
| 164 | mux2 tmp6=$a,0x44 };; // see b in next iteration | 164 | mux2 tmp6=$a,0x44 };; // see b in next iteration |
| 165 | { .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) | 165 | { .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) |
| 166 | shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) | 166 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) |
| 167 | nop.i 0 };; | 167 | nop.i 0 };; |
| 168 | 168 | ||
| 169 | ___ | 169 | ___ |
| 170 | } | 170 | } |
| 171 | else { | 171 | else { |
| 172 | $code.=<<___; | 172 | $code.=<<___; |
| 173 | { .mib; mov $X[$i&0xf]=$f // Xupdate | 173 | { .mib; add $e=$e,$Konst // e+=K_60_79 |
| 174 | dep.z tmp5=$a,5,27 } // a<<5 | 174 | dep.z tmp5=$a,5,27 } // a<<5 |
| 175 | { .mib; xor tmp0=$c,$b | 175 | { .mib; xor tmp0=$c,$b |
| 176 | add tmp4=$e,$Konst };; | ||
| 177 | { .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d | ||
| 178 | extr.u tmp1=$a,27,5 } // a>>27 | ||
| 179 | { .mib; add $f=$f,tmp4 // f+=e+K_20_39 | ||
| 180 | add $h1=$h1,$a };; // wrap up | 176 | add $h1=$h1,$a };; // wrap up |
| 181 | { .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) | 177 | { .mib; add $e=$e,$X[$i%16] // e+=Xupdate |
| 182 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) ;;? | 178 | extr.u tmp1=$a,27,5 } // a>>27 |
| 183 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | 179 | { .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d |
| 184 | add $h3=$h3,$c };; // wrap up | 180 | add $h3=$h3,$c };; // wrap up |
| 185 | { .mib; add tmp3=1,inp // used in unaligned codepath | 181 | { .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) |
| 186 | add $f=$f,tmp1 } // f+=ROTATE(a,5) | 182 | or tmp1=tmp1,tmp5 // ROTATE(a,5) |
| 187 | { .mib; add $h2=$h2,$b // wrap up | 183 | shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;? |
| 184 | { .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5) | ||
| 185 | add tmp3=1,inp // used in unaligned codepath | ||
| 188 | add $h4=$h4,$d };; // wrap up | 186 | add $h4=$h4,$d };; // wrap up |
| 189 | 187 | ||
| 190 | ___ | 188 | ___ |
| @@ -193,29 +191,29 @@ ___ | |||
| 193 | 191 | ||
| 194 | sub BODY_40_59 { | 192 | sub BODY_40_59 { |
| 195 | local *code=shift; | 193 | local *code=shift; |
| 196 | local ($i,$a,$b,$c,$d,$e,$f)=@_; | 194 | my ($i,$a,$b,$c,$d,$e)=@_; |
| 195 | my $j=$i+1; | ||
| 196 | my $Xn=@X[$j%16]; | ||
| 197 | 197 | ||
| 198 | $code.=<<___; | 198 | $code.=<<___; |
| 199 | { .mmi; mov $X[$i&0xf]=$f // Xupdate | 199 | { .mib; add $e=$e,$K_40_59 // e+=K_40_59 |
| 200 | and tmp0=$c,$b | ||
| 201 | dep.z tmp5=$a,5,27 } // a<<5 | 200 | dep.z tmp5=$a,5,27 } // a<<5 |
| 202 | { .mmi; and tmp1=$d,$b | 201 | { .mib; and tmp1=$c,$d |
| 203 | add tmp4=$e,$K_40_59 };; | 202 | xor tmp0=$c,$d };; |
| 204 | { .mmi; or tmp0=tmp0,tmp1 // (b&c)|(b&d) | 203 | { .mmi; add $e=$e,$X[$i%16] // e+=Xupdate |
| 205 | add $f=$f,tmp4 // f+=e+K_40_59 | 204 | add tmp5=tmp5,tmp1 // a<<5+(c&d) |
| 206 | extr.u tmp1=$a,27,5 } // a>>27 | 205 | extr.u tmp1=$a,27,5 } // a>>27 |
| 207 | { .mmi; and tmp4=$c,$d | 206 | { .mmi; and tmp0=tmp0,$b |
| 208 | xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 | 207 | xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate |
| 209 | xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 | 208 | xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate |
| 210 | };; | 209 | { .mmi; add $e=$e,tmp0 // e+=b&(c^d) |
| 211 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | 210 | add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d) |
| 212 | xor tmp2=tmp2,tmp3 // +1 | ||
| 213 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | 211 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) |
| 214 | { .mmi; or tmp0=tmp0,tmp4 // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d) | 212 | { .mmi; xor $Xn=$Xn,tmp3 |
| 215 | mux2 tmp6=$a,0x44 };; // see b in next iteration | 213 | mux2 tmp6=$a,0x44 };; // see b in next iteration |
| 216 | { .mii; add $f=$f,tmp0 // f+=F_40_59(b,c,d) | 214 | { .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d) |
| 217 | shrp $e=tmp2,tmp2,31;; // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) | 215 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) |
| 218 | add $f=$f,tmp1 };; // f+=ROTATE(a,5) | 216 | nop.i 0x0 };; |
| 219 | 217 | ||
| 220 | ___ | 218 | ___ |
| 221 | } | 219 | } |
| @@ -237,7 +235,7 @@ inp=r33; // in1 | |||
| 237 | .align 32 | 235 | .align 32 |
| 238 | sha1_block_data_order: | 236 | sha1_block_data_order: |
| 239 | .prologue | 237 | .prologue |
| 240 | { .mmi; alloc tmp1=ar.pfs,3,15,0,0 | 238 | { .mmi; alloc tmp1=ar.pfs,3,14,0,0 |
| 241 | $ADDP tmp0=4,ctx | 239 | $ADDP tmp0=4,ctx |
| 242 | .save ar.lc,r3 | 240 | .save ar.lc,r3 |
| 243 | mov r3=ar.lc } | 241 | mov r3=ar.lc } |
| @@ -245,8 +243,8 @@ sha1_block_data_order: | |||
| 245 | $ADDP inp=0,inp | 243 | $ADDP inp=0,inp |
| 246 | mov r2=pr };; | 244 | mov r2=pr };; |
| 247 | tmp4=in2; | 245 | tmp4=in2; |
| 248 | tmp5=loc13; | 246 | tmp5=loc12; |
| 249 | tmp6=loc14; | 247 | tmp6=loc13; |
| 250 | .body | 248 | .body |
| 251 | { .mlx; ld4 $h0=[ctx],8 | 249 | { .mlx; ld4 $h0=[ctx],8 |
| 252 | movl $K_00_19=0x5a827999 } | 250 | movl $K_00_19=0x5a827999 } |
| @@ -273,7 +271,7 @@ tmp6=loc14; | |||
| 273 | 271 | ||
| 274 | ___ | 272 | ___ |
| 275 | 273 | ||
| 276 | { my $i,@V=($A,$B,$C,$D,$E,$T); | 274 | { my $i,@V=($A,$B,$C,$D,$E); |
| 277 | 275 | ||
| 278 | for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); } | 276 | for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); } |
| 279 | for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); } | 277 | for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); } |
| @@ -281,12 +279,12 @@ ___ | |||
| 281 | for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); } | 279 | for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); } |
| 282 | for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); } | 280 | for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); } |
| 283 | 281 | ||
| 284 | (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check | 282 | (($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check |
| 285 | } | 283 | } |
| 286 | 284 | ||
| 287 | $code.=<<___; | 285 | $code.=<<___; |
| 288 | { .mmb; add $h0=$h0,$E | 286 | { .mmb; add $h0=$h0,$A |
| 289 | nop.m 0 | 287 | add $h2=$h2,$C |
| 290 | br.ctop.dptk.many .Ldtop };; | 288 | br.ctop.dptk.many .Ldtop };; |
| 291 | .Ldend: | 289 | .Ldend: |
| 292 | { .mmi; add tmp0=4,ctx | 290 | { .mmi; add tmp0=4,ctx |
diff --git a/src/lib/libcrypto/sha/asm/sha1-mips.pl b/src/lib/libcrypto/sha/asm/sha1-mips.pl new file mode 100644 index 0000000000..f1a702f38f --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha1-mips.pl | |||
| @@ -0,0 +1,354 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # SHA1 block procedure for MIPS. | ||
| 11 | |||
| 12 | # Performance improvement is 30% on unaligned input. The "secret" is | ||
| 13 | # to deploy lwl/lwr pair to load unaligned input. One could have | ||
| 14 | # vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- | ||
| 15 | # compatible subroutine. There is room for minor optimization on | ||
| 16 | # little-endian platforms... | ||
| 17 | |||
| 18 | ###################################################################### | ||
| 19 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | ||
| 20 | # widely used. Then there is a new contender: NUBI. It appears that if | ||
| 21 | # one picks the latter, it's possible to arrange code in ABI neutral | ||
| 22 | # manner. Therefore let's stick to NUBI register layout: | ||
| 23 | # | ||
| 24 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | ||
| 25 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
| 26 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | ||
| 27 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | ||
| 28 | # | ||
| 29 | # The return value is placed in $a0. Following coding rules facilitate | ||
| 30 | # interoperability: | ||
| 31 | # | ||
| 32 | # - never ever touch $tp, "thread pointer", former $gp; | ||
| 33 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | ||
| 34 | # old code]; | ||
| 35 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | ||
| 36 | # | ||
| 37 | # For reference here is register layout for N32/64 MIPS ABIs: | ||
| 38 | # | ||
| 39 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
| 40 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
| 41 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
| 42 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
| 43 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
| 44 | # | ||
| 45 | $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 | ||
| 46 | |||
| 47 | if ($flavour =~ /64|n32/i) { | ||
| 48 | $PTR_ADD="dadd"; # incidentally works even on n32 | ||
| 49 | $PTR_SUB="dsub"; # incidentally works even on n32 | ||
| 50 | $REG_S="sd"; | ||
| 51 | $REG_L="ld"; | ||
| 52 | $PTR_SLL="dsll"; # incidentally works even on n32 | ||
| 53 | $SZREG=8; | ||
| 54 | } else { | ||
| 55 | $PTR_ADD="add"; | ||
| 56 | $PTR_SUB="sub"; | ||
| 57 | $REG_S="sw"; | ||
| 58 | $REG_L="lw"; | ||
| 59 | $PTR_SLL="sll"; | ||
| 60 | $SZREG=4; | ||
| 61 | } | ||
| 62 | # | ||
| 63 | # <appro@openssl.org> | ||
| 64 | # | ||
| 65 | ###################################################################### | ||
| 66 | |||
| 67 | $big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; | ||
| 68 | |||
| 69 | for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } | ||
| 70 | open STDOUT,">$output"; | ||
| 71 | |||
| 72 | if (!defined($big_endian)) | ||
| 73 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
| 74 | |||
| 75 | # offsets of the Most and Least Significant Bytes | ||
| 76 | $MSB=$big_endian?0:3; | ||
| 77 | $LSB=3&~$MSB; | ||
| 78 | |||
| 79 | @X=map("\$$_",(8..23)); # a4-a7,s0-s11 | ||
| 80 | |||
| 81 | $ctx=$a0; | ||
| 82 | $inp=$a1; | ||
| 83 | $num=$a2; | ||
| 84 | $A="\$1"; | ||
| 85 | $B="\$2"; | ||
| 86 | $C="\$3"; | ||
| 87 | $D="\$7"; | ||
| 88 | $E="\$24"; @V=($A,$B,$C,$D,$E); | ||
| 89 | $t0="\$25"; | ||
| 90 | $t1=$num; # $num is offloaded to stack | ||
| 91 | $t2="\$30"; # fp | ||
| 92 | $K="\$31"; # ra | ||
| 93 | |||
| 94 | sub BODY_00_14 { | ||
| 95 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 96 | my $j=$i+1; | ||
| 97 | $code.=<<___ if (!$big_endian); | ||
| 98 | srl $t0,@X[$i],24 # byte swap($i) | ||
| 99 | srl $t1,@X[$i],8 | ||
| 100 | andi $t2,@X[$i],0xFF00 | ||
| 101 | sll @X[$i],@X[$i],24 | ||
| 102 | andi $t1,0xFF00 | ||
| 103 | sll $t2,$t2,8 | ||
| 104 | or @X[$i],$t0 | ||
| 105 | or $t1,$t2 | ||
| 106 | or @X[$i],$t1 | ||
| 107 | ___ | ||
| 108 | $code.=<<___; | ||
| 109 | lwl @X[$j],$j*4+$MSB($inp) | ||
| 110 | sll $t0,$a,5 # $i | ||
| 111 | addu $e,$K | ||
| 112 | lwr @X[$j],$j*4+$LSB($inp) | ||
| 113 | srl $t1,$a,27 | ||
| 114 | addu $e,$t0 | ||
| 115 | xor $t0,$c,$d | ||
| 116 | addu $e,$t1 | ||
| 117 | sll $t2,$b,30 | ||
| 118 | and $t0,$b | ||
| 119 | srl $b,$b,2 | ||
| 120 | xor $t0,$d | ||
| 121 | addu $e,@X[$i] | ||
| 122 | or $b,$t2 | ||
| 123 | addu $e,$t0 | ||
| 124 | ___ | ||
| 125 | } | ||
| 126 | |||
| 127 | sub BODY_15_19 { | ||
| 128 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 129 | my $j=$i+1; | ||
| 130 | |||
| 131 | $code.=<<___ if (!$big_endian && $i==15); | ||
| 132 | srl $t0,@X[$i],24 # byte swap($i) | ||
| 133 | srl $t1,@X[$i],8 | ||
| 134 | andi $t2,@X[$i],0xFF00 | ||
| 135 | sll @X[$i],@X[$i],24 | ||
| 136 | andi $t1,0xFF00 | ||
| 137 | sll $t2,$t2,8 | ||
| 138 | or @X[$i],$t0 | ||
| 139 | or @X[$i],$t1 | ||
| 140 | or @X[$i],$t2 | ||
| 141 | ___ | ||
| 142 | $code.=<<___; | ||
| 143 | xor @X[$j%16],@X[($j+2)%16] | ||
| 144 | sll $t0,$a,5 # $i | ||
| 145 | addu $e,$K | ||
| 146 | srl $t1,$a,27 | ||
| 147 | addu $e,$t0 | ||
| 148 | xor @X[$j%16],@X[($j+8)%16] | ||
| 149 | xor $t0,$c,$d | ||
| 150 | addu $e,$t1 | ||
| 151 | xor @X[$j%16],@X[($j+13)%16] | ||
| 152 | sll $t2,$b,30 | ||
| 153 | and $t0,$b | ||
| 154 | srl $t1,@X[$j%16],31 | ||
| 155 | addu @X[$j%16],@X[$j%16] | ||
| 156 | srl $b,$b,2 | ||
| 157 | xor $t0,$d | ||
| 158 | or @X[$j%16],$t1 | ||
| 159 | addu $e,@X[$i%16] | ||
| 160 | or $b,$t2 | ||
| 161 | addu $e,$t0 | ||
| 162 | ___ | ||
| 163 | } | ||
| 164 | |||
| 165 | sub BODY_20_39 { | ||
| 166 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 167 | my $j=$i+1; | ||
| 168 | $code.=<<___ if ($i<79); | ||
| 169 | xor @X[$j%16],@X[($j+2)%16] | ||
| 170 | sll $t0,$a,5 # $i | ||
| 171 | addu $e,$K | ||
| 172 | srl $t1,$a,27 | ||
| 173 | addu $e,$t0 | ||
| 174 | xor @X[$j%16],@X[($j+8)%16] | ||
| 175 | xor $t0,$c,$d | ||
| 176 | addu $e,$t1 | ||
| 177 | xor @X[$j%16],@X[($j+13)%16] | ||
| 178 | sll $t2,$b,30 | ||
| 179 | xor $t0,$b | ||
| 180 | srl $t1,@X[$j%16],31 | ||
| 181 | addu @X[$j%16],@X[$j%16] | ||
| 182 | srl $b,$b,2 | ||
| 183 | addu $e,@X[$i%16] | ||
| 184 | or @X[$j%16],$t1 | ||
| 185 | or $b,$t2 | ||
| 186 | addu $e,$t0 | ||
| 187 | ___ | ||
| 188 | $code.=<<___ if ($i==79); | ||
| 189 | lw @X[0],0($ctx) | ||
| 190 | sll $t0,$a,5 # $i | ||
| 191 | addu $e,$K | ||
| 192 | lw @X[1],4($ctx) | ||
| 193 | srl $t1,$a,27 | ||
| 194 | addu $e,$t0 | ||
| 195 | lw @X[2],8($ctx) | ||
| 196 | xor $t0,$c,$d | ||
| 197 | addu $e,$t1 | ||
| 198 | lw @X[3],12($ctx) | ||
| 199 | sll $t2,$b,30 | ||
| 200 | xor $t0,$b | ||
| 201 | lw @X[4],16($ctx) | ||
| 202 | srl $b,$b,2 | ||
| 203 | addu $e,@X[$i%16] | ||
| 204 | or $b,$t2 | ||
| 205 | addu $e,$t0 | ||
| 206 | ___ | ||
| 207 | } | ||
| 208 | |||
| 209 | sub BODY_40_59 { | ||
| 210 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 211 | my $j=$i+1; | ||
| 212 | $code.=<<___ if ($i<79); | ||
| 213 | xor @X[$j%16],@X[($j+2)%16] | ||
| 214 | sll $t0,$a,5 # $i | ||
| 215 | addu $e,$K | ||
| 216 | srl $t1,$a,27 | ||
| 217 | addu $e,$t0 | ||
| 218 | xor @X[$j%16],@X[($j+8)%16] | ||
| 219 | and $t0,$c,$d | ||
| 220 | addu $e,$t1 | ||
| 221 | xor @X[$j%16],@X[($j+13)%16] | ||
| 222 | sll $t2,$b,30 | ||
| 223 | addu $e,$t0 | ||
| 224 | srl $t1,@X[$j%16],31 | ||
| 225 | xor $t0,$c,$d | ||
| 226 | addu @X[$j%16],@X[$j%16] | ||
| 227 | and $t0,$b | ||
| 228 | srl $b,$b,2 | ||
| 229 | or @X[$j%16],$t1 | ||
| 230 | addu $e,@X[$i%16] | ||
| 231 | or $b,$t2 | ||
| 232 | addu $e,$t0 | ||
| 233 | ___ | ||
| 234 | } | ||
| 235 | |||
| 236 | $FRAMESIZE=16; # large enough to accomodate NUBI saved registers | ||
| 237 | $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; | ||
| 238 | |||
| 239 | $code=<<___; | ||
| 240 | #ifdef OPENSSL_FIPSCANISTER | ||
| 241 | # include <openssl/fipssyms.h> | ||
| 242 | #endif | ||
| 243 | |||
| 244 | .text | ||
| 245 | |||
| 246 | .set noat | ||
| 247 | .set noreorder | ||
| 248 | .align 5 | ||
| 249 | .globl sha1_block_data_order | ||
| 250 | .ent sha1_block_data_order | ||
| 251 | sha1_block_data_order: | ||
| 252 | .frame $sp,$FRAMESIZE*$SZREG,$ra | ||
| 253 | .mask $SAVED_REGS_MASK,-$SZREG | ||
| 254 | .set noreorder | ||
| 255 | $PTR_SUB $sp,$FRAMESIZE*$SZREG | ||
| 256 | $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) | ||
| 257 | $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) | ||
| 258 | $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) | ||
| 259 | $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) | ||
| 260 | $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) | ||
| 261 | $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) | ||
| 262 | $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) | ||
| 263 | $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) | ||
| 264 | $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) | ||
| 265 | $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) | ||
| 266 | ___ | ||
| 267 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
| 268 | $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) | ||
| 269 | $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) | ||
| 270 | $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) | ||
| 271 | $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) | ||
| 272 | $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) | ||
| 273 | ___ | ||
| 274 | $code.=<<___; | ||
| 275 | $PTR_SLL $num,6 | ||
| 276 | $PTR_ADD $num,$inp | ||
| 277 | $REG_S $num,0($sp) | ||
| 278 | lw $A,0($ctx) | ||
| 279 | lw $B,4($ctx) | ||
| 280 | lw $C,8($ctx) | ||
| 281 | lw $D,12($ctx) | ||
| 282 | b .Loop | ||
| 283 | lw $E,16($ctx) | ||
| 284 | .align 4 | ||
| 285 | .Loop: | ||
| 286 | .set reorder | ||
| 287 | lwl @X[0],$MSB($inp) | ||
| 288 | lui $K,0x5a82 | ||
| 289 | lwr @X[0],$LSB($inp) | ||
| 290 | ori $K,0x7999 # K_00_19 | ||
| 291 | ___ | ||
| 292 | for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } | ||
| 293 | for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } | ||
| 294 | $code.=<<___; | ||
| 295 | lui $K,0x6ed9 | ||
| 296 | ori $K,0xeba1 # K_20_39 | ||
| 297 | ___ | ||
| 298 | for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
| 299 | $code.=<<___; | ||
| 300 | lui $K,0x8f1b | ||
| 301 | ori $K,0xbcdc # K_40_59 | ||
| 302 | ___ | ||
| 303 | for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | ||
| 304 | $code.=<<___; | ||
| 305 | lui $K,0xca62 | ||
| 306 | ori $K,0xc1d6 # K_60_79 | ||
| 307 | ___ | ||
| 308 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
| 309 | $code.=<<___; | ||
| 310 | $PTR_ADD $inp,64 | ||
| 311 | $REG_L $num,0($sp) | ||
| 312 | |||
| 313 | addu $A,$X[0] | ||
| 314 | addu $B,$X[1] | ||
| 315 | sw $A,0($ctx) | ||
| 316 | addu $C,$X[2] | ||
| 317 | addu $D,$X[3] | ||
| 318 | sw $B,4($ctx) | ||
| 319 | addu $E,$X[4] | ||
| 320 | sw $C,8($ctx) | ||
| 321 | sw $D,12($ctx) | ||
| 322 | sw $E,16($ctx) | ||
| 323 | .set noreorder | ||
| 324 | bne $inp,$num,.Loop | ||
| 325 | nop | ||
| 326 | |||
| 327 | .set noreorder | ||
| 328 | $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) | ||
| 329 | $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) | ||
| 330 | $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) | ||
| 331 | $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) | ||
| 332 | $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) | ||
| 333 | $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) | ||
| 334 | $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) | ||
| 335 | $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) | ||
| 336 | $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) | ||
| 337 | $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) | ||
| 338 | ___ | ||
| 339 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 340 | $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) | ||
| 341 | $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) | ||
| 342 | $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) | ||
| 343 | $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) | ||
| 344 | $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) | ||
| 345 | ___ | ||
| 346 | $code.=<<___; | ||
| 347 | jr $ra | ||
| 348 | $PTR_ADD $sp,$FRAMESIZE*$SZREG | ||
| 349 | .end sha1_block_data_order | ||
| 350 | .rdata | ||
| 351 | .asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 352 | ___ | ||
| 353 | print $code; | ||
| 354 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha1-parisc.pl b/src/lib/libcrypto/sha/asm/sha1-parisc.pl new file mode 100644 index 0000000000..6d7bf495b2 --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha1-parisc.pl | |||
| @@ -0,0 +1,259 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # SHA1 block procedure for PA-RISC. | ||
| 11 | |||
| 12 | # June 2009. | ||
| 13 | # | ||
| 14 | # On PA-7100LC performance is >30% better than gcc 3.2 generated code | ||
| 15 | # for aligned input and >50% better for unaligned. Compared to vendor | ||
| 16 | # compiler on PA-8600 it's almost 60% faster in 64-bit build and just | ||
| 17 | # few percent faster in 32-bit one (this for aligned input, data for | ||
| 18 | # unaligned input is not available). | ||
| 19 | # | ||
| 20 | # Special thanks to polarhome.com for providing HP-UX account. | ||
| 21 | |||
| 22 | $flavour = shift; | ||
| 23 | $output = shift; | ||
| 24 | open STDOUT,">$output"; | ||
| 25 | |||
| 26 | if ($flavour =~ /64/) { | ||
| 27 | $LEVEL ="2.0W"; | ||
| 28 | $SIZE_T =8; | ||
| 29 | $FRAME_MARKER =80; | ||
| 30 | $SAVED_RP =16; | ||
| 31 | $PUSH ="std"; | ||
| 32 | $PUSHMA ="std,ma"; | ||
| 33 | $POP ="ldd"; | ||
| 34 | $POPMB ="ldd,mb"; | ||
| 35 | } else { | ||
| 36 | $LEVEL ="1.0"; | ||
| 37 | $SIZE_T =4; | ||
| 38 | $FRAME_MARKER =48; | ||
| 39 | $SAVED_RP =20; | ||
| 40 | $PUSH ="stw"; | ||
| 41 | $PUSHMA ="stwm"; | ||
| 42 | $POP ="ldw"; | ||
| 43 | $POPMB ="ldwm"; | ||
| 44 | } | ||
| 45 | |||
| 46 | $FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker | ||
| 47 | # [+ argument transfer] | ||
| 48 | $ctx="%r26"; # arg0 | ||
| 49 | $inp="%r25"; # arg1 | ||
| 50 | $num="%r24"; # arg2 | ||
| 51 | |||
| 52 | $t0="%r28"; | ||
| 53 | $t1="%r29"; | ||
| 54 | $K="%r31"; | ||
| 55 | |||
| 56 | @X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", | ||
| 57 | "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0); | ||
| 58 | |||
| 59 | @V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23"); | ||
| 60 | |||
| 61 | sub BODY_00_19 { | ||
| 62 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 63 | my $j=$i+1; | ||
| 64 | $code.=<<___ if ($i<15); | ||
| 65 | addl $K,$e,$e ; $i | ||
| 66 | shd $a,$a,27,$t1 | ||
| 67 | addl @X[$i],$e,$e | ||
| 68 | and $c,$b,$t0 | ||
| 69 | addl $t1,$e,$e | ||
| 70 | andcm $d,$b,$t1 | ||
| 71 | shd $b,$b,2,$b | ||
| 72 | or $t1,$t0,$t0 | ||
| 73 | addl $t0,$e,$e | ||
| 74 | ___ | ||
| 75 | $code.=<<___ if ($i>=15); # with forward Xupdate | ||
| 76 | addl $K,$e,$e ; $i | ||
| 77 | shd $a,$a,27,$t1 | ||
| 78 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] | ||
| 79 | addl @X[$i%16],$e,$e | ||
| 80 | and $c,$b,$t0 | ||
| 81 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
| 82 | addl $t1,$e,$e | ||
| 83 | andcm $d,$b,$t1 | ||
| 84 | shd $b,$b,2,$b | ||
| 85 | or $t1,$t0,$t0 | ||
| 86 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
| 87 | add $t0,$e,$e | ||
| 88 | shd @X[$j%16],@X[$j%16],31,@X[$j%16] | ||
| 89 | ___ | ||
| 90 | } | ||
| 91 | |||
| 92 | sub BODY_20_39 { | ||
| 93 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 94 | my $j=$i+1; | ||
| 95 | $code.=<<___ if ($i<79); | ||
| 96 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i | ||
| 97 | addl $K,$e,$e | ||
| 98 | shd $a,$a,27,$t1 | ||
| 99 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
| 100 | addl @X[$i%16],$e,$e | ||
| 101 | xor $b,$c,$t0 | ||
| 102 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
| 103 | addl $t1,$e,$e | ||
| 104 | shd $b,$b,2,$b | ||
| 105 | xor $d,$t0,$t0 | ||
| 106 | shd @X[$j%16],@X[$j%16],31,@X[$j%16] | ||
| 107 | addl $t0,$e,$e | ||
| 108 | ___ | ||
| 109 | $code.=<<___ if ($i==79); # with context load | ||
| 110 | ldw 0($ctx),@X[0] ; $i | ||
| 111 | addl $K,$e,$e | ||
| 112 | shd $a,$a,27,$t1 | ||
| 113 | ldw 4($ctx),@X[1] | ||
| 114 | addl @X[$i%16],$e,$e | ||
| 115 | xor $b,$c,$t0 | ||
| 116 | ldw 8($ctx),@X[2] | ||
| 117 | addl $t1,$e,$e | ||
| 118 | shd $b,$b,2,$b | ||
| 119 | xor $d,$t0,$t0 | ||
| 120 | ldw 12($ctx),@X[3] | ||
| 121 | addl $t0,$e,$e | ||
| 122 | ldw 16($ctx),@X[4] | ||
| 123 | ___ | ||
| 124 | } | ||
| 125 | |||
| 126 | sub BODY_40_59 { | ||
| 127 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 128 | my $j=$i+1; | ||
| 129 | $code.=<<___; | ||
| 130 | shd $a,$a,27,$t1 ; $i | ||
| 131 | addl $K,$e,$e | ||
| 132 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] | ||
| 133 | xor $d,$c,$t0 | ||
| 134 | addl @X[$i%16],$e,$e | ||
| 135 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
| 136 | and $b,$t0,$t0 | ||
| 137 | addl $t1,$e,$e | ||
| 138 | shd $b,$b,2,$b | ||
| 139 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
| 140 | addl $t0,$e,$e | ||
| 141 | and $d,$c,$t1 | ||
| 142 | shd @X[$j%16],@X[$j%16],31,@X[$j%16] | ||
| 143 | addl $t1,$e,$e | ||
| 144 | ___ | ||
| 145 | } | ||
| 146 | |||
| 147 | $code=<<___; | ||
| 148 | .LEVEL $LEVEL | ||
| 149 | .SPACE \$TEXT\$ | ||
| 150 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
| 151 | |||
| 152 | .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
| 153 | sha1_block_data_order | ||
| 154 | .PROC | ||
| 155 | .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16 | ||
| 156 | .ENTRY | ||
| 157 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
| 158 | $PUSHMA %r3,$FRAME(%sp) | ||
| 159 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
| 160 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
| 161 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
| 162 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
| 163 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
| 164 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
| 165 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
| 166 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
| 167 | $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) | ||
| 168 | $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) | ||
| 169 | $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) | ||
| 170 | $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) | ||
| 171 | $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) | ||
| 172 | |||
| 173 | ldw 0($ctx),$A | ||
| 174 | ldw 4($ctx),$B | ||
| 175 | ldw 8($ctx),$C | ||
| 176 | ldw 12($ctx),$D | ||
| 177 | ldw 16($ctx),$E | ||
| 178 | |||
| 179 | extru $inp,31,2,$t0 ; t0=inp&3; | ||
| 180 | sh3addl $t0,%r0,$t0 ; t0*=8; | ||
| 181 | subi 32,$t0,$t0 ; t0=32-t0; | ||
| 182 | mtctl $t0,%cr11 ; %sar=t0; | ||
| 183 | |||
| 184 | L\$oop | ||
| 185 | ldi 3,$t0 | ||
| 186 | andcm $inp,$t0,$t0 ; 64-bit neutral | ||
| 187 | ___ | ||
| 188 | for ($i=0;$i<15;$i++) { # load input block | ||
| 189 | $code.="\tldw `4*$i`($t0),@X[$i]\n"; } | ||
| 190 | $code.=<<___; | ||
| 191 | cmpb,*= $inp,$t0,L\$aligned | ||
| 192 | ldw 60($t0),@X[15] | ||
| 193 | ldw 64($t0),@X[16] | ||
| 194 | ___ | ||
| 195 | for ($i=0;$i<16;$i++) { # align input | ||
| 196 | $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; } | ||
| 197 | $code.=<<___; | ||
| 198 | L\$aligned | ||
| 199 | ldil L'0x5a827000,$K ; K_00_19 | ||
| 200 | ldo 0x999($K),$K | ||
| 201 | ___ | ||
| 202 | for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } | ||
| 203 | $code.=<<___; | ||
| 204 | ldil L'0x6ed9e000,$K ; K_20_39 | ||
| 205 | ldo 0xba1($K),$K | ||
| 206 | ___ | ||
| 207 | |||
| 208 | for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
| 209 | $code.=<<___; | ||
| 210 | ldil L'0x8f1bb000,$K ; K_40_59 | ||
| 211 | ldo 0xcdc($K),$K | ||
| 212 | ___ | ||
| 213 | |||
| 214 | for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | ||
| 215 | $code.=<<___; | ||
| 216 | ldil L'0xca62c000,$K ; K_60_79 | ||
| 217 | ldo 0x1d6($K),$K | ||
| 218 | ___ | ||
| 219 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
| 220 | |||
| 221 | $code.=<<___; | ||
| 222 | addl @X[0],$A,$A | ||
| 223 | addl @X[1],$B,$B | ||
| 224 | addl @X[2],$C,$C | ||
| 225 | addl @X[3],$D,$D | ||
| 226 | addl @X[4],$E,$E | ||
| 227 | stw $A,0($ctx) | ||
| 228 | stw $B,4($ctx) | ||
| 229 | stw $C,8($ctx) | ||
| 230 | stw $D,12($ctx) | ||
| 231 | stw $E,16($ctx) | ||
| 232 | addib,*<> -1,$num,L\$oop | ||
| 233 | ldo 64($inp),$inp | ||
| 234 | |||
| 235 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
| 236 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
| 237 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
| 238 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
| 239 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
| 240 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
| 241 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
| 242 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
| 243 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
| 244 | $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 | ||
| 245 | $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 | ||
| 246 | $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 | ||
| 247 | $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 | ||
| 248 | $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 | ||
| 249 | bv (%r2) | ||
| 250 | .EXIT | ||
| 251 | $POPMB -$FRAME(%sp),%r3 | ||
| 252 | .PROCEND | ||
| 253 | .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 254 | ___ | ||
| 255 | |||
| 256 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 257 | $code =~ s/,\*/,/gm if ($SIZE_T==4); | ||
| 258 | print $code; | ||
| 259 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl index dcd0fcdfcf..2140dd2f8d 100755 --- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl +++ b/src/lib/libcrypto/sha/asm/sha1-ppc.pl | |||
| @@ -24,12 +24,14 @@ $flavour = shift; | |||
| 24 | 24 | ||
| 25 | if ($flavour =~ /64/) { | 25 | if ($flavour =~ /64/) { |
| 26 | $SIZE_T =8; | 26 | $SIZE_T =8; |
| 27 | $LRSAVE =2*$SIZE_T; | ||
| 27 | $UCMP ="cmpld"; | 28 | $UCMP ="cmpld"; |
| 28 | $STU ="stdu"; | 29 | $STU ="stdu"; |
| 29 | $POP ="ld"; | 30 | $POP ="ld"; |
| 30 | $PUSH ="std"; | 31 | $PUSH ="std"; |
| 31 | } elsif ($flavour =~ /32/) { | 32 | } elsif ($flavour =~ /32/) { |
| 32 | $SIZE_T =4; | 33 | $SIZE_T =4; |
| 34 | $LRSAVE =$SIZE_T; | ||
| 33 | $UCMP ="cmplw"; | 35 | $UCMP ="cmplw"; |
| 34 | $STU ="stwu"; | 36 | $STU ="stwu"; |
| 35 | $POP ="lwz"; | 37 | $POP ="lwz"; |
| @@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl"; | |||
| 43 | 45 | ||
| 44 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | 46 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
| 45 | 47 | ||
| 46 | $FRAME=24*$SIZE_T; | 48 | $FRAME=24*$SIZE_T+64; |
| 49 | $LOCALS=6*$SIZE_T; | ||
| 47 | 50 | ||
| 48 | $K ="r0"; | 51 | $K ="r0"; |
| 49 | $sp ="r1"; | 52 | $sp ="r1"; |
| @@ -162,9 +165,8 @@ $code=<<___; | |||
| 162 | .globl .sha1_block_data_order | 165 | .globl .sha1_block_data_order |
| 163 | .align 4 | 166 | .align 4 |
| 164 | .sha1_block_data_order: | 167 | .sha1_block_data_order: |
| 168 | $STU $sp,-$FRAME($sp) | ||
| 165 | mflr r0 | 169 | mflr r0 |
| 166 | $STU $sp,`-($FRAME+64)`($sp) | ||
| 167 | $PUSH r0,`$FRAME-$SIZE_T*18`($sp) | ||
| 168 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) | 170 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) |
| 169 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) | 171 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) |
| 170 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) | 172 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) |
| @@ -182,6 +184,7 @@ $code=<<___; | |||
| 182 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | 184 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
| 183 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | 185 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
| 184 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | 186 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
| 187 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | ||
| 185 | lwz $A,0($ctx) | 188 | lwz $A,0($ctx) |
| 186 | lwz $B,4($ctx) | 189 | lwz $B,4($ctx) |
| 187 | lwz $C,8($ctx) | 190 | lwz $C,8($ctx) |
| @@ -192,37 +195,14 @@ $code=<<___; | |||
| 192 | Laligned: | 195 | Laligned: |
| 193 | mtctr $num | 196 | mtctr $num |
| 194 | bl Lsha1_block_private | 197 | bl Lsha1_block_private |
| 195 | Ldone: | 198 | b Ldone |
| 196 | $POP r0,`$FRAME-$SIZE_T*18`($sp) | ||
| 197 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | ||
| 198 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | ||
| 199 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | ||
| 200 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | ||
| 201 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | ||
| 202 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | ||
| 203 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | ||
| 204 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | ||
| 205 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | ||
| 206 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | ||
| 207 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | ||
| 208 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | ||
| 209 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | ||
| 210 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | ||
| 211 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | ||
| 212 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | ||
| 213 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | ||
| 214 | mtlr r0 | ||
| 215 | addi $sp,$sp,`$FRAME+64` | ||
| 216 | blr | ||
| 217 | ___ | ||
| 218 | 199 | ||
| 219 | # PowerPC specification allows an implementation to be ill-behaved | 200 | ; PowerPC specification allows an implementation to be ill-behaved |
| 220 | # upon unaligned access which crosses page boundary. "Better safe | 201 | ; upon unaligned access which crosses page boundary. "Better safe |
| 221 | # than sorry" principle makes me treat it specially. But I don't | 202 | ; than sorry" principle makes me treat it specially. But I don't |
| 222 | # look for particular offending word, but rather for 64-byte input | 203 | ; look for particular offending word, but rather for 64-byte input |
| 223 | # block which crosses the boundary. Once found that block is aligned | 204 | ; block which crosses the boundary. Once found that block is aligned |
| 224 | # and hashed separately... | 205 | ; and hashed separately... |
| 225 | $code.=<<___; | ||
| 226 | .align 4 | 206 | .align 4 |
| 227 | Lunaligned: | 207 | Lunaligned: |
| 228 | subfic $t1,$inp,4096 | 208 | subfic $t1,$inp,4096 |
| @@ -237,7 +217,7 @@ Lunaligned: | |||
| 237 | Lcross_page: | 217 | Lcross_page: |
| 238 | li $t1,16 | 218 | li $t1,16 |
| 239 | mtctr $t1 | 219 | mtctr $t1 |
| 240 | addi r20,$sp,$FRAME ; spot below the frame | 220 | addi r20,$sp,$LOCALS ; spot within the frame |
| 241 | Lmemcpy: | 221 | Lmemcpy: |
| 242 | lbz r16,0($inp) | 222 | lbz r16,0($inp) |
| 243 | lbz r17,1($inp) | 223 | lbz r17,1($inp) |
| @@ -251,15 +231,40 @@ Lmemcpy: | |||
| 251 | addi r20,r20,4 | 231 | addi r20,r20,4 |
| 252 | bdnz Lmemcpy | 232 | bdnz Lmemcpy |
| 253 | 233 | ||
| 254 | $PUSH $inp,`$FRAME-$SIZE_T*19`($sp) | 234 | $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) |
| 255 | li $t1,1 | 235 | li $t1,1 |
| 256 | addi $inp,$sp,$FRAME | 236 | addi $inp,$sp,$LOCALS |
| 257 | mtctr $t1 | 237 | mtctr $t1 |
| 258 | bl Lsha1_block_private | 238 | bl Lsha1_block_private |
| 259 | $POP $inp,`$FRAME-$SIZE_T*19`($sp) | 239 | $POP $inp,`$FRAME-$SIZE_T*18`($sp) |
| 260 | addic. $num,$num,-1 | 240 | addic. $num,$num,-1 |
| 261 | bne- Lunaligned | 241 | bne- Lunaligned |
| 262 | b Ldone | 242 | |
| 243 | Ldone: | ||
| 244 | $POP r0,`$FRAME+$LRSAVE`($sp) | ||
| 245 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | ||
| 246 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | ||
| 247 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | ||
| 248 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | ||
| 249 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | ||
| 250 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | ||
| 251 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | ||
| 252 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | ||
| 253 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | ||
| 254 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | ||
| 255 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | ||
| 256 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | ||
| 257 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | ||
| 258 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | ||
| 259 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | ||
| 260 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | ||
| 261 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | ||
| 262 | mtlr r0 | ||
| 263 | addi $sp,$sp,$FRAME | ||
| 264 | blr | ||
| 265 | .long 0 | ||
| 266 | .byte 0,12,4,1,0x80,18,3,0 | ||
| 267 | .long 0 | ||
| 263 | ___ | 268 | ___ |
| 264 | 269 | ||
| 265 | # This is private block function, which uses tailored calling | 270 | # This is private block function, which uses tailored calling |
| @@ -309,6 +314,8 @@ $code.=<<___; | |||
| 309 | addi $inp,$inp,`16*4` | 314 | addi $inp,$inp,`16*4` |
| 310 | bdnz- Lsha1_block_private | 315 | bdnz- Lsha1_block_private |
| 311 | blr | 316 | blr |
| 317 | .long 0 | ||
| 318 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 312 | ___ | 319 | ___ |
| 313 | $code.=<<___; | 320 | $code.=<<___; |
| 314 | .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" | 321 | .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" |
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl index 4b17848287..9193dda45e 100644 --- a/src/lib/libcrypto/sha/asm/sha1-s390x.pl +++ b/src/lib/libcrypto/sha/asm/sha1-s390x.pl | |||
| @@ -21,9 +21,28 @@ | |||
| 21 | # instructions to favour dual-issue z10 pipeline. On z10 hardware is | 21 | # instructions to favour dual-issue z10 pipeline. On z10 hardware is |
| 22 | # "only" ~2.3x faster than software. | 22 | # "only" ~2.3x faster than software. |
| 23 | 23 | ||
| 24 | # November 2010. | ||
| 25 | # | ||
| 26 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
| 27 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
| 28 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
| 29 | # application context. The feature is not specific to any particular | ||
| 30 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
| 31 | # remains z/Architecture specific. | ||
| 32 | |||
| 24 | $kimdfunc=1; # magic function code for kimd instruction | 33 | $kimdfunc=1; # magic function code for kimd instruction |
| 25 | 34 | ||
| 26 | $output=shift; | 35 | $flavour = shift; |
| 36 | |||
| 37 | if ($flavour =~ /3[12]/) { | ||
| 38 | $SIZE_T=4; | ||
| 39 | $g=""; | ||
| 40 | } else { | ||
| 41 | $SIZE_T=8; | ||
| 42 | $g="g"; | ||
| 43 | } | ||
| 44 | |||
| 45 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 27 | open STDOUT,">$output"; | 46 | open STDOUT,">$output"; |
| 28 | 47 | ||
| 29 | $K_00_39="%r0"; $K=$K_00_39; | 48 | $K_00_39="%r0"; $K=$K_00_39; |
| @@ -42,13 +61,14 @@ $t1="%r11"; | |||
| 42 | @X=("%r12","%r13","%r14"); | 61 | @X=("%r12","%r13","%r14"); |
| 43 | $sp="%r15"; | 62 | $sp="%r15"; |
| 44 | 63 | ||
| 45 | $frame=160+16*4; | 64 | $stdframe=16*$SIZE_T+4*8; |
| 65 | $frame=$stdframe+16*4; | ||
| 46 | 66 | ||
| 47 | sub Xupdate { | 67 | sub Xupdate { |
| 48 | my $i=shift; | 68 | my $i=shift; |
| 49 | 69 | ||
| 50 | $code.=<<___ if ($i==15); | 70 | $code.=<<___ if ($i==15); |
| 51 | lg $prefetch,160($sp) ### Xupdate(16) warm-up | 71 | lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up |
| 52 | lr $X[0],$X[2] | 72 | lr $X[0],$X[2] |
| 53 | ___ | 73 | ___ |
| 54 | return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle | 74 | return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle |
| @@ -58,8 +78,8 @@ $code.=<<___ if ($i<16); | |||
| 58 | ___ | 78 | ___ |
| 59 | $code.=<<___ if ($i>=16); | 79 | $code.=<<___ if ($i>=16); |
| 60 | xgr $X[0],$prefetch ### Xupdate($i) | 80 | xgr $X[0],$prefetch ### Xupdate($i) |
| 61 | lg $prefetch,`160+4*(($i+2)%16)`($sp) | 81 | lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp) |
| 62 | xg $X[0],`160+4*(($i+8)%16)`($sp) | 82 | xg $X[0],`$stdframe+4*(($i+8)%16)`($sp) |
| 63 | xgr $X[0],$prefetch | 83 | xgr $X[0],$prefetch |
| 64 | rll $X[0],$X[0],1 | 84 | rll $X[0],$X[0],1 |
| 65 | rllg $X[1],$X[0],32 | 85 | rllg $X[1],$X[0],32 |
| @@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16); | |||
| 68 | lr $X[2],$X[1] # feedback | 88 | lr $X[2],$X[1] # feedback |
| 69 | ___ | 89 | ___ |
| 70 | $code.=<<___ if ($i<=70); | 90 | $code.=<<___ if ($i<=70); |
| 71 | stg $X[0],`160+4*($i%16)`($sp) | 91 | stg $X[0],`$stdframe+4*($i%16)`($sp) |
| 72 | ___ | 92 | ___ |
| 73 | unshift(@X,pop(@X)); | 93 | unshift(@X,pop(@X)); |
| 74 | } | 94 | } |
| @@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc); | |||
| 148 | tmhl %r0,0x4000 # check for message-security assist | 168 | tmhl %r0,0x4000 # check for message-security assist |
| 149 | jz .Lsoftware | 169 | jz .Lsoftware |
| 150 | lghi %r0,0 | 170 | lghi %r0,0 |
| 151 | la %r1,16($sp) | 171 | la %r1,`2*$SIZE_T`($sp) |
| 152 | .long 0xb93e0002 # kimd %r0,%r2 | 172 | .long 0xb93e0002 # kimd %r0,%r2 |
| 153 | lg %r0,16($sp) | 173 | lg %r0,`2*$SIZE_T`($sp) |
| 154 | tmhh %r0,`0x8000>>$kimdfunc` | 174 | tmhh %r0,`0x8000>>$kimdfunc` |
| 155 | jz .Lsoftware | 175 | jz .Lsoftware |
| 156 | lghi %r0,$kimdfunc | 176 | lghi %r0,$kimdfunc |
| @@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc); | |||
| 165 | ___ | 185 | ___ |
| 166 | $code.=<<___; | 186 | $code.=<<___; |
| 167 | lghi %r1,-$frame | 187 | lghi %r1,-$frame |
| 168 | stg $ctx,16($sp) | 188 | st${g} $ctx,`2*$SIZE_T`($sp) |
| 169 | stmg %r6,%r15,48($sp) | 189 | stm${g} %r6,%r15,`6*$SIZE_T`($sp) |
| 170 | lgr %r0,$sp | 190 | lgr %r0,$sp |
| 171 | la $sp,0(%r1,$sp) | 191 | la $sp,0(%r1,$sp) |
| 172 | stg %r0,0($sp) | 192 | st${g} %r0,0($sp) |
| 173 | 193 | ||
| 174 | larl $t0,Ktable | 194 | larl $t0,Ktable |
| 175 | llgf $A,0($ctx) | 195 | llgf $A,0($ctx) |
| @@ -199,7 +219,7 @@ ___ | |||
| 199 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | 219 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
| 200 | $code.=<<___; | 220 | $code.=<<___; |
| 201 | 221 | ||
| 202 | lg $ctx,`$frame+16`($sp) | 222 | l${g} $ctx,`$frame+2*$SIZE_T`($sp) |
| 203 | la $inp,64($inp) | 223 | la $inp,64($inp) |
| 204 | al $A,0($ctx) | 224 | al $A,0($ctx) |
| 205 | al $B,4($ctx) | 225 | al $B,4($ctx) |
| @@ -211,13 +231,13 @@ $code.=<<___; | |||
| 211 | st $C,8($ctx) | 231 | st $C,8($ctx) |
| 212 | st $D,12($ctx) | 232 | st $D,12($ctx) |
| 213 | st $E,16($ctx) | 233 | st $E,16($ctx) |
| 214 | brct $len,.Lloop | 234 | brct${g} $len,.Lloop |
| 215 | 235 | ||
| 216 | lmg %r6,%r15,`$frame+48`($sp) | 236 | lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) |
| 217 | br %r14 | 237 | br %r14 |
| 218 | .size sha1_block_data_order,.-sha1_block_data_order | 238 | .size sha1_block_data_order,.-sha1_block_data_order |
| 219 | .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 239 | .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
| 220 | .comm OPENSSL_s390xcap_P,8,8 | 240 | .comm OPENSSL_s390xcap_P,16,8 |
| 221 | ___ | 241 | ___ |
| 222 | 242 | ||
| 223 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 243 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl index 4edc5ea9ad..f27c1e3fb0 100755 --- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl +++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl | |||
| @@ -16,7 +16,7 @@ | |||
| 16 | # There was suggestion to mechanically translate 32-bit code, but I | 16 | # There was suggestion to mechanically translate 32-bit code, but I |
| 17 | # dismissed it, reasoning that x86_64 offers enough register bank | 17 | # dismissed it, reasoning that x86_64 offers enough register bank |
| 18 | # capacity to fully utilize SHA-1 parallelism. Therefore this fresh | 18 | # capacity to fully utilize SHA-1 parallelism. Therefore this fresh |
| 19 | # implementation:-) However! While 64-bit code does performs better | 19 | # implementation:-) However! While 64-bit code does perform better |
| 20 | # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, | 20 | # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, |
| 21 | # x86_64 does offer larger *addressable* bank, but out-of-order core | 21 | # x86_64 does offer larger *addressable* bank, but out-of-order core |
| 22 | # reaches for even more registers through dynamic aliasing, and EM64T | 22 | # reaches for even more registers through dynamic aliasing, and EM64T |
| @@ -29,6 +29,38 @@ | |||
| 29 | # Xeon P4 +65% +0% 9.9 | 29 | # Xeon P4 +65% +0% 9.9 |
| 30 | # Core2 +60% +10% 7.0 | 30 | # Core2 +60% +10% 7.0 |
| 31 | 31 | ||
| 32 | # August 2009. | ||
| 33 | # | ||
| 34 | # The code was revised to minimize code size and to maximize | ||
| 35 | # "distance" between instructions producing input to 'lea' | ||
| 36 | # instruction and the 'lea' instruction itself, which is essential | ||
| 37 | # for Intel Atom core. | ||
| 38 | |||
| 39 | # October 2010. | ||
| 40 | # | ||
| 41 | # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it | ||
| 42 | # is to offload message schedule denoted by Wt in NIST specification, | ||
| 43 | # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module | ||
| 44 | # for background and implementation details. The only difference from | ||
| 45 | # 32-bit code is that 64-bit code doesn't have to spill @X[] elements | ||
| 46 | # to free temporary registers. | ||
| 47 | |||
| 48 | # April 2011. | ||
| 49 | # | ||
| 50 | # Add AVX code path. See sha1-586.pl for further information. | ||
| 51 | |||
| 52 | ###################################################################### | ||
| 53 | # Current performance is summarized in following table. Numbers are | ||
| 54 | # CPU clock cycles spent to process single byte (less is better). | ||
| 55 | # | ||
| 56 | # x86_64 SSSE3 AVX | ||
| 57 | # P4 9.8 - | ||
| 58 | # Opteron 6.6 - | ||
| 59 | # Core2 6.7 6.1/+10% - | ||
| 60 | # Atom 11.0 9.7/+13% - | ||
| 61 | # Westmere 7.1 5.6/+27% - | ||
| 62 | # Sandy Bridge 7.9 6.3/+25% 5.2/+51% | ||
| 63 | |||
| 32 | $flavour = shift; | 64 | $flavour = shift; |
| 33 | $output = shift; | 65 | $output = shift; |
| 34 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | 66 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| @@ -40,6 +72,16 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |||
| 40 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | 72 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| 41 | die "can't locate x86_64-xlate.pl"; | 73 | die "can't locate x86_64-xlate.pl"; |
| 42 | 74 | ||
| 75 | $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | ||
| 76 | =~ /GNU assembler version ([2-9]\.[0-9]+)/ && | ||
| 77 | $1>=2.19); | ||
| 78 | $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | ||
| 79 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && | ||
| 80 | $1>=2.09); | ||
| 81 | $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | ||
| 82 | `ml64 2>&1` =~ /Version ([0-9]+)\./ && | ||
| 83 | $1>=10); | ||
| 84 | |||
| 43 | open STDOUT,"| $^X $xlate $flavour $output"; | 85 | open STDOUT,"| $^X $xlate $flavour $output"; |
| 44 | 86 | ||
| 45 | $ctx="%rdi"; # 1st arg | 87 | $ctx="%rdi"; # 1st arg |
| @@ -51,196 +93,994 @@ $ctx="%r8"; | |||
| 51 | $inp="%r9"; | 93 | $inp="%r9"; |
| 52 | $num="%r10"; | 94 | $num="%r10"; |
| 53 | 95 | ||
| 54 | $xi="%eax"; | 96 | $t0="%eax"; |
| 55 | $t0="%ebx"; | 97 | $t1="%ebx"; |
| 56 | $t1="%ecx"; | 98 | $t2="%ecx"; |
| 57 | $A="%edx"; | 99 | @xi=("%edx","%ebp"); |
| 58 | $B="%esi"; | 100 | $A="%esi"; |
| 59 | $C="%edi"; | 101 | $B="%edi"; |
| 60 | $D="%ebp"; | 102 | $C="%r11d"; |
| 61 | $E="%r11d"; | 103 | $D="%r12d"; |
| 62 | $T="%r12d"; | 104 | $E="%r13d"; |
| 63 | |||
| 64 | @V=($A,$B,$C,$D,$E,$T); | ||
| 65 | 105 | ||
| 66 | sub PROLOGUE { | 106 | @V=($A,$B,$C,$D,$E); |
| 67 | my $func=shift; | ||
| 68 | $code.=<<___; | ||
| 69 | .globl $func | ||
| 70 | .type $func,\@function,3 | ||
| 71 | .align 16 | ||
| 72 | $func: | ||
| 73 | push %rbx | ||
| 74 | push %rbp | ||
| 75 | push %r12 | ||
| 76 | mov %rsp,%r11 | ||
| 77 | mov %rdi,$ctx # reassigned argument | ||
| 78 | sub \$`8+16*4`,%rsp | ||
| 79 | mov %rsi,$inp # reassigned argument | ||
| 80 | and \$-64,%rsp | ||
| 81 | mov %rdx,$num # reassigned argument | ||
| 82 | mov %r11,`16*4`(%rsp) | ||
| 83 | .Lprologue: | ||
| 84 | |||
| 85 | mov 0($ctx),$A | ||
| 86 | mov 4($ctx),$B | ||
| 87 | mov 8($ctx),$C | ||
| 88 | mov 12($ctx),$D | ||
| 89 | mov 16($ctx),$E | ||
| 90 | ___ | ||
| 91 | } | ||
| 92 | |||
| 93 | sub EPILOGUE { | ||
| 94 | my $func=shift; | ||
| 95 | $code.=<<___; | ||
| 96 | mov `16*4`(%rsp),%rsi | ||
| 97 | mov (%rsi),%r12 | ||
| 98 | mov 8(%rsi),%rbp | ||
| 99 | mov 16(%rsi),%rbx | ||
| 100 | lea 24(%rsi),%rsp | ||
| 101 | .Lepilogue: | ||
| 102 | ret | ||
| 103 | .size $func,.-$func | ||
| 104 | ___ | ||
| 105 | } | ||
| 106 | 107 | ||
| 107 | sub BODY_00_19 { | 108 | sub BODY_00_19 { |
| 108 | my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; | 109 | my ($i,$a,$b,$c,$d,$e)=@_; |
| 109 | my $j=$i+1; | 110 | my $j=$i+1; |
| 110 | $code.=<<___ if ($i==0); | 111 | $code.=<<___ if ($i==0); |
| 111 | mov `4*$i`($inp),$xi | 112 | mov `4*$i`($inp),$xi[0] |
| 112 | `"bswap $xi" if(!defined($host))` | 113 | bswap $xi[0] |
| 113 | mov $xi,`4*$i`(%rsp) | 114 | mov $xi[0],`4*$i`(%rsp) |
| 114 | ___ | 115 | ___ |
| 115 | $code.=<<___ if ($i<15); | 116 | $code.=<<___ if ($i<15); |
| 116 | lea 0x5a827999($xi,$e),$f | ||
| 117 | mov $c,$t0 | 117 | mov $c,$t0 |
| 118 | mov `4*$j`($inp),$xi | 118 | mov `4*$j`($inp),$xi[1] |
| 119 | mov $a,$e | 119 | mov $a,$t2 |
| 120 | xor $d,$t0 | 120 | xor $d,$t0 |
| 121 | `"bswap $xi" if(!defined($host))` | 121 | bswap $xi[1] |
| 122 | rol \$5,$e | 122 | rol \$5,$t2 |
| 123 | lea 0x5a827999($xi[0],$e),$e | ||
| 123 | and $b,$t0 | 124 | and $b,$t0 |
| 124 | mov $xi,`4*$j`(%rsp) | 125 | mov $xi[1],`4*$j`(%rsp) |
| 125 | add $e,$f | 126 | add $t2,$e |
| 126 | xor $d,$t0 | 127 | xor $d,$t0 |
| 127 | rol \$30,$b | 128 | rol \$30,$b |
| 128 | add $t0,$f | 129 | add $t0,$e |
| 129 | ___ | 130 | ___ |
| 130 | $code.=<<___ if ($i>=15); | 131 | $code.=<<___ if ($i>=15); |
| 131 | lea 0x5a827999($xi,$e),$f | 132 | mov `4*($j%16)`(%rsp),$xi[1] |
| 132 | mov `4*($j%16)`(%rsp),$xi | ||
| 133 | mov $c,$t0 | 133 | mov $c,$t0 |
| 134 | mov $a,$e | 134 | mov $a,$t2 |
| 135 | xor `4*(($j+2)%16)`(%rsp),$xi | 135 | xor `4*(($j+2)%16)`(%rsp),$xi[1] |
| 136 | xor $d,$t0 | 136 | xor $d,$t0 |
| 137 | rol \$5,$e | 137 | rol \$5,$t2 |
| 138 | xor `4*(($j+8)%16)`(%rsp),$xi | 138 | xor `4*(($j+8)%16)`(%rsp),$xi[1] |
| 139 | and $b,$t0 | 139 | and $b,$t0 |
| 140 | add $e,$f | 140 | lea 0x5a827999($xi[0],$e),$e |
| 141 | xor `4*(($j+13)%16)`(%rsp),$xi | 141 | xor `4*(($j+13)%16)`(%rsp),$xi[1] |
| 142 | xor $d,$t0 | 142 | xor $d,$t0 |
| 143 | rol \$1,$xi[1] | ||
| 144 | add $t2,$e | ||
| 143 | rol \$30,$b | 145 | rol \$30,$b |
| 144 | add $t0,$f | 146 | mov $xi[1],`4*($j%16)`(%rsp) |
| 145 | rol \$1,$xi | 147 | add $t0,$e |
| 146 | mov $xi,`4*($j%16)`(%rsp) | ||
| 147 | ___ | 148 | ___ |
| 149 | unshift(@xi,pop(@xi)); | ||
| 148 | } | 150 | } |
| 149 | 151 | ||
| 150 | sub BODY_20_39 { | 152 | sub BODY_20_39 { |
| 151 | my ($i,$a,$b,$c,$d,$e,$f)=@_; | 153 | my ($i,$a,$b,$c,$d,$e)=@_; |
| 152 | my $j=$i+1; | 154 | my $j=$i+1; |
| 153 | my $K=($i<40)?0x6ed9eba1:0xca62c1d6; | 155 | my $K=($i<40)?0x6ed9eba1:0xca62c1d6; |
| 154 | $code.=<<___ if ($i<79); | 156 | $code.=<<___ if ($i<79); |
| 155 | lea $K($xi,$e),$f | 157 | mov `4*($j%16)`(%rsp),$xi[1] |
| 156 | mov `4*($j%16)`(%rsp),$xi | ||
| 157 | mov $c,$t0 | 158 | mov $c,$t0 |
| 158 | mov $a,$e | 159 | mov $a,$t2 |
| 159 | xor `4*(($j+2)%16)`(%rsp),$xi | 160 | xor `4*(($j+2)%16)`(%rsp),$xi[1] |
| 160 | xor $b,$t0 | 161 | xor $b,$t0 |
| 161 | rol \$5,$e | 162 | rol \$5,$t2 |
| 162 | xor `4*(($j+8)%16)`(%rsp),$xi | 163 | lea $K($xi[0],$e),$e |
| 164 | xor `4*(($j+8)%16)`(%rsp),$xi[1] | ||
| 163 | xor $d,$t0 | 165 | xor $d,$t0 |
| 164 | add $e,$f | 166 | add $t2,$e |
| 165 | xor `4*(($j+13)%16)`(%rsp),$xi | 167 | xor `4*(($j+13)%16)`(%rsp),$xi[1] |
| 166 | rol \$30,$b | 168 | rol \$30,$b |
| 167 | add $t0,$f | 169 | add $t0,$e |
| 168 | rol \$1,$xi | 170 | rol \$1,$xi[1] |
| 169 | ___ | 171 | ___ |
| 170 | $code.=<<___ if ($i<76); | 172 | $code.=<<___ if ($i<76); |
| 171 | mov $xi,`4*($j%16)`(%rsp) | 173 | mov $xi[1],`4*($j%16)`(%rsp) |
| 172 | ___ | 174 | ___ |
| 173 | $code.=<<___ if ($i==79); | 175 | $code.=<<___ if ($i==79); |
| 174 | lea $K($xi,$e),$f | ||
| 175 | mov $c,$t0 | 176 | mov $c,$t0 |
| 176 | mov $a,$e | 177 | mov $a,$t2 |
| 177 | xor $b,$t0 | 178 | xor $b,$t0 |
| 178 | rol \$5,$e | 179 | lea $K($xi[0],$e),$e |
| 180 | rol \$5,$t2 | ||
| 179 | xor $d,$t0 | 181 | xor $d,$t0 |
| 180 | add $e,$f | 182 | add $t2,$e |
| 181 | rol \$30,$b | 183 | rol \$30,$b |
| 182 | add $t0,$f | 184 | add $t0,$e |
| 183 | ___ | 185 | ___ |
| 186 | unshift(@xi,pop(@xi)); | ||
| 184 | } | 187 | } |
| 185 | 188 | ||
| 186 | sub BODY_40_59 { | 189 | sub BODY_40_59 { |
| 187 | my ($i,$a,$b,$c,$d,$e,$f)=@_; | 190 | my ($i,$a,$b,$c,$d,$e)=@_; |
| 188 | my $j=$i+1; | 191 | my $j=$i+1; |
| 189 | $code.=<<___; | 192 | $code.=<<___; |
| 190 | lea 0x8f1bbcdc($xi,$e),$f | 193 | mov `4*($j%16)`(%rsp),$xi[1] |
| 191 | mov `4*($j%16)`(%rsp),$xi | 194 | mov $c,$t0 |
| 192 | mov $b,$t0 | 195 | mov $c,$t1 |
| 193 | mov $b,$t1 | 196 | xor `4*(($j+2)%16)`(%rsp),$xi[1] |
| 194 | xor `4*(($j+2)%16)`(%rsp),$xi | 197 | and $d,$t0 |
| 195 | mov $a,$e | 198 | mov $a,$t2 |
| 196 | and $c,$t0 | 199 | xor `4*(($j+8)%16)`(%rsp),$xi[1] |
| 197 | xor `4*(($j+8)%16)`(%rsp),$xi | 200 | xor $d,$t1 |
| 198 | or $c,$t1 | 201 | lea 0x8f1bbcdc($xi[0],$e),$e |
| 199 | rol \$5,$e | 202 | rol \$5,$t2 |
| 200 | xor `4*(($j+13)%16)`(%rsp),$xi | 203 | xor `4*(($j+13)%16)`(%rsp),$xi[1] |
| 201 | and $d,$t1 | 204 | add $t0,$e |
| 202 | add $e,$f | 205 | and $b,$t1 |
| 203 | rol \$1,$xi | 206 | rol \$1,$xi[1] |
| 204 | or $t1,$t0 | 207 | add $t1,$e |
| 205 | rol \$30,$b | 208 | rol \$30,$b |
| 206 | mov $xi,`4*($j%16)`(%rsp) | 209 | mov $xi[1],`4*($j%16)`(%rsp) |
| 207 | add $t0,$f | 210 | add $t2,$e |
| 208 | ___ | 211 | ___ |
| 212 | unshift(@xi,pop(@xi)); | ||
| 209 | } | 213 | } |
| 210 | 214 | ||
| 211 | $code=".text\n"; | 215 | $code.=<<___; |
| 216 | .text | ||
| 217 | .extern OPENSSL_ia32cap_P | ||
| 212 | 218 | ||
| 213 | &PROLOGUE("sha1_block_data_order"); | 219 | .globl sha1_block_data_order |
| 214 | $code.=".align 4\n.Lloop:\n"; | 220 | .type sha1_block_data_order,\@function,3 |
| 221 | .align 16 | ||
| 222 | sha1_block_data_order: | ||
| 223 | mov OPENSSL_ia32cap_P+0(%rip),%r9d | ||
| 224 | mov OPENSSL_ia32cap_P+4(%rip),%r8d | ||
| 225 | test \$`1<<9`,%r8d # check SSSE3 bit | ||
| 226 | jz .Lialu | ||
| 227 | ___ | ||
| 228 | $code.=<<___ if ($avx); | ||
| 229 | and \$`1<<28`,%r8d # mask AVX bit | ||
| 230 | and \$`1<<30`,%r9d # mask "Intel CPU" bit | ||
| 231 | or %r9d,%r8d | ||
| 232 | cmp \$`1<<28|1<<30`,%r8d | ||
| 233 | je _avx_shortcut | ||
| 234 | ___ | ||
| 235 | $code.=<<___; | ||
| 236 | jmp _ssse3_shortcut | ||
| 237 | |||
| 238 | .align 16 | ||
| 239 | .Lialu: | ||
| 240 | push %rbx | ||
| 241 | push %rbp | ||
| 242 | push %r12 | ||
| 243 | push %r13 | ||
| 244 | mov %rsp,%r11 | ||
| 245 | mov %rdi,$ctx # reassigned argument | ||
| 246 | sub \$`8+16*4`,%rsp | ||
| 247 | mov %rsi,$inp # reassigned argument | ||
| 248 | and \$-64,%rsp | ||
| 249 | mov %rdx,$num # reassigned argument | ||
| 250 | mov %r11,`16*4`(%rsp) | ||
| 251 | .Lprologue: | ||
| 252 | |||
| 253 | mov 0($ctx),$A | ||
| 254 | mov 4($ctx),$B | ||
| 255 | mov 8($ctx),$C | ||
| 256 | mov 12($ctx),$D | ||
| 257 | mov 16($ctx),$E | ||
| 258 | jmp .Lloop | ||
| 259 | |||
| 260 | .align 16 | ||
| 261 | .Lloop: | ||
| 262 | ___ | ||
| 215 | for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } | 263 | for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } |
| 216 | for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | 264 | for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
| 217 | for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | 265 | for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } |
| 218 | for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | 266 | for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
| 219 | $code.=<<___; | 267 | $code.=<<___; |
| 220 | add 0($ctx),$E | 268 | add 0($ctx),$A |
| 221 | add 4($ctx),$T | 269 | add 4($ctx),$B |
| 222 | add 8($ctx),$A | 270 | add 8($ctx),$C |
| 223 | add 12($ctx),$B | 271 | add 12($ctx),$D |
| 224 | add 16($ctx),$C | 272 | add 16($ctx),$E |
| 225 | mov $E,0($ctx) | 273 | mov $A,0($ctx) |
| 226 | mov $T,4($ctx) | 274 | mov $B,4($ctx) |
| 227 | mov $A,8($ctx) | 275 | mov $C,8($ctx) |
| 228 | mov $B,12($ctx) | 276 | mov $D,12($ctx) |
| 229 | mov $C,16($ctx) | 277 | mov $E,16($ctx) |
| 230 | 278 | ||
| 231 | xchg $E,$A # mov $E,$A | ||
| 232 | xchg $T,$B # mov $T,$B | ||
| 233 | xchg $E,$C # mov $A,$C | ||
| 234 | xchg $T,$D # mov $B,$D | ||
| 235 | # mov $C,$E | ||
| 236 | lea `16*4`($inp),$inp | ||
| 237 | sub \$1,$num | 279 | sub \$1,$num |
| 280 | lea `16*4`($inp),$inp | ||
| 238 | jnz .Lloop | 281 | jnz .Lloop |
| 282 | |||
| 283 | mov `16*4`(%rsp),%rsi | ||
| 284 | mov (%rsi),%r13 | ||
| 285 | mov 8(%rsi),%r12 | ||
| 286 | mov 16(%rsi),%rbp | ||
| 287 | mov 24(%rsi),%rbx | ||
| 288 | lea 32(%rsi),%rsp | ||
| 289 | .Lepilogue: | ||
| 290 | ret | ||
| 291 | .size sha1_block_data_order,.-sha1_block_data_order | ||
| 239 | ___ | 292 | ___ |
| 240 | &EPILOGUE("sha1_block_data_order"); | 293 | {{{ |
| 294 | my $Xi=4; | ||
| 295 | my @X=map("%xmm$_",(4..7,0..3)); | ||
| 296 | my @Tx=map("%xmm$_",(8..10)); | ||
| 297 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization | ||
| 298 | my @T=("%esi","%edi"); | ||
| 299 | my $j=0; | ||
| 300 | my $K_XX_XX="%r11"; | ||
| 301 | |||
| 302 | my $_rol=sub { &rol(@_) }; | ||
| 303 | my $_ror=sub { &ror(@_) }; | ||
| 304 | |||
| 305 | $code.=<<___; | ||
| 306 | .type sha1_block_data_order_ssse3,\@function,3 | ||
| 307 | .align 16 | ||
| 308 | sha1_block_data_order_ssse3: | ||
| 309 | _ssse3_shortcut: | ||
| 310 | push %rbx | ||
| 311 | push %rbp | ||
| 312 | push %r12 | ||
| 313 | lea `-64-($win64?5*16:0)`(%rsp),%rsp | ||
| 314 | ___ | ||
| 315 | $code.=<<___ if ($win64); | ||
| 316 | movaps %xmm6,64+0(%rsp) | ||
| 317 | movaps %xmm7,64+16(%rsp) | ||
| 318 | movaps %xmm8,64+32(%rsp) | ||
| 319 | movaps %xmm9,64+48(%rsp) | ||
| 320 | movaps %xmm10,64+64(%rsp) | ||
| 321 | .Lprologue_ssse3: | ||
| 322 | ___ | ||
| 323 | $code.=<<___; | ||
| 324 | mov %rdi,$ctx # reassigned argument | ||
| 325 | mov %rsi,$inp # reassigned argument | ||
| 326 | mov %rdx,$num # reassigned argument | ||
| 327 | |||
| 328 | shl \$6,$num | ||
| 329 | add $inp,$num | ||
| 330 | lea K_XX_XX(%rip),$K_XX_XX | ||
| 331 | |||
| 332 | mov 0($ctx),$A # load context | ||
| 333 | mov 4($ctx),$B | ||
| 334 | mov 8($ctx),$C | ||
| 335 | mov 12($ctx),$D | ||
| 336 | mov $B,@T[0] # magic seed | ||
| 337 | mov 16($ctx),$E | ||
| 338 | |||
| 339 | movdqa 64($K_XX_XX),@X[2] # pbswap mask | ||
| 340 | movdqa 0($K_XX_XX),@Tx[1] # K_00_19 | ||
| 341 | movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] | ||
| 342 | movdqu 16($inp),@X[-3&7] | ||
| 343 | movdqu 32($inp),@X[-2&7] | ||
| 344 | movdqu 48($inp),@X[-1&7] | ||
| 345 | pshufb @X[2],@X[-4&7] # byte swap | ||
| 346 | add \$64,$inp | ||
| 347 | pshufb @X[2],@X[-3&7] | ||
| 348 | pshufb @X[2],@X[-2&7] | ||
| 349 | pshufb @X[2],@X[-1&7] | ||
| 350 | paddd @Tx[1],@X[-4&7] # add K_00_19 | ||
| 351 | paddd @Tx[1],@X[-3&7] | ||
| 352 | paddd @Tx[1],@X[-2&7] | ||
| 353 | movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU | ||
| 354 | psubd @Tx[1],@X[-4&7] # restore X[] | ||
| 355 | movdqa @X[-3&7],16(%rsp) | ||
| 356 | psubd @Tx[1],@X[-3&7] | ||
| 357 | movdqa @X[-2&7],32(%rsp) | ||
| 358 | psubd @Tx[1],@X[-2&7] | ||
| 359 | jmp .Loop_ssse3 | ||
| 360 | ___ | ||
| 361 | |||
| 362 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | ||
| 363 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | ||
| 364 | my $arg = pop; | ||
| 365 | $arg = "\$$arg" if ($arg*1 eq $arg); | ||
| 366 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | ||
| 367 | } | ||
| 368 | |||
| 369 | sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 | ||
| 370 | { use integer; | ||
| 371 | my $body = shift; | ||
| 372 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | ||
| 373 | my ($a,$b,$c,$d,$e); | ||
| 374 | |||
| 375 | &movdqa (@X[0],@X[-3&7]); | ||
| 376 | eval(shift(@insns)); | ||
| 377 | eval(shift(@insns)); | ||
| 378 | &movdqa (@Tx[0],@X[-1&7]); | ||
| 379 | &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" | ||
| 380 | eval(shift(@insns)); | ||
| 381 | eval(shift(@insns)); | ||
| 382 | |||
| 383 | &paddd (@Tx[1],@X[-1&7]); | ||
| 384 | eval(shift(@insns)); | ||
| 385 | eval(shift(@insns)); | ||
| 386 | &psrldq (@Tx[0],4); # "X[-3]", 3 dwords | ||
| 387 | eval(shift(@insns)); | ||
| 388 | eval(shift(@insns)); | ||
| 389 | &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | ||
| 390 | eval(shift(@insns)); | ||
| 391 | eval(shift(@insns)); | ||
| 392 | |||
| 393 | &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" | ||
| 394 | eval(shift(@insns)); | ||
| 395 | eval(shift(@insns)); | ||
| 396 | eval(shift(@insns)); | ||
| 397 | eval(shift(@insns)); | ||
| 398 | |||
| 399 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" | ||
| 400 | eval(shift(@insns)); | ||
| 401 | eval(shift(@insns)); | ||
| 402 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
| 403 | eval(shift(@insns)); | ||
| 404 | eval(shift(@insns)); | ||
| 405 | |||
| 406 | &movdqa (@Tx[2],@X[0]); | ||
| 407 | &movdqa (@Tx[0],@X[0]); | ||
| 408 | eval(shift(@insns)); | ||
| 409 | eval(shift(@insns)); | ||
| 410 | eval(shift(@insns)); | ||
| 411 | eval(shift(@insns)); | ||
| 412 | |||
| 413 | &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword | ||
| 414 | &paddd (@X[0],@X[0]); | ||
| 415 | eval(shift(@insns)); | ||
| 416 | eval(shift(@insns)); | ||
| 417 | eval(shift(@insns)); | ||
| 418 | eval(shift(@insns)); | ||
| 419 | |||
| 420 | &psrld (@Tx[0],31); | ||
| 421 | eval(shift(@insns)); | ||
| 422 | eval(shift(@insns)); | ||
| 423 | &movdqa (@Tx[1],@Tx[2]); | ||
| 424 | eval(shift(@insns)); | ||
| 425 | eval(shift(@insns)); | ||
| 426 | |||
| 427 | &psrld (@Tx[2],30); | ||
| 428 | &por (@X[0],@Tx[0]); # "X[0]"<<<=1 | ||
| 429 | eval(shift(@insns)); | ||
| 430 | eval(shift(@insns)); | ||
| 431 | eval(shift(@insns)); | ||
| 432 | eval(shift(@insns)); | ||
| 433 | |||
| 434 | &pslld (@Tx[1],2); | ||
| 435 | &pxor (@X[0],@Tx[2]); | ||
| 436 | eval(shift(@insns)); | ||
| 437 | eval(shift(@insns)); | ||
| 438 | &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX | ||
| 439 | eval(shift(@insns)); | ||
| 440 | eval(shift(@insns)); | ||
| 441 | |||
| 442 | &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 | ||
| 443 | |||
| 444 | foreach (@insns) { eval; } # remaining instructions [if any] | ||
| 445 | |||
| 446 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
| 447 | push(@Tx,shift(@Tx)); | ||
| 448 | } | ||
| 449 | |||
| 450 | sub Xupdate_ssse3_32_79() | ||
| 451 | { use integer; | ||
| 452 | my $body = shift; | ||
| 453 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | ||
| 454 | my ($a,$b,$c,$d,$e); | ||
| 455 | |||
| 456 | &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); | ||
| 457 | eval(shift(@insns)); # body_20_39 | ||
| 458 | &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | ||
| 459 | &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" | ||
| 460 | eval(shift(@insns)); | ||
| 461 | eval(shift(@insns)); | ||
| 462 | eval(shift(@insns)); # rol | ||
| 463 | |||
| 464 | &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | ||
| 465 | eval(shift(@insns)); | ||
| 466 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); | ||
| 467 | if ($Xi%5) { | ||
| 468 | &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... | ||
| 469 | } else { # ... or load next one | ||
| 470 | &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); | ||
| 471 | } | ||
| 472 | &paddd (@Tx[1],@X[-1&7]); | ||
| 473 | eval(shift(@insns)); # ror | ||
| 474 | eval(shift(@insns)); | ||
| 475 | |||
| 476 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" | ||
| 477 | eval(shift(@insns)); # body_20_39 | ||
| 478 | eval(shift(@insns)); | ||
| 479 | eval(shift(@insns)); | ||
| 480 | eval(shift(@insns)); # rol | ||
| 481 | |||
| 482 | &movdqa (@Tx[0],@X[0]); | ||
| 483 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
| 484 | eval(shift(@insns)); | ||
| 485 | eval(shift(@insns)); | ||
| 486 | eval(shift(@insns)); # ror | ||
| 487 | eval(shift(@insns)); | ||
| 488 | |||
| 489 | &pslld (@X[0],2); | ||
| 490 | eval(shift(@insns)); # body_20_39 | ||
| 491 | eval(shift(@insns)); | ||
| 492 | &psrld (@Tx[0],30); | ||
| 493 | eval(shift(@insns)); | ||
| 494 | eval(shift(@insns)); # rol | ||
| 495 | eval(shift(@insns)); | ||
| 496 | eval(shift(@insns)); | ||
| 497 | eval(shift(@insns)); # ror | ||
| 498 | eval(shift(@insns)); | ||
| 499 | |||
| 500 | &por (@X[0],@Tx[0]); # "X[0]"<<<=2 | ||
| 501 | eval(shift(@insns)); # body_20_39 | ||
| 502 | eval(shift(@insns)); | ||
| 503 | &movdqa (@Tx[1],@X[0]) if ($Xi<19); | ||
| 504 | eval(shift(@insns)); | ||
| 505 | eval(shift(@insns)); # rol | ||
| 506 | eval(shift(@insns)); | ||
| 507 | eval(shift(@insns)); | ||
| 508 | eval(shift(@insns)); # rol | ||
| 509 | eval(shift(@insns)); | ||
| 510 | |||
| 511 | foreach (@insns) { eval; } # remaining instructions | ||
| 512 | |||
| 513 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
| 514 | push(@Tx,shift(@Tx)); | ||
| 515 | } | ||
| 516 | |||
| 517 | sub Xuplast_ssse3_80() | ||
| 518 | { use integer; | ||
| 519 | my $body = shift; | ||
| 520 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 521 | my ($a,$b,$c,$d,$e); | ||
| 522 | |||
| 523 | eval(shift(@insns)); | ||
| 524 | &paddd (@Tx[1],@X[-1&7]); | ||
| 525 | eval(shift(@insns)); | ||
| 526 | eval(shift(@insns)); | ||
| 527 | eval(shift(@insns)); | ||
| 528 | eval(shift(@insns)); | ||
| 529 | |||
| 530 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU | ||
| 531 | |||
| 532 | foreach (@insns) { eval; } # remaining instructions | ||
| 533 | |||
| 534 | &cmp ($inp,$num); | ||
| 535 | &je (".Ldone_ssse3"); | ||
| 536 | |||
| 537 | unshift(@Tx,pop(@Tx)); | ||
| 538 | |||
| 539 | &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask | ||
| 540 | &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 | ||
| 541 | &movdqu (@X[-4&7],"0($inp)"); # load input | ||
| 542 | &movdqu (@X[-3&7],"16($inp)"); | ||
| 543 | &movdqu (@X[-2&7],"32($inp)"); | ||
| 544 | &movdqu (@X[-1&7],"48($inp)"); | ||
| 545 | &pshufb (@X[-4&7],@X[2]); # byte swap | ||
| 546 | &add ($inp,64); | ||
| 547 | |||
| 548 | $Xi=0; | ||
| 549 | } | ||
| 550 | |||
| 551 | sub Xloop_ssse3() | ||
| 552 | { use integer; | ||
| 553 | my $body = shift; | ||
| 554 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 555 | my ($a,$b,$c,$d,$e); | ||
| 556 | |||
| 557 | eval(shift(@insns)); | ||
| 558 | eval(shift(@insns)); | ||
| 559 | &pshufb (@X[($Xi-3)&7],@X[2]); | ||
| 560 | eval(shift(@insns)); | ||
| 561 | eval(shift(@insns)); | ||
| 562 | &paddd (@X[($Xi-4)&7],@Tx[1]); | ||
| 563 | eval(shift(@insns)); | ||
| 564 | eval(shift(@insns)); | ||
| 565 | eval(shift(@insns)); | ||
| 566 | eval(shift(@insns)); | ||
| 567 | &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU | ||
| 568 | eval(shift(@insns)); | ||
| 569 | eval(shift(@insns)); | ||
| 570 | &psubd (@X[($Xi-4)&7],@Tx[1]); | ||
| 571 | |||
| 572 | foreach (@insns) { eval; } | ||
| 573 | $Xi++; | ||
| 574 | } | ||
| 575 | |||
| 576 | sub Xtail_ssse3() | ||
| 577 | { use integer; | ||
| 578 | my $body = shift; | ||
| 579 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 580 | my ($a,$b,$c,$d,$e); | ||
| 581 | |||
| 582 | foreach (@insns) { eval; } | ||
| 583 | } | ||
| 584 | |||
| 585 | sub body_00_19 () { | ||
| 586 | ( | ||
| 587 | '($a,$b,$c,$d,$e)=@V;'. | ||
| 588 | '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer | ||
| 589 | '&xor ($c,$d);', | ||
| 590 | '&mov (@T[1],$a);', # $b in next round | ||
| 591 | '&$_rol ($a,5);', | ||
| 592 | '&and (@T[0],$c);', # ($b&($c^$d)) | ||
| 593 | '&xor ($c,$d);', # restore $c | ||
| 594 | '&xor (@T[0],$d);', | ||
| 595 | '&add ($e,$a);', | ||
| 596 | '&$_ror ($b,$j?7:2);', # $b>>>2 | ||
| 597 | '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
| 598 | ); | ||
| 599 | } | ||
| 600 | |||
| 601 | sub body_20_39 () { | ||
| 602 | ( | ||
| 603 | '($a,$b,$c,$d,$e)=@V;'. | ||
| 604 | '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer | ||
| 605 | '&xor (@T[0],$d);', # ($b^$d) | ||
| 606 | '&mov (@T[1],$a);', # $b in next round | ||
| 607 | '&$_rol ($a,5);', | ||
| 608 | '&xor (@T[0],$c);', # ($b^$d^$c) | ||
| 609 | '&add ($e,$a);', | ||
| 610 | '&$_ror ($b,7);', # $b>>>2 | ||
| 611 | '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
| 612 | ); | ||
| 613 | } | ||
| 614 | |||
| 615 | sub body_40_59 () { | ||
| 616 | ( | ||
| 617 | '($a,$b,$c,$d,$e)=@V;'. | ||
| 618 | '&mov (@T[1],$c);', | ||
| 619 | '&xor ($c,$d);', | ||
| 620 | '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer | ||
| 621 | '&and (@T[1],$d);', | ||
| 622 | '&and (@T[0],$c);', # ($b&($c^$d)) | ||
| 623 | '&$_ror ($b,7);', # $b>>>2 | ||
| 624 | '&add ($e,@T[1]);', | ||
| 625 | '&mov (@T[1],$a);', # $b in next round | ||
| 626 | '&$_rol ($a,5);', | ||
| 627 | '&add ($e,@T[0]);', | ||
| 628 | '&xor ($c,$d);', # restore $c | ||
| 629 | '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
| 630 | ); | ||
| 631 | } | ||
| 241 | $code.=<<___; | 632 | $code.=<<___; |
| 242 | .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 243 | .align 16 | 633 | .align 16 |
| 634 | .Loop_ssse3: | ||
| 635 | ___ | ||
| 636 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
| 637 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
| 638 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
| 639 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
| 640 | &Xupdate_ssse3_32_79(\&body_00_19); | ||
| 641 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 642 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 643 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 644 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 645 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 646 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
| 647 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
| 648 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
| 649 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
| 650 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
| 651 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
| 652 | &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" | ||
| 653 | |||
| 654 | $saved_j=$j; @saved_V=@V; | ||
| 655 | |||
| 656 | &Xloop_ssse3(\&body_20_39); | ||
| 657 | &Xloop_ssse3(\&body_20_39); | ||
| 658 | &Xloop_ssse3(\&body_20_39); | ||
| 659 | |||
| 660 | $code.=<<___; | ||
| 661 | add 0($ctx),$A # update context | ||
| 662 | add 4($ctx),@T[0] | ||
| 663 | add 8($ctx),$C | ||
| 664 | add 12($ctx),$D | ||
| 665 | mov $A,0($ctx) | ||
| 666 | add 16($ctx),$E | ||
| 667 | mov @T[0],4($ctx) | ||
| 668 | mov @T[0],$B # magic seed | ||
| 669 | mov $C,8($ctx) | ||
| 670 | mov $D,12($ctx) | ||
| 671 | mov $E,16($ctx) | ||
| 672 | jmp .Loop_ssse3 | ||
| 673 | |||
| 674 | .align 16 | ||
| 675 | .Ldone_ssse3: | ||
| 676 | ___ | ||
| 677 | $j=$saved_j; @V=@saved_V; | ||
| 678 | |||
| 679 | &Xtail_ssse3(\&body_20_39); | ||
| 680 | &Xtail_ssse3(\&body_20_39); | ||
| 681 | &Xtail_ssse3(\&body_20_39); | ||
| 682 | |||
| 683 | $code.=<<___; | ||
| 684 | add 0($ctx),$A # update context | ||
| 685 | add 4($ctx),@T[0] | ||
| 686 | add 8($ctx),$C | ||
| 687 | mov $A,0($ctx) | ||
| 688 | add 12($ctx),$D | ||
| 689 | mov @T[0],4($ctx) | ||
| 690 | add 16($ctx),$E | ||
| 691 | mov $C,8($ctx) | ||
| 692 | mov $D,12($ctx) | ||
| 693 | mov $E,16($ctx) | ||
| 694 | ___ | ||
| 695 | $code.=<<___ if ($win64); | ||
| 696 | movaps 64+0(%rsp),%xmm6 | ||
| 697 | movaps 64+16(%rsp),%xmm7 | ||
| 698 | movaps 64+32(%rsp),%xmm8 | ||
| 699 | movaps 64+48(%rsp),%xmm9 | ||
| 700 | movaps 64+64(%rsp),%xmm10 | ||
| 701 | ___ | ||
| 702 | $code.=<<___; | ||
| 703 | lea `64+($win64?5*16:0)`(%rsp),%rsi | ||
| 704 | mov 0(%rsi),%r12 | ||
| 705 | mov 8(%rsi),%rbp | ||
| 706 | mov 16(%rsi),%rbx | ||
| 707 | lea 24(%rsi),%rsp | ||
| 708 | .Lepilogue_ssse3: | ||
| 709 | ret | ||
| 710 | .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 | ||
| 711 | ___ | ||
| 712 | |||
| 713 | if ($avx) { | ||
| 714 | my $Xi=4; | ||
| 715 | my @X=map("%xmm$_",(4..7,0..3)); | ||
| 716 | my @Tx=map("%xmm$_",(8..10)); | ||
| 717 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization | ||
| 718 | my @T=("%esi","%edi"); | ||
| 719 | my $j=0; | ||
| 720 | my $K_XX_XX="%r11"; | ||
| 721 | |||
| 722 | my $_rol=sub { &shld(@_[0],@_) }; | ||
| 723 | my $_ror=sub { &shrd(@_[0],@_) }; | ||
| 724 | |||
| 725 | $code.=<<___; | ||
| 726 | .type sha1_block_data_order_avx,\@function,3 | ||
| 727 | .align 16 | ||
| 728 | sha1_block_data_order_avx: | ||
| 729 | _avx_shortcut: | ||
| 730 | push %rbx | ||
| 731 | push %rbp | ||
| 732 | push %r12 | ||
| 733 | lea `-64-($win64?5*16:0)`(%rsp),%rsp | ||
| 734 | ___ | ||
| 735 | $code.=<<___ if ($win64); | ||
| 736 | movaps %xmm6,64+0(%rsp) | ||
| 737 | movaps %xmm7,64+16(%rsp) | ||
| 738 | movaps %xmm8,64+32(%rsp) | ||
| 739 | movaps %xmm9,64+48(%rsp) | ||
| 740 | movaps %xmm10,64+64(%rsp) | ||
| 741 | .Lprologue_avx: | ||
| 742 | ___ | ||
| 743 | $code.=<<___; | ||
| 744 | mov %rdi,$ctx # reassigned argument | ||
| 745 | mov %rsi,$inp # reassigned argument | ||
| 746 | mov %rdx,$num # reassigned argument | ||
| 747 | vzeroall | ||
| 748 | |||
| 749 | shl \$6,$num | ||
| 750 | add $inp,$num | ||
| 751 | lea K_XX_XX(%rip),$K_XX_XX | ||
| 752 | |||
| 753 | mov 0($ctx),$A # load context | ||
| 754 | mov 4($ctx),$B | ||
| 755 | mov 8($ctx),$C | ||
| 756 | mov 12($ctx),$D | ||
| 757 | mov $B,@T[0] # magic seed | ||
| 758 | mov 16($ctx),$E | ||
| 759 | |||
| 760 | vmovdqa 64($K_XX_XX),@X[2] # pbswap mask | ||
| 761 | vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 | ||
| 762 | vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] | ||
| 763 | vmovdqu 16($inp),@X[-3&7] | ||
| 764 | vmovdqu 32($inp),@X[-2&7] | ||
| 765 | vmovdqu 48($inp),@X[-1&7] | ||
| 766 | vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap | ||
| 767 | add \$64,$inp | ||
| 768 | vpshufb @X[2],@X[-3&7],@X[-3&7] | ||
| 769 | vpshufb @X[2],@X[-2&7],@X[-2&7] | ||
| 770 | vpshufb @X[2],@X[-1&7],@X[-1&7] | ||
| 771 | vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 | ||
| 772 | vpaddd @Tx[1],@X[-3&7],@X[1] | ||
| 773 | vpaddd @Tx[1],@X[-2&7],@X[2] | ||
| 774 | vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU | ||
| 775 | vmovdqa @X[1],16(%rsp) | ||
| 776 | vmovdqa @X[2],32(%rsp) | ||
| 777 | jmp .Loop_avx | ||
| 778 | ___ | ||
| 779 | |||
| 780 | sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 | ||
| 781 | { use integer; | ||
| 782 | my $body = shift; | ||
| 783 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | ||
| 784 | my ($a,$b,$c,$d,$e); | ||
| 785 | |||
| 786 | eval(shift(@insns)); | ||
| 787 | eval(shift(@insns)); | ||
| 788 | &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" | ||
| 789 | eval(shift(@insns)); | ||
| 790 | eval(shift(@insns)); | ||
| 791 | |||
| 792 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
| 793 | eval(shift(@insns)); | ||
| 794 | eval(shift(@insns)); | ||
| 795 | &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords | ||
| 796 | eval(shift(@insns)); | ||
| 797 | eval(shift(@insns)); | ||
| 798 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | ||
| 799 | eval(shift(@insns)); | ||
| 800 | eval(shift(@insns)); | ||
| 801 | |||
| 802 | &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" | ||
| 803 | eval(shift(@insns)); | ||
| 804 | eval(shift(@insns)); | ||
| 805 | eval(shift(@insns)); | ||
| 806 | eval(shift(@insns)); | ||
| 807 | |||
| 808 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" | ||
| 809 | eval(shift(@insns)); | ||
| 810 | eval(shift(@insns)); | ||
| 811 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
| 812 | eval(shift(@insns)); | ||
| 813 | eval(shift(@insns)); | ||
| 814 | |||
| 815 | &vpsrld (@Tx[0],@X[0],31); | ||
| 816 | eval(shift(@insns)); | ||
| 817 | eval(shift(@insns)); | ||
| 818 | eval(shift(@insns)); | ||
| 819 | eval(shift(@insns)); | ||
| 820 | |||
| 821 | &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword | ||
| 822 | &vpaddd (@X[0],@X[0],@X[0]); | ||
| 823 | eval(shift(@insns)); | ||
| 824 | eval(shift(@insns)); | ||
| 825 | eval(shift(@insns)); | ||
| 826 | eval(shift(@insns)); | ||
| 827 | |||
| 828 | &vpsrld (@Tx[1],@Tx[2],30); | ||
| 829 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 | ||
| 830 | eval(shift(@insns)); | ||
| 831 | eval(shift(@insns)); | ||
| 832 | eval(shift(@insns)); | ||
| 833 | eval(shift(@insns)); | ||
| 834 | |||
| 835 | &vpslld (@Tx[2],@Tx[2],2); | ||
| 836 | &vpxor (@X[0],@X[0],@Tx[1]); | ||
| 837 | eval(shift(@insns)); | ||
| 838 | eval(shift(@insns)); | ||
| 839 | eval(shift(@insns)); | ||
| 840 | eval(shift(@insns)); | ||
| 841 | |||
| 842 | &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 | ||
| 843 | eval(shift(@insns)); | ||
| 844 | eval(shift(@insns)); | ||
| 845 | &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX | ||
| 846 | eval(shift(@insns)); | ||
| 847 | eval(shift(@insns)); | ||
| 848 | |||
| 849 | |||
| 850 | foreach (@insns) { eval; } # remaining instructions [if any] | ||
| 851 | |||
| 852 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
| 853 | push(@Tx,shift(@Tx)); | ||
| 854 | } | ||
| 855 | |||
| 856 | sub Xupdate_avx_32_79() | ||
| 857 | { use integer; | ||
| 858 | my $body = shift; | ||
| 859 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | ||
| 860 | my ($a,$b,$c,$d,$e); | ||
| 861 | |||
| 862 | &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" | ||
| 863 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | ||
| 864 | eval(shift(@insns)); # body_20_39 | ||
| 865 | eval(shift(@insns)); | ||
| 866 | eval(shift(@insns)); | ||
| 867 | eval(shift(@insns)); # rol | ||
| 868 | |||
| 869 | &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | ||
| 870 | eval(shift(@insns)); | ||
| 871 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); | ||
| 872 | if ($Xi%5) { | ||
| 873 | &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... | ||
| 874 | } else { # ... or load next one | ||
| 875 | &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); | ||
| 876 | } | ||
| 877 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
| 878 | eval(shift(@insns)); # ror | ||
| 879 | eval(shift(@insns)); | ||
| 880 | |||
| 881 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" | ||
| 882 | eval(shift(@insns)); # body_20_39 | ||
| 883 | eval(shift(@insns)); | ||
| 884 | eval(shift(@insns)); | ||
| 885 | eval(shift(@insns)); # rol | ||
| 886 | |||
| 887 | &vpsrld (@Tx[0],@X[0],30); | ||
| 888 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
| 889 | eval(shift(@insns)); | ||
| 890 | eval(shift(@insns)); | ||
| 891 | eval(shift(@insns)); # ror | ||
| 892 | eval(shift(@insns)); | ||
| 893 | |||
| 894 | &vpslld (@X[0],@X[0],2); | ||
| 895 | eval(shift(@insns)); # body_20_39 | ||
| 896 | eval(shift(@insns)); | ||
| 897 | eval(shift(@insns)); | ||
| 898 | eval(shift(@insns)); # rol | ||
| 899 | eval(shift(@insns)); | ||
| 900 | eval(shift(@insns)); | ||
| 901 | eval(shift(@insns)); # ror | ||
| 902 | eval(shift(@insns)); | ||
| 903 | |||
| 904 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 | ||
| 905 | eval(shift(@insns)); # body_20_39 | ||
| 906 | eval(shift(@insns)); | ||
| 907 | &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); | ||
| 908 | eval(shift(@insns)); | ||
| 909 | eval(shift(@insns)); # rol | ||
| 910 | eval(shift(@insns)); | ||
| 911 | eval(shift(@insns)); | ||
| 912 | eval(shift(@insns)); # rol | ||
| 913 | eval(shift(@insns)); | ||
| 914 | |||
| 915 | foreach (@insns) { eval; } # remaining instructions | ||
| 916 | |||
| 917 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
| 918 | push(@Tx,shift(@Tx)); | ||
| 919 | } | ||
| 920 | |||
| 921 | sub Xuplast_avx_80() | ||
| 922 | { use integer; | ||
| 923 | my $body = shift; | ||
| 924 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 925 | my ($a,$b,$c,$d,$e); | ||
| 926 | |||
| 927 | eval(shift(@insns)); | ||
| 928 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
| 929 | eval(shift(@insns)); | ||
| 930 | eval(shift(@insns)); | ||
| 931 | eval(shift(@insns)); | ||
| 932 | eval(shift(@insns)); | ||
| 933 | |||
| 934 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU | ||
| 935 | |||
| 936 | foreach (@insns) { eval; } # remaining instructions | ||
| 937 | |||
| 938 | &cmp ($inp,$num); | ||
| 939 | &je (".Ldone_avx"); | ||
| 940 | |||
| 941 | unshift(@Tx,pop(@Tx)); | ||
| 942 | |||
| 943 | &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask | ||
| 944 | &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 | ||
| 945 | &vmovdqu(@X[-4&7],"0($inp)"); # load input | ||
| 946 | &vmovdqu(@X[-3&7],"16($inp)"); | ||
| 947 | &vmovdqu(@X[-2&7],"32($inp)"); | ||
| 948 | &vmovdqu(@X[-1&7],"48($inp)"); | ||
| 949 | &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | ||
| 950 | &add ($inp,64); | ||
| 951 | |||
| 952 | $Xi=0; | ||
| 953 | } | ||
| 954 | |||
| 955 | sub Xloop_avx() | ||
| 956 | { use integer; | ||
| 957 | my $body = shift; | ||
| 958 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 959 | my ($a,$b,$c,$d,$e); | ||
| 960 | |||
| 961 | eval(shift(@insns)); | ||
| 962 | eval(shift(@insns)); | ||
| 963 | &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); | ||
| 964 | eval(shift(@insns)); | ||
| 965 | eval(shift(@insns)); | ||
| 966 | &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); | ||
| 967 | eval(shift(@insns)); | ||
| 968 | eval(shift(@insns)); | ||
| 969 | eval(shift(@insns)); | ||
| 970 | eval(shift(@insns)); | ||
| 971 | &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU | ||
| 972 | eval(shift(@insns)); | ||
| 973 | eval(shift(@insns)); | ||
| 974 | |||
| 975 | foreach (@insns) { eval; } | ||
| 976 | $Xi++; | ||
| 977 | } | ||
| 978 | |||
| 979 | sub Xtail_avx() | ||
| 980 | { use integer; | ||
| 981 | my $body = shift; | ||
| 982 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
| 983 | my ($a,$b,$c,$d,$e); | ||
| 984 | |||
| 985 | foreach (@insns) { eval; } | ||
| 986 | } | ||
| 987 | |||
| 988 | $code.=<<___; | ||
| 989 | .align 16 | ||
| 990 | .Loop_avx: | ||
| 991 | ___ | ||
| 992 | &Xupdate_avx_16_31(\&body_00_19); | ||
| 993 | &Xupdate_avx_16_31(\&body_00_19); | ||
| 994 | &Xupdate_avx_16_31(\&body_00_19); | ||
| 995 | &Xupdate_avx_16_31(\&body_00_19); | ||
| 996 | &Xupdate_avx_32_79(\&body_00_19); | ||
| 997 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 998 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 999 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 1000 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 1001 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 1002 | &Xupdate_avx_32_79(\&body_40_59); | ||
| 1003 | &Xupdate_avx_32_79(\&body_40_59); | ||
| 1004 | &Xupdate_avx_32_79(\&body_40_59); | ||
| 1005 | &Xupdate_avx_32_79(\&body_40_59); | ||
| 1006 | &Xupdate_avx_32_79(\&body_40_59); | ||
| 1007 | &Xupdate_avx_32_79(\&body_20_39); | ||
| 1008 | &Xuplast_avx_80(\&body_20_39); # can jump to "done" | ||
| 1009 | |||
| 1010 | $saved_j=$j; @saved_V=@V; | ||
| 1011 | |||
| 1012 | &Xloop_avx(\&body_20_39); | ||
| 1013 | &Xloop_avx(\&body_20_39); | ||
| 1014 | &Xloop_avx(\&body_20_39); | ||
| 1015 | |||
| 1016 | $code.=<<___; | ||
| 1017 | add 0($ctx),$A # update context | ||
| 1018 | add 4($ctx),@T[0] | ||
| 1019 | add 8($ctx),$C | ||
| 1020 | add 12($ctx),$D | ||
| 1021 | mov $A,0($ctx) | ||
| 1022 | add 16($ctx),$E | ||
| 1023 | mov @T[0],4($ctx) | ||
| 1024 | mov @T[0],$B # magic seed | ||
| 1025 | mov $C,8($ctx) | ||
| 1026 | mov $D,12($ctx) | ||
| 1027 | mov $E,16($ctx) | ||
| 1028 | jmp .Loop_avx | ||
| 1029 | |||
| 1030 | .align 16 | ||
| 1031 | .Ldone_avx: | ||
| 1032 | ___ | ||
| 1033 | $j=$saved_j; @V=@saved_V; | ||
| 1034 | |||
| 1035 | &Xtail_avx(\&body_20_39); | ||
| 1036 | &Xtail_avx(\&body_20_39); | ||
| 1037 | &Xtail_avx(\&body_20_39); | ||
| 1038 | |||
| 1039 | $code.=<<___; | ||
| 1040 | vzeroall | ||
| 1041 | |||
| 1042 | add 0($ctx),$A # update context | ||
| 1043 | add 4($ctx),@T[0] | ||
| 1044 | add 8($ctx),$C | ||
| 1045 | mov $A,0($ctx) | ||
| 1046 | add 12($ctx),$D | ||
| 1047 | mov @T[0],4($ctx) | ||
| 1048 | add 16($ctx),$E | ||
| 1049 | mov $C,8($ctx) | ||
| 1050 | mov $D,12($ctx) | ||
| 1051 | mov $E,16($ctx) | ||
| 1052 | ___ | ||
| 1053 | $code.=<<___ if ($win64); | ||
| 1054 | movaps 64+0(%rsp),%xmm6 | ||
| 1055 | movaps 64+16(%rsp),%xmm7 | ||
| 1056 | movaps 64+32(%rsp),%xmm8 | ||
| 1057 | movaps 64+48(%rsp),%xmm9 | ||
| 1058 | movaps 64+64(%rsp),%xmm10 | ||
| 1059 | ___ | ||
| 1060 | $code.=<<___; | ||
| 1061 | lea `64+($win64?5*16:0)`(%rsp),%rsi | ||
| 1062 | mov 0(%rsi),%r12 | ||
| 1063 | mov 8(%rsi),%rbp | ||
| 1064 | mov 16(%rsi),%rbx | ||
| 1065 | lea 24(%rsi),%rsp | ||
| 1066 | .Lepilogue_avx: | ||
| 1067 | ret | ||
| 1068 | .size sha1_block_data_order_avx,.-sha1_block_data_order_avx | ||
| 1069 | ___ | ||
| 1070 | } | ||
| 1071 | $code.=<<___; | ||
| 1072 | .align 64 | ||
| 1073 | K_XX_XX: | ||
| 1074 | .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 | ||
| 1075 | .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 | ||
| 1076 | .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 | ||
| 1077 | .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 | ||
| 1078 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask | ||
| 1079 | ___ | ||
| 1080 | }}} | ||
| 1081 | $code.=<<___; | ||
| 1082 | .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 1083 | .align 64 | ||
| 244 | ___ | 1084 | ___ |
| 245 | 1085 | ||
| 246 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | 1086 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
| @@ -272,25 +1112,75 @@ se_handler: | |||
| 272 | 1112 | ||
| 273 | lea .Lprologue(%rip),%r10 | 1113 | lea .Lprologue(%rip),%r10 |
| 274 | cmp %r10,%rbx # context->Rip<.Lprologue | 1114 | cmp %r10,%rbx # context->Rip<.Lprologue |
| 275 | jb .Lin_prologue | 1115 | jb .Lcommon_seh_tail |
| 276 | 1116 | ||
| 277 | mov 152($context),%rax # pull context->Rsp | 1117 | mov 152($context),%rax # pull context->Rsp |
| 278 | 1118 | ||
| 279 | lea .Lepilogue(%rip),%r10 | 1119 | lea .Lepilogue(%rip),%r10 |
| 280 | cmp %r10,%rbx # context->Rip>=.Lepilogue | 1120 | cmp %r10,%rbx # context->Rip>=.Lepilogue |
| 281 | jae .Lin_prologue | 1121 | jae .Lcommon_seh_tail |
| 282 | 1122 | ||
| 283 | mov `16*4`(%rax),%rax # pull saved stack pointer | 1123 | mov `16*4`(%rax),%rax # pull saved stack pointer |
| 284 | lea 24(%rax),%rax | 1124 | lea 32(%rax),%rax |
| 285 | 1125 | ||
| 286 | mov -8(%rax),%rbx | 1126 | mov -8(%rax),%rbx |
| 287 | mov -16(%rax),%rbp | 1127 | mov -16(%rax),%rbp |
| 288 | mov -24(%rax),%r12 | 1128 | mov -24(%rax),%r12 |
| 1129 | mov -32(%rax),%r13 | ||
| 289 | mov %rbx,144($context) # restore context->Rbx | 1130 | mov %rbx,144($context) # restore context->Rbx |
| 290 | mov %rbp,160($context) # restore context->Rbp | 1131 | mov %rbp,160($context) # restore context->Rbp |
| 291 | mov %r12,216($context) # restore context->R12 | 1132 | mov %r12,216($context) # restore context->R12 |
| 1133 | mov %r13,224($context) # restore context->R13 | ||
| 1134 | |||
| 1135 | jmp .Lcommon_seh_tail | ||
| 1136 | .size se_handler,.-se_handler | ||
| 292 | 1137 | ||
| 293 | .Lin_prologue: | 1138 | .type ssse3_handler,\@abi-omnipotent |
| 1139 | .align 16 | ||
| 1140 | ssse3_handler: | ||
| 1141 | push %rsi | ||
| 1142 | push %rdi | ||
| 1143 | push %rbx | ||
| 1144 | push %rbp | ||
| 1145 | push %r12 | ||
| 1146 | push %r13 | ||
| 1147 | push %r14 | ||
| 1148 | push %r15 | ||
| 1149 | pushfq | ||
| 1150 | sub \$64,%rsp | ||
| 1151 | |||
| 1152 | mov 120($context),%rax # pull context->Rax | ||
| 1153 | mov 248($context),%rbx # pull context->Rip | ||
| 1154 | |||
| 1155 | mov 8($disp),%rsi # disp->ImageBase | ||
| 1156 | mov 56($disp),%r11 # disp->HandlerData | ||
| 1157 | |||
| 1158 | mov 0(%r11),%r10d # HandlerData[0] | ||
| 1159 | lea (%rsi,%r10),%r10 # prologue label | ||
| 1160 | cmp %r10,%rbx # context->Rip<prologue label | ||
| 1161 | jb .Lcommon_seh_tail | ||
| 1162 | |||
| 1163 | mov 152($context),%rax # pull context->Rsp | ||
| 1164 | |||
| 1165 | mov 4(%r11),%r10d # HandlerData[1] | ||
| 1166 | lea (%rsi,%r10),%r10 # epilogue label | ||
| 1167 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
| 1168 | jae .Lcommon_seh_tail | ||
| 1169 | |||
| 1170 | lea 64(%rax),%rsi | ||
| 1171 | lea 512($context),%rdi # &context.Xmm6 | ||
| 1172 | mov \$10,%ecx | ||
| 1173 | .long 0xa548f3fc # cld; rep movsq | ||
| 1174 | lea `24+64+5*16`(%rax),%rax # adjust stack pointer | ||
| 1175 | |||
| 1176 | mov -8(%rax),%rbx | ||
| 1177 | mov -16(%rax),%rbp | ||
| 1178 | mov -24(%rax),%r12 | ||
| 1179 | mov %rbx,144($context) # restore context->Rbx | ||
| 1180 | mov %rbp,160($context) # restore context->Rbp | ||
| 1181 | mov %r12,216($context) # restore cotnext->R12 | ||
| 1182 | |||
| 1183 | .Lcommon_seh_tail: | ||
| 294 | mov 8(%rax),%rdi | 1184 | mov 8(%rax),%rdi |
| 295 | mov 16(%rax),%rsi | 1185 | mov 16(%rax),%rsi |
| 296 | mov %rax,152($context) # restore context->Rsp | 1186 | mov %rax,152($context) # restore context->Rsp |
| @@ -328,19 +1218,38 @@ se_handler: | |||
| 328 | pop %rdi | 1218 | pop %rdi |
| 329 | pop %rsi | 1219 | pop %rsi |
| 330 | ret | 1220 | ret |
| 331 | .size se_handler,.-se_handler | 1221 | .size ssse3_handler,.-ssse3_handler |
| 332 | 1222 | ||
| 333 | .section .pdata | 1223 | .section .pdata |
| 334 | .align 4 | 1224 | .align 4 |
| 335 | .rva .LSEH_begin_sha1_block_data_order | 1225 | .rva .LSEH_begin_sha1_block_data_order |
| 336 | .rva .LSEH_end_sha1_block_data_order | 1226 | .rva .LSEH_end_sha1_block_data_order |
| 337 | .rva .LSEH_info_sha1_block_data_order | 1227 | .rva .LSEH_info_sha1_block_data_order |
| 338 | 1228 | .rva .LSEH_begin_sha1_block_data_order_ssse3 | |
| 1229 | .rva .LSEH_end_sha1_block_data_order_ssse3 | ||
| 1230 | .rva .LSEH_info_sha1_block_data_order_ssse3 | ||
| 1231 | ___ | ||
| 1232 | $code.=<<___ if ($avx); | ||
| 1233 | .rva .LSEH_begin_sha1_block_data_order_avx | ||
| 1234 | .rva .LSEH_end_sha1_block_data_order_avx | ||
| 1235 | .rva .LSEH_info_sha1_block_data_order_avx | ||
| 1236 | ___ | ||
| 1237 | $code.=<<___; | ||
| 339 | .section .xdata | 1238 | .section .xdata |
| 340 | .align 8 | 1239 | .align 8 |
| 341 | .LSEH_info_sha1_block_data_order: | 1240 | .LSEH_info_sha1_block_data_order: |
| 342 | .byte 9,0,0,0 | 1241 | .byte 9,0,0,0 |
| 343 | .rva se_handler | 1242 | .rva se_handler |
| 1243 | .LSEH_info_sha1_block_data_order_ssse3: | ||
| 1244 | .byte 9,0,0,0 | ||
| 1245 | .rva ssse3_handler | ||
| 1246 | .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] | ||
| 1247 | ___ | ||
| 1248 | $code.=<<___ if ($avx); | ||
| 1249 | .LSEH_info_sha1_block_data_order_avx: | ||
| 1250 | .byte 9,0,0,0 | ||
| 1251 | .rva ssse3_handler | ||
| 1252 | .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] | ||
| 344 | ___ | 1253 | ___ |
| 345 | } | 1254 | } |
| 346 | 1255 | ||
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl index ecc8b69c75..928ec53123 100644 --- a/src/lib/libcrypto/sha/asm/sha256-586.pl +++ b/src/lib/libcrypto/sha/asm/sha256-586.pl | |||
| @@ -14,8 +14,8 @@ | |||
| 14 | # Pentium PIII P4 AMD K8 Core2 | 14 | # Pentium PIII P4 AMD K8 Core2 |
| 15 | # gcc 46 36 41 27 26 | 15 | # gcc 46 36 41 27 26 |
| 16 | # icc 57 33 38 25 23 | 16 | # icc 57 33 38 25 23 |
| 17 | # x86 asm 40 30 35 20 20 | 17 | # x86 asm 40 30 33 20 18 |
| 18 | # x86_64 asm(*) - - 21 15.8 16.5 | 18 | # x86_64 asm(*) - - 21 16 16 |
| 19 | # | 19 | # |
| 20 | # (*) x86_64 assembler performance is presented for reference | 20 | # (*) x86_64 assembler performance is presented for reference |
| 21 | # purposes. | 21 | # purposes. |
| @@ -48,20 +48,19 @@ sub BODY_00_15() { | |||
| 48 | my $in_16_63=shift; | 48 | my $in_16_63=shift; |
| 49 | 49 | ||
| 50 | &mov ("ecx",$E); | 50 | &mov ("ecx",$E); |
| 51 | &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_63); # T += X[-7] | 51 | &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) |
| 52 | &ror ("ecx",6); | 52 | &ror ("ecx",25-11); |
| 53 | &mov ("edi",$E); | ||
| 54 | &ror ("edi",11); | ||
| 55 | &mov ("esi",$Foff); | 53 | &mov ("esi",$Foff); |
| 56 | &xor ("ecx","edi"); | 54 | &xor ("ecx",$E); |
| 57 | &ror ("edi",25-11); | 55 | &ror ("ecx",11-6); |
| 58 | &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0] | 56 | &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0] |
| 59 | &xor ("ecx","edi"); # Sigma1(e) | 57 | &xor ("ecx",$E); |
| 58 | &ror ("ecx",6); # Sigma1(e) | ||
| 60 | &mov ("edi",$Goff); | 59 | &mov ("edi",$Goff); |
| 61 | &add ($T,"ecx"); # T += Sigma1(e) | 60 | &add ($T,"ecx"); # T += Sigma1(e) |
| 62 | &mov ($Eoff,$E); # modulo-scheduled | ||
| 63 | 61 | ||
| 64 | &xor ("esi","edi"); | 62 | &xor ("esi","edi"); |
| 63 | &mov ($Eoff,$E); # modulo-scheduled | ||
| 65 | &mov ("ecx",$A); | 64 | &mov ("ecx",$A); |
| 66 | &and ("esi",$E); | 65 | &and ("esi",$E); |
| 67 | &mov ($E,$Doff); # e becomes d, which is e in next iteration | 66 | &mov ($E,$Doff); # e becomes d, which is e in next iteration |
| @@ -69,14 +68,14 @@ sub BODY_00_15() { | |||
| 69 | &mov ("edi",$A); | 68 | &mov ("edi",$A); |
| 70 | &add ($T,"esi"); # T += Ch(e,f,g) | 69 | &add ($T,"esi"); # T += Ch(e,f,g) |
| 71 | 70 | ||
| 72 | &ror ("ecx",2); | 71 | &ror ("ecx",22-13); |
| 73 | &add ($T,$Hoff); # T += h | 72 | &add ($T,$Hoff); # T += h |
| 74 | &ror ("edi",13); | 73 | &xor ("ecx",$A); |
| 74 | &ror ("ecx",13-2); | ||
| 75 | &mov ("esi",$Boff); | 75 | &mov ("esi",$Boff); |
| 76 | &xor ("ecx","edi"); | 76 | &xor ("ecx",$A); |
| 77 | &ror ("edi",22-13); | 77 | &ror ("ecx",2); # Sigma0(a) |
| 78 | &add ($E,$T); # d += T | 78 | &add ($E,$T); # d += T |
| 79 | &xor ("ecx","edi"); # Sigma0(a) | ||
| 80 | &mov ("edi",$Coff); | 79 | &mov ("edi",$Coff); |
| 81 | 80 | ||
| 82 | &add ($T,"ecx"); # T += Sigma0(a) | 81 | &add ($T,"ecx"); # T += Sigma0(a) |
| @@ -168,23 +167,22 @@ sub BODY_00_15() { | |||
| 168 | &set_label("16_63",16); | 167 | &set_label("16_63",16); |
| 169 | &mov ("esi",$T); | 168 | &mov ("esi",$T); |
| 170 | &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); | 169 | &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); |
| 171 | &shr ($T,3); | ||
| 172 | &ror ("esi",7); | ||
| 173 | &xor ($T,"esi"); | ||
| 174 | &ror ("esi",18-7); | 170 | &ror ("esi",18-7); |
| 175 | &mov ("edi","ecx"); | 171 | &mov ("edi","ecx"); |
| 176 | &xor ($T,"esi"); # T = sigma0(X[-15]) | 172 | &xor ("esi",$T); |
| 173 | &ror ("esi",7); | ||
| 174 | &shr ($T,3); | ||
| 177 | 175 | ||
| 178 | &shr ("ecx",10); | ||
| 179 | &mov ("esi",&DWP(4*(8+15+16),"esp")); | ||
| 180 | &ror ("edi",17); | ||
| 181 | &xor ("ecx","edi"); | ||
| 182 | &ror ("edi",19-17); | 176 | &ror ("edi",19-17); |
| 183 | &add ($T,"esi"); # T += X[-16] | 177 | &xor ($T,"esi"); # T = sigma0(X[-15]) |
| 184 | &xor ("edi","ecx") # sigma1(X[-2]) | 178 | &xor ("edi","ecx"); |
| 179 | &ror ("edi",17); | ||
| 180 | &shr ("ecx",10); | ||
| 181 | &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16] | ||
| 182 | &xor ("edi","ecx"); # sigma1(X[-2]) | ||
| 185 | 183 | ||
| 186 | &add ($T,"edi"); # T += sigma1(X[-2]) | 184 | &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7] |
| 187 | # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1) | 185 | # &add ($T,"edi"); # T += sigma1(X[-2]) |
| 188 | # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] | 186 | # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] |
| 189 | 187 | ||
| 190 | &BODY_00_15(1); | 188 | &BODY_00_15(1); |
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl index 492cb62bc0..9c84e8d93c 100644 --- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl +++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl | |||
| @@ -18,11 +18,16 @@ | |||
| 18 | # Rescheduling for dual-issue pipeline resulted in 22% improvement on | 18 | # Rescheduling for dual-issue pipeline resulted in 22% improvement on |
| 19 | # Cortex A8 core and ~20 cycles per processed byte. | 19 | # Cortex A8 core and ~20 cycles per processed byte. |
| 20 | 20 | ||
| 21 | # February 2011. | ||
| 22 | # | ||
| 23 | # Profiler-assisted and platform-specific optimization resulted in 16% | ||
| 24 | # improvement on Cortex A8 core and ~17 cycles per processed byte. | ||
| 25 | |||
| 21 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | 26 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| 22 | open STDOUT,">$output"; | 27 | open STDOUT,">$output"; |
| 23 | 28 | ||
| 24 | $ctx="r0"; $t0="r0"; | 29 | $ctx="r0"; $t0="r0"; |
| 25 | $inp="r1"; | 30 | $inp="r1"; $t3="r1"; |
| 26 | $len="r2"; $t1="r2"; | 31 | $len="r2"; $t1="r2"; |
| 27 | $T1="r3"; | 32 | $T1="r3"; |
| 28 | $A="r4"; | 33 | $A="r4"; |
| @@ -46,6 +51,9 @@ sub BODY_00_15 { | |||
| 46 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 51 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
| 47 | 52 | ||
| 48 | $code.=<<___ if ($i<16); | 53 | $code.=<<___ if ($i<16); |
| 54 | #if __ARM_ARCH__>=7 | ||
| 55 | ldr $T1,[$inp],#4 | ||
| 56 | #else | ||
| 49 | ldrb $T1,[$inp,#3] @ $i | 57 | ldrb $T1,[$inp,#3] @ $i |
| 50 | ldrb $t2,[$inp,#2] | 58 | ldrb $t2,[$inp,#2] |
| 51 | ldrb $t1,[$inp,#1] | 59 | ldrb $t1,[$inp,#1] |
| @@ -53,16 +61,24 @@ $code.=<<___ if ($i<16); | |||
| 53 | orr $T1,$T1,$t2,lsl#8 | 61 | orr $T1,$T1,$t2,lsl#8 |
| 54 | orr $T1,$T1,$t1,lsl#16 | 62 | orr $T1,$T1,$t1,lsl#16 |
| 55 | orr $T1,$T1,$t0,lsl#24 | 63 | orr $T1,$T1,$t0,lsl#24 |
| 56 | `"str $inp,[sp,#17*4]" if ($i==15)` | 64 | #endif |
| 57 | ___ | 65 | ___ |
| 58 | $code.=<<___; | 66 | $code.=<<___; |
| 59 | ldr $t2,[$Ktbl],#4 @ *K256++ | ||
| 60 | mov $t0,$e,ror#$Sigma1[0] | 67 | mov $t0,$e,ror#$Sigma1[0] |
| 61 | str $T1,[sp,#`$i%16`*4] | 68 | ldr $t2,[$Ktbl],#4 @ *K256++ |
| 62 | eor $t0,$t0,$e,ror#$Sigma1[1] | 69 | eor $t0,$t0,$e,ror#$Sigma1[1] |
| 63 | eor $t1,$f,$g | 70 | eor $t1,$f,$g |
| 71 | #if $i>=16 | ||
| 72 | add $T1,$T1,$t3 @ from BODY_16_xx | ||
| 73 | #elif __ARM_ARCH__>=7 && defined(__ARMEL__) | ||
| 74 | rev $T1,$T1 | ||
| 75 | #endif | ||
| 76 | #if $i==15 | ||
| 77 | str $inp,[sp,#17*4] @ leave room for $t3 | ||
| 78 | #endif | ||
| 64 | eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) | 79 | eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) |
| 65 | and $t1,$t1,$e | 80 | and $t1,$t1,$e |
| 81 | str $T1,[sp,#`$i%16`*4] | ||
| 66 | add $T1,$T1,$t0 | 82 | add $T1,$T1,$t0 |
| 67 | eor $t1,$t1,$g @ Ch(e,f,g) | 83 | eor $t1,$t1,$g @ Ch(e,f,g) |
| 68 | add $T1,$T1,$h | 84 | add $T1,$T1,$h |
| @@ -71,6 +87,9 @@ $code.=<<___; | |||
| 71 | eor $h,$h,$a,ror#$Sigma0[1] | 87 | eor $h,$h,$a,ror#$Sigma0[1] |
| 72 | add $T1,$T1,$t2 | 88 | add $T1,$T1,$t2 |
| 73 | eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) | 89 | eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) |
| 90 | #if $i>=15 | ||
| 91 | ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx | ||
| 92 | #endif | ||
| 74 | orr $t0,$a,$b | 93 | orr $t0,$a,$b |
| 75 | and $t1,$a,$b | 94 | and $t1,$a,$b |
| 76 | and $t0,$t0,$c | 95 | and $t0,$t0,$c |
| @@ -85,24 +104,26 @@ sub BODY_16_XX { | |||
| 85 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 104 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
| 86 | 105 | ||
| 87 | $code.=<<___; | 106 | $code.=<<___; |
| 88 | ldr $t1,[sp,#`($i+1)%16`*4] @ $i | 107 | @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i |
| 89 | ldr $t2,[sp,#`($i+14)%16`*4] | 108 | ldr $t2,[sp,#`($i+14)%16`*4] |
| 109 | mov $t0,$t3,ror#$sigma0[0] | ||
| 90 | ldr $T1,[sp,#`($i+0)%16`*4] | 110 | ldr $T1,[sp,#`($i+0)%16`*4] |
| 91 | mov $t0,$t1,ror#$sigma0[0] | 111 | eor $t0,$t0,$t3,ror#$sigma0[1] |
| 92 | ldr $inp,[sp,#`($i+9)%16`*4] | 112 | ldr $t1,[sp,#`($i+9)%16`*4] |
| 93 | eor $t0,$t0,$t1,ror#$sigma0[1] | 113 | eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1]) |
| 94 | eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) | 114 | mov $t3,$t2,ror#$sigma1[0] |
| 95 | mov $t1,$t2,ror#$sigma1[0] | ||
| 96 | add $T1,$T1,$t0 | 115 | add $T1,$T1,$t0 |
| 97 | eor $t1,$t1,$t2,ror#$sigma1[1] | 116 | eor $t3,$t3,$t2,ror#$sigma1[1] |
| 98 | add $T1,$T1,$inp | ||
| 99 | eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) | ||
| 100 | add $T1,$T1,$t1 | 117 | add $T1,$T1,$t1 |
| 118 | eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) | ||
| 119 | @ add $T1,$T1,$t3 | ||
| 101 | ___ | 120 | ___ |
| 102 | &BODY_00_15(@_); | 121 | &BODY_00_15(@_); |
| 103 | } | 122 | } |
| 104 | 123 | ||
| 105 | $code=<<___; | 124 | $code=<<___; |
| 125 | #include "arm_arch.h" | ||
| 126 | |||
| 106 | .text | 127 | .text |
| 107 | .code 32 | 128 | .code 32 |
| 108 | 129 | ||
| @@ -132,7 +153,7 @@ K256: | |||
| 132 | sha256_block_data_order: | 153 | sha256_block_data_order: |
| 133 | sub r3,pc,#8 @ sha256_block_data_order | 154 | sub r3,pc,#8 @ sha256_block_data_order |
| 134 | add $len,$inp,$len,lsl#6 @ len to point at the end of inp | 155 | add $len,$inp,$len,lsl#6 @ len to point at the end of inp |
| 135 | stmdb sp!,{$ctx,$inp,$len,r4-r12,lr} | 156 | stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} |
| 136 | ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} | 157 | ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} |
| 137 | sub $Ktbl,r3,#256 @ K256 | 158 | sub $Ktbl,r3,#256 @ K256 |
| 138 | sub sp,sp,#16*4 @ alloca(X[16]) | 159 | sub sp,sp,#16*4 @ alloca(X[16]) |
| @@ -171,10 +192,14 @@ $code.=<<___; | |||
| 171 | bne .Loop | 192 | bne .Loop |
| 172 | 193 | ||
| 173 | add sp,sp,#`16+3`*4 @ destroy frame | 194 | add sp,sp,#`16+3`*4 @ destroy frame |
| 174 | ldmia sp!,{r4-r12,lr} | 195 | #if __ARM_ARCH__>=5 |
| 196 | ldmia sp!,{r4-r11,pc} | ||
| 197 | #else | ||
| 198 | ldmia sp!,{r4-r11,lr} | ||
| 175 | tst lr,#1 | 199 | tst lr,#1 |
| 176 | moveq pc,lr @ be binary compatible with V4, yet | 200 | moveq pc,lr @ be binary compatible with V4, yet |
| 177 | bx lr @ interoperable with Thumb ISA:-) | 201 | bx lr @ interoperable with Thumb ISA:-) |
| 202 | #endif | ||
| 178 | .size sha256_block_data_order,.-sha256_block_data_order | 203 | .size sha256_block_data_order,.-sha256_block_data_order |
| 179 | .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | 204 | .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" |
| 180 | .align 2 | 205 | .align 2 |
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl index 3a35861ac6..7faf37b147 100644 --- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl +++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl | |||
| @@ -18,22 +18,33 @@ | |||
| 18 | # Rescheduling for dual-issue pipeline resulted in 6% improvement on | 18 | # Rescheduling for dual-issue pipeline resulted in 6% improvement on |
| 19 | # Cortex A8 core and ~40 cycles per processed byte. | 19 | # Cortex A8 core and ~40 cycles per processed byte. |
| 20 | 20 | ||
| 21 | # February 2011. | ||
| 22 | # | ||
| 23 | # Profiler-assisted and platform-specific optimization resulted in 7% | ||
| 24 | # improvement on Coxtex A8 core and ~38 cycles per byte. | ||
| 25 | |||
| 26 | # March 2011. | ||
| 27 | # | ||
| 28 | # Add NEON implementation. On Cortex A8 it was measured to process | ||
| 29 | # one byte in 25.5 cycles or 47% faster than integer-only code. | ||
| 30 | |||
| 21 | # Byte order [in]dependence. ========================================= | 31 | # Byte order [in]dependence. ========================================= |
| 22 | # | 32 | # |
| 23 | # Caller is expected to maintain specific *dword* order in h[0-7], | 33 | # Originally caller was expected to maintain specific *dword* order in |
| 24 | # namely with most significant dword at *lower* address, which is | 34 | # h[0-7], namely with most significant dword at *lower* address, which |
| 25 | # reflected in below two parameters. *Byte* order within these dwords | 35 | # was reflected in below two parameters as 0 and 4. Now caller is |
| 26 | # in turn is whatever *native* byte order on current platform. | 36 | # expected to maintain native byte order for whole 64-bit values. |
| 27 | $hi=0; | 37 | $hi="HI"; |
| 28 | $lo=4; | 38 | $lo="LO"; |
| 29 | # ==================================================================== | 39 | # ==================================================================== |
| 30 | 40 | ||
| 31 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | 41 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| 32 | open STDOUT,">$output"; | 42 | open STDOUT,">$output"; |
| 33 | 43 | ||
| 34 | $ctx="r0"; | 44 | $ctx="r0"; # parameter block |
| 35 | $inp="r1"; | 45 | $inp="r1"; |
| 36 | $len="r2"; | 46 | $len="r2"; |
| 47 | |||
| 37 | $Tlo="r3"; | 48 | $Tlo="r3"; |
| 38 | $Thi="r4"; | 49 | $Thi="r4"; |
| 39 | $Alo="r5"; | 50 | $Alo="r5"; |
| @@ -61,15 +72,17 @@ $Xoff=8*8; | |||
| 61 | sub BODY_00_15() { | 72 | sub BODY_00_15() { |
| 62 | my $magic = shift; | 73 | my $magic = shift; |
| 63 | $code.=<<___; | 74 | $code.=<<___; |
| 64 | ldr $t2,[sp,#$Hoff+0] @ h.lo | ||
| 65 | ldr $t3,[sp,#$Hoff+4] @ h.hi | ||
| 66 | @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) | 75 | @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) |
| 67 | @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 | 76 | @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 |
| 68 | @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 | 77 | @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 |
| 69 | mov $t0,$Elo,lsr#14 | 78 | mov $t0,$Elo,lsr#14 |
| 79 | str $Tlo,[sp,#$Xoff+0] | ||
| 70 | mov $t1,$Ehi,lsr#14 | 80 | mov $t1,$Ehi,lsr#14 |
| 81 | str $Thi,[sp,#$Xoff+4] | ||
| 71 | eor $t0,$t0,$Ehi,lsl#18 | 82 | eor $t0,$t0,$Ehi,lsl#18 |
| 83 | ldr $t2,[sp,#$Hoff+0] @ h.lo | ||
| 72 | eor $t1,$t1,$Elo,lsl#18 | 84 | eor $t1,$t1,$Elo,lsl#18 |
| 85 | ldr $t3,[sp,#$Hoff+4] @ h.hi | ||
| 73 | eor $t0,$t0,$Elo,lsr#18 | 86 | eor $t0,$t0,$Elo,lsr#18 |
| 74 | eor $t1,$t1,$Ehi,lsr#18 | 87 | eor $t1,$t1,$Ehi,lsr#18 |
| 75 | eor $t0,$t0,$Ehi,lsl#14 | 88 | eor $t0,$t0,$Ehi,lsl#14 |
| @@ -96,25 +109,24 @@ $code.=<<___; | |||
| 96 | and $t1,$t1,$Ehi | 109 | and $t1,$t1,$Ehi |
| 97 | str $Ahi,[sp,#$Aoff+4] | 110 | str $Ahi,[sp,#$Aoff+4] |
| 98 | eor $t0,$t0,$t2 | 111 | eor $t0,$t0,$t2 |
| 99 | ldr $t2,[$Ktbl,#4] @ K[i].lo | 112 | ldr $t2,[$Ktbl,#$lo] @ K[i].lo |
| 100 | eor $t1,$t1,$t3 @ Ch(e,f,g) | 113 | eor $t1,$t1,$t3 @ Ch(e,f,g) |
| 101 | ldr $t3,[$Ktbl,#0] @ K[i].hi | 114 | ldr $t3,[$Ktbl,#$hi] @ K[i].hi |
| 102 | 115 | ||
| 103 | adds $Tlo,$Tlo,$t0 | 116 | adds $Tlo,$Tlo,$t0 |
| 104 | ldr $Elo,[sp,#$Doff+0] @ d.lo | 117 | ldr $Elo,[sp,#$Doff+0] @ d.lo |
| 105 | adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) | 118 | adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) |
| 106 | ldr $Ehi,[sp,#$Doff+4] @ d.hi | 119 | ldr $Ehi,[sp,#$Doff+4] @ d.hi |
| 107 | adds $Tlo,$Tlo,$t2 | 120 | adds $Tlo,$Tlo,$t2 |
| 121 | and $t0,$t2,#0xff | ||
| 108 | adc $Thi,$Thi,$t3 @ T += K[i] | 122 | adc $Thi,$Thi,$t3 @ T += K[i] |
| 109 | adds $Elo,$Elo,$Tlo | 123 | adds $Elo,$Elo,$Tlo |
| 124 | ldr $t2,[sp,#$Boff+0] @ b.lo | ||
| 110 | adc $Ehi,$Ehi,$Thi @ d += T | 125 | adc $Ehi,$Ehi,$Thi @ d += T |
| 111 | |||
| 112 | and $t0,$t2,#0xff | ||
| 113 | teq $t0,#$magic | 126 | teq $t0,#$magic |
| 114 | orreq $Ktbl,$Ktbl,#1 | ||
| 115 | 127 | ||
| 116 | ldr $t2,[sp,#$Boff+0] @ b.lo | ||
| 117 | ldr $t3,[sp,#$Coff+0] @ c.lo | 128 | ldr $t3,[sp,#$Coff+0] @ c.lo |
| 129 | orreq $Ktbl,$Ktbl,#1 | ||
| 118 | @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) | 130 | @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) |
| 119 | @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 | 131 | @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 |
| 120 | @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 | 132 | @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 |
| @@ -131,80 +143,100 @@ $code.=<<___; | |||
| 131 | eor $t0,$t0,$Alo,lsl#25 | 143 | eor $t0,$t0,$Alo,lsl#25 |
| 132 | eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) | 144 | eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) |
| 133 | adds $Tlo,$Tlo,$t0 | 145 | adds $Tlo,$Tlo,$t0 |
| 146 | and $t0,$Alo,$t2 | ||
| 134 | adc $Thi,$Thi,$t1 @ T += Sigma0(a) | 147 | adc $Thi,$Thi,$t1 @ T += Sigma0(a) |
| 135 | 148 | ||
| 136 | and $t0,$Alo,$t2 | ||
| 137 | orr $Alo,$Alo,$t2 | ||
| 138 | ldr $t1,[sp,#$Boff+4] @ b.hi | 149 | ldr $t1,[sp,#$Boff+4] @ b.hi |
| 150 | orr $Alo,$Alo,$t2 | ||
| 139 | ldr $t2,[sp,#$Coff+4] @ c.hi | 151 | ldr $t2,[sp,#$Coff+4] @ c.hi |
| 140 | and $Alo,$Alo,$t3 | 152 | and $Alo,$Alo,$t3 |
| 141 | orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo | ||
| 142 | and $t3,$Ahi,$t1 | 153 | and $t3,$Ahi,$t1 |
| 143 | orr $Ahi,$Ahi,$t1 | 154 | orr $Ahi,$Ahi,$t1 |
| 155 | orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo | ||
| 144 | and $Ahi,$Ahi,$t2 | 156 | and $Ahi,$Ahi,$t2 |
| 145 | orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi | ||
| 146 | adds $Alo,$Alo,$Tlo | 157 | adds $Alo,$Alo,$Tlo |
| 147 | adc $Ahi,$Ahi,$Thi @ h += T | 158 | orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi |
| 148 | |||
| 149 | sub sp,sp,#8 | 159 | sub sp,sp,#8 |
| 160 | adc $Ahi,$Ahi,$Thi @ h += T | ||
| 161 | tst $Ktbl,#1 | ||
| 150 | add $Ktbl,$Ktbl,#8 | 162 | add $Ktbl,$Ktbl,#8 |
| 151 | ___ | 163 | ___ |
| 152 | } | 164 | } |
| 153 | $code=<<___; | 165 | $code=<<___; |
| 166 | #include "arm_arch.h" | ||
| 167 | #ifdef __ARMEL__ | ||
| 168 | # define LO 0 | ||
| 169 | # define HI 4 | ||
| 170 | # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 | ||
| 171 | #else | ||
| 172 | # define HI 0 | ||
| 173 | # define LO 4 | ||
| 174 | # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 | ||
| 175 | #endif | ||
| 176 | |||
| 154 | .text | 177 | .text |
| 155 | .code 32 | 178 | .code 32 |
| 156 | .type K512,%object | 179 | .type K512,%object |
| 157 | .align 5 | 180 | .align 5 |
| 158 | K512: | 181 | K512: |
| 159 | .word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd | 182 | WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) |
| 160 | .word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc | 183 | WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) |
| 161 | .word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 | 184 | WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) |
| 162 | .word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 | 185 | WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) |
| 163 | .word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe | 186 | WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) |
| 164 | .word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 | 187 | WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) |
| 165 | .word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 | 188 | WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) |
| 166 | .word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 | 189 | WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) |
| 167 | .word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 | 190 | WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) |
| 168 | .word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 | 191 | WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) |
| 169 | .word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 | 192 | WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) |
| 170 | .word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 | 193 | WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) |
| 171 | .word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 | 194 | WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) |
| 172 | .word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 | 195 | WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) |
| 173 | .word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 | 196 | WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) |
| 174 | .word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 | 197 | WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) |
| 175 | .word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 | 198 | WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) |
| 176 | .word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df | 199 | WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) |
| 177 | .word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 | 200 | WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) |
| 178 | .word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b | 201 | WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) |
| 179 | .word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 | 202 | WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) |
| 180 | .word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 | 203 | WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) |
| 181 | .word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 | 204 | WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) |
| 182 | .word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 | 205 | WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) |
| 183 | .word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 | 206 | WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) |
| 184 | .word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 | 207 | WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) |
| 185 | .word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb | 208 | WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) |
| 186 | .word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 | 209 | WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) |
| 187 | .word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 | 210 | WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) |
| 188 | .word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec | 211 | WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) |
| 189 | .word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 | 212 | WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) |
| 190 | .word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b | 213 | WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) |
| 191 | .word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 | 214 | WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) |
| 192 | .word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 | 215 | WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) |
| 193 | .word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 | 216 | WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) |
| 194 | .word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b | 217 | WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) |
| 195 | .word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 | 218 | WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) |
| 196 | .word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c | 219 | WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) |
| 197 | .word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a | 220 | WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) |
| 198 | .word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 | 221 | WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) |
| 199 | .size K512,.-K512 | 222 | .size K512,.-K512 |
| 223 | .LOPENSSL_armcap: | ||
| 224 | .word OPENSSL_armcap_P-sha512_block_data_order | ||
| 225 | .skip 32-4 | ||
| 200 | 226 | ||
| 201 | .global sha512_block_data_order | 227 | .global sha512_block_data_order |
| 202 | .type sha512_block_data_order,%function | 228 | .type sha512_block_data_order,%function |
| 203 | sha512_block_data_order: | 229 | sha512_block_data_order: |
| 204 | sub r3,pc,#8 @ sha512_block_data_order | 230 | sub r3,pc,#8 @ sha512_block_data_order |
| 205 | add $len,$inp,$len,lsl#7 @ len to point at the end of inp | 231 | add $len,$inp,$len,lsl#7 @ len to point at the end of inp |
| 232 | #if __ARM_ARCH__>=7 | ||
| 233 | ldr r12,.LOPENSSL_armcap | ||
| 234 | ldr r12,[r3,r12] @ OPENSSL_armcap_P | ||
| 235 | tst r12,#1 | ||
| 236 | bne .LNEON | ||
| 237 | #endif | ||
| 206 | stmdb sp!,{r4-r12,lr} | 238 | stmdb sp!,{r4-r12,lr} |
| 207 | sub $Ktbl,r3,#640 @ K512 | 239 | sub $Ktbl,r3,#672 @ K512 |
| 208 | sub sp,sp,#9*8 | 240 | sub sp,sp,#9*8 |
| 209 | 241 | ||
| 210 | ldr $Elo,[$ctx,#$Eoff+$lo] | 242 | ldr $Elo,[$ctx,#$Eoff+$lo] |
| @@ -238,6 +270,7 @@ sha512_block_data_order: | |||
| 238 | str $Thi,[sp,#$Foff+4] | 270 | str $Thi,[sp,#$Foff+4] |
| 239 | 271 | ||
| 240 | .L00_15: | 272 | .L00_15: |
| 273 | #if __ARM_ARCH__<7 | ||
| 241 | ldrb $Tlo,[$inp,#7] | 274 | ldrb $Tlo,[$inp,#7] |
| 242 | ldrb $t0, [$inp,#6] | 275 | ldrb $t0, [$inp,#6] |
| 243 | ldrb $t1, [$inp,#5] | 276 | ldrb $t1, [$inp,#5] |
| @@ -252,26 +285,30 @@ sha512_block_data_order: | |||
| 252 | orr $Thi,$Thi,$t3,lsl#8 | 285 | orr $Thi,$Thi,$t3,lsl#8 |
| 253 | orr $Thi,$Thi,$t0,lsl#16 | 286 | orr $Thi,$Thi,$t0,lsl#16 |
| 254 | orr $Thi,$Thi,$t1,lsl#24 | 287 | orr $Thi,$Thi,$t1,lsl#24 |
| 255 | str $Tlo,[sp,#$Xoff+0] | 288 | #else |
| 256 | str $Thi,[sp,#$Xoff+4] | 289 | ldr $Tlo,[$inp,#4] |
| 290 | ldr $Thi,[$inp],#8 | ||
| 291 | #ifdef __ARMEL__ | ||
| 292 | rev $Tlo,$Tlo | ||
| 293 | rev $Thi,$Thi | ||
| 294 | #endif | ||
| 295 | #endif | ||
| 257 | ___ | 296 | ___ |
| 258 | &BODY_00_15(0x94); | 297 | &BODY_00_15(0x94); |
| 259 | $code.=<<___; | 298 | $code.=<<___; |
| 260 | tst $Ktbl,#1 | 299 | tst $Ktbl,#1 |
| 261 | beq .L00_15 | 300 | beq .L00_15 |
| 262 | bic $Ktbl,$Ktbl,#1 | ||
| 263 | |||
| 264 | .L16_79: | ||
| 265 | ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] | 301 | ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] |
| 266 | ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] | 302 | ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] |
| 267 | ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] | 303 | bic $Ktbl,$Ktbl,#1 |
| 268 | ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] | 304 | .L16_79: |
| 269 | |||
| 270 | @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) | 305 | @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) |
| 271 | @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 | 306 | @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 |
| 272 | @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 | 307 | @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 |
| 273 | mov $Tlo,$t0,lsr#1 | 308 | mov $Tlo,$t0,lsr#1 |
| 309 | ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] | ||
| 274 | mov $Thi,$t1,lsr#1 | 310 | mov $Thi,$t1,lsr#1 |
| 311 | ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] | ||
| 275 | eor $Tlo,$Tlo,$t1,lsl#31 | 312 | eor $Tlo,$Tlo,$t1,lsl#31 |
| 276 | eor $Thi,$Thi,$t0,lsl#31 | 313 | eor $Thi,$Thi,$t0,lsl#31 |
| 277 | eor $Tlo,$Tlo,$t0,lsr#8 | 314 | eor $Tlo,$Tlo,$t0,lsr#8 |
| @@ -295,25 +332,24 @@ $code.=<<___; | |||
| 295 | eor $t1,$t1,$t3,lsl#3 | 332 | eor $t1,$t1,$t3,lsl#3 |
| 296 | eor $t0,$t0,$t2,lsr#6 | 333 | eor $t0,$t0,$t2,lsr#6 |
| 297 | eor $t1,$t1,$t3,lsr#6 | 334 | eor $t1,$t1,$t3,lsr#6 |
| 335 | ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] | ||
| 298 | eor $t0,$t0,$t3,lsl#26 | 336 | eor $t0,$t0,$t3,lsl#26 |
| 299 | 337 | ||
| 300 | ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] | ||
| 301 | ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] | 338 | ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] |
| 302 | adds $Tlo,$Tlo,$t0 | 339 | adds $Tlo,$Tlo,$t0 |
| 340 | ldr $t0,[sp,#`$Xoff+8*16`+0] | ||
| 303 | adc $Thi,$Thi,$t1 | 341 | adc $Thi,$Thi,$t1 |
| 304 | 342 | ||
| 305 | ldr $t0,[sp,#`$Xoff+8*16`+0] | ||
| 306 | ldr $t1,[sp,#`$Xoff+8*16`+4] | 343 | ldr $t1,[sp,#`$Xoff+8*16`+4] |
| 307 | adds $Tlo,$Tlo,$t2 | 344 | adds $Tlo,$Tlo,$t2 |
| 308 | adc $Thi,$Thi,$t3 | 345 | adc $Thi,$Thi,$t3 |
| 309 | adds $Tlo,$Tlo,$t0 | 346 | adds $Tlo,$Tlo,$t0 |
| 310 | adc $Thi,$Thi,$t1 | 347 | adc $Thi,$Thi,$t1 |
| 311 | str $Tlo,[sp,#$Xoff+0] | ||
| 312 | str $Thi,[sp,#$Xoff+4] | ||
| 313 | ___ | 348 | ___ |
| 314 | &BODY_00_15(0x17); | 349 | &BODY_00_15(0x17); |
| 315 | $code.=<<___; | 350 | $code.=<<___; |
| 316 | tst $Ktbl,#1 | 351 | ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] |
| 352 | ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] | ||
| 317 | beq .L16_79 | 353 | beq .L16_79 |
| 318 | bic $Ktbl,$Ktbl,#1 | 354 | bic $Ktbl,$Ktbl,#1 |
| 319 | 355 | ||
| @@ -324,12 +360,12 @@ $code.=<<___; | |||
| 324 | ldr $t2, [$ctx,#$Boff+$lo] | 360 | ldr $t2, [$ctx,#$Boff+$lo] |
| 325 | ldr $t3, [$ctx,#$Boff+$hi] | 361 | ldr $t3, [$ctx,#$Boff+$hi] |
| 326 | adds $t0,$Alo,$t0 | 362 | adds $t0,$Alo,$t0 |
| 327 | adc $t1,$Ahi,$t1 | ||
| 328 | adds $t2,$Tlo,$t2 | ||
| 329 | adc $t3,$Thi,$t3 | ||
| 330 | str $t0, [$ctx,#$Aoff+$lo] | 363 | str $t0, [$ctx,#$Aoff+$lo] |
| 364 | adc $t1,$Ahi,$t1 | ||
| 331 | str $t1, [$ctx,#$Aoff+$hi] | 365 | str $t1, [$ctx,#$Aoff+$hi] |
| 366 | adds $t2,$Tlo,$t2 | ||
| 332 | str $t2, [$ctx,#$Boff+$lo] | 367 | str $t2, [$ctx,#$Boff+$lo] |
| 368 | adc $t3,$Thi,$t3 | ||
| 333 | str $t3, [$ctx,#$Boff+$hi] | 369 | str $t3, [$ctx,#$Boff+$hi] |
| 334 | 370 | ||
| 335 | ldr $Alo,[sp,#$Coff+0] | 371 | ldr $Alo,[sp,#$Coff+0] |
| @@ -341,12 +377,12 @@ $code.=<<___; | |||
| 341 | ldr $t2, [$ctx,#$Doff+$lo] | 377 | ldr $t2, [$ctx,#$Doff+$lo] |
| 342 | ldr $t3, [$ctx,#$Doff+$hi] | 378 | ldr $t3, [$ctx,#$Doff+$hi] |
| 343 | adds $t0,$Alo,$t0 | 379 | adds $t0,$Alo,$t0 |
| 344 | adc $t1,$Ahi,$t1 | ||
| 345 | adds $t2,$Tlo,$t2 | ||
| 346 | adc $t3,$Thi,$t3 | ||
| 347 | str $t0, [$ctx,#$Coff+$lo] | 380 | str $t0, [$ctx,#$Coff+$lo] |
| 381 | adc $t1,$Ahi,$t1 | ||
| 348 | str $t1, [$ctx,#$Coff+$hi] | 382 | str $t1, [$ctx,#$Coff+$hi] |
| 383 | adds $t2,$Tlo,$t2 | ||
| 349 | str $t2, [$ctx,#$Doff+$lo] | 384 | str $t2, [$ctx,#$Doff+$lo] |
| 385 | adc $t3,$Thi,$t3 | ||
| 350 | str $t3, [$ctx,#$Doff+$hi] | 386 | str $t3, [$ctx,#$Doff+$hi] |
| 351 | 387 | ||
| 352 | ldr $Tlo,[sp,#$Foff+0] | 388 | ldr $Tlo,[sp,#$Foff+0] |
| @@ -356,12 +392,12 @@ $code.=<<___; | |||
| 356 | ldr $t2, [$ctx,#$Foff+$lo] | 392 | ldr $t2, [$ctx,#$Foff+$lo] |
| 357 | ldr $t3, [$ctx,#$Foff+$hi] | 393 | ldr $t3, [$ctx,#$Foff+$hi] |
| 358 | adds $Elo,$Elo,$t0 | 394 | adds $Elo,$Elo,$t0 |
| 359 | adc $Ehi,$Ehi,$t1 | ||
| 360 | adds $t2,$Tlo,$t2 | ||
| 361 | adc $t3,$Thi,$t3 | ||
| 362 | str $Elo,[$ctx,#$Eoff+$lo] | 395 | str $Elo,[$ctx,#$Eoff+$lo] |
| 396 | adc $Ehi,$Ehi,$t1 | ||
| 363 | str $Ehi,[$ctx,#$Eoff+$hi] | 397 | str $Ehi,[$ctx,#$Eoff+$hi] |
| 398 | adds $t2,$Tlo,$t2 | ||
| 364 | str $t2, [$ctx,#$Foff+$lo] | 399 | str $t2, [$ctx,#$Foff+$lo] |
| 400 | adc $t3,$Thi,$t3 | ||
| 365 | str $t3, [$ctx,#$Foff+$hi] | 401 | str $t3, [$ctx,#$Foff+$hi] |
| 366 | 402 | ||
| 367 | ldr $Alo,[sp,#$Goff+0] | 403 | ldr $Alo,[sp,#$Goff+0] |
| @@ -373,12 +409,12 @@ $code.=<<___; | |||
| 373 | ldr $t2, [$ctx,#$Hoff+$lo] | 409 | ldr $t2, [$ctx,#$Hoff+$lo] |
| 374 | ldr $t3, [$ctx,#$Hoff+$hi] | 410 | ldr $t3, [$ctx,#$Hoff+$hi] |
| 375 | adds $t0,$Alo,$t0 | 411 | adds $t0,$Alo,$t0 |
| 376 | adc $t1,$Ahi,$t1 | ||
| 377 | adds $t2,$Tlo,$t2 | ||
| 378 | adc $t3,$Thi,$t3 | ||
| 379 | str $t0, [$ctx,#$Goff+$lo] | 412 | str $t0, [$ctx,#$Goff+$lo] |
| 413 | adc $t1,$Ahi,$t1 | ||
| 380 | str $t1, [$ctx,#$Goff+$hi] | 414 | str $t1, [$ctx,#$Goff+$hi] |
| 415 | adds $t2,$Tlo,$t2 | ||
| 381 | str $t2, [$ctx,#$Hoff+$lo] | 416 | str $t2, [$ctx,#$Hoff+$lo] |
| 417 | adc $t3,$Thi,$t3 | ||
| 382 | str $t3, [$ctx,#$Hoff+$hi] | 418 | str $t3, [$ctx,#$Hoff+$hi] |
| 383 | 419 | ||
| 384 | add sp,sp,#640 | 420 | add sp,sp,#640 |
| @@ -388,13 +424,156 @@ $code.=<<___; | |||
| 388 | bne .Loop | 424 | bne .Loop |
| 389 | 425 | ||
| 390 | add sp,sp,#8*9 @ destroy frame | 426 | add sp,sp,#8*9 @ destroy frame |
| 427 | #if __ARM_ARCH__>=5 | ||
| 428 | ldmia sp!,{r4-r12,pc} | ||
| 429 | #else | ||
| 391 | ldmia sp!,{r4-r12,lr} | 430 | ldmia sp!,{r4-r12,lr} |
| 392 | tst lr,#1 | 431 | tst lr,#1 |
| 393 | moveq pc,lr @ be binary compatible with V4, yet | 432 | moveq pc,lr @ be binary compatible with V4, yet |
| 394 | bx lr @ interoperable with Thumb ISA:-) | 433 | bx lr @ interoperable with Thumb ISA:-) |
| 395 | .size sha512_block_data_order,.-sha512_block_data_order | 434 | #endif |
| 396 | .asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | 435 | ___ |
| 436 | |||
| 437 | { | ||
| 438 | my @Sigma0=(28,34,39); | ||
| 439 | my @Sigma1=(14,18,41); | ||
| 440 | my @sigma0=(1, 8, 7); | ||
| 441 | my @sigma1=(19,61,6); | ||
| 442 | |||
| 443 | my $Ktbl="r3"; | ||
| 444 | my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch | ||
| 445 | |||
| 446 | my @X=map("d$_",(0..15)); | ||
| 447 | my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); | ||
| 448 | |||
| 449 | sub NEON_00_15() { | ||
| 450 | my $i=shift; | ||
| 451 | my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; | ||
| 452 | my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps | ||
| 453 | |||
| 454 | $code.=<<___ if ($i<16 || $i&1); | ||
| 455 | vshr.u64 $t0,$e,#@Sigma1[0] @ $i | ||
| 456 | #if $i<16 | ||
| 457 | vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned | ||
| 458 | #endif | ||
| 459 | vshr.u64 $t1,$e,#@Sigma1[1] | ||
| 460 | vshr.u64 $t2,$e,#@Sigma1[2] | ||
| 461 | ___ | ||
| 462 | $code.=<<___; | ||
| 463 | vld1.64 {$K},[$Ktbl,:64]! @ K[i++] | ||
| 464 | vsli.64 $t0,$e,#`64-@Sigma1[0]` | ||
| 465 | vsli.64 $t1,$e,#`64-@Sigma1[1]` | ||
| 466 | vsli.64 $t2,$e,#`64-@Sigma1[2]` | ||
| 467 | #if $i<16 && defined(__ARMEL__) | ||
| 468 | vrev64.8 @X[$i],@X[$i] | ||
| 469 | #endif | ||
| 470 | vadd.i64 $T1,$K,$h | ||
| 471 | veor $Ch,$f,$g | ||
| 472 | veor $t0,$t1 | ||
| 473 | vand $Ch,$e | ||
| 474 | veor $t0,$t2 @ Sigma1(e) | ||
| 475 | veor $Ch,$g @ Ch(e,f,g) | ||
| 476 | vadd.i64 $T1,$t0 | ||
| 477 | vshr.u64 $t0,$a,#@Sigma0[0] | ||
| 478 | vadd.i64 $T1,$Ch | ||
| 479 | vshr.u64 $t1,$a,#@Sigma0[1] | ||
| 480 | vshr.u64 $t2,$a,#@Sigma0[2] | ||
| 481 | vsli.64 $t0,$a,#`64-@Sigma0[0]` | ||
| 482 | vsli.64 $t1,$a,#`64-@Sigma0[1]` | ||
| 483 | vsli.64 $t2,$a,#`64-@Sigma0[2]` | ||
| 484 | vadd.i64 $T1,@X[$i%16] | ||
| 485 | vorr $Maj,$a,$c | ||
| 486 | vand $Ch,$a,$c | ||
| 487 | veor $h,$t0,$t1 | ||
| 488 | vand $Maj,$b | ||
| 489 | veor $h,$t2 @ Sigma0(a) | ||
| 490 | vorr $Maj,$Ch @ Maj(a,b,c) | ||
| 491 | vadd.i64 $h,$T1 | ||
| 492 | vadd.i64 $d,$T1 | ||
| 493 | vadd.i64 $h,$Maj | ||
| 494 | ___ | ||
| 495 | } | ||
| 496 | |||
| 497 | sub NEON_16_79() { | ||
| 498 | my $i=shift; | ||
| 499 | |||
| 500 | if ($i&1) { &NEON_00_15($i,@_); return; } | ||
| 501 | |||
| 502 | # 2x-vectorized, therefore runs every 2nd round | ||
| 503 | my @X=map("q$_",(0..7)); # view @X as 128-bit vector | ||
| 504 | my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps | ||
| 505 | my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 | ||
| 506 | my $e=@_[4]; # $e from NEON_00_15 | ||
| 507 | $i /= 2; | ||
| 508 | $code.=<<___; | ||
| 509 | vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] | ||
| 510 | vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] | ||
| 511 | vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] | ||
| 512 | vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` | ||
| 513 | vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] | ||
| 514 | vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` | ||
| 515 | veor $s1,$t0 | ||
| 516 | vshr.u64 $t0,$s0,#@sigma0[0] | ||
| 517 | veor $s1,$t1 @ sigma1(X[i+14]) | ||
| 518 | vshr.u64 $t1,$s0,#@sigma0[1] | ||
| 519 | vadd.i64 @X[$i%8],$s1 | ||
| 520 | vshr.u64 $s1,$s0,#@sigma0[2] | ||
| 521 | vsli.64 $t0,$s0,#`64-@sigma0[0]` | ||
| 522 | vsli.64 $t1,$s0,#`64-@sigma0[1]` | ||
| 523 | vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] | ||
| 524 | veor $s1,$t0 | ||
| 525 | vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 | ||
| 526 | vadd.i64 @X[$i%8],$s0 | ||
| 527 | vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 | ||
| 528 | veor $s1,$t1 @ sigma0(X[i+1]) | ||
| 529 | vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 | ||
| 530 | vadd.i64 @X[$i%8],$s1 | ||
| 531 | ___ | ||
| 532 | &NEON_00_15(2*$i,@_); | ||
| 533 | } | ||
| 534 | |||
| 535 | $code.=<<___; | ||
| 536 | #if __ARM_ARCH__>=7 | ||
| 537 | .fpu neon | ||
| 538 | |||
| 539 | .align 4 | ||
| 540 | .LNEON: | ||
| 541 | dmb @ errata #451034 on early Cortex A8 | ||
| 542 | vstmdb sp!,{d8-d15} @ ABI specification says so | ||
| 543 | sub $Ktbl,r3,#672 @ K512 | ||
| 544 | vldmia $ctx,{$A-$H} @ load context | ||
| 545 | .Loop_neon: | ||
| 546 | ___ | ||
| 547 | for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } | ||
| 548 | $code.=<<___; | ||
| 549 | mov $cnt,#4 | ||
| 550 | .L16_79_neon: | ||
| 551 | subs $cnt,#1 | ||
| 552 | ___ | ||
| 553 | for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } | ||
| 554 | $code.=<<___; | ||
| 555 | bne .L16_79_neon | ||
| 556 | |||
| 557 | vldmia $ctx,{d24-d31} @ load context to temp | ||
| 558 | vadd.i64 q8,q12 @ vectorized accumulate | ||
| 559 | vadd.i64 q9,q13 | ||
| 560 | vadd.i64 q10,q14 | ||
| 561 | vadd.i64 q11,q15 | ||
| 562 | vstmia $ctx,{$A-$H} @ save context | ||
| 563 | teq $inp,$len | ||
| 564 | sub $Ktbl,#640 @ rewind K512 | ||
| 565 | bne .Loop_neon | ||
| 566 | |||
| 567 | vldmia sp!,{d8-d15} @ epilogue | ||
| 568 | bx lr | ||
| 569 | #endif | ||
| 570 | ___ | ||
| 571 | } | ||
| 572 | $code.=<<___; | ||
| 573 | .size sha512_block_data_order,.-sha512_block_data_order | ||
| 574 | .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 397 | .align 2 | 575 | .align 2 |
| 576 | .comm OPENSSL_armcap_P,4,4 | ||
| 398 | ___ | 577 | ___ |
| 399 | 578 | ||
| 400 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 579 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/sha/asm/sha512-mips.pl b/src/lib/libcrypto/sha/asm/sha512-mips.pl new file mode 100644 index 0000000000..ba5b250890 --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha512-mips.pl | |||
| @@ -0,0 +1,455 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # SHA2 block procedures for MIPS. | ||
| 11 | |||
| 12 | # October 2010. | ||
| 13 | # | ||
| 14 | # SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc- | ||
| 15 | # generated code in o32 build and ~55% in n32/64 build. SHA512 [which | ||
| 16 | # for now can only be compiled for MIPS64 ISA] improvement is modest | ||
| 17 | # ~17%, but it comes for free, because it's same instruction sequence. | ||
| 18 | # Improvement coefficients are for aligned input. | ||
| 19 | |||
| 20 | ###################################################################### | ||
| 21 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | ||
| 22 | # widely used. Then there is a new contender: NUBI. It appears that if | ||
| 23 | # one picks the latter, it's possible to arrange code in ABI neutral | ||
| 24 | # manner. Therefore let's stick to NUBI register layout: | ||
| 25 | # | ||
| 26 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | ||
| 27 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
| 28 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | ||
| 29 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | ||
| 30 | # | ||
| 31 | # The return value is placed in $a0. Following coding rules facilitate | ||
| 32 | # interoperability: | ||
| 33 | # | ||
| 34 | # - never ever touch $tp, "thread pointer", former $gp [o32 can be | ||
| 35 | # excluded from the rule, because it's specified volatile]; | ||
| 36 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | ||
| 37 | # old code]; | ||
| 38 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | ||
| 39 | # | ||
| 40 | # For reference here is register layout for N32/64 MIPS ABIs: | ||
| 41 | # | ||
| 42 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
| 43 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
| 44 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
| 45 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
| 46 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
| 47 | # | ||
| 48 | $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 | ||
| 49 | |||
| 50 | if ($flavour =~ /64|n32/i) { | ||
| 51 | $PTR_ADD="dadd"; # incidentally works even on n32 | ||
| 52 | $PTR_SUB="dsub"; # incidentally works even on n32 | ||
| 53 | $REG_S="sd"; | ||
| 54 | $REG_L="ld"; | ||
| 55 | $PTR_SLL="dsll"; # incidentally works even on n32 | ||
| 56 | $SZREG=8; | ||
| 57 | } else { | ||
| 58 | $PTR_ADD="add"; | ||
| 59 | $PTR_SUB="sub"; | ||
| 60 | $REG_S="sw"; | ||
| 61 | $REG_L="lw"; | ||
| 62 | $PTR_SLL="sll"; | ||
| 63 | $SZREG=4; | ||
| 64 | } | ||
| 65 | $pf = ($flavour =~ /nubi/i) ? $t0 : $t2; | ||
| 66 | # | ||
| 67 | # <appro@openssl.org> | ||
| 68 | # | ||
| 69 | ###################################################################### | ||
| 70 | |||
| 71 | $big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; | ||
| 72 | |||
| 73 | for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } | ||
| 74 | open STDOUT,">$output"; | ||
| 75 | |||
| 76 | if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } | ||
| 77 | |||
| 78 | if ($output =~ /512/) { | ||
| 79 | $label="512"; | ||
| 80 | $SZ=8; | ||
| 81 | $LD="ld"; # load from memory | ||
| 82 | $ST="sd"; # store to memory | ||
| 83 | $SLL="dsll"; # shift left logical | ||
| 84 | $SRL="dsrl"; # shift right logical | ||
| 85 | $ADDU="daddu"; | ||
| 86 | @Sigma0=(28,34,39); | ||
| 87 | @Sigma1=(14,18,41); | ||
| 88 | @sigma0=( 7, 1, 8); # right shift first | ||
| 89 | @sigma1=( 6,19,61); # right shift first | ||
| 90 | $lastK=0x817; | ||
| 91 | $rounds=80; | ||
| 92 | } else { | ||
| 93 | $label="256"; | ||
| 94 | $SZ=4; | ||
| 95 | $LD="lw"; # load from memory | ||
| 96 | $ST="sw"; # store to memory | ||
| 97 | $SLL="sll"; # shift left logical | ||
| 98 | $SRL="srl"; # shift right logical | ||
| 99 | $ADDU="addu"; | ||
| 100 | @Sigma0=( 2,13,22); | ||
| 101 | @Sigma1=( 6,11,25); | ||
| 102 | @sigma0=( 3, 7,18); # right shift first | ||
| 103 | @sigma1=(10,17,19); # right shift first | ||
| 104 | $lastK=0x8f2; | ||
| 105 | $rounds=64; | ||
| 106 | } | ||
| 107 | |||
| 108 | $MSB = $big_endian ? 0 : ($SZ-1); | ||
| 109 | $LSB = ($SZ-1)&~$MSB; | ||
| 110 | |||
| 111 | @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31)); | ||
| 112 | @X=map("\$$_",(8..23)); | ||
| 113 | |||
| 114 | $ctx=$a0; | ||
| 115 | $inp=$a1; | ||
| 116 | $len=$a2; $Ktbl=$len; | ||
| 117 | |||
| 118 | sub BODY_00_15 { | ||
| 119 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; | ||
| 120 | my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]); | ||
| 121 | |||
| 122 | $code.=<<___ if ($i<15); | ||
| 123 | ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp) | ||
| 124 | ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp) | ||
| 125 | ___ | ||
| 126 | $code.=<<___ if (!$big_endian && $i<16 && $SZ==4); | ||
| 127 | srl $tmp0,@X[0],24 # byte swap($i) | ||
| 128 | srl $tmp1,@X[0],8 | ||
| 129 | andi $tmp2,@X[0],0xFF00 | ||
| 130 | sll @X[0],@X[0],24 | ||
| 131 | andi $tmp1,0xFF00 | ||
| 132 | sll $tmp2,$tmp2,8 | ||
| 133 | or @X[0],$tmp0 | ||
| 134 | or $tmp1,$tmp2 | ||
| 135 | or @X[0],$tmp1 | ||
| 136 | ___ | ||
| 137 | $code.=<<___ if (!$big_endian && $i<16 && $SZ==8); | ||
| 138 | ori $tmp0,$zero,0xFF | ||
| 139 | dsll $tmp2,$tmp0,32 | ||
| 140 | or $tmp0,$tmp2 # 0x000000FF000000FF | ||
| 141 | and $tmp1,@X[0],$tmp0 # byte swap($i) | ||
| 142 | dsrl $tmp2,@X[0],24 | ||
| 143 | dsll $tmp1,24 | ||
| 144 | and $tmp2,$tmp0 | ||
| 145 | dsll $tmp0,8 # 0x0000FF000000FF00 | ||
| 146 | or $tmp1,$tmp2 | ||
| 147 | and $tmp2,@X[0],$tmp0 | ||
| 148 | dsrl @X[0],8 | ||
| 149 | dsll $tmp2,8 | ||
| 150 | and @X[0],$tmp0 | ||
| 151 | or $tmp1,$tmp2 | ||
| 152 | or @X[0],$tmp1 | ||
| 153 | dsrl $tmp1,@X[0],32 | ||
| 154 | dsll @X[0],32 | ||
| 155 | or @X[0],$tmp1 | ||
| 156 | ___ | ||
| 157 | $code.=<<___; | ||
| 158 | $ADDU $T1,$X[0],$h # $i | ||
| 159 | $SRL $h,$e,@Sigma1[0] | ||
| 160 | xor $tmp2,$f,$g | ||
| 161 | $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]` | ||
| 162 | and $tmp2,$e | ||
| 163 | $SRL $tmp0,$e,@Sigma1[1] | ||
| 164 | xor $h,$tmp1 | ||
| 165 | $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]` | ||
| 166 | xor $h,$tmp0 | ||
| 167 | $SRL $tmp0,$e,@Sigma1[2] | ||
| 168 | xor $h,$tmp1 | ||
| 169 | $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]` | ||
| 170 | xor $h,$tmp0 | ||
| 171 | xor $tmp2,$g # Ch(e,f,g) | ||
| 172 | xor $tmp0,$tmp1,$h # Sigma1(e) | ||
| 173 | |||
| 174 | $SRL $h,$a,@Sigma0[0] | ||
| 175 | $ADDU $T1,$tmp2 | ||
| 176 | $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i] | ||
| 177 | $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]` | ||
| 178 | $ADDU $T1,$tmp0 | ||
| 179 | $SRL $tmp0,$a,@Sigma0[1] | ||
| 180 | xor $h,$tmp1 | ||
| 181 | $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]` | ||
| 182 | xor $h,$tmp0 | ||
| 183 | $SRL $tmp0,$a,@Sigma0[2] | ||
| 184 | xor $h,$tmp1 | ||
| 185 | $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]` | ||
| 186 | xor $h,$tmp0 | ||
| 187 | $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer | ||
| 188 | xor $h,$tmp1 # Sigma0(a) | ||
| 189 | |||
| 190 | or $tmp0,$a,$b | ||
| 191 | and $tmp1,$a,$b | ||
| 192 | and $tmp0,$c | ||
| 193 | or $tmp1,$tmp0 # Maj(a,b,c) | ||
| 194 | $ADDU $T1,$tmp2 # +=K[$i] | ||
| 195 | $ADDU $h,$tmp1 | ||
| 196 | |||
| 197 | $ADDU $d,$T1 | ||
| 198 | $ADDU $h,$T1 | ||
| 199 | ___ | ||
| 200 | $code.=<<___ if ($i>=13); | ||
| 201 | $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer | ||
| 202 | ___ | ||
| 203 | } | ||
| 204 | |||
| 205 | sub BODY_16_XX { | ||
| 206 | my $i=@_[0]; | ||
| 207 | my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]); | ||
| 208 | |||
| 209 | $code.=<<___; | ||
| 210 | $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i) | ||
| 211 | $ADDU @X[0],@X[9] # +=X[i+9] | ||
| 212 | $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]` | ||
| 213 | $SRL $tmp0,@X[1],@sigma0[1] | ||
| 214 | xor $tmp2,$tmp1 | ||
| 215 | $SLL $tmp1,`@sigma0[2]-@sigma0[1]` | ||
| 216 | xor $tmp2,$tmp0 | ||
| 217 | $SRL $tmp0,@X[1],@sigma0[2] | ||
| 218 | xor $tmp2,$tmp1 | ||
| 219 | |||
| 220 | $SRL $tmp3,@X[14],@sigma1[0] | ||
| 221 | xor $tmp2,$tmp0 # sigma0(X[i+1]) | ||
| 222 | $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]` | ||
| 223 | $ADDU @X[0],$tmp2 | ||
| 224 | $SRL $tmp0,@X[14],@sigma1[1] | ||
| 225 | xor $tmp3,$tmp1 | ||
| 226 | $SLL $tmp1,`@sigma1[2]-@sigma1[1]` | ||
| 227 | xor $tmp3,$tmp0 | ||
| 228 | $SRL $tmp0,@X[14],@sigma1[2] | ||
| 229 | xor $tmp3,$tmp1 | ||
| 230 | |||
| 231 | xor $tmp3,$tmp0 # sigma1(X[i+14]) | ||
| 232 | $ADDU @X[0],$tmp3 | ||
| 233 | ___ | ||
| 234 | &BODY_00_15(@_); | ||
| 235 | } | ||
| 236 | |||
| 237 | $FRAMESIZE=16*$SZ+16*$SZREG; | ||
| 238 | $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; | ||
| 239 | |||
| 240 | $code.=<<___; | ||
| 241 | #ifdef OPENSSL_FIPSCANISTER | ||
| 242 | # include <openssl/fipssyms.h> | ||
| 243 | #endif | ||
| 244 | |||
| 245 | .text | ||
| 246 | .set noat | ||
| 247 | #if !defined(__vxworks) || defined(__pic__) | ||
| 248 | .option pic2 | ||
| 249 | #endif | ||
| 250 | |||
| 251 | .align 5 | ||
| 252 | .globl sha${label}_block_data_order | ||
| 253 | .ent sha${label}_block_data_order | ||
| 254 | sha${label}_block_data_order: | ||
| 255 | .frame $sp,$FRAMESIZE,$ra | ||
| 256 | .mask $SAVED_REGS_MASK,-$SZREG | ||
| 257 | .set noreorder | ||
| 258 | ___ | ||
| 259 | $code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification | ||
| 260 | .cpload $pf | ||
| 261 | ___ | ||
| 262 | $code.=<<___; | ||
| 263 | $PTR_SUB $sp,$FRAMESIZE | ||
| 264 | $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) | ||
| 265 | $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) | ||
| 266 | $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) | ||
| 267 | $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) | ||
| 268 | $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) | ||
| 269 | $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) | ||
| 270 | $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) | ||
| 271 | $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) | ||
| 272 | $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) | ||
| 273 | $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) | ||
| 274 | ___ | ||
| 275 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
| 276 | $REG_S $s3,$FRAMESIZE-11*$SZREG($sp) | ||
| 277 | $REG_S $s2,$FRAMESIZE-12*$SZREG($sp) | ||
| 278 | $REG_S $s1,$FRAMESIZE-13*$SZREG($sp) | ||
| 279 | $REG_S $s0,$FRAMESIZE-14*$SZREG($sp) | ||
| 280 | $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) | ||
| 281 | ___ | ||
| 282 | $code.=<<___; | ||
| 283 | $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)` | ||
| 284 | ___ | ||
| 285 | $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification | ||
| 286 | .cplocal $Ktbl | ||
| 287 | .cpsetup $pf,$zero,sha${label}_block_data_order | ||
| 288 | ___ | ||
| 289 | $code.=<<___; | ||
| 290 | .set reorder | ||
| 291 | la $Ktbl,K${label} # PIC-ified 'load address' | ||
| 292 | |||
| 293 | $LD $A,0*$SZ($ctx) # load context | ||
| 294 | $LD $B,1*$SZ($ctx) | ||
| 295 | $LD $C,2*$SZ($ctx) | ||
| 296 | $LD $D,3*$SZ($ctx) | ||
| 297 | $LD $E,4*$SZ($ctx) | ||
| 298 | $LD $F,5*$SZ($ctx) | ||
| 299 | $LD $G,6*$SZ($ctx) | ||
| 300 | $LD $H,7*$SZ($ctx) | ||
| 301 | |||
| 302 | $PTR_ADD @X[15],$inp # pointer to the end of input | ||
| 303 | $REG_S @X[15],16*$SZ($sp) | ||
| 304 | b .Loop | ||
| 305 | |||
| 306 | .align 5 | ||
| 307 | .Loop: | ||
| 308 | ${LD}l @X[0],$MSB($inp) | ||
| 309 | ${LD}r @X[0],$LSB($inp) | ||
| 310 | ___ | ||
| 311 | for ($i=0;$i<16;$i++) | ||
| 312 | { &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } | ||
| 313 | $code.=<<___; | ||
| 314 | b .L16_xx | ||
| 315 | .align 4 | ||
| 316 | .L16_xx: | ||
| 317 | ___ | ||
| 318 | for (;$i<32;$i++) | ||
| 319 | { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } | ||
| 320 | $code.=<<___; | ||
| 321 | and @X[6],0xfff | ||
| 322 | li @X[7],$lastK | ||
| 323 | .set noreorder | ||
| 324 | bne @X[6],@X[7],.L16_xx | ||
| 325 | $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16 | ||
| 326 | |||
| 327 | $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input | ||
| 328 | $LD @X[0],0*$SZ($ctx) | ||
| 329 | $LD @X[1],1*$SZ($ctx) | ||
| 330 | $LD @X[2],2*$SZ($ctx) | ||
| 331 | $PTR_ADD $inp,16*$SZ | ||
| 332 | $LD @X[3],3*$SZ($ctx) | ||
| 333 | $ADDU $A,@X[0] | ||
| 334 | $LD @X[4],4*$SZ($ctx) | ||
| 335 | $ADDU $B,@X[1] | ||
| 336 | $LD @X[5],5*$SZ($ctx) | ||
| 337 | $ADDU $C,@X[2] | ||
| 338 | $LD @X[6],6*$SZ($ctx) | ||
| 339 | $ADDU $D,@X[3] | ||
| 340 | $LD @X[7],7*$SZ($ctx) | ||
| 341 | $ADDU $E,@X[4] | ||
| 342 | $ST $A,0*$SZ($ctx) | ||
| 343 | $ADDU $F,@X[5] | ||
| 344 | $ST $B,1*$SZ($ctx) | ||
| 345 | $ADDU $G,@X[6] | ||
| 346 | $ST $C,2*$SZ($ctx) | ||
| 347 | $ADDU $H,@X[7] | ||
| 348 | $ST $D,3*$SZ($ctx) | ||
| 349 | $ST $E,4*$SZ($ctx) | ||
| 350 | $ST $F,5*$SZ($ctx) | ||
| 351 | $ST $G,6*$SZ($ctx) | ||
| 352 | $ST $H,7*$SZ($ctx) | ||
| 353 | |||
| 354 | bnel $inp,@X[15],.Loop | ||
| 355 | $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl | ||
| 356 | |||
| 357 | $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) | ||
| 358 | $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) | ||
| 359 | $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) | ||
| 360 | $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) | ||
| 361 | $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) | ||
| 362 | $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) | ||
| 363 | $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) | ||
| 364 | $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) | ||
| 365 | $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) | ||
| 366 | $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) | ||
| 367 | ___ | ||
| 368 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
| 369 | $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) | ||
| 370 | $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) | ||
| 371 | $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) | ||
| 372 | $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) | ||
| 373 | $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) | ||
| 374 | ___ | ||
| 375 | $code.=<<___; | ||
| 376 | jr $ra | ||
| 377 | $PTR_ADD $sp,$FRAMESIZE | ||
| 378 | .end sha${label}_block_data_order | ||
| 379 | |||
| 380 | .rdata | ||
| 381 | .align 5 | ||
| 382 | K${label}: | ||
| 383 | ___ | ||
| 384 | if ($SZ==4) { | ||
| 385 | $code.=<<___; | ||
| 386 | .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 | ||
| 387 | .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 | ||
| 388 | .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 | ||
| 389 | .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 | ||
| 390 | .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc | ||
| 391 | .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da | ||
| 392 | .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 | ||
| 393 | .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 | ||
| 394 | .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 | ||
| 395 | .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 | ||
| 396 | .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 | ||
| 397 | .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 | ||
| 398 | .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 | ||
| 399 | .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 | ||
| 400 | .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 | ||
| 401 | .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | ||
| 402 | ___ | ||
| 403 | } else { | ||
| 404 | $code.=<<___; | ||
| 405 | .dword 0x428a2f98d728ae22, 0x7137449123ef65cd | ||
| 406 | .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc | ||
| 407 | .dword 0x3956c25bf348b538, 0x59f111f1b605d019 | ||
| 408 | .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 | ||
| 409 | .dword 0xd807aa98a3030242, 0x12835b0145706fbe | ||
| 410 | .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 | ||
| 411 | .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 | ||
| 412 | .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 | ||
| 413 | .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 | ||
| 414 | .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 | ||
| 415 | .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 | ||
| 416 | .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 | ||
| 417 | .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 | ||
| 418 | .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 | ||
| 419 | .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 | ||
| 420 | .dword 0x06ca6351e003826f, 0x142929670a0e6e70 | ||
| 421 | .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 | ||
| 422 | .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df | ||
| 423 | .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 | ||
| 424 | .dword 0x81c2c92e47edaee6, 0x92722c851482353b | ||
| 425 | .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 | ||
| 426 | .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 | ||
| 427 | .dword 0xd192e819d6ef5218, 0xd69906245565a910 | ||
| 428 | .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 | ||
| 429 | .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 | ||
| 430 | .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 | ||
| 431 | .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb | ||
| 432 | .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 | ||
| 433 | .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 | ||
| 434 | .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec | ||
| 435 | .dword 0x90befffa23631e28, 0xa4506cebde82bde9 | ||
| 436 | .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b | ||
| 437 | .dword 0xca273eceea26619c, 0xd186b8c721c0c207 | ||
| 438 | .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 | ||
| 439 | .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 | ||
| 440 | .dword 0x113f9804bef90dae, 0x1b710b35131c471b | ||
| 441 | .dword 0x28db77f523047d84, 0x32caab7b40c72493 | ||
| 442 | .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c | ||
| 443 | .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a | ||
| 444 | .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 | ||
| 445 | ___ | ||
| 446 | } | ||
| 447 | $code.=<<___; | ||
| 448 | .asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 449 | .align 5 | ||
| 450 | |||
| 451 | ___ | ||
| 452 | |||
| 453 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 454 | print $code; | ||
| 455 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha512-parisc.pl b/src/lib/libcrypto/sha/asm/sha512-parisc.pl new file mode 100755 index 0000000000..e24ee58ae9 --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha512-parisc.pl | |||
| @@ -0,0 +1,791 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # SHA256/512 block procedure for PA-RISC. | ||
| 11 | |||
| 12 | # June 2009. | ||
| 13 | # | ||
| 14 | # SHA256 performance is >75% better than gcc 3.2 generated code on | ||
| 15 | # PA-7100LC. Compared to code generated by vendor compiler this | ||
| 16 | # implementation is almost 70% faster in 64-bit build, but delivers | ||
| 17 | # virtually same performance in 32-bit build on PA-8600. | ||
| 18 | # | ||
| 19 | # SHA512 performance is >2.9x better than gcc 3.2 generated code on | ||
| 20 | # PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the | ||
| 21 | # code is executed on PA-RISC 2.0 processor and switches to 64-bit | ||
| 22 | # code path delivering adequate peformance even in "blended" 32-bit | ||
| 23 | # build. Though 64-bit code is not any faster than code generated by | ||
| 24 | # vendor compiler on PA-8600... | ||
| 25 | # | ||
| 26 | # Special thanks to polarhome.com for providing HP-UX account. | ||
| 27 | |||
| 28 | $flavour = shift; | ||
| 29 | $output = shift; | ||
| 30 | open STDOUT,">$output"; | ||
| 31 | |||
| 32 | if ($flavour =~ /64/) { | ||
| 33 | $LEVEL ="2.0W"; | ||
| 34 | $SIZE_T =8; | ||
| 35 | $FRAME_MARKER =80; | ||
| 36 | $SAVED_RP =16; | ||
| 37 | $PUSH ="std"; | ||
| 38 | $PUSHMA ="std,ma"; | ||
| 39 | $POP ="ldd"; | ||
| 40 | $POPMB ="ldd,mb"; | ||
| 41 | } else { | ||
| 42 | $LEVEL ="1.0"; | ||
| 43 | $SIZE_T =4; | ||
| 44 | $FRAME_MARKER =48; | ||
| 45 | $SAVED_RP =20; | ||
| 46 | $PUSH ="stw"; | ||
| 47 | $PUSHMA ="stwm"; | ||
| 48 | $POP ="ldw"; | ||
| 49 | $POPMB ="ldwm"; | ||
| 50 | } | ||
| 51 | |||
| 52 | if ($output =~ /512/) { | ||
| 53 | $func="sha512_block_data_order"; | ||
| 54 | $SZ=8; | ||
| 55 | @Sigma0=(28,34,39); | ||
| 56 | @Sigma1=(14,18,41); | ||
| 57 | @sigma0=(1, 8, 7); | ||
| 58 | @sigma1=(19,61, 6); | ||
| 59 | $rounds=80; | ||
| 60 | $LAST10BITS=0x017; | ||
| 61 | $LD="ldd"; | ||
| 62 | $LDM="ldd,ma"; | ||
| 63 | $ST="std"; | ||
| 64 | } else { | ||
| 65 | $func="sha256_block_data_order"; | ||
| 66 | $SZ=4; | ||
| 67 | @Sigma0=( 2,13,22); | ||
| 68 | @Sigma1=( 6,11,25); | ||
| 69 | @sigma0=( 7,18, 3); | ||
| 70 | @sigma1=(17,19,10); | ||
| 71 | $rounds=64; | ||
| 72 | $LAST10BITS=0x0f2; | ||
| 73 | $LD="ldw"; | ||
| 74 | $LDM="ldwm"; | ||
| 75 | $ST="stw"; | ||
| 76 | } | ||
| 77 | |||
| 78 | $FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker | ||
| 79 | # [+ argument transfer] | ||
| 80 | $XOFF=16*$SZ+32; # local variables | ||
| 81 | $FRAME+=$XOFF; | ||
| 82 | $XOFF+=$FRAME_MARKER; # distance between %sp and local variables | ||
| 83 | |||
| 84 | $ctx="%r26"; # zapped by $a0 | ||
| 85 | $inp="%r25"; # zapped by $a1 | ||
| 86 | $num="%r24"; # zapped by $t0 | ||
| 87 | |||
| 88 | $a0 ="%r26"; | ||
| 89 | $a1 ="%r25"; | ||
| 90 | $t0 ="%r24"; | ||
| 91 | $t1 ="%r29"; | ||
| 92 | $Tbl="%r31"; | ||
| 93 | |||
| 94 | @V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28"); | ||
| 95 | |||
| 96 | @X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", | ||
| 97 | "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp); | ||
| 98 | |||
| 99 | sub ROUND_00_15 { | ||
| 100 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; | ||
| 101 | $code.=<<___; | ||
| 102 | _ror $e,$Sigma1[0],$a0 | ||
| 103 | and $f,$e,$t0 | ||
| 104 | _ror $e,$Sigma1[1],$a1 | ||
| 105 | addl $t1,$h,$h | ||
| 106 | andcm $g,$e,$t1 | ||
| 107 | xor $a1,$a0,$a0 | ||
| 108 | _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1 | ||
| 109 | or $t0,$t1,$t1 ; Ch(e,f,g) | ||
| 110 | addl @X[$i%16],$h,$h | ||
| 111 | xor $a0,$a1,$a1 ; Sigma1(e) | ||
| 112 | addl $t1,$h,$h | ||
| 113 | _ror $a,$Sigma0[0],$a0 | ||
| 114 | addl $a1,$h,$h | ||
| 115 | |||
| 116 | _ror $a,$Sigma0[1],$a1 | ||
| 117 | and $a,$b,$t0 | ||
| 118 | and $a,$c,$t1 | ||
| 119 | xor $a1,$a0,$a0 | ||
| 120 | _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1 | ||
| 121 | xor $t1,$t0,$t0 | ||
| 122 | and $b,$c,$t1 | ||
| 123 | xor $a0,$a1,$a1 ; Sigma0(a) | ||
| 124 | addl $h,$d,$d | ||
| 125 | xor $t1,$t0,$t0 ; Maj(a,b,c) | ||
| 126 | `"$LDM $SZ($Tbl),$t1" if ($i<15)` | ||
| 127 | addl $a1,$h,$h | ||
| 128 | addl $t0,$h,$h | ||
| 129 | |||
| 130 | ___ | ||
| 131 | } | ||
| 132 | |||
| 133 | sub ROUND_16_xx { | ||
| 134 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; | ||
| 135 | $i-=16; | ||
| 136 | $code.=<<___; | ||
| 137 | _ror @X[($i+1)%16],$sigma0[0],$a0 | ||
| 138 | _ror @X[($i+1)%16],$sigma0[1],$a1 | ||
| 139 | addl @X[($i+9)%16],@X[$i],@X[$i] | ||
| 140 | _ror @X[($i+14)%16],$sigma1[0],$t0 | ||
| 141 | _ror @X[($i+14)%16],$sigma1[1],$t1 | ||
| 142 | xor $a1,$a0,$a0 | ||
| 143 | _shr @X[($i+1)%16],$sigma0[2],$a1 | ||
| 144 | xor $t1,$t0,$t0 | ||
| 145 | _shr @X[($i+14)%16],$sigma1[2],$t1 | ||
| 146 | xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f]) | ||
| 147 | xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f]) | ||
| 148 | $LDM $SZ($Tbl),$t1 | ||
| 149 | addl $a0,@X[$i],@X[$i] | ||
| 150 | addl $t0,@X[$i],@X[$i] | ||
| 151 | ___ | ||
| 152 | $code.=<<___ if ($i==15); | ||
| 153 | extru $t1,31,10,$a1 | ||
| 154 | comiclr,<> $LAST10BITS,$a1,%r0 | ||
| 155 | ldo 1($Tbl),$Tbl ; signal end of $Tbl | ||
| 156 | ___ | ||
| 157 | &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h); | ||
| 158 | } | ||
| 159 | |||
| 160 | $code=<<___; | ||
| 161 | .LEVEL $LEVEL | ||
| 162 | .SPACE \$TEXT\$ | ||
| 163 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
| 164 | |||
| 165 | .ALIGN 64 | ||
| 166 | L\$table | ||
| 167 | ___ | ||
| 168 | $code.=<<___ if ($SZ==8); | ||
| 169 | .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd | ||
| 170 | .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc | ||
| 171 | .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019 | ||
| 172 | .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118 | ||
| 173 | .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe | ||
| 174 | .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2 | ||
| 175 | .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1 | ||
| 176 | .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694 | ||
| 177 | .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3 | ||
| 178 | .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65 | ||
| 179 | .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483 | ||
| 180 | .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5 | ||
| 181 | .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210 | ||
| 182 | .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4 | ||
| 183 | .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725 | ||
| 184 | .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70 | ||
| 185 | .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926 | ||
| 186 | .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df | ||
| 187 | .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8 | ||
| 188 | .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b | ||
| 189 | .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001 | ||
| 190 | .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30 | ||
| 191 | .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910 | ||
| 192 | .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8 | ||
| 193 | .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53 | ||
| 194 | .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8 | ||
| 195 | .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb | ||
| 196 | .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3 | ||
| 197 | .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60 | ||
| 198 | .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec | ||
| 199 | .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9 | ||
| 200 | .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b | ||
| 201 | .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207 | ||
| 202 | .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178 | ||
| 203 | .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6 | ||
| 204 | .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b | ||
| 205 | .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493 | ||
| 206 | .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c | ||
| 207 | .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a | ||
| 208 | .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817 | ||
| 209 | ___ | ||
| 210 | $code.=<<___ if ($SZ==4); | ||
| 211 | .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | ||
| 212 | .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | ||
| 213 | .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | ||
| 214 | .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | ||
| 215 | .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | ||
| 216 | .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | ||
| 217 | .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | ||
| 218 | .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | ||
| 219 | .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | ||
| 220 | .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | ||
| 221 | .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | ||
| 222 | .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | ||
| 223 | .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | ||
| 224 | .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | ||
| 225 | .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | ||
| 226 | .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | ||
| 227 | ___ | ||
| 228 | $code.=<<___; | ||
| 229 | |||
| 230 | .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
| 231 | .ALIGN 64 | ||
| 232 | $func | ||
| 233 | .PROC | ||
| 234 | .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 | ||
| 235 | .ENTRY | ||
| 236 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
| 237 | $PUSHMA %r3,$FRAME(%sp) | ||
| 238 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
| 239 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
| 240 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
| 241 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
| 242 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
| 243 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
| 244 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
| 245 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
| 246 | $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) | ||
| 247 | $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) | ||
| 248 | $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) | ||
| 249 | $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) | ||
| 250 | $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) | ||
| 251 | $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) | ||
| 252 | $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) | ||
| 253 | |||
| 254 | _shl $num,`log(16*$SZ)/log(2)`,$num | ||
| 255 | addl $inp,$num,$num ; $num to point at the end of $inp | ||
| 256 | |||
| 257 | $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments | ||
| 258 | $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) | ||
| 259 | $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp) | ||
| 260 | |||
| 261 | blr %r0,$Tbl | ||
| 262 | ldi 3,$t1 | ||
| 263 | L\$pic | ||
| 264 | andcm $Tbl,$t1,$Tbl ; wipe privilege level | ||
| 265 | ldo L\$table-L\$pic($Tbl),$Tbl | ||
| 266 | ___ | ||
| 267 | $code.=<<___ if ($SZ==8 && $SIZE_T==4); | ||
| 268 | ldi 31,$t1 | ||
| 269 | mtctl $t1,%cr11 | ||
| 270 | extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0 | ||
| 271 | b L\$parisc1 | ||
| 272 | nop | ||
| 273 | ___ | ||
| 274 | $code.=<<___; | ||
| 275 | $LD `0*$SZ`($ctx),$A ; load context | ||
| 276 | $LD `1*$SZ`($ctx),$B | ||
| 277 | $LD `2*$SZ`($ctx),$C | ||
| 278 | $LD `3*$SZ`($ctx),$D | ||
| 279 | $LD `4*$SZ`($ctx),$E | ||
| 280 | $LD `5*$SZ`($ctx),$F | ||
| 281 | $LD `6*$SZ`($ctx),$G | ||
| 282 | $LD `7*$SZ`($ctx),$H | ||
| 283 | |||
| 284 | extru $inp,31,`log($SZ)/log(2)`,$t0 | ||
| 285 | sh3addl $t0,%r0,$t0 | ||
| 286 | subi `8*$SZ`,$t0,$t0 | ||
| 287 | mtctl $t0,%cr11 ; load %sar with align factor | ||
| 288 | |||
| 289 | L\$oop | ||
| 290 | ldi `$SZ-1`,$t0 | ||
| 291 | $LDM $SZ($Tbl),$t1 | ||
| 292 | andcm $inp,$t0,$t0 ; align $inp | ||
| 293 | ___ | ||
| 294 | for ($i=0;$i<15;$i++) { # load input block | ||
| 295 | $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; } | ||
| 296 | $code.=<<___; | ||
| 297 | cmpb,*= $inp,$t0,L\$aligned | ||
| 298 | $LD `$SZ*15`($t0),@X[15] | ||
| 299 | $LD `$SZ*16`($t0),@X[16] | ||
| 300 | ___ | ||
| 301 | for ($i=0;$i<16;$i++) { # align data | ||
| 302 | $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; } | ||
| 303 | $code.=<<___; | ||
| 304 | L\$aligned | ||
| 305 | nop ; otherwise /usr/ccs/bin/as is confused by below .WORD | ||
| 306 | ___ | ||
| 307 | |||
| 308 | for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } | ||
| 309 | $code.=<<___; | ||
| 310 | L\$rounds | ||
| 311 | nop ; otherwise /usr/ccs/bin/as is confused by below .WORD | ||
| 312 | ___ | ||
| 313 | for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); } | ||
| 314 | $code.=<<___; | ||
| 315 | bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled? | ||
| 316 | nop | ||
| 317 | |||
| 318 | $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments | ||
| 319 | $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp | ||
| 320 | $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num | ||
| 321 | ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl | ||
| 322 | |||
| 323 | $LD `0*$SZ`($ctx),@X[0] ; load context | ||
| 324 | $LD `1*$SZ`($ctx),@X[1] | ||
| 325 | $LD `2*$SZ`($ctx),@X[2] | ||
| 326 | $LD `3*$SZ`($ctx),@X[3] | ||
| 327 | $LD `4*$SZ`($ctx),@X[4] | ||
| 328 | $LD `5*$SZ`($ctx),@X[5] | ||
| 329 | addl @X[0],$A,$A | ||
| 330 | $LD `6*$SZ`($ctx),@X[6] | ||
| 331 | addl @X[1],$B,$B | ||
| 332 | $LD `7*$SZ`($ctx),@X[7] | ||
| 333 | ldo `16*$SZ`($inp),$inp ; advance $inp | ||
| 334 | |||
| 335 | $ST $A,`0*$SZ`($ctx) ; save context | ||
| 336 | addl @X[2],$C,$C | ||
| 337 | $ST $B,`1*$SZ`($ctx) | ||
| 338 | addl @X[3],$D,$D | ||
| 339 | $ST $C,`2*$SZ`($ctx) | ||
| 340 | addl @X[4],$E,$E | ||
| 341 | $ST $D,`3*$SZ`($ctx) | ||
| 342 | addl @X[5],$F,$F | ||
| 343 | $ST $E,`4*$SZ`($ctx) | ||
| 344 | addl @X[6],$G,$G | ||
| 345 | $ST $F,`5*$SZ`($ctx) | ||
| 346 | addl @X[7],$H,$H | ||
| 347 | $ST $G,`6*$SZ`($ctx) | ||
| 348 | $ST $H,`7*$SZ`($ctx) | ||
| 349 | |||
| 350 | cmpb,*<>,n $inp,$num,L\$oop | ||
| 351 | $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp | ||
| 352 | ___ | ||
| 353 | if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0 | ||
| 354 | {{ | ||
| 355 | $code.=<<___; | ||
| 356 | b L\$done | ||
| 357 | nop | ||
| 358 | |||
| 359 | .ALIGN 64 | ||
| 360 | L\$parisc1 | ||
| 361 | ___ | ||
| 362 | |||
| 363 | @V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo, | ||
| 364 | $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = | ||
| 365 | ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", | ||
| 366 | "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16"); | ||
| 367 | $a0 ="%r17"; | ||
| 368 | $a1 ="%r18"; | ||
| 369 | $a2 ="%r19"; | ||
| 370 | $a3 ="%r20"; | ||
| 371 | $t0 ="%r21"; | ||
| 372 | $t1 ="%r22"; | ||
| 373 | $t2 ="%r28"; | ||
| 374 | $t3 ="%r29"; | ||
| 375 | $Tbl="%r31"; | ||
| 376 | |||
| 377 | @X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx | ||
| 378 | |||
| 379 | sub ROUND_00_15_pa1 { | ||
| 380 | my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, | ||
| 381 | $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_; | ||
| 382 | my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; | ||
| 383 | |||
| 384 | $code.=<<___ if (!$flag); | ||
| 385 | ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi | ||
| 386 | ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] | ||
| 387 | ___ | ||
| 388 | $code.=<<___; | ||
| 389 | shd $ehi,$elo,$Sigma1[0],$t0 | ||
| 390 | add $Xlo,$hlo,$hlo | ||
| 391 | shd $elo,$ehi,$Sigma1[0],$t1 | ||
| 392 | addc $Xhi,$hhi,$hhi ; h += X[i] | ||
| 393 | shd $ehi,$elo,$Sigma1[1],$t2 | ||
| 394 | ldwm 8($Tbl),$Xhi | ||
| 395 | shd $elo,$ehi,$Sigma1[1],$t3 | ||
| 396 | ldw -4($Tbl),$Xlo ; load K[i] | ||
| 397 | xor $t2,$t0,$t0 | ||
| 398 | xor $t3,$t1,$t1 | ||
| 399 | and $flo,$elo,$a0 | ||
| 400 | and $fhi,$ehi,$a1 | ||
| 401 | shd $ehi,$elo,$Sigma1[2],$t2 | ||
| 402 | andcm $glo,$elo,$a2 | ||
| 403 | shd $elo,$ehi,$Sigma1[2],$t3 | ||
| 404 | andcm $ghi,$ehi,$a3 | ||
| 405 | xor $t2,$t0,$t0 | ||
| 406 | xor $t3,$t1,$t1 ; Sigma1(e) | ||
| 407 | add $Xlo,$hlo,$hlo | ||
| 408 | xor $a2,$a0,$a0 | ||
| 409 | addc $Xhi,$hhi,$hhi ; h += K[i] | ||
| 410 | xor $a3,$a1,$a1 ; Ch(e,f,g) | ||
| 411 | |||
| 412 | add $t0,$hlo,$hlo | ||
| 413 | shd $ahi,$alo,$Sigma0[0],$t0 | ||
| 414 | addc $t1,$hhi,$hhi ; h += Sigma1(e) | ||
| 415 | shd $alo,$ahi,$Sigma0[0],$t1 | ||
| 416 | add $a0,$hlo,$hlo | ||
| 417 | shd $ahi,$alo,$Sigma0[1],$t2 | ||
| 418 | addc $a1,$hhi,$hhi ; h += Ch(e,f,g) | ||
| 419 | shd $alo,$ahi,$Sigma0[1],$t3 | ||
| 420 | |||
| 421 | xor $t2,$t0,$t0 | ||
| 422 | xor $t3,$t1,$t1 | ||
| 423 | shd $ahi,$alo,$Sigma0[2],$t2 | ||
| 424 | and $alo,$blo,$a0 | ||
| 425 | shd $alo,$ahi,$Sigma0[2],$t3 | ||
| 426 | and $ahi,$bhi,$a1 | ||
| 427 | xor $t2,$t0,$t0 | ||
| 428 | xor $t3,$t1,$t1 ; Sigma0(a) | ||
| 429 | |||
| 430 | and $alo,$clo,$a2 | ||
| 431 | and $ahi,$chi,$a3 | ||
| 432 | xor $a2,$a0,$a0 | ||
| 433 | add $hlo,$dlo,$dlo | ||
| 434 | xor $a3,$a1,$a1 | ||
| 435 | addc $hhi,$dhi,$dhi ; d += h | ||
| 436 | and $blo,$clo,$a2 | ||
| 437 | add $t0,$hlo,$hlo | ||
| 438 | and $bhi,$chi,$a3 | ||
| 439 | addc $t1,$hhi,$hhi ; h += Sigma0(a) | ||
| 440 | xor $a2,$a0,$a0 | ||
| 441 | add $a0,$hlo,$hlo | ||
| 442 | xor $a3,$a1,$a1 ; Maj(a,b,c) | ||
| 443 | addc $a1,$hhi,$hhi ; h += Maj(a,b,c) | ||
| 444 | |||
| 445 | ___ | ||
| 446 | $code.=<<___ if ($i==15 && $flag); | ||
| 447 | extru $Xlo,31,10,$Xlo | ||
| 448 | comiclr,= $LAST10BITS,$Xlo,%r0 | ||
| 449 | b L\$rounds_pa1 | ||
| 450 | nop | ||
| 451 | ___ | ||
| 452 | push(@X,shift(@X)); push(@X,shift(@X)); | ||
| 453 | } | ||
| 454 | |||
| 455 | sub ROUND_16_xx_pa1 { | ||
| 456 | my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; | ||
| 457 | my ($i)=shift; | ||
| 458 | $i-=16; | ||
| 459 | $code.=<<___; | ||
| 460 | ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi | ||
| 461 | ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] | ||
| 462 | ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1 | ||
| 463 | ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9] | ||
| 464 | ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3 | ||
| 465 | ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14] | ||
| 466 | shd $Xnhi,$Xnlo,$sigma0[0],$t0 | ||
| 467 | shd $Xnlo,$Xnhi,$sigma0[0],$t1 | ||
| 468 | add $a0,$Xlo,$Xlo | ||
| 469 | shd $Xnhi,$Xnlo,$sigma0[1],$t2 | ||
| 470 | addc $a1,$Xhi,$Xhi | ||
| 471 | shd $Xnlo,$Xnhi,$sigma0[1],$t3 | ||
| 472 | xor $t2,$t0,$t0 | ||
| 473 | shd $Xnhi,$Xnlo,$sigma0[2],$t2 | ||
| 474 | xor $t3,$t1,$t1 | ||
| 475 | extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3 | ||
| 476 | xor $t2,$t0,$t0 | ||
| 477 | shd $a3,$a2,$sigma1[0],$a0 | ||
| 478 | xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f]) | ||
| 479 | shd $a2,$a3,$sigma1[0],$a1 | ||
| 480 | add $t0,$Xlo,$Xlo | ||
| 481 | shd $a3,$a2,$sigma1[1],$t2 | ||
| 482 | addc $t1,$Xhi,$Xhi | ||
| 483 | shd $a2,$a3,$sigma1[1],$t3 | ||
| 484 | xor $t2,$a0,$a0 | ||
| 485 | shd $a3,$a2,$sigma1[2],$t2 | ||
| 486 | xor $t3,$a1,$a1 | ||
| 487 | extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3 | ||
| 488 | xor $t2,$a0,$a0 | ||
| 489 | xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f]) | ||
| 490 | add $a0,$Xlo,$Xlo | ||
| 491 | addc $a1,$Xhi,$Xhi | ||
| 492 | |||
| 493 | stw $Xhi,`-$XOFF+8*($i%16)`(%sp) | ||
| 494 | stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp) | ||
| 495 | ___ | ||
| 496 | &ROUND_00_15_pa1($i,@_,1); | ||
| 497 | } | ||
| 498 | $code.=<<___; | ||
| 499 | ldw `0*4`($ctx),$Ahi ; load context | ||
| 500 | ldw `1*4`($ctx),$Alo | ||
| 501 | ldw `2*4`($ctx),$Bhi | ||
| 502 | ldw `3*4`($ctx),$Blo | ||
| 503 | ldw `4*4`($ctx),$Chi | ||
| 504 | ldw `5*4`($ctx),$Clo | ||
| 505 | ldw `6*4`($ctx),$Dhi | ||
| 506 | ldw `7*4`($ctx),$Dlo | ||
| 507 | ldw `8*4`($ctx),$Ehi | ||
| 508 | ldw `9*4`($ctx),$Elo | ||
| 509 | ldw `10*4`($ctx),$Fhi | ||
| 510 | ldw `11*4`($ctx),$Flo | ||
| 511 | ldw `12*4`($ctx),$Ghi | ||
| 512 | ldw `13*4`($ctx),$Glo | ||
| 513 | ldw `14*4`($ctx),$Hhi | ||
| 514 | ldw `15*4`($ctx),$Hlo | ||
| 515 | |||
| 516 | extru $inp,31,2,$t0 | ||
| 517 | sh3addl $t0,%r0,$t0 | ||
| 518 | subi 32,$t0,$t0 | ||
| 519 | mtctl $t0,%cr11 ; load %sar with align factor | ||
| 520 | |||
| 521 | L\$oop_pa1 | ||
| 522 | extru $inp,31,2,$a3 | ||
| 523 | comib,= 0,$a3,L\$aligned_pa1 | ||
| 524 | sub $inp,$a3,$inp | ||
| 525 | |||
| 526 | ldw `0*4`($inp),$X[0] | ||
| 527 | ldw `1*4`($inp),$X[1] | ||
| 528 | ldw `2*4`($inp),$t2 | ||
| 529 | ldw `3*4`($inp),$t3 | ||
| 530 | ldw `4*4`($inp),$a0 | ||
| 531 | ldw `5*4`($inp),$a1 | ||
| 532 | ldw `6*4`($inp),$a2 | ||
| 533 | ldw `7*4`($inp),$a3 | ||
| 534 | vshd $X[0],$X[1],$X[0] | ||
| 535 | vshd $X[1],$t2,$X[1] | ||
| 536 | stw $X[0],`-$XOFF+0*4`(%sp) | ||
| 537 | ldw `8*4`($inp),$t0 | ||
| 538 | vshd $t2,$t3,$t2 | ||
| 539 | stw $X[1],`-$XOFF+1*4`(%sp) | ||
| 540 | ldw `9*4`($inp),$t1 | ||
| 541 | vshd $t3,$a0,$t3 | ||
| 542 | ___ | ||
| 543 | { | ||
| 544 | my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); | ||
| 545 | for ($i=2;$i<=(128/4-8);$i++) { | ||
| 546 | $code.=<<___; | ||
| 547 | stw $t[0],`-$XOFF+$i*4`(%sp) | ||
| 548 | ldw `(8+$i)*4`($inp),$t[0] | ||
| 549 | vshd $t[1],$t[2],$t[1] | ||
| 550 | ___ | ||
| 551 | push(@t,shift(@t)); | ||
| 552 | } | ||
| 553 | for (;$i<(128/4-1);$i++) { | ||
| 554 | $code.=<<___; | ||
| 555 | stw $t[0],`-$XOFF+$i*4`(%sp) | ||
| 556 | vshd $t[1],$t[2],$t[1] | ||
| 557 | ___ | ||
| 558 | push(@t,shift(@t)); | ||
| 559 | } | ||
| 560 | $code.=<<___; | ||
| 561 | b L\$collected_pa1 | ||
| 562 | stw $t[0],`-$XOFF+$i*4`(%sp) | ||
| 563 | |||
| 564 | ___ | ||
| 565 | } | ||
| 566 | $code.=<<___; | ||
| 567 | L\$aligned_pa1 | ||
| 568 | ldw `0*4`($inp),$X[0] | ||
| 569 | ldw `1*4`($inp),$X[1] | ||
| 570 | ldw `2*4`($inp),$t2 | ||
| 571 | ldw `3*4`($inp),$t3 | ||
| 572 | ldw `4*4`($inp),$a0 | ||
| 573 | ldw `5*4`($inp),$a1 | ||
| 574 | ldw `6*4`($inp),$a2 | ||
| 575 | ldw `7*4`($inp),$a3 | ||
| 576 | stw $X[0],`-$XOFF+0*4`(%sp) | ||
| 577 | ldw `8*4`($inp),$t0 | ||
| 578 | stw $X[1],`-$XOFF+1*4`(%sp) | ||
| 579 | ldw `9*4`($inp),$t1 | ||
| 580 | ___ | ||
| 581 | { | ||
| 582 | my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); | ||
| 583 | for ($i=2;$i<(128/4-8);$i++) { | ||
| 584 | $code.=<<___; | ||
| 585 | stw $t[0],`-$XOFF+$i*4`(%sp) | ||
| 586 | ldw `(8+$i)*4`($inp),$t[0] | ||
| 587 | ___ | ||
| 588 | push(@t,shift(@t)); | ||
| 589 | } | ||
| 590 | for (;$i<128/4;$i++) { | ||
| 591 | $code.=<<___; | ||
| 592 | stw $t[0],`-$XOFF+$i*4`(%sp) | ||
| 593 | ___ | ||
| 594 | push(@t,shift(@t)); | ||
| 595 | } | ||
| 596 | $code.="L\$collected_pa1\n"; | ||
| 597 | } | ||
| 598 | |||
| 599 | for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } | ||
| 600 | $code.="L\$rounds_pa1\n"; | ||
| 601 | for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } | ||
| 602 | |||
| 603 | $code.=<<___; | ||
| 604 | $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments | ||
| 605 | $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp | ||
| 606 | $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num | ||
| 607 | ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl | ||
| 608 | |||
| 609 | ldw `0*4`($ctx),$t1 ; update context | ||
| 610 | ldw `1*4`($ctx),$t0 | ||
| 611 | ldw `2*4`($ctx),$t3 | ||
| 612 | ldw `3*4`($ctx),$t2 | ||
| 613 | ldw `4*4`($ctx),$a1 | ||
| 614 | ldw `5*4`($ctx),$a0 | ||
| 615 | ldw `6*4`($ctx),$a3 | ||
| 616 | add $t0,$Alo,$Alo | ||
| 617 | ldw `7*4`($ctx),$a2 | ||
| 618 | addc $t1,$Ahi,$Ahi | ||
| 619 | ldw `8*4`($ctx),$t1 | ||
| 620 | add $t2,$Blo,$Blo | ||
| 621 | ldw `9*4`($ctx),$t0 | ||
| 622 | addc $t3,$Bhi,$Bhi | ||
| 623 | ldw `10*4`($ctx),$t3 | ||
| 624 | add $a0,$Clo,$Clo | ||
| 625 | ldw `11*4`($ctx),$t2 | ||
| 626 | addc $a1,$Chi,$Chi | ||
| 627 | ldw `12*4`($ctx),$a1 | ||
| 628 | add $a2,$Dlo,$Dlo | ||
| 629 | ldw `13*4`($ctx),$a0 | ||
| 630 | addc $a3,$Dhi,$Dhi | ||
| 631 | ldw `14*4`($ctx),$a3 | ||
| 632 | add $t0,$Elo,$Elo | ||
| 633 | ldw `15*4`($ctx),$a2 | ||
| 634 | addc $t1,$Ehi,$Ehi | ||
| 635 | stw $Ahi,`0*4`($ctx) | ||
| 636 | add $t2,$Flo,$Flo | ||
| 637 | stw $Alo,`1*4`($ctx) | ||
| 638 | addc $t3,$Fhi,$Fhi | ||
| 639 | stw $Bhi,`2*4`($ctx) | ||
| 640 | add $a0,$Glo,$Glo | ||
| 641 | stw $Blo,`3*4`($ctx) | ||
| 642 | addc $a1,$Ghi,$Ghi | ||
| 643 | stw $Chi,`4*4`($ctx) | ||
| 644 | add $a2,$Hlo,$Hlo | ||
| 645 | stw $Clo,`5*4`($ctx) | ||
| 646 | addc $a3,$Hhi,$Hhi | ||
| 647 | stw $Dhi,`6*4`($ctx) | ||
| 648 | ldo `16*$SZ`($inp),$inp ; advance $inp | ||
| 649 | stw $Dlo,`7*4`($ctx) | ||
| 650 | stw $Ehi,`8*4`($ctx) | ||
| 651 | stw $Elo,`9*4`($ctx) | ||
| 652 | stw $Fhi,`10*4`($ctx) | ||
| 653 | stw $Flo,`11*4`($ctx) | ||
| 654 | stw $Ghi,`12*4`($ctx) | ||
| 655 | stw $Glo,`13*4`($ctx) | ||
| 656 | stw $Hhi,`14*4`($ctx) | ||
| 657 | comb,= $inp,$num,L\$done | ||
| 658 | stw $Hlo,`15*4`($ctx) | ||
| 659 | b L\$oop_pa1 | ||
| 660 | $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp | ||
| 661 | L\$done | ||
| 662 | ___ | ||
| 663 | }} | ||
| 664 | $code.=<<___; | ||
| 665 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
| 666 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
| 667 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
| 668 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
| 669 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
| 670 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
| 671 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
| 672 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
| 673 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
| 674 | $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 | ||
| 675 | $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 | ||
| 676 | $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 | ||
| 677 | $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 | ||
| 678 | $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 | ||
| 679 | $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 | ||
| 680 | $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 | ||
| 681 | bv (%r2) | ||
| 682 | .EXIT | ||
| 683 | $POPMB -$FRAME(%sp),%r3 | ||
| 684 | .PROCEND | ||
| 685 | .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 686 | ___ | ||
| 687 | |||
| 688 | # Explicitly encode PA-RISC 2.0 instructions used in this module, so | ||
| 689 | # that it can be compiled with .LEVEL 1.0. It should be noted that I | ||
| 690 | # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 | ||
| 691 | # directive... | ||
| 692 | |||
| 693 | my $ldd = sub { | ||
| 694 | my ($mod,$args) = @_; | ||
| 695 | my $orig = "ldd$mod\t$args"; | ||
| 696 | |||
| 697 | if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices | ||
| 698 | { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1); | ||
| 699 | $opcode|=(1<<3) if ($mod =~ /^,m/); | ||
| 700 | $opcode|=(1<<2) if ($mod =~ /^,mb/); | ||
| 701 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 702 | } | ||
| 703 | else { "\t".$orig; } | ||
| 704 | }; | ||
| 705 | |||
| 706 | my $std = sub { | ||
| 707 | my ($mod,$args) = @_; | ||
| 708 | my $orig = "std$mod\t$args"; | ||
| 709 | |||
| 710 | if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices | ||
| 711 | { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); | ||
| 712 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 713 | } | ||
| 714 | else { "\t".$orig; } | ||
| 715 | }; | ||
| 716 | |||
| 717 | my $extrd = sub { | ||
| 718 | my ($mod,$args) = @_; | ||
| 719 | my $orig = "extrd$mod\t$args"; | ||
| 720 | |||
| 721 | # I only have ",u" completer, it's implicitly encoded... | ||
| 722 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 | ||
| 723 | { my $opcode=(0x36<<26)|($1<<21)|($4<<16); | ||
| 724 | my $len=32-$3; | ||
| 725 | $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos | ||
| 726 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len | ||
| 727 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 728 | } | ||
| 729 | elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 | ||
| 730 | { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); | ||
| 731 | my $len=32-$2; | ||
| 732 | $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len | ||
| 733 | $opcode |= (1<<13) if ($mod =~ /,\**=/); | ||
| 734 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 735 | } | ||
| 736 | else { "\t".$orig; } | ||
| 737 | }; | ||
| 738 | |||
| 739 | my $shrpd = sub { | ||
| 740 | my ($mod,$args) = @_; | ||
| 741 | my $orig = "shrpd$mod\t$args"; | ||
| 742 | |||
| 743 | if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 | ||
| 744 | { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; | ||
| 745 | my $cpos=63-$3; | ||
| 746 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa | ||
| 747 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
| 748 | } | ||
| 749 | elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 | ||
| 750 | { sprintf "\t.WORD\t0x%08x\t; %s", | ||
| 751 | (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; | ||
| 752 | } | ||
| 753 | else { "\t".$orig; } | ||
| 754 | }; | ||
| 755 | |||
| 756 | sub assemble { | ||
| 757 | my ($mnemonic,$mod,$args)=@_; | ||
| 758 | my $opcode = eval("\$$mnemonic"); | ||
| 759 | |||
| 760 | ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; | ||
| 761 | } | ||
| 762 | |||
| 763 | foreach (split("\n",$code)) { | ||
| 764 | s/\`([^\`]*)\`/eval $1/ge; | ||
| 765 | |||
| 766 | s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/ | ||
| 767 | $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32 | ||
| 768 | : sprintf("shd\t%$1,%$2,%d",$3)/e or | ||
| 769 | # translate made up instructons: _ror, _shr, _align, _shl | ||
| 770 | s/_ror(\s+)(%r[0-9]+),/ | ||
| 771 | ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or | ||
| 772 | |||
| 773 | s/_shr(\s+%r[0-9]+),([0-9]+),/ | ||
| 774 | $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2) | ||
| 775 | : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or | ||
| 776 | |||
| 777 | s/_align(\s+%r[0-9]+,%r[0-9]+),/ | ||
| 778 | ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or | ||
| 779 | |||
| 780 | s/_shl(\s+%r[0-9]+),([0-9]+),/ | ||
| 781 | $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2) | ||
| 782 | : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e; | ||
| 783 | |||
| 784 | s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4); | ||
| 785 | |||
| 786 | s/cmpb,\*/comb,/ if ($SIZE_T==4); | ||
| 787 | |||
| 788 | print $_,"\n"; | ||
| 789 | } | ||
| 790 | |||
| 791 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl index 768a6a6fad..6b44a68e59 100755 --- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl +++ b/src/lib/libcrypto/sha/asm/sha512-ppc.pl | |||
| @@ -40,6 +40,7 @@ $output =shift; | |||
| 40 | 40 | ||
| 41 | if ($flavour =~ /64/) { | 41 | if ($flavour =~ /64/) { |
| 42 | $SIZE_T=8; | 42 | $SIZE_T=8; |
| 43 | $LRSAVE=2*$SIZE_T; | ||
| 43 | $STU="stdu"; | 44 | $STU="stdu"; |
| 44 | $UCMP="cmpld"; | 45 | $UCMP="cmpld"; |
| 45 | $SHL="sldi"; | 46 | $SHL="sldi"; |
| @@ -47,6 +48,7 @@ if ($flavour =~ /64/) { | |||
| 47 | $PUSH="std"; | 48 | $PUSH="std"; |
| 48 | } elsif ($flavour =~ /32/) { | 49 | } elsif ($flavour =~ /32/) { |
| 49 | $SIZE_T=4; | 50 | $SIZE_T=4; |
| 51 | $LRSAVE=$SIZE_T; | ||
| 50 | $STU="stwu"; | 52 | $STU="stwu"; |
| 51 | $UCMP="cmplw"; | 53 | $UCMP="cmplw"; |
| 52 | $SHL="slwi"; | 54 | $SHL="slwi"; |
| @@ -87,7 +89,8 @@ if ($output =~ /512/) { | |||
| 87 | $SHR="srwi"; | 89 | $SHR="srwi"; |
| 88 | } | 90 | } |
| 89 | 91 | ||
| 90 | $FRAME=32*$SIZE_T; | 92 | $FRAME=32*$SIZE_T+16*$SZ; |
| 93 | $LOCALS=6*$SIZE_T; | ||
| 91 | 94 | ||
| 92 | $sp ="r1"; | 95 | $sp ="r1"; |
| 93 | $toc="r2"; | 96 | $toc="r2"; |
| @@ -179,13 +182,12 @@ $code=<<___; | |||
| 179 | .globl $func | 182 | .globl $func |
| 180 | .align 6 | 183 | .align 6 |
| 181 | $func: | 184 | $func: |
| 185 | $STU $sp,-$FRAME($sp) | ||
| 182 | mflr r0 | 186 | mflr r0 |
| 183 | $STU $sp,`-($FRAME+16*$SZ)`($sp) | ||
| 184 | $SHL $num,$num,`log(16*$SZ)/log(2)` | 187 | $SHL $num,$num,`log(16*$SZ)/log(2)` |
| 185 | 188 | ||
| 186 | $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp) | 189 | $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp) |
| 187 | 190 | ||
| 188 | $PUSH r0,`$FRAME-$SIZE_T*21`($sp) | ||
| 189 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) | 191 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) |
| 190 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) | 192 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) |
| 191 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | 193 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) |
| @@ -206,6 +208,7 @@ $func: | |||
| 206 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | 208 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
| 207 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | 209 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
| 208 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | 210 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
| 211 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | ||
| 209 | 212 | ||
| 210 | $LD $A,`0*$SZ`($ctx) | 213 | $LD $A,`0*$SZ`($ctx) |
| 211 | mr $inp,r4 ; incarnate $inp | 214 | mr $inp,r4 ; incarnate $inp |
| @@ -217,7 +220,7 @@ $func: | |||
| 217 | $LD $G,`6*$SZ`($ctx) | 220 | $LD $G,`6*$SZ`($ctx) |
| 218 | $LD $H,`7*$SZ`($ctx) | 221 | $LD $H,`7*$SZ`($ctx) |
| 219 | 222 | ||
| 220 | b LPICmeup | 223 | bl LPICmeup |
| 221 | LPICedup: | 224 | LPICedup: |
| 222 | andi. r0,$inp,3 | 225 | andi. r0,$inp,3 |
| 223 | bne Lunaligned | 226 | bne Lunaligned |
| @@ -226,40 +229,14 @@ Laligned: | |||
| 226 | $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer | 229 | $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer |
| 227 | $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer | 230 | $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer |
| 228 | bl Lsha2_block_private | 231 | bl Lsha2_block_private |
| 229 | Ldone: | 232 | b Ldone |
| 230 | $POP r0,`$FRAME-$SIZE_T*21`($sp) | ||
| 231 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) | ||
| 232 | $POP r13,`$FRAME-$SIZE_T*19`($sp) | ||
| 233 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | ||
| 234 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | ||
| 235 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | ||
| 236 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | ||
| 237 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | ||
| 238 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | ||
| 239 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | ||
| 240 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | ||
| 241 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | ||
| 242 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | ||
| 243 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | ||
| 244 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | ||
| 245 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | ||
| 246 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | ||
| 247 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | ||
| 248 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | ||
| 249 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | ||
| 250 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | ||
| 251 | mtlr r0 | ||
| 252 | addi $sp,$sp,`$FRAME+16*$SZ` | ||
| 253 | blr | ||
| 254 | ___ | ||
| 255 | 233 | ||
| 256 | # PowerPC specification allows an implementation to be ill-behaved | 234 | ; PowerPC specification allows an implementation to be ill-behaved |
| 257 | # upon unaligned access which crosses page boundary. "Better safe | 235 | ; upon unaligned access which crosses page boundary. "Better safe |
| 258 | # than sorry" principle makes me treat it specially. But I don't | 236 | ; than sorry" principle makes me treat it specially. But I don't |
| 259 | # look for particular offending word, but rather for the input | 237 | ; look for particular offending word, but rather for the input |
| 260 | # block which crosses the boundary. Once found that block is aligned | 238 | ; block which crosses the boundary. Once found that block is aligned |
| 261 | # and hashed separately... | 239 | ; and hashed separately... |
| 262 | $code.=<<___; | ||
| 263 | .align 4 | 240 | .align 4 |
| 264 | Lunaligned: | 241 | Lunaligned: |
| 265 | subfic $t1,$inp,4096 | 242 | subfic $t1,$inp,4096 |
| @@ -278,7 +255,7 @@ Lunaligned: | |||
| 278 | Lcross_page: | 255 | Lcross_page: |
| 279 | li $t1,`16*$SZ/4` | 256 | li $t1,`16*$SZ/4` |
| 280 | mtctr $t1 | 257 | mtctr $t1 |
| 281 | addi r20,$sp,$FRAME ; aligned spot below the frame | 258 | addi r20,$sp,$LOCALS ; aligned spot below the frame |
| 282 | Lmemcpy: | 259 | Lmemcpy: |
| 283 | lbz r16,0($inp) | 260 | lbz r16,0($inp) |
| 284 | lbz r17,1($inp) | 261 | lbz r17,1($inp) |
| @@ -293,8 +270,8 @@ Lmemcpy: | |||
| 293 | bdnz Lmemcpy | 270 | bdnz Lmemcpy |
| 294 | 271 | ||
| 295 | $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp | 272 | $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp |
| 296 | addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer | 273 | addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer |
| 297 | addi $inp,$sp,$FRAME ; fictitious inp pointer | 274 | addi $inp,$sp,$LOCALS ; fictitious inp pointer |
| 298 | $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num | 275 | $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num |
| 299 | $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer | 276 | $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer |
| 300 | $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer | 277 | $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer |
| @@ -303,10 +280,36 @@ Lmemcpy: | |||
| 303 | $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num | 280 | $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num |
| 304 | addic. $num,$num,`-16*$SZ` ; num-- | 281 | addic. $num,$num,`-16*$SZ` ; num-- |
| 305 | bne- Lunaligned | 282 | bne- Lunaligned |
| 306 | b Ldone | ||
| 307 | ___ | ||
| 308 | 283 | ||
| 309 | $code.=<<___; | 284 | Ldone: |
| 285 | $POP r0,`$FRAME+$LRSAVE`($sp) | ||
| 286 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) | ||
| 287 | $POP r13,`$FRAME-$SIZE_T*19`($sp) | ||
| 288 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | ||
| 289 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | ||
| 290 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | ||
| 291 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | ||
| 292 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | ||
| 293 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | ||
| 294 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | ||
| 295 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | ||
| 296 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | ||
| 297 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | ||
| 298 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | ||
| 299 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | ||
| 300 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | ||
| 301 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | ||
| 302 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | ||
| 303 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | ||
| 304 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | ||
| 305 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | ||
| 306 | mtlr r0 | ||
| 307 | addi $sp,$sp,$FRAME | ||
| 308 | blr | ||
| 309 | .long 0 | ||
| 310 | .byte 0,12,4,1,0x80,18,3,0 | ||
| 311 | .long 0 | ||
| 312 | |||
| 310 | .align 4 | 313 | .align 4 |
| 311 | Lsha2_block_private: | 314 | Lsha2_block_private: |
| 312 | ___ | 315 | ___ |
| @@ -372,6 +375,8 @@ $code.=<<___; | |||
| 372 | $ST $H,`7*$SZ`($ctx) | 375 | $ST $H,`7*$SZ`($ctx) |
| 373 | bne Lsha2_block_private | 376 | bne Lsha2_block_private |
| 374 | blr | 377 | blr |
| 378 | .long 0 | ||
| 379 | .byte 0,12,0x14,0,0,0,0,0 | ||
| 375 | ___ | 380 | ___ |
| 376 | 381 | ||
| 377 | # Ugly hack here, because PPC assembler syntax seem to vary too | 382 | # Ugly hack here, because PPC assembler syntax seem to vary too |
| @@ -379,22 +384,15 @@ ___ | |||
| 379 | $code.=<<___; | 384 | $code.=<<___; |
| 380 | .align 6 | 385 | .align 6 |
| 381 | LPICmeup: | 386 | LPICmeup: |
| 382 | bl LPIC | 387 | mflr r0 |
| 383 | addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop | 388 | bcl 20,31,\$+4 |
| 384 | b LPICedup | 389 | mflr $Tbl ; vvvvvv "distance" between . and 1st data entry |
| 385 | nop | 390 | addi $Tbl,$Tbl,`64-8` |
| 386 | nop | 391 | mtlr r0 |
| 387 | nop | ||
| 388 | nop | ||
| 389 | nop | ||
| 390 | LPIC: mflr $Tbl | ||
| 391 | blr | 392 | blr |
| 392 | nop | 393 | .long 0 |
| 393 | nop | 394 | .byte 0,12,0x14,0,0,0,0,0 |
| 394 | nop | 395 | .space `64-9*4` |
| 395 | nop | ||
| 396 | nop | ||
| 397 | nop | ||
| 398 | ___ | 396 | ___ |
| 399 | $code.=<<___ if ($SZ==8); | 397 | $code.=<<___ if ($SZ==8); |
| 400 | .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd | 398 | .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd |
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl index e7ef2d5a9f..079a3fc78a 100644 --- a/src/lib/libcrypto/sha/asm/sha512-s390x.pl +++ b/src/lib/libcrypto/sha/asm/sha512-s390x.pl | |||
| @@ -26,6 +26,26 @@ | |||
| 26 | # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster | 26 | # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster |
| 27 | # than software. | 27 | # than software. |
| 28 | 28 | ||
| 29 | # November 2010. | ||
| 30 | # | ||
| 31 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
| 32 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
| 33 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
| 34 | # application context. The feature is not specific to any particular | ||
| 35 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
| 36 | # remains z/Architecture specific. On z900 SHA256 was measured to | ||
| 37 | # perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3. | ||
| 38 | |||
| 39 | $flavour = shift; | ||
| 40 | |||
| 41 | if ($flavour =~ /3[12]/) { | ||
| 42 | $SIZE_T=4; | ||
| 43 | $g=""; | ||
| 44 | } else { | ||
| 45 | $SIZE_T=8; | ||
| 46 | $g="g"; | ||
| 47 | } | ||
| 48 | |||
| 29 | $t0="%r0"; | 49 | $t0="%r0"; |
| 30 | $t1="%r1"; | 50 | $t1="%r1"; |
| 31 | $ctx="%r2"; $t2="%r2"; | 51 | $ctx="%r2"; $t2="%r2"; |
| @@ -44,7 +64,7 @@ $tbl="%r13"; | |||
| 44 | $T1="%r14"; | 64 | $T1="%r14"; |
| 45 | $sp="%r15"; | 65 | $sp="%r15"; |
| 46 | 66 | ||
| 47 | $output=shift; | 67 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| 48 | open STDOUT,">$output"; | 68 | open STDOUT,">$output"; |
| 49 | 69 | ||
| 50 | if ($output =~ /512/) { | 70 | if ($output =~ /512/) { |
| @@ -78,7 +98,8 @@ if ($output =~ /512/) { | |||
| 78 | } | 98 | } |
| 79 | $Func="sha${label}_block_data_order"; | 99 | $Func="sha${label}_block_data_order"; |
| 80 | $Table="K${label}"; | 100 | $Table="K${label}"; |
| 81 | $frame=160+16*$SZ; | 101 | $stdframe=16*$SIZE_T+4*8; |
| 102 | $frame=$stdframe+16*$SZ; | ||
| 82 | 103 | ||
| 83 | sub BODY_00_15 { | 104 | sub BODY_00_15 { |
| 84 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 105 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
| @@ -93,9 +114,9 @@ $code.=<<___; | |||
| 93 | xgr $t0,$t1 | 114 | xgr $t0,$t1 |
| 94 | $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` | 115 | $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` |
| 95 | xgr $t2,$g | 116 | xgr $t2,$g |
| 96 | $ST $T1,`160+$SZ*($i%16)`($sp) | 117 | $ST $T1,`$stdframe+$SZ*($i%16)`($sp) |
| 97 | xgr $t0,$t1 # Sigma1(e) | 118 | xgr $t0,$t1 # Sigma1(e) |
| 98 | la $T1,0($T1,$h) # T1+=h | 119 | algr $T1,$h # T1+=h |
| 99 | ngr $t2,$e | 120 | ngr $t2,$e |
| 100 | lgr $t1,$a | 121 | lgr $t1,$a |
| 101 | algr $T1,$t0 # T1+=Sigma1(e) | 122 | algr $T1,$t0 # T1+=Sigma1(e) |
| @@ -113,7 +134,7 @@ $code.=<<___; | |||
| 113 | ngr $t2,$b | 134 | ngr $t2,$b |
| 114 | algr $h,$T1 # h+=T1 | 135 | algr $h,$T1 # h+=T1 |
| 115 | ogr $t2,$t1 # Maj(a,b,c) | 136 | ogr $t2,$t1 # Maj(a,b,c) |
| 116 | la $d,0($d,$T1) # d+=T1 | 137 | algr $d,$T1 # d+=T1 |
| 117 | algr $h,$t2 # h+=Maj(a,b,c) | 138 | algr $h,$t2 # h+=Maj(a,b,c) |
| 118 | ___ | 139 | ___ |
| 119 | } | 140 | } |
| @@ -122,19 +143,19 @@ sub BODY_16_XX { | |||
| 122 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 143 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
| 123 | 144 | ||
| 124 | $code.=<<___; | 145 | $code.=<<___; |
| 125 | $LD $T1,`160+$SZ*(($i+1)%16)`($sp) ### $i | 146 | $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i |
| 126 | $LD $t1,`160+$SZ*(($i+14)%16)`($sp) | 147 | $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp) |
| 127 | $ROT $t0,$T1,$sigma0[0] | 148 | $ROT $t0,$T1,$sigma0[0] |
| 128 | $SHR $T1,$sigma0[2] | 149 | $SHR $T1,$sigma0[2] |
| 129 | $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]` | 150 | $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]` |
| 130 | xgr $T1,$t0 | 151 | xgr $T1,$t0 |
| 131 | $ROT $t0,$t1,$sigma1[0] | 152 | $ROT $t0,$t1,$sigma1[0] |
| 132 | xgr $T1,$t2 # sigma0(X[i+1]) | 153 | xgr $T1,$t2 # sigma0(X[i+1]) |
| 133 | $SHR $t1,$sigma1[2] | 154 | $SHR $t1,$sigma1[2] |
| 134 | $ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i] | 155 | $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i] |
| 135 | xgr $t1,$t0 | 156 | xgr $t1,$t0 |
| 136 | $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` | 157 | $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` |
| 137 | $ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9] | 158 | $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9] |
| 138 | xgr $t1,$t0 # sigma1(X[i+14]) | 159 | xgr $t1,$t0 # sigma1(X[i+14]) |
| 139 | algr $T1,$t1 # +=sigma1(X[i+14]) | 160 | algr $T1,$t1 # +=sigma1(X[i+14]) |
| 140 | ___ | 161 | ___ |
| @@ -212,6 +233,7 @@ $code.=<<___; | |||
| 212 | .globl $Func | 233 | .globl $Func |
| 213 | .type $Func,\@function | 234 | .type $Func,\@function |
| 214 | $Func: | 235 | $Func: |
| 236 | sllg $len,$len,`log(16*$SZ)/log(2)` | ||
| 215 | ___ | 237 | ___ |
| 216 | $code.=<<___ if ($kimdfunc); | 238 | $code.=<<___ if ($kimdfunc); |
| 217 | larl %r1,OPENSSL_s390xcap_P | 239 | larl %r1,OPENSSL_s390xcap_P |
| @@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc); | |||
| 219 | tmhl %r0,0x4000 # check for message-security assist | 241 | tmhl %r0,0x4000 # check for message-security assist |
| 220 | jz .Lsoftware | 242 | jz .Lsoftware |
| 221 | lghi %r0,0 | 243 | lghi %r0,0 |
| 222 | la %r1,16($sp) | 244 | la %r1,`2*$SIZE_T`($sp) |
| 223 | .long 0xb93e0002 # kimd %r0,%r2 | 245 | .long 0xb93e0002 # kimd %r0,%r2 |
| 224 | lg %r0,16($sp) | 246 | lg %r0,`2*$SIZE_T`($sp) |
| 225 | tmhh %r0,`0x8000>>$kimdfunc` | 247 | tmhh %r0,`0x8000>>$kimdfunc` |
| 226 | jz .Lsoftware | 248 | jz .Lsoftware |
| 227 | lghi %r0,$kimdfunc | 249 | lghi %r0,$kimdfunc |
| 228 | lgr %r1,$ctx | 250 | lgr %r1,$ctx |
| 229 | lgr %r2,$inp | 251 | lgr %r2,$inp |
| 230 | sllg %r3,$len,`log(16*$SZ)/log(2)` | 252 | lgr %r3,$len |
| 231 | .long 0xb93e0002 # kimd %r0,%r2 | 253 | .long 0xb93e0002 # kimd %r0,%r2 |
| 232 | brc 1,.-4 # pay attention to "partial completion" | 254 | brc 1,.-4 # pay attention to "partial completion" |
| 233 | br %r14 | 255 | br %r14 |
| @@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc); | |||
| 235 | .Lsoftware: | 257 | .Lsoftware: |
| 236 | ___ | 258 | ___ |
| 237 | $code.=<<___; | 259 | $code.=<<___; |
| 238 | sllg $len,$len,`log(16*$SZ)/log(2)` | ||
| 239 | lghi %r1,-$frame | 260 | lghi %r1,-$frame |
| 240 | agr $len,$inp | 261 | la $len,0($len,$inp) |
| 241 | stmg $ctx,%r15,16($sp) | 262 | stm${g} $ctx,%r15,`2*$SIZE_T`($sp) |
| 242 | lgr %r0,$sp | 263 | lgr %r0,$sp |
| 243 | la $sp,0(%r1,$sp) | 264 | la $sp,0(%r1,$sp) |
| 244 | stg %r0,0($sp) | 265 | st${g} %r0,0($sp) |
| 245 | 266 | ||
| 246 | larl $tbl,$Table | 267 | larl $tbl,$Table |
| 247 | $LD $A,`0*$SZ`($ctx) | 268 | $LD $A,`0*$SZ`($ctx) |
| @@ -265,7 +286,7 @@ $code.=<<___; | |||
| 265 | clgr $len,$t0 | 286 | clgr $len,$t0 |
| 266 | jne .Lrounds_16_xx | 287 | jne .Lrounds_16_xx |
| 267 | 288 | ||
| 268 | lg $ctx,`$frame+16`($sp) | 289 | l${g} $ctx,`$frame+2*$SIZE_T`($sp) |
| 269 | la $inp,`16*$SZ`($inp) | 290 | la $inp,`16*$SZ`($inp) |
| 270 | $ADD $A,`0*$SZ`($ctx) | 291 | $ADD $A,`0*$SZ`($ctx) |
| 271 | $ADD $B,`1*$SZ`($ctx) | 292 | $ADD $B,`1*$SZ`($ctx) |
| @@ -283,14 +304,14 @@ $code.=<<___; | |||
| 283 | $ST $F,`5*$SZ`($ctx) | 304 | $ST $F,`5*$SZ`($ctx) |
| 284 | $ST $G,`6*$SZ`($ctx) | 305 | $ST $G,`6*$SZ`($ctx) |
| 285 | $ST $H,`7*$SZ`($ctx) | 306 | $ST $H,`7*$SZ`($ctx) |
| 286 | clg $inp,`$frame+32`($sp) | 307 | cl${g} $inp,`$frame+4*$SIZE_T`($sp) |
| 287 | jne .Lloop | 308 | jne .Lloop |
| 288 | 309 | ||
| 289 | lmg %r6,%r15,`$frame+48`($sp) | 310 | lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) |
| 290 | br %r14 | 311 | br %r14 |
| 291 | .size $Func,.-$Func | 312 | .size $Func,.-$Func |
| 292 | .string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 313 | .string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
| 293 | .comm OPENSSL_s390xcap_P,8,8 | 314 | .comm OPENSSL_s390xcap_P,16,8 |
| 294 | ___ | 315 | ___ |
| 295 | 316 | ||
| 296 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 317 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl index ec5d78135e..585740789e 100644 --- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl +++ b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl | |||
| @@ -305,9 +305,9 @@ $code.=<<___; | |||
| 305 | srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] | 305 | srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] |
| 306 | xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) | 306 | xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) |
| 307 | srl @X[($i/2)%8],0,$tmp0 | 307 | srl @X[($i/2)%8],0,$tmp0 |
| 308 | add $tmp2,$tmp1,$tmp1 | ||
| 308 | add $xi,$T1,$T1 ! +=X[i] | 309 | add $xi,$T1,$T1 ! +=X[i] |
| 309 | xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] | 310 | xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] |
| 310 | add $tmp2,$T1,$T1 | ||
| 311 | add $tmp1,$T1,$T1 | 311 | add $tmp1,$T1,$T1 |
| 312 | 312 | ||
| 313 | srl $T1,0,$T1 | 313 | srl $T1,0,$T1 |
| @@ -318,9 +318,9 @@ ___ | |||
| 318 | $code.=<<___; | 318 | $code.=<<___; |
| 319 | srlx @X[($i/2)%8],32,$tmp1 ! X[i] | 319 | srlx @X[($i/2)%8],32,$tmp1 ! X[i] |
| 320 | xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) | 320 | xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) |
| 321 | srl @X[($i/2)%8],0,@X[($i/2)%8] | ||
| 322 | add $xi,$T1,$T1 ! +=X[i+9] | 321 | add $xi,$T1,$T1 ! +=X[i+9] |
| 323 | add $tmp2,$T1,$T1 | 322 | add $tmp2,$tmp1,$tmp1 |
| 323 | srl @X[($i/2)%8],0,@X[($i/2)%8] | ||
| 324 | add $tmp1,$T1,$T1 | 324 | add $tmp1,$T1,$T1 |
| 325 | 325 | ||
| 326 | sllx $T1,32,$tmp0 | 326 | sllx $T1,32,$tmp0 |
diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl index e6643f8cf6..f611a2d898 100755 --- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl +++ b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl | |||
| @@ -95,50 +95,44 @@ sub ROUND_00_15() | |||
| 95 | { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 95 | { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
| 96 | 96 | ||
| 97 | $code.=<<___; | 97 | $code.=<<___; |
| 98 | mov $e,$a0 | 98 | ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 |
| 99 | mov $e,$a1 | ||
| 100 | mov $f,$a2 | 99 | mov $f,$a2 |
| 100 | mov $T1,`$SZ*($i&0xf)`(%rsp) | ||
| 101 | 101 | ||
| 102 | ror \$$Sigma1[0],$a0 | 102 | ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 |
| 103 | ror \$$Sigma1[1],$a1 | 103 | xor $e,$a0 |
| 104 | xor $g,$a2 # f^g | 104 | xor $g,$a2 # f^g |
| 105 | 105 | ||
| 106 | xor $a1,$a0 | 106 | ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 |
| 107 | ror \$`$Sigma1[2]-$Sigma1[1]`,$a1 | 107 | add $h,$T1 # T1+=h |
| 108 | xor $a,$a1 | ||
| 109 | |||
| 110 | add ($Tbl,$round,$SZ),$T1 # T1+=K[round] | ||
| 108 | and $e,$a2 # (f^g)&e | 111 | and $e,$a2 # (f^g)&e |
| 109 | mov $T1,`$SZ*($i&0xf)`(%rsp) | 112 | mov $b,$h |
| 110 | 113 | ||
| 111 | xor $a1,$a0 # Sigma1(e) | 114 | ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 |
| 115 | xor $e,$a0 | ||
| 112 | xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g | 116 | xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g |
| 113 | add $h,$T1 # T1+=h | ||
| 114 | |||
| 115 | mov $a,$h | ||
| 116 | add $a0,$T1 # T1+=Sigma1(e) | ||
| 117 | 117 | ||
| 118 | xor $c,$h # b^c | ||
| 119 | xor $a,$a1 | ||
| 118 | add $a2,$T1 # T1+=Ch(e,f,g) | 120 | add $a2,$T1 # T1+=Ch(e,f,g) |
| 119 | mov $a,$a0 | 121 | mov $b,$a2 |
| 120 | mov $a,$a1 | ||
| 121 | 122 | ||
| 122 | ror \$$Sigma0[0],$h | 123 | ror \$$Sigma1[0],$a0 # Sigma1(e) |
| 123 | ror \$$Sigma0[1],$a0 | 124 | and $a,$h # h=(b^c)&a |
| 124 | mov $a,$a2 | 125 | and $c,$a2 # b&c |
| 125 | add ($Tbl,$round,$SZ),$T1 # T1+=K[round] | ||
| 126 | 126 | ||
| 127 | xor $a0,$h | 127 | ror \$$Sigma0[0],$a1 # Sigma0(a) |
| 128 | ror \$`$Sigma0[2]-$Sigma0[1]`,$a0 | 128 | add $a0,$T1 # T1+=Sigma1(e) |
| 129 | or $c,$a1 # a|c | 129 | add $a2,$h # h+=b&c (completes +=Maj(a,b,c) |
| 130 | 130 | ||
| 131 | xor $a0,$h # h=Sigma0(a) | ||
| 132 | and $c,$a2 # a&c | ||
| 133 | add $T1,$d # d+=T1 | 131 | add $T1,$d # d+=T1 |
| 134 | |||
| 135 | and $b,$a1 # (a|c)&b | ||
| 136 | add $T1,$h # h+=T1 | 132 | add $T1,$h # h+=T1 |
| 137 | |||
| 138 | or $a2,$a1 # Maj(a,b,c)=((a|c)&b)|(a&c) | ||
| 139 | lea 1($round),$round # round++ | 133 | lea 1($round),$round # round++ |
| 134 | add $a1,$h # h+=Sigma0(a) | ||
| 140 | 135 | ||
| 141 | add $a1,$h # h+=Maj(a,b,c) | ||
| 142 | ___ | 136 | ___ |
| 143 | } | 137 | } |
| 144 | 138 | ||
| @@ -147,32 +141,30 @@ sub ROUND_16_XX() | |||
| 147 | 141 | ||
| 148 | $code.=<<___; | 142 | $code.=<<___; |
| 149 | mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 | 143 | mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 |
| 150 | mov `$SZ*(($i+14)&0xf)`(%rsp),$T1 | 144 | mov `$SZ*(($i+14)&0xf)`(%rsp),$a1 |
| 151 | 145 | mov $a0,$T1 | |
| 152 | mov $a0,$a2 | 146 | mov $a1,$a2 |
| 153 | 147 | ||
| 148 | ror \$`$sigma0[1]-$sigma0[0]`,$T1 | ||
| 149 | xor $a0,$T1 | ||
| 154 | shr \$$sigma0[2],$a0 | 150 | shr \$$sigma0[2],$a0 |
| 155 | ror \$$sigma0[0],$a2 | ||
| 156 | |||
| 157 | xor $a2,$a0 | ||
| 158 | ror \$`$sigma0[1]-$sigma0[0]`,$a2 | ||
| 159 | 151 | ||
| 160 | xor $a2,$a0 # sigma0(X[(i+1)&0xf]) | 152 | ror \$$sigma0[0],$T1 |
| 161 | mov $T1,$a1 | 153 | xor $T1,$a0 # sigma0(X[(i+1)&0xf]) |
| 154 | mov `$SZ*(($i+9)&0xf)`(%rsp),$T1 | ||
| 162 | 155 | ||
| 163 | shr \$$sigma1[2],$T1 | 156 | ror \$`$sigma1[1]-$sigma1[0]`,$a2 |
| 164 | ror \$$sigma1[0],$a1 | 157 | xor $a1,$a2 |
| 165 | 158 | shr \$$sigma1[2],$a1 | |
| 166 | xor $a1,$T1 | ||
| 167 | ror \$`$sigma1[1]-$sigma1[0]`,$a1 | ||
| 168 | |||
| 169 | xor $a1,$T1 # sigma1(X[(i+14)&0xf]) | ||
| 170 | 159 | ||
| 160 | ror \$$sigma1[0],$a2 | ||
| 171 | add $a0,$T1 | 161 | add $a0,$T1 |
| 172 | 162 | xor $a2,$a1 # sigma1(X[(i+14)&0xf]) | |
| 173 | add `$SZ*(($i+9)&0xf)`(%rsp),$T1 | ||
| 174 | 163 | ||
| 175 | add `$SZ*($i&0xf)`(%rsp),$T1 | 164 | add `$SZ*($i&0xf)`(%rsp),$T1 |
| 165 | mov $e,$a0 | ||
| 166 | add $a1,$T1 | ||
| 167 | mov $a,$a1 | ||
| 176 | ___ | 168 | ___ |
| 177 | &ROUND_00_15(@_); | 169 | &ROUND_00_15(@_); |
| 178 | } | 170 | } |
| @@ -219,6 +211,8 @@ $func: | |||
| 219 | ___ | 211 | ___ |
| 220 | for($i=0;$i<16;$i++) { | 212 | for($i=0;$i<16;$i++) { |
| 221 | $code.=" mov $SZ*$i($inp),$T1\n"; | 213 | $code.=" mov $SZ*$i($inp),$T1\n"; |
| 214 | $code.=" mov @ROT[4],$a0\n"; | ||
| 215 | $code.=" mov @ROT[0],$a1\n"; | ||
| 222 | $code.=" bswap $T1\n"; | 216 | $code.=" bswap $T1\n"; |
| 223 | &ROUND_00_15($i,@ROT); | 217 | &ROUND_00_15($i,@ROT); |
| 224 | unshift(@ROT,pop(@ROT)); | 218 | unshift(@ROT,pop(@ROT)); |
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c index 8952d87673..f88d3d6dad 100644 --- a/src/lib/libcrypto/sha/sha256.c +++ b/src/lib/libcrypto/sha/sha256.c | |||
| @@ -16,7 +16,7 @@ | |||
| 16 | 16 | ||
| 17 | const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT; | 17 | const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT; |
| 18 | 18 | ||
| 19 | int SHA224_Init (SHA256_CTX *c) | 19 | fips_md_init_ctx(SHA224, SHA256) |
| 20 | { | 20 | { |
| 21 | memset (c,0,sizeof(*c)); | 21 | memset (c,0,sizeof(*c)); |
| 22 | c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL; | 22 | c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL; |
| @@ -27,7 +27,7 @@ int SHA224_Init (SHA256_CTX *c) | |||
| 27 | return 1; | 27 | return 1; |
| 28 | } | 28 | } |
| 29 | 29 | ||
| 30 | int SHA256_Init (SHA256_CTX *c) | 30 | fips_md_init(SHA256) |
| 31 | { | 31 | { |
| 32 | memset (c,0,sizeof(*c)); | 32 | memset (c,0,sizeof(*c)); |
| 33 | c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL; | 33 | c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL; |
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c index cbc0e58c48..50dd7dc744 100644 --- a/src/lib/libcrypto/sha/sha512.c +++ b/src/lib/libcrypto/sha/sha512.c | |||
| @@ -59,21 +59,8 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT; | |||
| 59 | #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA | 59 | #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA |
| 60 | #endif | 60 | #endif |
| 61 | 61 | ||
| 62 | int SHA384_Init (SHA512_CTX *c) | 62 | fips_md_init_ctx(SHA384, SHA512) |
| 63 | { | 63 | { |
| 64 | #if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) | ||
| 65 | /* maintain dword order required by assembler module */ | ||
| 66 | unsigned int *h = (unsigned int *)c->h; | ||
| 67 | |||
| 68 | h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8; | ||
| 69 | h[2] = 0x629a292a; h[3] = 0x367cd507; | ||
| 70 | h[4] = 0x9159015a; h[5] = 0x3070dd17; | ||
| 71 | h[6] = 0x152fecd8; h[7] = 0xf70e5939; | ||
| 72 | h[8] = 0x67332667; h[9] = 0xffc00b31; | ||
| 73 | h[10] = 0x8eb44a87; h[11] = 0x68581511; | ||
| 74 | h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7; | ||
| 75 | h[14] = 0x47b5481d; h[15] = 0xbefa4fa4; | ||
| 76 | #else | ||
| 77 | c->h[0]=U64(0xcbbb9d5dc1059ed8); | 64 | c->h[0]=U64(0xcbbb9d5dc1059ed8); |
| 78 | c->h[1]=U64(0x629a292a367cd507); | 65 | c->h[1]=U64(0x629a292a367cd507); |
| 79 | c->h[2]=U64(0x9159015a3070dd17); | 66 | c->h[2]=U64(0x9159015a3070dd17); |
| @@ -82,27 +69,14 @@ int SHA384_Init (SHA512_CTX *c) | |||
| 82 | c->h[5]=U64(0x8eb44a8768581511); | 69 | c->h[5]=U64(0x8eb44a8768581511); |
| 83 | c->h[6]=U64(0xdb0c2e0d64f98fa7); | 70 | c->h[6]=U64(0xdb0c2e0d64f98fa7); |
| 84 | c->h[7]=U64(0x47b5481dbefa4fa4); | 71 | c->h[7]=U64(0x47b5481dbefa4fa4); |
| 85 | #endif | 72 | |
| 86 | c->Nl=0; c->Nh=0; | 73 | c->Nl=0; c->Nh=0; |
| 87 | c->num=0; c->md_len=SHA384_DIGEST_LENGTH; | 74 | c->num=0; c->md_len=SHA384_DIGEST_LENGTH; |
| 88 | return 1; | 75 | return 1; |
| 89 | } | 76 | } |
| 90 | 77 | ||
| 91 | int SHA512_Init (SHA512_CTX *c) | 78 | fips_md_init(SHA512) |
| 92 | { | 79 | { |
| 93 | #if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) | ||
| 94 | /* maintain dword order required by assembler module */ | ||
| 95 | unsigned int *h = (unsigned int *)c->h; | ||
| 96 | |||
| 97 | h[0] = 0x6a09e667; h[1] = 0xf3bcc908; | ||
| 98 | h[2] = 0xbb67ae85; h[3] = 0x84caa73b; | ||
| 99 | h[4] = 0x3c6ef372; h[5] = 0xfe94f82b; | ||
| 100 | h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1; | ||
| 101 | h[8] = 0x510e527f; h[9] = 0xade682d1; | ||
| 102 | h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f; | ||
| 103 | h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b; | ||
| 104 | h[14] = 0x5be0cd19; h[15] = 0x137e2179; | ||
| 105 | #else | ||
| 106 | c->h[0]=U64(0x6a09e667f3bcc908); | 80 | c->h[0]=U64(0x6a09e667f3bcc908); |
| 107 | c->h[1]=U64(0xbb67ae8584caa73b); | 81 | c->h[1]=U64(0xbb67ae8584caa73b); |
| 108 | c->h[2]=U64(0x3c6ef372fe94f82b); | 82 | c->h[2]=U64(0x3c6ef372fe94f82b); |
| @@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c) | |||
| 111 | c->h[5]=U64(0x9b05688c2b3e6c1f); | 85 | c->h[5]=U64(0x9b05688c2b3e6c1f); |
| 112 | c->h[6]=U64(0x1f83d9abfb41bd6b); | 86 | c->h[6]=U64(0x1f83d9abfb41bd6b); |
| 113 | c->h[7]=U64(0x5be0cd19137e2179); | 87 | c->h[7]=U64(0x5be0cd19137e2179); |
| 114 | #endif | 88 | |
| 115 | c->Nl=0; c->Nh=0; | 89 | c->Nl=0; c->Nh=0; |
| 116 | c->num=0; c->md_len=SHA512_DIGEST_LENGTH; | 90 | c->num=0; c->md_len=SHA512_DIGEST_LENGTH; |
| 117 | return 1; | 91 | return 1; |
| @@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c) | |||
| 160 | 134 | ||
| 161 | if (md==0) return 0; | 135 | if (md==0) return 0; |
| 162 | 136 | ||
| 163 | #if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) | ||
| 164 | /* recall assembler dword order... */ | ||
| 165 | n = c->md_len; | ||
| 166 | if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH) | ||
| 167 | { | ||
| 168 | unsigned int *h = (unsigned int *)c->h, t; | ||
| 169 | |||
| 170 | for (n/=4;n;n--) | ||
| 171 | { | ||
| 172 | t = *(h++); | ||
| 173 | *(md++) = (unsigned char)(t>>24); | ||
| 174 | *(md++) = (unsigned char)(t>>16); | ||
| 175 | *(md++) = (unsigned char)(t>>8); | ||
| 176 | *(md++) = (unsigned char)(t); | ||
| 177 | } | ||
| 178 | } | ||
| 179 | else return 0; | ||
| 180 | #else | ||
| 181 | switch (c->md_len) | 137 | switch (c->md_len) |
| 182 | { | 138 | { |
| 183 | /* Let compiler decide if it's appropriate to unroll... */ | 139 | /* Let compiler decide if it's appropriate to unroll... */ |
| @@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c) | |||
| 214 | /* ... as well as make sure md_len is not abused. */ | 170 | /* ... as well as make sure md_len is not abused. */ |
| 215 | default: return 0; | 171 | default: return 0; |
| 216 | } | 172 | } |
| 217 | #endif | 173 | |
| 218 | return 1; | 174 | return 1; |
| 219 | } | 175 | } |
| 220 | 176 | ||
diff --git a/src/lib/libcrypto/sparcv9cap.c b/src/lib/libcrypto/sparcv9cap.c index ed195ab402..43b3ac6f81 100644 --- a/src/lib/libcrypto/sparcv9cap.c +++ b/src/lib/libcrypto/sparcv9cap.c | |||
| @@ -19,7 +19,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U | |||
| 19 | int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); | 19 | int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); |
| 20 | int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); | 20 | int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); |
| 21 | 21 | ||
| 22 | if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == | 22 | if (num>=8 && !(num&1) && |
| 23 | (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == | ||
| 23 | (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) | 24 | (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) |
| 24 | return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); | 25 | return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); |
| 25 | else | 26 | else |
| @@ -169,7 +170,6 @@ void OPENSSL_cpuid_setup(void) | |||
| 169 | char *e; | 170 | char *e; |
| 170 | struct sigaction common_act,ill_oact,bus_oact; | 171 | struct sigaction common_act,ill_oact,bus_oact; |
| 171 | sigset_t all_masked,oset; | 172 | sigset_t all_masked,oset; |
| 172 | int sig; | ||
| 173 | static int trigger=0; | 173 | static int trigger=0; |
| 174 | 174 | ||
| 175 | if (trigger) return; | 175 | if (trigger) return; |
diff --git a/src/lib/libcrypto/ts/ts.h b/src/lib/libcrypto/ts/ts.h index 190e8a1bf2..c2448e3c3b 100644 --- a/src/lib/libcrypto/ts/ts.h +++ b/src/lib/libcrypto/ts/ts.h | |||
| @@ -86,9 +86,6 @@ | |||
| 86 | #include <openssl/dh.h> | 86 | #include <openssl/dh.h> |
| 87 | #endif | 87 | #endif |
| 88 | 88 | ||
| 89 | #include <openssl/evp.h> | ||
| 90 | |||
| 91 | |||
| 92 | #ifdef __cplusplus | 89 | #ifdef __cplusplus |
| 93 | extern "C" { | 90 | extern "C" { |
| 94 | #endif | 91 | #endif |
diff --git a/src/lib/libcrypto/whrlpool/whrlpool.h b/src/lib/libcrypto/whrlpool/whrlpool.h index 03c91da115..9e01f5b076 100644 --- a/src/lib/libcrypto/whrlpool/whrlpool.h +++ b/src/lib/libcrypto/whrlpool/whrlpool.h | |||
| @@ -24,6 +24,9 @@ typedef struct { | |||
| 24 | } WHIRLPOOL_CTX; | 24 | } WHIRLPOOL_CTX; |
| 25 | 25 | ||
| 26 | #ifndef OPENSSL_NO_WHIRLPOOL | 26 | #ifndef OPENSSL_NO_WHIRLPOOL |
| 27 | #ifdef OPENSSL_FIPS | ||
| 28 | int private_WHIRLPOOL_Init(WHIRLPOOL_CTX *c); | ||
| 29 | #endif | ||
| 27 | int WHIRLPOOL_Init (WHIRLPOOL_CTX *c); | 30 | int WHIRLPOOL_Init (WHIRLPOOL_CTX *c); |
| 28 | int WHIRLPOOL_Update (WHIRLPOOL_CTX *c,const void *inp,size_t bytes); | 31 | int WHIRLPOOL_Update (WHIRLPOOL_CTX *c,const void *inp,size_t bytes); |
| 29 | void WHIRLPOOL_BitUpdate(WHIRLPOOL_CTX *c,const void *inp,size_t bits); | 32 | void WHIRLPOOL_BitUpdate(WHIRLPOOL_CTX *c,const void *inp,size_t bits); |
diff --git a/src/lib/libcrypto/whrlpool/wp_block.c b/src/lib/libcrypto/whrlpool/wp_block.c index 221f6cc59f..824ed1827c 100644 --- a/src/lib/libcrypto/whrlpool/wp_block.c +++ b/src/lib/libcrypto/whrlpool/wp_block.c | |||
| @@ -68,9 +68,9 @@ typedef unsigned long long u64; | |||
| 68 | CPUs this is actually faster! */ | 68 | CPUs this is actually faster! */ |
| 69 | # endif | 69 | # endif |
| 70 | # define GO_FOR_MMX(ctx,inp,num) do { \ | 70 | # define GO_FOR_MMX(ctx,inp,num) do { \ |
| 71 | extern unsigned long OPENSSL_ia32cap_P; \ | 71 | extern unsigned int OPENSSL_ia32cap_P[]; \ |
| 72 | void whirlpool_block_mmx(void *,const void *,size_t); \ | 72 | void whirlpool_block_mmx(void *,const void *,size_t); \ |
| 73 | if (!(OPENSSL_ia32cap_P & (1<<23))) break; \ | 73 | if (!(OPENSSL_ia32cap_P[0] & (1<<23))) break; \ |
| 74 | whirlpool_block_mmx(ctx->H.c,inp,num); return; \ | 74 | whirlpool_block_mmx(ctx->H.c,inp,num); return; \ |
| 75 | } while (0) | 75 | } while (0) |
| 76 | # endif | 76 | # endif |
diff --git a/src/lib/libcrypto/whrlpool/wp_dgst.c b/src/lib/libcrypto/whrlpool/wp_dgst.c index ee5c5c1bf3..7e28bef51d 100644 --- a/src/lib/libcrypto/whrlpool/wp_dgst.c +++ b/src/lib/libcrypto/whrlpool/wp_dgst.c | |||
| @@ -52,9 +52,10 @@ | |||
| 52 | */ | 52 | */ |
| 53 | 53 | ||
| 54 | #include "wp_locl.h" | 54 | #include "wp_locl.h" |
| 55 | #include <openssl/crypto.h> | ||
| 55 | #include <string.h> | 56 | #include <string.h> |
| 56 | 57 | ||
| 57 | int WHIRLPOOL_Init (WHIRLPOOL_CTX *c) | 58 | fips_md_init(WHIRLPOOL) |
| 58 | { | 59 | { |
| 59 | memset (c,0,sizeof(*c)); | 60 | memset (c,0,sizeof(*c)); |
| 60 | return(1); | 61 | return(1); |
diff --git a/src/lib/libcrypto/x86cpuid.pl b/src/lib/libcrypto/x86cpuid.pl index a7464af19b..39fd8f2293 100644 --- a/src/lib/libcrypto/x86cpuid.pl +++ b/src/lib/libcrypto/x86cpuid.pl | |||
| @@ -19,9 +19,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
| 19 | &pushf (); | 19 | &pushf (); |
| 20 | &pop ("eax"); | 20 | &pop ("eax"); |
| 21 | &xor ("ecx","eax"); | 21 | &xor ("ecx","eax"); |
| 22 | &bt ("ecx",21); | ||
| 23 | &jnc (&label("done")); | ||
| 24 | &xor ("eax","eax"); | 22 | &xor ("eax","eax"); |
| 23 | &bt ("ecx",21); | ||
| 24 | &jnc (&label("nocpuid")); | ||
| 25 | &cpuid (); | 25 | &cpuid (); |
| 26 | &mov ("edi","eax"); # max value for standard query level | 26 | &mov ("edi","eax"); # max value for standard query level |
| 27 | 27 | ||
| @@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
| 51 | # AMD specific | 51 | # AMD specific |
| 52 | &mov ("eax",0x80000000); | 52 | &mov ("eax",0x80000000); |
| 53 | &cpuid (); | 53 | &cpuid (); |
| 54 | &cmp ("eax",0x80000008); | 54 | &cmp ("eax",0x80000001); |
| 55 | &jb (&label("intel")); | ||
| 56 | &mov ("esi","eax"); | ||
| 57 | &mov ("eax",0x80000001); | ||
| 58 | &cpuid (); | ||
| 59 | &or ("ebp","ecx"); | ||
| 60 | &and ("ebp",1<<11|1); # isolate XOP bit | ||
| 61 | &cmp ("esi",0x80000008); | ||
| 55 | &jb (&label("intel")); | 62 | &jb (&label("intel")); |
| 56 | 63 | ||
| 57 | &mov ("eax",0x80000008); | 64 | &mov ("eax",0x80000008); |
| @@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
| 62 | &mov ("eax",1); | 69 | &mov ("eax",1); |
| 63 | &cpuid (); | 70 | &cpuid (); |
| 64 | &bt ("edx",28); | 71 | &bt ("edx",28); |
| 65 | &jnc (&label("done")); | 72 | &jnc (&label("generic")); |
| 66 | &shr ("ebx",16); | 73 | &shr ("ebx",16); |
| 67 | &and ("ebx",0xff); | 74 | &and ("ebx",0xff); |
| 68 | &cmp ("ebx","esi"); | 75 | &cmp ("ebx","esi"); |
| 69 | &ja (&label("done")); | 76 | &ja (&label("generic")); |
| 70 | &and ("edx",0xefffffff); # clear hyper-threading bit | 77 | &and ("edx",0xefffffff); # clear hyper-threading bit |
| 71 | &jmp (&label("done")); | 78 | &jmp (&label("generic")); |
| 72 | 79 | ||
| 73 | &set_label("intel"); | 80 | &set_label("intel"); |
| 74 | &cmp ("edi",4); | 81 | &cmp ("edi",4); |
| @@ -85,27 +92,51 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
| 85 | &set_label("nocacheinfo"); | 92 | &set_label("nocacheinfo"); |
| 86 | &mov ("eax",1); | 93 | &mov ("eax",1); |
| 87 | &cpuid (); | 94 | &cpuid (); |
| 95 | &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0 | ||
| 88 | &cmp ("ebp",0); | 96 | &cmp ("ebp",0); |
| 89 | &jne (&label("notP4")); | 97 | &jne (&label("notintel")); |
| 98 | &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs | ||
| 90 | &and (&HB("eax"),15); # familiy ID | 99 | &and (&HB("eax"),15); # familiy ID |
| 91 | &cmp (&HB("eax"),15); # P4? | 100 | &cmp (&HB("eax"),15); # P4? |
| 92 | &jne (&label("notP4")); | 101 | &jne (&label("notintel")); |
| 93 | &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR | 102 | &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR |
| 94 | &set_label("notP4"); | 103 | &set_label("notintel"); |
| 95 | &bt ("edx",28); # test hyper-threading bit | 104 | &bt ("edx",28); # test hyper-threading bit |
| 96 | &jnc (&label("done")); | 105 | &jnc (&label("generic")); |
| 97 | &and ("edx",0xefffffff); | 106 | &and ("edx",0xefffffff); |
| 98 | &cmp ("edi",0); | 107 | &cmp ("edi",0); |
| 99 | &je (&label("done")); | 108 | &je (&label("generic")); |
| 100 | 109 | ||
| 101 | &or ("edx",0x10000000); | 110 | &or ("edx",0x10000000); |
| 102 | &shr ("ebx",16); | 111 | &shr ("ebx",16); |
| 103 | &cmp (&LB("ebx"),1); | 112 | &cmp (&LB("ebx"),1); |
| 104 | &ja (&label("done")); | 113 | &ja (&label("generic")); |
| 105 | &and ("edx",0xefffffff); # clear hyper-threading bit if not | 114 | &and ("edx",0xefffffff); # clear hyper-threading bit if not |
| 115 | |||
| 116 | &set_label("generic"); | ||
| 117 | &and ("ebp",1<<11); # isolate AMD XOP flag | ||
| 118 | &and ("ecx",0xfffff7ff); # force 11th bit to 0 | ||
| 119 | &mov ("esi","edx"); | ||
| 120 | &or ("ebp","ecx"); # merge AMD XOP flag | ||
| 121 | |||
| 122 | &bt ("ecx",27); # check OSXSAVE bit | ||
| 123 | &jnc (&label("clear_avx")); | ||
| 124 | &xor ("ecx","ecx"); | ||
| 125 | &data_byte(0x0f,0x01,0xd0); # xgetbv | ||
| 126 | &and ("eax",6); | ||
| 127 | &cmp ("eax",6); | ||
| 128 | &je (&label("done")); | ||
| 129 | &cmp ("eax",2); | ||
| 130 | &je (&label("clear_avx")); | ||
| 131 | &set_label("clear_xmm"); | ||
| 132 | &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits | ||
| 133 | &and ("esi",0xfeffffff); # clear FXSR | ||
| 134 | &set_label("clear_avx"); | ||
| 135 | &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits | ||
| 106 | &set_label("done"); | 136 | &set_label("done"); |
| 107 | &mov ("eax","edx"); | 137 | &mov ("eax","esi"); |
| 108 | &mov ("edx","ecx"); | 138 | &mov ("edx","ebp"); |
| 139 | &set_label("nocpuid"); | ||
| 109 | &function_end("OPENSSL_ia32_cpuid"); | 140 | &function_end("OPENSSL_ia32_cpuid"); |
| 110 | 141 | ||
| 111 | &external_label("OPENSSL_ia32cap_P"); | 142 | &external_label("OPENSSL_ia32cap_P"); |
| @@ -199,8 +230,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
| 199 | &bt (&DWP(0,"ecx"),1); | 230 | &bt (&DWP(0,"ecx"),1); |
| 200 | &jnc (&label("no_x87")); | 231 | &jnc (&label("no_x87")); |
| 201 | if ($sse2) { | 232 | if ($sse2) { |
| 202 | &bt (&DWP(0,"ecx"),26); | 233 | &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits |
| 203 | &jnc (&label("no_sse2")); | 234 | &cmp ("ecx",1<<26|1<<24); |
| 235 | &jne (&label("no_sse2")); | ||
| 204 | &pxor ("xmm0","xmm0"); | 236 | &pxor ("xmm0","xmm0"); |
| 205 | &pxor ("xmm1","xmm1"); | 237 | &pxor ("xmm1","xmm1"); |
| 206 | &pxor ("xmm2","xmm2"); | 238 | &pxor ("xmm2","xmm2"); |
| @@ -307,6 +339,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
| 307 | &ret (); | 339 | &ret (); |
| 308 | &function_end_B("OPENSSL_cleanse"); | 340 | &function_end_B("OPENSSL_cleanse"); |
| 309 | 341 | ||
| 342 | &function_begin_B("OPENSSL_ia32_rdrand"); | ||
| 343 | &mov ("ecx",8); | ||
| 344 | &set_label("loop"); | ||
| 345 | &rdrand ("eax"); | ||
| 346 | &jc (&label("break")); | ||
| 347 | &loop (&label("loop")); | ||
| 348 | &set_label("break"); | ||
| 349 | &cmp ("eax",0); | ||
| 350 | &cmove ("eax","ecx"); | ||
| 351 | &ret (); | ||
| 352 | &function_end_B("OPENSSL_ia32_rdrand"); | ||
| 353 | |||
| 310 | &initseg("OPENSSL_cpuid_setup"); | 354 | &initseg("OPENSSL_cpuid_setup"); |
| 311 | 355 | ||
| 312 | &asm_finish(); | 356 | &asm_finish(); |
diff --git a/src/lib/libssl/d1_both.c b/src/lib/libssl/d1_both.c index 9f898d6997..de8bab873f 100644 --- a/src/lib/libssl/d1_both.c +++ b/src/lib/libssl/d1_both.c | |||
| @@ -227,14 +227,14 @@ int dtls1_do_write(SSL *s, int type) | |||
| 227 | unsigned int len, frag_off, mac_size, blocksize; | 227 | unsigned int len, frag_off, mac_size, blocksize; |
| 228 | 228 | ||
| 229 | /* AHA! Figure out the MTU, and stick to the right size */ | 229 | /* AHA! Figure out the MTU, and stick to the right size */ |
| 230 | if ( ! (SSL_get_options(s) & SSL_OP_NO_QUERY_MTU)) | 230 | if (s->d1->mtu < dtls1_min_mtu() && !(SSL_get_options(s) & SSL_OP_NO_QUERY_MTU)) |
| 231 | { | 231 | { |
| 232 | s->d1->mtu = | 232 | s->d1->mtu = |
| 233 | BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_QUERY_MTU, 0, NULL); | 233 | BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_QUERY_MTU, 0, NULL); |
| 234 | 234 | ||
| 235 | /* I've seen the kernel return bogus numbers when it doesn't know | 235 | /* I've seen the kernel return bogus numbers when it doesn't know |
| 236 | * (initial write), so just make sure we have a reasonable number */ | 236 | * (initial write), so just make sure we have a reasonable number */ |
| 237 | if ( s->d1->mtu < dtls1_min_mtu()) | 237 | if (s->d1->mtu < dtls1_min_mtu()) |
| 238 | { | 238 | { |
| 239 | s->d1->mtu = 0; | 239 | s->d1->mtu = 0; |
| 240 | s->d1->mtu = dtls1_guess_mtu(s->d1->mtu); | 240 | s->d1->mtu = dtls1_guess_mtu(s->d1->mtu); |
| @@ -1084,7 +1084,11 @@ int dtls1_read_failed(SSL *s, int code) | |||
| 1084 | return code; | 1084 | return code; |
| 1085 | } | 1085 | } |
| 1086 | 1086 | ||
| 1087 | if ( ! SSL_in_init(s)) /* done, no need to send a retransmit */ | 1087 | #ifndef OPENSSL_NO_HEARTBEATS |
| 1088 | if (!SSL_in_init(s) && !s->tlsext_hb_pending) /* done, no need to send a retransmit */ | ||
| 1089 | #else | ||
| 1090 | if (!SSL_in_init(s)) /* done, no need to send a retransmit */ | ||
| 1091 | #endif | ||
| 1088 | { | 1092 | { |
| 1089 | BIO_set_flags(SSL_get_rbio(s), BIO_FLAGS_READ); | 1093 | BIO_set_flags(SSL_get_rbio(s), BIO_FLAGS_READ); |
| 1090 | return code; | 1094 | return code; |
| @@ -1417,3 +1421,171 @@ dtls1_get_ccs_header(unsigned char *data, struct ccs_header_st *ccs_hdr) | |||
| 1417 | 1421 | ||
| 1418 | ccs_hdr->type = *(data++); | 1422 | ccs_hdr->type = *(data++); |
| 1419 | } | 1423 | } |
| 1424 | |||
| 1425 | int dtls1_shutdown(SSL *s) | ||
| 1426 | { | ||
| 1427 | int ret; | ||
| 1428 | #ifndef OPENSSL_NO_SCTP | ||
| 1429 | if (BIO_dgram_is_sctp(SSL_get_wbio(s)) && | ||
| 1430 | !(s->shutdown & SSL_SENT_SHUTDOWN)) | ||
| 1431 | { | ||
| 1432 | ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s)); | ||
| 1433 | if (ret < 0) return -1; | ||
| 1434 | |||
| 1435 | if (ret == 0) | ||
| 1436 | BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 1, NULL); | ||
| 1437 | } | ||
| 1438 | #endif | ||
| 1439 | ret = ssl3_shutdown(s); | ||
| 1440 | #ifndef OPENSSL_NO_SCTP | ||
| 1441 | BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 0, NULL); | ||
| 1442 | #endif | ||
| 1443 | return ret; | ||
| 1444 | } | ||
| 1445 | |||
| 1446 | #ifndef OPENSSL_NO_HEARTBEATS | ||
| 1447 | int | ||
| 1448 | dtls1_process_heartbeat(SSL *s) | ||
| 1449 | { | ||
| 1450 | unsigned char *p = &s->s3->rrec.data[0], *pl; | ||
| 1451 | unsigned short hbtype; | ||
| 1452 | unsigned int payload; | ||
| 1453 | unsigned int padding = 16; /* Use minimum padding */ | ||
| 1454 | |||
| 1455 | /* Read type and payload length first */ | ||
| 1456 | hbtype = *p++; | ||
| 1457 | n2s(p, payload); | ||
| 1458 | pl = p; | ||
| 1459 | |||
| 1460 | if (s->msg_callback) | ||
| 1461 | s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT, | ||
| 1462 | &s->s3->rrec.data[0], s->s3->rrec.length, | ||
| 1463 | s, s->msg_callback_arg); | ||
| 1464 | |||
| 1465 | if (hbtype == TLS1_HB_REQUEST) | ||
| 1466 | { | ||
| 1467 | unsigned char *buffer, *bp; | ||
| 1468 | int r; | ||
| 1469 | |||
| 1470 | /* Allocate memory for the response, size is 1 byte | ||
| 1471 | * message type, plus 2 bytes payload length, plus | ||
| 1472 | * payload, plus padding | ||
| 1473 | */ | ||
| 1474 | buffer = OPENSSL_malloc(1 + 2 + payload + padding); | ||
| 1475 | bp = buffer; | ||
| 1476 | |||
| 1477 | /* Enter response type, length and copy payload */ | ||
| 1478 | *bp++ = TLS1_HB_RESPONSE; | ||
| 1479 | s2n(payload, bp); | ||
| 1480 | memcpy(bp, pl, payload); | ||
| 1481 | bp += payload; | ||
| 1482 | /* Random padding */ | ||
| 1483 | RAND_pseudo_bytes(bp, padding); | ||
| 1484 | |||
| 1485 | r = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding); | ||
| 1486 | |||
| 1487 | if (r >= 0 && s->msg_callback) | ||
| 1488 | s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT, | ||
| 1489 | buffer, 3 + payload + padding, | ||
| 1490 | s, s->msg_callback_arg); | ||
| 1491 | |||
| 1492 | OPENSSL_free(buffer); | ||
| 1493 | |||
| 1494 | if (r < 0) | ||
| 1495 | return r; | ||
| 1496 | } | ||
| 1497 | else if (hbtype == TLS1_HB_RESPONSE) | ||
| 1498 | { | ||
| 1499 | unsigned int seq; | ||
| 1500 | |||
| 1501 | /* We only send sequence numbers (2 bytes unsigned int), | ||
| 1502 | * and 16 random bytes, so we just try to read the | ||
| 1503 | * sequence number */ | ||
| 1504 | n2s(pl, seq); | ||
| 1505 | |||
| 1506 | if (payload == 18 && seq == s->tlsext_hb_seq) | ||
| 1507 | { | ||
| 1508 | dtls1_stop_timer(s); | ||
| 1509 | s->tlsext_hb_seq++; | ||
| 1510 | s->tlsext_hb_pending = 0; | ||
| 1511 | } | ||
| 1512 | } | ||
| 1513 | |||
| 1514 | return 0; | ||
| 1515 | } | ||
| 1516 | |||
| 1517 | int | ||
| 1518 | dtls1_heartbeat(SSL *s) | ||
| 1519 | { | ||
| 1520 | unsigned char *buf, *p; | ||
| 1521 | int ret; | ||
| 1522 | unsigned int payload = 18; /* Sequence number + random bytes */ | ||
| 1523 | unsigned int padding = 16; /* Use minimum padding */ | ||
| 1524 | |||
| 1525 | /* Only send if peer supports and accepts HB requests... */ | ||
| 1526 | if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) || | ||
| 1527 | s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS) | ||
| 1528 | { | ||
| 1529 | SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT); | ||
| 1530 | return -1; | ||
| 1531 | } | ||
| 1532 | |||
| 1533 | /* ...and there is none in flight yet... */ | ||
| 1534 | if (s->tlsext_hb_pending) | ||
| 1535 | { | ||
| 1536 | SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING); | ||
| 1537 | return -1; | ||
| 1538 | } | ||
| 1539 | |||
| 1540 | /* ...and no handshake in progress. */ | ||
| 1541 | if (SSL_in_init(s) || s->in_handshake) | ||
| 1542 | { | ||
| 1543 | SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE); | ||
| 1544 | return -1; | ||
| 1545 | } | ||
| 1546 | |||
| 1547 | /* Check if padding is too long, payload and padding | ||
| 1548 | * must not exceed 2^14 - 3 = 16381 bytes in total. | ||
| 1549 | */ | ||
| 1550 | OPENSSL_assert(payload + padding <= 16381); | ||
| 1551 | |||
| 1552 | /* Create HeartBeat message, we just use a sequence number | ||
| 1553 | * as payload to distuingish different messages and add | ||
| 1554 | * some random stuff. | ||
| 1555 | * - Message Type, 1 byte | ||
| 1556 | * - Payload Length, 2 bytes (unsigned int) | ||
| 1557 | * - Payload, the sequence number (2 bytes uint) | ||
| 1558 | * - Payload, random bytes (16 bytes uint) | ||
| 1559 | * - Padding | ||
| 1560 | */ | ||
| 1561 | buf = OPENSSL_malloc(1 + 2 + payload + padding); | ||
| 1562 | p = buf; | ||
| 1563 | /* Message Type */ | ||
| 1564 | *p++ = TLS1_HB_REQUEST; | ||
| 1565 | /* Payload length (18 bytes here) */ | ||
| 1566 | s2n(payload, p); | ||
| 1567 | /* Sequence number */ | ||
| 1568 | s2n(s->tlsext_hb_seq, p); | ||
| 1569 | /* 16 random bytes */ | ||
| 1570 | RAND_pseudo_bytes(p, 16); | ||
| 1571 | p += 16; | ||
| 1572 | /* Random padding */ | ||
| 1573 | RAND_pseudo_bytes(p, padding); | ||
| 1574 | |||
| 1575 | ret = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding); | ||
| 1576 | if (ret >= 0) | ||
| 1577 | { | ||
| 1578 | if (s->msg_callback) | ||
| 1579 | s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT, | ||
| 1580 | buf, 3 + payload + padding, | ||
| 1581 | s, s->msg_callback_arg); | ||
| 1582 | |||
| 1583 | dtls1_start_timer(s); | ||
| 1584 | s->tlsext_hb_pending = 1; | ||
| 1585 | } | ||
| 1586 | |||
| 1587 | OPENSSL_free(buf); | ||
| 1588 | |||
| 1589 | return ret; | ||
| 1590 | } | ||
| 1591 | #endif | ||
diff --git a/src/lib/libssl/d1_enc.c b/src/lib/libssl/d1_enc.c index becbab91c2..07a5e97ce5 100644 --- a/src/lib/libssl/d1_enc.c +++ b/src/lib/libssl/d1_enc.c | |||
| @@ -260,7 +260,7 @@ int dtls1_enc(SSL *s, int send) | |||
| 260 | } | 260 | } |
| 261 | /* TLS 1.0 does not bound the number of padding bytes by the block size. | 261 | /* TLS 1.0 does not bound the number of padding bytes by the block size. |
| 262 | * All of them must have value 'padding_length'. */ | 262 | * All of them must have value 'padding_length'. */ |
| 263 | if (i > (int)rec->length) | 263 | if (i + bs > (int)rec->length) |
| 264 | { | 264 | { |
| 265 | /* Incorrect padding. SSLerr() and ssl3_alert are done | 265 | /* Incorrect padding. SSLerr() and ssl3_alert are done |
| 266 | * by caller: we don't want to reveal whether this is | 266 | * by caller: we don't want to reveal whether this is |
diff --git a/src/lib/libssl/d1_lib.c b/src/lib/libssl/d1_lib.c index c3b77c889b..f61f718183 100644 --- a/src/lib/libssl/d1_lib.c +++ b/src/lib/libssl/d1_lib.c | |||
| @@ -82,6 +82,7 @@ SSL3_ENC_METHOD DTLSv1_enc_data={ | |||
| 82 | TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE, | 82 | TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE, |
| 83 | TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE, | 83 | TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE, |
| 84 | tls1_alert_code, | 84 | tls1_alert_code, |
| 85 | tls1_export_keying_material, | ||
| 85 | }; | 86 | }; |
| 86 | 87 | ||
| 87 | long dtls1_default_timeout(void) | 88 | long dtls1_default_timeout(void) |
| @@ -291,6 +292,15 @@ const SSL_CIPHER *dtls1_get_cipher(unsigned int u) | |||
| 291 | 292 | ||
| 292 | void dtls1_start_timer(SSL *s) | 293 | void dtls1_start_timer(SSL *s) |
| 293 | { | 294 | { |
| 295 | #ifndef OPENSSL_NO_SCTP | ||
| 296 | /* Disable timer for SCTP */ | ||
| 297 | if (BIO_dgram_is_sctp(SSL_get_wbio(s))) | ||
| 298 | { | ||
| 299 | memset(&(s->d1->next_timeout), 0, sizeof(struct timeval)); | ||
| 300 | return; | ||
| 301 | } | ||
| 302 | #endif | ||
| 303 | |||
| 294 | /* If timer is not set, initialize duration with 1 second */ | 304 | /* If timer is not set, initialize duration with 1 second */ |
| 295 | if (s->d1->next_timeout.tv_sec == 0 && s->d1->next_timeout.tv_usec == 0) | 305 | if (s->d1->next_timeout.tv_sec == 0 && s->d1->next_timeout.tv_usec == 0) |
| 296 | { | 306 | { |
| @@ -381,6 +391,7 @@ void dtls1_double_timeout(SSL *s) | |||
| 381 | void dtls1_stop_timer(SSL *s) | 391 | void dtls1_stop_timer(SSL *s) |
| 382 | { | 392 | { |
| 383 | /* Reset everything */ | 393 | /* Reset everything */ |
| 394 | memset(&(s->d1->timeout), 0, sizeof(struct dtls1_timeout_st)); | ||
| 384 | memset(&(s->d1->next_timeout), 0, sizeof(struct timeval)); | 395 | memset(&(s->d1->next_timeout), 0, sizeof(struct timeval)); |
| 385 | s->d1->timeout_duration = 1; | 396 | s->d1->timeout_duration = 1; |
| 386 | BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT, 0, &(s->d1->next_timeout)); | 397 | BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT, 0, &(s->d1->next_timeout)); |
| @@ -388,10 +399,28 @@ void dtls1_stop_timer(SSL *s) | |||
| 388 | dtls1_clear_record_buffer(s); | 399 | dtls1_clear_record_buffer(s); |
| 389 | } | 400 | } |
| 390 | 401 | ||
| 391 | int dtls1_handle_timeout(SSL *s) | 402 | int dtls1_check_timeout_num(SSL *s) |
| 392 | { | 403 | { |
| 393 | DTLS1_STATE *state; | 404 | s->d1->timeout.num_alerts++; |
| 405 | |||
| 406 | /* Reduce MTU after 2 unsuccessful retransmissions */ | ||
| 407 | if (s->d1->timeout.num_alerts > 2) | ||
| 408 | { | ||
| 409 | s->d1->mtu = BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_GET_FALLBACK_MTU, 0, NULL); | ||
| 410 | } | ||
| 394 | 411 | ||
| 412 | if (s->d1->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT) | ||
| 413 | { | ||
| 414 | /* fail the connection, enough alerts have been sent */ | ||
| 415 | SSLerr(SSL_F_DTLS1_CHECK_TIMEOUT_NUM,SSL_R_READ_TIMEOUT_EXPIRED); | ||
| 416 | return -1; | ||
| 417 | } | ||
| 418 | |||
| 419 | return 0; | ||
| 420 | } | ||
| 421 | |||
| 422 | int dtls1_handle_timeout(SSL *s) | ||
| 423 | { | ||
| 395 | /* if no timer is expired, don't do anything */ | 424 | /* if no timer is expired, don't do anything */ |
| 396 | if (!dtls1_is_timer_expired(s)) | 425 | if (!dtls1_is_timer_expired(s)) |
| 397 | { | 426 | { |
| @@ -399,20 +428,23 @@ int dtls1_handle_timeout(SSL *s) | |||
| 399 | } | 428 | } |
| 400 | 429 | ||
| 401 | dtls1_double_timeout(s); | 430 | dtls1_double_timeout(s); |
| 402 | state = s->d1; | 431 | |
| 403 | state->timeout.num_alerts++; | 432 | if (dtls1_check_timeout_num(s) < 0) |
| 404 | if ( state->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT) | ||
| 405 | { | ||
| 406 | /* fail the connection, enough alerts have been sent */ | ||
| 407 | SSLerr(SSL_F_DTLS1_HANDLE_TIMEOUT,SSL_R_READ_TIMEOUT_EXPIRED); | ||
| 408 | return -1; | 433 | return -1; |
| 434 | |||
| 435 | s->d1->timeout.read_timeouts++; | ||
| 436 | if (s->d1->timeout.read_timeouts > DTLS1_TMO_READ_COUNT) | ||
| 437 | { | ||
| 438 | s->d1->timeout.read_timeouts = 1; | ||
| 409 | } | 439 | } |
| 410 | 440 | ||
| 411 | state->timeout.read_timeouts++; | 441 | #ifndef OPENSSL_NO_HEARTBEATS |
| 412 | if ( state->timeout.read_timeouts > DTLS1_TMO_READ_COUNT) | 442 | if (s->tlsext_hb_pending) |
| 413 | { | 443 | { |
| 414 | state->timeout.read_timeouts = 1; | 444 | s->tlsext_hb_pending = 0; |
| 445 | return dtls1_heartbeat(s); | ||
| 415 | } | 446 | } |
| 447 | #endif | ||
| 416 | 448 | ||
| 417 | dtls1_start_timer(s); | 449 | dtls1_start_timer(s); |
| 418 | return dtls1_retransmit_buffered_messages(s); | 450 | return dtls1_retransmit_buffered_messages(s); |
diff --git a/src/lib/libssl/d1_srtp.c b/src/lib/libssl/d1_srtp.c new file mode 100644 index 0000000000..928935bd8b --- /dev/null +++ b/src/lib/libssl/d1_srtp.c | |||
| @@ -0,0 +1,493 @@ | |||
| 1 | /* ssl/t1_lib.c */ | ||
| 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 3 | * All rights reserved. | ||
| 4 | * | ||
| 5 | * This package is an SSL implementation written | ||
| 6 | * by Eric Young (eay@cryptsoft.com). | ||
| 7 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 8 | * | ||
| 9 | * This library is free for commercial and non-commercial use as long as | ||
| 10 | * the following conditions are aheared to. The following conditions | ||
| 11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 13 | * included with this distribution is covered by the same copyright terms | ||
| 14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 15 | * | ||
| 16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 17 | * the code are not to be removed. | ||
| 18 | * If this package is used in a product, Eric Young should be given attribution | ||
| 19 | * as the author of the parts of the library used. | ||
| 20 | * This can be in the form of a textual message at program startup or | ||
| 21 | * in documentation (online or textual) provided with the package. | ||
| 22 | * | ||
| 23 | * Redistribution and use in source and binary forms, with or without | ||
| 24 | * modification, are permitted provided that the following conditions | ||
| 25 | * are met: | ||
| 26 | * 1. Redistributions of source code must retain the copyright | ||
| 27 | * notice, this list of conditions and the following disclaimer. | ||
| 28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 29 | * notice, this list of conditions and the following disclaimer in the | ||
| 30 | * documentation and/or other materials provided with the distribution. | ||
| 31 | * 3. All advertising materials mentioning features or use of this software | ||
| 32 | * must display the following acknowledgement: | ||
| 33 | * "This product includes cryptographic software written by | ||
| 34 | * Eric Young (eay@cryptsoft.com)" | ||
| 35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 36 | * being used are not cryptographic related :-). | ||
| 37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 38 | * the apps directory (application code) you must include an acknowledgement: | ||
| 39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 40 | * | ||
| 41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 51 | * SUCH DAMAGE. | ||
| 52 | * | ||
| 53 | * The licence and distribution terms for any publically available version or | ||
| 54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 55 | * copied and put under another distribution licence | ||
| 56 | * [including the GNU Public Licence.] | ||
| 57 | */ | ||
| 58 | /* ==================================================================== | ||
| 59 | * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. | ||
| 60 | * | ||
| 61 | * Redistribution and use in source and binary forms, with or without | ||
| 62 | * modification, are permitted provided that the following conditions | ||
| 63 | * are met: | ||
| 64 | * | ||
| 65 | * 1. Redistributions of source code must retain the above copyright | ||
| 66 | * notice, this list of conditions and the following disclaimer. | ||
| 67 | * | ||
| 68 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 69 | * notice, this list of conditions and the following disclaimer in | ||
| 70 | * the documentation and/or other materials provided with the | ||
| 71 | * distribution. | ||
| 72 | * | ||
| 73 | * 3. All advertising materials mentioning features or use of this | ||
| 74 | * software must display the following acknowledgment: | ||
| 75 | * "This product includes software developed by the OpenSSL Project | ||
| 76 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
| 77 | * | ||
| 78 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 79 | * endorse or promote products derived from this software without | ||
| 80 | * prior written permission. For written permission, please contact | ||
| 81 | * openssl-core@openssl.org. | ||
| 82 | * | ||
| 83 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 84 | * nor may "OpenSSL" appear in their names without prior written | ||
| 85 | * permission of the OpenSSL Project. | ||
| 86 | * | ||
| 87 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 88 | * acknowledgment: | ||
| 89 | * "This product includes software developed by the OpenSSL Project | ||
| 90 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
| 91 | * | ||
| 92 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 93 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 94 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 95 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 96 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 97 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 98 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 99 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 100 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 101 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 102 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 103 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 104 | * ==================================================================== | ||
| 105 | * | ||
| 106 | * This product includes cryptographic software written by Eric Young | ||
| 107 | * (eay@cryptsoft.com). This product includes software written by Tim | ||
| 108 | * Hudson (tjh@cryptsoft.com). | ||
| 109 | * | ||
| 110 | */ | ||
| 111 | /* | ||
| 112 | DTLS code by Eric Rescorla <ekr@rtfm.com> | ||
| 113 | |||
| 114 | Copyright (C) 2006, Network Resonance, Inc. | ||
| 115 | Copyright (C) 2011, RTFM, Inc. | ||
| 116 | */ | ||
| 117 | |||
| 118 | #ifndef OPENSSL_NO_SRTP | ||
| 119 | |||
| 120 | #include <stdio.h> | ||
| 121 | #include <openssl/objects.h> | ||
| 122 | #include "ssl_locl.h" | ||
| 123 | #include "srtp.h" | ||
| 124 | |||
| 125 | |||
| 126 | static SRTP_PROTECTION_PROFILE srtp_known_profiles[]= | ||
| 127 | { | ||
| 128 | { | ||
| 129 | "SRTP_AES128_CM_SHA1_80", | ||
| 130 | SRTP_AES128_CM_SHA1_80, | ||
| 131 | }, | ||
| 132 | { | ||
| 133 | "SRTP_AES128_CM_SHA1_32", | ||
| 134 | SRTP_AES128_CM_SHA1_32, | ||
| 135 | }, | ||
| 136 | #if 0 | ||
| 137 | { | ||
| 138 | "SRTP_NULL_SHA1_80", | ||
| 139 | SRTP_NULL_SHA1_80, | ||
| 140 | }, | ||
| 141 | { | ||
| 142 | "SRTP_NULL_SHA1_32", | ||
| 143 | SRTP_NULL_SHA1_32, | ||
| 144 | }, | ||
| 145 | #endif | ||
| 146 | {0} | ||
| 147 | }; | ||
| 148 | |||
| 149 | static int find_profile_by_name(char *profile_name, | ||
| 150 | SRTP_PROTECTION_PROFILE **pptr,unsigned len) | ||
| 151 | { | ||
| 152 | SRTP_PROTECTION_PROFILE *p; | ||
| 153 | |||
| 154 | p=srtp_known_profiles; | ||
| 155 | while(p->name) | ||
| 156 | { | ||
| 157 | if((len == strlen(p->name)) && !strncmp(p->name,profile_name, | ||
| 158 | len)) | ||
| 159 | { | ||
| 160 | *pptr=p; | ||
| 161 | return 0; | ||
| 162 | } | ||
| 163 | |||
| 164 | p++; | ||
| 165 | } | ||
| 166 | |||
| 167 | return 1; | ||
| 168 | } | ||
| 169 | |||
| 170 | static int find_profile_by_num(unsigned profile_num, | ||
| 171 | SRTP_PROTECTION_PROFILE **pptr) | ||
| 172 | { | ||
| 173 | SRTP_PROTECTION_PROFILE *p; | ||
| 174 | |||
| 175 | p=srtp_known_profiles; | ||
| 176 | while(p->name) | ||
| 177 | { | ||
| 178 | if(p->id == profile_num) | ||
| 179 | { | ||
| 180 | *pptr=p; | ||
| 181 | return 0; | ||
| 182 | } | ||
| 183 | p++; | ||
| 184 | } | ||
| 185 | |||
| 186 | return 1; | ||
| 187 | } | ||
| 188 | |||
| 189 | static int ssl_ctx_make_profiles(const char *profiles_string,STACK_OF(SRTP_PROTECTION_PROFILE) **out) | ||
| 190 | { | ||
| 191 | STACK_OF(SRTP_PROTECTION_PROFILE) *profiles; | ||
| 192 | |||
| 193 | char *col; | ||
| 194 | char *ptr=(char *)profiles_string; | ||
| 195 | |||
| 196 | SRTP_PROTECTION_PROFILE *p; | ||
| 197 | |||
| 198 | if(!(profiles=sk_SRTP_PROTECTION_PROFILE_new_null())) | ||
| 199 | { | ||
| 200 | SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES, SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES); | ||
| 201 | return 1; | ||
| 202 | } | ||
| 203 | |||
| 204 | do | ||
| 205 | { | ||
| 206 | col=strchr(ptr,':'); | ||
| 207 | |||
| 208 | if(!find_profile_by_name(ptr,&p, | ||
| 209 | col ? col-ptr : (int)strlen(ptr))) | ||
| 210 | { | ||
| 211 | sk_SRTP_PROTECTION_PROFILE_push(profiles,p); | ||
| 212 | } | ||
| 213 | else | ||
| 214 | { | ||
| 215 | SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES,SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE); | ||
| 216 | return 1; | ||
| 217 | } | ||
| 218 | |||
| 219 | if(col) ptr=col+1; | ||
| 220 | } while (col); | ||
| 221 | |||
| 222 | *out=profiles; | ||
| 223 | |||
| 224 | return 0; | ||
| 225 | } | ||
| 226 | |||
| 227 | int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx,const char *profiles) | ||
| 228 | { | ||
| 229 | return ssl_ctx_make_profiles(profiles,&ctx->srtp_profiles); | ||
| 230 | } | ||
| 231 | |||
| 232 | int SSL_set_tlsext_use_srtp(SSL *s,const char *profiles) | ||
| 233 | { | ||
| 234 | return ssl_ctx_make_profiles(profiles,&s->srtp_profiles); | ||
| 235 | } | ||
| 236 | |||
| 237 | |||
| 238 | STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *s) | ||
| 239 | { | ||
| 240 | if(s != NULL) | ||
| 241 | { | ||
| 242 | if(s->srtp_profiles != NULL) | ||
| 243 | { | ||
| 244 | return s->srtp_profiles; | ||
| 245 | } | ||
| 246 | else if((s->ctx != NULL) && | ||
| 247 | (s->ctx->srtp_profiles != NULL)) | ||
| 248 | { | ||
| 249 | return s->ctx->srtp_profiles; | ||
| 250 | } | ||
| 251 | } | ||
| 252 | |||
| 253 | return NULL; | ||
| 254 | } | ||
| 255 | |||
| 256 | SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s) | ||
| 257 | { | ||
| 258 | return s->srtp_profile; | ||
| 259 | } | ||
| 260 | |||
| 261 | /* Note: this function returns 0 length if there are no | ||
| 262 | profiles specified */ | ||
| 263 | int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen) | ||
| 264 | { | ||
| 265 | int ct=0; | ||
| 266 | int i; | ||
| 267 | STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0; | ||
| 268 | SRTP_PROTECTION_PROFILE *prof; | ||
| 269 | |||
| 270 | clnt=SSL_get_srtp_profiles(s); | ||
| 271 | ct=sk_SRTP_PROTECTION_PROFILE_num(clnt); /* -1 if clnt == 0 */ | ||
| 272 | |||
| 273 | if(p) | ||
| 274 | { | ||
| 275 | if(ct==0) | ||
| 276 | { | ||
| 277 | SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST); | ||
| 278 | return 1; | ||
| 279 | } | ||
| 280 | |||
| 281 | if((2 + ct*2 + 1) > maxlen) | ||
| 282 | { | ||
| 283 | SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG); | ||
| 284 | return 1; | ||
| 285 | } | ||
| 286 | |||
| 287 | /* Add the length */ | ||
| 288 | s2n(ct * 2, p); | ||
| 289 | for(i=0;i<ct;i++) | ||
| 290 | { | ||
| 291 | prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i); | ||
| 292 | s2n(prof->id,p); | ||
| 293 | } | ||
| 294 | |||
| 295 | /* Add an empty use_mki value */ | ||
| 296 | *p++ = 0; | ||
| 297 | } | ||
| 298 | |||
| 299 | *len=2 + ct*2 + 1; | ||
| 300 | |||
| 301 | return 0; | ||
| 302 | } | ||
| 303 | |||
| 304 | |||
| 305 | int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al) | ||
| 306 | { | ||
| 307 | SRTP_PROTECTION_PROFILE *cprof,*sprof; | ||
| 308 | STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0,*srvr; | ||
| 309 | int ct; | ||
| 310 | int mki_len; | ||
| 311 | int i,j; | ||
| 312 | int id; | ||
| 313 | int ret; | ||
| 314 | |||
| 315 | /* Length value + the MKI length */ | ||
| 316 | if(len < 3) | ||
| 317 | { | ||
| 318 | SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
| 319 | *al=SSL_AD_DECODE_ERROR; | ||
| 320 | return 1; | ||
| 321 | } | ||
| 322 | |||
| 323 | /* Pull off the length of the cipher suite list */ | ||
| 324 | n2s(d, ct); | ||
| 325 | len -= 2; | ||
| 326 | |||
| 327 | /* Check that it is even */ | ||
| 328 | if(ct%2) | ||
| 329 | { | ||
| 330 | SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
| 331 | *al=SSL_AD_DECODE_ERROR; | ||
| 332 | return 1; | ||
| 333 | } | ||
| 334 | |||
| 335 | /* Check that lengths are consistent */ | ||
| 336 | if(len < (ct + 1)) | ||
| 337 | { | ||
| 338 | SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
| 339 | *al=SSL_AD_DECODE_ERROR; | ||
| 340 | return 1; | ||
| 341 | } | ||
| 342 | |||
| 343 | |||
| 344 | clnt=sk_SRTP_PROTECTION_PROFILE_new_null(); | ||
| 345 | |||
| 346 | while(ct) | ||
| 347 | { | ||
| 348 | n2s(d,id); | ||
| 349 | ct-=2; | ||
| 350 | len-=2; | ||
| 351 | |||
| 352 | if(!find_profile_by_num(id,&cprof)) | ||
| 353 | { | ||
| 354 | sk_SRTP_PROTECTION_PROFILE_push(clnt,cprof); | ||
| 355 | } | ||
| 356 | else | ||
| 357 | { | ||
| 358 | ; /* Ignore */ | ||
| 359 | } | ||
| 360 | } | ||
| 361 | |||
| 362 | /* Now extract the MKI value as a sanity check, but discard it for now */ | ||
| 363 | mki_len = *d; | ||
| 364 | d++; len--; | ||
| 365 | |||
| 366 | if (mki_len != len) | ||
| 367 | { | ||
| 368 | SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE); | ||
| 369 | *al=SSL_AD_DECODE_ERROR; | ||
| 370 | return 1; | ||
| 371 | } | ||
| 372 | |||
| 373 | srvr=SSL_get_srtp_profiles(s); | ||
| 374 | |||
| 375 | /* Pick our most preferred profile. If no profiles have been | ||
| 376 | configured then the outer loop doesn't run | ||
| 377 | (sk_SRTP_PROTECTION_PROFILE_num() = -1) | ||
| 378 | and so we just return without doing anything */ | ||
| 379 | for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(srvr);i++) | ||
| 380 | { | ||
| 381 | sprof=sk_SRTP_PROTECTION_PROFILE_value(srvr,i); | ||
| 382 | |||
| 383 | for(j=0;j<sk_SRTP_PROTECTION_PROFILE_num(clnt);j++) | ||
| 384 | { | ||
| 385 | cprof=sk_SRTP_PROTECTION_PROFILE_value(clnt,j); | ||
| 386 | |||
| 387 | if(cprof->id==sprof->id) | ||
| 388 | { | ||
| 389 | s->srtp_profile=sprof; | ||
| 390 | *al=0; | ||
| 391 | ret=0; | ||
| 392 | goto done; | ||
| 393 | } | ||
| 394 | } | ||
| 395 | } | ||
| 396 | |||
| 397 | ret=0; | ||
| 398 | |||
| 399 | done: | ||
| 400 | if(clnt) sk_SRTP_PROTECTION_PROFILE_free(clnt); | ||
| 401 | |||
| 402 | return ret; | ||
| 403 | } | ||
| 404 | |||
| 405 | int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen) | ||
| 406 | { | ||
| 407 | if(p) | ||
| 408 | { | ||
| 409 | if(maxlen < 5) | ||
| 410 | { | ||
| 411 | SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG); | ||
| 412 | return 1; | ||
| 413 | } | ||
| 414 | |||
| 415 | if(s->srtp_profile==0) | ||
| 416 | { | ||
| 417 | SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_USE_SRTP_NOT_NEGOTIATED); | ||
| 418 | return 1; | ||
| 419 | } | ||
| 420 | s2n(2, p); | ||
| 421 | s2n(s->srtp_profile->id,p); | ||
| 422 | *p++ = 0; | ||
| 423 | } | ||
| 424 | *len=5; | ||
| 425 | |||
| 426 | return 0; | ||
| 427 | } | ||
| 428 | |||
| 429 | |||
| 430 | int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al) | ||
| 431 | { | ||
| 432 | unsigned id; | ||
| 433 | int i; | ||
| 434 | int ct; | ||
| 435 | |||
| 436 | STACK_OF(SRTP_PROTECTION_PROFILE) *clnt; | ||
| 437 | SRTP_PROTECTION_PROFILE *prof; | ||
| 438 | |||
| 439 | if(len!=5) | ||
| 440 | { | ||
| 441 | SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
| 442 | *al=SSL_AD_DECODE_ERROR; | ||
| 443 | return 1; | ||
| 444 | } | ||
| 445 | |||
| 446 | n2s(d, ct); | ||
| 447 | if(ct!=2) | ||
| 448 | { | ||
| 449 | SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
| 450 | *al=SSL_AD_DECODE_ERROR; | ||
| 451 | return 1; | ||
| 452 | } | ||
| 453 | |||
| 454 | n2s(d,id); | ||
| 455 | if (*d) /* Must be no MKI, since we never offer one */ | ||
| 456 | { | ||
| 457 | SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE); | ||
| 458 | *al=SSL_AD_ILLEGAL_PARAMETER; | ||
| 459 | return 1; | ||
| 460 | } | ||
| 461 | |||
| 462 | clnt=SSL_get_srtp_profiles(s); | ||
| 463 | |||
| 464 | /* Throw an error if the server gave us an unsolicited extension */ | ||
| 465 | if (clnt == NULL) | ||
| 466 | { | ||
| 467 | SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_NO_SRTP_PROFILES); | ||
| 468 | *al=SSL_AD_DECODE_ERROR; | ||
| 469 | return 1; | ||
| 470 | } | ||
| 471 | |||
| 472 | /* Check to see if the server gave us something we support | ||
| 473 | (and presumably offered) | ||
| 474 | */ | ||
| 475 | for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(clnt);i++) | ||
| 476 | { | ||
| 477 | prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i); | ||
| 478 | |||
| 479 | if(prof->id == id) | ||
| 480 | { | ||
| 481 | s->srtp_profile=prof; | ||
| 482 | *al=0; | ||
| 483 | return 0; | ||
| 484 | } | ||
| 485 | } | ||
| 486 | |||
| 487 | SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
| 488 | *al=SSL_AD_DECODE_ERROR; | ||
| 489 | return 1; | ||
| 490 | } | ||
| 491 | |||
| 492 | |||
| 493 | #endif | ||
diff --git a/src/lib/libssl/srtp.h b/src/lib/libssl/srtp.h new file mode 100644 index 0000000000..c0cf33ef28 --- /dev/null +++ b/src/lib/libssl/srtp.h | |||
| @@ -0,0 +1,145 @@ | |||
| 1 | /* ssl/tls1.h */ | ||
| 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
| 3 | * All rights reserved. | ||
| 4 | * | ||
| 5 | * This package is an SSL implementation written | ||
| 6 | * by Eric Young (eay@cryptsoft.com). | ||
| 7 | * The implementation was written so as to conform with Netscapes SSL. | ||
| 8 | * | ||
| 9 | * This library is free for commercial and non-commercial use as long as | ||
| 10 | * the following conditions are aheared to. The following conditions | ||
| 11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
| 12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
| 13 | * included with this distribution is covered by the same copyright terms | ||
| 14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
| 15 | * | ||
| 16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
| 17 | * the code are not to be removed. | ||
| 18 | * If this package is used in a product, Eric Young should be given attribution | ||
| 19 | * as the author of the parts of the library used. | ||
| 20 | * This can be in the form of a textual message at program startup or | ||
| 21 | * in documentation (online or textual) provided with the package. | ||
| 22 | * | ||
| 23 | * Redistribution and use in source and binary forms, with or without | ||
| 24 | * modification, are permitted provided that the following conditions | ||
| 25 | * are met: | ||
| 26 | * 1. Redistributions of source code must retain the copyright | ||
| 27 | * notice, this list of conditions and the following disclaimer. | ||
| 28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 29 | * notice, this list of conditions and the following disclaimer in the | ||
| 30 | * documentation and/or other materials provided with the distribution. | ||
| 31 | * 3. All advertising materials mentioning features or use of this software | ||
| 32 | * must display the following acknowledgement: | ||
| 33 | * "This product includes cryptographic software written by | ||
| 34 | * Eric Young (eay@cryptsoft.com)" | ||
| 35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
| 36 | * being used are not cryptographic related :-). | ||
| 37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
| 38 | * the apps directory (application code) you must include an acknowledgement: | ||
| 39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
| 40 | * | ||
| 41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
| 42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
| 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
| 47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
| 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
| 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
| 51 | * SUCH DAMAGE. | ||
| 52 | * | ||
| 53 | * The licence and distribution terms for any publically available version or | ||
| 54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
| 55 | * copied and put under another distribution licence | ||
| 56 | * [including the GNU Public Licence.] | ||
| 57 | */ | ||
| 58 | /* ==================================================================== | ||
| 59 | * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. | ||
| 60 | * | ||
| 61 | * Redistribution and use in source and binary forms, with or without | ||
| 62 | * modification, are permitted provided that the following conditions | ||
| 63 | * are met: | ||
| 64 | * | ||
| 65 | * 1. Redistributions of source code must retain the above copyright | ||
| 66 | * notice, this list of conditions and the following disclaimer. | ||
| 67 | * | ||
| 68 | * 2. Redistributions in binary form must reproduce the above copyright | ||
| 69 | * notice, this list of conditions and the following disclaimer in | ||
| 70 | * the documentation and/or other materials provided with the | ||
| 71 | * distribution. | ||
| 72 | * | ||
| 73 | * 3. All advertising materials mentioning features or use of this | ||
| 74 | * software must display the following acknowledgment: | ||
| 75 | * "This product includes software developed by the OpenSSL Project | ||
| 76 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
| 77 | * | ||
| 78 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
| 79 | * endorse or promote products derived from this software without | ||
| 80 | * prior written permission. For written permission, please contact | ||
| 81 | * openssl-core@openssl.org. | ||
| 82 | * | ||
| 83 | * 5. Products derived from this software may not be called "OpenSSL" | ||
| 84 | * nor may "OpenSSL" appear in their names without prior written | ||
| 85 | * permission of the OpenSSL Project. | ||
| 86 | * | ||
| 87 | * 6. Redistributions of any form whatsoever must retain the following | ||
| 88 | * acknowledgment: | ||
| 89 | * "This product includes software developed by the OpenSSL Project | ||
| 90 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
| 91 | * | ||
| 92 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
| 93 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 94 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
| 95 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
| 96 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
| 97 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 98 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
| 99 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
| 100 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
| 101 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 102 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
| 103 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 104 | * ==================================================================== | ||
| 105 | * | ||
| 106 | * This product includes cryptographic software written by Eric Young | ||
| 107 | * (eay@cryptsoft.com). This product includes software written by Tim | ||
| 108 | * Hudson (tjh@cryptsoft.com). | ||
| 109 | * | ||
| 110 | */ | ||
| 111 | /* | ||
| 112 | DTLS code by Eric Rescorla <ekr@rtfm.com> | ||
| 113 | |||
| 114 | Copyright (C) 2006, Network Resonance, Inc. | ||
| 115 | Copyright (C) 2011, RTFM, Inc. | ||
| 116 | */ | ||
| 117 | |||
| 118 | #ifndef HEADER_D1_SRTP_H | ||
| 119 | #define HEADER_D1_SRTP_H | ||
| 120 | |||
| 121 | #ifdef __cplusplus | ||
| 122 | extern "C" { | ||
| 123 | #endif | ||
| 124 | |||
| 125 | |||
| 126 | #define SRTP_AES128_CM_SHA1_80 0x0001 | ||
| 127 | #define SRTP_AES128_CM_SHA1_32 0x0002 | ||
| 128 | #define SRTP_AES128_F8_SHA1_80 0x0003 | ||
| 129 | #define SRTP_AES128_F8_SHA1_32 0x0004 | ||
| 130 | #define SRTP_NULL_SHA1_80 0x0005 | ||
| 131 | #define SRTP_NULL_SHA1_32 0x0006 | ||
| 132 | |||
| 133 | int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles); | ||
| 134 | int SSL_set_tlsext_use_srtp(SSL *ctx, const char *profiles); | ||
| 135 | SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s); | ||
| 136 | |||
| 137 | STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *ssl); | ||
| 138 | SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s); | ||
| 139 | |||
| 140 | #ifdef __cplusplus | ||
| 141 | } | ||
| 142 | #endif | ||
| 143 | |||
| 144 | #endif | ||
| 145 | |||
diff --git a/src/lib/libssl/test/P1ss.cnf b/src/lib/libssl/test/P1ss.cnf index 876a0d35f8..326cce2ba8 100644 --- a/src/lib/libssl/test/P1ss.cnf +++ b/src/lib/libssl/test/P1ss.cnf | |||
| @@ -7,7 +7,7 @@ RANDFILE = ./.rnd | |||
| 7 | 7 | ||
| 8 | #################################################################### | 8 | #################################################################### |
| 9 | [ req ] | 9 | [ req ] |
| 10 | default_bits = 512 | 10 | default_bits = 1024 |
| 11 | default_keyfile = keySS.pem | 11 | default_keyfile = keySS.pem |
| 12 | distinguished_name = req_distinguished_name | 12 | distinguished_name = req_distinguished_name |
| 13 | encrypt_rsa_key = no | 13 | encrypt_rsa_key = no |
diff --git a/src/lib/libssl/test/P2ss.cnf b/src/lib/libssl/test/P2ss.cnf index 373a87e7c2..8b502321b8 100644 --- a/src/lib/libssl/test/P2ss.cnf +++ b/src/lib/libssl/test/P2ss.cnf | |||
| @@ -7,7 +7,7 @@ RANDFILE = ./.rnd | |||
| 7 | 7 | ||
| 8 | #################################################################### | 8 | #################################################################### |
| 9 | [ req ] | 9 | [ req ] |
| 10 | default_bits = 512 | 10 | default_bits = 1024 |
| 11 | default_keyfile = keySS.pem | 11 | default_keyfile = keySS.pem |
| 12 | distinguished_name = req_distinguished_name | 12 | distinguished_name = req_distinguished_name |
| 13 | encrypt_rsa_key = no | 13 | encrypt_rsa_key = no |
diff --git a/src/lib/libssl/test/pkits-test.pl b/src/lib/libssl/test/pkits-test.pl index 69dffa16f9..5c6b89fcdb 100644 --- a/src/lib/libssl/test/pkits-test.pl +++ b/src/lib/libssl/test/pkits-test.pl | |||
| @@ -784,6 +784,15 @@ my $ossl = "ossl/apps/openssl"; | |||
| 784 | 784 | ||
| 785 | my $ossl_cmd = "$ossl_path cms -verify -verify_retcode "; | 785 | my $ossl_cmd = "$ossl_path cms -verify -verify_retcode "; |
| 786 | $ossl_cmd .= "-CAfile pkitsta.pem -crl_check_all -x509_strict "; | 786 | $ossl_cmd .= "-CAfile pkitsta.pem -crl_check_all -x509_strict "; |
| 787 | |||
| 788 | # Check for expiry of trust anchor | ||
| 789 | system "$ossl_path x509 -inform DER -in $pkitsta -checkend 0"; | ||
| 790 | if ($? == 256) | ||
| 791 | { | ||
| 792 | print STDERR "WARNING: using older expired data\n"; | ||
| 793 | $ossl_cmd .= "-attime 1291940972 "; | ||
| 794 | } | ||
| 795 | |||
| 787 | $ossl_cmd .= "-policy_check -extended_crl -use_deltas -out /dev/null 2>&1 "; | 796 | $ossl_cmd .= "-policy_check -extended_crl -use_deltas -out /dev/null 2>&1 "; |
| 788 | 797 | ||
| 789 | system "$ossl_path x509 -inform DER -in $pkitsta -out pkitsta.pem"; | 798 | system "$ossl_path x509 -inform DER -in $pkitsta -out pkitsta.pem"; |
diff --git a/src/lib/libssl/test/test.cnf b/src/lib/libssl/test/test.cnf index faad3914a8..10834442a1 100644 --- a/src/lib/libssl/test/test.cnf +++ b/src/lib/libssl/test/test.cnf | |||
| @@ -56,7 +56,7 @@ emailAddress = optional | |||
| 56 | 56 | ||
| 57 | #################################################################### | 57 | #################################################################### |
| 58 | [ req ] | 58 | [ req ] |
| 59 | default_bits = 512 | 59 | default_bits = 1024 |
| 60 | default_keyfile = testkey.pem | 60 | default_keyfile = testkey.pem |
| 61 | distinguished_name = req_distinguished_name | 61 | distinguished_name = req_distinguished_name |
| 62 | encrypt_rsa_key = no | 62 | encrypt_rsa_key = no |
