diff options
author | djm <> | 2012-10-13 21:23:50 +0000 |
---|---|---|
committer | djm <> | 2012-10-13 21:23:50 +0000 |
commit | e9d65189905c6e99c1062d65e26bf83eebb0a26a (patch) | |
tree | 10ebe51c3542099b0ab8325d8f322372375dc3b4 | |
parent | 59625e84c89bf82e1c6d20c55785b618eb56ea72 (diff) | |
parent | 228cae30b117c2493f69ad3c195341cd6ec8d430 (diff) | |
download | openbsd-e9d65189905c6e99c1062d65e26bf83eebb0a26a.tar.gz openbsd-e9d65189905c6e99c1062d65e26bf83eebb0a26a.tar.bz2 openbsd-e9d65189905c6e99c1062d65e26bf83eebb0a26a.zip |
This commit was generated by cvs2git to track changes on a CVS vendor
branch.
Diffstat (limited to '')
160 files changed, 48653 insertions, 1420 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl index c51ee1fbf6..86b86c4a0f 100644 --- a/src/lib/libcrypto/aes/asm/aes-armv4.pl +++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl | |||
@@ -27,6 +27,11 @@ | |||
27 | # Rescheduling for dual-issue pipeline resulted in 12% improvement on | 27 | # Rescheduling for dual-issue pipeline resulted in 12% improvement on |
28 | # Cortex A8 core and ~25 cycles per byte processed with 128-bit key. | 28 | # Cortex A8 core and ~25 cycles per byte processed with 128-bit key. |
29 | 29 | ||
30 | # February 2011. | ||
31 | # | ||
32 | # Profiler-assisted and platform-specific optimization resulted in 16% | ||
33 | # improvement on Cortex A8 core and ~21.5 cycles per byte. | ||
34 | |||
30 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | 35 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
31 | open STDOUT,">$output"; | 36 | open STDOUT,">$output"; |
32 | 37 | ||
@@ -46,6 +51,7 @@ $key="r11"; | |||
46 | $rounds="r12"; | 51 | $rounds="r12"; |
47 | 52 | ||
48 | $code=<<___; | 53 | $code=<<___; |
54 | #include "arm_arch.h" | ||
49 | .text | 55 | .text |
50 | .code 32 | 56 | .code 32 |
51 | 57 | ||
@@ -166,7 +172,7 @@ AES_encrypt: | |||
166 | mov $rounds,r0 @ inp | 172 | mov $rounds,r0 @ inp |
167 | mov $key,r2 | 173 | mov $key,r2 |
168 | sub $tbl,r3,#AES_encrypt-AES_Te @ Te | 174 | sub $tbl,r3,#AES_encrypt-AES_Te @ Te |
169 | 175 | #if __ARM_ARCH__<7 | |
170 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral | 176 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral |
171 | ldrb $t1,[$rounds,#2] @ manner... | 177 | ldrb $t1,[$rounds,#2] @ manner... |
172 | ldrb $t2,[$rounds,#1] | 178 | ldrb $t2,[$rounds,#1] |
@@ -195,10 +201,33 @@ AES_encrypt: | |||
195 | orr $s3,$s3,$t1,lsl#8 | 201 | orr $s3,$s3,$t1,lsl#8 |
196 | orr $s3,$s3,$t2,lsl#16 | 202 | orr $s3,$s3,$t2,lsl#16 |
197 | orr $s3,$s3,$t3,lsl#24 | 203 | orr $s3,$s3,$t3,lsl#24 |
198 | 204 | #else | |
205 | ldr $s0,[$rounds,#0] | ||
206 | ldr $s1,[$rounds,#4] | ||
207 | ldr $s2,[$rounds,#8] | ||
208 | ldr $s3,[$rounds,#12] | ||
209 | #ifdef __ARMEL__ | ||
210 | rev $s0,$s0 | ||
211 | rev $s1,$s1 | ||
212 | rev $s2,$s2 | ||
213 | rev $s3,$s3 | ||
214 | #endif | ||
215 | #endif | ||
199 | bl _armv4_AES_encrypt | 216 | bl _armv4_AES_encrypt |
200 | 217 | ||
201 | ldr $rounds,[sp],#4 @ pop out | 218 | ldr $rounds,[sp],#4 @ pop out |
219 | #if __ARM_ARCH__>=7 | ||
220 | #ifdef __ARMEL__ | ||
221 | rev $s0,$s0 | ||
222 | rev $s1,$s1 | ||
223 | rev $s2,$s2 | ||
224 | rev $s3,$s3 | ||
225 | #endif | ||
226 | str $s0,[$rounds,#0] | ||
227 | str $s1,[$rounds,#4] | ||
228 | str $s2,[$rounds,#8] | ||
229 | str $s3,[$rounds,#12] | ||
230 | #else | ||
202 | mov $t1,$s0,lsr#24 @ write output in endian-neutral | 231 | mov $t1,$s0,lsr#24 @ write output in endian-neutral |
203 | mov $t2,$s0,lsr#16 @ manner... | 232 | mov $t2,$s0,lsr#16 @ manner... |
204 | mov $t3,$s0,lsr#8 | 233 | mov $t3,$s0,lsr#8 |
@@ -227,11 +256,15 @@ AES_encrypt: | |||
227 | strb $t2,[$rounds,#13] | 256 | strb $t2,[$rounds,#13] |
228 | strb $t3,[$rounds,#14] | 257 | strb $t3,[$rounds,#14] |
229 | strb $s3,[$rounds,#15] | 258 | strb $s3,[$rounds,#15] |
230 | 259 | #endif | |
260 | #if __ARM_ARCH__>=5 | ||
261 | ldmia sp!,{r4-r12,pc} | ||
262 | #else | ||
231 | ldmia sp!,{r4-r12,lr} | 263 | ldmia sp!,{r4-r12,lr} |
232 | tst lr,#1 | 264 | tst lr,#1 |
233 | moveq pc,lr @ be binary compatible with V4, yet | 265 | moveq pc,lr @ be binary compatible with V4, yet |
234 | bx lr @ interoperable with Thumb ISA:-) | 266 | bx lr @ interoperable with Thumb ISA:-) |
267 | #endif | ||
235 | .size AES_encrypt,.-AES_encrypt | 268 | .size AES_encrypt,.-AES_encrypt |
236 | 269 | ||
237 | .type _armv4_AES_encrypt,%function | 270 | .type _armv4_AES_encrypt,%function |
@@ -271,11 +304,11 @@ _armv4_AES_encrypt: | |||
271 | and $i2,lr,$s2,lsr#16 @ i1 | 304 | and $i2,lr,$s2,lsr#16 @ i1 |
272 | eor $t3,$t3,$i3,ror#8 | 305 | eor $t3,$t3,$i3,ror#8 |
273 | and $i3,lr,$s2 | 306 | and $i3,lr,$s2 |
274 | eor $s1,$s1,$t1,ror#24 | ||
275 | ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] | 307 | ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] |
308 | eor $s1,$s1,$t1,ror#24 | ||
309 | ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] | ||
276 | mov $s2,$s2,lsr#24 | 310 | mov $s2,$s2,lsr#24 |
277 | 311 | ||
278 | ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] | ||
279 | ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] | 312 | ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] |
280 | eor $s0,$s0,$i1,ror#16 | 313 | eor $s0,$s0,$i1,ror#16 |
281 | ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] | 314 | ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] |
@@ -284,16 +317,16 @@ _armv4_AES_encrypt: | |||
284 | and $i2,lr,$s3,lsr#8 @ i1 | 317 | and $i2,lr,$s3,lsr#8 @ i1 |
285 | eor $t3,$t3,$i3,ror#16 | 318 | eor $t3,$t3,$i3,ror#16 |
286 | and $i3,lr,$s3,lsr#16 @ i2 | 319 | and $i3,lr,$s3,lsr#16 @ i2 |
287 | eor $s2,$s2,$t2,ror#16 | ||
288 | ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] | 320 | ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] |
321 | eor $s2,$s2,$t2,ror#16 | ||
322 | ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] | ||
289 | mov $s3,$s3,lsr#24 | 323 | mov $s3,$s3,lsr#24 |
290 | 324 | ||
291 | ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] | ||
292 | ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] | 325 | ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] |
293 | eor $s0,$s0,$i1,ror#24 | 326 | eor $s0,$s0,$i1,ror#24 |
294 | ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24] | ||
295 | eor $s1,$s1,$i2,ror#16 | ||
296 | ldr $i1,[$key],#16 | 327 | ldr $i1,[$key],#16 |
328 | eor $s1,$s1,$i2,ror#16 | ||
329 | ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24] | ||
297 | eor $s2,$s2,$i3,ror#8 | 330 | eor $s2,$s2,$i3,ror#8 |
298 | ldr $t1,[$key,#-12] | 331 | ldr $t1,[$key,#-12] |
299 | eor $s3,$s3,$t3,ror#8 | 332 | eor $s3,$s3,$t3,ror#8 |
@@ -333,11 +366,11 @@ _armv4_AES_encrypt: | |||
333 | and $i2,lr,$s2,lsr#16 @ i1 | 366 | and $i2,lr,$s2,lsr#16 @ i1 |
334 | eor $t3,$i3,$t3,lsl#8 | 367 | eor $t3,$i3,$t3,lsl#8 |
335 | and $i3,lr,$s2 | 368 | and $i3,lr,$s2 |
336 | eor $s1,$t1,$s1,lsl#24 | ||
337 | ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] | 369 | ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] |
370 | eor $s1,$t1,$s1,lsl#24 | ||
371 | ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] | ||
338 | mov $s2,$s2,lsr#24 | 372 | mov $s2,$s2,lsr#24 |
339 | 373 | ||
340 | ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] | ||
341 | ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] | 374 | ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] |
342 | eor $s0,$i1,$s0,lsl#8 | 375 | eor $s0,$i1,$s0,lsl#8 |
343 | ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] | 376 | ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] |
@@ -346,15 +379,15 @@ _armv4_AES_encrypt: | |||
346 | and $i2,lr,$s3,lsr#8 @ i1 | 379 | and $i2,lr,$s3,lsr#8 @ i1 |
347 | eor $t3,$i3,$t3,lsl#8 | 380 | eor $t3,$i3,$t3,lsl#8 |
348 | and $i3,lr,$s3,lsr#16 @ i2 | 381 | and $i3,lr,$s3,lsr#16 @ i2 |
349 | eor $s2,$t2,$s2,lsl#24 | ||
350 | ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] | 382 | ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] |
383 | eor $s2,$t2,$s2,lsl#24 | ||
384 | ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] | ||
351 | mov $s3,$s3,lsr#24 | 385 | mov $s3,$s3,lsr#24 |
352 | 386 | ||
353 | ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] | ||
354 | ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] | 387 | ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] |
355 | eor $s0,$i1,$s0,lsl#8 | 388 | eor $s0,$i1,$s0,lsl#8 |
356 | ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24] | ||
357 | ldr $i1,[$key,#0] | 389 | ldr $i1,[$key,#0] |
390 | ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24] | ||
358 | eor $s1,$s1,$i2,lsl#8 | 391 | eor $s1,$s1,$i2,lsl#8 |
359 | ldr $t1,[$key,#4] | 392 | ldr $t1,[$key,#4] |
360 | eor $s2,$s2,$i3,lsl#16 | 393 | eor $s2,$s2,$i3,lsl#16 |
@@ -371,10 +404,11 @@ _armv4_AES_encrypt: | |||
371 | ldr pc,[sp],#4 @ pop and return | 404 | ldr pc,[sp],#4 @ pop and return |
372 | .size _armv4_AES_encrypt,.-_armv4_AES_encrypt | 405 | .size _armv4_AES_encrypt,.-_armv4_AES_encrypt |
373 | 406 | ||
374 | .global AES_set_encrypt_key | 407 | .global private_AES_set_encrypt_key |
375 | .type AES_set_encrypt_key,%function | 408 | .type private_AES_set_encrypt_key,%function |
376 | .align 5 | 409 | .align 5 |
377 | AES_set_encrypt_key: | 410 | private_AES_set_encrypt_key: |
411 | _armv4_AES_set_encrypt_key: | ||
378 | sub r3,pc,#8 @ AES_set_encrypt_key | 412 | sub r3,pc,#8 @ AES_set_encrypt_key |
379 | teq r0,#0 | 413 | teq r0,#0 |
380 | moveq r0,#-1 | 414 | moveq r0,#-1 |
@@ -392,12 +426,13 @@ AES_set_encrypt_key: | |||
392 | bne .Labrt | 426 | bne .Labrt |
393 | 427 | ||
394 | .Lok: stmdb sp!,{r4-r12,lr} | 428 | .Lok: stmdb sp!,{r4-r12,lr} |
395 | sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4 | 429 | sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 |
396 | 430 | ||
397 | mov $rounds,r0 @ inp | 431 | mov $rounds,r0 @ inp |
398 | mov lr,r1 @ bits | 432 | mov lr,r1 @ bits |
399 | mov $key,r2 @ key | 433 | mov $key,r2 @ key |
400 | 434 | ||
435 | #if __ARM_ARCH__<7 | ||
401 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral | 436 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral |
402 | ldrb $t1,[$rounds,#2] @ manner... | 437 | ldrb $t1,[$rounds,#2] @ manner... |
403 | ldrb $t2,[$rounds,#1] | 438 | ldrb $t2,[$rounds,#1] |
@@ -430,6 +465,22 @@ AES_set_encrypt_key: | |||
430 | orr $s3,$s3,$t3,lsl#24 | 465 | orr $s3,$s3,$t3,lsl#24 |
431 | str $s2,[$key,#-8] | 466 | str $s2,[$key,#-8] |
432 | str $s3,[$key,#-4] | 467 | str $s3,[$key,#-4] |
468 | #else | ||
469 | ldr $s0,[$rounds,#0] | ||
470 | ldr $s1,[$rounds,#4] | ||
471 | ldr $s2,[$rounds,#8] | ||
472 | ldr $s3,[$rounds,#12] | ||
473 | #ifdef __ARMEL__ | ||
474 | rev $s0,$s0 | ||
475 | rev $s1,$s1 | ||
476 | rev $s2,$s2 | ||
477 | rev $s3,$s3 | ||
478 | #endif | ||
479 | str $s0,[$key],#16 | ||
480 | str $s1,[$key,#-12] | ||
481 | str $s2,[$key,#-8] | ||
482 | str $s3,[$key,#-4] | ||
483 | #endif | ||
433 | 484 | ||
434 | teq lr,#128 | 485 | teq lr,#128 |
435 | bne .Lnot128 | 486 | bne .Lnot128 |
@@ -466,6 +517,7 @@ AES_set_encrypt_key: | |||
466 | b .Ldone | 517 | b .Ldone |
467 | 518 | ||
468 | .Lnot128: | 519 | .Lnot128: |
520 | #if __ARM_ARCH__<7 | ||
469 | ldrb $i2,[$rounds,#19] | 521 | ldrb $i2,[$rounds,#19] |
470 | ldrb $t1,[$rounds,#18] | 522 | ldrb $t1,[$rounds,#18] |
471 | ldrb $t2,[$rounds,#17] | 523 | ldrb $t2,[$rounds,#17] |
@@ -482,6 +534,16 @@ AES_set_encrypt_key: | |||
482 | str $i2,[$key],#8 | 534 | str $i2,[$key],#8 |
483 | orr $i3,$i3,$t3,lsl#24 | 535 | orr $i3,$i3,$t3,lsl#24 |
484 | str $i3,[$key,#-4] | 536 | str $i3,[$key,#-4] |
537 | #else | ||
538 | ldr $i2,[$rounds,#16] | ||
539 | ldr $i3,[$rounds,#20] | ||
540 | #ifdef __ARMEL__ | ||
541 | rev $i2,$i2 | ||
542 | rev $i3,$i3 | ||
543 | #endif | ||
544 | str $i2,[$key],#8 | ||
545 | str $i3,[$key,#-4] | ||
546 | #endif | ||
485 | 547 | ||
486 | teq lr,#192 | 548 | teq lr,#192 |
487 | bne .Lnot192 | 549 | bne .Lnot192 |
@@ -526,6 +588,7 @@ AES_set_encrypt_key: | |||
526 | b .L192_loop | 588 | b .L192_loop |
527 | 589 | ||
528 | .Lnot192: | 590 | .Lnot192: |
591 | #if __ARM_ARCH__<7 | ||
529 | ldrb $i2,[$rounds,#27] | 592 | ldrb $i2,[$rounds,#27] |
530 | ldrb $t1,[$rounds,#26] | 593 | ldrb $t1,[$rounds,#26] |
531 | ldrb $t2,[$rounds,#25] | 594 | ldrb $t2,[$rounds,#25] |
@@ -542,6 +605,16 @@ AES_set_encrypt_key: | |||
542 | str $i2,[$key],#8 | 605 | str $i2,[$key],#8 |
543 | orr $i3,$i3,$t3,lsl#24 | 606 | orr $i3,$i3,$t3,lsl#24 |
544 | str $i3,[$key,#-4] | 607 | str $i3,[$key,#-4] |
608 | #else | ||
609 | ldr $i2,[$rounds,#24] | ||
610 | ldr $i3,[$rounds,#28] | ||
611 | #ifdef __ARMEL__ | ||
612 | rev $i2,$i2 | ||
613 | rev $i3,$i3 | ||
614 | #endif | ||
615 | str $i2,[$key],#8 | ||
616 | str $i3,[$key,#-4] | ||
617 | #endif | ||
545 | 618 | ||
546 | mov $rounds,#14 | 619 | mov $rounds,#14 |
547 | str $rounds,[$key,#240-32] | 620 | str $rounds,[$key,#240-32] |
@@ -606,14 +679,14 @@ AES_set_encrypt_key: | |||
606 | .Labrt: tst lr,#1 | 679 | .Labrt: tst lr,#1 |
607 | moveq pc,lr @ be binary compatible with V4, yet | 680 | moveq pc,lr @ be binary compatible with V4, yet |
608 | bx lr @ interoperable with Thumb ISA:-) | 681 | bx lr @ interoperable with Thumb ISA:-) |
609 | .size AES_set_encrypt_key,.-AES_set_encrypt_key | 682 | .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key |
610 | 683 | ||
611 | .global AES_set_decrypt_key | 684 | .global private_AES_set_decrypt_key |
612 | .type AES_set_decrypt_key,%function | 685 | .type private_AES_set_decrypt_key,%function |
613 | .align 5 | 686 | .align 5 |
614 | AES_set_decrypt_key: | 687 | private_AES_set_decrypt_key: |
615 | str lr,[sp,#-4]! @ push lr | 688 | str lr,[sp,#-4]! @ push lr |
616 | bl AES_set_encrypt_key | 689 | bl _armv4_AES_set_encrypt_key |
617 | teq r0,#0 | 690 | teq r0,#0 |
618 | ldrne lr,[sp],#4 @ pop lr | 691 | ldrne lr,[sp],#4 @ pop lr |
619 | bne .Labrt | 692 | bne .Labrt |
@@ -692,11 +765,15 @@ $code.=<<___; | |||
692 | bne .Lmix | 765 | bne .Lmix |
693 | 766 | ||
694 | mov r0,#0 | 767 | mov r0,#0 |
768 | #if __ARM_ARCH__>=5 | ||
769 | ldmia sp!,{r4-r12,pc} | ||
770 | #else | ||
695 | ldmia sp!,{r4-r12,lr} | 771 | ldmia sp!,{r4-r12,lr} |
696 | tst lr,#1 | 772 | tst lr,#1 |
697 | moveq pc,lr @ be binary compatible with V4, yet | 773 | moveq pc,lr @ be binary compatible with V4, yet |
698 | bx lr @ interoperable with Thumb ISA:-) | 774 | bx lr @ interoperable with Thumb ISA:-) |
699 | .size AES_set_decrypt_key,.-AES_set_decrypt_key | 775 | #endif |
776 | .size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key | ||
700 | 777 | ||
701 | .type AES_Td,%object | 778 | .type AES_Td,%object |
702 | .align 5 | 779 | .align 5 |
@@ -811,7 +888,7 @@ AES_decrypt: | |||
811 | mov $rounds,r0 @ inp | 888 | mov $rounds,r0 @ inp |
812 | mov $key,r2 | 889 | mov $key,r2 |
813 | sub $tbl,r3,#AES_decrypt-AES_Td @ Td | 890 | sub $tbl,r3,#AES_decrypt-AES_Td @ Td |
814 | 891 | #if __ARM_ARCH__<7 | |
815 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral | 892 | ldrb $s0,[$rounds,#3] @ load input data in endian-neutral |
816 | ldrb $t1,[$rounds,#2] @ manner... | 893 | ldrb $t1,[$rounds,#2] @ manner... |
817 | ldrb $t2,[$rounds,#1] | 894 | ldrb $t2,[$rounds,#1] |
@@ -840,10 +917,33 @@ AES_decrypt: | |||
840 | orr $s3,$s3,$t1,lsl#8 | 917 | orr $s3,$s3,$t1,lsl#8 |
841 | orr $s3,$s3,$t2,lsl#16 | 918 | orr $s3,$s3,$t2,lsl#16 |
842 | orr $s3,$s3,$t3,lsl#24 | 919 | orr $s3,$s3,$t3,lsl#24 |
843 | 920 | #else | |
921 | ldr $s0,[$rounds,#0] | ||
922 | ldr $s1,[$rounds,#4] | ||
923 | ldr $s2,[$rounds,#8] | ||
924 | ldr $s3,[$rounds,#12] | ||
925 | #ifdef __ARMEL__ | ||
926 | rev $s0,$s0 | ||
927 | rev $s1,$s1 | ||
928 | rev $s2,$s2 | ||
929 | rev $s3,$s3 | ||
930 | #endif | ||
931 | #endif | ||
844 | bl _armv4_AES_decrypt | 932 | bl _armv4_AES_decrypt |
845 | 933 | ||
846 | ldr $rounds,[sp],#4 @ pop out | 934 | ldr $rounds,[sp],#4 @ pop out |
935 | #if __ARM_ARCH__>=7 | ||
936 | #ifdef __ARMEL__ | ||
937 | rev $s0,$s0 | ||
938 | rev $s1,$s1 | ||
939 | rev $s2,$s2 | ||
940 | rev $s3,$s3 | ||
941 | #endif | ||
942 | str $s0,[$rounds,#0] | ||
943 | str $s1,[$rounds,#4] | ||
944 | str $s2,[$rounds,#8] | ||
945 | str $s3,[$rounds,#12] | ||
946 | #else | ||
847 | mov $t1,$s0,lsr#24 @ write output in endian-neutral | 947 | mov $t1,$s0,lsr#24 @ write output in endian-neutral |
848 | mov $t2,$s0,lsr#16 @ manner... | 948 | mov $t2,$s0,lsr#16 @ manner... |
849 | mov $t3,$s0,lsr#8 | 949 | mov $t3,$s0,lsr#8 |
@@ -872,11 +972,15 @@ AES_decrypt: | |||
872 | strb $t2,[$rounds,#13] | 972 | strb $t2,[$rounds,#13] |
873 | strb $t3,[$rounds,#14] | 973 | strb $t3,[$rounds,#14] |
874 | strb $s3,[$rounds,#15] | 974 | strb $s3,[$rounds,#15] |
875 | 975 | #endif | |
976 | #if __ARM_ARCH__>=5 | ||
977 | ldmia sp!,{r4-r12,pc} | ||
978 | #else | ||
876 | ldmia sp!,{r4-r12,lr} | 979 | ldmia sp!,{r4-r12,lr} |
877 | tst lr,#1 | 980 | tst lr,#1 |
878 | moveq pc,lr @ be binary compatible with V4, yet | 981 | moveq pc,lr @ be binary compatible with V4, yet |
879 | bx lr @ interoperable with Thumb ISA:-) | 982 | bx lr @ interoperable with Thumb ISA:-) |
983 | #endif | ||
880 | .size AES_decrypt,.-AES_decrypt | 984 | .size AES_decrypt,.-AES_decrypt |
881 | 985 | ||
882 | .type _armv4_AES_decrypt,%function | 986 | .type _armv4_AES_decrypt,%function |
@@ -916,11 +1020,11 @@ _armv4_AES_decrypt: | |||
916 | and $i2,lr,$s2 @ i1 | 1020 | and $i2,lr,$s2 @ i1 |
917 | eor $t3,$i3,$t3,ror#8 | 1021 | eor $t3,$i3,$t3,ror#8 |
918 | and $i3,lr,$s2,lsr#16 | 1022 | and $i3,lr,$s2,lsr#16 |
919 | eor $s1,$s1,$t1,ror#8 | ||
920 | ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] | 1023 | ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] |
1024 | eor $s1,$s1,$t1,ror#8 | ||
1025 | ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] | ||
921 | mov $s2,$s2,lsr#24 | 1026 | mov $s2,$s2,lsr#24 |
922 | 1027 | ||
923 | ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] | ||
924 | ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] | 1028 | ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] |
925 | eor $s0,$s0,$i1,ror#16 | 1029 | eor $s0,$s0,$i1,ror#16 |
926 | ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] | 1030 | ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] |
@@ -929,22 +1033,22 @@ _armv4_AES_decrypt: | |||
929 | and $i2,lr,$s3,lsr#8 @ i1 | 1033 | and $i2,lr,$s3,lsr#8 @ i1 |
930 | eor $t3,$i3,$t3,ror#8 | 1034 | eor $t3,$i3,$t3,ror#8 |
931 | and $i3,lr,$s3 @ i2 | 1035 | and $i3,lr,$s3 @ i2 |
932 | eor $s2,$s2,$t2,ror#8 | ||
933 | ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] | 1036 | ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] |
1037 | eor $s2,$s2,$t2,ror#8 | ||
1038 | ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] | ||
934 | mov $s3,$s3,lsr#24 | 1039 | mov $s3,$s3,lsr#24 |
935 | 1040 | ||
936 | ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] | ||
937 | ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] | 1041 | ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] |
938 | eor $s0,$s0,$i1,ror#8 | 1042 | eor $s0,$s0,$i1,ror#8 |
939 | ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] | 1043 | ldr $i1,[$key],#16 |
940 | eor $s1,$s1,$i2,ror#16 | 1044 | eor $s1,$s1,$i2,ror#16 |
1045 | ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] | ||
941 | eor $s2,$s2,$i3,ror#24 | 1046 | eor $s2,$s2,$i3,ror#24 |
942 | ldr $i1,[$key],#16 | ||
943 | eor $s3,$s3,$t3,ror#8 | ||
944 | 1047 | ||
945 | ldr $t1,[$key,#-12] | 1048 | ldr $t1,[$key,#-12] |
946 | ldr $t2,[$key,#-8] | ||
947 | eor $s0,$s0,$i1 | 1049 | eor $s0,$s0,$i1 |
1050 | ldr $t2,[$key,#-8] | ||
1051 | eor $s3,$s3,$t3,ror#8 | ||
948 | ldr $t3,[$key,#-4] | 1052 | ldr $t3,[$key,#-4] |
949 | and $i1,lr,$s0,lsr#16 | 1053 | and $i1,lr,$s0,lsr#16 |
950 | eor $s1,$s1,$t1 | 1054 | eor $s1,$s1,$t1 |
@@ -985,11 +1089,11 @@ _armv4_AES_decrypt: | |||
985 | and $i1,lr,$s2,lsr#8 @ i0 | 1089 | and $i1,lr,$s2,lsr#8 @ i0 |
986 | eor $t2,$t2,$i2,lsl#8 | 1090 | eor $t2,$t2,$i2,lsl#8 |
987 | and $i2,lr,$s2 @ i1 | 1091 | and $i2,lr,$s2 @ i1 |
988 | eor $t3,$t3,$i3,lsl#8 | ||
989 | ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] | 1092 | ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] |
1093 | eor $t3,$t3,$i3,lsl#8 | ||
1094 | ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] | ||
990 | and $i3,lr,$s2,lsr#16 | 1095 | and $i3,lr,$s2,lsr#16 |
991 | 1096 | ||
992 | ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] | ||
993 | ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] | 1097 | ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] |
994 | eor $s0,$s0,$i1,lsl#8 | 1098 | eor $s0,$s0,$i1,lsl#8 |
995 | ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] | 1099 | ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] |
@@ -997,11 +1101,11 @@ _armv4_AES_decrypt: | |||
997 | and $i1,lr,$s3,lsr#16 @ i0 | 1101 | and $i1,lr,$s3,lsr#16 @ i0 |
998 | eor $s2,$t2,$s2,lsl#16 | 1102 | eor $s2,$t2,$s2,lsl#16 |
999 | and $i2,lr,$s3,lsr#8 @ i1 | 1103 | and $i2,lr,$s3,lsr#8 @ i1 |
1000 | eor $t3,$t3,$i3,lsl#16 | ||
1001 | ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] | 1104 | ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] |
1105 | eor $t3,$t3,$i3,lsl#16 | ||
1106 | ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] | ||
1002 | and $i3,lr,$s3 @ i2 | 1107 | and $i3,lr,$s3 @ i2 |
1003 | 1108 | ||
1004 | ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] | ||
1005 | ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] | 1109 | ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] |
1006 | ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] | 1110 | ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] |
1007 | eor $s0,$s0,$i1,lsl#16 | 1111 | eor $s0,$s0,$i1,lsl#16 |
diff --git a/src/lib/libcrypto/aes/asm/aes-mips.pl b/src/lib/libcrypto/aes/asm/aes-mips.pl new file mode 100644 index 0000000000..2ce6deffc8 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aes-mips.pl | |||
@@ -0,0 +1,1611 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # AES for MIPS | ||
11 | |||
12 | # October 2010 | ||
13 | # | ||
14 | # Code uses 1K[+256B] S-box and on single-issue core [such as R5000] | ||
15 | # spends ~68 cycles per byte processed with 128-bit key. This is ~16% | ||
16 | # faster than gcc-generated code, which is not very impressive. But | ||
17 | # recall that compressed S-box requires extra processing, namely | ||
18 | # additional rotations. Rotations are implemented with lwl/lwr pairs, | ||
19 | # which is normally used for loading unaligned data. Another cool | ||
20 | # thing about this module is its endian neutrality, which means that | ||
21 | # it processes data without ever changing byte order... | ||
22 | |||
23 | ###################################################################### | ||
24 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | ||
25 | # widely used. Then there is a new contender: NUBI. It appears that if | ||
26 | # one picks the latter, it's possible to arrange code in ABI neutral | ||
27 | # manner. Therefore let's stick to NUBI register layout: | ||
28 | # | ||
29 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | ||
30 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
31 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | ||
32 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | ||
33 | # | ||
34 | # The return value is placed in $a0. Following coding rules facilitate | ||
35 | # interoperability: | ||
36 | # | ||
37 | # - never ever touch $tp, "thread pointer", former $gp; | ||
38 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | ||
39 | # old code]; | ||
40 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | ||
41 | # | ||
42 | # For reference here is register layout for N32/64 MIPS ABIs: | ||
43 | # | ||
44 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
45 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
46 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
47 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
48 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
49 | # | ||
50 | $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 | ||
51 | |||
52 | if ($flavour =~ /64|n32/i) { | ||
53 | $PTR_ADD="dadd"; # incidentally works even on n32 | ||
54 | $PTR_SUB="dsub"; # incidentally works even on n32 | ||
55 | $REG_S="sd"; | ||
56 | $REG_L="ld"; | ||
57 | $PTR_SLL="dsll"; # incidentally works even on n32 | ||
58 | $SZREG=8; | ||
59 | } else { | ||
60 | $PTR_ADD="add"; | ||
61 | $PTR_SUB="sub"; | ||
62 | $REG_S="sw"; | ||
63 | $REG_L="lw"; | ||
64 | $PTR_SLL="sll"; | ||
65 | $SZREG=4; | ||
66 | } | ||
67 | $pf = ($flavour =~ /nubi/i) ? $t0 : $t2; | ||
68 | # | ||
69 | # <appro@openssl.org> | ||
70 | # | ||
71 | ###################################################################### | ||
72 | |||
73 | $big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; | ||
74 | |||
75 | for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } | ||
76 | open STDOUT,">$output"; | ||
77 | |||
78 | if (!defined($big_endian)) | ||
79 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
80 | |||
81 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
82 | open STDOUT,">$output"; | ||
83 | |||
84 | my ($MSB,$LSB)=(0,3); # automatically converted to little-endian | ||
85 | |||
86 | $code.=<<___; | ||
87 | .text | ||
88 | #ifdef OPENSSL_FIPSCANISTER | ||
89 | # include <openssl/fipssyms.h> | ||
90 | #endif | ||
91 | |||
92 | #if !defined(__vxworks) || defined(__pic__) | ||
93 | .option pic2 | ||
94 | #endif | ||
95 | .set noat | ||
96 | ___ | ||
97 | |||
98 | {{{ | ||
99 | my $FRAMESIZE=16*$SZREG; | ||
100 | my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; | ||
101 | |||
102 | my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7); | ||
103 | my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); | ||
104 | my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23)); | ||
105 | my ($key0,$cnt)=($gp,$fp); | ||
106 | |||
107 | # instuction ordering is "stolen" from output from MIPSpro assembler | ||
108 | # invoked with -mips3 -O3 arguments... | ||
109 | $code.=<<___; | ||
110 | .align 5 | ||
111 | .ent _mips_AES_encrypt | ||
112 | _mips_AES_encrypt: | ||
113 | .frame $sp,0,$ra | ||
114 | .set reorder | ||
115 | lw $t0,0($key) | ||
116 | lw $t1,4($key) | ||
117 | lw $t2,8($key) | ||
118 | lw $t3,12($key) | ||
119 | lw $cnt,240($key) | ||
120 | $PTR_ADD $key0,$key,16 | ||
121 | |||
122 | xor $s0,$t0 | ||
123 | xor $s1,$t1 | ||
124 | xor $s2,$t2 | ||
125 | xor $s3,$t3 | ||
126 | |||
127 | sub $cnt,1 | ||
128 | _xtr $i0,$s1,16-2 | ||
129 | .Loop_enc: | ||
130 | _xtr $i1,$s2,16-2 | ||
131 | _xtr $i2,$s3,16-2 | ||
132 | _xtr $i3,$s0,16-2 | ||
133 | and $i0,0x3fc | ||
134 | and $i1,0x3fc | ||
135 | and $i2,0x3fc | ||
136 | and $i3,0x3fc | ||
137 | $PTR_ADD $i0,$Tbl | ||
138 | $PTR_ADD $i1,$Tbl | ||
139 | $PTR_ADD $i2,$Tbl | ||
140 | $PTR_ADD $i3,$Tbl | ||
141 | lwl $t0,3($i0) # Te1[s1>>16] | ||
142 | lwl $t1,3($i1) # Te1[s2>>16] | ||
143 | lwl $t2,3($i2) # Te1[s3>>16] | ||
144 | lwl $t3,3($i3) # Te1[s0>>16] | ||
145 | lwr $t0,2($i0) # Te1[s1>>16] | ||
146 | lwr $t1,2($i1) # Te1[s2>>16] | ||
147 | lwr $t2,2($i2) # Te1[s3>>16] | ||
148 | lwr $t3,2($i3) # Te1[s0>>16] | ||
149 | |||
150 | _xtr $i0,$s2,8-2 | ||
151 | _xtr $i1,$s3,8-2 | ||
152 | _xtr $i2,$s0,8-2 | ||
153 | _xtr $i3,$s1,8-2 | ||
154 | and $i0,0x3fc | ||
155 | and $i1,0x3fc | ||
156 | and $i2,0x3fc | ||
157 | and $i3,0x3fc | ||
158 | $PTR_ADD $i0,$Tbl | ||
159 | $PTR_ADD $i1,$Tbl | ||
160 | $PTR_ADD $i2,$Tbl | ||
161 | $PTR_ADD $i3,$Tbl | ||
162 | lwl $t4,2($i0) # Te2[s2>>8] | ||
163 | lwl $t5,2($i1) # Te2[s3>>8] | ||
164 | lwl $t6,2($i2) # Te2[s0>>8] | ||
165 | lwl $t7,2($i3) # Te2[s1>>8] | ||
166 | lwr $t4,1($i0) # Te2[s2>>8] | ||
167 | lwr $t5,1($i1) # Te2[s3>>8] | ||
168 | lwr $t6,1($i2) # Te2[s0>>8] | ||
169 | lwr $t7,1($i3) # Te2[s1>>8] | ||
170 | |||
171 | _xtr $i0,$s3,0-2 | ||
172 | _xtr $i1,$s0,0-2 | ||
173 | _xtr $i2,$s1,0-2 | ||
174 | _xtr $i3,$s2,0-2 | ||
175 | and $i0,0x3fc | ||
176 | and $i1,0x3fc | ||
177 | and $i2,0x3fc | ||
178 | and $i3,0x3fc | ||
179 | $PTR_ADD $i0,$Tbl | ||
180 | $PTR_ADD $i1,$Tbl | ||
181 | $PTR_ADD $i2,$Tbl | ||
182 | $PTR_ADD $i3,$Tbl | ||
183 | lwl $t8,1($i0) # Te3[s3] | ||
184 | lwl $t9,1($i1) # Te3[s0] | ||
185 | lwl $t10,1($i2) # Te3[s1] | ||
186 | lwl $t11,1($i3) # Te3[s2] | ||
187 | lwr $t8,0($i0) # Te3[s3] | ||
188 | lwr $t9,0($i1) # Te3[s0] | ||
189 | lwr $t10,0($i2) # Te3[s1] | ||
190 | lwr $t11,0($i3) # Te3[s2] | ||
191 | |||
192 | _xtr $i0,$s0,24-2 | ||
193 | _xtr $i1,$s1,24-2 | ||
194 | _xtr $i2,$s2,24-2 | ||
195 | _xtr $i3,$s3,24-2 | ||
196 | and $i0,0x3fc | ||
197 | and $i1,0x3fc | ||
198 | and $i2,0x3fc | ||
199 | and $i3,0x3fc | ||
200 | $PTR_ADD $i0,$Tbl | ||
201 | $PTR_ADD $i1,$Tbl | ||
202 | $PTR_ADD $i2,$Tbl | ||
203 | $PTR_ADD $i3,$Tbl | ||
204 | xor $t0,$t4 | ||
205 | xor $t1,$t5 | ||
206 | xor $t2,$t6 | ||
207 | xor $t3,$t7 | ||
208 | lw $t4,0($i0) # Te0[s0>>24] | ||
209 | lw $t5,0($i1) # Te0[s1>>24] | ||
210 | lw $t6,0($i2) # Te0[s2>>24] | ||
211 | lw $t7,0($i3) # Te0[s3>>24] | ||
212 | |||
213 | lw $s0,0($key0) | ||
214 | lw $s1,4($key0) | ||
215 | lw $s2,8($key0) | ||
216 | lw $s3,12($key0) | ||
217 | |||
218 | xor $t0,$t8 | ||
219 | xor $t1,$t9 | ||
220 | xor $t2,$t10 | ||
221 | xor $t3,$t11 | ||
222 | |||
223 | xor $t0,$t4 | ||
224 | xor $t1,$t5 | ||
225 | xor $t2,$t6 | ||
226 | xor $t3,$t7 | ||
227 | |||
228 | sub $cnt,1 | ||
229 | $PTR_ADD $key0,16 | ||
230 | xor $s0,$t0 | ||
231 | xor $s1,$t1 | ||
232 | xor $s2,$t2 | ||
233 | xor $s3,$t3 | ||
234 | .set noreorder | ||
235 | bnez $cnt,.Loop_enc | ||
236 | _xtr $i0,$s1,16-2 | ||
237 | |||
238 | .set reorder | ||
239 | _xtr $i1,$s2,16-2 | ||
240 | _xtr $i2,$s3,16-2 | ||
241 | _xtr $i3,$s0,16-2 | ||
242 | and $i0,0x3fc | ||
243 | and $i1,0x3fc | ||
244 | and $i2,0x3fc | ||
245 | and $i3,0x3fc | ||
246 | $PTR_ADD $i0,$Tbl | ||
247 | $PTR_ADD $i1,$Tbl | ||
248 | $PTR_ADD $i2,$Tbl | ||
249 | $PTR_ADD $i3,$Tbl | ||
250 | lbu $t0,2($i0) # Te4[s1>>16] | ||
251 | lbu $t1,2($i1) # Te4[s2>>16] | ||
252 | lbu $t2,2($i2) # Te4[s3>>16] | ||
253 | lbu $t3,2($i3) # Te4[s0>>16] | ||
254 | |||
255 | _xtr $i0,$s2,8-2 | ||
256 | _xtr $i1,$s3,8-2 | ||
257 | _xtr $i2,$s0,8-2 | ||
258 | _xtr $i3,$s1,8-2 | ||
259 | and $i0,0x3fc | ||
260 | and $i1,0x3fc | ||
261 | and $i2,0x3fc | ||
262 | and $i3,0x3fc | ||
263 | $PTR_ADD $i0,$Tbl | ||
264 | $PTR_ADD $i1,$Tbl | ||
265 | $PTR_ADD $i2,$Tbl | ||
266 | $PTR_ADD $i3,$Tbl | ||
267 | lbu $t4,2($i0) # Te4[s2>>8] | ||
268 | lbu $t5,2($i1) # Te4[s3>>8] | ||
269 | lbu $t6,2($i2) # Te4[s0>>8] | ||
270 | lbu $t7,2($i3) # Te4[s1>>8] | ||
271 | |||
272 | _xtr $i0,$s0,24-2 | ||
273 | _xtr $i1,$s1,24-2 | ||
274 | _xtr $i2,$s2,24-2 | ||
275 | _xtr $i3,$s3,24-2 | ||
276 | and $i0,0x3fc | ||
277 | and $i1,0x3fc | ||
278 | and $i2,0x3fc | ||
279 | and $i3,0x3fc | ||
280 | $PTR_ADD $i0,$Tbl | ||
281 | $PTR_ADD $i1,$Tbl | ||
282 | $PTR_ADD $i2,$Tbl | ||
283 | $PTR_ADD $i3,$Tbl | ||
284 | lbu $t8,2($i0) # Te4[s0>>24] | ||
285 | lbu $t9,2($i1) # Te4[s1>>24] | ||
286 | lbu $t10,2($i2) # Te4[s2>>24] | ||
287 | lbu $t11,2($i3) # Te4[s3>>24] | ||
288 | |||
289 | _xtr $i0,$s3,0-2 | ||
290 | _xtr $i1,$s0,0-2 | ||
291 | _xtr $i2,$s1,0-2 | ||
292 | _xtr $i3,$s2,0-2 | ||
293 | and $i0,0x3fc | ||
294 | and $i1,0x3fc | ||
295 | and $i2,0x3fc | ||
296 | and $i3,0x3fc | ||
297 | |||
298 | _ins $t0,16 | ||
299 | _ins $t1,16 | ||
300 | _ins $t2,16 | ||
301 | _ins $t3,16 | ||
302 | |||
303 | _ins $t4,8 | ||
304 | _ins $t5,8 | ||
305 | _ins $t6,8 | ||
306 | _ins $t7,8 | ||
307 | |||
308 | xor $t0,$t4 | ||
309 | xor $t1,$t5 | ||
310 | xor $t2,$t6 | ||
311 | xor $t3,$t7 | ||
312 | |||
313 | $PTR_ADD $i0,$Tbl | ||
314 | $PTR_ADD $i1,$Tbl | ||
315 | $PTR_ADD $i2,$Tbl | ||
316 | $PTR_ADD $i3,$Tbl | ||
317 | lbu $t4,2($i0) # Te4[s3] | ||
318 | lbu $t5,2($i1) # Te4[s0] | ||
319 | lbu $t6,2($i2) # Te4[s1] | ||
320 | lbu $t7,2($i3) # Te4[s2] | ||
321 | |||
322 | _ins $t8,24 | ||
323 | _ins $t9,24 | ||
324 | _ins $t10,24 | ||
325 | _ins $t11,24 | ||
326 | |||
327 | lw $s0,0($key0) | ||
328 | lw $s1,4($key0) | ||
329 | lw $s2,8($key0) | ||
330 | lw $s3,12($key0) | ||
331 | |||
332 | xor $t0,$t8 | ||
333 | xor $t1,$t9 | ||
334 | xor $t2,$t10 | ||
335 | xor $t3,$t11 | ||
336 | |||
337 | _ins $t4,0 | ||
338 | _ins $t5,0 | ||
339 | _ins $t6,0 | ||
340 | _ins $t7,0 | ||
341 | |||
342 | xor $t0,$t4 | ||
343 | xor $t1,$t5 | ||
344 | xor $t2,$t6 | ||
345 | xor $t3,$t7 | ||
346 | |||
347 | xor $s0,$t0 | ||
348 | xor $s1,$t1 | ||
349 | xor $s2,$t2 | ||
350 | xor $s3,$t3 | ||
351 | |||
352 | jr $ra | ||
353 | .end _mips_AES_encrypt | ||
354 | |||
355 | .align 5 | ||
356 | .globl AES_encrypt | ||
357 | .ent AES_encrypt | ||
358 | AES_encrypt: | ||
359 | .frame $sp,$FRAMESIZE,$ra | ||
360 | .mask $SAVED_REGS_MASK,-$SZREG | ||
361 | .set noreorder | ||
362 | ___ | ||
363 | $code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification | ||
364 | .cpload $pf | ||
365 | ___ | ||
366 | $code.=<<___; | ||
367 | $PTR_SUB $sp,$FRAMESIZE | ||
368 | $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) | ||
369 | $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) | ||
370 | $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) | ||
371 | $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) | ||
372 | $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) | ||
373 | $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) | ||
374 | $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) | ||
375 | $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) | ||
376 | $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) | ||
377 | $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) | ||
378 | ___ | ||
379 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
380 | $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) | ||
381 | $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) | ||
382 | $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) | ||
383 | $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) | ||
384 | $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) | ||
385 | ___ | ||
386 | $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification | ||
387 | .cplocal $Tbl | ||
388 | .cpsetup $pf,$zero,AES_encrypt | ||
389 | ___ | ||
390 | $code.=<<___; | ||
391 | .set reorder | ||
392 | la $Tbl,AES_Te # PIC-ified 'load address' | ||
393 | |||
394 | lwl $s0,0+$MSB($inp) | ||
395 | lwl $s1,4+$MSB($inp) | ||
396 | lwl $s2,8+$MSB($inp) | ||
397 | lwl $s3,12+$MSB($inp) | ||
398 | lwr $s0,0+$LSB($inp) | ||
399 | lwr $s1,4+$LSB($inp) | ||
400 | lwr $s2,8+$LSB($inp) | ||
401 | lwr $s3,12+$LSB($inp) | ||
402 | |||
403 | bal _mips_AES_encrypt | ||
404 | |||
405 | swr $s0,0+$LSB($out) | ||
406 | swr $s1,4+$LSB($out) | ||
407 | swr $s2,8+$LSB($out) | ||
408 | swr $s3,12+$LSB($out) | ||
409 | swl $s0,0+$MSB($out) | ||
410 | swl $s1,4+$MSB($out) | ||
411 | swl $s2,8+$MSB($out) | ||
412 | swl $s3,12+$MSB($out) | ||
413 | |||
414 | .set noreorder | ||
415 | $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) | ||
416 | $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) | ||
417 | $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) | ||
418 | $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) | ||
419 | $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) | ||
420 | $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) | ||
421 | $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) | ||
422 | $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) | ||
423 | $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) | ||
424 | $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) | ||
425 | ___ | ||
426 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
427 | $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) | ||
428 | $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) | ||
429 | $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) | ||
430 | $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) | ||
431 | $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) | ||
432 | ___ | ||
433 | $code.=<<___; | ||
434 | jr $ra | ||
435 | $PTR_ADD $sp,$FRAMESIZE | ||
436 | .end AES_encrypt | ||
437 | ___ | ||
438 | |||
439 | $code.=<<___; | ||
440 | .align 5 | ||
441 | .ent _mips_AES_decrypt | ||
442 | _mips_AES_decrypt: | ||
443 | .frame $sp,0,$ra | ||
444 | .set reorder | ||
445 | lw $t0,0($key) | ||
446 | lw $t1,4($key) | ||
447 | lw $t2,8($key) | ||
448 | lw $t3,12($key) | ||
449 | lw $cnt,240($key) | ||
450 | $PTR_ADD $key0,$key,16 | ||
451 | |||
452 | xor $s0,$t0 | ||
453 | xor $s1,$t1 | ||
454 | xor $s2,$t2 | ||
455 | xor $s3,$t3 | ||
456 | |||
457 | sub $cnt,1 | ||
458 | _xtr $i0,$s3,16-2 | ||
459 | .Loop_dec: | ||
460 | _xtr $i1,$s0,16-2 | ||
461 | _xtr $i2,$s1,16-2 | ||
462 | _xtr $i3,$s2,16-2 | ||
463 | and $i0,0x3fc | ||
464 | and $i1,0x3fc | ||
465 | and $i2,0x3fc | ||
466 | and $i3,0x3fc | ||
467 | $PTR_ADD $i0,$Tbl | ||
468 | $PTR_ADD $i1,$Tbl | ||
469 | $PTR_ADD $i2,$Tbl | ||
470 | $PTR_ADD $i3,$Tbl | ||
471 | lwl $t0,3($i0) # Td1[s3>>16] | ||
472 | lwl $t1,3($i1) # Td1[s0>>16] | ||
473 | lwl $t2,3($i2) # Td1[s1>>16] | ||
474 | lwl $t3,3($i3) # Td1[s2>>16] | ||
475 | lwr $t0,2($i0) # Td1[s3>>16] | ||
476 | lwr $t1,2($i1) # Td1[s0>>16] | ||
477 | lwr $t2,2($i2) # Td1[s1>>16] | ||
478 | lwr $t3,2($i3) # Td1[s2>>16] | ||
479 | |||
480 | _xtr $i0,$s2,8-2 | ||
481 | _xtr $i1,$s3,8-2 | ||
482 | _xtr $i2,$s0,8-2 | ||
483 | _xtr $i3,$s1,8-2 | ||
484 | and $i0,0x3fc | ||
485 | and $i1,0x3fc | ||
486 | and $i2,0x3fc | ||
487 | and $i3,0x3fc | ||
488 | $PTR_ADD $i0,$Tbl | ||
489 | $PTR_ADD $i1,$Tbl | ||
490 | $PTR_ADD $i2,$Tbl | ||
491 | $PTR_ADD $i3,$Tbl | ||
492 | lwl $t4,2($i0) # Td2[s2>>8] | ||
493 | lwl $t5,2($i1) # Td2[s3>>8] | ||
494 | lwl $t6,2($i2) # Td2[s0>>8] | ||
495 | lwl $t7,2($i3) # Td2[s1>>8] | ||
496 | lwr $t4,1($i0) # Td2[s2>>8] | ||
497 | lwr $t5,1($i1) # Td2[s3>>8] | ||
498 | lwr $t6,1($i2) # Td2[s0>>8] | ||
499 | lwr $t7,1($i3) # Td2[s1>>8] | ||
500 | |||
501 | _xtr $i0,$s1,0-2 | ||
502 | _xtr $i1,$s2,0-2 | ||
503 | _xtr $i2,$s3,0-2 | ||
504 | _xtr $i3,$s0,0-2 | ||
505 | and $i0,0x3fc | ||
506 | and $i1,0x3fc | ||
507 | and $i2,0x3fc | ||
508 | and $i3,0x3fc | ||
509 | $PTR_ADD $i0,$Tbl | ||
510 | $PTR_ADD $i1,$Tbl | ||
511 | $PTR_ADD $i2,$Tbl | ||
512 | $PTR_ADD $i3,$Tbl | ||
513 | lwl $t8,1($i0) # Td3[s1] | ||
514 | lwl $t9,1($i1) # Td3[s2] | ||
515 | lwl $t10,1($i2) # Td3[s3] | ||
516 | lwl $t11,1($i3) # Td3[s0] | ||
517 | lwr $t8,0($i0) # Td3[s1] | ||
518 | lwr $t9,0($i1) # Td3[s2] | ||
519 | lwr $t10,0($i2) # Td3[s3] | ||
520 | lwr $t11,0($i3) # Td3[s0] | ||
521 | |||
522 | _xtr $i0,$s0,24-2 | ||
523 | _xtr $i1,$s1,24-2 | ||
524 | _xtr $i2,$s2,24-2 | ||
525 | _xtr $i3,$s3,24-2 | ||
526 | and $i0,0x3fc | ||
527 | and $i1,0x3fc | ||
528 | and $i2,0x3fc | ||
529 | and $i3,0x3fc | ||
530 | $PTR_ADD $i0,$Tbl | ||
531 | $PTR_ADD $i1,$Tbl | ||
532 | $PTR_ADD $i2,$Tbl | ||
533 | $PTR_ADD $i3,$Tbl | ||
534 | |||
535 | xor $t0,$t4 | ||
536 | xor $t1,$t5 | ||
537 | xor $t2,$t6 | ||
538 | xor $t3,$t7 | ||
539 | |||
540 | |||
541 | lw $t4,0($i0) # Td0[s0>>24] | ||
542 | lw $t5,0($i1) # Td0[s1>>24] | ||
543 | lw $t6,0($i2) # Td0[s2>>24] | ||
544 | lw $t7,0($i3) # Td0[s3>>24] | ||
545 | |||
546 | lw $s0,0($key0) | ||
547 | lw $s1,4($key0) | ||
548 | lw $s2,8($key0) | ||
549 | lw $s3,12($key0) | ||
550 | |||
551 | xor $t0,$t8 | ||
552 | xor $t1,$t9 | ||
553 | xor $t2,$t10 | ||
554 | xor $t3,$t11 | ||
555 | |||
556 | xor $t0,$t4 | ||
557 | xor $t1,$t5 | ||
558 | xor $t2,$t6 | ||
559 | xor $t3,$t7 | ||
560 | |||
561 | sub $cnt,1 | ||
562 | $PTR_ADD $key0,16 | ||
563 | xor $s0,$t0 | ||
564 | xor $s1,$t1 | ||
565 | xor $s2,$t2 | ||
566 | xor $s3,$t3 | ||
567 | .set noreorder | ||
568 | bnez $cnt,.Loop_dec | ||
569 | _xtr $i0,$s3,16-2 | ||
570 | |||
571 | .set reorder | ||
572 | lw $t4,1024($Tbl) # prefetch Td4 | ||
573 | lw $t5,1024+32($Tbl) | ||
574 | lw $t6,1024+64($Tbl) | ||
575 | lw $t7,1024+96($Tbl) | ||
576 | lw $t8,1024+128($Tbl) | ||
577 | lw $t9,1024+160($Tbl) | ||
578 | lw $t10,1024+192($Tbl) | ||
579 | lw $t11,1024+224($Tbl) | ||
580 | |||
581 | _xtr $i0,$s3,16 | ||
582 | _xtr $i1,$s0,16 | ||
583 | _xtr $i2,$s1,16 | ||
584 | _xtr $i3,$s2,16 | ||
585 | and $i0,0xff | ||
586 | and $i1,0xff | ||
587 | and $i2,0xff | ||
588 | and $i3,0xff | ||
589 | $PTR_ADD $i0,$Tbl | ||
590 | $PTR_ADD $i1,$Tbl | ||
591 | $PTR_ADD $i2,$Tbl | ||
592 | $PTR_ADD $i3,$Tbl | ||
593 | lbu $t0,1024($i0) # Td4[s3>>16] | ||
594 | lbu $t1,1024($i1) # Td4[s0>>16] | ||
595 | lbu $t2,1024($i2) # Td4[s1>>16] | ||
596 | lbu $t3,1024($i3) # Td4[s2>>16] | ||
597 | |||
598 | _xtr $i0,$s2,8 | ||
599 | _xtr $i1,$s3,8 | ||
600 | _xtr $i2,$s0,8 | ||
601 | _xtr $i3,$s1,8 | ||
602 | and $i0,0xff | ||
603 | and $i1,0xff | ||
604 | and $i2,0xff | ||
605 | and $i3,0xff | ||
606 | $PTR_ADD $i0,$Tbl | ||
607 | $PTR_ADD $i1,$Tbl | ||
608 | $PTR_ADD $i2,$Tbl | ||
609 | $PTR_ADD $i3,$Tbl | ||
610 | lbu $t4,1024($i0) # Td4[s2>>8] | ||
611 | lbu $t5,1024($i1) # Td4[s3>>8] | ||
612 | lbu $t6,1024($i2) # Td4[s0>>8] | ||
613 | lbu $t7,1024($i3) # Td4[s1>>8] | ||
614 | |||
615 | _xtr $i0,$s0,24 | ||
616 | _xtr $i1,$s1,24 | ||
617 | _xtr $i2,$s2,24 | ||
618 | _xtr $i3,$s3,24 | ||
619 | $PTR_ADD $i0,$Tbl | ||
620 | $PTR_ADD $i1,$Tbl | ||
621 | $PTR_ADD $i2,$Tbl | ||
622 | $PTR_ADD $i3,$Tbl | ||
623 | lbu $t8,1024($i0) # Td4[s0>>24] | ||
624 | lbu $t9,1024($i1) # Td4[s1>>24] | ||
625 | lbu $t10,1024($i2) # Td4[s2>>24] | ||
626 | lbu $t11,1024($i3) # Td4[s3>>24] | ||
627 | |||
628 | _xtr $i0,$s1,0 | ||
629 | _xtr $i1,$s2,0 | ||
630 | _xtr $i2,$s3,0 | ||
631 | _xtr $i3,$s0,0 | ||
632 | |||
633 | _ins $t0,16 | ||
634 | _ins $t1,16 | ||
635 | _ins $t2,16 | ||
636 | _ins $t3,16 | ||
637 | |||
638 | _ins $t4,8 | ||
639 | _ins $t5,8 | ||
640 | _ins $t6,8 | ||
641 | _ins $t7,8 | ||
642 | |||
643 | xor $t0,$t4 | ||
644 | xor $t1,$t5 | ||
645 | xor $t2,$t6 | ||
646 | xor $t3,$t7 | ||
647 | |||
648 | $PTR_ADD $i0,$Tbl | ||
649 | $PTR_ADD $i1,$Tbl | ||
650 | $PTR_ADD $i2,$Tbl | ||
651 | $PTR_ADD $i3,$Tbl | ||
652 | lbu $t4,1024($i0) # Td4[s1] | ||
653 | lbu $t5,1024($i1) # Td4[s2] | ||
654 | lbu $t6,1024($i2) # Td4[s3] | ||
655 | lbu $t7,1024($i3) # Td4[s0] | ||
656 | |||
657 | _ins $t8,24 | ||
658 | _ins $t9,24 | ||
659 | _ins $t10,24 | ||
660 | _ins $t11,24 | ||
661 | |||
662 | lw $s0,0($key0) | ||
663 | lw $s1,4($key0) | ||
664 | lw $s2,8($key0) | ||
665 | lw $s3,12($key0) | ||
666 | |||
667 | _ins $t4,0 | ||
668 | _ins $t5,0 | ||
669 | _ins $t6,0 | ||
670 | _ins $t7,0 | ||
671 | |||
672 | |||
673 | xor $t0,$t8 | ||
674 | xor $t1,$t9 | ||
675 | xor $t2,$t10 | ||
676 | xor $t3,$t11 | ||
677 | |||
678 | xor $t0,$t4 | ||
679 | xor $t1,$t5 | ||
680 | xor $t2,$t6 | ||
681 | xor $t3,$t7 | ||
682 | |||
683 | xor $s0,$t0 | ||
684 | xor $s1,$t1 | ||
685 | xor $s2,$t2 | ||
686 | xor $s3,$t3 | ||
687 | |||
688 | jr $ra | ||
689 | .end _mips_AES_decrypt | ||
690 | |||
691 | .align 5 | ||
692 | .globl AES_decrypt | ||
693 | .ent AES_decrypt | ||
694 | AES_decrypt: | ||
695 | .frame $sp,$FRAMESIZE,$ra | ||
696 | .mask $SAVED_REGS_MASK,-$SZREG | ||
697 | .set noreorder | ||
698 | ___ | ||
699 | $code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification | ||
700 | .cpload $pf | ||
701 | ___ | ||
702 | $code.=<<___; | ||
703 | $PTR_SUB $sp,$FRAMESIZE | ||
704 | $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) | ||
705 | $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) | ||
706 | $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) | ||
707 | $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) | ||
708 | $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) | ||
709 | $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) | ||
710 | $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) | ||
711 | $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) | ||
712 | $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) | ||
713 | $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) | ||
714 | ___ | ||
715 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
716 | $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) | ||
717 | $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) | ||
718 | $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) | ||
719 | $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) | ||
720 | $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) | ||
721 | ___ | ||
722 | $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification | ||
723 | .cplocal $Tbl | ||
724 | .cpsetup $pf,$zero,AES_decrypt | ||
725 | ___ | ||
726 | $code.=<<___; | ||
727 | .set reorder | ||
728 | la $Tbl,AES_Td # PIC-ified 'load address' | ||
729 | |||
730 | lwl $s0,0+$MSB($inp) | ||
731 | lwl $s1,4+$MSB($inp) | ||
732 | lwl $s2,8+$MSB($inp) | ||
733 | lwl $s3,12+$MSB($inp) | ||
734 | lwr $s0,0+$LSB($inp) | ||
735 | lwr $s1,4+$LSB($inp) | ||
736 | lwr $s2,8+$LSB($inp) | ||
737 | lwr $s3,12+$LSB($inp) | ||
738 | |||
739 | bal _mips_AES_decrypt | ||
740 | |||
741 | swr $s0,0+$LSB($out) | ||
742 | swr $s1,4+$LSB($out) | ||
743 | swr $s2,8+$LSB($out) | ||
744 | swr $s3,12+$LSB($out) | ||
745 | swl $s0,0+$MSB($out) | ||
746 | swl $s1,4+$MSB($out) | ||
747 | swl $s2,8+$MSB($out) | ||
748 | swl $s3,12+$MSB($out) | ||
749 | |||
750 | .set noreorder | ||
751 | $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) | ||
752 | $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) | ||
753 | $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) | ||
754 | $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) | ||
755 | $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) | ||
756 | $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) | ||
757 | $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) | ||
758 | $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) | ||
759 | $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) | ||
760 | $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) | ||
761 | ___ | ||
762 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
763 | $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) | ||
764 | $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) | ||
765 | $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) | ||
766 | $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) | ||
767 | $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) | ||
768 | ___ | ||
769 | $code.=<<___; | ||
770 | jr $ra | ||
771 | $PTR_ADD $sp,$FRAMESIZE | ||
772 | .end AES_decrypt | ||
773 | ___ | ||
774 | }}} | ||
775 | |||
776 | {{{ | ||
777 | my $FRAMESIZE=8*$SZREG; | ||
778 | my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000; | ||
779 | |||
780 | my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3); | ||
781 | my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); | ||
782 | my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); | ||
783 | my ($rcon,$cnt)=($gp,$fp); | ||
784 | |||
785 | $code.=<<___; | ||
786 | .align 5 | ||
787 | .ent _mips_AES_set_encrypt_key | ||
788 | _mips_AES_set_encrypt_key: | ||
789 | .frame $sp,0,$ra | ||
790 | .set noreorder | ||
791 | beqz $inp,.Lekey_done | ||
792 | li $t0,-1 | ||
793 | beqz $key,.Lekey_done | ||
794 | $PTR_ADD $rcon,$Tbl,1024+256 | ||
795 | |||
796 | .set reorder | ||
797 | lwl $rk0,0+$MSB($inp) # load 128 bits | ||
798 | lwl $rk1,4+$MSB($inp) | ||
799 | lwl $rk2,8+$MSB($inp) | ||
800 | lwl $rk3,12+$MSB($inp) | ||
801 | li $at,128 | ||
802 | lwr $rk0,0+$LSB($inp) | ||
803 | lwr $rk1,4+$LSB($inp) | ||
804 | lwr $rk2,8+$LSB($inp) | ||
805 | lwr $rk3,12+$LSB($inp) | ||
806 | .set noreorder | ||
807 | beq $bits,$at,.L128bits | ||
808 | li $cnt,10 | ||
809 | |||
810 | .set reorder | ||
811 | lwl $rk4,16+$MSB($inp) # load 192 bits | ||
812 | lwl $rk5,20+$MSB($inp) | ||
813 | li $at,192 | ||
814 | lwr $rk4,16+$LSB($inp) | ||
815 | lwr $rk5,20+$LSB($inp) | ||
816 | .set noreorder | ||
817 | beq $bits,$at,.L192bits | ||
818 | li $cnt,8 | ||
819 | |||
820 | .set reorder | ||
821 | lwl $rk6,24+$MSB($inp) # load 256 bits | ||
822 | lwl $rk7,28+$MSB($inp) | ||
823 | li $at,256 | ||
824 | lwr $rk6,24+$LSB($inp) | ||
825 | lwr $rk7,28+$LSB($inp) | ||
826 | .set noreorder | ||
827 | beq $bits,$at,.L256bits | ||
828 | li $cnt,7 | ||
829 | |||
830 | b .Lekey_done | ||
831 | li $t0,-2 | ||
832 | |||
833 | .align 4 | ||
834 | .L128bits: | ||
835 | .set reorder | ||
836 | srl $i0,$rk3,16 | ||
837 | srl $i1,$rk3,8 | ||
838 | and $i0,0xff | ||
839 | and $i1,0xff | ||
840 | and $i2,$rk3,0xff | ||
841 | srl $i3,$rk3,24 | ||
842 | $PTR_ADD $i0,$Tbl | ||
843 | $PTR_ADD $i1,$Tbl | ||
844 | $PTR_ADD $i2,$Tbl | ||
845 | $PTR_ADD $i3,$Tbl | ||
846 | lbu $i0,1024($i0) | ||
847 | lbu $i1,1024($i1) | ||
848 | lbu $i2,1024($i2) | ||
849 | lbu $i3,1024($i3) | ||
850 | |||
851 | sw $rk0,0($key) | ||
852 | sw $rk1,4($key) | ||
853 | sw $rk2,8($key) | ||
854 | sw $rk3,12($key) | ||
855 | sub $cnt,1 | ||
856 | $PTR_ADD $key,16 | ||
857 | |||
858 | _bias $i0,24 | ||
859 | _bias $i1,16 | ||
860 | _bias $i2,8 | ||
861 | _bias $i3,0 | ||
862 | |||
863 | xor $rk0,$i0 | ||
864 | lw $i0,0($rcon) | ||
865 | xor $rk0,$i1 | ||
866 | xor $rk0,$i2 | ||
867 | xor $rk0,$i3 | ||
868 | xor $rk0,$i0 | ||
869 | |||
870 | xor $rk1,$rk0 | ||
871 | xor $rk2,$rk1 | ||
872 | xor $rk3,$rk2 | ||
873 | |||
874 | .set noreorder | ||
875 | bnez $cnt,.L128bits | ||
876 | $PTR_ADD $rcon,4 | ||
877 | |||
878 | sw $rk0,0($key) | ||
879 | sw $rk1,4($key) | ||
880 | sw $rk2,8($key) | ||
881 | li $cnt,10 | ||
882 | sw $rk3,12($key) | ||
883 | li $t0,0 | ||
884 | sw $cnt,80($key) | ||
885 | b .Lekey_done | ||
886 | $PTR_SUB $key,10*16 | ||
887 | |||
888 | .align 4 | ||
889 | .L192bits: | ||
890 | .set reorder | ||
891 | srl $i0,$rk5,16 | ||
892 | srl $i1,$rk5,8 | ||
893 | and $i0,0xff | ||
894 | and $i1,0xff | ||
895 | and $i2,$rk5,0xff | ||
896 | srl $i3,$rk5,24 | ||
897 | $PTR_ADD $i0,$Tbl | ||
898 | $PTR_ADD $i1,$Tbl | ||
899 | $PTR_ADD $i2,$Tbl | ||
900 | $PTR_ADD $i3,$Tbl | ||
901 | lbu $i0,1024($i0) | ||
902 | lbu $i1,1024($i1) | ||
903 | lbu $i2,1024($i2) | ||
904 | lbu $i3,1024($i3) | ||
905 | |||
906 | sw $rk0,0($key) | ||
907 | sw $rk1,4($key) | ||
908 | sw $rk2,8($key) | ||
909 | sw $rk3,12($key) | ||
910 | sw $rk4,16($key) | ||
911 | sw $rk5,20($key) | ||
912 | sub $cnt,1 | ||
913 | $PTR_ADD $key,24 | ||
914 | |||
915 | _bias $i0,24 | ||
916 | _bias $i1,16 | ||
917 | _bias $i2,8 | ||
918 | _bias $i3,0 | ||
919 | |||
920 | xor $rk0,$i0 | ||
921 | lw $i0,0($rcon) | ||
922 | xor $rk0,$i1 | ||
923 | xor $rk0,$i2 | ||
924 | xor $rk0,$i3 | ||
925 | xor $rk0,$i0 | ||
926 | |||
927 | xor $rk1,$rk0 | ||
928 | xor $rk2,$rk1 | ||
929 | xor $rk3,$rk2 | ||
930 | xor $rk4,$rk3 | ||
931 | xor $rk5,$rk4 | ||
932 | |||
933 | .set noreorder | ||
934 | bnez $cnt,.L192bits | ||
935 | $PTR_ADD $rcon,4 | ||
936 | |||
937 | sw $rk0,0($key) | ||
938 | sw $rk1,4($key) | ||
939 | sw $rk2,8($key) | ||
940 | li $cnt,12 | ||
941 | sw $rk3,12($key) | ||
942 | li $t0,0 | ||
943 | sw $cnt,48($key) | ||
944 | b .Lekey_done | ||
945 | $PTR_SUB $key,12*16 | ||
946 | |||
947 | .align 4 | ||
948 | .L256bits: | ||
949 | .set reorder | ||
950 | srl $i0,$rk7,16 | ||
951 | srl $i1,$rk7,8 | ||
952 | and $i0,0xff | ||
953 | and $i1,0xff | ||
954 | and $i2,$rk7,0xff | ||
955 | srl $i3,$rk7,24 | ||
956 | $PTR_ADD $i0,$Tbl | ||
957 | $PTR_ADD $i1,$Tbl | ||
958 | $PTR_ADD $i2,$Tbl | ||
959 | $PTR_ADD $i3,$Tbl | ||
960 | lbu $i0,1024($i0) | ||
961 | lbu $i1,1024($i1) | ||
962 | lbu $i2,1024($i2) | ||
963 | lbu $i3,1024($i3) | ||
964 | |||
965 | sw $rk0,0($key) | ||
966 | sw $rk1,4($key) | ||
967 | sw $rk2,8($key) | ||
968 | sw $rk3,12($key) | ||
969 | sw $rk4,16($key) | ||
970 | sw $rk5,20($key) | ||
971 | sw $rk6,24($key) | ||
972 | sw $rk7,28($key) | ||
973 | sub $cnt,1 | ||
974 | |||
975 | _bias $i0,24 | ||
976 | _bias $i1,16 | ||
977 | _bias $i2,8 | ||
978 | _bias $i3,0 | ||
979 | |||
980 | xor $rk0,$i0 | ||
981 | lw $i0,0($rcon) | ||
982 | xor $rk0,$i1 | ||
983 | xor $rk0,$i2 | ||
984 | xor $rk0,$i3 | ||
985 | xor $rk0,$i0 | ||
986 | |||
987 | xor $rk1,$rk0 | ||
988 | xor $rk2,$rk1 | ||
989 | xor $rk3,$rk2 | ||
990 | beqz $cnt,.L256bits_done | ||
991 | |||
992 | srl $i0,$rk3,24 | ||
993 | srl $i1,$rk3,16 | ||
994 | srl $i2,$rk3,8 | ||
995 | and $i3,$rk3,0xff | ||
996 | and $i1,0xff | ||
997 | and $i2,0xff | ||
998 | $PTR_ADD $i0,$Tbl | ||
999 | $PTR_ADD $i1,$Tbl | ||
1000 | $PTR_ADD $i2,$Tbl | ||
1001 | $PTR_ADD $i3,$Tbl | ||
1002 | lbu $i0,1024($i0) | ||
1003 | lbu $i1,1024($i1) | ||
1004 | lbu $i2,1024($i2) | ||
1005 | lbu $i3,1024($i3) | ||
1006 | sll $i0,24 | ||
1007 | sll $i1,16 | ||
1008 | sll $i2,8 | ||
1009 | |||
1010 | xor $rk4,$i0 | ||
1011 | xor $rk4,$i1 | ||
1012 | xor $rk4,$i2 | ||
1013 | xor $rk4,$i3 | ||
1014 | |||
1015 | xor $rk5,$rk4 | ||
1016 | xor $rk6,$rk5 | ||
1017 | xor $rk7,$rk6 | ||
1018 | |||
1019 | $PTR_ADD $key,32 | ||
1020 | .set noreorder | ||
1021 | b .L256bits | ||
1022 | $PTR_ADD $rcon,4 | ||
1023 | |||
1024 | .L256bits_done: | ||
1025 | sw $rk0,32($key) | ||
1026 | sw $rk1,36($key) | ||
1027 | sw $rk2,40($key) | ||
1028 | li $cnt,14 | ||
1029 | sw $rk3,44($key) | ||
1030 | li $t0,0 | ||
1031 | sw $cnt,48($key) | ||
1032 | $PTR_SUB $key,12*16 | ||
1033 | |||
1034 | .Lekey_done: | ||
1035 | jr $ra | ||
1036 | nop | ||
1037 | .end _mips_AES_set_encrypt_key | ||
1038 | |||
1039 | .globl AES_set_encrypt_key | ||
1040 | .ent AES_set_encrypt_key | ||
1041 | AES_set_encrypt_key: | ||
1042 | .frame $sp,$FRAMESIZE,$ra | ||
1043 | .mask $SAVED_REGS_MASK,-$SZREG | ||
1044 | .set noreorder | ||
1045 | ___ | ||
1046 | $code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification | ||
1047 | .cpload $pf | ||
1048 | ___ | ||
1049 | $code.=<<___; | ||
1050 | $PTR_SUB $sp,$FRAMESIZE | ||
1051 | $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) | ||
1052 | $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) | ||
1053 | ___ | ||
1054 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
1055 | $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) | ||
1056 | $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) | ||
1057 | $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) | ||
1058 | $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) | ||
1059 | $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) | ||
1060 | ___ | ||
1061 | $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification | ||
1062 | .cplocal $Tbl | ||
1063 | .cpsetup $pf,$zero,AES_set_encrypt_key | ||
1064 | ___ | ||
1065 | $code.=<<___; | ||
1066 | .set reorder | ||
1067 | la $Tbl,AES_Te # PIC-ified 'load address' | ||
1068 | |||
1069 | bal _mips_AES_set_encrypt_key | ||
1070 | |||
1071 | .set noreorder | ||
1072 | move $a0,$t0 | ||
1073 | $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) | ||
1074 | $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) | ||
1075 | ___ | ||
1076 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1077 | $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) | ||
1078 | $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) | ||
1079 | $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) | ||
1080 | $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) | ||
1081 | $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) | ||
1082 | ___ | ||
1083 | $code.=<<___; | ||
1084 | jr $ra | ||
1085 | $PTR_ADD $sp,$FRAMESIZE | ||
1086 | .end AES_set_encrypt_key | ||
1087 | ___ | ||
1088 | |||
1089 | my ($head,$tail)=($inp,$bits); | ||
1090 | my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); | ||
1091 | my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2); | ||
1092 | $code.=<<___; | ||
1093 | .align 5 | ||
1094 | .globl AES_set_decrypt_key | ||
1095 | .ent AES_set_decrypt_key | ||
1096 | AES_set_decrypt_key: | ||
1097 | .frame $sp,$FRAMESIZE,$ra | ||
1098 | .mask $SAVED_REGS_MASK,-$SZREG | ||
1099 | .set noreorder | ||
1100 | ___ | ||
1101 | $code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification | ||
1102 | .cpload $pf | ||
1103 | ___ | ||
1104 | $code.=<<___; | ||
1105 | $PTR_SUB $sp,$FRAMESIZE | ||
1106 | $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) | ||
1107 | $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) | ||
1108 | ___ | ||
1109 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
1110 | $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) | ||
1111 | $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) | ||
1112 | $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) | ||
1113 | $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) | ||
1114 | $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) | ||
1115 | ___ | ||
1116 | $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification | ||
1117 | .cplocal $Tbl | ||
1118 | .cpsetup $pf,$zero,AES_set_decrypt_key | ||
1119 | ___ | ||
1120 | $code.=<<___; | ||
1121 | .set reorder | ||
1122 | la $Tbl,AES_Te # PIC-ified 'load address' | ||
1123 | |||
1124 | bal _mips_AES_set_encrypt_key | ||
1125 | |||
1126 | bltz $t0,.Ldkey_done | ||
1127 | |||
1128 | sll $at,$cnt,4 | ||
1129 | $PTR_ADD $head,$key,0 | ||
1130 | $PTR_ADD $tail,$key,$at | ||
1131 | .align 4 | ||
1132 | .Lswap: | ||
1133 | lw $rk0,0($head) | ||
1134 | lw $rk1,4($head) | ||
1135 | lw $rk2,8($head) | ||
1136 | lw $rk3,12($head) | ||
1137 | lw $rk4,0($tail) | ||
1138 | lw $rk5,4($tail) | ||
1139 | lw $rk6,8($tail) | ||
1140 | lw $rk7,12($tail) | ||
1141 | sw $rk0,0($tail) | ||
1142 | sw $rk1,4($tail) | ||
1143 | sw $rk2,8($tail) | ||
1144 | sw $rk3,12($tail) | ||
1145 | $PTR_ADD $head,16 | ||
1146 | $PTR_SUB $tail,16 | ||
1147 | sw $rk4,-16($head) | ||
1148 | sw $rk5,-12($head) | ||
1149 | sw $rk6,-8($head) | ||
1150 | sw $rk7,-4($head) | ||
1151 | bne $head,$tail,.Lswap | ||
1152 | |||
1153 | lw $tp1,16($key) # modulo-scheduled | ||
1154 | lui $x80808080,0x8080 | ||
1155 | sub $cnt,1 | ||
1156 | or $x80808080,0x8080 | ||
1157 | sll $cnt,2 | ||
1158 | $PTR_ADD $key,16 | ||
1159 | lui $x1b1b1b1b,0x1b1b | ||
1160 | nor $x7f7f7f7f,$zero,$x80808080 | ||
1161 | or $x1b1b1b1b,0x1b1b | ||
1162 | .align 4 | ||
1163 | .Lmix: | ||
1164 | and $m,$tp1,$x80808080 | ||
1165 | and $tp2,$tp1,$x7f7f7f7f | ||
1166 | srl $tp4,$m,7 | ||
1167 | addu $tp2,$tp2 # tp2<<1 | ||
1168 | subu $m,$tp4 | ||
1169 | and $m,$x1b1b1b1b | ||
1170 | xor $tp2,$m | ||
1171 | |||
1172 | and $m,$tp2,$x80808080 | ||
1173 | and $tp4,$tp2,$x7f7f7f7f | ||
1174 | srl $tp8,$m,7 | ||
1175 | addu $tp4,$tp4 # tp4<<1 | ||
1176 | subu $m,$tp8 | ||
1177 | and $m,$x1b1b1b1b | ||
1178 | xor $tp4,$m | ||
1179 | |||
1180 | and $m,$tp4,$x80808080 | ||
1181 | and $tp8,$tp4,$x7f7f7f7f | ||
1182 | srl $tp9,$m,7 | ||
1183 | addu $tp8,$tp8 # tp8<<1 | ||
1184 | subu $m,$tp9 | ||
1185 | and $m,$x1b1b1b1b | ||
1186 | xor $tp8,$m | ||
1187 | |||
1188 | xor $tp9,$tp8,$tp1 | ||
1189 | xor $tpe,$tp8,$tp4 | ||
1190 | xor $tpb,$tp9,$tp2 | ||
1191 | xor $tpd,$tp9,$tp4 | ||
1192 | |||
1193 | _ror $tp1,$tpd,16 | ||
1194 | xor $tpe,$tp2 | ||
1195 | _ror $tp2,$tpd,-16 | ||
1196 | xor $tpe,$tp1 | ||
1197 | _ror $tp1,$tp9,8 | ||
1198 | xor $tpe,$tp2 | ||
1199 | _ror $tp2,$tp9,-24 | ||
1200 | xor $tpe,$tp1 | ||
1201 | _ror $tp1,$tpb,24 | ||
1202 | xor $tpe,$tp2 | ||
1203 | _ror $tp2,$tpb,-8 | ||
1204 | xor $tpe,$tp1 | ||
1205 | lw $tp1,4($key) # modulo-scheduled | ||
1206 | xor $tpe,$tp2 | ||
1207 | sub $cnt,1 | ||
1208 | sw $tpe,0($key) | ||
1209 | $PTR_ADD $key,4 | ||
1210 | bnez $cnt,.Lmix | ||
1211 | |||
1212 | li $t0,0 | ||
1213 | .Ldkey_done: | ||
1214 | .set noreorder | ||
1215 | move $a0,$t0 | ||
1216 | $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) | ||
1217 | $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) | ||
1218 | ___ | ||
1219 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1220 | $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) | ||
1221 | $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) | ||
1222 | $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) | ||
1223 | $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) | ||
1224 | $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) | ||
1225 | ___ | ||
1226 | $code.=<<___; | ||
1227 | jr $ra | ||
1228 | $PTR_ADD $sp,$FRAMESIZE | ||
1229 | .end AES_set_decrypt_key | ||
1230 | ___ | ||
1231 | }}} | ||
1232 | |||
1233 | ###################################################################### | ||
1234 | # Tables are kept in endian-neutral manner | ||
1235 | $code.=<<___; | ||
1236 | .rdata | ||
1237 | .align 6 | ||
1238 | AES_Te: | ||
1239 | .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0 | ||
1240 | .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d | ||
1241 | .byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd | ||
1242 | .byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 | ||
1243 | .byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 | ||
1244 | .byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d | ||
1245 | .byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 | ||
1246 | .byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a | ||
1247 | .byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d | ||
1248 | .byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 | ||
1249 | .byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb | ||
1250 | .byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b | ||
1251 | .byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 | ||
1252 | .byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea | ||
1253 | .byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 | ||
1254 | .byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b | ||
1255 | .byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c | ||
1256 | .byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a | ||
1257 | .byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 | ||
1258 | .byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f | ||
1259 | .byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 | ||
1260 | .byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 | ||
1261 | .byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 | ||
1262 | .byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f | ||
1263 | .byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 | ||
1264 | .byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e | ||
1265 | .byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 | ||
1266 | .byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 | ||
1267 | .byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 | ||
1268 | .byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d | ||
1269 | .byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 | ||
1270 | .byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f | ||
1271 | .byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e | ||
1272 | .byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e | ||
1273 | .byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 | ||
1274 | .byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb | ||
1275 | .byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d | ||
1276 | .byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce | ||
1277 | .byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e | ||
1278 | .byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 | ||
1279 | .byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 | ||
1280 | .byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c | ||
1281 | .byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f | ||
1282 | .byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed | ||
1283 | .byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 | ||
1284 | .byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b | ||
1285 | .byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 | ||
1286 | .byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a | ||
1287 | .byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a | ||
1288 | .byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 | ||
1289 | .byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 | ||
1290 | .byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 | ||
1291 | .byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 | ||
1292 | .byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 | ||
1293 | .byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 | ||
1294 | .byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 | ||
1295 | .byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe | ||
1296 | .byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a | ||
1297 | .byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc | ||
1298 | .byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 | ||
1299 | .byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 | ||
1300 | .byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 | ||
1301 | .byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a | ||
1302 | .byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d | ||
1303 | .byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 | ||
1304 | .byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f | ||
1305 | .byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 | ||
1306 | .byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 | ||
1307 | .byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 | ||
1308 | .byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 | ||
1309 | .byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 | ||
1310 | .byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 | ||
1311 | .byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 | ||
1312 | .byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f | ||
1313 | .byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e | ||
1314 | .byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 | ||
1315 | .byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 | ||
1316 | .byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c | ||
1317 | .byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 | ||
1318 | .byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 | ||
1319 | .byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 | ||
1320 | .byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e | ||
1321 | .byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a | ||
1322 | .byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 | ||
1323 | .byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e | ||
1324 | .byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 | ||
1325 | .byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 | ||
1326 | .byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b | ||
1327 | .byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 | ||
1328 | .byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 | ||
1329 | .byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 | ||
1330 | .byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 | ||
1331 | .byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa | ||
1332 | .byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 | ||
1333 | .byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e | ||
1334 | .byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 | ||
1335 | .byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 | ||
1336 | .byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 | ||
1337 | .byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 | ||
1338 | .byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 | ||
1339 | .byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c | ||
1340 | .byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 | ||
1341 | .byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc | ||
1342 | .byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 | ||
1343 | .byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 | ||
1344 | .byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa | ||
1345 | .byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 | ||
1346 | .byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 | ||
1347 | .byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f | ||
1348 | .byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 | ||
1349 | .byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 | ||
1350 | .byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 | ||
1351 | .byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 | ||
1352 | .byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 | ||
1353 | .byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 | ||
1354 | .byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 | ||
1355 | .byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 | ||
1356 | .byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 | ||
1357 | .byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff | ||
1358 | .byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a | ||
1359 | .byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 | ||
1360 | .byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 | ||
1361 | .byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 | ||
1362 | .byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 | ||
1363 | .byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 | ||
1364 | .byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 | ||
1365 | .byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc | ||
1366 | .byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a | ||
1367 | |||
1368 | .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 | ||
1369 | .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | ||
1370 | .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | ||
1371 | .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | ||
1372 | .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | ||
1373 | .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | ||
1374 | .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | ||
1375 | .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | ||
1376 | .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | ||
1377 | .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | ||
1378 | .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | ||
1379 | .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | ||
1380 | .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | ||
1381 | .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | ||
1382 | .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | ||
1383 | .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | ||
1384 | .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | ||
1385 | .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | ||
1386 | .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | ||
1387 | .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | ||
1388 | .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | ||
1389 | .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | ||
1390 | .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | ||
1391 | .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | ||
1392 | .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | ||
1393 | .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | ||
1394 | .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | ||
1395 | .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | ||
1396 | .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | ||
1397 | .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | ||
1398 | .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | ||
1399 | .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | ||
1400 | |||
1401 | .byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon | ||
1402 | .byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 | ||
1403 | .byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 | ||
1404 | .byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 | ||
1405 | .byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 | ||
1406 | |||
1407 | .align 6 | ||
1408 | AES_Td: | ||
1409 | .byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0 | ||
1410 | .byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 | ||
1411 | .byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 | ||
1412 | .byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 | ||
1413 | .byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 | ||
1414 | .byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 | ||
1415 | .byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 | ||
1416 | .byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f | ||
1417 | .byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 | ||
1418 | .byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 | ||
1419 | .byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 | ||
1420 | .byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 | ||
1421 | .byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 | ||
1422 | .byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda | ||
1423 | .byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 | ||
1424 | .byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 | ||
1425 | .byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 | ||
1426 | .byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd | ||
1427 | .byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 | ||
1428 | .byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 | ||
1429 | .byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 | ||
1430 | .byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 | ||
1431 | .byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 | ||
1432 | .byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 | ||
1433 | .byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 | ||
1434 | .byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 | ||
1435 | .byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 | ||
1436 | .byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a | ||
1437 | .byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 | ||
1438 | .byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 | ||
1439 | .byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 | ||
1440 | .byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c | ||
1441 | .byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 | ||
1442 | .byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 | ||
1443 | .byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 | ||
1444 | .byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a | ||
1445 | .byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 | ||
1446 | .byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 | ||
1447 | .byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa | ||
1448 | .byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 | ||
1449 | .byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d | ||
1450 | .byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 | ||
1451 | .byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 | ||
1452 | .byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff | ||
1453 | .byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 | ||
1454 | .byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 | ||
1455 | .byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 | ||
1456 | .byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb | ||
1457 | .byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 | ||
1458 | .byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 | ||
1459 | .byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 | ||
1460 | .byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e | ||
1461 | .byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 | ||
1462 | .byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 | ||
1463 | .byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 | ||
1464 | .byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a | ||
1465 | .byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f | ||
1466 | .byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e | ||
1467 | .byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 | ||
1468 | .byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 | ||
1469 | .byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 | ||
1470 | .byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d | ||
1471 | .byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad | ||
1472 | .byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 | ||
1473 | .byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c | ||
1474 | .byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd | ||
1475 | .byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc | ||
1476 | .byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 | ||
1477 | .byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc | ||
1478 | .byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 | ||
1479 | .byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 | ||
1480 | .byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 | ||
1481 | .byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 | ||
1482 | .byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d | ||
1483 | .byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 | ||
1484 | .byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 | ||
1485 | .byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 | ||
1486 | .byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 | ||
1487 | .byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a | ||
1488 | .byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef | ||
1489 | .byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 | ||
1490 | .byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 | ||
1491 | .byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 | ||
1492 | .byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 | ||
1493 | .byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d | ||
1494 | .byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 | ||
1495 | .byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 | ||
1496 | .byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 | ||
1497 | .byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c | ||
1498 | .byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 | ||
1499 | .byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 | ||
1500 | .byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b | ||
1501 | .byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 | ||
1502 | .byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 | ||
1503 | .byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e | ||
1504 | .byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 | ||
1505 | .byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce | ||
1506 | .byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 | ||
1507 | .byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 | ||
1508 | .byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 | ||
1509 | .byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 | ||
1510 | .byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 | ||
1511 | .byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 | ||
1512 | .byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f | ||
1513 | .byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d | ||
1514 | .byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf | ||
1515 | .byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b | ||
1516 | .byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f | ||
1517 | .byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d | ||
1518 | .byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e | ||
1519 | .byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 | ||
1520 | .byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 | ||
1521 | .byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a | ||
1522 | .byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 | ||
1523 | .byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 | ||
1524 | .byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c | ||
1525 | .byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f | ||
1526 | .byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf | ||
1527 | .byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b | ||
1528 | .byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 | ||
1529 | .byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e | ||
1530 | .byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f | ||
1531 | .byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c | ||
1532 | .byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 | ||
1533 | .byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde | ||
1534 | .byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 | ||
1535 | .byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 | ||
1536 | .byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 | ||
1537 | |||
1538 | .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4 | ||
1539 | .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | ||
1540 | .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | ||
1541 | .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | ||
1542 | .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | ||
1543 | .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | ||
1544 | .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | ||
1545 | .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | ||
1546 | .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | ||
1547 | .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | ||
1548 | .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | ||
1549 | .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | ||
1550 | .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | ||
1551 | .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | ||
1552 | .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | ||
1553 | .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | ||
1554 | .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | ||
1555 | .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | ||
1556 | .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | ||
1557 | .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | ||
1558 | .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | ||
1559 | .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | ||
1560 | .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | ||
1561 | .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | ||
1562 | .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | ||
1563 | .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | ||
1564 | .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | ||
1565 | .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | ||
1566 | .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | ||
1567 | .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | ||
1568 | .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | ||
1569 | .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | ||
1570 | ___ | ||
1571 | |||
1572 | foreach (split("\n",$code)) { | ||
1573 | s/\`([^\`]*)\`/eval $1/ge; | ||
1574 | |||
1575 | # made-up _instructions, _xtr, _ins, _ror and _bias, cope | ||
1576 | # with byte order dependencies... | ||
1577 | if (/^\s+_/) { | ||
1578 | s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/; | ||
1579 | |||
1580 | s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/ | ||
1581 | sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) | ||
1582 | : eval("24-$3"))/e or | ||
1583 | s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ | ||
1584 | sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) | ||
1585 | : eval("24-$3"))/e or | ||
1586 | s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/ | ||
1587 | sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) | ||
1588 | : eval("$3*-1"))/e or | ||
1589 | s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ | ||
1590 | sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) | ||
1591 | : eval("($3-16)&31"))/e; | ||
1592 | |||
1593 | s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/ | ||
1594 | sprintf("sll\t$1,$2,$3")/e or | ||
1595 | s/srl\s+(\$[0-9]+),(\$[0-9]+),0/ | ||
1596 | sprintf("and\t$1,$2,0xff")/e or | ||
1597 | s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/; | ||
1598 | } | ||
1599 | |||
1600 | # convert lwl/lwr and swr/swl to little-endian order | ||
1601 | if (!$big_endian && /^\s+[sl]w[lr]\s+/) { | ||
1602 | s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/ | ||
1603 | sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or | ||
1604 | s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/ | ||
1605 | sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e; | ||
1606 | } | ||
1607 | |||
1608 | print $_,"\n"; | ||
1609 | } | ||
1610 | |||
1611 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/aes/asm/aes-parisc.pl b/src/lib/libcrypto/aes/asm/aes-parisc.pl new file mode 100644 index 0000000000..c36b6a2270 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aes-parisc.pl | |||
@@ -0,0 +1,1021 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # AES for PA-RISC. | ||
11 | # | ||
12 | # June 2009. | ||
13 | # | ||
14 | # The module is mechanical transliteration of aes-sparcv9.pl, but with | ||
15 | # a twist: S-boxes are compressed even further down to 1K+256B. On | ||
16 | # PA-7100LC performance is ~40% better than gcc 3.2 generated code and | ||
17 | # is about 33 cycles per byte processed with 128-bit key. Newer CPUs | ||
18 | # perform at 16 cycles per byte. It's not faster than code generated | ||
19 | # by vendor compiler, but recall that it has compressed S-boxes, which | ||
20 | # requires extra processing. | ||
21 | # | ||
22 | # Special thanks to polarhome.com for providing HP-UX account. | ||
23 | |||
24 | $flavour = shift; | ||
25 | $output = shift; | ||
26 | open STDOUT,">$output"; | ||
27 | |||
28 | if ($flavour =~ /64/) { | ||
29 | $LEVEL ="2.0W"; | ||
30 | $SIZE_T =8; | ||
31 | $FRAME_MARKER =80; | ||
32 | $SAVED_RP =16; | ||
33 | $PUSH ="std"; | ||
34 | $PUSHMA ="std,ma"; | ||
35 | $POP ="ldd"; | ||
36 | $POPMB ="ldd,mb"; | ||
37 | } else { | ||
38 | $LEVEL ="1.0"; | ||
39 | $SIZE_T =4; | ||
40 | $FRAME_MARKER =48; | ||
41 | $SAVED_RP =20; | ||
42 | $PUSH ="stw"; | ||
43 | $PUSHMA ="stwm"; | ||
44 | $POP ="ldw"; | ||
45 | $POPMB ="ldwm"; | ||
46 | } | ||
47 | |||
48 | $FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker | ||
49 | # [+ argument transfer] | ||
50 | $inp="%r26"; # arg0 | ||
51 | $out="%r25"; # arg1 | ||
52 | $key="%r24"; # arg2 | ||
53 | |||
54 | ($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4"); | ||
55 | ($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8"); | ||
56 | |||
57 | ($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7, | ||
58 | $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) = | ||
59 | ("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16", | ||
60 | "%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26"); | ||
61 | |||
62 | $tbl="%r28"; | ||
63 | $rounds="%r29"; | ||
64 | |||
65 | $code=<<___; | ||
66 | .LEVEL $LEVEL | ||
67 | .SPACE \$TEXT\$ | ||
68 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
69 | |||
70 | .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
71 | .ALIGN 64 | ||
72 | AES_encrypt | ||
73 | .PROC | ||
74 | .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 | ||
75 | .ENTRY | ||
76 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
77 | $PUSHMA %r3,$FRAME(%sp) | ||
78 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
79 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
80 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
81 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
82 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
83 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
84 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
85 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
86 | $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) | ||
87 | $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) | ||
88 | $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) | ||
89 | $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) | ||
90 | $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) | ||
91 | $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) | ||
92 | $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) | ||
93 | |||
94 | blr %r0,$tbl | ||
95 | ldi 3,$t0 | ||
96 | L\$enc_pic | ||
97 | andcm $tbl,$t0,$tbl | ||
98 | ldo L\$AES_Te-L\$enc_pic($tbl),$tbl | ||
99 | |||
100 | and $inp,$t0,$t0 | ||
101 | sub $inp,$t0,$inp | ||
102 | ldw 0($inp),$s0 | ||
103 | ldw 4($inp),$s1 | ||
104 | ldw 8($inp),$s2 | ||
105 | comib,= 0,$t0,L\$enc_inp_aligned | ||
106 | ldw 12($inp),$s3 | ||
107 | |||
108 | sh3addl $t0,%r0,$t0 | ||
109 | subi 32,$t0,$t0 | ||
110 | mtctl $t0,%cr11 | ||
111 | ldw 16($inp),$t1 | ||
112 | vshd $s0,$s1,$s0 | ||
113 | vshd $s1,$s2,$s1 | ||
114 | vshd $s2,$s3,$s2 | ||
115 | vshd $s3,$t1,$s3 | ||
116 | |||
117 | L\$enc_inp_aligned | ||
118 | bl _parisc_AES_encrypt,%r31 | ||
119 | nop | ||
120 | |||
121 | extru,<> $out,31,2,%r0 | ||
122 | b L\$enc_out_aligned | ||
123 | nop | ||
124 | |||
125 | _srm $s0,24,$acc0 | ||
126 | _srm $s0,16,$acc1 | ||
127 | stb $acc0,0($out) | ||
128 | _srm $s0,8,$acc2 | ||
129 | stb $acc1,1($out) | ||
130 | _srm $s1,24,$acc4 | ||
131 | stb $acc2,2($out) | ||
132 | _srm $s1,16,$acc5 | ||
133 | stb $s0,3($out) | ||
134 | _srm $s1,8,$acc6 | ||
135 | stb $acc4,4($out) | ||
136 | _srm $s2,24,$acc0 | ||
137 | stb $acc5,5($out) | ||
138 | _srm $s2,16,$acc1 | ||
139 | stb $acc6,6($out) | ||
140 | _srm $s2,8,$acc2 | ||
141 | stb $s1,7($out) | ||
142 | _srm $s3,24,$acc4 | ||
143 | stb $acc0,8($out) | ||
144 | _srm $s3,16,$acc5 | ||
145 | stb $acc1,9($out) | ||
146 | _srm $s3,8,$acc6 | ||
147 | stb $acc2,10($out) | ||
148 | stb $s2,11($out) | ||
149 | stb $acc4,12($out) | ||
150 | stb $acc5,13($out) | ||
151 | stb $acc6,14($out) | ||
152 | b L\$enc_done | ||
153 | stb $s3,15($out) | ||
154 | |||
155 | L\$enc_out_aligned | ||
156 | stw $s0,0($out) | ||
157 | stw $s1,4($out) | ||
158 | stw $s2,8($out) | ||
159 | stw $s3,12($out) | ||
160 | |||
161 | L\$enc_done | ||
162 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
163 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
164 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
165 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
166 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
167 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
168 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
169 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
170 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
171 | $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 | ||
172 | $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 | ||
173 | $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 | ||
174 | $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 | ||
175 | $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 | ||
176 | $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 | ||
177 | $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 | ||
178 | bv (%r2) | ||
179 | .EXIT | ||
180 | $POPMB -$FRAME(%sp),%r3 | ||
181 | .PROCEND | ||
182 | |||
183 | .ALIGN 16 | ||
184 | _parisc_AES_encrypt | ||
185 | .PROC | ||
186 | .CALLINFO MILLICODE | ||
187 | .ENTRY | ||
188 | ldw 240($key),$rounds | ||
189 | ldw 0($key),$t0 | ||
190 | ldw 4($key),$t1 | ||
191 | ldw 8($key),$t2 | ||
192 | _srm $rounds,1,$rounds | ||
193 | xor $t0,$s0,$s0 | ||
194 | ldw 12($key),$t3 | ||
195 | _srm $s0,24,$acc0 | ||
196 | xor $t1,$s1,$s1 | ||
197 | ldw 16($key),$t0 | ||
198 | _srm $s1,16,$acc1 | ||
199 | xor $t2,$s2,$s2 | ||
200 | ldw 20($key),$t1 | ||
201 | xor $t3,$s3,$s3 | ||
202 | ldw 24($key),$t2 | ||
203 | ldw 28($key),$t3 | ||
204 | L\$enc_loop | ||
205 | _srm $s2,8,$acc2 | ||
206 | ldwx,s $acc0($tbl),$acc0 | ||
207 | _srm $s3,0,$acc3 | ||
208 | ldwx,s $acc1($tbl),$acc1 | ||
209 | _srm $s1,24,$acc4 | ||
210 | ldwx,s $acc2($tbl),$acc2 | ||
211 | _srm $s2,16,$acc5 | ||
212 | ldwx,s $acc3($tbl),$acc3 | ||
213 | _srm $s3,8,$acc6 | ||
214 | ldwx,s $acc4($tbl),$acc4 | ||
215 | _srm $s0,0,$acc7 | ||
216 | ldwx,s $acc5($tbl),$acc5 | ||
217 | _srm $s2,24,$acc8 | ||
218 | ldwx,s $acc6($tbl),$acc6 | ||
219 | _srm $s3,16,$acc9 | ||
220 | ldwx,s $acc7($tbl),$acc7 | ||
221 | _srm $s0,8,$acc10 | ||
222 | ldwx,s $acc8($tbl),$acc8 | ||
223 | _srm $s1,0,$acc11 | ||
224 | ldwx,s $acc9($tbl),$acc9 | ||
225 | _srm $s3,24,$acc12 | ||
226 | ldwx,s $acc10($tbl),$acc10 | ||
227 | _srm $s0,16,$acc13 | ||
228 | ldwx,s $acc11($tbl),$acc11 | ||
229 | _srm $s1,8,$acc14 | ||
230 | ldwx,s $acc12($tbl),$acc12 | ||
231 | _srm $s2,0,$acc15 | ||
232 | ldwx,s $acc13($tbl),$acc13 | ||
233 | ldwx,s $acc14($tbl),$acc14 | ||
234 | ldwx,s $acc15($tbl),$acc15 | ||
235 | addib,= -1,$rounds,L\$enc_last | ||
236 | ldo 32($key),$key | ||
237 | |||
238 | _ror $acc1,8,$acc1 | ||
239 | xor $acc0,$t0,$t0 | ||
240 | ldw 0($key),$s0 | ||
241 | _ror $acc2,16,$acc2 | ||
242 | xor $acc1,$t0,$t0 | ||
243 | ldw 4($key),$s1 | ||
244 | _ror $acc3,24,$acc3 | ||
245 | xor $acc2,$t0,$t0 | ||
246 | ldw 8($key),$s2 | ||
247 | _ror $acc5,8,$acc5 | ||
248 | xor $acc3,$t0,$t0 | ||
249 | ldw 12($key),$s3 | ||
250 | _ror $acc6,16,$acc6 | ||
251 | xor $acc4,$t1,$t1 | ||
252 | _ror $acc7,24,$acc7 | ||
253 | xor $acc5,$t1,$t1 | ||
254 | _ror $acc9,8,$acc9 | ||
255 | xor $acc6,$t1,$t1 | ||
256 | _ror $acc10,16,$acc10 | ||
257 | xor $acc7,$t1,$t1 | ||
258 | _ror $acc11,24,$acc11 | ||
259 | xor $acc8,$t2,$t2 | ||
260 | _ror $acc13,8,$acc13 | ||
261 | xor $acc9,$t2,$t2 | ||
262 | _ror $acc14,16,$acc14 | ||
263 | xor $acc10,$t2,$t2 | ||
264 | _ror $acc15,24,$acc15 | ||
265 | xor $acc11,$t2,$t2 | ||
266 | xor $acc12,$acc14,$acc14 | ||
267 | xor $acc13,$t3,$t3 | ||
268 | _srm $t0,24,$acc0 | ||
269 | xor $acc14,$t3,$t3 | ||
270 | _srm $t1,16,$acc1 | ||
271 | xor $acc15,$t3,$t3 | ||
272 | |||
273 | _srm $t2,8,$acc2 | ||
274 | ldwx,s $acc0($tbl),$acc0 | ||
275 | _srm $t3,0,$acc3 | ||
276 | ldwx,s $acc1($tbl),$acc1 | ||
277 | _srm $t1,24,$acc4 | ||
278 | ldwx,s $acc2($tbl),$acc2 | ||
279 | _srm $t2,16,$acc5 | ||
280 | ldwx,s $acc3($tbl),$acc3 | ||
281 | _srm $t3,8,$acc6 | ||
282 | ldwx,s $acc4($tbl),$acc4 | ||
283 | _srm $t0,0,$acc7 | ||
284 | ldwx,s $acc5($tbl),$acc5 | ||
285 | _srm $t2,24,$acc8 | ||
286 | ldwx,s $acc6($tbl),$acc6 | ||
287 | _srm $t3,16,$acc9 | ||
288 | ldwx,s $acc7($tbl),$acc7 | ||
289 | _srm $t0,8,$acc10 | ||
290 | ldwx,s $acc8($tbl),$acc8 | ||
291 | _srm $t1,0,$acc11 | ||
292 | ldwx,s $acc9($tbl),$acc9 | ||
293 | _srm $t3,24,$acc12 | ||
294 | ldwx,s $acc10($tbl),$acc10 | ||
295 | _srm $t0,16,$acc13 | ||
296 | ldwx,s $acc11($tbl),$acc11 | ||
297 | _srm $t1,8,$acc14 | ||
298 | ldwx,s $acc12($tbl),$acc12 | ||
299 | _srm $t2,0,$acc15 | ||
300 | ldwx,s $acc13($tbl),$acc13 | ||
301 | _ror $acc1,8,$acc1 | ||
302 | ldwx,s $acc14($tbl),$acc14 | ||
303 | |||
304 | _ror $acc2,16,$acc2 | ||
305 | xor $acc0,$s0,$s0 | ||
306 | ldwx,s $acc15($tbl),$acc15 | ||
307 | _ror $acc3,24,$acc3 | ||
308 | xor $acc1,$s0,$s0 | ||
309 | ldw 16($key),$t0 | ||
310 | _ror $acc5,8,$acc5 | ||
311 | xor $acc2,$s0,$s0 | ||
312 | ldw 20($key),$t1 | ||
313 | _ror $acc6,16,$acc6 | ||
314 | xor $acc3,$s0,$s0 | ||
315 | ldw 24($key),$t2 | ||
316 | _ror $acc7,24,$acc7 | ||
317 | xor $acc4,$s1,$s1 | ||
318 | ldw 28($key),$t3 | ||
319 | _ror $acc9,8,$acc9 | ||
320 | xor $acc5,$s1,$s1 | ||
321 | ldw 1024+0($tbl),%r0 ; prefetch te4 | ||
322 | _ror $acc10,16,$acc10 | ||
323 | xor $acc6,$s1,$s1 | ||
324 | ldw 1024+32($tbl),%r0 ; prefetch te4 | ||
325 | _ror $acc11,24,$acc11 | ||
326 | xor $acc7,$s1,$s1 | ||
327 | ldw 1024+64($tbl),%r0 ; prefetch te4 | ||
328 | _ror $acc13,8,$acc13 | ||
329 | xor $acc8,$s2,$s2 | ||
330 | ldw 1024+96($tbl),%r0 ; prefetch te4 | ||
331 | _ror $acc14,16,$acc14 | ||
332 | xor $acc9,$s2,$s2 | ||
333 | ldw 1024+128($tbl),%r0 ; prefetch te4 | ||
334 | _ror $acc15,24,$acc15 | ||
335 | xor $acc10,$s2,$s2 | ||
336 | ldw 1024+160($tbl),%r0 ; prefetch te4 | ||
337 | _srm $s0,24,$acc0 | ||
338 | xor $acc11,$s2,$s2 | ||
339 | ldw 1024+192($tbl),%r0 ; prefetch te4 | ||
340 | xor $acc12,$acc14,$acc14 | ||
341 | xor $acc13,$s3,$s3 | ||
342 | ldw 1024+224($tbl),%r0 ; prefetch te4 | ||
343 | _srm $s1,16,$acc1 | ||
344 | xor $acc14,$s3,$s3 | ||
345 | b L\$enc_loop | ||
346 | xor $acc15,$s3,$s3 | ||
347 | |||
348 | .ALIGN 16 | ||
349 | L\$enc_last | ||
350 | ldo 1024($tbl),$rounds | ||
351 | _ror $acc1,8,$acc1 | ||
352 | xor $acc0,$t0,$t0 | ||
353 | ldw 0($key),$s0 | ||
354 | _ror $acc2,16,$acc2 | ||
355 | xor $acc1,$t0,$t0 | ||
356 | ldw 4($key),$s1 | ||
357 | _ror $acc3,24,$acc3 | ||
358 | xor $acc2,$t0,$t0 | ||
359 | ldw 8($key),$s2 | ||
360 | _ror $acc5,8,$acc5 | ||
361 | xor $acc3,$t0,$t0 | ||
362 | ldw 12($key),$s3 | ||
363 | _ror $acc6,16,$acc6 | ||
364 | xor $acc4,$t1,$t1 | ||
365 | _ror $acc7,24,$acc7 | ||
366 | xor $acc5,$t1,$t1 | ||
367 | _ror $acc9,8,$acc9 | ||
368 | xor $acc6,$t1,$t1 | ||
369 | _ror $acc10,16,$acc10 | ||
370 | xor $acc7,$t1,$t1 | ||
371 | _ror $acc11,24,$acc11 | ||
372 | xor $acc8,$t2,$t2 | ||
373 | _ror $acc13,8,$acc13 | ||
374 | xor $acc9,$t2,$t2 | ||
375 | _ror $acc14,16,$acc14 | ||
376 | xor $acc10,$t2,$t2 | ||
377 | _ror $acc15,24,$acc15 | ||
378 | xor $acc11,$t2,$t2 | ||
379 | xor $acc12,$acc14,$acc14 | ||
380 | xor $acc13,$t3,$t3 | ||
381 | _srm $t0,24,$acc0 | ||
382 | xor $acc14,$t3,$t3 | ||
383 | _srm $t1,16,$acc1 | ||
384 | xor $acc15,$t3,$t3 | ||
385 | |||
386 | _srm $t2,8,$acc2 | ||
387 | ldbx $acc0($rounds),$acc0 | ||
388 | _srm $t1,24,$acc4 | ||
389 | ldbx $acc1($rounds),$acc1 | ||
390 | _srm $t2,16,$acc5 | ||
391 | _srm $t3,0,$acc3 | ||
392 | ldbx $acc2($rounds),$acc2 | ||
393 | ldbx $acc3($rounds),$acc3 | ||
394 | _srm $t3,8,$acc6 | ||
395 | ldbx $acc4($rounds),$acc4 | ||
396 | _srm $t2,24,$acc8 | ||
397 | ldbx $acc5($rounds),$acc5 | ||
398 | _srm $t3,16,$acc9 | ||
399 | _srm $t0,0,$acc7 | ||
400 | ldbx $acc6($rounds),$acc6 | ||
401 | ldbx $acc7($rounds),$acc7 | ||
402 | _srm $t0,8,$acc10 | ||
403 | ldbx $acc8($rounds),$acc8 | ||
404 | _srm $t3,24,$acc12 | ||
405 | ldbx $acc9($rounds),$acc9 | ||
406 | _srm $t0,16,$acc13 | ||
407 | _srm $t1,0,$acc11 | ||
408 | ldbx $acc10($rounds),$acc10 | ||
409 | _srm $t1,8,$acc14 | ||
410 | ldbx $acc11($rounds),$acc11 | ||
411 | ldbx $acc12($rounds),$acc12 | ||
412 | ldbx $acc13($rounds),$acc13 | ||
413 | _srm $t2,0,$acc15 | ||
414 | ldbx $acc14($rounds),$acc14 | ||
415 | |||
416 | dep $acc0,7,8,$acc3 | ||
417 | ldbx $acc15($rounds),$acc15 | ||
418 | dep $acc4,7,8,$acc7 | ||
419 | dep $acc1,15,8,$acc3 | ||
420 | dep $acc5,15,8,$acc7 | ||
421 | dep $acc2,23,8,$acc3 | ||
422 | dep $acc6,23,8,$acc7 | ||
423 | xor $acc3,$s0,$s0 | ||
424 | xor $acc7,$s1,$s1 | ||
425 | dep $acc8,7,8,$acc11 | ||
426 | dep $acc12,7,8,$acc15 | ||
427 | dep $acc9,15,8,$acc11 | ||
428 | dep $acc13,15,8,$acc15 | ||
429 | dep $acc10,23,8,$acc11 | ||
430 | dep $acc14,23,8,$acc15 | ||
431 | xor $acc11,$s2,$s2 | ||
432 | |||
433 | bv (%r31) | ||
434 | .EXIT | ||
435 | xor $acc15,$s3,$s3 | ||
436 | .PROCEND | ||
437 | |||
438 | .ALIGN 64 | ||
439 | L\$AES_Te | ||
440 | .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d | ||
441 | .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 | ||
442 | .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d | ||
443 | .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a | ||
444 | .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 | ||
445 | .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b | ||
446 | .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea | ||
447 | .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b | ||
448 | .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a | ||
449 | .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f | ||
450 | .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 | ||
451 | .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f | ||
452 | .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e | ||
453 | .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 | ||
454 | .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d | ||
455 | .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f | ||
456 | .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e | ||
457 | .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb | ||
458 | .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce | ||
459 | .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 | ||
460 | .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c | ||
461 | .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed | ||
462 | .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b | ||
463 | .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a | ||
464 | .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 | ||
465 | .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 | ||
466 | .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 | ||
467 | .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 | ||
468 | .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a | ||
469 | .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 | ||
470 | .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 | ||
471 | .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d | ||
472 | .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f | ||
473 | .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 | ||
474 | .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 | ||
475 | .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 | ||
476 | .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f | ||
477 | .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 | ||
478 | .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c | ||
479 | .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 | ||
480 | .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e | ||
481 | .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 | ||
482 | .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 | ||
483 | .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b | ||
484 | .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 | ||
485 | .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 | ||
486 | .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 | ||
487 | .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 | ||
488 | .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 | ||
489 | .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 | ||
490 | .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 | ||
491 | .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 | ||
492 | .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa | ||
493 | .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 | ||
494 | .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 | ||
495 | .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 | ||
496 | .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 | ||
497 | .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 | ||
498 | .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 | ||
499 | .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a | ||
500 | .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 | ||
501 | .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 | ||
502 | .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 | ||
503 | .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a | ||
504 | .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 | ||
505 | .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | ||
506 | .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | ||
507 | .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | ||
508 | .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | ||
509 | .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | ||
510 | .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | ||
511 | .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | ||
512 | .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | ||
513 | .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | ||
514 | .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | ||
515 | .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | ||
516 | .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | ||
517 | .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | ||
518 | .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | ||
519 | .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | ||
520 | .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | ||
521 | .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | ||
522 | .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | ||
523 | .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | ||
524 | .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | ||
525 | .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | ||
526 | .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | ||
527 | .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | ||
528 | .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | ||
529 | .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | ||
530 | .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | ||
531 | .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | ||
532 | .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | ||
533 | .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | ||
534 | .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | ||
535 | .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | ||
536 | ___ | ||
537 | |||
538 | $code.=<<___; | ||
539 | .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
540 | .ALIGN 16 | ||
541 | AES_decrypt | ||
542 | .PROC | ||
543 | .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 | ||
544 | .ENTRY | ||
545 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
546 | $PUSHMA %r3,$FRAME(%sp) | ||
547 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
548 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
549 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
550 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
551 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
552 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
553 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
554 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
555 | $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) | ||
556 | $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) | ||
557 | $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) | ||
558 | $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) | ||
559 | $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) | ||
560 | $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) | ||
561 | $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) | ||
562 | |||
563 | blr %r0,$tbl | ||
564 | ldi 3,$t0 | ||
565 | L\$dec_pic | ||
566 | andcm $tbl,$t0,$tbl | ||
567 | ldo L\$AES_Td-L\$dec_pic($tbl),$tbl | ||
568 | |||
569 | and $inp,$t0,$t0 | ||
570 | sub $inp,$t0,$inp | ||
571 | ldw 0($inp),$s0 | ||
572 | ldw 4($inp),$s1 | ||
573 | ldw 8($inp),$s2 | ||
574 | comib,= 0,$t0,L\$dec_inp_aligned | ||
575 | ldw 12($inp),$s3 | ||
576 | |||
577 | sh3addl $t0,%r0,$t0 | ||
578 | subi 32,$t0,$t0 | ||
579 | mtctl $t0,%cr11 | ||
580 | ldw 16($inp),$t1 | ||
581 | vshd $s0,$s1,$s0 | ||
582 | vshd $s1,$s2,$s1 | ||
583 | vshd $s2,$s3,$s2 | ||
584 | vshd $s3,$t1,$s3 | ||
585 | |||
586 | L\$dec_inp_aligned | ||
587 | bl _parisc_AES_decrypt,%r31 | ||
588 | nop | ||
589 | |||
590 | extru,<> $out,31,2,%r0 | ||
591 | b L\$dec_out_aligned | ||
592 | nop | ||
593 | |||
594 | _srm $s0,24,$acc0 | ||
595 | _srm $s0,16,$acc1 | ||
596 | stb $acc0,0($out) | ||
597 | _srm $s0,8,$acc2 | ||
598 | stb $acc1,1($out) | ||
599 | _srm $s1,24,$acc4 | ||
600 | stb $acc2,2($out) | ||
601 | _srm $s1,16,$acc5 | ||
602 | stb $s0,3($out) | ||
603 | _srm $s1,8,$acc6 | ||
604 | stb $acc4,4($out) | ||
605 | _srm $s2,24,$acc0 | ||
606 | stb $acc5,5($out) | ||
607 | _srm $s2,16,$acc1 | ||
608 | stb $acc6,6($out) | ||
609 | _srm $s2,8,$acc2 | ||
610 | stb $s1,7($out) | ||
611 | _srm $s3,24,$acc4 | ||
612 | stb $acc0,8($out) | ||
613 | _srm $s3,16,$acc5 | ||
614 | stb $acc1,9($out) | ||
615 | _srm $s3,8,$acc6 | ||
616 | stb $acc2,10($out) | ||
617 | stb $s2,11($out) | ||
618 | stb $acc4,12($out) | ||
619 | stb $acc5,13($out) | ||
620 | stb $acc6,14($out) | ||
621 | b L\$dec_done | ||
622 | stb $s3,15($out) | ||
623 | |||
624 | L\$dec_out_aligned | ||
625 | stw $s0,0($out) | ||
626 | stw $s1,4($out) | ||
627 | stw $s2,8($out) | ||
628 | stw $s3,12($out) | ||
629 | |||
630 | L\$dec_done | ||
631 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
632 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
633 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
634 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
635 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
636 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
637 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
638 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
639 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
640 | $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 | ||
641 | $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 | ||
642 | $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 | ||
643 | $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 | ||
644 | $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 | ||
645 | $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 | ||
646 | $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 | ||
647 | bv (%r2) | ||
648 | .EXIT | ||
649 | $POPMB -$FRAME(%sp),%r3 | ||
650 | .PROCEND | ||
651 | |||
652 | .ALIGN 16 | ||
653 | _parisc_AES_decrypt | ||
654 | .PROC | ||
655 | .CALLINFO MILLICODE | ||
656 | .ENTRY | ||
657 | ldw 240($key),$rounds | ||
658 | ldw 0($key),$t0 | ||
659 | ldw 4($key),$t1 | ||
660 | ldw 8($key),$t2 | ||
661 | ldw 12($key),$t3 | ||
662 | _srm $rounds,1,$rounds | ||
663 | xor $t0,$s0,$s0 | ||
664 | ldw 16($key),$t0 | ||
665 | xor $t1,$s1,$s1 | ||
666 | ldw 20($key),$t1 | ||
667 | _srm $s0,24,$acc0 | ||
668 | xor $t2,$s2,$s2 | ||
669 | ldw 24($key),$t2 | ||
670 | xor $t3,$s3,$s3 | ||
671 | ldw 28($key),$t3 | ||
672 | _srm $s3,16,$acc1 | ||
673 | L\$dec_loop | ||
674 | _srm $s2,8,$acc2 | ||
675 | ldwx,s $acc0($tbl),$acc0 | ||
676 | _srm $s1,0,$acc3 | ||
677 | ldwx,s $acc1($tbl),$acc1 | ||
678 | _srm $s1,24,$acc4 | ||
679 | ldwx,s $acc2($tbl),$acc2 | ||
680 | _srm $s0,16,$acc5 | ||
681 | ldwx,s $acc3($tbl),$acc3 | ||
682 | _srm $s3,8,$acc6 | ||
683 | ldwx,s $acc4($tbl),$acc4 | ||
684 | _srm $s2,0,$acc7 | ||
685 | ldwx,s $acc5($tbl),$acc5 | ||
686 | _srm $s2,24,$acc8 | ||
687 | ldwx,s $acc6($tbl),$acc6 | ||
688 | _srm $s1,16,$acc9 | ||
689 | ldwx,s $acc7($tbl),$acc7 | ||
690 | _srm $s0,8,$acc10 | ||
691 | ldwx,s $acc8($tbl),$acc8 | ||
692 | _srm $s3,0,$acc11 | ||
693 | ldwx,s $acc9($tbl),$acc9 | ||
694 | _srm $s3,24,$acc12 | ||
695 | ldwx,s $acc10($tbl),$acc10 | ||
696 | _srm $s2,16,$acc13 | ||
697 | ldwx,s $acc11($tbl),$acc11 | ||
698 | _srm $s1,8,$acc14 | ||
699 | ldwx,s $acc12($tbl),$acc12 | ||
700 | _srm $s0,0,$acc15 | ||
701 | ldwx,s $acc13($tbl),$acc13 | ||
702 | ldwx,s $acc14($tbl),$acc14 | ||
703 | ldwx,s $acc15($tbl),$acc15 | ||
704 | addib,= -1,$rounds,L\$dec_last | ||
705 | ldo 32($key),$key | ||
706 | |||
707 | _ror $acc1,8,$acc1 | ||
708 | xor $acc0,$t0,$t0 | ||
709 | ldw 0($key),$s0 | ||
710 | _ror $acc2,16,$acc2 | ||
711 | xor $acc1,$t0,$t0 | ||
712 | ldw 4($key),$s1 | ||
713 | _ror $acc3,24,$acc3 | ||
714 | xor $acc2,$t0,$t0 | ||
715 | ldw 8($key),$s2 | ||
716 | _ror $acc5,8,$acc5 | ||
717 | xor $acc3,$t0,$t0 | ||
718 | ldw 12($key),$s3 | ||
719 | _ror $acc6,16,$acc6 | ||
720 | xor $acc4,$t1,$t1 | ||
721 | _ror $acc7,24,$acc7 | ||
722 | xor $acc5,$t1,$t1 | ||
723 | _ror $acc9,8,$acc9 | ||
724 | xor $acc6,$t1,$t1 | ||
725 | _ror $acc10,16,$acc10 | ||
726 | xor $acc7,$t1,$t1 | ||
727 | _ror $acc11,24,$acc11 | ||
728 | xor $acc8,$t2,$t2 | ||
729 | _ror $acc13,8,$acc13 | ||
730 | xor $acc9,$t2,$t2 | ||
731 | _ror $acc14,16,$acc14 | ||
732 | xor $acc10,$t2,$t2 | ||
733 | _ror $acc15,24,$acc15 | ||
734 | xor $acc11,$t2,$t2 | ||
735 | xor $acc12,$acc14,$acc14 | ||
736 | xor $acc13,$t3,$t3 | ||
737 | _srm $t0,24,$acc0 | ||
738 | xor $acc14,$t3,$t3 | ||
739 | xor $acc15,$t3,$t3 | ||
740 | _srm $t3,16,$acc1 | ||
741 | |||
742 | _srm $t2,8,$acc2 | ||
743 | ldwx,s $acc0($tbl),$acc0 | ||
744 | _srm $t1,0,$acc3 | ||
745 | ldwx,s $acc1($tbl),$acc1 | ||
746 | _srm $t1,24,$acc4 | ||
747 | ldwx,s $acc2($tbl),$acc2 | ||
748 | _srm $t0,16,$acc5 | ||
749 | ldwx,s $acc3($tbl),$acc3 | ||
750 | _srm $t3,8,$acc6 | ||
751 | ldwx,s $acc4($tbl),$acc4 | ||
752 | _srm $t2,0,$acc7 | ||
753 | ldwx,s $acc5($tbl),$acc5 | ||
754 | _srm $t2,24,$acc8 | ||
755 | ldwx,s $acc6($tbl),$acc6 | ||
756 | _srm $t1,16,$acc9 | ||
757 | ldwx,s $acc7($tbl),$acc7 | ||
758 | _srm $t0,8,$acc10 | ||
759 | ldwx,s $acc8($tbl),$acc8 | ||
760 | _srm $t3,0,$acc11 | ||
761 | ldwx,s $acc9($tbl),$acc9 | ||
762 | _srm $t3,24,$acc12 | ||
763 | ldwx,s $acc10($tbl),$acc10 | ||
764 | _srm $t2,16,$acc13 | ||
765 | ldwx,s $acc11($tbl),$acc11 | ||
766 | _srm $t1,8,$acc14 | ||
767 | ldwx,s $acc12($tbl),$acc12 | ||
768 | _srm $t0,0,$acc15 | ||
769 | ldwx,s $acc13($tbl),$acc13 | ||
770 | _ror $acc1,8,$acc1 | ||
771 | ldwx,s $acc14($tbl),$acc14 | ||
772 | |||
773 | _ror $acc2,16,$acc2 | ||
774 | xor $acc0,$s0,$s0 | ||
775 | ldwx,s $acc15($tbl),$acc15 | ||
776 | _ror $acc3,24,$acc3 | ||
777 | xor $acc1,$s0,$s0 | ||
778 | ldw 16($key),$t0 | ||
779 | _ror $acc5,8,$acc5 | ||
780 | xor $acc2,$s0,$s0 | ||
781 | ldw 20($key),$t1 | ||
782 | _ror $acc6,16,$acc6 | ||
783 | xor $acc3,$s0,$s0 | ||
784 | ldw 24($key),$t2 | ||
785 | _ror $acc7,24,$acc7 | ||
786 | xor $acc4,$s1,$s1 | ||
787 | ldw 28($key),$t3 | ||
788 | _ror $acc9,8,$acc9 | ||
789 | xor $acc5,$s1,$s1 | ||
790 | ldw 1024+0($tbl),%r0 ; prefetch td4 | ||
791 | _ror $acc10,16,$acc10 | ||
792 | xor $acc6,$s1,$s1 | ||
793 | ldw 1024+32($tbl),%r0 ; prefetch td4 | ||
794 | _ror $acc11,24,$acc11 | ||
795 | xor $acc7,$s1,$s1 | ||
796 | ldw 1024+64($tbl),%r0 ; prefetch td4 | ||
797 | _ror $acc13,8,$acc13 | ||
798 | xor $acc8,$s2,$s2 | ||
799 | ldw 1024+96($tbl),%r0 ; prefetch td4 | ||
800 | _ror $acc14,16,$acc14 | ||
801 | xor $acc9,$s2,$s2 | ||
802 | ldw 1024+128($tbl),%r0 ; prefetch td4 | ||
803 | _ror $acc15,24,$acc15 | ||
804 | xor $acc10,$s2,$s2 | ||
805 | ldw 1024+160($tbl),%r0 ; prefetch td4 | ||
806 | _srm $s0,24,$acc0 | ||
807 | xor $acc11,$s2,$s2 | ||
808 | ldw 1024+192($tbl),%r0 ; prefetch td4 | ||
809 | xor $acc12,$acc14,$acc14 | ||
810 | xor $acc13,$s3,$s3 | ||
811 | ldw 1024+224($tbl),%r0 ; prefetch td4 | ||
812 | xor $acc14,$s3,$s3 | ||
813 | xor $acc15,$s3,$s3 | ||
814 | b L\$dec_loop | ||
815 | _srm $s3,16,$acc1 | ||
816 | |||
817 | .ALIGN 16 | ||
818 | L\$dec_last | ||
819 | ldo 1024($tbl),$rounds | ||
820 | _ror $acc1,8,$acc1 | ||
821 | xor $acc0,$t0,$t0 | ||
822 | ldw 0($key),$s0 | ||
823 | _ror $acc2,16,$acc2 | ||
824 | xor $acc1,$t0,$t0 | ||
825 | ldw 4($key),$s1 | ||
826 | _ror $acc3,24,$acc3 | ||
827 | xor $acc2,$t0,$t0 | ||
828 | ldw 8($key),$s2 | ||
829 | _ror $acc5,8,$acc5 | ||
830 | xor $acc3,$t0,$t0 | ||
831 | ldw 12($key),$s3 | ||
832 | _ror $acc6,16,$acc6 | ||
833 | xor $acc4,$t1,$t1 | ||
834 | _ror $acc7,24,$acc7 | ||
835 | xor $acc5,$t1,$t1 | ||
836 | _ror $acc9,8,$acc9 | ||
837 | xor $acc6,$t1,$t1 | ||
838 | _ror $acc10,16,$acc10 | ||
839 | xor $acc7,$t1,$t1 | ||
840 | _ror $acc11,24,$acc11 | ||
841 | xor $acc8,$t2,$t2 | ||
842 | _ror $acc13,8,$acc13 | ||
843 | xor $acc9,$t2,$t2 | ||
844 | _ror $acc14,16,$acc14 | ||
845 | xor $acc10,$t2,$t2 | ||
846 | _ror $acc15,24,$acc15 | ||
847 | xor $acc11,$t2,$t2 | ||
848 | xor $acc12,$acc14,$acc14 | ||
849 | xor $acc13,$t3,$t3 | ||
850 | _srm $t0,24,$acc0 | ||
851 | xor $acc14,$t3,$t3 | ||
852 | xor $acc15,$t3,$t3 | ||
853 | _srm $t3,16,$acc1 | ||
854 | |||
855 | _srm $t2,8,$acc2 | ||
856 | ldbx $acc0($rounds),$acc0 | ||
857 | _srm $t1,24,$acc4 | ||
858 | ldbx $acc1($rounds),$acc1 | ||
859 | _srm $t0,16,$acc5 | ||
860 | _srm $t1,0,$acc3 | ||
861 | ldbx $acc2($rounds),$acc2 | ||
862 | ldbx $acc3($rounds),$acc3 | ||
863 | _srm $t3,8,$acc6 | ||
864 | ldbx $acc4($rounds),$acc4 | ||
865 | _srm $t2,24,$acc8 | ||
866 | ldbx $acc5($rounds),$acc5 | ||
867 | _srm $t1,16,$acc9 | ||
868 | _srm $t2,0,$acc7 | ||
869 | ldbx $acc6($rounds),$acc6 | ||
870 | ldbx $acc7($rounds),$acc7 | ||
871 | _srm $t0,8,$acc10 | ||
872 | ldbx $acc8($rounds),$acc8 | ||
873 | _srm $t3,24,$acc12 | ||
874 | ldbx $acc9($rounds),$acc9 | ||
875 | _srm $t2,16,$acc13 | ||
876 | _srm $t3,0,$acc11 | ||
877 | ldbx $acc10($rounds),$acc10 | ||
878 | _srm $t1,8,$acc14 | ||
879 | ldbx $acc11($rounds),$acc11 | ||
880 | ldbx $acc12($rounds),$acc12 | ||
881 | ldbx $acc13($rounds),$acc13 | ||
882 | _srm $t0,0,$acc15 | ||
883 | ldbx $acc14($rounds),$acc14 | ||
884 | |||
885 | dep $acc0,7,8,$acc3 | ||
886 | ldbx $acc15($rounds),$acc15 | ||
887 | dep $acc4,7,8,$acc7 | ||
888 | dep $acc1,15,8,$acc3 | ||
889 | dep $acc5,15,8,$acc7 | ||
890 | dep $acc2,23,8,$acc3 | ||
891 | dep $acc6,23,8,$acc7 | ||
892 | xor $acc3,$s0,$s0 | ||
893 | xor $acc7,$s1,$s1 | ||
894 | dep $acc8,7,8,$acc11 | ||
895 | dep $acc12,7,8,$acc15 | ||
896 | dep $acc9,15,8,$acc11 | ||
897 | dep $acc13,15,8,$acc15 | ||
898 | dep $acc10,23,8,$acc11 | ||
899 | dep $acc14,23,8,$acc15 | ||
900 | xor $acc11,$s2,$s2 | ||
901 | |||
902 | bv (%r31) | ||
903 | .EXIT | ||
904 | xor $acc15,$s3,$s3 | ||
905 | .PROCEND | ||
906 | |||
907 | .ALIGN 64 | ||
908 | L\$AES_Td | ||
909 | .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 | ||
910 | .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 | ||
911 | .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 | ||
912 | .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f | ||
913 | .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 | ||
914 | .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 | ||
915 | .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da | ||
916 | .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 | ||
917 | .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd | ||
918 | .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 | ||
919 | .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 | ||
920 | .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 | ||
921 | .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 | ||
922 | .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a | ||
923 | .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 | ||
924 | .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c | ||
925 | .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 | ||
926 | .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a | ||
927 | .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 | ||
928 | .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 | ||
929 | .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 | ||
930 | .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff | ||
931 | .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 | ||
932 | .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb | ||
933 | .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 | ||
934 | .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e | ||
935 | .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 | ||
936 | .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a | ||
937 | .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e | ||
938 | .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 | ||
939 | .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d | ||
940 | .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 | ||
941 | .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd | ||
942 | .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 | ||
943 | .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 | ||
944 | .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 | ||
945 | .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d | ||
946 | .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 | ||
947 | .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 | ||
948 | .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef | ||
949 | .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 | ||
950 | .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 | ||
951 | .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 | ||
952 | .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 | ||
953 | .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 | ||
954 | .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b | ||
955 | .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 | ||
956 | .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 | ||
957 | .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 | ||
958 | .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 | ||
959 | .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 | ||
960 | .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f | ||
961 | .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df | ||
962 | .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f | ||
963 | .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e | ||
964 | .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 | ||
965 | .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 | ||
966 | .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c | ||
967 | .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf | ||
968 | .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 | ||
969 | .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f | ||
970 | .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 | ||
971 | .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 | ||
972 | .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 | ||
973 | .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 | ||
974 | .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | ||
975 | .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | ||
976 | .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | ||
977 | .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | ||
978 | .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | ||
979 | .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | ||
980 | .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | ||
981 | .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | ||
982 | .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | ||
983 | .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | ||
984 | .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | ||
985 | .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | ||
986 | .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | ||
987 | .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | ||
988 | .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | ||
989 | .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | ||
990 | .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | ||
991 | .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | ||
992 | .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | ||
993 | .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | ||
994 | .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | ||
995 | .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | ||
996 | .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | ||
997 | .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | ||
998 | .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | ||
999 | .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | ||
1000 | .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | ||
1001 | .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | ||
1002 | .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | ||
1003 | .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | ||
1004 | .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | ||
1005 | .STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
1006 | ___ | ||
1007 | |||
1008 | foreach (split("\n",$code)) { | ||
1009 | s/\`([^\`]*)\`/eval $1/ge; | ||
1010 | |||
1011 | # translate made up instructons: _ror, _srm | ||
1012 | s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or | ||
1013 | |||
1014 | s/_srm(\s+%r[0-9]+),([0-9]+),/ | ||
1015 | $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2) | ||
1016 | : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e; | ||
1017 | |||
1018 | s/,\*/,/ if ($SIZE_T==4); | ||
1019 | print $_,"\n"; | ||
1020 | } | ||
1021 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl index f82c5e1814..7c52cbe5f9 100644 --- a/src/lib/libcrypto/aes/asm/aes-ppc.pl +++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl | |||
@@ -7,7 +7,7 @@ | |||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | 7 | # details see http://www.openssl.org/~appro/cryptogams/. |
8 | # ==================================================================== | 8 | # ==================================================================== |
9 | 9 | ||
10 | # Needs more work: key setup, page boundaries, CBC routine... | 10 | # Needs more work: key setup, CBC routine... |
11 | # | 11 | # |
12 | # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with | 12 | # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with |
13 | # 128-bit key, which is ~40% better than 64-bit code generated by gcc | 13 | # 128-bit key, which is ~40% better than 64-bit code generated by gcc |
@@ -18,7 +18,7 @@ | |||
18 | 18 | ||
19 | # February 2010 | 19 | # February 2010 |
20 | # | 20 | # |
21 | # Rescheduling instructions to favour Power6 pipeline gives 10% | 21 | # Rescheduling instructions to favour Power6 pipeline gave 10% |
22 | # performance improvement on the platfrom in question (and marginal | 22 | # performance improvement on the platfrom in question (and marginal |
23 | # improvement even on others). It should be noted that Power6 fails | 23 | # improvement even on others). It should be noted that Power6 fails |
24 | # to process byte in 18 cycles, only in 23, because it fails to issue | 24 | # to process byte in 18 cycles, only in 23, because it fails to issue |
@@ -33,11 +33,13 @@ $flavour = shift; | |||
33 | 33 | ||
34 | if ($flavour =~ /64/) { | 34 | if ($flavour =~ /64/) { |
35 | $SIZE_T =8; | 35 | $SIZE_T =8; |
36 | $LRSAVE =2*$SIZE_T; | ||
36 | $STU ="stdu"; | 37 | $STU ="stdu"; |
37 | $POP ="ld"; | 38 | $POP ="ld"; |
38 | $PUSH ="std"; | 39 | $PUSH ="std"; |
39 | } elsif ($flavour =~ /32/) { | 40 | } elsif ($flavour =~ /32/) { |
40 | $SIZE_T =4; | 41 | $SIZE_T =4; |
42 | $LRSAVE =$SIZE_T; | ||
41 | $STU ="stwu"; | 43 | $STU ="stwu"; |
42 | $POP ="lwz"; | 44 | $POP ="lwz"; |
43 | $PUSH ="stw"; | 45 | $PUSH ="stw"; |
@@ -116,15 +118,19 @@ LAES_Te: | |||
116 | addi $Tbl0,$Tbl0,`128-8` | 118 | addi $Tbl0,$Tbl0,`128-8` |
117 | mtlr r0 | 119 | mtlr r0 |
118 | blr | 120 | blr |
119 | .space `32-24` | 121 | .long 0 |
122 | .byte 0,12,0x14,0,0,0,0,0 | ||
123 | .space `64-9*4` | ||
120 | LAES_Td: | 124 | LAES_Td: |
121 | mflr r0 | 125 | mflr r0 |
122 | bcl 20,31,\$+4 | 126 | bcl 20,31,\$+4 |
123 | mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry | 127 | mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry |
124 | addi $Tbl0,$Tbl0,`128-8-32+2048+256` | 128 | addi $Tbl0,$Tbl0,`128-64-8+2048+256` |
125 | mtlr r0 | 129 | mtlr r0 |
126 | blr | 130 | blr |
127 | .space `128-32-24` | 131 | .long 0 |
132 | .byte 0,12,0x14,0,0,0,0,0 | ||
133 | .space `128-64-9*4` | ||
128 | ___ | 134 | ___ |
129 | &_data_word( | 135 | &_data_word( |
130 | 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, | 136 | 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, |
@@ -328,10 +334,9 @@ $code.=<<___; | |||
328 | .globl .AES_encrypt | 334 | .globl .AES_encrypt |
329 | .align 7 | 335 | .align 7 |
330 | .AES_encrypt: | 336 | .AES_encrypt: |
331 | mflr r0 | ||
332 | $STU $sp,-$FRAME($sp) | 337 | $STU $sp,-$FRAME($sp) |
338 | mflr r0 | ||
333 | 339 | ||
334 | $PUSH r0,`$FRAME-$SIZE_T*21`($sp) | ||
335 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) | 340 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) |
336 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) | 341 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) |
337 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | 342 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) |
@@ -352,7 +357,14 @@ $code.=<<___; | |||
352 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | 357 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
353 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | 358 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
354 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | 359 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
360 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | ||
361 | |||
362 | andi. $t0,$inp,3 | ||
363 | andi. $t1,$out,3 | ||
364 | or. $t0,$t0,$t1 | ||
365 | bne Lenc_unaligned | ||
355 | 366 | ||
367 | Lenc_unaligned_ok: | ||
356 | lwz $s0,0($inp) | 368 | lwz $s0,0($inp) |
357 | lwz $s1,4($inp) | 369 | lwz $s1,4($inp) |
358 | lwz $s2,8($inp) | 370 | lwz $s2,8($inp) |
@@ -363,8 +375,80 @@ $code.=<<___; | |||
363 | stw $s1,4($out) | 375 | stw $s1,4($out) |
364 | stw $s2,8($out) | 376 | stw $s2,8($out) |
365 | stw $s3,12($out) | 377 | stw $s3,12($out) |
378 | b Lenc_done | ||
379 | |||
380 | Lenc_unaligned: | ||
381 | subfic $t0,$inp,4096 | ||
382 | subfic $t1,$out,4096 | ||
383 | andi. $t0,$t0,4096-16 | ||
384 | beq Lenc_xpage | ||
385 | andi. $t1,$t1,4096-16 | ||
386 | bne Lenc_unaligned_ok | ||
387 | |||
388 | Lenc_xpage: | ||
389 | lbz $acc00,0($inp) | ||
390 | lbz $acc01,1($inp) | ||
391 | lbz $acc02,2($inp) | ||
392 | lbz $s0,3($inp) | ||
393 | lbz $acc04,4($inp) | ||
394 | lbz $acc05,5($inp) | ||
395 | lbz $acc06,6($inp) | ||
396 | lbz $s1,7($inp) | ||
397 | lbz $acc08,8($inp) | ||
398 | lbz $acc09,9($inp) | ||
399 | lbz $acc10,10($inp) | ||
400 | insrwi $s0,$acc00,8,0 | ||
401 | lbz $s2,11($inp) | ||
402 | insrwi $s1,$acc04,8,0 | ||
403 | lbz $acc12,12($inp) | ||
404 | insrwi $s0,$acc01,8,8 | ||
405 | lbz $acc13,13($inp) | ||
406 | insrwi $s1,$acc05,8,8 | ||
407 | lbz $acc14,14($inp) | ||
408 | insrwi $s0,$acc02,8,16 | ||
409 | lbz $s3,15($inp) | ||
410 | insrwi $s1,$acc06,8,16 | ||
411 | insrwi $s2,$acc08,8,0 | ||
412 | insrwi $s3,$acc12,8,0 | ||
413 | insrwi $s2,$acc09,8,8 | ||
414 | insrwi $s3,$acc13,8,8 | ||
415 | insrwi $s2,$acc10,8,16 | ||
416 | insrwi $s3,$acc14,8,16 | ||
417 | |||
418 | bl LAES_Te | ||
419 | bl Lppc_AES_encrypt_compact | ||
420 | |||
421 | extrwi $acc00,$s0,8,0 | ||
422 | extrwi $acc01,$s0,8,8 | ||
423 | stb $acc00,0($out) | ||
424 | extrwi $acc02,$s0,8,16 | ||
425 | stb $acc01,1($out) | ||
426 | stb $acc02,2($out) | ||
427 | extrwi $acc04,$s1,8,0 | ||
428 | stb $s0,3($out) | ||
429 | extrwi $acc05,$s1,8,8 | ||
430 | stb $acc04,4($out) | ||
431 | extrwi $acc06,$s1,8,16 | ||
432 | stb $acc05,5($out) | ||
433 | stb $acc06,6($out) | ||
434 | extrwi $acc08,$s2,8,0 | ||
435 | stb $s1,7($out) | ||
436 | extrwi $acc09,$s2,8,8 | ||
437 | stb $acc08,8($out) | ||
438 | extrwi $acc10,$s2,8,16 | ||
439 | stb $acc09,9($out) | ||
440 | stb $acc10,10($out) | ||
441 | extrwi $acc12,$s3,8,0 | ||
442 | stb $s2,11($out) | ||
443 | extrwi $acc13,$s3,8,8 | ||
444 | stb $acc12,12($out) | ||
445 | extrwi $acc14,$s3,8,16 | ||
446 | stb $acc13,13($out) | ||
447 | stb $acc14,14($out) | ||
448 | stb $s3,15($out) | ||
366 | 449 | ||
367 | $POP r0,`$FRAME-$SIZE_T*21`($sp) | 450 | Lenc_done: |
451 | $POP r0,`$FRAME+$LRSAVE`($sp) | ||
368 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) | 452 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) |
369 | $POP r13,`$FRAME-$SIZE_T*19`($sp) | 453 | $POP r13,`$FRAME-$SIZE_T*19`($sp) |
370 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | 454 | $POP r14,`$FRAME-$SIZE_T*18`($sp) |
@@ -388,18 +472,21 @@ $code.=<<___; | |||
388 | mtlr r0 | 472 | mtlr r0 |
389 | addi $sp,$sp,$FRAME | 473 | addi $sp,$sp,$FRAME |
390 | blr | 474 | blr |
475 | .long 0 | ||
476 | .byte 0,12,4,1,0x80,18,3,0 | ||
477 | .long 0 | ||
391 | 478 | ||
392 | .align 5 | 479 | .align 5 |
393 | Lppc_AES_encrypt: | 480 | Lppc_AES_encrypt: |
394 | lwz $acc00,240($key) | 481 | lwz $acc00,240($key) |
395 | lwz $t0,0($key) | ||
396 | lwz $t1,4($key) | ||
397 | lwz $t2,8($key) | ||
398 | lwz $t3,12($key) | ||
399 | addi $Tbl1,$Tbl0,3 | 482 | addi $Tbl1,$Tbl0,3 |
483 | lwz $t0,0($key) | ||
400 | addi $Tbl2,$Tbl0,2 | 484 | addi $Tbl2,$Tbl0,2 |
485 | lwz $t1,4($key) | ||
401 | addi $Tbl3,$Tbl0,1 | 486 | addi $Tbl3,$Tbl0,1 |
487 | lwz $t2,8($key) | ||
402 | addi $acc00,$acc00,-1 | 488 | addi $acc00,$acc00,-1 |
489 | lwz $t3,12($key) | ||
403 | addi $key,$key,16 | 490 | addi $key,$key,16 |
404 | xor $s0,$s0,$t0 | 491 | xor $s0,$s0,$t0 |
405 | xor $s1,$s1,$t1 | 492 | xor $s1,$s1,$t1 |
@@ -413,44 +500,44 @@ Lenc_loop: | |||
413 | rlwinm $acc02,$s2,`32-24+3`,21,28 | 500 | rlwinm $acc02,$s2,`32-24+3`,21,28 |
414 | rlwinm $acc03,$s3,`32-24+3`,21,28 | 501 | rlwinm $acc03,$s3,`32-24+3`,21,28 |
415 | lwz $t0,0($key) | 502 | lwz $t0,0($key) |
416 | lwz $t1,4($key) | ||
417 | rlwinm $acc04,$s1,`32-16+3`,21,28 | 503 | rlwinm $acc04,$s1,`32-16+3`,21,28 |
504 | lwz $t1,4($key) | ||
418 | rlwinm $acc05,$s2,`32-16+3`,21,28 | 505 | rlwinm $acc05,$s2,`32-16+3`,21,28 |
419 | lwz $t2,8($key) | 506 | lwz $t2,8($key) |
420 | lwz $t3,12($key) | ||
421 | rlwinm $acc06,$s3,`32-16+3`,21,28 | 507 | rlwinm $acc06,$s3,`32-16+3`,21,28 |
508 | lwz $t3,12($key) | ||
422 | rlwinm $acc07,$s0,`32-16+3`,21,28 | 509 | rlwinm $acc07,$s0,`32-16+3`,21,28 |
423 | lwzx $acc00,$Tbl0,$acc00 | 510 | lwzx $acc00,$Tbl0,$acc00 |
424 | lwzx $acc01,$Tbl0,$acc01 | ||
425 | rlwinm $acc08,$s2,`32-8+3`,21,28 | 511 | rlwinm $acc08,$s2,`32-8+3`,21,28 |
512 | lwzx $acc01,$Tbl0,$acc01 | ||
426 | rlwinm $acc09,$s3,`32-8+3`,21,28 | 513 | rlwinm $acc09,$s3,`32-8+3`,21,28 |
427 | lwzx $acc02,$Tbl0,$acc02 | 514 | lwzx $acc02,$Tbl0,$acc02 |
428 | lwzx $acc03,$Tbl0,$acc03 | ||
429 | rlwinm $acc10,$s0,`32-8+3`,21,28 | 515 | rlwinm $acc10,$s0,`32-8+3`,21,28 |
516 | lwzx $acc03,$Tbl0,$acc03 | ||
430 | rlwinm $acc11,$s1,`32-8+3`,21,28 | 517 | rlwinm $acc11,$s1,`32-8+3`,21,28 |
431 | lwzx $acc04,$Tbl1,$acc04 | 518 | lwzx $acc04,$Tbl1,$acc04 |
432 | lwzx $acc05,$Tbl1,$acc05 | ||
433 | rlwinm $acc12,$s3,`0+3`,21,28 | 519 | rlwinm $acc12,$s3,`0+3`,21,28 |
520 | lwzx $acc05,$Tbl1,$acc05 | ||
434 | rlwinm $acc13,$s0,`0+3`,21,28 | 521 | rlwinm $acc13,$s0,`0+3`,21,28 |
435 | lwzx $acc06,$Tbl1,$acc06 | 522 | lwzx $acc06,$Tbl1,$acc06 |
436 | lwzx $acc07,$Tbl1,$acc07 | ||
437 | rlwinm $acc14,$s1,`0+3`,21,28 | 523 | rlwinm $acc14,$s1,`0+3`,21,28 |
524 | lwzx $acc07,$Tbl1,$acc07 | ||
438 | rlwinm $acc15,$s2,`0+3`,21,28 | 525 | rlwinm $acc15,$s2,`0+3`,21,28 |
439 | lwzx $acc08,$Tbl2,$acc08 | 526 | lwzx $acc08,$Tbl2,$acc08 |
440 | lwzx $acc09,$Tbl2,$acc09 | ||
441 | xor $t0,$t0,$acc00 | 527 | xor $t0,$t0,$acc00 |
528 | lwzx $acc09,$Tbl2,$acc09 | ||
442 | xor $t1,$t1,$acc01 | 529 | xor $t1,$t1,$acc01 |
443 | lwzx $acc10,$Tbl2,$acc10 | 530 | lwzx $acc10,$Tbl2,$acc10 |
444 | lwzx $acc11,$Tbl2,$acc11 | ||
445 | xor $t2,$t2,$acc02 | 531 | xor $t2,$t2,$acc02 |
532 | lwzx $acc11,$Tbl2,$acc11 | ||
446 | xor $t3,$t3,$acc03 | 533 | xor $t3,$t3,$acc03 |
447 | lwzx $acc12,$Tbl3,$acc12 | 534 | lwzx $acc12,$Tbl3,$acc12 |
448 | lwzx $acc13,$Tbl3,$acc13 | ||
449 | xor $t0,$t0,$acc04 | 535 | xor $t0,$t0,$acc04 |
536 | lwzx $acc13,$Tbl3,$acc13 | ||
450 | xor $t1,$t1,$acc05 | 537 | xor $t1,$t1,$acc05 |
451 | lwzx $acc14,$Tbl3,$acc14 | 538 | lwzx $acc14,$Tbl3,$acc14 |
452 | lwzx $acc15,$Tbl3,$acc15 | ||
453 | xor $t2,$t2,$acc06 | 539 | xor $t2,$t2,$acc06 |
540 | lwzx $acc15,$Tbl3,$acc15 | ||
454 | xor $t3,$t3,$acc07 | 541 | xor $t3,$t3,$acc07 |
455 | xor $t0,$t0,$acc08 | 542 | xor $t0,$t0,$acc08 |
456 | xor $t1,$t1,$acc09 | 543 | xor $t1,$t1,$acc09 |
@@ -466,60 +553,60 @@ Lenc_loop: | |||
466 | addi $Tbl2,$Tbl0,2048 | 553 | addi $Tbl2,$Tbl0,2048 |
467 | nop | 554 | nop |
468 | lwz $t0,0($key) | 555 | lwz $t0,0($key) |
469 | lwz $t1,4($key) | ||
470 | rlwinm $acc00,$s0,`32-24`,24,31 | 556 | rlwinm $acc00,$s0,`32-24`,24,31 |
557 | lwz $t1,4($key) | ||
471 | rlwinm $acc01,$s1,`32-24`,24,31 | 558 | rlwinm $acc01,$s1,`32-24`,24,31 |
472 | lwz $t2,8($key) | 559 | lwz $t2,8($key) |
473 | lwz $t3,12($key) | ||
474 | rlwinm $acc02,$s2,`32-24`,24,31 | 560 | rlwinm $acc02,$s2,`32-24`,24,31 |
561 | lwz $t3,12($key) | ||
475 | rlwinm $acc03,$s3,`32-24`,24,31 | 562 | rlwinm $acc03,$s3,`32-24`,24,31 |
476 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 | 563 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 |
477 | lwz $acc09,`2048+32`($Tbl0) | ||
478 | rlwinm $acc04,$s1,`32-16`,24,31 | 564 | rlwinm $acc04,$s1,`32-16`,24,31 |
565 | lwz $acc09,`2048+32`($Tbl0) | ||
479 | rlwinm $acc05,$s2,`32-16`,24,31 | 566 | rlwinm $acc05,$s2,`32-16`,24,31 |
480 | lwz $acc10,`2048+64`($Tbl0) | 567 | lwz $acc10,`2048+64`($Tbl0) |
481 | lwz $acc11,`2048+96`($Tbl0) | ||
482 | rlwinm $acc06,$s3,`32-16`,24,31 | 568 | rlwinm $acc06,$s3,`32-16`,24,31 |
569 | lwz $acc11,`2048+96`($Tbl0) | ||
483 | rlwinm $acc07,$s0,`32-16`,24,31 | 570 | rlwinm $acc07,$s0,`32-16`,24,31 |
484 | lwz $acc12,`2048+128`($Tbl0) | 571 | lwz $acc12,`2048+128`($Tbl0) |
485 | lwz $acc13,`2048+160`($Tbl0) | ||
486 | rlwinm $acc08,$s2,`32-8`,24,31 | 572 | rlwinm $acc08,$s2,`32-8`,24,31 |
573 | lwz $acc13,`2048+160`($Tbl0) | ||
487 | rlwinm $acc09,$s3,`32-8`,24,31 | 574 | rlwinm $acc09,$s3,`32-8`,24,31 |
488 | lwz $acc14,`2048+192`($Tbl0) | 575 | lwz $acc14,`2048+192`($Tbl0) |
489 | lwz $acc15,`2048+224`($Tbl0) | ||
490 | rlwinm $acc10,$s0,`32-8`,24,31 | 576 | rlwinm $acc10,$s0,`32-8`,24,31 |
577 | lwz $acc15,`2048+224`($Tbl0) | ||
491 | rlwinm $acc11,$s1,`32-8`,24,31 | 578 | rlwinm $acc11,$s1,`32-8`,24,31 |
492 | lbzx $acc00,$Tbl2,$acc00 | 579 | lbzx $acc00,$Tbl2,$acc00 |
493 | lbzx $acc01,$Tbl2,$acc01 | ||
494 | rlwinm $acc12,$s3,`0`,24,31 | 580 | rlwinm $acc12,$s3,`0`,24,31 |
581 | lbzx $acc01,$Tbl2,$acc01 | ||
495 | rlwinm $acc13,$s0,`0`,24,31 | 582 | rlwinm $acc13,$s0,`0`,24,31 |
496 | lbzx $acc02,$Tbl2,$acc02 | 583 | lbzx $acc02,$Tbl2,$acc02 |
497 | lbzx $acc03,$Tbl2,$acc03 | ||
498 | rlwinm $acc14,$s1,`0`,24,31 | 584 | rlwinm $acc14,$s1,`0`,24,31 |
585 | lbzx $acc03,$Tbl2,$acc03 | ||
499 | rlwinm $acc15,$s2,`0`,24,31 | 586 | rlwinm $acc15,$s2,`0`,24,31 |
500 | lbzx $acc04,$Tbl2,$acc04 | 587 | lbzx $acc04,$Tbl2,$acc04 |
501 | lbzx $acc05,$Tbl2,$acc05 | ||
502 | rlwinm $s0,$acc00,24,0,7 | 588 | rlwinm $s0,$acc00,24,0,7 |
589 | lbzx $acc05,$Tbl2,$acc05 | ||
503 | rlwinm $s1,$acc01,24,0,7 | 590 | rlwinm $s1,$acc01,24,0,7 |
504 | lbzx $acc06,$Tbl2,$acc06 | 591 | lbzx $acc06,$Tbl2,$acc06 |
505 | lbzx $acc07,$Tbl2,$acc07 | ||
506 | rlwinm $s2,$acc02,24,0,7 | 592 | rlwinm $s2,$acc02,24,0,7 |
593 | lbzx $acc07,$Tbl2,$acc07 | ||
507 | rlwinm $s3,$acc03,24,0,7 | 594 | rlwinm $s3,$acc03,24,0,7 |
508 | lbzx $acc08,$Tbl2,$acc08 | 595 | lbzx $acc08,$Tbl2,$acc08 |
509 | lbzx $acc09,$Tbl2,$acc09 | ||
510 | rlwimi $s0,$acc04,16,8,15 | 596 | rlwimi $s0,$acc04,16,8,15 |
597 | lbzx $acc09,$Tbl2,$acc09 | ||
511 | rlwimi $s1,$acc05,16,8,15 | 598 | rlwimi $s1,$acc05,16,8,15 |
512 | lbzx $acc10,$Tbl2,$acc10 | 599 | lbzx $acc10,$Tbl2,$acc10 |
513 | lbzx $acc11,$Tbl2,$acc11 | ||
514 | rlwimi $s2,$acc06,16,8,15 | 600 | rlwimi $s2,$acc06,16,8,15 |
601 | lbzx $acc11,$Tbl2,$acc11 | ||
515 | rlwimi $s3,$acc07,16,8,15 | 602 | rlwimi $s3,$acc07,16,8,15 |
516 | lbzx $acc12,$Tbl2,$acc12 | 603 | lbzx $acc12,$Tbl2,$acc12 |
517 | lbzx $acc13,$Tbl2,$acc13 | ||
518 | rlwimi $s0,$acc08,8,16,23 | 604 | rlwimi $s0,$acc08,8,16,23 |
605 | lbzx $acc13,$Tbl2,$acc13 | ||
519 | rlwimi $s1,$acc09,8,16,23 | 606 | rlwimi $s1,$acc09,8,16,23 |
520 | lbzx $acc14,$Tbl2,$acc14 | 607 | lbzx $acc14,$Tbl2,$acc14 |
521 | lbzx $acc15,$Tbl2,$acc15 | ||
522 | rlwimi $s2,$acc10,8,16,23 | 608 | rlwimi $s2,$acc10,8,16,23 |
609 | lbzx $acc15,$Tbl2,$acc15 | ||
523 | rlwimi $s3,$acc11,8,16,23 | 610 | rlwimi $s3,$acc11,8,16,23 |
524 | or $s0,$s0,$acc12 | 611 | or $s0,$s0,$acc12 |
525 | or $s1,$s1,$acc13 | 612 | or $s1,$s1,$acc13 |
@@ -530,29 +617,31 @@ Lenc_loop: | |||
530 | xor $s2,$s2,$t2 | 617 | xor $s2,$s2,$t2 |
531 | xor $s3,$s3,$t3 | 618 | xor $s3,$s3,$t3 |
532 | blr | 619 | blr |
620 | .long 0 | ||
621 | .byte 0,12,0x14,0,0,0,0,0 | ||
533 | 622 | ||
534 | .align 4 | 623 | .align 4 |
535 | Lppc_AES_encrypt_compact: | 624 | Lppc_AES_encrypt_compact: |
536 | lwz $acc00,240($key) | 625 | lwz $acc00,240($key) |
537 | lwz $t0,0($key) | ||
538 | lwz $t1,4($key) | ||
539 | lwz $t2,8($key) | ||
540 | lwz $t3,12($key) | ||
541 | addi $Tbl1,$Tbl0,2048 | 626 | addi $Tbl1,$Tbl0,2048 |
627 | lwz $t0,0($key) | ||
542 | lis $mask80,0x8080 | 628 | lis $mask80,0x8080 |
629 | lwz $t1,4($key) | ||
543 | lis $mask1b,0x1b1b | 630 | lis $mask1b,0x1b1b |
544 | addi $key,$key,16 | 631 | lwz $t2,8($key) |
545 | ori $mask80,$mask80,0x8080 | 632 | ori $mask80,$mask80,0x8080 |
633 | lwz $t3,12($key) | ||
546 | ori $mask1b,$mask1b,0x1b1b | 634 | ori $mask1b,$mask1b,0x1b1b |
635 | addi $key,$key,16 | ||
547 | mtctr $acc00 | 636 | mtctr $acc00 |
548 | .align 4 | 637 | .align 4 |
549 | Lenc_compact_loop: | 638 | Lenc_compact_loop: |
550 | xor $s0,$s0,$t0 | 639 | xor $s0,$s0,$t0 |
551 | xor $s1,$s1,$t1 | 640 | xor $s1,$s1,$t1 |
552 | xor $s2,$s2,$t2 | ||
553 | xor $s3,$s3,$t3 | ||
554 | rlwinm $acc00,$s0,`32-24`,24,31 | 641 | rlwinm $acc00,$s0,`32-24`,24,31 |
642 | xor $s2,$s2,$t2 | ||
555 | rlwinm $acc01,$s1,`32-24`,24,31 | 643 | rlwinm $acc01,$s1,`32-24`,24,31 |
644 | xor $s3,$s3,$t3 | ||
556 | rlwinm $acc02,$s2,`32-24`,24,31 | 645 | rlwinm $acc02,$s2,`32-24`,24,31 |
557 | rlwinm $acc03,$s3,`32-24`,24,31 | 646 | rlwinm $acc03,$s3,`32-24`,24,31 |
558 | rlwinm $acc04,$s1,`32-16`,24,31 | 647 | rlwinm $acc04,$s1,`32-16`,24,31 |
@@ -560,48 +649,48 @@ Lenc_compact_loop: | |||
560 | rlwinm $acc06,$s3,`32-16`,24,31 | 649 | rlwinm $acc06,$s3,`32-16`,24,31 |
561 | rlwinm $acc07,$s0,`32-16`,24,31 | 650 | rlwinm $acc07,$s0,`32-16`,24,31 |
562 | lbzx $acc00,$Tbl1,$acc00 | 651 | lbzx $acc00,$Tbl1,$acc00 |
563 | lbzx $acc01,$Tbl1,$acc01 | ||
564 | rlwinm $acc08,$s2,`32-8`,24,31 | 652 | rlwinm $acc08,$s2,`32-8`,24,31 |
653 | lbzx $acc01,$Tbl1,$acc01 | ||
565 | rlwinm $acc09,$s3,`32-8`,24,31 | 654 | rlwinm $acc09,$s3,`32-8`,24,31 |
566 | lbzx $acc02,$Tbl1,$acc02 | 655 | lbzx $acc02,$Tbl1,$acc02 |
567 | lbzx $acc03,$Tbl1,$acc03 | ||
568 | rlwinm $acc10,$s0,`32-8`,24,31 | 656 | rlwinm $acc10,$s0,`32-8`,24,31 |
657 | lbzx $acc03,$Tbl1,$acc03 | ||
569 | rlwinm $acc11,$s1,`32-8`,24,31 | 658 | rlwinm $acc11,$s1,`32-8`,24,31 |
570 | lbzx $acc04,$Tbl1,$acc04 | 659 | lbzx $acc04,$Tbl1,$acc04 |
571 | lbzx $acc05,$Tbl1,$acc05 | ||
572 | rlwinm $acc12,$s3,`0`,24,31 | 660 | rlwinm $acc12,$s3,`0`,24,31 |
661 | lbzx $acc05,$Tbl1,$acc05 | ||
573 | rlwinm $acc13,$s0,`0`,24,31 | 662 | rlwinm $acc13,$s0,`0`,24,31 |
574 | lbzx $acc06,$Tbl1,$acc06 | 663 | lbzx $acc06,$Tbl1,$acc06 |
575 | lbzx $acc07,$Tbl1,$acc07 | ||
576 | rlwinm $acc14,$s1,`0`,24,31 | 664 | rlwinm $acc14,$s1,`0`,24,31 |
665 | lbzx $acc07,$Tbl1,$acc07 | ||
577 | rlwinm $acc15,$s2,`0`,24,31 | 666 | rlwinm $acc15,$s2,`0`,24,31 |
578 | lbzx $acc08,$Tbl1,$acc08 | 667 | lbzx $acc08,$Tbl1,$acc08 |
579 | lbzx $acc09,$Tbl1,$acc09 | ||
580 | rlwinm $s0,$acc00,24,0,7 | 668 | rlwinm $s0,$acc00,24,0,7 |
669 | lbzx $acc09,$Tbl1,$acc09 | ||
581 | rlwinm $s1,$acc01,24,0,7 | 670 | rlwinm $s1,$acc01,24,0,7 |
582 | lbzx $acc10,$Tbl1,$acc10 | 671 | lbzx $acc10,$Tbl1,$acc10 |
583 | lbzx $acc11,$Tbl1,$acc11 | ||
584 | rlwinm $s2,$acc02,24,0,7 | 672 | rlwinm $s2,$acc02,24,0,7 |
673 | lbzx $acc11,$Tbl1,$acc11 | ||
585 | rlwinm $s3,$acc03,24,0,7 | 674 | rlwinm $s3,$acc03,24,0,7 |
586 | lbzx $acc12,$Tbl1,$acc12 | 675 | lbzx $acc12,$Tbl1,$acc12 |
587 | lbzx $acc13,$Tbl1,$acc13 | ||
588 | rlwimi $s0,$acc04,16,8,15 | 676 | rlwimi $s0,$acc04,16,8,15 |
677 | lbzx $acc13,$Tbl1,$acc13 | ||
589 | rlwimi $s1,$acc05,16,8,15 | 678 | rlwimi $s1,$acc05,16,8,15 |
590 | lbzx $acc14,$Tbl1,$acc14 | 679 | lbzx $acc14,$Tbl1,$acc14 |
591 | lbzx $acc15,$Tbl1,$acc15 | ||
592 | rlwimi $s2,$acc06,16,8,15 | 680 | rlwimi $s2,$acc06,16,8,15 |
681 | lbzx $acc15,$Tbl1,$acc15 | ||
593 | rlwimi $s3,$acc07,16,8,15 | 682 | rlwimi $s3,$acc07,16,8,15 |
594 | rlwimi $s0,$acc08,8,16,23 | 683 | rlwimi $s0,$acc08,8,16,23 |
595 | rlwimi $s1,$acc09,8,16,23 | 684 | rlwimi $s1,$acc09,8,16,23 |
596 | rlwimi $s2,$acc10,8,16,23 | 685 | rlwimi $s2,$acc10,8,16,23 |
597 | rlwimi $s3,$acc11,8,16,23 | 686 | rlwimi $s3,$acc11,8,16,23 |
598 | lwz $t0,0($key) | 687 | lwz $t0,0($key) |
599 | lwz $t1,4($key) | ||
600 | or $s0,$s0,$acc12 | 688 | or $s0,$s0,$acc12 |
689 | lwz $t1,4($key) | ||
601 | or $s1,$s1,$acc13 | 690 | or $s1,$s1,$acc13 |
602 | lwz $t2,8($key) | 691 | lwz $t2,8($key) |
603 | lwz $t3,12($key) | ||
604 | or $s2,$s2,$acc14 | 692 | or $s2,$s2,$acc14 |
693 | lwz $t3,12($key) | ||
605 | or $s3,$s3,$acc15 | 694 | or $s3,$s3,$acc15 |
606 | 695 | ||
607 | addi $key,$key,16 | 696 | addi $key,$key,16 |
@@ -612,12 +701,12 @@ Lenc_compact_loop: | |||
612 | and $acc02,$s2,$mask80 | 701 | and $acc02,$s2,$mask80 |
613 | and $acc03,$s3,$mask80 | 702 | and $acc03,$s3,$mask80 |
614 | srwi $acc04,$acc00,7 # r1>>7 | 703 | srwi $acc04,$acc00,7 # r1>>7 |
615 | srwi $acc05,$acc01,7 | ||
616 | srwi $acc06,$acc02,7 | ||
617 | srwi $acc07,$acc03,7 | ||
618 | andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f | 704 | andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f |
705 | srwi $acc05,$acc01,7 | ||
619 | andc $acc09,$s1,$mask80 | 706 | andc $acc09,$s1,$mask80 |
707 | srwi $acc06,$acc02,7 | ||
620 | andc $acc10,$s2,$mask80 | 708 | andc $acc10,$s2,$mask80 |
709 | srwi $acc07,$acc03,7 | ||
621 | andc $acc11,$s3,$mask80 | 710 | andc $acc11,$s3,$mask80 |
622 | sub $acc00,$acc00,$acc04 # r1-(r1>>7) | 711 | sub $acc00,$acc00,$acc04 # r1-(r1>>7) |
623 | sub $acc01,$acc01,$acc05 | 712 | sub $acc01,$acc01,$acc05 |
@@ -633,32 +722,32 @@ Lenc_compact_loop: | |||
633 | and $acc03,$acc03,$mask1b | 722 | and $acc03,$acc03,$mask1b |
634 | xor $acc00,$acc00,$acc08 # r2 | 723 | xor $acc00,$acc00,$acc08 # r2 |
635 | xor $acc01,$acc01,$acc09 | 724 | xor $acc01,$acc01,$acc09 |
725 | rotlwi $acc12,$s0,16 # ROTATE(r0,16) | ||
636 | xor $acc02,$acc02,$acc10 | 726 | xor $acc02,$acc02,$acc10 |
727 | rotlwi $acc13,$s1,16 | ||
637 | xor $acc03,$acc03,$acc11 | 728 | xor $acc03,$acc03,$acc11 |
729 | rotlwi $acc14,$s2,16 | ||
638 | 730 | ||
639 | rotlwi $acc12,$s0,16 # ROTATE(r0,16) | ||
640 | rotlwi $acc13,$s1,16 | ||
641 | rotlwi $acc14,$s2,16 | ||
642 | rotlwi $acc15,$s3,16 | ||
643 | xor $s0,$s0,$acc00 # r0^r2 | 731 | xor $s0,$s0,$acc00 # r0^r2 |
732 | rotlwi $acc15,$s3,16 | ||
644 | xor $s1,$s1,$acc01 | 733 | xor $s1,$s1,$acc01 |
645 | xor $s2,$s2,$acc02 | ||
646 | xor $s3,$s3,$acc03 | ||
647 | rotrwi $s0,$s0,24 # ROTATE(r2^r0,24) | 734 | rotrwi $s0,$s0,24 # ROTATE(r2^r0,24) |
735 | xor $s2,$s2,$acc02 | ||
648 | rotrwi $s1,$s1,24 | 736 | rotrwi $s1,$s1,24 |
737 | xor $s3,$s3,$acc03 | ||
649 | rotrwi $s2,$s2,24 | 738 | rotrwi $s2,$s2,24 |
650 | rotrwi $s3,$s3,24 | ||
651 | xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2 | 739 | xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2 |
740 | rotrwi $s3,$s3,24 | ||
652 | xor $s1,$s1,$acc01 | 741 | xor $s1,$s1,$acc01 |
653 | xor $s2,$s2,$acc02 | 742 | xor $s2,$s2,$acc02 |
654 | xor $s3,$s3,$acc03 | 743 | xor $s3,$s3,$acc03 |
655 | rotlwi $acc08,$acc12,8 # ROTATE(r0,24) | 744 | rotlwi $acc08,$acc12,8 # ROTATE(r0,24) |
656 | rotlwi $acc09,$acc13,8 | ||
657 | rotlwi $acc10,$acc14,8 | ||
658 | rotlwi $acc11,$acc15,8 | ||
659 | xor $s0,$s0,$acc12 # | 745 | xor $s0,$s0,$acc12 # |
746 | rotlwi $acc09,$acc13,8 | ||
660 | xor $s1,$s1,$acc13 | 747 | xor $s1,$s1,$acc13 |
748 | rotlwi $acc10,$acc14,8 | ||
661 | xor $s2,$s2,$acc14 | 749 | xor $s2,$s2,$acc14 |
750 | rotlwi $acc11,$acc15,8 | ||
662 | xor $s3,$s3,$acc15 | 751 | xor $s3,$s3,$acc15 |
663 | xor $s0,$s0,$acc08 # | 752 | xor $s0,$s0,$acc08 # |
664 | xor $s1,$s1,$acc09 | 753 | xor $s1,$s1,$acc09 |
@@ -673,14 +762,15 @@ Lenc_compact_done: | |||
673 | xor $s2,$s2,$t2 | 762 | xor $s2,$s2,$t2 |
674 | xor $s3,$s3,$t3 | 763 | xor $s3,$s3,$t3 |
675 | blr | 764 | blr |
765 | .long 0 | ||
766 | .byte 0,12,0x14,0,0,0,0,0 | ||
676 | 767 | ||
677 | .globl .AES_decrypt | 768 | .globl .AES_decrypt |
678 | .align 7 | 769 | .align 7 |
679 | .AES_decrypt: | 770 | .AES_decrypt: |
680 | mflr r0 | ||
681 | $STU $sp,-$FRAME($sp) | 771 | $STU $sp,-$FRAME($sp) |
772 | mflr r0 | ||
682 | 773 | ||
683 | $PUSH r0,`$FRAME-$SIZE_T*21`($sp) | ||
684 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) | 774 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) |
685 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) | 775 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) |
686 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | 776 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) |
@@ -701,7 +791,14 @@ Lenc_compact_done: | |||
701 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | 791 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
702 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | 792 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
703 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | 793 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
794 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | ||
704 | 795 | ||
796 | andi. $t0,$inp,3 | ||
797 | andi. $t1,$out,3 | ||
798 | or. $t0,$t0,$t1 | ||
799 | bne Ldec_unaligned | ||
800 | |||
801 | Ldec_unaligned_ok: | ||
705 | lwz $s0,0($inp) | 802 | lwz $s0,0($inp) |
706 | lwz $s1,4($inp) | 803 | lwz $s1,4($inp) |
707 | lwz $s2,8($inp) | 804 | lwz $s2,8($inp) |
@@ -712,8 +809,80 @@ Lenc_compact_done: | |||
712 | stw $s1,4($out) | 809 | stw $s1,4($out) |
713 | stw $s2,8($out) | 810 | stw $s2,8($out) |
714 | stw $s3,12($out) | 811 | stw $s3,12($out) |
812 | b Ldec_done | ||
813 | |||
814 | Ldec_unaligned: | ||
815 | subfic $t0,$inp,4096 | ||
816 | subfic $t1,$out,4096 | ||
817 | andi. $t0,$t0,4096-16 | ||
818 | beq Ldec_xpage | ||
819 | andi. $t1,$t1,4096-16 | ||
820 | bne Ldec_unaligned_ok | ||
821 | |||
822 | Ldec_xpage: | ||
823 | lbz $acc00,0($inp) | ||
824 | lbz $acc01,1($inp) | ||
825 | lbz $acc02,2($inp) | ||
826 | lbz $s0,3($inp) | ||
827 | lbz $acc04,4($inp) | ||
828 | lbz $acc05,5($inp) | ||
829 | lbz $acc06,6($inp) | ||
830 | lbz $s1,7($inp) | ||
831 | lbz $acc08,8($inp) | ||
832 | lbz $acc09,9($inp) | ||
833 | lbz $acc10,10($inp) | ||
834 | insrwi $s0,$acc00,8,0 | ||
835 | lbz $s2,11($inp) | ||
836 | insrwi $s1,$acc04,8,0 | ||
837 | lbz $acc12,12($inp) | ||
838 | insrwi $s0,$acc01,8,8 | ||
839 | lbz $acc13,13($inp) | ||
840 | insrwi $s1,$acc05,8,8 | ||
841 | lbz $acc14,14($inp) | ||
842 | insrwi $s0,$acc02,8,16 | ||
843 | lbz $s3,15($inp) | ||
844 | insrwi $s1,$acc06,8,16 | ||
845 | insrwi $s2,$acc08,8,0 | ||
846 | insrwi $s3,$acc12,8,0 | ||
847 | insrwi $s2,$acc09,8,8 | ||
848 | insrwi $s3,$acc13,8,8 | ||
849 | insrwi $s2,$acc10,8,16 | ||
850 | insrwi $s3,$acc14,8,16 | ||
851 | |||
852 | bl LAES_Td | ||
853 | bl Lppc_AES_decrypt_compact | ||
715 | 854 | ||
716 | $POP r0,`$FRAME-$SIZE_T*21`($sp) | 855 | extrwi $acc00,$s0,8,0 |
856 | extrwi $acc01,$s0,8,8 | ||
857 | stb $acc00,0($out) | ||
858 | extrwi $acc02,$s0,8,16 | ||
859 | stb $acc01,1($out) | ||
860 | stb $acc02,2($out) | ||
861 | extrwi $acc04,$s1,8,0 | ||
862 | stb $s0,3($out) | ||
863 | extrwi $acc05,$s1,8,8 | ||
864 | stb $acc04,4($out) | ||
865 | extrwi $acc06,$s1,8,16 | ||
866 | stb $acc05,5($out) | ||
867 | stb $acc06,6($out) | ||
868 | extrwi $acc08,$s2,8,0 | ||
869 | stb $s1,7($out) | ||
870 | extrwi $acc09,$s2,8,8 | ||
871 | stb $acc08,8($out) | ||
872 | extrwi $acc10,$s2,8,16 | ||
873 | stb $acc09,9($out) | ||
874 | stb $acc10,10($out) | ||
875 | extrwi $acc12,$s3,8,0 | ||
876 | stb $s2,11($out) | ||
877 | extrwi $acc13,$s3,8,8 | ||
878 | stb $acc12,12($out) | ||
879 | extrwi $acc14,$s3,8,16 | ||
880 | stb $acc13,13($out) | ||
881 | stb $acc14,14($out) | ||
882 | stb $s3,15($out) | ||
883 | |||
884 | Ldec_done: | ||
885 | $POP r0,`$FRAME+$LRSAVE`($sp) | ||
717 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) | 886 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) |
718 | $POP r13,`$FRAME-$SIZE_T*19`($sp) | 887 | $POP r13,`$FRAME-$SIZE_T*19`($sp) |
719 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | 888 | $POP r14,`$FRAME-$SIZE_T*18`($sp) |
@@ -737,18 +906,21 @@ Lenc_compact_done: | |||
737 | mtlr r0 | 906 | mtlr r0 |
738 | addi $sp,$sp,$FRAME | 907 | addi $sp,$sp,$FRAME |
739 | blr | 908 | blr |
909 | .long 0 | ||
910 | .byte 0,12,4,1,0x80,18,3,0 | ||
911 | .long 0 | ||
740 | 912 | ||
741 | .align 5 | 913 | .align 5 |
742 | Lppc_AES_decrypt: | 914 | Lppc_AES_decrypt: |
743 | lwz $acc00,240($key) | 915 | lwz $acc00,240($key) |
744 | lwz $t0,0($key) | ||
745 | lwz $t1,4($key) | ||
746 | lwz $t2,8($key) | ||
747 | lwz $t3,12($key) | ||
748 | addi $Tbl1,$Tbl0,3 | 916 | addi $Tbl1,$Tbl0,3 |
917 | lwz $t0,0($key) | ||
749 | addi $Tbl2,$Tbl0,2 | 918 | addi $Tbl2,$Tbl0,2 |
919 | lwz $t1,4($key) | ||
750 | addi $Tbl3,$Tbl0,1 | 920 | addi $Tbl3,$Tbl0,1 |
921 | lwz $t2,8($key) | ||
751 | addi $acc00,$acc00,-1 | 922 | addi $acc00,$acc00,-1 |
923 | lwz $t3,12($key) | ||
752 | addi $key,$key,16 | 924 | addi $key,$key,16 |
753 | xor $s0,$s0,$t0 | 925 | xor $s0,$s0,$t0 |
754 | xor $s1,$s1,$t1 | 926 | xor $s1,$s1,$t1 |
@@ -762,44 +934,44 @@ Ldec_loop: | |||
762 | rlwinm $acc02,$s2,`32-24+3`,21,28 | 934 | rlwinm $acc02,$s2,`32-24+3`,21,28 |
763 | rlwinm $acc03,$s3,`32-24+3`,21,28 | 935 | rlwinm $acc03,$s3,`32-24+3`,21,28 |
764 | lwz $t0,0($key) | 936 | lwz $t0,0($key) |
765 | lwz $t1,4($key) | ||
766 | rlwinm $acc04,$s3,`32-16+3`,21,28 | 937 | rlwinm $acc04,$s3,`32-16+3`,21,28 |
938 | lwz $t1,4($key) | ||
767 | rlwinm $acc05,$s0,`32-16+3`,21,28 | 939 | rlwinm $acc05,$s0,`32-16+3`,21,28 |
768 | lwz $t2,8($key) | 940 | lwz $t2,8($key) |
769 | lwz $t3,12($key) | ||
770 | rlwinm $acc06,$s1,`32-16+3`,21,28 | 941 | rlwinm $acc06,$s1,`32-16+3`,21,28 |
942 | lwz $t3,12($key) | ||
771 | rlwinm $acc07,$s2,`32-16+3`,21,28 | 943 | rlwinm $acc07,$s2,`32-16+3`,21,28 |
772 | lwzx $acc00,$Tbl0,$acc00 | 944 | lwzx $acc00,$Tbl0,$acc00 |
773 | lwzx $acc01,$Tbl0,$acc01 | ||
774 | rlwinm $acc08,$s2,`32-8+3`,21,28 | 945 | rlwinm $acc08,$s2,`32-8+3`,21,28 |
946 | lwzx $acc01,$Tbl0,$acc01 | ||
775 | rlwinm $acc09,$s3,`32-8+3`,21,28 | 947 | rlwinm $acc09,$s3,`32-8+3`,21,28 |
776 | lwzx $acc02,$Tbl0,$acc02 | 948 | lwzx $acc02,$Tbl0,$acc02 |
777 | lwzx $acc03,$Tbl0,$acc03 | ||
778 | rlwinm $acc10,$s0,`32-8+3`,21,28 | 949 | rlwinm $acc10,$s0,`32-8+3`,21,28 |
950 | lwzx $acc03,$Tbl0,$acc03 | ||
779 | rlwinm $acc11,$s1,`32-8+3`,21,28 | 951 | rlwinm $acc11,$s1,`32-8+3`,21,28 |
780 | lwzx $acc04,$Tbl1,$acc04 | 952 | lwzx $acc04,$Tbl1,$acc04 |
781 | lwzx $acc05,$Tbl1,$acc05 | ||
782 | rlwinm $acc12,$s1,`0+3`,21,28 | 953 | rlwinm $acc12,$s1,`0+3`,21,28 |
954 | lwzx $acc05,$Tbl1,$acc05 | ||
783 | rlwinm $acc13,$s2,`0+3`,21,28 | 955 | rlwinm $acc13,$s2,`0+3`,21,28 |
784 | lwzx $acc06,$Tbl1,$acc06 | 956 | lwzx $acc06,$Tbl1,$acc06 |
785 | lwzx $acc07,$Tbl1,$acc07 | ||
786 | rlwinm $acc14,$s3,`0+3`,21,28 | 957 | rlwinm $acc14,$s3,`0+3`,21,28 |
958 | lwzx $acc07,$Tbl1,$acc07 | ||
787 | rlwinm $acc15,$s0,`0+3`,21,28 | 959 | rlwinm $acc15,$s0,`0+3`,21,28 |
788 | lwzx $acc08,$Tbl2,$acc08 | 960 | lwzx $acc08,$Tbl2,$acc08 |
789 | lwzx $acc09,$Tbl2,$acc09 | ||
790 | xor $t0,$t0,$acc00 | 961 | xor $t0,$t0,$acc00 |
962 | lwzx $acc09,$Tbl2,$acc09 | ||
791 | xor $t1,$t1,$acc01 | 963 | xor $t1,$t1,$acc01 |
792 | lwzx $acc10,$Tbl2,$acc10 | 964 | lwzx $acc10,$Tbl2,$acc10 |
793 | lwzx $acc11,$Tbl2,$acc11 | ||
794 | xor $t2,$t2,$acc02 | 965 | xor $t2,$t2,$acc02 |
966 | lwzx $acc11,$Tbl2,$acc11 | ||
795 | xor $t3,$t3,$acc03 | 967 | xor $t3,$t3,$acc03 |
796 | lwzx $acc12,$Tbl3,$acc12 | 968 | lwzx $acc12,$Tbl3,$acc12 |
797 | lwzx $acc13,$Tbl3,$acc13 | ||
798 | xor $t0,$t0,$acc04 | 969 | xor $t0,$t0,$acc04 |
970 | lwzx $acc13,$Tbl3,$acc13 | ||
799 | xor $t1,$t1,$acc05 | 971 | xor $t1,$t1,$acc05 |
800 | lwzx $acc14,$Tbl3,$acc14 | 972 | lwzx $acc14,$Tbl3,$acc14 |
801 | lwzx $acc15,$Tbl3,$acc15 | ||
802 | xor $t2,$t2,$acc06 | 973 | xor $t2,$t2,$acc06 |
974 | lwzx $acc15,$Tbl3,$acc15 | ||
803 | xor $t3,$t3,$acc07 | 975 | xor $t3,$t3,$acc07 |
804 | xor $t0,$t0,$acc08 | 976 | xor $t0,$t0,$acc08 |
805 | xor $t1,$t1,$acc09 | 977 | xor $t1,$t1,$acc09 |
@@ -815,56 +987,56 @@ Ldec_loop: | |||
815 | addi $Tbl2,$Tbl0,2048 | 987 | addi $Tbl2,$Tbl0,2048 |
816 | nop | 988 | nop |
817 | lwz $t0,0($key) | 989 | lwz $t0,0($key) |
818 | lwz $t1,4($key) | ||
819 | rlwinm $acc00,$s0,`32-24`,24,31 | 990 | rlwinm $acc00,$s0,`32-24`,24,31 |
991 | lwz $t1,4($key) | ||
820 | rlwinm $acc01,$s1,`32-24`,24,31 | 992 | rlwinm $acc01,$s1,`32-24`,24,31 |
821 | lwz $t2,8($key) | 993 | lwz $t2,8($key) |
822 | lwz $t3,12($key) | ||
823 | rlwinm $acc02,$s2,`32-24`,24,31 | 994 | rlwinm $acc02,$s2,`32-24`,24,31 |
995 | lwz $t3,12($key) | ||
824 | rlwinm $acc03,$s3,`32-24`,24,31 | 996 | rlwinm $acc03,$s3,`32-24`,24,31 |
825 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 | 997 | lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 |
826 | lwz $acc09,`2048+32`($Tbl0) | ||
827 | rlwinm $acc04,$s3,`32-16`,24,31 | 998 | rlwinm $acc04,$s3,`32-16`,24,31 |
999 | lwz $acc09,`2048+32`($Tbl0) | ||
828 | rlwinm $acc05,$s0,`32-16`,24,31 | 1000 | rlwinm $acc05,$s0,`32-16`,24,31 |
829 | lwz $acc10,`2048+64`($Tbl0) | 1001 | lwz $acc10,`2048+64`($Tbl0) |
830 | lwz $acc11,`2048+96`($Tbl0) | ||
831 | lbzx $acc00,$Tbl2,$acc00 | 1002 | lbzx $acc00,$Tbl2,$acc00 |
1003 | lwz $acc11,`2048+96`($Tbl0) | ||
832 | lbzx $acc01,$Tbl2,$acc01 | 1004 | lbzx $acc01,$Tbl2,$acc01 |
833 | lwz $acc12,`2048+128`($Tbl0) | 1005 | lwz $acc12,`2048+128`($Tbl0) |
834 | lwz $acc13,`2048+160`($Tbl0) | ||
835 | rlwinm $acc06,$s1,`32-16`,24,31 | 1006 | rlwinm $acc06,$s1,`32-16`,24,31 |
1007 | lwz $acc13,`2048+160`($Tbl0) | ||
836 | rlwinm $acc07,$s2,`32-16`,24,31 | 1008 | rlwinm $acc07,$s2,`32-16`,24,31 |
837 | lwz $acc14,`2048+192`($Tbl0) | 1009 | lwz $acc14,`2048+192`($Tbl0) |
838 | lwz $acc15,`2048+224`($Tbl0) | ||
839 | rlwinm $acc08,$s2,`32-8`,24,31 | 1010 | rlwinm $acc08,$s2,`32-8`,24,31 |
1011 | lwz $acc15,`2048+224`($Tbl0) | ||
840 | rlwinm $acc09,$s3,`32-8`,24,31 | 1012 | rlwinm $acc09,$s3,`32-8`,24,31 |
841 | lbzx $acc02,$Tbl2,$acc02 | 1013 | lbzx $acc02,$Tbl2,$acc02 |
842 | lbzx $acc03,$Tbl2,$acc03 | ||
843 | rlwinm $acc10,$s0,`32-8`,24,31 | 1014 | rlwinm $acc10,$s0,`32-8`,24,31 |
1015 | lbzx $acc03,$Tbl2,$acc03 | ||
844 | rlwinm $acc11,$s1,`32-8`,24,31 | 1016 | rlwinm $acc11,$s1,`32-8`,24,31 |
845 | lbzx $acc04,$Tbl2,$acc04 | 1017 | lbzx $acc04,$Tbl2,$acc04 |
846 | lbzx $acc05,$Tbl2,$acc05 | ||
847 | rlwinm $acc12,$s1,`0`,24,31 | 1018 | rlwinm $acc12,$s1,`0`,24,31 |
1019 | lbzx $acc05,$Tbl2,$acc05 | ||
848 | rlwinm $acc13,$s2,`0`,24,31 | 1020 | rlwinm $acc13,$s2,`0`,24,31 |
849 | lbzx $acc06,$Tbl2,$acc06 | 1021 | lbzx $acc06,$Tbl2,$acc06 |
850 | lbzx $acc07,$Tbl2,$acc07 | ||
851 | rlwinm $acc14,$s3,`0`,24,31 | 1022 | rlwinm $acc14,$s3,`0`,24,31 |
1023 | lbzx $acc07,$Tbl2,$acc07 | ||
852 | rlwinm $acc15,$s0,`0`,24,31 | 1024 | rlwinm $acc15,$s0,`0`,24,31 |
853 | lbzx $acc08,$Tbl2,$acc08 | 1025 | lbzx $acc08,$Tbl2,$acc08 |
854 | lbzx $acc09,$Tbl2,$acc09 | ||
855 | rlwinm $s0,$acc00,24,0,7 | 1026 | rlwinm $s0,$acc00,24,0,7 |
1027 | lbzx $acc09,$Tbl2,$acc09 | ||
856 | rlwinm $s1,$acc01,24,0,7 | 1028 | rlwinm $s1,$acc01,24,0,7 |
857 | lbzx $acc10,$Tbl2,$acc10 | 1029 | lbzx $acc10,$Tbl2,$acc10 |
858 | lbzx $acc11,$Tbl2,$acc11 | ||
859 | rlwinm $s2,$acc02,24,0,7 | 1030 | rlwinm $s2,$acc02,24,0,7 |
1031 | lbzx $acc11,$Tbl2,$acc11 | ||
860 | rlwinm $s3,$acc03,24,0,7 | 1032 | rlwinm $s3,$acc03,24,0,7 |
861 | lbzx $acc12,$Tbl2,$acc12 | 1033 | lbzx $acc12,$Tbl2,$acc12 |
862 | lbzx $acc13,$Tbl2,$acc13 | ||
863 | rlwimi $s0,$acc04,16,8,15 | 1034 | rlwimi $s0,$acc04,16,8,15 |
1035 | lbzx $acc13,$Tbl2,$acc13 | ||
864 | rlwimi $s1,$acc05,16,8,15 | 1036 | rlwimi $s1,$acc05,16,8,15 |
865 | lbzx $acc14,$Tbl2,$acc14 | 1037 | lbzx $acc14,$Tbl2,$acc14 |
866 | lbzx $acc15,$Tbl2,$acc15 | ||
867 | rlwimi $s2,$acc06,16,8,15 | 1038 | rlwimi $s2,$acc06,16,8,15 |
1039 | lbzx $acc15,$Tbl2,$acc15 | ||
868 | rlwimi $s3,$acc07,16,8,15 | 1040 | rlwimi $s3,$acc07,16,8,15 |
869 | rlwimi $s0,$acc08,8,16,23 | 1041 | rlwimi $s0,$acc08,8,16,23 |
870 | rlwimi $s1,$acc09,8,16,23 | 1042 | rlwimi $s1,$acc09,8,16,23 |
@@ -879,20 +1051,22 @@ Ldec_loop: | |||
879 | xor $s2,$s2,$t2 | 1051 | xor $s2,$s2,$t2 |
880 | xor $s3,$s3,$t3 | 1052 | xor $s3,$s3,$t3 |
881 | blr | 1053 | blr |
1054 | .long 0 | ||
1055 | .byte 0,12,0x14,0,0,0,0,0 | ||
882 | 1056 | ||
883 | .align 4 | 1057 | .align 4 |
884 | Lppc_AES_decrypt_compact: | 1058 | Lppc_AES_decrypt_compact: |
885 | lwz $acc00,240($key) | 1059 | lwz $acc00,240($key) |
886 | lwz $t0,0($key) | ||
887 | lwz $t1,4($key) | ||
888 | lwz $t2,8($key) | ||
889 | lwz $t3,12($key) | ||
890 | addi $Tbl1,$Tbl0,2048 | 1060 | addi $Tbl1,$Tbl0,2048 |
1061 | lwz $t0,0($key) | ||
891 | lis $mask80,0x8080 | 1062 | lis $mask80,0x8080 |
1063 | lwz $t1,4($key) | ||
892 | lis $mask1b,0x1b1b | 1064 | lis $mask1b,0x1b1b |
893 | addi $key,$key,16 | 1065 | lwz $t2,8($key) |
894 | ori $mask80,$mask80,0x8080 | 1066 | ori $mask80,$mask80,0x8080 |
1067 | lwz $t3,12($key) | ||
895 | ori $mask1b,$mask1b,0x1b1b | 1068 | ori $mask1b,$mask1b,0x1b1b |
1069 | addi $key,$key,16 | ||
896 | ___ | 1070 | ___ |
897 | $code.=<<___ if ($SIZE_T==8); | 1071 | $code.=<<___ if ($SIZE_T==8); |
898 | insrdi $mask80,$mask80,32,0 | 1072 | insrdi $mask80,$mask80,32,0 |
@@ -904,10 +1078,10 @@ $code.=<<___; | |||
904 | Ldec_compact_loop: | 1078 | Ldec_compact_loop: |
905 | xor $s0,$s0,$t0 | 1079 | xor $s0,$s0,$t0 |
906 | xor $s1,$s1,$t1 | 1080 | xor $s1,$s1,$t1 |
907 | xor $s2,$s2,$t2 | ||
908 | xor $s3,$s3,$t3 | ||
909 | rlwinm $acc00,$s0,`32-24`,24,31 | 1081 | rlwinm $acc00,$s0,`32-24`,24,31 |
1082 | xor $s2,$s2,$t2 | ||
910 | rlwinm $acc01,$s1,`32-24`,24,31 | 1083 | rlwinm $acc01,$s1,`32-24`,24,31 |
1084 | xor $s3,$s3,$t3 | ||
911 | rlwinm $acc02,$s2,`32-24`,24,31 | 1085 | rlwinm $acc02,$s2,`32-24`,24,31 |
912 | rlwinm $acc03,$s3,`32-24`,24,31 | 1086 | rlwinm $acc03,$s3,`32-24`,24,31 |
913 | rlwinm $acc04,$s3,`32-16`,24,31 | 1087 | rlwinm $acc04,$s3,`32-16`,24,31 |
@@ -915,48 +1089,48 @@ Ldec_compact_loop: | |||
915 | rlwinm $acc06,$s1,`32-16`,24,31 | 1089 | rlwinm $acc06,$s1,`32-16`,24,31 |
916 | rlwinm $acc07,$s2,`32-16`,24,31 | 1090 | rlwinm $acc07,$s2,`32-16`,24,31 |
917 | lbzx $acc00,$Tbl1,$acc00 | 1091 | lbzx $acc00,$Tbl1,$acc00 |
918 | lbzx $acc01,$Tbl1,$acc01 | ||
919 | rlwinm $acc08,$s2,`32-8`,24,31 | 1092 | rlwinm $acc08,$s2,`32-8`,24,31 |
1093 | lbzx $acc01,$Tbl1,$acc01 | ||
920 | rlwinm $acc09,$s3,`32-8`,24,31 | 1094 | rlwinm $acc09,$s3,`32-8`,24,31 |
921 | lbzx $acc02,$Tbl1,$acc02 | 1095 | lbzx $acc02,$Tbl1,$acc02 |
922 | lbzx $acc03,$Tbl1,$acc03 | ||
923 | rlwinm $acc10,$s0,`32-8`,24,31 | 1096 | rlwinm $acc10,$s0,`32-8`,24,31 |
1097 | lbzx $acc03,$Tbl1,$acc03 | ||
924 | rlwinm $acc11,$s1,`32-8`,24,31 | 1098 | rlwinm $acc11,$s1,`32-8`,24,31 |
925 | lbzx $acc04,$Tbl1,$acc04 | 1099 | lbzx $acc04,$Tbl1,$acc04 |
926 | lbzx $acc05,$Tbl1,$acc05 | ||
927 | rlwinm $acc12,$s1,`0`,24,31 | 1100 | rlwinm $acc12,$s1,`0`,24,31 |
1101 | lbzx $acc05,$Tbl1,$acc05 | ||
928 | rlwinm $acc13,$s2,`0`,24,31 | 1102 | rlwinm $acc13,$s2,`0`,24,31 |
929 | lbzx $acc06,$Tbl1,$acc06 | 1103 | lbzx $acc06,$Tbl1,$acc06 |
930 | lbzx $acc07,$Tbl1,$acc07 | ||
931 | rlwinm $acc14,$s3,`0`,24,31 | 1104 | rlwinm $acc14,$s3,`0`,24,31 |
1105 | lbzx $acc07,$Tbl1,$acc07 | ||
932 | rlwinm $acc15,$s0,`0`,24,31 | 1106 | rlwinm $acc15,$s0,`0`,24,31 |
933 | lbzx $acc08,$Tbl1,$acc08 | 1107 | lbzx $acc08,$Tbl1,$acc08 |
934 | lbzx $acc09,$Tbl1,$acc09 | ||
935 | rlwinm $s0,$acc00,24,0,7 | 1108 | rlwinm $s0,$acc00,24,0,7 |
1109 | lbzx $acc09,$Tbl1,$acc09 | ||
936 | rlwinm $s1,$acc01,24,0,7 | 1110 | rlwinm $s1,$acc01,24,0,7 |
937 | lbzx $acc10,$Tbl1,$acc10 | 1111 | lbzx $acc10,$Tbl1,$acc10 |
938 | lbzx $acc11,$Tbl1,$acc11 | ||
939 | rlwinm $s2,$acc02,24,0,7 | 1112 | rlwinm $s2,$acc02,24,0,7 |
1113 | lbzx $acc11,$Tbl1,$acc11 | ||
940 | rlwinm $s3,$acc03,24,0,7 | 1114 | rlwinm $s3,$acc03,24,0,7 |
941 | lbzx $acc12,$Tbl1,$acc12 | 1115 | lbzx $acc12,$Tbl1,$acc12 |
942 | lbzx $acc13,$Tbl1,$acc13 | ||
943 | rlwimi $s0,$acc04,16,8,15 | 1116 | rlwimi $s0,$acc04,16,8,15 |
1117 | lbzx $acc13,$Tbl1,$acc13 | ||
944 | rlwimi $s1,$acc05,16,8,15 | 1118 | rlwimi $s1,$acc05,16,8,15 |
945 | lbzx $acc14,$Tbl1,$acc14 | 1119 | lbzx $acc14,$Tbl1,$acc14 |
946 | lbzx $acc15,$Tbl1,$acc15 | ||
947 | rlwimi $s2,$acc06,16,8,15 | 1120 | rlwimi $s2,$acc06,16,8,15 |
1121 | lbzx $acc15,$Tbl1,$acc15 | ||
948 | rlwimi $s3,$acc07,16,8,15 | 1122 | rlwimi $s3,$acc07,16,8,15 |
949 | rlwimi $s0,$acc08,8,16,23 | 1123 | rlwimi $s0,$acc08,8,16,23 |
950 | rlwimi $s1,$acc09,8,16,23 | 1124 | rlwimi $s1,$acc09,8,16,23 |
951 | rlwimi $s2,$acc10,8,16,23 | 1125 | rlwimi $s2,$acc10,8,16,23 |
952 | rlwimi $s3,$acc11,8,16,23 | 1126 | rlwimi $s3,$acc11,8,16,23 |
953 | lwz $t0,0($key) | 1127 | lwz $t0,0($key) |
954 | lwz $t1,4($key) | ||
955 | or $s0,$s0,$acc12 | 1128 | or $s0,$s0,$acc12 |
1129 | lwz $t1,4($key) | ||
956 | or $s1,$s1,$acc13 | 1130 | or $s1,$s1,$acc13 |
957 | lwz $t2,8($key) | 1131 | lwz $t2,8($key) |
958 | lwz $t3,12($key) | ||
959 | or $s2,$s2,$acc14 | 1132 | or $s2,$s2,$acc14 |
1133 | lwz $t3,12($key) | ||
960 | or $s3,$s3,$acc15 | 1134 | or $s3,$s3,$acc15 |
961 | 1135 | ||
962 | addi $key,$key,16 | 1136 | addi $key,$key,16 |
@@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4); | |||
1030 | and $acc02,$s2,$mask80 | 1204 | and $acc02,$s2,$mask80 |
1031 | and $acc03,$s3,$mask80 | 1205 | and $acc03,$s3,$mask80 |
1032 | srwi $acc04,$acc00,7 # r1>>7 | 1206 | srwi $acc04,$acc00,7 # r1>>7 |
1033 | srwi $acc05,$acc01,7 | ||
1034 | srwi $acc06,$acc02,7 | ||
1035 | srwi $acc07,$acc03,7 | ||
1036 | andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f | 1207 | andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f |
1208 | srwi $acc05,$acc01,7 | ||
1037 | andc $acc09,$s1,$mask80 | 1209 | andc $acc09,$s1,$mask80 |
1210 | srwi $acc06,$acc02,7 | ||
1038 | andc $acc10,$s2,$mask80 | 1211 | andc $acc10,$s2,$mask80 |
1212 | srwi $acc07,$acc03,7 | ||
1039 | andc $acc11,$s3,$mask80 | 1213 | andc $acc11,$s3,$mask80 |
1040 | sub $acc00,$acc00,$acc04 # r1-(r1>>7) | 1214 | sub $acc00,$acc00,$acc04 # r1-(r1>>7) |
1041 | sub $acc01,$acc01,$acc05 | 1215 | sub $acc01,$acc01,$acc05 |
@@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4); | |||
1059 | and $acc06,$acc02,$mask80 | 1233 | and $acc06,$acc02,$mask80 |
1060 | and $acc07,$acc03,$mask80 | 1234 | and $acc07,$acc03,$mask80 |
1061 | srwi $acc08,$acc04,7 # r1>>7 | 1235 | srwi $acc08,$acc04,7 # r1>>7 |
1062 | srwi $acc09,$acc05,7 | ||
1063 | srwi $acc10,$acc06,7 | ||
1064 | srwi $acc11,$acc07,7 | ||
1065 | andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f | 1236 | andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f |
1237 | srwi $acc09,$acc05,7 | ||
1066 | andc $acc13,$acc01,$mask80 | 1238 | andc $acc13,$acc01,$mask80 |
1239 | srwi $acc10,$acc06,7 | ||
1067 | andc $acc14,$acc02,$mask80 | 1240 | andc $acc14,$acc02,$mask80 |
1241 | srwi $acc11,$acc07,7 | ||
1068 | andc $acc15,$acc03,$mask80 | 1242 | andc $acc15,$acc03,$mask80 |
1069 | sub $acc04,$acc04,$acc08 # r1-(r1>>7) | 1243 | sub $acc04,$acc04,$acc08 # r1-(r1>>7) |
1070 | sub $acc05,$acc05,$acc09 | 1244 | sub $acc05,$acc05,$acc09 |
@@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4); | |||
1085 | 1259 | ||
1086 | and $acc08,$acc04,$mask80 # r1=r4&0x80808080 | 1260 | and $acc08,$acc04,$mask80 # r1=r4&0x80808080 |
1087 | and $acc09,$acc05,$mask80 | 1261 | and $acc09,$acc05,$mask80 |
1088 | and $acc10,$acc06,$mask80 | ||
1089 | and $acc11,$acc07,$mask80 | ||
1090 | srwi $acc12,$acc08,7 # r1>>7 | 1262 | srwi $acc12,$acc08,7 # r1>>7 |
1263 | and $acc10,$acc06,$mask80 | ||
1091 | srwi $acc13,$acc09,7 | 1264 | srwi $acc13,$acc09,7 |
1265 | and $acc11,$acc07,$mask80 | ||
1092 | srwi $acc14,$acc10,7 | 1266 | srwi $acc14,$acc10,7 |
1093 | srwi $acc15,$acc11,7 | ||
1094 | sub $acc08,$acc08,$acc12 # r1-(r1>>7) | 1267 | sub $acc08,$acc08,$acc12 # r1-(r1>>7) |
1268 | srwi $acc15,$acc11,7 | ||
1095 | sub $acc09,$acc09,$acc13 | 1269 | sub $acc09,$acc09,$acc13 |
1096 | sub $acc10,$acc10,$acc14 | 1270 | sub $acc10,$acc10,$acc14 |
1097 | sub $acc11,$acc11,$acc15 | 1271 | sub $acc11,$acc11,$acc15 |
@@ -1124,10 +1298,10 @@ ___ | |||
1124 | $code.=<<___; | 1298 | $code.=<<___; |
1125 | rotrwi $s0,$s0,8 # = ROTATE(r0,8) | 1299 | rotrwi $s0,$s0,8 # = ROTATE(r0,8) |
1126 | rotrwi $s1,$s1,8 | 1300 | rotrwi $s1,$s1,8 |
1127 | rotrwi $s2,$s2,8 | ||
1128 | rotrwi $s3,$s3,8 | ||
1129 | xor $s0,$s0,$acc00 # ^= r2^r0 | 1301 | xor $s0,$s0,$acc00 # ^= r2^r0 |
1302 | rotrwi $s2,$s2,8 | ||
1130 | xor $s1,$s1,$acc01 | 1303 | xor $s1,$s1,$acc01 |
1304 | rotrwi $s3,$s3,8 | ||
1131 | xor $s2,$s2,$acc02 | 1305 | xor $s2,$s2,$acc02 |
1132 | xor $s3,$s3,$acc03 | 1306 | xor $s3,$s3,$acc03 |
1133 | xor $acc00,$acc00,$acc08 | 1307 | xor $acc00,$acc00,$acc08 |
@@ -1135,32 +1309,32 @@ $code.=<<___; | |||
1135 | xor $acc02,$acc02,$acc10 | 1309 | xor $acc02,$acc02,$acc10 |
1136 | xor $acc03,$acc03,$acc11 | 1310 | xor $acc03,$acc03,$acc11 |
1137 | xor $s0,$s0,$acc04 # ^= r4^r0 | 1311 | xor $s0,$s0,$acc04 # ^= r4^r0 |
1138 | xor $s1,$s1,$acc05 | ||
1139 | xor $s2,$s2,$acc06 | ||
1140 | xor $s3,$s3,$acc07 | ||
1141 | rotrwi $acc00,$acc00,24 | 1312 | rotrwi $acc00,$acc00,24 |
1313 | xor $s1,$s1,$acc05 | ||
1142 | rotrwi $acc01,$acc01,24 | 1314 | rotrwi $acc01,$acc01,24 |
1315 | xor $s2,$s2,$acc06 | ||
1143 | rotrwi $acc02,$acc02,24 | 1316 | rotrwi $acc02,$acc02,24 |
1317 | xor $s3,$s3,$acc07 | ||
1144 | rotrwi $acc03,$acc03,24 | 1318 | rotrwi $acc03,$acc03,24 |
1145 | xor $acc04,$acc04,$acc08 | 1319 | xor $acc04,$acc04,$acc08 |
1146 | xor $acc05,$acc05,$acc09 | 1320 | xor $acc05,$acc05,$acc09 |
1147 | xor $acc06,$acc06,$acc10 | 1321 | xor $acc06,$acc06,$acc10 |
1148 | xor $acc07,$acc07,$acc11 | 1322 | xor $acc07,$acc07,$acc11 |
1149 | xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)] | 1323 | xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)] |
1150 | xor $s1,$s1,$acc09 | ||
1151 | xor $s2,$s2,$acc10 | ||
1152 | xor $s3,$s3,$acc11 | ||
1153 | rotrwi $acc04,$acc04,16 | 1324 | rotrwi $acc04,$acc04,16 |
1325 | xor $s1,$s1,$acc09 | ||
1154 | rotrwi $acc05,$acc05,16 | 1326 | rotrwi $acc05,$acc05,16 |
1327 | xor $s2,$s2,$acc10 | ||
1155 | rotrwi $acc06,$acc06,16 | 1328 | rotrwi $acc06,$acc06,16 |
1329 | xor $s3,$s3,$acc11 | ||
1156 | rotrwi $acc07,$acc07,16 | 1330 | rotrwi $acc07,$acc07,16 |
1157 | xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24) | 1331 | xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24) |
1158 | xor $s1,$s1,$acc01 | ||
1159 | xor $s2,$s2,$acc02 | ||
1160 | xor $s3,$s3,$acc03 | ||
1161 | rotrwi $acc08,$acc08,8 | 1332 | rotrwi $acc08,$acc08,8 |
1333 | xor $s1,$s1,$acc01 | ||
1162 | rotrwi $acc09,$acc09,8 | 1334 | rotrwi $acc09,$acc09,8 |
1335 | xor $s2,$s2,$acc02 | ||
1163 | rotrwi $acc10,$acc10,8 | 1336 | rotrwi $acc10,$acc10,8 |
1337 | xor $s3,$s3,$acc03 | ||
1164 | rotrwi $acc11,$acc11,8 | 1338 | rotrwi $acc11,$acc11,8 |
1165 | xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16) | 1339 | xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16) |
1166 | xor $s1,$s1,$acc05 | 1340 | xor $s1,$s1,$acc05 |
@@ -1179,7 +1353,9 @@ Ldec_compact_done: | |||
1179 | xor $s2,$s2,$t2 | 1353 | xor $s2,$s2,$t2 |
1180 | xor $s3,$s3,$t3 | 1354 | xor $s3,$s3,$t3 |
1181 | blr | 1355 | blr |
1182 | .long 0 | 1356 | .long 0 |
1357 | .byte 0,12,0x14,0,0,0,0,0 | ||
1358 | |||
1183 | .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" | 1359 | .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" |
1184 | .align 7 | 1360 | .align 7 |
1185 | ___ | 1361 | ___ |
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl index 7e01889298..445a1e6762 100644 --- a/src/lib/libcrypto/aes/asm/aes-s390x.pl +++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl | |||
@@ -44,12 +44,57 @@ | |||
44 | # Unlike previous version hardware support detection takes place only | 44 | # Unlike previous version hardware support detection takes place only |
45 | # at the moment of key schedule setup, which is denoted in key->rounds. | 45 | # at the moment of key schedule setup, which is denoted in key->rounds. |
46 | # This is done, because deferred key setup can't be made MT-safe, not | 46 | # This is done, because deferred key setup can't be made MT-safe, not |
47 | # for key lengthes longer than 128 bits. | 47 | # for keys longer than 128 bits. |
48 | # | 48 | # |
49 | # Add AES_cbc_encrypt, which gives incredible performance improvement, | 49 | # Add AES_cbc_encrypt, which gives incredible performance improvement, |
50 | # it was measured to be ~6.6x. It's less than previously mentioned 8x, | 50 | # it was measured to be ~6.6x. It's less than previously mentioned 8x, |
51 | # because software implementation was optimized. | 51 | # because software implementation was optimized. |
52 | 52 | ||
53 | # May 2010. | ||
54 | # | ||
55 | # Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x | ||
56 | # performance improvement over "generic" counter mode routine relying | ||
57 | # on single-block, also hardware-assisted, AES_encrypt. "Up to" refers | ||
58 | # to the fact that exact throughput value depends on current stack | ||
59 | # frame alignment within 4KB page. In worst case you get ~75% of the | ||
60 | # maximum, but *on average* it would be as much as ~98%. Meaning that | ||
61 | # worst case is unlike, it's like hitting ravine on plateau. | ||
62 | |||
63 | # November 2010. | ||
64 | # | ||
65 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
66 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
67 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
68 | # application context. The feature is not specific to any particular | ||
69 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
70 | # remains z/Architecture specific. On z990 it was measured to perform | ||
71 | # 2x better than code generated by gcc 4.3. | ||
72 | |||
73 | # December 2010. | ||
74 | # | ||
75 | # Add support for z196 "cipher message with counter" instruction. | ||
76 | # Note however that it's disengaged, because it was measured to | ||
77 | # perform ~12% worse than vanilla km-based code... | ||
78 | |||
79 | # February 2011. | ||
80 | # | ||
81 | # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes | ||
82 | # instructions, which deliver ~70% improvement at 8KB block size over | ||
83 | # vanilla km-based code, 37% - at most like 512-bytes block size. | ||
84 | |||
85 | $flavour = shift; | ||
86 | |||
87 | if ($flavour =~ /3[12]/) { | ||
88 | $SIZE_T=4; | ||
89 | $g=""; | ||
90 | } else { | ||
91 | $SIZE_T=8; | ||
92 | $g="g"; | ||
93 | } | ||
94 | |||
95 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
96 | open STDOUT,">$output"; | ||
97 | |||
53 | $softonly=0; # allow hardware support | 98 | $softonly=0; # allow hardware support |
54 | 99 | ||
55 | $t0="%r0"; $mask="%r0"; | 100 | $t0="%r0"; $mask="%r0"; |
@@ -69,6 +114,8 @@ $rounds="%r13"; | |||
69 | $ra="%r14"; | 114 | $ra="%r14"; |
70 | $sp="%r15"; | 115 | $sp="%r15"; |
71 | 116 | ||
117 | $stdframe=16*$SIZE_T+4*8; | ||
118 | |||
72 | sub _data_word() | 119 | sub _data_word() |
73 | { my $i; | 120 | { my $i; |
74 | while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } | 121 | while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } |
@@ -210,7 +257,7 @@ $code.=<<___ if (!$softonly); | |||
210 | .Lesoft: | 257 | .Lesoft: |
211 | ___ | 258 | ___ |
212 | $code.=<<___; | 259 | $code.=<<___; |
213 | stmg %r3,$ra,24($sp) | 260 | stm${g} %r3,$ra,3*$SIZE_T($sp) |
214 | 261 | ||
215 | llgf $s0,0($inp) | 262 | llgf $s0,0($inp) |
216 | llgf $s1,4($inp) | 263 | llgf $s1,4($inp) |
@@ -220,20 +267,20 @@ $code.=<<___; | |||
220 | larl $tbl,AES_Te | 267 | larl $tbl,AES_Te |
221 | bras $ra,_s390x_AES_encrypt | 268 | bras $ra,_s390x_AES_encrypt |
222 | 269 | ||
223 | lg $out,24($sp) | 270 | l${g} $out,3*$SIZE_T($sp) |
224 | st $s0,0($out) | 271 | st $s0,0($out) |
225 | st $s1,4($out) | 272 | st $s1,4($out) |
226 | st $s2,8($out) | 273 | st $s2,8($out) |
227 | st $s3,12($out) | 274 | st $s3,12($out) |
228 | 275 | ||
229 | lmg %r6,$ra,48($sp) | 276 | lm${g} %r6,$ra,6*$SIZE_T($sp) |
230 | br $ra | 277 | br $ra |
231 | .size AES_encrypt,.-AES_encrypt | 278 | .size AES_encrypt,.-AES_encrypt |
232 | 279 | ||
233 | .type _s390x_AES_encrypt,\@function | 280 | .type _s390x_AES_encrypt,\@function |
234 | .align 16 | 281 | .align 16 |
235 | _s390x_AES_encrypt: | 282 | _s390x_AES_encrypt: |
236 | stg $ra,152($sp) | 283 | st${g} $ra,15*$SIZE_T($sp) |
237 | x $s0,0($key) | 284 | x $s0,0($key) |
238 | x $s1,4($key) | 285 | x $s1,4($key) |
239 | x $s2,8($key) | 286 | x $s2,8($key) |
@@ -397,7 +444,7 @@ _s390x_AES_encrypt: | |||
397 | or $s2,$i3 | 444 | or $s2,$i3 |
398 | or $s3,$t3 | 445 | or $s3,$t3 |
399 | 446 | ||
400 | lg $ra,152($sp) | 447 | l${g} $ra,15*$SIZE_T($sp) |
401 | xr $s0,$t0 | 448 | xr $s0,$t0 |
402 | xr $s1,$t2 | 449 | xr $s1,$t2 |
403 | x $s2,24($key) | 450 | x $s2,24($key) |
@@ -536,7 +583,7 @@ $code.=<<___ if (!$softonly); | |||
536 | .Ldsoft: | 583 | .Ldsoft: |
537 | ___ | 584 | ___ |
538 | $code.=<<___; | 585 | $code.=<<___; |
539 | stmg %r3,$ra,24($sp) | 586 | stm${g} %r3,$ra,3*$SIZE_T($sp) |
540 | 587 | ||
541 | llgf $s0,0($inp) | 588 | llgf $s0,0($inp) |
542 | llgf $s1,4($inp) | 589 | llgf $s1,4($inp) |
@@ -546,20 +593,20 @@ $code.=<<___; | |||
546 | larl $tbl,AES_Td | 593 | larl $tbl,AES_Td |
547 | bras $ra,_s390x_AES_decrypt | 594 | bras $ra,_s390x_AES_decrypt |
548 | 595 | ||
549 | lg $out,24($sp) | 596 | l${g} $out,3*$SIZE_T($sp) |
550 | st $s0,0($out) | 597 | st $s0,0($out) |
551 | st $s1,4($out) | 598 | st $s1,4($out) |
552 | st $s2,8($out) | 599 | st $s2,8($out) |
553 | st $s3,12($out) | 600 | st $s3,12($out) |
554 | 601 | ||
555 | lmg %r6,$ra,48($sp) | 602 | lm${g} %r6,$ra,6*$SIZE_T($sp) |
556 | br $ra | 603 | br $ra |
557 | .size AES_decrypt,.-AES_decrypt | 604 | .size AES_decrypt,.-AES_decrypt |
558 | 605 | ||
559 | .type _s390x_AES_decrypt,\@function | 606 | .type _s390x_AES_decrypt,\@function |
560 | .align 16 | 607 | .align 16 |
561 | _s390x_AES_decrypt: | 608 | _s390x_AES_decrypt: |
562 | stg $ra,152($sp) | 609 | st${g} $ra,15*$SIZE_T($sp) |
563 | x $s0,0($key) | 610 | x $s0,0($key) |
564 | x $s1,4($key) | 611 | x $s1,4($key) |
565 | x $s2,8($key) | 612 | x $s2,8($key) |
@@ -703,7 +750,7 @@ _s390x_AES_decrypt: | |||
703 | nr $i1,$mask | 750 | nr $i1,$mask |
704 | nr $i2,$mask | 751 | nr $i2,$mask |
705 | 752 | ||
706 | lg $ra,152($sp) | 753 | l${g} $ra,15*$SIZE_T($sp) |
707 | or $s1,$t1 | 754 | or $s1,$t1 |
708 | l $t0,16($key) | 755 | l $t0,16($key) |
709 | l $t1,20($key) | 756 | l $t1,20($key) |
@@ -732,14 +779,15 @@ ___ | |||
732 | $code.=<<___; | 779 | $code.=<<___; |
733 | # void AES_set_encrypt_key(const unsigned char *in, int bits, | 780 | # void AES_set_encrypt_key(const unsigned char *in, int bits, |
734 | # AES_KEY *key) { | 781 | # AES_KEY *key) { |
735 | .globl AES_set_encrypt_key | 782 | .globl private_AES_set_encrypt_key |
736 | .type AES_set_encrypt_key,\@function | 783 | .type private_AES_set_encrypt_key,\@function |
737 | .align 16 | 784 | .align 16 |
738 | AES_set_encrypt_key: | 785 | private_AES_set_encrypt_key: |
786 | _s390x_AES_set_encrypt_key: | ||
739 | lghi $t0,0 | 787 | lghi $t0,0 |
740 | clgr $inp,$t0 | 788 | cl${g}r $inp,$t0 |
741 | je .Lminus1 | 789 | je .Lminus1 |
742 | clgr $key,$t0 | 790 | cl${g}r $key,$t0 |
743 | je .Lminus1 | 791 | je .Lminus1 |
744 | 792 | ||
745 | lghi $t0,128 | 793 | lghi $t0,128 |
@@ -789,7 +837,8 @@ $code.=<<___ if (!$softonly); | |||
789 | je 1f | 837 | je 1f |
790 | lg %r1,24($inp) | 838 | lg %r1,24($inp) |
791 | stg %r1,24($key) | 839 | stg %r1,24($key) |
792 | 1: st $bits,236($key) # save bits | 840 | 1: st $bits,236($key) # save bits [for debugging purposes] |
841 | lgr $t0,%r5 | ||
793 | st %r5,240($key) # save km code | 842 | st %r5,240($key) # save km code |
794 | lghi %r2,0 | 843 | lghi %r2,0 |
795 | br %r14 | 844 | br %r14 |
@@ -797,7 +846,7 @@ ___ | |||
797 | $code.=<<___; | 846 | $code.=<<___; |
798 | .align 16 | 847 | .align 16 |
799 | .Lekey_internal: | 848 | .Lekey_internal: |
800 | stmg %r6,%r13,48($sp) # all non-volatile regs | 849 | stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key |
801 | 850 | ||
802 | larl $tbl,AES_Te+2048 | 851 | larl $tbl,AES_Te+2048 |
803 | 852 | ||
@@ -857,8 +906,9 @@ $code.=<<___; | |||
857 | la $key,16($key) # key+=4 | 906 | la $key,16($key) # key+=4 |
858 | la $t3,4($t3) # i++ | 907 | la $t3,4($t3) # i++ |
859 | brct $rounds,.L128_loop | 908 | brct $rounds,.L128_loop |
909 | lghi $t0,10 | ||
860 | lghi %r2,0 | 910 | lghi %r2,0 |
861 | lmg %r6,%r13,48($sp) | 911 | lm${g} %r4,%r13,4*$SIZE_T($sp) |
862 | br $ra | 912 | br $ra |
863 | 913 | ||
864 | .align 16 | 914 | .align 16 |
@@ -905,8 +955,9 @@ $code.=<<___; | |||
905 | st $s2,32($key) | 955 | st $s2,32($key) |
906 | st $s3,36($key) | 956 | st $s3,36($key) |
907 | brct $rounds,.L192_continue | 957 | brct $rounds,.L192_continue |
958 | lghi $t0,12 | ||
908 | lghi %r2,0 | 959 | lghi %r2,0 |
909 | lmg %r6,%r13,48($sp) | 960 | lm${g} %r4,%r13,4*$SIZE_T($sp) |
910 | br $ra | 961 | br $ra |
911 | 962 | ||
912 | .align 16 | 963 | .align 16 |
@@ -967,8 +1018,9 @@ $code.=<<___; | |||
967 | st $s2,40($key) | 1018 | st $s2,40($key) |
968 | st $s3,44($key) | 1019 | st $s3,44($key) |
969 | brct $rounds,.L256_continue | 1020 | brct $rounds,.L256_continue |
1021 | lghi $t0,14 | ||
970 | lghi %r2,0 | 1022 | lghi %r2,0 |
971 | lmg %r6,%r13,48($sp) | 1023 | lm${g} %r4,%r13,4*$SIZE_T($sp) |
972 | br $ra | 1024 | br $ra |
973 | 1025 | ||
974 | .align 16 | 1026 | .align 16 |
@@ -1011,42 +1063,34 @@ $code.=<<___; | |||
1011 | .Lminus1: | 1063 | .Lminus1: |
1012 | lghi %r2,-1 | 1064 | lghi %r2,-1 |
1013 | br $ra | 1065 | br $ra |
1014 | .size AES_set_encrypt_key,.-AES_set_encrypt_key | 1066 | .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key |
1015 | 1067 | ||
1016 | # void AES_set_decrypt_key(const unsigned char *in, int bits, | 1068 | # void AES_set_decrypt_key(const unsigned char *in, int bits, |
1017 | # AES_KEY *key) { | 1069 | # AES_KEY *key) { |
1018 | .globl AES_set_decrypt_key | 1070 | .globl private_AES_set_decrypt_key |
1019 | .type AES_set_decrypt_key,\@function | 1071 | .type private_AES_set_decrypt_key,\@function |
1020 | .align 16 | 1072 | .align 16 |
1021 | AES_set_decrypt_key: | 1073 | private_AES_set_decrypt_key: |
1022 | stg $key,32($sp) # I rely on AES_set_encrypt_key to | 1074 | #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to |
1023 | stg $ra,112($sp) # save non-volatile registers! | 1075 | st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key! |
1024 | bras $ra,AES_set_encrypt_key | 1076 | bras $ra,_s390x_AES_set_encrypt_key |
1025 | lg $key,32($sp) | 1077 | #l${g} $key,4*$SIZE_T($sp) |
1026 | lg $ra,112($sp) | 1078 | l${g} $ra,14*$SIZE_T($sp) |
1027 | ltgr %r2,%r2 | 1079 | ltgr %r2,%r2 |
1028 | bnzr $ra | 1080 | bnzr $ra |
1029 | ___ | 1081 | ___ |
1030 | $code.=<<___ if (!$softonly); | 1082 | $code.=<<___ if (!$softonly); |
1031 | l $t0,240($key) | 1083 | #l $t0,240($key) |
1032 | lhi $t1,16 | 1084 | lhi $t1,16 |
1033 | cr $t0,$t1 | 1085 | cr $t0,$t1 |
1034 | jl .Lgo | 1086 | jl .Lgo |
1035 | oill $t0,0x80 # set "decrypt" bit | 1087 | oill $t0,0x80 # set "decrypt" bit |
1036 | st $t0,240($key) | 1088 | st $t0,240($key) |
1037 | br $ra | 1089 | br $ra |
1038 | |||
1039 | .align 16 | ||
1040 | .Ldkey_internal: | ||
1041 | stg $key,32($sp) | ||
1042 | stg $ra,40($sp) | ||
1043 | bras $ra,.Lekey_internal | ||
1044 | lg $key,32($sp) | ||
1045 | lg $ra,40($sp) | ||
1046 | ___ | 1090 | ___ |
1047 | $code.=<<___; | 1091 | $code.=<<___; |
1048 | 1092 | .align 16 | |
1049 | .Lgo: llgf $rounds,240($key) | 1093 | .Lgo: lgr $rounds,$t0 #llgf $rounds,240($key) |
1050 | la $i1,0($key) | 1094 | la $i1,0($key) |
1051 | sllg $i2,$rounds,4 | 1095 | sllg $i2,$rounds,4 |
1052 | la $i2,0($i2,$key) | 1096 | la $i2,0($i2,$key) |
@@ -1123,13 +1167,14 @@ $code.=<<___; | |||
1123 | la $key,4($key) | 1167 | la $key,4($key) |
1124 | brct $rounds,.Lmix | 1168 | brct $rounds,.Lmix |
1125 | 1169 | ||
1126 | lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! | 1170 | lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key! |
1127 | lghi %r2,0 | 1171 | lghi %r2,0 |
1128 | br $ra | 1172 | br $ra |
1129 | .size AES_set_decrypt_key,.-AES_set_decrypt_key | 1173 | .size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key |
1130 | ___ | 1174 | ___ |
1131 | 1175 | ||
1132 | #void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | 1176 | ######################################################################## |
1177 | # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | ||
1133 | # size_t length, const AES_KEY *key, | 1178 | # size_t length, const AES_KEY *key, |
1134 | # unsigned char *ivec, const int enc) | 1179 | # unsigned char *ivec, const int enc) |
1135 | { | 1180 | { |
@@ -1163,7 +1208,7 @@ $code.=<<___ if (!$softonly); | |||
1163 | l %r0,240($key) # load kmc code | 1208 | l %r0,240($key) # load kmc code |
1164 | lghi $key,15 # res=len%16, len-=res; | 1209 | lghi $key,15 # res=len%16, len-=res; |
1165 | ngr $key,$len | 1210 | ngr $key,$len |
1166 | slgr $len,$key | 1211 | sl${g}r $len,$key |
1167 | la %r1,16($sp) # parameter block - ivec || key | 1212 | la %r1,16($sp) # parameter block - ivec || key |
1168 | jz .Lkmc_truncated | 1213 | jz .Lkmc_truncated |
1169 | .long 0xb92f0042 # kmc %r4,%r2 | 1214 | .long 0xb92f0042 # kmc %r4,%r2 |
@@ -1181,34 +1226,34 @@ $code.=<<___ if (!$softonly); | |||
1181 | tmll %r0,0x80 | 1226 | tmll %r0,0x80 |
1182 | jnz .Lkmc_truncated_dec | 1227 | jnz .Lkmc_truncated_dec |
1183 | lghi %r1,0 | 1228 | lghi %r1,0 |
1184 | stg %r1,128($sp) | 1229 | stg %r1,16*$SIZE_T($sp) |
1185 | stg %r1,136($sp) | 1230 | stg %r1,16*$SIZE_T+8($sp) |
1186 | bras %r1,1f | 1231 | bras %r1,1f |
1187 | mvc 128(1,$sp),0($inp) | 1232 | mvc 16*$SIZE_T(1,$sp),0($inp) |
1188 | 1: ex $key,0(%r1) | 1233 | 1: ex $key,0(%r1) |
1189 | la %r1,16($sp) # restore parameter block | 1234 | la %r1,16($sp) # restore parameter block |
1190 | la $inp,128($sp) | 1235 | la $inp,16*$SIZE_T($sp) |
1191 | lghi $len,16 | 1236 | lghi $len,16 |
1192 | .long 0xb92f0042 # kmc %r4,%r2 | 1237 | .long 0xb92f0042 # kmc %r4,%r2 |
1193 | j .Lkmc_done | 1238 | j .Lkmc_done |
1194 | .align 16 | 1239 | .align 16 |
1195 | .Lkmc_truncated_dec: | 1240 | .Lkmc_truncated_dec: |
1196 | stg $out,64($sp) | 1241 | st${g} $out,4*$SIZE_T($sp) |
1197 | la $out,128($sp) | 1242 | la $out,16*$SIZE_T($sp) |
1198 | lghi $len,16 | 1243 | lghi $len,16 |
1199 | .long 0xb92f0042 # kmc %r4,%r2 | 1244 | .long 0xb92f0042 # kmc %r4,%r2 |
1200 | lg $out,64($sp) | 1245 | l${g} $out,4*$SIZE_T($sp) |
1201 | bras %r1,2f | 1246 | bras %r1,2f |
1202 | mvc 0(1,$out),128($sp) | 1247 | mvc 0(1,$out),16*$SIZE_T($sp) |
1203 | 2: ex $key,0(%r1) | 1248 | 2: ex $key,0(%r1) |
1204 | j .Lkmc_done | 1249 | j .Lkmc_done |
1205 | .align 16 | 1250 | .align 16 |
1206 | .Lcbc_software: | 1251 | .Lcbc_software: |
1207 | ___ | 1252 | ___ |
1208 | $code.=<<___; | 1253 | $code.=<<___; |
1209 | stmg $key,$ra,40($sp) | 1254 | stm${g} $key,$ra,5*$SIZE_T($sp) |
1210 | lhi %r0,0 | 1255 | lhi %r0,0 |
1211 | cl %r0,164($sp) | 1256 | cl %r0,`$stdframe+$SIZE_T-4`($sp) |
1212 | je .Lcbc_decrypt | 1257 | je .Lcbc_decrypt |
1213 | 1258 | ||
1214 | larl $tbl,AES_Te | 1259 | larl $tbl,AES_Te |
@@ -1219,10 +1264,10 @@ $code.=<<___; | |||
1219 | llgf $s3,12($ivp) | 1264 | llgf $s3,12($ivp) |
1220 | 1265 | ||
1221 | lghi $t0,16 | 1266 | lghi $t0,16 |
1222 | slgr $len,$t0 | 1267 | sl${g}r $len,$t0 |
1223 | brc 4,.Lcbc_enc_tail # if borrow | 1268 | brc 4,.Lcbc_enc_tail # if borrow |
1224 | .Lcbc_enc_loop: | 1269 | .Lcbc_enc_loop: |
1225 | stmg $inp,$out,16($sp) | 1270 | stm${g} $inp,$out,2*$SIZE_T($sp) |
1226 | x $s0,0($inp) | 1271 | x $s0,0($inp) |
1227 | x $s1,4($inp) | 1272 | x $s1,4($inp) |
1228 | x $s2,8($inp) | 1273 | x $s2,8($inp) |
@@ -1231,7 +1276,7 @@ $code.=<<___; | |||
1231 | 1276 | ||
1232 | bras $ra,_s390x_AES_encrypt | 1277 | bras $ra,_s390x_AES_encrypt |
1233 | 1278 | ||
1234 | lmg $inp,$key,16($sp) | 1279 | lm${g} $inp,$key,2*$SIZE_T($sp) |
1235 | st $s0,0($out) | 1280 | st $s0,0($out) |
1236 | st $s1,4($out) | 1281 | st $s1,4($out) |
1237 | st $s2,8($out) | 1282 | st $s2,8($out) |
@@ -1240,33 +1285,33 @@ $code.=<<___; | |||
1240 | la $inp,16($inp) | 1285 | la $inp,16($inp) |
1241 | la $out,16($out) | 1286 | la $out,16($out) |
1242 | lghi $t0,16 | 1287 | lghi $t0,16 |
1243 | ltgr $len,$len | 1288 | lt${g}r $len,$len |
1244 | jz .Lcbc_enc_done | 1289 | jz .Lcbc_enc_done |
1245 | slgr $len,$t0 | 1290 | sl${g}r $len,$t0 |
1246 | brc 4,.Lcbc_enc_tail # if borrow | 1291 | brc 4,.Lcbc_enc_tail # if borrow |
1247 | j .Lcbc_enc_loop | 1292 | j .Lcbc_enc_loop |
1248 | .align 16 | 1293 | .align 16 |
1249 | .Lcbc_enc_done: | 1294 | .Lcbc_enc_done: |
1250 | lg $ivp,48($sp) | 1295 | l${g} $ivp,6*$SIZE_T($sp) |
1251 | st $s0,0($ivp) | 1296 | st $s0,0($ivp) |
1252 | st $s1,4($ivp) | 1297 | st $s1,4($ivp) |
1253 | st $s2,8($ivp) | 1298 | st $s2,8($ivp) |
1254 | st $s3,12($ivp) | 1299 | st $s3,12($ivp) |
1255 | 1300 | ||
1256 | lmg %r7,$ra,56($sp) | 1301 | lm${g} %r7,$ra,7*$SIZE_T($sp) |
1257 | br $ra | 1302 | br $ra |
1258 | 1303 | ||
1259 | .align 16 | 1304 | .align 16 |
1260 | .Lcbc_enc_tail: | 1305 | .Lcbc_enc_tail: |
1261 | aghi $len,15 | 1306 | aghi $len,15 |
1262 | lghi $t0,0 | 1307 | lghi $t0,0 |
1263 | stg $t0,128($sp) | 1308 | stg $t0,16*$SIZE_T($sp) |
1264 | stg $t0,136($sp) | 1309 | stg $t0,16*$SIZE_T+8($sp) |
1265 | bras $t1,3f | 1310 | bras $t1,3f |
1266 | mvc 128(1,$sp),0($inp) | 1311 | mvc 16*$SIZE_T(1,$sp),0($inp) |
1267 | 3: ex $len,0($t1) | 1312 | 3: ex $len,0($t1) |
1268 | lghi $len,0 | 1313 | lghi $len,0 |
1269 | la $inp,128($sp) | 1314 | la $inp,16*$SIZE_T($sp) |
1270 | j .Lcbc_enc_loop | 1315 | j .Lcbc_enc_loop |
1271 | 1316 | ||
1272 | .align 16 | 1317 | .align 16 |
@@ -1275,10 +1320,10 @@ $code.=<<___; | |||
1275 | 1320 | ||
1276 | lg $t0,0($ivp) | 1321 | lg $t0,0($ivp) |
1277 | lg $t1,8($ivp) | 1322 | lg $t1,8($ivp) |
1278 | stmg $t0,$t1,128($sp) | 1323 | stmg $t0,$t1,16*$SIZE_T($sp) |
1279 | 1324 | ||
1280 | .Lcbc_dec_loop: | 1325 | .Lcbc_dec_loop: |
1281 | stmg $inp,$out,16($sp) | 1326 | stm${g} $inp,$out,2*$SIZE_T($sp) |
1282 | llgf $s0,0($inp) | 1327 | llgf $s0,0($inp) |
1283 | llgf $s1,4($inp) | 1328 | llgf $s1,4($inp) |
1284 | llgf $s2,8($inp) | 1329 | llgf $s2,8($inp) |
@@ -1287,7 +1332,7 @@ $code.=<<___; | |||
1287 | 1332 | ||
1288 | bras $ra,_s390x_AES_decrypt | 1333 | bras $ra,_s390x_AES_decrypt |
1289 | 1334 | ||
1290 | lmg $inp,$key,16($sp) | 1335 | lm${g} $inp,$key,2*$SIZE_T($sp) |
1291 | sllg $s0,$s0,32 | 1336 | sllg $s0,$s0,32 |
1292 | sllg $s2,$s2,32 | 1337 | sllg $s2,$s2,32 |
1293 | lr $s0,$s1 | 1338 | lr $s0,$s1 |
@@ -1295,15 +1340,15 @@ $code.=<<___; | |||
1295 | 1340 | ||
1296 | lg $t0,0($inp) | 1341 | lg $t0,0($inp) |
1297 | lg $t1,8($inp) | 1342 | lg $t1,8($inp) |
1298 | xg $s0,128($sp) | 1343 | xg $s0,16*$SIZE_T($sp) |
1299 | xg $s2,136($sp) | 1344 | xg $s2,16*$SIZE_T+8($sp) |
1300 | lghi $s1,16 | 1345 | lghi $s1,16 |
1301 | slgr $len,$s1 | 1346 | sl${g}r $len,$s1 |
1302 | brc 4,.Lcbc_dec_tail # if borrow | 1347 | brc 4,.Lcbc_dec_tail # if borrow |
1303 | brc 2,.Lcbc_dec_done # if zero | 1348 | brc 2,.Lcbc_dec_done # if zero |
1304 | stg $s0,0($out) | 1349 | stg $s0,0($out) |
1305 | stg $s2,8($out) | 1350 | stg $s2,8($out) |
1306 | stmg $t0,$t1,128($sp) | 1351 | stmg $t0,$t1,16*$SIZE_T($sp) |
1307 | 1352 | ||
1308 | la $inp,16($inp) | 1353 | la $inp,16($inp) |
1309 | la $out,16($out) | 1354 | la $out,16($out) |
@@ -1313,7 +1358,7 @@ $code.=<<___; | |||
1313 | stg $s0,0($out) | 1358 | stg $s0,0($out) |
1314 | stg $s2,8($out) | 1359 | stg $s2,8($out) |
1315 | .Lcbc_dec_exit: | 1360 | .Lcbc_dec_exit: |
1316 | lmg $ivp,$ra,48($sp) | 1361 | lm${g} %r6,$ra,6*$SIZE_T($sp) |
1317 | stmg $t0,$t1,0($ivp) | 1362 | stmg $t0,$t1,0($ivp) |
1318 | 1363 | ||
1319 | br $ra | 1364 | br $ra |
@@ -1321,19 +1366,889 @@ $code.=<<___; | |||
1321 | .align 16 | 1366 | .align 16 |
1322 | .Lcbc_dec_tail: | 1367 | .Lcbc_dec_tail: |
1323 | aghi $len,15 | 1368 | aghi $len,15 |
1324 | stg $s0,128($sp) | 1369 | stg $s0,16*$SIZE_T($sp) |
1325 | stg $s2,136($sp) | 1370 | stg $s2,16*$SIZE_T+8($sp) |
1326 | bras $s1,4f | 1371 | bras $s1,4f |
1327 | mvc 0(1,$out),128($sp) | 1372 | mvc 0(1,$out),16*$SIZE_T($sp) |
1328 | 4: ex $len,0($s1) | 1373 | 4: ex $len,0($s1) |
1329 | j .Lcbc_dec_exit | 1374 | j .Lcbc_dec_exit |
1330 | .size AES_cbc_encrypt,.-AES_cbc_encrypt | 1375 | .size AES_cbc_encrypt,.-AES_cbc_encrypt |
1331 | .comm OPENSSL_s390xcap_P,8,8 | 1376 | ___ |
1377 | } | ||
1378 | ######################################################################## | ||
1379 | # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out, | ||
1380 | # size_t blocks, const AES_KEY *key, | ||
1381 | # const unsigned char *ivec) | ||
1382 | { | ||
1383 | my $inp="%r2"; | ||
1384 | my $out="%r4"; # blocks and out are swapped | ||
1385 | my $len="%r3"; | ||
1386 | my $key="%r5"; my $iv0="%r5"; | ||
1387 | my $ivp="%r6"; | ||
1388 | my $fp ="%r7"; | ||
1389 | |||
1390 | $code.=<<___; | ||
1391 | .globl AES_ctr32_encrypt | ||
1392 | .type AES_ctr32_encrypt,\@function | ||
1393 | .align 16 | ||
1394 | AES_ctr32_encrypt: | ||
1395 | xgr %r3,%r4 # flip %r3 and %r4, $out and $len | ||
1396 | xgr %r4,%r3 | ||
1397 | xgr %r3,%r4 | ||
1398 | llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case | ||
1399 | ___ | ||
1400 | $code.=<<___ if (!$softonly); | ||
1401 | l %r0,240($key) | ||
1402 | lhi %r1,16 | ||
1403 | clr %r0,%r1 | ||
1404 | jl .Lctr32_software | ||
1405 | |||
1406 | stm${g} %r6,$s3,6*$SIZE_T($sp) | ||
1407 | |||
1408 | slgr $out,$inp | ||
1409 | la %r1,0($key) # %r1 is permanent copy of $key | ||
1410 | lg $iv0,0($ivp) # load ivec | ||
1411 | lg $ivp,8($ivp) | ||
1412 | |||
1413 | # prepare and allocate stack frame at the top of 4K page | ||
1414 | # with 1K reserved for eventual signal handling | ||
1415 | lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer | ||
1416 | lghi $s1,-4096 | ||
1417 | algr $s0,$sp | ||
1418 | lgr $fp,$sp | ||
1419 | ngr $s0,$s1 # align at page boundary | ||
1420 | slgr $fp,$s0 # total buffer size | ||
1421 | lgr $s2,$sp | ||
1422 | lghi $s1,1024+16 # sl[g]fi is extended-immediate facility | ||
1423 | slgr $fp,$s1 # deduct reservation to get usable buffer size | ||
1424 | # buffer size is at lest 256 and at most 3072+256-16 | ||
1425 | |||
1426 | la $sp,1024($s0) # alloca | ||
1427 | srlg $fp,$fp,4 # convert bytes to blocks, minimum 16 | ||
1428 | st${g} $s2,0($sp) # back-chain | ||
1429 | st${g} $fp,$SIZE_T($sp) | ||
1430 | |||
1431 | slgr $len,$fp | ||
1432 | brc 1,.Lctr32_hw_switch # not zero, no borrow | ||
1433 | algr $fp,$len # input is shorter than allocated buffer | ||
1434 | lghi $len,0 | ||
1435 | st${g} $fp,$SIZE_T($sp) | ||
1436 | |||
1437 | .Lctr32_hw_switch: | ||
1438 | ___ | ||
1439 | $code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower | ||
1440 | larl $s0,OPENSSL_s390xcap_P | ||
1441 | lg $s0,8($s0) | ||
1442 | tmhh $s0,0x0004 # check for message_security-assist-4 | ||
1443 | jz .Lctr32_km_loop | ||
1444 | |||
1445 | llgfr $s0,%r0 | ||
1446 | lgr $s1,%r1 | ||
1447 | lghi %r0,0 | ||
1448 | la %r1,16($sp) | ||
1449 | .long 0xb92d2042 # kmctr %r4,%r2,%r2 | ||
1450 | |||
1451 | llihh %r0,0x8000 # check if kmctr supports the function code | ||
1452 | srlg %r0,%r0,0($s0) | ||
1453 | ng %r0,16($sp) | ||
1454 | lgr %r0,$s0 | ||
1455 | lgr %r1,$s1 | ||
1456 | jz .Lctr32_km_loop | ||
1457 | |||
1458 | ####### kmctr code | ||
1459 | algr $out,$inp # restore $out | ||
1460 | lgr $s1,$len # $s1 undertakes $len | ||
1461 | j .Lctr32_kmctr_loop | ||
1462 | .align 16 | ||
1463 | .Lctr32_kmctr_loop: | ||
1464 | la $s2,16($sp) | ||
1465 | lgr $s3,$fp | ||
1466 | .Lctr32_kmctr_prepare: | ||
1467 | stg $iv0,0($s2) | ||
1468 | stg $ivp,8($s2) | ||
1469 | la $s2,16($s2) | ||
1470 | ahi $ivp,1 # 32-bit increment, preserves upper half | ||
1471 | brct $s3,.Lctr32_kmctr_prepare | ||
1472 | |||
1473 | #la $inp,0($inp) # inp | ||
1474 | sllg $len,$fp,4 # len | ||
1475 | #la $out,0($out) # out | ||
1476 | la $s2,16($sp) # iv | ||
1477 | .long 0xb92da042 # kmctr $out,$s2,$inp | ||
1478 | brc 1,.-4 # pay attention to "partial completion" | ||
1479 | |||
1480 | slgr $s1,$fp | ||
1481 | brc 1,.Lctr32_kmctr_loop # not zero, no borrow | ||
1482 | algr $fp,$s1 | ||
1483 | lghi $s1,0 | ||
1484 | brc 4+1,.Lctr32_kmctr_loop # not zero | ||
1485 | |||
1486 | l${g} $sp,0($sp) | ||
1487 | lm${g} %r6,$s3,6*$SIZE_T($sp) | ||
1488 | br $ra | ||
1489 | .align 16 | ||
1490 | ___ | ||
1491 | $code.=<<___; | ||
1492 | .Lctr32_km_loop: | ||
1493 | la $s2,16($sp) | ||
1494 | lgr $s3,$fp | ||
1495 | .Lctr32_km_prepare: | ||
1496 | stg $iv0,0($s2) | ||
1497 | stg $ivp,8($s2) | ||
1498 | la $s2,16($s2) | ||
1499 | ahi $ivp,1 # 32-bit increment, preserves upper half | ||
1500 | brct $s3,.Lctr32_km_prepare | ||
1501 | |||
1502 | la $s0,16($sp) # inp | ||
1503 | sllg $s1,$fp,4 # len | ||
1504 | la $s2,16($sp) # out | ||
1505 | .long 0xb92e00a8 # km %r10,%r8 | ||
1506 | brc 1,.-4 # pay attention to "partial completion" | ||
1507 | |||
1508 | la $s2,16($sp) | ||
1509 | lgr $s3,$fp | ||
1510 | slgr $s2,$inp | ||
1511 | .Lctr32_km_xor: | ||
1512 | lg $s0,0($inp) | ||
1513 | lg $s1,8($inp) | ||
1514 | xg $s0,0($s2,$inp) | ||
1515 | xg $s1,8($s2,$inp) | ||
1516 | stg $s0,0($out,$inp) | ||
1517 | stg $s1,8($out,$inp) | ||
1518 | la $inp,16($inp) | ||
1519 | brct $s3,.Lctr32_km_xor | ||
1520 | |||
1521 | slgr $len,$fp | ||
1522 | brc 1,.Lctr32_km_loop # not zero, no borrow | ||
1523 | algr $fp,$len | ||
1524 | lghi $len,0 | ||
1525 | brc 4+1,.Lctr32_km_loop # not zero | ||
1526 | |||
1527 | l${g} $s0,0($sp) | ||
1528 | l${g} $s1,$SIZE_T($sp) | ||
1529 | la $s2,16($sp) | ||
1530 | .Lctr32_km_zap: | ||
1531 | stg $s0,0($s2) | ||
1532 | stg $s0,8($s2) | ||
1533 | la $s2,16($s2) | ||
1534 | brct $s1,.Lctr32_km_zap | ||
1535 | |||
1536 | la $sp,0($s0) | ||
1537 | lm${g} %r6,$s3,6*$SIZE_T($sp) | ||
1538 | br $ra | ||
1539 | .align 16 | ||
1540 | .Lctr32_software: | ||
1541 | ___ | ||
1542 | $code.=<<___; | ||
1543 | stm${g} $key,$ra,5*$SIZE_T($sp) | ||
1544 | sl${g}r $inp,$out | ||
1545 | larl $tbl,AES_Te | ||
1546 | llgf $t1,12($ivp) | ||
1547 | |||
1548 | .Lctr32_loop: | ||
1549 | stm${g} $inp,$out,2*$SIZE_T($sp) | ||
1550 | llgf $s0,0($ivp) | ||
1551 | llgf $s1,4($ivp) | ||
1552 | llgf $s2,8($ivp) | ||
1553 | lgr $s3,$t1 | ||
1554 | st $t1,16*$SIZE_T($sp) | ||
1555 | lgr %r4,$key | ||
1556 | |||
1557 | bras $ra,_s390x_AES_encrypt | ||
1558 | |||
1559 | lm${g} $inp,$ivp,2*$SIZE_T($sp) | ||
1560 | llgf $t1,16*$SIZE_T($sp) | ||
1561 | x $s0,0($inp,$out) | ||
1562 | x $s1,4($inp,$out) | ||
1563 | x $s2,8($inp,$out) | ||
1564 | x $s3,12($inp,$out) | ||
1565 | stm $s0,$s3,0($out) | ||
1566 | |||
1567 | la $out,16($out) | ||
1568 | ahi $t1,1 # 32-bit increment | ||
1569 | brct $len,.Lctr32_loop | ||
1570 | |||
1571 | lm${g} %r6,$ra,6*$SIZE_T($sp) | ||
1572 | br $ra | ||
1573 | .size AES_ctr32_encrypt,.-AES_ctr32_encrypt | ||
1574 | ___ | ||
1575 | } | ||
1576 | |||
1577 | ######################################################################## | ||
1578 | # void AES_xts_encrypt(const char *inp,char *out,size_t len, | ||
1579 | # const AES_KEY *key1, const AES_KEY *key2, | ||
1580 | # const unsigned char iv[16]); | ||
1581 | # | ||
1582 | { | ||
1583 | my $inp="%r2"; | ||
1584 | my $out="%r4"; # len and out are swapped | ||
1585 | my $len="%r3"; | ||
1586 | my $key1="%r5"; # $i1 | ||
1587 | my $key2="%r6"; # $i2 | ||
1588 | my $fp="%r7"; # $i3 | ||
1589 | my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame... | ||
1590 | |||
1591 | $code.=<<___; | ||
1592 | .type _s390x_xts_km,\@function | ||
1593 | .align 16 | ||
1594 | _s390x_xts_km: | ||
1595 | ___ | ||
1596 | $code.=<<___ if(1); | ||
1597 | llgfr $s0,%r0 # put aside the function code | ||
1598 | lghi $s1,0x7f | ||
1599 | nr $s1,%r0 | ||
1600 | lghi %r0,0 # query capability vector | ||
1601 | la %r1,2*$SIZE_T($sp) | ||
1602 | .long 0xb92e0042 # km %r4,%r2 | ||
1603 | llihh %r1,0x8000 | ||
1604 | srlg %r1,%r1,32($s1) # check for 32+function code | ||
1605 | ng %r1,2*$SIZE_T($sp) | ||
1606 | lgr %r0,$s0 # restore the function code | ||
1607 | la %r1,0($key1) # restore $key1 | ||
1608 | jz .Lxts_km_vanilla | ||
1609 | |||
1610 | lmg $i2,$i3,$tweak($sp) # put aside the tweak value | ||
1611 | algr $out,$inp | ||
1612 | |||
1613 | oill %r0,32 # switch to xts function code | ||
1614 | aghi $s1,-18 # | ||
1615 | sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16 | ||
1616 | la %r1,$tweak-16($sp) | ||
1617 | slgr %r1,$s1 # parameter block position | ||
1618 | lmg $s0,$s3,0($key1) # load 256 bits of key material, | ||
1619 | stmg $s0,$s3,0(%r1) # and copy it to parameter block. | ||
1620 | # yes, it contains junk and overlaps | ||
1621 | # with the tweak in 128-bit case. | ||
1622 | # it's done to avoid conditional | ||
1623 | # branch. | ||
1624 | stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value | ||
1625 | |||
1626 | .long 0xb92e0042 # km %r4,%r2 | ||
1627 | brc 1,.-4 # pay attention to "partial completion" | ||
1628 | |||
1629 | lrvg $s0,$tweak+0($sp) # load the last tweak | ||
1630 | lrvg $s1,$tweak+8($sp) | ||
1631 | stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key | ||
1632 | |||
1633 | nill %r0,0xffdf # switch back to original function code | ||
1634 | la %r1,0($key1) # restore pointer to $key1 | ||
1635 | slgr $out,$inp | ||
1636 | |||
1637 | llgc $len,2*$SIZE_T-1($sp) | ||
1638 | nill $len,0x0f # $len%=16 | ||
1639 | br $ra | ||
1640 | |||
1641 | .align 16 | ||
1642 | .Lxts_km_vanilla: | ||
1643 | ___ | ||
1644 | $code.=<<___; | ||
1645 | # prepare and allocate stack frame at the top of 4K page | ||
1646 | # with 1K reserved for eventual signal handling | ||
1647 | lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer | ||
1648 | lghi $s1,-4096 | ||
1649 | algr $s0,$sp | ||
1650 | lgr $fp,$sp | ||
1651 | ngr $s0,$s1 # align at page boundary | ||
1652 | slgr $fp,$s0 # total buffer size | ||
1653 | lgr $s2,$sp | ||
1654 | lghi $s1,1024+16 # sl[g]fi is extended-immediate facility | ||
1655 | slgr $fp,$s1 # deduct reservation to get usable buffer size | ||
1656 | # buffer size is at lest 256 and at most 3072+256-16 | ||
1657 | |||
1658 | la $sp,1024($s0) # alloca | ||
1659 | nill $fp,0xfff0 # round to 16*n | ||
1660 | st${g} $s2,0($sp) # back-chain | ||
1661 | nill $len,0xfff0 # redundant | ||
1662 | st${g} $fp,$SIZE_T($sp) | ||
1663 | |||
1664 | slgr $len,$fp | ||
1665 | brc 1,.Lxts_km_go # not zero, no borrow | ||
1666 | algr $fp,$len # input is shorter than allocated buffer | ||
1667 | lghi $len,0 | ||
1668 | st${g} $fp,$SIZE_T($sp) | ||
1669 | |||
1670 | .Lxts_km_go: | ||
1671 | lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian | ||
1672 | lrvg $s1,$tweak+8($s2) | ||
1673 | |||
1674 | la $s2,16($sp) # vector of ascending tweak values | ||
1675 | slgr $s2,$inp | ||
1676 | srlg $s3,$fp,4 | ||
1677 | j .Lxts_km_start | ||
1678 | |||
1679 | .Lxts_km_loop: | ||
1680 | la $s2,16($sp) | ||
1681 | slgr $s2,$inp | ||
1682 | srlg $s3,$fp,4 | ||
1683 | .Lxts_km_prepare: | ||
1684 | lghi $i1,0x87 | ||
1685 | srag $i2,$s1,63 # broadcast upper bit | ||
1686 | ngr $i1,$i2 # rem | ||
1687 | srlg $i2,$s0,63 # carry bit from lower half | ||
1688 | sllg $s0,$s0,1 | ||
1689 | sllg $s1,$s1,1 | ||
1690 | xgr $s0,$i1 | ||
1691 | ogr $s1,$i2 | ||
1692 | .Lxts_km_start: | ||
1693 | lrvgr $i1,$s0 # flip byte order | ||
1694 | lrvgr $i2,$s1 | ||
1695 | stg $i1,0($s2,$inp) | ||
1696 | stg $i2,8($s2,$inp) | ||
1697 | xg $i1,0($inp) | ||
1698 | xg $i2,8($inp) | ||
1699 | stg $i1,0($out,$inp) | ||
1700 | stg $i2,8($out,$inp) | ||
1701 | la $inp,16($inp) | ||
1702 | brct $s3,.Lxts_km_prepare | ||
1703 | |||
1704 | slgr $inp,$fp # rewind $inp | ||
1705 | la $s2,0($out,$inp) | ||
1706 | lgr $s3,$fp | ||
1707 | .long 0xb92e00aa # km $s2,$s2 | ||
1708 | brc 1,.-4 # pay attention to "partial completion" | ||
1709 | |||
1710 | la $s2,16($sp) | ||
1711 | slgr $s2,$inp | ||
1712 | srlg $s3,$fp,4 | ||
1713 | .Lxts_km_xor: | ||
1714 | lg $i1,0($out,$inp) | ||
1715 | lg $i2,8($out,$inp) | ||
1716 | xg $i1,0($s2,$inp) | ||
1717 | xg $i2,8($s2,$inp) | ||
1718 | stg $i1,0($out,$inp) | ||
1719 | stg $i2,8($out,$inp) | ||
1720 | la $inp,16($inp) | ||
1721 | brct $s3,.Lxts_km_xor | ||
1722 | |||
1723 | slgr $len,$fp | ||
1724 | brc 1,.Lxts_km_loop # not zero, no borrow | ||
1725 | algr $fp,$len | ||
1726 | lghi $len,0 | ||
1727 | brc 4+1,.Lxts_km_loop # not zero | ||
1728 | |||
1729 | l${g} $i1,0($sp) # back-chain | ||
1730 | llgf $fp,`2*$SIZE_T-4`($sp) # bytes used | ||
1731 | la $i2,16($sp) | ||
1732 | srlg $fp,$fp,4 | ||
1733 | .Lxts_km_zap: | ||
1734 | stg $i1,0($i2) | ||
1735 | stg $i1,8($i2) | ||
1736 | la $i2,16($i2) | ||
1737 | brct $fp,.Lxts_km_zap | ||
1738 | |||
1739 | la $sp,0($i1) | ||
1740 | llgc $len,2*$SIZE_T-1($i1) | ||
1741 | nill $len,0x0f # $len%=16 | ||
1742 | bzr $ra | ||
1743 | |||
1744 | # generate one more tweak... | ||
1745 | lghi $i1,0x87 | ||
1746 | srag $i2,$s1,63 # broadcast upper bit | ||
1747 | ngr $i1,$i2 # rem | ||
1748 | srlg $i2,$s0,63 # carry bit from lower half | ||
1749 | sllg $s0,$s0,1 | ||
1750 | sllg $s1,$s1,1 | ||
1751 | xgr $s0,$i1 | ||
1752 | ogr $s1,$i2 | ||
1753 | |||
1754 | ltr $len,$len # clear zero flag | ||
1755 | br $ra | ||
1756 | .size _s390x_xts_km,.-_s390x_xts_km | ||
1757 | |||
1758 | .globl AES_xts_encrypt | ||
1759 | .type AES_xts_encrypt,\@function | ||
1760 | .align 16 | ||
1761 | AES_xts_encrypt: | ||
1762 | xgr %r3,%r4 # flip %r3 and %r4, $out and $len | ||
1763 | xgr %r4,%r3 | ||
1764 | xgr %r3,%r4 | ||
1765 | ___ | ||
1766 | $code.=<<___ if ($SIZE_T==4); | ||
1767 | llgfr $len,$len | ||
1768 | ___ | ||
1769 | $code.=<<___; | ||
1770 | st${g} $len,1*$SIZE_T($sp) # save copy of $len | ||
1771 | srag $len,$len,4 # formally wrong, because it expands | ||
1772 | # sign byte, but who can afford asking | ||
1773 | # to process more than 2^63-1 bytes? | ||
1774 | # I use it, because it sets condition | ||
1775 | # code... | ||
1776 | bcr 8,$ra # abort if zero (i.e. less than 16) | ||
1777 | ___ | ||
1778 | $code.=<<___ if (!$softonly); | ||
1779 | llgf %r0,240($key2) | ||
1780 | lhi %r1,16 | ||
1781 | clr %r0,%r1 | ||
1782 | jl .Lxts_enc_software | ||
1783 | |||
1784 | stm${g} %r6,$s3,6*$SIZE_T($sp) | ||
1785 | st${g} $ra,14*$SIZE_T($sp) | ||
1786 | |||
1787 | sllg $len,$len,4 # $len&=~15 | ||
1788 | slgr $out,$inp | ||
1789 | |||
1790 | # generate the tweak value | ||
1791 | l${g} $s3,$stdframe($sp) # pointer to iv | ||
1792 | la $s2,$tweak($sp) | ||
1793 | lmg $s0,$s1,0($s3) | ||
1794 | lghi $s3,16 | ||
1795 | stmg $s0,$s1,0($s2) | ||
1796 | la %r1,0($key2) # $key2 is not needed anymore | ||
1797 | .long 0xb92e00aa # km $s2,$s2, generate the tweak | ||
1798 | brc 1,.-4 # can this happen? | ||
1799 | |||
1800 | l %r0,240($key1) | ||
1801 | la %r1,0($key1) # $key1 is not needed anymore | ||
1802 | bras $ra,_s390x_xts_km | ||
1803 | jz .Lxts_enc_km_done | ||
1804 | |||
1805 | aghi $inp,-16 # take one step back | ||
1806 | la $i3,0($out,$inp) # put aside real $out | ||
1807 | .Lxts_enc_km_steal: | ||
1808 | llgc $i1,16($inp) | ||
1809 | llgc $i2,0($out,$inp) | ||
1810 | stc $i1,0($out,$inp) | ||
1811 | stc $i2,16($out,$inp) | ||
1812 | la $inp,1($inp) | ||
1813 | brct $len,.Lxts_enc_km_steal | ||
1814 | |||
1815 | la $s2,0($i3) | ||
1816 | lghi $s3,16 | ||
1817 | lrvgr $i1,$s0 # flip byte order | ||
1818 | lrvgr $i2,$s1 | ||
1819 | xg $i1,0($s2) | ||
1820 | xg $i2,8($s2) | ||
1821 | stg $i1,0($s2) | ||
1822 | stg $i2,8($s2) | ||
1823 | .long 0xb92e00aa # km $s2,$s2 | ||
1824 | brc 1,.-4 # can this happen? | ||
1825 | lrvgr $i1,$s0 # flip byte order | ||
1826 | lrvgr $i2,$s1 | ||
1827 | xg $i1,0($i3) | ||
1828 | xg $i2,8($i3) | ||
1829 | stg $i1,0($i3) | ||
1830 | stg $i2,8($i3) | ||
1831 | |||
1832 | .Lxts_enc_km_done: | ||
1833 | l${g} $ra,14*$SIZE_T($sp) | ||
1834 | st${g} $sp,$tweak($sp) # wipe tweak | ||
1835 | st${g} $sp,$tweak($sp) | ||
1836 | lm${g} %r6,$s3,6*$SIZE_T($sp) | ||
1837 | br $ra | ||
1838 | .align 16 | ||
1839 | .Lxts_enc_software: | ||
1840 | ___ | ||
1841 | $code.=<<___; | ||
1842 | stm${g} %r6,$ra,6*$SIZE_T($sp) | ||
1843 | |||
1844 | slgr $out,$inp | ||
1845 | |||
1846 | xgr $s0,$s0 # clear upper half | ||
1847 | xgr $s1,$s1 | ||
1848 | lrv $s0,$stdframe+4($sp) # load secno | ||
1849 | lrv $s1,$stdframe+0($sp) | ||
1850 | xgr $s2,$s2 | ||
1851 | xgr $s3,$s3 | ||
1852 | stm${g} %r2,%r5,2*$SIZE_T($sp) | ||
1853 | la $key,0($key2) | ||
1854 | larl $tbl,AES_Te | ||
1855 | bras $ra,_s390x_AES_encrypt # generate the tweak | ||
1856 | lm${g} %r2,%r5,2*$SIZE_T($sp) | ||
1857 | stm $s0,$s3,$tweak($sp) # save the tweak | ||
1858 | j .Lxts_enc_enter | ||
1859 | |||
1860 | .align 16 | ||
1861 | .Lxts_enc_loop: | ||
1862 | lrvg $s1,$tweak+0($sp) # load the tweak in little-endian | ||
1863 | lrvg $s3,$tweak+8($sp) | ||
1864 | lghi %r1,0x87 | ||
1865 | srag %r0,$s3,63 # broadcast upper bit | ||
1866 | ngr %r1,%r0 # rem | ||
1867 | srlg %r0,$s1,63 # carry bit from lower half | ||
1868 | sllg $s1,$s1,1 | ||
1869 | sllg $s3,$s3,1 | ||
1870 | xgr $s1,%r1 | ||
1871 | ogr $s3,%r0 | ||
1872 | lrvgr $s1,$s1 # flip byte order | ||
1873 | lrvgr $s3,$s3 | ||
1874 | srlg $s0,$s1,32 # smash the tweak to 4x32-bits | ||
1875 | stg $s1,$tweak+0($sp) # save the tweak | ||
1876 | llgfr $s1,$s1 | ||
1877 | srlg $s2,$s3,32 | ||
1878 | stg $s3,$tweak+8($sp) | ||
1879 | llgfr $s3,$s3 | ||
1880 | la $inp,16($inp) # $inp+=16 | ||
1881 | .Lxts_enc_enter: | ||
1882 | x $s0,0($inp) # ^=*($inp) | ||
1883 | x $s1,4($inp) | ||
1884 | x $s2,8($inp) | ||
1885 | x $s3,12($inp) | ||
1886 | stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing | ||
1887 | la $key,0($key1) | ||
1888 | bras $ra,_s390x_AES_encrypt | ||
1889 | lm${g} %r2,%r5,2*$SIZE_T($sp) | ||
1890 | x $s0,$tweak+0($sp) # ^=tweak | ||
1891 | x $s1,$tweak+4($sp) | ||
1892 | x $s2,$tweak+8($sp) | ||
1893 | x $s3,$tweak+12($sp) | ||
1894 | st $s0,0($out,$inp) | ||
1895 | st $s1,4($out,$inp) | ||
1896 | st $s2,8($out,$inp) | ||
1897 | st $s3,12($out,$inp) | ||
1898 | brct${g} $len,.Lxts_enc_loop | ||
1899 | |||
1900 | llgc $len,`2*$SIZE_T-1`($sp) | ||
1901 | nill $len,0x0f # $len%16 | ||
1902 | jz .Lxts_enc_done | ||
1903 | |||
1904 | la $i3,0($inp,$out) # put aside real $out | ||
1905 | .Lxts_enc_steal: | ||
1906 | llgc %r0,16($inp) | ||
1907 | llgc %r1,0($out,$inp) | ||
1908 | stc %r0,0($out,$inp) | ||
1909 | stc %r1,16($out,$inp) | ||
1910 | la $inp,1($inp) | ||
1911 | brct $len,.Lxts_enc_steal | ||
1912 | la $out,0($i3) # restore real $out | ||
1913 | |||
1914 | # generate last tweak... | ||
1915 | lrvg $s1,$tweak+0($sp) # load the tweak in little-endian | ||
1916 | lrvg $s3,$tweak+8($sp) | ||
1917 | lghi %r1,0x87 | ||
1918 | srag %r0,$s3,63 # broadcast upper bit | ||
1919 | ngr %r1,%r0 # rem | ||
1920 | srlg %r0,$s1,63 # carry bit from lower half | ||
1921 | sllg $s1,$s1,1 | ||
1922 | sllg $s3,$s3,1 | ||
1923 | xgr $s1,%r1 | ||
1924 | ogr $s3,%r0 | ||
1925 | lrvgr $s1,$s1 # flip byte order | ||
1926 | lrvgr $s3,$s3 | ||
1927 | srlg $s0,$s1,32 # smash the tweak to 4x32-bits | ||
1928 | stg $s1,$tweak+0($sp) # save the tweak | ||
1929 | llgfr $s1,$s1 | ||
1930 | srlg $s2,$s3,32 | ||
1931 | stg $s3,$tweak+8($sp) | ||
1932 | llgfr $s3,$s3 | ||
1933 | |||
1934 | x $s0,0($out) # ^=*(inp)|stolen cipther-text | ||
1935 | x $s1,4($out) | ||
1936 | x $s2,8($out) | ||
1937 | x $s3,12($out) | ||
1938 | st${g} $out,4*$SIZE_T($sp) | ||
1939 | la $key,0($key1) | ||
1940 | bras $ra,_s390x_AES_encrypt | ||
1941 | l${g} $out,4*$SIZE_T($sp) | ||
1942 | x $s0,`$tweak+0`($sp) # ^=tweak | ||
1943 | x $s1,`$tweak+4`($sp) | ||
1944 | x $s2,`$tweak+8`($sp) | ||
1945 | x $s3,`$tweak+12`($sp) | ||
1946 | st $s0,0($out) | ||
1947 | st $s1,4($out) | ||
1948 | st $s2,8($out) | ||
1949 | st $s3,12($out) | ||
1950 | |||
1951 | .Lxts_enc_done: | ||
1952 | stg $sp,$tweak+0($sp) # wipe tweak | ||
1953 | stg $sp,$twesk+8($sp) | ||
1954 | lm${g} %r6,$ra,6*$SIZE_T($sp) | ||
1955 | br $ra | ||
1956 | .size AES_xts_encrypt,.-AES_xts_encrypt | ||
1957 | ___ | ||
1958 | # void AES_xts_decrypt(const char *inp,char *out,size_t len, | ||
1959 | # const AES_KEY *key1, const AES_KEY *key2,u64 secno); | ||
1960 | # | ||
1961 | $code.=<<___; | ||
1962 | .globl AES_xts_decrypt | ||
1963 | .type AES_xts_decrypt,\@function | ||
1964 | .align 16 | ||
1965 | AES_xts_decrypt: | ||
1966 | xgr %r3,%r4 # flip %r3 and %r4, $out and $len | ||
1967 | xgr %r4,%r3 | ||
1968 | xgr %r3,%r4 | ||
1969 | ___ | ||
1970 | $code.=<<___ if ($SIZE_T==4); | ||
1971 | llgfr $len,$len | ||
1972 | ___ | ||
1973 | $code.=<<___; | ||
1974 | st${g} $len,1*$SIZE_T($sp) # save copy of $len | ||
1975 | aghi $len,-16 | ||
1976 | bcr 4,$ra # abort if less than zero. formally | ||
1977 | # wrong, because $len is unsigned, | ||
1978 | # but who can afford asking to | ||
1979 | # process more than 2^63-1 bytes? | ||
1980 | tmll $len,0x0f | ||
1981 | jnz .Lxts_dec_proceed | ||
1982 | aghi $len,16 | ||
1983 | .Lxts_dec_proceed: | ||
1984 | ___ | ||
1985 | $code.=<<___ if (!$softonly); | ||
1986 | llgf %r0,240($key2) | ||
1987 | lhi %r1,16 | ||
1988 | clr %r0,%r1 | ||
1989 | jl .Lxts_dec_software | ||
1990 | |||
1991 | stm${g} %r6,$s3,6*$SIZE_T($sp) | ||
1992 | st${g} $ra,14*$SIZE_T($sp) | ||
1993 | |||
1994 | nill $len,0xfff0 # $len&=~15 | ||
1995 | slgr $out,$inp | ||
1996 | |||
1997 | # generate the tweak value | ||
1998 | l${g} $s3,$stdframe($sp) # pointer to iv | ||
1999 | la $s2,$tweak($sp) | ||
2000 | lmg $s0,$s1,0($s3) | ||
2001 | lghi $s3,16 | ||
2002 | stmg $s0,$s1,0($s2) | ||
2003 | la %r1,0($key2) # $key2 is not needed past this point | ||
2004 | .long 0xb92e00aa # km $s2,$s2, generate the tweak | ||
2005 | brc 1,.-4 # can this happen? | ||
2006 | |||
2007 | l %r0,240($key1) | ||
2008 | la %r1,0($key1) # $key1 is not needed anymore | ||
2009 | |||
2010 | ltgr $len,$len | ||
2011 | jz .Lxts_dec_km_short | ||
2012 | bras $ra,_s390x_xts_km | ||
2013 | jz .Lxts_dec_km_done | ||
2014 | |||
2015 | lrvgr $s2,$s0 # make copy in reverse byte order | ||
2016 | lrvgr $s3,$s1 | ||
2017 | j .Lxts_dec_km_2ndtweak | ||
2018 | |||
2019 | .Lxts_dec_km_short: | ||
2020 | llgc $len,`2*$SIZE_T-1`($sp) | ||
2021 | nill $len,0x0f # $len%=16 | ||
2022 | lrvg $s0,$tweak+0($sp) # load the tweak | ||
2023 | lrvg $s1,$tweak+8($sp) | ||
2024 | lrvgr $s2,$s0 # make copy in reverse byte order | ||
2025 | lrvgr $s3,$s1 | ||
2026 | |||
2027 | .Lxts_dec_km_2ndtweak: | ||
2028 | lghi $i1,0x87 | ||
2029 | srag $i2,$s1,63 # broadcast upper bit | ||
2030 | ngr $i1,$i2 # rem | ||
2031 | srlg $i2,$s0,63 # carry bit from lower half | ||
2032 | sllg $s0,$s0,1 | ||
2033 | sllg $s1,$s1,1 | ||
2034 | xgr $s0,$i1 | ||
2035 | ogr $s1,$i2 | ||
2036 | lrvgr $i1,$s0 # flip byte order | ||
2037 | lrvgr $i2,$s1 | ||
2038 | |||
2039 | xg $i1,0($inp) | ||
2040 | xg $i2,8($inp) | ||
2041 | stg $i1,0($out,$inp) | ||
2042 | stg $i2,8($out,$inp) | ||
2043 | la $i2,0($out,$inp) | ||
2044 | lghi $i3,16 | ||
2045 | .long 0xb92e0066 # km $i2,$i2 | ||
2046 | brc 1,.-4 # can this happen? | ||
2047 | lrvgr $i1,$s0 | ||
2048 | lrvgr $i2,$s1 | ||
2049 | xg $i1,0($out,$inp) | ||
2050 | xg $i2,8($out,$inp) | ||
2051 | stg $i1,0($out,$inp) | ||
2052 | stg $i2,8($out,$inp) | ||
2053 | |||
2054 | la $i3,0($out,$inp) # put aside real $out | ||
2055 | .Lxts_dec_km_steal: | ||
2056 | llgc $i1,16($inp) | ||
2057 | llgc $i2,0($out,$inp) | ||
2058 | stc $i1,0($out,$inp) | ||
2059 | stc $i2,16($out,$inp) | ||
2060 | la $inp,1($inp) | ||
2061 | brct $len,.Lxts_dec_km_steal | ||
2062 | |||
2063 | lgr $s0,$s2 | ||
2064 | lgr $s1,$s3 | ||
2065 | xg $s0,0($i3) | ||
2066 | xg $s1,8($i3) | ||
2067 | stg $s0,0($i3) | ||
2068 | stg $s1,8($i3) | ||
2069 | la $s0,0($i3) | ||
2070 | lghi $s1,16 | ||
2071 | .long 0xb92e0088 # km $s0,$s0 | ||
2072 | brc 1,.-4 # can this happen? | ||
2073 | xg $s2,0($i3) | ||
2074 | xg $s3,8($i3) | ||
2075 | stg $s2,0($i3) | ||
2076 | stg $s3,8($i3) | ||
2077 | .Lxts_dec_km_done: | ||
2078 | l${g} $ra,14*$SIZE_T($sp) | ||
2079 | st${g} $sp,$tweak($sp) # wipe tweak | ||
2080 | st${g} $sp,$tweak($sp) | ||
2081 | lm${g} %r6,$s3,6*$SIZE_T($sp) | ||
2082 | br $ra | ||
2083 | .align 16 | ||
2084 | .Lxts_dec_software: | ||
2085 | ___ | ||
2086 | $code.=<<___; | ||
2087 | stm${g} %r6,$ra,6*$SIZE_T($sp) | ||
2088 | |||
2089 | srlg $len,$len,4 | ||
2090 | slgr $out,$inp | ||
2091 | |||
2092 | xgr $s0,$s0 # clear upper half | ||
2093 | xgr $s1,$s1 | ||
2094 | lrv $s0,$stdframe+4($sp) # load secno | ||
2095 | lrv $s1,$stdframe+0($sp) | ||
2096 | xgr $s2,$s2 | ||
2097 | xgr $s3,$s3 | ||
2098 | stm${g} %r2,%r5,2*$SIZE_T($sp) | ||
2099 | la $key,0($key2) | ||
2100 | larl $tbl,AES_Te | ||
2101 | bras $ra,_s390x_AES_encrypt # generate the tweak | ||
2102 | lm${g} %r2,%r5,2*$SIZE_T($sp) | ||
2103 | larl $tbl,AES_Td | ||
2104 | lt${g}r $len,$len | ||
2105 | stm $s0,$s3,$tweak($sp) # save the tweak | ||
2106 | jz .Lxts_dec_short | ||
2107 | j .Lxts_dec_enter | ||
2108 | |||
2109 | .align 16 | ||
2110 | .Lxts_dec_loop: | ||
2111 | lrvg $s1,$tweak+0($sp) # load the tweak in little-endian | ||
2112 | lrvg $s3,$tweak+8($sp) | ||
2113 | lghi %r1,0x87 | ||
2114 | srag %r0,$s3,63 # broadcast upper bit | ||
2115 | ngr %r1,%r0 # rem | ||
2116 | srlg %r0,$s1,63 # carry bit from lower half | ||
2117 | sllg $s1,$s1,1 | ||
2118 | sllg $s3,$s3,1 | ||
2119 | xgr $s1,%r1 | ||
2120 | ogr $s3,%r0 | ||
2121 | lrvgr $s1,$s1 # flip byte order | ||
2122 | lrvgr $s3,$s3 | ||
2123 | srlg $s0,$s1,32 # smash the tweak to 4x32-bits | ||
2124 | stg $s1,$tweak+0($sp) # save the tweak | ||
2125 | llgfr $s1,$s1 | ||
2126 | srlg $s2,$s3,32 | ||
2127 | stg $s3,$tweak+8($sp) | ||
2128 | llgfr $s3,$s3 | ||
2129 | .Lxts_dec_enter: | ||
2130 | x $s0,0($inp) # tweak^=*(inp) | ||
2131 | x $s1,4($inp) | ||
2132 | x $s2,8($inp) | ||
2133 | x $s3,12($inp) | ||
2134 | stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing | ||
2135 | la $key,0($key1) | ||
2136 | bras $ra,_s390x_AES_decrypt | ||
2137 | lm${g} %r2,%r5,2*$SIZE_T($sp) | ||
2138 | x $s0,$tweak+0($sp) # ^=tweak | ||
2139 | x $s1,$tweak+4($sp) | ||
2140 | x $s2,$tweak+8($sp) | ||
2141 | x $s3,$tweak+12($sp) | ||
2142 | st $s0,0($out,$inp) | ||
2143 | st $s1,4($out,$inp) | ||
2144 | st $s2,8($out,$inp) | ||
2145 | st $s3,12($out,$inp) | ||
2146 | la $inp,16($inp) | ||
2147 | brct${g} $len,.Lxts_dec_loop | ||
2148 | |||
2149 | llgc $len,`2*$SIZE_T-1`($sp) | ||
2150 | nill $len,0x0f # $len%16 | ||
2151 | jz .Lxts_dec_done | ||
2152 | |||
2153 | # generate pair of tweaks... | ||
2154 | lrvg $s1,$tweak+0($sp) # load the tweak in little-endian | ||
2155 | lrvg $s3,$tweak+8($sp) | ||
2156 | lghi %r1,0x87 | ||
2157 | srag %r0,$s3,63 # broadcast upper bit | ||
2158 | ngr %r1,%r0 # rem | ||
2159 | srlg %r0,$s1,63 # carry bit from lower half | ||
2160 | sllg $s1,$s1,1 | ||
2161 | sllg $s3,$s3,1 | ||
2162 | xgr $s1,%r1 | ||
2163 | ogr $s3,%r0 | ||
2164 | lrvgr $i2,$s1 # flip byte order | ||
2165 | lrvgr $i3,$s3 | ||
2166 | stmg $i2,$i3,$tweak($sp) # save the 1st tweak | ||
2167 | j .Lxts_dec_2ndtweak | ||
2168 | |||
2169 | .align 16 | ||
2170 | .Lxts_dec_short: | ||
2171 | llgc $len,`2*$SIZE_T-1`($sp) | ||
2172 | nill $len,0x0f # $len%16 | ||
2173 | lrvg $s1,$tweak+0($sp) # load the tweak in little-endian | ||
2174 | lrvg $s3,$tweak+8($sp) | ||
2175 | .Lxts_dec_2ndtweak: | ||
2176 | lghi %r1,0x87 | ||
2177 | srag %r0,$s3,63 # broadcast upper bit | ||
2178 | ngr %r1,%r0 # rem | ||
2179 | srlg %r0,$s1,63 # carry bit from lower half | ||
2180 | sllg $s1,$s1,1 | ||
2181 | sllg $s3,$s3,1 | ||
2182 | xgr $s1,%r1 | ||
2183 | ogr $s3,%r0 | ||
2184 | lrvgr $s1,$s1 # flip byte order | ||
2185 | lrvgr $s3,$s3 | ||
2186 | srlg $s0,$s1,32 # smash the tweak to 4x32-bits | ||
2187 | stg $s1,$tweak-16+0($sp) # save the 2nd tweak | ||
2188 | llgfr $s1,$s1 | ||
2189 | srlg $s2,$s3,32 | ||
2190 | stg $s3,$tweak-16+8($sp) | ||
2191 | llgfr $s3,$s3 | ||
2192 | |||
2193 | x $s0,0($inp) # tweak_the_2nd^=*(inp) | ||
2194 | x $s1,4($inp) | ||
2195 | x $s2,8($inp) | ||
2196 | x $s3,12($inp) | ||
2197 | stm${g} %r2,%r3,2*$SIZE_T($sp) | ||
2198 | la $key,0($key1) | ||
2199 | bras $ra,_s390x_AES_decrypt | ||
2200 | lm${g} %r2,%r5,2*$SIZE_T($sp) | ||
2201 | x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd | ||
2202 | x $s1,$tweak-16+4($sp) | ||
2203 | x $s2,$tweak-16+8($sp) | ||
2204 | x $s3,$tweak-16+12($sp) | ||
2205 | st $s0,0($out,$inp) | ||
2206 | st $s1,4($out,$inp) | ||
2207 | st $s2,8($out,$inp) | ||
2208 | st $s3,12($out,$inp) | ||
2209 | |||
2210 | la $i3,0($out,$inp) # put aside real $out | ||
2211 | .Lxts_dec_steal: | ||
2212 | llgc %r0,16($inp) | ||
2213 | llgc %r1,0($out,$inp) | ||
2214 | stc %r0,0($out,$inp) | ||
2215 | stc %r1,16($out,$inp) | ||
2216 | la $inp,1($inp) | ||
2217 | brct $len,.Lxts_dec_steal | ||
2218 | la $out,0($i3) # restore real $out | ||
2219 | |||
2220 | lm $s0,$s3,$tweak($sp) # load the 1st tweak | ||
2221 | x $s0,0($out) # tweak^=*(inp)|stolen cipher-text | ||
2222 | x $s1,4($out) | ||
2223 | x $s2,8($out) | ||
2224 | x $s3,12($out) | ||
2225 | st${g} $out,4*$SIZE_T($sp) | ||
2226 | la $key,0($key1) | ||
2227 | bras $ra,_s390x_AES_decrypt | ||
2228 | l${g} $out,4*$SIZE_T($sp) | ||
2229 | x $s0,$tweak+0($sp) # ^=tweak | ||
2230 | x $s1,$tweak+4($sp) | ||
2231 | x $s2,$tweak+8($sp) | ||
2232 | x $s3,$tweak+12($sp) | ||
2233 | st $s0,0($out) | ||
2234 | st $s1,4($out) | ||
2235 | st $s2,8($out) | ||
2236 | st $s3,12($out) | ||
2237 | stg $sp,$tweak-16+0($sp) # wipe 2nd tweak | ||
2238 | stg $sp,$tweak-16+8($sp) | ||
2239 | .Lxts_dec_done: | ||
2240 | stg $sp,$tweak+0($sp) # wipe tweak | ||
2241 | stg $sp,$twesk+8($sp) | ||
2242 | lm${g} %r6,$ra,6*$SIZE_T($sp) | ||
2243 | br $ra | ||
2244 | .size AES_xts_decrypt,.-AES_xts_decrypt | ||
1332 | ___ | 2245 | ___ |
1333 | } | 2246 | } |
1334 | $code.=<<___; | 2247 | $code.=<<___; |
1335 | .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 2248 | .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
2249 | .comm OPENSSL_s390xcap_P,16,8 | ||
1336 | ___ | 2250 | ___ |
1337 | 2251 | ||
1338 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 2252 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
1339 | print $code; | 2253 | print $code; |
2254 | close STDOUT; # force flush | ||
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl index c57b3a2d6d..403c4d1290 100755 --- a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl +++ b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl | |||
@@ -1176,6 +1176,7 @@ ___ | |||
1176 | # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have | 1176 | # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have |
1177 | # undesired effect, so just omit them and sacrifice some portion of | 1177 | # undesired effect, so just omit them and sacrifice some portion of |
1178 | # percent in performance... | 1178 | # percent in performance... |
1179 | $code =~ s/fmovs.*$//gem; | 1179 | $code =~ s/fmovs.*$//gm; |
1180 | 1180 | ||
1181 | print $code; | 1181 | print $code; |
1182 | close STDOUT; # ensure flush | ||
diff --git a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl new file mode 100644 index 0000000000..c6f6b3334a --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl | |||
@@ -0,0 +1,1249 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # June 2011 | ||
11 | # | ||
12 | # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled | ||
13 | # in http://download.intel.com/design/intarch/papers/323686.pdf, is | ||
14 | # that since AESNI-CBC encrypt exhibit *very* low instruction-level | ||
15 | # parallelism, interleaving it with another algorithm would allow to | ||
16 | # utilize processor resources better and achieve better performance. | ||
17 | # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and | ||
18 | # AESNI code is weaved into it. Below are performance numbers in | ||
19 | # cycles per processed byte, less is better, for standalone AESNI-CBC | ||
20 | # encrypt, sum of the latter and standalone SHA1, and "stitched" | ||
21 | # subroutine: | ||
22 | # | ||
23 | # AES-128-CBC +SHA1 stitch gain | ||
24 | # Westmere 3.77[+5.6] 9.37 6.65 +41% | ||
25 | # Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%) | ||
26 | # | ||
27 | # AES-192-CBC | ||
28 | # Westmere 4.51 10.11 6.97 +45% | ||
29 | # Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%) | ||
30 | # | ||
31 | # AES-256-CBC | ||
32 | # Westmere 5.25 10.85 7.25 +50% | ||
33 | # Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%) | ||
34 | # | ||
35 | # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for | ||
36 | # background information. Above numbers in parentheses are SSSE3 | ||
37 | # results collected on AVX-capable CPU, i.e. apply on OSes that | ||
38 | # don't support AVX. | ||
39 | # | ||
40 | # Needless to mention that it makes no sense to implement "stitched" | ||
41 | # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1 | ||
42 | # fully utilize parallelism, so stitching would not give any gain | ||
43 | # anyway. Well, there might be some, e.g. because of better cache | ||
44 | # locality... For reference, here are performance results for | ||
45 | # standalone AESNI-CBC decrypt: | ||
46 | # | ||
47 | # AES-128-CBC AES-192-CBC AES-256-CBC | ||
48 | # Westmere 1.31 1.55 1.80 | ||
49 | # Sandy Bridge 0.93 1.06 1.22 | ||
50 | |||
51 | $flavour = shift; | ||
52 | $output = shift; | ||
53 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
54 | |||
55 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
56 | |||
57 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
58 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
59 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
60 | die "can't locate x86_64-xlate.pl"; | ||
61 | |||
62 | $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | ||
63 | =~ /GNU assembler version ([2-9]\.[0-9]+)/ && | ||
64 | $1>=2.19); | ||
65 | $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | ||
66 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && | ||
67 | $1>=2.09); | ||
68 | $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | ||
69 | `ml64 2>&1` =~ /Version ([0-9]+)\./ && | ||
70 | $1>=10); | ||
71 | |||
72 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
73 | |||
74 | # void aesni_cbc_sha1_enc(const void *inp, | ||
75 | # void *out, | ||
76 | # size_t length, | ||
77 | # const AES_KEY *key, | ||
78 | # unsigned char *iv, | ||
79 | # SHA_CTX *ctx, | ||
80 | # const void *in0); | ||
81 | |||
82 | $code.=<<___; | ||
83 | .text | ||
84 | .extern OPENSSL_ia32cap_P | ||
85 | |||
86 | .globl aesni_cbc_sha1_enc | ||
87 | .type aesni_cbc_sha1_enc,\@abi-omnipotent | ||
88 | .align 16 | ||
89 | aesni_cbc_sha1_enc: | ||
90 | # caller should check for SSSE3 and AES-NI bits | ||
91 | mov OPENSSL_ia32cap_P+0(%rip),%r10d | ||
92 | mov OPENSSL_ia32cap_P+4(%rip),%r11d | ||
93 | ___ | ||
94 | $code.=<<___ if ($avx); | ||
95 | and \$`1<<28`,%r11d # mask AVX bit | ||
96 | and \$`1<<30`,%r10d # mask "Intel CPU" bit | ||
97 | or %r11d,%r10d | ||
98 | cmp \$`1<<28|1<<30`,%r10d | ||
99 | je aesni_cbc_sha1_enc_avx | ||
100 | ___ | ||
101 | $code.=<<___; | ||
102 | jmp aesni_cbc_sha1_enc_ssse3 | ||
103 | ret | ||
104 | .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc | ||
105 | ___ | ||
106 | |||
107 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); | ||
108 | |||
109 | my $Xi=4; | ||
110 | my @X=map("%xmm$_",(4..7,0..3)); | ||
111 | my @Tx=map("%xmm$_",(8..10)); | ||
112 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization | ||
113 | my @T=("%esi","%edi"); | ||
114 | my $j=0; my $jj=0; my $r=0; my $sn=0; | ||
115 | my $K_XX_XX="%r11"; | ||
116 | my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13)); | ||
117 | my @rndkey=("%xmm14","%xmm15"); | ||
118 | |||
119 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | ||
120 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | ||
121 | my $arg = pop; | ||
122 | $arg = "\$$arg" if ($arg*1 eq $arg); | ||
123 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | ||
124 | } | ||
125 | |||
126 | my $_rol=sub { &rol(@_) }; | ||
127 | my $_ror=sub { &ror(@_) }; | ||
128 | |||
129 | $code.=<<___; | ||
130 | .type aesni_cbc_sha1_enc_ssse3,\@function,6 | ||
131 | .align 16 | ||
132 | aesni_cbc_sha1_enc_ssse3: | ||
133 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument | ||
134 | #shr \$6,$len # debugging artefact | ||
135 | #jz .Lepilogue_ssse3 # debugging artefact | ||
136 | push %rbx | ||
137 | push %rbp | ||
138 | push %r12 | ||
139 | push %r13 | ||
140 | push %r14 | ||
141 | push %r15 | ||
142 | lea `-104-($win64?10*16:0)`(%rsp),%rsp | ||
143 | #mov $in0,$inp # debugging artefact | ||
144 | #lea 64(%rsp),$ctx # debugging artefact | ||
145 | ___ | ||
146 | $code.=<<___ if ($win64); | ||
147 | movaps %xmm6,96+0(%rsp) | ||
148 | movaps %xmm7,96+16(%rsp) | ||
149 | movaps %xmm8,96+32(%rsp) | ||
150 | movaps %xmm9,96+48(%rsp) | ||
151 | movaps %xmm10,96+64(%rsp) | ||
152 | movaps %xmm11,96+80(%rsp) | ||
153 | movaps %xmm12,96+96(%rsp) | ||
154 | movaps %xmm13,96+112(%rsp) | ||
155 | movaps %xmm14,96+128(%rsp) | ||
156 | movaps %xmm15,96+144(%rsp) | ||
157 | .Lprologue_ssse3: | ||
158 | ___ | ||
159 | $code.=<<___; | ||
160 | mov $in0,%r12 # reassign arguments | ||
161 | mov $out,%r13 | ||
162 | mov $len,%r14 | ||
163 | mov $key,%r15 | ||
164 | movdqu ($ivp),$iv # load IV | ||
165 | mov $ivp,88(%rsp) # save $ivp | ||
166 | ___ | ||
167 | my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments | ||
168 | my $rounds="${ivp}d"; | ||
169 | $code.=<<___; | ||
170 | shl \$6,$len | ||
171 | sub $in0,$out | ||
172 | mov 240($key),$rounds | ||
173 | add $inp,$len # end of input | ||
174 | |||
175 | lea K_XX_XX(%rip),$K_XX_XX | ||
176 | mov 0($ctx),$A # load context | ||
177 | mov 4($ctx),$B | ||
178 | mov 8($ctx),$C | ||
179 | mov 12($ctx),$D | ||
180 | mov $B,@T[0] # magic seed | ||
181 | mov 16($ctx),$E | ||
182 | |||
183 | movdqa 64($K_XX_XX),@X[2] # pbswap mask | ||
184 | movdqa 0($K_XX_XX),@Tx[1] # K_00_19 | ||
185 | movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] | ||
186 | movdqu 16($inp),@X[-3&7] | ||
187 | movdqu 32($inp),@X[-2&7] | ||
188 | movdqu 48($inp),@X[-1&7] | ||
189 | pshufb @X[2],@X[-4&7] # byte swap | ||
190 | add \$64,$inp | ||
191 | pshufb @X[2],@X[-3&7] | ||
192 | pshufb @X[2],@X[-2&7] | ||
193 | pshufb @X[2],@X[-1&7] | ||
194 | paddd @Tx[1],@X[-4&7] # add K_00_19 | ||
195 | paddd @Tx[1],@X[-3&7] | ||
196 | paddd @Tx[1],@X[-2&7] | ||
197 | movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU | ||
198 | psubd @Tx[1],@X[-4&7] # restore X[] | ||
199 | movdqa @X[-3&7],16(%rsp) | ||
200 | psubd @Tx[1],@X[-3&7] | ||
201 | movdqa @X[-2&7],32(%rsp) | ||
202 | psubd @Tx[1],@X[-2&7] | ||
203 | movups ($key),$rndkey0 # $key[0] | ||
204 | movups 16($key),$rndkey[0] # forward reference | ||
205 | jmp .Loop_ssse3 | ||
206 | ___ | ||
207 | |||
208 | my $aesenc=sub { | ||
209 | use integer; | ||
210 | my ($n,$k)=($r/10,$r%10); | ||
211 | if ($k==0) { | ||
212 | $code.=<<___; | ||
213 | movups `16*$n`($in0),$in # load input | ||
214 | xorps $rndkey0,$in | ||
215 | ___ | ||
216 | $code.=<<___ if ($n); | ||
217 | movups $iv,`16*($n-1)`($out,$in0) # write output | ||
218 | ___ | ||
219 | $code.=<<___; | ||
220 | xorps $in,$iv | ||
221 | aesenc $rndkey[0],$iv | ||
222 | movups `32+16*$k`($key),$rndkey[1] | ||
223 | ___ | ||
224 | } elsif ($k==9) { | ||
225 | $sn++; | ||
226 | $code.=<<___; | ||
227 | cmp \$11,$rounds | ||
228 | jb .Laesenclast$sn | ||
229 | movups `32+16*($k+0)`($key),$rndkey[1] | ||
230 | aesenc $rndkey[0],$iv | ||
231 | movups `32+16*($k+1)`($key),$rndkey[0] | ||
232 | aesenc $rndkey[1],$iv | ||
233 | je .Laesenclast$sn | ||
234 | movups `32+16*($k+2)`($key),$rndkey[1] | ||
235 | aesenc $rndkey[0],$iv | ||
236 | movups `32+16*($k+3)`($key),$rndkey[0] | ||
237 | aesenc $rndkey[1],$iv | ||
238 | .Laesenclast$sn: | ||
239 | aesenclast $rndkey[0],$iv | ||
240 | movups 16($key),$rndkey[1] # forward reference | ||
241 | ___ | ||
242 | } else { | ||
243 | $code.=<<___; | ||
244 | aesenc $rndkey[0],$iv | ||
245 | movups `32+16*$k`($key),$rndkey[1] | ||
246 | ___ | ||
247 | } | ||
248 | $r++; unshift(@rndkey,pop(@rndkey)); | ||
249 | }; | ||
250 | |||
251 | sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 | ||
252 | { use integer; | ||
253 | my $body = shift; | ||
254 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | ||
255 | my ($a,$b,$c,$d,$e); | ||
256 | |||
257 | &movdqa (@X[0],@X[-3&7]); | ||
258 | eval(shift(@insns)); | ||
259 | eval(shift(@insns)); | ||
260 | &movdqa (@Tx[0],@X[-1&7]); | ||
261 | &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" | ||
262 | eval(shift(@insns)); | ||
263 | eval(shift(@insns)); | ||
264 | |||
265 | &paddd (@Tx[1],@X[-1&7]); | ||
266 | eval(shift(@insns)); | ||
267 | eval(shift(@insns)); | ||
268 | &psrldq (@Tx[0],4); # "X[-3]", 3 dwords | ||
269 | eval(shift(@insns)); | ||
270 | eval(shift(@insns)); | ||
271 | &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | ||
272 | eval(shift(@insns)); | ||
273 | eval(shift(@insns)); | ||
274 | |||
275 | &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" | ||
276 | eval(shift(@insns)); | ||
277 | eval(shift(@insns)); | ||
278 | eval(shift(@insns)); | ||
279 | eval(shift(@insns)); | ||
280 | |||
281 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" | ||
282 | eval(shift(@insns)); | ||
283 | eval(shift(@insns)); | ||
284 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
285 | eval(shift(@insns)); | ||
286 | eval(shift(@insns)); | ||
287 | |||
288 | &movdqa (@Tx[2],@X[0]); | ||
289 | &movdqa (@Tx[0],@X[0]); | ||
290 | eval(shift(@insns)); | ||
291 | eval(shift(@insns)); | ||
292 | eval(shift(@insns)); | ||
293 | eval(shift(@insns)); | ||
294 | |||
295 | &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword | ||
296 | &paddd (@X[0],@X[0]); | ||
297 | eval(shift(@insns)); | ||
298 | eval(shift(@insns)); | ||
299 | eval(shift(@insns)); | ||
300 | eval(shift(@insns)); | ||
301 | |||
302 | &psrld (@Tx[0],31); | ||
303 | eval(shift(@insns)); | ||
304 | eval(shift(@insns)); | ||
305 | &movdqa (@Tx[1],@Tx[2]); | ||
306 | eval(shift(@insns)); | ||
307 | eval(shift(@insns)); | ||
308 | |||
309 | &psrld (@Tx[2],30); | ||
310 | &por (@X[0],@Tx[0]); # "X[0]"<<<=1 | ||
311 | eval(shift(@insns)); | ||
312 | eval(shift(@insns)); | ||
313 | eval(shift(@insns)); | ||
314 | eval(shift(@insns)); | ||
315 | |||
316 | &pslld (@Tx[1],2); | ||
317 | &pxor (@X[0],@Tx[2]); | ||
318 | eval(shift(@insns)); | ||
319 | eval(shift(@insns)); | ||
320 | &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX | ||
321 | eval(shift(@insns)); | ||
322 | eval(shift(@insns)); | ||
323 | |||
324 | &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 | ||
325 | |||
326 | foreach (@insns) { eval; } # remaining instructions [if any] | ||
327 | |||
328 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
329 | push(@Tx,shift(@Tx)); | ||
330 | } | ||
331 | |||
332 | sub Xupdate_ssse3_32_79() | ||
333 | { use integer; | ||
334 | my $body = shift; | ||
335 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | ||
336 | my ($a,$b,$c,$d,$e); | ||
337 | |||
338 | &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); | ||
339 | eval(shift(@insns)); # body_20_39 | ||
340 | &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | ||
341 | &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" | ||
342 | eval(shift(@insns)); | ||
343 | eval(shift(@insns)); | ||
344 | eval(shift(@insns)); # rol | ||
345 | |||
346 | &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | ||
347 | eval(shift(@insns)); | ||
348 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); | ||
349 | if ($Xi%5) { | ||
350 | &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... | ||
351 | } else { # ... or load next one | ||
352 | &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); | ||
353 | } | ||
354 | &paddd (@Tx[1],@X[-1&7]); | ||
355 | eval(shift(@insns)); # ror | ||
356 | eval(shift(@insns)); | ||
357 | |||
358 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" | ||
359 | eval(shift(@insns)); # body_20_39 | ||
360 | eval(shift(@insns)); | ||
361 | eval(shift(@insns)); | ||
362 | eval(shift(@insns)); # rol | ||
363 | |||
364 | &movdqa (@Tx[0],@X[0]); | ||
365 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
366 | eval(shift(@insns)); | ||
367 | eval(shift(@insns)); | ||
368 | eval(shift(@insns)); # ror | ||
369 | eval(shift(@insns)); | ||
370 | |||
371 | &pslld (@X[0],2); | ||
372 | eval(shift(@insns)); # body_20_39 | ||
373 | eval(shift(@insns)); | ||
374 | &psrld (@Tx[0],30); | ||
375 | eval(shift(@insns)); | ||
376 | eval(shift(@insns)); # rol | ||
377 | eval(shift(@insns)); | ||
378 | eval(shift(@insns)); | ||
379 | eval(shift(@insns)); # ror | ||
380 | eval(shift(@insns)); | ||
381 | |||
382 | &por (@X[0],@Tx[0]); # "X[0]"<<<=2 | ||
383 | eval(shift(@insns)); # body_20_39 | ||
384 | eval(shift(@insns)); | ||
385 | &movdqa (@Tx[1],@X[0]) if ($Xi<19); | ||
386 | eval(shift(@insns)); | ||
387 | eval(shift(@insns)); # rol | ||
388 | eval(shift(@insns)); | ||
389 | eval(shift(@insns)); | ||
390 | eval(shift(@insns)); # rol | ||
391 | eval(shift(@insns)); | ||
392 | |||
393 | foreach (@insns) { eval; } # remaining instructions | ||
394 | |||
395 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
396 | push(@Tx,shift(@Tx)); | ||
397 | } | ||
398 | |||
399 | sub Xuplast_ssse3_80() | ||
400 | { use integer; | ||
401 | my $body = shift; | ||
402 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
403 | my ($a,$b,$c,$d,$e); | ||
404 | |||
405 | eval(shift(@insns)); | ||
406 | &paddd (@Tx[1],@X[-1&7]); | ||
407 | eval(shift(@insns)); | ||
408 | eval(shift(@insns)); | ||
409 | eval(shift(@insns)); | ||
410 | eval(shift(@insns)); | ||
411 | |||
412 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU | ||
413 | |||
414 | foreach (@insns) { eval; } # remaining instructions | ||
415 | |||
416 | &cmp ($inp,$len); | ||
417 | &je (".Ldone_ssse3"); | ||
418 | |||
419 | unshift(@Tx,pop(@Tx)); | ||
420 | |||
421 | &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask | ||
422 | &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 | ||
423 | &movdqu (@X[-4&7],"0($inp)"); # load input | ||
424 | &movdqu (@X[-3&7],"16($inp)"); | ||
425 | &movdqu (@X[-2&7],"32($inp)"); | ||
426 | &movdqu (@X[-1&7],"48($inp)"); | ||
427 | &pshufb (@X[-4&7],@X[2]); # byte swap | ||
428 | &add ($inp,64); | ||
429 | |||
430 | $Xi=0; | ||
431 | } | ||
432 | |||
433 | sub Xloop_ssse3() | ||
434 | { use integer; | ||
435 | my $body = shift; | ||
436 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
437 | my ($a,$b,$c,$d,$e); | ||
438 | |||
439 | eval(shift(@insns)); | ||
440 | eval(shift(@insns)); | ||
441 | &pshufb (@X[($Xi-3)&7],@X[2]); | ||
442 | eval(shift(@insns)); | ||
443 | eval(shift(@insns)); | ||
444 | &paddd (@X[($Xi-4)&7],@Tx[1]); | ||
445 | eval(shift(@insns)); | ||
446 | eval(shift(@insns)); | ||
447 | eval(shift(@insns)); | ||
448 | eval(shift(@insns)); | ||
449 | &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU | ||
450 | eval(shift(@insns)); | ||
451 | eval(shift(@insns)); | ||
452 | &psubd (@X[($Xi-4)&7],@Tx[1]); | ||
453 | |||
454 | foreach (@insns) { eval; } | ||
455 | $Xi++; | ||
456 | } | ||
457 | |||
458 | sub Xtail_ssse3() | ||
459 | { use integer; | ||
460 | my $body = shift; | ||
461 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
462 | my ($a,$b,$c,$d,$e); | ||
463 | |||
464 | foreach (@insns) { eval; } | ||
465 | } | ||
466 | |||
467 | sub body_00_19 () { | ||
468 | use integer; | ||
469 | my ($k,$n); | ||
470 | my @r=( | ||
471 | '($a,$b,$c,$d,$e)=@V;'. | ||
472 | '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer | ||
473 | '&xor ($c,$d);', | ||
474 | '&mov (@T[1],$a);', # $b in next round | ||
475 | '&$_rol ($a,5);', | ||
476 | '&and (@T[0],$c);', # ($b&($c^$d)) | ||
477 | '&xor ($c,$d);', # restore $c | ||
478 | '&xor (@T[0],$d);', | ||
479 | '&add ($e,$a);', | ||
480 | '&$_ror ($b,$j?7:2);', # $b>>>2 | ||
481 | '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
482 | ); | ||
483 | $n = scalar(@r); | ||
484 | $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds | ||
485 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); | ||
486 | $jj++; | ||
487 | return @r; | ||
488 | } | ||
489 | |||
490 | sub body_20_39 () { | ||
491 | use integer; | ||
492 | my ($k,$n); | ||
493 | my @r=( | ||
494 | '($a,$b,$c,$d,$e)=@V;'. | ||
495 | '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer | ||
496 | '&xor (@T[0],$d);', # ($b^$d) | ||
497 | '&mov (@T[1],$a);', # $b in next round | ||
498 | '&$_rol ($a,5);', | ||
499 | '&xor (@T[0],$c);', # ($b^$d^$c) | ||
500 | '&add ($e,$a);', | ||
501 | '&$_ror ($b,7);', # $b>>>2 | ||
502 | '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
503 | ); | ||
504 | $n = scalar(@r); | ||
505 | $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds | ||
506 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); | ||
507 | $jj++; | ||
508 | return @r; | ||
509 | } | ||
510 | |||
511 | sub body_40_59 () { | ||
512 | use integer; | ||
513 | my ($k,$n); | ||
514 | my @r=( | ||
515 | '($a,$b,$c,$d,$e)=@V;'. | ||
516 | '&mov (@T[1],$c);', | ||
517 | '&xor ($c,$d);', | ||
518 | '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer | ||
519 | '&and (@T[1],$d);', | ||
520 | '&and (@T[0],$c);', # ($b&($c^$d)) | ||
521 | '&$_ror ($b,7);', # $b>>>2 | ||
522 | '&add ($e,@T[1]);', | ||
523 | '&mov (@T[1],$a);', # $b in next round | ||
524 | '&$_rol ($a,5);', | ||
525 | '&add ($e,@T[0]);', | ||
526 | '&xor ($c,$d);', # restore $c | ||
527 | '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
528 | ); | ||
529 | $n = scalar(@r); | ||
530 | $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds | ||
531 | @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); | ||
532 | $jj++; | ||
533 | return @r; | ||
534 | } | ||
535 | $code.=<<___; | ||
536 | .align 16 | ||
537 | .Loop_ssse3: | ||
538 | ___ | ||
539 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
540 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
541 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
542 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
543 | &Xupdate_ssse3_32_79(\&body_00_19); | ||
544 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
545 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
546 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
547 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
548 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
549 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
550 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
551 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
552 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
553 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
554 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
555 | &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" | ||
556 | |||
557 | $saved_j=$j; @saved_V=@V; | ||
558 | $saved_r=$r; @saved_rndkey=@rndkey; | ||
559 | |||
560 | &Xloop_ssse3(\&body_20_39); | ||
561 | &Xloop_ssse3(\&body_20_39); | ||
562 | &Xloop_ssse3(\&body_20_39); | ||
563 | |||
564 | $code.=<<___; | ||
565 | movups $iv,48($out,$in0) # write output | ||
566 | lea 64($in0),$in0 | ||
567 | |||
568 | add 0($ctx),$A # update context | ||
569 | add 4($ctx),@T[0] | ||
570 | add 8($ctx),$C | ||
571 | add 12($ctx),$D | ||
572 | mov $A,0($ctx) | ||
573 | add 16($ctx),$E | ||
574 | mov @T[0],4($ctx) | ||
575 | mov @T[0],$B # magic seed | ||
576 | mov $C,8($ctx) | ||
577 | mov $D,12($ctx) | ||
578 | mov $E,16($ctx) | ||
579 | jmp .Loop_ssse3 | ||
580 | |||
581 | .align 16 | ||
582 | .Ldone_ssse3: | ||
583 | ___ | ||
584 | $jj=$j=$saved_j; @V=@saved_V; | ||
585 | $r=$saved_r; @rndkey=@saved_rndkey; | ||
586 | |||
587 | &Xtail_ssse3(\&body_20_39); | ||
588 | &Xtail_ssse3(\&body_20_39); | ||
589 | &Xtail_ssse3(\&body_20_39); | ||
590 | |||
591 | $code.=<<___; | ||
592 | movups $iv,48($out,$in0) # write output | ||
593 | mov 88(%rsp),$ivp # restore $ivp | ||
594 | |||
595 | add 0($ctx),$A # update context | ||
596 | add 4($ctx),@T[0] | ||
597 | add 8($ctx),$C | ||
598 | mov $A,0($ctx) | ||
599 | add 12($ctx),$D | ||
600 | mov @T[0],4($ctx) | ||
601 | add 16($ctx),$E | ||
602 | mov $C,8($ctx) | ||
603 | mov $D,12($ctx) | ||
604 | mov $E,16($ctx) | ||
605 | movups $iv,($ivp) # write IV | ||
606 | ___ | ||
607 | $code.=<<___ if ($win64); | ||
608 | movaps 96+0(%rsp),%xmm6 | ||
609 | movaps 96+16(%rsp),%xmm7 | ||
610 | movaps 96+32(%rsp),%xmm8 | ||
611 | movaps 96+48(%rsp),%xmm9 | ||
612 | movaps 96+64(%rsp),%xmm10 | ||
613 | movaps 96+80(%rsp),%xmm11 | ||
614 | movaps 96+96(%rsp),%xmm12 | ||
615 | movaps 96+112(%rsp),%xmm13 | ||
616 | movaps 96+128(%rsp),%xmm14 | ||
617 | movaps 96+144(%rsp),%xmm15 | ||
618 | ___ | ||
619 | $code.=<<___; | ||
620 | lea `104+($win64?10*16:0)`(%rsp),%rsi | ||
621 | mov 0(%rsi),%r15 | ||
622 | mov 8(%rsi),%r14 | ||
623 | mov 16(%rsi),%r13 | ||
624 | mov 24(%rsi),%r12 | ||
625 | mov 32(%rsi),%rbp | ||
626 | mov 40(%rsi),%rbx | ||
627 | lea 48(%rsi),%rsp | ||
628 | .Lepilogue_ssse3: | ||
629 | ret | ||
630 | .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 | ||
631 | ___ | ||
632 | |||
633 | $j=$jj=$r=$sn=0; | ||
634 | |||
635 | if ($avx) { | ||
636 | my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); | ||
637 | |||
638 | my $Xi=4; | ||
639 | my @X=map("%xmm$_",(4..7,0..3)); | ||
640 | my @Tx=map("%xmm$_",(8..10)); | ||
641 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization | ||
642 | my @T=("%esi","%edi"); | ||
643 | |||
644 | my $_rol=sub { &shld(@_[0],@_) }; | ||
645 | my $_ror=sub { &shrd(@_[0],@_) }; | ||
646 | |||
647 | $code.=<<___; | ||
648 | .type aesni_cbc_sha1_enc_avx,\@function,6 | ||
649 | .align 16 | ||
650 | aesni_cbc_sha1_enc_avx: | ||
651 | mov `($win64?56:8)`(%rsp),$inp # load 7th argument | ||
652 | #shr \$6,$len # debugging artefact | ||
653 | #jz .Lepilogue_avx # debugging artefact | ||
654 | push %rbx | ||
655 | push %rbp | ||
656 | push %r12 | ||
657 | push %r13 | ||
658 | push %r14 | ||
659 | push %r15 | ||
660 | lea `-104-($win64?10*16:0)`(%rsp),%rsp | ||
661 | #mov $in0,$inp # debugging artefact | ||
662 | #lea 64(%rsp),$ctx # debugging artefact | ||
663 | ___ | ||
664 | $code.=<<___ if ($win64); | ||
665 | movaps %xmm6,96+0(%rsp) | ||
666 | movaps %xmm7,96+16(%rsp) | ||
667 | movaps %xmm8,96+32(%rsp) | ||
668 | movaps %xmm9,96+48(%rsp) | ||
669 | movaps %xmm10,96+64(%rsp) | ||
670 | movaps %xmm11,96+80(%rsp) | ||
671 | movaps %xmm12,96+96(%rsp) | ||
672 | movaps %xmm13,96+112(%rsp) | ||
673 | movaps %xmm14,96+128(%rsp) | ||
674 | movaps %xmm15,96+144(%rsp) | ||
675 | .Lprologue_avx: | ||
676 | ___ | ||
677 | $code.=<<___; | ||
678 | vzeroall | ||
679 | mov $in0,%r12 # reassign arguments | ||
680 | mov $out,%r13 | ||
681 | mov $len,%r14 | ||
682 | mov $key,%r15 | ||
683 | vmovdqu ($ivp),$iv # load IV | ||
684 | mov $ivp,88(%rsp) # save $ivp | ||
685 | ___ | ||
686 | my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments | ||
687 | my $rounds="${ivp}d"; | ||
688 | $code.=<<___; | ||
689 | shl \$6,$len | ||
690 | sub $in0,$out | ||
691 | mov 240($key),$rounds | ||
692 | add \$112,$key # size optimization | ||
693 | add $inp,$len # end of input | ||
694 | |||
695 | lea K_XX_XX(%rip),$K_XX_XX | ||
696 | mov 0($ctx),$A # load context | ||
697 | mov 4($ctx),$B | ||
698 | mov 8($ctx),$C | ||
699 | mov 12($ctx),$D | ||
700 | mov $B,@T[0] # magic seed | ||
701 | mov 16($ctx),$E | ||
702 | |||
703 | vmovdqa 64($K_XX_XX),@X[2] # pbswap mask | ||
704 | vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 | ||
705 | vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] | ||
706 | vmovdqu 16($inp),@X[-3&7] | ||
707 | vmovdqu 32($inp),@X[-2&7] | ||
708 | vmovdqu 48($inp),@X[-1&7] | ||
709 | vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap | ||
710 | add \$64,$inp | ||
711 | vpshufb @X[2],@X[-3&7],@X[-3&7] | ||
712 | vpshufb @X[2],@X[-2&7],@X[-2&7] | ||
713 | vpshufb @X[2],@X[-1&7],@X[-1&7] | ||
714 | vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 | ||
715 | vpaddd @Tx[1],@X[-3&7],@X[1] | ||
716 | vpaddd @Tx[1],@X[-2&7],@X[2] | ||
717 | vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU | ||
718 | vmovdqa @X[1],16(%rsp) | ||
719 | vmovdqa @X[2],32(%rsp) | ||
720 | vmovups -112($key),$rndkey0 # $key[0] | ||
721 | vmovups 16-112($key),$rndkey[0] # forward reference | ||
722 | jmp .Loop_avx | ||
723 | ___ | ||
724 | |||
725 | my $aesenc=sub { | ||
726 | use integer; | ||
727 | my ($n,$k)=($r/10,$r%10); | ||
728 | if ($k==0) { | ||
729 | $code.=<<___; | ||
730 | vmovups `16*$n`($in0),$in # load input | ||
731 | vxorps $rndkey0,$in,$in | ||
732 | ___ | ||
733 | $code.=<<___ if ($n); | ||
734 | vmovups $iv,`16*($n-1)`($out,$in0) # write output | ||
735 | ___ | ||
736 | $code.=<<___; | ||
737 | vxorps $in,$iv,$iv | ||
738 | vaesenc $rndkey[0],$iv,$iv | ||
739 | vmovups `32+16*$k-112`($key),$rndkey[1] | ||
740 | ___ | ||
741 | } elsif ($k==9) { | ||
742 | $sn++; | ||
743 | $code.=<<___; | ||
744 | cmp \$11,$rounds | ||
745 | jb .Lvaesenclast$sn | ||
746 | vaesenc $rndkey[0],$iv,$iv | ||
747 | vmovups `32+16*($k+0)-112`($key),$rndkey[1] | ||
748 | vaesenc $rndkey[1],$iv,$iv | ||
749 | vmovups `32+16*($k+1)-112`($key),$rndkey[0] | ||
750 | je .Lvaesenclast$sn | ||
751 | vaesenc $rndkey[0],$iv,$iv | ||
752 | vmovups `32+16*($k+2)-112`($key),$rndkey[1] | ||
753 | vaesenc $rndkey[1],$iv,$iv | ||
754 | vmovups `32+16*($k+3)-112`($key),$rndkey[0] | ||
755 | .Lvaesenclast$sn: | ||
756 | vaesenclast $rndkey[0],$iv,$iv | ||
757 | vmovups 16-112($key),$rndkey[1] # forward reference | ||
758 | ___ | ||
759 | } else { | ||
760 | $code.=<<___; | ||
761 | vaesenc $rndkey[0],$iv,$iv | ||
762 | vmovups `32+16*$k-112`($key),$rndkey[1] | ||
763 | ___ | ||
764 | } | ||
765 | $r++; unshift(@rndkey,pop(@rndkey)); | ||
766 | }; | ||
767 | |||
768 | sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 | ||
769 | { use integer; | ||
770 | my $body = shift; | ||
771 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | ||
772 | my ($a,$b,$c,$d,$e); | ||
773 | |||
774 | eval(shift(@insns)); | ||
775 | eval(shift(@insns)); | ||
776 | &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" | ||
777 | eval(shift(@insns)); | ||
778 | eval(shift(@insns)); | ||
779 | |||
780 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
781 | eval(shift(@insns)); | ||
782 | eval(shift(@insns)); | ||
783 | &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords | ||
784 | eval(shift(@insns)); | ||
785 | eval(shift(@insns)); | ||
786 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | ||
787 | eval(shift(@insns)); | ||
788 | eval(shift(@insns)); | ||
789 | |||
790 | &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" | ||
791 | eval(shift(@insns)); | ||
792 | eval(shift(@insns)); | ||
793 | eval(shift(@insns)); | ||
794 | eval(shift(@insns)); | ||
795 | |||
796 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" | ||
797 | eval(shift(@insns)); | ||
798 | eval(shift(@insns)); | ||
799 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
800 | eval(shift(@insns)); | ||
801 | eval(shift(@insns)); | ||
802 | |||
803 | &vpsrld (@Tx[0],@X[0],31); | ||
804 | eval(shift(@insns)); | ||
805 | eval(shift(@insns)); | ||
806 | eval(shift(@insns)); | ||
807 | eval(shift(@insns)); | ||
808 | |||
809 | &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword | ||
810 | &vpaddd (@X[0],@X[0],@X[0]); | ||
811 | eval(shift(@insns)); | ||
812 | eval(shift(@insns)); | ||
813 | eval(shift(@insns)); | ||
814 | eval(shift(@insns)); | ||
815 | |||
816 | &vpsrld (@Tx[1],@Tx[2],30); | ||
817 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 | ||
818 | eval(shift(@insns)); | ||
819 | eval(shift(@insns)); | ||
820 | eval(shift(@insns)); | ||
821 | eval(shift(@insns)); | ||
822 | |||
823 | &vpslld (@Tx[2],@Tx[2],2); | ||
824 | &vpxor (@X[0],@X[0],@Tx[1]); | ||
825 | eval(shift(@insns)); | ||
826 | eval(shift(@insns)); | ||
827 | eval(shift(@insns)); | ||
828 | eval(shift(@insns)); | ||
829 | |||
830 | &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 | ||
831 | eval(shift(@insns)); | ||
832 | eval(shift(@insns)); | ||
833 | &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX | ||
834 | eval(shift(@insns)); | ||
835 | eval(shift(@insns)); | ||
836 | |||
837 | |||
838 | foreach (@insns) { eval; } # remaining instructions [if any] | ||
839 | |||
840 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
841 | push(@Tx,shift(@Tx)); | ||
842 | } | ||
843 | |||
844 | sub Xupdate_avx_32_79() | ||
845 | { use integer; | ||
846 | my $body = shift; | ||
847 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | ||
848 | my ($a,$b,$c,$d,$e); | ||
849 | |||
850 | &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" | ||
851 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | ||
852 | eval(shift(@insns)); # body_20_39 | ||
853 | eval(shift(@insns)); | ||
854 | eval(shift(@insns)); | ||
855 | eval(shift(@insns)); # rol | ||
856 | |||
857 | &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | ||
858 | eval(shift(@insns)); | ||
859 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); | ||
860 | if ($Xi%5) { | ||
861 | &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... | ||
862 | } else { # ... or load next one | ||
863 | &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); | ||
864 | } | ||
865 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
866 | eval(shift(@insns)); # ror | ||
867 | eval(shift(@insns)); | ||
868 | |||
869 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" | ||
870 | eval(shift(@insns)); # body_20_39 | ||
871 | eval(shift(@insns)); | ||
872 | eval(shift(@insns)); | ||
873 | eval(shift(@insns)); # rol | ||
874 | |||
875 | &vpsrld (@Tx[0],@X[0],30); | ||
876 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
877 | eval(shift(@insns)); | ||
878 | eval(shift(@insns)); | ||
879 | eval(shift(@insns)); # ror | ||
880 | eval(shift(@insns)); | ||
881 | |||
882 | &vpslld (@X[0],@X[0],2); | ||
883 | eval(shift(@insns)); # body_20_39 | ||
884 | eval(shift(@insns)); | ||
885 | eval(shift(@insns)); | ||
886 | eval(shift(@insns)); # rol | ||
887 | eval(shift(@insns)); | ||
888 | eval(shift(@insns)); | ||
889 | eval(shift(@insns)); # ror | ||
890 | eval(shift(@insns)); | ||
891 | |||
892 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 | ||
893 | eval(shift(@insns)); # body_20_39 | ||
894 | eval(shift(@insns)); | ||
895 | &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); | ||
896 | eval(shift(@insns)); | ||
897 | eval(shift(@insns)); # rol | ||
898 | eval(shift(@insns)); | ||
899 | eval(shift(@insns)); | ||
900 | eval(shift(@insns)); # rol | ||
901 | eval(shift(@insns)); | ||
902 | |||
903 | foreach (@insns) { eval; } # remaining instructions | ||
904 | |||
905 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
906 | push(@Tx,shift(@Tx)); | ||
907 | } | ||
908 | |||
909 | sub Xuplast_avx_80() | ||
910 | { use integer; | ||
911 | my $body = shift; | ||
912 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
913 | my ($a,$b,$c,$d,$e); | ||
914 | |||
915 | eval(shift(@insns)); | ||
916 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
917 | eval(shift(@insns)); | ||
918 | eval(shift(@insns)); | ||
919 | eval(shift(@insns)); | ||
920 | eval(shift(@insns)); | ||
921 | |||
922 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU | ||
923 | |||
924 | foreach (@insns) { eval; } # remaining instructions | ||
925 | |||
926 | &cmp ($inp,$len); | ||
927 | &je (".Ldone_avx"); | ||
928 | |||
929 | unshift(@Tx,pop(@Tx)); | ||
930 | |||
931 | &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask | ||
932 | &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 | ||
933 | &vmovdqu(@X[-4&7],"0($inp)"); # load input | ||
934 | &vmovdqu(@X[-3&7],"16($inp)"); | ||
935 | &vmovdqu(@X[-2&7],"32($inp)"); | ||
936 | &vmovdqu(@X[-1&7],"48($inp)"); | ||
937 | &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | ||
938 | &add ($inp,64); | ||
939 | |||
940 | $Xi=0; | ||
941 | } | ||
942 | |||
943 | sub Xloop_avx() | ||
944 | { use integer; | ||
945 | my $body = shift; | ||
946 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
947 | my ($a,$b,$c,$d,$e); | ||
948 | |||
949 | eval(shift(@insns)); | ||
950 | eval(shift(@insns)); | ||
951 | &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); | ||
952 | eval(shift(@insns)); | ||
953 | eval(shift(@insns)); | ||
954 | &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); | ||
955 | eval(shift(@insns)); | ||
956 | eval(shift(@insns)); | ||
957 | eval(shift(@insns)); | ||
958 | eval(shift(@insns)); | ||
959 | &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU | ||
960 | eval(shift(@insns)); | ||
961 | eval(shift(@insns)); | ||
962 | |||
963 | foreach (@insns) { eval; } | ||
964 | $Xi++; | ||
965 | } | ||
966 | |||
967 | sub Xtail_avx() | ||
968 | { use integer; | ||
969 | my $body = shift; | ||
970 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
971 | my ($a,$b,$c,$d,$e); | ||
972 | |||
973 | foreach (@insns) { eval; } | ||
974 | } | ||
975 | |||
976 | $code.=<<___; | ||
977 | .align 16 | ||
978 | .Loop_avx: | ||
979 | ___ | ||
980 | &Xupdate_avx_16_31(\&body_00_19); | ||
981 | &Xupdate_avx_16_31(\&body_00_19); | ||
982 | &Xupdate_avx_16_31(\&body_00_19); | ||
983 | &Xupdate_avx_16_31(\&body_00_19); | ||
984 | &Xupdate_avx_32_79(\&body_00_19); | ||
985 | &Xupdate_avx_32_79(\&body_20_39); | ||
986 | &Xupdate_avx_32_79(\&body_20_39); | ||
987 | &Xupdate_avx_32_79(\&body_20_39); | ||
988 | &Xupdate_avx_32_79(\&body_20_39); | ||
989 | &Xupdate_avx_32_79(\&body_20_39); | ||
990 | &Xupdate_avx_32_79(\&body_40_59); | ||
991 | &Xupdate_avx_32_79(\&body_40_59); | ||
992 | &Xupdate_avx_32_79(\&body_40_59); | ||
993 | &Xupdate_avx_32_79(\&body_40_59); | ||
994 | &Xupdate_avx_32_79(\&body_40_59); | ||
995 | &Xupdate_avx_32_79(\&body_20_39); | ||
996 | &Xuplast_avx_80(\&body_20_39); # can jump to "done" | ||
997 | |||
998 | $saved_j=$j; @saved_V=@V; | ||
999 | $saved_r=$r; @saved_rndkey=@rndkey; | ||
1000 | |||
1001 | &Xloop_avx(\&body_20_39); | ||
1002 | &Xloop_avx(\&body_20_39); | ||
1003 | &Xloop_avx(\&body_20_39); | ||
1004 | |||
1005 | $code.=<<___; | ||
1006 | vmovups $iv,48($out,$in0) # write output | ||
1007 | lea 64($in0),$in0 | ||
1008 | |||
1009 | add 0($ctx),$A # update context | ||
1010 | add 4($ctx),@T[0] | ||
1011 | add 8($ctx),$C | ||
1012 | add 12($ctx),$D | ||
1013 | mov $A,0($ctx) | ||
1014 | add 16($ctx),$E | ||
1015 | mov @T[0],4($ctx) | ||
1016 | mov @T[0],$B # magic seed | ||
1017 | mov $C,8($ctx) | ||
1018 | mov $D,12($ctx) | ||
1019 | mov $E,16($ctx) | ||
1020 | jmp .Loop_avx | ||
1021 | |||
1022 | .align 16 | ||
1023 | .Ldone_avx: | ||
1024 | ___ | ||
1025 | $jj=$j=$saved_j; @V=@saved_V; | ||
1026 | $r=$saved_r; @rndkey=@saved_rndkey; | ||
1027 | |||
1028 | &Xtail_avx(\&body_20_39); | ||
1029 | &Xtail_avx(\&body_20_39); | ||
1030 | &Xtail_avx(\&body_20_39); | ||
1031 | |||
1032 | $code.=<<___; | ||
1033 | vmovups $iv,48($out,$in0) # write output | ||
1034 | mov 88(%rsp),$ivp # restore $ivp | ||
1035 | |||
1036 | add 0($ctx),$A # update context | ||
1037 | add 4($ctx),@T[0] | ||
1038 | add 8($ctx),$C | ||
1039 | mov $A,0($ctx) | ||
1040 | add 12($ctx),$D | ||
1041 | mov @T[0],4($ctx) | ||
1042 | add 16($ctx),$E | ||
1043 | mov $C,8($ctx) | ||
1044 | mov $D,12($ctx) | ||
1045 | mov $E,16($ctx) | ||
1046 | vmovups $iv,($ivp) # write IV | ||
1047 | vzeroall | ||
1048 | ___ | ||
1049 | $code.=<<___ if ($win64); | ||
1050 | movaps 96+0(%rsp),%xmm6 | ||
1051 | movaps 96+16(%rsp),%xmm7 | ||
1052 | movaps 96+32(%rsp),%xmm8 | ||
1053 | movaps 96+48(%rsp),%xmm9 | ||
1054 | movaps 96+64(%rsp),%xmm10 | ||
1055 | movaps 96+80(%rsp),%xmm11 | ||
1056 | movaps 96+96(%rsp),%xmm12 | ||
1057 | movaps 96+112(%rsp),%xmm13 | ||
1058 | movaps 96+128(%rsp),%xmm14 | ||
1059 | movaps 96+144(%rsp),%xmm15 | ||
1060 | ___ | ||
1061 | $code.=<<___; | ||
1062 | lea `104+($win64?10*16:0)`(%rsp),%rsi | ||
1063 | mov 0(%rsi),%r15 | ||
1064 | mov 8(%rsi),%r14 | ||
1065 | mov 16(%rsi),%r13 | ||
1066 | mov 24(%rsi),%r12 | ||
1067 | mov 32(%rsi),%rbp | ||
1068 | mov 40(%rsi),%rbx | ||
1069 | lea 48(%rsi),%rsp | ||
1070 | .Lepilogue_avx: | ||
1071 | ret | ||
1072 | .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx | ||
1073 | ___ | ||
1074 | } | ||
1075 | $code.=<<___; | ||
1076 | .align 64 | ||
1077 | K_XX_XX: | ||
1078 | .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 | ||
1079 | .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 | ||
1080 | .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 | ||
1081 | .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 | ||
1082 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask | ||
1083 | |||
1084 | .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
1085 | .align 64 | ||
1086 | ___ | ||
1087 | |||
1088 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
1089 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
1090 | if ($win64) { | ||
1091 | $rec="%rcx"; | ||
1092 | $frame="%rdx"; | ||
1093 | $context="%r8"; | ||
1094 | $disp="%r9"; | ||
1095 | |||
1096 | $code.=<<___; | ||
1097 | .extern __imp_RtlVirtualUnwind | ||
1098 | .type ssse3_handler,\@abi-omnipotent | ||
1099 | .align 16 | ||
1100 | ssse3_handler: | ||
1101 | push %rsi | ||
1102 | push %rdi | ||
1103 | push %rbx | ||
1104 | push %rbp | ||
1105 | push %r12 | ||
1106 | push %r13 | ||
1107 | push %r14 | ||
1108 | push %r15 | ||
1109 | pushfq | ||
1110 | sub \$64,%rsp | ||
1111 | |||
1112 | mov 120($context),%rax # pull context->Rax | ||
1113 | mov 248($context),%rbx # pull context->Rip | ||
1114 | |||
1115 | mov 8($disp),%rsi # disp->ImageBase | ||
1116 | mov 56($disp),%r11 # disp->HandlerData | ||
1117 | |||
1118 | mov 0(%r11),%r10d # HandlerData[0] | ||
1119 | lea (%rsi,%r10),%r10 # prologue label | ||
1120 | cmp %r10,%rbx # context->Rip<prologue label | ||
1121 | jb .Lcommon_seh_tail | ||
1122 | |||
1123 | mov 152($context),%rax # pull context->Rsp | ||
1124 | |||
1125 | mov 4(%r11),%r10d # HandlerData[1] | ||
1126 | lea (%rsi,%r10),%r10 # epilogue label | ||
1127 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
1128 | jae .Lcommon_seh_tail | ||
1129 | |||
1130 | lea 96(%rax),%rsi | ||
1131 | lea 512($context),%rdi # &context.Xmm6 | ||
1132 | mov \$20,%ecx | ||
1133 | .long 0xa548f3fc # cld; rep movsq | ||
1134 | lea `104+10*16`(%rax),%rax # adjust stack pointer | ||
1135 | |||
1136 | mov 0(%rax),%r15 | ||
1137 | mov 8(%rax),%r14 | ||
1138 | mov 16(%rax),%r13 | ||
1139 | mov 24(%rax),%r12 | ||
1140 | mov 32(%rax),%rbp | ||
1141 | mov 40(%rax),%rbx | ||
1142 | lea 48(%rax),%rax | ||
1143 | mov %rbx,144($context) # restore context->Rbx | ||
1144 | mov %rbp,160($context) # restore context->Rbp | ||
1145 | mov %r12,216($context) # restore context->R12 | ||
1146 | mov %r13,224($context) # restore context->R13 | ||
1147 | mov %r14,232($context) # restore context->R14 | ||
1148 | mov %r15,240($context) # restore context->R15 | ||
1149 | |||
1150 | .Lcommon_seh_tail: | ||
1151 | mov 8(%rax),%rdi | ||
1152 | mov 16(%rax),%rsi | ||
1153 | mov %rax,152($context) # restore context->Rsp | ||
1154 | mov %rsi,168($context) # restore context->Rsi | ||
1155 | mov %rdi,176($context) # restore context->Rdi | ||
1156 | |||
1157 | mov 40($disp),%rdi # disp->ContextRecord | ||
1158 | mov $context,%rsi # context | ||
1159 | mov \$154,%ecx # sizeof(CONTEXT) | ||
1160 | .long 0xa548f3fc # cld; rep movsq | ||
1161 | |||
1162 | mov $disp,%rsi | ||
1163 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
1164 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
1165 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
1166 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
1167 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
1168 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
1169 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
1170 | mov %r10,32(%rsp) # arg5 | ||
1171 | mov %r11,40(%rsp) # arg6 | ||
1172 | mov %r12,48(%rsp) # arg7 | ||
1173 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
1174 | call *__imp_RtlVirtualUnwind(%rip) | ||
1175 | |||
1176 | mov \$1,%eax # ExceptionContinueSearch | ||
1177 | add \$64,%rsp | ||
1178 | popfq | ||
1179 | pop %r15 | ||
1180 | pop %r14 | ||
1181 | pop %r13 | ||
1182 | pop %r12 | ||
1183 | pop %rbp | ||
1184 | pop %rbx | ||
1185 | pop %rdi | ||
1186 | pop %rsi | ||
1187 | ret | ||
1188 | .size ssse3_handler,.-ssse3_handler | ||
1189 | |||
1190 | .section .pdata | ||
1191 | .align 4 | ||
1192 | .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3 | ||
1193 | .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3 | ||
1194 | .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3 | ||
1195 | ___ | ||
1196 | $code.=<<___ if ($avx); | ||
1197 | .rva .LSEH_begin_aesni_cbc_sha1_enc_avx | ||
1198 | .rva .LSEH_end_aesni_cbc_sha1_enc_avx | ||
1199 | .rva .LSEH_info_aesni_cbc_sha1_enc_avx | ||
1200 | ___ | ||
1201 | $code.=<<___; | ||
1202 | .section .xdata | ||
1203 | .align 8 | ||
1204 | .LSEH_info_aesni_cbc_sha1_enc_ssse3: | ||
1205 | .byte 9,0,0,0 | ||
1206 | .rva ssse3_handler | ||
1207 | .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] | ||
1208 | ___ | ||
1209 | $code.=<<___ if ($avx); | ||
1210 | .LSEH_info_aesni_cbc_sha1_enc_avx: | ||
1211 | .byte 9,0,0,0 | ||
1212 | .rva ssse3_handler | ||
1213 | .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] | ||
1214 | ___ | ||
1215 | } | ||
1216 | |||
1217 | #################################################################### | ||
1218 | sub rex { | ||
1219 | local *opcode=shift; | ||
1220 | my ($dst,$src)=@_; | ||
1221 | my $rex=0; | ||
1222 | |||
1223 | $rex|=0x04 if($dst>=8); | ||
1224 | $rex|=0x01 if($src>=8); | ||
1225 | push @opcode,$rex|0x40 if($rex); | ||
1226 | } | ||
1227 | |||
1228 | sub aesni { | ||
1229 | my $line=shift; | ||
1230 | my @opcode=(0x66); | ||
1231 | |||
1232 | if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { | ||
1233 | my %opcodelet = ( | ||
1234 | "aesenc" => 0xdc, "aesenclast" => 0xdd | ||
1235 | ); | ||
1236 | return undef if (!defined($opcodelet{$1})); | ||
1237 | rex(\@opcode,$3,$2); | ||
1238 | push @opcode,0x0f,0x38,$opcodelet{$1}; | ||
1239 | push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M | ||
1240 | return ".byte\t".join(',',@opcode); | ||
1241 | } | ||
1242 | return $line; | ||
1243 | } | ||
1244 | |||
1245 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
1246 | $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; | ||
1247 | |||
1248 | print $code; | ||
1249 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86.pl b/src/lib/libcrypto/aes/asm/aesni-x86.pl new file mode 100644 index 0000000000..3dc345b585 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aesni-x86.pl | |||
@@ -0,0 +1,2189 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # This module implements support for Intel AES-NI extension. In | ||
11 | # OpenSSL context it's used with Intel engine, but can also be used as | ||
12 | # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for | ||
13 | # details]. | ||
14 | # | ||
15 | # Performance. | ||
16 | # | ||
17 | # To start with see corresponding paragraph in aesni-x86_64.pl... | ||
18 | # Instead of filling table similar to one found there I've chosen to | ||
19 | # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. | ||
20 | # The simplified table below represents 32-bit performance relative | ||
21 | # to 64-bit one in every given point. Ratios vary for different | ||
22 | # encryption modes, therefore interval values. | ||
23 | # | ||
24 | # 16-byte 64-byte 256-byte 1-KB 8-KB | ||
25 | # 53-67% 67-84% 91-94% 95-98% 97-99.5% | ||
26 | # | ||
27 | # Lower ratios for smaller block sizes are perfectly understandable, | ||
28 | # because function call overhead is higher in 32-bit mode. Largest | ||
29 | # 8-KB block performance is virtually same: 32-bit code is less than | ||
30 | # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. | ||
31 | |||
32 | # January 2011 | ||
33 | # | ||
34 | # See aesni-x86_64.pl for details. Unlike x86_64 version this module | ||
35 | # interleaves at most 6 aes[enc|dec] instructions, because there are | ||
36 | # not enough registers for 8x interleave [which should be optimal for | ||
37 | # Sandy Bridge]. Actually, performance results for 6x interleave | ||
38 | # factor presented in aesni-x86_64.pl (except for CTR) are for this | ||
39 | # module. | ||
40 | |||
41 | # April 2011 | ||
42 | # | ||
43 | # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing | ||
44 | # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. | ||
45 | |||
46 | $PREFIX="aesni"; # if $PREFIX is set to "AES", the script | ||
47 | # generates drop-in replacement for | ||
48 | # crypto/aes/asm/aes-586.pl:-) | ||
49 | $inline=1; # inline _aesni_[en|de]crypt | ||
50 | |||
51 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
52 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
53 | require "x86asm.pl"; | ||
54 | |||
55 | &asm_init($ARGV[0],$0); | ||
56 | |||
57 | if ($PREFIX eq "aesni") { $movekey=*movups; } | ||
58 | else { $movekey=*movups; } | ||
59 | |||
60 | $len="eax"; | ||
61 | $rounds="ecx"; | ||
62 | $key="edx"; | ||
63 | $inp="esi"; | ||
64 | $out="edi"; | ||
65 | $rounds_="ebx"; # backup copy for $rounds | ||
66 | $key_="ebp"; # backup copy for $key | ||
67 | |||
68 | $rndkey0="xmm0"; | ||
69 | $rndkey1="xmm1"; | ||
70 | $inout0="xmm2"; | ||
71 | $inout1="xmm3"; | ||
72 | $inout2="xmm4"; | ||
73 | $inout3="xmm5"; $in1="xmm5"; | ||
74 | $inout4="xmm6"; $in0="xmm6"; | ||
75 | $inout5="xmm7"; $ivec="xmm7"; | ||
76 | |||
77 | # AESNI extenstion | ||
78 | sub aeskeygenassist | ||
79 | { my($dst,$src,$imm)=@_; | ||
80 | if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) | ||
81 | { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } | ||
82 | } | ||
83 | sub aescommon | ||
84 | { my($opcodelet,$dst,$src)=@_; | ||
85 | if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) | ||
86 | { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} | ||
87 | } | ||
88 | sub aesimc { aescommon(0xdb,@_); } | ||
89 | sub aesenc { aescommon(0xdc,@_); } | ||
90 | sub aesenclast { aescommon(0xdd,@_); } | ||
91 | sub aesdec { aescommon(0xde,@_); } | ||
92 | sub aesdeclast { aescommon(0xdf,@_); } | ||
93 | |||
94 | # Inline version of internal aesni_[en|de]crypt1 | ||
95 | { my $sn; | ||
96 | sub aesni_inline_generate1 | ||
97 | { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); | ||
98 | $sn++; | ||
99 | |||
100 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
101 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
102 | &xorps ($ivec,$rndkey0) if (defined($ivec)); | ||
103 | &lea ($key,&DWP(32,$key)); | ||
104 | &xorps ($inout,$ivec) if (defined($ivec)); | ||
105 | &xorps ($inout,$rndkey0) if (!defined($ivec)); | ||
106 | &set_label("${p}1_loop_$sn"); | ||
107 | eval"&aes${p} ($inout,$rndkey1)"; | ||
108 | &dec ($rounds); | ||
109 | &$movekey ($rndkey1,&QWP(0,$key)); | ||
110 | &lea ($key,&DWP(16,$key)); | ||
111 | &jnz (&label("${p}1_loop_$sn")); | ||
112 | eval"&aes${p}last ($inout,$rndkey1)"; | ||
113 | }} | ||
114 | |||
115 | sub aesni_generate1 # fully unrolled loop | ||
116 | { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); | ||
117 | |||
118 | &function_begin_B("_aesni_${p}rypt1"); | ||
119 | &movups ($rndkey0,&QWP(0,$key)); | ||
120 | &$movekey ($rndkey1,&QWP(0x10,$key)); | ||
121 | &xorps ($inout,$rndkey0); | ||
122 | &$movekey ($rndkey0,&QWP(0x20,$key)); | ||
123 | &lea ($key,&DWP(0x30,$key)); | ||
124 | &cmp ($rounds,11); | ||
125 | &jb (&label("${p}128")); | ||
126 | &lea ($key,&DWP(0x20,$key)); | ||
127 | &je (&label("${p}192")); | ||
128 | &lea ($key,&DWP(0x20,$key)); | ||
129 | eval"&aes${p} ($inout,$rndkey1)"; | ||
130 | &$movekey ($rndkey1,&QWP(-0x40,$key)); | ||
131 | eval"&aes${p} ($inout,$rndkey0)"; | ||
132 | &$movekey ($rndkey0,&QWP(-0x30,$key)); | ||
133 | &set_label("${p}192"); | ||
134 | eval"&aes${p} ($inout,$rndkey1)"; | ||
135 | &$movekey ($rndkey1,&QWP(-0x20,$key)); | ||
136 | eval"&aes${p} ($inout,$rndkey0)"; | ||
137 | &$movekey ($rndkey0,&QWP(-0x10,$key)); | ||
138 | &set_label("${p}128"); | ||
139 | eval"&aes${p} ($inout,$rndkey1)"; | ||
140 | &$movekey ($rndkey1,&QWP(0,$key)); | ||
141 | eval"&aes${p} ($inout,$rndkey0)"; | ||
142 | &$movekey ($rndkey0,&QWP(0x10,$key)); | ||
143 | eval"&aes${p} ($inout,$rndkey1)"; | ||
144 | &$movekey ($rndkey1,&QWP(0x20,$key)); | ||
145 | eval"&aes${p} ($inout,$rndkey0)"; | ||
146 | &$movekey ($rndkey0,&QWP(0x30,$key)); | ||
147 | eval"&aes${p} ($inout,$rndkey1)"; | ||
148 | &$movekey ($rndkey1,&QWP(0x40,$key)); | ||
149 | eval"&aes${p} ($inout,$rndkey0)"; | ||
150 | &$movekey ($rndkey0,&QWP(0x50,$key)); | ||
151 | eval"&aes${p} ($inout,$rndkey1)"; | ||
152 | &$movekey ($rndkey1,&QWP(0x60,$key)); | ||
153 | eval"&aes${p} ($inout,$rndkey0)"; | ||
154 | &$movekey ($rndkey0,&QWP(0x70,$key)); | ||
155 | eval"&aes${p} ($inout,$rndkey1)"; | ||
156 | eval"&aes${p}last ($inout,$rndkey0)"; | ||
157 | &ret(); | ||
158 | &function_end_B("_aesni_${p}rypt1"); | ||
159 | } | ||
160 | |||
161 | # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); | ||
162 | &aesni_generate1("enc") if (!$inline); | ||
163 | &function_begin_B("${PREFIX}_encrypt"); | ||
164 | &mov ("eax",&wparam(0)); | ||
165 | &mov ($key,&wparam(2)); | ||
166 | &movups ($inout0,&QWP(0,"eax")); | ||
167 | &mov ($rounds,&DWP(240,$key)); | ||
168 | &mov ("eax",&wparam(1)); | ||
169 | if ($inline) | ||
170 | { &aesni_inline_generate1("enc"); } | ||
171 | else | ||
172 | { &call ("_aesni_encrypt1"); } | ||
173 | &movups (&QWP(0,"eax"),$inout0); | ||
174 | &ret (); | ||
175 | &function_end_B("${PREFIX}_encrypt"); | ||
176 | |||
177 | # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); | ||
178 | &aesni_generate1("dec") if(!$inline); | ||
179 | &function_begin_B("${PREFIX}_decrypt"); | ||
180 | &mov ("eax",&wparam(0)); | ||
181 | &mov ($key,&wparam(2)); | ||
182 | &movups ($inout0,&QWP(0,"eax")); | ||
183 | &mov ($rounds,&DWP(240,$key)); | ||
184 | &mov ("eax",&wparam(1)); | ||
185 | if ($inline) | ||
186 | { &aesni_inline_generate1("dec"); } | ||
187 | else | ||
188 | { &call ("_aesni_decrypt1"); } | ||
189 | &movups (&QWP(0,"eax"),$inout0); | ||
190 | &ret (); | ||
191 | &function_end_B("${PREFIX}_decrypt"); | ||
192 | |||
193 | # _aesni_[en|de]cryptN are private interfaces, N denotes interleave | ||
194 | # factor. Why 3x subroutine were originally used in loops? Even though | ||
195 | # aes[enc|dec] latency was originally 6, it could be scheduled only | ||
196 | # every *2nd* cycle. Thus 3x interleave was the one providing optimal | ||
197 | # utilization, i.e. when subroutine's throughput is virtually same as | ||
198 | # of non-interleaved subroutine [for number of input blocks up to 3]. | ||
199 | # This is why it makes no sense to implement 2x subroutine. | ||
200 | # aes[enc|dec] latency in next processor generation is 8, but the | ||
201 | # instructions can be scheduled every cycle. Optimal interleave for | ||
202 | # new processor is therefore 8x, but it's unfeasible to accommodate it | ||
203 | # in XMM registers addreassable in 32-bit mode and therefore 6x is | ||
204 | # used instead... | ||
205 | |||
206 | sub aesni_generate3 | ||
207 | { my $p=shift; | ||
208 | |||
209 | &function_begin_B("_aesni_${p}rypt3"); | ||
210 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
211 | &shr ($rounds,1); | ||
212 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
213 | &lea ($key,&DWP(32,$key)); | ||
214 | &xorps ($inout0,$rndkey0); | ||
215 | &pxor ($inout1,$rndkey0); | ||
216 | &pxor ($inout2,$rndkey0); | ||
217 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
218 | |||
219 | &set_label("${p}3_loop"); | ||
220 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
221 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
222 | &dec ($rounds); | ||
223 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
224 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
225 | eval"&aes${p} ($inout0,$rndkey0)"; | ||
226 | eval"&aes${p} ($inout1,$rndkey0)"; | ||
227 | &lea ($key,&DWP(32,$key)); | ||
228 | eval"&aes${p} ($inout2,$rndkey0)"; | ||
229 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
230 | &jnz (&label("${p}3_loop")); | ||
231 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
232 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
233 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
234 | eval"&aes${p}last ($inout0,$rndkey0)"; | ||
235 | eval"&aes${p}last ($inout1,$rndkey0)"; | ||
236 | eval"&aes${p}last ($inout2,$rndkey0)"; | ||
237 | &ret(); | ||
238 | &function_end_B("_aesni_${p}rypt3"); | ||
239 | } | ||
240 | |||
241 | # 4x interleave is implemented to improve small block performance, | ||
242 | # most notably [and naturally] 4 block by ~30%. One can argue that one | ||
243 | # should have implemented 5x as well, but improvement would be <20%, | ||
244 | # so it's not worth it... | ||
245 | sub aesni_generate4 | ||
246 | { my $p=shift; | ||
247 | |||
248 | &function_begin_B("_aesni_${p}rypt4"); | ||
249 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
250 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
251 | &shr ($rounds,1); | ||
252 | &lea ($key,&DWP(32,$key)); | ||
253 | &xorps ($inout0,$rndkey0); | ||
254 | &pxor ($inout1,$rndkey0); | ||
255 | &pxor ($inout2,$rndkey0); | ||
256 | &pxor ($inout3,$rndkey0); | ||
257 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
258 | |||
259 | &set_label("${p}4_loop"); | ||
260 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
261 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
262 | &dec ($rounds); | ||
263 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
264 | eval"&aes${p} ($inout3,$rndkey1)"; | ||
265 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
266 | eval"&aes${p} ($inout0,$rndkey0)"; | ||
267 | eval"&aes${p} ($inout1,$rndkey0)"; | ||
268 | &lea ($key,&DWP(32,$key)); | ||
269 | eval"&aes${p} ($inout2,$rndkey0)"; | ||
270 | eval"&aes${p} ($inout3,$rndkey0)"; | ||
271 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
272 | &jnz (&label("${p}4_loop")); | ||
273 | |||
274 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
275 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
276 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
277 | eval"&aes${p} ($inout3,$rndkey1)"; | ||
278 | eval"&aes${p}last ($inout0,$rndkey0)"; | ||
279 | eval"&aes${p}last ($inout1,$rndkey0)"; | ||
280 | eval"&aes${p}last ($inout2,$rndkey0)"; | ||
281 | eval"&aes${p}last ($inout3,$rndkey0)"; | ||
282 | &ret(); | ||
283 | &function_end_B("_aesni_${p}rypt4"); | ||
284 | } | ||
285 | |||
286 | sub aesni_generate6 | ||
287 | { my $p=shift; | ||
288 | |||
289 | &function_begin_B("_aesni_${p}rypt6"); | ||
290 | &static_label("_aesni_${p}rypt6_enter"); | ||
291 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
292 | &shr ($rounds,1); | ||
293 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
294 | &lea ($key,&DWP(32,$key)); | ||
295 | &xorps ($inout0,$rndkey0); | ||
296 | &pxor ($inout1,$rndkey0); # pxor does better here | ||
297 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
298 | &pxor ($inout2,$rndkey0); | ||
299 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
300 | &pxor ($inout3,$rndkey0); | ||
301 | &dec ($rounds); | ||
302 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
303 | &pxor ($inout4,$rndkey0); | ||
304 | eval"&aes${p} ($inout3,$rndkey1)"; | ||
305 | &pxor ($inout5,$rndkey0); | ||
306 | eval"&aes${p} ($inout4,$rndkey1)"; | ||
307 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
308 | eval"&aes${p} ($inout5,$rndkey1)"; | ||
309 | &jmp (&label("_aesni_${p}rypt6_enter")); | ||
310 | |||
311 | &set_label("${p}6_loop",16); | ||
312 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
313 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
314 | &dec ($rounds); | ||
315 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
316 | eval"&aes${p} ($inout3,$rndkey1)"; | ||
317 | eval"&aes${p} ($inout4,$rndkey1)"; | ||
318 | eval"&aes${p} ($inout5,$rndkey1)"; | ||
319 | &set_label("_aesni_${p}rypt6_enter",16); | ||
320 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
321 | eval"&aes${p} ($inout0,$rndkey0)"; | ||
322 | eval"&aes${p} ($inout1,$rndkey0)"; | ||
323 | &lea ($key,&DWP(32,$key)); | ||
324 | eval"&aes${p} ($inout2,$rndkey0)"; | ||
325 | eval"&aes${p} ($inout3,$rndkey0)"; | ||
326 | eval"&aes${p} ($inout4,$rndkey0)"; | ||
327 | eval"&aes${p} ($inout5,$rndkey0)"; | ||
328 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
329 | &jnz (&label("${p}6_loop")); | ||
330 | |||
331 | eval"&aes${p} ($inout0,$rndkey1)"; | ||
332 | eval"&aes${p} ($inout1,$rndkey1)"; | ||
333 | eval"&aes${p} ($inout2,$rndkey1)"; | ||
334 | eval"&aes${p} ($inout3,$rndkey1)"; | ||
335 | eval"&aes${p} ($inout4,$rndkey1)"; | ||
336 | eval"&aes${p} ($inout5,$rndkey1)"; | ||
337 | eval"&aes${p}last ($inout0,$rndkey0)"; | ||
338 | eval"&aes${p}last ($inout1,$rndkey0)"; | ||
339 | eval"&aes${p}last ($inout2,$rndkey0)"; | ||
340 | eval"&aes${p}last ($inout3,$rndkey0)"; | ||
341 | eval"&aes${p}last ($inout4,$rndkey0)"; | ||
342 | eval"&aes${p}last ($inout5,$rndkey0)"; | ||
343 | &ret(); | ||
344 | &function_end_B("_aesni_${p}rypt6"); | ||
345 | } | ||
346 | &aesni_generate3("enc") if ($PREFIX eq "aesni"); | ||
347 | &aesni_generate3("dec"); | ||
348 | &aesni_generate4("enc") if ($PREFIX eq "aesni"); | ||
349 | &aesni_generate4("dec"); | ||
350 | &aesni_generate6("enc") if ($PREFIX eq "aesni"); | ||
351 | &aesni_generate6("dec"); | ||
352 | |||
353 | if ($PREFIX eq "aesni") { | ||
354 | ###################################################################### | ||
355 | # void aesni_ecb_encrypt (const void *in, void *out, | ||
356 | # size_t length, const AES_KEY *key, | ||
357 | # int enc); | ||
358 | &function_begin("aesni_ecb_encrypt"); | ||
359 | &mov ($inp,&wparam(0)); | ||
360 | &mov ($out,&wparam(1)); | ||
361 | &mov ($len,&wparam(2)); | ||
362 | &mov ($key,&wparam(3)); | ||
363 | &mov ($rounds_,&wparam(4)); | ||
364 | &and ($len,-16); | ||
365 | &jz (&label("ecb_ret")); | ||
366 | &mov ($rounds,&DWP(240,$key)); | ||
367 | &test ($rounds_,$rounds_); | ||
368 | &jz (&label("ecb_decrypt")); | ||
369 | |||
370 | &mov ($key_,$key); # backup $key | ||
371 | &mov ($rounds_,$rounds); # backup $rounds | ||
372 | &cmp ($len,0x60); | ||
373 | &jb (&label("ecb_enc_tail")); | ||
374 | |||
375 | &movdqu ($inout0,&QWP(0,$inp)); | ||
376 | &movdqu ($inout1,&QWP(0x10,$inp)); | ||
377 | &movdqu ($inout2,&QWP(0x20,$inp)); | ||
378 | &movdqu ($inout3,&QWP(0x30,$inp)); | ||
379 | &movdqu ($inout4,&QWP(0x40,$inp)); | ||
380 | &movdqu ($inout5,&QWP(0x50,$inp)); | ||
381 | &lea ($inp,&DWP(0x60,$inp)); | ||
382 | &sub ($len,0x60); | ||
383 | &jmp (&label("ecb_enc_loop6_enter")); | ||
384 | |||
385 | &set_label("ecb_enc_loop6",16); | ||
386 | &movups (&QWP(0,$out),$inout0); | ||
387 | &movdqu ($inout0,&QWP(0,$inp)); | ||
388 | &movups (&QWP(0x10,$out),$inout1); | ||
389 | &movdqu ($inout1,&QWP(0x10,$inp)); | ||
390 | &movups (&QWP(0x20,$out),$inout2); | ||
391 | &movdqu ($inout2,&QWP(0x20,$inp)); | ||
392 | &movups (&QWP(0x30,$out),$inout3); | ||
393 | &movdqu ($inout3,&QWP(0x30,$inp)); | ||
394 | &movups (&QWP(0x40,$out),$inout4); | ||
395 | &movdqu ($inout4,&QWP(0x40,$inp)); | ||
396 | &movups (&QWP(0x50,$out),$inout5); | ||
397 | &lea ($out,&DWP(0x60,$out)); | ||
398 | &movdqu ($inout5,&QWP(0x50,$inp)); | ||
399 | &lea ($inp,&DWP(0x60,$inp)); | ||
400 | &set_label("ecb_enc_loop6_enter"); | ||
401 | |||
402 | &call ("_aesni_encrypt6"); | ||
403 | |||
404 | &mov ($key,$key_); # restore $key | ||
405 | &mov ($rounds,$rounds_); # restore $rounds | ||
406 | &sub ($len,0x60); | ||
407 | &jnc (&label("ecb_enc_loop6")); | ||
408 | |||
409 | &movups (&QWP(0,$out),$inout0); | ||
410 | &movups (&QWP(0x10,$out),$inout1); | ||
411 | &movups (&QWP(0x20,$out),$inout2); | ||
412 | &movups (&QWP(0x30,$out),$inout3); | ||
413 | &movups (&QWP(0x40,$out),$inout4); | ||
414 | &movups (&QWP(0x50,$out),$inout5); | ||
415 | &lea ($out,&DWP(0x60,$out)); | ||
416 | &add ($len,0x60); | ||
417 | &jz (&label("ecb_ret")); | ||
418 | |||
419 | &set_label("ecb_enc_tail"); | ||
420 | &movups ($inout0,&QWP(0,$inp)); | ||
421 | &cmp ($len,0x20); | ||
422 | &jb (&label("ecb_enc_one")); | ||
423 | &movups ($inout1,&QWP(0x10,$inp)); | ||
424 | &je (&label("ecb_enc_two")); | ||
425 | &movups ($inout2,&QWP(0x20,$inp)); | ||
426 | &cmp ($len,0x40); | ||
427 | &jb (&label("ecb_enc_three")); | ||
428 | &movups ($inout3,&QWP(0x30,$inp)); | ||
429 | &je (&label("ecb_enc_four")); | ||
430 | &movups ($inout4,&QWP(0x40,$inp)); | ||
431 | &xorps ($inout5,$inout5); | ||
432 | &call ("_aesni_encrypt6"); | ||
433 | &movups (&QWP(0,$out),$inout0); | ||
434 | &movups (&QWP(0x10,$out),$inout1); | ||
435 | &movups (&QWP(0x20,$out),$inout2); | ||
436 | &movups (&QWP(0x30,$out),$inout3); | ||
437 | &movups (&QWP(0x40,$out),$inout4); | ||
438 | jmp (&label("ecb_ret")); | ||
439 | |||
440 | &set_label("ecb_enc_one",16); | ||
441 | if ($inline) | ||
442 | { &aesni_inline_generate1("enc"); } | ||
443 | else | ||
444 | { &call ("_aesni_encrypt1"); } | ||
445 | &movups (&QWP(0,$out),$inout0); | ||
446 | &jmp (&label("ecb_ret")); | ||
447 | |||
448 | &set_label("ecb_enc_two",16); | ||
449 | &xorps ($inout2,$inout2); | ||
450 | &call ("_aesni_encrypt3"); | ||
451 | &movups (&QWP(0,$out),$inout0); | ||
452 | &movups (&QWP(0x10,$out),$inout1); | ||
453 | &jmp (&label("ecb_ret")); | ||
454 | |||
455 | &set_label("ecb_enc_three",16); | ||
456 | &call ("_aesni_encrypt3"); | ||
457 | &movups (&QWP(0,$out),$inout0); | ||
458 | &movups (&QWP(0x10,$out),$inout1); | ||
459 | &movups (&QWP(0x20,$out),$inout2); | ||
460 | &jmp (&label("ecb_ret")); | ||
461 | |||
462 | &set_label("ecb_enc_four",16); | ||
463 | &call ("_aesni_encrypt4"); | ||
464 | &movups (&QWP(0,$out),$inout0); | ||
465 | &movups (&QWP(0x10,$out),$inout1); | ||
466 | &movups (&QWP(0x20,$out),$inout2); | ||
467 | &movups (&QWP(0x30,$out),$inout3); | ||
468 | &jmp (&label("ecb_ret")); | ||
469 | ###################################################################### | ||
470 | &set_label("ecb_decrypt",16); | ||
471 | &mov ($key_,$key); # backup $key | ||
472 | &mov ($rounds_,$rounds); # backup $rounds | ||
473 | &cmp ($len,0x60); | ||
474 | &jb (&label("ecb_dec_tail")); | ||
475 | |||
476 | &movdqu ($inout0,&QWP(0,$inp)); | ||
477 | &movdqu ($inout1,&QWP(0x10,$inp)); | ||
478 | &movdqu ($inout2,&QWP(0x20,$inp)); | ||
479 | &movdqu ($inout3,&QWP(0x30,$inp)); | ||
480 | &movdqu ($inout4,&QWP(0x40,$inp)); | ||
481 | &movdqu ($inout5,&QWP(0x50,$inp)); | ||
482 | &lea ($inp,&DWP(0x60,$inp)); | ||
483 | &sub ($len,0x60); | ||
484 | &jmp (&label("ecb_dec_loop6_enter")); | ||
485 | |||
486 | &set_label("ecb_dec_loop6",16); | ||
487 | &movups (&QWP(0,$out),$inout0); | ||
488 | &movdqu ($inout0,&QWP(0,$inp)); | ||
489 | &movups (&QWP(0x10,$out),$inout1); | ||
490 | &movdqu ($inout1,&QWP(0x10,$inp)); | ||
491 | &movups (&QWP(0x20,$out),$inout2); | ||
492 | &movdqu ($inout2,&QWP(0x20,$inp)); | ||
493 | &movups (&QWP(0x30,$out),$inout3); | ||
494 | &movdqu ($inout3,&QWP(0x30,$inp)); | ||
495 | &movups (&QWP(0x40,$out),$inout4); | ||
496 | &movdqu ($inout4,&QWP(0x40,$inp)); | ||
497 | &movups (&QWP(0x50,$out),$inout5); | ||
498 | &lea ($out,&DWP(0x60,$out)); | ||
499 | &movdqu ($inout5,&QWP(0x50,$inp)); | ||
500 | &lea ($inp,&DWP(0x60,$inp)); | ||
501 | &set_label("ecb_dec_loop6_enter"); | ||
502 | |||
503 | &call ("_aesni_decrypt6"); | ||
504 | |||
505 | &mov ($key,$key_); # restore $key | ||
506 | &mov ($rounds,$rounds_); # restore $rounds | ||
507 | &sub ($len,0x60); | ||
508 | &jnc (&label("ecb_dec_loop6")); | ||
509 | |||
510 | &movups (&QWP(0,$out),$inout0); | ||
511 | &movups (&QWP(0x10,$out),$inout1); | ||
512 | &movups (&QWP(0x20,$out),$inout2); | ||
513 | &movups (&QWP(0x30,$out),$inout3); | ||
514 | &movups (&QWP(0x40,$out),$inout4); | ||
515 | &movups (&QWP(0x50,$out),$inout5); | ||
516 | &lea ($out,&DWP(0x60,$out)); | ||
517 | &add ($len,0x60); | ||
518 | &jz (&label("ecb_ret")); | ||
519 | |||
520 | &set_label("ecb_dec_tail"); | ||
521 | &movups ($inout0,&QWP(0,$inp)); | ||
522 | &cmp ($len,0x20); | ||
523 | &jb (&label("ecb_dec_one")); | ||
524 | &movups ($inout1,&QWP(0x10,$inp)); | ||
525 | &je (&label("ecb_dec_two")); | ||
526 | &movups ($inout2,&QWP(0x20,$inp)); | ||
527 | &cmp ($len,0x40); | ||
528 | &jb (&label("ecb_dec_three")); | ||
529 | &movups ($inout3,&QWP(0x30,$inp)); | ||
530 | &je (&label("ecb_dec_four")); | ||
531 | &movups ($inout4,&QWP(0x40,$inp)); | ||
532 | &xorps ($inout5,$inout5); | ||
533 | &call ("_aesni_decrypt6"); | ||
534 | &movups (&QWP(0,$out),$inout0); | ||
535 | &movups (&QWP(0x10,$out),$inout1); | ||
536 | &movups (&QWP(0x20,$out),$inout2); | ||
537 | &movups (&QWP(0x30,$out),$inout3); | ||
538 | &movups (&QWP(0x40,$out),$inout4); | ||
539 | &jmp (&label("ecb_ret")); | ||
540 | |||
541 | &set_label("ecb_dec_one",16); | ||
542 | if ($inline) | ||
543 | { &aesni_inline_generate1("dec"); } | ||
544 | else | ||
545 | { &call ("_aesni_decrypt1"); } | ||
546 | &movups (&QWP(0,$out),$inout0); | ||
547 | &jmp (&label("ecb_ret")); | ||
548 | |||
549 | &set_label("ecb_dec_two",16); | ||
550 | &xorps ($inout2,$inout2); | ||
551 | &call ("_aesni_decrypt3"); | ||
552 | &movups (&QWP(0,$out),$inout0); | ||
553 | &movups (&QWP(0x10,$out),$inout1); | ||
554 | &jmp (&label("ecb_ret")); | ||
555 | |||
556 | &set_label("ecb_dec_three",16); | ||
557 | &call ("_aesni_decrypt3"); | ||
558 | &movups (&QWP(0,$out),$inout0); | ||
559 | &movups (&QWP(0x10,$out),$inout1); | ||
560 | &movups (&QWP(0x20,$out),$inout2); | ||
561 | &jmp (&label("ecb_ret")); | ||
562 | |||
563 | &set_label("ecb_dec_four",16); | ||
564 | &call ("_aesni_decrypt4"); | ||
565 | &movups (&QWP(0,$out),$inout0); | ||
566 | &movups (&QWP(0x10,$out),$inout1); | ||
567 | &movups (&QWP(0x20,$out),$inout2); | ||
568 | &movups (&QWP(0x30,$out),$inout3); | ||
569 | |||
570 | &set_label("ecb_ret"); | ||
571 | &function_end("aesni_ecb_encrypt"); | ||
572 | |||
573 | ###################################################################### | ||
574 | # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, | ||
575 | # size_t blocks, const AES_KEY *key, | ||
576 | # const char *ivec,char *cmac); | ||
577 | # | ||
578 | # Handles only complete blocks, operates on 64-bit counter and | ||
579 | # does not update *ivec! Nor does it finalize CMAC value | ||
580 | # (see engine/eng_aesni.c for details) | ||
581 | # | ||
582 | { my $cmac=$inout1; | ||
583 | &function_begin("aesni_ccm64_encrypt_blocks"); | ||
584 | &mov ($inp,&wparam(0)); | ||
585 | &mov ($out,&wparam(1)); | ||
586 | &mov ($len,&wparam(2)); | ||
587 | &mov ($key,&wparam(3)); | ||
588 | &mov ($rounds_,&wparam(4)); | ||
589 | &mov ($rounds,&wparam(5)); | ||
590 | &mov ($key_,"esp"); | ||
591 | &sub ("esp",60); | ||
592 | &and ("esp",-16); # align stack | ||
593 | &mov (&DWP(48,"esp"),$key_); | ||
594 | |||
595 | &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec | ||
596 | &movdqu ($cmac,&QWP(0,$rounds)); # load cmac | ||
597 | &mov ($rounds,&DWP(240,$key)); | ||
598 | |||
599 | # compose byte-swap control mask for pshufb on stack | ||
600 | &mov (&DWP(0,"esp"),0x0c0d0e0f); | ||
601 | &mov (&DWP(4,"esp"),0x08090a0b); | ||
602 | &mov (&DWP(8,"esp"),0x04050607); | ||
603 | &mov (&DWP(12,"esp"),0x00010203); | ||
604 | |||
605 | # compose counter increment vector on stack | ||
606 | &mov ($rounds_,1); | ||
607 | &xor ($key_,$key_); | ||
608 | &mov (&DWP(16,"esp"),$rounds_); | ||
609 | &mov (&DWP(20,"esp"),$key_); | ||
610 | &mov (&DWP(24,"esp"),$key_); | ||
611 | &mov (&DWP(28,"esp"),$key_); | ||
612 | |||
613 | &shr ($rounds,1); | ||
614 | &lea ($key_,&DWP(0,$key)); | ||
615 | &movdqa ($inout3,&QWP(0,"esp")); | ||
616 | &movdqa ($inout0,$ivec); | ||
617 | &mov ($rounds_,$rounds); | ||
618 | &pshufb ($ivec,$inout3); | ||
619 | |||
620 | &set_label("ccm64_enc_outer"); | ||
621 | &$movekey ($rndkey0,&QWP(0,$key_)); | ||
622 | &mov ($rounds,$rounds_); | ||
623 | &movups ($in0,&QWP(0,$inp)); | ||
624 | |||
625 | &xorps ($inout0,$rndkey0); | ||
626 | &$movekey ($rndkey1,&QWP(16,$key_)); | ||
627 | &xorps ($rndkey0,$in0); | ||
628 | &lea ($key,&DWP(32,$key_)); | ||
629 | &xorps ($cmac,$rndkey0); # cmac^=inp | ||
630 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
631 | |||
632 | &set_label("ccm64_enc2_loop"); | ||
633 | &aesenc ($inout0,$rndkey1); | ||
634 | &dec ($rounds); | ||
635 | &aesenc ($cmac,$rndkey1); | ||
636 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
637 | &aesenc ($inout0,$rndkey0); | ||
638 | &lea ($key,&DWP(32,$key)); | ||
639 | &aesenc ($cmac,$rndkey0); | ||
640 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
641 | &jnz (&label("ccm64_enc2_loop")); | ||
642 | &aesenc ($inout0,$rndkey1); | ||
643 | &aesenc ($cmac,$rndkey1); | ||
644 | &paddq ($ivec,&QWP(16,"esp")); | ||
645 | &aesenclast ($inout0,$rndkey0); | ||
646 | &aesenclast ($cmac,$rndkey0); | ||
647 | |||
648 | &dec ($len); | ||
649 | &lea ($inp,&DWP(16,$inp)); | ||
650 | &xorps ($in0,$inout0); # inp^=E(ivec) | ||
651 | &movdqa ($inout0,$ivec); | ||
652 | &movups (&QWP(0,$out),$in0); # save output | ||
653 | &lea ($out,&DWP(16,$out)); | ||
654 | &pshufb ($inout0,$inout3); | ||
655 | &jnz (&label("ccm64_enc_outer")); | ||
656 | |||
657 | &mov ("esp",&DWP(48,"esp")); | ||
658 | &mov ($out,&wparam(5)); | ||
659 | &movups (&QWP(0,$out),$cmac); | ||
660 | &function_end("aesni_ccm64_encrypt_blocks"); | ||
661 | |||
662 | &function_begin("aesni_ccm64_decrypt_blocks"); | ||
663 | &mov ($inp,&wparam(0)); | ||
664 | &mov ($out,&wparam(1)); | ||
665 | &mov ($len,&wparam(2)); | ||
666 | &mov ($key,&wparam(3)); | ||
667 | &mov ($rounds_,&wparam(4)); | ||
668 | &mov ($rounds,&wparam(5)); | ||
669 | &mov ($key_,"esp"); | ||
670 | &sub ("esp",60); | ||
671 | &and ("esp",-16); # align stack | ||
672 | &mov (&DWP(48,"esp"),$key_); | ||
673 | |||
674 | &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec | ||
675 | &movdqu ($cmac,&QWP(0,$rounds)); # load cmac | ||
676 | &mov ($rounds,&DWP(240,$key)); | ||
677 | |||
678 | # compose byte-swap control mask for pshufb on stack | ||
679 | &mov (&DWP(0,"esp"),0x0c0d0e0f); | ||
680 | &mov (&DWP(4,"esp"),0x08090a0b); | ||
681 | &mov (&DWP(8,"esp"),0x04050607); | ||
682 | &mov (&DWP(12,"esp"),0x00010203); | ||
683 | |||
684 | # compose counter increment vector on stack | ||
685 | &mov ($rounds_,1); | ||
686 | &xor ($key_,$key_); | ||
687 | &mov (&DWP(16,"esp"),$rounds_); | ||
688 | &mov (&DWP(20,"esp"),$key_); | ||
689 | &mov (&DWP(24,"esp"),$key_); | ||
690 | &mov (&DWP(28,"esp"),$key_); | ||
691 | |||
692 | &movdqa ($inout3,&QWP(0,"esp")); # bswap mask | ||
693 | &movdqa ($inout0,$ivec); | ||
694 | |||
695 | &mov ($key_,$key); | ||
696 | &mov ($rounds_,$rounds); | ||
697 | |||
698 | &pshufb ($ivec,$inout3); | ||
699 | if ($inline) | ||
700 | { &aesni_inline_generate1("enc"); } | ||
701 | else | ||
702 | { &call ("_aesni_encrypt1"); } | ||
703 | &movups ($in0,&QWP(0,$inp)); # load inp | ||
704 | &paddq ($ivec,&QWP(16,"esp")); | ||
705 | &lea ($inp,&QWP(16,$inp)); | ||
706 | &jmp (&label("ccm64_dec_outer")); | ||
707 | |||
708 | &set_label("ccm64_dec_outer",16); | ||
709 | &xorps ($in0,$inout0); # inp ^= E(ivec) | ||
710 | &movdqa ($inout0,$ivec); | ||
711 | &mov ($rounds,$rounds_); | ||
712 | &movups (&QWP(0,$out),$in0); # save output | ||
713 | &lea ($out,&DWP(16,$out)); | ||
714 | &pshufb ($inout0,$inout3); | ||
715 | |||
716 | &sub ($len,1); | ||
717 | &jz (&label("ccm64_dec_break")); | ||
718 | |||
719 | &$movekey ($rndkey0,&QWP(0,$key_)); | ||
720 | &shr ($rounds,1); | ||
721 | &$movekey ($rndkey1,&QWP(16,$key_)); | ||
722 | &xorps ($in0,$rndkey0); | ||
723 | &lea ($key,&DWP(32,$key_)); | ||
724 | &xorps ($inout0,$rndkey0); | ||
725 | &xorps ($cmac,$in0); # cmac^=out | ||
726 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
727 | |||
728 | &set_label("ccm64_dec2_loop"); | ||
729 | &aesenc ($inout0,$rndkey1); | ||
730 | &dec ($rounds); | ||
731 | &aesenc ($cmac,$rndkey1); | ||
732 | &$movekey ($rndkey1,&QWP(16,$key)); | ||
733 | &aesenc ($inout0,$rndkey0); | ||
734 | &lea ($key,&DWP(32,$key)); | ||
735 | &aesenc ($cmac,$rndkey0); | ||
736 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
737 | &jnz (&label("ccm64_dec2_loop")); | ||
738 | &movups ($in0,&QWP(0,$inp)); # load inp | ||
739 | &paddq ($ivec,&QWP(16,"esp")); | ||
740 | &aesenc ($inout0,$rndkey1); | ||
741 | &aesenc ($cmac,$rndkey1); | ||
742 | &lea ($inp,&QWP(16,$inp)); | ||
743 | &aesenclast ($inout0,$rndkey0); | ||
744 | &aesenclast ($cmac,$rndkey0); | ||
745 | &jmp (&label("ccm64_dec_outer")); | ||
746 | |||
747 | &set_label("ccm64_dec_break",16); | ||
748 | &mov ($key,$key_); | ||
749 | if ($inline) | ||
750 | { &aesni_inline_generate1("enc",$cmac,$in0); } | ||
751 | else | ||
752 | { &call ("_aesni_encrypt1",$cmac); } | ||
753 | |||
754 | &mov ("esp",&DWP(48,"esp")); | ||
755 | &mov ($out,&wparam(5)); | ||
756 | &movups (&QWP(0,$out),$cmac); | ||
757 | &function_end("aesni_ccm64_decrypt_blocks"); | ||
758 | } | ||
759 | |||
760 | ###################################################################### | ||
761 | # void aesni_ctr32_encrypt_blocks (const void *in, void *out, | ||
762 | # size_t blocks, const AES_KEY *key, | ||
763 | # const char *ivec); | ||
764 | # | ||
765 | # Handles only complete blocks, operates on 32-bit counter and | ||
766 | # does not update *ivec! (see engine/eng_aesni.c for details) | ||
767 | # | ||
768 | # stack layout: | ||
769 | # 0 pshufb mask | ||
770 | # 16 vector addend: 0,6,6,6 | ||
771 | # 32 counter-less ivec | ||
772 | # 48 1st triplet of counter vector | ||
773 | # 64 2nd triplet of counter vector | ||
774 | # 80 saved %esp | ||
775 | |||
776 | &function_begin("aesni_ctr32_encrypt_blocks"); | ||
777 | &mov ($inp,&wparam(0)); | ||
778 | &mov ($out,&wparam(1)); | ||
779 | &mov ($len,&wparam(2)); | ||
780 | &mov ($key,&wparam(3)); | ||
781 | &mov ($rounds_,&wparam(4)); | ||
782 | &mov ($key_,"esp"); | ||
783 | &sub ("esp",88); | ||
784 | &and ("esp",-16); # align stack | ||
785 | &mov (&DWP(80,"esp"),$key_); | ||
786 | |||
787 | &cmp ($len,1); | ||
788 | &je (&label("ctr32_one_shortcut")); | ||
789 | |||
790 | &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec | ||
791 | |||
792 | # compose byte-swap control mask for pshufb on stack | ||
793 | &mov (&DWP(0,"esp"),0x0c0d0e0f); | ||
794 | &mov (&DWP(4,"esp"),0x08090a0b); | ||
795 | &mov (&DWP(8,"esp"),0x04050607); | ||
796 | &mov (&DWP(12,"esp"),0x00010203); | ||
797 | |||
798 | # compose counter increment vector on stack | ||
799 | &mov ($rounds,6); | ||
800 | &xor ($key_,$key_); | ||
801 | &mov (&DWP(16,"esp"),$rounds); | ||
802 | &mov (&DWP(20,"esp"),$rounds); | ||
803 | &mov (&DWP(24,"esp"),$rounds); | ||
804 | &mov (&DWP(28,"esp"),$key_); | ||
805 | |||
806 | &pextrd ($rounds_,$inout5,3); # pull 32-bit counter | ||
807 | &pinsrd ($inout5,$key_,3); # wipe 32-bit counter | ||
808 | |||
809 | &mov ($rounds,&DWP(240,$key)); # key->rounds | ||
810 | |||
811 | # compose 2 vectors of 3x32-bit counters | ||
812 | &bswap ($rounds_); | ||
813 | &pxor ($rndkey1,$rndkey1); | ||
814 | &pxor ($rndkey0,$rndkey0); | ||
815 | &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask | ||
816 | &pinsrd ($rndkey1,$rounds_,0); | ||
817 | &lea ($key_,&DWP(3,$rounds_)); | ||
818 | &pinsrd ($rndkey0,$key_,0); | ||
819 | &inc ($rounds_); | ||
820 | &pinsrd ($rndkey1,$rounds_,1); | ||
821 | &inc ($key_); | ||
822 | &pinsrd ($rndkey0,$key_,1); | ||
823 | &inc ($rounds_); | ||
824 | &pinsrd ($rndkey1,$rounds_,2); | ||
825 | &inc ($key_); | ||
826 | &pinsrd ($rndkey0,$key_,2); | ||
827 | &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet | ||
828 | &pshufb ($rndkey1,$inout0); # byte swap | ||
829 | &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet | ||
830 | &pshufb ($rndkey0,$inout0); # byte swap | ||
831 | |||
832 | &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword | ||
833 | &pshufd ($inout1,$rndkey1,2<<6); | ||
834 | &cmp ($len,6); | ||
835 | &jb (&label("ctr32_tail")); | ||
836 | &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec | ||
837 | &shr ($rounds,1); | ||
838 | &mov ($key_,$key); # backup $key | ||
839 | &mov ($rounds_,$rounds); # backup $rounds | ||
840 | &sub ($len,6); | ||
841 | &jmp (&label("ctr32_loop6")); | ||
842 | |||
843 | &set_label("ctr32_loop6",16); | ||
844 | &pshufd ($inout2,$rndkey1,1<<6); | ||
845 | &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec | ||
846 | &pshufd ($inout3,$rndkey0,3<<6); | ||
847 | &por ($inout0,$rndkey1); # merge counter-less ivec | ||
848 | &pshufd ($inout4,$rndkey0,2<<6); | ||
849 | &por ($inout1,$rndkey1); | ||
850 | &pshufd ($inout5,$rndkey0,1<<6); | ||
851 | &por ($inout2,$rndkey1); | ||
852 | &por ($inout3,$rndkey1); | ||
853 | &por ($inout4,$rndkey1); | ||
854 | &por ($inout5,$rndkey1); | ||
855 | |||
856 | # inlining _aesni_encrypt6's prologue gives ~4% improvement... | ||
857 | &$movekey ($rndkey0,&QWP(0,$key_)); | ||
858 | &$movekey ($rndkey1,&QWP(16,$key_)); | ||
859 | &lea ($key,&DWP(32,$key_)); | ||
860 | &dec ($rounds); | ||
861 | &pxor ($inout0,$rndkey0); | ||
862 | &pxor ($inout1,$rndkey0); | ||
863 | &aesenc ($inout0,$rndkey1); | ||
864 | &pxor ($inout2,$rndkey0); | ||
865 | &aesenc ($inout1,$rndkey1); | ||
866 | &pxor ($inout3,$rndkey0); | ||
867 | &aesenc ($inout2,$rndkey1); | ||
868 | &pxor ($inout4,$rndkey0); | ||
869 | &aesenc ($inout3,$rndkey1); | ||
870 | &pxor ($inout5,$rndkey0); | ||
871 | &aesenc ($inout4,$rndkey1); | ||
872 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
873 | &aesenc ($inout5,$rndkey1); | ||
874 | |||
875 | &call (&label("_aesni_encrypt6_enter")); | ||
876 | |||
877 | &movups ($rndkey1,&QWP(0,$inp)); | ||
878 | &movups ($rndkey0,&QWP(0x10,$inp)); | ||
879 | &xorps ($inout0,$rndkey1); | ||
880 | &movups ($rndkey1,&QWP(0x20,$inp)); | ||
881 | &xorps ($inout1,$rndkey0); | ||
882 | &movups (&QWP(0,$out),$inout0); | ||
883 | &movdqa ($rndkey0,&QWP(16,"esp")); # load increment | ||
884 | &xorps ($inout2,$rndkey1); | ||
885 | &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet | ||
886 | &movups (&QWP(0x10,$out),$inout1); | ||
887 | &movups (&QWP(0x20,$out),$inout2); | ||
888 | |||
889 | &paddd ($rndkey1,$rndkey0); # 1st triplet increment | ||
890 | &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment | ||
891 | &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask | ||
892 | |||
893 | &movups ($inout1,&QWP(0x30,$inp)); | ||
894 | &movups ($inout2,&QWP(0x40,$inp)); | ||
895 | &xorps ($inout3,$inout1); | ||
896 | &movups ($inout1,&QWP(0x50,$inp)); | ||
897 | &lea ($inp,&DWP(0x60,$inp)); | ||
898 | &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet | ||
899 | &pshufb ($rndkey1,$inout0); # byte swap | ||
900 | &xorps ($inout4,$inout2); | ||
901 | &movups (&QWP(0x30,$out),$inout3); | ||
902 | &xorps ($inout5,$inout1); | ||
903 | &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet | ||
904 | &pshufb ($rndkey0,$inout0); # byte swap | ||
905 | &movups (&QWP(0x40,$out),$inout4); | ||
906 | &pshufd ($inout0,$rndkey1,3<<6); | ||
907 | &movups (&QWP(0x50,$out),$inout5); | ||
908 | &lea ($out,&DWP(0x60,$out)); | ||
909 | |||
910 | &mov ($rounds,$rounds_); | ||
911 | &pshufd ($inout1,$rndkey1,2<<6); | ||
912 | &sub ($len,6); | ||
913 | &jnc (&label("ctr32_loop6")); | ||
914 | |||
915 | &add ($len,6); | ||
916 | &jz (&label("ctr32_ret")); | ||
917 | &mov ($key,$key_); | ||
918 | &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds | ||
919 | &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec | ||
920 | |||
921 | &set_label("ctr32_tail"); | ||
922 | &por ($inout0,$inout5); | ||
923 | &cmp ($len,2); | ||
924 | &jb (&label("ctr32_one")); | ||
925 | |||
926 | &pshufd ($inout2,$rndkey1,1<<6); | ||
927 | &por ($inout1,$inout5); | ||
928 | &je (&label("ctr32_two")); | ||
929 | |||
930 | &pshufd ($inout3,$rndkey0,3<<6); | ||
931 | &por ($inout2,$inout5); | ||
932 | &cmp ($len,4); | ||
933 | &jb (&label("ctr32_three")); | ||
934 | |||
935 | &pshufd ($inout4,$rndkey0,2<<6); | ||
936 | &por ($inout3,$inout5); | ||
937 | &je (&label("ctr32_four")); | ||
938 | |||
939 | &por ($inout4,$inout5); | ||
940 | &call ("_aesni_encrypt6"); | ||
941 | &movups ($rndkey1,&QWP(0,$inp)); | ||
942 | &movups ($rndkey0,&QWP(0x10,$inp)); | ||
943 | &xorps ($inout0,$rndkey1); | ||
944 | &movups ($rndkey1,&QWP(0x20,$inp)); | ||
945 | &xorps ($inout1,$rndkey0); | ||
946 | &movups ($rndkey0,&QWP(0x30,$inp)); | ||
947 | &xorps ($inout2,$rndkey1); | ||
948 | &movups ($rndkey1,&QWP(0x40,$inp)); | ||
949 | &xorps ($inout3,$rndkey0); | ||
950 | &movups (&QWP(0,$out),$inout0); | ||
951 | &xorps ($inout4,$rndkey1); | ||
952 | &movups (&QWP(0x10,$out),$inout1); | ||
953 | &movups (&QWP(0x20,$out),$inout2); | ||
954 | &movups (&QWP(0x30,$out),$inout3); | ||
955 | &movups (&QWP(0x40,$out),$inout4); | ||
956 | &jmp (&label("ctr32_ret")); | ||
957 | |||
958 | &set_label("ctr32_one_shortcut",16); | ||
959 | &movups ($inout0,&QWP(0,$rounds_)); # load ivec | ||
960 | &mov ($rounds,&DWP(240,$key)); | ||
961 | |||
962 | &set_label("ctr32_one"); | ||
963 | if ($inline) | ||
964 | { &aesni_inline_generate1("enc"); } | ||
965 | else | ||
966 | { &call ("_aesni_encrypt1"); } | ||
967 | &movups ($in0,&QWP(0,$inp)); | ||
968 | &xorps ($in0,$inout0); | ||
969 | &movups (&QWP(0,$out),$in0); | ||
970 | &jmp (&label("ctr32_ret")); | ||
971 | |||
972 | &set_label("ctr32_two",16); | ||
973 | &call ("_aesni_encrypt3"); | ||
974 | &movups ($inout3,&QWP(0,$inp)); | ||
975 | &movups ($inout4,&QWP(0x10,$inp)); | ||
976 | &xorps ($inout0,$inout3); | ||
977 | &xorps ($inout1,$inout4); | ||
978 | &movups (&QWP(0,$out),$inout0); | ||
979 | &movups (&QWP(0x10,$out),$inout1); | ||
980 | &jmp (&label("ctr32_ret")); | ||
981 | |||
982 | &set_label("ctr32_three",16); | ||
983 | &call ("_aesni_encrypt3"); | ||
984 | &movups ($inout3,&QWP(0,$inp)); | ||
985 | &movups ($inout4,&QWP(0x10,$inp)); | ||
986 | &xorps ($inout0,$inout3); | ||
987 | &movups ($inout5,&QWP(0x20,$inp)); | ||
988 | &xorps ($inout1,$inout4); | ||
989 | &movups (&QWP(0,$out),$inout0); | ||
990 | &xorps ($inout2,$inout5); | ||
991 | &movups (&QWP(0x10,$out),$inout1); | ||
992 | &movups (&QWP(0x20,$out),$inout2); | ||
993 | &jmp (&label("ctr32_ret")); | ||
994 | |||
995 | &set_label("ctr32_four",16); | ||
996 | &call ("_aesni_encrypt4"); | ||
997 | &movups ($inout4,&QWP(0,$inp)); | ||
998 | &movups ($inout5,&QWP(0x10,$inp)); | ||
999 | &movups ($rndkey1,&QWP(0x20,$inp)); | ||
1000 | &xorps ($inout0,$inout4); | ||
1001 | &movups ($rndkey0,&QWP(0x30,$inp)); | ||
1002 | &xorps ($inout1,$inout5); | ||
1003 | &movups (&QWP(0,$out),$inout0); | ||
1004 | &xorps ($inout2,$rndkey1); | ||
1005 | &movups (&QWP(0x10,$out),$inout1); | ||
1006 | &xorps ($inout3,$rndkey0); | ||
1007 | &movups (&QWP(0x20,$out),$inout2); | ||
1008 | &movups (&QWP(0x30,$out),$inout3); | ||
1009 | |||
1010 | &set_label("ctr32_ret"); | ||
1011 | &mov ("esp",&DWP(80,"esp")); | ||
1012 | &function_end("aesni_ctr32_encrypt_blocks"); | ||
1013 | |||
1014 | ###################################################################### | ||
1015 | # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
1016 | # const AES_KEY *key1, const AES_KEY *key2 | ||
1017 | # const unsigned char iv[16]); | ||
1018 | # | ||
1019 | { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); | ||
1020 | |||
1021 | &function_begin("aesni_xts_encrypt"); | ||
1022 | &mov ($key,&wparam(4)); # key2 | ||
1023 | &mov ($inp,&wparam(5)); # clear-text tweak | ||
1024 | |||
1025 | &mov ($rounds,&DWP(240,$key)); # key2->rounds | ||
1026 | &movups ($inout0,&QWP(0,$inp)); | ||
1027 | if ($inline) | ||
1028 | { &aesni_inline_generate1("enc"); } | ||
1029 | else | ||
1030 | { &call ("_aesni_encrypt1"); } | ||
1031 | |||
1032 | &mov ($inp,&wparam(0)); | ||
1033 | &mov ($out,&wparam(1)); | ||
1034 | &mov ($len,&wparam(2)); | ||
1035 | &mov ($key,&wparam(3)); # key1 | ||
1036 | |||
1037 | &mov ($key_,"esp"); | ||
1038 | &sub ("esp",16*7+8); | ||
1039 | &mov ($rounds,&DWP(240,$key)); # key1->rounds | ||
1040 | &and ("esp",-16); # align stack | ||
1041 | |||
1042 | &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant | ||
1043 | &mov (&DWP(16*6+4,"esp"),0); | ||
1044 | &mov (&DWP(16*6+8,"esp"),1); | ||
1045 | &mov (&DWP(16*6+12,"esp"),0); | ||
1046 | &mov (&DWP(16*7+0,"esp"),$len); # save original $len | ||
1047 | &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp | ||
1048 | |||
1049 | &movdqa ($tweak,$inout0); | ||
1050 | &pxor ($twtmp,$twtmp); | ||
1051 | &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 | ||
1052 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1053 | |||
1054 | &and ($len,-16); | ||
1055 | &mov ($key_,$key); # backup $key | ||
1056 | &mov ($rounds_,$rounds); # backup $rounds | ||
1057 | &sub ($len,16*6); | ||
1058 | &jc (&label("xts_enc_short")); | ||
1059 | |||
1060 | &shr ($rounds,1); | ||
1061 | &mov ($rounds_,$rounds); | ||
1062 | &jmp (&label("xts_enc_loop6")); | ||
1063 | |||
1064 | &set_label("xts_enc_loop6",16); | ||
1065 | for ($i=0;$i<4;$i++) { | ||
1066 | &pshufd ($twres,$twtmp,0x13); | ||
1067 | &pxor ($twtmp,$twtmp); | ||
1068 | &movdqa (&QWP(16*$i,"esp"),$tweak); | ||
1069 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1070 | &pand ($twres,$twmask); # isolate carry and residue | ||
1071 | &pcmpgtd ($twtmp,$tweak); # broadcast upper bits | ||
1072 | &pxor ($tweak,$twres); | ||
1073 | } | ||
1074 | &pshufd ($inout5,$twtmp,0x13); | ||
1075 | &movdqa (&QWP(16*$i++,"esp"),$tweak); | ||
1076 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1077 | &$movekey ($rndkey0,&QWP(0,$key_)); | ||
1078 | &pand ($inout5,$twmask); # isolate carry and residue | ||
1079 | &movups ($inout0,&QWP(0,$inp)); # load input | ||
1080 | &pxor ($inout5,$tweak); | ||
1081 | |||
1082 | # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] | ||
1083 | &movdqu ($inout1,&QWP(16*1,$inp)); | ||
1084 | &xorps ($inout0,$rndkey0); # input^=rndkey[0] | ||
1085 | &movdqu ($inout2,&QWP(16*2,$inp)); | ||
1086 | &pxor ($inout1,$rndkey0); | ||
1087 | &movdqu ($inout3,&QWP(16*3,$inp)); | ||
1088 | &pxor ($inout2,$rndkey0); | ||
1089 | &movdqu ($inout4,&QWP(16*4,$inp)); | ||
1090 | &pxor ($inout3,$rndkey0); | ||
1091 | &movdqu ($rndkey1,&QWP(16*5,$inp)); | ||
1092 | &pxor ($inout4,$rndkey0); | ||
1093 | &lea ($inp,&DWP(16*6,$inp)); | ||
1094 | &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
1095 | &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak | ||
1096 | &pxor ($inout5,$rndkey1); | ||
1097 | |||
1098 | &$movekey ($rndkey1,&QWP(16,$key_)); | ||
1099 | &lea ($key,&DWP(32,$key_)); | ||
1100 | &pxor ($inout1,&QWP(16*1,"esp")); | ||
1101 | &aesenc ($inout0,$rndkey1); | ||
1102 | &pxor ($inout2,&QWP(16*2,"esp")); | ||
1103 | &aesenc ($inout1,$rndkey1); | ||
1104 | &pxor ($inout3,&QWP(16*3,"esp")); | ||
1105 | &dec ($rounds); | ||
1106 | &aesenc ($inout2,$rndkey1); | ||
1107 | &pxor ($inout4,&QWP(16*4,"esp")); | ||
1108 | &aesenc ($inout3,$rndkey1); | ||
1109 | &pxor ($inout5,$rndkey0); | ||
1110 | &aesenc ($inout4,$rndkey1); | ||
1111 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
1112 | &aesenc ($inout5,$rndkey1); | ||
1113 | &call (&label("_aesni_encrypt6_enter")); | ||
1114 | |||
1115 | &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak | ||
1116 | &pxor ($twtmp,$twtmp); | ||
1117 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
1118 | &pcmpgtd ($twtmp,$tweak); # broadcast upper bits | ||
1119 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
1120 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1121 | &xorps ($inout2,&QWP(16*2,"esp")); | ||
1122 | &movups (&QWP(16*1,$out),$inout1); | ||
1123 | &xorps ($inout3,&QWP(16*3,"esp")); | ||
1124 | &movups (&QWP(16*2,$out),$inout2); | ||
1125 | &xorps ($inout4,&QWP(16*4,"esp")); | ||
1126 | &movups (&QWP(16*3,$out),$inout3); | ||
1127 | &xorps ($inout5,$tweak); | ||
1128 | &movups (&QWP(16*4,$out),$inout4); | ||
1129 | &pshufd ($twres,$twtmp,0x13); | ||
1130 | &movups (&QWP(16*5,$out),$inout5); | ||
1131 | &lea ($out,&DWP(16*6,$out)); | ||
1132 | &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 | ||
1133 | |||
1134 | &pxor ($twtmp,$twtmp); | ||
1135 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1136 | &pand ($twres,$twmask); # isolate carry and residue | ||
1137 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1138 | &mov ($rounds,$rounds_); # restore $rounds | ||
1139 | &pxor ($tweak,$twres); | ||
1140 | |||
1141 | &sub ($len,16*6); | ||
1142 | &jnc (&label("xts_enc_loop6")); | ||
1143 | |||
1144 | &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds | ||
1145 | &mov ($key,$key_); # restore $key | ||
1146 | &mov ($rounds_,$rounds); | ||
1147 | |||
1148 | &set_label("xts_enc_short"); | ||
1149 | &add ($len,16*6); | ||
1150 | &jz (&label("xts_enc_done6x")); | ||
1151 | |||
1152 | &movdqa ($inout3,$tweak); # put aside previous tweak | ||
1153 | &cmp ($len,0x20); | ||
1154 | &jb (&label("xts_enc_one")); | ||
1155 | |||
1156 | &pshufd ($twres,$twtmp,0x13); | ||
1157 | &pxor ($twtmp,$twtmp); | ||
1158 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1159 | &pand ($twres,$twmask); # isolate carry and residue | ||
1160 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1161 | &pxor ($tweak,$twres); | ||
1162 | &je (&label("xts_enc_two")); | ||
1163 | |||
1164 | &pshufd ($twres,$twtmp,0x13); | ||
1165 | &pxor ($twtmp,$twtmp); | ||
1166 | &movdqa ($inout4,$tweak); # put aside previous tweak | ||
1167 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1168 | &pand ($twres,$twmask); # isolate carry and residue | ||
1169 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1170 | &pxor ($tweak,$twres); | ||
1171 | &cmp ($len,0x40); | ||
1172 | &jb (&label("xts_enc_three")); | ||
1173 | |||
1174 | &pshufd ($twres,$twtmp,0x13); | ||
1175 | &pxor ($twtmp,$twtmp); | ||
1176 | &movdqa ($inout5,$tweak); # put aside previous tweak | ||
1177 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1178 | &pand ($twres,$twmask); # isolate carry and residue | ||
1179 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1180 | &pxor ($tweak,$twres); | ||
1181 | &movdqa (&QWP(16*0,"esp"),$inout3); | ||
1182 | &movdqa (&QWP(16*1,"esp"),$inout4); | ||
1183 | &je (&label("xts_enc_four")); | ||
1184 | |||
1185 | &movdqa (&QWP(16*2,"esp"),$inout5); | ||
1186 | &pshufd ($inout5,$twtmp,0x13); | ||
1187 | &movdqa (&QWP(16*3,"esp"),$tweak); | ||
1188 | &paddq ($tweak,$tweak); # &psllq($inout0,1); | ||
1189 | &pand ($inout5,$twmask); # isolate carry and residue | ||
1190 | &pxor ($inout5,$tweak); | ||
1191 | |||
1192 | &movdqu ($inout0,&QWP(16*0,$inp)); # load input | ||
1193 | &movdqu ($inout1,&QWP(16*1,$inp)); | ||
1194 | &movdqu ($inout2,&QWP(16*2,$inp)); | ||
1195 | &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
1196 | &movdqu ($inout3,&QWP(16*3,$inp)); | ||
1197 | &pxor ($inout1,&QWP(16*1,"esp")); | ||
1198 | &movdqu ($inout4,&QWP(16*4,$inp)); | ||
1199 | &pxor ($inout2,&QWP(16*2,"esp")); | ||
1200 | &lea ($inp,&DWP(16*5,$inp)); | ||
1201 | &pxor ($inout3,&QWP(16*3,"esp")); | ||
1202 | &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak | ||
1203 | &pxor ($inout4,$inout5); | ||
1204 | |||
1205 | &call ("_aesni_encrypt6"); | ||
1206 | |||
1207 | &movaps ($tweak,&QWP(16*4,"esp")); # last tweak | ||
1208 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
1209 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
1210 | &xorps ($inout2,&QWP(16*2,"esp")); | ||
1211 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1212 | &xorps ($inout3,&QWP(16*3,"esp")); | ||
1213 | &movups (&QWP(16*1,$out),$inout1); | ||
1214 | &xorps ($inout4,$tweak); | ||
1215 | &movups (&QWP(16*2,$out),$inout2); | ||
1216 | &movups (&QWP(16*3,$out),$inout3); | ||
1217 | &movups (&QWP(16*4,$out),$inout4); | ||
1218 | &lea ($out,&DWP(16*5,$out)); | ||
1219 | &jmp (&label("xts_enc_done")); | ||
1220 | |||
1221 | &set_label("xts_enc_one",16); | ||
1222 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
1223 | &lea ($inp,&DWP(16*1,$inp)); | ||
1224 | &xorps ($inout0,$inout3); # input^=tweak | ||
1225 | if ($inline) | ||
1226 | { &aesni_inline_generate1("enc"); } | ||
1227 | else | ||
1228 | { &call ("_aesni_encrypt1"); } | ||
1229 | &xorps ($inout0,$inout3); # output^=tweak | ||
1230 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1231 | &lea ($out,&DWP(16*1,$out)); | ||
1232 | |||
1233 | &movdqa ($tweak,$inout3); # last tweak | ||
1234 | &jmp (&label("xts_enc_done")); | ||
1235 | |||
1236 | &set_label("xts_enc_two",16); | ||
1237 | &movaps ($inout4,$tweak); # put aside last tweak | ||
1238 | |||
1239 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
1240 | &movups ($inout1,&QWP(16*1,$inp)); | ||
1241 | &lea ($inp,&DWP(16*2,$inp)); | ||
1242 | &xorps ($inout0,$inout3); # input^=tweak | ||
1243 | &xorps ($inout1,$inout4); | ||
1244 | &xorps ($inout2,$inout2); | ||
1245 | |||
1246 | &call ("_aesni_encrypt3"); | ||
1247 | |||
1248 | &xorps ($inout0,$inout3); # output^=tweak | ||
1249 | &xorps ($inout1,$inout4); | ||
1250 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1251 | &movups (&QWP(16*1,$out),$inout1); | ||
1252 | &lea ($out,&DWP(16*2,$out)); | ||
1253 | |||
1254 | &movdqa ($tweak,$inout4); # last tweak | ||
1255 | &jmp (&label("xts_enc_done")); | ||
1256 | |||
1257 | &set_label("xts_enc_three",16); | ||
1258 | &movaps ($inout5,$tweak); # put aside last tweak | ||
1259 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
1260 | &movups ($inout1,&QWP(16*1,$inp)); | ||
1261 | &movups ($inout2,&QWP(16*2,$inp)); | ||
1262 | &lea ($inp,&DWP(16*3,$inp)); | ||
1263 | &xorps ($inout0,$inout3); # input^=tweak | ||
1264 | &xorps ($inout1,$inout4); | ||
1265 | &xorps ($inout2,$inout5); | ||
1266 | |||
1267 | &call ("_aesni_encrypt3"); | ||
1268 | |||
1269 | &xorps ($inout0,$inout3); # output^=tweak | ||
1270 | &xorps ($inout1,$inout4); | ||
1271 | &xorps ($inout2,$inout5); | ||
1272 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1273 | &movups (&QWP(16*1,$out),$inout1); | ||
1274 | &movups (&QWP(16*2,$out),$inout2); | ||
1275 | &lea ($out,&DWP(16*3,$out)); | ||
1276 | |||
1277 | &movdqa ($tweak,$inout5); # last tweak | ||
1278 | &jmp (&label("xts_enc_done")); | ||
1279 | |||
1280 | &set_label("xts_enc_four",16); | ||
1281 | &movaps ($inout4,$tweak); # put aside last tweak | ||
1282 | |||
1283 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
1284 | &movups ($inout1,&QWP(16*1,$inp)); | ||
1285 | &movups ($inout2,&QWP(16*2,$inp)); | ||
1286 | &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
1287 | &movups ($inout3,&QWP(16*3,$inp)); | ||
1288 | &lea ($inp,&DWP(16*4,$inp)); | ||
1289 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
1290 | &xorps ($inout2,$inout5); | ||
1291 | &xorps ($inout3,$inout4); | ||
1292 | |||
1293 | &call ("_aesni_encrypt4"); | ||
1294 | |||
1295 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
1296 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
1297 | &xorps ($inout2,$inout5); | ||
1298 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1299 | &xorps ($inout3,$inout4); | ||
1300 | &movups (&QWP(16*1,$out),$inout1); | ||
1301 | &movups (&QWP(16*2,$out),$inout2); | ||
1302 | &movups (&QWP(16*3,$out),$inout3); | ||
1303 | &lea ($out,&DWP(16*4,$out)); | ||
1304 | |||
1305 | &movdqa ($tweak,$inout4); # last tweak | ||
1306 | &jmp (&label("xts_enc_done")); | ||
1307 | |||
1308 | &set_label("xts_enc_done6x",16); # $tweak is pre-calculated | ||
1309 | &mov ($len,&DWP(16*7+0,"esp")); # restore original $len | ||
1310 | &and ($len,15); | ||
1311 | &jz (&label("xts_enc_ret")); | ||
1312 | &movdqa ($inout3,$tweak); | ||
1313 | &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 | ||
1314 | &jmp (&label("xts_enc_steal")); | ||
1315 | |||
1316 | &set_label("xts_enc_done",16); | ||
1317 | &mov ($len,&DWP(16*7+0,"esp")); # restore original $len | ||
1318 | &pxor ($twtmp,$twtmp); | ||
1319 | &and ($len,15); | ||
1320 | &jz (&label("xts_enc_ret")); | ||
1321 | |||
1322 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1323 | &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 | ||
1324 | &pshufd ($inout3,$twtmp,0x13); | ||
1325 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1326 | &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue | ||
1327 | &pxor ($inout3,$tweak); | ||
1328 | |||
1329 | &set_label("xts_enc_steal"); | ||
1330 | &movz ($rounds,&BP(0,$inp)); | ||
1331 | &movz ($key,&BP(-16,$out)); | ||
1332 | &lea ($inp,&DWP(1,$inp)); | ||
1333 | &mov (&BP(-16,$out),&LB($rounds)); | ||
1334 | &mov (&BP(0,$out),&LB($key)); | ||
1335 | &lea ($out,&DWP(1,$out)); | ||
1336 | &sub ($len,1); | ||
1337 | &jnz (&label("xts_enc_steal")); | ||
1338 | |||
1339 | &sub ($out,&DWP(16*7+0,"esp")); # rewind $out | ||
1340 | &mov ($key,$key_); # restore $key | ||
1341 | &mov ($rounds,$rounds_); # restore $rounds | ||
1342 | |||
1343 | &movups ($inout0,&QWP(-16,$out)); # load input | ||
1344 | &xorps ($inout0,$inout3); # input^=tweak | ||
1345 | if ($inline) | ||
1346 | { &aesni_inline_generate1("enc"); } | ||
1347 | else | ||
1348 | { &call ("_aesni_encrypt1"); } | ||
1349 | &xorps ($inout0,$inout3); # output^=tweak | ||
1350 | &movups (&QWP(-16,$out),$inout0); # write output | ||
1351 | |||
1352 | &set_label("xts_enc_ret"); | ||
1353 | &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp | ||
1354 | &function_end("aesni_xts_encrypt"); | ||
1355 | |||
1356 | &function_begin("aesni_xts_decrypt"); | ||
1357 | &mov ($key,&wparam(4)); # key2 | ||
1358 | &mov ($inp,&wparam(5)); # clear-text tweak | ||
1359 | |||
1360 | &mov ($rounds,&DWP(240,$key)); # key2->rounds | ||
1361 | &movups ($inout0,&QWP(0,$inp)); | ||
1362 | if ($inline) | ||
1363 | { &aesni_inline_generate1("enc"); } | ||
1364 | else | ||
1365 | { &call ("_aesni_encrypt1"); } | ||
1366 | |||
1367 | &mov ($inp,&wparam(0)); | ||
1368 | &mov ($out,&wparam(1)); | ||
1369 | &mov ($len,&wparam(2)); | ||
1370 | &mov ($key,&wparam(3)); # key1 | ||
1371 | |||
1372 | &mov ($key_,"esp"); | ||
1373 | &sub ("esp",16*7+8); | ||
1374 | &and ("esp",-16); # align stack | ||
1375 | |||
1376 | &xor ($rounds_,$rounds_); # if(len%16) len-=16; | ||
1377 | &test ($len,15); | ||
1378 | &setnz (&LB($rounds_)); | ||
1379 | &shl ($rounds_,4); | ||
1380 | &sub ($len,$rounds_); | ||
1381 | |||
1382 | &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant | ||
1383 | &mov (&DWP(16*6+4,"esp"),0); | ||
1384 | &mov (&DWP(16*6+8,"esp"),1); | ||
1385 | &mov (&DWP(16*6+12,"esp"),0); | ||
1386 | &mov (&DWP(16*7+0,"esp"),$len); # save original $len | ||
1387 | &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp | ||
1388 | |||
1389 | &mov ($rounds,&DWP(240,$key)); # key1->rounds | ||
1390 | &mov ($key_,$key); # backup $key | ||
1391 | &mov ($rounds_,$rounds); # backup $rounds | ||
1392 | |||
1393 | &movdqa ($tweak,$inout0); | ||
1394 | &pxor ($twtmp,$twtmp); | ||
1395 | &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 | ||
1396 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1397 | |||
1398 | &and ($len,-16); | ||
1399 | &sub ($len,16*6); | ||
1400 | &jc (&label("xts_dec_short")); | ||
1401 | |||
1402 | &shr ($rounds,1); | ||
1403 | &mov ($rounds_,$rounds); | ||
1404 | &jmp (&label("xts_dec_loop6")); | ||
1405 | |||
1406 | &set_label("xts_dec_loop6",16); | ||
1407 | for ($i=0;$i<4;$i++) { | ||
1408 | &pshufd ($twres,$twtmp,0x13); | ||
1409 | &pxor ($twtmp,$twtmp); | ||
1410 | &movdqa (&QWP(16*$i,"esp"),$tweak); | ||
1411 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1412 | &pand ($twres,$twmask); # isolate carry and residue | ||
1413 | &pcmpgtd ($twtmp,$tweak); # broadcast upper bits | ||
1414 | &pxor ($tweak,$twres); | ||
1415 | } | ||
1416 | &pshufd ($inout5,$twtmp,0x13); | ||
1417 | &movdqa (&QWP(16*$i++,"esp"),$tweak); | ||
1418 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1419 | &$movekey ($rndkey0,&QWP(0,$key_)); | ||
1420 | &pand ($inout5,$twmask); # isolate carry and residue | ||
1421 | &movups ($inout0,&QWP(0,$inp)); # load input | ||
1422 | &pxor ($inout5,$tweak); | ||
1423 | |||
1424 | # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] | ||
1425 | &movdqu ($inout1,&QWP(16*1,$inp)); | ||
1426 | &xorps ($inout0,$rndkey0); # input^=rndkey[0] | ||
1427 | &movdqu ($inout2,&QWP(16*2,$inp)); | ||
1428 | &pxor ($inout1,$rndkey0); | ||
1429 | &movdqu ($inout3,&QWP(16*3,$inp)); | ||
1430 | &pxor ($inout2,$rndkey0); | ||
1431 | &movdqu ($inout4,&QWP(16*4,$inp)); | ||
1432 | &pxor ($inout3,$rndkey0); | ||
1433 | &movdqu ($rndkey1,&QWP(16*5,$inp)); | ||
1434 | &pxor ($inout4,$rndkey0); | ||
1435 | &lea ($inp,&DWP(16*6,$inp)); | ||
1436 | &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
1437 | &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak | ||
1438 | &pxor ($inout5,$rndkey1); | ||
1439 | |||
1440 | &$movekey ($rndkey1,&QWP(16,$key_)); | ||
1441 | &lea ($key,&DWP(32,$key_)); | ||
1442 | &pxor ($inout1,&QWP(16*1,"esp")); | ||
1443 | &aesdec ($inout0,$rndkey1); | ||
1444 | &pxor ($inout2,&QWP(16*2,"esp")); | ||
1445 | &aesdec ($inout1,$rndkey1); | ||
1446 | &pxor ($inout3,&QWP(16*3,"esp")); | ||
1447 | &dec ($rounds); | ||
1448 | &aesdec ($inout2,$rndkey1); | ||
1449 | &pxor ($inout4,&QWP(16*4,"esp")); | ||
1450 | &aesdec ($inout3,$rndkey1); | ||
1451 | &pxor ($inout5,$rndkey0); | ||
1452 | &aesdec ($inout4,$rndkey1); | ||
1453 | &$movekey ($rndkey0,&QWP(0,$key)); | ||
1454 | &aesdec ($inout5,$rndkey1); | ||
1455 | &call (&label("_aesni_decrypt6_enter")); | ||
1456 | |||
1457 | &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak | ||
1458 | &pxor ($twtmp,$twtmp); | ||
1459 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
1460 | &pcmpgtd ($twtmp,$tweak); # broadcast upper bits | ||
1461 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
1462 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1463 | &xorps ($inout2,&QWP(16*2,"esp")); | ||
1464 | &movups (&QWP(16*1,$out),$inout1); | ||
1465 | &xorps ($inout3,&QWP(16*3,"esp")); | ||
1466 | &movups (&QWP(16*2,$out),$inout2); | ||
1467 | &xorps ($inout4,&QWP(16*4,"esp")); | ||
1468 | &movups (&QWP(16*3,$out),$inout3); | ||
1469 | &xorps ($inout5,$tweak); | ||
1470 | &movups (&QWP(16*4,$out),$inout4); | ||
1471 | &pshufd ($twres,$twtmp,0x13); | ||
1472 | &movups (&QWP(16*5,$out),$inout5); | ||
1473 | &lea ($out,&DWP(16*6,$out)); | ||
1474 | &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 | ||
1475 | |||
1476 | &pxor ($twtmp,$twtmp); | ||
1477 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1478 | &pand ($twres,$twmask); # isolate carry and residue | ||
1479 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1480 | &mov ($rounds,$rounds_); # restore $rounds | ||
1481 | &pxor ($tweak,$twres); | ||
1482 | |||
1483 | &sub ($len,16*6); | ||
1484 | &jnc (&label("xts_dec_loop6")); | ||
1485 | |||
1486 | &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds | ||
1487 | &mov ($key,$key_); # restore $key | ||
1488 | &mov ($rounds_,$rounds); | ||
1489 | |||
1490 | &set_label("xts_dec_short"); | ||
1491 | &add ($len,16*6); | ||
1492 | &jz (&label("xts_dec_done6x")); | ||
1493 | |||
1494 | &movdqa ($inout3,$tweak); # put aside previous tweak | ||
1495 | &cmp ($len,0x20); | ||
1496 | &jb (&label("xts_dec_one")); | ||
1497 | |||
1498 | &pshufd ($twres,$twtmp,0x13); | ||
1499 | &pxor ($twtmp,$twtmp); | ||
1500 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1501 | &pand ($twres,$twmask); # isolate carry and residue | ||
1502 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1503 | &pxor ($tweak,$twres); | ||
1504 | &je (&label("xts_dec_two")); | ||
1505 | |||
1506 | &pshufd ($twres,$twtmp,0x13); | ||
1507 | &pxor ($twtmp,$twtmp); | ||
1508 | &movdqa ($inout4,$tweak); # put aside previous tweak | ||
1509 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1510 | &pand ($twres,$twmask); # isolate carry and residue | ||
1511 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1512 | &pxor ($tweak,$twres); | ||
1513 | &cmp ($len,0x40); | ||
1514 | &jb (&label("xts_dec_three")); | ||
1515 | |||
1516 | &pshufd ($twres,$twtmp,0x13); | ||
1517 | &pxor ($twtmp,$twtmp); | ||
1518 | &movdqa ($inout5,$tweak); # put aside previous tweak | ||
1519 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1520 | &pand ($twres,$twmask); # isolate carry and residue | ||
1521 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1522 | &pxor ($tweak,$twres); | ||
1523 | &movdqa (&QWP(16*0,"esp"),$inout3); | ||
1524 | &movdqa (&QWP(16*1,"esp"),$inout4); | ||
1525 | &je (&label("xts_dec_four")); | ||
1526 | |||
1527 | &movdqa (&QWP(16*2,"esp"),$inout5); | ||
1528 | &pshufd ($inout5,$twtmp,0x13); | ||
1529 | &movdqa (&QWP(16*3,"esp"),$tweak); | ||
1530 | &paddq ($tweak,$tweak); # &psllq($inout0,1); | ||
1531 | &pand ($inout5,$twmask); # isolate carry and residue | ||
1532 | &pxor ($inout5,$tweak); | ||
1533 | |||
1534 | &movdqu ($inout0,&QWP(16*0,$inp)); # load input | ||
1535 | &movdqu ($inout1,&QWP(16*1,$inp)); | ||
1536 | &movdqu ($inout2,&QWP(16*2,$inp)); | ||
1537 | &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
1538 | &movdqu ($inout3,&QWP(16*3,$inp)); | ||
1539 | &pxor ($inout1,&QWP(16*1,"esp")); | ||
1540 | &movdqu ($inout4,&QWP(16*4,$inp)); | ||
1541 | &pxor ($inout2,&QWP(16*2,"esp")); | ||
1542 | &lea ($inp,&DWP(16*5,$inp)); | ||
1543 | &pxor ($inout3,&QWP(16*3,"esp")); | ||
1544 | &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak | ||
1545 | &pxor ($inout4,$inout5); | ||
1546 | |||
1547 | &call ("_aesni_decrypt6"); | ||
1548 | |||
1549 | &movaps ($tweak,&QWP(16*4,"esp")); # last tweak | ||
1550 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
1551 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
1552 | &xorps ($inout2,&QWP(16*2,"esp")); | ||
1553 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1554 | &xorps ($inout3,&QWP(16*3,"esp")); | ||
1555 | &movups (&QWP(16*1,$out),$inout1); | ||
1556 | &xorps ($inout4,$tweak); | ||
1557 | &movups (&QWP(16*2,$out),$inout2); | ||
1558 | &movups (&QWP(16*3,$out),$inout3); | ||
1559 | &movups (&QWP(16*4,$out),$inout4); | ||
1560 | &lea ($out,&DWP(16*5,$out)); | ||
1561 | &jmp (&label("xts_dec_done")); | ||
1562 | |||
1563 | &set_label("xts_dec_one",16); | ||
1564 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
1565 | &lea ($inp,&DWP(16*1,$inp)); | ||
1566 | &xorps ($inout0,$inout3); # input^=tweak | ||
1567 | if ($inline) | ||
1568 | { &aesni_inline_generate1("dec"); } | ||
1569 | else | ||
1570 | { &call ("_aesni_decrypt1"); } | ||
1571 | &xorps ($inout0,$inout3); # output^=tweak | ||
1572 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1573 | &lea ($out,&DWP(16*1,$out)); | ||
1574 | |||
1575 | &movdqa ($tweak,$inout3); # last tweak | ||
1576 | &jmp (&label("xts_dec_done")); | ||
1577 | |||
1578 | &set_label("xts_dec_two",16); | ||
1579 | &movaps ($inout4,$tweak); # put aside last tweak | ||
1580 | |||
1581 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
1582 | &movups ($inout1,&QWP(16*1,$inp)); | ||
1583 | &lea ($inp,&DWP(16*2,$inp)); | ||
1584 | &xorps ($inout0,$inout3); # input^=tweak | ||
1585 | &xorps ($inout1,$inout4); | ||
1586 | |||
1587 | &call ("_aesni_decrypt3"); | ||
1588 | |||
1589 | &xorps ($inout0,$inout3); # output^=tweak | ||
1590 | &xorps ($inout1,$inout4); | ||
1591 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1592 | &movups (&QWP(16*1,$out),$inout1); | ||
1593 | &lea ($out,&DWP(16*2,$out)); | ||
1594 | |||
1595 | &movdqa ($tweak,$inout4); # last tweak | ||
1596 | &jmp (&label("xts_dec_done")); | ||
1597 | |||
1598 | &set_label("xts_dec_three",16); | ||
1599 | &movaps ($inout5,$tweak); # put aside last tweak | ||
1600 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
1601 | &movups ($inout1,&QWP(16*1,$inp)); | ||
1602 | &movups ($inout2,&QWP(16*2,$inp)); | ||
1603 | &lea ($inp,&DWP(16*3,$inp)); | ||
1604 | &xorps ($inout0,$inout3); # input^=tweak | ||
1605 | &xorps ($inout1,$inout4); | ||
1606 | &xorps ($inout2,$inout5); | ||
1607 | |||
1608 | &call ("_aesni_decrypt3"); | ||
1609 | |||
1610 | &xorps ($inout0,$inout3); # output^=tweak | ||
1611 | &xorps ($inout1,$inout4); | ||
1612 | &xorps ($inout2,$inout5); | ||
1613 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1614 | &movups (&QWP(16*1,$out),$inout1); | ||
1615 | &movups (&QWP(16*2,$out),$inout2); | ||
1616 | &lea ($out,&DWP(16*3,$out)); | ||
1617 | |||
1618 | &movdqa ($tweak,$inout5); # last tweak | ||
1619 | &jmp (&label("xts_dec_done")); | ||
1620 | |||
1621 | &set_label("xts_dec_four",16); | ||
1622 | &movaps ($inout4,$tweak); # put aside last tweak | ||
1623 | |||
1624 | &movups ($inout0,&QWP(16*0,$inp)); # load input | ||
1625 | &movups ($inout1,&QWP(16*1,$inp)); | ||
1626 | &movups ($inout2,&QWP(16*2,$inp)); | ||
1627 | &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak | ||
1628 | &movups ($inout3,&QWP(16*3,$inp)); | ||
1629 | &lea ($inp,&DWP(16*4,$inp)); | ||
1630 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
1631 | &xorps ($inout2,$inout5); | ||
1632 | &xorps ($inout3,$inout4); | ||
1633 | |||
1634 | &call ("_aesni_decrypt4"); | ||
1635 | |||
1636 | &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak | ||
1637 | &xorps ($inout1,&QWP(16*1,"esp")); | ||
1638 | &xorps ($inout2,$inout5); | ||
1639 | &movups (&QWP(16*0,$out),$inout0); # write output | ||
1640 | &xorps ($inout3,$inout4); | ||
1641 | &movups (&QWP(16*1,$out),$inout1); | ||
1642 | &movups (&QWP(16*2,$out),$inout2); | ||
1643 | &movups (&QWP(16*3,$out),$inout3); | ||
1644 | &lea ($out,&DWP(16*4,$out)); | ||
1645 | |||
1646 | &movdqa ($tweak,$inout4); # last tweak | ||
1647 | &jmp (&label("xts_dec_done")); | ||
1648 | |||
1649 | &set_label("xts_dec_done6x",16); # $tweak is pre-calculated | ||
1650 | &mov ($len,&DWP(16*7+0,"esp")); # restore original $len | ||
1651 | &and ($len,15); | ||
1652 | &jz (&label("xts_dec_ret")); | ||
1653 | &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 | ||
1654 | &jmp (&label("xts_dec_only_one_more")); | ||
1655 | |||
1656 | &set_label("xts_dec_done",16); | ||
1657 | &mov ($len,&DWP(16*7+0,"esp")); # restore original $len | ||
1658 | &pxor ($twtmp,$twtmp); | ||
1659 | &and ($len,15); | ||
1660 | &jz (&label("xts_dec_ret")); | ||
1661 | |||
1662 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1663 | &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 | ||
1664 | &pshufd ($twres,$twtmp,0x13); | ||
1665 | &pxor ($twtmp,$twtmp); | ||
1666 | &movdqa ($twmask,&QWP(16*6,"esp")); | ||
1667 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1668 | &pand ($twres,$twmask); # isolate carry and residue | ||
1669 | &pcmpgtd($twtmp,$tweak); # broadcast upper bits | ||
1670 | &pxor ($tweak,$twres); | ||
1671 | |||
1672 | &set_label("xts_dec_only_one_more"); | ||
1673 | &pshufd ($inout3,$twtmp,0x13); | ||
1674 | &movdqa ($inout4,$tweak); # put aside previous tweak | ||
1675 | &paddq ($tweak,$tweak); # &psllq($tweak,1); | ||
1676 | &pand ($inout3,$twmask); # isolate carry and residue | ||
1677 | &pxor ($inout3,$tweak); | ||
1678 | |||
1679 | &mov ($key,$key_); # restore $key | ||
1680 | &mov ($rounds,$rounds_); # restore $rounds | ||
1681 | |||
1682 | &movups ($inout0,&QWP(0,$inp)); # load input | ||
1683 | &xorps ($inout0,$inout3); # input^=tweak | ||
1684 | if ($inline) | ||
1685 | { &aesni_inline_generate1("dec"); } | ||
1686 | else | ||
1687 | { &call ("_aesni_decrypt1"); } | ||
1688 | &xorps ($inout0,$inout3); # output^=tweak | ||
1689 | &movups (&QWP(0,$out),$inout0); # write output | ||
1690 | |||
1691 | &set_label("xts_dec_steal"); | ||
1692 | &movz ($rounds,&BP(16,$inp)); | ||
1693 | &movz ($key,&BP(0,$out)); | ||
1694 | &lea ($inp,&DWP(1,$inp)); | ||
1695 | &mov (&BP(0,$out),&LB($rounds)); | ||
1696 | &mov (&BP(16,$out),&LB($key)); | ||
1697 | &lea ($out,&DWP(1,$out)); | ||
1698 | &sub ($len,1); | ||
1699 | &jnz (&label("xts_dec_steal")); | ||
1700 | |||
1701 | &sub ($out,&DWP(16*7+0,"esp")); # rewind $out | ||
1702 | &mov ($key,$key_); # restore $key | ||
1703 | &mov ($rounds,$rounds_); # restore $rounds | ||
1704 | |||
1705 | &movups ($inout0,&QWP(0,$out)); # load input | ||
1706 | &xorps ($inout0,$inout4); # input^=tweak | ||
1707 | if ($inline) | ||
1708 | { &aesni_inline_generate1("dec"); } | ||
1709 | else | ||
1710 | { &call ("_aesni_decrypt1"); } | ||
1711 | &xorps ($inout0,$inout4); # output^=tweak | ||
1712 | &movups (&QWP(0,$out),$inout0); # write output | ||
1713 | |||
1714 | &set_label("xts_dec_ret"); | ||
1715 | &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp | ||
1716 | &function_end("aesni_xts_decrypt"); | ||
1717 | } | ||
1718 | } | ||
1719 | |||
1720 | ###################################################################### | ||
1721 | # void $PREFIX_cbc_encrypt (const void *inp, void *out, | ||
1722 | # size_t length, const AES_KEY *key, | ||
1723 | # unsigned char *ivp,const int enc); | ||
1724 | &function_begin("${PREFIX}_cbc_encrypt"); | ||
1725 | &mov ($inp,&wparam(0)); | ||
1726 | &mov ($rounds_,"esp"); | ||
1727 | &mov ($out,&wparam(1)); | ||
1728 | &sub ($rounds_,24); | ||
1729 | &mov ($len,&wparam(2)); | ||
1730 | &and ($rounds_,-16); | ||
1731 | &mov ($key,&wparam(3)); | ||
1732 | &mov ($key_,&wparam(4)); | ||
1733 | &test ($len,$len); | ||
1734 | &jz (&label("cbc_abort")); | ||
1735 | |||
1736 | &cmp (&wparam(5),0); | ||
1737 | &xchg ($rounds_,"esp"); # alloca | ||
1738 | &movups ($ivec,&QWP(0,$key_)); # load IV | ||
1739 | &mov ($rounds,&DWP(240,$key)); | ||
1740 | &mov ($key_,$key); # backup $key | ||
1741 | &mov (&DWP(16,"esp"),$rounds_); # save original %esp | ||
1742 | &mov ($rounds_,$rounds); # backup $rounds | ||
1743 | &je (&label("cbc_decrypt")); | ||
1744 | |||
1745 | &movaps ($inout0,$ivec); | ||
1746 | &cmp ($len,16); | ||
1747 | &jb (&label("cbc_enc_tail")); | ||
1748 | &sub ($len,16); | ||
1749 | &jmp (&label("cbc_enc_loop")); | ||
1750 | |||
1751 | &set_label("cbc_enc_loop",16); | ||
1752 | &movups ($ivec,&QWP(0,$inp)); # input actually | ||
1753 | &lea ($inp,&DWP(16,$inp)); | ||
1754 | if ($inline) | ||
1755 | { &aesni_inline_generate1("enc",$inout0,$ivec); } | ||
1756 | else | ||
1757 | { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } | ||
1758 | &mov ($rounds,$rounds_); # restore $rounds | ||
1759 | &mov ($key,$key_); # restore $key | ||
1760 | &movups (&QWP(0,$out),$inout0); # store output | ||
1761 | &lea ($out,&DWP(16,$out)); | ||
1762 | &sub ($len,16); | ||
1763 | &jnc (&label("cbc_enc_loop")); | ||
1764 | &add ($len,16); | ||
1765 | &jnz (&label("cbc_enc_tail")); | ||
1766 | &movaps ($ivec,$inout0); | ||
1767 | &jmp (&label("cbc_ret")); | ||
1768 | |||
1769 | &set_label("cbc_enc_tail"); | ||
1770 | &mov ("ecx",$len); # zaps $rounds | ||
1771 | &data_word(0xA4F3F689); # rep movsb | ||
1772 | &mov ("ecx",16); # zero tail | ||
1773 | &sub ("ecx",$len); | ||
1774 | &xor ("eax","eax"); # zaps $len | ||
1775 | &data_word(0xAAF3F689); # rep stosb | ||
1776 | &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block | ||
1777 | &mov ($rounds,$rounds_); # restore $rounds | ||
1778 | &mov ($inp,$out); # $inp and $out are the same | ||
1779 | &mov ($key,$key_); # restore $key | ||
1780 | &jmp (&label("cbc_enc_loop")); | ||
1781 | ###################################################################### | ||
1782 | &set_label("cbc_decrypt",16); | ||
1783 | &cmp ($len,0x50); | ||
1784 | &jbe (&label("cbc_dec_tail")); | ||
1785 | &movaps (&QWP(0,"esp"),$ivec); # save IV | ||
1786 | &sub ($len,0x50); | ||
1787 | &jmp (&label("cbc_dec_loop6_enter")); | ||
1788 | |||
1789 | &set_label("cbc_dec_loop6",16); | ||
1790 | &movaps (&QWP(0,"esp"),$rndkey0); # save IV | ||
1791 | &movups (&QWP(0,$out),$inout5); | ||
1792 | &lea ($out,&DWP(0x10,$out)); | ||
1793 | &set_label("cbc_dec_loop6_enter"); | ||
1794 | &movdqu ($inout0,&QWP(0,$inp)); | ||
1795 | &movdqu ($inout1,&QWP(0x10,$inp)); | ||
1796 | &movdqu ($inout2,&QWP(0x20,$inp)); | ||
1797 | &movdqu ($inout3,&QWP(0x30,$inp)); | ||
1798 | &movdqu ($inout4,&QWP(0x40,$inp)); | ||
1799 | &movdqu ($inout5,&QWP(0x50,$inp)); | ||
1800 | |||
1801 | &call ("_aesni_decrypt6"); | ||
1802 | |||
1803 | &movups ($rndkey1,&QWP(0,$inp)); | ||
1804 | &movups ($rndkey0,&QWP(0x10,$inp)); | ||
1805 | &xorps ($inout0,&QWP(0,"esp")); # ^=IV | ||
1806 | &xorps ($inout1,$rndkey1); | ||
1807 | &movups ($rndkey1,&QWP(0x20,$inp)); | ||
1808 | &xorps ($inout2,$rndkey0); | ||
1809 | &movups ($rndkey0,&QWP(0x30,$inp)); | ||
1810 | &xorps ($inout3,$rndkey1); | ||
1811 | &movups ($rndkey1,&QWP(0x40,$inp)); | ||
1812 | &xorps ($inout4,$rndkey0); | ||
1813 | &movups ($rndkey0,&QWP(0x50,$inp)); # IV | ||
1814 | &xorps ($inout5,$rndkey1); | ||
1815 | &movups (&QWP(0,$out),$inout0); | ||
1816 | &movups (&QWP(0x10,$out),$inout1); | ||
1817 | &lea ($inp,&DWP(0x60,$inp)); | ||
1818 | &movups (&QWP(0x20,$out),$inout2); | ||
1819 | &mov ($rounds,$rounds_) # restore $rounds | ||
1820 | &movups (&QWP(0x30,$out),$inout3); | ||
1821 | &mov ($key,$key_); # restore $key | ||
1822 | &movups (&QWP(0x40,$out),$inout4); | ||
1823 | &lea ($out,&DWP(0x50,$out)); | ||
1824 | &sub ($len,0x60); | ||
1825 | &ja (&label("cbc_dec_loop6")); | ||
1826 | |||
1827 | &movaps ($inout0,$inout5); | ||
1828 | &movaps ($ivec,$rndkey0); | ||
1829 | &add ($len,0x50); | ||
1830 | &jle (&label("cbc_dec_tail_collected")); | ||
1831 | &movups (&QWP(0,$out),$inout0); | ||
1832 | &lea ($out,&DWP(0x10,$out)); | ||
1833 | &set_label("cbc_dec_tail"); | ||
1834 | &movups ($inout0,&QWP(0,$inp)); | ||
1835 | &movaps ($in0,$inout0); | ||
1836 | &cmp ($len,0x10); | ||
1837 | &jbe (&label("cbc_dec_one")); | ||
1838 | |||
1839 | &movups ($inout1,&QWP(0x10,$inp)); | ||
1840 | &movaps ($in1,$inout1); | ||
1841 | &cmp ($len,0x20); | ||
1842 | &jbe (&label("cbc_dec_two")); | ||
1843 | |||
1844 | &movups ($inout2,&QWP(0x20,$inp)); | ||
1845 | &cmp ($len,0x30); | ||
1846 | &jbe (&label("cbc_dec_three")); | ||
1847 | |||
1848 | &movups ($inout3,&QWP(0x30,$inp)); | ||
1849 | &cmp ($len,0x40); | ||
1850 | &jbe (&label("cbc_dec_four")); | ||
1851 | |||
1852 | &movups ($inout4,&QWP(0x40,$inp)); | ||
1853 | &movaps (&QWP(0,"esp"),$ivec); # save IV | ||
1854 | &movups ($inout0,&QWP(0,$inp)); | ||
1855 | &xorps ($inout5,$inout5); | ||
1856 | &call ("_aesni_decrypt6"); | ||
1857 | &movups ($rndkey1,&QWP(0,$inp)); | ||
1858 | &movups ($rndkey0,&QWP(0x10,$inp)); | ||
1859 | &xorps ($inout0,&QWP(0,"esp")); # ^= IV | ||
1860 | &xorps ($inout1,$rndkey1); | ||
1861 | &movups ($rndkey1,&QWP(0x20,$inp)); | ||
1862 | &xorps ($inout2,$rndkey0); | ||
1863 | &movups ($rndkey0,&QWP(0x30,$inp)); | ||
1864 | &xorps ($inout3,$rndkey1); | ||
1865 | &movups ($ivec,&QWP(0x40,$inp)); # IV | ||
1866 | &xorps ($inout4,$rndkey0); | ||
1867 | &movups (&QWP(0,$out),$inout0); | ||
1868 | &movups (&QWP(0x10,$out),$inout1); | ||
1869 | &movups (&QWP(0x20,$out),$inout2); | ||
1870 | &movups (&QWP(0x30,$out),$inout3); | ||
1871 | &lea ($out,&DWP(0x40,$out)); | ||
1872 | &movaps ($inout0,$inout4); | ||
1873 | &sub ($len,0x50); | ||
1874 | &jmp (&label("cbc_dec_tail_collected")); | ||
1875 | |||
1876 | &set_label("cbc_dec_one",16); | ||
1877 | if ($inline) | ||
1878 | { &aesni_inline_generate1("dec"); } | ||
1879 | else | ||
1880 | { &call ("_aesni_decrypt1"); } | ||
1881 | &xorps ($inout0,$ivec); | ||
1882 | &movaps ($ivec,$in0); | ||
1883 | &sub ($len,0x10); | ||
1884 | &jmp (&label("cbc_dec_tail_collected")); | ||
1885 | |||
1886 | &set_label("cbc_dec_two",16); | ||
1887 | &xorps ($inout2,$inout2); | ||
1888 | &call ("_aesni_decrypt3"); | ||
1889 | &xorps ($inout0,$ivec); | ||
1890 | &xorps ($inout1,$in0); | ||
1891 | &movups (&QWP(0,$out),$inout0); | ||
1892 | &movaps ($inout0,$inout1); | ||
1893 | &lea ($out,&DWP(0x10,$out)); | ||
1894 | &movaps ($ivec,$in1); | ||
1895 | &sub ($len,0x20); | ||
1896 | &jmp (&label("cbc_dec_tail_collected")); | ||
1897 | |||
1898 | &set_label("cbc_dec_three",16); | ||
1899 | &call ("_aesni_decrypt3"); | ||
1900 | &xorps ($inout0,$ivec); | ||
1901 | &xorps ($inout1,$in0); | ||
1902 | &xorps ($inout2,$in1); | ||
1903 | &movups (&QWP(0,$out),$inout0); | ||
1904 | &movaps ($inout0,$inout2); | ||
1905 | &movups (&QWP(0x10,$out),$inout1); | ||
1906 | &lea ($out,&DWP(0x20,$out)); | ||
1907 | &movups ($ivec,&QWP(0x20,$inp)); | ||
1908 | &sub ($len,0x30); | ||
1909 | &jmp (&label("cbc_dec_tail_collected")); | ||
1910 | |||
1911 | &set_label("cbc_dec_four",16); | ||
1912 | &call ("_aesni_decrypt4"); | ||
1913 | &movups ($rndkey1,&QWP(0x10,$inp)); | ||
1914 | &movups ($rndkey0,&QWP(0x20,$inp)); | ||
1915 | &xorps ($inout0,$ivec); | ||
1916 | &movups ($ivec,&QWP(0x30,$inp)); | ||
1917 | &xorps ($inout1,$in0); | ||
1918 | &movups (&QWP(0,$out),$inout0); | ||
1919 | &xorps ($inout2,$rndkey1); | ||
1920 | &movups (&QWP(0x10,$out),$inout1); | ||
1921 | &xorps ($inout3,$rndkey0); | ||
1922 | &movups (&QWP(0x20,$out),$inout2); | ||
1923 | &lea ($out,&DWP(0x30,$out)); | ||
1924 | &movaps ($inout0,$inout3); | ||
1925 | &sub ($len,0x40); | ||
1926 | |||
1927 | &set_label("cbc_dec_tail_collected"); | ||
1928 | &and ($len,15); | ||
1929 | &jnz (&label("cbc_dec_tail_partial")); | ||
1930 | &movups (&QWP(0,$out),$inout0); | ||
1931 | &jmp (&label("cbc_ret")); | ||
1932 | |||
1933 | &set_label("cbc_dec_tail_partial",16); | ||
1934 | &movaps (&QWP(0,"esp"),$inout0); | ||
1935 | &mov ("ecx",16); | ||
1936 | &mov ($inp,"esp"); | ||
1937 | &sub ("ecx",$len); | ||
1938 | &data_word(0xA4F3F689); # rep movsb | ||
1939 | |||
1940 | &set_label("cbc_ret"); | ||
1941 | &mov ("esp",&DWP(16,"esp")); # pull original %esp | ||
1942 | &mov ($key_,&wparam(4)); | ||
1943 | &movups (&QWP(0,$key_),$ivec); # output IV | ||
1944 | &set_label("cbc_abort"); | ||
1945 | &function_end("${PREFIX}_cbc_encrypt"); | ||
1946 | |||
1947 | ###################################################################### | ||
1948 | # Mechanical port from aesni-x86_64.pl. | ||
1949 | # | ||
1950 | # _aesni_set_encrypt_key is private interface, | ||
1951 | # input: | ||
1952 | # "eax" const unsigned char *userKey | ||
1953 | # $rounds int bits | ||
1954 | # $key AES_KEY *key | ||
1955 | # output: | ||
1956 | # "eax" return code | ||
1957 | # $round rounds | ||
1958 | |||
1959 | &function_begin_B("_aesni_set_encrypt_key"); | ||
1960 | &test ("eax","eax"); | ||
1961 | &jz (&label("bad_pointer")); | ||
1962 | &test ($key,$key); | ||
1963 | &jz (&label("bad_pointer")); | ||
1964 | |||
1965 | &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey | ||
1966 | &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 | ||
1967 | &lea ($key,&DWP(16,$key)); | ||
1968 | &cmp ($rounds,256); | ||
1969 | &je (&label("14rounds")); | ||
1970 | &cmp ($rounds,192); | ||
1971 | &je (&label("12rounds")); | ||
1972 | &cmp ($rounds,128); | ||
1973 | &jne (&label("bad_keybits")); | ||
1974 | |||
1975 | &set_label("10rounds",16); | ||
1976 | &mov ($rounds,9); | ||
1977 | &$movekey (&QWP(-16,$key),"xmm0"); # round 0 | ||
1978 | &aeskeygenassist("xmm1","xmm0",0x01); # round 1 | ||
1979 | &call (&label("key_128_cold")); | ||
1980 | &aeskeygenassist("xmm1","xmm0",0x2); # round 2 | ||
1981 | &call (&label("key_128")); | ||
1982 | &aeskeygenassist("xmm1","xmm0",0x04); # round 3 | ||
1983 | &call (&label("key_128")); | ||
1984 | &aeskeygenassist("xmm1","xmm0",0x08); # round 4 | ||
1985 | &call (&label("key_128")); | ||
1986 | &aeskeygenassist("xmm1","xmm0",0x10); # round 5 | ||
1987 | &call (&label("key_128")); | ||
1988 | &aeskeygenassist("xmm1","xmm0",0x20); # round 6 | ||
1989 | &call (&label("key_128")); | ||
1990 | &aeskeygenassist("xmm1","xmm0",0x40); # round 7 | ||
1991 | &call (&label("key_128")); | ||
1992 | &aeskeygenassist("xmm1","xmm0",0x80); # round 8 | ||
1993 | &call (&label("key_128")); | ||
1994 | &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 | ||
1995 | &call (&label("key_128")); | ||
1996 | &aeskeygenassist("xmm1","xmm0",0x36); # round 10 | ||
1997 | &call (&label("key_128")); | ||
1998 | &$movekey (&QWP(0,$key),"xmm0"); | ||
1999 | &mov (&DWP(80,$key),$rounds); | ||
2000 | &xor ("eax","eax"); | ||
2001 | &ret(); | ||
2002 | |||
2003 | &set_label("key_128",16); | ||
2004 | &$movekey (&QWP(0,$key),"xmm0"); | ||
2005 | &lea ($key,&DWP(16,$key)); | ||
2006 | &set_label("key_128_cold"); | ||
2007 | &shufps ("xmm4","xmm0",0b00010000); | ||
2008 | &xorps ("xmm0","xmm4"); | ||
2009 | &shufps ("xmm4","xmm0",0b10001100); | ||
2010 | &xorps ("xmm0","xmm4"); | ||
2011 | &shufps ("xmm1","xmm1",0b11111111); # critical path | ||
2012 | &xorps ("xmm0","xmm1"); | ||
2013 | &ret(); | ||
2014 | |||
2015 | &set_label("12rounds",16); | ||
2016 | &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey | ||
2017 | &mov ($rounds,11); | ||
2018 | &$movekey (&QWP(-16,$key),"xmm0") # round 0 | ||
2019 | &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 | ||
2020 | &call (&label("key_192a_cold")); | ||
2021 | &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 | ||
2022 | &call (&label("key_192b")); | ||
2023 | &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 | ||
2024 | &call (&label("key_192a")); | ||
2025 | &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 | ||
2026 | &call (&label("key_192b")); | ||
2027 | &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 | ||
2028 | &call (&label("key_192a")); | ||
2029 | &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 | ||
2030 | &call (&label("key_192b")); | ||
2031 | &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 | ||
2032 | &call (&label("key_192a")); | ||
2033 | &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 | ||
2034 | &call (&label("key_192b")); | ||
2035 | &$movekey (&QWP(0,$key),"xmm0"); | ||
2036 | &mov (&DWP(48,$key),$rounds); | ||
2037 | &xor ("eax","eax"); | ||
2038 | &ret(); | ||
2039 | |||
2040 | &set_label("key_192a",16); | ||
2041 | &$movekey (&QWP(0,$key),"xmm0"); | ||
2042 | &lea ($key,&DWP(16,$key)); | ||
2043 | &set_label("key_192a_cold",16); | ||
2044 | &movaps ("xmm5","xmm2"); | ||
2045 | &set_label("key_192b_warm"); | ||
2046 | &shufps ("xmm4","xmm0",0b00010000); | ||
2047 | &movdqa ("xmm3","xmm2"); | ||
2048 | &xorps ("xmm0","xmm4"); | ||
2049 | &shufps ("xmm4","xmm0",0b10001100); | ||
2050 | &pslldq ("xmm3",4); | ||
2051 | &xorps ("xmm0","xmm4"); | ||
2052 | &pshufd ("xmm1","xmm1",0b01010101); # critical path | ||
2053 | &pxor ("xmm2","xmm3"); | ||
2054 | &pxor ("xmm0","xmm1"); | ||
2055 | &pshufd ("xmm3","xmm0",0b11111111); | ||
2056 | &pxor ("xmm2","xmm3"); | ||
2057 | &ret(); | ||
2058 | |||
2059 | &set_label("key_192b",16); | ||
2060 | &movaps ("xmm3","xmm0"); | ||
2061 | &shufps ("xmm5","xmm0",0b01000100); | ||
2062 | &$movekey (&QWP(0,$key),"xmm5"); | ||
2063 | &shufps ("xmm3","xmm2",0b01001110); | ||
2064 | &$movekey (&QWP(16,$key),"xmm3"); | ||
2065 | &lea ($key,&DWP(32,$key)); | ||
2066 | &jmp (&label("key_192b_warm")); | ||
2067 | |||
2068 | &set_label("14rounds",16); | ||
2069 | &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey | ||
2070 | &mov ($rounds,13); | ||
2071 | &lea ($key,&DWP(16,$key)); | ||
2072 | &$movekey (&QWP(-32,$key),"xmm0"); # round 0 | ||
2073 | &$movekey (&QWP(-16,$key),"xmm2"); # round 1 | ||
2074 | &aeskeygenassist("xmm1","xmm2",0x01); # round 2 | ||
2075 | &call (&label("key_256a_cold")); | ||
2076 | &aeskeygenassist("xmm1","xmm0",0x01); # round 3 | ||
2077 | &call (&label("key_256b")); | ||
2078 | &aeskeygenassist("xmm1","xmm2",0x02); # round 4 | ||
2079 | &call (&label("key_256a")); | ||
2080 | &aeskeygenassist("xmm1","xmm0",0x02); # round 5 | ||
2081 | &call (&label("key_256b")); | ||
2082 | &aeskeygenassist("xmm1","xmm2",0x04); # round 6 | ||
2083 | &call (&label("key_256a")); | ||
2084 | &aeskeygenassist("xmm1","xmm0",0x04); # round 7 | ||
2085 | &call (&label("key_256b")); | ||
2086 | &aeskeygenassist("xmm1","xmm2",0x08); # round 8 | ||
2087 | &call (&label("key_256a")); | ||
2088 | &aeskeygenassist("xmm1","xmm0",0x08); # round 9 | ||
2089 | &call (&label("key_256b")); | ||
2090 | &aeskeygenassist("xmm1","xmm2",0x10); # round 10 | ||
2091 | &call (&label("key_256a")); | ||
2092 | &aeskeygenassist("xmm1","xmm0",0x10); # round 11 | ||
2093 | &call (&label("key_256b")); | ||
2094 | &aeskeygenassist("xmm1","xmm2",0x20); # round 12 | ||
2095 | &call (&label("key_256a")); | ||
2096 | &aeskeygenassist("xmm1","xmm0",0x20); # round 13 | ||
2097 | &call (&label("key_256b")); | ||
2098 | &aeskeygenassist("xmm1","xmm2",0x40); # round 14 | ||
2099 | &call (&label("key_256a")); | ||
2100 | &$movekey (&QWP(0,$key),"xmm0"); | ||
2101 | &mov (&DWP(16,$key),$rounds); | ||
2102 | &xor ("eax","eax"); | ||
2103 | &ret(); | ||
2104 | |||
2105 | &set_label("key_256a",16); | ||
2106 | &$movekey (&QWP(0,$key),"xmm2"); | ||
2107 | &lea ($key,&DWP(16,$key)); | ||
2108 | &set_label("key_256a_cold"); | ||
2109 | &shufps ("xmm4","xmm0",0b00010000); | ||
2110 | &xorps ("xmm0","xmm4"); | ||
2111 | &shufps ("xmm4","xmm0",0b10001100); | ||
2112 | &xorps ("xmm0","xmm4"); | ||
2113 | &shufps ("xmm1","xmm1",0b11111111); # critical path | ||
2114 | &xorps ("xmm0","xmm1"); | ||
2115 | &ret(); | ||
2116 | |||
2117 | &set_label("key_256b",16); | ||
2118 | &$movekey (&QWP(0,$key),"xmm0"); | ||
2119 | &lea ($key,&DWP(16,$key)); | ||
2120 | |||
2121 | &shufps ("xmm4","xmm2",0b00010000); | ||
2122 | &xorps ("xmm2","xmm4"); | ||
2123 | &shufps ("xmm4","xmm2",0b10001100); | ||
2124 | &xorps ("xmm2","xmm4"); | ||
2125 | &shufps ("xmm1","xmm1",0b10101010); # critical path | ||
2126 | &xorps ("xmm2","xmm1"); | ||
2127 | &ret(); | ||
2128 | |||
2129 | &set_label("bad_pointer",4); | ||
2130 | &mov ("eax",-1); | ||
2131 | &ret (); | ||
2132 | &set_label("bad_keybits",4); | ||
2133 | &mov ("eax",-2); | ||
2134 | &ret (); | ||
2135 | &function_end_B("_aesni_set_encrypt_key"); | ||
2136 | |||
2137 | # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, | ||
2138 | # AES_KEY *key) | ||
2139 | &function_begin_B("${PREFIX}_set_encrypt_key"); | ||
2140 | &mov ("eax",&wparam(0)); | ||
2141 | &mov ($rounds,&wparam(1)); | ||
2142 | &mov ($key,&wparam(2)); | ||
2143 | &call ("_aesni_set_encrypt_key"); | ||
2144 | &ret (); | ||
2145 | &function_end_B("${PREFIX}_set_encrypt_key"); | ||
2146 | |||
2147 | # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, | ||
2148 | # AES_KEY *key) | ||
2149 | &function_begin_B("${PREFIX}_set_decrypt_key"); | ||
2150 | &mov ("eax",&wparam(0)); | ||
2151 | &mov ($rounds,&wparam(1)); | ||
2152 | &mov ($key,&wparam(2)); | ||
2153 | &call ("_aesni_set_encrypt_key"); | ||
2154 | &mov ($key,&wparam(2)); | ||
2155 | &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key | ||
2156 | &test ("eax","eax"); | ||
2157 | &jnz (&label("dec_key_ret")); | ||
2158 | &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule | ||
2159 | |||
2160 | &$movekey ("xmm0",&QWP(0,$key)); # just swap | ||
2161 | &$movekey ("xmm1",&QWP(0,"eax")); | ||
2162 | &$movekey (&QWP(0,"eax"),"xmm0"); | ||
2163 | &$movekey (&QWP(0,$key),"xmm1"); | ||
2164 | &lea ($key,&DWP(16,$key)); | ||
2165 | &lea ("eax",&DWP(-16,"eax")); | ||
2166 | |||
2167 | &set_label("dec_key_inverse"); | ||
2168 | &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse | ||
2169 | &$movekey ("xmm1",&QWP(0,"eax")); | ||
2170 | &aesimc ("xmm0","xmm0"); | ||
2171 | &aesimc ("xmm1","xmm1"); | ||
2172 | &lea ($key,&DWP(16,$key)); | ||
2173 | &lea ("eax",&DWP(-16,"eax")); | ||
2174 | &$movekey (&QWP(16,"eax"),"xmm0"); | ||
2175 | &$movekey (&QWP(-16,$key),"xmm1"); | ||
2176 | &cmp ("eax",$key); | ||
2177 | &ja (&label("dec_key_inverse")); | ||
2178 | |||
2179 | &$movekey ("xmm0",&QWP(0,$key)); # inverse middle | ||
2180 | &aesimc ("xmm0","xmm0"); | ||
2181 | &$movekey (&QWP(0,$key),"xmm0"); | ||
2182 | |||
2183 | &xor ("eax","eax"); # return success | ||
2184 | &set_label("dec_key_ret"); | ||
2185 | &ret (); | ||
2186 | &function_end_B("${PREFIX}_set_decrypt_key"); | ||
2187 | &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); | ||
2188 | |||
2189 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl new file mode 100644 index 0000000000..c9c6312fa7 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl | |||
@@ -0,0 +1,3044 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | ################################################################### | ||
4 | ### AES-128 [originally in CTR mode] ### | ||
5 | ### bitsliced implementation for Intel Core 2 processors ### | ||
6 | ### requires support of SSE extensions up to SSSE3 ### | ||
7 | ### Author: Emilia Käsper and Peter Schwabe ### | ||
8 | ### Date: 2009-03-19 ### | ||
9 | ### Public domain ### | ||
10 | ### ### | ||
11 | ### See http://homes.esat.kuleuven.be/~ekasper/#software for ### | ||
12 | ### further information. ### | ||
13 | ################################################################### | ||
14 | # | ||
15 | # September 2011. | ||
16 | # | ||
17 | # Started as transliteration to "perlasm" the original code has | ||
18 | # undergone following changes: | ||
19 | # | ||
20 | # - code was made position-independent; | ||
21 | # - rounds were folded into a loop resulting in >5x size reduction | ||
22 | # from 12.5KB to 2.2KB; | ||
23 | # - above was possibile thanks to mixcolumns() modification that | ||
24 | # allowed to feed its output back to aesenc[last], this was | ||
25 | # achieved at cost of two additional inter-registers moves; | ||
26 | # - some instruction reordering and interleaving; | ||
27 | # - this module doesn't implement key setup subroutine, instead it | ||
28 | # relies on conversion of "conventional" key schedule as returned | ||
29 | # by AES_set_encrypt_key (see discussion below); | ||
30 | # - first and last round keys are treated differently, which allowed | ||
31 | # to skip one shiftrows(), reduce bit-sliced key schedule and | ||
32 | # speed-up conversion by 22%; | ||
33 | # - support for 192- and 256-bit keys was added; | ||
34 | # | ||
35 | # Resulting performance in CPU cycles spent to encrypt one byte out | ||
36 | # of 4096-byte buffer with 128-bit key is: | ||
37 | # | ||
38 | # Emilia's this(*) difference | ||
39 | # | ||
40 | # Core 2 9.30 8.69 +7% | ||
41 | # Nehalem(**) 7.63 6.98 +9% | ||
42 | # Atom 17.1 17.4 -2%(***) | ||
43 | # | ||
44 | # (*) Comparison is not completely fair, because "this" is ECB, | ||
45 | # i.e. no extra processing such as counter values calculation | ||
46 | # and xor-ing input as in Emilia's CTR implementation is | ||
47 | # performed. However, the CTR calculations stand for not more | ||
48 | # than 1% of total time, so comparison is *rather* fair. | ||
49 | # | ||
50 | # (**) Results were collected on Westmere, which is considered to | ||
51 | # be equivalent to Nehalem for this code. | ||
52 | # | ||
53 | # (***) Slowdown on Atom is rather strange per se, because original | ||
54 | # implementation has a number of 9+-bytes instructions, which | ||
55 | # are bad for Atom front-end, and which I eliminated completely. | ||
56 | # In attempt to address deterioration sbox() was tested in FP | ||
57 | # SIMD "domain" (movaps instead of movdqa, xorps instead of | ||
58 | # pxor, etc.). While it resulted in nominal 4% improvement on | ||
59 | # Atom, it hurted Westmere by more than 2x factor. | ||
60 | # | ||
61 | # As for key schedule conversion subroutine. Interface to OpenSSL | ||
62 | # relies on per-invocation on-the-fly conversion. This naturally | ||
63 | # has impact on performance, especially for short inputs. Conversion | ||
64 | # time in CPU cycles and its ratio to CPU cycles spent in 8x block | ||
65 | # function is: | ||
66 | # | ||
67 | # conversion conversion/8x block | ||
68 | # Core 2 240 0.22 | ||
69 | # Nehalem 180 0.20 | ||
70 | # Atom 430 0.19 | ||
71 | # | ||
72 | # The ratio values mean that 128-byte blocks will be processed | ||
73 | # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, | ||
74 | # etc. Then keep in mind that input sizes not divisible by 128 are | ||
75 | # *effectively* slower, especially shortest ones, e.g. consecutive | ||
76 | # 144-byte blocks are processed 44% slower than one would expect, | ||
77 | # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" | ||
78 | # it's still faster than ["hyper-threading-safe" code path in] | ||
79 | # aes-x86_64.pl on all lengths above 64 bytes... | ||
80 | # | ||
81 | # October 2011. | ||
82 | # | ||
83 | # Add decryption procedure. Performance in CPU cycles spent to decrypt | ||
84 | # one byte out of 4096-byte buffer with 128-bit key is: | ||
85 | # | ||
86 | # Core 2 11.0 | ||
87 | # Nehalem 9.16 | ||
88 | # Atom 20.9 | ||
89 | # | ||
90 | # November 2011. | ||
91 | # | ||
92 | # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is | ||
93 | # suboptimal, but XTS is meant to be used with larger blocks... | ||
94 | # | ||
95 | # <appro@openssl.org> | ||
96 | |||
97 | $flavour = shift; | ||
98 | $output = shift; | ||
99 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
100 | |||
101 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
102 | |||
103 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
104 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
105 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
106 | die "can't locate x86_64-xlate.pl"; | ||
107 | |||
108 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
109 | |||
110 | my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); | ||
111 | my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) | ||
112 | my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... | ||
113 | |||
114 | { | ||
115 | my ($key,$rounds,$const)=("%rax","%r10d","%r11"); | ||
116 | |||
117 | sub Sbox { | ||
118 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
119 | # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb | ||
120 | my @b=@_[0..7]; | ||
121 | my @t=@_[8..11]; | ||
122 | my @s=@_[12..15]; | ||
123 | &InBasisChange (@b); | ||
124 | &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); | ||
125 | &OutBasisChange (@b[7,1,4,2,6,5,0,3]); | ||
126 | } | ||
127 | |||
128 | sub InBasisChange { | ||
129 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
130 | # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb | ||
131 | my @b=@_[0..7]; | ||
132 | $code.=<<___; | ||
133 | pxor @b[6], @b[5] | ||
134 | pxor @b[1], @b[2] | ||
135 | pxor @b[0], @b[3] | ||
136 | pxor @b[2], @b[6] | ||
137 | pxor @b[0], @b[5] | ||
138 | |||
139 | pxor @b[3], @b[6] | ||
140 | pxor @b[7], @b[3] | ||
141 | pxor @b[5], @b[7] | ||
142 | pxor @b[4], @b[3] | ||
143 | pxor @b[5], @b[4] | ||
144 | pxor @b[1], @b[3] | ||
145 | |||
146 | pxor @b[7], @b[2] | ||
147 | pxor @b[5], @b[1] | ||
148 | ___ | ||
149 | } | ||
150 | |||
151 | sub OutBasisChange { | ||
152 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
153 | # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb | ||
154 | my @b=@_[0..7]; | ||
155 | $code.=<<___; | ||
156 | pxor @b[6], @b[0] | ||
157 | pxor @b[4], @b[1] | ||
158 | pxor @b[0], @b[2] | ||
159 | pxor @b[6], @b[4] | ||
160 | pxor @b[1], @b[6] | ||
161 | |||
162 | pxor @b[5], @b[1] | ||
163 | pxor @b[3], @b[5] | ||
164 | pxor @b[7], @b[3] | ||
165 | pxor @b[5], @b[7] | ||
166 | pxor @b[5], @b[2] | ||
167 | |||
168 | pxor @b[7], @b[4] | ||
169 | ___ | ||
170 | } | ||
171 | |||
172 | sub InvSbox { | ||
173 | # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | ||
174 | # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb | ||
175 | my @b=@_[0..7]; | ||
176 | my @t=@_[8..11]; | ||
177 | my @s=@_[12..15]; | ||
178 | &InvInBasisChange (@b); | ||
179 | &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); | ||
180 | &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); | ||
181 | } | ||
182 | |||
183 | sub InvInBasisChange { # OutBasisChange in reverse | ||
184 | my @b=@_[5,1,2,6,3,7,0,4]; | ||
185 | $code.=<<___ | ||
186 | pxor @b[7], @b[4] | ||
187 | |||
188 | pxor @b[5], @b[7] | ||
189 | pxor @b[5], @b[2] | ||
190 | pxor @b[7], @b[3] | ||
191 | pxor @b[3], @b[5] | ||
192 | pxor @b[5], @b[1] | ||
193 | |||
194 | pxor @b[1], @b[6] | ||
195 | pxor @b[0], @b[2] | ||
196 | pxor @b[6], @b[4] | ||
197 | pxor @b[6], @b[0] | ||
198 | pxor @b[4], @b[1] | ||
199 | ___ | ||
200 | } | ||
201 | |||
202 | sub InvOutBasisChange { # InBasisChange in reverse | ||
203 | my @b=@_[2,5,7,3,6,1,0,4]; | ||
204 | $code.=<<___; | ||
205 | pxor @b[5], @b[1] | ||
206 | pxor @b[7], @b[2] | ||
207 | |||
208 | pxor @b[1], @b[3] | ||
209 | pxor @b[5], @b[4] | ||
210 | pxor @b[5], @b[7] | ||
211 | pxor @b[4], @b[3] | ||
212 | pxor @b[0], @b[5] | ||
213 | pxor @b[7], @b[3] | ||
214 | pxor @b[2], @b[6] | ||
215 | pxor @b[1], @b[2] | ||
216 | pxor @b[3], @b[6] | ||
217 | |||
218 | pxor @b[0], @b[3] | ||
219 | pxor @b[6], @b[5] | ||
220 | ___ | ||
221 | } | ||
222 | |||
223 | sub Mul_GF4 { | ||
224 | #;************************************************************* | ||
225 | #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * | ||
226 | #;************************************************************* | ||
227 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
228 | $code.=<<___; | ||
229 | movdqa $y0, $t0 | ||
230 | pxor $y1, $t0 | ||
231 | pand $x0, $t0 | ||
232 | pxor $x1, $x0 | ||
233 | pand $y0, $x1 | ||
234 | pand $y1, $x0 | ||
235 | pxor $x1, $x0 | ||
236 | pxor $t0, $x1 | ||
237 | ___ | ||
238 | } | ||
239 | |||
240 | sub Mul_GF4_N { # not used, see next subroutine | ||
241 | # multiply and scale by N | ||
242 | my ($x0,$x1,$y0,$y1,$t0)=@_; | ||
243 | $code.=<<___; | ||
244 | movdqa $y0, $t0 | ||
245 | pxor $y1, $t0 | ||
246 | pand $x0, $t0 | ||
247 | pxor $x1, $x0 | ||
248 | pand $y0, $x1 | ||
249 | pand $y1, $x0 | ||
250 | pxor $x0, $x1 | ||
251 | pxor $t0, $x0 | ||
252 | ___ | ||
253 | } | ||
254 | |||
255 | sub Mul_GF4_N_GF4 { | ||
256 | # interleaved Mul_GF4_N and Mul_GF4 | ||
257 | my ($x0,$x1,$y0,$y1,$t0, | ||
258 | $x2,$x3,$y2,$y3,$t1)=@_; | ||
259 | $code.=<<___; | ||
260 | movdqa $y0, $t0 | ||
261 | movdqa $y2, $t1 | ||
262 | pxor $y1, $t0 | ||
263 | pxor $y3, $t1 | ||
264 | pand $x0, $t0 | ||
265 | pand $x2, $t1 | ||
266 | pxor $x1, $x0 | ||
267 | pxor $x3, $x2 | ||
268 | pand $y0, $x1 | ||
269 | pand $y2, $x3 | ||
270 | pand $y1, $x0 | ||
271 | pand $y3, $x2 | ||
272 | pxor $x0, $x1 | ||
273 | pxor $x3, $x2 | ||
274 | pxor $t0, $x0 | ||
275 | pxor $t1, $x3 | ||
276 | ___ | ||
277 | } | ||
278 | sub Mul_GF16_2 { | ||
279 | my @x=@_[0..7]; | ||
280 | my @y=@_[8..11]; | ||
281 | my @t=@_[12..15]; | ||
282 | $code.=<<___; | ||
283 | movdqa @x[0], @t[0] | ||
284 | movdqa @x[1], @t[1] | ||
285 | ___ | ||
286 | &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); | ||
287 | $code.=<<___; | ||
288 | pxor @x[2], @t[0] | ||
289 | pxor @x[3], @t[1] | ||
290 | pxor @y[2], @y[0] | ||
291 | pxor @y[3], @y[1] | ||
292 | ___ | ||
293 | Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
294 | @x[2], @x[3], @y[2], @y[3], @t[2]); | ||
295 | $code.=<<___; | ||
296 | pxor @t[0], @x[0] | ||
297 | pxor @t[0], @x[2] | ||
298 | pxor @t[1], @x[1] | ||
299 | pxor @t[1], @x[3] | ||
300 | |||
301 | movdqa @x[4], @t[0] | ||
302 | movdqa @x[5], @t[1] | ||
303 | pxor @x[6], @t[0] | ||
304 | pxor @x[7], @t[1] | ||
305 | ___ | ||
306 | &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | ||
307 | @x[6], @x[7], @y[2], @y[3], @t[2]); | ||
308 | $code.=<<___; | ||
309 | pxor @y[2], @y[0] | ||
310 | pxor @y[3], @y[1] | ||
311 | ___ | ||
312 | &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); | ||
313 | $code.=<<___; | ||
314 | pxor @t[0], @x[4] | ||
315 | pxor @t[0], @x[6] | ||
316 | pxor @t[1], @x[5] | ||
317 | pxor @t[1], @x[7] | ||
318 | ___ | ||
319 | } | ||
320 | sub Inv_GF256 { | ||
321 | #;******************************************************************** | ||
322 | #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * | ||
323 | #;******************************************************************** | ||
324 | my @x=@_[0..7]; | ||
325 | my @t=@_[8..11]; | ||
326 | my @s=@_[12..15]; | ||
327 | # direct optimizations from hardware | ||
328 | $code.=<<___; | ||
329 | movdqa @x[4], @t[3] | ||
330 | movdqa @x[5], @t[2] | ||
331 | movdqa @x[1], @t[1] | ||
332 | movdqa @x[7], @s[1] | ||
333 | movdqa @x[0], @s[0] | ||
334 | |||
335 | pxor @x[6], @t[3] | ||
336 | pxor @x[7], @t[2] | ||
337 | pxor @x[3], @t[1] | ||
338 | movdqa @t[3], @s[2] | ||
339 | pxor @x[6], @s[1] | ||
340 | movdqa @t[2], @t[0] | ||
341 | pxor @x[2], @s[0] | ||
342 | movdqa @t[3], @s[3] | ||
343 | |||
344 | por @t[1], @t[2] | ||
345 | por @s[0], @t[3] | ||
346 | pxor @t[0], @s[3] | ||
347 | pand @s[0], @s[2] | ||
348 | pxor @t[1], @s[0] | ||
349 | pand @t[1], @t[0] | ||
350 | pand @s[0], @s[3] | ||
351 | movdqa @x[3], @s[0] | ||
352 | pxor @x[2], @s[0] | ||
353 | pand @s[0], @s[1] | ||
354 | pxor @s[1], @t[3] | ||
355 | pxor @s[1], @t[2] | ||
356 | movdqa @x[4], @s[1] | ||
357 | movdqa @x[1], @s[0] | ||
358 | pxor @x[5], @s[1] | ||
359 | pxor @x[0], @s[0] | ||
360 | movdqa @s[1], @t[1] | ||
361 | pand @s[0], @s[1] | ||
362 | por @s[0], @t[1] | ||
363 | pxor @s[1], @t[0] | ||
364 | pxor @s[3], @t[3] | ||
365 | pxor @s[2], @t[2] | ||
366 | pxor @s[3], @t[1] | ||
367 | movdqa @x[7], @s[0] | ||
368 | pxor @s[2], @t[0] | ||
369 | movdqa @x[6], @s[1] | ||
370 | pxor @s[2], @t[1] | ||
371 | movdqa @x[5], @s[2] | ||
372 | pand @x[3], @s[0] | ||
373 | movdqa @x[4], @s[3] | ||
374 | pand @x[2], @s[1] | ||
375 | pand @x[1], @s[2] | ||
376 | por @x[0], @s[3] | ||
377 | pxor @s[0], @t[3] | ||
378 | pxor @s[1], @t[2] | ||
379 | pxor @s[2], @t[1] | ||
380 | pxor @s[3], @t[0] | ||
381 | |||
382 | #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 | ||
383 | |||
384 | # new smaller inversion | ||
385 | |||
386 | movdqa @t[3], @s[0] | ||
387 | pand @t[1], @t[3] | ||
388 | pxor @t[2], @s[0] | ||
389 | |||
390 | movdqa @t[0], @s[2] | ||
391 | movdqa @s[0], @s[3] | ||
392 | pxor @t[3], @s[2] | ||
393 | pand @s[2], @s[3] | ||
394 | |||
395 | movdqa @t[1], @s[1] | ||
396 | pxor @t[2], @s[3] | ||
397 | pxor @t[0], @s[1] | ||
398 | |||
399 | pxor @t[2], @t[3] | ||
400 | |||
401 | pand @t[3], @s[1] | ||
402 | |||
403 | movdqa @s[2], @t[2] | ||
404 | pxor @t[0], @s[1] | ||
405 | |||
406 | pxor @s[1], @t[2] | ||
407 | pxor @s[1], @t[1] | ||
408 | |||
409 | pand @t[0], @t[2] | ||
410 | |||
411 | pxor @t[2], @s[2] | ||
412 | pxor @t[2], @t[1] | ||
413 | |||
414 | pand @s[3], @s[2] | ||
415 | |||
416 | pxor @s[0], @s[2] | ||
417 | ___ | ||
418 | # output in s3, s2, s1, t1 | ||
419 | |||
420 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 | ||
421 | |||
422 | # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 | ||
423 | &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); | ||
424 | |||
425 | ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb | ||
426 | } | ||
427 | |||
428 | # AES linear components | ||
429 | |||
430 | sub ShiftRows { | ||
431 | my @x=@_[0..7]; | ||
432 | my $mask=pop; | ||
433 | $code.=<<___; | ||
434 | pxor 0x00($key),@x[0] | ||
435 | pxor 0x10($key),@x[1] | ||
436 | pshufb $mask,@x[0] | ||
437 | pxor 0x20($key),@x[2] | ||
438 | pshufb $mask,@x[1] | ||
439 | pxor 0x30($key),@x[3] | ||
440 | pshufb $mask,@x[2] | ||
441 | pxor 0x40($key),@x[4] | ||
442 | pshufb $mask,@x[3] | ||
443 | pxor 0x50($key),@x[5] | ||
444 | pshufb $mask,@x[4] | ||
445 | pxor 0x60($key),@x[6] | ||
446 | pshufb $mask,@x[5] | ||
447 | pxor 0x70($key),@x[7] | ||
448 | pshufb $mask,@x[6] | ||
449 | lea 0x80($key),$key | ||
450 | pshufb $mask,@x[7] | ||
451 | ___ | ||
452 | } | ||
453 | |||
454 | sub MixColumns { | ||
455 | # modified to emit output in order suitable for feeding back to aesenc[last] | ||
456 | my @x=@_[0..7]; | ||
457 | my @t=@_[8..15]; | ||
458 | $code.=<<___; | ||
459 | pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 | ||
460 | pshufd \$0x93, @x[1], @t[1] | ||
461 | pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) | ||
462 | pshufd \$0x93, @x[2], @t[2] | ||
463 | pxor @t[1], @x[1] | ||
464 | pshufd \$0x93, @x[3], @t[3] | ||
465 | pxor @t[2], @x[2] | ||
466 | pshufd \$0x93, @x[4], @t[4] | ||
467 | pxor @t[3], @x[3] | ||
468 | pshufd \$0x93, @x[5], @t[5] | ||
469 | pxor @t[4], @x[4] | ||
470 | pshufd \$0x93, @x[6], @t[6] | ||
471 | pxor @t[5], @x[5] | ||
472 | pshufd \$0x93, @x[7], @t[7] | ||
473 | pxor @t[6], @x[6] | ||
474 | pxor @t[7], @x[7] | ||
475 | |||
476 | pxor @x[0], @t[1] | ||
477 | pxor @x[7], @t[0] | ||
478 | pxor @x[7], @t[1] | ||
479 | pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) | ||
480 | pxor @x[1], @t[2] | ||
481 | pshufd \$0x4E, @x[1], @x[1] | ||
482 | pxor @x[4], @t[5] | ||
483 | pxor @t[0], @x[0] | ||
484 | pxor @x[5], @t[6] | ||
485 | pxor @t[1], @x[1] | ||
486 | pxor @x[3], @t[4] | ||
487 | pshufd \$0x4E, @x[4], @t[0] | ||
488 | pxor @x[6], @t[7] | ||
489 | pshufd \$0x4E, @x[5], @t[1] | ||
490 | pxor @x[2], @t[3] | ||
491 | pshufd \$0x4E, @x[3], @x[4] | ||
492 | pxor @x[7], @t[3] | ||
493 | pshufd \$0x4E, @x[7], @x[5] | ||
494 | pxor @x[7], @t[4] | ||
495 | pshufd \$0x4E, @x[6], @x[3] | ||
496 | pxor @t[4], @t[0] | ||
497 | pshufd \$0x4E, @x[2], @x[6] | ||
498 | pxor @t[5], @t[1] | ||
499 | |||
500 | pxor @t[3], @x[4] | ||
501 | pxor @t[7], @x[5] | ||
502 | pxor @t[6], @x[3] | ||
503 | movdqa @t[0], @x[2] | ||
504 | pxor @t[2], @x[6] | ||
505 | movdqa @t[1], @x[7] | ||
506 | ___ | ||
507 | } | ||
508 | |||
509 | sub InvMixColumns { | ||
510 | my @x=@_[0..7]; | ||
511 | my @t=@_[8..15]; | ||
512 | |||
513 | $code.=<<___; | ||
514 | # multiplication by 0x0e | ||
515 | pshufd \$0x93, @x[7], @t[7] | ||
516 | movdqa @x[2], @t[2] | ||
517 | pxor @x[5], @x[7] # 7 5 | ||
518 | pxor @x[5], @x[2] # 2 5 | ||
519 | pshufd \$0x93, @x[0], @t[0] | ||
520 | movdqa @x[5], @t[5] | ||
521 | pxor @x[0], @x[5] # 5 0 [1] | ||
522 | pxor @x[1], @x[0] # 0 1 | ||
523 | pshufd \$0x93, @x[1], @t[1] | ||
524 | pxor @x[2], @x[1] # 1 25 | ||
525 | pxor @x[6], @x[0] # 01 6 [2] | ||
526 | pxor @x[3], @x[1] # 125 3 [4] | ||
527 | pshufd \$0x93, @x[3], @t[3] | ||
528 | pxor @x[0], @x[2] # 25 016 [3] | ||
529 | pxor @x[7], @x[3] # 3 75 | ||
530 | pxor @x[6], @x[7] # 75 6 [0] | ||
531 | pshufd \$0x93, @x[6], @t[6] | ||
532 | movdqa @x[4], @t[4] | ||
533 | pxor @x[4], @x[6] # 6 4 | ||
534 | pxor @x[3], @x[4] # 4 375 [6] | ||
535 | pxor @x[7], @x[3] # 375 756=36 | ||
536 | pxor @t[5], @x[6] # 64 5 [7] | ||
537 | pxor @t[2], @x[3] # 36 2 | ||
538 | pxor @t[4], @x[3] # 362 4 [5] | ||
539 | pshufd \$0x93, @t[5], @t[5] | ||
540 | ___ | ||
541 | my @y = @x[7,5,0,2,1,3,4,6]; | ||
542 | $code.=<<___; | ||
543 | # multiplication by 0x0b | ||
544 | pxor @y[0], @y[1] | ||
545 | pxor @t[0], @y[0] | ||
546 | pxor @t[1], @y[1] | ||
547 | pshufd \$0x93, @t[2], @t[2] | ||
548 | pxor @t[5], @y[0] | ||
549 | pxor @t[6], @y[1] | ||
550 | pxor @t[7], @y[0] | ||
551 | pshufd \$0x93, @t[4], @t[4] | ||
552 | pxor @t[6], @t[7] # clobber t[7] | ||
553 | pxor @y[0], @y[1] | ||
554 | |||
555 | pxor @t[0], @y[3] | ||
556 | pshufd \$0x93, @t[0], @t[0] | ||
557 | pxor @t[1], @y[2] | ||
558 | pxor @t[1], @y[4] | ||
559 | pxor @t[2], @y[2] | ||
560 | pshufd \$0x93, @t[1], @t[1] | ||
561 | pxor @t[2], @y[3] | ||
562 | pxor @t[2], @y[5] | ||
563 | pxor @t[7], @y[2] | ||
564 | pshufd \$0x93, @t[2], @t[2] | ||
565 | pxor @t[3], @y[3] | ||
566 | pxor @t[3], @y[6] | ||
567 | pxor @t[3], @y[4] | ||
568 | pshufd \$0x93, @t[3], @t[3] | ||
569 | pxor @t[4], @y[7] | ||
570 | pxor @t[4], @y[5] | ||
571 | pxor @t[7], @y[7] | ||
572 | pxor @t[5], @y[3] | ||
573 | pxor @t[4], @y[4] | ||
574 | pxor @t[5], @t[7] # clobber t[7] even more | ||
575 | |||
576 | pxor @t[7], @y[5] | ||
577 | pshufd \$0x93, @t[4], @t[4] | ||
578 | pxor @t[7], @y[6] | ||
579 | pxor @t[7], @y[4] | ||
580 | |||
581 | pxor @t[5], @t[7] | ||
582 | pshufd \$0x93, @t[5], @t[5] | ||
583 | pxor @t[6], @t[7] # restore t[7] | ||
584 | |||
585 | # multiplication by 0x0d | ||
586 | pxor @y[7], @y[4] | ||
587 | pxor @t[4], @y[7] | ||
588 | pshufd \$0x93, @t[6], @t[6] | ||
589 | pxor @t[0], @y[2] | ||
590 | pxor @t[5], @y[7] | ||
591 | pxor @t[2], @y[2] | ||
592 | pshufd \$0x93, @t[7], @t[7] | ||
593 | |||
594 | pxor @y[1], @y[3] | ||
595 | pxor @t[1], @y[1] | ||
596 | pxor @t[0], @y[0] | ||
597 | pxor @t[0], @y[3] | ||
598 | pxor @t[5], @y[1] | ||
599 | pxor @t[5], @y[0] | ||
600 | pxor @t[7], @y[1] | ||
601 | pshufd \$0x93, @t[0], @t[0] | ||
602 | pxor @t[6], @y[0] | ||
603 | pxor @y[1], @y[3] | ||
604 | pxor @t[1], @y[4] | ||
605 | pshufd \$0x93, @t[1], @t[1] | ||
606 | |||
607 | pxor @t[7], @y[7] | ||
608 | pxor @t[2], @y[4] | ||
609 | pxor @t[2], @y[5] | ||
610 | pshufd \$0x93, @t[2], @t[2] | ||
611 | pxor @t[6], @y[2] | ||
612 | pxor @t[3], @t[6] # clobber t[6] | ||
613 | pxor @y[7], @y[4] | ||
614 | pxor @t[6], @y[3] | ||
615 | |||
616 | pxor @t[6], @y[6] | ||
617 | pxor @t[5], @y[5] | ||
618 | pxor @t[4], @y[6] | ||
619 | pshufd \$0x93, @t[4], @t[4] | ||
620 | pxor @t[6], @y[5] | ||
621 | pxor @t[7], @y[6] | ||
622 | pxor @t[3], @t[6] # restore t[6] | ||
623 | |||
624 | pshufd \$0x93, @t[5], @t[5] | ||
625 | pshufd \$0x93, @t[6], @t[6] | ||
626 | pshufd \$0x93, @t[7], @t[7] | ||
627 | pshufd \$0x93, @t[3], @t[3] | ||
628 | |||
629 | # multiplication by 0x09 | ||
630 | pxor @y[1], @y[4] | ||
631 | pxor @y[1], @t[1] # t[1]=y[1] | ||
632 | pxor @t[5], @t[0] # clobber t[0] | ||
633 | pxor @t[5], @t[1] | ||
634 | pxor @t[0], @y[3] | ||
635 | pxor @y[0], @t[0] # t[0]=y[0] | ||
636 | pxor @t[6], @t[1] | ||
637 | pxor @t[7], @t[6] # clobber t[6] | ||
638 | pxor @t[1], @y[4] | ||
639 | pxor @t[4], @y[7] | ||
640 | pxor @y[4], @t[4] # t[4]=y[4] | ||
641 | pxor @t[3], @y[6] | ||
642 | pxor @y[3], @t[3] # t[3]=y[3] | ||
643 | pxor @t[2], @y[5] | ||
644 | pxor @y[2], @t[2] # t[2]=y[2] | ||
645 | pxor @t[7], @t[3] | ||
646 | pxor @y[5], @t[5] # t[5]=y[5] | ||
647 | pxor @t[6], @t[2] | ||
648 | pxor @t[6], @t[5] | ||
649 | pxor @y[6], @t[6] # t[6]=y[6] | ||
650 | pxor @y[7], @t[7] # t[7]=y[7] | ||
651 | |||
652 | movdqa @t[0],@XMM[0] | ||
653 | movdqa @t[1],@XMM[1] | ||
654 | movdqa @t[2],@XMM[2] | ||
655 | movdqa @t[3],@XMM[3] | ||
656 | movdqa @t[4],@XMM[4] | ||
657 | movdqa @t[5],@XMM[5] | ||
658 | movdqa @t[6],@XMM[6] | ||
659 | movdqa @t[7],@XMM[7] | ||
660 | ___ | ||
661 | } | ||
662 | |||
663 | sub aesenc { # not used | ||
664 | my @b=@_[0..7]; | ||
665 | my @t=@_[8..15]; | ||
666 | $code.=<<___; | ||
667 | movdqa 0x30($const),@t[0] # .LSR | ||
668 | ___ | ||
669 | &ShiftRows (@b,@t[0]); | ||
670 | &Sbox (@b,@t); | ||
671 | &MixColumns (@b[0,1,4,6,3,7,2,5],@t); | ||
672 | } | ||
673 | |||
674 | sub aesenclast { # not used | ||
675 | my @b=@_[0..7]; | ||
676 | my @t=@_[8..15]; | ||
677 | $code.=<<___; | ||
678 | movdqa 0x40($const),@t[0] # .LSRM0 | ||
679 | ___ | ||
680 | &ShiftRows (@b,@t[0]); | ||
681 | &Sbox (@b,@t); | ||
682 | $code.=<<___ | ||
683 | pxor 0x00($key),@b[0] | ||
684 | pxor 0x10($key),@b[1] | ||
685 | pxor 0x20($key),@b[4] | ||
686 | pxor 0x30($key),@b[6] | ||
687 | pxor 0x40($key),@b[3] | ||
688 | pxor 0x50($key),@b[7] | ||
689 | pxor 0x60($key),@b[2] | ||
690 | pxor 0x70($key),@b[5] | ||
691 | ___ | ||
692 | } | ||
693 | |||
694 | sub swapmove { | ||
695 | my ($a,$b,$n,$mask,$t)=@_; | ||
696 | $code.=<<___; | ||
697 | movdqa $b,$t | ||
698 | psrlq \$$n,$b | ||
699 | pxor $a,$b | ||
700 | pand $mask,$b | ||
701 | pxor $b,$a | ||
702 | psllq \$$n,$b | ||
703 | pxor $t,$b | ||
704 | ___ | ||
705 | } | ||
706 | sub swapmove2x { | ||
707 | my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; | ||
708 | $code.=<<___; | ||
709 | movdqa $b0,$t0 | ||
710 | psrlq \$$n,$b0 | ||
711 | movdqa $b1,$t1 | ||
712 | psrlq \$$n,$b1 | ||
713 | pxor $a0,$b0 | ||
714 | pxor $a1,$b1 | ||
715 | pand $mask,$b0 | ||
716 | pand $mask,$b1 | ||
717 | pxor $b0,$a0 | ||
718 | psllq \$$n,$b0 | ||
719 | pxor $b1,$a1 | ||
720 | psllq \$$n,$b1 | ||
721 | pxor $t0,$b0 | ||
722 | pxor $t1,$b1 | ||
723 | ___ | ||
724 | } | ||
725 | |||
726 | sub bitslice { | ||
727 | my @x=reverse(@_[0..7]); | ||
728 | my ($t0,$t1,$t2,$t3)=@_[8..11]; | ||
729 | $code.=<<___; | ||
730 | movdqa 0x00($const),$t0 # .LBS0 | ||
731 | movdqa 0x10($const),$t1 # .LBS1 | ||
732 | ___ | ||
733 | &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); | ||
734 | &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
735 | $code.=<<___; | ||
736 | movdqa 0x20($const),$t0 # .LBS2 | ||
737 | ___ | ||
738 | &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); | ||
739 | &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
740 | |||
741 | &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); | ||
742 | &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); | ||
743 | } | ||
744 | |||
745 | $code.=<<___; | ||
746 | .text | ||
747 | |||
748 | .extern asm_AES_encrypt | ||
749 | .extern asm_AES_decrypt | ||
750 | |||
751 | .type _bsaes_encrypt8,\@abi-omnipotent | ||
752 | .align 64 | ||
753 | _bsaes_encrypt8: | ||
754 | lea .LBS0(%rip), $const # constants table | ||
755 | |||
756 | movdqa ($key), @XMM[9] # round 0 key | ||
757 | lea 0x10($key), $key | ||
758 | movdqa 0x50($const), @XMM[8] # .LM0SR | ||
759 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
760 | pxor @XMM[9], @XMM[1] | ||
761 | pshufb @XMM[8], @XMM[0] | ||
762 | pxor @XMM[9], @XMM[2] | ||
763 | pshufb @XMM[8], @XMM[1] | ||
764 | pxor @XMM[9], @XMM[3] | ||
765 | pshufb @XMM[8], @XMM[2] | ||
766 | pxor @XMM[9], @XMM[4] | ||
767 | pshufb @XMM[8], @XMM[3] | ||
768 | pxor @XMM[9], @XMM[5] | ||
769 | pshufb @XMM[8], @XMM[4] | ||
770 | pxor @XMM[9], @XMM[6] | ||
771 | pshufb @XMM[8], @XMM[5] | ||
772 | pxor @XMM[9], @XMM[7] | ||
773 | pshufb @XMM[8], @XMM[6] | ||
774 | pshufb @XMM[8], @XMM[7] | ||
775 | _bsaes_encrypt8_bitslice: | ||
776 | ___ | ||
777 | &bitslice (@XMM[0..7, 8..11]); | ||
778 | $code.=<<___; | ||
779 | dec $rounds | ||
780 | jmp .Lenc_sbox | ||
781 | .align 16 | ||
782 | .Lenc_loop: | ||
783 | ___ | ||
784 | &ShiftRows (@XMM[0..7, 8]); | ||
785 | $code.=".Lenc_sbox:\n"; | ||
786 | &Sbox (@XMM[0..7, 8..15]); | ||
787 | $code.=<<___; | ||
788 | dec $rounds | ||
789 | jl .Lenc_done | ||
790 | ___ | ||
791 | &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); | ||
792 | $code.=<<___; | ||
793 | movdqa 0x30($const), @XMM[8] # .LSR | ||
794 | jnz .Lenc_loop | ||
795 | movdqa 0x40($const), @XMM[8] # .LSRM0 | ||
796 | jmp .Lenc_loop | ||
797 | .align 16 | ||
798 | .Lenc_done: | ||
799 | ___ | ||
800 | # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb | ||
801 | &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); | ||
802 | $code.=<<___; | ||
803 | movdqa ($key), @XMM[8] # last round key | ||
804 | pxor @XMM[8], @XMM[4] | ||
805 | pxor @XMM[8], @XMM[6] | ||
806 | pxor @XMM[8], @XMM[3] | ||
807 | pxor @XMM[8], @XMM[7] | ||
808 | pxor @XMM[8], @XMM[2] | ||
809 | pxor @XMM[8], @XMM[5] | ||
810 | pxor @XMM[8], @XMM[0] | ||
811 | pxor @XMM[8], @XMM[1] | ||
812 | ret | ||
813 | .size _bsaes_encrypt8,.-_bsaes_encrypt8 | ||
814 | |||
815 | .type _bsaes_decrypt8,\@abi-omnipotent | ||
816 | .align 64 | ||
817 | _bsaes_decrypt8: | ||
818 | lea .LBS0(%rip), $const # constants table | ||
819 | |||
820 | movdqa ($key), @XMM[9] # round 0 key | ||
821 | lea 0x10($key), $key | ||
822 | movdqa -0x30($const), @XMM[8] # .LM0ISR | ||
823 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
824 | pxor @XMM[9], @XMM[1] | ||
825 | pshufb @XMM[8], @XMM[0] | ||
826 | pxor @XMM[9], @XMM[2] | ||
827 | pshufb @XMM[8], @XMM[1] | ||
828 | pxor @XMM[9], @XMM[3] | ||
829 | pshufb @XMM[8], @XMM[2] | ||
830 | pxor @XMM[9], @XMM[4] | ||
831 | pshufb @XMM[8], @XMM[3] | ||
832 | pxor @XMM[9], @XMM[5] | ||
833 | pshufb @XMM[8], @XMM[4] | ||
834 | pxor @XMM[9], @XMM[6] | ||
835 | pshufb @XMM[8], @XMM[5] | ||
836 | pxor @XMM[9], @XMM[7] | ||
837 | pshufb @XMM[8], @XMM[6] | ||
838 | pshufb @XMM[8], @XMM[7] | ||
839 | ___ | ||
840 | &bitslice (@XMM[0..7, 8..11]); | ||
841 | $code.=<<___; | ||
842 | dec $rounds | ||
843 | jmp .Ldec_sbox | ||
844 | .align 16 | ||
845 | .Ldec_loop: | ||
846 | ___ | ||
847 | &ShiftRows (@XMM[0..7, 8]); | ||
848 | $code.=".Ldec_sbox:\n"; | ||
849 | &InvSbox (@XMM[0..7, 8..15]); | ||
850 | $code.=<<___; | ||
851 | dec $rounds | ||
852 | jl .Ldec_done | ||
853 | ___ | ||
854 | &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); | ||
855 | $code.=<<___; | ||
856 | movdqa -0x10($const), @XMM[8] # .LISR | ||
857 | jnz .Ldec_loop | ||
858 | movdqa -0x20($const), @XMM[8] # .LISRM0 | ||
859 | jmp .Ldec_loop | ||
860 | .align 16 | ||
861 | .Ldec_done: | ||
862 | ___ | ||
863 | &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); | ||
864 | $code.=<<___; | ||
865 | movdqa ($key), @XMM[8] # last round key | ||
866 | pxor @XMM[8], @XMM[6] | ||
867 | pxor @XMM[8], @XMM[4] | ||
868 | pxor @XMM[8], @XMM[2] | ||
869 | pxor @XMM[8], @XMM[7] | ||
870 | pxor @XMM[8], @XMM[3] | ||
871 | pxor @XMM[8], @XMM[5] | ||
872 | pxor @XMM[8], @XMM[0] | ||
873 | pxor @XMM[8], @XMM[1] | ||
874 | ret | ||
875 | .size _bsaes_decrypt8,.-_bsaes_decrypt8 | ||
876 | ___ | ||
877 | } | ||
878 | { | ||
879 | my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); | ||
880 | |||
881 | sub bitslice_key { | ||
882 | my @x=reverse(@_[0..7]); | ||
883 | my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; | ||
884 | |||
885 | &swapmove (@x[0,1],1,$bs0,$t2,$t3); | ||
886 | $code.=<<___; | ||
887 | #&swapmove(@x[2,3],1,$t0,$t2,$t3); | ||
888 | movdqa @x[0], @x[2] | ||
889 | movdqa @x[1], @x[3] | ||
890 | ___ | ||
891 | #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | ||
892 | |||
893 | &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); | ||
894 | $code.=<<___; | ||
895 | #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | ||
896 | movdqa @x[0], @x[4] | ||
897 | movdqa @x[2], @x[6] | ||
898 | movdqa @x[1], @x[5] | ||
899 | movdqa @x[3], @x[7] | ||
900 | ___ | ||
901 | &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); | ||
902 | &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); | ||
903 | } | ||
904 | |||
905 | $code.=<<___; | ||
906 | .type _bsaes_key_convert,\@abi-omnipotent | ||
907 | .align 16 | ||
908 | _bsaes_key_convert: | ||
909 | lea .Lmasks(%rip), $const | ||
910 | movdqu ($inp), %xmm7 # load round 0 key | ||
911 | lea 0x10($inp), $inp | ||
912 | movdqa 0x00($const), %xmm0 # 0x01... | ||
913 | movdqa 0x10($const), %xmm1 # 0x02... | ||
914 | movdqa 0x20($const), %xmm2 # 0x04... | ||
915 | movdqa 0x30($const), %xmm3 # 0x08... | ||
916 | movdqa 0x40($const), %xmm4 # .LM0 | ||
917 | pcmpeqd %xmm5, %xmm5 # .LNOT | ||
918 | |||
919 | movdqu ($inp), %xmm6 # load round 1 key | ||
920 | movdqa %xmm7, ($out) # save round 0 key | ||
921 | lea 0x10($out), $out | ||
922 | dec $rounds | ||
923 | jmp .Lkey_loop | ||
924 | .align 16 | ||
925 | .Lkey_loop: | ||
926 | pshufb %xmm4, %xmm6 # .LM0 | ||
927 | |||
928 | movdqa %xmm0, %xmm8 | ||
929 | movdqa %xmm1, %xmm9 | ||
930 | |||
931 | pand %xmm6, %xmm8 | ||
932 | pand %xmm6, %xmm9 | ||
933 | movdqa %xmm2, %xmm10 | ||
934 | pcmpeqb %xmm0, %xmm8 | ||
935 | psllq \$4, %xmm0 # 0x10... | ||
936 | movdqa %xmm3, %xmm11 | ||
937 | pcmpeqb %xmm1, %xmm9 | ||
938 | psllq \$4, %xmm1 # 0x20... | ||
939 | |||
940 | pand %xmm6, %xmm10 | ||
941 | pand %xmm6, %xmm11 | ||
942 | movdqa %xmm0, %xmm12 | ||
943 | pcmpeqb %xmm2, %xmm10 | ||
944 | psllq \$4, %xmm2 # 0x40... | ||
945 | movdqa %xmm1, %xmm13 | ||
946 | pcmpeqb %xmm3, %xmm11 | ||
947 | psllq \$4, %xmm3 # 0x80... | ||
948 | |||
949 | movdqa %xmm2, %xmm14 | ||
950 | movdqa %xmm3, %xmm15 | ||
951 | pxor %xmm5, %xmm8 # "pnot" | ||
952 | pxor %xmm5, %xmm9 | ||
953 | |||
954 | pand %xmm6, %xmm12 | ||
955 | pand %xmm6, %xmm13 | ||
956 | movdqa %xmm8, 0x00($out) # write bit-sliced round key | ||
957 | pcmpeqb %xmm0, %xmm12 | ||
958 | psrlq \$4, %xmm0 # 0x01... | ||
959 | movdqa %xmm9, 0x10($out) | ||
960 | pcmpeqb %xmm1, %xmm13 | ||
961 | psrlq \$4, %xmm1 # 0x02... | ||
962 | lea 0x10($inp), $inp | ||
963 | |||
964 | pand %xmm6, %xmm14 | ||
965 | pand %xmm6, %xmm15 | ||
966 | movdqa %xmm10, 0x20($out) | ||
967 | pcmpeqb %xmm2, %xmm14 | ||
968 | psrlq \$4, %xmm2 # 0x04... | ||
969 | movdqa %xmm11, 0x30($out) | ||
970 | pcmpeqb %xmm3, %xmm15 | ||
971 | psrlq \$4, %xmm3 # 0x08... | ||
972 | movdqu ($inp), %xmm6 # load next round key | ||
973 | |||
974 | pxor %xmm5, %xmm13 # "pnot" | ||
975 | pxor %xmm5, %xmm14 | ||
976 | movdqa %xmm12, 0x40($out) | ||
977 | movdqa %xmm13, 0x50($out) | ||
978 | movdqa %xmm14, 0x60($out) | ||
979 | movdqa %xmm15, 0x70($out) | ||
980 | lea 0x80($out),$out | ||
981 | dec $rounds | ||
982 | jnz .Lkey_loop | ||
983 | |||
984 | movdqa 0x50($const), %xmm7 # .L63 | ||
985 | #movdqa %xmm6, ($out) # don't save last round key | ||
986 | ret | ||
987 | .size _bsaes_key_convert,.-_bsaes_key_convert | ||
988 | ___ | ||
989 | } | ||
990 | |||
991 | if (0 && !$win64) { # following four functions are unsupported interface | ||
992 | # used for benchmarking... | ||
993 | $code.=<<___; | ||
994 | .globl bsaes_enc_key_convert | ||
995 | .type bsaes_enc_key_convert,\@function,2 | ||
996 | .align 16 | ||
997 | bsaes_enc_key_convert: | ||
998 | mov 240($inp),%r10d # pass rounds | ||
999 | mov $inp,%rcx # pass key | ||
1000 | mov $out,%rax # pass key schedule | ||
1001 | call _bsaes_key_convert | ||
1002 | pxor %xmm6,%xmm7 # fix up last round key | ||
1003 | movdqa %xmm7,(%rax) # save last round key | ||
1004 | ret | ||
1005 | .size bsaes_enc_key_convert,.-bsaes_enc_key_convert | ||
1006 | |||
1007 | .globl bsaes_encrypt_128 | ||
1008 | .type bsaes_encrypt_128,\@function,4 | ||
1009 | .align 16 | ||
1010 | bsaes_encrypt_128: | ||
1011 | .Lenc128_loop: | ||
1012 | movdqu 0x00($inp), @XMM[0] # load input | ||
1013 | movdqu 0x10($inp), @XMM[1] | ||
1014 | movdqu 0x20($inp), @XMM[2] | ||
1015 | movdqu 0x30($inp), @XMM[3] | ||
1016 | movdqu 0x40($inp), @XMM[4] | ||
1017 | movdqu 0x50($inp), @XMM[5] | ||
1018 | movdqu 0x60($inp), @XMM[6] | ||
1019 | movdqu 0x70($inp), @XMM[7] | ||
1020 | mov $key, %rax # pass the $key | ||
1021 | lea 0x80($inp), $inp | ||
1022 | mov \$10,%r10d | ||
1023 | |||
1024 | call _bsaes_encrypt8 | ||
1025 | |||
1026 | movdqu @XMM[0], 0x00($out) # write output | ||
1027 | movdqu @XMM[1], 0x10($out) | ||
1028 | movdqu @XMM[4], 0x20($out) | ||
1029 | movdqu @XMM[6], 0x30($out) | ||
1030 | movdqu @XMM[3], 0x40($out) | ||
1031 | movdqu @XMM[7], 0x50($out) | ||
1032 | movdqu @XMM[2], 0x60($out) | ||
1033 | movdqu @XMM[5], 0x70($out) | ||
1034 | lea 0x80($out), $out | ||
1035 | sub \$0x80,$len | ||
1036 | ja .Lenc128_loop | ||
1037 | ret | ||
1038 | .size bsaes_encrypt_128,.-bsaes_encrypt_128 | ||
1039 | |||
1040 | .globl bsaes_dec_key_convert | ||
1041 | .type bsaes_dec_key_convert,\@function,2 | ||
1042 | .align 16 | ||
1043 | bsaes_dec_key_convert: | ||
1044 | mov 240($inp),%r10d # pass rounds | ||
1045 | mov $inp,%rcx # pass key | ||
1046 | mov $out,%rax # pass key schedule | ||
1047 | call _bsaes_key_convert | ||
1048 | pxor ($out),%xmm7 # fix up round 0 key | ||
1049 | movdqa %xmm6,(%rax) # save last round key | ||
1050 | movdqa %xmm7,($out) | ||
1051 | ret | ||
1052 | .size bsaes_dec_key_convert,.-bsaes_dec_key_convert | ||
1053 | |||
1054 | .globl bsaes_decrypt_128 | ||
1055 | .type bsaes_decrypt_128,\@function,4 | ||
1056 | .align 16 | ||
1057 | bsaes_decrypt_128: | ||
1058 | .Ldec128_loop: | ||
1059 | movdqu 0x00($inp), @XMM[0] # load input | ||
1060 | movdqu 0x10($inp), @XMM[1] | ||
1061 | movdqu 0x20($inp), @XMM[2] | ||
1062 | movdqu 0x30($inp), @XMM[3] | ||
1063 | movdqu 0x40($inp), @XMM[4] | ||
1064 | movdqu 0x50($inp), @XMM[5] | ||
1065 | movdqu 0x60($inp), @XMM[6] | ||
1066 | movdqu 0x70($inp), @XMM[7] | ||
1067 | mov $key, %rax # pass the $key | ||
1068 | lea 0x80($inp), $inp | ||
1069 | mov \$10,%r10d | ||
1070 | |||
1071 | call _bsaes_decrypt8 | ||
1072 | |||
1073 | movdqu @XMM[0], 0x00($out) # write output | ||
1074 | movdqu @XMM[1], 0x10($out) | ||
1075 | movdqu @XMM[6], 0x20($out) | ||
1076 | movdqu @XMM[4], 0x30($out) | ||
1077 | movdqu @XMM[2], 0x40($out) | ||
1078 | movdqu @XMM[7], 0x50($out) | ||
1079 | movdqu @XMM[3], 0x60($out) | ||
1080 | movdqu @XMM[5], 0x70($out) | ||
1081 | lea 0x80($out), $out | ||
1082 | sub \$0x80,$len | ||
1083 | ja .Ldec128_loop | ||
1084 | ret | ||
1085 | .size bsaes_decrypt_128,.-bsaes_decrypt_128 | ||
1086 | ___ | ||
1087 | } | ||
1088 | { | ||
1089 | ###################################################################### | ||
1090 | # | ||
1091 | # OpenSSL interface | ||
1092 | # | ||
1093 | my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") | ||
1094 | : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); | ||
1095 | my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); | ||
1096 | |||
1097 | if ($ecb) { | ||
1098 | $code.=<<___; | ||
1099 | .globl bsaes_ecb_encrypt_blocks | ||
1100 | .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent | ||
1101 | .align 16 | ||
1102 | bsaes_ecb_encrypt_blocks: | ||
1103 | mov %rsp, %rax | ||
1104 | .Lecb_enc_prologue: | ||
1105 | push %rbp | ||
1106 | push %rbx | ||
1107 | push %r12 | ||
1108 | push %r13 | ||
1109 | push %r14 | ||
1110 | push %r15 | ||
1111 | lea -0x48(%rsp),%rsp | ||
1112 | ___ | ||
1113 | $code.=<<___ if ($win64); | ||
1114 | lea -0xa0(%rsp), %rsp | ||
1115 | movaps %xmm6, 0x40(%rsp) | ||
1116 | movaps %xmm7, 0x50(%rsp) | ||
1117 | movaps %xmm8, 0x60(%rsp) | ||
1118 | movaps %xmm9, 0x70(%rsp) | ||
1119 | movaps %xmm10, 0x80(%rsp) | ||
1120 | movaps %xmm11, 0x90(%rsp) | ||
1121 | movaps %xmm12, 0xa0(%rsp) | ||
1122 | movaps %xmm13, 0xb0(%rsp) | ||
1123 | movaps %xmm14, 0xc0(%rsp) | ||
1124 | movaps %xmm15, 0xd0(%rsp) | ||
1125 | .Lecb_enc_body: | ||
1126 | ___ | ||
1127 | $code.=<<___; | ||
1128 | mov %rsp,%rbp # backup %rsp | ||
1129 | mov 240($arg4),%eax # rounds | ||
1130 | mov $arg1,$inp # backup arguments | ||
1131 | mov $arg2,$out | ||
1132 | mov $arg3,$len | ||
1133 | mov $arg4,$key | ||
1134 | cmp \$8,$arg3 | ||
1135 | jb .Lecb_enc_short | ||
1136 | |||
1137 | mov %eax,%ebx # backup rounds | ||
1138 | shl \$7,%rax # 128 bytes per inner round key | ||
1139 | sub \$`128-32`,%rax # size of bit-sliced key schedule | ||
1140 | sub %rax,%rsp | ||
1141 | mov %rsp,%rax # pass key schedule | ||
1142 | mov $key,%rcx # pass key | ||
1143 | mov %ebx,%r10d # pass rounds | ||
1144 | call _bsaes_key_convert | ||
1145 | pxor %xmm6,%xmm7 # fix up last round key | ||
1146 | movdqa %xmm7,(%rax) # save last round key | ||
1147 | |||
1148 | sub \$8,$len | ||
1149 | .Lecb_enc_loop: | ||
1150 | movdqu 0x00($inp), @XMM[0] # load input | ||
1151 | movdqu 0x10($inp), @XMM[1] | ||
1152 | movdqu 0x20($inp), @XMM[2] | ||
1153 | movdqu 0x30($inp), @XMM[3] | ||
1154 | movdqu 0x40($inp), @XMM[4] | ||
1155 | movdqu 0x50($inp), @XMM[5] | ||
1156 | mov %rsp, %rax # pass key schedule | ||
1157 | movdqu 0x60($inp), @XMM[6] | ||
1158 | mov %ebx,%r10d # pass rounds | ||
1159 | movdqu 0x70($inp), @XMM[7] | ||
1160 | lea 0x80($inp), $inp | ||
1161 | |||
1162 | call _bsaes_encrypt8 | ||
1163 | |||
1164 | movdqu @XMM[0], 0x00($out) # write output | ||
1165 | movdqu @XMM[1], 0x10($out) | ||
1166 | movdqu @XMM[4], 0x20($out) | ||
1167 | movdqu @XMM[6], 0x30($out) | ||
1168 | movdqu @XMM[3], 0x40($out) | ||
1169 | movdqu @XMM[7], 0x50($out) | ||
1170 | movdqu @XMM[2], 0x60($out) | ||
1171 | movdqu @XMM[5], 0x70($out) | ||
1172 | lea 0x80($out), $out | ||
1173 | sub \$8,$len | ||
1174 | jnc .Lecb_enc_loop | ||
1175 | |||
1176 | add \$8,$len | ||
1177 | jz .Lecb_enc_done | ||
1178 | |||
1179 | movdqu 0x00($inp), @XMM[0] # load input | ||
1180 | mov %rsp, %rax # pass key schedule | ||
1181 | mov %ebx,%r10d # pass rounds | ||
1182 | cmp \$2,$len | ||
1183 | jb .Lecb_enc_one | ||
1184 | movdqu 0x10($inp), @XMM[1] | ||
1185 | je .Lecb_enc_two | ||
1186 | movdqu 0x20($inp), @XMM[2] | ||
1187 | cmp \$4,$len | ||
1188 | jb .Lecb_enc_three | ||
1189 | movdqu 0x30($inp), @XMM[3] | ||
1190 | je .Lecb_enc_four | ||
1191 | movdqu 0x40($inp), @XMM[4] | ||
1192 | cmp \$6,$len | ||
1193 | jb .Lecb_enc_five | ||
1194 | movdqu 0x50($inp), @XMM[5] | ||
1195 | je .Lecb_enc_six | ||
1196 | movdqu 0x60($inp), @XMM[6] | ||
1197 | call _bsaes_encrypt8 | ||
1198 | movdqu @XMM[0], 0x00($out) # write output | ||
1199 | movdqu @XMM[1], 0x10($out) | ||
1200 | movdqu @XMM[4], 0x20($out) | ||
1201 | movdqu @XMM[6], 0x30($out) | ||
1202 | movdqu @XMM[3], 0x40($out) | ||
1203 | movdqu @XMM[7], 0x50($out) | ||
1204 | movdqu @XMM[2], 0x60($out) | ||
1205 | jmp .Lecb_enc_done | ||
1206 | .align 16 | ||
1207 | .Lecb_enc_six: | ||
1208 | call _bsaes_encrypt8 | ||
1209 | movdqu @XMM[0], 0x00($out) # write output | ||
1210 | movdqu @XMM[1], 0x10($out) | ||
1211 | movdqu @XMM[4], 0x20($out) | ||
1212 | movdqu @XMM[6], 0x30($out) | ||
1213 | movdqu @XMM[3], 0x40($out) | ||
1214 | movdqu @XMM[7], 0x50($out) | ||
1215 | jmp .Lecb_enc_done | ||
1216 | .align 16 | ||
1217 | .Lecb_enc_five: | ||
1218 | call _bsaes_encrypt8 | ||
1219 | movdqu @XMM[0], 0x00($out) # write output | ||
1220 | movdqu @XMM[1], 0x10($out) | ||
1221 | movdqu @XMM[4], 0x20($out) | ||
1222 | movdqu @XMM[6], 0x30($out) | ||
1223 | movdqu @XMM[3], 0x40($out) | ||
1224 | jmp .Lecb_enc_done | ||
1225 | .align 16 | ||
1226 | .Lecb_enc_four: | ||
1227 | call _bsaes_encrypt8 | ||
1228 | movdqu @XMM[0], 0x00($out) # write output | ||
1229 | movdqu @XMM[1], 0x10($out) | ||
1230 | movdqu @XMM[4], 0x20($out) | ||
1231 | movdqu @XMM[6], 0x30($out) | ||
1232 | jmp .Lecb_enc_done | ||
1233 | .align 16 | ||
1234 | .Lecb_enc_three: | ||
1235 | call _bsaes_encrypt8 | ||
1236 | movdqu @XMM[0], 0x00($out) # write output | ||
1237 | movdqu @XMM[1], 0x10($out) | ||
1238 | movdqu @XMM[4], 0x20($out) | ||
1239 | jmp .Lecb_enc_done | ||
1240 | .align 16 | ||
1241 | .Lecb_enc_two: | ||
1242 | call _bsaes_encrypt8 | ||
1243 | movdqu @XMM[0], 0x00($out) # write output | ||
1244 | movdqu @XMM[1], 0x10($out) | ||
1245 | jmp .Lecb_enc_done | ||
1246 | .align 16 | ||
1247 | .Lecb_enc_one: | ||
1248 | call _bsaes_encrypt8 | ||
1249 | movdqu @XMM[0], 0x00($out) # write output | ||
1250 | jmp .Lecb_enc_done | ||
1251 | .align 16 | ||
1252 | .Lecb_enc_short: | ||
1253 | lea ($inp), $arg1 | ||
1254 | lea ($out), $arg2 | ||
1255 | lea ($key), $arg3 | ||
1256 | call asm_AES_encrypt | ||
1257 | lea 16($inp), $inp | ||
1258 | lea 16($out), $out | ||
1259 | dec $len | ||
1260 | jnz .Lecb_enc_short | ||
1261 | |||
1262 | .Lecb_enc_done: | ||
1263 | lea (%rsp),%rax | ||
1264 | pxor %xmm0, %xmm0 | ||
1265 | .Lecb_enc_bzero: # wipe key schedule [if any] | ||
1266 | movdqa %xmm0, 0x00(%rax) | ||
1267 | movdqa %xmm0, 0x10(%rax) | ||
1268 | lea 0x20(%rax), %rax | ||
1269 | cmp %rax, %rbp | ||
1270 | jb .Lecb_enc_bzero | ||
1271 | |||
1272 | lea (%rbp),%rsp # restore %rsp | ||
1273 | ___ | ||
1274 | $code.=<<___ if ($win64); | ||
1275 | movaps 0x40(%rbp), %xmm6 | ||
1276 | movaps 0x50(%rbp), %xmm7 | ||
1277 | movaps 0x60(%rbp), %xmm8 | ||
1278 | movaps 0x70(%rbp), %xmm9 | ||
1279 | movaps 0x80(%rbp), %xmm10 | ||
1280 | movaps 0x90(%rbp), %xmm11 | ||
1281 | movaps 0xa0(%rbp), %xmm12 | ||
1282 | movaps 0xb0(%rbp), %xmm13 | ||
1283 | movaps 0xc0(%rbp), %xmm14 | ||
1284 | movaps 0xd0(%rbp), %xmm15 | ||
1285 | lea 0xa0(%rbp), %rsp | ||
1286 | ___ | ||
1287 | $code.=<<___; | ||
1288 | mov 0x48(%rsp), %r15 | ||
1289 | mov 0x50(%rsp), %r14 | ||
1290 | mov 0x58(%rsp), %r13 | ||
1291 | mov 0x60(%rsp), %r12 | ||
1292 | mov 0x68(%rsp), %rbx | ||
1293 | mov 0x70(%rsp), %rax | ||
1294 | lea 0x78(%rsp), %rsp | ||
1295 | mov %rax, %rbp | ||
1296 | .Lecb_enc_epilogue: | ||
1297 | ret | ||
1298 | .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks | ||
1299 | |||
1300 | .globl bsaes_ecb_decrypt_blocks | ||
1301 | .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent | ||
1302 | .align 16 | ||
1303 | bsaes_ecb_decrypt_blocks: | ||
1304 | mov %rsp, %rax | ||
1305 | .Lecb_dec_prologue: | ||
1306 | push %rbp | ||
1307 | push %rbx | ||
1308 | push %r12 | ||
1309 | push %r13 | ||
1310 | push %r14 | ||
1311 | push %r15 | ||
1312 | lea -0x48(%rsp),%rsp | ||
1313 | ___ | ||
1314 | $code.=<<___ if ($win64); | ||
1315 | lea -0xa0(%rsp), %rsp | ||
1316 | movaps %xmm6, 0x40(%rsp) | ||
1317 | movaps %xmm7, 0x50(%rsp) | ||
1318 | movaps %xmm8, 0x60(%rsp) | ||
1319 | movaps %xmm9, 0x70(%rsp) | ||
1320 | movaps %xmm10, 0x80(%rsp) | ||
1321 | movaps %xmm11, 0x90(%rsp) | ||
1322 | movaps %xmm12, 0xa0(%rsp) | ||
1323 | movaps %xmm13, 0xb0(%rsp) | ||
1324 | movaps %xmm14, 0xc0(%rsp) | ||
1325 | movaps %xmm15, 0xd0(%rsp) | ||
1326 | .Lecb_dec_body: | ||
1327 | ___ | ||
1328 | $code.=<<___; | ||
1329 | mov %rsp,%rbp # backup %rsp | ||
1330 | mov 240($arg4),%eax # rounds | ||
1331 | mov $arg1,$inp # backup arguments | ||
1332 | mov $arg2,$out | ||
1333 | mov $arg3,$len | ||
1334 | mov $arg4,$key | ||
1335 | cmp \$8,$arg3 | ||
1336 | jb .Lecb_dec_short | ||
1337 | |||
1338 | mov %eax,%ebx # backup rounds | ||
1339 | shl \$7,%rax # 128 bytes per inner round key | ||
1340 | sub \$`128-32`,%rax # size of bit-sliced key schedule | ||
1341 | sub %rax,%rsp | ||
1342 | mov %rsp,%rax # pass key schedule | ||
1343 | mov $key,%rcx # pass key | ||
1344 | mov %ebx,%r10d # pass rounds | ||
1345 | call _bsaes_key_convert | ||
1346 | pxor (%rsp),%xmm7 # fix up 0 round key | ||
1347 | movdqa %xmm6,(%rax) # save last round key | ||
1348 | movdqa %xmm7,(%rsp) | ||
1349 | |||
1350 | sub \$8,$len | ||
1351 | .Lecb_dec_loop: | ||
1352 | movdqu 0x00($inp), @XMM[0] # load input | ||
1353 | movdqu 0x10($inp), @XMM[1] | ||
1354 | movdqu 0x20($inp), @XMM[2] | ||
1355 | movdqu 0x30($inp), @XMM[3] | ||
1356 | movdqu 0x40($inp), @XMM[4] | ||
1357 | movdqu 0x50($inp), @XMM[5] | ||
1358 | mov %rsp, %rax # pass key schedule | ||
1359 | movdqu 0x60($inp), @XMM[6] | ||
1360 | mov %ebx,%r10d # pass rounds | ||
1361 | movdqu 0x70($inp), @XMM[7] | ||
1362 | lea 0x80($inp), $inp | ||
1363 | |||
1364 | call _bsaes_decrypt8 | ||
1365 | |||
1366 | movdqu @XMM[0], 0x00($out) # write output | ||
1367 | movdqu @XMM[1], 0x10($out) | ||
1368 | movdqu @XMM[6], 0x20($out) | ||
1369 | movdqu @XMM[4], 0x30($out) | ||
1370 | movdqu @XMM[2], 0x40($out) | ||
1371 | movdqu @XMM[7], 0x50($out) | ||
1372 | movdqu @XMM[3], 0x60($out) | ||
1373 | movdqu @XMM[5], 0x70($out) | ||
1374 | lea 0x80($out), $out | ||
1375 | sub \$8,$len | ||
1376 | jnc .Lecb_dec_loop | ||
1377 | |||
1378 | add \$8,$len | ||
1379 | jz .Lecb_dec_done | ||
1380 | |||
1381 | movdqu 0x00($inp), @XMM[0] # load input | ||
1382 | mov %rsp, %rax # pass key schedule | ||
1383 | mov %ebx,%r10d # pass rounds | ||
1384 | cmp \$2,$len | ||
1385 | jb .Lecb_dec_one | ||
1386 | movdqu 0x10($inp), @XMM[1] | ||
1387 | je .Lecb_dec_two | ||
1388 | movdqu 0x20($inp), @XMM[2] | ||
1389 | cmp \$4,$len | ||
1390 | jb .Lecb_dec_three | ||
1391 | movdqu 0x30($inp), @XMM[3] | ||
1392 | je .Lecb_dec_four | ||
1393 | movdqu 0x40($inp), @XMM[4] | ||
1394 | cmp \$6,$len | ||
1395 | jb .Lecb_dec_five | ||
1396 | movdqu 0x50($inp), @XMM[5] | ||
1397 | je .Lecb_dec_six | ||
1398 | movdqu 0x60($inp), @XMM[6] | ||
1399 | call _bsaes_decrypt8 | ||
1400 | movdqu @XMM[0], 0x00($out) # write output | ||
1401 | movdqu @XMM[1], 0x10($out) | ||
1402 | movdqu @XMM[6], 0x20($out) | ||
1403 | movdqu @XMM[4], 0x30($out) | ||
1404 | movdqu @XMM[2], 0x40($out) | ||
1405 | movdqu @XMM[7], 0x50($out) | ||
1406 | movdqu @XMM[3], 0x60($out) | ||
1407 | jmp .Lecb_dec_done | ||
1408 | .align 16 | ||
1409 | .Lecb_dec_six: | ||
1410 | call _bsaes_decrypt8 | ||
1411 | movdqu @XMM[0], 0x00($out) # write output | ||
1412 | movdqu @XMM[1], 0x10($out) | ||
1413 | movdqu @XMM[6], 0x20($out) | ||
1414 | movdqu @XMM[4], 0x30($out) | ||
1415 | movdqu @XMM[2], 0x40($out) | ||
1416 | movdqu @XMM[7], 0x50($out) | ||
1417 | jmp .Lecb_dec_done | ||
1418 | .align 16 | ||
1419 | .Lecb_dec_five: | ||
1420 | call _bsaes_decrypt8 | ||
1421 | movdqu @XMM[0], 0x00($out) # write output | ||
1422 | movdqu @XMM[1], 0x10($out) | ||
1423 | movdqu @XMM[6], 0x20($out) | ||
1424 | movdqu @XMM[4], 0x30($out) | ||
1425 | movdqu @XMM[2], 0x40($out) | ||
1426 | jmp .Lecb_dec_done | ||
1427 | .align 16 | ||
1428 | .Lecb_dec_four: | ||
1429 | call _bsaes_decrypt8 | ||
1430 | movdqu @XMM[0], 0x00($out) # write output | ||
1431 | movdqu @XMM[1], 0x10($out) | ||
1432 | movdqu @XMM[6], 0x20($out) | ||
1433 | movdqu @XMM[4], 0x30($out) | ||
1434 | jmp .Lecb_dec_done | ||
1435 | .align 16 | ||
1436 | .Lecb_dec_three: | ||
1437 | call _bsaes_decrypt8 | ||
1438 | movdqu @XMM[0], 0x00($out) # write output | ||
1439 | movdqu @XMM[1], 0x10($out) | ||
1440 | movdqu @XMM[6], 0x20($out) | ||
1441 | jmp .Lecb_dec_done | ||
1442 | .align 16 | ||
1443 | .Lecb_dec_two: | ||
1444 | call _bsaes_decrypt8 | ||
1445 | movdqu @XMM[0], 0x00($out) # write output | ||
1446 | movdqu @XMM[1], 0x10($out) | ||
1447 | jmp .Lecb_dec_done | ||
1448 | .align 16 | ||
1449 | .Lecb_dec_one: | ||
1450 | call _bsaes_decrypt8 | ||
1451 | movdqu @XMM[0], 0x00($out) # write output | ||
1452 | jmp .Lecb_dec_done | ||
1453 | .align 16 | ||
1454 | .Lecb_dec_short: | ||
1455 | lea ($inp), $arg1 | ||
1456 | lea ($out), $arg2 | ||
1457 | lea ($key), $arg3 | ||
1458 | call asm_AES_decrypt | ||
1459 | lea 16($inp), $inp | ||
1460 | lea 16($out), $out | ||
1461 | dec $len | ||
1462 | jnz .Lecb_dec_short | ||
1463 | |||
1464 | .Lecb_dec_done: | ||
1465 | lea (%rsp),%rax | ||
1466 | pxor %xmm0, %xmm0 | ||
1467 | .Lecb_dec_bzero: # wipe key schedule [if any] | ||
1468 | movdqa %xmm0, 0x00(%rax) | ||
1469 | movdqa %xmm0, 0x10(%rax) | ||
1470 | lea 0x20(%rax), %rax | ||
1471 | cmp %rax, %rbp | ||
1472 | jb .Lecb_dec_bzero | ||
1473 | |||
1474 | lea (%rbp),%rsp # restore %rsp | ||
1475 | ___ | ||
1476 | $code.=<<___ if ($win64); | ||
1477 | movaps 0x40(%rbp), %xmm6 | ||
1478 | movaps 0x50(%rbp), %xmm7 | ||
1479 | movaps 0x60(%rbp), %xmm8 | ||
1480 | movaps 0x70(%rbp), %xmm9 | ||
1481 | movaps 0x80(%rbp), %xmm10 | ||
1482 | movaps 0x90(%rbp), %xmm11 | ||
1483 | movaps 0xa0(%rbp), %xmm12 | ||
1484 | movaps 0xb0(%rbp), %xmm13 | ||
1485 | movaps 0xc0(%rbp), %xmm14 | ||
1486 | movaps 0xd0(%rbp), %xmm15 | ||
1487 | lea 0xa0(%rbp), %rsp | ||
1488 | ___ | ||
1489 | $code.=<<___; | ||
1490 | mov 0x48(%rsp), %r15 | ||
1491 | mov 0x50(%rsp), %r14 | ||
1492 | mov 0x58(%rsp), %r13 | ||
1493 | mov 0x60(%rsp), %r12 | ||
1494 | mov 0x68(%rsp), %rbx | ||
1495 | mov 0x70(%rsp), %rax | ||
1496 | lea 0x78(%rsp), %rsp | ||
1497 | mov %rax, %rbp | ||
1498 | .Lecb_dec_epilogue: | ||
1499 | ret | ||
1500 | .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks | ||
1501 | ___ | ||
1502 | } | ||
1503 | $code.=<<___; | ||
1504 | .extern asm_AES_cbc_encrypt | ||
1505 | .globl bsaes_cbc_encrypt | ||
1506 | .type bsaes_cbc_encrypt,\@abi-omnipotent | ||
1507 | .align 16 | ||
1508 | bsaes_cbc_encrypt: | ||
1509 | ___ | ||
1510 | $code.=<<___ if ($win64); | ||
1511 | mov 48(%rsp),$arg6 # pull direction flag | ||
1512 | ___ | ||
1513 | $code.=<<___; | ||
1514 | cmp \$0,$arg6 | ||
1515 | jne asm_AES_cbc_encrypt | ||
1516 | cmp \$128,$arg3 | ||
1517 | jb asm_AES_cbc_encrypt | ||
1518 | |||
1519 | mov %rsp, %rax | ||
1520 | .Lcbc_dec_prologue: | ||
1521 | push %rbp | ||
1522 | push %rbx | ||
1523 | push %r12 | ||
1524 | push %r13 | ||
1525 | push %r14 | ||
1526 | push %r15 | ||
1527 | lea -0x48(%rsp), %rsp | ||
1528 | ___ | ||
1529 | $code.=<<___ if ($win64); | ||
1530 | mov 0xa0(%rsp),$arg5 # pull ivp | ||
1531 | lea -0xa0(%rsp), %rsp | ||
1532 | movaps %xmm6, 0x40(%rsp) | ||
1533 | movaps %xmm7, 0x50(%rsp) | ||
1534 | movaps %xmm8, 0x60(%rsp) | ||
1535 | movaps %xmm9, 0x70(%rsp) | ||
1536 | movaps %xmm10, 0x80(%rsp) | ||
1537 | movaps %xmm11, 0x90(%rsp) | ||
1538 | movaps %xmm12, 0xa0(%rsp) | ||
1539 | movaps %xmm13, 0xb0(%rsp) | ||
1540 | movaps %xmm14, 0xc0(%rsp) | ||
1541 | movaps %xmm15, 0xd0(%rsp) | ||
1542 | .Lcbc_dec_body: | ||
1543 | ___ | ||
1544 | $code.=<<___; | ||
1545 | mov %rsp, %rbp # backup %rsp | ||
1546 | mov 240($arg4), %eax # rounds | ||
1547 | mov $arg1, $inp # backup arguments | ||
1548 | mov $arg2, $out | ||
1549 | mov $arg3, $len | ||
1550 | mov $arg4, $key | ||
1551 | mov $arg5, %rbx | ||
1552 | shr \$4, $len # bytes to blocks | ||
1553 | |||
1554 | mov %eax, %edx # rounds | ||
1555 | shl \$7, %rax # 128 bytes per inner round key | ||
1556 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
1557 | sub %rax, %rsp | ||
1558 | |||
1559 | mov %rsp, %rax # pass key schedule | ||
1560 | mov $key, %rcx # pass key | ||
1561 | mov %edx, %r10d # pass rounds | ||
1562 | call _bsaes_key_convert | ||
1563 | pxor (%rsp),%xmm7 # fix up 0 round key | ||
1564 | movdqa %xmm6,(%rax) # save last round key | ||
1565 | movdqa %xmm7,(%rsp) | ||
1566 | |||
1567 | movdqu (%rbx), @XMM[15] # load IV | ||
1568 | sub \$8,$len | ||
1569 | .Lcbc_dec_loop: | ||
1570 | movdqu 0x00($inp), @XMM[0] # load input | ||
1571 | movdqu 0x10($inp), @XMM[1] | ||
1572 | movdqu 0x20($inp), @XMM[2] | ||
1573 | movdqu 0x30($inp), @XMM[3] | ||
1574 | movdqu 0x40($inp), @XMM[4] | ||
1575 | movdqu 0x50($inp), @XMM[5] | ||
1576 | mov %rsp, %rax # pass key schedule | ||
1577 | movdqu 0x60($inp), @XMM[6] | ||
1578 | mov %edx,%r10d # pass rounds | ||
1579 | movdqu 0x70($inp), @XMM[7] | ||
1580 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1581 | |||
1582 | call _bsaes_decrypt8 | ||
1583 | |||
1584 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1585 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1586 | movdqu 0x10($inp), @XMM[9] | ||
1587 | pxor @XMM[8], @XMM[1] | ||
1588 | movdqu 0x20($inp), @XMM[10] | ||
1589 | pxor @XMM[9], @XMM[6] | ||
1590 | movdqu 0x30($inp), @XMM[11] | ||
1591 | pxor @XMM[10], @XMM[4] | ||
1592 | movdqu 0x40($inp), @XMM[12] | ||
1593 | pxor @XMM[11], @XMM[2] | ||
1594 | movdqu 0x50($inp), @XMM[13] | ||
1595 | pxor @XMM[12], @XMM[7] | ||
1596 | movdqu 0x60($inp), @XMM[14] | ||
1597 | pxor @XMM[13], @XMM[3] | ||
1598 | movdqu 0x70($inp), @XMM[15] # IV | ||
1599 | pxor @XMM[14], @XMM[5] | ||
1600 | movdqu @XMM[0], 0x00($out) # write output | ||
1601 | lea 0x80($inp), $inp | ||
1602 | movdqu @XMM[1], 0x10($out) | ||
1603 | movdqu @XMM[6], 0x20($out) | ||
1604 | movdqu @XMM[4], 0x30($out) | ||
1605 | movdqu @XMM[2], 0x40($out) | ||
1606 | movdqu @XMM[7], 0x50($out) | ||
1607 | movdqu @XMM[3], 0x60($out) | ||
1608 | movdqu @XMM[5], 0x70($out) | ||
1609 | lea 0x80($out), $out | ||
1610 | sub \$8,$len | ||
1611 | jnc .Lcbc_dec_loop | ||
1612 | |||
1613 | add \$8,$len | ||
1614 | jz .Lcbc_dec_done | ||
1615 | |||
1616 | movdqu 0x00($inp), @XMM[0] # load input | ||
1617 | mov %rsp, %rax # pass key schedule | ||
1618 | mov %edx, %r10d # pass rounds | ||
1619 | cmp \$2,$len | ||
1620 | jb .Lcbc_dec_one | ||
1621 | movdqu 0x10($inp), @XMM[1] | ||
1622 | je .Lcbc_dec_two | ||
1623 | movdqu 0x20($inp), @XMM[2] | ||
1624 | cmp \$4,$len | ||
1625 | jb .Lcbc_dec_three | ||
1626 | movdqu 0x30($inp), @XMM[3] | ||
1627 | je .Lcbc_dec_four | ||
1628 | movdqu 0x40($inp), @XMM[4] | ||
1629 | cmp \$6,$len | ||
1630 | jb .Lcbc_dec_five | ||
1631 | movdqu 0x50($inp), @XMM[5] | ||
1632 | je .Lcbc_dec_six | ||
1633 | movdqu 0x60($inp), @XMM[6] | ||
1634 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1635 | call _bsaes_decrypt8 | ||
1636 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1637 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1638 | movdqu 0x10($inp), @XMM[9] | ||
1639 | pxor @XMM[8], @XMM[1] | ||
1640 | movdqu 0x20($inp), @XMM[10] | ||
1641 | pxor @XMM[9], @XMM[6] | ||
1642 | movdqu 0x30($inp), @XMM[11] | ||
1643 | pxor @XMM[10], @XMM[4] | ||
1644 | movdqu 0x40($inp), @XMM[12] | ||
1645 | pxor @XMM[11], @XMM[2] | ||
1646 | movdqu 0x50($inp), @XMM[13] | ||
1647 | pxor @XMM[12], @XMM[7] | ||
1648 | movdqu 0x60($inp), @XMM[15] # IV | ||
1649 | pxor @XMM[13], @XMM[3] | ||
1650 | movdqu @XMM[0], 0x00($out) # write output | ||
1651 | movdqu @XMM[1], 0x10($out) | ||
1652 | movdqu @XMM[6], 0x20($out) | ||
1653 | movdqu @XMM[4], 0x30($out) | ||
1654 | movdqu @XMM[2], 0x40($out) | ||
1655 | movdqu @XMM[7], 0x50($out) | ||
1656 | movdqu @XMM[3], 0x60($out) | ||
1657 | jmp .Lcbc_dec_done | ||
1658 | .align 16 | ||
1659 | .Lcbc_dec_six: | ||
1660 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1661 | call _bsaes_decrypt8 | ||
1662 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1663 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1664 | movdqu 0x10($inp), @XMM[9] | ||
1665 | pxor @XMM[8], @XMM[1] | ||
1666 | movdqu 0x20($inp), @XMM[10] | ||
1667 | pxor @XMM[9], @XMM[6] | ||
1668 | movdqu 0x30($inp), @XMM[11] | ||
1669 | pxor @XMM[10], @XMM[4] | ||
1670 | movdqu 0x40($inp), @XMM[12] | ||
1671 | pxor @XMM[11], @XMM[2] | ||
1672 | movdqu 0x50($inp), @XMM[15] # IV | ||
1673 | pxor @XMM[12], @XMM[7] | ||
1674 | movdqu @XMM[0], 0x00($out) # write output | ||
1675 | movdqu @XMM[1], 0x10($out) | ||
1676 | movdqu @XMM[6], 0x20($out) | ||
1677 | movdqu @XMM[4], 0x30($out) | ||
1678 | movdqu @XMM[2], 0x40($out) | ||
1679 | movdqu @XMM[7], 0x50($out) | ||
1680 | jmp .Lcbc_dec_done | ||
1681 | .align 16 | ||
1682 | .Lcbc_dec_five: | ||
1683 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1684 | call _bsaes_decrypt8 | ||
1685 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1686 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1687 | movdqu 0x10($inp), @XMM[9] | ||
1688 | pxor @XMM[8], @XMM[1] | ||
1689 | movdqu 0x20($inp), @XMM[10] | ||
1690 | pxor @XMM[9], @XMM[6] | ||
1691 | movdqu 0x30($inp), @XMM[11] | ||
1692 | pxor @XMM[10], @XMM[4] | ||
1693 | movdqu 0x40($inp), @XMM[15] # IV | ||
1694 | pxor @XMM[11], @XMM[2] | ||
1695 | movdqu @XMM[0], 0x00($out) # write output | ||
1696 | movdqu @XMM[1], 0x10($out) | ||
1697 | movdqu @XMM[6], 0x20($out) | ||
1698 | movdqu @XMM[4], 0x30($out) | ||
1699 | movdqu @XMM[2], 0x40($out) | ||
1700 | jmp .Lcbc_dec_done | ||
1701 | .align 16 | ||
1702 | .Lcbc_dec_four: | ||
1703 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1704 | call _bsaes_decrypt8 | ||
1705 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1706 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1707 | movdqu 0x10($inp), @XMM[9] | ||
1708 | pxor @XMM[8], @XMM[1] | ||
1709 | movdqu 0x20($inp), @XMM[10] | ||
1710 | pxor @XMM[9], @XMM[6] | ||
1711 | movdqu 0x30($inp), @XMM[15] # IV | ||
1712 | pxor @XMM[10], @XMM[4] | ||
1713 | movdqu @XMM[0], 0x00($out) # write output | ||
1714 | movdqu @XMM[1], 0x10($out) | ||
1715 | movdqu @XMM[6], 0x20($out) | ||
1716 | movdqu @XMM[4], 0x30($out) | ||
1717 | jmp .Lcbc_dec_done | ||
1718 | .align 16 | ||
1719 | .Lcbc_dec_three: | ||
1720 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1721 | call _bsaes_decrypt8 | ||
1722 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1723 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1724 | movdqu 0x10($inp), @XMM[9] | ||
1725 | pxor @XMM[8], @XMM[1] | ||
1726 | movdqu 0x20($inp), @XMM[15] # IV | ||
1727 | pxor @XMM[9], @XMM[6] | ||
1728 | movdqu @XMM[0], 0x00($out) # write output | ||
1729 | movdqu @XMM[1], 0x10($out) | ||
1730 | movdqu @XMM[6], 0x20($out) | ||
1731 | jmp .Lcbc_dec_done | ||
1732 | .align 16 | ||
1733 | .Lcbc_dec_two: | ||
1734 | movdqa @XMM[15], 0x20(%rbp) # put aside IV | ||
1735 | call _bsaes_decrypt8 | ||
1736 | pxor 0x20(%rbp), @XMM[0] # ^= IV | ||
1737 | movdqu 0x00($inp), @XMM[8] # re-load input | ||
1738 | movdqu 0x10($inp), @XMM[15] # IV | ||
1739 | pxor @XMM[8], @XMM[1] | ||
1740 | movdqu @XMM[0], 0x00($out) # write output | ||
1741 | movdqu @XMM[1], 0x10($out) | ||
1742 | jmp .Lcbc_dec_done | ||
1743 | .align 16 | ||
1744 | .Lcbc_dec_one: | ||
1745 | lea ($inp), $arg1 | ||
1746 | lea 0x20(%rbp), $arg2 # buffer output | ||
1747 | lea ($key), $arg3 | ||
1748 | call asm_AES_decrypt # doesn't touch %xmm | ||
1749 | pxor 0x20(%rbp), @XMM[15] # ^= IV | ||
1750 | movdqu @XMM[15], ($out) # write output | ||
1751 | movdqa @XMM[0], @XMM[15] # IV | ||
1752 | |||
1753 | .Lcbc_dec_done: | ||
1754 | movdqu @XMM[15], (%rbx) # return IV | ||
1755 | lea (%rsp), %rax | ||
1756 | pxor %xmm0, %xmm0 | ||
1757 | .Lcbc_dec_bzero: # wipe key schedule [if any] | ||
1758 | movdqa %xmm0, 0x00(%rax) | ||
1759 | movdqa %xmm0, 0x10(%rax) | ||
1760 | lea 0x20(%rax), %rax | ||
1761 | cmp %rax, %rbp | ||
1762 | ja .Lcbc_dec_bzero | ||
1763 | |||
1764 | lea (%rbp),%rsp # restore %rsp | ||
1765 | ___ | ||
1766 | $code.=<<___ if ($win64); | ||
1767 | movaps 0x40(%rbp), %xmm6 | ||
1768 | movaps 0x50(%rbp), %xmm7 | ||
1769 | movaps 0x60(%rbp), %xmm8 | ||
1770 | movaps 0x70(%rbp), %xmm9 | ||
1771 | movaps 0x80(%rbp), %xmm10 | ||
1772 | movaps 0x90(%rbp), %xmm11 | ||
1773 | movaps 0xa0(%rbp), %xmm12 | ||
1774 | movaps 0xb0(%rbp), %xmm13 | ||
1775 | movaps 0xc0(%rbp), %xmm14 | ||
1776 | movaps 0xd0(%rbp), %xmm15 | ||
1777 | lea 0xa0(%rbp), %rsp | ||
1778 | ___ | ||
1779 | $code.=<<___; | ||
1780 | mov 0x48(%rsp), %r15 | ||
1781 | mov 0x50(%rsp), %r14 | ||
1782 | mov 0x58(%rsp), %r13 | ||
1783 | mov 0x60(%rsp), %r12 | ||
1784 | mov 0x68(%rsp), %rbx | ||
1785 | mov 0x70(%rsp), %rax | ||
1786 | lea 0x78(%rsp), %rsp | ||
1787 | mov %rax, %rbp | ||
1788 | .Lcbc_dec_epilogue: | ||
1789 | ret | ||
1790 | .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt | ||
1791 | |||
1792 | .globl bsaes_ctr32_encrypt_blocks | ||
1793 | .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent | ||
1794 | .align 16 | ||
1795 | bsaes_ctr32_encrypt_blocks: | ||
1796 | mov %rsp, %rax | ||
1797 | .Lctr_enc_prologue: | ||
1798 | push %rbp | ||
1799 | push %rbx | ||
1800 | push %r12 | ||
1801 | push %r13 | ||
1802 | push %r14 | ||
1803 | push %r15 | ||
1804 | lea -0x48(%rsp), %rsp | ||
1805 | ___ | ||
1806 | $code.=<<___ if ($win64); | ||
1807 | mov 0xa0(%rsp),$arg5 # pull ivp | ||
1808 | lea -0xa0(%rsp), %rsp | ||
1809 | movaps %xmm6, 0x40(%rsp) | ||
1810 | movaps %xmm7, 0x50(%rsp) | ||
1811 | movaps %xmm8, 0x60(%rsp) | ||
1812 | movaps %xmm9, 0x70(%rsp) | ||
1813 | movaps %xmm10, 0x80(%rsp) | ||
1814 | movaps %xmm11, 0x90(%rsp) | ||
1815 | movaps %xmm12, 0xa0(%rsp) | ||
1816 | movaps %xmm13, 0xb0(%rsp) | ||
1817 | movaps %xmm14, 0xc0(%rsp) | ||
1818 | movaps %xmm15, 0xd0(%rsp) | ||
1819 | .Lctr_enc_body: | ||
1820 | ___ | ||
1821 | $code.=<<___; | ||
1822 | mov %rsp, %rbp # backup %rsp | ||
1823 | movdqu ($arg5), %xmm0 # load counter | ||
1824 | mov 240($arg4), %eax # rounds | ||
1825 | mov $arg1, $inp # backup arguments | ||
1826 | mov $arg2, $out | ||
1827 | mov $arg3, $len | ||
1828 | mov $arg4, $key | ||
1829 | movdqa %xmm0, 0x20(%rbp) # copy counter | ||
1830 | cmp \$8, $arg3 | ||
1831 | jb .Lctr_enc_short | ||
1832 | |||
1833 | mov %eax, %ebx # rounds | ||
1834 | shl \$7, %rax # 128 bytes per inner round key | ||
1835 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
1836 | sub %rax, %rsp | ||
1837 | |||
1838 | mov %rsp, %rax # pass key schedule | ||
1839 | mov $key, %rcx # pass key | ||
1840 | mov %ebx, %r10d # pass rounds | ||
1841 | call _bsaes_key_convert | ||
1842 | pxor %xmm6,%xmm7 # fix up last round key | ||
1843 | movdqa %xmm7,(%rax) # save last round key | ||
1844 | |||
1845 | movdqa (%rsp), @XMM[9] # load round0 key | ||
1846 | lea .LADD1(%rip), %r11 | ||
1847 | movdqa 0x20(%rbp), @XMM[0] # counter copy | ||
1848 | movdqa -0x20(%r11), @XMM[8] # .LSWPUP | ||
1849 | pshufb @XMM[8], @XMM[9] # byte swap upper part | ||
1850 | pshufb @XMM[8], @XMM[0] | ||
1851 | movdqa @XMM[9], (%rsp) # save adjusted round0 key | ||
1852 | jmp .Lctr_enc_loop | ||
1853 | .align 16 | ||
1854 | .Lctr_enc_loop: | ||
1855 | movdqa @XMM[0], 0x20(%rbp) # save counter | ||
1856 | movdqa @XMM[0], @XMM[1] # prepare 8 counter values | ||
1857 | movdqa @XMM[0], @XMM[2] | ||
1858 | paddd 0x00(%r11), @XMM[1] # .LADD1 | ||
1859 | movdqa @XMM[0], @XMM[3] | ||
1860 | paddd 0x10(%r11), @XMM[2] # .LADD2 | ||
1861 | movdqa @XMM[0], @XMM[4] | ||
1862 | paddd 0x20(%r11), @XMM[3] # .LADD3 | ||
1863 | movdqa @XMM[0], @XMM[5] | ||
1864 | paddd 0x30(%r11), @XMM[4] # .LADD4 | ||
1865 | movdqa @XMM[0], @XMM[6] | ||
1866 | paddd 0x40(%r11), @XMM[5] # .LADD5 | ||
1867 | movdqa @XMM[0], @XMM[7] | ||
1868 | paddd 0x50(%r11), @XMM[6] # .LADD6 | ||
1869 | paddd 0x60(%r11), @XMM[7] # .LADD7 | ||
1870 | |||
1871 | # Borrow prologue from _bsaes_encrypt8 to use the opportunity | ||
1872 | # to flip byte order in 32-bit counter | ||
1873 | movdqa (%rsp), @XMM[9] # round 0 key | ||
1874 | lea 0x10(%rsp), %rax # pass key schedule | ||
1875 | movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR | ||
1876 | pxor @XMM[9], @XMM[0] # xor with round0 key | ||
1877 | pxor @XMM[9], @XMM[1] | ||
1878 | pshufb @XMM[8], @XMM[0] | ||
1879 | pxor @XMM[9], @XMM[2] | ||
1880 | pshufb @XMM[8], @XMM[1] | ||
1881 | pxor @XMM[9], @XMM[3] | ||
1882 | pshufb @XMM[8], @XMM[2] | ||
1883 | pxor @XMM[9], @XMM[4] | ||
1884 | pshufb @XMM[8], @XMM[3] | ||
1885 | pxor @XMM[9], @XMM[5] | ||
1886 | pshufb @XMM[8], @XMM[4] | ||
1887 | pxor @XMM[9], @XMM[6] | ||
1888 | pshufb @XMM[8], @XMM[5] | ||
1889 | pxor @XMM[9], @XMM[7] | ||
1890 | pshufb @XMM[8], @XMM[6] | ||
1891 | lea .LBS0(%rip), %r11 # constants table | ||
1892 | pshufb @XMM[8], @XMM[7] | ||
1893 | mov %ebx,%r10d # pass rounds | ||
1894 | |||
1895 | call _bsaes_encrypt8_bitslice | ||
1896 | |||
1897 | sub \$8,$len | ||
1898 | jc .Lctr_enc_loop_done | ||
1899 | |||
1900 | movdqu 0x00($inp), @XMM[8] # load input | ||
1901 | movdqu 0x10($inp), @XMM[9] | ||
1902 | movdqu 0x20($inp), @XMM[10] | ||
1903 | movdqu 0x30($inp), @XMM[11] | ||
1904 | movdqu 0x40($inp), @XMM[12] | ||
1905 | movdqu 0x50($inp), @XMM[13] | ||
1906 | movdqu 0x60($inp), @XMM[14] | ||
1907 | movdqu 0x70($inp), @XMM[15] | ||
1908 | lea 0x80($inp),$inp | ||
1909 | pxor @XMM[0], @XMM[8] | ||
1910 | movdqa 0x20(%rbp), @XMM[0] # load counter | ||
1911 | pxor @XMM[9], @XMM[1] | ||
1912 | movdqu @XMM[8], 0x00($out) # write output | ||
1913 | pxor @XMM[10], @XMM[4] | ||
1914 | movdqu @XMM[1], 0x10($out) | ||
1915 | pxor @XMM[11], @XMM[6] | ||
1916 | movdqu @XMM[4], 0x20($out) | ||
1917 | pxor @XMM[12], @XMM[3] | ||
1918 | movdqu @XMM[6], 0x30($out) | ||
1919 | pxor @XMM[13], @XMM[7] | ||
1920 | movdqu @XMM[3], 0x40($out) | ||
1921 | pxor @XMM[14], @XMM[2] | ||
1922 | movdqu @XMM[7], 0x50($out) | ||
1923 | pxor @XMM[15], @XMM[5] | ||
1924 | movdqu @XMM[2], 0x60($out) | ||
1925 | lea .LADD1(%rip), %r11 | ||
1926 | movdqu @XMM[5], 0x70($out) | ||
1927 | lea 0x80($out), $out | ||
1928 | paddd 0x70(%r11), @XMM[0] # .LADD8 | ||
1929 | jnz .Lctr_enc_loop | ||
1930 | |||
1931 | jmp .Lctr_enc_done | ||
1932 | .align 16 | ||
1933 | .Lctr_enc_loop_done: | ||
1934 | add \$8, $len | ||
1935 | movdqu 0x00($inp), @XMM[8] # load input | ||
1936 | pxor @XMM[8], @XMM[0] | ||
1937 | movdqu @XMM[0], 0x00($out) # write output | ||
1938 | cmp \$2,$len | ||
1939 | jb .Lctr_enc_done | ||
1940 | movdqu 0x10($inp), @XMM[9] | ||
1941 | pxor @XMM[9], @XMM[1] | ||
1942 | movdqu @XMM[1], 0x10($out) | ||
1943 | je .Lctr_enc_done | ||
1944 | movdqu 0x20($inp), @XMM[10] | ||
1945 | pxor @XMM[10], @XMM[4] | ||
1946 | movdqu @XMM[4], 0x20($out) | ||
1947 | cmp \$4,$len | ||
1948 | jb .Lctr_enc_done | ||
1949 | movdqu 0x30($inp), @XMM[11] | ||
1950 | pxor @XMM[11], @XMM[6] | ||
1951 | movdqu @XMM[6], 0x30($out) | ||
1952 | je .Lctr_enc_done | ||
1953 | movdqu 0x40($inp), @XMM[12] | ||
1954 | pxor @XMM[12], @XMM[3] | ||
1955 | movdqu @XMM[3], 0x40($out) | ||
1956 | cmp \$6,$len | ||
1957 | jb .Lctr_enc_done | ||
1958 | movdqu 0x50($inp), @XMM[13] | ||
1959 | pxor @XMM[13], @XMM[7] | ||
1960 | movdqu @XMM[7], 0x50($out) | ||
1961 | je .Lctr_enc_done | ||
1962 | movdqu 0x60($inp), @XMM[14] | ||
1963 | pxor @XMM[14], @XMM[2] | ||
1964 | movdqu @XMM[2], 0x60($out) | ||
1965 | jmp .Lctr_enc_done | ||
1966 | |||
1967 | .align 16 | ||
1968 | .Lctr_enc_short: | ||
1969 | lea 0x20(%rbp), $arg1 | ||
1970 | lea 0x30(%rbp), $arg2 | ||
1971 | lea ($key), $arg3 | ||
1972 | call asm_AES_encrypt | ||
1973 | movdqu ($inp), @XMM[1] | ||
1974 | lea 16($inp), $inp | ||
1975 | mov 0x2c(%rbp), %eax # load 32-bit counter | ||
1976 | bswap %eax | ||
1977 | pxor 0x30(%rbp), @XMM[1] | ||
1978 | inc %eax # increment | ||
1979 | movdqu @XMM[1], ($out) | ||
1980 | bswap %eax | ||
1981 | lea 16($out), $out | ||
1982 | mov %eax, 0x2c(%rsp) # save 32-bit counter | ||
1983 | dec $len | ||
1984 | jnz .Lctr_enc_short | ||
1985 | |||
1986 | .Lctr_enc_done: | ||
1987 | lea (%rsp), %rax | ||
1988 | pxor %xmm0, %xmm0 | ||
1989 | .Lctr_enc_bzero: # wipe key schedule [if any] | ||
1990 | movdqa %xmm0, 0x00(%rax) | ||
1991 | movdqa %xmm0, 0x10(%rax) | ||
1992 | lea 0x20(%rax), %rax | ||
1993 | cmp %rax, %rbp | ||
1994 | ja .Lctr_enc_bzero | ||
1995 | |||
1996 | lea (%rbp),%rsp # restore %rsp | ||
1997 | ___ | ||
1998 | $code.=<<___ if ($win64); | ||
1999 | movaps 0x40(%rbp), %xmm6 | ||
2000 | movaps 0x50(%rbp), %xmm7 | ||
2001 | movaps 0x60(%rbp), %xmm8 | ||
2002 | movaps 0x70(%rbp), %xmm9 | ||
2003 | movaps 0x80(%rbp), %xmm10 | ||
2004 | movaps 0x90(%rbp), %xmm11 | ||
2005 | movaps 0xa0(%rbp), %xmm12 | ||
2006 | movaps 0xb0(%rbp), %xmm13 | ||
2007 | movaps 0xc0(%rbp), %xmm14 | ||
2008 | movaps 0xd0(%rbp), %xmm15 | ||
2009 | lea 0xa0(%rbp), %rsp | ||
2010 | ___ | ||
2011 | $code.=<<___; | ||
2012 | mov 0x48(%rsp), %r15 | ||
2013 | mov 0x50(%rsp), %r14 | ||
2014 | mov 0x58(%rsp), %r13 | ||
2015 | mov 0x60(%rsp), %r12 | ||
2016 | mov 0x68(%rsp), %rbx | ||
2017 | mov 0x70(%rsp), %rax | ||
2018 | lea 0x78(%rsp), %rsp | ||
2019 | mov %rax, %rbp | ||
2020 | .Lctr_enc_epilogue: | ||
2021 | ret | ||
2022 | .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks | ||
2023 | ___ | ||
2024 | ###################################################################### | ||
2025 | # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, | ||
2026 | # const AES_KEY *key1, const AES_KEY *key2, | ||
2027 | # const unsigned char iv[16]); | ||
2028 | # | ||
2029 | my ($twmask,$twres,$twtmp)=@XMM[13..15]; | ||
2030 | $code.=<<___; | ||
2031 | .globl bsaes_xts_encrypt | ||
2032 | .type bsaes_xts_encrypt,\@abi-omnipotent | ||
2033 | .align 16 | ||
2034 | bsaes_xts_encrypt: | ||
2035 | mov %rsp, %rax | ||
2036 | .Lxts_enc_prologue: | ||
2037 | push %rbp | ||
2038 | push %rbx | ||
2039 | push %r12 | ||
2040 | push %r13 | ||
2041 | push %r14 | ||
2042 | push %r15 | ||
2043 | lea -0x48(%rsp), %rsp | ||
2044 | ___ | ||
2045 | $code.=<<___ if ($win64); | ||
2046 | mov 0xa0(%rsp),$arg5 # pull key2 | ||
2047 | mov 0xa8(%rsp),$arg6 # pull ivp | ||
2048 | lea -0xa0(%rsp), %rsp | ||
2049 | movaps %xmm6, 0x40(%rsp) | ||
2050 | movaps %xmm7, 0x50(%rsp) | ||
2051 | movaps %xmm8, 0x60(%rsp) | ||
2052 | movaps %xmm9, 0x70(%rsp) | ||
2053 | movaps %xmm10, 0x80(%rsp) | ||
2054 | movaps %xmm11, 0x90(%rsp) | ||
2055 | movaps %xmm12, 0xa0(%rsp) | ||
2056 | movaps %xmm13, 0xb0(%rsp) | ||
2057 | movaps %xmm14, 0xc0(%rsp) | ||
2058 | movaps %xmm15, 0xd0(%rsp) | ||
2059 | .Lxts_enc_body: | ||
2060 | ___ | ||
2061 | $code.=<<___; | ||
2062 | mov %rsp, %rbp # backup %rsp | ||
2063 | mov $arg1, $inp # backup arguments | ||
2064 | mov $arg2, $out | ||
2065 | mov $arg3, $len | ||
2066 | mov $arg4, $key | ||
2067 | |||
2068 | lea ($arg6), $arg1 | ||
2069 | lea 0x20(%rbp), $arg2 | ||
2070 | lea ($arg5), $arg3 | ||
2071 | call asm_AES_encrypt # generate initial tweak | ||
2072 | |||
2073 | mov 240($key), %eax # rounds | ||
2074 | mov $len, %rbx # backup $len | ||
2075 | |||
2076 | mov %eax, %edx # rounds | ||
2077 | shl \$7, %rax # 128 bytes per inner round key | ||
2078 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
2079 | sub %rax, %rsp | ||
2080 | |||
2081 | mov %rsp, %rax # pass key schedule | ||
2082 | mov $key, %rcx # pass key | ||
2083 | mov %edx, %r10d # pass rounds | ||
2084 | call _bsaes_key_convert | ||
2085 | pxor %xmm6, %xmm7 # fix up last round key | ||
2086 | movdqa %xmm7, (%rax) # save last round key | ||
2087 | |||
2088 | and \$-16, $len | ||
2089 | sub \$0x80, %rsp # place for tweak[8] | ||
2090 | movdqa 0x20(%rbp), @XMM[7] # initial tweak | ||
2091 | |||
2092 | pxor $twtmp, $twtmp | ||
2093 | movdqa .Lxts_magic(%rip), $twmask | ||
2094 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2095 | |||
2096 | sub \$0x80, $len | ||
2097 | jc .Lxts_enc_short | ||
2098 | jmp .Lxts_enc_loop | ||
2099 | |||
2100 | .align 16 | ||
2101 | .Lxts_enc_loop: | ||
2102 | ___ | ||
2103 | for ($i=0;$i<7;$i++) { | ||
2104 | $code.=<<___; | ||
2105 | pshufd \$0x13, $twtmp, $twres | ||
2106 | pxor $twtmp, $twtmp | ||
2107 | movdqa @XMM[7], @XMM[$i] | ||
2108 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2109 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2110 | pand $twmask, $twres # isolate carry and residue | ||
2111 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2112 | pxor $twres, @XMM[7] | ||
2113 | ___ | ||
2114 | $code.=<<___ if ($i>=1); | ||
2115 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2116 | ___ | ||
2117 | $code.=<<___ if ($i>=2); | ||
2118 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2119 | ___ | ||
2120 | } | ||
2121 | $code.=<<___; | ||
2122 | movdqu 0x60($inp), @XMM[8+6] | ||
2123 | pxor @XMM[8+5], @XMM[5] | ||
2124 | movdqu 0x70($inp), @XMM[8+7] | ||
2125 | lea 0x80($inp), $inp | ||
2126 | movdqa @XMM[7], 0x70(%rsp) | ||
2127 | pxor @XMM[8+6], @XMM[6] | ||
2128 | lea 0x80(%rsp), %rax # pass key schedule | ||
2129 | pxor @XMM[8+7], @XMM[7] | ||
2130 | mov %edx, %r10d # pass rounds | ||
2131 | |||
2132 | call _bsaes_encrypt8 | ||
2133 | |||
2134 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2135 | pxor 0x10(%rsp), @XMM[1] | ||
2136 | movdqu @XMM[0], 0x00($out) # write output | ||
2137 | pxor 0x20(%rsp), @XMM[4] | ||
2138 | movdqu @XMM[1], 0x10($out) | ||
2139 | pxor 0x30(%rsp), @XMM[6] | ||
2140 | movdqu @XMM[4], 0x20($out) | ||
2141 | pxor 0x40(%rsp), @XMM[3] | ||
2142 | movdqu @XMM[6], 0x30($out) | ||
2143 | pxor 0x50(%rsp), @XMM[7] | ||
2144 | movdqu @XMM[3], 0x40($out) | ||
2145 | pxor 0x60(%rsp), @XMM[2] | ||
2146 | movdqu @XMM[7], 0x50($out) | ||
2147 | pxor 0x70(%rsp), @XMM[5] | ||
2148 | movdqu @XMM[2], 0x60($out) | ||
2149 | movdqu @XMM[5], 0x70($out) | ||
2150 | lea 0x80($out), $out | ||
2151 | |||
2152 | movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | ||
2153 | pxor $twtmp, $twtmp | ||
2154 | movdqa .Lxts_magic(%rip), $twmask | ||
2155 | pcmpgtd @XMM[7], $twtmp | ||
2156 | pshufd \$0x13, $twtmp, $twres | ||
2157 | pxor $twtmp, $twtmp | ||
2158 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2159 | pand $twmask, $twres # isolate carry and residue | ||
2160 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2161 | pxor $twres, @XMM[7] | ||
2162 | |||
2163 | sub \$0x80,$len | ||
2164 | jnc .Lxts_enc_loop | ||
2165 | |||
2166 | .Lxts_enc_short: | ||
2167 | add \$0x80, $len | ||
2168 | jz .Lxts_enc_done | ||
2169 | ___ | ||
2170 | for ($i=0;$i<7;$i++) { | ||
2171 | $code.=<<___; | ||
2172 | pshufd \$0x13, $twtmp, $twres | ||
2173 | pxor $twtmp, $twtmp | ||
2174 | movdqa @XMM[7], @XMM[$i] | ||
2175 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2176 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2177 | pand $twmask, $twres # isolate carry and residue | ||
2178 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2179 | pxor $twres, @XMM[7] | ||
2180 | ___ | ||
2181 | $code.=<<___ if ($i>=1); | ||
2182 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2183 | cmp \$`0x10*$i`,$len | ||
2184 | je .Lxts_enc_$i | ||
2185 | ___ | ||
2186 | $code.=<<___ if ($i>=2); | ||
2187 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2188 | ___ | ||
2189 | } | ||
2190 | $code.=<<___; | ||
2191 | movdqu 0x60($inp), @XMM[8+6] | ||
2192 | pxor @XMM[8+5], @XMM[5] | ||
2193 | movdqa @XMM[7], 0x70(%rsp) | ||
2194 | lea 0x70($inp), $inp | ||
2195 | pxor @XMM[8+6], @XMM[6] | ||
2196 | lea 0x80(%rsp), %rax # pass key schedule | ||
2197 | mov %edx, %r10d # pass rounds | ||
2198 | |||
2199 | call _bsaes_encrypt8 | ||
2200 | |||
2201 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2202 | pxor 0x10(%rsp), @XMM[1] | ||
2203 | movdqu @XMM[0], 0x00($out) # write output | ||
2204 | pxor 0x20(%rsp), @XMM[4] | ||
2205 | movdqu @XMM[1], 0x10($out) | ||
2206 | pxor 0x30(%rsp), @XMM[6] | ||
2207 | movdqu @XMM[4], 0x20($out) | ||
2208 | pxor 0x40(%rsp), @XMM[3] | ||
2209 | movdqu @XMM[6], 0x30($out) | ||
2210 | pxor 0x50(%rsp), @XMM[7] | ||
2211 | movdqu @XMM[3], 0x40($out) | ||
2212 | pxor 0x60(%rsp), @XMM[2] | ||
2213 | movdqu @XMM[7], 0x50($out) | ||
2214 | movdqu @XMM[2], 0x60($out) | ||
2215 | lea 0x70($out), $out | ||
2216 | |||
2217 | movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | ||
2218 | jmp .Lxts_enc_done | ||
2219 | .align 16 | ||
2220 | .Lxts_enc_6: | ||
2221 | pxor @XMM[8+4], @XMM[4] | ||
2222 | lea 0x60($inp), $inp | ||
2223 | pxor @XMM[8+5], @XMM[5] | ||
2224 | lea 0x80(%rsp), %rax # pass key schedule | ||
2225 | mov %edx, %r10d # pass rounds | ||
2226 | |||
2227 | call _bsaes_encrypt8 | ||
2228 | |||
2229 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2230 | pxor 0x10(%rsp), @XMM[1] | ||
2231 | movdqu @XMM[0], 0x00($out) # write output | ||
2232 | pxor 0x20(%rsp), @XMM[4] | ||
2233 | movdqu @XMM[1], 0x10($out) | ||
2234 | pxor 0x30(%rsp), @XMM[6] | ||
2235 | movdqu @XMM[4], 0x20($out) | ||
2236 | pxor 0x40(%rsp), @XMM[3] | ||
2237 | movdqu @XMM[6], 0x30($out) | ||
2238 | pxor 0x50(%rsp), @XMM[7] | ||
2239 | movdqu @XMM[3], 0x40($out) | ||
2240 | movdqu @XMM[7], 0x50($out) | ||
2241 | lea 0x60($out), $out | ||
2242 | |||
2243 | movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | ||
2244 | jmp .Lxts_enc_done | ||
2245 | .align 16 | ||
2246 | .Lxts_enc_5: | ||
2247 | pxor @XMM[8+3], @XMM[3] | ||
2248 | lea 0x50($inp), $inp | ||
2249 | pxor @XMM[8+4], @XMM[4] | ||
2250 | lea 0x80(%rsp), %rax # pass key schedule | ||
2251 | mov %edx, %r10d # pass rounds | ||
2252 | |||
2253 | call _bsaes_encrypt8 | ||
2254 | |||
2255 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2256 | pxor 0x10(%rsp), @XMM[1] | ||
2257 | movdqu @XMM[0], 0x00($out) # write output | ||
2258 | pxor 0x20(%rsp), @XMM[4] | ||
2259 | movdqu @XMM[1], 0x10($out) | ||
2260 | pxor 0x30(%rsp), @XMM[6] | ||
2261 | movdqu @XMM[4], 0x20($out) | ||
2262 | pxor 0x40(%rsp), @XMM[3] | ||
2263 | movdqu @XMM[6], 0x30($out) | ||
2264 | movdqu @XMM[3], 0x40($out) | ||
2265 | lea 0x50($out), $out | ||
2266 | |||
2267 | movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | ||
2268 | jmp .Lxts_enc_done | ||
2269 | .align 16 | ||
2270 | .Lxts_enc_4: | ||
2271 | pxor @XMM[8+2], @XMM[2] | ||
2272 | lea 0x40($inp), $inp | ||
2273 | pxor @XMM[8+3], @XMM[3] | ||
2274 | lea 0x80(%rsp), %rax # pass key schedule | ||
2275 | mov %edx, %r10d # pass rounds | ||
2276 | |||
2277 | call _bsaes_encrypt8 | ||
2278 | |||
2279 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2280 | pxor 0x10(%rsp), @XMM[1] | ||
2281 | movdqu @XMM[0], 0x00($out) # write output | ||
2282 | pxor 0x20(%rsp), @XMM[4] | ||
2283 | movdqu @XMM[1], 0x10($out) | ||
2284 | pxor 0x30(%rsp), @XMM[6] | ||
2285 | movdqu @XMM[4], 0x20($out) | ||
2286 | movdqu @XMM[6], 0x30($out) | ||
2287 | lea 0x40($out), $out | ||
2288 | |||
2289 | movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | ||
2290 | jmp .Lxts_enc_done | ||
2291 | .align 16 | ||
2292 | .Lxts_enc_3: | ||
2293 | pxor @XMM[8+1], @XMM[1] | ||
2294 | lea 0x30($inp), $inp | ||
2295 | pxor @XMM[8+2], @XMM[2] | ||
2296 | lea 0x80(%rsp), %rax # pass key schedule | ||
2297 | mov %edx, %r10d # pass rounds | ||
2298 | |||
2299 | call _bsaes_encrypt8 | ||
2300 | |||
2301 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2302 | pxor 0x10(%rsp), @XMM[1] | ||
2303 | movdqu @XMM[0], 0x00($out) # write output | ||
2304 | pxor 0x20(%rsp), @XMM[4] | ||
2305 | movdqu @XMM[1], 0x10($out) | ||
2306 | movdqu @XMM[4], 0x20($out) | ||
2307 | lea 0x30($out), $out | ||
2308 | |||
2309 | movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | ||
2310 | jmp .Lxts_enc_done | ||
2311 | .align 16 | ||
2312 | .Lxts_enc_2: | ||
2313 | pxor @XMM[8+0], @XMM[0] | ||
2314 | lea 0x20($inp), $inp | ||
2315 | pxor @XMM[8+1], @XMM[1] | ||
2316 | lea 0x80(%rsp), %rax # pass key schedule | ||
2317 | mov %edx, %r10d # pass rounds | ||
2318 | |||
2319 | call _bsaes_encrypt8 | ||
2320 | |||
2321 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2322 | pxor 0x10(%rsp), @XMM[1] | ||
2323 | movdqu @XMM[0], 0x00($out) # write output | ||
2324 | movdqu @XMM[1], 0x10($out) | ||
2325 | lea 0x20($out), $out | ||
2326 | |||
2327 | movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | ||
2328 | jmp .Lxts_enc_done | ||
2329 | .align 16 | ||
2330 | .Lxts_enc_1: | ||
2331 | pxor @XMM[0], @XMM[8] | ||
2332 | lea 0x10($inp), $inp | ||
2333 | movdqa @XMM[8], 0x20(%rbp) | ||
2334 | lea 0x20(%rbp), $arg1 | ||
2335 | lea 0x20(%rbp), $arg2 | ||
2336 | lea ($key), $arg3 | ||
2337 | call asm_AES_encrypt # doesn't touch %xmm | ||
2338 | pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | ||
2339 | #pxor @XMM[8], @XMM[0] | ||
2340 | #lea 0x80(%rsp), %rax # pass key schedule | ||
2341 | #mov %edx, %r10d # pass rounds | ||
2342 | #call _bsaes_encrypt8 | ||
2343 | #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2344 | movdqu @XMM[0], 0x00($out) # write output | ||
2345 | lea 0x10($out), $out | ||
2346 | |||
2347 | movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | ||
2348 | |||
2349 | .Lxts_enc_done: | ||
2350 | and \$15, %ebx | ||
2351 | jz .Lxts_enc_ret | ||
2352 | mov $out, %rdx | ||
2353 | |||
2354 | .Lxts_enc_steal: | ||
2355 | movzb ($inp), %eax | ||
2356 | movzb -16(%rdx), %ecx | ||
2357 | lea 1($inp), $inp | ||
2358 | mov %al, -16(%rdx) | ||
2359 | mov %cl, 0(%rdx) | ||
2360 | lea 1(%rdx), %rdx | ||
2361 | sub \$1,%ebx | ||
2362 | jnz .Lxts_enc_steal | ||
2363 | |||
2364 | movdqu -16($out), @XMM[0] | ||
2365 | lea 0x20(%rbp), $arg1 | ||
2366 | pxor @XMM[7], @XMM[0] | ||
2367 | lea 0x20(%rbp), $arg2 | ||
2368 | movdqa @XMM[0], 0x20(%rbp) | ||
2369 | lea ($key), $arg3 | ||
2370 | call asm_AES_encrypt # doesn't touch %xmm | ||
2371 | pxor 0x20(%rbp), @XMM[7] | ||
2372 | movdqu @XMM[7], -16($out) | ||
2373 | |||
2374 | .Lxts_enc_ret: | ||
2375 | lea (%rsp), %rax | ||
2376 | pxor %xmm0, %xmm0 | ||
2377 | .Lxts_enc_bzero: # wipe key schedule [if any] | ||
2378 | movdqa %xmm0, 0x00(%rax) | ||
2379 | movdqa %xmm0, 0x10(%rax) | ||
2380 | lea 0x20(%rax), %rax | ||
2381 | cmp %rax, %rbp | ||
2382 | ja .Lxts_enc_bzero | ||
2383 | |||
2384 | lea (%rbp),%rsp # restore %rsp | ||
2385 | ___ | ||
2386 | $code.=<<___ if ($win64); | ||
2387 | movaps 0x40(%rbp), %xmm6 | ||
2388 | movaps 0x50(%rbp), %xmm7 | ||
2389 | movaps 0x60(%rbp), %xmm8 | ||
2390 | movaps 0x70(%rbp), %xmm9 | ||
2391 | movaps 0x80(%rbp), %xmm10 | ||
2392 | movaps 0x90(%rbp), %xmm11 | ||
2393 | movaps 0xa0(%rbp), %xmm12 | ||
2394 | movaps 0xb0(%rbp), %xmm13 | ||
2395 | movaps 0xc0(%rbp), %xmm14 | ||
2396 | movaps 0xd0(%rbp), %xmm15 | ||
2397 | lea 0xa0(%rbp), %rsp | ||
2398 | ___ | ||
2399 | $code.=<<___; | ||
2400 | mov 0x48(%rsp), %r15 | ||
2401 | mov 0x50(%rsp), %r14 | ||
2402 | mov 0x58(%rsp), %r13 | ||
2403 | mov 0x60(%rsp), %r12 | ||
2404 | mov 0x68(%rsp), %rbx | ||
2405 | mov 0x70(%rsp), %rax | ||
2406 | lea 0x78(%rsp), %rsp | ||
2407 | mov %rax, %rbp | ||
2408 | .Lxts_enc_epilogue: | ||
2409 | ret | ||
2410 | .size bsaes_xts_encrypt,.-bsaes_xts_encrypt | ||
2411 | |||
2412 | .globl bsaes_xts_decrypt | ||
2413 | .type bsaes_xts_decrypt,\@abi-omnipotent | ||
2414 | .align 16 | ||
2415 | bsaes_xts_decrypt: | ||
2416 | mov %rsp, %rax | ||
2417 | .Lxts_dec_prologue: | ||
2418 | push %rbp | ||
2419 | push %rbx | ||
2420 | push %r12 | ||
2421 | push %r13 | ||
2422 | push %r14 | ||
2423 | push %r15 | ||
2424 | lea -0x48(%rsp), %rsp | ||
2425 | ___ | ||
2426 | $code.=<<___ if ($win64); | ||
2427 | mov 0xa0(%rsp),$arg5 # pull key2 | ||
2428 | mov 0xa8(%rsp),$arg6 # pull ivp | ||
2429 | lea -0xa0(%rsp), %rsp | ||
2430 | movaps %xmm6, 0x40(%rsp) | ||
2431 | movaps %xmm7, 0x50(%rsp) | ||
2432 | movaps %xmm8, 0x60(%rsp) | ||
2433 | movaps %xmm9, 0x70(%rsp) | ||
2434 | movaps %xmm10, 0x80(%rsp) | ||
2435 | movaps %xmm11, 0x90(%rsp) | ||
2436 | movaps %xmm12, 0xa0(%rsp) | ||
2437 | movaps %xmm13, 0xb0(%rsp) | ||
2438 | movaps %xmm14, 0xc0(%rsp) | ||
2439 | movaps %xmm15, 0xd0(%rsp) | ||
2440 | .Lxts_dec_body: | ||
2441 | ___ | ||
2442 | $code.=<<___; | ||
2443 | mov %rsp, %rbp # backup %rsp | ||
2444 | mov $arg1, $inp # backup arguments | ||
2445 | mov $arg2, $out | ||
2446 | mov $arg3, $len | ||
2447 | mov $arg4, $key | ||
2448 | |||
2449 | lea ($arg6), $arg1 | ||
2450 | lea 0x20(%rbp), $arg2 | ||
2451 | lea ($arg5), $arg3 | ||
2452 | call asm_AES_encrypt # generate initial tweak | ||
2453 | |||
2454 | mov 240($key), %eax # rounds | ||
2455 | mov $len, %rbx # backup $len | ||
2456 | |||
2457 | mov %eax, %edx # rounds | ||
2458 | shl \$7, %rax # 128 bytes per inner round key | ||
2459 | sub \$`128-32`, %rax # size of bit-sliced key schedule | ||
2460 | sub %rax, %rsp | ||
2461 | |||
2462 | mov %rsp, %rax # pass key schedule | ||
2463 | mov $key, %rcx # pass key | ||
2464 | mov %edx, %r10d # pass rounds | ||
2465 | call _bsaes_key_convert | ||
2466 | pxor (%rsp), %xmm7 # fix up round 0 key | ||
2467 | movdqa %xmm6, (%rax) # save last round key | ||
2468 | movdqa %xmm7, (%rsp) | ||
2469 | |||
2470 | xor %eax, %eax # if ($len%16) len-=16; | ||
2471 | and \$-16, $len | ||
2472 | test \$15, %ebx | ||
2473 | setnz %al | ||
2474 | shl \$4, %rax | ||
2475 | sub %rax, $len | ||
2476 | |||
2477 | sub \$0x80, %rsp # place for tweak[8] | ||
2478 | movdqa 0x20(%rbp), @XMM[7] # initial tweak | ||
2479 | |||
2480 | pxor $twtmp, $twtmp | ||
2481 | movdqa .Lxts_magic(%rip), $twmask | ||
2482 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2483 | |||
2484 | sub \$0x80, $len | ||
2485 | jc .Lxts_dec_short | ||
2486 | jmp .Lxts_dec_loop | ||
2487 | |||
2488 | .align 16 | ||
2489 | .Lxts_dec_loop: | ||
2490 | ___ | ||
2491 | for ($i=0;$i<7;$i++) { | ||
2492 | $code.=<<___; | ||
2493 | pshufd \$0x13, $twtmp, $twres | ||
2494 | pxor $twtmp, $twtmp | ||
2495 | movdqa @XMM[7], @XMM[$i] | ||
2496 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2497 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2498 | pand $twmask, $twres # isolate carry and residue | ||
2499 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2500 | pxor $twres, @XMM[7] | ||
2501 | ___ | ||
2502 | $code.=<<___ if ($i>=1); | ||
2503 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2504 | ___ | ||
2505 | $code.=<<___ if ($i>=2); | ||
2506 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2507 | ___ | ||
2508 | } | ||
2509 | $code.=<<___; | ||
2510 | movdqu 0x60($inp), @XMM[8+6] | ||
2511 | pxor @XMM[8+5], @XMM[5] | ||
2512 | movdqu 0x70($inp), @XMM[8+7] | ||
2513 | lea 0x80($inp), $inp | ||
2514 | movdqa @XMM[7], 0x70(%rsp) | ||
2515 | pxor @XMM[8+6], @XMM[6] | ||
2516 | lea 0x80(%rsp), %rax # pass key schedule | ||
2517 | pxor @XMM[8+7], @XMM[7] | ||
2518 | mov %edx, %r10d # pass rounds | ||
2519 | |||
2520 | call _bsaes_decrypt8 | ||
2521 | |||
2522 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2523 | pxor 0x10(%rsp), @XMM[1] | ||
2524 | movdqu @XMM[0], 0x00($out) # write output | ||
2525 | pxor 0x20(%rsp), @XMM[6] | ||
2526 | movdqu @XMM[1], 0x10($out) | ||
2527 | pxor 0x30(%rsp), @XMM[4] | ||
2528 | movdqu @XMM[6], 0x20($out) | ||
2529 | pxor 0x40(%rsp), @XMM[2] | ||
2530 | movdqu @XMM[4], 0x30($out) | ||
2531 | pxor 0x50(%rsp), @XMM[7] | ||
2532 | movdqu @XMM[2], 0x40($out) | ||
2533 | pxor 0x60(%rsp), @XMM[3] | ||
2534 | movdqu @XMM[7], 0x50($out) | ||
2535 | pxor 0x70(%rsp), @XMM[5] | ||
2536 | movdqu @XMM[3], 0x60($out) | ||
2537 | movdqu @XMM[5], 0x70($out) | ||
2538 | lea 0x80($out), $out | ||
2539 | |||
2540 | movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | ||
2541 | pxor $twtmp, $twtmp | ||
2542 | movdqa .Lxts_magic(%rip), $twmask | ||
2543 | pcmpgtd @XMM[7], $twtmp | ||
2544 | pshufd \$0x13, $twtmp, $twres | ||
2545 | pxor $twtmp, $twtmp | ||
2546 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2547 | pand $twmask, $twres # isolate carry and residue | ||
2548 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2549 | pxor $twres, @XMM[7] | ||
2550 | |||
2551 | sub \$0x80,$len | ||
2552 | jnc .Lxts_dec_loop | ||
2553 | |||
2554 | .Lxts_dec_short: | ||
2555 | add \$0x80, $len | ||
2556 | jz .Lxts_dec_done | ||
2557 | ___ | ||
2558 | for ($i=0;$i<7;$i++) { | ||
2559 | $code.=<<___; | ||
2560 | pshufd \$0x13, $twtmp, $twres | ||
2561 | pxor $twtmp, $twtmp | ||
2562 | movdqa @XMM[7], @XMM[$i] | ||
2563 | movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | ||
2564 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2565 | pand $twmask, $twres # isolate carry and residue | ||
2566 | pcmpgtd @XMM[7], $twtmp # broadcast upper bits | ||
2567 | pxor $twres, @XMM[7] | ||
2568 | ___ | ||
2569 | $code.=<<___ if ($i>=1); | ||
2570 | movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | ||
2571 | cmp \$`0x10*$i`,$len | ||
2572 | je .Lxts_dec_$i | ||
2573 | ___ | ||
2574 | $code.=<<___ if ($i>=2); | ||
2575 | pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | ||
2576 | ___ | ||
2577 | } | ||
2578 | $code.=<<___; | ||
2579 | movdqu 0x60($inp), @XMM[8+6] | ||
2580 | pxor @XMM[8+5], @XMM[5] | ||
2581 | movdqa @XMM[7], 0x70(%rsp) | ||
2582 | lea 0x70($inp), $inp | ||
2583 | pxor @XMM[8+6], @XMM[6] | ||
2584 | lea 0x80(%rsp), %rax # pass key schedule | ||
2585 | mov %edx, %r10d # pass rounds | ||
2586 | |||
2587 | call _bsaes_decrypt8 | ||
2588 | |||
2589 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2590 | pxor 0x10(%rsp), @XMM[1] | ||
2591 | movdqu @XMM[0], 0x00($out) # write output | ||
2592 | pxor 0x20(%rsp), @XMM[6] | ||
2593 | movdqu @XMM[1], 0x10($out) | ||
2594 | pxor 0x30(%rsp), @XMM[4] | ||
2595 | movdqu @XMM[6], 0x20($out) | ||
2596 | pxor 0x40(%rsp), @XMM[2] | ||
2597 | movdqu @XMM[4], 0x30($out) | ||
2598 | pxor 0x50(%rsp), @XMM[7] | ||
2599 | movdqu @XMM[2], 0x40($out) | ||
2600 | pxor 0x60(%rsp), @XMM[3] | ||
2601 | movdqu @XMM[7], 0x50($out) | ||
2602 | movdqu @XMM[3], 0x60($out) | ||
2603 | lea 0x70($out), $out | ||
2604 | |||
2605 | movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | ||
2606 | jmp .Lxts_dec_done | ||
2607 | .align 16 | ||
2608 | .Lxts_dec_6: | ||
2609 | pxor @XMM[8+4], @XMM[4] | ||
2610 | lea 0x60($inp), $inp | ||
2611 | pxor @XMM[8+5], @XMM[5] | ||
2612 | lea 0x80(%rsp), %rax # pass key schedule | ||
2613 | mov %edx, %r10d # pass rounds | ||
2614 | |||
2615 | call _bsaes_decrypt8 | ||
2616 | |||
2617 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2618 | pxor 0x10(%rsp), @XMM[1] | ||
2619 | movdqu @XMM[0], 0x00($out) # write output | ||
2620 | pxor 0x20(%rsp), @XMM[6] | ||
2621 | movdqu @XMM[1], 0x10($out) | ||
2622 | pxor 0x30(%rsp), @XMM[4] | ||
2623 | movdqu @XMM[6], 0x20($out) | ||
2624 | pxor 0x40(%rsp), @XMM[2] | ||
2625 | movdqu @XMM[4], 0x30($out) | ||
2626 | pxor 0x50(%rsp), @XMM[7] | ||
2627 | movdqu @XMM[2], 0x40($out) | ||
2628 | movdqu @XMM[7], 0x50($out) | ||
2629 | lea 0x60($out), $out | ||
2630 | |||
2631 | movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | ||
2632 | jmp .Lxts_dec_done | ||
2633 | .align 16 | ||
2634 | .Lxts_dec_5: | ||
2635 | pxor @XMM[8+3], @XMM[3] | ||
2636 | lea 0x50($inp), $inp | ||
2637 | pxor @XMM[8+4], @XMM[4] | ||
2638 | lea 0x80(%rsp), %rax # pass key schedule | ||
2639 | mov %edx, %r10d # pass rounds | ||
2640 | |||
2641 | call _bsaes_decrypt8 | ||
2642 | |||
2643 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2644 | pxor 0x10(%rsp), @XMM[1] | ||
2645 | movdqu @XMM[0], 0x00($out) # write output | ||
2646 | pxor 0x20(%rsp), @XMM[6] | ||
2647 | movdqu @XMM[1], 0x10($out) | ||
2648 | pxor 0x30(%rsp), @XMM[4] | ||
2649 | movdqu @XMM[6], 0x20($out) | ||
2650 | pxor 0x40(%rsp), @XMM[2] | ||
2651 | movdqu @XMM[4], 0x30($out) | ||
2652 | movdqu @XMM[2], 0x40($out) | ||
2653 | lea 0x50($out), $out | ||
2654 | |||
2655 | movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | ||
2656 | jmp .Lxts_dec_done | ||
2657 | .align 16 | ||
2658 | .Lxts_dec_4: | ||
2659 | pxor @XMM[8+2], @XMM[2] | ||
2660 | lea 0x40($inp), $inp | ||
2661 | pxor @XMM[8+3], @XMM[3] | ||
2662 | lea 0x80(%rsp), %rax # pass key schedule | ||
2663 | mov %edx, %r10d # pass rounds | ||
2664 | |||
2665 | call _bsaes_decrypt8 | ||
2666 | |||
2667 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2668 | pxor 0x10(%rsp), @XMM[1] | ||
2669 | movdqu @XMM[0], 0x00($out) # write output | ||
2670 | pxor 0x20(%rsp), @XMM[6] | ||
2671 | movdqu @XMM[1], 0x10($out) | ||
2672 | pxor 0x30(%rsp), @XMM[4] | ||
2673 | movdqu @XMM[6], 0x20($out) | ||
2674 | movdqu @XMM[4], 0x30($out) | ||
2675 | lea 0x40($out), $out | ||
2676 | |||
2677 | movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | ||
2678 | jmp .Lxts_dec_done | ||
2679 | .align 16 | ||
2680 | .Lxts_dec_3: | ||
2681 | pxor @XMM[8+1], @XMM[1] | ||
2682 | lea 0x30($inp), $inp | ||
2683 | pxor @XMM[8+2], @XMM[2] | ||
2684 | lea 0x80(%rsp), %rax # pass key schedule | ||
2685 | mov %edx, %r10d # pass rounds | ||
2686 | |||
2687 | call _bsaes_decrypt8 | ||
2688 | |||
2689 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2690 | pxor 0x10(%rsp), @XMM[1] | ||
2691 | movdqu @XMM[0], 0x00($out) # write output | ||
2692 | pxor 0x20(%rsp), @XMM[6] | ||
2693 | movdqu @XMM[1], 0x10($out) | ||
2694 | movdqu @XMM[6], 0x20($out) | ||
2695 | lea 0x30($out), $out | ||
2696 | |||
2697 | movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | ||
2698 | jmp .Lxts_dec_done | ||
2699 | .align 16 | ||
2700 | .Lxts_dec_2: | ||
2701 | pxor @XMM[8+0], @XMM[0] | ||
2702 | lea 0x20($inp), $inp | ||
2703 | pxor @XMM[8+1], @XMM[1] | ||
2704 | lea 0x80(%rsp), %rax # pass key schedule | ||
2705 | mov %edx, %r10d # pass rounds | ||
2706 | |||
2707 | call _bsaes_decrypt8 | ||
2708 | |||
2709 | pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2710 | pxor 0x10(%rsp), @XMM[1] | ||
2711 | movdqu @XMM[0], 0x00($out) # write output | ||
2712 | movdqu @XMM[1], 0x10($out) | ||
2713 | lea 0x20($out), $out | ||
2714 | |||
2715 | movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | ||
2716 | jmp .Lxts_dec_done | ||
2717 | .align 16 | ||
2718 | .Lxts_dec_1: | ||
2719 | pxor @XMM[0], @XMM[8] | ||
2720 | lea 0x10($inp), $inp | ||
2721 | movdqa @XMM[8], 0x20(%rbp) | ||
2722 | lea 0x20(%rbp), $arg1 | ||
2723 | lea 0x20(%rbp), $arg2 | ||
2724 | lea ($key), $arg3 | ||
2725 | call asm_AES_decrypt # doesn't touch %xmm | ||
2726 | pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | ||
2727 | #pxor @XMM[8], @XMM[0] | ||
2728 | #lea 0x80(%rsp), %rax # pass key schedule | ||
2729 | #mov %edx, %r10d # pass rounds | ||
2730 | #call _bsaes_decrypt8 | ||
2731 | #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | ||
2732 | movdqu @XMM[0], 0x00($out) # write output | ||
2733 | lea 0x10($out), $out | ||
2734 | |||
2735 | movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | ||
2736 | |||
2737 | .Lxts_dec_done: | ||
2738 | and \$15, %ebx | ||
2739 | jz .Lxts_dec_ret | ||
2740 | |||
2741 | pxor $twtmp, $twtmp | ||
2742 | movdqa .Lxts_magic(%rip), $twmask | ||
2743 | pcmpgtd @XMM[7], $twtmp | ||
2744 | pshufd \$0x13, $twtmp, $twres | ||
2745 | movdqa @XMM[7], @XMM[6] | ||
2746 | paddq @XMM[7], @XMM[7] # psllq 1,$tweak | ||
2747 | pand $twmask, $twres # isolate carry and residue | ||
2748 | movdqu ($inp), @XMM[0] | ||
2749 | pxor $twres, @XMM[7] | ||
2750 | |||
2751 | lea 0x20(%rbp), $arg1 | ||
2752 | pxor @XMM[7], @XMM[0] | ||
2753 | lea 0x20(%rbp), $arg2 | ||
2754 | movdqa @XMM[0], 0x20(%rbp) | ||
2755 | lea ($key), $arg3 | ||
2756 | call asm_AES_decrypt # doesn't touch %xmm | ||
2757 | pxor 0x20(%rbp), @XMM[7] | ||
2758 | mov $out, %rdx | ||
2759 | movdqu @XMM[7], ($out) | ||
2760 | |||
2761 | .Lxts_dec_steal: | ||
2762 | movzb 16($inp), %eax | ||
2763 | movzb (%rdx), %ecx | ||
2764 | lea 1($inp), $inp | ||
2765 | mov %al, (%rdx) | ||
2766 | mov %cl, 16(%rdx) | ||
2767 | lea 1(%rdx), %rdx | ||
2768 | sub \$1,%ebx | ||
2769 | jnz .Lxts_dec_steal | ||
2770 | |||
2771 | movdqu ($out), @XMM[0] | ||
2772 | lea 0x20(%rbp), $arg1 | ||
2773 | pxor @XMM[6], @XMM[0] | ||
2774 | lea 0x20(%rbp), $arg2 | ||
2775 | movdqa @XMM[0], 0x20(%rbp) | ||
2776 | lea ($key), $arg3 | ||
2777 | call asm_AES_decrypt # doesn't touch %xmm | ||
2778 | pxor 0x20(%rbp), @XMM[6] | ||
2779 | movdqu @XMM[6], ($out) | ||
2780 | |||
2781 | .Lxts_dec_ret: | ||
2782 | lea (%rsp), %rax | ||
2783 | pxor %xmm0, %xmm0 | ||
2784 | .Lxts_dec_bzero: # wipe key schedule [if any] | ||
2785 | movdqa %xmm0, 0x00(%rax) | ||
2786 | movdqa %xmm0, 0x10(%rax) | ||
2787 | lea 0x20(%rax), %rax | ||
2788 | cmp %rax, %rbp | ||
2789 | ja .Lxts_dec_bzero | ||
2790 | |||
2791 | lea (%rbp),%rsp # restore %rsp | ||
2792 | ___ | ||
2793 | $code.=<<___ if ($win64); | ||
2794 | movaps 0x40(%rbp), %xmm6 | ||
2795 | movaps 0x50(%rbp), %xmm7 | ||
2796 | movaps 0x60(%rbp), %xmm8 | ||
2797 | movaps 0x70(%rbp), %xmm9 | ||
2798 | movaps 0x80(%rbp), %xmm10 | ||
2799 | movaps 0x90(%rbp), %xmm11 | ||
2800 | movaps 0xa0(%rbp), %xmm12 | ||
2801 | movaps 0xb0(%rbp), %xmm13 | ||
2802 | movaps 0xc0(%rbp), %xmm14 | ||
2803 | movaps 0xd0(%rbp), %xmm15 | ||
2804 | lea 0xa0(%rbp), %rsp | ||
2805 | ___ | ||
2806 | $code.=<<___; | ||
2807 | mov 0x48(%rsp), %r15 | ||
2808 | mov 0x50(%rsp), %r14 | ||
2809 | mov 0x58(%rsp), %r13 | ||
2810 | mov 0x60(%rsp), %r12 | ||
2811 | mov 0x68(%rsp), %rbx | ||
2812 | mov 0x70(%rsp), %rax | ||
2813 | lea 0x78(%rsp), %rsp | ||
2814 | mov %rax, %rbp | ||
2815 | .Lxts_dec_epilogue: | ||
2816 | ret | ||
2817 | .size bsaes_xts_decrypt,.-bsaes_xts_decrypt | ||
2818 | ___ | ||
2819 | } | ||
2820 | $code.=<<___; | ||
2821 | .type _bsaes_const,\@object | ||
2822 | .align 64 | ||
2823 | _bsaes_const: | ||
2824 | .LM0ISR: # InvShiftRows constants | ||
2825 | .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 | ||
2826 | .LISRM0: | ||
2827 | .quad 0x01040b0e0205080f, 0x0306090c00070a0d | ||
2828 | .LISR: | ||
2829 | .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 | ||
2830 | .LBS0: # bit-slice constants | ||
2831 | .quad 0x5555555555555555, 0x5555555555555555 | ||
2832 | .LBS1: | ||
2833 | .quad 0x3333333333333333, 0x3333333333333333 | ||
2834 | .LBS2: | ||
2835 | .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f | ||
2836 | .LSR: # shiftrows constants | ||
2837 | .quad 0x0504070600030201, 0x0f0e0d0c0a09080b | ||
2838 | .LSRM0: | ||
2839 | .quad 0x0304090e00050a0f, 0x01060b0c0207080d | ||
2840 | .LM0SR: | ||
2841 | .quad 0x0a0e02060f03070b, 0x0004080c05090d01 | ||
2842 | .LSWPUP: # byte-swap upper dword | ||
2843 | .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 | ||
2844 | .LSWPUPM0SR: | ||
2845 | .quad 0x0a0d02060c03070b, 0x0004080f05090e01 | ||
2846 | .LADD1: # counter increment constants | ||
2847 | .quad 0x0000000000000000, 0x0000000100000000 | ||
2848 | .LADD2: | ||
2849 | .quad 0x0000000000000000, 0x0000000200000000 | ||
2850 | .LADD3: | ||
2851 | .quad 0x0000000000000000, 0x0000000300000000 | ||
2852 | .LADD4: | ||
2853 | .quad 0x0000000000000000, 0x0000000400000000 | ||
2854 | .LADD5: | ||
2855 | .quad 0x0000000000000000, 0x0000000500000000 | ||
2856 | .LADD6: | ||
2857 | .quad 0x0000000000000000, 0x0000000600000000 | ||
2858 | .LADD7: | ||
2859 | .quad 0x0000000000000000, 0x0000000700000000 | ||
2860 | .LADD8: | ||
2861 | .quad 0x0000000000000000, 0x0000000800000000 | ||
2862 | .Lxts_magic: | ||
2863 | .long 0x87,0,1,0 | ||
2864 | .Lmasks: | ||
2865 | .quad 0x0101010101010101, 0x0101010101010101 | ||
2866 | .quad 0x0202020202020202, 0x0202020202020202 | ||
2867 | .quad 0x0404040404040404, 0x0404040404040404 | ||
2868 | .quad 0x0808080808080808, 0x0808080808080808 | ||
2869 | .LM0: | ||
2870 | .quad 0x02060a0e03070b0f, 0x0004080c0105090d | ||
2871 | .L63: | ||
2872 | .quad 0x6363636363636363, 0x6363636363636363 | ||
2873 | .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" | ||
2874 | .align 64 | ||
2875 | .size _bsaes_const,.-_bsaes_const | ||
2876 | ___ | ||
2877 | |||
2878 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
2879 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
2880 | if ($win64) { | ||
2881 | $rec="%rcx"; | ||
2882 | $frame="%rdx"; | ||
2883 | $context="%r8"; | ||
2884 | $disp="%r9"; | ||
2885 | |||
2886 | $code.=<<___; | ||
2887 | .extern __imp_RtlVirtualUnwind | ||
2888 | .type se_handler,\@abi-omnipotent | ||
2889 | .align 16 | ||
2890 | se_handler: | ||
2891 | push %rsi | ||
2892 | push %rdi | ||
2893 | push %rbx | ||
2894 | push %rbp | ||
2895 | push %r12 | ||
2896 | push %r13 | ||
2897 | push %r14 | ||
2898 | push %r15 | ||
2899 | pushfq | ||
2900 | sub \$64,%rsp | ||
2901 | |||
2902 | mov 120($context),%rax # pull context->Rax | ||
2903 | mov 248($context),%rbx # pull context->Rip | ||
2904 | |||
2905 | mov 8($disp),%rsi # disp->ImageBase | ||
2906 | mov 56($disp),%r11 # disp->HandlerData | ||
2907 | |||
2908 | mov 0(%r11),%r10d # HandlerData[0] | ||
2909 | lea (%rsi,%r10),%r10 # prologue label | ||
2910 | cmp %r10,%rbx # context->Rip<prologue label | ||
2911 | jb .Lin_prologue | ||
2912 | |||
2913 | mov 152($context),%rax # pull context->Rsp | ||
2914 | |||
2915 | mov 4(%r11),%r10d # HandlerData[1] | ||
2916 | lea (%rsi,%r10),%r10 # epilogue label | ||
2917 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2918 | jae .Lin_prologue | ||
2919 | |||
2920 | mov 160($context),%rax # pull context->Rbp | ||
2921 | |||
2922 | lea 0x40(%rax),%rsi # %xmm save area | ||
2923 | lea 512($context),%rdi # &context.Xmm6 | ||
2924 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
2925 | .long 0xa548f3fc # cld; rep movsq | ||
2926 | lea 0xa0(%rax),%rax # adjust stack pointer | ||
2927 | |||
2928 | mov 0x70(%rax),%rbp | ||
2929 | mov 0x68(%rax),%rbx | ||
2930 | mov 0x60(%rax),%r12 | ||
2931 | mov 0x58(%rax),%r13 | ||
2932 | mov 0x50(%rax),%r14 | ||
2933 | mov 0x48(%rax),%r15 | ||
2934 | lea 0x78(%rax),%rax # adjust stack pointer | ||
2935 | mov %rbx,144($context) # restore context->Rbx | ||
2936 | mov %rbp,160($context) # restore context->Rbp | ||
2937 | mov %r12,216($context) # restore context->R12 | ||
2938 | mov %r13,224($context) # restore context->R13 | ||
2939 | mov %r14,232($context) # restore context->R14 | ||
2940 | mov %r15,240($context) # restore context->R15 | ||
2941 | |||
2942 | .Lin_prologue: | ||
2943 | mov %rax,152($context) # restore context->Rsp | ||
2944 | |||
2945 | mov 40($disp),%rdi # disp->ContextRecord | ||
2946 | mov $context,%rsi # context | ||
2947 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
2948 | .long 0xa548f3fc # cld; rep movsq | ||
2949 | |||
2950 | mov $disp,%rsi | ||
2951 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
2952 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
2953 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
2954 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
2955 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
2956 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
2957 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
2958 | mov %r10,32(%rsp) # arg5 | ||
2959 | mov %r11,40(%rsp) # arg6 | ||
2960 | mov %r12,48(%rsp) # arg7 | ||
2961 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
2962 | call *__imp_RtlVirtualUnwind(%rip) | ||
2963 | |||
2964 | mov \$1,%eax # ExceptionContinueSearch | ||
2965 | add \$64,%rsp | ||
2966 | popfq | ||
2967 | pop %r15 | ||
2968 | pop %r14 | ||
2969 | pop %r13 | ||
2970 | pop %r12 | ||
2971 | pop %rbp | ||
2972 | pop %rbx | ||
2973 | pop %rdi | ||
2974 | pop %rsi | ||
2975 | ret | ||
2976 | .size se_handler,.-se_handler | ||
2977 | |||
2978 | .section .pdata | ||
2979 | .align 4 | ||
2980 | ___ | ||
2981 | $code.=<<___ if ($ecb); | ||
2982 | .rva .Lecb_enc_prologue | ||
2983 | .rva .Lecb_enc_epilogue | ||
2984 | .rva .Lecb_enc_info | ||
2985 | |||
2986 | .rva .Lecb_dec_prologue | ||
2987 | .rva .Lecb_dec_epilogue | ||
2988 | .rva .Lecb_dec_info | ||
2989 | ___ | ||
2990 | $code.=<<___; | ||
2991 | .rva .Lcbc_dec_prologue | ||
2992 | .rva .Lcbc_dec_epilogue | ||
2993 | .rva .Lcbc_dec_info | ||
2994 | |||
2995 | .rva .Lctr_enc_prologue | ||
2996 | .rva .Lctr_enc_epilogue | ||
2997 | .rva .Lctr_enc_info | ||
2998 | |||
2999 | .rva .Lxts_enc_prologue | ||
3000 | .rva .Lxts_enc_epilogue | ||
3001 | .rva .Lxts_enc_info | ||
3002 | |||
3003 | .rva .Lxts_dec_prologue | ||
3004 | .rva .Lxts_dec_epilogue | ||
3005 | .rva .Lxts_dec_info | ||
3006 | |||
3007 | .section .xdata | ||
3008 | .align 8 | ||
3009 | ___ | ||
3010 | $code.=<<___ if ($ecb); | ||
3011 | .Lecb_enc_info: | ||
3012 | .byte 9,0,0,0 | ||
3013 | .rva se_handler | ||
3014 | .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] | ||
3015 | .Lecb_dec_info: | ||
3016 | .byte 9,0,0,0 | ||
3017 | .rva se_handler | ||
3018 | .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] | ||
3019 | ___ | ||
3020 | $code.=<<___; | ||
3021 | .Lcbc_dec_info: | ||
3022 | .byte 9,0,0,0 | ||
3023 | .rva se_handler | ||
3024 | .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] | ||
3025 | .Lctr_enc_info: | ||
3026 | .byte 9,0,0,0 | ||
3027 | .rva se_handler | ||
3028 | .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] | ||
3029 | .Lxts_enc_info: | ||
3030 | .byte 9,0,0,0 | ||
3031 | .rva se_handler | ||
3032 | .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] | ||
3033 | .Lxts_dec_info: | ||
3034 | .byte 9,0,0,0 | ||
3035 | .rva se_handler | ||
3036 | .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] | ||
3037 | ___ | ||
3038 | } | ||
3039 | |||
3040 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
3041 | |||
3042 | print $code; | ||
3043 | |||
3044 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl new file mode 100644 index 0000000000..1533e2c304 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/vpaes-x86.pl | |||
@@ -0,0 +1,903 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | ###################################################################### | ||
4 | ## Constant-time SSSE3 AES core implementation. | ||
5 | ## version 0.1 | ||
6 | ## | ||
7 | ## By Mike Hamburg (Stanford University), 2009 | ||
8 | ## Public domain. | ||
9 | ## | ||
10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
11 | ## http://crypto.stanford.edu/vpaes/. | ||
12 | |||
13 | ###################################################################### | ||
14 | # September 2011. | ||
15 | # | ||
16 | # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for | ||
17 | # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
18 | # doesn't handle partial vectors (doesn't have to if called from | ||
19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
20 | # schedule structure with the original nor does it make assumption | ||
21 | # about its alignment... | ||
22 | # | ||
23 | # Performance summary. aes-586.pl column lists large-block CBC | ||
24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
25 | # byte processed with 128-bit key, and vpaes-x86.pl column - [also | ||
26 | # large-block CBC] encrypt/decrypt. | ||
27 | # | ||
28 | # aes-586.pl vpaes-x86.pl | ||
29 | # | ||
30 | # Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) | ||
31 | # Nehalem 27.9/40.4/18.1 10.3/12.0 | ||
32 | # Atom 102./119./60.1 64.5/85.3(***) | ||
33 | # | ||
34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
36 | # majority of contemporary cores share cache, slower code path | ||
37 | # is common place. In other words "with-hyper-threading-off" | ||
38 | # results are presented mostly for reference purposes. | ||
39 | # | ||
40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
41 | # | ||
42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
43 | # pshufb, yet it's respectable +32%/65% improvement on Core 2 | ||
44 | # and +58%/40% on Atom (as implied, over "hyper-threading-safe" | ||
45 | # code path). | ||
46 | # | ||
47 | # <appro@openssl.org> | ||
48 | |||
49 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
50 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
51 | require "x86asm.pl"; | ||
52 | |||
53 | &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); | ||
54 | |||
55 | $PREFIX="vpaes"; | ||
56 | |||
57 | my ($round, $base, $magic, $key, $const, $inp, $out)= | ||
58 | ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); | ||
59 | |||
60 | &static_label("_vpaes_consts"); | ||
61 | &static_label("_vpaes_schedule_low_round"); | ||
62 | |||
63 | &set_label("_vpaes_consts",64); | ||
64 | $k_inv=-0x30; # inv, inva | ||
65 | &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); | ||
66 | &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); | ||
67 | |||
68 | $k_s0F=-0x10; # s0F | ||
69 | &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); | ||
70 | |||
71 | $k_ipt=0x00; # input transform (lo, hi) | ||
72 | &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); | ||
73 | &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); | ||
74 | |||
75 | $k_sb1=0x20; # sb1u, sb1t | ||
76 | &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); | ||
77 | &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); | ||
78 | $k_sb2=0x40; # sb2u, sb2t | ||
79 | &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); | ||
80 | &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); | ||
81 | $k_sbo=0x60; # sbou, sbot | ||
82 | &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); | ||
83 | &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); | ||
84 | |||
85 | $k_mc_forward=0x80; # mc_forward | ||
86 | &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); | ||
87 | &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); | ||
88 | &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); | ||
89 | &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); | ||
90 | |||
91 | $k_mc_backward=0xc0; # mc_backward | ||
92 | &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); | ||
93 | &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); | ||
94 | &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); | ||
95 | &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); | ||
96 | |||
97 | $k_sr=0x100; # sr | ||
98 | &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); | ||
99 | &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); | ||
100 | &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); | ||
101 | &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); | ||
102 | |||
103 | $k_rcon=0x140; # rcon | ||
104 | &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); | ||
105 | |||
106 | $k_s63=0x150; # s63: all equal to 0x63 transformed | ||
107 | &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); | ||
108 | |||
109 | $k_opt=0x160; # output transform | ||
110 | &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); | ||
111 | &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); | ||
112 | |||
113 | $k_deskew=0x180; # deskew tables: inverts the sbox's "skew" | ||
114 | &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); | ||
115 | &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); | ||
116 | ## | ||
117 | ## Decryption stuff | ||
118 | ## Key schedule constants | ||
119 | ## | ||
120 | $k_dksd=0x1a0; # decryption key schedule: invskew x*D | ||
121 | &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); | ||
122 | &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); | ||
123 | $k_dksb=0x1c0; # decryption key schedule: invskew x*B | ||
124 | &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); | ||
125 | &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); | ||
126 | $k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 | ||
127 | &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); | ||
128 | &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); | ||
129 | $k_dks9=0x200; # decryption key schedule: invskew x*9 | ||
130 | &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); | ||
131 | &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); | ||
132 | |||
133 | ## | ||
134 | ## Decryption stuff | ||
135 | ## Round function constants | ||
136 | ## | ||
137 | $k_dipt=0x220; # decryption input transform | ||
138 | &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); | ||
139 | &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); | ||
140 | |||
141 | $k_dsb9=0x240; # decryption sbox output *9*u, *9*t | ||
142 | &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); | ||
143 | &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); | ||
144 | $k_dsbd=0x260; # decryption sbox output *D*u, *D*t | ||
145 | &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); | ||
146 | &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); | ||
147 | $k_dsbb=0x280; # decryption sbox output *B*u, *B*t | ||
148 | &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); | ||
149 | &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); | ||
150 | $k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t | ||
151 | &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); | ||
152 | &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); | ||
153 | $k_dsbo=0x2c0; # decryption sbox final output | ||
154 | &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); | ||
155 | &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); | ||
156 | &asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); | ||
157 | &align (64); | ||
158 | |||
159 | &function_begin_B("_vpaes_preheat"); | ||
160 | &add ($const,&DWP(0,"esp")); | ||
161 | &movdqa ("xmm7",&QWP($k_inv,$const)); | ||
162 | &movdqa ("xmm6",&QWP($k_s0F,$const)); | ||
163 | &ret (); | ||
164 | &function_end_B("_vpaes_preheat"); | ||
165 | |||
166 | ## | ||
167 | ## _aes_encrypt_core | ||
168 | ## | ||
169 | ## AES-encrypt %xmm0. | ||
170 | ## | ||
171 | ## Inputs: | ||
172 | ## %xmm0 = input | ||
173 | ## %xmm6-%xmm7 as in _vpaes_preheat | ||
174 | ## (%edx) = scheduled keys | ||
175 | ## | ||
176 | ## Output in %xmm0 | ||
177 | ## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx | ||
178 | ## | ||
179 | ## | ||
180 | &function_begin_B("_vpaes_encrypt_core"); | ||
181 | &mov ($magic,16); | ||
182 | &mov ($round,&DWP(240,$key)); | ||
183 | &movdqa ("xmm1","xmm6") | ||
184 | &movdqa ("xmm2",&QWP($k_ipt,$const)); | ||
185 | &pandn ("xmm1","xmm0"); | ||
186 | &movdqu ("xmm5",&QWP(0,$key)); | ||
187 | &psrld ("xmm1",4); | ||
188 | &pand ("xmm0","xmm6"); | ||
189 | &pshufb ("xmm2","xmm0"); | ||
190 | &movdqa ("xmm0",&QWP($k_ipt+16,$const)); | ||
191 | &pshufb ("xmm0","xmm1"); | ||
192 | &pxor ("xmm2","xmm5"); | ||
193 | &pxor ("xmm0","xmm2"); | ||
194 | &add ($key,16); | ||
195 | &lea ($base,&DWP($k_mc_backward,$const)); | ||
196 | &jmp (&label("enc_entry")); | ||
197 | |||
198 | |||
199 | &set_label("enc_loop",16); | ||
200 | # middle of middle round | ||
201 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u | ||
202 | &pshufb ("xmm4","xmm2"); # 4 = sb1u | ||
203 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
204 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t | ||
205 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
206 | &pxor ("xmm0","xmm4"); # 0 = A | ||
207 | &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u | ||
208 | &pshufb ("xmm5","xmm2"); # 4 = sb2u | ||
209 | &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] | ||
210 | &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t | ||
211 | &pshufb ("xmm2","xmm3"); # 2 = sb2t | ||
212 | &pxor ("xmm2","xmm5"); # 2 = 2A | ||
213 | &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] | ||
214 | &movdqa ("xmm3","xmm0"); # 3 = A | ||
215 | &pshufb ("xmm0","xmm1"); # 0 = B | ||
216 | &add ($key,16); # next key | ||
217 | &pxor ("xmm0","xmm2"); # 0 = 2A+B | ||
218 | &pshufb ("xmm3","xmm4"); # 3 = D | ||
219 | &add ($magic,16); # next mc | ||
220 | &pxor ("xmm3","xmm0"); # 3 = 2A+B+D | ||
221 | &pshufb ("xmm0","xmm1"); # 0 = 2B+C | ||
222 | &and ($magic,0x30); # ... mod 4 | ||
223 | &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D | ||
224 | &sub ($round,1); # nr-- | ||
225 | |||
226 | &set_label("enc_entry"); | ||
227 | # top of round | ||
228 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
229 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
230 | &psrld ("xmm1",4); # 1 = i | ||
231 | &pand ("xmm0","xmm6"); # 0 = k | ||
232 | &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k | ||
233 | &pshufb ("xmm5","xmm0"); # 2 = a/k | ||
234 | &pxor ("xmm0","xmm1"); # 0 = j | ||
235 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
236 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
237 | &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k | ||
238 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
239 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
240 | &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k | ||
241 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
242 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
243 | &pxor ("xmm2","xmm0"); # 2 = io | ||
244 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
245 | &movdqu ("xmm5",&QWP(0,$key)); | ||
246 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
247 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
248 | &jnz (&label("enc_loop")); | ||
249 | |||
250 | # middle of last round | ||
251 | &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo | ||
252 | &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 | ||
253 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
254 | &pxor ("xmm4","xmm5"); # 4 = sb1u + k | ||
255 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
256 | &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] | ||
257 | &pxor ("xmm0","xmm4"); # 0 = A | ||
258 | &pshufb ("xmm0","xmm1"); | ||
259 | &ret (); | ||
260 | &function_end_B("_vpaes_encrypt_core"); | ||
261 | |||
262 | ## | ||
263 | ## Decryption core | ||
264 | ## | ||
265 | ## Same API as encryption core. | ||
266 | ## | ||
267 | &function_begin_B("_vpaes_decrypt_core"); | ||
268 | &mov ($round,&DWP(240,$key)); | ||
269 | &lea ($base,&DWP($k_dsbd,$const)); | ||
270 | &movdqa ("xmm1","xmm6"); | ||
271 | &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); | ||
272 | &pandn ("xmm1","xmm0"); | ||
273 | &mov ($magic,$round); | ||
274 | &psrld ("xmm1",4) | ||
275 | &movdqu ("xmm5",&QWP(0,$key)); | ||
276 | &shl ($magic,4); | ||
277 | &pand ("xmm0","xmm6"); | ||
278 | &pshufb ("xmm2","xmm0"); | ||
279 | &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); | ||
280 | &xor ($magic,0x30); | ||
281 | &pshufb ("xmm0","xmm1"); | ||
282 | &and ($magic,0x30); | ||
283 | &pxor ("xmm2","xmm5"); | ||
284 | &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); | ||
285 | &pxor ("xmm0","xmm2"); | ||
286 | &add ($key,16); | ||
287 | &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); | ||
288 | &jmp (&label("dec_entry")); | ||
289 | |||
290 | &set_label("dec_loop",16); | ||
291 | ## | ||
292 | ## Inverse mix columns | ||
293 | ## | ||
294 | &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u | ||
295 | &pshufb ("xmm4","xmm2"); # 4 = sb9u | ||
296 | &pxor ("xmm4","xmm0"); | ||
297 | &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t | ||
298 | &pshufb ("xmm0","xmm3"); # 0 = sb9t | ||
299 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
300 | &add ($key,16); # next round key | ||
301 | |||
302 | &pshufb ("xmm0","xmm5"); # MC ch | ||
303 | &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu | ||
304 | &pshufb ("xmm4","xmm2"); # 4 = sbdu | ||
305 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
306 | &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt | ||
307 | &pshufb ("xmm0","xmm3"); # 0 = sbdt | ||
308 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
309 | &sub ($round,1); # nr-- | ||
310 | |||
311 | &pshufb ("xmm0","xmm5"); # MC ch | ||
312 | &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu | ||
313 | &pshufb ("xmm4","xmm2"); # 4 = sbbu | ||
314 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
315 | &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt | ||
316 | &pshufb ("xmm0","xmm3"); # 0 = sbbt | ||
317 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
318 | |||
319 | &pshufb ("xmm0","xmm5"); # MC ch | ||
320 | &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu | ||
321 | &pshufb ("xmm4","xmm2"); # 4 = sbeu | ||
322 | &pxor ("xmm4","xmm0"); # 4 = ch | ||
323 | &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet | ||
324 | &pshufb ("xmm0","xmm3"); # 0 = sbet | ||
325 | &pxor ("xmm0","xmm4"); # 0 = ch | ||
326 | |||
327 | &palignr("xmm5","xmm5",12); | ||
328 | |||
329 | &set_label("dec_entry"); | ||
330 | # top of round | ||
331 | &movdqa ("xmm1","xmm6"); # 1 : i | ||
332 | &pandn ("xmm1","xmm0"); # 1 = i<<4 | ||
333 | &psrld ("xmm1",4); # 1 = i | ||
334 | &pand ("xmm0","xmm6"); # 0 = k | ||
335 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
336 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
337 | &pxor ("xmm0","xmm1"); # 0 = j | ||
338 | &movdqa ("xmm3","xmm7"); # 3 : 1/i | ||
339 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
340 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
341 | &movdqa ("xmm4","xmm7"); # 4 : 1/j | ||
342 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
343 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
344 | &movdqa ("xmm2","xmm7"); # 2 : 1/iak | ||
345 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
346 | &pxor ("xmm2","xmm0"); # 2 = io | ||
347 | &movdqa ("xmm3","xmm7"); # 3 : 1/jak | ||
348 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
349 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
350 | &movdqu ("xmm0",&QWP(0,$key)); | ||
351 | &jnz (&label("dec_loop")); | ||
352 | |||
353 | # middle of last round | ||
354 | &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou | ||
355 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
356 | &pxor ("xmm4","xmm0"); # 4 = sb1u + k | ||
357 | &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot | ||
358 | &movdqa ("xmm2",&QWP(0,$magic)); | ||
359 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
360 | &pxor ("xmm0","xmm4"); # 0 = A | ||
361 | &pshufb ("xmm0","xmm2"); | ||
362 | &ret (); | ||
363 | &function_end_B("_vpaes_decrypt_core"); | ||
364 | |||
365 | ######################################################## | ||
366 | ## ## | ||
367 | ## AES key schedule ## | ||
368 | ## ## | ||
369 | ######################################################## | ||
370 | &function_begin_B("_vpaes_schedule_core"); | ||
371 | &add ($const,&DWP(0,"esp")); | ||
372 | &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) | ||
373 | &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon | ||
374 | |||
375 | # input transform | ||
376 | &movdqa ("xmm3","xmm0"); | ||
377 | &lea ($base,&DWP($k_ipt,$const)); | ||
378 | &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 | ||
379 | &call ("_vpaes_schedule_transform"); | ||
380 | &movdqa ("xmm7","xmm0"); | ||
381 | |||
382 | &test ($out,$out); | ||
383 | &jnz (&label("schedule_am_decrypting")); | ||
384 | |||
385 | # encrypting, output zeroth round key after transform | ||
386 | &movdqu (&QWP(0,$key),"xmm0"); | ||
387 | &jmp (&label("schedule_go")); | ||
388 | |||
389 | &set_label("schedule_am_decrypting"); | ||
390 | # decrypting, output zeroth round key after shiftrows | ||
391 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
392 | &pshufb ("xmm3","xmm1"); | ||
393 | &movdqu (&QWP(0,$key),"xmm3"); | ||
394 | &xor ($magic,0x30); | ||
395 | |||
396 | &set_label("schedule_go"); | ||
397 | &cmp ($round,192); | ||
398 | &ja (&label("schedule_256")); | ||
399 | &je (&label("schedule_192")); | ||
400 | # 128: fall though | ||
401 | |||
402 | ## | ||
403 | ## .schedule_128 | ||
404 | ## | ||
405 | ## 128-bit specific part of key schedule. | ||
406 | ## | ||
407 | ## This schedule is really simple, because all its parts | ||
408 | ## are accomplished by the subroutines. | ||
409 | ## | ||
410 | &set_label("schedule_128"); | ||
411 | &mov ($round,10); | ||
412 | |||
413 | &set_label("loop_schedule_128"); | ||
414 | &call ("_vpaes_schedule_round"); | ||
415 | &dec ($round); | ||
416 | &jz (&label("schedule_mangle_last")); | ||
417 | &call ("_vpaes_schedule_mangle"); # write output | ||
418 | &jmp (&label("loop_schedule_128")); | ||
419 | |||
420 | ## | ||
421 | ## .aes_schedule_192 | ||
422 | ## | ||
423 | ## 192-bit specific part of key schedule. | ||
424 | ## | ||
425 | ## The main body of this schedule is the same as the 128-bit | ||
426 | ## schedule, but with more smearing. The long, high side is | ||
427 | ## stored in %xmm7 as before, and the short, low side is in | ||
428 | ## the high bits of %xmm6. | ||
429 | ## | ||
430 | ## This schedule is somewhat nastier, however, because each | ||
431 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
432 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
433 | ## keys. | ||
434 | ## | ||
435 | &set_label("schedule_192",16); | ||
436 | &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) | ||
437 | &call ("_vpaes_schedule_transform"); # input transform | ||
438 | &movdqa ("xmm6","xmm0"); # save short part | ||
439 | &pxor ("xmm4","xmm4"); # clear 4 | ||
440 | &movhlps("xmm6","xmm4"); # clobber low side with zeros | ||
441 | &mov ($round,4); | ||
442 | |||
443 | &set_label("loop_schedule_192"); | ||
444 | &call ("_vpaes_schedule_round"); | ||
445 | &palignr("xmm0","xmm6",8); | ||
446 | &call ("_vpaes_schedule_mangle"); # save key n | ||
447 | &call ("_vpaes_schedule_192_smear"); | ||
448 | &call ("_vpaes_schedule_mangle"); # save key n+1 | ||
449 | &call ("_vpaes_schedule_round"); | ||
450 | &dec ($round); | ||
451 | &jz (&label("schedule_mangle_last")); | ||
452 | &call ("_vpaes_schedule_mangle"); # save key n+2 | ||
453 | &call ("_vpaes_schedule_192_smear"); | ||
454 | &jmp (&label("loop_schedule_192")); | ||
455 | |||
456 | ## | ||
457 | ## .aes_schedule_256 | ||
458 | ## | ||
459 | ## 256-bit specific part of key schedule. | ||
460 | ## | ||
461 | ## The structure here is very similar to the 128-bit | ||
462 | ## schedule, but with an additional "low side" in | ||
463 | ## %xmm6. The low side's rounds are the same as the | ||
464 | ## high side's, except no rcon and no rotation. | ||
465 | ## | ||
466 | &set_label("schedule_256",16); | ||
467 | &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) | ||
468 | &call ("_vpaes_schedule_transform"); # input transform | ||
469 | &mov ($round,7); | ||
470 | |||
471 | &set_label("loop_schedule_256"); | ||
472 | &call ("_vpaes_schedule_mangle"); # output low result | ||
473 | &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 | ||
474 | |||
475 | # high round | ||
476 | &call ("_vpaes_schedule_round"); | ||
477 | &dec ($round); | ||
478 | &jz (&label("schedule_mangle_last")); | ||
479 | &call ("_vpaes_schedule_mangle"); | ||
480 | |||
481 | # low round. swap xmm7 and xmm6 | ||
482 | &pshufd ("xmm0","xmm0",0xFF); | ||
483 | &movdqa (&QWP(20,"esp"),"xmm7"); | ||
484 | &movdqa ("xmm7","xmm6"); | ||
485 | &call ("_vpaes_schedule_low_round"); | ||
486 | &movdqa ("xmm7",&QWP(20,"esp")); | ||
487 | |||
488 | &jmp (&label("loop_schedule_256")); | ||
489 | |||
490 | ## | ||
491 | ## .aes_schedule_mangle_last | ||
492 | ## | ||
493 | ## Mangler for last round of key schedule | ||
494 | ## Mangles %xmm0 | ||
495 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
496 | ## when decrypting, outputs unskew(%xmm0) | ||
497 | ## | ||
498 | ## Always called right before return... jumps to cleanup and exits | ||
499 | ## | ||
500 | &set_label("schedule_mangle_last",16); | ||
501 | # schedule last round key from xmm0 | ||
502 | &lea ($base,&DWP($k_deskew,$const)); | ||
503 | &test ($out,$out); | ||
504 | &jnz (&label("schedule_mangle_last_dec")); | ||
505 | |||
506 | # encrypting | ||
507 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
508 | &pshufb ("xmm0","xmm1"); # output permute | ||
509 | &lea ($base,&DWP($k_opt,$const)); # prepare to output transform | ||
510 | &add ($key,32); | ||
511 | |||
512 | &set_label("schedule_mangle_last_dec"); | ||
513 | &add ($key,-16); | ||
514 | &pxor ("xmm0",&QWP($k_s63,$const)); | ||
515 | &call ("_vpaes_schedule_transform"); # output transform | ||
516 | &movdqu (&QWP(0,$key),"xmm0"); # save last key | ||
517 | |||
518 | # cleanup | ||
519 | &pxor ("xmm0","xmm0"); | ||
520 | &pxor ("xmm1","xmm1"); | ||
521 | &pxor ("xmm2","xmm2"); | ||
522 | &pxor ("xmm3","xmm3"); | ||
523 | &pxor ("xmm4","xmm4"); | ||
524 | &pxor ("xmm5","xmm5"); | ||
525 | &pxor ("xmm6","xmm6"); | ||
526 | &pxor ("xmm7","xmm7"); | ||
527 | &ret (); | ||
528 | &function_end_B("_vpaes_schedule_core"); | ||
529 | |||
530 | ## | ||
531 | ## .aes_schedule_192_smear | ||
532 | ## | ||
533 | ## Smear the short, low side in the 192-bit key schedule. | ||
534 | ## | ||
535 | ## Inputs: | ||
536 | ## %xmm7: high side, b a x y | ||
537 | ## %xmm6: low side, d c 0 0 | ||
538 | ## %xmm13: 0 | ||
539 | ## | ||
540 | ## Outputs: | ||
541 | ## %xmm6: b+c+d b+c 0 0 | ||
542 | ## %xmm0: b+c+d b+c b a | ||
543 | ## | ||
544 | &function_begin_B("_vpaes_schedule_192_smear"); | ||
545 | &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 | ||
546 | &pxor ("xmm6","xmm0"); # -> c+d c 0 0 | ||
547 | &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a | ||
548 | &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a | ||
549 | &movdqa ("xmm0","xmm6"); | ||
550 | &pxor ("xmm1","xmm1"); | ||
551 | &movhlps("xmm6","xmm1"); # clobber low side with zeros | ||
552 | &ret (); | ||
553 | &function_end_B("_vpaes_schedule_192_smear"); | ||
554 | |||
555 | ## | ||
556 | ## .aes_schedule_round | ||
557 | ## | ||
558 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
559 | ## | ||
560 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
561 | ## then rotates it by one byte and xors into the low dword of | ||
562 | ## %xmm7. | ||
563 | ## | ||
564 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
565 | ## next rcon. | ||
566 | ## | ||
567 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
568 | ## second low, result into third, result into highest. | ||
569 | ## | ||
570 | ## Returns results in %xmm7 = %xmm0. | ||
571 | ## Clobbers %xmm1-%xmm5. | ||
572 | ## | ||
573 | &function_begin_B("_vpaes_schedule_round"); | ||
574 | # extract rcon from xmm8 | ||
575 | &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 | ||
576 | &pxor ("xmm1","xmm1"); | ||
577 | &palignr("xmm1","xmm2",15); | ||
578 | &palignr("xmm2","xmm2",15); | ||
579 | &pxor ("xmm7","xmm1"); | ||
580 | |||
581 | # rotate | ||
582 | &pshufd ("xmm0","xmm0",0xFF); | ||
583 | &palignr("xmm0","xmm0",1); | ||
584 | |||
585 | # fall through... | ||
586 | &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 | ||
587 | |||
588 | # low round: same as high round, but no rotation and no rcon. | ||
589 | &set_label("_vpaes_schedule_low_round"); | ||
590 | # smear xmm7 | ||
591 | &movdqa ("xmm1","xmm7"); | ||
592 | &pslldq ("xmm7",4); | ||
593 | &pxor ("xmm7","xmm1"); | ||
594 | &movdqa ("xmm1","xmm7"); | ||
595 | &pslldq ("xmm7",8); | ||
596 | &pxor ("xmm7","xmm1"); | ||
597 | &pxor ("xmm7",&QWP($k_s63,$const)); | ||
598 | |||
599 | # subbyte | ||
600 | &movdqa ("xmm4",&QWP($k_s0F,$const)); | ||
601 | &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j | ||
602 | &movdqa ("xmm1","xmm4"); | ||
603 | &pandn ("xmm1","xmm0"); | ||
604 | &psrld ("xmm1",4); # 1 = i | ||
605 | &pand ("xmm0","xmm4"); # 0 = k | ||
606 | &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k | ||
607 | &pshufb ("xmm2","xmm0"); # 2 = a/k | ||
608 | &pxor ("xmm0","xmm1"); # 0 = j | ||
609 | &movdqa ("xmm3","xmm5"); # 3 : 1/i | ||
610 | &pshufb ("xmm3","xmm1"); # 3 = 1/i | ||
611 | &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k | ||
612 | &movdqa ("xmm4","xmm5"); # 4 : 1/j | ||
613 | &pshufb ("xmm4","xmm0"); # 4 = 1/j | ||
614 | &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k | ||
615 | &movdqa ("xmm2","xmm5"); # 2 : 1/iak | ||
616 | &pshufb ("xmm2","xmm3"); # 2 = 1/iak | ||
617 | &pxor ("xmm2","xmm0"); # 2 = io | ||
618 | &movdqa ("xmm3","xmm5"); # 3 : 1/jak | ||
619 | &pshufb ("xmm3","xmm4"); # 3 = 1/jak | ||
620 | &pxor ("xmm3","xmm1"); # 3 = jo | ||
621 | &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou | ||
622 | &pshufb ("xmm4","xmm2"); # 4 = sbou | ||
623 | &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot | ||
624 | &pshufb ("xmm0","xmm3"); # 0 = sb1t | ||
625 | &pxor ("xmm0","xmm4"); # 0 = sbox output | ||
626 | |||
627 | # add in smeared stuff | ||
628 | &pxor ("xmm0","xmm7"); | ||
629 | &movdqa ("xmm7","xmm0"); | ||
630 | &ret (); | ||
631 | &function_end_B("_vpaes_schedule_round"); | ||
632 | |||
633 | ## | ||
634 | ## .aes_schedule_transform | ||
635 | ## | ||
636 | ## Linear-transform %xmm0 according to tables at (%ebx) | ||
637 | ## | ||
638 | ## Output in %xmm0 | ||
639 | ## Clobbers %xmm1, %xmm2 | ||
640 | ## | ||
641 | &function_begin_B("_vpaes_schedule_transform"); | ||
642 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
643 | &movdqa ("xmm1","xmm2"); | ||
644 | &pandn ("xmm1","xmm0"); | ||
645 | &psrld ("xmm1",4); | ||
646 | &pand ("xmm0","xmm2"); | ||
647 | &movdqa ("xmm2",&QWP(0,$base)); | ||
648 | &pshufb ("xmm2","xmm0"); | ||
649 | &movdqa ("xmm0",&QWP(16,$base)); | ||
650 | &pshufb ("xmm0","xmm1"); | ||
651 | &pxor ("xmm0","xmm2"); | ||
652 | &ret (); | ||
653 | &function_end_B("_vpaes_schedule_transform"); | ||
654 | |||
655 | ## | ||
656 | ## .aes_schedule_mangle | ||
657 | ## | ||
658 | ## Mangle xmm0 from (basis-transformed) standard version | ||
659 | ## to our version. | ||
660 | ## | ||
661 | ## On encrypt, | ||
662 | ## xor with 0x63 | ||
663 | ## multiply by circulant 0,1,1,1 | ||
664 | ## apply shiftrows transform | ||
665 | ## | ||
666 | ## On decrypt, | ||
667 | ## xor with 0x63 | ||
668 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
669 | ## deskew | ||
670 | ## apply shiftrows transform | ||
671 | ## | ||
672 | ## | ||
673 | ## Writes out to (%edx), and increments or decrements it | ||
674 | ## Keeps track of round number mod 4 in %ecx | ||
675 | ## Preserves xmm0 | ||
676 | ## Clobbers xmm1-xmm5 | ||
677 | ## | ||
678 | &function_begin_B("_vpaes_schedule_mangle"); | ||
679 | &movdqa ("xmm4","xmm0"); # save xmm0 for later | ||
680 | &movdqa ("xmm5",&QWP($k_mc_forward,$const)); | ||
681 | &test ($out,$out); | ||
682 | &jnz (&label("schedule_mangle_dec")); | ||
683 | |||
684 | # encrypting | ||
685 | &add ($key,16); | ||
686 | &pxor ("xmm4",&QWP($k_s63,$const)); | ||
687 | &pshufb ("xmm4","xmm5"); | ||
688 | &movdqa ("xmm3","xmm4"); | ||
689 | &pshufb ("xmm4","xmm5"); | ||
690 | &pxor ("xmm3","xmm4"); | ||
691 | &pshufb ("xmm4","xmm5"); | ||
692 | &pxor ("xmm3","xmm4"); | ||
693 | |||
694 | &jmp (&label("schedule_mangle_both")); | ||
695 | |||
696 | &set_label("schedule_mangle_dec",16); | ||
697 | # inverse mix columns | ||
698 | &movdqa ("xmm2",&QWP($k_s0F,$const)); | ||
699 | &lea ($inp,&DWP($k_dksd,$const)); | ||
700 | &movdqa ("xmm1","xmm2"); | ||
701 | &pandn ("xmm1","xmm4"); | ||
702 | &psrld ("xmm1",4); # 1 = hi | ||
703 | &pand ("xmm4","xmm2"); # 4 = lo | ||
704 | |||
705 | &movdqa ("xmm2",&QWP(0,$inp)); | ||
706 | &pshufb ("xmm2","xmm4"); | ||
707 | &movdqa ("xmm3",&QWP(0x10,$inp)); | ||
708 | &pshufb ("xmm3","xmm1"); | ||
709 | &pxor ("xmm3","xmm2"); | ||
710 | &pshufb ("xmm3","xmm5"); | ||
711 | |||
712 | &movdqa ("xmm2",&QWP(0x20,$inp)); | ||
713 | &pshufb ("xmm2","xmm4"); | ||
714 | &pxor ("xmm2","xmm3"); | ||
715 | &movdqa ("xmm3",&QWP(0x30,$inp)); | ||
716 | &pshufb ("xmm3","xmm1"); | ||
717 | &pxor ("xmm3","xmm2"); | ||
718 | &pshufb ("xmm3","xmm5"); | ||
719 | |||
720 | &movdqa ("xmm2",&QWP(0x40,$inp)); | ||
721 | &pshufb ("xmm2","xmm4"); | ||
722 | &pxor ("xmm2","xmm3"); | ||
723 | &movdqa ("xmm3",&QWP(0x50,$inp)); | ||
724 | &pshufb ("xmm3","xmm1"); | ||
725 | &pxor ("xmm3","xmm2"); | ||
726 | &pshufb ("xmm3","xmm5"); | ||
727 | |||
728 | &movdqa ("xmm2",&QWP(0x60,$inp)); | ||
729 | &pshufb ("xmm2","xmm4"); | ||
730 | &pxor ("xmm2","xmm3"); | ||
731 | &movdqa ("xmm3",&QWP(0x70,$inp)); | ||
732 | &pshufb ("xmm3","xmm1"); | ||
733 | &pxor ("xmm3","xmm2"); | ||
734 | |||
735 | &add ($key,-16); | ||
736 | |||
737 | &set_label("schedule_mangle_both"); | ||
738 | &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); | ||
739 | &pshufb ("xmm3","xmm1"); | ||
740 | &add ($magic,-16); | ||
741 | &and ($magic,0x30); | ||
742 | &movdqu (&QWP(0,$key),"xmm3"); | ||
743 | &ret (); | ||
744 | &function_end_B("_vpaes_schedule_mangle"); | ||
745 | |||
746 | # | ||
747 | # Interface to OpenSSL | ||
748 | # | ||
749 | &function_begin("${PREFIX}_set_encrypt_key"); | ||
750 | &mov ($inp,&wparam(0)); # inp | ||
751 | &lea ($base,&DWP(-56,"esp")); | ||
752 | &mov ($round,&wparam(1)); # bits | ||
753 | &and ($base,-16); | ||
754 | &mov ($key,&wparam(2)); # key | ||
755 | &xchg ($base,"esp"); # alloca | ||
756 | &mov (&DWP(48,"esp"),$base); | ||
757 | |||
758 | &mov ($base,$round); | ||
759 | &shr ($base,5); | ||
760 | &add ($base,5); | ||
761 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
762 | &mov ($magic,0x30); | ||
763 | &mov ($out,0); | ||
764 | |||
765 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
766 | &call ("_vpaes_schedule_core"); | ||
767 | &set_label("pic_point"); | ||
768 | |||
769 | &mov ("esp",&DWP(48,"esp")); | ||
770 | &xor ("eax","eax"); | ||
771 | &function_end("${PREFIX}_set_encrypt_key"); | ||
772 | |||
773 | &function_begin("${PREFIX}_set_decrypt_key"); | ||
774 | &mov ($inp,&wparam(0)); # inp | ||
775 | &lea ($base,&DWP(-56,"esp")); | ||
776 | &mov ($round,&wparam(1)); # bits | ||
777 | &and ($base,-16); | ||
778 | &mov ($key,&wparam(2)); # key | ||
779 | &xchg ($base,"esp"); # alloca | ||
780 | &mov (&DWP(48,"esp"),$base); | ||
781 | |||
782 | &mov ($base,$round); | ||
783 | &shr ($base,5); | ||
784 | &add ($base,5); | ||
785 | &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; | ||
786 | &shl ($base,4); | ||
787 | &lea ($key,&DWP(16,$key,$base)); | ||
788 | |||
789 | &mov ($out,1); | ||
790 | &mov ($magic,$round); | ||
791 | &shr ($magic,1); | ||
792 | &and ($magic,32); | ||
793 | &xor ($magic,32); # nbist==192?0:32; | ||
794 | |||
795 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
796 | &call ("_vpaes_schedule_core"); | ||
797 | &set_label("pic_point"); | ||
798 | |||
799 | &mov ("esp",&DWP(48,"esp")); | ||
800 | &xor ("eax","eax"); | ||
801 | &function_end("${PREFIX}_set_decrypt_key"); | ||
802 | |||
803 | &function_begin("${PREFIX}_encrypt"); | ||
804 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
805 | &call ("_vpaes_preheat"); | ||
806 | &set_label("pic_point"); | ||
807 | &mov ($inp,&wparam(0)); # inp | ||
808 | &lea ($base,&DWP(-56,"esp")); | ||
809 | &mov ($out,&wparam(1)); # out | ||
810 | &and ($base,-16); | ||
811 | &mov ($key,&wparam(2)); # key | ||
812 | &xchg ($base,"esp"); # alloca | ||
813 | &mov (&DWP(48,"esp"),$base); | ||
814 | |||
815 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
816 | &call ("_vpaes_encrypt_core"); | ||
817 | &movdqu (&QWP(0,$out),"xmm0"); | ||
818 | |||
819 | &mov ("esp",&DWP(48,"esp")); | ||
820 | &function_end("${PREFIX}_encrypt"); | ||
821 | |||
822 | &function_begin("${PREFIX}_decrypt"); | ||
823 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
824 | &call ("_vpaes_preheat"); | ||
825 | &set_label("pic_point"); | ||
826 | &mov ($inp,&wparam(0)); # inp | ||
827 | &lea ($base,&DWP(-56,"esp")); | ||
828 | &mov ($out,&wparam(1)); # out | ||
829 | &and ($base,-16); | ||
830 | &mov ($key,&wparam(2)); # key | ||
831 | &xchg ($base,"esp"); # alloca | ||
832 | &mov (&DWP(48,"esp"),$base); | ||
833 | |||
834 | &movdqu ("xmm0",&QWP(0,$inp)); | ||
835 | &call ("_vpaes_decrypt_core"); | ||
836 | &movdqu (&QWP(0,$out),"xmm0"); | ||
837 | |||
838 | &mov ("esp",&DWP(48,"esp")); | ||
839 | &function_end("${PREFIX}_decrypt"); | ||
840 | |||
841 | &function_begin("${PREFIX}_cbc_encrypt"); | ||
842 | &mov ($inp,&wparam(0)); # inp | ||
843 | &mov ($out,&wparam(1)); # out | ||
844 | &mov ($round,&wparam(2)); # len | ||
845 | &mov ($key,&wparam(3)); # key | ||
846 | &sub ($round,16); | ||
847 | &jc (&label("cbc_abort")); | ||
848 | &lea ($base,&DWP(-56,"esp")); | ||
849 | &mov ($const,&wparam(4)); # ivp | ||
850 | &and ($base,-16); | ||
851 | &mov ($magic,&wparam(5)); # enc | ||
852 | &xchg ($base,"esp"); # alloca | ||
853 | &movdqu ("xmm1",&QWP(0,$const)); # load IV | ||
854 | &sub ($out,$inp); | ||
855 | &mov (&DWP(48,"esp"),$base); | ||
856 | |||
857 | &mov (&DWP(0,"esp"),$out); # save out | ||
858 | &mov (&DWP(4,"esp"),$key) # save key | ||
859 | &mov (&DWP(8,"esp"),$const); # save ivp | ||
860 | &mov ($out,$round); # $out works as $len | ||
861 | |||
862 | &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); | ||
863 | &call ("_vpaes_preheat"); | ||
864 | &set_label("pic_point"); | ||
865 | &cmp ($magic,0); | ||
866 | &je (&label("cbc_dec_loop")); | ||
867 | &jmp (&label("cbc_enc_loop")); | ||
868 | |||
869 | &set_label("cbc_enc_loop",16); | ||
870 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
871 | &pxor ("xmm0","xmm1"); # inp^=iv | ||
872 | &call ("_vpaes_encrypt_core"); | ||
873 | &mov ($base,&DWP(0,"esp")); # restore out | ||
874 | &mov ($key,&DWP(4,"esp")); # restore key | ||
875 | &movdqa ("xmm1","xmm0"); | ||
876 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
877 | &lea ($inp,&DWP(16,$inp)); | ||
878 | &sub ($out,16); | ||
879 | &jnc (&label("cbc_enc_loop")); | ||
880 | &jmp (&label("cbc_done")); | ||
881 | |||
882 | &set_label("cbc_dec_loop",16); | ||
883 | &movdqu ("xmm0",&QWP(0,$inp)); # load input | ||
884 | &movdqa (&QWP(16,"esp"),"xmm1"); # save IV | ||
885 | &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV | ||
886 | &call ("_vpaes_decrypt_core"); | ||
887 | &mov ($base,&DWP(0,"esp")); # restore out | ||
888 | &mov ($key,&DWP(4,"esp")); # restore key | ||
889 | &pxor ("xmm0",&QWP(16,"esp")); # out^=iv | ||
890 | &movdqa ("xmm1",&QWP(32,"esp")); # load next IV | ||
891 | &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output | ||
892 | &lea ($inp,&DWP(16,$inp)); | ||
893 | &sub ($out,16); | ||
894 | &jnc (&label("cbc_dec_loop")); | ||
895 | |||
896 | &set_label("cbc_done"); | ||
897 | &mov ($base,&DWP(8,"esp")); # restore ivp | ||
898 | &mov ("esp",&DWP(48,"esp")); | ||
899 | &movdqu (&QWP(0,$base),"xmm1"); # write IV | ||
900 | &set_label("cbc_abort"); | ||
901 | &function_end("${PREFIX}_cbc_encrypt"); | ||
902 | |||
903 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl new file mode 100644 index 0000000000..37998db5e1 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl | |||
@@ -0,0 +1,1206 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | ###################################################################### | ||
4 | ## Constant-time SSSE3 AES core implementation. | ||
5 | ## version 0.1 | ||
6 | ## | ||
7 | ## By Mike Hamburg (Stanford University), 2009 | ||
8 | ## Public domain. | ||
9 | ## | ||
10 | ## For details see http://shiftleft.org/papers/vector_aes/ and | ||
11 | ## http://crypto.stanford.edu/vpaes/. | ||
12 | |||
13 | ###################################################################### | ||
14 | # September 2011. | ||
15 | # | ||
16 | # Interface to OpenSSL as "almost" drop-in replacement for | ||
17 | # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt | ||
18 | # doesn't handle partial vectors (doesn't have to if called from | ||
19 | # EVP only). "Drop-in" implies that this module doesn't share key | ||
20 | # schedule structure with the original nor does it make assumption | ||
21 | # about its alignment... | ||
22 | # | ||
23 | # Performance summary. aes-x86_64.pl column lists large-block CBC | ||
24 | # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per | ||
25 | # byte processed with 128-bit key, and vpaes-x86_64.pl column - | ||
26 | # [also large-block CBC] encrypt/decrypt. | ||
27 | # | ||
28 | # aes-x86_64.pl vpaes-x86_64.pl | ||
29 | # | ||
30 | # Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) | ||
31 | # Nehalem 30.5/42.2/14.6 9.8/11.8 | ||
32 | # Atom 63.9/79.0/32.1 64.0/84.8(***) | ||
33 | # | ||
34 | # (*) "Hyper-threading" in the context refers rather to cache shared | ||
35 | # among multiple cores, than to specifically Intel HTT. As vast | ||
36 | # majority of contemporary cores share cache, slower code path | ||
37 | # is common place. In other words "with-hyper-threading-off" | ||
38 | # results are presented mostly for reference purposes. | ||
39 | # | ||
40 | # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. | ||
41 | # | ||
42 | # (***) Less impressive improvement on Core 2 and Atom is due to slow | ||
43 | # pshufb, yet it's respectable +40%/78% improvement on Core 2 | ||
44 | # (as implied, over "hyper-threading-safe" code path). | ||
45 | # | ||
46 | # <appro@openssl.org> | ||
47 | |||
48 | $flavour = shift; | ||
49 | $output = shift; | ||
50 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
51 | |||
52 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
53 | |||
54 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
55 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
56 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
57 | die "can't locate x86_64-xlate.pl"; | ||
58 | |||
59 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
60 | |||
61 | $PREFIX="vpaes"; | ||
62 | |||
63 | $code.=<<___; | ||
64 | .text | ||
65 | |||
66 | ## | ||
67 | ## _aes_encrypt_core | ||
68 | ## | ||
69 | ## AES-encrypt %xmm0. | ||
70 | ## | ||
71 | ## Inputs: | ||
72 | ## %xmm0 = input | ||
73 | ## %xmm9-%xmm15 as in _vpaes_preheat | ||
74 | ## (%rdx) = scheduled keys | ||
75 | ## | ||
76 | ## Output in %xmm0 | ||
77 | ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax | ||
78 | ## Preserves %xmm6 - %xmm8 so you get some local vectors | ||
79 | ## | ||
80 | ## | ||
81 | .type _vpaes_encrypt_core,\@abi-omnipotent | ||
82 | .align 16 | ||
83 | _vpaes_encrypt_core: | ||
84 | mov %rdx, %r9 | ||
85 | mov \$16, %r11 | ||
86 | mov 240(%rdx),%eax | ||
87 | movdqa %xmm9, %xmm1 | ||
88 | movdqa .Lk_ipt(%rip), %xmm2 # iptlo | ||
89 | pandn %xmm0, %xmm1 | ||
90 | movdqu (%r9), %xmm5 # round0 key | ||
91 | psrld \$4, %xmm1 | ||
92 | pand %xmm9, %xmm0 | ||
93 | pshufb %xmm0, %xmm2 | ||
94 | movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi | ||
95 | pshufb %xmm1, %xmm0 | ||
96 | pxor %xmm5, %xmm2 | ||
97 | pxor %xmm2, %xmm0 | ||
98 | add \$16, %r9 | ||
99 | lea .Lk_mc_backward(%rip),%r10 | ||
100 | jmp .Lenc_entry | ||
101 | |||
102 | .align 16 | ||
103 | .Lenc_loop: | ||
104 | # middle of middle round | ||
105 | movdqa %xmm13, %xmm4 # 4 : sb1u | ||
106 | pshufb %xmm2, %xmm4 # 4 = sb1u | ||
107 | pxor %xmm5, %xmm4 # 4 = sb1u + k | ||
108 | movdqa %xmm12, %xmm0 # 0 : sb1t | ||
109 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
110 | pxor %xmm4, %xmm0 # 0 = A | ||
111 | movdqa %xmm15, %xmm5 # 4 : sb2u | ||
112 | pshufb %xmm2, %xmm5 # 4 = sb2u | ||
113 | movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] | ||
114 | movdqa %xmm14, %xmm2 # 2 : sb2t | ||
115 | pshufb %xmm3, %xmm2 # 2 = sb2t | ||
116 | pxor %xmm5, %xmm2 # 2 = 2A | ||
117 | movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] | ||
118 | movdqa %xmm0, %xmm3 # 3 = A | ||
119 | pshufb %xmm1, %xmm0 # 0 = B | ||
120 | add \$16, %r9 # next key | ||
121 | pxor %xmm2, %xmm0 # 0 = 2A+B | ||
122 | pshufb %xmm4, %xmm3 # 3 = D | ||
123 | add \$16, %r11 # next mc | ||
124 | pxor %xmm0, %xmm3 # 3 = 2A+B+D | ||
125 | pshufb %xmm1, %xmm0 # 0 = 2B+C | ||
126 | and \$0x30, %r11 # ... mod 4 | ||
127 | pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D | ||
128 | sub \$1,%rax # nr-- | ||
129 | |||
130 | .Lenc_entry: | ||
131 | # top of round | ||
132 | movdqa %xmm9, %xmm1 # 1 : i | ||
133 | pandn %xmm0, %xmm1 # 1 = i<<4 | ||
134 | psrld \$4, %xmm1 # 1 = i | ||
135 | pand %xmm9, %xmm0 # 0 = k | ||
136 | movdqa %xmm11, %xmm5 # 2 : a/k | ||
137 | pshufb %xmm0, %xmm5 # 2 = a/k | ||
138 | pxor %xmm1, %xmm0 # 0 = j | ||
139 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
140 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
141 | pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k | ||
142 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
143 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
144 | pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k | ||
145 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
146 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
147 | pxor %xmm0, %xmm2 # 2 = io | ||
148 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
149 | movdqu (%r9), %xmm5 | ||
150 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
151 | pxor %xmm1, %xmm3 # 3 = jo | ||
152 | jnz .Lenc_loop | ||
153 | |||
154 | # middle of last round | ||
155 | movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo | ||
156 | movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 | ||
157 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
158 | pxor %xmm5, %xmm4 # 4 = sb1u + k | ||
159 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
160 | movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] | ||
161 | pxor %xmm4, %xmm0 # 0 = A | ||
162 | pshufb %xmm1, %xmm0 | ||
163 | ret | ||
164 | .size _vpaes_encrypt_core,.-_vpaes_encrypt_core | ||
165 | |||
166 | ## | ||
167 | ## Decryption core | ||
168 | ## | ||
169 | ## Same API as encryption core. | ||
170 | ## | ||
171 | .type _vpaes_decrypt_core,\@abi-omnipotent | ||
172 | .align 16 | ||
173 | _vpaes_decrypt_core: | ||
174 | mov %rdx, %r9 # load key | ||
175 | mov 240(%rdx),%eax | ||
176 | movdqa %xmm9, %xmm1 | ||
177 | movdqa .Lk_dipt(%rip), %xmm2 # iptlo | ||
178 | pandn %xmm0, %xmm1 | ||
179 | mov %rax, %r11 | ||
180 | psrld \$4, %xmm1 | ||
181 | movdqu (%r9), %xmm5 # round0 key | ||
182 | shl \$4, %r11 | ||
183 | pand %xmm9, %xmm0 | ||
184 | pshufb %xmm0, %xmm2 | ||
185 | movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi | ||
186 | xor \$0x30, %r11 | ||
187 | lea .Lk_dsbd(%rip),%r10 | ||
188 | pshufb %xmm1, %xmm0 | ||
189 | and \$0x30, %r11 | ||
190 | pxor %xmm5, %xmm2 | ||
191 | movdqa .Lk_mc_forward+48(%rip), %xmm5 | ||
192 | pxor %xmm2, %xmm0 | ||
193 | add \$16, %r9 | ||
194 | add %r10, %r11 | ||
195 | jmp .Ldec_entry | ||
196 | |||
197 | .align 16 | ||
198 | .Ldec_loop: | ||
199 | ## | ||
200 | ## Inverse mix columns | ||
201 | ## | ||
202 | movdqa -0x20(%r10),%xmm4 # 4 : sb9u | ||
203 | pshufb %xmm2, %xmm4 # 4 = sb9u | ||
204 | pxor %xmm0, %xmm4 | ||
205 | movdqa -0x10(%r10),%xmm0 # 0 : sb9t | ||
206 | pshufb %xmm3, %xmm0 # 0 = sb9t | ||
207 | pxor %xmm4, %xmm0 # 0 = ch | ||
208 | add \$16, %r9 # next round key | ||
209 | |||
210 | pshufb %xmm5, %xmm0 # MC ch | ||
211 | movdqa 0x00(%r10),%xmm4 # 4 : sbdu | ||
212 | pshufb %xmm2, %xmm4 # 4 = sbdu | ||
213 | pxor %xmm0, %xmm4 # 4 = ch | ||
214 | movdqa 0x10(%r10),%xmm0 # 0 : sbdt | ||
215 | pshufb %xmm3, %xmm0 # 0 = sbdt | ||
216 | pxor %xmm4, %xmm0 # 0 = ch | ||
217 | sub \$1,%rax # nr-- | ||
218 | |||
219 | pshufb %xmm5, %xmm0 # MC ch | ||
220 | movdqa 0x20(%r10),%xmm4 # 4 : sbbu | ||
221 | pshufb %xmm2, %xmm4 # 4 = sbbu | ||
222 | pxor %xmm0, %xmm4 # 4 = ch | ||
223 | movdqa 0x30(%r10),%xmm0 # 0 : sbbt | ||
224 | pshufb %xmm3, %xmm0 # 0 = sbbt | ||
225 | pxor %xmm4, %xmm0 # 0 = ch | ||
226 | |||
227 | pshufb %xmm5, %xmm0 # MC ch | ||
228 | movdqa 0x40(%r10),%xmm4 # 4 : sbeu | ||
229 | pshufb %xmm2, %xmm4 # 4 = sbeu | ||
230 | pxor %xmm0, %xmm4 # 4 = ch | ||
231 | movdqa 0x50(%r10),%xmm0 # 0 : sbet | ||
232 | pshufb %xmm3, %xmm0 # 0 = sbet | ||
233 | pxor %xmm4, %xmm0 # 0 = ch | ||
234 | |||
235 | palignr \$12, %xmm5, %xmm5 | ||
236 | |||
237 | .Ldec_entry: | ||
238 | # top of round | ||
239 | movdqa %xmm9, %xmm1 # 1 : i | ||
240 | pandn %xmm0, %xmm1 # 1 = i<<4 | ||
241 | psrld \$4, %xmm1 # 1 = i | ||
242 | pand %xmm9, %xmm0 # 0 = k | ||
243 | movdqa %xmm11, %xmm2 # 2 : a/k | ||
244 | pshufb %xmm0, %xmm2 # 2 = a/k | ||
245 | pxor %xmm1, %xmm0 # 0 = j | ||
246 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
247 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
248 | pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k | ||
249 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
250 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
251 | pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k | ||
252 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
253 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
254 | pxor %xmm0, %xmm2 # 2 = io | ||
255 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
256 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
257 | pxor %xmm1, %xmm3 # 3 = jo | ||
258 | movdqu (%r9), %xmm0 | ||
259 | jnz .Ldec_loop | ||
260 | |||
261 | # middle of last round | ||
262 | movdqa 0x60(%r10), %xmm4 # 3 : sbou | ||
263 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
264 | pxor %xmm0, %xmm4 # 4 = sb1u + k | ||
265 | movdqa 0x70(%r10), %xmm0 # 0 : sbot | ||
266 | movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 | ||
267 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
268 | pxor %xmm4, %xmm0 # 0 = A | ||
269 | pshufb %xmm2, %xmm0 | ||
270 | ret | ||
271 | .size _vpaes_decrypt_core,.-_vpaes_decrypt_core | ||
272 | |||
273 | ######################################################## | ||
274 | ## ## | ||
275 | ## AES key schedule ## | ||
276 | ## ## | ||
277 | ######################################################## | ||
278 | .type _vpaes_schedule_core,\@abi-omnipotent | ||
279 | .align 16 | ||
280 | _vpaes_schedule_core: | ||
281 | # rdi = key | ||
282 | # rsi = size in bits | ||
283 | # rdx = buffer | ||
284 | # rcx = direction. 0=encrypt, 1=decrypt | ||
285 | |||
286 | call _vpaes_preheat # load the tables | ||
287 | movdqa .Lk_rcon(%rip), %xmm8 # load rcon | ||
288 | movdqu (%rdi), %xmm0 # load key (unaligned) | ||
289 | |||
290 | # input transform | ||
291 | movdqa %xmm0, %xmm3 | ||
292 | lea .Lk_ipt(%rip), %r11 | ||
293 | call _vpaes_schedule_transform | ||
294 | movdqa %xmm0, %xmm7 | ||
295 | |||
296 | lea .Lk_sr(%rip),%r10 | ||
297 | test %rcx, %rcx | ||
298 | jnz .Lschedule_am_decrypting | ||
299 | |||
300 | # encrypting, output zeroth round key after transform | ||
301 | movdqu %xmm0, (%rdx) | ||
302 | jmp .Lschedule_go | ||
303 | |||
304 | .Lschedule_am_decrypting: | ||
305 | # decrypting, output zeroth round key after shiftrows | ||
306 | movdqa (%r8,%r10),%xmm1 | ||
307 | pshufb %xmm1, %xmm3 | ||
308 | movdqu %xmm3, (%rdx) | ||
309 | xor \$0x30, %r8 | ||
310 | |||
311 | .Lschedule_go: | ||
312 | cmp \$192, %esi | ||
313 | ja .Lschedule_256 | ||
314 | je .Lschedule_192 | ||
315 | # 128: fall though | ||
316 | |||
317 | ## | ||
318 | ## .schedule_128 | ||
319 | ## | ||
320 | ## 128-bit specific part of key schedule. | ||
321 | ## | ||
322 | ## This schedule is really simple, because all its parts | ||
323 | ## are accomplished by the subroutines. | ||
324 | ## | ||
325 | .Lschedule_128: | ||
326 | mov \$10, %esi | ||
327 | |||
328 | .Loop_schedule_128: | ||
329 | call _vpaes_schedule_round | ||
330 | dec %rsi | ||
331 | jz .Lschedule_mangle_last | ||
332 | call _vpaes_schedule_mangle # write output | ||
333 | jmp .Loop_schedule_128 | ||
334 | |||
335 | ## | ||
336 | ## .aes_schedule_192 | ||
337 | ## | ||
338 | ## 192-bit specific part of key schedule. | ||
339 | ## | ||
340 | ## The main body of this schedule is the same as the 128-bit | ||
341 | ## schedule, but with more smearing. The long, high side is | ||
342 | ## stored in %xmm7 as before, and the short, low side is in | ||
343 | ## the high bits of %xmm6. | ||
344 | ## | ||
345 | ## This schedule is somewhat nastier, however, because each | ||
346 | ## round produces 192 bits of key material, or 1.5 round keys. | ||
347 | ## Therefore, on each cycle we do 2 rounds and produce 3 round | ||
348 | ## keys. | ||
349 | ## | ||
350 | .align 16 | ||
351 | .Lschedule_192: | ||
352 | movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) | ||
353 | call _vpaes_schedule_transform # input transform | ||
354 | movdqa %xmm0, %xmm6 # save short part | ||
355 | pxor %xmm4, %xmm4 # clear 4 | ||
356 | movhlps %xmm4, %xmm6 # clobber low side with zeros | ||
357 | mov \$4, %esi | ||
358 | |||
359 | .Loop_schedule_192: | ||
360 | call _vpaes_schedule_round | ||
361 | palignr \$8,%xmm6,%xmm0 | ||
362 | call _vpaes_schedule_mangle # save key n | ||
363 | call _vpaes_schedule_192_smear | ||
364 | call _vpaes_schedule_mangle # save key n+1 | ||
365 | call _vpaes_schedule_round | ||
366 | dec %rsi | ||
367 | jz .Lschedule_mangle_last | ||
368 | call _vpaes_schedule_mangle # save key n+2 | ||
369 | call _vpaes_schedule_192_smear | ||
370 | jmp .Loop_schedule_192 | ||
371 | |||
372 | ## | ||
373 | ## .aes_schedule_256 | ||
374 | ## | ||
375 | ## 256-bit specific part of key schedule. | ||
376 | ## | ||
377 | ## The structure here is very similar to the 128-bit | ||
378 | ## schedule, but with an additional "low side" in | ||
379 | ## %xmm6. The low side's rounds are the same as the | ||
380 | ## high side's, except no rcon and no rotation. | ||
381 | ## | ||
382 | .align 16 | ||
383 | .Lschedule_256: | ||
384 | movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) | ||
385 | call _vpaes_schedule_transform # input transform | ||
386 | mov \$7, %esi | ||
387 | |||
388 | .Loop_schedule_256: | ||
389 | call _vpaes_schedule_mangle # output low result | ||
390 | movdqa %xmm0, %xmm6 # save cur_lo in xmm6 | ||
391 | |||
392 | # high round | ||
393 | call _vpaes_schedule_round | ||
394 | dec %rsi | ||
395 | jz .Lschedule_mangle_last | ||
396 | call _vpaes_schedule_mangle | ||
397 | |||
398 | # low round. swap xmm7 and xmm6 | ||
399 | pshufd \$0xFF, %xmm0, %xmm0 | ||
400 | movdqa %xmm7, %xmm5 | ||
401 | movdqa %xmm6, %xmm7 | ||
402 | call _vpaes_schedule_low_round | ||
403 | movdqa %xmm5, %xmm7 | ||
404 | |||
405 | jmp .Loop_schedule_256 | ||
406 | |||
407 | |||
408 | ## | ||
409 | ## .aes_schedule_mangle_last | ||
410 | ## | ||
411 | ## Mangler for last round of key schedule | ||
412 | ## Mangles %xmm0 | ||
413 | ## when encrypting, outputs out(%xmm0) ^ 63 | ||
414 | ## when decrypting, outputs unskew(%xmm0) | ||
415 | ## | ||
416 | ## Always called right before return... jumps to cleanup and exits | ||
417 | ## | ||
418 | .align 16 | ||
419 | .Lschedule_mangle_last: | ||
420 | # schedule last round key from xmm0 | ||
421 | lea .Lk_deskew(%rip),%r11 # prepare to deskew | ||
422 | test %rcx, %rcx | ||
423 | jnz .Lschedule_mangle_last_dec | ||
424 | |||
425 | # encrypting | ||
426 | movdqa (%r8,%r10),%xmm1 | ||
427 | pshufb %xmm1, %xmm0 # output permute | ||
428 | lea .Lk_opt(%rip), %r11 # prepare to output transform | ||
429 | add \$32, %rdx | ||
430 | |||
431 | .Lschedule_mangle_last_dec: | ||
432 | add \$-16, %rdx | ||
433 | pxor .Lk_s63(%rip), %xmm0 | ||
434 | call _vpaes_schedule_transform # output transform | ||
435 | movdqu %xmm0, (%rdx) # save last key | ||
436 | |||
437 | # cleanup | ||
438 | pxor %xmm0, %xmm0 | ||
439 | pxor %xmm1, %xmm1 | ||
440 | pxor %xmm2, %xmm2 | ||
441 | pxor %xmm3, %xmm3 | ||
442 | pxor %xmm4, %xmm4 | ||
443 | pxor %xmm5, %xmm5 | ||
444 | pxor %xmm6, %xmm6 | ||
445 | pxor %xmm7, %xmm7 | ||
446 | ret | ||
447 | .size _vpaes_schedule_core,.-_vpaes_schedule_core | ||
448 | |||
449 | ## | ||
450 | ## .aes_schedule_192_smear | ||
451 | ## | ||
452 | ## Smear the short, low side in the 192-bit key schedule. | ||
453 | ## | ||
454 | ## Inputs: | ||
455 | ## %xmm7: high side, b a x y | ||
456 | ## %xmm6: low side, d c 0 0 | ||
457 | ## %xmm13: 0 | ||
458 | ## | ||
459 | ## Outputs: | ||
460 | ## %xmm6: b+c+d b+c 0 0 | ||
461 | ## %xmm0: b+c+d b+c b a | ||
462 | ## | ||
463 | .type _vpaes_schedule_192_smear,\@abi-omnipotent | ||
464 | .align 16 | ||
465 | _vpaes_schedule_192_smear: | ||
466 | pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 | ||
467 | pxor %xmm0, %xmm6 # -> c+d c 0 0 | ||
468 | pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a | ||
469 | pxor %xmm0, %xmm6 # -> b+c+d b+c b a | ||
470 | movdqa %xmm6, %xmm0 | ||
471 | pxor %xmm1, %xmm1 | ||
472 | movhlps %xmm1, %xmm6 # clobber low side with zeros | ||
473 | ret | ||
474 | .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear | ||
475 | |||
476 | ## | ||
477 | ## .aes_schedule_round | ||
478 | ## | ||
479 | ## Runs one main round of the key schedule on %xmm0, %xmm7 | ||
480 | ## | ||
481 | ## Specifically, runs subbytes on the high dword of %xmm0 | ||
482 | ## then rotates it by one byte and xors into the low dword of | ||
483 | ## %xmm7. | ||
484 | ## | ||
485 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for | ||
486 | ## next rcon. | ||
487 | ## | ||
488 | ## Smears the dwords of %xmm7 by xoring the low into the | ||
489 | ## second low, result into third, result into highest. | ||
490 | ## | ||
491 | ## Returns results in %xmm7 = %xmm0. | ||
492 | ## Clobbers %xmm1-%xmm4, %r11. | ||
493 | ## | ||
494 | .type _vpaes_schedule_round,\@abi-omnipotent | ||
495 | .align 16 | ||
496 | _vpaes_schedule_round: | ||
497 | # extract rcon from xmm8 | ||
498 | pxor %xmm1, %xmm1 | ||
499 | palignr \$15, %xmm8, %xmm1 | ||
500 | palignr \$15, %xmm8, %xmm8 | ||
501 | pxor %xmm1, %xmm7 | ||
502 | |||
503 | # rotate | ||
504 | pshufd \$0xFF, %xmm0, %xmm0 | ||
505 | palignr \$1, %xmm0, %xmm0 | ||
506 | |||
507 | # fall through... | ||
508 | |||
509 | # low round: same as high round, but no rotation and no rcon. | ||
510 | _vpaes_schedule_low_round: | ||
511 | # smear xmm7 | ||
512 | movdqa %xmm7, %xmm1 | ||
513 | pslldq \$4, %xmm7 | ||
514 | pxor %xmm1, %xmm7 | ||
515 | movdqa %xmm7, %xmm1 | ||
516 | pslldq \$8, %xmm7 | ||
517 | pxor %xmm1, %xmm7 | ||
518 | pxor .Lk_s63(%rip), %xmm7 | ||
519 | |||
520 | # subbytes | ||
521 | movdqa %xmm9, %xmm1 | ||
522 | pandn %xmm0, %xmm1 | ||
523 | psrld \$4, %xmm1 # 1 = i | ||
524 | pand %xmm9, %xmm0 # 0 = k | ||
525 | movdqa %xmm11, %xmm2 # 2 : a/k | ||
526 | pshufb %xmm0, %xmm2 # 2 = a/k | ||
527 | pxor %xmm1, %xmm0 # 0 = j | ||
528 | movdqa %xmm10, %xmm3 # 3 : 1/i | ||
529 | pshufb %xmm1, %xmm3 # 3 = 1/i | ||
530 | pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k | ||
531 | movdqa %xmm10, %xmm4 # 4 : 1/j | ||
532 | pshufb %xmm0, %xmm4 # 4 = 1/j | ||
533 | pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k | ||
534 | movdqa %xmm10, %xmm2 # 2 : 1/iak | ||
535 | pshufb %xmm3, %xmm2 # 2 = 1/iak | ||
536 | pxor %xmm0, %xmm2 # 2 = io | ||
537 | movdqa %xmm10, %xmm3 # 3 : 1/jak | ||
538 | pshufb %xmm4, %xmm3 # 3 = 1/jak | ||
539 | pxor %xmm1, %xmm3 # 3 = jo | ||
540 | movdqa %xmm13, %xmm4 # 4 : sbou | ||
541 | pshufb %xmm2, %xmm4 # 4 = sbou | ||
542 | movdqa %xmm12, %xmm0 # 0 : sbot | ||
543 | pshufb %xmm3, %xmm0 # 0 = sb1t | ||
544 | pxor %xmm4, %xmm0 # 0 = sbox output | ||
545 | |||
546 | # add in smeared stuff | ||
547 | pxor %xmm7, %xmm0 | ||
548 | movdqa %xmm0, %xmm7 | ||
549 | ret | ||
550 | .size _vpaes_schedule_round,.-_vpaes_schedule_round | ||
551 | |||
552 | ## | ||
553 | ## .aes_schedule_transform | ||
554 | ## | ||
555 | ## Linear-transform %xmm0 according to tables at (%r11) | ||
556 | ## | ||
557 | ## Requires that %xmm9 = 0x0F0F... as in preheat | ||
558 | ## Output in %xmm0 | ||
559 | ## Clobbers %xmm1, %xmm2 | ||
560 | ## | ||
561 | .type _vpaes_schedule_transform,\@abi-omnipotent | ||
562 | .align 16 | ||
563 | _vpaes_schedule_transform: | ||
564 | movdqa %xmm9, %xmm1 | ||
565 | pandn %xmm0, %xmm1 | ||
566 | psrld \$4, %xmm1 | ||
567 | pand %xmm9, %xmm0 | ||
568 | movdqa (%r11), %xmm2 # lo | ||
569 | pshufb %xmm0, %xmm2 | ||
570 | movdqa 16(%r11), %xmm0 # hi | ||
571 | pshufb %xmm1, %xmm0 | ||
572 | pxor %xmm2, %xmm0 | ||
573 | ret | ||
574 | .size _vpaes_schedule_transform,.-_vpaes_schedule_transform | ||
575 | |||
576 | ## | ||
577 | ## .aes_schedule_mangle | ||
578 | ## | ||
579 | ## Mangle xmm0 from (basis-transformed) standard version | ||
580 | ## to our version. | ||
581 | ## | ||
582 | ## On encrypt, | ||
583 | ## xor with 0x63 | ||
584 | ## multiply by circulant 0,1,1,1 | ||
585 | ## apply shiftrows transform | ||
586 | ## | ||
587 | ## On decrypt, | ||
588 | ## xor with 0x63 | ||
589 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 | ||
590 | ## deskew | ||
591 | ## apply shiftrows transform | ||
592 | ## | ||
593 | ## | ||
594 | ## Writes out to (%rdx), and increments or decrements it | ||
595 | ## Keeps track of round number mod 4 in %r8 | ||
596 | ## Preserves xmm0 | ||
597 | ## Clobbers xmm1-xmm5 | ||
598 | ## | ||
599 | .type _vpaes_schedule_mangle,\@abi-omnipotent | ||
600 | .align 16 | ||
601 | _vpaes_schedule_mangle: | ||
602 | movdqa %xmm0, %xmm4 # save xmm0 for later | ||
603 | movdqa .Lk_mc_forward(%rip),%xmm5 | ||
604 | test %rcx, %rcx | ||
605 | jnz .Lschedule_mangle_dec | ||
606 | |||
607 | # encrypting | ||
608 | add \$16, %rdx | ||
609 | pxor .Lk_s63(%rip),%xmm4 | ||
610 | pshufb %xmm5, %xmm4 | ||
611 | movdqa %xmm4, %xmm3 | ||
612 | pshufb %xmm5, %xmm4 | ||
613 | pxor %xmm4, %xmm3 | ||
614 | pshufb %xmm5, %xmm4 | ||
615 | pxor %xmm4, %xmm3 | ||
616 | |||
617 | jmp .Lschedule_mangle_both | ||
618 | .align 16 | ||
619 | .Lschedule_mangle_dec: | ||
620 | # inverse mix columns | ||
621 | lea .Lk_dksd(%rip),%r11 | ||
622 | movdqa %xmm9, %xmm1 | ||
623 | pandn %xmm4, %xmm1 | ||
624 | psrld \$4, %xmm1 # 1 = hi | ||
625 | pand %xmm9, %xmm4 # 4 = lo | ||
626 | |||
627 | movdqa 0x00(%r11), %xmm2 | ||
628 | pshufb %xmm4, %xmm2 | ||
629 | movdqa 0x10(%r11), %xmm3 | ||
630 | pshufb %xmm1, %xmm3 | ||
631 | pxor %xmm2, %xmm3 | ||
632 | pshufb %xmm5, %xmm3 | ||
633 | |||
634 | movdqa 0x20(%r11), %xmm2 | ||
635 | pshufb %xmm4, %xmm2 | ||
636 | pxor %xmm3, %xmm2 | ||
637 | movdqa 0x30(%r11), %xmm3 | ||
638 | pshufb %xmm1, %xmm3 | ||
639 | pxor %xmm2, %xmm3 | ||
640 | pshufb %xmm5, %xmm3 | ||
641 | |||
642 | movdqa 0x40(%r11), %xmm2 | ||
643 | pshufb %xmm4, %xmm2 | ||
644 | pxor %xmm3, %xmm2 | ||
645 | movdqa 0x50(%r11), %xmm3 | ||
646 | pshufb %xmm1, %xmm3 | ||
647 | pxor %xmm2, %xmm3 | ||
648 | pshufb %xmm5, %xmm3 | ||
649 | |||
650 | movdqa 0x60(%r11), %xmm2 | ||
651 | pshufb %xmm4, %xmm2 | ||
652 | pxor %xmm3, %xmm2 | ||
653 | movdqa 0x70(%r11), %xmm3 | ||
654 | pshufb %xmm1, %xmm3 | ||
655 | pxor %xmm2, %xmm3 | ||
656 | |||
657 | add \$-16, %rdx | ||
658 | |||
659 | .Lschedule_mangle_both: | ||
660 | movdqa (%r8,%r10),%xmm1 | ||
661 | pshufb %xmm1,%xmm3 | ||
662 | add \$-16, %r8 | ||
663 | and \$0x30, %r8 | ||
664 | movdqu %xmm3, (%rdx) | ||
665 | ret | ||
666 | .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle | ||
667 | |||
668 | # | ||
669 | # Interface to OpenSSL | ||
670 | # | ||
671 | .globl ${PREFIX}_set_encrypt_key | ||
672 | .type ${PREFIX}_set_encrypt_key,\@function,3 | ||
673 | .align 16 | ||
674 | ${PREFIX}_set_encrypt_key: | ||
675 | ___ | ||
676 | $code.=<<___ if ($win64); | ||
677 | lea -0xb8(%rsp),%rsp | ||
678 | movaps %xmm6,0x10(%rsp) | ||
679 | movaps %xmm7,0x20(%rsp) | ||
680 | movaps %xmm8,0x30(%rsp) | ||
681 | movaps %xmm9,0x40(%rsp) | ||
682 | movaps %xmm10,0x50(%rsp) | ||
683 | movaps %xmm11,0x60(%rsp) | ||
684 | movaps %xmm12,0x70(%rsp) | ||
685 | movaps %xmm13,0x80(%rsp) | ||
686 | movaps %xmm14,0x90(%rsp) | ||
687 | movaps %xmm15,0xa0(%rsp) | ||
688 | .Lenc_key_body: | ||
689 | ___ | ||
690 | $code.=<<___; | ||
691 | mov %esi,%eax | ||
692 | shr \$5,%eax | ||
693 | add \$5,%eax | ||
694 | mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; | ||
695 | |||
696 | mov \$0,%ecx | ||
697 | mov \$0x30,%r8d | ||
698 | call _vpaes_schedule_core | ||
699 | ___ | ||
700 | $code.=<<___ if ($win64); | ||
701 | movaps 0x10(%rsp),%xmm6 | ||
702 | movaps 0x20(%rsp),%xmm7 | ||
703 | movaps 0x30(%rsp),%xmm8 | ||
704 | movaps 0x40(%rsp),%xmm9 | ||
705 | movaps 0x50(%rsp),%xmm10 | ||
706 | movaps 0x60(%rsp),%xmm11 | ||
707 | movaps 0x70(%rsp),%xmm12 | ||
708 | movaps 0x80(%rsp),%xmm13 | ||
709 | movaps 0x90(%rsp),%xmm14 | ||
710 | movaps 0xa0(%rsp),%xmm15 | ||
711 | lea 0xb8(%rsp),%rsp | ||
712 | .Lenc_key_epilogue: | ||
713 | ___ | ||
714 | $code.=<<___; | ||
715 | xor %eax,%eax | ||
716 | ret | ||
717 | .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key | ||
718 | |||
719 | .globl ${PREFIX}_set_decrypt_key | ||
720 | .type ${PREFIX}_set_decrypt_key,\@function,3 | ||
721 | .align 16 | ||
722 | ${PREFIX}_set_decrypt_key: | ||
723 | ___ | ||
724 | $code.=<<___ if ($win64); | ||
725 | lea -0xb8(%rsp),%rsp | ||
726 | movaps %xmm6,0x10(%rsp) | ||
727 | movaps %xmm7,0x20(%rsp) | ||
728 | movaps %xmm8,0x30(%rsp) | ||
729 | movaps %xmm9,0x40(%rsp) | ||
730 | movaps %xmm10,0x50(%rsp) | ||
731 | movaps %xmm11,0x60(%rsp) | ||
732 | movaps %xmm12,0x70(%rsp) | ||
733 | movaps %xmm13,0x80(%rsp) | ||
734 | movaps %xmm14,0x90(%rsp) | ||
735 | movaps %xmm15,0xa0(%rsp) | ||
736 | .Ldec_key_body: | ||
737 | ___ | ||
738 | $code.=<<___; | ||
739 | mov %esi,%eax | ||
740 | shr \$5,%eax | ||
741 | add \$5,%eax | ||
742 | mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; | ||
743 | shl \$4,%eax | ||
744 | lea 16(%rdx,%rax),%rdx | ||
745 | |||
746 | mov \$1,%ecx | ||
747 | mov %esi,%r8d | ||
748 | shr \$1,%r8d | ||
749 | and \$32,%r8d | ||
750 | xor \$32,%r8d # nbits==192?0:32 | ||
751 | call _vpaes_schedule_core | ||
752 | ___ | ||
753 | $code.=<<___ if ($win64); | ||
754 | movaps 0x10(%rsp),%xmm6 | ||
755 | movaps 0x20(%rsp),%xmm7 | ||
756 | movaps 0x30(%rsp),%xmm8 | ||
757 | movaps 0x40(%rsp),%xmm9 | ||
758 | movaps 0x50(%rsp),%xmm10 | ||
759 | movaps 0x60(%rsp),%xmm11 | ||
760 | movaps 0x70(%rsp),%xmm12 | ||
761 | movaps 0x80(%rsp),%xmm13 | ||
762 | movaps 0x90(%rsp),%xmm14 | ||
763 | movaps 0xa0(%rsp),%xmm15 | ||
764 | lea 0xb8(%rsp),%rsp | ||
765 | .Ldec_key_epilogue: | ||
766 | ___ | ||
767 | $code.=<<___; | ||
768 | xor %eax,%eax | ||
769 | ret | ||
770 | .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key | ||
771 | |||
772 | .globl ${PREFIX}_encrypt | ||
773 | .type ${PREFIX}_encrypt,\@function,3 | ||
774 | .align 16 | ||
775 | ${PREFIX}_encrypt: | ||
776 | ___ | ||
777 | $code.=<<___ if ($win64); | ||
778 | lea -0xb8(%rsp),%rsp | ||
779 | movaps %xmm6,0x10(%rsp) | ||
780 | movaps %xmm7,0x20(%rsp) | ||
781 | movaps %xmm8,0x30(%rsp) | ||
782 | movaps %xmm9,0x40(%rsp) | ||
783 | movaps %xmm10,0x50(%rsp) | ||
784 | movaps %xmm11,0x60(%rsp) | ||
785 | movaps %xmm12,0x70(%rsp) | ||
786 | movaps %xmm13,0x80(%rsp) | ||
787 | movaps %xmm14,0x90(%rsp) | ||
788 | movaps %xmm15,0xa0(%rsp) | ||
789 | .Lenc_body: | ||
790 | ___ | ||
791 | $code.=<<___; | ||
792 | movdqu (%rdi),%xmm0 | ||
793 | call _vpaes_preheat | ||
794 | call _vpaes_encrypt_core | ||
795 | movdqu %xmm0,(%rsi) | ||
796 | ___ | ||
797 | $code.=<<___ if ($win64); | ||
798 | movaps 0x10(%rsp),%xmm6 | ||
799 | movaps 0x20(%rsp),%xmm7 | ||
800 | movaps 0x30(%rsp),%xmm8 | ||
801 | movaps 0x40(%rsp),%xmm9 | ||
802 | movaps 0x50(%rsp),%xmm10 | ||
803 | movaps 0x60(%rsp),%xmm11 | ||
804 | movaps 0x70(%rsp),%xmm12 | ||
805 | movaps 0x80(%rsp),%xmm13 | ||
806 | movaps 0x90(%rsp),%xmm14 | ||
807 | movaps 0xa0(%rsp),%xmm15 | ||
808 | lea 0xb8(%rsp),%rsp | ||
809 | .Lenc_epilogue: | ||
810 | ___ | ||
811 | $code.=<<___; | ||
812 | ret | ||
813 | .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt | ||
814 | |||
815 | .globl ${PREFIX}_decrypt | ||
816 | .type ${PREFIX}_decrypt,\@function,3 | ||
817 | .align 16 | ||
818 | ${PREFIX}_decrypt: | ||
819 | ___ | ||
820 | $code.=<<___ if ($win64); | ||
821 | lea -0xb8(%rsp),%rsp | ||
822 | movaps %xmm6,0x10(%rsp) | ||
823 | movaps %xmm7,0x20(%rsp) | ||
824 | movaps %xmm8,0x30(%rsp) | ||
825 | movaps %xmm9,0x40(%rsp) | ||
826 | movaps %xmm10,0x50(%rsp) | ||
827 | movaps %xmm11,0x60(%rsp) | ||
828 | movaps %xmm12,0x70(%rsp) | ||
829 | movaps %xmm13,0x80(%rsp) | ||
830 | movaps %xmm14,0x90(%rsp) | ||
831 | movaps %xmm15,0xa0(%rsp) | ||
832 | .Ldec_body: | ||
833 | ___ | ||
834 | $code.=<<___; | ||
835 | movdqu (%rdi),%xmm0 | ||
836 | call _vpaes_preheat | ||
837 | call _vpaes_decrypt_core | ||
838 | movdqu %xmm0,(%rsi) | ||
839 | ___ | ||
840 | $code.=<<___ if ($win64); | ||
841 | movaps 0x10(%rsp),%xmm6 | ||
842 | movaps 0x20(%rsp),%xmm7 | ||
843 | movaps 0x30(%rsp),%xmm8 | ||
844 | movaps 0x40(%rsp),%xmm9 | ||
845 | movaps 0x50(%rsp),%xmm10 | ||
846 | movaps 0x60(%rsp),%xmm11 | ||
847 | movaps 0x70(%rsp),%xmm12 | ||
848 | movaps 0x80(%rsp),%xmm13 | ||
849 | movaps 0x90(%rsp),%xmm14 | ||
850 | movaps 0xa0(%rsp),%xmm15 | ||
851 | lea 0xb8(%rsp),%rsp | ||
852 | .Ldec_epilogue: | ||
853 | ___ | ||
854 | $code.=<<___; | ||
855 | ret | ||
856 | .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt | ||
857 | ___ | ||
858 | { | ||
859 | my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||
860 | # void AES_cbc_encrypt (const void char *inp, unsigned char *out, | ||
861 | # size_t length, const AES_KEY *key, | ||
862 | # unsigned char *ivp,const int enc); | ||
863 | $code.=<<___; | ||
864 | .globl ${PREFIX}_cbc_encrypt | ||
865 | .type ${PREFIX}_cbc_encrypt,\@function,6 | ||
866 | .align 16 | ||
867 | ${PREFIX}_cbc_encrypt: | ||
868 | xchg $key,$len | ||
869 | ___ | ||
870 | ($len,$key)=($key,$len); | ||
871 | $code.=<<___; | ||
872 | sub \$16,$len | ||
873 | jc .Lcbc_abort | ||
874 | ___ | ||
875 | $code.=<<___ if ($win64); | ||
876 | lea -0xb8(%rsp),%rsp | ||
877 | movaps %xmm6,0x10(%rsp) | ||
878 | movaps %xmm7,0x20(%rsp) | ||
879 | movaps %xmm8,0x30(%rsp) | ||
880 | movaps %xmm9,0x40(%rsp) | ||
881 | movaps %xmm10,0x50(%rsp) | ||
882 | movaps %xmm11,0x60(%rsp) | ||
883 | movaps %xmm12,0x70(%rsp) | ||
884 | movaps %xmm13,0x80(%rsp) | ||
885 | movaps %xmm14,0x90(%rsp) | ||
886 | movaps %xmm15,0xa0(%rsp) | ||
887 | .Lcbc_body: | ||
888 | ___ | ||
889 | $code.=<<___; | ||
890 | movdqu ($ivp),%xmm6 # load IV | ||
891 | sub $inp,$out | ||
892 | call _vpaes_preheat | ||
893 | cmp \$0,${enc}d | ||
894 | je .Lcbc_dec_loop | ||
895 | jmp .Lcbc_enc_loop | ||
896 | .align 16 | ||
897 | .Lcbc_enc_loop: | ||
898 | movdqu ($inp),%xmm0 | ||
899 | pxor %xmm6,%xmm0 | ||
900 | call _vpaes_encrypt_core | ||
901 | movdqa %xmm0,%xmm6 | ||
902 | movdqu %xmm0,($out,$inp) | ||
903 | lea 16($inp),$inp | ||
904 | sub \$16,$len | ||
905 | jnc .Lcbc_enc_loop | ||
906 | jmp .Lcbc_done | ||
907 | .align 16 | ||
908 | .Lcbc_dec_loop: | ||
909 | movdqu ($inp),%xmm0 | ||
910 | movdqa %xmm0,%xmm7 | ||
911 | call _vpaes_decrypt_core | ||
912 | pxor %xmm6,%xmm0 | ||
913 | movdqa %xmm7,%xmm6 | ||
914 | movdqu %xmm0,($out,$inp) | ||
915 | lea 16($inp),$inp | ||
916 | sub \$16,$len | ||
917 | jnc .Lcbc_dec_loop | ||
918 | .Lcbc_done: | ||
919 | movdqu %xmm6,($ivp) # save IV | ||
920 | ___ | ||
921 | $code.=<<___ if ($win64); | ||
922 | movaps 0x10(%rsp),%xmm6 | ||
923 | movaps 0x20(%rsp),%xmm7 | ||
924 | movaps 0x30(%rsp),%xmm8 | ||
925 | movaps 0x40(%rsp),%xmm9 | ||
926 | movaps 0x50(%rsp),%xmm10 | ||
927 | movaps 0x60(%rsp),%xmm11 | ||
928 | movaps 0x70(%rsp),%xmm12 | ||
929 | movaps 0x80(%rsp),%xmm13 | ||
930 | movaps 0x90(%rsp),%xmm14 | ||
931 | movaps 0xa0(%rsp),%xmm15 | ||
932 | lea 0xb8(%rsp),%rsp | ||
933 | .Lcbc_epilogue: | ||
934 | ___ | ||
935 | $code.=<<___; | ||
936 | .Lcbc_abort: | ||
937 | ret | ||
938 | .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt | ||
939 | ___ | ||
940 | } | ||
941 | $code.=<<___; | ||
942 | ## | ||
943 | ## _aes_preheat | ||
944 | ## | ||
945 | ## Fills register %r10 -> .aes_consts (so you can -fPIC) | ||
946 | ## and %xmm9-%xmm15 as specified below. | ||
947 | ## | ||
948 | .type _vpaes_preheat,\@abi-omnipotent | ||
949 | .align 16 | ||
950 | _vpaes_preheat: | ||
951 | lea .Lk_s0F(%rip), %r10 | ||
952 | movdqa -0x20(%r10), %xmm10 # .Lk_inv | ||
953 | movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 | ||
954 | movdqa 0x00(%r10), %xmm9 # .Lk_s0F | ||
955 | movdqa 0x30(%r10), %xmm13 # .Lk_sb1 | ||
956 | movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 | ||
957 | movdqa 0x50(%r10), %xmm15 # .Lk_sb2 | ||
958 | movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 | ||
959 | ret | ||
960 | .size _vpaes_preheat,.-_vpaes_preheat | ||
961 | ######################################################## | ||
962 | ## ## | ||
963 | ## Constants ## | ||
964 | ## ## | ||
965 | ######################################################## | ||
966 | .type _vpaes_consts,\@object | ||
967 | .align 64 | ||
968 | _vpaes_consts: | ||
969 | .Lk_inv: # inv, inva | ||
970 | .quad 0x0E05060F0D080180, 0x040703090A0B0C02 | ||
971 | .quad 0x01040A060F0B0780, 0x030D0E0C02050809 | ||
972 | |||
973 | .Lk_s0F: # s0F | ||
974 | .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F | ||
975 | |||
976 | .Lk_ipt: # input transform (lo, hi) | ||
977 | .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 | ||
978 | .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 | ||
979 | |||
980 | .Lk_sb1: # sb1u, sb1t | ||
981 | .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 | ||
982 | .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF | ||
983 | .Lk_sb2: # sb2u, sb2t | ||
984 | .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD | ||
985 | .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A | ||
986 | .Lk_sbo: # sbou, sbot | ||
987 | .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 | ||
988 | .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA | ||
989 | |||
990 | .Lk_mc_forward: # mc_forward | ||
991 | .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 | ||
992 | .quad 0x080B0A0904070605, 0x000302010C0F0E0D | ||
993 | .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 | ||
994 | .quad 0x000302010C0F0E0D, 0x080B0A0904070605 | ||
995 | |||
996 | .Lk_mc_backward:# mc_backward | ||
997 | .quad 0x0605040702010003, 0x0E0D0C0F0A09080B | ||
998 | .quad 0x020100030E0D0C0F, 0x0A09080B06050407 | ||
999 | .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 | ||
1000 | .quad 0x0A09080B06050407, 0x020100030E0D0C0F | ||
1001 | |||
1002 | .Lk_sr: # sr | ||
1003 | .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 | ||
1004 | .quad 0x030E09040F0A0500, 0x0B06010C07020D08 | ||
1005 | .quad 0x0F060D040B020900, 0x070E050C030A0108 | ||
1006 | .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 | ||
1007 | |||
1008 | .Lk_rcon: # rcon | ||
1009 | .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 | ||
1010 | |||
1011 | .Lk_s63: # s63: all equal to 0x63 transformed | ||
1012 | .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B | ||
1013 | |||
1014 | .Lk_opt: # output transform | ||
1015 | .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 | ||
1016 | .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 | ||
1017 | |||
1018 | .Lk_deskew: # deskew tables: inverts the sbox's "skew" | ||
1019 | .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A | ||
1020 | .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 | ||
1021 | |||
1022 | ## | ||
1023 | ## Decryption stuff | ||
1024 | ## Key schedule constants | ||
1025 | ## | ||
1026 | .Lk_dksd: # decryption key schedule: invskew x*D | ||
1027 | .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 | ||
1028 | .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E | ||
1029 | .Lk_dksb: # decryption key schedule: invskew x*B | ||
1030 | .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 | ||
1031 | .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 | ||
1032 | .Lk_dkse: # decryption key schedule: invskew x*E + 0x63 | ||
1033 | .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 | ||
1034 | .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 | ||
1035 | .Lk_dks9: # decryption key schedule: invskew x*9 | ||
1036 | .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC | ||
1037 | .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE | ||
1038 | |||
1039 | ## | ||
1040 | ## Decryption stuff | ||
1041 | ## Round function constants | ||
1042 | ## | ||
1043 | .Lk_dipt: # decryption input transform | ||
1044 | .quad 0x0F505B040B545F00, 0x154A411E114E451A | ||
1045 | .quad 0x86E383E660056500, 0x12771772F491F194 | ||
1046 | |||
1047 | .Lk_dsb9: # decryption sbox output *9*u, *9*t | ||
1048 | .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 | ||
1049 | .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 | ||
1050 | .Lk_dsbd: # decryption sbox output *D*u, *D*t | ||
1051 | .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 | ||
1052 | .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 | ||
1053 | .Lk_dsbb: # decryption sbox output *B*u, *B*t | ||
1054 | .quad 0xD022649296B44200, 0x602646F6B0F2D404 | ||
1055 | .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B | ||
1056 | .Lk_dsbe: # decryption sbox output *E*u, *E*t | ||
1057 | .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 | ||
1058 | .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 | ||
1059 | .Lk_dsbo: # decryption sbox final output | ||
1060 | .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D | ||
1061 | .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C | ||
1062 | .asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)" | ||
1063 | .align 64 | ||
1064 | .size _vpaes_consts,.-_vpaes_consts | ||
1065 | ___ | ||
1066 | |||
1067 | if ($win64) { | ||
1068 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
1069 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
1070 | $rec="%rcx"; | ||
1071 | $frame="%rdx"; | ||
1072 | $context="%r8"; | ||
1073 | $disp="%r9"; | ||
1074 | |||
1075 | $code.=<<___; | ||
1076 | .extern __imp_RtlVirtualUnwind | ||
1077 | .type se_handler,\@abi-omnipotent | ||
1078 | .align 16 | ||
1079 | se_handler: | ||
1080 | push %rsi | ||
1081 | push %rdi | ||
1082 | push %rbx | ||
1083 | push %rbp | ||
1084 | push %r12 | ||
1085 | push %r13 | ||
1086 | push %r14 | ||
1087 | push %r15 | ||
1088 | pushfq | ||
1089 | sub \$64,%rsp | ||
1090 | |||
1091 | mov 120($context),%rax # pull context->Rax | ||
1092 | mov 248($context),%rbx # pull context->Rip | ||
1093 | |||
1094 | mov 8($disp),%rsi # disp->ImageBase | ||
1095 | mov 56($disp),%r11 # disp->HandlerData | ||
1096 | |||
1097 | mov 0(%r11),%r10d # HandlerData[0] | ||
1098 | lea (%rsi,%r10),%r10 # prologue label | ||
1099 | cmp %r10,%rbx # context->Rip<prologue label | ||
1100 | jb .Lin_prologue | ||
1101 | |||
1102 | mov 152($context),%rax # pull context->Rsp | ||
1103 | |||
1104 | mov 4(%r11),%r10d # HandlerData[1] | ||
1105 | lea (%rsi,%r10),%r10 # epilogue label | ||
1106 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
1107 | jae .Lin_prologue | ||
1108 | |||
1109 | lea 16(%rax),%rsi # %xmm save area | ||
1110 | lea 512($context),%rdi # &context.Xmm6 | ||
1111 | mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | ||
1112 | .long 0xa548f3fc # cld; rep movsq | ||
1113 | lea 0xb8(%rax),%rax # adjust stack pointer | ||
1114 | |||
1115 | .Lin_prologue: | ||
1116 | mov 8(%rax),%rdi | ||
1117 | mov 16(%rax),%rsi | ||
1118 | mov %rax,152($context) # restore context->Rsp | ||
1119 | mov %rsi,168($context) # restore context->Rsi | ||
1120 | mov %rdi,176($context) # restore context->Rdi | ||
1121 | |||
1122 | mov 40($disp),%rdi # disp->ContextRecord | ||
1123 | mov $context,%rsi # context | ||
1124 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
1125 | .long 0xa548f3fc # cld; rep movsq | ||
1126 | |||
1127 | mov $disp,%rsi | ||
1128 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
1129 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
1130 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
1131 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
1132 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
1133 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
1134 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
1135 | mov %r10,32(%rsp) # arg5 | ||
1136 | mov %r11,40(%rsp) # arg6 | ||
1137 | mov %r12,48(%rsp) # arg7 | ||
1138 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
1139 | call *__imp_RtlVirtualUnwind(%rip) | ||
1140 | |||
1141 | mov \$1,%eax # ExceptionContinueSearch | ||
1142 | add \$64,%rsp | ||
1143 | popfq | ||
1144 | pop %r15 | ||
1145 | pop %r14 | ||
1146 | pop %r13 | ||
1147 | pop %r12 | ||
1148 | pop %rbp | ||
1149 | pop %rbx | ||
1150 | pop %rdi | ||
1151 | pop %rsi | ||
1152 | ret | ||
1153 | .size se_handler,.-se_handler | ||
1154 | |||
1155 | .section .pdata | ||
1156 | .align 4 | ||
1157 | .rva .LSEH_begin_${PREFIX}_set_encrypt_key | ||
1158 | .rva .LSEH_end_${PREFIX}_set_encrypt_key | ||
1159 | .rva .LSEH_info_${PREFIX}_set_encrypt_key | ||
1160 | |||
1161 | .rva .LSEH_begin_${PREFIX}_set_decrypt_key | ||
1162 | .rva .LSEH_end_${PREFIX}_set_decrypt_key | ||
1163 | .rva .LSEH_info_${PREFIX}_set_decrypt_key | ||
1164 | |||
1165 | .rva .LSEH_begin_${PREFIX}_encrypt | ||
1166 | .rva .LSEH_end_${PREFIX}_encrypt | ||
1167 | .rva .LSEH_info_${PREFIX}_encrypt | ||
1168 | |||
1169 | .rva .LSEH_begin_${PREFIX}_decrypt | ||
1170 | .rva .LSEH_end_${PREFIX}_decrypt | ||
1171 | .rva .LSEH_info_${PREFIX}_decrypt | ||
1172 | |||
1173 | .rva .LSEH_begin_${PREFIX}_cbc_encrypt | ||
1174 | .rva .LSEH_end_${PREFIX}_cbc_encrypt | ||
1175 | .rva .LSEH_info_${PREFIX}_cbc_encrypt | ||
1176 | |||
1177 | .section .xdata | ||
1178 | .align 8 | ||
1179 | .LSEH_info_${PREFIX}_set_encrypt_key: | ||
1180 | .byte 9,0,0,0 | ||
1181 | .rva se_handler | ||
1182 | .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] | ||
1183 | .LSEH_info_${PREFIX}_set_decrypt_key: | ||
1184 | .byte 9,0,0,0 | ||
1185 | .rva se_handler | ||
1186 | .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[] | ||
1187 | .LSEH_info_${PREFIX}_encrypt: | ||
1188 | .byte 9,0,0,0 | ||
1189 | .rva se_handler | ||
1190 | .rva .Lenc_body,.Lenc_epilogue # HandlerData[] | ||
1191 | .LSEH_info_${PREFIX}_decrypt: | ||
1192 | .byte 9,0,0,0 | ||
1193 | .rva se_handler | ||
1194 | .rva .Ldec_body,.Ldec_epilogue # HandlerData[] | ||
1195 | .LSEH_info_${PREFIX}_cbc_encrypt: | ||
1196 | .byte 9,0,0,0 | ||
1197 | .rva se_handler | ||
1198 | .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[] | ||
1199 | ___ | ||
1200 | } | ||
1201 | |||
1202 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
1203 | |||
1204 | print $code; | ||
1205 | |||
1206 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/arm_arch.h b/src/lib/libcrypto/arm_arch.h new file mode 100644 index 0000000000..5a83107680 --- /dev/null +++ b/src/lib/libcrypto/arm_arch.h | |||
@@ -0,0 +1,51 @@ | |||
1 | #ifndef __ARM_ARCH_H__ | ||
2 | #define __ARM_ARCH_H__ | ||
3 | |||
4 | #if !defined(__ARM_ARCH__) | ||
5 | # if defined(__CC_ARM) | ||
6 | # define __ARM_ARCH__ __TARGET_ARCH_ARM | ||
7 | # if defined(__BIG_ENDIAN) | ||
8 | # define __ARMEB__ | ||
9 | # else | ||
10 | # define __ARMEL__ | ||
11 | # endif | ||
12 | # elif defined(__GNUC__) | ||
13 | /* | ||
14 | * Why doesn't gcc define __ARM_ARCH__? Instead it defines | ||
15 | * bunch of below macros. See all_architectires[] table in | ||
16 | * gcc/config/arm/arm.c. On a side note it defines | ||
17 | * __ARMEL__/__ARMEB__ for little-/big-endian. | ||
18 | */ | ||
19 | # if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ | ||
20 | defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ | ||
21 | defined(__ARM_ARCH_7EM__) | ||
22 | # define __ARM_ARCH__ 7 | ||
23 | # elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ | ||
24 | defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \ | ||
25 | defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \ | ||
26 | defined(__ARM_ARCH_6T2__) | ||
27 | # define __ARM_ARCH__ 6 | ||
28 | # elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ | ||
29 | defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \ | ||
30 | defined(__ARM_ARCH_5TEJ__) | ||
31 | # define __ARM_ARCH__ 5 | ||
32 | # elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) | ||
33 | # define __ARM_ARCH__ 4 | ||
34 | # else | ||
35 | # error "unsupported ARM architecture" | ||
36 | # endif | ||
37 | # endif | ||
38 | #endif | ||
39 | |||
40 | #ifdef OPENSSL_FIPSCANISTER | ||
41 | #include <openssl/fipssyms.h> | ||
42 | #endif | ||
43 | |||
44 | #if !__ASSEMBLER__ | ||
45 | extern unsigned int OPENSSL_armcap_P; | ||
46 | |||
47 | #define ARMV7_NEON (1<<0) | ||
48 | #define ARMV7_TICK (1<<1) | ||
49 | #endif | ||
50 | |||
51 | #endif | ||
diff --git a/src/lib/libcrypto/armcap.c b/src/lib/libcrypto/armcap.c new file mode 100644 index 0000000000..5258d2fbdd --- /dev/null +++ b/src/lib/libcrypto/armcap.c | |||
@@ -0,0 +1,80 @@ | |||
1 | #include <stdio.h> | ||
2 | #include <stdlib.h> | ||
3 | #include <string.h> | ||
4 | #include <setjmp.h> | ||
5 | #include <signal.h> | ||
6 | #include <crypto.h> | ||
7 | |||
8 | #include "arm_arch.h" | ||
9 | |||
10 | unsigned int OPENSSL_armcap_P; | ||
11 | |||
12 | static sigset_t all_masked; | ||
13 | |||
14 | static sigjmp_buf ill_jmp; | ||
15 | static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } | ||
16 | |||
17 | /* | ||
18 | * Following subroutines could have been inlined, but it's not all | ||
19 | * ARM compilers support inline assembler... | ||
20 | */ | ||
21 | void _armv7_neon_probe(void); | ||
22 | unsigned int _armv7_tick(void); | ||
23 | |||
24 | unsigned int OPENSSL_rdtsc(void) | ||
25 | { | ||
26 | if (OPENSSL_armcap_P|ARMV7_TICK) | ||
27 | return _armv7_tick(); | ||
28 | else | ||
29 | return 0; | ||
30 | } | ||
31 | |||
32 | #if defined(__GNUC__) && __GNUC__>=2 | ||
33 | void OPENSSL_cpuid_setup(void) __attribute__((constructor)); | ||
34 | #endif | ||
35 | void OPENSSL_cpuid_setup(void) | ||
36 | { | ||
37 | char *e; | ||
38 | struct sigaction ill_oact,ill_act; | ||
39 | sigset_t oset; | ||
40 | static int trigger=0; | ||
41 | |||
42 | if (trigger) return; | ||
43 | trigger=1; | ||
44 | |||
45 | if ((e=getenv("OPENSSL_armcap"))) | ||
46 | { | ||
47 | OPENSSL_armcap_P=strtoul(e,NULL,0); | ||
48 | return; | ||
49 | } | ||
50 | |||
51 | sigfillset(&all_masked); | ||
52 | sigdelset(&all_masked,SIGILL); | ||
53 | sigdelset(&all_masked,SIGTRAP); | ||
54 | sigdelset(&all_masked,SIGFPE); | ||
55 | sigdelset(&all_masked,SIGBUS); | ||
56 | sigdelset(&all_masked,SIGSEGV); | ||
57 | |||
58 | OPENSSL_armcap_P = 0; | ||
59 | |||
60 | memset(&ill_act,0,sizeof(ill_act)); | ||
61 | ill_act.sa_handler = ill_handler; | ||
62 | ill_act.sa_mask = all_masked; | ||
63 | |||
64 | sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset); | ||
65 | sigaction(SIGILL,&ill_act,&ill_oact); | ||
66 | |||
67 | if (sigsetjmp(ill_jmp,1) == 0) | ||
68 | { | ||
69 | _armv7_neon_probe(); | ||
70 | OPENSSL_armcap_P |= ARMV7_NEON; | ||
71 | } | ||
72 | if (sigsetjmp(ill_jmp,1) == 0) | ||
73 | { | ||
74 | _armv7_tick(); | ||
75 | OPENSSL_armcap_P |= ARMV7_TICK; | ||
76 | } | ||
77 | |||
78 | sigaction (SIGILL,&ill_oact,NULL); | ||
79 | sigprocmask(SIG_SETMASK,&oset,NULL); | ||
80 | } | ||
diff --git a/src/lib/libcrypto/armv4cpuid.S b/src/lib/libcrypto/armv4cpuid.S new file mode 100644 index 0000000000..2d618deaa4 --- /dev/null +++ b/src/lib/libcrypto/armv4cpuid.S | |||
@@ -0,0 +1,154 @@ | |||
1 | #include "arm_arch.h" | ||
2 | |||
3 | .text | ||
4 | .code 32 | ||
5 | |||
6 | .align 5 | ||
7 | .global _armv7_neon_probe | ||
8 | .type _armv7_neon_probe,%function | ||
9 | _armv7_neon_probe: | ||
10 | .word 0xf26ee1fe @ vorr q15,q15,q15 | ||
11 | .word 0xe12fff1e @ bx lr | ||
12 | .size _armv7_neon_probe,.-_armv7_neon_probe | ||
13 | |||
14 | .global _armv7_tick | ||
15 | .type _armv7_tick,%function | ||
16 | _armv7_tick: | ||
17 | mrc p15,0,r0,c9,c13,0 | ||
18 | .word 0xe12fff1e @ bx lr | ||
19 | .size _armv7_tick,.-_armv7_tick | ||
20 | |||
21 | .global OPENSSL_atomic_add | ||
22 | .type OPENSSL_atomic_add,%function | ||
23 | OPENSSL_atomic_add: | ||
24 | #if __ARM_ARCH__>=6 | ||
25 | .Ladd: ldrex r2,[r0] | ||
26 | add r3,r2,r1 | ||
27 | strex r2,r3,[r0] | ||
28 | cmp r2,#0 | ||
29 | bne .Ladd | ||
30 | mov r0,r3 | ||
31 | .word 0xe12fff1e @ bx lr | ||
32 | #else | ||
33 | stmdb sp!,{r4-r6,lr} | ||
34 | ldr r2,.Lspinlock | ||
35 | adr r3,.Lspinlock | ||
36 | mov r4,r0 | ||
37 | mov r5,r1 | ||
38 | add r6,r3,r2 @ &spinlock | ||
39 | b .+8 | ||
40 | .Lspin: bl sched_yield | ||
41 | mov r0,#-1 | ||
42 | swp r0,r0,[r6] | ||
43 | cmp r0,#0 | ||
44 | bne .Lspin | ||
45 | |||
46 | ldr r2,[r4] | ||
47 | add r2,r2,r5 | ||
48 | str r2,[r4] | ||
49 | str r0,[r6] @ release spinlock | ||
50 | ldmia sp!,{r4-r6,lr} | ||
51 | tst lr,#1 | ||
52 | moveq pc,lr | ||
53 | .word 0xe12fff1e @ bx lr | ||
54 | #endif | ||
55 | .size OPENSSL_atomic_add,.-OPENSSL_atomic_add | ||
56 | |||
57 | .global OPENSSL_cleanse | ||
58 | .type OPENSSL_cleanse,%function | ||
59 | OPENSSL_cleanse: | ||
60 | eor ip,ip,ip | ||
61 | cmp r1,#7 | ||
62 | subhs r1,r1,#4 | ||
63 | bhs .Lot | ||
64 | cmp r1,#0 | ||
65 | beq .Lcleanse_done | ||
66 | .Little: | ||
67 | strb ip,[r0],#1 | ||
68 | subs r1,r1,#1 | ||
69 | bhi .Little | ||
70 | b .Lcleanse_done | ||
71 | |||
72 | .Lot: tst r0,#3 | ||
73 | beq .Laligned | ||
74 | strb ip,[r0],#1 | ||
75 | sub r1,r1,#1 | ||
76 | b .Lot | ||
77 | .Laligned: | ||
78 | str ip,[r0],#4 | ||
79 | subs r1,r1,#4 | ||
80 | bhs .Laligned | ||
81 | adds r1,r1,#4 | ||
82 | bne .Little | ||
83 | .Lcleanse_done: | ||
84 | tst lr,#1 | ||
85 | moveq pc,lr | ||
86 | .word 0xe12fff1e @ bx lr | ||
87 | .size OPENSSL_cleanse,.-OPENSSL_cleanse | ||
88 | |||
89 | .global OPENSSL_wipe_cpu | ||
90 | .type OPENSSL_wipe_cpu,%function | ||
91 | OPENSSL_wipe_cpu: | ||
92 | ldr r0,.LOPENSSL_armcap | ||
93 | adr r1,.LOPENSSL_armcap | ||
94 | ldr r0,[r1,r0] | ||
95 | eor r2,r2,r2 | ||
96 | eor r3,r3,r3 | ||
97 | eor ip,ip,ip | ||
98 | tst r0,#1 | ||
99 | beq .Lwipe_done | ||
100 | .word 0xf3000150 @ veor q0, q0, q0 | ||
101 | .word 0xf3022152 @ veor q1, q1, q1 | ||
102 | .word 0xf3044154 @ veor q2, q2, q2 | ||
103 | .word 0xf3066156 @ veor q3, q3, q3 | ||
104 | .word 0xf34001f0 @ veor q8, q8, q8 | ||
105 | .word 0xf34221f2 @ veor q9, q9, q9 | ||
106 | .word 0xf34441f4 @ veor q10, q10, q10 | ||
107 | .word 0xf34661f6 @ veor q11, q11, q11 | ||
108 | .word 0xf34881f8 @ veor q12, q12, q12 | ||
109 | .word 0xf34aa1fa @ veor q13, q13, q13 | ||
110 | .word 0xf34cc1fc @ veor q14, q14, q14 | ||
111 | .word 0xf34ee1fe @ veor q15, q15, q15 | ||
112 | .Lwipe_done: | ||
113 | mov r0,sp | ||
114 | tst lr,#1 | ||
115 | moveq pc,lr | ||
116 | .word 0xe12fff1e @ bx lr | ||
117 | .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu | ||
118 | |||
119 | .global OPENSSL_instrument_bus | ||
120 | .type OPENSSL_instrument_bus,%function | ||
121 | OPENSSL_instrument_bus: | ||
122 | eor r0,r0,r0 | ||
123 | tst lr,#1 | ||
124 | moveq pc,lr | ||
125 | .word 0xe12fff1e @ bx lr | ||
126 | .size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus | ||
127 | |||
128 | .global OPENSSL_instrument_bus2 | ||
129 | .type OPENSSL_instrument_bus2,%function | ||
130 | OPENSSL_instrument_bus2: | ||
131 | eor r0,r0,r0 | ||
132 | tst lr,#1 | ||
133 | moveq pc,lr | ||
134 | .word 0xe12fff1e @ bx lr | ||
135 | .size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 | ||
136 | |||
137 | .align 5 | ||
138 | .LOPENSSL_armcap: | ||
139 | .word OPENSSL_armcap_P-.LOPENSSL_armcap | ||
140 | #if __ARM_ARCH__>=6 | ||
141 | .align 5 | ||
142 | #else | ||
143 | .Lspinlock: | ||
144 | .word atomic_add_spinlock-.Lspinlock | ||
145 | .align 5 | ||
146 | |||
147 | .data | ||
148 | .align 2 | ||
149 | atomic_add_spinlock: | ||
150 | .word 0 | ||
151 | #endif | ||
152 | |||
153 | .comm OPENSSL_armcap_P,4,4 | ||
154 | .hidden OPENSSL_armcap_P | ||
diff --git a/src/lib/libcrypto/asn1/ameth_lib.c b/src/lib/libcrypto/asn1/ameth_lib.c index 5a581b90ea..a19e058fca 100644 --- a/src/lib/libcrypto/asn1/ameth_lib.c +++ b/src/lib/libcrypto/asn1/ameth_lib.c | |||
@@ -69,6 +69,7 @@ extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[]; | |||
69 | extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth; | 69 | extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth; |
70 | extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth; | 70 | extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth; |
71 | extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth; | 71 | extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth; |
72 | extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth; | ||
72 | 73 | ||
73 | /* Keep this sorted in type order !! */ | 74 | /* Keep this sorted in type order !! */ |
74 | static const EVP_PKEY_ASN1_METHOD *standard_methods[] = | 75 | static const EVP_PKEY_ASN1_METHOD *standard_methods[] = |
@@ -90,7 +91,8 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] = | |||
90 | #ifndef OPENSSL_NO_EC | 91 | #ifndef OPENSSL_NO_EC |
91 | &eckey_asn1_meth, | 92 | &eckey_asn1_meth, |
92 | #endif | 93 | #endif |
93 | &hmac_asn1_meth | 94 | &hmac_asn1_meth, |
95 | &cmac_asn1_meth | ||
94 | }; | 96 | }; |
95 | 97 | ||
96 | typedef int sk_cmp_fn_type(const char * const *a, const char * const *b); | 98 | typedef int sk_cmp_fn_type(const char * const *a, const char * const *b); |
@@ -291,6 +293,8 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags, | |||
291 | if (!ameth) | 293 | if (!ameth) |
292 | return NULL; | 294 | return NULL; |
293 | 295 | ||
296 | memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD)); | ||
297 | |||
294 | ameth->pkey_id = id; | 298 | ameth->pkey_id = id; |
295 | ameth->pkey_base_id = id; | 299 | ameth->pkey_base_id = id; |
296 | ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC; | 300 | ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC; |
@@ -325,6 +329,9 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags, | |||
325 | ameth->old_priv_encode = 0; | 329 | ameth->old_priv_encode = 0; |
326 | ameth->old_priv_decode = 0; | 330 | ameth->old_priv_decode = 0; |
327 | 331 | ||
332 | ameth->item_verify = 0; | ||
333 | ameth->item_sign = 0; | ||
334 | |||
328 | ameth->pkey_size = 0; | 335 | ameth->pkey_size = 0; |
329 | ameth->pkey_bits = 0; | 336 | ameth->pkey_bits = 0; |
330 | 337 | ||
@@ -376,6 +383,9 @@ void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst, | |||
376 | dst->pkey_free = src->pkey_free; | 383 | dst->pkey_free = src->pkey_free; |
377 | dst->pkey_ctrl = src->pkey_ctrl; | 384 | dst->pkey_ctrl = src->pkey_ctrl; |
378 | 385 | ||
386 | dst->item_sign = src->item_sign; | ||
387 | dst->item_verify = src->item_verify; | ||
388 | |||
379 | } | 389 | } |
380 | 390 | ||
381 | void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth) | 391 | void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth) |
diff --git a/src/lib/libcrypto/asn1/asn1_locl.h b/src/lib/libcrypto/asn1/asn1_locl.h index 5aa65e28f5..9fcf0d9530 100644 --- a/src/lib/libcrypto/asn1/asn1_locl.h +++ b/src/lib/libcrypto/asn1/asn1_locl.h | |||
@@ -102,6 +102,10 @@ struct evp_pkey_asn1_method_st | |||
102 | int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); | 102 | int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); |
103 | int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, | 103 | int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, |
104 | ASN1_PCTX *pctx); | 104 | ASN1_PCTX *pctx); |
105 | int (*sig_print)(BIO *out, | ||
106 | const X509_ALGOR *sigalg, const ASN1_STRING *sig, | ||
107 | int indent, ASN1_PCTX *pctx); | ||
108 | |||
105 | 109 | ||
106 | void (*pkey_free)(EVP_PKEY *pkey); | 110 | void (*pkey_free)(EVP_PKEY *pkey); |
107 | int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2); | 111 | int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2); |
@@ -111,6 +115,13 @@ struct evp_pkey_asn1_method_st | |||
111 | int (*old_priv_decode)(EVP_PKEY *pkey, | 115 | int (*old_priv_decode)(EVP_PKEY *pkey, |
112 | const unsigned char **pder, int derlen); | 116 | const unsigned char **pder, int derlen); |
113 | int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder); | 117 | int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder); |
118 | /* Custom ASN1 signature verification */ | ||
119 | int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, | ||
120 | X509_ALGOR *a, ASN1_BIT_STRING *sig, | ||
121 | EVP_PKEY *pkey); | ||
122 | int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, | ||
123 | X509_ALGOR *alg1, X509_ALGOR *alg2, | ||
124 | ASN1_BIT_STRING *sig); | ||
114 | 125 | ||
115 | } /* EVP_PKEY_ASN1_METHOD */; | 126 | } /* EVP_PKEY_ASN1_METHOD */; |
116 | 127 | ||
diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl new file mode 100644 index 0000000000..c52e0b75b5 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl | |||
@@ -0,0 +1,278 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # May 2011 | ||
11 | # | ||
12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication | ||
13 | # used in bn_gf2m.c. It's kind of low-hanging mechanical port from | ||
14 | # C for the time being... Except that it has two code paths: pure | ||
15 | # integer code suitable for any ARMv4 and later CPU and NEON code | ||
16 | # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs | ||
17 | # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50% | ||
18 | # faster than compiler-generated code. For ECDH and ECDSA verify (but | ||
19 | # not for ECDSA sign) it means 25%-45% improvement depending on key | ||
20 | # length, more for longer keys. Even though NEON 1x1 multiplication | ||
21 | # runs in even less cycles, ~30, improvement is measurable only on | ||
22 | # longer keys. One has to optimize code elsewhere to get NEON glow... | ||
23 | |||
24 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
25 | open STDOUT,">$output"; | ||
26 | |||
27 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | ||
28 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | ||
29 | sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } | ||
30 | |||
31 | $code=<<___; | ||
32 | #include "arm_arch.h" | ||
33 | |||
34 | .text | ||
35 | .code 32 | ||
36 | |||
37 | #if __ARM_ARCH__>=7 | ||
38 | .fpu neon | ||
39 | |||
40 | .type mul_1x1_neon,%function | ||
41 | .align 5 | ||
42 | mul_1x1_neon: | ||
43 | vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a | ||
44 | vmull.p8 `&Q("d0")`,d16,d17 @ a·bb | ||
45 | vshl.u64 `&Dlo("q2")`,d16,#16 | ||
46 | vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb | ||
47 | vshl.u64 `&Dlo("q3")`,d16,#24 | ||
48 | vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb | ||
49 | vshr.u64 `&Dlo("q1")`,#8 | ||
50 | vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb | ||
51 | vshl.u64 `&Dhi("q1")`,#24 | ||
52 | veor d0,`&Dlo("q1")` | ||
53 | vshr.u64 `&Dlo("q2")`,#16 | ||
54 | veor d0,`&Dhi("q1")` | ||
55 | vshl.u64 `&Dhi("q2")`,#16 | ||
56 | veor d0,`&Dlo("q2")` | ||
57 | vshr.u64 `&Dlo("q3")`,#24 | ||
58 | veor d0,`&Dhi("q2")` | ||
59 | vshl.u64 `&Dhi("q3")`,#8 | ||
60 | veor d0,`&Dlo("q3")` | ||
61 | veor d0,`&Dhi("q3")` | ||
62 | bx lr | ||
63 | .size mul_1x1_neon,.-mul_1x1_neon | ||
64 | #endif | ||
65 | ___ | ||
66 | ################ | ||
67 | # private interface to mul_1x1_ialu | ||
68 | # | ||
69 | $a="r1"; | ||
70 | $b="r0"; | ||
71 | |||
72 | ($a0,$a1,$a2,$a12,$a4,$a14)= | ||
73 | ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12); | ||
74 | |||
75 | $mask="r12"; | ||
76 | |||
77 | $code.=<<___; | ||
78 | .type mul_1x1_ialu,%function | ||
79 | .align 5 | ||
80 | mul_1x1_ialu: | ||
81 | mov $a0,#0 | ||
82 | bic $a1,$a,#3<<30 @ a1=a&0x3fffffff | ||
83 | str $a0,[sp,#0] @ tab[0]=0 | ||
84 | add $a2,$a1,$a1 @ a2=a1<<1 | ||
85 | str $a1,[sp,#4] @ tab[1]=a1 | ||
86 | eor $a12,$a1,$a2 @ a1^a2 | ||
87 | str $a2,[sp,#8] @ tab[2]=a2 | ||
88 | mov $a4,$a1,lsl#2 @ a4=a1<<2 | ||
89 | str $a12,[sp,#12] @ tab[3]=a1^a2 | ||
90 | eor $a14,$a1,$a4 @ a1^a4 | ||
91 | str $a4,[sp,#16] @ tab[4]=a4 | ||
92 | eor $a0,$a2,$a4 @ a2^a4 | ||
93 | str $a14,[sp,#20] @ tab[5]=a1^a4 | ||
94 | eor $a12,$a12,$a4 @ a1^a2^a4 | ||
95 | str $a0,[sp,#24] @ tab[6]=a2^a4 | ||
96 | and $i0,$mask,$b,lsl#2 | ||
97 | str $a12,[sp,#28] @ tab[7]=a1^a2^a4 | ||
98 | |||
99 | and $i1,$mask,$b,lsr#1 | ||
100 | ldr $lo,[sp,$i0] @ tab[b & 0x7] | ||
101 | and $i0,$mask,$b,lsr#4 | ||
102 | ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7] | ||
103 | and $i1,$mask,$b,lsr#7 | ||
104 | ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7] | ||
105 | eor $lo,$lo,$t1,lsl#3 @ stall | ||
106 | mov $hi,$t1,lsr#29 | ||
107 | ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7] | ||
108 | |||
109 | and $i0,$mask,$b,lsr#10 | ||
110 | eor $lo,$lo,$t0,lsl#6 | ||
111 | eor $hi,$hi,$t0,lsr#26 | ||
112 | ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7] | ||
113 | |||
114 | and $i1,$mask,$b,lsr#13 | ||
115 | eor $lo,$lo,$t1,lsl#9 | ||
116 | eor $hi,$hi,$t1,lsr#23 | ||
117 | ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7] | ||
118 | |||
119 | and $i0,$mask,$b,lsr#16 | ||
120 | eor $lo,$lo,$t0,lsl#12 | ||
121 | eor $hi,$hi,$t0,lsr#20 | ||
122 | ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7] | ||
123 | |||
124 | and $i1,$mask,$b,lsr#19 | ||
125 | eor $lo,$lo,$t1,lsl#15 | ||
126 | eor $hi,$hi,$t1,lsr#17 | ||
127 | ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7] | ||
128 | |||
129 | and $i0,$mask,$b,lsr#22 | ||
130 | eor $lo,$lo,$t0,lsl#18 | ||
131 | eor $hi,$hi,$t0,lsr#14 | ||
132 | ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7] | ||
133 | |||
134 | and $i1,$mask,$b,lsr#25 | ||
135 | eor $lo,$lo,$t1,lsl#21 | ||
136 | eor $hi,$hi,$t1,lsr#11 | ||
137 | ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7] | ||
138 | |||
139 | tst $a,#1<<30 | ||
140 | and $i0,$mask,$b,lsr#28 | ||
141 | eor $lo,$lo,$t0,lsl#24 | ||
142 | eor $hi,$hi,$t0,lsr#8 | ||
143 | ldr $t0,[sp,$i0] @ tab[b >> 30 ] | ||
144 | |||
145 | eorne $lo,$lo,$b,lsl#30 | ||
146 | eorne $hi,$hi,$b,lsr#2 | ||
147 | tst $a,#1<<31 | ||
148 | eor $lo,$lo,$t1,lsl#27 | ||
149 | eor $hi,$hi,$t1,lsr#5 | ||
150 | eorne $lo,$lo,$b,lsl#31 | ||
151 | eorne $hi,$hi,$b,lsr#1 | ||
152 | eor $lo,$lo,$t0,lsl#30 | ||
153 | eor $hi,$hi,$t0,lsr#2 | ||
154 | |||
155 | mov pc,lr | ||
156 | .size mul_1x1_ialu,.-mul_1x1_ialu | ||
157 | ___ | ||
158 | ################ | ||
159 | # void bn_GF2m_mul_2x2(BN_ULONG *r, | ||
160 | # BN_ULONG a1,BN_ULONG a0, | ||
161 | # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 | ||
162 | |||
163 | ($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); | ||
164 | |||
165 | $code.=<<___; | ||
166 | .global bn_GF2m_mul_2x2 | ||
167 | .type bn_GF2m_mul_2x2,%function | ||
168 | .align 5 | ||
169 | bn_GF2m_mul_2x2: | ||
170 | #if __ARM_ARCH__>=7 | ||
171 | ldr r12,.LOPENSSL_armcap | ||
172 | .Lpic: ldr r12,[pc,r12] | ||
173 | tst r12,#1 | ||
174 | beq .Lialu | ||
175 | |||
176 | veor $A1,$A1 | ||
177 | vmov.32 $B1,r3,r3 @ two copies of b1 | ||
178 | vmov.32 ${A1}[0],r1 @ a1 | ||
179 | |||
180 | veor $A0,$A0 | ||
181 | vld1.32 ${B0}[],[sp,:32] @ two copies of b0 | ||
182 | vmov.32 ${A0}[0],r2 @ a0 | ||
183 | mov r12,lr | ||
184 | |||
185 | vmov d16,$A1 | ||
186 | vmov d17,$B1 | ||
187 | bl mul_1x1_neon @ a1·b1 | ||
188 | vmov $A1B1,d0 | ||
189 | |||
190 | vmov d16,$A0 | ||
191 | vmov d17,$B0 | ||
192 | bl mul_1x1_neon @ a0·b0 | ||
193 | vmov $A0B0,d0 | ||
194 | |||
195 | veor d16,$A0,$A1 | ||
196 | veor d17,$B0,$B1 | ||
197 | veor $A0,$A0B0,$A1B1 | ||
198 | bl mul_1x1_neon @ (a0+a1)·(b0+b1) | ||
199 | |||
200 | veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 | ||
201 | vshl.u64 d1,d0,#32 | ||
202 | vshr.u64 d0,d0,#32 | ||
203 | veor $A0B0,d1 | ||
204 | veor $A1B1,d0 | ||
205 | vst1.32 {${A0B0}[0]},[r0,:32]! | ||
206 | vst1.32 {${A0B0}[1]},[r0,:32]! | ||
207 | vst1.32 {${A1B1}[0]},[r0,:32]! | ||
208 | vst1.32 {${A1B1}[1]},[r0,:32] | ||
209 | bx r12 | ||
210 | .align 4 | ||
211 | .Lialu: | ||
212 | #endif | ||
213 | ___ | ||
214 | $ret="r10"; # reassigned 1st argument | ||
215 | $code.=<<___; | ||
216 | stmdb sp!,{r4-r10,lr} | ||
217 | mov $ret,r0 @ reassign 1st argument | ||
218 | mov $b,r3 @ $b=b1 | ||
219 | ldr r3,[sp,#32] @ load b0 | ||
220 | mov $mask,#7<<2 | ||
221 | sub sp,sp,#32 @ allocate tab[8] | ||
222 | |||
223 | bl mul_1x1_ialu @ a1·b1 | ||
224 | str $lo,[$ret,#8] | ||
225 | str $hi,[$ret,#12] | ||
226 | |||
227 | eor $b,$b,r3 @ flip b0 and b1 | ||
228 | eor $a,$a,r2 @ flip a0 and a1 | ||
229 | eor r3,r3,$b | ||
230 | eor r2,r2,$a | ||
231 | eor $b,$b,r3 | ||
232 | eor $a,$a,r2 | ||
233 | bl mul_1x1_ialu @ a0·b0 | ||
234 | str $lo,[$ret] | ||
235 | str $hi,[$ret,#4] | ||
236 | |||
237 | eor $a,$a,r2 | ||
238 | eor $b,$b,r3 | ||
239 | bl mul_1x1_ialu @ (a1+a0)·(b1+b0) | ||
240 | ___ | ||
241 | @r=map("r$_",(6..9)); | ||
242 | $code.=<<___; | ||
243 | ldmia $ret,{@r[0]-@r[3]} | ||
244 | eor $lo,$lo,$hi | ||
245 | eor $hi,$hi,@r[1] | ||
246 | eor $lo,$lo,@r[0] | ||
247 | eor $hi,$hi,@r[2] | ||
248 | eor $lo,$lo,@r[3] | ||
249 | eor $hi,$hi,@r[3] | ||
250 | str $hi,[$ret,#8] | ||
251 | eor $lo,$lo,$hi | ||
252 | add sp,sp,#32 @ destroy tab[8] | ||
253 | str $lo,[$ret,#4] | ||
254 | |||
255 | #if __ARM_ARCH__>=5 | ||
256 | ldmia sp!,{r4-r10,pc} | ||
257 | #else | ||
258 | ldmia sp!,{r4-r10,lr} | ||
259 | tst lr,#1 | ||
260 | moveq pc,lr @ be binary compatible with V4, yet | ||
261 | bx lr @ interoperable with Thumb ISA:-) | ||
262 | #endif | ||
263 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
264 | #if __ARM_ARCH__>=7 | ||
265 | .align 5 | ||
266 | .LOPENSSL_armcap: | ||
267 | .word OPENSSL_armcap_P-(.Lpic+8) | ||
268 | #endif | ||
269 | .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
270 | .align 5 | ||
271 | |||
272 | .comm OPENSSL_armcap_P,4,4 | ||
273 | ___ | ||
274 | |||
275 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
276 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
277 | print $code; | ||
278 | close STDOUT; # enforce flush | ||
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl index 14e0d2d1dd..f78a8b5f0f 100644 --- a/src/lib/libcrypto/bn/asm/armv4-mont.pl +++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl | |||
@@ -23,6 +23,9 @@ | |||
23 | # than 1/2KB. Windows CE port would be trivial, as it's exclusively | 23 | # than 1/2KB. Windows CE port would be trivial, as it's exclusively |
24 | # about decorations, ABI and instruction syntax are identical. | 24 | # about decorations, ABI and instruction syntax are identical. |
25 | 25 | ||
26 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
27 | open STDOUT,">$output"; | ||
28 | |||
26 | $num="r0"; # starts as num argument, but holds &tp[num-1] | 29 | $num="r0"; # starts as num argument, but holds &tp[num-1] |
27 | $ap="r1"; | 30 | $ap="r1"; |
28 | $bp="r2"; $bi="r2"; $rp="r2"; | 31 | $bp="r2"; $bi="r2"; $rp="r2"; |
@@ -89,9 +92,9 @@ bn_mul_mont: | |||
89 | .L1st: | 92 | .L1st: |
90 | ldr $aj,[$ap],#4 @ ap[j],ap++ | 93 | ldr $aj,[$ap],#4 @ ap[j],ap++ |
91 | mov $alo,$ahi | 94 | mov $alo,$ahi |
95 | ldr $nj,[$np],#4 @ np[j],np++ | ||
92 | mov $ahi,#0 | 96 | mov $ahi,#0 |
93 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] | 97 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] |
94 | ldr $nj,[$np],#4 @ np[j],np++ | ||
95 | mov $nhi,#0 | 98 | mov $nhi,#0 |
96 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | 99 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 |
97 | adds $nlo,$nlo,$alo | 100 | adds $nlo,$nlo,$alo |
@@ -101,21 +104,21 @@ bn_mul_mont: | |||
101 | bne .L1st | 104 | bne .L1st |
102 | 105 | ||
103 | adds $nlo,$nlo,$ahi | 106 | adds $nlo,$nlo,$ahi |
107 | ldr $tp,[$_bp] @ restore bp | ||
104 | mov $nhi,#0 | 108 | mov $nhi,#0 |
109 | ldr $n0,[$_n0] @ restore n0 | ||
105 | adc $nhi,$nhi,#0 | 110 | adc $nhi,$nhi,#0 |
106 | ldr $tp,[$_bp] @ restore bp | ||
107 | str $nlo,[$num] @ tp[num-1]= | 111 | str $nlo,[$num] @ tp[num-1]= |
108 | ldr $n0,[$_n0] @ restore n0 | ||
109 | str $nhi,[$num,#4] @ tp[num]= | 112 | str $nhi,[$num,#4] @ tp[num]= |
110 | 113 | ||
111 | .Louter: | 114 | .Louter: |
112 | sub $tj,$num,sp @ "original" $num-1 value | 115 | sub $tj,$num,sp @ "original" $num-1 value |
113 | sub $ap,$ap,$tj @ "rewind" ap to &ap[1] | 116 | sub $ap,$ap,$tj @ "rewind" ap to &ap[1] |
114 | sub $np,$np,$tj @ "rewind" np to &np[1] | ||
115 | ldr $bi,[$tp,#4]! @ *(++bp) | 117 | ldr $bi,[$tp,#4]! @ *(++bp) |
118 | sub $np,$np,$tj @ "rewind" np to &np[1] | ||
116 | ldr $aj,[$ap,#-4] @ ap[0] | 119 | ldr $aj,[$ap,#-4] @ ap[0] |
117 | ldr $nj,[$np,#-4] @ np[0] | ||
118 | ldr $alo,[sp] @ tp[0] | 120 | ldr $alo,[sp] @ tp[0] |
121 | ldr $nj,[$np,#-4] @ np[0] | ||
119 | ldr $tj,[sp,#4] @ tp[1] | 122 | ldr $tj,[sp,#4] @ tp[1] |
120 | 123 | ||
121 | mov $ahi,#0 | 124 | mov $ahi,#0 |
@@ -129,13 +132,13 @@ bn_mul_mont: | |||
129 | .Linner: | 132 | .Linner: |
130 | ldr $aj,[$ap],#4 @ ap[j],ap++ | 133 | ldr $aj,[$ap],#4 @ ap[j],ap++ |
131 | adds $alo,$ahi,$tj @ +=tp[j] | 134 | adds $alo,$ahi,$tj @ +=tp[j] |
135 | ldr $nj,[$np],#4 @ np[j],np++ | ||
132 | mov $ahi,#0 | 136 | mov $ahi,#0 |
133 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] | 137 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] |
134 | ldr $nj,[$np],#4 @ np[j],np++ | ||
135 | mov $nhi,#0 | 138 | mov $nhi,#0 |
136 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | 139 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 |
137 | ldr $tj,[$tp,#8] @ tp[j+1] | ||
138 | adc $ahi,$ahi,#0 | 140 | adc $ahi,$ahi,#0 |
141 | ldr $tj,[$tp,#8] @ tp[j+1] | ||
139 | adds $nlo,$nlo,$alo | 142 | adds $nlo,$nlo,$alo |
140 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | 143 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ |
141 | adc $nlo,$nhi,#0 | 144 | adc $nlo,$nhi,#0 |
@@ -144,13 +147,13 @@ bn_mul_mont: | |||
144 | 147 | ||
145 | adds $nlo,$nlo,$ahi | 148 | adds $nlo,$nlo,$ahi |
146 | mov $nhi,#0 | 149 | mov $nhi,#0 |
150 | ldr $tp,[$_bp] @ restore bp | ||
147 | adc $nhi,$nhi,#0 | 151 | adc $nhi,$nhi,#0 |
152 | ldr $n0,[$_n0] @ restore n0 | ||
148 | adds $nlo,$nlo,$tj | 153 | adds $nlo,$nlo,$tj |
149 | adc $nhi,$nhi,#0 | ||
150 | ldr $tp,[$_bp] @ restore bp | ||
151 | ldr $tj,[$_bpend] @ restore &bp[num] | 154 | ldr $tj,[$_bpend] @ restore &bp[num] |
155 | adc $nhi,$nhi,#0 | ||
152 | str $nlo,[$num] @ tp[num-1]= | 156 | str $nlo,[$num] @ tp[num-1]= |
153 | ldr $n0,[$_n0] @ restore n0 | ||
154 | str $nhi,[$num,#4] @ tp[num]= | 157 | str $nhi,[$num,#4] @ tp[num]= |
155 | 158 | ||
156 | cmp $tp,$tj | 159 | cmp $tp,$tj |
diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl new file mode 100644 index 0000000000..e258658428 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/ia64-mont.pl | |||
@@ -0,0 +1,851 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # January 2010 | ||
11 | # | ||
12 | # "Teaser" Montgomery multiplication module for IA-64. There are | ||
13 | # several possibilities for improvement: | ||
14 | # | ||
15 | # - modulo-scheduling outer loop would eliminate quite a number of | ||
16 | # stalls after ldf8, xma and getf.sig outside inner loop and | ||
17 | # improve shorter key performance; | ||
18 | # - shorter vector support [with input vectors being fetched only | ||
19 | # once] should be added; | ||
20 | # - 2x unroll with help of n0[1] would make the code scalable on | ||
21 | # "wider" IA-64, "wider" than Itanium 2 that is, which is not of | ||
22 | # acute interest, because upcoming Tukwila's individual cores are | ||
23 | # reportedly based on Itanium 2 design; | ||
24 | # - dedicated squaring procedure(?); | ||
25 | # | ||
26 | # January 2010 | ||
27 | # | ||
28 | # Shorter vector support is implemented by zero-padding ap and np | ||
29 | # vectors up to 8 elements, or 512 bits. This means that 256-bit | ||
30 | # inputs will be processed only 2 times faster than 512-bit inputs, | ||
31 | # not 4 [as one would expect, because algorithm complexity is n^2]. | ||
32 | # The reason for padding is that inputs shorter than 512 bits won't | ||
33 | # be processed faster anyway, because minimal critical path of the | ||
34 | # core loop happens to match 512-bit timing. Either way, it resulted | ||
35 | # in >100% improvement of 512-bit RSA sign benchmark and 50% - of | ||
36 | # 1024-bit one [in comparison to original version of *this* module]. | ||
37 | # | ||
38 | # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* | ||
39 | # this module is: | ||
40 | # sign verify sign/s verify/s | ||
41 | # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 | ||
42 | # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 | ||
43 | # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 | ||
44 | # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 | ||
45 | # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 | ||
46 | # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 | ||
47 | # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 | ||
48 | # | ||
49 | # ... and *without* (but still with ia64.S): | ||
50 | # | ||
51 | # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 | ||
52 | # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 | ||
53 | # rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 | ||
54 | # rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 | ||
55 | # dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 | ||
56 | # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 | ||
57 | # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 | ||
58 | # | ||
59 | # As it can be seen, RSA sign performance improves by 130-30%, | ||
60 | # hereafter less for longer keys, while verify - by 74-13%. | ||
61 | # DSA performance improves by 115-30%. | ||
62 | |||
63 | if ($^O eq "hpux") { | ||
64 | $ADDP="addp4"; | ||
65 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | ||
66 | } else { $ADDP="add"; } | ||
67 | |||
68 | $code=<<___; | ||
69 | .explicit | ||
70 | .text | ||
71 | |||
72 | // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, | ||
73 | // const BN_ULONG *bp,const BN_ULONG *np, | ||
74 | // const BN_ULONG *n0p,int num); | ||
75 | .align 64 | ||
76 | .global bn_mul_mont# | ||
77 | .proc bn_mul_mont# | ||
78 | bn_mul_mont: | ||
79 | .prologue | ||
80 | .body | ||
81 | { .mmi; cmp4.le p6,p7=2,r37;; | ||
82 | (p6) cmp4.lt.unc p8,p9=8,r37 | ||
83 | mov ret0=r0 };; | ||
84 | { .bbb; | ||
85 | (p9) br.cond.dptk.many bn_mul_mont_8 | ||
86 | (p8) br.cond.dpnt.many bn_mul_mont_general | ||
87 | (p7) br.ret.spnt.many b0 };; | ||
88 | .endp bn_mul_mont# | ||
89 | |||
90 | prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; | ||
91 | |||
92 | rptr=r8; aptr=r9; bptr=r14; nptr=r15; | ||
93 | tptr=r16; // &tp[0] | ||
94 | tp_1=r17; // &tp[-1] | ||
95 | num=r18; len=r19; lc=r20; | ||
96 | topbit=r21; // carry bit from tmp[num] | ||
97 | |||
98 | n0=f6; | ||
99 | m0=f7; | ||
100 | bi=f8; | ||
101 | |||
102 | .align 64 | ||
103 | .local bn_mul_mont_general# | ||
104 | .proc bn_mul_mont_general# | ||
105 | bn_mul_mont_general: | ||
106 | .prologue | ||
107 | { .mmi; .save ar.pfs,prevfs | ||
108 | alloc prevfs=ar.pfs,6,2,0,8 | ||
109 | $ADDP aptr=0,in1 | ||
110 | .save ar.lc,prevlc | ||
111 | mov prevlc=ar.lc } | ||
112 | { .mmi; .vframe prevsp | ||
113 | mov prevsp=sp | ||
114 | $ADDP bptr=0,in2 | ||
115 | .save pr,prevpr | ||
116 | mov prevpr=pr };; | ||
117 | |||
118 | .body | ||
119 | .rotf alo[6],nlo[4],ahi[8],nhi[6] | ||
120 | .rotr a[3],n[3],t[2] | ||
121 | |||
122 | { .mmi; ldf8 bi=[bptr],8 // (*bp++) | ||
123 | ldf8 alo[4]=[aptr],16 // ap[0] | ||
124 | $ADDP r30=8,in1 };; | ||
125 | { .mmi; ldf8 alo[3]=[r30],16 // ap[1] | ||
126 | ldf8 alo[2]=[aptr],16 // ap[2] | ||
127 | $ADDP in4=0,in4 };; | ||
128 | { .mmi; ldf8 alo[1]=[r30] // ap[3] | ||
129 | ldf8 n0=[in4] // n0 | ||
130 | $ADDP rptr=0,in0 } | ||
131 | { .mmi; $ADDP nptr=0,in3 | ||
132 | mov r31=16 | ||
133 | zxt4 num=in5 };; | ||
134 | { .mmi; ldf8 nlo[2]=[nptr],8 // np[0] | ||
135 | shladd len=num,3,r0 | ||
136 | shladd r31=num,3,r31 };; | ||
137 | { .mmi; ldf8 nlo[1]=[nptr],8 // np[1] | ||
138 | add lc=-5,num | ||
139 | sub r31=sp,r31 };; | ||
140 | { .mfb; and sp=-16,r31 // alloca | ||
141 | xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] | ||
142 | nop.b 0 } | ||
143 | { .mfb; nop.m 0 | ||
144 | xmpy.lu alo[4]=alo[4],bi | ||
145 | brp.loop.imp .L1st_ctop,.L1st_cend-16 | ||
146 | };; | ||
147 | { .mfi; nop.m 0 | ||
148 | xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] | ||
149 | add tp_1=8,sp } | ||
150 | { .mfi; nop.m 0 | ||
151 | xma.lu alo[3]=alo[3],bi,ahi[2] | ||
152 | mov pr.rot=0x20001f<<16 | ||
153 | // ------^----- (p40) at first (p23) | ||
154 | // ----------^^ p[16:20]=1 | ||
155 | };; | ||
156 | { .mfi; nop.m 0 | ||
157 | xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 | ||
158 | mov ar.lc=lc } | ||
159 | { .mfi; nop.m 0 | ||
160 | fcvt.fxu.s1 nhi[1]=f0 | ||
161 | mov ar.ec=8 };; | ||
162 | |||
163 | .align 32 | ||
164 | .L1st_ctop: | ||
165 | .pred.rel "mutex",p40,p42 | ||
166 | { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) | ||
167 | (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] | ||
168 | (p40) add n[2]=n[2],a[2] } // (p23) } | ||
169 | { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) | ||
170 | (p18) xma.lu alo[2]=alo[2],bi,ahi[1] | ||
171 | (p42) add n[2]=n[2],a[2],1 };; // (p23) | ||
172 | { .mfi; (p21) getf.sig a[0]=alo[5] | ||
173 | (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] | ||
174 | (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) | ||
175 | { .mfi; (p23) st8 [tp_1]=n[2],8 | ||
176 | (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] | ||
177 | (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) | ||
178 | { .mmb; (p21) getf.sig n[0]=nlo[3] | ||
179 | (p16) nop.m 0 | ||
180 | br.ctop.sptk .L1st_ctop };; | ||
181 | .L1st_cend: | ||
182 | |||
183 | { .mmi; getf.sig a[0]=ahi[6] // (p24) | ||
184 | getf.sig n[0]=nhi[4] | ||
185 | add num=-1,num };; // num-- | ||
186 | { .mmi; .pred.rel "mutex",p40,p42 | ||
187 | (p40) add n[0]=n[0],a[0] | ||
188 | (p42) add n[0]=n[0],a[0],1 | ||
189 | sub aptr=aptr,len };; // rewind | ||
190 | { .mmi; .pred.rel "mutex",p40,p42 | ||
191 | (p40) cmp.ltu p41,p39=n[0],a[0] | ||
192 | (p42) cmp.leu p41,p39=n[0],a[0] | ||
193 | sub nptr=nptr,len };; | ||
194 | { .mmi; .pred.rel "mutex",p39,p41 | ||
195 | (p39) add topbit=r0,r0 | ||
196 | (p41) add topbit=r0,r0,1 | ||
197 | nop.i 0 } | ||
198 | { .mmi; st8 [tp_1]=n[0] | ||
199 | add tptr=16,sp | ||
200 | add tp_1=8,sp };; | ||
201 | |||
202 | .Louter: | ||
203 | { .mmi; ldf8 bi=[bptr],8 // (*bp++) | ||
204 | ldf8 ahi[3]=[tptr] // tp[0] | ||
205 | add r30=8,aptr };; | ||
206 | { .mmi; ldf8 alo[4]=[aptr],16 // ap[0] | ||
207 | ldf8 alo[3]=[r30],16 // ap[1] | ||
208 | add r31=8,nptr };; | ||
209 | { .mfb; ldf8 alo[2]=[aptr],16 // ap[2] | ||
210 | xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] | ||
211 | brp.loop.imp .Linner_ctop,.Linner_cend-16 | ||
212 | } | ||
213 | { .mfb; ldf8 alo[1]=[r30] // ap[3] | ||
214 | xma.lu alo[4]=alo[4],bi,ahi[3] | ||
215 | clrrrb.pr };; | ||
216 | { .mfi; ldf8 nlo[2]=[nptr],16 // np[0] | ||
217 | xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] | ||
218 | nop.i 0 } | ||
219 | { .mfi; ldf8 nlo[1]=[r31] // np[1] | ||
220 | xma.lu alo[3]=alo[3],bi,ahi[2] | ||
221 | mov pr.rot=0x20101f<<16 | ||
222 | // ------^----- (p40) at first (p23) | ||
223 | // --------^--- (p30) at first (p22) | ||
224 | // ----------^^ p[16:20]=1 | ||
225 | };; | ||
226 | { .mfi; st8 [tptr]=r0 // tp[0] is already accounted | ||
227 | xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 | ||
228 | mov ar.lc=lc } | ||
229 | { .mfi; | ||
230 | fcvt.fxu.s1 nhi[1]=f0 | ||
231 | mov ar.ec=8 };; | ||
232 | |||
233 | // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in | ||
234 | // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 | ||
235 | // in latter case accounts for two-tick pipeline stall, which means | ||
236 | // that its performance would be ~20% lower than optimal one. No | ||
237 | // attempt was made to address this, because original Itanium is | ||
238 | // hardly represented out in the wild... | ||
239 | .align 32 | ||
240 | .Linner_ctop: | ||
241 | .pred.rel "mutex",p40,p42 | ||
242 | .pred.rel "mutex",p30,p32 | ||
243 | { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) | ||
244 | (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] | ||
245 | (p40) add n[2]=n[2],a[2] } // (p23) | ||
246 | { .mfi; (p16) nop.m 0 | ||
247 | (p18) xma.lu alo[2]=alo[2],bi,ahi[1] | ||
248 | (p42) add n[2]=n[2],a[2],1 };; // (p23) | ||
249 | { .mfi; (p21) getf.sig a[0]=alo[5] | ||
250 | (p16) nop.f 0 | ||
251 | (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) | ||
252 | { .mfi; (p21) ld8 t[0]=[tptr],8 | ||
253 | (p16) nop.f 0 | ||
254 | (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) | ||
255 | { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) | ||
256 | (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] | ||
257 | (p30) add a[1]=a[1],t[1] } // (p22) | ||
258 | { .mfi; (p16) nop.m 0 | ||
259 | (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] | ||
260 | (p32) add a[1]=a[1],t[1],1 };; // (p22) | ||
261 | { .mmi; (p21) getf.sig n[0]=nlo[3] | ||
262 | (p16) nop.m 0 | ||
263 | (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) | ||
264 | { .mmb; (p23) st8 [tp_1]=n[2],8 | ||
265 | (p32) cmp.leu p31,p29=a[1],t[1] // (p22) | ||
266 | br.ctop.sptk .Linner_ctop };; | ||
267 | .Linner_cend: | ||
268 | |||
269 | { .mmi; getf.sig a[0]=ahi[6] // (p24) | ||
270 | getf.sig n[0]=nhi[4] | ||
271 | nop.i 0 };; | ||
272 | |||
273 | { .mmi; .pred.rel "mutex",p31,p33 | ||
274 | (p31) add a[0]=a[0],topbit | ||
275 | (p33) add a[0]=a[0],topbit,1 | ||
276 | mov topbit=r0 };; | ||
277 | { .mfi; .pred.rel "mutex",p31,p33 | ||
278 | (p31) cmp.ltu p32,p30=a[0],topbit | ||
279 | (p33) cmp.leu p32,p30=a[0],topbit | ||
280 | } | ||
281 | { .mfi; .pred.rel "mutex",p40,p42 | ||
282 | (p40) add n[0]=n[0],a[0] | ||
283 | (p42) add n[0]=n[0],a[0],1 | ||
284 | };; | ||
285 | { .mmi; .pred.rel "mutex",p44,p46 | ||
286 | (p40) cmp.ltu p41,p39=n[0],a[0] | ||
287 | (p42) cmp.leu p41,p39=n[0],a[0] | ||
288 | (p32) add topbit=r0,r0,1 } | ||
289 | |||
290 | { .mmi; st8 [tp_1]=n[0],8 | ||
291 | cmp4.ne p6,p0=1,num | ||
292 | sub aptr=aptr,len };; // rewind | ||
293 | { .mmi; sub nptr=nptr,len | ||
294 | (p41) add topbit=r0,r0,1 | ||
295 | add tptr=16,sp } | ||
296 | { .mmb; add tp_1=8,sp | ||
297 | add num=-1,num // num-- | ||
298 | (p6) br.cond.sptk.many .Louter };; | ||
299 | |||
300 | { .mbb; add lc=4,lc | ||
301 | brp.loop.imp .Lsub_ctop,.Lsub_cend-16 | ||
302 | clrrrb.pr };; | ||
303 | { .mii; nop.m 0 | ||
304 | mov pr.rot=0x10001<<16 | ||
305 | // ------^---- (p33) at first (p17) | ||
306 | mov ar.lc=lc } | ||
307 | { .mii; nop.m 0 | ||
308 | mov ar.ec=3 | ||
309 | nop.i 0 };; | ||
310 | |||
311 | .Lsub_ctop: | ||
312 | .pred.rel "mutex",p33,p35 | ||
313 | { .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) | ||
314 | (p16) nop.f 0 | ||
315 | (p33) sub n[1]=t[1],n[1] } // (p17) | ||
316 | { .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) | ||
317 | (p16) nop.f 0 | ||
318 | (p35) sub n[1]=t[1],n[1],1 };; // (p17) | ||
319 | { .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r | ||
320 | (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) | ||
321 | (p18) nop.b 0 } | ||
322 | { .mib; (p18) nop.m 0 | ||
323 | (p35) cmp.geu p34,p32=n[1],t[1] // (p17) | ||
324 | br.ctop.sptk .Lsub_ctop };; | ||
325 | .Lsub_cend: | ||
326 | |||
327 | { .mmb; .pred.rel "mutex",p34,p36 | ||
328 | (p34) sub topbit=topbit,r0 // (p19) | ||
329 | (p36) sub topbit=topbit,r0,1 | ||
330 | brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 | ||
331 | } | ||
332 | { .mmb; sub rptr=rptr,len // rewind | ||
333 | sub tptr=tptr,len | ||
334 | clrrrb.pr };; | ||
335 | { .mmi; and aptr=tptr,topbit | ||
336 | andcm bptr=rptr,topbit | ||
337 | mov pr.rot=1<<16 };; | ||
338 | { .mii; or nptr=aptr,bptr | ||
339 | mov ar.lc=lc | ||
340 | mov ar.ec=3 };; | ||
341 | |||
342 | .Lcopy_ctop: | ||
343 | { .mmb; (p16) ld8 n[0]=[nptr],8 | ||
344 | (p18) st8 [tptr]=r0,8 | ||
345 | (p16) nop.b 0 } | ||
346 | { .mmb; (p16) nop.m 0 | ||
347 | (p18) st8 [rptr]=n[2],8 | ||
348 | br.ctop.sptk .Lcopy_ctop };; | ||
349 | .Lcopy_cend: | ||
350 | |||
351 | { .mmi; mov ret0=1 // signal "handled" | ||
352 | rum 1<<5 // clear um.mfh | ||
353 | mov ar.lc=prevlc } | ||
354 | { .mib; .restore sp | ||
355 | mov sp=prevsp | ||
356 | mov pr=prevpr,0x1ffff | ||
357 | br.ret.sptk.many b0 };; | ||
358 | .endp bn_mul_mont_general# | ||
359 | |||
360 | a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; | ||
361 | n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; | ||
362 | t0=r15; | ||
363 | |||
364 | ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; | ||
365 | ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; | ||
366 | |||
367 | .align 64 | ||
368 | .skip 48 // aligns loop body | ||
369 | .local bn_mul_mont_8# | ||
370 | .proc bn_mul_mont_8# | ||
371 | bn_mul_mont_8: | ||
372 | .prologue | ||
373 | { .mmi; .save ar.pfs,prevfs | ||
374 | alloc prevfs=ar.pfs,6,2,0,8 | ||
375 | .vframe prevsp | ||
376 | mov prevsp=sp | ||
377 | .save ar.lc,prevlc | ||
378 | mov prevlc=ar.lc } | ||
379 | { .mmi; add r17=-6*16,sp | ||
380 | add sp=-7*16,sp | ||
381 | .save pr,prevpr | ||
382 | mov prevpr=pr };; | ||
383 | |||
384 | { .mmi; .save.gf 0,0x10 | ||
385 | stf.spill [sp]=f16,-16 | ||
386 | .save.gf 0,0x20 | ||
387 | stf.spill [r17]=f17,32 | ||
388 | add r16=-5*16,prevsp};; | ||
389 | { .mmi; .save.gf 0,0x40 | ||
390 | stf.spill [r16]=f18,32 | ||
391 | .save.gf 0,0x80 | ||
392 | stf.spill [r17]=f19,32 | ||
393 | $ADDP aptr=0,in1 };; | ||
394 | { .mmi; .save.gf 0,0x100 | ||
395 | stf.spill [r16]=f20,32 | ||
396 | .save.gf 0,0x200 | ||
397 | stf.spill [r17]=f21,32 | ||
398 | $ADDP r29=8,in1 };; | ||
399 | { .mmi; .save.gf 0,0x400 | ||
400 | stf.spill [r16]=f22 | ||
401 | .save.gf 0,0x800 | ||
402 | stf.spill [r17]=f23 | ||
403 | $ADDP rptr=0,in0 };; | ||
404 | |||
405 | .body | ||
406 | .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] | ||
407 | .rotr t[8] | ||
408 | |||
409 | // load input vectors padding them to 8 elements | ||
410 | { .mmi; ldf8 ai0=[aptr],16 // ap[0] | ||
411 | ldf8 ai1=[r29],16 // ap[1] | ||
412 | $ADDP bptr=0,in2 } | ||
413 | { .mmi; $ADDP r30=8,in2 | ||
414 | $ADDP nptr=0,in3 | ||
415 | $ADDP r31=8,in3 };; | ||
416 | { .mmi; ldf8 bj[7]=[bptr],16 // bp[0] | ||
417 | ldf8 bj[6]=[r30],16 // bp[1] | ||
418 | cmp4.le p4,p5=3,in5 } | ||
419 | { .mmi; ldf8 ni0=[nptr],16 // np[0] | ||
420 | ldf8 ni1=[r31],16 // np[1] | ||
421 | cmp4.le p6,p7=4,in5 };; | ||
422 | |||
423 | { .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] | ||
424 | (p5)fcvt.fxu ai2=f0 | ||
425 | cmp4.le p8,p9=5,in5 } | ||
426 | { .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] | ||
427 | (p7)fcvt.fxu ai3=f0 | ||
428 | cmp4.le p10,p11=6,in5 } | ||
429 | { .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] | ||
430 | (p5)fcvt.fxu bj[5]=f0 | ||
431 | cmp4.le p12,p13=7,in5 } | ||
432 | { .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] | ||
433 | (p7)fcvt.fxu bj[4]=f0 | ||
434 | cmp4.le p14,p15=8,in5 } | ||
435 | { .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] | ||
436 | (p5)fcvt.fxu ni2=f0 | ||
437 | addp4 r28=-1,in5 } | ||
438 | { .mfi; (p6)ldf8 ni3=[r31],16 // np[3] | ||
439 | (p7)fcvt.fxu ni3=f0 | ||
440 | $ADDP in4=0,in4 };; | ||
441 | |||
442 | { .mfi; ldf8 n0=[in4] | ||
443 | fcvt.fxu tf[1]=f0 | ||
444 | nop.i 0 } | ||
445 | |||
446 | { .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] | ||
447 | (p9)fcvt.fxu ai4=f0 | ||
448 | mov t[0]=r0 } | ||
449 | { .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] | ||
450 | (p11)fcvt.fxu ai5=f0 | ||
451 | mov t[1]=r0 } | ||
452 | { .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] | ||
453 | (p9)fcvt.fxu bj[3]=f0 | ||
454 | mov t[2]=r0 } | ||
455 | { .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] | ||
456 | (p11)fcvt.fxu bj[2]=f0 | ||
457 | mov t[3]=r0 } | ||
458 | { .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] | ||
459 | (p9)fcvt.fxu ni4=f0 | ||
460 | mov t[4]=r0 } | ||
461 | { .mfi; (p10)ldf8 ni5=[r31],16 // np[5] | ||
462 | (p11)fcvt.fxu ni5=f0 | ||
463 | mov t[5]=r0 };; | ||
464 | |||
465 | { .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] | ||
466 | (p13)fcvt.fxu ai6=f0 | ||
467 | mov t[6]=r0 } | ||
468 | { .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] | ||
469 | (p15)fcvt.fxu ai7=f0 | ||
470 | mov t[7]=r0 } | ||
471 | { .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] | ||
472 | (p13)fcvt.fxu bj[1]=f0 | ||
473 | mov ar.lc=r28 } | ||
474 | { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] | ||
475 | (p15)fcvt.fxu bj[0]=f0 | ||
476 | mov ar.ec=1 } | ||
477 | { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] | ||
478 | (p13)fcvt.fxu ni6=f0 | ||
479 | mov pr.rot=1<<16 } | ||
480 | { .mfb; (p14)ldf8 ni7=[r31],16 // np[7] | ||
481 | (p15)fcvt.fxu ni7=f0 | ||
482 | brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 | ||
483 | };; | ||
484 | |||
485 | // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt | ||
486 | // to measure with help of Interval Time Counter indicated that the | ||
487 | // factor is a tad higher: 33 or 34, if not 35. Exact measurement and | ||
488 | // addressing the issue is problematic, because I don't have access | ||
489 | // to platform-specific instruction-level profiler. On Itanium it | ||
490 | // should run in 56*n ticks, because of higher xma latency... | ||
491 | .Louter_8_ctop: | ||
492 | .pred.rel "mutex",p40,p42 | ||
493 | .pred.rel "mutex",p48,p50 | ||
494 | { .mfi; (p16) nop.m 0 // 0: | ||
495 | (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] | ||
496 | (p40) add a3=a3,n3 } // (p17) a3+=n3 | ||
497 | { .mfi; (p42) add a3=a3,n3,1 | ||
498 | (p16) xma.lu alo[0]=ai0,bj[7],tf[1] | ||
499 | (p16) nop.i 0 };; | ||
500 | { .mii; (p17) getf.sig a7=alo[8] // 1: | ||
501 | (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 | ||
502 | (p50) add t[6]=t[6],a3,1 };; | ||
503 | { .mfi; (p17) getf.sig a8=ahi[8] // 2: | ||
504 | (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 | ||
505 | (p40) cmp.ltu p43,p41=a3,n3 } | ||
506 | { .mfi; (p42) cmp.leu p43,p41=a3,n3 | ||
507 | (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] | ||
508 | (p16) nop.i 0 };; | ||
509 | { .mii; (p17) getf.sig n5=nlo[6] // 3: | ||
510 | (p48) cmp.ltu p51,p49=t[6],a3 | ||
511 | (p50) cmp.leu p51,p49=t[6],a3 };; | ||
512 | .pred.rel "mutex",p41,p43 | ||
513 | .pred.rel "mutex",p49,p51 | ||
514 | { .mfi; (p16) nop.m 0 // 4: | ||
515 | (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] | ||
516 | (p41) add a4=a4,n4 } // (p17) a4+=n4 | ||
517 | { .mfi; (p43) add a4=a4,n4,1 | ||
518 | (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] | ||
519 | (p16) nop.i 0 };; | ||
520 | { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 | ||
521 | (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 | ||
522 | (p51) add t[5]=t[5],a4,1 };; | ||
523 | { .mfi; (p16) nop.m 0 // 6: | ||
524 | (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 | ||
525 | (p41) cmp.ltu p42,p40=a4,n4 } | ||
526 | { .mfi; (p43) cmp.leu p42,p40=a4,n4 | ||
527 | (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] | ||
528 | (p16) nop.i 0 };; | ||
529 | { .mii; (p17) getf.sig n6=nlo[7] // 7: | ||
530 | (p49) cmp.ltu p50,p48=t[5],a4 | ||
531 | (p51) cmp.leu p50,p48=t[5],a4 };; | ||
532 | .pred.rel "mutex",p40,p42 | ||
533 | .pred.rel "mutex",p48,p50 | ||
534 | { .mfi; (p16) nop.m 0 // 8: | ||
535 | (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] | ||
536 | (p40) add a5=a5,n5 } // (p17) a5+=n5 | ||
537 | { .mfi; (p42) add a5=a5,n5,1 | ||
538 | (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] | ||
539 | (p16) nop.i 0 };; | ||
540 | { .mii; (p16) getf.sig a1=alo[1] // 9: | ||
541 | (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 | ||
542 | (p50) add t[4]=t[4],a5,1 };; | ||
543 | { .mfi; (p16) nop.m 0 // 10: | ||
544 | (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 | ||
545 | (p40) cmp.ltu p43,p41=a5,n5 } | ||
546 | { .mfi; (p42) cmp.leu p43,p41=a5,n5 | ||
547 | (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] | ||
548 | (p16) nop.i 0 };; | ||
549 | { .mii; (p17) getf.sig n7=nlo[8] // 11: | ||
550 | (p48) cmp.ltu p51,p49=t[4],a5 | ||
551 | (p50) cmp.leu p51,p49=t[4],a5 };; | ||
552 | .pred.rel "mutex",p41,p43 | ||
553 | .pred.rel "mutex",p49,p51 | ||
554 | { .mfi; (p17) getf.sig n8=nhi[8] // 12: | ||
555 | (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] | ||
556 | (p41) add a6=a6,n6 } // (p17) a6+=n6 | ||
557 | { .mfi; (p43) add a6=a6,n6,1 | ||
558 | (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] | ||
559 | (p16) nop.i 0 };; | ||
560 | { .mii; (p16) getf.sig a2=alo[2] // 13: | ||
561 | (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 | ||
562 | (p51) add t[3]=t[3],a6,1 };; | ||
563 | { .mfi; (p16) nop.m 0 // 14: | ||
564 | (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 | ||
565 | (p41) cmp.ltu p42,p40=a6,n6 } | ||
566 | { .mfi; (p43) cmp.leu p42,p40=a6,n6 | ||
567 | (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] | ||
568 | (p16) nop.i 0 };; | ||
569 | { .mii; (p16) nop.m 0 // 15: | ||
570 | (p49) cmp.ltu p50,p48=t[3],a6 | ||
571 | (p51) cmp.leu p50,p48=t[3],a6 };; | ||
572 | .pred.rel "mutex",p40,p42 | ||
573 | .pred.rel "mutex",p48,p50 | ||
574 | { .mfi; (p16) nop.m 0 // 16: | ||
575 | (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] | ||
576 | (p40) add a7=a7,n7 } // (p17) a7+=n7 | ||
577 | { .mfi; (p42) add a7=a7,n7,1 | ||
578 | (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] | ||
579 | (p16) nop.i 0 };; | ||
580 | { .mii; (p16) getf.sig a3=alo[3] // 17: | ||
581 | (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 | ||
582 | (p50) add t[2]=t[2],a7,1 };; | ||
583 | { .mfi; (p16) nop.m 0 // 18: | ||
584 | (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 | ||
585 | (p40) cmp.ltu p43,p41=a7,n7 } | ||
586 | { .mfi; (p42) cmp.leu p43,p41=a7,n7 | ||
587 | (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] | ||
588 | (p16) nop.i 0 };; | ||
589 | { .mii; (p16) getf.sig n1=nlo[1] // 19: | ||
590 | (p48) cmp.ltu p51,p49=t[2],a7 | ||
591 | (p50) cmp.leu p51,p49=t[2],a7 };; | ||
592 | .pred.rel "mutex",p41,p43 | ||
593 | .pred.rel "mutex",p49,p51 | ||
594 | { .mfi; (p16) nop.m 0 // 20: | ||
595 | (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] | ||
596 | (p41) add a8=a8,n8 } // (p17) a8+=n8 | ||
597 | { .mfi; (p43) add a8=a8,n8,1 | ||
598 | (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] | ||
599 | (p16) nop.i 0 };; | ||
600 | { .mii; (p16) getf.sig a4=alo[4] // 21: | ||
601 | (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 | ||
602 | (p51) add t[1]=t[1],a8,1 };; | ||
603 | { .mfi; (p16) nop.m 0 // 22: | ||
604 | (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 | ||
605 | (p41) cmp.ltu p42,p40=a8,n8 } | ||
606 | { .mfi; (p43) cmp.leu p42,p40=a8,n8 | ||
607 | (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] | ||
608 | (p16) nop.i 0 };; | ||
609 | { .mii; (p16) getf.sig n2=nlo[2] // 23: | ||
610 | (p49) cmp.ltu p50,p48=t[1],a8 | ||
611 | (p51) cmp.leu p50,p48=t[1],a8 };; | ||
612 | { .mfi; (p16) nop.m 0 // 24: | ||
613 | (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] | ||
614 | (p16) add a1=a1,n1 } // (p16) a1+=n1 | ||
615 | { .mfi; (p16) nop.m 0 | ||
616 | (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] | ||
617 | (p17) mov t[0]=r0 };; | ||
618 | { .mii; (p16) getf.sig a5=alo[5] // 25: | ||
619 | (p16) add t0=t[7],a1 // (p16) t[7]+=a1 | ||
620 | (p42) add t[0]=t[0],r0,1 };; | ||
621 | { .mfi; (p16) setf.sig tf[0]=t0 // 26: | ||
622 | (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 | ||
623 | (p50) add t[0]=t[0],r0,1 } | ||
624 | { .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 | ||
625 | (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] | ||
626 | (p16) nop.i 0 };; | ||
627 | { .mii; (p16) getf.sig n3=nlo[3] // 27: | ||
628 | (p16) cmp.ltu.unc p50,p48=t0,a1 | ||
629 | (p16) nop.i 0 };; | ||
630 | .pred.rel "mutex",p40,p42 | ||
631 | .pred.rel "mutex",p48,p50 | ||
632 | { .mfi; (p16) nop.m 0 // 28: | ||
633 | (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] | ||
634 | (p40) add a2=a2,n2 } // (p16) a2+=n2 | ||
635 | { .mfi; (p42) add a2=a2,n2,1 | ||
636 | (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] | ||
637 | (p16) nop.i 0 };; | ||
638 | { .mii; (p16) getf.sig a6=alo[6] // 29: | ||
639 | (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 | ||
640 | (p50) add t[6]=t[6],a2,1 };; | ||
641 | { .mfi; (p16) nop.m 0 // 30: | ||
642 | (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 | ||
643 | (p40) cmp.ltu p41,p39=a2,n2 } | ||
644 | { .mfi; (p42) cmp.leu p41,p39=a2,n2 | ||
645 | (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] | ||
646 | (p16) nop.i 0 };; | ||
647 | { .mfi; (p16) getf.sig n4=nlo[4] // 31: | ||
648 | (p16) nop.f 0 | ||
649 | (p48) cmp.ltu p49,p47=t[6],a2 } | ||
650 | { .mfb; (p50) cmp.leu p49,p47=t[6],a2 | ||
651 | (p16) nop.f 0 | ||
652 | br.ctop.sptk.many .Louter_8_ctop };; | ||
653 | .Louter_8_cend: | ||
654 | |||
655 | // above loop has to execute one more time, without (p16), which is | ||
656 | // replaced with merged move of np[8] to GPR bank | ||
657 | .pred.rel "mutex",p40,p42 | ||
658 | .pred.rel "mutex",p48,p50 | ||
659 | { .mmi; (p0) getf.sig n1=ni0 // 0: | ||
660 | (p40) add a3=a3,n3 // (p17) a3+=n3 | ||
661 | (p42) add a3=a3,n3,1 };; | ||
662 | { .mii; (p17) getf.sig a7=alo[8] // 1: | ||
663 | (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 | ||
664 | (p50) add t[6]=t[6],a3,1 };; | ||
665 | { .mfi; (p17) getf.sig a8=ahi[8] // 2: | ||
666 | (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 | ||
667 | (p40) cmp.ltu p43,p41=a3,n3 } | ||
668 | { .mfi; (p42) cmp.leu p43,p41=a3,n3 | ||
669 | (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] | ||
670 | (p0) nop.i 0 };; | ||
671 | { .mii; (p17) getf.sig n5=nlo[6] // 3: | ||
672 | (p48) cmp.ltu p51,p49=t[6],a3 | ||
673 | (p50) cmp.leu p51,p49=t[6],a3 };; | ||
674 | .pred.rel "mutex",p41,p43 | ||
675 | .pred.rel "mutex",p49,p51 | ||
676 | { .mmi; (p0) getf.sig n2=ni1 // 4: | ||
677 | (p41) add a4=a4,n4 // (p17) a4+=n4 | ||
678 | (p43) add a4=a4,n4,1 };; | ||
679 | { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 | ||
680 | (p0) nop.f 0 | ||
681 | (p51) add t[5]=t[5],a4,1 };; | ||
682 | { .mfi; (p0) getf.sig n3=ni2 // 6: | ||
683 | (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 | ||
684 | (p41) cmp.ltu p42,p40=a4,n4 } | ||
685 | { .mfi; (p43) cmp.leu p42,p40=a4,n4 | ||
686 | (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] | ||
687 | (p0) nop.i 0 };; | ||
688 | { .mii; (p17) getf.sig n6=nlo[7] // 7: | ||
689 | (p49) cmp.ltu p50,p48=t[5],a4 | ||
690 | (p51) cmp.leu p50,p48=t[5],a4 };; | ||
691 | .pred.rel "mutex",p40,p42 | ||
692 | .pred.rel "mutex",p48,p50 | ||
693 | { .mii; (p0) getf.sig n4=ni3 // 8: | ||
694 | (p40) add a5=a5,n5 // (p17) a5+=n5 | ||
695 | (p42) add a5=a5,n5,1 };; | ||
696 | { .mii; (p0) nop.m 0 // 9: | ||
697 | (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 | ||
698 | (p50) add t[4]=t[4],a5,1 };; | ||
699 | { .mii; (p0) nop.m 0 // 10: | ||
700 | (p40) cmp.ltu p43,p41=a5,n5 | ||
701 | (p42) cmp.leu p43,p41=a5,n5 };; | ||
702 | { .mii; (p17) getf.sig n7=nlo[8] // 11: | ||
703 | (p48) cmp.ltu p51,p49=t[4],a5 | ||
704 | (p50) cmp.leu p51,p49=t[4],a5 };; | ||
705 | .pred.rel "mutex",p41,p43 | ||
706 | .pred.rel "mutex",p49,p51 | ||
707 | { .mii; (p17) getf.sig n8=nhi[8] // 12: | ||
708 | (p41) add a6=a6,n6 // (p17) a6+=n6 | ||
709 | (p43) add a6=a6,n6,1 };; | ||
710 | { .mii; (p0) getf.sig n5=ni4 // 13: | ||
711 | (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 | ||
712 | (p51) add t[3]=t[3],a6,1 };; | ||
713 | { .mii; (p0) nop.m 0 // 14: | ||
714 | (p41) cmp.ltu p42,p40=a6,n6 | ||
715 | (p43) cmp.leu p42,p40=a6,n6 };; | ||
716 | { .mii; (p0) getf.sig n6=ni5 // 15: | ||
717 | (p49) cmp.ltu p50,p48=t[3],a6 | ||
718 | (p51) cmp.leu p50,p48=t[3],a6 };; | ||
719 | .pred.rel "mutex",p40,p42 | ||
720 | .pred.rel "mutex",p48,p50 | ||
721 | { .mii; (p0) nop.m 0 // 16: | ||
722 | (p40) add a7=a7,n7 // (p17) a7+=n7 | ||
723 | (p42) add a7=a7,n7,1 };; | ||
724 | { .mii; (p0) nop.m 0 // 17: | ||
725 | (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 | ||
726 | (p50) add t[2]=t[2],a7,1 };; | ||
727 | { .mii; (p0) nop.m 0 // 18: | ||
728 | (p40) cmp.ltu p43,p41=a7,n7 | ||
729 | (p42) cmp.leu p43,p41=a7,n7 };; | ||
730 | { .mii; (p0) getf.sig n7=ni6 // 19: | ||
731 | (p48) cmp.ltu p51,p49=t[2],a7 | ||
732 | (p50) cmp.leu p51,p49=t[2],a7 };; | ||
733 | .pred.rel "mutex",p41,p43 | ||
734 | .pred.rel "mutex",p49,p51 | ||
735 | { .mii; (p0) nop.m 0 // 20: | ||
736 | (p41) add a8=a8,n8 // (p17) a8+=n8 | ||
737 | (p43) add a8=a8,n8,1 };; | ||
738 | { .mmi; (p0) nop.m 0 // 21: | ||
739 | (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 | ||
740 | (p51) add t[1]=t[1],a8,1 } | ||
741 | { .mmi; (p17) mov t[0]=r0 | ||
742 | (p41) cmp.ltu p42,p40=a8,n8 | ||
743 | (p43) cmp.leu p42,p40=a8,n8 };; | ||
744 | { .mmi; (p0) getf.sig n8=ni7 // 22: | ||
745 | (p49) cmp.ltu p50,p48=t[1],a8 | ||
746 | (p51) cmp.leu p50,p48=t[1],a8 } | ||
747 | { .mmi; (p42) add t[0]=t[0],r0,1 | ||
748 | (p0) add r16=-7*16,prevsp | ||
749 | (p0) add r17=-6*16,prevsp };; | ||
750 | |||
751 | // subtract np[8] from carrybit|tmp[8] | ||
752 | // carrybit|tmp[8] layout upon exit from above loop is: | ||
753 | // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) | ||
754 | { .mmi; (p50)add t[0]=t[0],r0,1 | ||
755 | add r18=-5*16,prevsp | ||
756 | sub n1=t0,n1 };; | ||
757 | { .mmi; cmp.gtu p34,p32=n1,t0;; | ||
758 | .pred.rel "mutex",p32,p34 | ||
759 | (p32)sub n2=t[7],n2 | ||
760 | (p34)sub n2=t[7],n2,1 };; | ||
761 | { .mii; (p32)cmp.gtu p35,p33=n2,t[7] | ||
762 | (p34)cmp.geu p35,p33=n2,t[7];; | ||
763 | .pred.rel "mutex",p33,p35 | ||
764 | (p33)sub n3=t[6],n3 } | ||
765 | { .mmi; (p35)sub n3=t[6],n3,1;; | ||
766 | (p33)cmp.gtu p34,p32=n3,t[6] | ||
767 | (p35)cmp.geu p34,p32=n3,t[6] };; | ||
768 | .pred.rel "mutex",p32,p34 | ||
769 | { .mii; (p32)sub n4=t[5],n4 | ||
770 | (p34)sub n4=t[5],n4,1;; | ||
771 | (p32)cmp.gtu p35,p33=n4,t[5] } | ||
772 | { .mmi; (p34)cmp.geu p35,p33=n4,t[5];; | ||
773 | .pred.rel "mutex",p33,p35 | ||
774 | (p33)sub n5=t[4],n5 | ||
775 | (p35)sub n5=t[4],n5,1 };; | ||
776 | { .mii; (p33)cmp.gtu p34,p32=n5,t[4] | ||
777 | (p35)cmp.geu p34,p32=n5,t[4];; | ||
778 | .pred.rel "mutex",p32,p34 | ||
779 | (p32)sub n6=t[3],n6 } | ||
780 | { .mmi; (p34)sub n6=t[3],n6,1;; | ||
781 | (p32)cmp.gtu p35,p33=n6,t[3] | ||
782 | (p34)cmp.geu p35,p33=n6,t[3] };; | ||
783 | .pred.rel "mutex",p33,p35 | ||
784 | { .mii; (p33)sub n7=t[2],n7 | ||
785 | (p35)sub n7=t[2],n7,1;; | ||
786 | (p33)cmp.gtu p34,p32=n7,t[2] } | ||
787 | { .mmi; (p35)cmp.geu p34,p32=n7,t[2];; | ||
788 | .pred.rel "mutex",p32,p34 | ||
789 | (p32)sub n8=t[1],n8 | ||
790 | (p34)sub n8=t[1],n8,1 };; | ||
791 | { .mii; (p32)cmp.gtu p35,p33=n8,t[1] | ||
792 | (p34)cmp.geu p35,p33=n8,t[1];; | ||
793 | .pred.rel "mutex",p33,p35 | ||
794 | (p33)sub a8=t[0],r0 } | ||
795 | { .mmi; (p35)sub a8=t[0],r0,1;; | ||
796 | (p33)cmp.gtu p34,p32=a8,t[0] | ||
797 | (p35)cmp.geu p34,p32=a8,t[0] };; | ||
798 | |||
799 | // save the result, either tmp[num] or tmp[num]-np[num] | ||
800 | .pred.rel "mutex",p32,p34 | ||
801 | { .mmi; (p32)st8 [rptr]=n1,8 | ||
802 | (p34)st8 [rptr]=t0,8 | ||
803 | add r19=-4*16,prevsp};; | ||
804 | { .mmb; (p32)st8 [rptr]=n2,8 | ||
805 | (p34)st8 [rptr]=t[7],8 | ||
806 | (p5)br.cond.dpnt.few .Ldone };; | ||
807 | { .mmb; (p32)st8 [rptr]=n3,8 | ||
808 | (p34)st8 [rptr]=t[6],8 | ||
809 | (p7)br.cond.dpnt.few .Ldone };; | ||
810 | { .mmb; (p32)st8 [rptr]=n4,8 | ||
811 | (p34)st8 [rptr]=t[5],8 | ||
812 | (p9)br.cond.dpnt.few .Ldone };; | ||
813 | { .mmb; (p32)st8 [rptr]=n5,8 | ||
814 | (p34)st8 [rptr]=t[4],8 | ||
815 | (p11)br.cond.dpnt.few .Ldone };; | ||
816 | { .mmb; (p32)st8 [rptr]=n6,8 | ||
817 | (p34)st8 [rptr]=t[3],8 | ||
818 | (p13)br.cond.dpnt.few .Ldone };; | ||
819 | { .mmb; (p32)st8 [rptr]=n7,8 | ||
820 | (p34)st8 [rptr]=t[2],8 | ||
821 | (p15)br.cond.dpnt.few .Ldone };; | ||
822 | { .mmb; (p32)st8 [rptr]=n8,8 | ||
823 | (p34)st8 [rptr]=t[1],8 | ||
824 | nop.b 0 };; | ||
825 | .Ldone: // epilogue | ||
826 | { .mmi; ldf.fill f16=[r16],64 | ||
827 | ldf.fill f17=[r17],64 | ||
828 | nop.i 0 } | ||
829 | { .mmi; ldf.fill f18=[r18],64 | ||
830 | ldf.fill f19=[r19],64 | ||
831 | mov pr=prevpr,0x1ffff };; | ||
832 | { .mmi; ldf.fill f20=[r16] | ||
833 | ldf.fill f21=[r17] | ||
834 | mov ar.lc=prevlc } | ||
835 | { .mmi; ldf.fill f22=[r18] | ||
836 | ldf.fill f23=[r19] | ||
837 | mov ret0=1 } // signal "handled" | ||
838 | { .mib; rum 1<<5 | ||
839 | .restore sp | ||
840 | mov sp=prevsp | ||
841 | br.ret.sptk.many b0 };; | ||
842 | .endp bn_mul_mont_8# | ||
843 | |||
844 | .type copyright#,\@object | ||
845 | copyright: | ||
846 | stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" | ||
847 | ___ | ||
848 | |||
849 | $output=shift and open STDOUT,">$output"; | ||
850 | print $code; | ||
851 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/mips-mont.pl b/src/lib/libcrypto/bn/asm/mips-mont.pl new file mode 100644 index 0000000000..b944a12b8e --- /dev/null +++ b/src/lib/libcrypto/bn/asm/mips-mont.pl | |||
@@ -0,0 +1,426 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # This module doesn't present direct interest for OpenSSL, because it | ||
11 | # doesn't provide better performance for longer keys, at least not on | ||
12 | # in-order-execution cores. While 512-bit RSA sign operations can be | ||
13 | # 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and | ||
14 | # 4096-bit ones are up to 15% slower. In 32-bit mode it varies from | ||
15 | # 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA | ||
16 | # verify:-( All comparisons are against bn_mul_mont-free assembler. | ||
17 | # The module might be of interest to embedded system developers, as | ||
18 | # the code is smaller than 1KB, yet offers >3x improvement on MIPS64 | ||
19 | # and 75-30% [less for longer keys] on MIPS32 over compiler-generated | ||
20 | # code. | ||
21 | |||
22 | ###################################################################### | ||
23 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | ||
24 | # widely used. Then there is a new contender: NUBI. It appears that if | ||
25 | # one picks the latter, it's possible to arrange code in ABI neutral | ||
26 | # manner. Therefore let's stick to NUBI register layout: | ||
27 | # | ||
28 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | ||
29 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
30 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | ||
31 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | ||
32 | # | ||
33 | # The return value is placed in $a0. Following coding rules facilitate | ||
34 | # interoperability: | ||
35 | # | ||
36 | # - never ever touch $tp, "thread pointer", former $gp; | ||
37 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | ||
38 | # old code]; | ||
39 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | ||
40 | # | ||
41 | # For reference here is register layout for N32/64 MIPS ABIs: | ||
42 | # | ||
43 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
44 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
45 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
46 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
47 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
48 | # | ||
49 | $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 | ||
50 | |||
51 | if ($flavour =~ /64|n32/i) { | ||
52 | $PTR_ADD="dadd"; # incidentally works even on n32 | ||
53 | $PTR_SUB="dsub"; # incidentally works even on n32 | ||
54 | $REG_S="sd"; | ||
55 | $REG_L="ld"; | ||
56 | $SZREG=8; | ||
57 | } else { | ||
58 | $PTR_ADD="add"; | ||
59 | $PTR_SUB="sub"; | ||
60 | $REG_S="sw"; | ||
61 | $REG_L="lw"; | ||
62 | $SZREG=4; | ||
63 | } | ||
64 | $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; | ||
65 | # | ||
66 | # <appro@openssl.org> | ||
67 | # | ||
68 | ###################################################################### | ||
69 | |||
70 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
71 | open STDOUT,">$output"; | ||
72 | |||
73 | if ($flavour =~ /64|n32/i) { | ||
74 | $LD="ld"; | ||
75 | $ST="sd"; | ||
76 | $MULTU="dmultu"; | ||
77 | $ADDU="daddu"; | ||
78 | $SUBU="dsubu"; | ||
79 | $BNSZ=8; | ||
80 | } else { | ||
81 | $LD="lw"; | ||
82 | $ST="sw"; | ||
83 | $MULTU="multu"; | ||
84 | $ADDU="addu"; | ||
85 | $SUBU="subu"; | ||
86 | $BNSZ=4; | ||
87 | } | ||
88 | |||
89 | # int bn_mul_mont( | ||
90 | $rp=$a0; # BN_ULONG *rp, | ||
91 | $ap=$a1; # const BN_ULONG *ap, | ||
92 | $bp=$a2; # const BN_ULONG *bp, | ||
93 | $np=$a3; # const BN_ULONG *np, | ||
94 | $n0=$a4; # const BN_ULONG *n0, | ||
95 | $num=$a5; # int num); | ||
96 | |||
97 | $lo0=$a6; | ||
98 | $hi0=$a7; | ||
99 | $lo1=$t1; | ||
100 | $hi1=$t2; | ||
101 | $aj=$s0; | ||
102 | $bi=$s1; | ||
103 | $nj=$s2; | ||
104 | $tp=$s3; | ||
105 | $alo=$s4; | ||
106 | $ahi=$s5; | ||
107 | $nlo=$s6; | ||
108 | $nhi=$s7; | ||
109 | $tj=$s8; | ||
110 | $i=$s9; | ||
111 | $j=$s10; | ||
112 | $m1=$s11; | ||
113 | |||
114 | $FRAMESIZE=14; | ||
115 | |||
116 | $code=<<___; | ||
117 | .text | ||
118 | |||
119 | .set noat | ||
120 | .set noreorder | ||
121 | |||
122 | .align 5 | ||
123 | .globl bn_mul_mont | ||
124 | .ent bn_mul_mont | ||
125 | bn_mul_mont: | ||
126 | ___ | ||
127 | $code.=<<___ if ($flavour =~ /o32/i); | ||
128 | lw $n0,16($sp) | ||
129 | lw $num,20($sp) | ||
130 | ___ | ||
131 | $code.=<<___; | ||
132 | slt $at,$num,4 | ||
133 | bnez $at,1f | ||
134 | li $t0,0 | ||
135 | slt $at,$num,17 # on in-order CPU | ||
136 | bnezl $at,bn_mul_mont_internal | ||
137 | nop | ||
138 | 1: jr $ra | ||
139 | li $a0,0 | ||
140 | .end bn_mul_mont | ||
141 | |||
142 | .align 5 | ||
143 | .ent bn_mul_mont_internal | ||
144 | bn_mul_mont_internal: | ||
145 | .frame $fp,$FRAMESIZE*$SZREG,$ra | ||
146 | .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG | ||
147 | $PTR_SUB $sp,$FRAMESIZE*$SZREG | ||
148 | $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) | ||
149 | $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) | ||
150 | $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) | ||
151 | $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) | ||
152 | $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) | ||
153 | $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) | ||
154 | $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) | ||
155 | $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) | ||
156 | $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) | ||
157 | ___ | ||
158 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
159 | $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) | ||
160 | $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) | ||
161 | $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) | ||
162 | $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) | ||
163 | ___ | ||
164 | $code.=<<___; | ||
165 | move $fp,$sp | ||
166 | |||
167 | .set reorder | ||
168 | $LD $n0,0($n0) | ||
169 | $LD $bi,0($bp) # bp[0] | ||
170 | $LD $aj,0($ap) # ap[0] | ||
171 | $LD $nj,0($np) # np[0] | ||
172 | |||
173 | $PTR_SUB $sp,2*$BNSZ # place for two extra words | ||
174 | sll $num,`log($BNSZ)/log(2)` | ||
175 | li $at,-4096 | ||
176 | $PTR_SUB $sp,$num | ||
177 | and $sp,$at | ||
178 | |||
179 | $MULTU $aj,$bi | ||
180 | $LD $alo,$BNSZ($ap) | ||
181 | $LD $nlo,$BNSZ($np) | ||
182 | mflo $lo0 | ||
183 | mfhi $hi0 | ||
184 | $MULTU $lo0,$n0 | ||
185 | mflo $m1 | ||
186 | |||
187 | $MULTU $alo,$bi | ||
188 | mflo $alo | ||
189 | mfhi $ahi | ||
190 | |||
191 | $MULTU $nj,$m1 | ||
192 | mflo $lo1 | ||
193 | mfhi $hi1 | ||
194 | $MULTU $nlo,$m1 | ||
195 | $ADDU $lo1,$lo0 | ||
196 | sltu $at,$lo1,$lo0 | ||
197 | $ADDU $hi1,$at | ||
198 | mflo $nlo | ||
199 | mfhi $nhi | ||
200 | |||
201 | move $tp,$sp | ||
202 | li $j,2*$BNSZ | ||
203 | .align 4 | ||
204 | .L1st: | ||
205 | .set noreorder | ||
206 | $PTR_ADD $aj,$ap,$j | ||
207 | $PTR_ADD $nj,$np,$j | ||
208 | $LD $aj,($aj) | ||
209 | $LD $nj,($nj) | ||
210 | |||
211 | $MULTU $aj,$bi | ||
212 | $ADDU $lo0,$alo,$hi0 | ||
213 | $ADDU $lo1,$nlo,$hi1 | ||
214 | sltu $at,$lo0,$hi0 | ||
215 | sltu $t0,$lo1,$hi1 | ||
216 | $ADDU $hi0,$ahi,$at | ||
217 | $ADDU $hi1,$nhi,$t0 | ||
218 | mflo $alo | ||
219 | mfhi $ahi | ||
220 | |||
221 | $ADDU $lo1,$lo0 | ||
222 | sltu $at,$lo1,$lo0 | ||
223 | $MULTU $nj,$m1 | ||
224 | $ADDU $hi1,$at | ||
225 | addu $j,$BNSZ | ||
226 | $ST $lo1,($tp) | ||
227 | sltu $t0,$j,$num | ||
228 | mflo $nlo | ||
229 | mfhi $nhi | ||
230 | |||
231 | bnez $t0,.L1st | ||
232 | $PTR_ADD $tp,$BNSZ | ||
233 | .set reorder | ||
234 | |||
235 | $ADDU $lo0,$alo,$hi0 | ||
236 | sltu $at,$lo0,$hi0 | ||
237 | $ADDU $hi0,$ahi,$at | ||
238 | |||
239 | $ADDU $lo1,$nlo,$hi1 | ||
240 | sltu $t0,$lo1,$hi1 | ||
241 | $ADDU $hi1,$nhi,$t0 | ||
242 | $ADDU $lo1,$lo0 | ||
243 | sltu $at,$lo1,$lo0 | ||
244 | $ADDU $hi1,$at | ||
245 | |||
246 | $ST $lo1,($tp) | ||
247 | |||
248 | $ADDU $hi1,$hi0 | ||
249 | sltu $at,$hi1,$hi0 | ||
250 | $ST $hi1,$BNSZ($tp) | ||
251 | $ST $at,2*$BNSZ($tp) | ||
252 | |||
253 | li $i,$BNSZ | ||
254 | .align 4 | ||
255 | .Louter: | ||
256 | $PTR_ADD $bi,$bp,$i | ||
257 | $LD $bi,($bi) | ||
258 | $LD $aj,($ap) | ||
259 | $LD $alo,$BNSZ($ap) | ||
260 | $LD $tj,($sp) | ||
261 | |||
262 | $MULTU $aj,$bi | ||
263 | $LD $nj,($np) | ||
264 | $LD $nlo,$BNSZ($np) | ||
265 | mflo $lo0 | ||
266 | mfhi $hi0 | ||
267 | $ADDU $lo0,$tj | ||
268 | $MULTU $lo0,$n0 | ||
269 | sltu $at,$lo0,$tj | ||
270 | $ADDU $hi0,$at | ||
271 | mflo $m1 | ||
272 | |||
273 | $MULTU $alo,$bi | ||
274 | mflo $alo | ||
275 | mfhi $ahi | ||
276 | |||
277 | $MULTU $nj,$m1 | ||
278 | mflo $lo1 | ||
279 | mfhi $hi1 | ||
280 | |||
281 | $MULTU $nlo,$m1 | ||
282 | $ADDU $lo1,$lo0 | ||
283 | sltu $at,$lo1,$lo0 | ||
284 | $ADDU $hi1,$at | ||
285 | mflo $nlo | ||
286 | mfhi $nhi | ||
287 | |||
288 | move $tp,$sp | ||
289 | li $j,2*$BNSZ | ||
290 | $LD $tj,$BNSZ($tp) | ||
291 | .align 4 | ||
292 | .Linner: | ||
293 | .set noreorder | ||
294 | $PTR_ADD $aj,$ap,$j | ||
295 | $PTR_ADD $nj,$np,$j | ||
296 | $LD $aj,($aj) | ||
297 | $LD $nj,($nj) | ||
298 | |||
299 | $MULTU $aj,$bi | ||
300 | $ADDU $lo0,$alo,$hi0 | ||
301 | $ADDU $lo1,$nlo,$hi1 | ||
302 | sltu $at,$lo0,$hi0 | ||
303 | sltu $t0,$lo1,$hi1 | ||
304 | $ADDU $hi0,$ahi,$at | ||
305 | $ADDU $hi1,$nhi,$t0 | ||
306 | mflo $alo | ||
307 | mfhi $ahi | ||
308 | |||
309 | $ADDU $lo0,$tj | ||
310 | addu $j,$BNSZ | ||
311 | $MULTU $nj,$m1 | ||
312 | sltu $at,$lo0,$tj | ||
313 | $ADDU $lo1,$lo0 | ||
314 | $ADDU $hi0,$at | ||
315 | sltu $t0,$lo1,$lo0 | ||
316 | $LD $tj,2*$BNSZ($tp) | ||
317 | $ADDU $hi1,$t0 | ||
318 | sltu $at,$j,$num | ||
319 | mflo $nlo | ||
320 | mfhi $nhi | ||
321 | $ST $lo1,($tp) | ||
322 | bnez $at,.Linner | ||
323 | $PTR_ADD $tp,$BNSZ | ||
324 | .set reorder | ||
325 | |||
326 | $ADDU $lo0,$alo,$hi0 | ||
327 | sltu $at,$lo0,$hi0 | ||
328 | $ADDU $hi0,$ahi,$at | ||
329 | $ADDU $lo0,$tj | ||
330 | sltu $t0,$lo0,$tj | ||
331 | $ADDU $hi0,$t0 | ||
332 | |||
333 | $LD $tj,2*$BNSZ($tp) | ||
334 | $ADDU $lo1,$nlo,$hi1 | ||
335 | sltu $at,$lo1,$hi1 | ||
336 | $ADDU $hi1,$nhi,$at | ||
337 | $ADDU $lo1,$lo0 | ||
338 | sltu $t0,$lo1,$lo0 | ||
339 | $ADDU $hi1,$t0 | ||
340 | $ST $lo1,($tp) | ||
341 | |||
342 | $ADDU $lo1,$hi1,$hi0 | ||
343 | sltu $hi1,$lo1,$hi0 | ||
344 | $ADDU $lo1,$tj | ||
345 | sltu $at,$lo1,$tj | ||
346 | $ADDU $hi1,$at | ||
347 | $ST $lo1,$BNSZ($tp) | ||
348 | $ST $hi1,2*$BNSZ($tp) | ||
349 | |||
350 | addu $i,$BNSZ | ||
351 | sltu $t0,$i,$num | ||
352 | bnez $t0,.Louter | ||
353 | |||
354 | .set noreorder | ||
355 | $PTR_ADD $tj,$sp,$num # &tp[num] | ||
356 | move $tp,$sp | ||
357 | move $ap,$sp | ||
358 | li $hi0,0 # clear borrow bit | ||
359 | |||
360 | .align 4 | ||
361 | .Lsub: $LD $lo0,($tp) | ||
362 | $LD $lo1,($np) | ||
363 | $PTR_ADD $tp,$BNSZ | ||
364 | $PTR_ADD $np,$BNSZ | ||
365 | $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] | ||
366 | sgtu $at,$lo1,$lo0 | ||
367 | $SUBU $lo0,$lo1,$hi0 | ||
368 | sgtu $hi0,$lo0,$lo1 | ||
369 | $ST $lo0,($rp) | ||
370 | or $hi0,$at | ||
371 | sltu $at,$tp,$tj | ||
372 | bnez $at,.Lsub | ||
373 | $PTR_ADD $rp,$BNSZ | ||
374 | |||
375 | $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit | ||
376 | move $tp,$sp | ||
377 | $PTR_SUB $rp,$num # restore rp | ||
378 | not $hi1,$hi0 | ||
379 | |||
380 | and $ap,$hi0,$sp | ||
381 | and $bp,$hi1,$rp | ||
382 | or $ap,$ap,$bp # ap=borrow?tp:rp | ||
383 | |||
384 | .align 4 | ||
385 | .Lcopy: $LD $aj,($ap) | ||
386 | $PTR_ADD $ap,$BNSZ | ||
387 | $ST $zero,($tp) | ||
388 | $PTR_ADD $tp,$BNSZ | ||
389 | sltu $at,$tp,$tj | ||
390 | $ST $aj,($rp) | ||
391 | bnez $at,.Lcopy | ||
392 | $PTR_ADD $rp,$BNSZ | ||
393 | |||
394 | li $a0,1 | ||
395 | li $t0,1 | ||
396 | |||
397 | .set noreorder | ||
398 | move $sp,$fp | ||
399 | $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) | ||
400 | $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) | ||
401 | $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) | ||
402 | $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) | ||
403 | $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) | ||
404 | $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) | ||
405 | $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) | ||
406 | $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) | ||
407 | $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) | ||
408 | ___ | ||
409 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
410 | $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) | ||
411 | $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) | ||
412 | $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) | ||
413 | $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) | ||
414 | ___ | ||
415 | $code.=<<___; | ||
416 | jr $ra | ||
417 | $PTR_ADD $sp,$FRAMESIZE*$SZREG | ||
418 | .end bn_mul_mont_internal | ||
419 | .rdata | ||
420 | .asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" | ||
421 | ___ | ||
422 | |||
423 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
424 | |||
425 | print $code; | ||
426 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/mips.pl b/src/lib/libcrypto/bn/asm/mips.pl new file mode 100644 index 0000000000..c162a3ec23 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/mips.pl | |||
@@ -0,0 +1,2585 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. | ||
6 | # | ||
7 | # Rights for redistribution and usage in source and binary forms are | ||
8 | # granted according to the OpenSSL license. Warranty of any kind is | ||
9 | # disclaimed. | ||
10 | # ==================================================================== | ||
11 | |||
12 | |||
13 | # July 1999 | ||
14 | # | ||
15 | # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. | ||
16 | # | ||
17 | # The module is designed to work with either of the "new" MIPS ABI(5), | ||
18 | # namely N32 or N64, offered by IRIX 6.x. It's not ment to work under | ||
19 | # IRIX 5.x not only because it doesn't support new ABIs but also | ||
20 | # because 5.x kernels put R4x00 CPU into 32-bit mode and all those | ||
21 | # 64-bit instructions (daddu, dmultu, etc.) found below gonna only | ||
22 | # cause illegal instruction exception:-( | ||
23 | # | ||
24 | # In addition the code depends on preprocessor flags set up by MIPSpro | ||
25 | # compiler driver (either as or cc) and therefore (probably?) can't be | ||
26 | # compiled by the GNU assembler. GNU C driver manages fine though... | ||
27 | # I mean as long as -mmips-as is specified or is the default option, | ||
28 | # because then it simply invokes /usr/bin/as which in turn takes | ||
29 | # perfect care of the preprocessor definitions. Another neat feature | ||
30 | # offered by the MIPSpro assembler is an optimization pass. This gave | ||
31 | # me the opportunity to have the code looking more regular as all those | ||
32 | # architecture dependent instruction rescheduling details were left to | ||
33 | # the assembler. Cool, huh? | ||
34 | # | ||
35 | # Performance improvement is astonishing! 'apps/openssl speed rsa dsa' | ||
36 | # goes way over 3 times faster! | ||
37 | # | ||
38 | # <appro@fy.chalmers.se> | ||
39 | |||
40 | # October 2010 | ||
41 | # | ||
42 | # Adapt the module even for 32-bit ABIs and other OSes. The former was | ||
43 | # achieved by mechanical replacement of 64-bit arithmetic instructions | ||
44 | # such as dmultu, daddu, etc. with their 32-bit counterparts and | ||
45 | # adjusting offsets denoting multiples of BN_ULONG. Above mentioned | ||
46 | # >3x performance improvement naturally does not apply to 32-bit code | ||
47 | # [because there is no instruction 32-bit compiler can't use], one | ||
48 | # has to content with 40-85% improvement depending on benchmark and | ||
49 | # key length, more for longer keys. | ||
50 | |||
51 | $flavour = shift; | ||
52 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
53 | open STDOUT,">$output"; | ||
54 | |||
55 | if ($flavour =~ /64|n32/i) { | ||
56 | $LD="ld"; | ||
57 | $ST="sd"; | ||
58 | $MULTU="dmultu"; | ||
59 | $DIVU="ddivu"; | ||
60 | $ADDU="daddu"; | ||
61 | $SUBU="dsubu"; | ||
62 | $SRL="dsrl"; | ||
63 | $SLL="dsll"; | ||
64 | $BNSZ=8; | ||
65 | $PTR_ADD="daddu"; | ||
66 | $PTR_SUB="dsubu"; | ||
67 | $SZREG=8; | ||
68 | $REG_S="sd"; | ||
69 | $REG_L="ld"; | ||
70 | } else { | ||
71 | $LD="lw"; | ||
72 | $ST="sw"; | ||
73 | $MULTU="multu"; | ||
74 | $DIVU="divu"; | ||
75 | $ADDU="addu"; | ||
76 | $SUBU="subu"; | ||
77 | $SRL="srl"; | ||
78 | $SLL="sll"; | ||
79 | $BNSZ=4; | ||
80 | $PTR_ADD="addu"; | ||
81 | $PTR_SUB="subu"; | ||
82 | $SZREG=4; | ||
83 | $REG_S="sw"; | ||
84 | $REG_L="lw"; | ||
85 | $code=".set mips2\n"; | ||
86 | } | ||
87 | |||
88 | # Below is N32/64 register layout used in the original module. | ||
89 | # | ||
90 | ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
91 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
92 | ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
93 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
94 | ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
95 | ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); | ||
96 | # | ||
97 | # No special adaptation is required for O32. NUBI on the other hand | ||
98 | # is treated by saving/restoring ($v1,$t0..$t3). | ||
99 | |||
100 | $gp=$v1 if ($flavour =~ /nubi/i); | ||
101 | |||
102 | $minus4=$v1; | ||
103 | |||
104 | $code.=<<___; | ||
105 | .rdata | ||
106 | .asciiz "mips3.s, Version 1.2" | ||
107 | .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" | ||
108 | |||
109 | .text | ||
110 | .set noat | ||
111 | |||
112 | .align 5 | ||
113 | .globl bn_mul_add_words | ||
114 | .ent bn_mul_add_words | ||
115 | bn_mul_add_words: | ||
116 | .set noreorder | ||
117 | bgtz $a2,bn_mul_add_words_internal | ||
118 | move $v0,$zero | ||
119 | jr $ra | ||
120 | move $a0,$v0 | ||
121 | .end bn_mul_add_words | ||
122 | |||
123 | .align 5 | ||
124 | .ent bn_mul_add_words_internal | ||
125 | bn_mul_add_words_internal: | ||
126 | ___ | ||
127 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
128 | .frame $sp,6*$SZREG,$ra | ||
129 | .mask 0x8000f008,-$SZREG | ||
130 | .set noreorder | ||
131 | $PTR_SUB $sp,6*$SZREG | ||
132 | $REG_S $ra,5*$SZREG($sp) | ||
133 | $REG_S $t3,4*$SZREG($sp) | ||
134 | $REG_S $t2,3*$SZREG($sp) | ||
135 | $REG_S $t1,2*$SZREG($sp) | ||
136 | $REG_S $t0,1*$SZREG($sp) | ||
137 | $REG_S $gp,0*$SZREG($sp) | ||
138 | ___ | ||
139 | $code.=<<___; | ||
140 | .set reorder | ||
141 | li $minus4,-4 | ||
142 | and $ta0,$a2,$minus4 | ||
143 | $LD $t0,0($a1) | ||
144 | beqz $ta0,.L_bn_mul_add_words_tail | ||
145 | |||
146 | .L_bn_mul_add_words_loop: | ||
147 | $MULTU $t0,$a3 | ||
148 | $LD $t1,0($a0) | ||
149 | $LD $t2,$BNSZ($a1) | ||
150 | $LD $t3,$BNSZ($a0) | ||
151 | $LD $ta0,2*$BNSZ($a1) | ||
152 | $LD $ta1,2*$BNSZ($a0) | ||
153 | $ADDU $t1,$v0 | ||
154 | sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit | ||
155 | # values", but it seems to work fine | ||
156 | # even on 64-bit registers. | ||
157 | mflo $at | ||
158 | mfhi $t0 | ||
159 | $ADDU $t1,$at | ||
160 | $ADDU $v0,$t0 | ||
161 | $MULTU $t2,$a3 | ||
162 | sltu $at,$t1,$at | ||
163 | $ST $t1,0($a0) | ||
164 | $ADDU $v0,$at | ||
165 | |||
166 | $LD $ta2,3*$BNSZ($a1) | ||
167 | $LD $ta3,3*$BNSZ($a0) | ||
168 | $ADDU $t3,$v0 | ||
169 | sltu $v0,$t3,$v0 | ||
170 | mflo $at | ||
171 | mfhi $t2 | ||
172 | $ADDU $t3,$at | ||
173 | $ADDU $v0,$t2 | ||
174 | $MULTU $ta0,$a3 | ||
175 | sltu $at,$t3,$at | ||
176 | $ST $t3,$BNSZ($a0) | ||
177 | $ADDU $v0,$at | ||
178 | |||
179 | subu $a2,4 | ||
180 | $PTR_ADD $a0,4*$BNSZ | ||
181 | $PTR_ADD $a1,4*$BNSZ | ||
182 | $ADDU $ta1,$v0 | ||
183 | sltu $v0,$ta1,$v0 | ||
184 | mflo $at | ||
185 | mfhi $ta0 | ||
186 | $ADDU $ta1,$at | ||
187 | $ADDU $v0,$ta0 | ||
188 | $MULTU $ta2,$a3 | ||
189 | sltu $at,$ta1,$at | ||
190 | $ST $ta1,-2*$BNSZ($a0) | ||
191 | $ADDU $v0,$at | ||
192 | |||
193 | |||
194 | and $ta0,$a2,$minus4 | ||
195 | $ADDU $ta3,$v0 | ||
196 | sltu $v0,$ta3,$v0 | ||
197 | mflo $at | ||
198 | mfhi $ta2 | ||
199 | $ADDU $ta3,$at | ||
200 | $ADDU $v0,$ta2 | ||
201 | sltu $at,$ta3,$at | ||
202 | $ST $ta3,-$BNSZ($a0) | ||
203 | $ADDU $v0,$at | ||
204 | .set noreorder | ||
205 | bgtzl $ta0,.L_bn_mul_add_words_loop | ||
206 | $LD $t0,0($a1) | ||
207 | |||
208 | beqz $a2,.L_bn_mul_add_words_return | ||
209 | nop | ||
210 | |||
211 | .L_bn_mul_add_words_tail: | ||
212 | .set reorder | ||
213 | $LD $t0,0($a1) | ||
214 | $MULTU $t0,$a3 | ||
215 | $LD $t1,0($a0) | ||
216 | subu $a2,1 | ||
217 | $ADDU $t1,$v0 | ||
218 | sltu $v0,$t1,$v0 | ||
219 | mflo $at | ||
220 | mfhi $t0 | ||
221 | $ADDU $t1,$at | ||
222 | $ADDU $v0,$t0 | ||
223 | sltu $at,$t1,$at | ||
224 | $ST $t1,0($a0) | ||
225 | $ADDU $v0,$at | ||
226 | beqz $a2,.L_bn_mul_add_words_return | ||
227 | |||
228 | $LD $t0,$BNSZ($a1) | ||
229 | $MULTU $t0,$a3 | ||
230 | $LD $t1,$BNSZ($a0) | ||
231 | subu $a2,1 | ||
232 | $ADDU $t1,$v0 | ||
233 | sltu $v0,$t1,$v0 | ||
234 | mflo $at | ||
235 | mfhi $t0 | ||
236 | $ADDU $t1,$at | ||
237 | $ADDU $v0,$t0 | ||
238 | sltu $at,$t1,$at | ||
239 | $ST $t1,$BNSZ($a0) | ||
240 | $ADDU $v0,$at | ||
241 | beqz $a2,.L_bn_mul_add_words_return | ||
242 | |||
243 | $LD $t0,2*$BNSZ($a1) | ||
244 | $MULTU $t0,$a3 | ||
245 | $LD $t1,2*$BNSZ($a0) | ||
246 | $ADDU $t1,$v0 | ||
247 | sltu $v0,$t1,$v0 | ||
248 | mflo $at | ||
249 | mfhi $t0 | ||
250 | $ADDU $t1,$at | ||
251 | $ADDU $v0,$t0 | ||
252 | sltu $at,$t1,$at | ||
253 | $ST $t1,2*$BNSZ($a0) | ||
254 | $ADDU $v0,$at | ||
255 | |||
256 | .L_bn_mul_add_words_return: | ||
257 | .set noreorder | ||
258 | ___ | ||
259 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
260 | $REG_L $t3,4*$SZREG($sp) | ||
261 | $REG_L $t2,3*$SZREG($sp) | ||
262 | $REG_L $t1,2*$SZREG($sp) | ||
263 | $REG_L $t0,1*$SZREG($sp) | ||
264 | $REG_L $gp,0*$SZREG($sp) | ||
265 | $PTR_ADD $sp,6*$SZREG | ||
266 | ___ | ||
267 | $code.=<<___; | ||
268 | jr $ra | ||
269 | move $a0,$v0 | ||
270 | .end bn_mul_add_words_internal | ||
271 | |||
272 | .align 5 | ||
273 | .globl bn_mul_words | ||
274 | .ent bn_mul_words | ||
275 | bn_mul_words: | ||
276 | .set noreorder | ||
277 | bgtz $a2,bn_mul_words_internal | ||
278 | move $v0,$zero | ||
279 | jr $ra | ||
280 | move $a0,$v0 | ||
281 | .end bn_mul_words | ||
282 | |||
283 | .align 5 | ||
284 | .ent bn_mul_words_internal | ||
285 | bn_mul_words_internal: | ||
286 | ___ | ||
287 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
288 | .frame $sp,6*$SZREG,$ra | ||
289 | .mask 0x8000f008,-$SZREG | ||
290 | .set noreorder | ||
291 | $PTR_SUB $sp,6*$SZREG | ||
292 | $REG_S $ra,5*$SZREG($sp) | ||
293 | $REG_S $t3,4*$SZREG($sp) | ||
294 | $REG_S $t2,3*$SZREG($sp) | ||
295 | $REG_S $t1,2*$SZREG($sp) | ||
296 | $REG_S $t0,1*$SZREG($sp) | ||
297 | $REG_S $gp,0*$SZREG($sp) | ||
298 | ___ | ||
299 | $code.=<<___; | ||
300 | .set reorder | ||
301 | li $minus4,-4 | ||
302 | and $ta0,$a2,$minus4 | ||
303 | $LD $t0,0($a1) | ||
304 | beqz $ta0,.L_bn_mul_words_tail | ||
305 | |||
306 | .L_bn_mul_words_loop: | ||
307 | $MULTU $t0,$a3 | ||
308 | $LD $t2,$BNSZ($a1) | ||
309 | $LD $ta0,2*$BNSZ($a1) | ||
310 | $LD $ta2,3*$BNSZ($a1) | ||
311 | mflo $at | ||
312 | mfhi $t0 | ||
313 | $ADDU $v0,$at | ||
314 | sltu $t1,$v0,$at | ||
315 | $MULTU $t2,$a3 | ||
316 | $ST $v0,0($a0) | ||
317 | $ADDU $v0,$t1,$t0 | ||
318 | |||
319 | subu $a2,4 | ||
320 | $PTR_ADD $a0,4*$BNSZ | ||
321 | $PTR_ADD $a1,4*$BNSZ | ||
322 | mflo $at | ||
323 | mfhi $t2 | ||
324 | $ADDU $v0,$at | ||
325 | sltu $t3,$v0,$at | ||
326 | $MULTU $ta0,$a3 | ||
327 | $ST $v0,-3*$BNSZ($a0) | ||
328 | $ADDU $v0,$t3,$t2 | ||
329 | |||
330 | mflo $at | ||
331 | mfhi $ta0 | ||
332 | $ADDU $v0,$at | ||
333 | sltu $ta1,$v0,$at | ||
334 | $MULTU $ta2,$a3 | ||
335 | $ST $v0,-2*$BNSZ($a0) | ||
336 | $ADDU $v0,$ta1,$ta0 | ||
337 | |||
338 | and $ta0,$a2,$minus4 | ||
339 | mflo $at | ||
340 | mfhi $ta2 | ||
341 | $ADDU $v0,$at | ||
342 | sltu $ta3,$v0,$at | ||
343 | $ST $v0,-$BNSZ($a0) | ||
344 | $ADDU $v0,$ta3,$ta2 | ||
345 | .set noreorder | ||
346 | bgtzl $ta0,.L_bn_mul_words_loop | ||
347 | $LD $t0,0($a1) | ||
348 | |||
349 | beqz $a2,.L_bn_mul_words_return | ||
350 | nop | ||
351 | |||
352 | .L_bn_mul_words_tail: | ||
353 | .set reorder | ||
354 | $LD $t0,0($a1) | ||
355 | $MULTU $t0,$a3 | ||
356 | subu $a2,1 | ||
357 | mflo $at | ||
358 | mfhi $t0 | ||
359 | $ADDU $v0,$at | ||
360 | sltu $t1,$v0,$at | ||
361 | $ST $v0,0($a0) | ||
362 | $ADDU $v0,$t1,$t0 | ||
363 | beqz $a2,.L_bn_mul_words_return | ||
364 | |||
365 | $LD $t0,$BNSZ($a1) | ||
366 | $MULTU $t0,$a3 | ||
367 | subu $a2,1 | ||
368 | mflo $at | ||
369 | mfhi $t0 | ||
370 | $ADDU $v0,$at | ||
371 | sltu $t1,$v0,$at | ||
372 | $ST $v0,$BNSZ($a0) | ||
373 | $ADDU $v0,$t1,$t0 | ||
374 | beqz $a2,.L_bn_mul_words_return | ||
375 | |||
376 | $LD $t0,2*$BNSZ($a1) | ||
377 | $MULTU $t0,$a3 | ||
378 | mflo $at | ||
379 | mfhi $t0 | ||
380 | $ADDU $v0,$at | ||
381 | sltu $t1,$v0,$at | ||
382 | $ST $v0,2*$BNSZ($a0) | ||
383 | $ADDU $v0,$t1,$t0 | ||
384 | |||
385 | .L_bn_mul_words_return: | ||
386 | .set noreorder | ||
387 | ___ | ||
388 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
389 | $REG_L $t3,4*$SZREG($sp) | ||
390 | $REG_L $t2,3*$SZREG($sp) | ||
391 | $REG_L $t1,2*$SZREG($sp) | ||
392 | $REG_L $t0,1*$SZREG($sp) | ||
393 | $REG_L $gp,0*$SZREG($sp) | ||
394 | $PTR_ADD $sp,6*$SZREG | ||
395 | ___ | ||
396 | $code.=<<___; | ||
397 | jr $ra | ||
398 | move $a0,$v0 | ||
399 | .end bn_mul_words_internal | ||
400 | |||
401 | .align 5 | ||
402 | .globl bn_sqr_words | ||
403 | .ent bn_sqr_words | ||
404 | bn_sqr_words: | ||
405 | .set noreorder | ||
406 | bgtz $a2,bn_sqr_words_internal | ||
407 | move $v0,$zero | ||
408 | jr $ra | ||
409 | move $a0,$v0 | ||
410 | .end bn_sqr_words | ||
411 | |||
412 | .align 5 | ||
413 | .ent bn_sqr_words_internal | ||
414 | bn_sqr_words_internal: | ||
415 | ___ | ||
416 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
417 | .frame $sp,6*$SZREG,$ra | ||
418 | .mask 0x8000f008,-$SZREG | ||
419 | .set noreorder | ||
420 | $PTR_SUB $sp,6*$SZREG | ||
421 | $REG_S $ra,5*$SZREG($sp) | ||
422 | $REG_S $t3,4*$SZREG($sp) | ||
423 | $REG_S $t2,3*$SZREG($sp) | ||
424 | $REG_S $t1,2*$SZREG($sp) | ||
425 | $REG_S $t0,1*$SZREG($sp) | ||
426 | $REG_S $gp,0*$SZREG($sp) | ||
427 | ___ | ||
428 | $code.=<<___; | ||
429 | .set reorder | ||
430 | li $minus4,-4 | ||
431 | and $ta0,$a2,$minus4 | ||
432 | $LD $t0,0($a1) | ||
433 | beqz $ta0,.L_bn_sqr_words_tail | ||
434 | |||
435 | .L_bn_sqr_words_loop: | ||
436 | $MULTU $t0,$t0 | ||
437 | $LD $t2,$BNSZ($a1) | ||
438 | $LD $ta0,2*$BNSZ($a1) | ||
439 | $LD $ta2,3*$BNSZ($a1) | ||
440 | mflo $t1 | ||
441 | mfhi $t0 | ||
442 | $ST $t1,0($a0) | ||
443 | $ST $t0,$BNSZ($a0) | ||
444 | |||
445 | $MULTU $t2,$t2 | ||
446 | subu $a2,4 | ||
447 | $PTR_ADD $a0,8*$BNSZ | ||
448 | $PTR_ADD $a1,4*$BNSZ | ||
449 | mflo $t3 | ||
450 | mfhi $t2 | ||
451 | $ST $t3,-6*$BNSZ($a0) | ||
452 | $ST $t2,-5*$BNSZ($a0) | ||
453 | |||
454 | $MULTU $ta0,$ta0 | ||
455 | mflo $ta1 | ||
456 | mfhi $ta0 | ||
457 | $ST $ta1,-4*$BNSZ($a0) | ||
458 | $ST $ta0,-3*$BNSZ($a0) | ||
459 | |||
460 | |||
461 | $MULTU $ta2,$ta2 | ||
462 | and $ta0,$a2,$minus4 | ||
463 | mflo $ta3 | ||
464 | mfhi $ta2 | ||
465 | $ST $ta3,-2*$BNSZ($a0) | ||
466 | $ST $ta2,-$BNSZ($a0) | ||
467 | |||
468 | .set noreorder | ||
469 | bgtzl $ta0,.L_bn_sqr_words_loop | ||
470 | $LD $t0,0($a1) | ||
471 | |||
472 | beqz $a2,.L_bn_sqr_words_return | ||
473 | nop | ||
474 | |||
475 | .L_bn_sqr_words_tail: | ||
476 | .set reorder | ||
477 | $LD $t0,0($a1) | ||
478 | $MULTU $t0,$t0 | ||
479 | subu $a2,1 | ||
480 | mflo $t1 | ||
481 | mfhi $t0 | ||
482 | $ST $t1,0($a0) | ||
483 | $ST $t0,$BNSZ($a0) | ||
484 | beqz $a2,.L_bn_sqr_words_return | ||
485 | |||
486 | $LD $t0,$BNSZ($a1) | ||
487 | $MULTU $t0,$t0 | ||
488 | subu $a2,1 | ||
489 | mflo $t1 | ||
490 | mfhi $t0 | ||
491 | $ST $t1,2*$BNSZ($a0) | ||
492 | $ST $t0,3*$BNSZ($a0) | ||
493 | beqz $a2,.L_bn_sqr_words_return | ||
494 | |||
495 | $LD $t0,2*$BNSZ($a1) | ||
496 | $MULTU $t0,$t0 | ||
497 | mflo $t1 | ||
498 | mfhi $t0 | ||
499 | $ST $t1,4*$BNSZ($a0) | ||
500 | $ST $t0,5*$BNSZ($a0) | ||
501 | |||
502 | .L_bn_sqr_words_return: | ||
503 | .set noreorder | ||
504 | ___ | ||
505 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
506 | $REG_L $t3,4*$SZREG($sp) | ||
507 | $REG_L $t2,3*$SZREG($sp) | ||
508 | $REG_L $t1,2*$SZREG($sp) | ||
509 | $REG_L $t0,1*$SZREG($sp) | ||
510 | $REG_L $gp,0*$SZREG($sp) | ||
511 | $PTR_ADD $sp,6*$SZREG | ||
512 | ___ | ||
513 | $code.=<<___; | ||
514 | jr $ra | ||
515 | move $a0,$v0 | ||
516 | |||
517 | .end bn_sqr_words_internal | ||
518 | |||
519 | .align 5 | ||
520 | .globl bn_add_words | ||
521 | .ent bn_add_words | ||
522 | bn_add_words: | ||
523 | .set noreorder | ||
524 | bgtz $a3,bn_add_words_internal | ||
525 | move $v0,$zero | ||
526 | jr $ra | ||
527 | move $a0,$v0 | ||
528 | .end bn_add_words | ||
529 | |||
530 | .align 5 | ||
531 | .ent bn_add_words_internal | ||
532 | bn_add_words_internal: | ||
533 | ___ | ||
534 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
535 | .frame $sp,6*$SZREG,$ra | ||
536 | .mask 0x8000f008,-$SZREG | ||
537 | .set noreorder | ||
538 | $PTR_SUB $sp,6*$SZREG | ||
539 | $REG_S $ra,5*$SZREG($sp) | ||
540 | $REG_S $t3,4*$SZREG($sp) | ||
541 | $REG_S $t2,3*$SZREG($sp) | ||
542 | $REG_S $t1,2*$SZREG($sp) | ||
543 | $REG_S $t0,1*$SZREG($sp) | ||
544 | $REG_S $gp,0*$SZREG($sp) | ||
545 | ___ | ||
546 | $code.=<<___; | ||
547 | .set reorder | ||
548 | li $minus4,-4 | ||
549 | and $at,$a3,$minus4 | ||
550 | $LD $t0,0($a1) | ||
551 | beqz $at,.L_bn_add_words_tail | ||
552 | |||
553 | .L_bn_add_words_loop: | ||
554 | $LD $ta0,0($a2) | ||
555 | subu $a3,4 | ||
556 | $LD $t1,$BNSZ($a1) | ||
557 | and $at,$a3,$minus4 | ||
558 | $LD $t2,2*$BNSZ($a1) | ||
559 | $PTR_ADD $a2,4*$BNSZ | ||
560 | $LD $t3,3*$BNSZ($a1) | ||
561 | $PTR_ADD $a0,4*$BNSZ | ||
562 | $LD $ta1,-3*$BNSZ($a2) | ||
563 | $PTR_ADD $a1,4*$BNSZ | ||
564 | $LD $ta2,-2*$BNSZ($a2) | ||
565 | $LD $ta3,-$BNSZ($a2) | ||
566 | $ADDU $ta0,$t0 | ||
567 | sltu $t8,$ta0,$t0 | ||
568 | $ADDU $t0,$ta0,$v0 | ||
569 | sltu $v0,$t0,$ta0 | ||
570 | $ST $t0,-4*$BNSZ($a0) | ||
571 | $ADDU $v0,$t8 | ||
572 | |||
573 | $ADDU $ta1,$t1 | ||
574 | sltu $t9,$ta1,$t1 | ||
575 | $ADDU $t1,$ta1,$v0 | ||
576 | sltu $v0,$t1,$ta1 | ||
577 | $ST $t1,-3*$BNSZ($a0) | ||
578 | $ADDU $v0,$t9 | ||
579 | |||
580 | $ADDU $ta2,$t2 | ||
581 | sltu $t8,$ta2,$t2 | ||
582 | $ADDU $t2,$ta2,$v0 | ||
583 | sltu $v0,$t2,$ta2 | ||
584 | $ST $t2,-2*$BNSZ($a0) | ||
585 | $ADDU $v0,$t8 | ||
586 | |||
587 | $ADDU $ta3,$t3 | ||
588 | sltu $t9,$ta3,$t3 | ||
589 | $ADDU $t3,$ta3,$v0 | ||
590 | sltu $v0,$t3,$ta3 | ||
591 | $ST $t3,-$BNSZ($a0) | ||
592 | $ADDU $v0,$t9 | ||
593 | |||
594 | .set noreorder | ||
595 | bgtzl $at,.L_bn_add_words_loop | ||
596 | $LD $t0,0($a1) | ||
597 | |||
598 | beqz $a3,.L_bn_add_words_return | ||
599 | nop | ||
600 | |||
601 | .L_bn_add_words_tail: | ||
602 | .set reorder | ||
603 | $LD $t0,0($a1) | ||
604 | $LD $ta0,0($a2) | ||
605 | $ADDU $ta0,$t0 | ||
606 | subu $a3,1 | ||
607 | sltu $t8,$ta0,$t0 | ||
608 | $ADDU $t0,$ta0,$v0 | ||
609 | sltu $v0,$t0,$ta0 | ||
610 | $ST $t0,0($a0) | ||
611 | $ADDU $v0,$t8 | ||
612 | beqz $a3,.L_bn_add_words_return | ||
613 | |||
614 | $LD $t1,$BNSZ($a1) | ||
615 | $LD $ta1,$BNSZ($a2) | ||
616 | $ADDU $ta1,$t1 | ||
617 | subu $a3,1 | ||
618 | sltu $t9,$ta1,$t1 | ||
619 | $ADDU $t1,$ta1,$v0 | ||
620 | sltu $v0,$t1,$ta1 | ||
621 | $ST $t1,$BNSZ($a0) | ||
622 | $ADDU $v0,$t9 | ||
623 | beqz $a3,.L_bn_add_words_return | ||
624 | |||
625 | $LD $t2,2*$BNSZ($a1) | ||
626 | $LD $ta2,2*$BNSZ($a2) | ||
627 | $ADDU $ta2,$t2 | ||
628 | sltu $t8,$ta2,$t2 | ||
629 | $ADDU $t2,$ta2,$v0 | ||
630 | sltu $v0,$t2,$ta2 | ||
631 | $ST $t2,2*$BNSZ($a0) | ||
632 | $ADDU $v0,$t8 | ||
633 | |||
634 | .L_bn_add_words_return: | ||
635 | .set noreorder | ||
636 | ___ | ||
637 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
638 | $REG_L $t3,4*$SZREG($sp) | ||
639 | $REG_L $t2,3*$SZREG($sp) | ||
640 | $REG_L $t1,2*$SZREG($sp) | ||
641 | $REG_L $t0,1*$SZREG($sp) | ||
642 | $REG_L $gp,0*$SZREG($sp) | ||
643 | $PTR_ADD $sp,6*$SZREG | ||
644 | ___ | ||
645 | $code.=<<___; | ||
646 | jr $ra | ||
647 | move $a0,$v0 | ||
648 | |||
649 | .end bn_add_words_internal | ||
650 | |||
651 | .align 5 | ||
652 | .globl bn_sub_words | ||
653 | .ent bn_sub_words | ||
654 | bn_sub_words: | ||
655 | .set noreorder | ||
656 | bgtz $a3,bn_sub_words_internal | ||
657 | move $v0,$zero | ||
658 | jr $ra | ||
659 | move $a0,$zero | ||
660 | .end bn_sub_words | ||
661 | |||
662 | .align 5 | ||
663 | .ent bn_sub_words_internal | ||
664 | bn_sub_words_internal: | ||
665 | ___ | ||
666 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
667 | .frame $sp,6*$SZREG,$ra | ||
668 | .mask 0x8000f008,-$SZREG | ||
669 | .set noreorder | ||
670 | $PTR_SUB $sp,6*$SZREG | ||
671 | $REG_S $ra,5*$SZREG($sp) | ||
672 | $REG_S $t3,4*$SZREG($sp) | ||
673 | $REG_S $t2,3*$SZREG($sp) | ||
674 | $REG_S $t1,2*$SZREG($sp) | ||
675 | $REG_S $t0,1*$SZREG($sp) | ||
676 | $REG_S $gp,0*$SZREG($sp) | ||
677 | ___ | ||
678 | $code.=<<___; | ||
679 | .set reorder | ||
680 | li $minus4,-4 | ||
681 | and $at,$a3,$minus4 | ||
682 | $LD $t0,0($a1) | ||
683 | beqz $at,.L_bn_sub_words_tail | ||
684 | |||
685 | .L_bn_sub_words_loop: | ||
686 | $LD $ta0,0($a2) | ||
687 | subu $a3,4 | ||
688 | $LD $t1,$BNSZ($a1) | ||
689 | and $at,$a3,$minus4 | ||
690 | $LD $t2,2*$BNSZ($a1) | ||
691 | $PTR_ADD $a2,4*$BNSZ | ||
692 | $LD $t3,3*$BNSZ($a1) | ||
693 | $PTR_ADD $a0,4*$BNSZ | ||
694 | $LD $ta1,-3*$BNSZ($a2) | ||
695 | $PTR_ADD $a1,4*$BNSZ | ||
696 | $LD $ta2,-2*$BNSZ($a2) | ||
697 | $LD $ta3,-$BNSZ($a2) | ||
698 | sltu $t8,$t0,$ta0 | ||
699 | $SUBU $ta0,$t0,$ta0 | ||
700 | $SUBU $t0,$ta0,$v0 | ||
701 | sgtu $v0,$t0,$ta0 | ||
702 | $ST $t0,-4*$BNSZ($a0) | ||
703 | $ADDU $v0,$t8 | ||
704 | |||
705 | sltu $t9,$t1,$ta1 | ||
706 | $SUBU $ta1,$t1,$ta1 | ||
707 | $SUBU $t1,$ta1,$v0 | ||
708 | sgtu $v0,$t1,$ta1 | ||
709 | $ST $t1,-3*$BNSZ($a0) | ||
710 | $ADDU $v0,$t9 | ||
711 | |||
712 | |||
713 | sltu $t8,$t2,$ta2 | ||
714 | $SUBU $ta2,$t2,$ta2 | ||
715 | $SUBU $t2,$ta2,$v0 | ||
716 | sgtu $v0,$t2,$ta2 | ||
717 | $ST $t2,-2*$BNSZ($a0) | ||
718 | $ADDU $v0,$t8 | ||
719 | |||
720 | sltu $t9,$t3,$ta3 | ||
721 | $SUBU $ta3,$t3,$ta3 | ||
722 | $SUBU $t3,$ta3,$v0 | ||
723 | sgtu $v0,$t3,$ta3 | ||
724 | $ST $t3,-$BNSZ($a0) | ||
725 | $ADDU $v0,$t9 | ||
726 | |||
727 | .set noreorder | ||
728 | bgtzl $at,.L_bn_sub_words_loop | ||
729 | $LD $t0,0($a1) | ||
730 | |||
731 | beqz $a3,.L_bn_sub_words_return | ||
732 | nop | ||
733 | |||
734 | .L_bn_sub_words_tail: | ||
735 | .set reorder | ||
736 | $LD $t0,0($a1) | ||
737 | $LD $ta0,0($a2) | ||
738 | subu $a3,1 | ||
739 | sltu $t8,$t0,$ta0 | ||
740 | $SUBU $ta0,$t0,$ta0 | ||
741 | $SUBU $t0,$ta0,$v0 | ||
742 | sgtu $v0,$t0,$ta0 | ||
743 | $ST $t0,0($a0) | ||
744 | $ADDU $v0,$t8 | ||
745 | beqz $a3,.L_bn_sub_words_return | ||
746 | |||
747 | $LD $t1,$BNSZ($a1) | ||
748 | subu $a3,1 | ||
749 | $LD $ta1,$BNSZ($a2) | ||
750 | sltu $t9,$t1,$ta1 | ||
751 | $SUBU $ta1,$t1,$ta1 | ||
752 | $SUBU $t1,$ta1,$v0 | ||
753 | sgtu $v0,$t1,$ta1 | ||
754 | $ST $t1,$BNSZ($a0) | ||
755 | $ADDU $v0,$t9 | ||
756 | beqz $a3,.L_bn_sub_words_return | ||
757 | |||
758 | $LD $t2,2*$BNSZ($a1) | ||
759 | $LD $ta2,2*$BNSZ($a2) | ||
760 | sltu $t8,$t2,$ta2 | ||
761 | $SUBU $ta2,$t2,$ta2 | ||
762 | $SUBU $t2,$ta2,$v0 | ||
763 | sgtu $v0,$t2,$ta2 | ||
764 | $ST $t2,2*$BNSZ($a0) | ||
765 | $ADDU $v0,$t8 | ||
766 | |||
767 | .L_bn_sub_words_return: | ||
768 | .set noreorder | ||
769 | ___ | ||
770 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
771 | $REG_L $t3,4*$SZREG($sp) | ||
772 | $REG_L $t2,3*$SZREG($sp) | ||
773 | $REG_L $t1,2*$SZREG($sp) | ||
774 | $REG_L $t0,1*$SZREG($sp) | ||
775 | $REG_L $gp,0*$SZREG($sp) | ||
776 | $PTR_ADD $sp,6*$SZREG | ||
777 | ___ | ||
778 | $code.=<<___; | ||
779 | jr $ra | ||
780 | move $a0,$v0 | ||
781 | .end bn_sub_words_internal | ||
782 | |||
783 | .align 5 | ||
784 | .globl bn_div_3_words | ||
785 | .ent bn_div_3_words | ||
786 | bn_div_3_words: | ||
787 | .set noreorder | ||
788 | move $a3,$a0 # we know that bn_div_words does not | ||
789 | # touch $a3, $ta2, $ta3 and preserves $a2 | ||
790 | # so that we can save two arguments | ||
791 | # and return address in registers | ||
792 | # instead of stack:-) | ||
793 | |||
794 | $LD $a0,($a3) | ||
795 | move $ta2,$a1 | ||
796 | bne $a0,$a2,bn_div_3_words_internal | ||
797 | $LD $a1,-$BNSZ($a3) | ||
798 | li $v0,-1 | ||
799 | jr $ra | ||
800 | move $a0,$v0 | ||
801 | .end bn_div_3_words | ||
802 | |||
803 | .align 5 | ||
804 | .ent bn_div_3_words_internal | ||
805 | bn_div_3_words_internal: | ||
806 | ___ | ||
807 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
808 | .frame $sp,6*$SZREG,$ra | ||
809 | .mask 0x8000f008,-$SZREG | ||
810 | .set noreorder | ||
811 | $PTR_SUB $sp,6*$SZREG | ||
812 | $REG_S $ra,5*$SZREG($sp) | ||
813 | $REG_S $t3,4*$SZREG($sp) | ||
814 | $REG_S $t2,3*$SZREG($sp) | ||
815 | $REG_S $t1,2*$SZREG($sp) | ||
816 | $REG_S $t0,1*$SZREG($sp) | ||
817 | $REG_S $gp,0*$SZREG($sp) | ||
818 | ___ | ||
819 | $code.=<<___; | ||
820 | .set reorder | ||
821 | move $ta3,$ra | ||
822 | bal bn_div_words | ||
823 | move $ra,$ta3 | ||
824 | $MULTU $ta2,$v0 | ||
825 | $LD $t2,-2*$BNSZ($a3) | ||
826 | move $ta0,$zero | ||
827 | mfhi $t1 | ||
828 | mflo $t0 | ||
829 | sltu $t8,$t1,$a1 | ||
830 | .L_bn_div_3_words_inner_loop: | ||
831 | bnez $t8,.L_bn_div_3_words_inner_loop_done | ||
832 | sgeu $at,$t2,$t0 | ||
833 | seq $t9,$t1,$a1 | ||
834 | and $at,$t9 | ||
835 | sltu $t3,$t0,$ta2 | ||
836 | $ADDU $a1,$a2 | ||
837 | $SUBU $t1,$t3 | ||
838 | $SUBU $t0,$ta2 | ||
839 | sltu $t8,$t1,$a1 | ||
840 | sltu $ta0,$a1,$a2 | ||
841 | or $t8,$ta0 | ||
842 | .set noreorder | ||
843 | beqzl $at,.L_bn_div_3_words_inner_loop | ||
844 | $SUBU $v0,1 | ||
845 | .set reorder | ||
846 | .L_bn_div_3_words_inner_loop_done: | ||
847 | .set noreorder | ||
848 | ___ | ||
849 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
850 | $REG_L $t3,4*$SZREG($sp) | ||
851 | $REG_L $t2,3*$SZREG($sp) | ||
852 | $REG_L $t1,2*$SZREG($sp) | ||
853 | $REG_L $t0,1*$SZREG($sp) | ||
854 | $REG_L $gp,0*$SZREG($sp) | ||
855 | $PTR_ADD $sp,6*$SZREG | ||
856 | ___ | ||
857 | $code.=<<___; | ||
858 | jr $ra | ||
859 | move $a0,$v0 | ||
860 | .end bn_div_3_words_internal | ||
861 | |||
862 | .align 5 | ||
863 | .globl bn_div_words | ||
864 | .ent bn_div_words | ||
865 | bn_div_words: | ||
866 | .set noreorder | ||
867 | bnez $a2,bn_div_words_internal | ||
868 | li $v0,-1 # I would rather signal div-by-zero | ||
869 | # which can be done with 'break 7' | ||
870 | jr $ra | ||
871 | move $a0,$v0 | ||
872 | .end bn_div_words | ||
873 | |||
874 | .align 5 | ||
875 | .ent bn_div_words_internal | ||
876 | bn_div_words_internal: | ||
877 | ___ | ||
878 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
879 | .frame $sp,6*$SZREG,$ra | ||
880 | .mask 0x8000f008,-$SZREG | ||
881 | .set noreorder | ||
882 | $PTR_SUB $sp,6*$SZREG | ||
883 | $REG_S $ra,5*$SZREG($sp) | ||
884 | $REG_S $t3,4*$SZREG($sp) | ||
885 | $REG_S $t2,3*$SZREG($sp) | ||
886 | $REG_S $t1,2*$SZREG($sp) | ||
887 | $REG_S $t0,1*$SZREG($sp) | ||
888 | $REG_S $gp,0*$SZREG($sp) | ||
889 | ___ | ||
890 | $code.=<<___; | ||
891 | move $v1,$zero | ||
892 | bltz $a2,.L_bn_div_words_body | ||
893 | move $t9,$v1 | ||
894 | $SLL $a2,1 | ||
895 | bgtz $a2,.-4 | ||
896 | addu $t9,1 | ||
897 | |||
898 | .set reorder | ||
899 | negu $t1,$t9 | ||
900 | li $t2,-1 | ||
901 | $SLL $t2,$t1 | ||
902 | and $t2,$a0 | ||
903 | $SRL $at,$a1,$t1 | ||
904 | .set noreorder | ||
905 | bnezl $t2,.+8 | ||
906 | break 6 # signal overflow | ||
907 | .set reorder | ||
908 | $SLL $a0,$t9 | ||
909 | $SLL $a1,$t9 | ||
910 | or $a0,$at | ||
911 | ___ | ||
912 | $QT=$ta0; | ||
913 | $HH=$ta1; | ||
914 | $DH=$v1; | ||
915 | $code.=<<___; | ||
916 | .L_bn_div_words_body: | ||
917 | $SRL $DH,$a2,4*$BNSZ # bits | ||
918 | sgeu $at,$a0,$a2 | ||
919 | .set noreorder | ||
920 | bnezl $at,.+8 | ||
921 | $SUBU $a0,$a2 | ||
922 | .set reorder | ||
923 | |||
924 | li $QT,-1 | ||
925 | $SRL $HH,$a0,4*$BNSZ # bits | ||
926 | $SRL $QT,4*$BNSZ # q=0xffffffff | ||
927 | beq $DH,$HH,.L_bn_div_words_skip_div1 | ||
928 | $DIVU $zero,$a0,$DH | ||
929 | mflo $QT | ||
930 | .L_bn_div_words_skip_div1: | ||
931 | $MULTU $a2,$QT | ||
932 | $SLL $t3,$a0,4*$BNSZ # bits | ||
933 | $SRL $at,$a1,4*$BNSZ # bits | ||
934 | or $t3,$at | ||
935 | mflo $t0 | ||
936 | mfhi $t1 | ||
937 | .L_bn_div_words_inner_loop1: | ||
938 | sltu $t2,$t3,$t0 | ||
939 | seq $t8,$HH,$t1 | ||
940 | sltu $at,$HH,$t1 | ||
941 | and $t2,$t8 | ||
942 | sltu $v0,$t0,$a2 | ||
943 | or $at,$t2 | ||
944 | .set noreorder | ||
945 | beqz $at,.L_bn_div_words_inner_loop1_done | ||
946 | $SUBU $t1,$v0 | ||
947 | $SUBU $t0,$a2 | ||
948 | b .L_bn_div_words_inner_loop1 | ||
949 | $SUBU $QT,1 | ||
950 | .set reorder | ||
951 | .L_bn_div_words_inner_loop1_done: | ||
952 | |||
953 | $SLL $a1,4*$BNSZ # bits | ||
954 | $SUBU $a0,$t3,$t0 | ||
955 | $SLL $v0,$QT,4*$BNSZ # bits | ||
956 | |||
957 | li $QT,-1 | ||
958 | $SRL $HH,$a0,4*$BNSZ # bits | ||
959 | $SRL $QT,4*$BNSZ # q=0xffffffff | ||
960 | beq $DH,$HH,.L_bn_div_words_skip_div2 | ||
961 | $DIVU $zero,$a0,$DH | ||
962 | mflo $QT | ||
963 | .L_bn_div_words_skip_div2: | ||
964 | $MULTU $a2,$QT | ||
965 | $SLL $t3,$a0,4*$BNSZ # bits | ||
966 | $SRL $at,$a1,4*$BNSZ # bits | ||
967 | or $t3,$at | ||
968 | mflo $t0 | ||
969 | mfhi $t1 | ||
970 | .L_bn_div_words_inner_loop2: | ||
971 | sltu $t2,$t3,$t0 | ||
972 | seq $t8,$HH,$t1 | ||
973 | sltu $at,$HH,$t1 | ||
974 | and $t2,$t8 | ||
975 | sltu $v1,$t0,$a2 | ||
976 | or $at,$t2 | ||
977 | .set noreorder | ||
978 | beqz $at,.L_bn_div_words_inner_loop2_done | ||
979 | $SUBU $t1,$v1 | ||
980 | $SUBU $t0,$a2 | ||
981 | b .L_bn_div_words_inner_loop2 | ||
982 | $SUBU $QT,1 | ||
983 | .set reorder | ||
984 | .L_bn_div_words_inner_loop2_done: | ||
985 | |||
986 | $SUBU $a0,$t3,$t0 | ||
987 | or $v0,$QT | ||
988 | $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it | ||
989 | $SRL $a2,$t9 # restore $a2 | ||
990 | |||
991 | .set noreorder | ||
992 | move $a1,$v1 | ||
993 | ___ | ||
994 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
995 | $REG_L $t3,4*$SZREG($sp) | ||
996 | $REG_L $t2,3*$SZREG($sp) | ||
997 | $REG_L $t1,2*$SZREG($sp) | ||
998 | $REG_L $t0,1*$SZREG($sp) | ||
999 | $REG_L $gp,0*$SZREG($sp) | ||
1000 | $PTR_ADD $sp,6*$SZREG | ||
1001 | ___ | ||
1002 | $code.=<<___; | ||
1003 | jr $ra | ||
1004 | move $a0,$v0 | ||
1005 | .end bn_div_words_internal | ||
1006 | ___ | ||
1007 | undef $HH; undef $QT; undef $DH; | ||
1008 | |||
1009 | ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); | ||
1010 | ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); | ||
1011 | |||
1012 | ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 | ||
1013 | ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 | ||
1014 | |||
1015 | ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); | ||
1016 | |||
1017 | $code.=<<___; | ||
1018 | |||
1019 | .align 5 | ||
1020 | .globl bn_mul_comba8 | ||
1021 | .ent bn_mul_comba8 | ||
1022 | bn_mul_comba8: | ||
1023 | .set noreorder | ||
1024 | ___ | ||
1025 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1026 | .frame $sp,12*$SZREG,$ra | ||
1027 | .mask 0x803ff008,-$SZREG | ||
1028 | $PTR_SUB $sp,12*$SZREG | ||
1029 | $REG_S $ra,11*$SZREG($sp) | ||
1030 | $REG_S $s5,10*$SZREG($sp) | ||
1031 | $REG_S $s4,9*$SZREG($sp) | ||
1032 | $REG_S $s3,8*$SZREG($sp) | ||
1033 | $REG_S $s2,7*$SZREG($sp) | ||
1034 | $REG_S $s1,6*$SZREG($sp) | ||
1035 | $REG_S $s0,5*$SZREG($sp) | ||
1036 | $REG_S $t3,4*$SZREG($sp) | ||
1037 | $REG_S $t2,3*$SZREG($sp) | ||
1038 | $REG_S $t1,2*$SZREG($sp) | ||
1039 | $REG_S $t0,1*$SZREG($sp) | ||
1040 | $REG_S $gp,0*$SZREG($sp) | ||
1041 | ___ | ||
1042 | $code.=<<___ if ($flavour !~ /nubi/i); | ||
1043 | .frame $sp,6*$SZREG,$ra | ||
1044 | .mask 0x003f0000,-$SZREG | ||
1045 | $PTR_SUB $sp,6*$SZREG | ||
1046 | $REG_S $s5,5*$SZREG($sp) | ||
1047 | $REG_S $s4,4*$SZREG($sp) | ||
1048 | $REG_S $s3,3*$SZREG($sp) | ||
1049 | $REG_S $s2,2*$SZREG($sp) | ||
1050 | $REG_S $s1,1*$SZREG($sp) | ||
1051 | $REG_S $s0,0*$SZREG($sp) | ||
1052 | ___ | ||
1053 | $code.=<<___; | ||
1054 | |||
1055 | .set reorder | ||
1056 | $LD $a_0,0($a1) # If compiled with -mips3 option on | ||
1057 | # R5000 box assembler barks on this | ||
1058 | # 1ine with "should not have mult/div | ||
1059 | # as last instruction in bb (R10K | ||
1060 | # bug)" warning. If anybody out there | ||
1061 | # has a clue about how to circumvent | ||
1062 | # this do send me a note. | ||
1063 | # <appro\@fy.chalmers.se> | ||
1064 | |||
1065 | $LD $b_0,0($a2) | ||
1066 | $LD $a_1,$BNSZ($a1) | ||
1067 | $LD $a_2,2*$BNSZ($a1) | ||
1068 | $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
1069 | $LD $a_3,3*$BNSZ($a1) | ||
1070 | $LD $b_1,$BNSZ($a2) | ||
1071 | $LD $b_2,2*$BNSZ($a2) | ||
1072 | $LD $b_3,3*$BNSZ($a2) | ||
1073 | mflo $c_1 | ||
1074 | mfhi $c_2 | ||
1075 | |||
1076 | $LD $a_4,4*$BNSZ($a1) | ||
1077 | $LD $a_5,5*$BNSZ($a1) | ||
1078 | $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); | ||
1079 | $LD $a_6,6*$BNSZ($a1) | ||
1080 | $LD $a_7,7*$BNSZ($a1) | ||
1081 | $LD $b_4,4*$BNSZ($a2) | ||
1082 | $LD $b_5,5*$BNSZ($a2) | ||
1083 | mflo $t_1 | ||
1084 | mfhi $t_2 | ||
1085 | $ADDU $c_2,$t_1 | ||
1086 | sltu $at,$c_2,$t_1 | ||
1087 | $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); | ||
1088 | $ADDU $c_3,$t_2,$at | ||
1089 | $LD $b_6,6*$BNSZ($a2) | ||
1090 | $LD $b_7,7*$BNSZ($a2) | ||
1091 | $ST $c_1,0($a0) # r[0]=c1; | ||
1092 | mflo $t_1 | ||
1093 | mfhi $t_2 | ||
1094 | $ADDU $c_2,$t_1 | ||
1095 | sltu $at,$c_2,$t_1 | ||
1096 | $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); | ||
1097 | $ADDU $t_2,$at | ||
1098 | $ADDU $c_3,$t_2 | ||
1099 | sltu $c_1,$c_3,$t_2 | ||
1100 | $ST $c_2,$BNSZ($a0) # r[1]=c2; | ||
1101 | |||
1102 | mflo $t_1 | ||
1103 | mfhi $t_2 | ||
1104 | $ADDU $c_3,$t_1 | ||
1105 | sltu $at,$c_3,$t_1 | ||
1106 | $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); | ||
1107 | $ADDU $t_2,$at | ||
1108 | $ADDU $c_1,$t_2 | ||
1109 | mflo $t_1 | ||
1110 | mfhi $t_2 | ||
1111 | $ADDU $c_3,$t_1 | ||
1112 | sltu $at,$c_3,$t_1 | ||
1113 | $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); | ||
1114 | $ADDU $t_2,$at | ||
1115 | $ADDU $c_1,$t_2 | ||
1116 | sltu $c_2,$c_1,$t_2 | ||
1117 | mflo $t_1 | ||
1118 | mfhi $t_2 | ||
1119 | $ADDU $c_3,$t_1 | ||
1120 | sltu $at,$c_3,$t_1 | ||
1121 | $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); | ||
1122 | $ADDU $t_2,$at | ||
1123 | $ADDU $c_1,$t_2 | ||
1124 | sltu $at,$c_1,$t_2 | ||
1125 | $ADDU $c_2,$at | ||
1126 | $ST $c_3,2*$BNSZ($a0) # r[2]=c3; | ||
1127 | |||
1128 | mflo $t_1 | ||
1129 | mfhi $t_2 | ||
1130 | $ADDU $c_1,$t_1 | ||
1131 | sltu $at,$c_1,$t_1 | ||
1132 | $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); | ||
1133 | $ADDU $t_2,$at | ||
1134 | $ADDU $c_2,$t_2 | ||
1135 | sltu $c_3,$c_2,$t_2 | ||
1136 | mflo $t_1 | ||
1137 | mfhi $t_2 | ||
1138 | $ADDU $c_1,$t_1 | ||
1139 | sltu $at,$c_1,$t_1 | ||
1140 | $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); | ||
1141 | $ADDU $t_2,$at | ||
1142 | $ADDU $c_2,$t_2 | ||
1143 | sltu $at,$c_2,$t_2 | ||
1144 | $ADDU $c_3,$at | ||
1145 | mflo $t_1 | ||
1146 | mfhi $t_2 | ||
1147 | $ADDU $c_1,$t_1 | ||
1148 | sltu $at,$c_1,$t_1 | ||
1149 | $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); | ||
1150 | $ADDU $t_2,$at | ||
1151 | $ADDU $c_2,$t_2 | ||
1152 | sltu $at,$c_2,$t_2 | ||
1153 | $ADDU $c_3,$at | ||
1154 | mflo $t_1 | ||
1155 | mfhi $t_2 | ||
1156 | $ADDU $c_1,$t_1 | ||
1157 | sltu $at,$c_1,$t_1 | ||
1158 | $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); | ||
1159 | $ADDU $t_2,$at | ||
1160 | $ADDU $c_2,$t_2 | ||
1161 | sltu $at,$c_2,$t_2 | ||
1162 | $ADDU $c_3,$at | ||
1163 | $ST $c_1,3*$BNSZ($a0) # r[3]=c1; | ||
1164 | |||
1165 | mflo $t_1 | ||
1166 | mfhi $t_2 | ||
1167 | $ADDU $c_2,$t_1 | ||
1168 | sltu $at,$c_2,$t_1 | ||
1169 | $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); | ||
1170 | $ADDU $t_2,$at | ||
1171 | $ADDU $c_3,$t_2 | ||
1172 | sltu $c_1,$c_3,$t_2 | ||
1173 | mflo $t_1 | ||
1174 | mfhi $t_2 | ||
1175 | $ADDU $c_2,$t_1 | ||
1176 | sltu $at,$c_2,$t_1 | ||
1177 | $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); | ||
1178 | $ADDU $t_2,$at | ||
1179 | $ADDU $c_3,$t_2 | ||
1180 | sltu $at,$c_3,$t_2 | ||
1181 | $ADDU $c_1,$at | ||
1182 | mflo $t_1 | ||
1183 | mfhi $t_2 | ||
1184 | $ADDU $c_2,$t_1 | ||
1185 | sltu $at,$c_2,$t_1 | ||
1186 | $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); | ||
1187 | $ADDU $t_2,$at | ||
1188 | $ADDU $c_3,$t_2 | ||
1189 | sltu $at,$c_3,$t_2 | ||
1190 | $ADDU $c_1,$at | ||
1191 | mflo $t_1 | ||
1192 | mfhi $t_2 | ||
1193 | $ADDU $c_2,$t_1 | ||
1194 | sltu $at,$c_2,$t_1 | ||
1195 | $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); | ||
1196 | $ADDU $t_2,$at | ||
1197 | $ADDU $c_3,$t_2 | ||
1198 | sltu $at,$c_3,$t_2 | ||
1199 | $ADDU $c_1,$at | ||
1200 | mflo $t_1 | ||
1201 | mfhi $t_2 | ||
1202 | $ADDU $c_2,$t_1 | ||
1203 | sltu $at,$c_2,$t_1 | ||
1204 | $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); | ||
1205 | $ADDU $t_2,$at | ||
1206 | $ADDU $c_3,$t_2 | ||
1207 | sltu $at,$c_3,$t_2 | ||
1208 | $ADDU $c_1,$at | ||
1209 | $ST $c_2,4*$BNSZ($a0) # r[4]=c2; | ||
1210 | |||
1211 | mflo $t_1 | ||
1212 | mfhi $t_2 | ||
1213 | $ADDU $c_3,$t_1 | ||
1214 | sltu $at,$c_3,$t_1 | ||
1215 | $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); | ||
1216 | $ADDU $t_2,$at | ||
1217 | $ADDU $c_1,$t_2 | ||
1218 | sltu $c_2,$c_1,$t_2 | ||
1219 | mflo $t_1 | ||
1220 | mfhi $t_2 | ||
1221 | $ADDU $c_3,$t_1 | ||
1222 | sltu $at,$c_3,$t_1 | ||
1223 | $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); | ||
1224 | $ADDU $t_2,$at | ||
1225 | $ADDU $c_1,$t_2 | ||
1226 | sltu $at,$c_1,$t_2 | ||
1227 | $ADDU $c_2,$at | ||
1228 | mflo $t_1 | ||
1229 | mfhi $t_2 | ||
1230 | $ADDU $c_3,$t_1 | ||
1231 | sltu $at,$c_3,$t_1 | ||
1232 | $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); | ||
1233 | $ADDU $t_2,$at | ||
1234 | $ADDU $c_1,$t_2 | ||
1235 | sltu $at,$c_1,$t_2 | ||
1236 | $ADDU $c_2,$at | ||
1237 | mflo $t_1 | ||
1238 | mfhi $t_2 | ||
1239 | $ADDU $c_3,$t_1 | ||
1240 | sltu $at,$c_3,$t_1 | ||
1241 | $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); | ||
1242 | $ADDU $t_2,$at | ||
1243 | $ADDU $c_1,$t_2 | ||
1244 | sltu $at,$c_1,$t_2 | ||
1245 | $ADDU $c_2,$at | ||
1246 | mflo $t_1 | ||
1247 | mfhi $t_2 | ||
1248 | $ADDU $c_3,$t_1 | ||
1249 | sltu $at,$c_3,$t_1 | ||
1250 | $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); | ||
1251 | $ADDU $t_2,$at | ||
1252 | $ADDU $c_1,$t_2 | ||
1253 | sltu $at,$c_1,$t_2 | ||
1254 | $ADDU $c_2,$at | ||
1255 | mflo $t_1 | ||
1256 | mfhi $t_2 | ||
1257 | $ADDU $c_3,$t_1 | ||
1258 | sltu $at,$c_3,$t_1 | ||
1259 | $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); | ||
1260 | $ADDU $t_2,$at | ||
1261 | $ADDU $c_1,$t_2 | ||
1262 | sltu $at,$c_1,$t_2 | ||
1263 | $ADDU $c_2,$at | ||
1264 | $ST $c_3,5*$BNSZ($a0) # r[5]=c3; | ||
1265 | |||
1266 | mflo $t_1 | ||
1267 | mfhi $t_2 | ||
1268 | $ADDU $c_1,$t_1 | ||
1269 | sltu $at,$c_1,$t_1 | ||
1270 | $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); | ||
1271 | $ADDU $t_2,$at | ||
1272 | $ADDU $c_2,$t_2 | ||
1273 | sltu $c_3,$c_2,$t_2 | ||
1274 | mflo $t_1 | ||
1275 | mfhi $t_2 | ||
1276 | $ADDU $c_1,$t_1 | ||
1277 | sltu $at,$c_1,$t_1 | ||
1278 | $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); | ||
1279 | $ADDU $t_2,$at | ||
1280 | $ADDU $c_2,$t_2 | ||
1281 | sltu $at,$c_2,$t_2 | ||
1282 | $ADDU $c_3,$at | ||
1283 | mflo $t_1 | ||
1284 | mfhi $t_2 | ||
1285 | $ADDU $c_1,$t_1 | ||
1286 | sltu $at,$c_1,$t_1 | ||
1287 | $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); | ||
1288 | $ADDU $t_2,$at | ||
1289 | $ADDU $c_2,$t_2 | ||
1290 | sltu $at,$c_2,$t_2 | ||
1291 | $ADDU $c_3,$at | ||
1292 | mflo $t_1 | ||
1293 | mfhi $t_2 | ||
1294 | $ADDU $c_1,$t_1 | ||
1295 | sltu $at,$c_1,$t_1 | ||
1296 | $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); | ||
1297 | $ADDU $t_2,$at | ||
1298 | $ADDU $c_2,$t_2 | ||
1299 | sltu $at,$c_2,$t_2 | ||
1300 | $ADDU $c_3,$at | ||
1301 | mflo $t_1 | ||
1302 | mfhi $t_2 | ||
1303 | $ADDU $c_1,$t_1 | ||
1304 | sltu $at,$c_1,$t_1 | ||
1305 | $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); | ||
1306 | $ADDU $t_2,$at | ||
1307 | $ADDU $c_2,$t_2 | ||
1308 | sltu $at,$c_2,$t_2 | ||
1309 | $ADDU $c_3,$at | ||
1310 | mflo $t_1 | ||
1311 | mfhi $t_2 | ||
1312 | $ADDU $c_1,$t_1 | ||
1313 | sltu $at,$c_1,$t_1 | ||
1314 | $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); | ||
1315 | $ADDU $t_2,$at | ||
1316 | $ADDU $c_2,$t_2 | ||
1317 | sltu $at,$c_2,$t_2 | ||
1318 | $ADDU $c_3,$at | ||
1319 | mflo $t_1 | ||
1320 | mfhi $t_2 | ||
1321 | $ADDU $c_1,$t_1 | ||
1322 | sltu $at,$c_1,$t_1 | ||
1323 | $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); | ||
1324 | $ADDU $t_2,$at | ||
1325 | $ADDU $c_2,$t_2 | ||
1326 | sltu $at,$c_2,$t_2 | ||
1327 | $ADDU $c_3,$at | ||
1328 | $ST $c_1,6*$BNSZ($a0) # r[6]=c1; | ||
1329 | |||
1330 | mflo $t_1 | ||
1331 | mfhi $t_2 | ||
1332 | $ADDU $c_2,$t_1 | ||
1333 | sltu $at,$c_2,$t_1 | ||
1334 | $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); | ||
1335 | $ADDU $t_2,$at | ||
1336 | $ADDU $c_3,$t_2 | ||
1337 | sltu $c_1,$c_3,$t_2 | ||
1338 | mflo $t_1 | ||
1339 | mfhi $t_2 | ||
1340 | $ADDU $c_2,$t_1 | ||
1341 | sltu $at,$c_2,$t_1 | ||
1342 | $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); | ||
1343 | $ADDU $t_2,$at | ||
1344 | $ADDU $c_3,$t_2 | ||
1345 | sltu $at,$c_3,$t_2 | ||
1346 | $ADDU $c_1,$at | ||
1347 | mflo $t_1 | ||
1348 | mfhi $t_2 | ||
1349 | $ADDU $c_2,$t_1 | ||
1350 | sltu $at,$c_2,$t_1 | ||
1351 | $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); | ||
1352 | $ADDU $t_2,$at | ||
1353 | $ADDU $c_3,$t_2 | ||
1354 | sltu $at,$c_3,$t_2 | ||
1355 | $ADDU $c_1,$at | ||
1356 | mflo $t_1 | ||
1357 | mfhi $t_2 | ||
1358 | $ADDU $c_2,$t_1 | ||
1359 | sltu $at,$c_2,$t_1 | ||
1360 | $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); | ||
1361 | $ADDU $t_2,$at | ||
1362 | $ADDU $c_3,$t_2 | ||
1363 | sltu $at,$c_3,$t_2 | ||
1364 | $ADDU $c_1,$at | ||
1365 | mflo $t_1 | ||
1366 | mfhi $t_2 | ||
1367 | $ADDU $c_2,$t_1 | ||
1368 | sltu $at,$c_2,$t_1 | ||
1369 | $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); | ||
1370 | $ADDU $t_2,$at | ||
1371 | $ADDU $c_3,$t_2 | ||
1372 | sltu $at,$c_3,$t_2 | ||
1373 | $ADDU $c_1,$at | ||
1374 | mflo $t_1 | ||
1375 | mfhi $t_2 | ||
1376 | $ADDU $c_2,$t_1 | ||
1377 | sltu $at,$c_2,$t_1 | ||
1378 | $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); | ||
1379 | $ADDU $t_2,$at | ||
1380 | $ADDU $c_3,$t_2 | ||
1381 | sltu $at,$c_3,$t_2 | ||
1382 | $ADDU $c_1,$at | ||
1383 | mflo $t_1 | ||
1384 | mfhi $t_2 | ||
1385 | $ADDU $c_2,$t_1 | ||
1386 | sltu $at,$c_2,$t_1 | ||
1387 | $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); | ||
1388 | $ADDU $t_2,$at | ||
1389 | $ADDU $c_3,$t_2 | ||
1390 | sltu $at,$c_3,$t_2 | ||
1391 | $ADDU $c_1,$at | ||
1392 | mflo $t_1 | ||
1393 | mfhi $t_2 | ||
1394 | $ADDU $c_2,$t_1 | ||
1395 | sltu $at,$c_2,$t_1 | ||
1396 | $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); | ||
1397 | $ADDU $t_2,$at | ||
1398 | $ADDU $c_3,$t_2 | ||
1399 | sltu $at,$c_3,$t_2 | ||
1400 | $ADDU $c_1,$at | ||
1401 | $ST $c_2,7*$BNSZ($a0) # r[7]=c2; | ||
1402 | |||
1403 | mflo $t_1 | ||
1404 | mfhi $t_2 | ||
1405 | $ADDU $c_3,$t_1 | ||
1406 | sltu $at,$c_3,$t_1 | ||
1407 | $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); | ||
1408 | $ADDU $t_2,$at | ||
1409 | $ADDU $c_1,$t_2 | ||
1410 | sltu $c_2,$c_1,$t_2 | ||
1411 | mflo $t_1 | ||
1412 | mfhi $t_2 | ||
1413 | $ADDU $c_3,$t_1 | ||
1414 | sltu $at,$c_3,$t_1 | ||
1415 | $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); | ||
1416 | $ADDU $t_2,$at | ||
1417 | $ADDU $c_1,$t_2 | ||
1418 | sltu $at,$c_1,$t_2 | ||
1419 | $ADDU $c_2,$at | ||
1420 | mflo $t_1 | ||
1421 | mfhi $t_2 | ||
1422 | $ADDU $c_3,$t_1 | ||
1423 | sltu $at,$c_3,$t_1 | ||
1424 | $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); | ||
1425 | $ADDU $t_2,$at | ||
1426 | $ADDU $c_1,$t_2 | ||
1427 | sltu $at,$c_1,$t_2 | ||
1428 | $ADDU $c_2,$at | ||
1429 | mflo $t_1 | ||
1430 | mfhi $t_2 | ||
1431 | $ADDU $c_3,$t_1 | ||
1432 | sltu $at,$c_3,$t_1 | ||
1433 | $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); | ||
1434 | $ADDU $t_2,$at | ||
1435 | $ADDU $c_1,$t_2 | ||
1436 | sltu $at,$c_1,$t_2 | ||
1437 | $ADDU $c_2,$at | ||
1438 | mflo $t_1 | ||
1439 | mfhi $t_2 | ||
1440 | $ADDU $c_3,$t_1 | ||
1441 | sltu $at,$c_3,$t_1 | ||
1442 | $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); | ||
1443 | $ADDU $t_2,$at | ||
1444 | $ADDU $c_1,$t_2 | ||
1445 | sltu $at,$c_1,$t_2 | ||
1446 | $ADDU $c_2,$at | ||
1447 | mflo $t_1 | ||
1448 | mfhi $t_2 | ||
1449 | $ADDU $c_3,$t_1 | ||
1450 | sltu $at,$c_3,$t_1 | ||
1451 | $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); | ||
1452 | $ADDU $t_2,$at | ||
1453 | $ADDU $c_1,$t_2 | ||
1454 | sltu $at,$c_1,$t_2 | ||
1455 | $ADDU $c_2,$at | ||
1456 | mflo $t_1 | ||
1457 | mfhi $t_2 | ||
1458 | $ADDU $c_3,$t_1 | ||
1459 | sltu $at,$c_3,$t_1 | ||
1460 | $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); | ||
1461 | $ADDU $t_2,$at | ||
1462 | $ADDU $c_1,$t_2 | ||
1463 | sltu $at,$c_1,$t_2 | ||
1464 | $ADDU $c_2,$at | ||
1465 | $ST $c_3,8*$BNSZ($a0) # r[8]=c3; | ||
1466 | |||
1467 | mflo $t_1 | ||
1468 | mfhi $t_2 | ||
1469 | $ADDU $c_1,$t_1 | ||
1470 | sltu $at,$c_1,$t_1 | ||
1471 | $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); | ||
1472 | $ADDU $t_2,$at | ||
1473 | $ADDU $c_2,$t_2 | ||
1474 | sltu $c_3,$c_2,$t_2 | ||
1475 | mflo $t_1 | ||
1476 | mfhi $t_2 | ||
1477 | $ADDU $c_1,$t_1 | ||
1478 | sltu $at,$c_1,$t_1 | ||
1479 | $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); | ||
1480 | $ADDU $t_2,$at | ||
1481 | $ADDU $c_2,$t_2 | ||
1482 | sltu $at,$c_2,$t_2 | ||
1483 | $ADDU $c_3,$at | ||
1484 | mflo $t_1 | ||
1485 | mfhi $t_2 | ||
1486 | $ADDU $c_1,$t_1 | ||
1487 | sltu $at,$c_1,$t_1 | ||
1488 | $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); | ||
1489 | $ADDU $t_2,$at | ||
1490 | $ADDU $c_2,$t_2 | ||
1491 | sltu $at,$c_2,$t_2 | ||
1492 | $ADDU $c_3,$at | ||
1493 | mflo $t_1 | ||
1494 | mfhi $t_2 | ||
1495 | $ADDU $c_1,$t_1 | ||
1496 | sltu $at,$c_1,$t_1 | ||
1497 | $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); | ||
1498 | $ADDU $t_2,$at | ||
1499 | $ADDU $c_2,$t_2 | ||
1500 | sltu $at,$c_2,$t_2 | ||
1501 | $ADDU $c_3,$at | ||
1502 | mflo $t_1 | ||
1503 | mfhi $t_2 | ||
1504 | $ADDU $c_1,$t_1 | ||
1505 | sltu $at,$c_1,$t_1 | ||
1506 | $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); | ||
1507 | $ADDU $t_2,$at | ||
1508 | $ADDU $c_2,$t_2 | ||
1509 | sltu $at,$c_2,$t_2 | ||
1510 | $ADDU $c_3,$at | ||
1511 | mflo $t_1 | ||
1512 | mfhi $t_2 | ||
1513 | $ADDU $c_1,$t_1 | ||
1514 | sltu $at,$c_1,$t_1 | ||
1515 | $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); | ||
1516 | $ADDU $t_2,$at | ||
1517 | $ADDU $c_2,$t_2 | ||
1518 | sltu $at,$c_2,$t_2 | ||
1519 | $ADDU $c_3,$at | ||
1520 | $ST $c_1,9*$BNSZ($a0) # r[9]=c1; | ||
1521 | |||
1522 | mflo $t_1 | ||
1523 | mfhi $t_2 | ||
1524 | $ADDU $c_2,$t_1 | ||
1525 | sltu $at,$c_2,$t_1 | ||
1526 | $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); | ||
1527 | $ADDU $t_2,$at | ||
1528 | $ADDU $c_3,$t_2 | ||
1529 | sltu $c_1,$c_3,$t_2 | ||
1530 | mflo $t_1 | ||
1531 | mfhi $t_2 | ||
1532 | $ADDU $c_2,$t_1 | ||
1533 | sltu $at,$c_2,$t_1 | ||
1534 | $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); | ||
1535 | $ADDU $t_2,$at | ||
1536 | $ADDU $c_3,$t_2 | ||
1537 | sltu $at,$c_3,$t_2 | ||
1538 | $ADDU $c_1,$at | ||
1539 | mflo $t_1 | ||
1540 | mfhi $t_2 | ||
1541 | $ADDU $c_2,$t_1 | ||
1542 | sltu $at,$c_2,$t_1 | ||
1543 | $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); | ||
1544 | $ADDU $t_2,$at | ||
1545 | $ADDU $c_3,$t_2 | ||
1546 | sltu $at,$c_3,$t_2 | ||
1547 | $ADDU $c_1,$at | ||
1548 | mflo $t_1 | ||
1549 | mfhi $t_2 | ||
1550 | $ADDU $c_2,$t_1 | ||
1551 | sltu $at,$c_2,$t_1 | ||
1552 | $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); | ||
1553 | $ADDU $t_2,$at | ||
1554 | $ADDU $c_3,$t_2 | ||
1555 | sltu $at,$c_3,$t_2 | ||
1556 | $ADDU $c_1,$at | ||
1557 | mflo $t_1 | ||
1558 | mfhi $t_2 | ||
1559 | $ADDU $c_2,$t_1 | ||
1560 | sltu $at,$c_2,$t_1 | ||
1561 | $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); | ||
1562 | $ADDU $t_2,$at | ||
1563 | $ADDU $c_3,$t_2 | ||
1564 | sltu $at,$c_3,$t_2 | ||
1565 | $ADDU $c_1,$at | ||
1566 | $ST $c_2,10*$BNSZ($a0) # r[10]=c2; | ||
1567 | |||
1568 | mflo $t_1 | ||
1569 | mfhi $t_2 | ||
1570 | $ADDU $c_3,$t_1 | ||
1571 | sltu $at,$c_3,$t_1 | ||
1572 | $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); | ||
1573 | $ADDU $t_2,$at | ||
1574 | $ADDU $c_1,$t_2 | ||
1575 | sltu $c_2,$c_1,$t_2 | ||
1576 | mflo $t_1 | ||
1577 | mfhi $t_2 | ||
1578 | $ADDU $c_3,$t_1 | ||
1579 | sltu $at,$c_3,$t_1 | ||
1580 | $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); | ||
1581 | $ADDU $t_2,$at | ||
1582 | $ADDU $c_1,$t_2 | ||
1583 | sltu $at,$c_1,$t_2 | ||
1584 | $ADDU $c_2,$at | ||
1585 | mflo $t_1 | ||
1586 | mfhi $t_2 | ||
1587 | $ADDU $c_3,$t_1 | ||
1588 | sltu $at,$c_3,$t_1 | ||
1589 | $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); | ||
1590 | $ADDU $t_2,$at | ||
1591 | $ADDU $c_1,$t_2 | ||
1592 | sltu $at,$c_1,$t_2 | ||
1593 | $ADDU $c_2,$at | ||
1594 | mflo $t_1 | ||
1595 | mfhi $t_2 | ||
1596 | $ADDU $c_3,$t_1 | ||
1597 | sltu $at,$c_3,$t_1 | ||
1598 | $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); | ||
1599 | $ADDU $t_2,$at | ||
1600 | $ADDU $c_1,$t_2 | ||
1601 | sltu $at,$c_1,$t_2 | ||
1602 | $ADDU $c_2,$at | ||
1603 | $ST $c_3,11*$BNSZ($a0) # r[11]=c3; | ||
1604 | |||
1605 | mflo $t_1 | ||
1606 | mfhi $t_2 | ||
1607 | $ADDU $c_1,$t_1 | ||
1608 | sltu $at,$c_1,$t_1 | ||
1609 | $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); | ||
1610 | $ADDU $t_2,$at | ||
1611 | $ADDU $c_2,$t_2 | ||
1612 | sltu $c_3,$c_2,$t_2 | ||
1613 | mflo $t_1 | ||
1614 | mfhi $t_2 | ||
1615 | $ADDU $c_1,$t_1 | ||
1616 | sltu $at,$c_1,$t_1 | ||
1617 | $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); | ||
1618 | $ADDU $t_2,$at | ||
1619 | $ADDU $c_2,$t_2 | ||
1620 | sltu $at,$c_2,$t_2 | ||
1621 | $ADDU $c_3,$at | ||
1622 | mflo $t_1 | ||
1623 | mfhi $t_2 | ||
1624 | $ADDU $c_1,$t_1 | ||
1625 | sltu $at,$c_1,$t_1 | ||
1626 | $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); | ||
1627 | $ADDU $t_2,$at | ||
1628 | $ADDU $c_2,$t_2 | ||
1629 | sltu $at,$c_2,$t_2 | ||
1630 | $ADDU $c_3,$at | ||
1631 | $ST $c_1,12*$BNSZ($a0) # r[12]=c1; | ||
1632 | |||
1633 | mflo $t_1 | ||
1634 | mfhi $t_2 | ||
1635 | $ADDU $c_2,$t_1 | ||
1636 | sltu $at,$c_2,$t_1 | ||
1637 | $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); | ||
1638 | $ADDU $t_2,$at | ||
1639 | $ADDU $c_3,$t_2 | ||
1640 | sltu $c_1,$c_3,$t_2 | ||
1641 | mflo $t_1 | ||
1642 | mfhi $t_2 | ||
1643 | $ADDU $c_2,$t_1 | ||
1644 | sltu $at,$c_2,$t_1 | ||
1645 | $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); | ||
1646 | $ADDU $t_2,$at | ||
1647 | $ADDU $c_3,$t_2 | ||
1648 | sltu $at,$c_3,$t_2 | ||
1649 | $ADDU $c_1,$at | ||
1650 | $ST $c_2,13*$BNSZ($a0) # r[13]=c2; | ||
1651 | |||
1652 | mflo $t_1 | ||
1653 | mfhi $t_2 | ||
1654 | $ADDU $c_3,$t_1 | ||
1655 | sltu $at,$c_3,$t_1 | ||
1656 | $ADDU $t_2,$at | ||
1657 | $ADDU $c_1,$t_2 | ||
1658 | $ST $c_3,14*$BNSZ($a0) # r[14]=c3; | ||
1659 | $ST $c_1,15*$BNSZ($a0) # r[15]=c1; | ||
1660 | |||
1661 | .set noreorder | ||
1662 | ___ | ||
1663 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1664 | $REG_L $s5,10*$SZREG($sp) | ||
1665 | $REG_L $s4,9*$SZREG($sp) | ||
1666 | $REG_L $s3,8*$SZREG($sp) | ||
1667 | $REG_L $s2,7*$SZREG($sp) | ||
1668 | $REG_L $s1,6*$SZREG($sp) | ||
1669 | $REG_L $s0,5*$SZREG($sp) | ||
1670 | $REG_L $t3,4*$SZREG($sp) | ||
1671 | $REG_L $t2,3*$SZREG($sp) | ||
1672 | $REG_L $t1,2*$SZREG($sp) | ||
1673 | $REG_L $t0,1*$SZREG($sp) | ||
1674 | $REG_L $gp,0*$SZREG($sp) | ||
1675 | jr $ra | ||
1676 | $PTR_ADD $sp,12*$SZREG | ||
1677 | ___ | ||
1678 | $code.=<<___ if ($flavour !~ /nubi/i); | ||
1679 | $REG_L $s5,5*$SZREG($sp) | ||
1680 | $REG_L $s4,4*$SZREG($sp) | ||
1681 | $REG_L $s3,3*$SZREG($sp) | ||
1682 | $REG_L $s2,2*$SZREG($sp) | ||
1683 | $REG_L $s1,1*$SZREG($sp) | ||
1684 | $REG_L $s0,0*$SZREG($sp) | ||
1685 | jr $ra | ||
1686 | $PTR_ADD $sp,6*$SZREG | ||
1687 | ___ | ||
1688 | $code.=<<___; | ||
1689 | .end bn_mul_comba8 | ||
1690 | |||
1691 | .align 5 | ||
1692 | .globl bn_mul_comba4 | ||
1693 | .ent bn_mul_comba4 | ||
1694 | bn_mul_comba4: | ||
1695 | ___ | ||
1696 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1697 | .frame $sp,6*$SZREG,$ra | ||
1698 | .mask 0x8000f008,-$SZREG | ||
1699 | .set noreorder | ||
1700 | $PTR_SUB $sp,6*$SZREG | ||
1701 | $REG_S $ra,5*$SZREG($sp) | ||
1702 | $REG_S $t3,4*$SZREG($sp) | ||
1703 | $REG_S $t2,3*$SZREG($sp) | ||
1704 | $REG_S $t1,2*$SZREG($sp) | ||
1705 | $REG_S $t0,1*$SZREG($sp) | ||
1706 | $REG_S $gp,0*$SZREG($sp) | ||
1707 | ___ | ||
1708 | $code.=<<___; | ||
1709 | .set reorder | ||
1710 | $LD $a_0,0($a1) | ||
1711 | $LD $b_0,0($a2) | ||
1712 | $LD $a_1,$BNSZ($a1) | ||
1713 | $LD $a_2,2*$BNSZ($a1) | ||
1714 | $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
1715 | $LD $a_3,3*$BNSZ($a1) | ||
1716 | $LD $b_1,$BNSZ($a2) | ||
1717 | $LD $b_2,2*$BNSZ($a2) | ||
1718 | $LD $b_3,3*$BNSZ($a2) | ||
1719 | mflo $c_1 | ||
1720 | mfhi $c_2 | ||
1721 | $ST $c_1,0($a0) | ||
1722 | |||
1723 | $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); | ||
1724 | mflo $t_1 | ||
1725 | mfhi $t_2 | ||
1726 | $ADDU $c_2,$t_1 | ||
1727 | sltu $at,$c_2,$t_1 | ||
1728 | $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); | ||
1729 | $ADDU $c_3,$t_2,$at | ||
1730 | mflo $t_1 | ||
1731 | mfhi $t_2 | ||
1732 | $ADDU $c_2,$t_1 | ||
1733 | sltu $at,$c_2,$t_1 | ||
1734 | $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); | ||
1735 | $ADDU $t_2,$at | ||
1736 | $ADDU $c_3,$t_2 | ||
1737 | sltu $c_1,$c_3,$t_2 | ||
1738 | $ST $c_2,$BNSZ($a0) | ||
1739 | |||
1740 | mflo $t_1 | ||
1741 | mfhi $t_2 | ||
1742 | $ADDU $c_3,$t_1 | ||
1743 | sltu $at,$c_3,$t_1 | ||
1744 | $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); | ||
1745 | $ADDU $t_2,$at | ||
1746 | $ADDU $c_1,$t_2 | ||
1747 | mflo $t_1 | ||
1748 | mfhi $t_2 | ||
1749 | $ADDU $c_3,$t_1 | ||
1750 | sltu $at,$c_3,$t_1 | ||
1751 | $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); | ||
1752 | $ADDU $t_2,$at | ||
1753 | $ADDU $c_1,$t_2 | ||
1754 | sltu $c_2,$c_1,$t_2 | ||
1755 | mflo $t_1 | ||
1756 | mfhi $t_2 | ||
1757 | $ADDU $c_3,$t_1 | ||
1758 | sltu $at,$c_3,$t_1 | ||
1759 | $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); | ||
1760 | $ADDU $t_2,$at | ||
1761 | $ADDU $c_1,$t_2 | ||
1762 | sltu $at,$c_1,$t_2 | ||
1763 | $ADDU $c_2,$at | ||
1764 | $ST $c_3,2*$BNSZ($a0) | ||
1765 | |||
1766 | mflo $t_1 | ||
1767 | mfhi $t_2 | ||
1768 | $ADDU $c_1,$t_1 | ||
1769 | sltu $at,$c_1,$t_1 | ||
1770 | $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); | ||
1771 | $ADDU $t_2,$at | ||
1772 | $ADDU $c_2,$t_2 | ||
1773 | sltu $c_3,$c_2,$t_2 | ||
1774 | mflo $t_1 | ||
1775 | mfhi $t_2 | ||
1776 | $ADDU $c_1,$t_1 | ||
1777 | sltu $at,$c_1,$t_1 | ||
1778 | $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); | ||
1779 | $ADDU $t_2,$at | ||
1780 | $ADDU $c_2,$t_2 | ||
1781 | sltu $at,$c_2,$t_2 | ||
1782 | $ADDU $c_3,$at | ||
1783 | mflo $t_1 | ||
1784 | mfhi $t_2 | ||
1785 | $ADDU $c_1,$t_1 | ||
1786 | sltu $at,$c_1,$t_1 | ||
1787 | $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); | ||
1788 | $ADDU $t_2,$at | ||
1789 | $ADDU $c_2,$t_2 | ||
1790 | sltu $at,$c_2,$t_2 | ||
1791 | $ADDU $c_3,$at | ||
1792 | mflo $t_1 | ||
1793 | mfhi $t_2 | ||
1794 | $ADDU $c_1,$t_1 | ||
1795 | sltu $at,$c_1,$t_1 | ||
1796 | $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); | ||
1797 | $ADDU $t_2,$at | ||
1798 | $ADDU $c_2,$t_2 | ||
1799 | sltu $at,$c_2,$t_2 | ||
1800 | $ADDU $c_3,$at | ||
1801 | $ST $c_1,3*$BNSZ($a0) | ||
1802 | |||
1803 | mflo $t_1 | ||
1804 | mfhi $t_2 | ||
1805 | $ADDU $c_2,$t_1 | ||
1806 | sltu $at,$c_2,$t_1 | ||
1807 | $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); | ||
1808 | $ADDU $t_2,$at | ||
1809 | $ADDU $c_3,$t_2 | ||
1810 | sltu $c_1,$c_3,$t_2 | ||
1811 | mflo $t_1 | ||
1812 | mfhi $t_2 | ||
1813 | $ADDU $c_2,$t_1 | ||
1814 | sltu $at,$c_2,$t_1 | ||
1815 | $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); | ||
1816 | $ADDU $t_2,$at | ||
1817 | $ADDU $c_3,$t_2 | ||
1818 | sltu $at,$c_3,$t_2 | ||
1819 | $ADDU $c_1,$at | ||
1820 | mflo $t_1 | ||
1821 | mfhi $t_2 | ||
1822 | $ADDU $c_2,$t_1 | ||
1823 | sltu $at,$c_2,$t_1 | ||
1824 | $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); | ||
1825 | $ADDU $t_2,$at | ||
1826 | $ADDU $c_3,$t_2 | ||
1827 | sltu $at,$c_3,$t_2 | ||
1828 | $ADDU $c_1,$at | ||
1829 | $ST $c_2,4*$BNSZ($a0) | ||
1830 | |||
1831 | mflo $t_1 | ||
1832 | mfhi $t_2 | ||
1833 | $ADDU $c_3,$t_1 | ||
1834 | sltu $at,$c_3,$t_1 | ||
1835 | $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); | ||
1836 | $ADDU $t_2,$at | ||
1837 | $ADDU $c_1,$t_2 | ||
1838 | sltu $c_2,$c_1,$t_2 | ||
1839 | mflo $t_1 | ||
1840 | mfhi $t_2 | ||
1841 | $ADDU $c_3,$t_1 | ||
1842 | sltu $at,$c_3,$t_1 | ||
1843 | $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); | ||
1844 | $ADDU $t_2,$at | ||
1845 | $ADDU $c_1,$t_2 | ||
1846 | sltu $at,$c_1,$t_2 | ||
1847 | $ADDU $c_2,$at | ||
1848 | $ST $c_3,5*$BNSZ($a0) | ||
1849 | |||
1850 | mflo $t_1 | ||
1851 | mfhi $t_2 | ||
1852 | $ADDU $c_1,$t_1 | ||
1853 | sltu $at,$c_1,$t_1 | ||
1854 | $ADDU $t_2,$at | ||
1855 | $ADDU $c_2,$t_2 | ||
1856 | $ST $c_1,6*$BNSZ($a0) | ||
1857 | $ST $c_2,7*$BNSZ($a0) | ||
1858 | |||
1859 | .set noreorder | ||
1860 | ___ | ||
1861 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1862 | $REG_L $t3,4*$SZREG($sp) | ||
1863 | $REG_L $t2,3*$SZREG($sp) | ||
1864 | $REG_L $t1,2*$SZREG($sp) | ||
1865 | $REG_L $t0,1*$SZREG($sp) | ||
1866 | $REG_L $gp,0*$SZREG($sp) | ||
1867 | $PTR_ADD $sp,6*$SZREG | ||
1868 | ___ | ||
1869 | $code.=<<___; | ||
1870 | jr $ra | ||
1871 | nop | ||
1872 | .end bn_mul_comba4 | ||
1873 | ___ | ||
1874 | |||
1875 | ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); | ||
1876 | |||
1877 | $code.=<<___; | ||
1878 | |||
1879 | .align 5 | ||
1880 | .globl bn_sqr_comba8 | ||
1881 | .ent bn_sqr_comba8 | ||
1882 | bn_sqr_comba8: | ||
1883 | ___ | ||
1884 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1885 | .frame $sp,6*$SZREG,$ra | ||
1886 | .mask 0x8000f008,-$SZREG | ||
1887 | .set noreorder | ||
1888 | $PTR_SUB $sp,6*$SZREG | ||
1889 | $REG_S $ra,5*$SZREG($sp) | ||
1890 | $REG_S $t3,4*$SZREG($sp) | ||
1891 | $REG_S $t2,3*$SZREG($sp) | ||
1892 | $REG_S $t1,2*$SZREG($sp) | ||
1893 | $REG_S $t0,1*$SZREG($sp) | ||
1894 | $REG_S $gp,0*$SZREG($sp) | ||
1895 | ___ | ||
1896 | $code.=<<___; | ||
1897 | .set reorder | ||
1898 | $LD $a_0,0($a1) | ||
1899 | $LD $a_1,$BNSZ($a1) | ||
1900 | $LD $a_2,2*$BNSZ($a1) | ||
1901 | $LD $a_3,3*$BNSZ($a1) | ||
1902 | |||
1903 | $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
1904 | $LD $a_4,4*$BNSZ($a1) | ||
1905 | $LD $a_5,5*$BNSZ($a1) | ||
1906 | $LD $a_6,6*$BNSZ($a1) | ||
1907 | $LD $a_7,7*$BNSZ($a1) | ||
1908 | mflo $c_1 | ||
1909 | mfhi $c_2 | ||
1910 | $ST $c_1,0($a0) | ||
1911 | |||
1912 | $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); | ||
1913 | mflo $t_1 | ||
1914 | mfhi $t_2 | ||
1915 | slt $c_1,$t_2,$zero | ||
1916 | $SLL $t_2,1 | ||
1917 | $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); | ||
1918 | slt $a2,$t_1,$zero | ||
1919 | $ADDU $t_2,$a2 | ||
1920 | $SLL $t_1,1 | ||
1921 | $ADDU $c_2,$t_1 | ||
1922 | sltu $at,$c_2,$t_1 | ||
1923 | $ADDU $c_3,$t_2,$at | ||
1924 | $ST $c_2,$BNSZ($a0) | ||
1925 | |||
1926 | mflo $t_1 | ||
1927 | mfhi $t_2 | ||
1928 | slt $c_2,$t_2,$zero | ||
1929 | $SLL $t_2,1 | ||
1930 | $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); | ||
1931 | slt $a2,$t_1,$zero | ||
1932 | $ADDU $t_2,$a2 | ||
1933 | $SLL $t_1,1 | ||
1934 | $ADDU $c_3,$t_1 | ||
1935 | sltu $at,$c_3,$t_1 | ||
1936 | $ADDU $t_2,$at | ||
1937 | $ADDU $c_1,$t_2 | ||
1938 | sltu $at,$c_1,$t_2 | ||
1939 | $ADDU $c_2,$at | ||
1940 | mflo $t_1 | ||
1941 | mfhi $t_2 | ||
1942 | $ADDU $c_3,$t_1 | ||
1943 | sltu $at,$c_3,$t_1 | ||
1944 | $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); | ||
1945 | $ADDU $t_2,$at | ||
1946 | $ADDU $c_1,$t_2 | ||
1947 | sltu $at,$c_1,$t_2 | ||
1948 | $ADDU $c_2,$at | ||
1949 | $ST $c_3,2*$BNSZ($a0) | ||
1950 | |||
1951 | mflo $t_1 | ||
1952 | mfhi $t_2 | ||
1953 | slt $c_3,$t_2,$zero | ||
1954 | $SLL $t_2,1 | ||
1955 | $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); | ||
1956 | slt $a2,$t_1,$zero | ||
1957 | $ADDU $t_2,$a2 | ||
1958 | $SLL $t_1,1 | ||
1959 | $ADDU $c_1,$t_1 | ||
1960 | sltu $at,$c_1,$t_1 | ||
1961 | $ADDU $t_2,$at | ||
1962 | $ADDU $c_2,$t_2 | ||
1963 | sltu $at,$c_2,$t_2 | ||
1964 | $ADDU $c_3,$at | ||
1965 | mflo $t_1 | ||
1966 | mfhi $t_2 | ||
1967 | slt $at,$t_2,$zero | ||
1968 | $ADDU $c_3,$at | ||
1969 | $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); | ||
1970 | $SLL $t_2,1 | ||
1971 | slt $a2,$t_1,$zero | ||
1972 | $ADDU $t_2,$a2 | ||
1973 | $SLL $t_1,1 | ||
1974 | $ADDU $c_1,$t_1 | ||
1975 | sltu $at,$c_1,$t_1 | ||
1976 | $ADDU $t_2,$at | ||
1977 | $ADDU $c_2,$t_2 | ||
1978 | sltu $at,$c_2,$t_2 | ||
1979 | $ADDU $c_3,$at | ||
1980 | $ST $c_1,3*$BNSZ($a0) | ||
1981 | |||
1982 | mflo $t_1 | ||
1983 | mfhi $t_2 | ||
1984 | slt $c_1,$t_2,$zero | ||
1985 | $SLL $t_2,1 | ||
1986 | $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); | ||
1987 | slt $a2,$t_1,$zero | ||
1988 | $ADDU $t_2,$a2 | ||
1989 | $SLL $t_1,1 | ||
1990 | $ADDU $c_2,$t_1 | ||
1991 | sltu $at,$c_2,$t_1 | ||
1992 | $ADDU $t_2,$at | ||
1993 | $ADDU $c_3,$t_2 | ||
1994 | sltu $at,$c_3,$t_2 | ||
1995 | $ADDU $c_1,$at | ||
1996 | mflo $t_1 | ||
1997 | mfhi $t_2 | ||
1998 | slt $at,$t_2,$zero | ||
1999 | $ADDU $c_1,$at | ||
2000 | $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); | ||
2001 | $SLL $t_2,1 | ||
2002 | slt $a2,$t_1,$zero | ||
2003 | $ADDU $t_2,$a2 | ||
2004 | $SLL $t_1,1 | ||
2005 | $ADDU $c_2,$t_1 | ||
2006 | sltu $at,$c_2,$t_1 | ||
2007 | $ADDU $t_2,$at | ||
2008 | $ADDU $c_3,$t_2 | ||
2009 | sltu $at,$c_3,$t_2 | ||
2010 | $ADDU $c_1,$at | ||
2011 | mflo $t_1 | ||
2012 | mfhi $t_2 | ||
2013 | $ADDU $c_2,$t_1 | ||
2014 | sltu $at,$c_2,$t_1 | ||
2015 | $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); | ||
2016 | $ADDU $t_2,$at | ||
2017 | $ADDU $c_3,$t_2 | ||
2018 | sltu $at,$c_3,$t_2 | ||
2019 | $ADDU $c_1,$at | ||
2020 | $ST $c_2,4*$BNSZ($a0) | ||
2021 | |||
2022 | mflo $t_1 | ||
2023 | mfhi $t_2 | ||
2024 | slt $c_2,$t_2,$zero | ||
2025 | $SLL $t_2,1 | ||
2026 | $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); | ||
2027 | slt $a2,$t_1,$zero | ||
2028 | $ADDU $t_2,$a2 | ||
2029 | $SLL $t_1,1 | ||
2030 | $ADDU $c_3,$t_1 | ||
2031 | sltu $at,$c_3,$t_1 | ||
2032 | $ADDU $t_2,$at | ||
2033 | $ADDU $c_1,$t_2 | ||
2034 | sltu $at,$c_1,$t_2 | ||
2035 | $ADDU $c_2,$at | ||
2036 | mflo $t_1 | ||
2037 | mfhi $t_2 | ||
2038 | slt $at,$t_2,$zero | ||
2039 | $ADDU $c_2,$at | ||
2040 | $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); | ||
2041 | $SLL $t_2,1 | ||
2042 | slt $a2,$t_1,$zero | ||
2043 | $ADDU $t_2,$a2 | ||
2044 | $SLL $t_1,1 | ||
2045 | $ADDU $c_3,$t_1 | ||
2046 | sltu $at,$c_3,$t_1 | ||
2047 | $ADDU $t_2,$at | ||
2048 | $ADDU $c_1,$t_2 | ||
2049 | sltu $at,$c_1,$t_2 | ||
2050 | $ADDU $c_2,$at | ||
2051 | mflo $t_1 | ||
2052 | mfhi $t_2 | ||
2053 | slt $at,$t_2,$zero | ||
2054 | $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); | ||
2055 | $ADDU $c_2,$at | ||
2056 | $SLL $t_2,1 | ||
2057 | slt $a2,$t_1,$zero | ||
2058 | $ADDU $t_2,$a2 | ||
2059 | $SLL $t_1,1 | ||
2060 | $ADDU $c_3,$t_1 | ||
2061 | sltu $at,$c_3,$t_1 | ||
2062 | $ADDU $t_2,$at | ||
2063 | $ADDU $c_1,$t_2 | ||
2064 | sltu $at,$c_1,$t_2 | ||
2065 | $ADDU $c_2,$at | ||
2066 | $ST $c_3,5*$BNSZ($a0) | ||
2067 | |||
2068 | mflo $t_1 | ||
2069 | mfhi $t_2 | ||
2070 | slt $c_3,$t_2,$zero | ||
2071 | $SLL $t_2,1 | ||
2072 | $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); | ||
2073 | slt $a2,$t_1,$zero | ||
2074 | $ADDU $t_2,$a2 | ||
2075 | $SLL $t_1,1 | ||
2076 | $ADDU $c_1,$t_1 | ||
2077 | sltu $at,$c_1,$t_1 | ||
2078 | $ADDU $t_2,$at | ||
2079 | $ADDU $c_2,$t_2 | ||
2080 | sltu $at,$c_2,$t_2 | ||
2081 | $ADDU $c_3,$at | ||
2082 | mflo $t_1 | ||
2083 | mfhi $t_2 | ||
2084 | slt $at,$t_2,$zero | ||
2085 | $ADDU $c_3,$at | ||
2086 | $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); | ||
2087 | $SLL $t_2,1 | ||
2088 | slt $a2,$t_1,$zero | ||
2089 | $ADDU $t_2,$a2 | ||
2090 | $SLL $t_1,1 | ||
2091 | $ADDU $c_1,$t_1 | ||
2092 | sltu $at,$c_1,$t_1 | ||
2093 | $ADDU $t_2,$at | ||
2094 | $ADDU $c_2,$t_2 | ||
2095 | sltu $at,$c_2,$t_2 | ||
2096 | $ADDU $c_3,$at | ||
2097 | mflo $t_1 | ||
2098 | mfhi $t_2 | ||
2099 | slt $at,$t_2,$zero | ||
2100 | $ADDU $c_3,$at | ||
2101 | $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); | ||
2102 | $SLL $t_2,1 | ||
2103 | slt $a2,$t_1,$zero | ||
2104 | $ADDU $t_2,$a2 | ||
2105 | $SLL $t_1,1 | ||
2106 | $ADDU $c_1,$t_1 | ||
2107 | sltu $at,$c_1,$t_1 | ||
2108 | $ADDU $t_2,$at | ||
2109 | $ADDU $c_2,$t_2 | ||
2110 | sltu $at,$c_2,$t_2 | ||
2111 | $ADDU $c_3,$at | ||
2112 | mflo $t_1 | ||
2113 | mfhi $t_2 | ||
2114 | $ADDU $c_1,$t_1 | ||
2115 | sltu $at,$c_1,$t_1 | ||
2116 | $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); | ||
2117 | $ADDU $t_2,$at | ||
2118 | $ADDU $c_2,$t_2 | ||
2119 | sltu $at,$c_2,$t_2 | ||
2120 | $ADDU $c_3,$at | ||
2121 | $ST $c_1,6*$BNSZ($a0) | ||
2122 | |||
2123 | mflo $t_1 | ||
2124 | mfhi $t_2 | ||
2125 | slt $c_1,$t_2,$zero | ||
2126 | $SLL $t_2,1 | ||
2127 | $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); | ||
2128 | slt $a2,$t_1,$zero | ||
2129 | $ADDU $t_2,$a2 | ||
2130 | $SLL $t_1,1 | ||
2131 | $ADDU $c_2,$t_1 | ||
2132 | sltu $at,$c_2,$t_1 | ||
2133 | $ADDU $t_2,$at | ||
2134 | $ADDU $c_3,$t_2 | ||
2135 | sltu $at,$c_3,$t_2 | ||
2136 | $ADDU $c_1,$at | ||
2137 | mflo $t_1 | ||
2138 | mfhi $t_2 | ||
2139 | slt $at,$t_2,$zero | ||
2140 | $ADDU $c_1,$at | ||
2141 | $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); | ||
2142 | $SLL $t_2,1 | ||
2143 | slt $a2,$t_1,$zero | ||
2144 | $ADDU $t_2,$a2 | ||
2145 | $SLL $t_1,1 | ||
2146 | $ADDU $c_2,$t_1 | ||
2147 | sltu $at,$c_2,$t_1 | ||
2148 | $ADDU $t_2,$at | ||
2149 | $ADDU $c_3,$t_2 | ||
2150 | sltu $at,$c_3,$t_2 | ||
2151 | $ADDU $c_1,$at | ||
2152 | mflo $t_1 | ||
2153 | mfhi $t_2 | ||
2154 | slt $at,$t_2,$zero | ||
2155 | $ADDU $c_1,$at | ||
2156 | $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); | ||
2157 | $SLL $t_2,1 | ||
2158 | slt $a2,$t_1,$zero | ||
2159 | $ADDU $t_2,$a2 | ||
2160 | $SLL $t_1,1 | ||
2161 | $ADDU $c_2,$t_1 | ||
2162 | sltu $at,$c_2,$t_1 | ||
2163 | $ADDU $t_2,$at | ||
2164 | $ADDU $c_3,$t_2 | ||
2165 | sltu $at,$c_3,$t_2 | ||
2166 | $ADDU $c_1,$at | ||
2167 | mflo $t_1 | ||
2168 | mfhi $t_2 | ||
2169 | slt $at,$t_2,$zero | ||
2170 | $ADDU $c_1,$at | ||
2171 | $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); | ||
2172 | $SLL $t_2,1 | ||
2173 | slt $a2,$t_1,$zero | ||
2174 | $ADDU $t_2,$a2 | ||
2175 | $SLL $t_1,1 | ||
2176 | $ADDU $c_2,$t_1 | ||
2177 | sltu $at,$c_2,$t_1 | ||
2178 | $ADDU $t_2,$at | ||
2179 | $ADDU $c_3,$t_2 | ||
2180 | sltu $at,$c_3,$t_2 | ||
2181 | $ADDU $c_1,$at | ||
2182 | $ST $c_2,7*$BNSZ($a0) | ||
2183 | |||
2184 | mflo $t_1 | ||
2185 | mfhi $t_2 | ||
2186 | slt $c_2,$t_2,$zero | ||
2187 | $SLL $t_2,1 | ||
2188 | $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); | ||
2189 | slt $a2,$t_1,$zero | ||
2190 | $ADDU $t_2,$a2 | ||
2191 | $SLL $t_1,1 | ||
2192 | $ADDU $c_3,$t_1 | ||
2193 | sltu $at,$c_3,$t_1 | ||
2194 | $ADDU $t_2,$at | ||
2195 | $ADDU $c_1,$t_2 | ||
2196 | sltu $at,$c_1,$t_2 | ||
2197 | $ADDU $c_2,$at | ||
2198 | mflo $t_1 | ||
2199 | mfhi $t_2 | ||
2200 | slt $at,$t_2,$zero | ||
2201 | $ADDU $c_2,$at | ||
2202 | $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); | ||
2203 | $SLL $t_2,1 | ||
2204 | slt $a2,$t_1,$zero | ||
2205 | $ADDU $t_2,$a2 | ||
2206 | $SLL $t_1,1 | ||
2207 | $ADDU $c_3,$t_1 | ||
2208 | sltu $at,$c_3,$t_1 | ||
2209 | $ADDU $t_2,$at | ||
2210 | $ADDU $c_1,$t_2 | ||
2211 | sltu $at,$c_1,$t_2 | ||
2212 | $ADDU $c_2,$at | ||
2213 | mflo $t_1 | ||
2214 | mfhi $t_2 | ||
2215 | slt $at,$t_2,$zero | ||
2216 | $ADDU $c_2,$at | ||
2217 | $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); | ||
2218 | $SLL $t_2,1 | ||
2219 | slt $a2,$t_1,$zero | ||
2220 | $ADDU $t_2,$a2 | ||
2221 | $SLL $t_1,1 | ||
2222 | $ADDU $c_3,$t_1 | ||
2223 | sltu $at,$c_3,$t_1 | ||
2224 | $ADDU $t_2,$at | ||
2225 | $ADDU $c_1,$t_2 | ||
2226 | sltu $at,$c_1,$t_2 | ||
2227 | $ADDU $c_2,$at | ||
2228 | mflo $t_1 | ||
2229 | mfhi $t_2 | ||
2230 | $ADDU $c_3,$t_1 | ||
2231 | sltu $at,$c_3,$t_1 | ||
2232 | $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); | ||
2233 | $ADDU $t_2,$at | ||
2234 | $ADDU $c_1,$t_2 | ||
2235 | sltu $at,$c_1,$t_2 | ||
2236 | $ADDU $c_2,$at | ||
2237 | $ST $c_3,8*$BNSZ($a0) | ||
2238 | |||
2239 | mflo $t_1 | ||
2240 | mfhi $t_2 | ||
2241 | slt $c_3,$t_2,$zero | ||
2242 | $SLL $t_2,1 | ||
2243 | $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); | ||
2244 | slt $a2,$t_1,$zero | ||
2245 | $ADDU $t_2,$a2 | ||
2246 | $SLL $t_1,1 | ||
2247 | $ADDU $c_1,$t_1 | ||
2248 | sltu $at,$c_1,$t_1 | ||
2249 | $ADDU $t_2,$at | ||
2250 | $ADDU $c_2,$t_2 | ||
2251 | sltu $at,$c_2,$t_2 | ||
2252 | $ADDU $c_3,$at | ||
2253 | mflo $t_1 | ||
2254 | mfhi $t_2 | ||
2255 | slt $at,$t_2,$zero | ||
2256 | $ADDU $c_3,$at | ||
2257 | $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); | ||
2258 | $SLL $t_2,1 | ||
2259 | slt $a2,$t_1,$zero | ||
2260 | $ADDU $t_2,$a2 | ||
2261 | $SLL $t_1,1 | ||
2262 | $ADDU $c_1,$t_1 | ||
2263 | sltu $at,$c_1,$t_1 | ||
2264 | $ADDU $t_2,$at | ||
2265 | $ADDU $c_2,$t_2 | ||
2266 | sltu $at,$c_2,$t_2 | ||
2267 | $ADDU $c_3,$at | ||
2268 | mflo $t_1 | ||
2269 | mfhi $t_2 | ||
2270 | slt $at,$t_2,$zero | ||
2271 | $ADDU $c_3,$at | ||
2272 | $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); | ||
2273 | $SLL $t_2,1 | ||
2274 | slt $a2,$t_1,$zero | ||
2275 | $ADDU $t_2,$a2 | ||
2276 | $SLL $t_1,1 | ||
2277 | $ADDU $c_1,$t_1 | ||
2278 | sltu $at,$c_1,$t_1 | ||
2279 | $ADDU $t_2,$at | ||
2280 | $ADDU $c_2,$t_2 | ||
2281 | sltu $at,$c_2,$t_2 | ||
2282 | $ADDU $c_3,$at | ||
2283 | $ST $c_1,9*$BNSZ($a0) | ||
2284 | |||
2285 | mflo $t_1 | ||
2286 | mfhi $t_2 | ||
2287 | slt $c_1,$t_2,$zero | ||
2288 | $SLL $t_2,1 | ||
2289 | $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); | ||
2290 | slt $a2,$t_1,$zero | ||
2291 | $ADDU $t_2,$a2 | ||
2292 | $SLL $t_1,1 | ||
2293 | $ADDU $c_2,$t_1 | ||
2294 | sltu $at,$c_2,$t_1 | ||
2295 | $ADDU $t_2,$at | ||
2296 | $ADDU $c_3,$t_2 | ||
2297 | sltu $at,$c_3,$t_2 | ||
2298 | $ADDU $c_1,$at | ||
2299 | mflo $t_1 | ||
2300 | mfhi $t_2 | ||
2301 | slt $at,$t_2,$zero | ||
2302 | $ADDU $c_1,$at | ||
2303 | $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); | ||
2304 | $SLL $t_2,1 | ||
2305 | slt $a2,$t_1,$zero | ||
2306 | $ADDU $t_2,$a2 | ||
2307 | $SLL $t_1,1 | ||
2308 | $ADDU $c_2,$t_1 | ||
2309 | sltu $at,$c_2,$t_1 | ||
2310 | $ADDU $t_2,$at | ||
2311 | $ADDU $c_3,$t_2 | ||
2312 | sltu $at,$c_3,$t_2 | ||
2313 | $ADDU $c_1,$at | ||
2314 | mflo $t_1 | ||
2315 | mfhi $t_2 | ||
2316 | $ADDU $c_2,$t_1 | ||
2317 | sltu $at,$c_2,$t_1 | ||
2318 | $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); | ||
2319 | $ADDU $t_2,$at | ||
2320 | $ADDU $c_3,$t_2 | ||
2321 | sltu $at,$c_3,$t_2 | ||
2322 | $ADDU $c_1,$at | ||
2323 | $ST $c_2,10*$BNSZ($a0) | ||
2324 | |||
2325 | mflo $t_1 | ||
2326 | mfhi $t_2 | ||
2327 | slt $c_2,$t_2,$zero | ||
2328 | $SLL $t_2,1 | ||
2329 | $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); | ||
2330 | slt $a2,$t_1,$zero | ||
2331 | $ADDU $t_2,$a2 | ||
2332 | $SLL $t_1,1 | ||
2333 | $ADDU $c_3,$t_1 | ||
2334 | sltu $at,$c_3,$t_1 | ||
2335 | $ADDU $t_2,$at | ||
2336 | $ADDU $c_1,$t_2 | ||
2337 | sltu $at,$c_1,$t_2 | ||
2338 | $ADDU $c_2,$at | ||
2339 | mflo $t_1 | ||
2340 | mfhi $t_2 | ||
2341 | slt $at,$t_2,$zero | ||
2342 | $ADDU $c_2,$at | ||
2343 | $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); | ||
2344 | $SLL $t_2,1 | ||
2345 | slt $a2,$t_1,$zero | ||
2346 | $ADDU $t_2,$a2 | ||
2347 | $SLL $t_1,1 | ||
2348 | $ADDU $c_3,$t_1 | ||
2349 | sltu $at,$c_3,$t_1 | ||
2350 | $ADDU $t_2,$at | ||
2351 | $ADDU $c_1,$t_2 | ||
2352 | sltu $at,$c_1,$t_2 | ||
2353 | $ADDU $c_2,$at | ||
2354 | $ST $c_3,11*$BNSZ($a0) | ||
2355 | |||
2356 | mflo $t_1 | ||
2357 | mfhi $t_2 | ||
2358 | slt $c_3,$t_2,$zero | ||
2359 | $SLL $t_2,1 | ||
2360 | $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); | ||
2361 | slt $a2,$t_1,$zero | ||
2362 | $ADDU $t_2,$a2 | ||
2363 | $SLL $t_1,1 | ||
2364 | $ADDU $c_1,$t_1 | ||
2365 | sltu $at,$c_1,$t_1 | ||
2366 | $ADDU $t_2,$at | ||
2367 | $ADDU $c_2,$t_2 | ||
2368 | sltu $at,$c_2,$t_2 | ||
2369 | $ADDU $c_3,$at | ||
2370 | mflo $t_1 | ||
2371 | mfhi $t_2 | ||
2372 | $ADDU $c_1,$t_1 | ||
2373 | sltu $at,$c_1,$t_1 | ||
2374 | $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); | ||
2375 | $ADDU $t_2,$at | ||
2376 | $ADDU $c_2,$t_2 | ||
2377 | sltu $at,$c_2,$t_2 | ||
2378 | $ADDU $c_3,$at | ||
2379 | $ST $c_1,12*$BNSZ($a0) | ||
2380 | |||
2381 | mflo $t_1 | ||
2382 | mfhi $t_2 | ||
2383 | slt $c_1,$t_2,$zero | ||
2384 | $SLL $t_2,1 | ||
2385 | $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); | ||
2386 | slt $a2,$t_1,$zero | ||
2387 | $ADDU $t_2,$a2 | ||
2388 | $SLL $t_1,1 | ||
2389 | $ADDU $c_2,$t_1 | ||
2390 | sltu $at,$c_2,$t_1 | ||
2391 | $ADDU $t_2,$at | ||
2392 | $ADDU $c_3,$t_2 | ||
2393 | sltu $at,$c_3,$t_2 | ||
2394 | $ADDU $c_1,$at | ||
2395 | $ST $c_2,13*$BNSZ($a0) | ||
2396 | |||
2397 | mflo $t_1 | ||
2398 | mfhi $t_2 | ||
2399 | $ADDU $c_3,$t_1 | ||
2400 | sltu $at,$c_3,$t_1 | ||
2401 | $ADDU $t_2,$at | ||
2402 | $ADDU $c_1,$t_2 | ||
2403 | $ST $c_3,14*$BNSZ($a0) | ||
2404 | $ST $c_1,15*$BNSZ($a0) | ||
2405 | |||
2406 | .set noreorder | ||
2407 | ___ | ||
2408 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
2409 | $REG_L $t3,4*$SZREG($sp) | ||
2410 | $REG_L $t2,3*$SZREG($sp) | ||
2411 | $REG_L $t1,2*$SZREG($sp) | ||
2412 | $REG_L $t0,1*$SZREG($sp) | ||
2413 | $REG_L $gp,0*$SZREG($sp) | ||
2414 | $PTR_ADD $sp,6*$SZREG | ||
2415 | ___ | ||
2416 | $code.=<<___; | ||
2417 | jr $ra | ||
2418 | nop | ||
2419 | .end bn_sqr_comba8 | ||
2420 | |||
2421 | .align 5 | ||
2422 | .globl bn_sqr_comba4 | ||
2423 | .ent bn_sqr_comba4 | ||
2424 | bn_sqr_comba4: | ||
2425 | ___ | ||
2426 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
2427 | .frame $sp,6*$SZREG,$ra | ||
2428 | .mask 0x8000f008,-$SZREG | ||
2429 | .set noreorder | ||
2430 | $PTR_SUB $sp,6*$SZREG | ||
2431 | $REG_S $ra,5*$SZREG($sp) | ||
2432 | $REG_S $t3,4*$SZREG($sp) | ||
2433 | $REG_S $t2,3*$SZREG($sp) | ||
2434 | $REG_S $t1,2*$SZREG($sp) | ||
2435 | $REG_S $t0,1*$SZREG($sp) | ||
2436 | $REG_S $gp,0*$SZREG($sp) | ||
2437 | ___ | ||
2438 | $code.=<<___; | ||
2439 | .set reorder | ||
2440 | $LD $a_0,0($a1) | ||
2441 | $LD $a_1,$BNSZ($a1) | ||
2442 | $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
2443 | $LD $a_2,2*$BNSZ($a1) | ||
2444 | $LD $a_3,3*$BNSZ($a1) | ||
2445 | mflo $c_1 | ||
2446 | mfhi $c_2 | ||
2447 | $ST $c_1,0($a0) | ||
2448 | |||
2449 | $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); | ||
2450 | mflo $t_1 | ||
2451 | mfhi $t_2 | ||
2452 | slt $c_1,$t_2,$zero | ||
2453 | $SLL $t_2,1 | ||
2454 | $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); | ||
2455 | slt $a2,$t_1,$zero | ||
2456 | $ADDU $t_2,$a2 | ||
2457 | $SLL $t_1,1 | ||
2458 | $ADDU $c_2,$t_1 | ||
2459 | sltu $at,$c_2,$t_1 | ||
2460 | $ADDU $c_3,$t_2,$at | ||
2461 | $ST $c_2,$BNSZ($a0) | ||
2462 | |||
2463 | mflo $t_1 | ||
2464 | mfhi $t_2 | ||
2465 | slt $c_2,$t_2,$zero | ||
2466 | $SLL $t_2,1 | ||
2467 | $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); | ||
2468 | slt $a2,$t_1,$zero | ||
2469 | $ADDU $t_2,$a2 | ||
2470 | $SLL $t_1,1 | ||
2471 | $ADDU $c_3,$t_1 | ||
2472 | sltu $at,$c_3,$t_1 | ||
2473 | $ADDU $t_2,$at | ||
2474 | $ADDU $c_1,$t_2 | ||
2475 | sltu $at,$c_1,$t_2 | ||
2476 | $ADDU $c_2,$at | ||
2477 | mflo $t_1 | ||
2478 | mfhi $t_2 | ||
2479 | $ADDU $c_3,$t_1 | ||
2480 | sltu $at,$c_3,$t_1 | ||
2481 | $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); | ||
2482 | $ADDU $t_2,$at | ||
2483 | $ADDU $c_1,$t_2 | ||
2484 | sltu $at,$c_1,$t_2 | ||
2485 | $ADDU $c_2,$at | ||
2486 | $ST $c_3,2*$BNSZ($a0) | ||
2487 | |||
2488 | mflo $t_1 | ||
2489 | mfhi $t_2 | ||
2490 | slt $c_3,$t_2,$zero | ||
2491 | $SLL $t_2,1 | ||
2492 | $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); | ||
2493 | slt $a2,$t_1,$zero | ||
2494 | $ADDU $t_2,$a2 | ||
2495 | $SLL $t_1,1 | ||
2496 | $ADDU $c_1,$t_1 | ||
2497 | sltu $at,$c_1,$t_1 | ||
2498 | $ADDU $t_2,$at | ||
2499 | $ADDU $c_2,$t_2 | ||
2500 | sltu $at,$c_2,$t_2 | ||
2501 | $ADDU $c_3,$at | ||
2502 | mflo $t_1 | ||
2503 | mfhi $t_2 | ||
2504 | slt $at,$t_2,$zero | ||
2505 | $ADDU $c_3,$at | ||
2506 | $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); | ||
2507 | $SLL $t_2,1 | ||
2508 | slt $a2,$t_1,$zero | ||
2509 | $ADDU $t_2,$a2 | ||
2510 | $SLL $t_1,1 | ||
2511 | $ADDU $c_1,$t_1 | ||
2512 | sltu $at,$c_1,$t_1 | ||
2513 | $ADDU $t_2,$at | ||
2514 | $ADDU $c_2,$t_2 | ||
2515 | sltu $at,$c_2,$t_2 | ||
2516 | $ADDU $c_3,$at | ||
2517 | $ST $c_1,3*$BNSZ($a0) | ||
2518 | |||
2519 | mflo $t_1 | ||
2520 | mfhi $t_2 | ||
2521 | slt $c_1,$t_2,$zero | ||
2522 | $SLL $t_2,1 | ||
2523 | $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); | ||
2524 | slt $a2,$t_1,$zero | ||
2525 | $ADDU $t_2,$a2 | ||
2526 | $SLL $t_1,1 | ||
2527 | $ADDU $c_2,$t_1 | ||
2528 | sltu $at,$c_2,$t_1 | ||
2529 | $ADDU $t_2,$at | ||
2530 | $ADDU $c_3,$t_2 | ||
2531 | sltu $at,$c_3,$t_2 | ||
2532 | $ADDU $c_1,$at | ||
2533 | mflo $t_1 | ||
2534 | mfhi $t_2 | ||
2535 | $ADDU $c_2,$t_1 | ||
2536 | sltu $at,$c_2,$t_1 | ||
2537 | $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); | ||
2538 | $ADDU $t_2,$at | ||
2539 | $ADDU $c_3,$t_2 | ||
2540 | sltu $at,$c_3,$t_2 | ||
2541 | $ADDU $c_1,$at | ||
2542 | $ST $c_2,4*$BNSZ($a0) | ||
2543 | |||
2544 | mflo $t_1 | ||
2545 | mfhi $t_2 | ||
2546 | slt $c_2,$t_2,$zero | ||
2547 | $SLL $t_2,1 | ||
2548 | $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); | ||
2549 | slt $a2,$t_1,$zero | ||
2550 | $ADDU $t_2,$a2 | ||
2551 | $SLL $t_1,1 | ||
2552 | $ADDU $c_3,$t_1 | ||
2553 | sltu $at,$c_3,$t_1 | ||
2554 | $ADDU $t_2,$at | ||
2555 | $ADDU $c_1,$t_2 | ||
2556 | sltu $at,$c_1,$t_2 | ||
2557 | $ADDU $c_2,$at | ||
2558 | $ST $c_3,5*$BNSZ($a0) | ||
2559 | |||
2560 | mflo $t_1 | ||
2561 | mfhi $t_2 | ||
2562 | $ADDU $c_1,$t_1 | ||
2563 | sltu $at,$c_1,$t_1 | ||
2564 | $ADDU $t_2,$at | ||
2565 | $ADDU $c_2,$t_2 | ||
2566 | $ST $c_1,6*$BNSZ($a0) | ||
2567 | $ST $c_2,7*$BNSZ($a0) | ||
2568 | |||
2569 | .set noreorder | ||
2570 | ___ | ||
2571 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
2572 | $REG_L $t3,4*$SZREG($sp) | ||
2573 | $REG_L $t2,3*$SZREG($sp) | ||
2574 | $REG_L $t1,2*$SZREG($sp) | ||
2575 | $REG_L $t0,1*$SZREG($sp) | ||
2576 | $REG_L $gp,0*$SZREG($sp) | ||
2577 | $PTR_ADD $sp,6*$SZREG | ||
2578 | ___ | ||
2579 | $code.=<<___; | ||
2580 | jr $ra | ||
2581 | nop | ||
2582 | .end bn_sqr_comba4 | ||
2583 | ___ | ||
2584 | print $code; | ||
2585 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl new file mode 100644 index 0000000000..54aeb01921 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl | |||
@@ -0,0 +1,1496 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # Copyright (c) 2010-2011 Intel Corp. | ||
4 | # Author: Vinodh.Gopal@intel.com | ||
5 | # Jim Guilford | ||
6 | # Erdinc.Ozturk@intel.com | ||
7 | # Maxim.Perminov@intel.com | ||
8 | # | ||
9 | # More information about algorithm used can be found at: | ||
10 | # http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf | ||
11 | # | ||
12 | # ==================================================================== | ||
13 | # Copyright (c) 2011 The OpenSSL Project. All rights reserved. | ||
14 | # | ||
15 | # Redistribution and use in source and binary forms, with or without | ||
16 | # modification, are permitted provided that the following conditions | ||
17 | # are met: | ||
18 | # | ||
19 | # 1. Redistributions of source code must retain the above copyright | ||
20 | # notice, this list of conditions and the following disclaimer. | ||
21 | # | ||
22 | # 2. Redistributions in binary form must reproduce the above copyright | ||
23 | # notice, this list of conditions and the following disclaimer in | ||
24 | # the documentation and/or other materials provided with the | ||
25 | # distribution. | ||
26 | # | ||
27 | # 3. All advertising materials mentioning features or use of this | ||
28 | # software must display the following acknowledgment: | ||
29 | # "This product includes software developed by the OpenSSL Project | ||
30 | # for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
31 | # | ||
32 | # 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
33 | # endorse or promote products derived from this software without | ||
34 | # prior written permission. For written permission, please contact | ||
35 | # licensing@OpenSSL.org. | ||
36 | # | ||
37 | # 5. Products derived from this software may not be called "OpenSSL" | ||
38 | # nor may "OpenSSL" appear in their names without prior written | ||
39 | # permission of the OpenSSL Project. | ||
40 | # | ||
41 | # 6. Redistributions of any form whatsoever must retain the following | ||
42 | # acknowledgment: | ||
43 | # "This product includes software developed by the OpenSSL Project | ||
44 | # for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
45 | # | ||
46 | # THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
47 | # EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
48 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
49 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
50 | # ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
51 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
52 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
53 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
54 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
55 | # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
56 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
57 | # OF THE POSSIBILITY OF SUCH DAMAGE. | ||
58 | # ==================================================================== | ||
59 | |||
60 | $flavour = shift; | ||
61 | $output = shift; | ||
62 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
63 | |||
64 | my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
65 | |||
66 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
67 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
68 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
69 | die "can't locate x86_64-xlate.pl"; | ||
70 | |||
71 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
72 | |||
73 | use strict; | ||
74 | my $code=".text\n\n"; | ||
75 | my $m=0; | ||
76 | |||
77 | # | ||
78 | # Define x512 macros | ||
79 | # | ||
80 | |||
81 | #MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2 | ||
82 | # | ||
83 | # uses rax, rdx, and args | ||
84 | sub MULSTEP_512_ADD | ||
85 | { | ||
86 | my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_; | ||
87 | my @X=@$x; # make a copy | ||
88 | $code.=<<___; | ||
89 | mov (+8*0)($SRC2), %rax | ||
90 | mul $OP # rdx:rax = %OP * [0] | ||
91 | mov ($ASRC), $X[0] | ||
92 | add %rax, $X[0] | ||
93 | adc \$0, %rdx | ||
94 | mov $X[0], $DST | ||
95 | ___ | ||
96 | for(my $i=1;$i<8;$i++) { | ||
97 | $code.=<<___; | ||
98 | mov %rdx, $TMP | ||
99 | |||
100 | mov (+8*$i)($SRC2), %rax | ||
101 | mul $OP # rdx:rax = %OP * [$i] | ||
102 | mov (+8*$i)($ASRC), $X[$i] | ||
103 | add %rax, $X[$i] | ||
104 | adc \$0, %rdx | ||
105 | add $TMP, $X[$i] | ||
106 | adc \$0, %rdx | ||
107 | ___ | ||
108 | } | ||
109 | $code.=<<___; | ||
110 | mov %rdx, $X[0] | ||
111 | ___ | ||
112 | } | ||
113 | |||
114 | #MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp | ||
115 | # | ||
116 | # uses rax, rdx, and args | ||
117 | sub MULSTEP_512 | ||
118 | { | ||
119 | my ($x, $DST, $SRC2, $OP, $TMP)=@_; | ||
120 | my @X=@$x; # make a copy | ||
121 | $code.=<<___; | ||
122 | mov (+8*0)($SRC2), %rax | ||
123 | mul $OP # rdx:rax = %OP * [0] | ||
124 | add %rax, $X[0] | ||
125 | adc \$0, %rdx | ||
126 | mov $X[0], $DST | ||
127 | ___ | ||
128 | for(my $i=1;$i<8;$i++) { | ||
129 | $code.=<<___; | ||
130 | mov %rdx, $TMP | ||
131 | |||
132 | mov (+8*$i)($SRC2), %rax | ||
133 | mul $OP # rdx:rax = %OP * [$i] | ||
134 | add %rax, $X[$i] | ||
135 | adc \$0, %rdx | ||
136 | add $TMP, $X[$i] | ||
137 | adc \$0, %rdx | ||
138 | ___ | ||
139 | } | ||
140 | $code.=<<___; | ||
141 | mov %rdx, $X[0] | ||
142 | ___ | ||
143 | } | ||
144 | |||
145 | # | ||
146 | # Swizzle Macros | ||
147 | # | ||
148 | |||
149 | # macro to copy data from flat space to swizzled table | ||
150 | #MACRO swizzle pDst, pSrc, tmp1, tmp2 | ||
151 | # pDst and pSrc are modified | ||
152 | sub swizzle | ||
153 | { | ||
154 | my ($pDst, $pSrc, $cnt, $d0)=@_; | ||
155 | $code.=<<___; | ||
156 | mov \$8, $cnt | ||
157 | loop_$m: | ||
158 | mov ($pSrc), $d0 | ||
159 | mov $d0#w, ($pDst) | ||
160 | shr \$16, $d0 | ||
161 | mov $d0#w, (+64*1)($pDst) | ||
162 | shr \$16, $d0 | ||
163 | mov $d0#w, (+64*2)($pDst) | ||
164 | shr \$16, $d0 | ||
165 | mov $d0#w, (+64*3)($pDst) | ||
166 | lea 8($pSrc), $pSrc | ||
167 | lea 64*4($pDst), $pDst | ||
168 | dec $cnt | ||
169 | jnz loop_$m | ||
170 | ___ | ||
171 | |||
172 | $m++; | ||
173 | } | ||
174 | |||
175 | # macro to copy data from swizzled table to flat space | ||
176 | #MACRO unswizzle pDst, pSrc, tmp*3 | ||
177 | sub unswizzle | ||
178 | { | ||
179 | my ($pDst, $pSrc, $cnt, $d0, $d1)=@_; | ||
180 | $code.=<<___; | ||
181 | mov \$4, $cnt | ||
182 | loop_$m: | ||
183 | movzxw (+64*3+256*0)($pSrc), $d0 | ||
184 | movzxw (+64*3+256*1)($pSrc), $d1 | ||
185 | shl \$16, $d0 | ||
186 | shl \$16, $d1 | ||
187 | mov (+64*2+256*0)($pSrc), $d0#w | ||
188 | mov (+64*2+256*1)($pSrc), $d1#w | ||
189 | shl \$16, $d0 | ||
190 | shl \$16, $d1 | ||
191 | mov (+64*1+256*0)($pSrc), $d0#w | ||
192 | mov (+64*1+256*1)($pSrc), $d1#w | ||
193 | shl \$16, $d0 | ||
194 | shl \$16, $d1 | ||
195 | mov (+64*0+256*0)($pSrc), $d0#w | ||
196 | mov (+64*0+256*1)($pSrc), $d1#w | ||
197 | mov $d0, (+8*0)($pDst) | ||
198 | mov $d1, (+8*1)($pDst) | ||
199 | lea 256*2($pSrc), $pSrc | ||
200 | lea 8*2($pDst), $pDst | ||
201 | sub \$1, $cnt | ||
202 | jnz loop_$m | ||
203 | ___ | ||
204 | |||
205 | $m++; | ||
206 | } | ||
207 | |||
208 | # | ||
209 | # Data Structures | ||
210 | # | ||
211 | |||
212 | # Reduce Data | ||
213 | # | ||
214 | # | ||
215 | # Offset Value | ||
216 | # 0C0 Carries | ||
217 | # 0B8 X2[10] | ||
218 | # 0B0 X2[9] | ||
219 | # 0A8 X2[8] | ||
220 | # 0A0 X2[7] | ||
221 | # 098 X2[6] | ||
222 | # 090 X2[5] | ||
223 | # 088 X2[4] | ||
224 | # 080 X2[3] | ||
225 | # 078 X2[2] | ||
226 | # 070 X2[1] | ||
227 | # 068 X2[0] | ||
228 | # 060 X1[12] P[10] | ||
229 | # 058 X1[11] P[9] Z[8] | ||
230 | # 050 X1[10] P[8] Z[7] | ||
231 | # 048 X1[9] P[7] Z[6] | ||
232 | # 040 X1[8] P[6] Z[5] | ||
233 | # 038 X1[7] P[5] Z[4] | ||
234 | # 030 X1[6] P[4] Z[3] | ||
235 | # 028 X1[5] P[3] Z[2] | ||
236 | # 020 X1[4] P[2] Z[1] | ||
237 | # 018 X1[3] P[1] Z[0] | ||
238 | # 010 X1[2] P[0] Y[2] | ||
239 | # 008 X1[1] Q[1] Y[1] | ||
240 | # 000 X1[0] Q[0] Y[0] | ||
241 | |||
242 | my $X1_offset = 0; # 13 qwords | ||
243 | my $X2_offset = $X1_offset + 13*8; # 11 qwords | ||
244 | my $Carries_offset = $X2_offset + 11*8; # 1 qword | ||
245 | my $Q_offset = 0; # 2 qwords | ||
246 | my $P_offset = $Q_offset + 2*8; # 11 qwords | ||
247 | my $Y_offset = 0; # 3 qwords | ||
248 | my $Z_offset = $Y_offset + 3*8; # 9 qwords | ||
249 | |||
250 | my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords) | ||
251 | |||
252 | # | ||
253 | # Stack Frame | ||
254 | # | ||
255 | # | ||
256 | # offset value | ||
257 | # ... <old stack contents> | ||
258 | # ... | ||
259 | # 280 Garray | ||
260 | |||
261 | # 278 tmp16[15] | ||
262 | # ... ... | ||
263 | # 200 tmp16[0] | ||
264 | |||
265 | # 1F8 tmp[7] | ||
266 | # ... ... | ||
267 | # 1C0 tmp[0] | ||
268 | |||
269 | # 1B8 GT[7] | ||
270 | # ... ... | ||
271 | # 180 GT[0] | ||
272 | |||
273 | # 178 Reduce Data | ||
274 | # ... ... | ||
275 | # 0B8 Reduce Data | ||
276 | # 0B0 reserved | ||
277 | # 0A8 reserved | ||
278 | # 0A0 reserved | ||
279 | # 098 reserved | ||
280 | # 090 reserved | ||
281 | # 088 reduce result addr | ||
282 | # 080 exp[8] | ||
283 | |||
284 | # ... | ||
285 | # 048 exp[1] | ||
286 | # 040 exp[0] | ||
287 | |||
288 | # 038 reserved | ||
289 | # 030 loop_idx | ||
290 | # 028 pg | ||
291 | # 020 i | ||
292 | # 018 pData ; arg 4 | ||
293 | # 010 pG ; arg 2 | ||
294 | # 008 pResult ; arg 1 | ||
295 | # 000 rsp ; stack pointer before subtract | ||
296 | |||
297 | my $rsp_offset = 0; | ||
298 | my $pResult_offset = 8*1 + $rsp_offset; | ||
299 | my $pG_offset = 8*1 + $pResult_offset; | ||
300 | my $pData_offset = 8*1 + $pG_offset; | ||
301 | my $i_offset = 8*1 + $pData_offset; | ||
302 | my $pg_offset = 8*1 + $i_offset; | ||
303 | my $loop_idx_offset = 8*1 + $pg_offset; | ||
304 | my $reserved1_offset = 8*1 + $loop_idx_offset; | ||
305 | my $exp_offset = 8*1 + $reserved1_offset; | ||
306 | my $red_result_addr_offset= 8*9 + $exp_offset; | ||
307 | my $reserved2_offset = 8*1 + $red_result_addr_offset; | ||
308 | my $Reduce_Data_offset = 8*5 + $reserved2_offset; | ||
309 | my $GT_offset = $Red_Data_Size + $Reduce_Data_offset; | ||
310 | my $tmp_offset = 8*8 + $GT_offset; | ||
311 | my $tmp16_offset = 8*8 + $tmp_offset; | ||
312 | my $garray_offset = 8*16 + $tmp16_offset; | ||
313 | my $mem_size = 8*8*32 + $garray_offset; | ||
314 | |||
315 | # | ||
316 | # Offsets within Reduce Data | ||
317 | # | ||
318 | # | ||
319 | # struct MODF_2FOLD_MONT_512_C1_DATA { | ||
320 | # UINT64 t[8][8]; | ||
321 | # UINT64 m[8]; | ||
322 | # UINT64 m1[8]; /* 2^768 % m */ | ||
323 | # UINT64 m2[8]; /* 2^640 % m */ | ||
324 | # UINT64 k1[2]; /* (- 1/m) % 2^128 */ | ||
325 | # }; | ||
326 | |||
327 | my $T = 0; | ||
328 | my $M = 512; # = 8 * 8 * 8 | ||
329 | my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */ | ||
330 | my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */ | ||
331 | my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */ | ||
332 | |||
333 | # | ||
334 | # FUNCTIONS | ||
335 | # | ||
336 | |||
337 | {{{ | ||
338 | # | ||
339 | # MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords) | ||
340 | # and add 512-bits (8 qwords) | ||
341 | # to get 640 bits (10 qwords) | ||
342 | # Input: 128-bit mul source: [rdi+8*1], rbp | ||
343 | # 512-bit mul source: [rsi+8*n] | ||
344 | # 512-bit add source: r15, r14, ..., r9, r8 | ||
345 | # Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0] | ||
346 | # Clobbers all regs except: rcx, rsi, rdi | ||
347 | $code.=<<___; | ||
348 | .type MULADD_128x512,\@abi-omnipotent | ||
349 | .align 16 | ||
350 | MULADD_128x512: | ||
351 | ___ | ||
352 | &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx"); | ||
353 | $code.=<<___; | ||
354 | mov (+8*1)(%rdi), %rbp | ||
355 | ___ | ||
356 | &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx"); | ||
357 | $code.=<<___; | ||
358 | ret | ||
359 | .size MULADD_128x512,.-MULADD_128x512 | ||
360 | ___ | ||
361 | }}} | ||
362 | |||
363 | {{{ | ||
364 | #MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0 | ||
365 | # | ||
366 | # Inputs: pDst: Destination (768 bits, 12 qwords) | ||
367 | # pA: Multiplicand (1024 bits, 16 qwords) | ||
368 | # pB: Multiplicand (512 bits, 8 qwords) | ||
369 | # Dst = Ah * B + Al | ||
370 | # where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits) | ||
371 | # Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0] | ||
372 | # Uses registers: arguments, RAX, RDX | ||
373 | sub MULADD_256x512 | ||
374 | { | ||
375 | my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_; | ||
376 | $code.=<<___; | ||
377 | mov (+8*12)($pA), $OP | ||
378 | ___ | ||
379 | &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP); | ||
380 | push(@$X,shift(@$X)); | ||
381 | |||
382 | $code.=<<___; | ||
383 | mov (+8*13)($pA), $OP | ||
384 | ___ | ||
385 | &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP); | ||
386 | push(@$X,shift(@$X)); | ||
387 | |||
388 | $code.=<<___; | ||
389 | mov (+8*14)($pA), $OP | ||
390 | ___ | ||
391 | &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP); | ||
392 | push(@$X,shift(@$X)); | ||
393 | |||
394 | $code.=<<___; | ||
395 | mov (+8*15)($pA), $OP | ||
396 | ___ | ||
397 | &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP); | ||
398 | push(@$X,shift(@$X)); | ||
399 | } | ||
400 | |||
401 | # | ||
402 | # mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */ | ||
403 | # UINT64 *m, /* 512 bits, 8 qwords */ | ||
404 | # MODF_2FOLD_MONT_512_C1_DATA *data, | ||
405 | # UINT64 *r) /* 512 bits, 8 qwords */ | ||
406 | # Input: x (number to be reduced): tmp16 (Implicit) | ||
407 | # m (modulus): [pM] (Implicit) | ||
408 | # data (reduce data): [pData] (Implicit) | ||
409 | # Output: r (result): Address in [red_res_addr] | ||
410 | # result also in: r9, r8, r15, r14, r13, r12, r11, r10 | ||
411 | |||
412 | my @X=map("%r$_",(8..15)); | ||
413 | |||
414 | $code.=<<___; | ||
415 | .type mont_reduce,\@abi-omnipotent | ||
416 | .align 16 | ||
417 | mont_reduce: | ||
418 | ___ | ||
419 | |||
420 | my $STACK_DEPTH = 8; | ||
421 | # | ||
422 | # X1 = Xh * M1 + Xl | ||
423 | $code.=<<___; | ||
424 | lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords | ||
425 | mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords | ||
426 | add \$$M1, %rsi | ||
427 | lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords | ||
428 | |||
429 | ___ | ||
430 | |||
431 | &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times | ||
432 | # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0] | ||
433 | |||
434 | $code.=<<___; | ||
435 | xor %rax, %rax | ||
436 | # X1 += xl | ||
437 | add (+8*8)(%rcx), $X[4] | ||
438 | adc (+8*9)(%rcx), $X[5] | ||
439 | adc (+8*10)(%rcx), $X[6] | ||
440 | adc (+8*11)(%rcx), $X[7] | ||
441 | adc \$0, %rax | ||
442 | # X1 is now rax, r11-r8, r15-r12, tmp16[3:0] | ||
443 | |||
444 | # | ||
445 | # check for carry ;; carry stored in rax | ||
446 | mov $X[4], (+8*8)(%rdi) # rdi points to X1 | ||
447 | mov $X[5], (+8*9)(%rdi) | ||
448 | mov $X[6], %rbp | ||
449 | mov $X[7], (+8*11)(%rdi) | ||
450 | |||
451 | mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp) | ||
452 | |||
453 | mov (+8*0)(%rdi), $X[4] | ||
454 | mov (+8*1)(%rdi), $X[5] | ||
455 | mov (+8*2)(%rdi), $X[6] | ||
456 | mov (+8*3)(%rdi), $X[7] | ||
457 | |||
458 | # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8 | ||
459 | # rdi -> X1 | ||
460 | # rsi -> M1 | ||
461 | |||
462 | # | ||
463 | # X2 = Xh * M2 + Xl | ||
464 | # do first part (X2 = Xh * M2) | ||
465 | add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords | ||
466 | # Xh is actually { [rdi+8*1], rbp } | ||
467 | add \$`$M2-$M1`, %rsi # rsi -> M2 | ||
468 | lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords | ||
469 | ___ | ||
470 | unshift(@X,pop(@X)); unshift(@X,pop(@X)); | ||
471 | $code.=<<___; | ||
472 | |||
473 | call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8 | ||
474 | # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0] | ||
475 | mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax | ||
476 | |||
477 | # X2 += Xl | ||
478 | add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl | ||
479 | adc (+8*9-8*10)(%rdi), $X[7] | ||
480 | mov $X[6], (+8*8)(%rcx) | ||
481 | mov $X[7], (+8*9)(%rcx) | ||
482 | |||
483 | adc %rax, %rax | ||
484 | mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp) | ||
485 | |||
486 | lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords | ||
487 | add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords | ||
488 | |||
489 | # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half) | ||
490 | # B1:B0 = rsi[1:0] = K1[1:0] | ||
491 | # A1:A0 = rcx[1:0] = X2[1:0] | ||
492 | # Result = rdi[1],rbp = Q[1],rbp | ||
493 | mov (%rsi), %r8 # B0 | ||
494 | mov (+8*1)(%rsi), %rbx # B1 | ||
495 | |||
496 | mov (%rcx), %rax # A0 | ||
497 | mul %r8 # B0 | ||
498 | mov %rax, %rbp | ||
499 | mov %rdx, %r9 | ||
500 | |||
501 | mov (+8*1)(%rcx), %rax # A1 | ||
502 | mul %r8 # B0 | ||
503 | add %rax, %r9 | ||
504 | |||
505 | mov (%rcx), %rax # A0 | ||
506 | mul %rbx # B1 | ||
507 | add %rax, %r9 | ||
508 | |||
509 | mov %r9, (+8*1)(%rdi) | ||
510 | # end MUL_128x128t128 | ||
511 | |||
512 | sub \$`$K1-$M`, %rsi | ||
513 | |||
514 | mov (%rcx), $X[6] | ||
515 | mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0] | ||
516 | |||
517 | call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8 | ||
518 | # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0] | ||
519 | |||
520 | # load first half of m to rdx, rdi, rbx, rax | ||
521 | # moved this here for efficiency | ||
522 | mov (+8*0)(%rsi), %rax | ||
523 | mov (+8*1)(%rsi), %rbx | ||
524 | mov (+8*2)(%rsi), %rdi | ||
525 | mov (+8*3)(%rsi), %rdx | ||
526 | |||
527 | # continue with reduction | ||
528 | mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp | ||
529 | |||
530 | add (+8*8)(%rcx), $X[6] | ||
531 | adc (+8*9)(%rcx), $X[7] | ||
532 | |||
533 | #accumulate the final carry to rbp | ||
534 | adc %rbp, %rbp | ||
535 | |||
536 | # Add in overflow corrections: R = (X2>>128) += T[overflow] | ||
537 | # R = {r9, r8, r15, r14, ..., r10} | ||
538 | shl \$3, %rbp | ||
539 | mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T) | ||
540 | add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out | ||
541 | |||
542 | # rsi will be used to generate a mask after the addition | ||
543 | xor %rsi, %rsi | ||
544 | |||
545 | add (+8*8*0)(%rbp), $X[0] | ||
546 | adc (+8*8*1)(%rbp), $X[1] | ||
547 | adc (+8*8*2)(%rbp), $X[2] | ||
548 | adc (+8*8*3)(%rbp), $X[3] | ||
549 | adc (+8*8*4)(%rbp), $X[4] | ||
550 | adc (+8*8*5)(%rbp), $X[5] | ||
551 | adc (+8*8*6)(%rbp), $X[6] | ||
552 | adc (+8*8*7)(%rbp), $X[7] | ||
553 | |||
554 | # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF | ||
555 | # if carry is clear: rsi = 0x0000000000000000 | ||
556 | sbb \$0, %rsi | ||
557 | |||
558 | # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m | ||
559 | and %rsi, %rax | ||
560 | and %rsi, %rbx | ||
561 | and %rsi, %rdi | ||
562 | and %rsi, %rdx | ||
563 | |||
564 | mov \$1, %rbp | ||
565 | sub %rax, $X[0] | ||
566 | sbb %rbx, $X[1] | ||
567 | sbb %rdi, $X[2] | ||
568 | sbb %rdx, $X[3] | ||
569 | |||
570 | # if there is a borrow: rbp = 0 | ||
571 | # if there is no borrow: rbp = 1 | ||
572 | # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m | ||
573 | sbb \$0, %rbp | ||
574 | |||
575 | #load second half of m to rdx, rdi, rbx, rax | ||
576 | |||
577 | add \$$M, %rcx | ||
578 | mov (+8*4)(%rcx), %rax | ||
579 | mov (+8*5)(%rcx), %rbx | ||
580 | mov (+8*6)(%rcx), %rdi | ||
581 | mov (+8*7)(%rcx), %rdx | ||
582 | |||
583 | # use the rsi mask as before | ||
584 | # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m | ||
585 | and %rsi, %rax | ||
586 | and %rsi, %rbx | ||
587 | and %rsi, %rdi | ||
588 | and %rsi, %rdx | ||
589 | |||
590 | # if rbp = 0, there was a borrow before, it is moved to the carry flag | ||
591 | # if rbp = 1, there was not a borrow before, carry flag is cleared | ||
592 | sub \$1, %rbp | ||
593 | |||
594 | sbb %rax, $X[4] | ||
595 | sbb %rbx, $X[5] | ||
596 | sbb %rdi, $X[6] | ||
597 | sbb %rdx, $X[7] | ||
598 | |||
599 | # write R back to memory | ||
600 | |||
601 | mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi | ||
602 | mov $X[0], (+8*0)(%rsi) | ||
603 | mov $X[1], (+8*1)(%rsi) | ||
604 | mov $X[2], (+8*2)(%rsi) | ||
605 | mov $X[3], (+8*3)(%rsi) | ||
606 | mov $X[4], (+8*4)(%rsi) | ||
607 | mov $X[5], (+8*5)(%rsi) | ||
608 | mov $X[6], (+8*6)(%rsi) | ||
609 | mov $X[7], (+8*7)(%rsi) | ||
610 | |||
611 | ret | ||
612 | .size mont_reduce,.-mont_reduce | ||
613 | ___ | ||
614 | }}} | ||
615 | |||
616 | {{{ | ||
617 | #MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2 | ||
618 | # | ||
619 | # Inputs: pDst: Destination (1024 bits, 16 qwords) | ||
620 | # pA: Multiplicand (512 bits, 8 qwords) | ||
621 | # pB: Multiplicand (512 bits, 8 qwords) | ||
622 | # Uses registers rax, rdx, args | ||
623 | # B operand in [pB] and also in x7...x0 | ||
624 | sub MUL_512x512 | ||
625 | { | ||
626 | my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_; | ||
627 | my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/); | ||
628 | my @X=@$x; # make a copy | ||
629 | |||
630 | $code.=<<___; | ||
631 | mov (+8*0)($pA), $OP | ||
632 | |||
633 | mov $X[0], %rax | ||
634 | mul $OP # rdx:rax = %OP * [0] | ||
635 | mov %rax, (+$pDst_o+8*0)($pDst) | ||
636 | mov %rdx, $X[0] | ||
637 | ___ | ||
638 | for(my $i=1;$i<8;$i++) { | ||
639 | $code.=<<___; | ||
640 | mov $X[$i], %rax | ||
641 | mul $OP # rdx:rax = %OP * [$i] | ||
642 | add %rax, $X[$i-1] | ||
643 | adc \$0, %rdx | ||
644 | mov %rdx, $X[$i] | ||
645 | ___ | ||
646 | } | ||
647 | |||
648 | for(my $i=1;$i<8;$i++) { | ||
649 | $code.=<<___; | ||
650 | mov (+8*$i)($pA), $OP | ||
651 | ___ | ||
652 | |||
653 | &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP); | ||
654 | push(@X,shift(@X)); | ||
655 | } | ||
656 | |||
657 | $code.=<<___; | ||
658 | mov $X[0], (+$pDst_o+8*8)($pDst) | ||
659 | mov $X[1], (+$pDst_o+8*9)($pDst) | ||
660 | mov $X[2], (+$pDst_o+8*10)($pDst) | ||
661 | mov $X[3], (+$pDst_o+8*11)($pDst) | ||
662 | mov $X[4], (+$pDst_o+8*12)($pDst) | ||
663 | mov $X[5], (+$pDst_o+8*13)($pDst) | ||
664 | mov $X[6], (+$pDst_o+8*14)($pDst) | ||
665 | mov $X[7], (+$pDst_o+8*15)($pDst) | ||
666 | ___ | ||
667 | } | ||
668 | |||
669 | # | ||
670 | # mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits) | ||
671 | # Input: src1: Address of source 1: rdi | ||
672 | # src2: Address of source 2: rsi | ||
673 | # Output: dst: Address of destination: [red_res_addr] | ||
674 | # src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10 | ||
675 | # Temp: Clobbers [tmp16], all registers | ||
676 | $code.=<<___; | ||
677 | .type mont_mul_a3b,\@abi-omnipotent | ||
678 | .align 16 | ||
679 | mont_mul_a3b: | ||
680 | # | ||
681 | # multiply tmp = src1 * src2 | ||
682 | # For multiply: dst = rcx, src1 = rdi, src2 = rsi | ||
683 | # stack depth is extra 8 from call | ||
684 | ___ | ||
685 | &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx"); | ||
686 | $code.=<<___; | ||
687 | # | ||
688 | # Dst = tmp % m | ||
689 | # Call reduce(tmp, m, data, dst) | ||
690 | |||
691 | # tail recursion optimization: jmp to mont_reduce and return from there | ||
692 | jmp mont_reduce | ||
693 | # call mont_reduce | ||
694 | # ret | ||
695 | .size mont_mul_a3b,.-mont_mul_a3b | ||
696 | ___ | ||
697 | }}} | ||
698 | |||
699 | {{{ | ||
700 | #SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4 | ||
701 | # | ||
702 | # Input in memory [pA] and also in x7...x0 | ||
703 | # Uses all argument registers plus rax and rdx | ||
704 | # | ||
705 | # This version computes all of the off-diagonal terms into memory, | ||
706 | # and then it adds in the diagonal terms | ||
707 | |||
708 | sub SQR_512 | ||
709 | { | ||
710 | my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_; | ||
711 | my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/); | ||
712 | my @X=@$x; # make a copy | ||
713 | $code.=<<___; | ||
714 | # ------------------ | ||
715 | # first pass 01...07 | ||
716 | # ------------------ | ||
717 | mov $X[0], $A | ||
718 | |||
719 | mov $X[1],%rax | ||
720 | mul $A | ||
721 | mov %rax, (+$pDst_o+8*1)($pDst) | ||
722 | ___ | ||
723 | for(my $i=2;$i<8;$i++) { | ||
724 | $code.=<<___; | ||
725 | mov %rdx, $X[$i-2] | ||
726 | mov $X[$i],%rax | ||
727 | mul $A | ||
728 | add %rax, $X[$i-2] | ||
729 | adc \$0, %rdx | ||
730 | ___ | ||
731 | } | ||
732 | $code.=<<___; | ||
733 | mov %rdx, $x7 | ||
734 | |||
735 | mov $X[0], (+$pDst_o+8*2)($pDst) | ||
736 | |||
737 | # ------------------ | ||
738 | # second pass 12...17 | ||
739 | # ------------------ | ||
740 | |||
741 | mov (+8*1)($pA), $A | ||
742 | |||
743 | mov (+8*2)($pA),%rax | ||
744 | mul $A | ||
745 | add %rax, $X[1] | ||
746 | adc \$0, %rdx | ||
747 | mov $X[1], (+$pDst_o+8*3)($pDst) | ||
748 | |||
749 | mov %rdx, $X[0] | ||
750 | mov (+8*3)($pA),%rax | ||
751 | mul $A | ||
752 | add %rax, $X[2] | ||
753 | adc \$0, %rdx | ||
754 | add $X[0], $X[2] | ||
755 | adc \$0, %rdx | ||
756 | mov $X[2], (+$pDst_o+8*4)($pDst) | ||
757 | |||
758 | mov %rdx, $X[0] | ||
759 | mov (+8*4)($pA),%rax | ||
760 | mul $A | ||
761 | add %rax, $X[3] | ||
762 | adc \$0, %rdx | ||
763 | add $X[0], $X[3] | ||
764 | adc \$0, %rdx | ||
765 | |||
766 | mov %rdx, $X[0] | ||
767 | mov (+8*5)($pA),%rax | ||
768 | mul $A | ||
769 | add %rax, $X[4] | ||
770 | adc \$0, %rdx | ||
771 | add $X[0], $X[4] | ||
772 | adc \$0, %rdx | ||
773 | |||
774 | mov %rdx, $X[0] | ||
775 | mov $X[6],%rax | ||
776 | mul $A | ||
777 | add %rax, $X[5] | ||
778 | adc \$0, %rdx | ||
779 | add $X[0], $X[5] | ||
780 | adc \$0, %rdx | ||
781 | |||
782 | mov %rdx, $X[0] | ||
783 | mov $X[7],%rax | ||
784 | mul $A | ||
785 | add %rax, $x7 | ||
786 | adc \$0, %rdx | ||
787 | add $X[0], $x7 | ||
788 | adc \$0, %rdx | ||
789 | |||
790 | mov %rdx, $X[1] | ||
791 | |||
792 | # ------------------ | ||
793 | # third pass 23...27 | ||
794 | # ------------------ | ||
795 | mov (+8*2)($pA), $A | ||
796 | |||
797 | mov (+8*3)($pA),%rax | ||
798 | mul $A | ||
799 | add %rax, $X[3] | ||
800 | adc \$0, %rdx | ||
801 | mov $X[3], (+$pDst_o+8*5)($pDst) | ||
802 | |||
803 | mov %rdx, $X[0] | ||
804 | mov (+8*4)($pA),%rax | ||
805 | mul $A | ||
806 | add %rax, $X[4] | ||
807 | adc \$0, %rdx | ||
808 | add $X[0], $X[4] | ||
809 | adc \$0, %rdx | ||
810 | mov $X[4], (+$pDst_o+8*6)($pDst) | ||
811 | |||
812 | mov %rdx, $X[0] | ||
813 | mov (+8*5)($pA),%rax | ||
814 | mul $A | ||
815 | add %rax, $X[5] | ||
816 | adc \$0, %rdx | ||
817 | add $X[0], $X[5] | ||
818 | adc \$0, %rdx | ||
819 | |||
820 | mov %rdx, $X[0] | ||
821 | mov $X[6],%rax | ||
822 | mul $A | ||
823 | add %rax, $x7 | ||
824 | adc \$0, %rdx | ||
825 | add $X[0], $x7 | ||
826 | adc \$0, %rdx | ||
827 | |||
828 | mov %rdx, $X[0] | ||
829 | mov $X[7],%rax | ||
830 | mul $A | ||
831 | add %rax, $X[1] | ||
832 | adc \$0, %rdx | ||
833 | add $X[0], $X[1] | ||
834 | adc \$0, %rdx | ||
835 | |||
836 | mov %rdx, $X[2] | ||
837 | |||
838 | # ------------------ | ||
839 | # fourth pass 34...37 | ||
840 | # ------------------ | ||
841 | |||
842 | mov (+8*3)($pA), $A | ||
843 | |||
844 | mov (+8*4)($pA),%rax | ||
845 | mul $A | ||
846 | add %rax, $X[5] | ||
847 | adc \$0, %rdx | ||
848 | mov $X[5], (+$pDst_o+8*7)($pDst) | ||
849 | |||
850 | mov %rdx, $X[0] | ||
851 | mov (+8*5)($pA),%rax | ||
852 | mul $A | ||
853 | add %rax, $x7 | ||
854 | adc \$0, %rdx | ||
855 | add $X[0], $x7 | ||
856 | adc \$0, %rdx | ||
857 | mov $x7, (+$pDst_o+8*8)($pDst) | ||
858 | |||
859 | mov %rdx, $X[0] | ||
860 | mov $X[6],%rax | ||
861 | mul $A | ||
862 | add %rax, $X[1] | ||
863 | adc \$0, %rdx | ||
864 | add $X[0], $X[1] | ||
865 | adc \$0, %rdx | ||
866 | |||
867 | mov %rdx, $X[0] | ||
868 | mov $X[7],%rax | ||
869 | mul $A | ||
870 | add %rax, $X[2] | ||
871 | adc \$0, %rdx | ||
872 | add $X[0], $X[2] | ||
873 | adc \$0, %rdx | ||
874 | |||
875 | mov %rdx, $X[5] | ||
876 | |||
877 | # ------------------ | ||
878 | # fifth pass 45...47 | ||
879 | # ------------------ | ||
880 | mov (+8*4)($pA), $A | ||
881 | |||
882 | mov (+8*5)($pA),%rax | ||
883 | mul $A | ||
884 | add %rax, $X[1] | ||
885 | adc \$0, %rdx | ||
886 | mov $X[1], (+$pDst_o+8*9)($pDst) | ||
887 | |||
888 | mov %rdx, $X[0] | ||
889 | mov $X[6],%rax | ||
890 | mul $A | ||
891 | add %rax, $X[2] | ||
892 | adc \$0, %rdx | ||
893 | add $X[0], $X[2] | ||
894 | adc \$0, %rdx | ||
895 | mov $X[2], (+$pDst_o+8*10)($pDst) | ||
896 | |||
897 | mov %rdx, $X[0] | ||
898 | mov $X[7],%rax | ||
899 | mul $A | ||
900 | add %rax, $X[5] | ||
901 | adc \$0, %rdx | ||
902 | add $X[0], $X[5] | ||
903 | adc \$0, %rdx | ||
904 | |||
905 | mov %rdx, $X[1] | ||
906 | |||
907 | # ------------------ | ||
908 | # sixth pass 56...57 | ||
909 | # ------------------ | ||
910 | mov (+8*5)($pA), $A | ||
911 | |||
912 | mov $X[6],%rax | ||
913 | mul $A | ||
914 | add %rax, $X[5] | ||
915 | adc \$0, %rdx | ||
916 | mov $X[5], (+$pDst_o+8*11)($pDst) | ||
917 | |||
918 | mov %rdx, $X[0] | ||
919 | mov $X[7],%rax | ||
920 | mul $A | ||
921 | add %rax, $X[1] | ||
922 | adc \$0, %rdx | ||
923 | add $X[0], $X[1] | ||
924 | adc \$0, %rdx | ||
925 | mov $X[1], (+$pDst_o+8*12)($pDst) | ||
926 | |||
927 | mov %rdx, $X[2] | ||
928 | |||
929 | # ------------------ | ||
930 | # seventh pass 67 | ||
931 | # ------------------ | ||
932 | mov $X[6], $A | ||
933 | |||
934 | mov $X[7],%rax | ||
935 | mul $A | ||
936 | add %rax, $X[2] | ||
937 | adc \$0, %rdx | ||
938 | mov $X[2], (+$pDst_o+8*13)($pDst) | ||
939 | |||
940 | mov %rdx, (+$pDst_o+8*14)($pDst) | ||
941 | |||
942 | # start finalize (add in squares, and double off-terms) | ||
943 | mov (+$pDst_o+8*1)($pDst), $X[0] | ||
944 | mov (+$pDst_o+8*2)($pDst), $X[1] | ||
945 | mov (+$pDst_o+8*3)($pDst), $X[2] | ||
946 | mov (+$pDst_o+8*4)($pDst), $X[3] | ||
947 | mov (+$pDst_o+8*5)($pDst), $X[4] | ||
948 | mov (+$pDst_o+8*6)($pDst), $X[5] | ||
949 | |||
950 | mov (+8*3)($pA), %rax | ||
951 | mul %rax | ||
952 | mov %rax, $x6 | ||
953 | mov %rdx, $X[6] | ||
954 | |||
955 | add $X[0], $X[0] | ||
956 | adc $X[1], $X[1] | ||
957 | adc $X[2], $X[2] | ||
958 | adc $X[3], $X[3] | ||
959 | adc $X[4], $X[4] | ||
960 | adc $X[5], $X[5] | ||
961 | adc \$0, $X[6] | ||
962 | |||
963 | mov (+8*0)($pA), %rax | ||
964 | mul %rax | ||
965 | mov %rax, (+$pDst_o+8*0)($pDst) | ||
966 | mov %rdx, $A | ||
967 | |||
968 | mov (+8*1)($pA), %rax | ||
969 | mul %rax | ||
970 | |||
971 | add $A, $X[0] | ||
972 | adc %rax, $X[1] | ||
973 | adc \$0, %rdx | ||
974 | |||
975 | mov %rdx, $A | ||
976 | mov $X[0], (+$pDst_o+8*1)($pDst) | ||
977 | mov $X[1], (+$pDst_o+8*2)($pDst) | ||
978 | |||
979 | mov (+8*2)($pA), %rax | ||
980 | mul %rax | ||
981 | |||
982 | add $A, $X[2] | ||
983 | adc %rax, $X[3] | ||
984 | adc \$0, %rdx | ||
985 | |||
986 | mov %rdx, $A | ||
987 | |||
988 | mov $X[2], (+$pDst_o+8*3)($pDst) | ||
989 | mov $X[3], (+$pDst_o+8*4)($pDst) | ||
990 | |||
991 | xor $tmp, $tmp | ||
992 | add $A, $X[4] | ||
993 | adc $x6, $X[5] | ||
994 | adc \$0, $tmp | ||
995 | |||
996 | mov $X[4], (+$pDst_o+8*5)($pDst) | ||
997 | mov $X[5], (+$pDst_o+8*6)($pDst) | ||
998 | |||
999 | # %%tmp has 0/1 in column 7 | ||
1000 | # %%A6 has a full value in column 7 | ||
1001 | |||
1002 | mov (+$pDst_o+8*7)($pDst), $X[0] | ||
1003 | mov (+$pDst_o+8*8)($pDst), $X[1] | ||
1004 | mov (+$pDst_o+8*9)($pDst), $X[2] | ||
1005 | mov (+$pDst_o+8*10)($pDst), $X[3] | ||
1006 | mov (+$pDst_o+8*11)($pDst), $X[4] | ||
1007 | mov (+$pDst_o+8*12)($pDst), $X[5] | ||
1008 | mov (+$pDst_o+8*13)($pDst), $x6 | ||
1009 | mov (+$pDst_o+8*14)($pDst), $x7 | ||
1010 | |||
1011 | mov $X[7], %rax | ||
1012 | mul %rax | ||
1013 | mov %rax, $X[7] | ||
1014 | mov %rdx, $A | ||
1015 | |||
1016 | add $X[0], $X[0] | ||
1017 | adc $X[1], $X[1] | ||
1018 | adc $X[2], $X[2] | ||
1019 | adc $X[3], $X[3] | ||
1020 | adc $X[4], $X[4] | ||
1021 | adc $X[5], $X[5] | ||
1022 | adc $x6, $x6 | ||
1023 | adc $x7, $x7 | ||
1024 | adc \$0, $A | ||
1025 | |||
1026 | add $tmp, $X[0] | ||
1027 | |||
1028 | mov (+8*4)($pA), %rax | ||
1029 | mul %rax | ||
1030 | |||
1031 | add $X[6], $X[0] | ||
1032 | adc %rax, $X[1] | ||
1033 | adc \$0, %rdx | ||
1034 | |||
1035 | mov %rdx, $tmp | ||
1036 | |||
1037 | mov $X[0], (+$pDst_o+8*7)($pDst) | ||
1038 | mov $X[1], (+$pDst_o+8*8)($pDst) | ||
1039 | |||
1040 | mov (+8*5)($pA), %rax | ||
1041 | mul %rax | ||
1042 | |||
1043 | add $tmp, $X[2] | ||
1044 | adc %rax, $X[3] | ||
1045 | adc \$0, %rdx | ||
1046 | |||
1047 | mov %rdx, $tmp | ||
1048 | |||
1049 | mov $X[2], (+$pDst_o+8*9)($pDst) | ||
1050 | mov $X[3], (+$pDst_o+8*10)($pDst) | ||
1051 | |||
1052 | mov (+8*6)($pA), %rax | ||
1053 | mul %rax | ||
1054 | |||
1055 | add $tmp, $X[4] | ||
1056 | adc %rax, $X[5] | ||
1057 | adc \$0, %rdx | ||
1058 | |||
1059 | mov $X[4], (+$pDst_o+8*11)($pDst) | ||
1060 | mov $X[5], (+$pDst_o+8*12)($pDst) | ||
1061 | |||
1062 | add %rdx, $x6 | ||
1063 | adc $X[7], $x7 | ||
1064 | adc \$0, $A | ||
1065 | |||
1066 | mov $x6, (+$pDst_o+8*13)($pDst) | ||
1067 | mov $x7, (+$pDst_o+8*14)($pDst) | ||
1068 | mov $A, (+$pDst_o+8*15)($pDst) | ||
1069 | ___ | ||
1070 | } | ||
1071 | |||
1072 | # | ||
1073 | # sqr_reduce: subroutine to compute Result = reduce(Result * Result) | ||
1074 | # | ||
1075 | # input and result also in: r9, r8, r15, r14, r13, r12, r11, r10 | ||
1076 | # | ||
1077 | $code.=<<___; | ||
1078 | .type sqr_reduce,\@abi-omnipotent | ||
1079 | .align 16 | ||
1080 | sqr_reduce: | ||
1081 | mov (+$pResult_offset+8)(%rsp), %rcx | ||
1082 | ___ | ||
1083 | &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi"); | ||
1084 | $code.=<<___; | ||
1085 | # tail recursion optimization: jmp to mont_reduce and return from there | ||
1086 | jmp mont_reduce | ||
1087 | # call mont_reduce | ||
1088 | # ret | ||
1089 | .size sqr_reduce,.-sqr_reduce | ||
1090 | ___ | ||
1091 | }}} | ||
1092 | |||
1093 | # | ||
1094 | # MAIN FUNCTION | ||
1095 | # | ||
1096 | |||
1097 | #mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */ | ||
1098 | # UINT64 *g, /* 512 bits, 8 qwords */ | ||
1099 | # UINT64 *exp, /* 512 bits, 8 qwords */ | ||
1100 | # struct mod_ctx_512 *data) | ||
1101 | |||
1102 | # window size = 5 | ||
1103 | # table size = 2^5 = 32 | ||
1104 | #table_entries equ 32 | ||
1105 | #table_size equ table_entries * 8 | ||
1106 | $code.=<<___; | ||
1107 | .globl mod_exp_512 | ||
1108 | .type mod_exp_512,\@function,4 | ||
1109 | mod_exp_512: | ||
1110 | push %rbp | ||
1111 | push %rbx | ||
1112 | push %r12 | ||
1113 | push %r13 | ||
1114 | push %r14 | ||
1115 | push %r15 | ||
1116 | |||
1117 | # adjust stack down and then align it with cache boundary | ||
1118 | mov %rsp, %r8 | ||
1119 | sub \$$mem_size, %rsp | ||
1120 | and \$-64, %rsp | ||
1121 | |||
1122 | # store previous stack pointer and arguments | ||
1123 | mov %r8, (+$rsp_offset)(%rsp) | ||
1124 | mov %rdi, (+$pResult_offset)(%rsp) | ||
1125 | mov %rsi, (+$pG_offset)(%rsp) | ||
1126 | mov %rcx, (+$pData_offset)(%rsp) | ||
1127 | .Lbody: | ||
1128 | # transform g into montgomery space | ||
1129 | # GT = reduce(g * C2) = reduce(g * (2^256)) | ||
1130 | # reduce expects to have the input in [tmp16] | ||
1131 | pxor %xmm4, %xmm4 | ||
1132 | movdqu (+16*0)(%rsi), %xmm0 | ||
1133 | movdqu (+16*1)(%rsi), %xmm1 | ||
1134 | movdqu (+16*2)(%rsi), %xmm2 | ||
1135 | movdqu (+16*3)(%rsi), %xmm3 | ||
1136 | movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp) | ||
1137 | movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp) | ||
1138 | movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp) | ||
1139 | movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp) | ||
1140 | movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp) | ||
1141 | movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp) | ||
1142 | movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp) | ||
1143 | movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp) | ||
1144 | |||
1145 | # load pExp before rdx gets blown away | ||
1146 | movdqu (+16*0)(%rdx), %xmm0 | ||
1147 | movdqu (+16*1)(%rdx), %xmm1 | ||
1148 | movdqu (+16*2)(%rdx), %xmm2 | ||
1149 | movdqu (+16*3)(%rdx), %xmm3 | ||
1150 | |||
1151 | lea (+$GT_offset)(%rsp), %rbx | ||
1152 | mov %rbx, (+$red_result_addr_offset)(%rsp) | ||
1153 | call mont_reduce | ||
1154 | |||
1155 | # Initialize tmp = C | ||
1156 | lea (+$tmp_offset)(%rsp), %rcx | ||
1157 | xor %rax, %rax | ||
1158 | mov %rax, (+8*0)(%rcx) | ||
1159 | mov %rax, (+8*1)(%rcx) | ||
1160 | mov %rax, (+8*3)(%rcx) | ||
1161 | mov %rax, (+8*4)(%rcx) | ||
1162 | mov %rax, (+8*5)(%rcx) | ||
1163 | mov %rax, (+8*6)(%rcx) | ||
1164 | mov %rax, (+8*7)(%rcx) | ||
1165 | mov %rax, (+$exp_offset+8*8)(%rsp) | ||
1166 | movq \$1, (+8*2)(%rcx) | ||
1167 | |||
1168 | lea (+$garray_offset)(%rsp), %rbp | ||
1169 | mov %rcx, %rsi # pTmp | ||
1170 | mov %rbp, %rdi # Garray[][0] | ||
1171 | ___ | ||
1172 | |||
1173 | &swizzle("%rdi", "%rcx", "%rax", "%rbx"); | ||
1174 | |||
1175 | # for (rax = 31; rax != 0; rax--) { | ||
1176 | # tmp = reduce(tmp * G) | ||
1177 | # swizzle(pg, tmp); | ||
1178 | # pg += 2; } | ||
1179 | $code.=<<___; | ||
1180 | mov \$31, %rax | ||
1181 | mov %rax, (+$i_offset)(%rsp) | ||
1182 | mov %rbp, (+$pg_offset)(%rsp) | ||
1183 | # rsi -> pTmp | ||
1184 | mov %rsi, (+$red_result_addr_offset)(%rsp) | ||
1185 | mov (+8*0)(%rsi), %r10 | ||
1186 | mov (+8*1)(%rsi), %r11 | ||
1187 | mov (+8*2)(%rsi), %r12 | ||
1188 | mov (+8*3)(%rsi), %r13 | ||
1189 | mov (+8*4)(%rsi), %r14 | ||
1190 | mov (+8*5)(%rsi), %r15 | ||
1191 | mov (+8*6)(%rsi), %r8 | ||
1192 | mov (+8*7)(%rsi), %r9 | ||
1193 | init_loop: | ||
1194 | lea (+$GT_offset)(%rsp), %rdi | ||
1195 | call mont_mul_a3b | ||
1196 | lea (+$tmp_offset)(%rsp), %rsi | ||
1197 | mov (+$pg_offset)(%rsp), %rbp | ||
1198 | add \$2, %rbp | ||
1199 | mov %rbp, (+$pg_offset)(%rsp) | ||
1200 | mov %rsi, %rcx # rcx = rsi = addr of tmp | ||
1201 | ___ | ||
1202 | |||
1203 | &swizzle("%rbp", "%rcx", "%rax", "%rbx"); | ||
1204 | $code.=<<___; | ||
1205 | mov (+$i_offset)(%rsp), %rax | ||
1206 | sub \$1, %rax | ||
1207 | mov %rax, (+$i_offset)(%rsp) | ||
1208 | jne init_loop | ||
1209 | |||
1210 | # | ||
1211 | # Copy exponent onto stack | ||
1212 | movdqa %xmm0, (+$exp_offset+16*0)(%rsp) | ||
1213 | movdqa %xmm1, (+$exp_offset+16*1)(%rsp) | ||
1214 | movdqa %xmm2, (+$exp_offset+16*2)(%rsp) | ||
1215 | movdqa %xmm3, (+$exp_offset+16*3)(%rsp) | ||
1216 | |||
1217 | |||
1218 | # | ||
1219 | # Do exponentiation | ||
1220 | # Initialize result to G[exp{511:507}] | ||
1221 | mov (+$exp_offset+62)(%rsp), %eax | ||
1222 | mov %rax, %rdx | ||
1223 | shr \$11, %rax | ||
1224 | and \$0x07FF, %edx | ||
1225 | mov %edx, (+$exp_offset+62)(%rsp) | ||
1226 | lea (+$garray_offset)(%rsp,%rax,2), %rsi | ||
1227 | mov (+$pResult_offset)(%rsp), %rdx | ||
1228 | ___ | ||
1229 | |||
1230 | &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax"); | ||
1231 | |||
1232 | # | ||
1233 | # Loop variables | ||
1234 | # rcx = [loop_idx] = index: 510-5 to 0 by 5 | ||
1235 | $code.=<<___; | ||
1236 | movq \$505, (+$loop_idx_offset)(%rsp) | ||
1237 | |||
1238 | mov (+$pResult_offset)(%rsp), %rcx | ||
1239 | mov %rcx, (+$red_result_addr_offset)(%rsp) | ||
1240 | mov (+8*0)(%rcx), %r10 | ||
1241 | mov (+8*1)(%rcx), %r11 | ||
1242 | mov (+8*2)(%rcx), %r12 | ||
1243 | mov (+8*3)(%rcx), %r13 | ||
1244 | mov (+8*4)(%rcx), %r14 | ||
1245 | mov (+8*5)(%rcx), %r15 | ||
1246 | mov (+8*6)(%rcx), %r8 | ||
1247 | mov (+8*7)(%rcx), %r9 | ||
1248 | jmp sqr_2 | ||
1249 | |||
1250 | main_loop_a3b: | ||
1251 | call sqr_reduce | ||
1252 | call sqr_reduce | ||
1253 | call sqr_reduce | ||
1254 | sqr_2: | ||
1255 | call sqr_reduce | ||
1256 | call sqr_reduce | ||
1257 | |||
1258 | # | ||
1259 | # Do multiply, first look up proper value in Garray | ||
1260 | mov (+$loop_idx_offset)(%rsp), %rcx # bit index | ||
1261 | mov %rcx, %rax | ||
1262 | shr \$4, %rax # rax is word pointer | ||
1263 | mov (+$exp_offset)(%rsp,%rax,2), %edx | ||
1264 | and \$15, %rcx | ||
1265 | shrq %cl, %rdx | ||
1266 | and \$0x1F, %rdx | ||
1267 | |||
1268 | lea (+$garray_offset)(%rsp,%rdx,2), %rsi | ||
1269 | lea (+$tmp_offset)(%rsp), %rdx | ||
1270 | mov %rdx, %rdi | ||
1271 | ___ | ||
1272 | |||
1273 | &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax"); | ||
1274 | # rdi = tmp = pG | ||
1275 | |||
1276 | # | ||
1277 | # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData) | ||
1278 | # result result pG M Data | ||
1279 | $code.=<<___; | ||
1280 | mov (+$pResult_offset)(%rsp), %rsi | ||
1281 | call mont_mul_a3b | ||
1282 | |||
1283 | # | ||
1284 | # finish loop | ||
1285 | mov (+$loop_idx_offset)(%rsp), %rcx | ||
1286 | sub \$5, %rcx | ||
1287 | mov %rcx, (+$loop_idx_offset)(%rsp) | ||
1288 | jge main_loop_a3b | ||
1289 | |||
1290 | # | ||
1291 | |||
1292 | end_main_loop_a3b: | ||
1293 | # transform result out of Montgomery space | ||
1294 | # result = reduce(result) | ||
1295 | mov (+$pResult_offset)(%rsp), %rdx | ||
1296 | pxor %xmm4, %xmm4 | ||
1297 | movdqu (+16*0)(%rdx), %xmm0 | ||
1298 | movdqu (+16*1)(%rdx), %xmm1 | ||
1299 | movdqu (+16*2)(%rdx), %xmm2 | ||
1300 | movdqu (+16*3)(%rdx), %xmm3 | ||
1301 | movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp) | ||
1302 | movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp) | ||
1303 | movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp) | ||
1304 | movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp) | ||
1305 | movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp) | ||
1306 | movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp) | ||
1307 | movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp) | ||
1308 | movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp) | ||
1309 | call mont_reduce | ||
1310 | |||
1311 | # If result > m, subract m | ||
1312 | # load result into r15:r8 | ||
1313 | mov (+$pResult_offset)(%rsp), %rax | ||
1314 | mov (+8*0)(%rax), %r8 | ||
1315 | mov (+8*1)(%rax), %r9 | ||
1316 | mov (+8*2)(%rax), %r10 | ||
1317 | mov (+8*3)(%rax), %r11 | ||
1318 | mov (+8*4)(%rax), %r12 | ||
1319 | mov (+8*5)(%rax), %r13 | ||
1320 | mov (+8*6)(%rax), %r14 | ||
1321 | mov (+8*7)(%rax), %r15 | ||
1322 | |||
1323 | # subtract m | ||
1324 | mov (+$pData_offset)(%rsp), %rbx | ||
1325 | add \$$M, %rbx | ||
1326 | |||
1327 | sub (+8*0)(%rbx), %r8 | ||
1328 | sbb (+8*1)(%rbx), %r9 | ||
1329 | sbb (+8*2)(%rbx), %r10 | ||
1330 | sbb (+8*3)(%rbx), %r11 | ||
1331 | sbb (+8*4)(%rbx), %r12 | ||
1332 | sbb (+8*5)(%rbx), %r13 | ||
1333 | sbb (+8*6)(%rbx), %r14 | ||
1334 | sbb (+8*7)(%rbx), %r15 | ||
1335 | |||
1336 | # if Carry is clear, replace result with difference | ||
1337 | mov (+8*0)(%rax), %rsi | ||
1338 | mov (+8*1)(%rax), %rdi | ||
1339 | mov (+8*2)(%rax), %rcx | ||
1340 | mov (+8*3)(%rax), %rdx | ||
1341 | cmovnc %r8, %rsi | ||
1342 | cmovnc %r9, %rdi | ||
1343 | cmovnc %r10, %rcx | ||
1344 | cmovnc %r11, %rdx | ||
1345 | mov %rsi, (+8*0)(%rax) | ||
1346 | mov %rdi, (+8*1)(%rax) | ||
1347 | mov %rcx, (+8*2)(%rax) | ||
1348 | mov %rdx, (+8*3)(%rax) | ||
1349 | |||
1350 | mov (+8*4)(%rax), %rsi | ||
1351 | mov (+8*5)(%rax), %rdi | ||
1352 | mov (+8*6)(%rax), %rcx | ||
1353 | mov (+8*7)(%rax), %rdx | ||
1354 | cmovnc %r12, %rsi | ||
1355 | cmovnc %r13, %rdi | ||
1356 | cmovnc %r14, %rcx | ||
1357 | cmovnc %r15, %rdx | ||
1358 | mov %rsi, (+8*4)(%rax) | ||
1359 | mov %rdi, (+8*5)(%rax) | ||
1360 | mov %rcx, (+8*6)(%rax) | ||
1361 | mov %rdx, (+8*7)(%rax) | ||
1362 | |||
1363 | mov (+$rsp_offset)(%rsp), %rsi | ||
1364 | mov 0(%rsi),%r15 | ||
1365 | mov 8(%rsi),%r14 | ||
1366 | mov 16(%rsi),%r13 | ||
1367 | mov 24(%rsi),%r12 | ||
1368 | mov 32(%rsi),%rbx | ||
1369 | mov 40(%rsi),%rbp | ||
1370 | lea 48(%rsi),%rsp | ||
1371 | .Lepilogue: | ||
1372 | ret | ||
1373 | .size mod_exp_512, . - mod_exp_512 | ||
1374 | ___ | ||
1375 | |||
1376 | if ($win64) { | ||
1377 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
1378 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
1379 | my $rec="%rcx"; | ||
1380 | my $frame="%rdx"; | ||
1381 | my $context="%r8"; | ||
1382 | my $disp="%r9"; | ||
1383 | |||
1384 | $code.=<<___; | ||
1385 | .extern __imp_RtlVirtualUnwind | ||
1386 | .type mod_exp_512_se_handler,\@abi-omnipotent | ||
1387 | .align 16 | ||
1388 | mod_exp_512_se_handler: | ||
1389 | push %rsi | ||
1390 | push %rdi | ||
1391 | push %rbx | ||
1392 | push %rbp | ||
1393 | push %r12 | ||
1394 | push %r13 | ||
1395 | push %r14 | ||
1396 | push %r15 | ||
1397 | pushfq | ||
1398 | sub \$64,%rsp | ||
1399 | |||
1400 | mov 120($context),%rax # pull context->Rax | ||
1401 | mov 248($context),%rbx # pull context->Rip | ||
1402 | |||
1403 | lea .Lbody(%rip),%r10 | ||
1404 | cmp %r10,%rbx # context->Rip<prologue label | ||
1405 | jb .Lin_prologue | ||
1406 | |||
1407 | mov 152($context),%rax # pull context->Rsp | ||
1408 | |||
1409 | lea .Lepilogue(%rip),%r10 | ||
1410 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
1411 | jae .Lin_prologue | ||
1412 | |||
1413 | mov $rsp_offset(%rax),%rax # pull saved Rsp | ||
1414 | |||
1415 | mov 32(%rax),%rbx | ||
1416 | mov 40(%rax),%rbp | ||
1417 | mov 24(%rax),%r12 | ||
1418 | mov 16(%rax),%r13 | ||
1419 | mov 8(%rax),%r14 | ||
1420 | mov 0(%rax),%r15 | ||
1421 | lea 48(%rax),%rax | ||
1422 | mov %rbx,144($context) # restore context->Rbx | ||
1423 | mov %rbp,160($context) # restore context->Rbp | ||
1424 | mov %r12,216($context) # restore context->R12 | ||
1425 | mov %r13,224($context) # restore context->R13 | ||
1426 | mov %r14,232($context) # restore context->R14 | ||
1427 | mov %r15,240($context) # restore context->R15 | ||
1428 | |||
1429 | .Lin_prologue: | ||
1430 | mov 8(%rax),%rdi | ||
1431 | mov 16(%rax),%rsi | ||
1432 | mov %rax,152($context) # restore context->Rsp | ||
1433 | mov %rsi,168($context) # restore context->Rsi | ||
1434 | mov %rdi,176($context) # restore context->Rdi | ||
1435 | |||
1436 | mov 40($disp),%rdi # disp->ContextRecord | ||
1437 | mov $context,%rsi # context | ||
1438 | mov \$154,%ecx # sizeof(CONTEXT) | ||
1439 | .long 0xa548f3fc # cld; rep movsq | ||
1440 | |||
1441 | mov $disp,%rsi | ||
1442 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
1443 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
1444 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
1445 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
1446 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
1447 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
1448 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
1449 | mov %r10,32(%rsp) # arg5 | ||
1450 | mov %r11,40(%rsp) # arg6 | ||
1451 | mov %r12,48(%rsp) # arg7 | ||
1452 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
1453 | call *__imp_RtlVirtualUnwind(%rip) | ||
1454 | |||
1455 | mov \$1,%eax # ExceptionContinueSearch | ||
1456 | add \$64,%rsp | ||
1457 | popfq | ||
1458 | pop %r15 | ||
1459 | pop %r14 | ||
1460 | pop %r13 | ||
1461 | pop %r12 | ||
1462 | pop %rbp | ||
1463 | pop %rbx | ||
1464 | pop %rdi | ||
1465 | pop %rsi | ||
1466 | ret | ||
1467 | .size mod_exp_512_se_handler,.-mod_exp_512_se_handler | ||
1468 | |||
1469 | .section .pdata | ||
1470 | .align 4 | ||
1471 | .rva .LSEH_begin_mod_exp_512 | ||
1472 | .rva .LSEH_end_mod_exp_512 | ||
1473 | .rva .LSEH_info_mod_exp_512 | ||
1474 | |||
1475 | .section .xdata | ||
1476 | .align 8 | ||
1477 | .LSEH_info_mod_exp_512: | ||
1478 | .byte 9,0,0,0 | ||
1479 | .rva mod_exp_512_se_handler | ||
1480 | ___ | ||
1481 | } | ||
1482 | |||
1483 | sub reg_part { | ||
1484 | my ($reg,$conv)=@_; | ||
1485 | if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } | ||
1486 | elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } | ||
1487 | elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } | ||
1488 | elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } | ||
1489 | return $reg; | ||
1490 | } | ||
1491 | |||
1492 | $code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; | ||
1493 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
1494 | $code =~ s/(\(\+[^)]+\))/eval $1/gem; | ||
1495 | print $code; | ||
1496 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/parisc-mont.pl b/src/lib/libcrypto/bn/asm/parisc-mont.pl new file mode 100644 index 0000000000..4a766a87fb --- /dev/null +++ b/src/lib/libcrypto/bn/asm/parisc-mont.pl | |||
@@ -0,0 +1,993 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # On PA-7100LC this module performs ~90-50% better, less for longer | ||
11 | # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means | ||
12 | # that compiler utilized xmpyu instruction to perform 32x32=64-bit | ||
13 | # multiplication, which in turn means that "baseline" performance was | ||
14 | # optimal in respect to instruction set capabilities. Fair comparison | ||
15 | # with vendor compiler is problematic, because OpenSSL doesn't define | ||
16 | # BN_LLONG [presumably] for historical reasons, which drives compiler | ||
17 | # toward 4 times 16x16=32-bit multiplicatons [plus complementary | ||
18 | # shifts and additions] instead. This means that you should observe | ||
19 | # several times improvement over code generated by vendor compiler | ||
20 | # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual | ||
21 | # improvement coefficient was never collected on PA-7100LC, or any | ||
22 | # other 1.1 CPU, because I don't have access to such machine with | ||
23 | # vendor compiler. But to give you a taste, PA-RISC 1.1 code path | ||
24 | # reportedly outperformed code generated by cc +DA1.1 +O3 by factor | ||
25 | # of ~5x on PA-8600. | ||
26 | # | ||
27 | # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is | ||
28 | # reportedly ~2x faster than vendor compiler generated code [according | ||
29 | # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of | ||
30 | # this implementation is actually 32-bit one, in the sense that it | ||
31 | # operates on 32-bit values. But pa-risc2[W].s operates on arrays of | ||
32 | # 64-bit BN_LONGs... How do they interoperate then? No problem. This | ||
33 | # module picks halves of 64-bit values in reverse order and pretends | ||
34 | # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" | ||
35 | # 64-bit code such as pa-risc2[W].s then? Well, the thing is that | ||
36 | # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, | ||
37 | # i.e. there is no "wider" multiplication like on most other 64-bit | ||
38 | # platforms. This means that even being effectively 32-bit, this | ||
39 | # implementation performs "64-bit" computational task in same amount | ||
40 | # of arithmetic operations, most notably multiplications. It requires | ||
41 | # more memory references, most notably to tp[num], but this doesn't | ||
42 | # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC | ||
43 | # 2.0 code path, provides virtually same performance as pa-risc2[W].s: | ||
44 | # it's ~10% better for shortest key length and ~10% worse for longest | ||
45 | # one. | ||
46 | # | ||
47 | # In case it wasn't clear. The module has two distinct code paths: | ||
48 | # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit | ||
49 | # additions and 64-bit integer loads, not to mention specific | ||
50 | # instruction scheduling. In 64-bit build naturally only 2.0 code path | ||
51 | # is assembled. In 32-bit application context both code paths are | ||
52 | # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path | ||
53 | # is taken automatically. Also, in 32-bit build the module imposes | ||
54 | # couple of limitations: vector lengths has to be even and vector | ||
55 | # addresses has to be 64-bit aligned. Normally neither is a problem: | ||
56 | # most common key lengths are even and vectors are commonly malloc-ed, | ||
57 | # which ensures alignment. | ||
58 | # | ||
59 | # Special thanks to polarhome.com for providing HP-UX account on | ||
60 | # PA-RISC 1.1 machine, and to correspondent who chose to remain | ||
61 | # anonymous for testing the code on PA-RISC 2.0 machine. | ||
62 | |||
63 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
64 | |||
65 | $flavour = shift; | ||
66 | $output = shift; | ||
67 | |||
68 | open STDOUT,">$output"; | ||
69 | |||
70 | if ($flavour =~ /64/) { | ||
71 | $LEVEL ="2.0W"; | ||
72 | $SIZE_T =8; | ||
73 | $FRAME_MARKER =80; | ||
74 | $SAVED_RP =16; | ||
75 | $PUSH ="std"; | ||
76 | $PUSHMA ="std,ma"; | ||
77 | $POP ="ldd"; | ||
78 | $POPMB ="ldd,mb"; | ||
79 | $BN_SZ =$SIZE_T; | ||
80 | } else { | ||
81 | $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; | ||
82 | $SIZE_T =4; | ||
83 | $FRAME_MARKER =48; | ||
84 | $SAVED_RP =20; | ||
85 | $PUSH ="stw"; | ||
86 | $PUSHMA ="stwm"; | ||
87 | $POP ="ldw"; | ||
88 | $POPMB ="ldwm"; | ||
89 | $BN_SZ =$SIZE_T; | ||
90 | if (open CONF,"<${dir}../../opensslconf.h") { | ||
91 | while(<CONF>) { | ||
92 | if (m/#\s*define\s+SIXTY_FOUR_BIT/) { | ||
93 | $BN_SZ=8; | ||
94 | $LEVEL="2.0"; | ||
95 | last; | ||
96 | } | ||
97 | } | ||
98 | close CONF; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker | ||
103 | # [+ argument transfer] | ||
104 | $LOCALS=$FRAME-$FRAME_MARKER; | ||
105 | $FRAME+=32; # local variables | ||
106 | |||
107 | $tp="%r31"; | ||
108 | $ti1="%r29"; | ||
109 | $ti0="%r28"; | ||
110 | |||
111 | $rp="%r26"; | ||
112 | $ap="%r25"; | ||
113 | $bp="%r24"; | ||
114 | $np="%r23"; | ||
115 | $n0="%r22"; # passed through stack in 32-bit | ||
116 | $num="%r21"; # passed through stack in 32-bit | ||
117 | $idx="%r20"; | ||
118 | $arrsz="%r19"; | ||
119 | |||
120 | $nm1="%r7"; | ||
121 | $nm0="%r6"; | ||
122 | $ab1="%r5"; | ||
123 | $ab0="%r4"; | ||
124 | |||
125 | $fp="%r3"; | ||
126 | $hi1="%r2"; | ||
127 | $hi0="%r1"; | ||
128 | |||
129 | $xfer=$n0; # accomodates [-16..15] offset in fld[dw]s | ||
130 | |||
131 | $fm0="%fr4"; $fti=$fm0; | ||
132 | $fbi="%fr5L"; | ||
133 | $fn0="%fr5R"; | ||
134 | $fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; | ||
135 | $fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; | ||
136 | |||
137 | $code=<<___; | ||
138 | .LEVEL $LEVEL | ||
139 | .SPACE \$TEXT\$ | ||
140 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
141 | |||
142 | .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | ||
143 | .ALIGN 64 | ||
144 | bn_mul_mont | ||
145 | .PROC | ||
146 | .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 | ||
147 | .ENTRY | ||
148 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
149 | $PUSHMA %r3,$FRAME(%sp) | ||
150 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
151 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
152 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
153 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
154 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
155 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
156 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
157 | ldo -$FRAME(%sp),$fp | ||
158 | ___ | ||
159 | $code.=<<___ if ($SIZE_T==4); | ||
160 | ldw `-$FRAME_MARKER-4`($fp),$n0 | ||
161 | ldw `-$FRAME_MARKER-8`($fp),$num | ||
162 | nop | ||
163 | nop ; alignment | ||
164 | ___ | ||
165 | $code.=<<___ if ($BN_SZ==4); | ||
166 | comiclr,<= 6,$num,%r0 ; are vectors long enough? | ||
167 | b L\$abort | ||
168 | ldi 0,%r28 ; signal "unhandled" | ||
169 | add,ev %r0,$num,$num ; is $num even? | ||
170 | b L\$abort | ||
171 | nop | ||
172 | or $ap,$np,$ti1 | ||
173 | extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? | ||
174 | b L\$abort | ||
175 | nop | ||
176 | nop ; alignment | ||
177 | nop | ||
178 | |||
179 | fldws 0($n0),${fn0} | ||
180 | fldws,ma 4($bp),${fbi} ; bp[0] | ||
181 | ___ | ||
182 | $code.=<<___ if ($BN_SZ==8); | ||
183 | comib,> 3,$num,L\$abort ; are vectors long enough? | ||
184 | ldi 0,%r28 ; signal "unhandled" | ||
185 | addl $num,$num,$num ; I operate on 32-bit values | ||
186 | |||
187 | fldws 4($n0),${fn0} ; only low part of n0 | ||
188 | fldws 4($bp),${fbi} ; bp[0] in flipped word order | ||
189 | ___ | ||
190 | $code.=<<___; | ||
191 | fldds 0($ap),${fai} ; ap[0,1] | ||
192 | fldds 0($np),${fni} ; np[0,1] | ||
193 | |||
194 | sh2addl $num,%r0,$arrsz | ||
195 | ldi 31,$hi0 | ||
196 | ldo 36($arrsz),$hi1 ; space for tp[num+1] | ||
197 | andcm $hi1,$hi0,$hi1 ; align | ||
198 | addl $hi1,%sp,%sp | ||
199 | $PUSH $fp,-$SIZE_T(%sp) | ||
200 | |||
201 | ldo `$LOCALS+16`($fp),$xfer | ||
202 | ldo `$LOCALS+32+4`($fp),$tp | ||
203 | |||
204 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] | ||
205 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] | ||
206 | xmpyu ${fn0},${fab0}R,${fm0} | ||
207 | |||
208 | addl $arrsz,$ap,$ap ; point at the end | ||
209 | addl $arrsz,$np,$np | ||
210 | subi 0,$arrsz,$idx ; j=0 | ||
211 | ldo 8($idx),$idx ; j++++ | ||
212 | |||
213 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
214 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
215 | fstds ${fab0},-16($xfer) | ||
216 | fstds ${fnm0},-8($xfer) | ||
217 | fstds ${fab1},0($xfer) | ||
218 | fstds ${fnm1},8($xfer) | ||
219 | flddx $idx($ap),${fai} ; ap[2,3] | ||
220 | flddx $idx($np),${fni} ; np[2,3] | ||
221 | ___ | ||
222 | $code.=<<___ if ($BN_SZ==4); | ||
223 | mtctl $hi0,%cr11 ; $hi0 still holds 31 | ||
224 | extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 | ||
225 | b L\$parisc11 | ||
226 | nop | ||
227 | ___ | ||
228 | $code.=<<___; # PA-RISC 2.0 code-path | ||
229 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
230 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
231 | ldd -16($xfer),$ab0 | ||
232 | fstds ${fab0},-16($xfer) | ||
233 | |||
234 | extrd,u $ab0,31,32,$hi0 | ||
235 | extrd,u $ab0,63,32,$ab0 | ||
236 | ldd -8($xfer),$nm0 | ||
237 | fstds ${fnm0},-8($xfer) | ||
238 | ldo 8($idx),$idx ; j++++ | ||
239 | addl $ab0,$nm0,$nm0 ; low part is discarded | ||
240 | extrd,u $nm0,31,32,$hi1 | ||
241 | |||
242 | L\$1st | ||
243 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] | ||
244 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
245 | ldd 0($xfer),$ab1 | ||
246 | fstds ${fab1},0($xfer) | ||
247 | addl $hi0,$ab1,$ab1 | ||
248 | extrd,u $ab1,31,32,$hi0 | ||
249 | ldd 8($xfer),$nm1 | ||
250 | fstds ${fnm1},8($xfer) | ||
251 | extrd,u $ab1,63,32,$ab1 | ||
252 | addl $hi1,$nm1,$nm1 | ||
253 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
254 | flddx $idx($np),${fni} ; np[j,j+1] | ||
255 | addl $ab1,$nm1,$nm1 | ||
256 | extrd,u $nm1,31,32,$hi1 | ||
257 | |||
258 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
259 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
260 | ldd -16($xfer),$ab0 | ||
261 | fstds ${fab0},-16($xfer) | ||
262 | addl $hi0,$ab0,$ab0 | ||
263 | extrd,u $ab0,31,32,$hi0 | ||
264 | ldd -8($xfer),$nm0 | ||
265 | fstds ${fnm0},-8($xfer) | ||
266 | extrd,u $ab0,63,32,$ab0 | ||
267 | addl $hi1,$nm0,$nm0 | ||
268 | stw $nm1,-4($tp) ; tp[j-1] | ||
269 | addl $ab0,$nm0,$nm0 | ||
270 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
271 | addib,<> 8,$idx,L\$1st ; j++++ | ||
272 | extrd,u $nm0,31,32,$hi1 | ||
273 | |||
274 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] | ||
275 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
276 | ldd 0($xfer),$ab1 | ||
277 | fstds ${fab1},0($xfer) | ||
278 | addl $hi0,$ab1,$ab1 | ||
279 | extrd,u $ab1,31,32,$hi0 | ||
280 | ldd 8($xfer),$nm1 | ||
281 | fstds ${fnm1},8($xfer) | ||
282 | extrd,u $ab1,63,32,$ab1 | ||
283 | addl $hi1,$nm1,$nm1 | ||
284 | ldd -16($xfer),$ab0 | ||
285 | addl $ab1,$nm1,$nm1 | ||
286 | ldd -8($xfer),$nm0 | ||
287 | extrd,u $nm1,31,32,$hi1 | ||
288 | |||
289 | addl $hi0,$ab0,$ab0 | ||
290 | extrd,u $ab0,31,32,$hi0 | ||
291 | stw $nm1,-4($tp) ; tp[j-1] | ||
292 | extrd,u $ab0,63,32,$ab0 | ||
293 | addl $hi1,$nm0,$nm0 | ||
294 | ldd 0($xfer),$ab1 | ||
295 | addl $ab0,$nm0,$nm0 | ||
296 | ldd,mb 8($xfer),$nm1 | ||
297 | extrd,u $nm0,31,32,$hi1 | ||
298 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
299 | |||
300 | ldo -1($num),$num ; i-- | ||
301 | subi 0,$arrsz,$idx ; j=0 | ||
302 | ___ | ||
303 | $code.=<<___ if ($BN_SZ==4); | ||
304 | fldws,ma 4($bp),${fbi} ; bp[1] | ||
305 | ___ | ||
306 | $code.=<<___ if ($BN_SZ==8); | ||
307 | fldws 0($bp),${fbi} ; bp[1] in flipped word order | ||
308 | ___ | ||
309 | $code.=<<___; | ||
310 | flddx $idx($ap),${fai} ; ap[0,1] | ||
311 | flddx $idx($np),${fni} ; np[0,1] | ||
312 | fldws 8($xfer),${fti}R ; tp[0] | ||
313 | addl $hi0,$ab1,$ab1 | ||
314 | extrd,u $ab1,31,32,$hi0 | ||
315 | extrd,u $ab1,63,32,$ab1 | ||
316 | ldo 8($idx),$idx ; j++++ | ||
317 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] | ||
318 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] | ||
319 | addl $hi1,$nm1,$nm1 | ||
320 | addl $ab1,$nm1,$nm1 | ||
321 | extrd,u $nm1,31,32,$hi1 | ||
322 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
323 | stw $nm1,-4($tp) ; tp[j-1] | ||
324 | |||
325 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
326 | fcpy,sgl %fr0,${fab0}L | ||
327 | addl $hi1,$hi0,$hi0 | ||
328 | extrd,u $hi0,31,32,$hi1 | ||
329 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
330 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
331 | stw $hi0,0($tp) | ||
332 | stw $hi1,4($tp) | ||
333 | |||
334 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
335 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
336 | xmpyu ${fn0},${fab0}R,${fm0} | ||
337 | ldo `$LOCALS+32+4`($fp),$tp | ||
338 | L\$outer | ||
339 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
340 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
341 | fstds ${fab0},-16($xfer) ; 33-bit value | ||
342 | fstds ${fnm0},-8($xfer) | ||
343 | flddx $idx($ap),${fai} ; ap[2] | ||
344 | flddx $idx($np),${fni} ; np[2] | ||
345 | ldo 8($idx),$idx ; j++++ | ||
346 | ldd -16($xfer),$ab0 ; 33-bit value | ||
347 | ldd -8($xfer),$nm0 | ||
348 | ldw 0($xfer),$hi0 ; high part | ||
349 | |||
350 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
351 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
352 | extrd,u $ab0,31,32,$ti0 ; carry bit | ||
353 | extrd,u $ab0,63,32,$ab0 | ||
354 | fstds ${fab1},0($xfer) | ||
355 | addl $ti0,$hi0,$hi0 ; account carry bit | ||
356 | fstds ${fnm1},8($xfer) | ||
357 | addl $ab0,$nm0,$nm0 ; low part is discarded | ||
358 | ldw 0($tp),$ti1 ; tp[1] | ||
359 | extrd,u $nm0,31,32,$hi1 | ||
360 | fstds ${fab0},-16($xfer) | ||
361 | fstds ${fnm0},-8($xfer) | ||
362 | |||
363 | L\$inner | ||
364 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] | ||
365 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
366 | ldd 0($xfer),$ab1 | ||
367 | fstds ${fab1},0($xfer) | ||
368 | addl $hi0,$ti1,$ti1 | ||
369 | addl $ti1,$ab1,$ab1 | ||
370 | ldd 8($xfer),$nm1 | ||
371 | fstds ${fnm1},8($xfer) | ||
372 | extrd,u $ab1,31,32,$hi0 | ||
373 | extrd,u $ab1,63,32,$ab1 | ||
374 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
375 | flddx $idx($np),${fni} ; np[j,j+1] | ||
376 | addl $hi1,$nm1,$nm1 | ||
377 | addl $ab1,$nm1,$nm1 | ||
378 | ldw 4($tp),$ti0 ; tp[j] | ||
379 | stw $nm1,-4($tp) ; tp[j-1] | ||
380 | |||
381 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
382 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
383 | ldd -16($xfer),$ab0 | ||
384 | fstds ${fab0},-16($xfer) | ||
385 | addl $hi0,$ti0,$ti0 | ||
386 | addl $ti0,$ab0,$ab0 | ||
387 | ldd -8($xfer),$nm0 | ||
388 | fstds ${fnm0},-8($xfer) | ||
389 | extrd,u $ab0,31,32,$hi0 | ||
390 | extrd,u $nm1,31,32,$hi1 | ||
391 | ldw 8($tp),$ti1 ; tp[j] | ||
392 | extrd,u $ab0,63,32,$ab0 | ||
393 | addl $hi1,$nm0,$nm0 | ||
394 | addl $ab0,$nm0,$nm0 | ||
395 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
396 | addib,<> 8,$idx,L\$inner ; j++++ | ||
397 | extrd,u $nm0,31,32,$hi1 | ||
398 | |||
399 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] | ||
400 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
401 | ldd 0($xfer),$ab1 | ||
402 | fstds ${fab1},0($xfer) | ||
403 | addl $hi0,$ti1,$ti1 | ||
404 | addl $ti1,$ab1,$ab1 | ||
405 | ldd 8($xfer),$nm1 | ||
406 | fstds ${fnm1},8($xfer) | ||
407 | extrd,u $ab1,31,32,$hi0 | ||
408 | extrd,u $ab1,63,32,$ab1 | ||
409 | ldw 4($tp),$ti0 ; tp[j] | ||
410 | addl $hi1,$nm1,$nm1 | ||
411 | addl $ab1,$nm1,$nm1 | ||
412 | ldd -16($xfer),$ab0 | ||
413 | ldd -8($xfer),$nm0 | ||
414 | extrd,u $nm1,31,32,$hi1 | ||
415 | |||
416 | addl $hi0,$ab0,$ab0 | ||
417 | addl $ti0,$ab0,$ab0 | ||
418 | stw $nm1,-4($tp) ; tp[j-1] | ||
419 | extrd,u $ab0,31,32,$hi0 | ||
420 | ldw 8($tp),$ti1 ; tp[j] | ||
421 | extrd,u $ab0,63,32,$ab0 | ||
422 | addl $hi1,$nm0,$nm0 | ||
423 | ldd 0($xfer),$ab1 | ||
424 | addl $ab0,$nm0,$nm0 | ||
425 | ldd,mb 8($xfer),$nm1 | ||
426 | extrd,u $nm0,31,32,$hi1 | ||
427 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
428 | |||
429 | addib,= -1,$num,L\$outerdone ; i-- | ||
430 | subi 0,$arrsz,$idx ; j=0 | ||
431 | ___ | ||
432 | $code.=<<___ if ($BN_SZ==4); | ||
433 | fldws,ma 4($bp),${fbi} ; bp[i] | ||
434 | ___ | ||
435 | $code.=<<___ if ($BN_SZ==8); | ||
436 | ldi 12,$ti0 ; bp[i] in flipped word order | ||
437 | addl,ev %r0,$num,$num | ||
438 | ldi -4,$ti0 | ||
439 | addl $ti0,$bp,$bp | ||
440 | fldws 0($bp),${fbi} | ||
441 | ___ | ||
442 | $code.=<<___; | ||
443 | flddx $idx($ap),${fai} ; ap[0] | ||
444 | addl $hi0,$ab1,$ab1 | ||
445 | flddx $idx($np),${fni} ; np[0] | ||
446 | fldws 8($xfer),${fti}R ; tp[0] | ||
447 | addl $ti1,$ab1,$ab1 | ||
448 | extrd,u $ab1,31,32,$hi0 | ||
449 | extrd,u $ab1,63,32,$ab1 | ||
450 | |||
451 | ldo 8($idx),$idx ; j++++ | ||
452 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] | ||
453 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] | ||
454 | ldw 4($tp),$ti0 ; tp[j] | ||
455 | |||
456 | addl $hi1,$nm1,$nm1 | ||
457 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
458 | addl $ab1,$nm1,$nm1 | ||
459 | extrd,u $nm1,31,32,$hi1 | ||
460 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
461 | fcpy,sgl %fr0,${fab0}L | ||
462 | stw $nm1,-4($tp) ; tp[j-1] | ||
463 | |||
464 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
465 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
466 | addl $hi1,$hi0,$hi0 | ||
467 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
468 | addl $ti0,$hi0,$hi0 | ||
469 | extrd,u $hi0,31,32,$hi1 | ||
470 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
471 | stw $hi0,0($tp) | ||
472 | stw $hi1,4($tp) | ||
473 | xmpyu ${fn0},${fab0}R,${fm0} | ||
474 | |||
475 | b L\$outer | ||
476 | ldo `$LOCALS+32+4`($fp),$tp | ||
477 | |||
478 | L\$outerdone | ||
479 | addl $hi0,$ab1,$ab1 | ||
480 | addl $ti1,$ab1,$ab1 | ||
481 | extrd,u $ab1,31,32,$hi0 | ||
482 | extrd,u $ab1,63,32,$ab1 | ||
483 | |||
484 | ldw 4($tp),$ti0 ; tp[j] | ||
485 | |||
486 | addl $hi1,$nm1,$nm1 | ||
487 | addl $ab1,$nm1,$nm1 | ||
488 | extrd,u $nm1,31,32,$hi1 | ||
489 | stw $nm1,-4($tp) ; tp[j-1] | ||
490 | |||
491 | addl $hi1,$hi0,$hi0 | ||
492 | addl $ti0,$hi0,$hi0 | ||
493 | extrd,u $hi0,31,32,$hi1 | ||
494 | stw $hi0,0($tp) | ||
495 | stw $hi1,4($tp) | ||
496 | |||
497 | ldo `$LOCALS+32`($fp),$tp | ||
498 | sub %r0,%r0,%r0 ; clear borrow | ||
499 | ___ | ||
500 | $code.=<<___ if ($BN_SZ==4); | ||
501 | ldws,ma 4($tp),$ti0 | ||
502 | extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? | ||
503 | b L\$sub_pa11 | ||
504 | addl $tp,$arrsz,$tp | ||
505 | L\$sub | ||
506 | ldwx $idx($np),$hi0 | ||
507 | subb $ti0,$hi0,$hi1 | ||
508 | ldwx $idx($tp),$ti0 | ||
509 | addib,<> 4,$idx,L\$sub | ||
510 | stws,ma $hi1,4($rp) | ||
511 | |||
512 | subb $ti0,%r0,$hi1 | ||
513 | ldo -4($tp),$tp | ||
514 | ___ | ||
515 | $code.=<<___ if ($BN_SZ==8); | ||
516 | ldd,ma 8($tp),$ti0 | ||
517 | L\$sub | ||
518 | ldd $idx($np),$hi0 | ||
519 | shrpd $ti0,$ti0,32,$ti0 ; flip word order | ||
520 | std $ti0,-8($tp) ; save flipped value | ||
521 | sub,db $ti0,$hi0,$hi1 | ||
522 | ldd,ma 8($tp),$ti0 | ||
523 | addib,<> 8,$idx,L\$sub | ||
524 | std,ma $hi1,8($rp) | ||
525 | |||
526 | extrd,u $ti0,31,32,$ti0 ; carry in flipped word order | ||
527 | sub,db $ti0,%r0,$hi1 | ||
528 | ldo -8($tp),$tp | ||
529 | ___ | ||
530 | $code.=<<___; | ||
531 | and $tp,$hi1,$ap | ||
532 | andcm $rp,$hi1,$bp | ||
533 | or $ap,$bp,$np | ||
534 | |||
535 | sub $rp,$arrsz,$rp ; rewind rp | ||
536 | subi 0,$arrsz,$idx | ||
537 | ldo `$LOCALS+32`($fp),$tp | ||
538 | L\$copy | ||
539 | ldd $idx($np),$hi0 | ||
540 | std,ma %r0,8($tp) | ||
541 | addib,<> 8,$idx,.-8 ; L\$copy | ||
542 | std,ma $hi0,8($rp) | ||
543 | ___ | ||
544 | |||
545 | if ($BN_SZ==4) { # PA-RISC 1.1 code-path | ||
546 | $ablo=$ab0; | ||
547 | $abhi=$ab1; | ||
548 | $nmlo0=$nm0; | ||
549 | $nmhi0=$nm1; | ||
550 | $nmlo1="%r9"; | ||
551 | $nmhi1="%r8"; | ||
552 | |||
553 | $code.=<<___; | ||
554 | b L\$done | ||
555 | nop | ||
556 | |||
557 | .ALIGN 8 | ||
558 | L\$parisc11 | ||
559 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
560 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
561 | ldw -12($xfer),$ablo | ||
562 | ldw -16($xfer),$hi0 | ||
563 | ldw -4($xfer),$nmlo0 | ||
564 | ldw -8($xfer),$nmhi0 | ||
565 | fstds ${fab0},-16($xfer) | ||
566 | fstds ${fnm0},-8($xfer) | ||
567 | |||
568 | ldo 8($idx),$idx ; j++++ | ||
569 | add $ablo,$nmlo0,$nmlo0 ; discarded | ||
570 | addc %r0,$nmhi0,$hi1 | ||
571 | ldw 4($xfer),$ablo | ||
572 | ldw 0($xfer),$abhi | ||
573 | nop | ||
574 | |||
575 | L\$1st_pa11 | ||
576 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] | ||
577 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
578 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
579 | flddx $idx($np),${fni} ; np[j,j+1] | ||
580 | add $hi0,$ablo,$ablo | ||
581 | ldw 12($xfer),$nmlo1 | ||
582 | addc %r0,$abhi,$hi0 | ||
583 | ldw 8($xfer),$nmhi1 | ||
584 | add $ablo,$nmlo1,$nmlo1 | ||
585 | fstds ${fab1},0($xfer) | ||
586 | addc %r0,$nmhi1,$nmhi1 | ||
587 | fstds ${fnm1},8($xfer) | ||
588 | add $hi1,$nmlo1,$nmlo1 | ||
589 | ldw -12($xfer),$ablo | ||
590 | addc %r0,$nmhi1,$hi1 | ||
591 | ldw -16($xfer),$abhi | ||
592 | |||
593 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
594 | ldw -4($xfer),$nmlo0 | ||
595 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
596 | ldw -8($xfer),$nmhi0 | ||
597 | add $hi0,$ablo,$ablo | ||
598 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
599 | addc %r0,$abhi,$hi0 | ||
600 | fstds ${fab0},-16($xfer) | ||
601 | add $ablo,$nmlo0,$nmlo0 | ||
602 | fstds ${fnm0},-8($xfer) | ||
603 | addc %r0,$nmhi0,$nmhi0 | ||
604 | ldw 0($xfer),$abhi | ||
605 | add $hi1,$nmlo0,$nmlo0 | ||
606 | ldw 4($xfer),$ablo | ||
607 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
608 | addib,<> 8,$idx,L\$1st_pa11 ; j++++ | ||
609 | addc %r0,$nmhi0,$hi1 | ||
610 | |||
611 | ldw 8($xfer),$nmhi1 | ||
612 | ldw 12($xfer),$nmlo1 | ||
613 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] | ||
614 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
615 | add $hi0,$ablo,$ablo | ||
616 | fstds ${fab1},0($xfer) | ||
617 | addc %r0,$abhi,$hi0 | ||
618 | fstds ${fnm1},8($xfer) | ||
619 | add $ablo,$nmlo1,$nmlo1 | ||
620 | ldw -16($xfer),$abhi | ||
621 | addc %r0,$nmhi1,$nmhi1 | ||
622 | ldw -12($xfer),$ablo | ||
623 | add $hi1,$nmlo1,$nmlo1 | ||
624 | ldw -8($xfer),$nmhi0 | ||
625 | addc %r0,$nmhi1,$hi1 | ||
626 | ldw -4($xfer),$nmlo0 | ||
627 | |||
628 | add $hi0,$ablo,$ablo | ||
629 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
630 | addc %r0,$abhi,$hi0 | ||
631 | ldw 0($xfer),$abhi | ||
632 | add $ablo,$nmlo0,$nmlo0 | ||
633 | ldw 4($xfer),$ablo | ||
634 | addc %r0,$nmhi0,$nmhi0 | ||
635 | ldws,mb 8($xfer),$nmhi1 | ||
636 | add $hi1,$nmlo0,$nmlo0 | ||
637 | ldw 4($xfer),$nmlo1 | ||
638 | addc %r0,$nmhi0,$hi1 | ||
639 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
640 | |||
641 | ldo -1($num),$num ; i-- | ||
642 | subi 0,$arrsz,$idx ; j=0 | ||
643 | |||
644 | fldws,ma 4($bp),${fbi} ; bp[1] | ||
645 | flddx $idx($ap),${fai} ; ap[0,1] | ||
646 | flddx $idx($np),${fni} ; np[0,1] | ||
647 | fldws 8($xfer),${fti}R ; tp[0] | ||
648 | add $hi0,$ablo,$ablo | ||
649 | addc %r0,$abhi,$hi0 | ||
650 | ldo 8($idx),$idx ; j++++ | ||
651 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] | ||
652 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] | ||
653 | add $hi1,$nmlo1,$nmlo1 | ||
654 | addc %r0,$nmhi1,$nmhi1 | ||
655 | add $ablo,$nmlo1,$nmlo1 | ||
656 | addc %r0,$nmhi1,$hi1 | ||
657 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
658 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
659 | |||
660 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
661 | fcpy,sgl %fr0,${fab0}L | ||
662 | add $hi1,$hi0,$hi0 | ||
663 | addc %r0,%r0,$hi1 | ||
664 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
665 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
666 | stw $hi0,0($tp) | ||
667 | stw $hi1,4($tp) | ||
668 | |||
669 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
670 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
671 | xmpyu ${fn0},${fab0}R,${fm0} | ||
672 | ldo `$LOCALS+32+4`($fp),$tp | ||
673 | L\$outer_pa11 | ||
674 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
675 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
676 | fstds ${fab0},-16($xfer) ; 33-bit value | ||
677 | fstds ${fnm0},-8($xfer) | ||
678 | flddx $idx($ap),${fai} ; ap[2,3] | ||
679 | flddx $idx($np),${fni} ; np[2,3] | ||
680 | ldw -16($xfer),$abhi ; carry bit actually | ||
681 | ldo 8($idx),$idx ; j++++ | ||
682 | ldw -12($xfer),$ablo | ||
683 | ldw -8($xfer),$nmhi0 | ||
684 | ldw -4($xfer),$nmlo0 | ||
685 | ldw 0($xfer),$hi0 ; high part | ||
686 | |||
687 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
688 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
689 | fstds ${fab1},0($xfer) | ||
690 | addl $abhi,$hi0,$hi0 ; account carry bit | ||
691 | fstds ${fnm1},8($xfer) | ||
692 | add $ablo,$nmlo0,$nmlo0 ; discarded | ||
693 | ldw 0($tp),$ti1 ; tp[1] | ||
694 | addc %r0,$nmhi0,$hi1 | ||
695 | fstds ${fab0},-16($xfer) | ||
696 | fstds ${fnm0},-8($xfer) | ||
697 | ldw 4($xfer),$ablo | ||
698 | ldw 0($xfer),$abhi | ||
699 | |||
700 | L\$inner_pa11 | ||
701 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] | ||
702 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
703 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
704 | flddx $idx($np),${fni} ; np[j,j+1] | ||
705 | add $hi0,$ablo,$ablo | ||
706 | ldw 4($tp),$ti0 ; tp[j] | ||
707 | addc %r0,$abhi,$abhi | ||
708 | ldw 12($xfer),$nmlo1 | ||
709 | add $ti1,$ablo,$ablo | ||
710 | ldw 8($xfer),$nmhi1 | ||
711 | addc %r0,$abhi,$hi0 | ||
712 | fstds ${fab1},0($xfer) | ||
713 | add $ablo,$nmlo1,$nmlo1 | ||
714 | fstds ${fnm1},8($xfer) | ||
715 | addc %r0,$nmhi1,$nmhi1 | ||
716 | ldw -12($xfer),$ablo | ||
717 | add $hi1,$nmlo1,$nmlo1 | ||
718 | ldw -16($xfer),$abhi | ||
719 | addc %r0,$nmhi1,$hi1 | ||
720 | |||
721 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
722 | ldw 8($tp),$ti1 ; tp[j] | ||
723 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
724 | ldw -4($xfer),$nmlo0 | ||
725 | add $hi0,$ablo,$ablo | ||
726 | ldw -8($xfer),$nmhi0 | ||
727 | addc %r0,$abhi,$abhi | ||
728 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
729 | add $ti0,$ablo,$ablo | ||
730 | fstds ${fab0},-16($xfer) | ||
731 | addc %r0,$abhi,$hi0 | ||
732 | fstds ${fnm0},-8($xfer) | ||
733 | add $ablo,$nmlo0,$nmlo0 | ||
734 | ldw 4($xfer),$ablo | ||
735 | addc %r0,$nmhi0,$nmhi0 | ||
736 | ldw 0($xfer),$abhi | ||
737 | add $hi1,$nmlo0,$nmlo0 | ||
738 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
739 | addib,<> 8,$idx,L\$inner_pa11 ; j++++ | ||
740 | addc %r0,$nmhi0,$hi1 | ||
741 | |||
742 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] | ||
743 | ldw 12($xfer),$nmlo1 | ||
744 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
745 | ldw 8($xfer),$nmhi1 | ||
746 | add $hi0,$ablo,$ablo | ||
747 | ldw 4($tp),$ti0 ; tp[j] | ||
748 | addc %r0,$abhi,$abhi | ||
749 | fstds ${fab1},0($xfer) | ||
750 | add $ti1,$ablo,$ablo | ||
751 | fstds ${fnm1},8($xfer) | ||
752 | addc %r0,$abhi,$hi0 | ||
753 | ldw -16($xfer),$abhi | ||
754 | add $ablo,$nmlo1,$nmlo1 | ||
755 | ldw -12($xfer),$ablo | ||
756 | addc %r0,$nmhi1,$nmhi1 | ||
757 | ldw -8($xfer),$nmhi0 | ||
758 | add $hi1,$nmlo1,$nmlo1 | ||
759 | ldw -4($xfer),$nmlo0 | ||
760 | addc %r0,$nmhi1,$hi1 | ||
761 | |||
762 | add $hi0,$ablo,$ablo | ||
763 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
764 | addc %r0,$abhi,$abhi | ||
765 | add $ti0,$ablo,$ablo | ||
766 | ldw 8($tp),$ti1 ; tp[j] | ||
767 | addc %r0,$abhi,$hi0 | ||
768 | ldw 0($xfer),$abhi | ||
769 | add $ablo,$nmlo0,$nmlo0 | ||
770 | ldw 4($xfer),$ablo | ||
771 | addc %r0,$nmhi0,$nmhi0 | ||
772 | ldws,mb 8($xfer),$nmhi1 | ||
773 | add $hi1,$nmlo0,$nmlo0 | ||
774 | ldw 4($xfer),$nmlo1 | ||
775 | addc %r0,$nmhi0,$hi1 | ||
776 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
777 | |||
778 | addib,= -1,$num,L\$outerdone_pa11; i-- | ||
779 | subi 0,$arrsz,$idx ; j=0 | ||
780 | |||
781 | fldws,ma 4($bp),${fbi} ; bp[i] | ||
782 | flddx $idx($ap),${fai} ; ap[0] | ||
783 | add $hi0,$ablo,$ablo | ||
784 | addc %r0,$abhi,$abhi | ||
785 | flddx $idx($np),${fni} ; np[0] | ||
786 | fldws 8($xfer),${fti}R ; tp[0] | ||
787 | add $ti1,$ablo,$ablo | ||
788 | addc %r0,$abhi,$hi0 | ||
789 | |||
790 | ldo 8($idx),$idx ; j++++ | ||
791 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] | ||
792 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] | ||
793 | ldw 4($tp),$ti0 ; tp[j] | ||
794 | |||
795 | add $hi1,$nmlo1,$nmlo1 | ||
796 | addc %r0,$nmhi1,$nmhi1 | ||
797 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
798 | add $ablo,$nmlo1,$nmlo1 | ||
799 | addc %r0,$nmhi1,$hi1 | ||
800 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
801 | fcpy,sgl %fr0,${fab0}L | ||
802 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
803 | |||
804 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
805 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
806 | add $hi1,$hi0,$hi0 | ||
807 | addc %r0,%r0,$hi1 | ||
808 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
809 | add $ti0,$hi0,$hi0 | ||
810 | addc %r0,$hi1,$hi1 | ||
811 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
812 | stw $hi0,0($tp) | ||
813 | stw $hi1,4($tp) | ||
814 | xmpyu ${fn0},${fab0}R,${fm0} | ||
815 | |||
816 | b L\$outer_pa11 | ||
817 | ldo `$LOCALS+32+4`($fp),$tp | ||
818 | |||
819 | L\$outerdone_pa11 | ||
820 | add $hi0,$ablo,$ablo | ||
821 | addc %r0,$abhi,$abhi | ||
822 | add $ti1,$ablo,$ablo | ||
823 | addc %r0,$abhi,$hi0 | ||
824 | |||
825 | ldw 4($tp),$ti0 ; tp[j] | ||
826 | |||
827 | add $hi1,$nmlo1,$nmlo1 | ||
828 | addc %r0,$nmhi1,$nmhi1 | ||
829 | add $ablo,$nmlo1,$nmlo1 | ||
830 | addc %r0,$nmhi1,$hi1 | ||
831 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
832 | |||
833 | add $hi1,$hi0,$hi0 | ||
834 | addc %r0,%r0,$hi1 | ||
835 | add $ti0,$hi0,$hi0 | ||
836 | addc %r0,$hi1,$hi1 | ||
837 | stw $hi0,0($tp) | ||
838 | stw $hi1,4($tp) | ||
839 | |||
840 | ldo `$LOCALS+32+4`($fp),$tp | ||
841 | sub %r0,%r0,%r0 ; clear borrow | ||
842 | ldw -4($tp),$ti0 | ||
843 | addl $tp,$arrsz,$tp | ||
844 | L\$sub_pa11 | ||
845 | ldwx $idx($np),$hi0 | ||
846 | subb $ti0,$hi0,$hi1 | ||
847 | ldwx $idx($tp),$ti0 | ||
848 | addib,<> 4,$idx,L\$sub_pa11 | ||
849 | stws,ma $hi1,4($rp) | ||
850 | |||
851 | subb $ti0,%r0,$hi1 | ||
852 | ldo -4($tp),$tp | ||
853 | and $tp,$hi1,$ap | ||
854 | andcm $rp,$hi1,$bp | ||
855 | or $ap,$bp,$np | ||
856 | |||
857 | sub $rp,$arrsz,$rp ; rewind rp | ||
858 | subi 0,$arrsz,$idx | ||
859 | ldo `$LOCALS+32`($fp),$tp | ||
860 | L\$copy_pa11 | ||
861 | ldwx $idx($np),$hi0 | ||
862 | stws,ma %r0,4($tp) | ||
863 | addib,<> 4,$idx,L\$copy_pa11 | ||
864 | stws,ma $hi0,4($rp) | ||
865 | |||
866 | nop ; alignment | ||
867 | L\$done | ||
868 | ___ | ||
869 | } | ||
870 | |||
871 | $code.=<<___; | ||
872 | ldi 1,%r28 ; signal "handled" | ||
873 | ldo $FRAME($fp),%sp ; destroy tp[num+1] | ||
874 | |||
875 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
876 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
877 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
878 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
879 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
880 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
881 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
882 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
883 | L\$abort | ||
884 | bv (%r2) | ||
885 | .EXIT | ||
886 | $POPMB -$FRAME(%sp),%r3 | ||
887 | .PROCEND | ||
888 | .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
889 | ___ | ||
890 | |||
891 | # Explicitly encode PA-RISC 2.0 instructions used in this module, so | ||
892 | # that it can be compiled with .LEVEL 1.0. It should be noted that I | ||
893 | # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 | ||
894 | # directive... | ||
895 | |||
896 | my $ldd = sub { | ||
897 | my ($mod,$args) = @_; | ||
898 | my $orig = "ldd$mod\t$args"; | ||
899 | |||
900 | if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 | ||
901 | { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; | ||
902 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
903 | } | ||
904 | elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 | ||
905 | { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; | ||
906 | $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset | ||
907 | $opcode|=(1<<5) if ($mod =~ /^,m/); | ||
908 | $opcode|=(1<<13) if ($mod =~ /^,mb/); | ||
909 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
910 | } | ||
911 | else { "\t".$orig; } | ||
912 | }; | ||
913 | |||
914 | my $std = sub { | ||
915 | my ($mod,$args) = @_; | ||
916 | my $orig = "std$mod\t$args"; | ||
917 | |||
918 | if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 | ||
919 | { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); | ||
920 | $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset | ||
921 | $opcode|=(1<<5) if ($mod =~ /^,m/); | ||
922 | $opcode|=(1<<13) if ($mod =~ /^,mb/); | ||
923 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
924 | } | ||
925 | else { "\t".$orig; } | ||
926 | }; | ||
927 | |||
928 | my $extrd = sub { | ||
929 | my ($mod,$args) = @_; | ||
930 | my $orig = "extrd$mod\t$args"; | ||
931 | |||
932 | # I only have ",u" completer, it's implicitly encoded... | ||
933 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 | ||
934 | { my $opcode=(0x36<<26)|($1<<21)|($4<<16); | ||
935 | my $len=32-$3; | ||
936 | $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos | ||
937 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len | ||
938 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
939 | } | ||
940 | elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 | ||
941 | { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); | ||
942 | my $len=32-$2; | ||
943 | $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len | ||
944 | $opcode |= (1<<13) if ($mod =~ /,\**=/); | ||
945 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
946 | } | ||
947 | else { "\t".$orig; } | ||
948 | }; | ||
949 | |||
950 | my $shrpd = sub { | ||
951 | my ($mod,$args) = @_; | ||
952 | my $orig = "shrpd$mod\t$args"; | ||
953 | |||
954 | if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 | ||
955 | { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; | ||
956 | my $cpos=63-$3; | ||
957 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa | ||
958 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
959 | } | ||
960 | else { "\t".$orig; } | ||
961 | }; | ||
962 | |||
963 | my $sub = sub { | ||
964 | my ($mod,$args) = @_; | ||
965 | my $orig = "sub$mod\t$args"; | ||
966 | |||
967 | if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { | ||
968 | my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; | ||
969 | $opcode|=(1<<10); # e1 | ||
970 | $opcode|=(1<<8); # e2 | ||
971 | $opcode|=(1<<5); # d | ||
972 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig | ||
973 | } | ||
974 | else { "\t".$orig; } | ||
975 | }; | ||
976 | |||
977 | sub assemble { | ||
978 | my ($mnemonic,$mod,$args)=@_; | ||
979 | my $opcode = eval("\$$mnemonic"); | ||
980 | |||
981 | ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; | ||
982 | } | ||
983 | |||
984 | foreach (split("\n",$code)) { | ||
985 | s/\`([^\`]*)\`/eval $1/ge; | ||
986 | # flip word order in 64-bit mode... | ||
987 | s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); | ||
988 | # assemble 2.0 instructions in 32-bit mode... | ||
989 | s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); | ||
990 | |||
991 | print $_,"\n"; | ||
992 | } | ||
993 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl index 7849eae959..f9b6992ccc 100644 --- a/src/lib/libcrypto/bn/asm/ppc-mont.pl +++ b/src/lib/libcrypto/bn/asm/ppc-mont.pl | |||
@@ -31,7 +31,6 @@ if ($flavour =~ /32/) { | |||
31 | $BNSZ= $BITS/8; | 31 | $BNSZ= $BITS/8; |
32 | $SIZE_T=4; | 32 | $SIZE_T=4; |
33 | $RZONE= 224; | 33 | $RZONE= 224; |
34 | $FRAME= $SIZE_T*16; | ||
35 | 34 | ||
36 | $LD= "lwz"; # load | 35 | $LD= "lwz"; # load |
37 | $LDU= "lwzu"; # load and update | 36 | $LDU= "lwzu"; # load and update |
@@ -51,7 +50,6 @@ if ($flavour =~ /32/) { | |||
51 | $BNSZ= $BITS/8; | 50 | $BNSZ= $BITS/8; |
52 | $SIZE_T=8; | 51 | $SIZE_T=8; |
53 | $RZONE= 288; | 52 | $RZONE= 288; |
54 | $FRAME= $SIZE_T*16; | ||
55 | 53 | ||
56 | # same as above, but 64-bit mnemonics... | 54 | # same as above, but 64-bit mnemonics... |
57 | $LD= "ld"; # load | 55 | $LD= "ld"; # load |
@@ -69,6 +67,9 @@ if ($flavour =~ /32/) { | |||
69 | $POP= $LD; | 67 | $POP= $LD; |
70 | } else { die "nonsense $flavour"; } | 68 | } else { die "nonsense $flavour"; } |
71 | 69 | ||
70 | $FRAME=8*$SIZE_T+$RZONE; | ||
71 | $LOCALS=8*$SIZE_T; | ||
72 | |||
72 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | 73 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
73 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | 74 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
74 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | 75 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
@@ -89,18 +90,18 @@ $aj="r10"; | |||
89 | $nj="r11"; | 90 | $nj="r11"; |
90 | $tj="r12"; | 91 | $tj="r12"; |
91 | # non-volatile registers | 92 | # non-volatile registers |
92 | $i="r14"; | 93 | $i="r20"; |
93 | $j="r15"; | 94 | $j="r21"; |
94 | $tp="r16"; | 95 | $tp="r22"; |
95 | $m0="r17"; | 96 | $m0="r23"; |
96 | $m1="r18"; | 97 | $m1="r24"; |
97 | $lo0="r19"; | 98 | $lo0="r25"; |
98 | $hi0="r20"; | 99 | $hi0="r26"; |
99 | $lo1="r21"; | 100 | $lo1="r27"; |
100 | $hi1="r22"; | 101 | $hi1="r28"; |
101 | $alo="r23"; | 102 | $alo="r29"; |
102 | $ahi="r24"; | 103 | $ahi="r30"; |
103 | $nlo="r25"; | 104 | $nlo="r31"; |
104 | # | 105 | # |
105 | $nhi="r0"; | 106 | $nhi="r0"; |
106 | 107 | ||
@@ -108,42 +109,48 @@ $code=<<___; | |||
108 | .machine "any" | 109 | .machine "any" |
109 | .text | 110 | .text |
110 | 111 | ||
111 | .globl .bn_mul_mont | 112 | .globl .bn_mul_mont_int |
112 | .align 4 | 113 | .align 4 |
113 | .bn_mul_mont: | 114 | .bn_mul_mont_int: |
114 | cmpwi $num,4 | 115 | cmpwi $num,4 |
115 | mr $rp,r3 ; $rp is reassigned | 116 | mr $rp,r3 ; $rp is reassigned |
116 | li r3,0 | 117 | li r3,0 |
117 | bltlr | 118 | bltlr |
118 | 119 | ___ | |
120 | $code.=<<___ if ($BNSZ==4); | ||
121 | cmpwi $num,32 ; longer key performance is not better | ||
122 | bgelr | ||
123 | ___ | ||
124 | $code.=<<___; | ||
119 | slwi $num,$num,`log($BNSZ)/log(2)` | 125 | slwi $num,$num,`log($BNSZ)/log(2)` |
120 | li $tj,-4096 | 126 | li $tj,-4096 |
121 | addi $ovf,$num,`$FRAME+$RZONE` | 127 | addi $ovf,$num,$FRAME |
122 | subf $ovf,$ovf,$sp ; $sp-$ovf | 128 | subf $ovf,$ovf,$sp ; $sp-$ovf |
123 | and $ovf,$ovf,$tj ; minimize TLB usage | 129 | and $ovf,$ovf,$tj ; minimize TLB usage |
124 | subf $ovf,$sp,$ovf ; $ovf-$sp | 130 | subf $ovf,$sp,$ovf ; $ovf-$sp |
131 | mr $tj,$sp | ||
125 | srwi $num,$num,`log($BNSZ)/log(2)` | 132 | srwi $num,$num,`log($BNSZ)/log(2)` |
126 | $STUX $sp,$sp,$ovf | 133 | $STUX $sp,$sp,$ovf |
127 | 134 | ||
128 | $PUSH r14,`4*$SIZE_T`($sp) | 135 | $PUSH r20,`-12*$SIZE_T`($tj) |
129 | $PUSH r15,`5*$SIZE_T`($sp) | 136 | $PUSH r21,`-11*$SIZE_T`($tj) |
130 | $PUSH r16,`6*$SIZE_T`($sp) | 137 | $PUSH r22,`-10*$SIZE_T`($tj) |
131 | $PUSH r17,`7*$SIZE_T`($sp) | 138 | $PUSH r23,`-9*$SIZE_T`($tj) |
132 | $PUSH r18,`8*$SIZE_T`($sp) | 139 | $PUSH r24,`-8*$SIZE_T`($tj) |
133 | $PUSH r19,`9*$SIZE_T`($sp) | 140 | $PUSH r25,`-7*$SIZE_T`($tj) |
134 | $PUSH r20,`10*$SIZE_T`($sp) | 141 | $PUSH r26,`-6*$SIZE_T`($tj) |
135 | $PUSH r21,`11*$SIZE_T`($sp) | 142 | $PUSH r27,`-5*$SIZE_T`($tj) |
136 | $PUSH r22,`12*$SIZE_T`($sp) | 143 | $PUSH r28,`-4*$SIZE_T`($tj) |
137 | $PUSH r23,`13*$SIZE_T`($sp) | 144 | $PUSH r29,`-3*$SIZE_T`($tj) |
138 | $PUSH r24,`14*$SIZE_T`($sp) | 145 | $PUSH r30,`-2*$SIZE_T`($tj) |
139 | $PUSH r25,`15*$SIZE_T`($sp) | 146 | $PUSH r31,`-1*$SIZE_T`($tj) |
140 | 147 | ||
141 | $LD $n0,0($n0) ; pull n0[0] value | 148 | $LD $n0,0($n0) ; pull n0[0] value |
142 | addi $num,$num,-2 ; adjust $num for counter register | 149 | addi $num,$num,-2 ; adjust $num for counter register |
143 | 150 | ||
144 | $LD $m0,0($bp) ; m0=bp[0] | 151 | $LD $m0,0($bp) ; m0=bp[0] |
145 | $LD $aj,0($ap) ; ap[0] | 152 | $LD $aj,0($ap) ; ap[0] |
146 | addi $tp,$sp,$FRAME | 153 | addi $tp,$sp,$LOCALS |
147 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] | 154 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] |
148 | $UMULH $hi0,$aj,$m0 | 155 | $UMULH $hi0,$aj,$m0 |
149 | 156 | ||
@@ -205,8 +212,8 @@ L1st: | |||
205 | Louter: | 212 | Louter: |
206 | $LDX $m0,$bp,$i ; m0=bp[i] | 213 | $LDX $m0,$bp,$i ; m0=bp[i] |
207 | $LD $aj,0($ap) ; ap[0] | 214 | $LD $aj,0($ap) ; ap[0] |
208 | addi $tp,$sp,$FRAME | 215 | addi $tp,$sp,$LOCALS |
209 | $LD $tj,$FRAME($sp) ; tp[0] | 216 | $LD $tj,$LOCALS($sp); tp[0] |
210 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] | 217 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] |
211 | $UMULH $hi0,$aj,$m0 | 218 | $UMULH $hi0,$aj,$m0 |
212 | $LD $aj,$BNSZ($ap) ; ap[1] | 219 | $LD $aj,$BNSZ($ap) ; ap[1] |
@@ -273,7 +280,7 @@ Linner: | |||
273 | 280 | ||
274 | addi $num,$num,2 ; restore $num | 281 | addi $num,$num,2 ; restore $num |
275 | subfc $j,$j,$j ; j=0 and "clear" XER[CA] | 282 | subfc $j,$j,$j ; j=0 and "clear" XER[CA] |
276 | addi $tp,$sp,$FRAME | 283 | addi $tp,$sp,$LOCALS |
277 | mtctr $num | 284 | mtctr $num |
278 | 285 | ||
279 | .align 4 | 286 | .align 4 |
@@ -299,23 +306,27 @@ Lcopy: ; copy or in-place refresh | |||
299 | addi $j,$j,$BNSZ | 306 | addi $j,$j,$BNSZ |
300 | bdnz- Lcopy | 307 | bdnz- Lcopy |
301 | 308 | ||
302 | $POP r14,`4*$SIZE_T`($sp) | 309 | $POP $tj,0($sp) |
303 | $POP r15,`5*$SIZE_T`($sp) | ||
304 | $POP r16,`6*$SIZE_T`($sp) | ||
305 | $POP r17,`7*$SIZE_T`($sp) | ||
306 | $POP r18,`8*$SIZE_T`($sp) | ||
307 | $POP r19,`9*$SIZE_T`($sp) | ||
308 | $POP r20,`10*$SIZE_T`($sp) | ||
309 | $POP r21,`11*$SIZE_T`($sp) | ||
310 | $POP r22,`12*$SIZE_T`($sp) | ||
311 | $POP r23,`13*$SIZE_T`($sp) | ||
312 | $POP r24,`14*$SIZE_T`($sp) | ||
313 | $POP r25,`15*$SIZE_T`($sp) | ||
314 | $POP $sp,0($sp) | ||
315 | li r3,1 | 310 | li r3,1 |
311 | $POP r20,`-12*$SIZE_T`($tj) | ||
312 | $POP r21,`-11*$SIZE_T`($tj) | ||
313 | $POP r22,`-10*$SIZE_T`($tj) | ||
314 | $POP r23,`-9*$SIZE_T`($tj) | ||
315 | $POP r24,`-8*$SIZE_T`($tj) | ||
316 | $POP r25,`-7*$SIZE_T`($tj) | ||
317 | $POP r26,`-6*$SIZE_T`($tj) | ||
318 | $POP r27,`-5*$SIZE_T`($tj) | ||
319 | $POP r28,`-4*$SIZE_T`($tj) | ||
320 | $POP r29,`-3*$SIZE_T`($tj) | ||
321 | $POP r30,`-2*$SIZE_T`($tj) | ||
322 | $POP r31,`-1*$SIZE_T`($tj) | ||
323 | mr $sp,$tj | ||
316 | blr | 324 | blr |
317 | .long 0 | 325 | .long 0 |
318 | .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" | 326 | .byte 0,12,4,0,0x80,12,6,0 |
327 | .long 0 | ||
328 | |||
329 | .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" | ||
319 | ___ | 330 | ___ |
320 | 331 | ||
321 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 332 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl index f4093177e6..1249ce2299 100644 --- a/src/lib/libcrypto/bn/asm/ppc.pl +++ b/src/lib/libcrypto/bn/asm/ppc.pl | |||
@@ -389,7 +389,9 @@ $data=<<EOF; | |||
389 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1 | 389 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1 |
390 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2 | 390 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2 |
391 | blr | 391 | blr |
392 | .long 0x00000000 | 392 | .long 0 |
393 | .byte 0,12,0x14,0,0,0,2,0 | ||
394 | .long 0 | ||
393 | 395 | ||
394 | # | 396 | # |
395 | # NOTE: The following label name should be changed to | 397 | # NOTE: The following label name should be changed to |
@@ -814,8 +816,9 @@ $data=<<EOF; | |||
814 | 816 | ||
815 | 817 | ||
816 | blr | 818 | blr |
817 | 819 | .long 0 | |
818 | .long 0x00000000 | 820 | .byte 0,12,0x14,0,0,0,2,0 |
821 | .long 0 | ||
819 | 822 | ||
820 | # | 823 | # |
821 | # NOTE: The following label name should be changed to | 824 | # NOTE: The following label name should be changed to |
@@ -966,7 +969,9 @@ $data=<<EOF; | |||
966 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1 | 969 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1 |
967 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2 | 970 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2 |
968 | blr | 971 | blr |
969 | .long 0x00000000 | 972 | .long 0 |
973 | .byte 0,12,0x14,0,0,0,3,0 | ||
974 | .long 0 | ||
970 | 975 | ||
971 | # | 976 | # |
972 | # NOTE: The following label name should be changed to | 977 | # NOTE: The following label name should be changed to |
@@ -1502,7 +1507,9 @@ $data=<<EOF; | |||
1502 | $ST r12,`14*$BNSZ`(r3) #r[14]=c3; | 1507 | $ST r12,`14*$BNSZ`(r3) #r[14]=c3; |
1503 | $ST r10,`15*$BNSZ`(r3) #r[15]=c1; | 1508 | $ST r10,`15*$BNSZ`(r3) #r[15]=c1; |
1504 | blr | 1509 | blr |
1505 | .long 0x00000000 | 1510 | .long 0 |
1511 | .byte 0,12,0x14,0,0,0,3,0 | ||
1512 | .long 0 | ||
1506 | 1513 | ||
1507 | # | 1514 | # |
1508 | # NOTE: The following label name should be changed to | 1515 | # NOTE: The following label name should be changed to |
@@ -1550,8 +1557,9 @@ Lppcasm_sub_adios: | |||
1550 | subfze r3,r0 # if carry bit is set then r3 = 0 else -1 | 1557 | subfze r3,r0 # if carry bit is set then r3 = 0 else -1 |
1551 | andi. r3,r3,1 # keep only last bit. | 1558 | andi. r3,r3,1 # keep only last bit. |
1552 | blr | 1559 | blr |
1553 | .long 0x00000000 | 1560 | .long 0 |
1554 | 1561 | .byte 0,12,0x14,0,0,0,4,0 | |
1562 | .long 0 | ||
1555 | 1563 | ||
1556 | # | 1564 | # |
1557 | # NOTE: The following label name should be changed to | 1565 | # NOTE: The following label name should be changed to |
@@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop: | |||
1594 | Lppcasm_add_adios: | 1602 | Lppcasm_add_adios: |
1595 | addze r3,r0 #return carry bit. | 1603 | addze r3,r0 #return carry bit. |
1596 | blr | 1604 | blr |
1597 | .long 0x00000000 | 1605 | .long 0 |
1606 | .byte 0,12,0x14,0,0,0,4,0 | ||
1607 | .long 0 | ||
1598 | 1608 | ||
1599 | # | 1609 | # |
1600 | # NOTE: The following label name should be changed to | 1610 | # NOTE: The following label name should be changed to |
@@ -1707,7 +1717,9 @@ Lppcasm_div8: | |||
1707 | Lppcasm_div9: | 1717 | Lppcasm_div9: |
1708 | or r3,r8,r0 | 1718 | or r3,r8,r0 |
1709 | blr | 1719 | blr |
1710 | .long 0x00000000 | 1720 | .long 0 |
1721 | .byte 0,12,0x14,0,0,0,3,0 | ||
1722 | .long 0 | ||
1711 | 1723 | ||
1712 | # | 1724 | # |
1713 | # NOTE: The following label name should be changed to | 1725 | # NOTE: The following label name should be changed to |
@@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop: | |||
1746 | bdnz- Lppcasm_sqr_mainloop | 1758 | bdnz- Lppcasm_sqr_mainloop |
1747 | Lppcasm_sqr_adios: | 1759 | Lppcasm_sqr_adios: |
1748 | blr | 1760 | blr |
1749 | .long 0x00000000 | 1761 | .long 0 |
1750 | 1762 | .byte 0,12,0x14,0,0,0,3,0 | |
1763 | .long 0 | ||
1751 | 1764 | ||
1752 | # | 1765 | # |
1753 | # NOTE: The following label name should be changed to | 1766 | # NOTE: The following label name should be changed to |
@@ -1850,7 +1863,9 @@ Lppcasm_mw_REM: | |||
1850 | Lppcasm_mw_OVER: | 1863 | Lppcasm_mw_OVER: |
1851 | addi r3,r12,0 | 1864 | addi r3,r12,0 |
1852 | blr | 1865 | blr |
1853 | .long 0x00000000 | 1866 | .long 0 |
1867 | .byte 0,12,0x14,0,0,0,4,0 | ||
1868 | .long 0 | ||
1854 | 1869 | ||
1855 | # | 1870 | # |
1856 | # NOTE: The following label name should be changed to | 1871 | # NOTE: The following label name should be changed to |
@@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover: | |||
1973 | Lppcasm_maw_adios: | 1988 | Lppcasm_maw_adios: |
1974 | addi r3,r12,0 | 1989 | addi r3,r12,0 |
1975 | blr | 1990 | blr |
1976 | .long 0x00000000 | 1991 | .long 0 |
1992 | .byte 0,12,0x14,0,0,0,4,0 | ||
1993 | .long 0 | ||
1977 | .align 4 | 1994 | .align 4 |
1978 | EOF | 1995 | EOF |
1979 | $data =~ s/\`([^\`]*)\`/eval $1/gem; | 1996 | $data =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl index 3449b35855..a14e769ad0 100644 --- a/src/lib/libcrypto/bn/asm/ppc64-mont.pl +++ b/src/lib/libcrypto/bn/asm/ppc64-mont.pl | |||
@@ -45,23 +45,40 @@ | |||
45 | # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive | 45 | # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive |
46 | # in absolute terms, but it's apparently the way Power 6 is... | 46 | # in absolute terms, but it's apparently the way Power 6 is... |
47 | 47 | ||
48 | # December 2009 | ||
49 | |||
50 | # Adapted for 32-bit build this module delivers 25-120%, yes, more | ||
51 | # than *twice* for longer keys, performance improvement over 32-bit | ||
52 | # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes | ||
53 | # even 64-bit integer operations and the trouble is that most PPC | ||
54 | # operating systems don't preserve upper halves of general purpose | ||
55 | # registers upon 32-bit signal delivery. They do preserve them upon | ||
56 | # context switch, but not signalling:-( This means that asynchronous | ||
57 | # signals have to be blocked upon entry to this subroutine. Signal | ||
58 | # masking (and of course complementary unmasking) has quite an impact | ||
59 | # on performance, naturally larger for shorter keys. It's so severe | ||
60 | # that 512-bit key performance can be as low as 1/3 of expected one. | ||
61 | # This is why this routine can be engaged for longer key operations | ||
62 | # only on these OSes, see crypto/ppccap.c for further details. MacOS X | ||
63 | # is an exception from this and doesn't require signal masking, and | ||
64 | # that's where above improvement coefficients were collected. For | ||
65 | # others alternative would be to break dependence on upper halves of | ||
66 | # GPRs by sticking to 32-bit integer operations... | ||
67 | |||
48 | $flavour = shift; | 68 | $flavour = shift; |
49 | 69 | ||
50 | if ($flavour =~ /32/) { | 70 | if ($flavour =~ /32/) { |
51 | $SIZE_T=4; | 71 | $SIZE_T=4; |
52 | $RZONE= 224; | 72 | $RZONE= 224; |
53 | $FRAME= $SIZE_T*12+8*12; | 73 | $fname= "bn_mul_mont_fpu64"; |
54 | $fname= "bn_mul_mont_ppc64"; | ||
55 | 74 | ||
56 | $STUX= "stwux"; # store indexed and update | 75 | $STUX= "stwux"; # store indexed and update |
57 | $PUSH= "stw"; | 76 | $PUSH= "stw"; |
58 | $POP= "lwz"; | 77 | $POP= "lwz"; |
59 | die "not implemented yet"; | ||
60 | } elsif ($flavour =~ /64/) { | 78 | } elsif ($flavour =~ /64/) { |
61 | $SIZE_T=8; | 79 | $SIZE_T=8; |
62 | $RZONE= 288; | 80 | $RZONE= 288; |
63 | $FRAME= $SIZE_T*12+8*12; | 81 | $fname= "bn_mul_mont_fpu64"; |
64 | $fname= "bn_mul_mont"; | ||
65 | 82 | ||
66 | # same as above, but 64-bit mnemonics... | 83 | # same as above, but 64-bit mnemonics... |
67 | $STUX= "stdux"; # store indexed and update | 84 | $STUX= "stdux"; # store indexed and update |
@@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl"; | |||
76 | 93 | ||
77 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | 94 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
78 | 95 | ||
79 | $FRAME=($FRAME+63)&~63; | 96 | $FRAME=64; # padded frame header |
80 | $TRANSFER=16*8; | 97 | $TRANSFER=16*8; |
81 | 98 | ||
82 | $carry="r0"; | 99 | $carry="r0"; |
@@ -93,16 +110,16 @@ $tp="r10"; | |||
93 | $j="r11"; | 110 | $j="r11"; |
94 | $i="r12"; | 111 | $i="r12"; |
95 | # non-volatile registers | 112 | # non-volatile registers |
96 | $nap_d="r14"; # interleaved ap and np in double format | 113 | $nap_d="r22"; # interleaved ap and np in double format |
97 | $a0="r15"; # ap[0] | 114 | $a0="r23"; # ap[0] |
98 | $t0="r16"; # temporary registers | 115 | $t0="r24"; # temporary registers |
99 | $t1="r17"; | 116 | $t1="r25"; |
100 | $t2="r18"; | 117 | $t2="r26"; |
101 | $t3="r19"; | 118 | $t3="r27"; |
102 | $t4="r20"; | 119 | $t4="r28"; |
103 | $t5="r21"; | 120 | $t5="r29"; |
104 | $t6="r22"; | 121 | $t6="r30"; |
105 | $t7="r23"; | 122 | $t7="r31"; |
106 | 123 | ||
107 | # PPC offers enough register bank capacity to unroll inner loops twice | 124 | # PPC offers enough register bank capacity to unroll inner loops twice |
108 | # | 125 | # |
@@ -132,28 +149,17 @@ $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; | |||
132 | $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; | 149 | $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; |
133 | $dota="f8"; $dotb="f9"; | 150 | $dota="f8"; $dotb="f9"; |
134 | $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; | 151 | $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; |
135 | $N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; | 152 | $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; |
136 | $T0a="f18"; $T0b="f19"; | 153 | $T0a="f24"; $T0b="f25"; |
137 | $T1a="f20"; $T1b="f21"; | 154 | $T1a="f26"; $T1b="f27"; |
138 | $T2a="f22"; $T2b="f23"; | 155 | $T2a="f28"; $T2b="f29"; |
139 | $T3a="f24"; $T3b="f25"; | 156 | $T3a="f30"; $T3b="f31"; |
140 | 157 | ||
141 | # sp----------->+-------------------------------+ | 158 | # sp----------->+-------------------------------+ |
142 | # | saved sp | | 159 | # | saved sp | |
143 | # +-------------------------------+ | 160 | # +-------------------------------+ |
144 | # | | | ||
145 | # +-------------------------------+ | ||
146 | # | 10 saved gpr, r14-r23 | | ||
147 | # . . | ||
148 | # . . | ||
149 | # +12*size_t +-------------------------------+ | ||
150 | # | 12 saved fpr, f14-f25 | | ||
151 | # . . | 161 | # . . |
152 | # . . | 162 | # +64 +-------------------------------+ |
153 | # +12*8 +-------------------------------+ | ||
154 | # | padding to 64 byte boundary | | ||
155 | # . . | ||
156 | # +X +-------------------------------+ | ||
157 | # | 16 gpr<->fpr transfer zone | | 163 | # | 16 gpr<->fpr transfer zone | |
158 | # . . | 164 | # . . |
159 | # . . | 165 | # . . |
@@ -173,6 +179,16 @@ $T3a="f24"; $T3b="f25"; | |||
173 | # . . | 179 | # . . |
174 | # . . | 180 | # . . |
175 | # +-------------------------------+ | 181 | # +-------------------------------+ |
182 | # . . | ||
183 | # -12*size_t +-------------------------------+ | ||
184 | # | 10 saved gpr, r22-r31 | | ||
185 | # . . | ||
186 | # . . | ||
187 | # -12*8 +-------------------------------+ | ||
188 | # | 12 saved fpr, f20-f31 | | ||
189 | # . . | ||
190 | # . . | ||
191 | # +-------------------------------+ | ||
176 | 192 | ||
177 | $code=<<___; | 193 | $code=<<___; |
178 | .machine "any" | 194 | .machine "any" |
@@ -181,14 +197,14 @@ $code=<<___; | |||
181 | .globl .$fname | 197 | .globl .$fname |
182 | .align 5 | 198 | .align 5 |
183 | .$fname: | 199 | .$fname: |
184 | cmpwi $num,4 | 200 | cmpwi $num,`3*8/$SIZE_T` |
185 | mr $rp,r3 ; $rp is reassigned | 201 | mr $rp,r3 ; $rp is reassigned |
186 | li r3,0 ; possible "not handled" return code | 202 | li r3,0 ; possible "not handled" return code |
187 | bltlr- | 203 | bltlr- |
188 | andi. r0,$num,1 ; $num has to be even | 204 | andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" |
189 | bnelr- | 205 | bnelr- |
190 | 206 | ||
191 | slwi $num,$num,3 ; num*=8 | 207 | slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) |
192 | li $i,-4096 | 208 | li $i,-4096 |
193 | slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num | 209 | slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num |
194 | add $tp,$tp,$num ; place for tp[num+1] | 210 | add $tp,$tp,$num ; place for tp[num+1] |
@@ -196,35 +212,50 @@ $code=<<___; | |||
196 | subf $tp,$tp,$sp ; $sp-$tp | 212 | subf $tp,$tp,$sp ; $sp-$tp |
197 | and $tp,$tp,$i ; minimize TLB usage | 213 | and $tp,$tp,$i ; minimize TLB usage |
198 | subf $tp,$sp,$tp ; $tp-$sp | 214 | subf $tp,$sp,$tp ; $tp-$sp |
215 | mr $i,$sp | ||
199 | $STUX $sp,$sp,$tp ; alloca | 216 | $STUX $sp,$sp,$tp ; alloca |
200 | 217 | ||
201 | $PUSH r14,`2*$SIZE_T`($sp) | 218 | $PUSH r22,`-12*8-10*$SIZE_T`($i) |
202 | $PUSH r15,`3*$SIZE_T`($sp) | 219 | $PUSH r23,`-12*8-9*$SIZE_T`($i) |
203 | $PUSH r16,`4*$SIZE_T`($sp) | 220 | $PUSH r24,`-12*8-8*$SIZE_T`($i) |
204 | $PUSH r17,`5*$SIZE_T`($sp) | 221 | $PUSH r25,`-12*8-7*$SIZE_T`($i) |
205 | $PUSH r18,`6*$SIZE_T`($sp) | 222 | $PUSH r26,`-12*8-6*$SIZE_T`($i) |
206 | $PUSH r19,`7*$SIZE_T`($sp) | 223 | $PUSH r27,`-12*8-5*$SIZE_T`($i) |
207 | $PUSH r20,`8*$SIZE_T`($sp) | 224 | $PUSH r28,`-12*8-4*$SIZE_T`($i) |
208 | $PUSH r21,`9*$SIZE_T`($sp) | 225 | $PUSH r29,`-12*8-3*$SIZE_T`($i) |
209 | $PUSH r22,`10*$SIZE_T`($sp) | 226 | $PUSH r30,`-12*8-2*$SIZE_T`($i) |
210 | $PUSH r23,`11*$SIZE_T`($sp) | 227 | $PUSH r31,`-12*8-1*$SIZE_T`($i) |
211 | stfd f14,`12*$SIZE_T+0`($sp) | 228 | stfd f20,`-12*8`($i) |
212 | stfd f15,`12*$SIZE_T+8`($sp) | 229 | stfd f21,`-11*8`($i) |
213 | stfd f16,`12*$SIZE_T+16`($sp) | 230 | stfd f22,`-10*8`($i) |
214 | stfd f17,`12*$SIZE_T+24`($sp) | 231 | stfd f23,`-9*8`($i) |
215 | stfd f18,`12*$SIZE_T+32`($sp) | 232 | stfd f24,`-8*8`($i) |
216 | stfd f19,`12*$SIZE_T+40`($sp) | 233 | stfd f25,`-7*8`($i) |
217 | stfd f20,`12*$SIZE_T+48`($sp) | 234 | stfd f26,`-6*8`($i) |
218 | stfd f21,`12*$SIZE_T+56`($sp) | 235 | stfd f27,`-5*8`($i) |
219 | stfd f22,`12*$SIZE_T+64`($sp) | 236 | stfd f28,`-4*8`($i) |
220 | stfd f23,`12*$SIZE_T+72`($sp) | 237 | stfd f29,`-3*8`($i) |
221 | stfd f24,`12*$SIZE_T+80`($sp) | 238 | stfd f30,`-2*8`($i) |
222 | stfd f25,`12*$SIZE_T+88`($sp) | 239 | stfd f31,`-1*8`($i) |
223 | 240 | ___ | |
241 | $code.=<<___ if ($SIZE_T==8); | ||
224 | ld $a0,0($ap) ; pull ap[0] value | 242 | ld $a0,0($ap) ; pull ap[0] value |
225 | ld $n0,0($n0) ; pull n0[0] value | 243 | ld $n0,0($n0) ; pull n0[0] value |
226 | ld $t3,0($bp) ; bp[0] | 244 | ld $t3,0($bp) ; bp[0] |
227 | 245 | ___ | |
246 | $code.=<<___ if ($SIZE_T==4); | ||
247 | mr $t1,$n0 | ||
248 | lwz $a0,0($ap) ; pull ap[0,1] value | ||
249 | lwz $t0,4($ap) | ||
250 | lwz $n0,0($t1) ; pull n0[0,1] value | ||
251 | lwz $t1,4($t1) | ||
252 | lwz $t3,0($bp) ; bp[0,1] | ||
253 | lwz $t2,4($bp) | ||
254 | insrdi $a0,$t0,32,0 | ||
255 | insrdi $n0,$t1,32,0 | ||
256 | insrdi $t3,$t2,32,0 | ||
257 | ___ | ||
258 | $code.=<<___; | ||
228 | addi $tp,$sp,`$FRAME+$TRANSFER+8+64` | 259 | addi $tp,$sp,`$FRAME+$TRANSFER+8+64` |
229 | li $i,-64 | 260 | li $i,-64 |
230 | add $nap_d,$tp,$num | 261 | add $nap_d,$tp,$num |
@@ -258,6 +289,8 @@ $code=<<___; | |||
258 | std $t5,`$FRAME+40`($sp) | 289 | std $t5,`$FRAME+40`($sp) |
259 | std $t6,`$FRAME+48`($sp) | 290 | std $t6,`$FRAME+48`($sp) |
260 | std $t7,`$FRAME+56`($sp) | 291 | std $t7,`$FRAME+56`($sp) |
292 | ___ | ||
293 | $code.=<<___ if ($SIZE_T==8); | ||
261 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair | 294 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair |
262 | lwz $t1,0($ap) | 295 | lwz $t1,0($ap) |
263 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | 296 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair |
@@ -266,6 +299,18 @@ $code=<<___; | |||
266 | lwz $t5,0($np) | 299 | lwz $t5,0($np) |
267 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | 300 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair |
268 | lwz $t7,8($np) | 301 | lwz $t7,8($np) |
302 | ___ | ||
303 | $code.=<<___ if ($SIZE_T==4); | ||
304 | lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs | ||
305 | lwz $t1,4($ap) | ||
306 | lwz $t2,8($ap) | ||
307 | lwz $t3,12($ap) | ||
308 | lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs | ||
309 | lwz $t5,4($np) | ||
310 | lwz $t6,8($np) | ||
311 | lwz $t7,12($np) | ||
312 | ___ | ||
313 | $code.=<<___; | ||
269 | lfd $ba,`$FRAME+0`($sp) | 314 | lfd $ba,`$FRAME+0`($sp) |
270 | lfd $bb,`$FRAME+8`($sp) | 315 | lfd $bb,`$FRAME+8`($sp) |
271 | lfd $bc,`$FRAME+16`($sp) | 316 | lfd $bc,`$FRAME+16`($sp) |
@@ -374,6 +419,8 @@ $code=<<___; | |||
374 | 419 | ||
375 | .align 5 | 420 | .align 5 |
376 | L1st: | 421 | L1st: |
422 | ___ | ||
423 | $code.=<<___ if ($SIZE_T==8); | ||
377 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair | 424 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair |
378 | lwz $t1,0($ap) | 425 | lwz $t1,0($ap) |
379 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | 426 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair |
@@ -382,6 +429,18 @@ L1st: | |||
382 | lwz $t5,0($np) | 429 | lwz $t5,0($np) |
383 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | 430 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair |
384 | lwz $t7,8($np) | 431 | lwz $t7,8($np) |
432 | ___ | ||
433 | $code.=<<___ if ($SIZE_T==4); | ||
434 | lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs | ||
435 | lwz $t1,4($ap) | ||
436 | lwz $t2,8($ap) | ||
437 | lwz $t3,12($ap) | ||
438 | lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs | ||
439 | lwz $t5,4($np) | ||
440 | lwz $t6,8($np) | ||
441 | lwz $t7,12($np) | ||
442 | ___ | ||
443 | $code.=<<___; | ||
385 | std $t0,`$FRAME+64`($sp) | 444 | std $t0,`$FRAME+64`($sp) |
386 | std $t1,`$FRAME+72`($sp) | 445 | std $t1,`$FRAME+72`($sp) |
387 | std $t2,`$FRAME+80`($sp) | 446 | std $t2,`$FRAME+80`($sp) |
@@ -559,7 +618,17 @@ L1st: | |||
559 | li $i,8 ; i=1 | 618 | li $i,8 ; i=1 |
560 | .align 5 | 619 | .align 5 |
561 | Louter: | 620 | Louter: |
621 | ___ | ||
622 | $code.=<<___ if ($SIZE_T==8); | ||
562 | ldx $t3,$bp,$i ; bp[i] | 623 | ldx $t3,$bp,$i ; bp[i] |
624 | ___ | ||
625 | $code.=<<___ if ($SIZE_T==4); | ||
626 | add $t0,$bp,$i | ||
627 | lwz $t3,0($t0) ; bp[i,i+1] | ||
628 | lwz $t0,4($t0) | ||
629 | insrdi $t3,$t0,32,0 | ||
630 | ___ | ||
631 | $code.=<<___; | ||
563 | ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] | 632 | ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] |
564 | mulld $t7,$a0,$t3 ; ap[0]*bp[i] | 633 | mulld $t7,$a0,$t3 ; ap[0]*bp[i] |
565 | 634 | ||
@@ -761,6 +830,13 @@ Linner: | |||
761 | stfd $T0b,`$FRAME+8`($sp) | 830 | stfd $T0b,`$FRAME+8`($sp) |
762 | add $t7,$t7,$carry | 831 | add $t7,$t7,$carry |
763 | addc $t3,$t0,$t1 | 832 | addc $t3,$t0,$t1 |
833 | ___ | ||
834 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
835 | extrdi $t0,$t0,32,0 | ||
836 | extrdi $t1,$t1,32,0 | ||
837 | adde $t0,$t0,$t1 | ||
838 | ___ | ||
839 | $code.=<<___; | ||
764 | stfd $T1a,`$FRAME+16`($sp) | 840 | stfd $T1a,`$FRAME+16`($sp) |
765 | stfd $T1b,`$FRAME+24`($sp) | 841 | stfd $T1b,`$FRAME+24`($sp) |
766 | insrdi $t4,$t7,16,0 ; 64..127 bits | 842 | insrdi $t4,$t7,16,0 ; 64..127 bits |
@@ -768,6 +844,13 @@ Linner: | |||
768 | stfd $T2a,`$FRAME+32`($sp) | 844 | stfd $T2a,`$FRAME+32`($sp) |
769 | stfd $T2b,`$FRAME+40`($sp) | 845 | stfd $T2b,`$FRAME+40`($sp) |
770 | adde $t5,$t4,$t2 | 846 | adde $t5,$t4,$t2 |
847 | ___ | ||
848 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
849 | extrdi $t4,$t4,32,0 | ||
850 | extrdi $t2,$t2,32,0 | ||
851 | adde $t4,$t4,$t2 | ||
852 | ___ | ||
853 | $code.=<<___; | ||
771 | stfd $T3a,`$FRAME+48`($sp) | 854 | stfd $T3a,`$FRAME+48`($sp) |
772 | stfd $T3b,`$FRAME+56`($sp) | 855 | stfd $T3b,`$FRAME+56`($sp) |
773 | addze $carry,$carry | 856 | addze $carry,$carry |
@@ -816,7 +899,21 @@ Linner: | |||
816 | ld $t7,`$FRAME+72`($sp) | 899 | ld $t7,`$FRAME+72`($sp) |
817 | 900 | ||
818 | addc $t3,$t0,$t1 | 901 | addc $t3,$t0,$t1 |
902 | ___ | ||
903 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
904 | extrdi $t0,$t0,32,0 | ||
905 | extrdi $t1,$t1,32,0 | ||
906 | adde $t0,$t0,$t1 | ||
907 | ___ | ||
908 | $code.=<<___; | ||
819 | adde $t5,$t4,$t2 | 909 | adde $t5,$t4,$t2 |
910 | ___ | ||
911 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
912 | extrdi $t4,$t4,32,0 | ||
913 | extrdi $t2,$t2,32,0 | ||
914 | adde $t4,$t4,$t2 | ||
915 | ___ | ||
916 | $code.=<<___; | ||
820 | addze $carry,$carry | 917 | addze $carry,$carry |
821 | 918 | ||
822 | std $t3,-16($tp) ; tp[j-1] | 919 | std $t3,-16($tp) ; tp[j-1] |
@@ -835,7 +932,9 @@ Linner: | |||
835 | subf $nap_d,$t7,$nap_d ; rewind pointer | 932 | subf $nap_d,$t7,$nap_d ; rewind pointer |
836 | cmpw $i,$num | 933 | cmpw $i,$num |
837 | blt- Louter | 934 | blt- Louter |
935 | ___ | ||
838 | 936 | ||
937 | $code.=<<___ if ($SIZE_T==8); | ||
839 | subf $np,$num,$np ; rewind np | 938 | subf $np,$num,$np ; rewind np |
840 | addi $j,$j,1 ; restore counter | 939 | addi $j,$j,1 ; restore counter |
841 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] | 940 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] |
@@ -883,34 +982,105 @@ Lcopy: ; copy or in-place refresh | |||
883 | stdx $i,$t4,$i | 982 | stdx $i,$t4,$i |
884 | addi $i,$i,16 | 983 | addi $i,$i,16 |
885 | bdnz- Lcopy | 984 | bdnz- Lcopy |
985 | ___ | ||
986 | $code.=<<___ if ($SIZE_T==4); | ||
987 | subf $np,$num,$np ; rewind np | ||
988 | addi $j,$j,1 ; restore counter | ||
989 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] | ||
990 | addi $tp,$sp,`$FRAME+$TRANSFER` | ||
991 | addi $np,$np,-4 | ||
992 | addi $rp,$rp,-4 | ||
993 | addi $ap,$sp,`$FRAME+$TRANSFER+4` | ||
994 | mtctr $j | ||
995 | |||
996 | .align 4 | ||
997 | Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order | ||
998 | ldu $t2,16($tp) | ||
999 | lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order | ||
1000 | lwz $t5,8($np) | ||
1001 | lwz $t6,12($np) | ||
1002 | lwzu $t7,16($np) | ||
1003 | extrdi $t1,$t0,32,0 | ||
1004 | extrdi $t3,$t2,32,0 | ||
1005 | subfe $t4,$t4,$t0 ; tp[j]-np[j] | ||
1006 | stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order | ||
1007 | subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] | ||
1008 | stw $t1,8($ap) | ||
1009 | subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] | ||
1010 | stw $t2,12($ap) | ||
1011 | subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] | ||
1012 | stwu $t3,16($ap) | ||
1013 | stw $t4,4($rp) | ||
1014 | stw $t5,8($rp) | ||
1015 | stw $t6,12($rp) | ||
1016 | stwu $t7,16($rp) | ||
1017 | bdnz- Lsub | ||
1018 | |||
1019 | li $i,0 | ||
1020 | subfe $ovf,$i,$ovf ; handle upmost overflow bit | ||
1021 | addi $tp,$sp,`$FRAME+$TRANSFER+4` | ||
1022 | subf $rp,$num,$rp ; rewind rp | ||
1023 | and $ap,$tp,$ovf | ||
1024 | andc $np,$rp,$ovf | ||
1025 | or $ap,$ap,$np ; ap=borrow?tp:rp | ||
1026 | addi $tp,$sp,`$FRAME+$TRANSFER` | ||
1027 | mtctr $j | ||
1028 | |||
1029 | .align 4 | ||
1030 | Lcopy: ; copy or in-place refresh | ||
1031 | lwz $t0,4($ap) | ||
1032 | lwz $t1,8($ap) | ||
1033 | lwz $t2,12($ap) | ||
1034 | lwzu $t3,16($ap) | ||
1035 | std $i,8($nap_d) ; zap nap_d | ||
1036 | std $i,16($nap_d) | ||
1037 | std $i,24($nap_d) | ||
1038 | std $i,32($nap_d) | ||
1039 | std $i,40($nap_d) | ||
1040 | std $i,48($nap_d) | ||
1041 | std $i,56($nap_d) | ||
1042 | stdu $i,64($nap_d) | ||
1043 | stw $t0,4($rp) | ||
1044 | stw $t1,8($rp) | ||
1045 | stw $t2,12($rp) | ||
1046 | stwu $t3,16($rp) | ||
1047 | std $i,8($tp) ; zap tp at once | ||
1048 | stdu $i,16($tp) | ||
1049 | bdnz- Lcopy | ||
1050 | ___ | ||
886 | 1051 | ||
887 | $POP r14,`2*$SIZE_T`($sp) | 1052 | $code.=<<___; |
888 | $POP r15,`3*$SIZE_T`($sp) | 1053 | $POP $i,0($sp) |
889 | $POP r16,`4*$SIZE_T`($sp) | ||
890 | $POP r17,`5*$SIZE_T`($sp) | ||
891 | $POP r18,`6*$SIZE_T`($sp) | ||
892 | $POP r19,`7*$SIZE_T`($sp) | ||
893 | $POP r20,`8*$SIZE_T`($sp) | ||
894 | $POP r21,`9*$SIZE_T`($sp) | ||
895 | $POP r22,`10*$SIZE_T`($sp) | ||
896 | $POP r23,`11*$SIZE_T`($sp) | ||
897 | lfd f14,`12*$SIZE_T+0`($sp) | ||
898 | lfd f15,`12*$SIZE_T+8`($sp) | ||
899 | lfd f16,`12*$SIZE_T+16`($sp) | ||
900 | lfd f17,`12*$SIZE_T+24`($sp) | ||
901 | lfd f18,`12*$SIZE_T+32`($sp) | ||
902 | lfd f19,`12*$SIZE_T+40`($sp) | ||
903 | lfd f20,`12*$SIZE_T+48`($sp) | ||
904 | lfd f21,`12*$SIZE_T+56`($sp) | ||
905 | lfd f22,`12*$SIZE_T+64`($sp) | ||
906 | lfd f23,`12*$SIZE_T+72`($sp) | ||
907 | lfd f24,`12*$SIZE_T+80`($sp) | ||
908 | lfd f25,`12*$SIZE_T+88`($sp) | ||
909 | $POP $sp,0($sp) | ||
910 | li r3,1 ; signal "handled" | 1054 | li r3,1 ; signal "handled" |
1055 | $POP r22,`-12*8-10*$SIZE_T`($i) | ||
1056 | $POP r23,`-12*8-9*$SIZE_T`($i) | ||
1057 | $POP r24,`-12*8-8*$SIZE_T`($i) | ||
1058 | $POP r25,`-12*8-7*$SIZE_T`($i) | ||
1059 | $POP r26,`-12*8-6*$SIZE_T`($i) | ||
1060 | $POP r27,`-12*8-5*$SIZE_T`($i) | ||
1061 | $POP r28,`-12*8-4*$SIZE_T`($i) | ||
1062 | $POP r29,`-12*8-3*$SIZE_T`($i) | ||
1063 | $POP r30,`-12*8-2*$SIZE_T`($i) | ||
1064 | $POP r31,`-12*8-1*$SIZE_T`($i) | ||
1065 | lfd f20,`-12*8`($i) | ||
1066 | lfd f21,`-11*8`($i) | ||
1067 | lfd f22,`-10*8`($i) | ||
1068 | lfd f23,`-9*8`($i) | ||
1069 | lfd f24,`-8*8`($i) | ||
1070 | lfd f25,`-7*8`($i) | ||
1071 | lfd f26,`-6*8`($i) | ||
1072 | lfd f27,`-5*8`($i) | ||
1073 | lfd f28,`-4*8`($i) | ||
1074 | lfd f29,`-3*8`($i) | ||
1075 | lfd f30,`-2*8`($i) | ||
1076 | lfd f31,`-1*8`($i) | ||
1077 | mr $sp,$i | ||
911 | blr | 1078 | blr |
912 | .long 0 | 1079 | .long 0 |
913 | .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" | 1080 | .byte 0,12,4,0,0x8c,10,6,0 |
1081 | .long 0 | ||
1082 | |||
1083 | .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" | ||
914 | ___ | 1084 | ___ |
915 | 1085 | ||
916 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 1086 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl new file mode 100644 index 0000000000..cd9f13eca2 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl | |||
@@ -0,0 +1,221 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # May 2011 | ||
11 | # | ||
12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
14 | # the time being... gcc 4.3 appeared to generate poor code, therefore | ||
15 | # the effort. And indeed, the module delivers 55%-90%(*) improvement | ||
16 | # on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit | ||
17 | # key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196. | ||
18 | # This is for 64-bit build. In 32-bit "highgprs" case improvement is | ||
19 | # even higher, for example on z990 it was measured 80%-150%. ECDSA | ||
20 | # sign is modest 9%-12% faster. Keep in mind that these coefficients | ||
21 | # are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is | ||
22 | # burnt in it... | ||
23 | # | ||
24 | # (*) gcc 4.1 was observed to deliver better results than gcc 4.3, | ||
25 | # so that improvement coefficients can vary from one specific | ||
26 | # setup to another. | ||
27 | |||
28 | $flavour = shift; | ||
29 | |||
30 | if ($flavour =~ /3[12]/) { | ||
31 | $SIZE_T=4; | ||
32 | $g=""; | ||
33 | } else { | ||
34 | $SIZE_T=8; | ||
35 | $g="g"; | ||
36 | } | ||
37 | |||
38 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
39 | open STDOUT,">$output"; | ||
40 | |||
41 | $stdframe=16*$SIZE_T+4*8; | ||
42 | |||
43 | $rp="%r2"; | ||
44 | $a1="%r3"; | ||
45 | $a0="%r4"; | ||
46 | $b1="%r5"; | ||
47 | $b0="%r6"; | ||
48 | |||
49 | $ra="%r14"; | ||
50 | $sp="%r15"; | ||
51 | |||
52 | @T=("%r0","%r1"); | ||
53 | @i=("%r12","%r13"); | ||
54 | |||
55 | ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11)); | ||
56 | ($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8; | ||
57 | |||
58 | $code.=<<___; | ||
59 | .text | ||
60 | |||
61 | .type _mul_1x1,\@function | ||
62 | .align 16 | ||
63 | _mul_1x1: | ||
64 | lgr $a1,$a | ||
65 | sllg $a2,$a,1 | ||
66 | sllg $a4,$a,2 | ||
67 | sllg $a8,$a,3 | ||
68 | |||
69 | srag $lo,$a1,63 # broadcast 63rd bit | ||
70 | nihh $a1,0x1fff | ||
71 | srag @i[0],$a2,63 # broadcast 62nd bit | ||
72 | nihh $a2,0x3fff | ||
73 | srag @i[1],$a4,63 # broadcast 61st bit | ||
74 | nihh $a4,0x7fff | ||
75 | ngr $lo,$b | ||
76 | ngr @i[0],$b | ||
77 | ngr @i[1],$b | ||
78 | |||
79 | lghi @T[0],0 | ||
80 | lgr $a12,$a1 | ||
81 | stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0 | ||
82 | xgr $a12,$a2 | ||
83 | stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1 | ||
84 | lgr $a48,$a4 | ||
85 | stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2 | ||
86 | xgr $a48,$a8 | ||
87 | stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2 | ||
88 | xgr $a1,$a4 | ||
89 | |||
90 | stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4 | ||
91 | xgr $a2,$a4 | ||
92 | stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4 | ||
93 | xgr $a12,$a4 | ||
94 | stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4 | ||
95 | xgr $a1,$a48 | ||
96 | stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4 | ||
97 | xgr $a2,$a48 | ||
98 | |||
99 | stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8 | ||
100 | xgr $a12,$a48 | ||
101 | stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8 | ||
102 | xgr $a1,$a4 | ||
103 | stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8 | ||
104 | xgr $a2,$a4 | ||
105 | stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8 | ||
106 | |||
107 | xgr $a12,$a4 | ||
108 | stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8 | ||
109 | srlg $hi,$lo,1 | ||
110 | stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8 | ||
111 | sllg $lo,$lo,63 | ||
112 | stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8 | ||
113 | srlg @T[0],@i[0],2 | ||
114 | stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8 | ||
115 | |||
116 | lghi $mask,`0xf<<3` | ||
117 | sllg $a1,@i[0],62 | ||
118 | sllg @i[0],$b,3 | ||
119 | srlg @T[1],@i[1],3 | ||
120 | ngr @i[0],$mask | ||
121 | sllg $a2,@i[1],61 | ||
122 | srlg @i[1],$b,4-3 | ||
123 | xgr $hi,@T[0] | ||
124 | ngr @i[1],$mask | ||
125 | xgr $lo,$a1 | ||
126 | xgr $hi,@T[1] | ||
127 | xgr $lo,$a2 | ||
128 | |||
129 | xg $lo,$stdframe(@i[0],$sp) | ||
130 | srlg @i[0],$b,8-3 | ||
131 | ngr @i[0],$mask | ||
132 | ___ | ||
133 | for($n=1;$n<14;$n++) { | ||
134 | $code.=<<___; | ||
135 | lg @T[1],$stdframe(@i[1],$sp) | ||
136 | srlg @i[1],$b,`($n+2)*4`-3 | ||
137 | sllg @T[0],@T[1],`$n*4` | ||
138 | ngr @i[1],$mask | ||
139 | srlg @T[1],@T[1],`64-$n*4` | ||
140 | xgr $lo,@T[0] | ||
141 | xgr $hi,@T[1] | ||
142 | ___ | ||
143 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
144 | } | ||
145 | $code.=<<___; | ||
146 | lg @T[1],$stdframe(@i[1],$sp) | ||
147 | sllg @T[0],@T[1],`$n*4` | ||
148 | srlg @T[1],@T[1],`64-$n*4` | ||
149 | xgr $lo,@T[0] | ||
150 | xgr $hi,@T[1] | ||
151 | |||
152 | lg @T[0],$stdframe(@i[0],$sp) | ||
153 | sllg @T[1],@T[0],`($n+1)*4` | ||
154 | srlg @T[0],@T[0],`64-($n+1)*4` | ||
155 | xgr $lo,@T[1] | ||
156 | xgr $hi,@T[0] | ||
157 | |||
158 | br $ra | ||
159 | .size _mul_1x1,.-_mul_1x1 | ||
160 | |||
161 | .globl bn_GF2m_mul_2x2 | ||
162 | .type bn_GF2m_mul_2x2,\@function | ||
163 | .align 16 | ||
164 | bn_GF2m_mul_2x2: | ||
165 | stm${g} %r3,%r15,3*$SIZE_T($sp) | ||
166 | |||
167 | lghi %r1,-$stdframe-128 | ||
168 | la %r0,0($sp) | ||
169 | la $sp,0(%r1,$sp) # alloca | ||
170 | st${g} %r0,0($sp) # back chain | ||
171 | ___ | ||
172 | if ($SIZE_T==8) { | ||
173 | my @r=map("%r$_",(6..9)); | ||
174 | $code.=<<___; | ||
175 | bras $ra,_mul_1x1 # a1·b1 | ||
176 | stmg $lo,$hi,16($rp) | ||
177 | |||
178 | lg $a,`$stdframe+128+4*$SIZE_T`($sp) | ||
179 | lg $b,`$stdframe+128+6*$SIZE_T`($sp) | ||
180 | bras $ra,_mul_1x1 # a0·b0 | ||
181 | stmg $lo,$hi,0($rp) | ||
182 | |||
183 | lg $a,`$stdframe+128+3*$SIZE_T`($sp) | ||
184 | lg $b,`$stdframe+128+5*$SIZE_T`($sp) | ||
185 | xg $a,`$stdframe+128+4*$SIZE_T`($sp) | ||
186 | xg $b,`$stdframe+128+6*$SIZE_T`($sp) | ||
187 | bras $ra,_mul_1x1 # (a0+a1)·(b0+b1) | ||
188 | lmg @r[0],@r[3],0($rp) | ||
189 | |||
190 | xgr $lo,$hi | ||
191 | xgr $hi,@r[1] | ||
192 | xgr $lo,@r[0] | ||
193 | xgr $hi,@r[2] | ||
194 | xgr $lo,@r[3] | ||
195 | xgr $hi,@r[3] | ||
196 | xgr $lo,$hi | ||
197 | stg $hi,16($rp) | ||
198 | stg $lo,8($rp) | ||
199 | ___ | ||
200 | } else { | ||
201 | $code.=<<___; | ||
202 | sllg %r3,%r3,32 | ||
203 | sllg %r5,%r5,32 | ||
204 | or %r3,%r4 | ||
205 | or %r5,%r6 | ||
206 | bras $ra,_mul_1x1 | ||
207 | rllg $lo,$lo,32 | ||
208 | rllg $hi,$hi,32 | ||
209 | stmg $lo,$hi,0($rp) | ||
210 | ___ | ||
211 | } | ||
212 | $code.=<<___; | ||
213 | lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp) | ||
214 | br $ra | ||
215 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
216 | .string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | ||
217 | ___ | ||
218 | |||
219 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
220 | print $code; | ||
221 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl index f61246f5b6..9fd64e81ee 100644 --- a/src/lib/libcrypto/bn/asm/s390x-mont.pl +++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl | |||
@@ -32,6 +32,33 @@ | |||
32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, | 32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, |
33 | # make inner loops counter-based. | 33 | # make inner loops counter-based. |
34 | 34 | ||
35 | # November 2010. | ||
36 | # | ||
37 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
38 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
39 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
40 | # application context. The feature is not specific to any particular | ||
41 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
42 | # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG | ||
43 | # is achieved by swapping words after 64-bit loads, follow _dswap-s. | ||
44 | # On z990 it was measured to perform 2.6-2.2 times better than | ||
45 | # compiler-generated code, less for longer keys... | ||
46 | |||
47 | $flavour = shift; | ||
48 | |||
49 | if ($flavour =~ /3[12]/) { | ||
50 | $SIZE_T=4; | ||
51 | $g=""; | ||
52 | } else { | ||
53 | $SIZE_T=8; | ||
54 | $g="g"; | ||
55 | } | ||
56 | |||
57 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
58 | open STDOUT,">$output"; | ||
59 | |||
60 | $stdframe=16*$SIZE_T+4*8; | ||
61 | |||
35 | $mn0="%r0"; | 62 | $mn0="%r0"; |
36 | $num="%r1"; | 63 | $num="%r1"; |
37 | 64 | ||
@@ -60,34 +87,44 @@ $code.=<<___; | |||
60 | .globl bn_mul_mont | 87 | .globl bn_mul_mont |
61 | .type bn_mul_mont,\@function | 88 | .type bn_mul_mont,\@function |
62 | bn_mul_mont: | 89 | bn_mul_mont: |
63 | lgf $num,164($sp) # pull $num | 90 | lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num |
64 | sla $num,3 # $num to enumerate bytes | 91 | sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes |
65 | la $bp,0($num,$bp) | 92 | la $bp,0($num,$bp) |
66 | 93 | ||
67 | stg %r2,16($sp) | 94 | st${g} %r2,2*$SIZE_T($sp) |
68 | 95 | ||
69 | cghi $num,16 # | 96 | cghi $num,16 # |
70 | lghi %r2,0 # | 97 | lghi %r2,0 # |
71 | blr %r14 # if($num<16) return 0; | 98 | blr %r14 # if($num<16) return 0; |
99 | ___ | ||
100 | $code.=<<___ if ($flavour =~ /3[12]/); | ||
101 | tmll $num,4 | ||
102 | bnzr %r14 # if ($num&1) return 0; | ||
103 | ___ | ||
104 | $code.=<<___ if ($flavour !~ /3[12]/); | ||
72 | cghi $num,96 # | 105 | cghi $num,96 # |
73 | bhr %r14 # if($num>96) return 0; | 106 | bhr %r14 # if($num>96) return 0; |
107 | ___ | ||
108 | $code.=<<___; | ||
109 | stm${g} %r3,%r15,3*$SIZE_T($sp) | ||
74 | 110 | ||
75 | stmg %r3,%r15,24($sp) | 111 | lghi $rp,-$stdframe-8 # leave room for carry bit |
76 | |||
77 | lghi $rp,-160-8 # leave room for carry bit | ||
78 | lcgr $j,$num # -$num | 112 | lcgr $j,$num # -$num |
79 | lgr %r0,$sp | 113 | lgr %r0,$sp |
80 | la $rp,0($rp,$sp) | 114 | la $rp,0($rp,$sp) |
81 | la $sp,0($j,$rp) # alloca | 115 | la $sp,0($j,$rp) # alloca |
82 | stg %r0,0($sp) # back chain | 116 | st${g} %r0,0($sp) # back chain |
83 | 117 | ||
84 | sra $num,3 # restore $num | 118 | sra $num,3 # restore $num |
85 | la $bp,0($j,$bp) # restore $bp | 119 | la $bp,0($j,$bp) # restore $bp |
86 | ahi $num,-1 # adjust $num for inner loop | 120 | ahi $num,-1 # adjust $num for inner loop |
87 | lg $n0,0($n0) # pull n0 | 121 | lg $n0,0($n0) # pull n0 |
122 | _dswap $n0 | ||
88 | 123 | ||
89 | lg $bi,0($bp) | 124 | lg $bi,0($bp) |
125 | _dswap $bi | ||
90 | lg $alo,0($ap) | 126 | lg $alo,0($ap) |
127 | _dswap $alo | ||
91 | mlgr $ahi,$bi # ap[0]*bp[0] | 128 | mlgr $ahi,$bi # ap[0]*bp[0] |
92 | lgr $AHI,$ahi | 129 | lgr $AHI,$ahi |
93 | 130 | ||
@@ -95,6 +132,7 @@ bn_mul_mont: | |||
95 | msgr $mn0,$n0 | 132 | msgr $mn0,$n0 |
96 | 133 | ||
97 | lg $nlo,0($np) # | 134 | lg $nlo,0($np) # |
135 | _dswap $nlo | ||
98 | mlgr $nhi,$mn0 # np[0]*m1 | 136 | mlgr $nhi,$mn0 # np[0]*m1 |
99 | algr $nlo,$alo # +="tp[0]" | 137 | algr $nlo,$alo # +="tp[0]" |
100 | lghi $NHI,0 | 138 | lghi $NHI,0 |
@@ -106,12 +144,14 @@ bn_mul_mont: | |||
106 | .align 16 | 144 | .align 16 |
107 | .L1st: | 145 | .L1st: |
108 | lg $alo,0($j,$ap) | 146 | lg $alo,0($j,$ap) |
147 | _dswap $alo | ||
109 | mlgr $ahi,$bi # ap[j]*bp[0] | 148 | mlgr $ahi,$bi # ap[j]*bp[0] |
110 | algr $alo,$AHI | 149 | algr $alo,$AHI |
111 | lghi $AHI,0 | 150 | lghi $AHI,0 |
112 | alcgr $AHI,$ahi | 151 | alcgr $AHI,$ahi |
113 | 152 | ||
114 | lg $nlo,0($j,$np) | 153 | lg $nlo,0($j,$np) |
154 | _dswap $nlo | ||
115 | mlgr $nhi,$mn0 # np[j]*m1 | 155 | mlgr $nhi,$mn0 # np[j]*m1 |
116 | algr $nlo,$NHI | 156 | algr $nlo,$NHI |
117 | lghi $NHI,0 | 157 | lghi $NHI,0 |
@@ -119,22 +159,24 @@ bn_mul_mont: | |||
119 | algr $nlo,$alo | 159 | algr $nlo,$alo |
120 | alcgr $NHI,$nhi | 160 | alcgr $NHI,$nhi |
121 | 161 | ||
122 | stg $nlo,160-8($j,$sp) # tp[j-1]= | 162 | stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= |
123 | la $j,8($j) # j++ | 163 | la $j,8($j) # j++ |
124 | brct $count,.L1st | 164 | brct $count,.L1st |
125 | 165 | ||
126 | algr $NHI,$AHI | 166 | algr $NHI,$AHI |
127 | lghi $AHI,0 | 167 | lghi $AHI,0 |
128 | alcgr $AHI,$AHI # upmost overflow bit | 168 | alcgr $AHI,$AHI # upmost overflow bit |
129 | stg $NHI,160-8($j,$sp) | 169 | stg $NHI,$stdframe-8($j,$sp) |
130 | stg $AHI,160($j,$sp) | 170 | stg $AHI,$stdframe($j,$sp) |
131 | la $bp,8($bp) # bp++ | 171 | la $bp,8($bp) # bp++ |
132 | 172 | ||
133 | .Louter: | 173 | .Louter: |
134 | lg $bi,0($bp) # bp[i] | 174 | lg $bi,0($bp) # bp[i] |
175 | _dswap $bi | ||
135 | lg $alo,0($ap) | 176 | lg $alo,0($ap) |
177 | _dswap $alo | ||
136 | mlgr $ahi,$bi # ap[0]*bp[i] | 178 | mlgr $ahi,$bi # ap[0]*bp[i] |
137 | alg $alo,160($sp) # +=tp[0] | 179 | alg $alo,$stdframe($sp) # +=tp[0] |
138 | lghi $AHI,0 | 180 | lghi $AHI,0 |
139 | alcgr $AHI,$ahi | 181 | alcgr $AHI,$ahi |
140 | 182 | ||
@@ -142,6 +184,7 @@ bn_mul_mont: | |||
142 | msgr $mn0,$n0 # tp[0]*n0 | 184 | msgr $mn0,$n0 # tp[0]*n0 |
143 | 185 | ||
144 | lg $nlo,0($np) # np[0] | 186 | lg $nlo,0($np) # np[0] |
187 | _dswap $nlo | ||
145 | mlgr $nhi,$mn0 # np[0]*m1 | 188 | mlgr $nhi,$mn0 # np[0]*m1 |
146 | algr $nlo,$alo # +="tp[0]" | 189 | algr $nlo,$alo # +="tp[0]" |
147 | lghi $NHI,0 | 190 | lghi $NHI,0 |
@@ -153,14 +196,16 @@ bn_mul_mont: | |||
153 | .align 16 | 196 | .align 16 |
154 | .Linner: | 197 | .Linner: |
155 | lg $alo,0($j,$ap) | 198 | lg $alo,0($j,$ap) |
199 | _dswap $alo | ||
156 | mlgr $ahi,$bi # ap[j]*bp[i] | 200 | mlgr $ahi,$bi # ap[j]*bp[i] |
157 | algr $alo,$AHI | 201 | algr $alo,$AHI |
158 | lghi $AHI,0 | 202 | lghi $AHI,0 |
159 | alcgr $ahi,$AHI | 203 | alcgr $ahi,$AHI |
160 | alg $alo,160($j,$sp)# +=tp[j] | 204 | alg $alo,$stdframe($j,$sp)# +=tp[j] |
161 | alcgr $AHI,$ahi | 205 | alcgr $AHI,$ahi |
162 | 206 | ||
163 | lg $nlo,0($j,$np) | 207 | lg $nlo,0($j,$np) |
208 | _dswap $nlo | ||
164 | mlgr $nhi,$mn0 # np[j]*m1 | 209 | mlgr $nhi,$mn0 # np[j]*m1 |
165 | algr $nlo,$NHI | 210 | algr $nlo,$NHI |
166 | lghi $NHI,0 | 211 | lghi $NHI,0 |
@@ -168,31 +213,33 @@ bn_mul_mont: | |||
168 | algr $nlo,$alo # +="tp[j]" | 213 | algr $nlo,$alo # +="tp[j]" |
169 | alcgr $NHI,$nhi | 214 | alcgr $NHI,$nhi |
170 | 215 | ||
171 | stg $nlo,160-8($j,$sp) # tp[j-1]= | 216 | stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= |
172 | la $j,8($j) # j++ | 217 | la $j,8($j) # j++ |
173 | brct $count,.Linner | 218 | brct $count,.Linner |
174 | 219 | ||
175 | algr $NHI,$AHI | 220 | algr $NHI,$AHI |
176 | lghi $AHI,0 | 221 | lghi $AHI,0 |
177 | alcgr $AHI,$AHI | 222 | alcgr $AHI,$AHI |
178 | alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit | 223 | alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit |
179 | lghi $ahi,0 | 224 | lghi $ahi,0 |
180 | alcgr $AHI,$ahi # new upmost overflow bit | 225 | alcgr $AHI,$ahi # new upmost overflow bit |
181 | stg $NHI,160-8($j,$sp) | 226 | stg $NHI,$stdframe-8($j,$sp) |
182 | stg $AHI,160($j,$sp) | 227 | stg $AHI,$stdframe($j,$sp) |
183 | 228 | ||
184 | la $bp,8($bp) # bp++ | 229 | la $bp,8($bp) # bp++ |
185 | clg $bp,160+8+32($j,$sp) # compare to &bp[num] | 230 | cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num] |
186 | jne .Louter | 231 | jne .Louter |
187 | 232 | ||
188 | lg $rp,160+8+16($j,$sp) # reincarnate rp | 233 | l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp |
189 | la $ap,160($sp) | 234 | la $ap,$stdframe($sp) |
190 | ahi $num,1 # restore $num, incidentally clears "borrow" | 235 | ahi $num,1 # restore $num, incidentally clears "borrow" |
191 | 236 | ||
192 | la $j,0(%r0) | 237 | la $j,0(%r0) |
193 | lr $count,$num | 238 | lr $count,$num |
194 | .Lsub: lg $alo,0($j,$ap) | 239 | .Lsub: lg $alo,0($j,$ap) |
195 | slbg $alo,0($j,$np) | 240 | lg $nlo,0($j,$np) |
241 | _dswap $nlo | ||
242 | slbgr $alo,$nlo | ||
196 | stg $alo,0($j,$rp) | 243 | stg $alo,0($j,$rp) |
197 | la $j,8($j) | 244 | la $j,8($j) |
198 | brct $count,.Lsub | 245 | brct $count,.Lsub |
@@ -207,19 +254,24 @@ bn_mul_mont: | |||
207 | 254 | ||
208 | la $j,0(%r0) | 255 | la $j,0(%r0) |
209 | lgr $count,$num | 256 | lgr $count,$num |
210 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh | 257 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh |
211 | stg $j,160($j,$sp) # zap tp | 258 | _dswap $alo |
259 | stg $j,$stdframe($j,$sp) # zap tp | ||
212 | stg $alo,0($j,$rp) | 260 | stg $alo,0($j,$rp) |
213 | la $j,8($j) | 261 | la $j,8($j) |
214 | brct $count,.Lcopy | 262 | brct $count,.Lcopy |
215 | 263 | ||
216 | la %r1,160+8+48($j,$sp) | 264 | la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp) |
217 | lmg %r6,%r15,0(%r1) | 265 | lm${g} %r6,%r15,0(%r1) |
218 | lghi %r2,1 # signal "processed" | 266 | lghi %r2,1 # signal "processed" |
219 | br %r14 | 267 | br %r14 |
220 | .size bn_mul_mont,.-bn_mul_mont | 268 | .size bn_mul_mont,.-bn_mul_mont |
221 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 269 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
222 | ___ | 270 | ___ |
223 | 271 | ||
224 | print $code; | 272 | foreach (split("\n",$code)) { |
273 | s/\`([^\`]*)\`/eval $1/ge; | ||
274 | s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e; | ||
275 | print $_,"\n"; | ||
276 | } | ||
225 | close STDOUT; | 277 | close STDOUT; |
diff --git a/src/lib/libcrypto/bn/asm/x86-gf2m.pl b/src/lib/libcrypto/bn/asm/x86-gf2m.pl new file mode 100644 index 0000000000..808a1e5969 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86-gf2m.pl | |||
@@ -0,0 +1,313 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # May 2011 | ||
11 | # | ||
12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
14 | # the time being... Except that it has three code paths: pure integer | ||
15 | # code suitable for any x86 CPU, MMX code suitable for PIII and later | ||
16 | # and PCLMULQDQ suitable for Westmere and later. Improvement varies | ||
17 | # from one benchmark and µ-arch to another. Below are interval values | ||
18 | # for 163- and 571-bit ECDH benchmarks relative to compiler-generated | ||
19 | # code: | ||
20 | # | ||
21 | # PIII 16%-30% | ||
22 | # P4 12%-12% | ||
23 | # Opteron 18%-40% | ||
24 | # Core2 19%-44% | ||
25 | # Atom 38%-64% | ||
26 | # Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) | ||
27 | # Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) | ||
28 | # | ||
29 | # Note that above improvement coefficients are not coefficients for | ||
30 | # bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result | ||
31 | # of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark | ||
32 | # is more and more dominated by other subroutines, most notably by | ||
33 | # BN_GF2m_mod[_mul]_arr... | ||
34 | |||
35 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
36 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
37 | require "x86asm.pl"; | ||
38 | |||
39 | &asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); | ||
40 | |||
41 | $sse2=0; | ||
42 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
43 | |||
44 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | ||
45 | |||
46 | $a="eax"; | ||
47 | $b="ebx"; | ||
48 | ($a1,$a2,$a4)=("ecx","edx","ebp"); | ||
49 | |||
50 | $R="mm0"; | ||
51 | @T=("mm1","mm2"); | ||
52 | ($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); | ||
53 | @i=("esi","edi"); | ||
54 | |||
55 | if (!$x86only) { | ||
56 | &function_begin_B("_mul_1x1_mmx"); | ||
57 | &sub ("esp",32+4); | ||
58 | &mov ($a1,$a); | ||
59 | &lea ($a2,&DWP(0,$a,$a)); | ||
60 | &and ($a1,0x3fffffff); | ||
61 | &lea ($a4,&DWP(0,$a2,$a2)); | ||
62 | &mov (&DWP(0*4,"esp"),0); | ||
63 | &and ($a2,0x7fffffff); | ||
64 | &movd ($A,$a); | ||
65 | &movd ($B,$b); | ||
66 | &mov (&DWP(1*4,"esp"),$a1); # a1 | ||
67 | &xor ($a1,$a2); # a1^a2 | ||
68 | &pxor ($B31,$B31); | ||
69 | &pxor ($B30,$B30); | ||
70 | &mov (&DWP(2*4,"esp"),$a2); # a2 | ||
71 | &xor ($a2,$a4); # a2^a4 | ||
72 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | ||
73 | &pcmpgtd($B31,$A); # broadcast 31st bit | ||
74 | &paddd ($A,$A); # $A<<=1 | ||
75 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | ||
76 | &mov (&DWP(4*4,"esp"),$a4); # a4 | ||
77 | &xor ($a4,$a2); # a2=a4^a2^a4 | ||
78 | &pand ($B31,$B); | ||
79 | &pcmpgtd($B30,$A); # broadcast 30th bit | ||
80 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | ||
81 | &xor ($a4,$a1); # a1^a2^a4 | ||
82 | &psllq ($B31,31); | ||
83 | &pand ($B30,$B); | ||
84 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | ||
85 | &mov (@i[0],0x7); | ||
86 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | ||
87 | &mov ($a4,@i[0]); | ||
88 | &and (@i[0],$b); | ||
89 | &shr ($b,3); | ||
90 | &mov (@i[1],$a4); | ||
91 | &psllq ($B30,30); | ||
92 | &and (@i[1],$b); | ||
93 | &shr ($b,3); | ||
94 | &movd ($R,&DWP(0,"esp",@i[0],4)); | ||
95 | &mov (@i[0],$a4); | ||
96 | &and (@i[0],$b); | ||
97 | &shr ($b,3); | ||
98 | for($n=1;$n<9;$n++) { | ||
99 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | ||
100 | &mov (@i[1],$a4); | ||
101 | &psllq (@T[1],3*$n); | ||
102 | &and (@i[1],$b); | ||
103 | &shr ($b,3); | ||
104 | &pxor ($R,@T[1]); | ||
105 | |||
106 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
107 | } | ||
108 | &movd (@T[1],&DWP(0,"esp",@i[1],4)); | ||
109 | &pxor ($R,$B30); | ||
110 | &psllq (@T[1],3*$n++); | ||
111 | &pxor ($R,@T[1]); | ||
112 | |||
113 | &movd (@T[0],&DWP(0,"esp",@i[0],4)); | ||
114 | &pxor ($R,$B31); | ||
115 | &psllq (@T[0],3*$n); | ||
116 | &add ("esp",32+4); | ||
117 | &pxor ($R,@T[0]); | ||
118 | &ret (); | ||
119 | &function_end_B("_mul_1x1_mmx"); | ||
120 | } | ||
121 | |||
122 | ($lo,$hi)=("eax","edx"); | ||
123 | @T=("ecx","ebp"); | ||
124 | |||
125 | &function_begin_B("_mul_1x1_ialu"); | ||
126 | &sub ("esp",32+4); | ||
127 | &mov ($a1,$a); | ||
128 | &lea ($a2,&DWP(0,$a,$a)); | ||
129 | &lea ($a4,&DWP(0,"",$a,4)); | ||
130 | &and ($a1,0x3fffffff); | ||
131 | &lea (@i[1],&DWP(0,$lo,$lo)); | ||
132 | &sar ($lo,31); # broadcast 31st bit | ||
133 | &mov (&DWP(0*4,"esp"),0); | ||
134 | &and ($a2,0x7fffffff); | ||
135 | &mov (&DWP(1*4,"esp"),$a1); # a1 | ||
136 | &xor ($a1,$a2); # a1^a2 | ||
137 | &mov (&DWP(2*4,"esp"),$a2); # a2 | ||
138 | &xor ($a2,$a4); # a2^a4 | ||
139 | &mov (&DWP(3*4,"esp"),$a1); # a1^a2 | ||
140 | &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 | ||
141 | &mov (&DWP(4*4,"esp"),$a4); # a4 | ||
142 | &xor ($a4,$a2); # a2=a4^a2^a4 | ||
143 | &mov (&DWP(5*4,"esp"),$a1); # a1^a4 | ||
144 | &xor ($a4,$a1); # a1^a2^a4 | ||
145 | &sar (@i[1],31); # broardcast 30th bit | ||
146 | &and ($lo,$b); | ||
147 | &mov (&DWP(6*4,"esp"),$a2); # a2^a4 | ||
148 | &and (@i[1],$b); | ||
149 | &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 | ||
150 | &mov ($hi,$lo); | ||
151 | &shl ($lo,31); | ||
152 | &mov (@T[0],@i[1]); | ||
153 | &shr ($hi,1); | ||
154 | |||
155 | &mov (@i[0],0x7); | ||
156 | &shl (@i[1],30); | ||
157 | &and (@i[0],$b); | ||
158 | &shr (@T[0],2); | ||
159 | &xor ($lo,@i[1]); | ||
160 | |||
161 | &shr ($b,3); | ||
162 | &mov (@i[1],0x7); # 5-byte instruction!? | ||
163 | &and (@i[1],$b); | ||
164 | &shr ($b,3); | ||
165 | &xor ($hi,@T[0]); | ||
166 | &xor ($lo,&DWP(0,"esp",@i[0],4)); | ||
167 | &mov (@i[0],0x7); | ||
168 | &and (@i[0],$b); | ||
169 | &shr ($b,3); | ||
170 | for($n=1;$n<9;$n++) { | ||
171 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | ||
172 | &mov (@i[1],0x7); | ||
173 | &mov (@T[0],@T[1]); | ||
174 | &shl (@T[1],3*$n); | ||
175 | &and (@i[1],$b); | ||
176 | &shr (@T[0],32-3*$n); | ||
177 | &xor ($lo,@T[1]); | ||
178 | &shr ($b,3); | ||
179 | &xor ($hi,@T[0]); | ||
180 | |||
181 | push(@i,shift(@i)); push(@T,shift(@T)); | ||
182 | } | ||
183 | &mov (@T[1],&DWP(0,"esp",@i[1],4)); | ||
184 | &mov (@T[0],@T[1]); | ||
185 | &shl (@T[1],3*$n); | ||
186 | &mov (@i[1],&DWP(0,"esp",@i[0],4)); | ||
187 | &shr (@T[0],32-3*$n); $n++; | ||
188 | &mov (@i[0],@i[1]); | ||
189 | &xor ($lo,@T[1]); | ||
190 | &shl (@i[1],3*$n); | ||
191 | &xor ($hi,@T[0]); | ||
192 | &shr (@i[0],32-3*$n); | ||
193 | &xor ($lo,@i[1]); | ||
194 | &xor ($hi,@i[0]); | ||
195 | |||
196 | &add ("esp",32+4); | ||
197 | &ret (); | ||
198 | &function_end_B("_mul_1x1_ialu"); | ||
199 | |||
200 | # void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); | ||
201 | &function_begin_B("bn_GF2m_mul_2x2"); | ||
202 | if (!$x86only) { | ||
203 | &picmeup("edx","OPENSSL_ia32cap_P"); | ||
204 | &mov ("eax",&DWP(0,"edx")); | ||
205 | &mov ("edx",&DWP(4,"edx")); | ||
206 | &test ("eax",1<<23); # check MMX bit | ||
207 | &jz (&label("ialu")); | ||
208 | if ($sse2) { | ||
209 | &test ("eax",1<<24); # check FXSR bit | ||
210 | &jz (&label("mmx")); | ||
211 | &test ("edx",1<<1); # check PCLMULQDQ bit | ||
212 | &jz (&label("mmx")); | ||
213 | |||
214 | &movups ("xmm0",&QWP(8,"esp")); | ||
215 | &shufps ("xmm0","xmm0",0b10110001); | ||
216 | &pclmulqdq ("xmm0","xmm0",1); | ||
217 | &mov ("eax",&DWP(4,"esp")); | ||
218 | &movups (&QWP(0,"eax"),"xmm0"); | ||
219 | &ret (); | ||
220 | |||
221 | &set_label("mmx",16); | ||
222 | } | ||
223 | &push ("ebp"); | ||
224 | &push ("ebx"); | ||
225 | &push ("esi"); | ||
226 | &push ("edi"); | ||
227 | &mov ($a,&wparam(1)); | ||
228 | &mov ($b,&wparam(3)); | ||
229 | &call ("_mul_1x1_mmx"); # a1·b1 | ||
230 | &movq ("mm7",$R); | ||
231 | |||
232 | &mov ($a,&wparam(2)); | ||
233 | &mov ($b,&wparam(4)); | ||
234 | &call ("_mul_1x1_mmx"); # a0·b0 | ||
235 | &movq ("mm6",$R); | ||
236 | |||
237 | &mov ($a,&wparam(1)); | ||
238 | &mov ($b,&wparam(3)); | ||
239 | &xor ($a,&wparam(2)); | ||
240 | &xor ($b,&wparam(4)); | ||
241 | &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) | ||
242 | &pxor ($R,"mm7"); | ||
243 | &mov ($a,&wparam(0)); | ||
244 | &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 | ||
245 | |||
246 | &movq ($A,$R); | ||
247 | &psllq ($R,32); | ||
248 | &pop ("edi"); | ||
249 | &psrlq ($A,32); | ||
250 | &pop ("esi"); | ||
251 | &pxor ($R,"mm6"); | ||
252 | &pop ("ebx"); | ||
253 | &pxor ($A,"mm7"); | ||
254 | &movq (&QWP(0,$a),$R); | ||
255 | &pop ("ebp"); | ||
256 | &movq (&QWP(8,$a),$A); | ||
257 | &emms (); | ||
258 | &ret (); | ||
259 | &set_label("ialu",16); | ||
260 | } | ||
261 | &push ("ebp"); | ||
262 | &push ("ebx"); | ||
263 | &push ("esi"); | ||
264 | &push ("edi"); | ||
265 | &stack_push(4+1); | ||
266 | |||
267 | &mov ($a,&wparam(1)); | ||
268 | &mov ($b,&wparam(3)); | ||
269 | &call ("_mul_1x1_ialu"); # a1·b1 | ||
270 | &mov (&DWP(8,"esp"),$lo); | ||
271 | &mov (&DWP(12,"esp"),$hi); | ||
272 | |||
273 | &mov ($a,&wparam(2)); | ||
274 | &mov ($b,&wparam(4)); | ||
275 | &call ("_mul_1x1_ialu"); # a0·b0 | ||
276 | &mov (&DWP(0,"esp"),$lo); | ||
277 | &mov (&DWP(4,"esp"),$hi); | ||
278 | |||
279 | &mov ($a,&wparam(1)); | ||
280 | &mov ($b,&wparam(3)); | ||
281 | &xor ($a,&wparam(2)); | ||
282 | &xor ($b,&wparam(4)); | ||
283 | &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) | ||
284 | |||
285 | &mov ("ebp",&wparam(0)); | ||
286 | @r=("ebx","ecx","edi","esi"); | ||
287 | &mov (@r[0],&DWP(0,"esp")); | ||
288 | &mov (@r[1],&DWP(4,"esp")); | ||
289 | &mov (@r[2],&DWP(8,"esp")); | ||
290 | &mov (@r[3],&DWP(12,"esp")); | ||
291 | |||
292 | &xor ($lo,$hi); | ||
293 | &xor ($hi,@r[1]); | ||
294 | &xor ($lo,@r[0]); | ||
295 | &mov (&DWP(0,"ebp"),@r[0]); | ||
296 | &xor ($hi,@r[2]); | ||
297 | &mov (&DWP(12,"ebp"),@r[3]); | ||
298 | &xor ($lo,@r[3]); | ||
299 | &stack_pop(4+1); | ||
300 | &xor ($hi,@r[3]); | ||
301 | &pop ("edi"); | ||
302 | &xor ($lo,$hi); | ||
303 | &pop ("esi"); | ||
304 | &mov (&DWP(8,"ebp"),$hi); | ||
305 | &pop ("ebx"); | ||
306 | &mov (&DWP(4,"ebp"),$lo); | ||
307 | &pop ("ebp"); | ||
308 | &ret (); | ||
309 | &function_end_B("bn_GF2m_mul_2x2"); | ||
310 | |||
311 | &asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
312 | |||
313 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl new file mode 100644 index 0000000000..1658acbbdd --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl | |||
@@ -0,0 +1,389 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # May 2011 | ||
11 | # | ||
12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication used | ||
13 | # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for | ||
14 | # the time being... Except that it has two code paths: code suitable | ||
15 | # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and | ||
16 | # later. Improvement varies from one benchmark and µ-arch to another. | ||
17 | # Vanilla code path is at most 20% faster than compiler-generated code | ||
18 | # [not very impressive], while PCLMULQDQ - whole 85%-160% better on | ||
19 | # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that | ||
20 | # these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not | ||
21 | # all CPU time is burnt in it... | ||
22 | |||
23 | $flavour = shift; | ||
24 | $output = shift; | ||
25 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
26 | |||
27 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
28 | |||
29 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
30 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
31 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
32 | die "can't locate x86_64-xlate.pl"; | ||
33 | |||
34 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
35 | |||
36 | ($lo,$hi)=("%rax","%rdx"); $a=$lo; | ||
37 | ($i0,$i1)=("%rsi","%rdi"); | ||
38 | ($t0,$t1)=("%rbx","%rcx"); | ||
39 | ($b,$mask)=("%rbp","%r8"); | ||
40 | ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15)); | ||
41 | ($R,$Tx)=("%xmm0","%xmm1"); | ||
42 | |||
43 | $code.=<<___; | ||
44 | .text | ||
45 | |||
46 | .type _mul_1x1,\@abi-omnipotent | ||
47 | .align 16 | ||
48 | _mul_1x1: | ||
49 | sub \$128+8,%rsp | ||
50 | mov \$-1,$a1 | ||
51 | lea ($a,$a),$i0 | ||
52 | shr \$3,$a1 | ||
53 | lea (,$a,4),$i1 | ||
54 | and $a,$a1 # a1=a&0x1fffffffffffffff | ||
55 | lea (,$a,8),$a8 | ||
56 | sar \$63,$a # broadcast 63rd bit | ||
57 | lea ($a1,$a1),$a2 | ||
58 | sar \$63,$i0 # broadcast 62nd bit | ||
59 | lea (,$a1,4),$a4 | ||
60 | and $b,$a | ||
61 | sar \$63,$i1 # boardcast 61st bit | ||
62 | mov $a,$hi # $a is $lo | ||
63 | shl \$63,$lo | ||
64 | and $b,$i0 | ||
65 | shr \$1,$hi | ||
66 | mov $i0,$t1 | ||
67 | shl \$62,$i0 | ||
68 | and $b,$i1 | ||
69 | shr \$2,$t1 | ||
70 | xor $i0,$lo | ||
71 | mov $i1,$t0 | ||
72 | shl \$61,$i1 | ||
73 | xor $t1,$hi | ||
74 | shr \$3,$t0 | ||
75 | xor $i1,$lo | ||
76 | xor $t0,$hi | ||
77 | |||
78 | mov $a1,$a12 | ||
79 | movq \$0,0(%rsp) # tab[0]=0 | ||
80 | xor $a2,$a12 # a1^a2 | ||
81 | mov $a1,8(%rsp) # tab[1]=a1 | ||
82 | mov $a4,$a48 | ||
83 | mov $a2,16(%rsp) # tab[2]=a2 | ||
84 | xor $a8,$a48 # a4^a8 | ||
85 | mov $a12,24(%rsp) # tab[3]=a1^a2 | ||
86 | |||
87 | xor $a4,$a1 | ||
88 | mov $a4,32(%rsp) # tab[4]=a4 | ||
89 | xor $a4,$a2 | ||
90 | mov $a1,40(%rsp) # tab[5]=a1^a4 | ||
91 | xor $a4,$a12 | ||
92 | mov $a2,48(%rsp) # tab[6]=a2^a4 | ||
93 | xor $a48,$a1 # a1^a4^a4^a8=a1^a8 | ||
94 | mov $a12,56(%rsp) # tab[7]=a1^a2^a4 | ||
95 | xor $a48,$a2 # a2^a4^a4^a8=a1^a8 | ||
96 | |||
97 | mov $a8,64(%rsp) # tab[8]=a8 | ||
98 | xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8 | ||
99 | mov $a1,72(%rsp) # tab[9]=a1^a8 | ||
100 | xor $a4,$a1 # a1^a8^a4 | ||
101 | mov $a2,80(%rsp) # tab[10]=a2^a8 | ||
102 | xor $a4,$a2 # a2^a8^a4 | ||
103 | mov $a12,88(%rsp) # tab[11]=a1^a2^a8 | ||
104 | |||
105 | xor $a4,$a12 # a1^a2^a8^a4 | ||
106 | mov $a48,96(%rsp) # tab[12]=a4^a8 | ||
107 | mov $mask,$i0 | ||
108 | mov $a1,104(%rsp) # tab[13]=a1^a4^a8 | ||
109 | and $b,$i0 | ||
110 | mov $a2,112(%rsp) # tab[14]=a2^a4^a8 | ||
111 | shr \$4,$b | ||
112 | mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8 | ||
113 | mov $mask,$i1 | ||
114 | and $b,$i1 | ||
115 | shr \$4,$b | ||
116 | |||
117 | movq (%rsp,$i0,8),$R # half of calculations is done in SSE2 | ||
118 | mov $mask,$i0 | ||
119 | and $b,$i0 | ||
120 | shr \$4,$b | ||
121 | ___ | ||
122 | for ($n=1;$n<8;$n++) { | ||
123 | $code.=<<___; | ||
124 | mov (%rsp,$i1,8),$t1 | ||
125 | mov $mask,$i1 | ||
126 | mov $t1,$t0 | ||
127 | shl \$`8*$n-4`,$t1 | ||
128 | and $b,$i1 | ||
129 | movq (%rsp,$i0,8),$Tx | ||
130 | shr \$`64-(8*$n-4)`,$t0 | ||
131 | xor $t1,$lo | ||
132 | pslldq \$$n,$Tx | ||
133 | mov $mask,$i0 | ||
134 | shr \$4,$b | ||
135 | xor $t0,$hi | ||
136 | and $b,$i0 | ||
137 | shr \$4,$b | ||
138 | pxor $Tx,$R | ||
139 | ___ | ||
140 | } | ||
141 | $code.=<<___; | ||
142 | mov (%rsp,$i1,8),$t1 | ||
143 | mov $t1,$t0 | ||
144 | shl \$`8*$n-4`,$t1 | ||
145 | movq $R,$i0 | ||
146 | shr \$`64-(8*$n-4)`,$t0 | ||
147 | xor $t1,$lo | ||
148 | psrldq \$8,$R | ||
149 | xor $t0,$hi | ||
150 | movq $R,$i1 | ||
151 | xor $i0,$lo | ||
152 | xor $i1,$hi | ||
153 | |||
154 | add \$128+8,%rsp | ||
155 | ret | ||
156 | .Lend_mul_1x1: | ||
157 | .size _mul_1x1,.-_mul_1x1 | ||
158 | ___ | ||
159 | |||
160 | ($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order | ||
161 | ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order | ||
162 | |||
163 | $code.=<<___; | ||
164 | .extern OPENSSL_ia32cap_P | ||
165 | .globl bn_GF2m_mul_2x2 | ||
166 | .type bn_GF2m_mul_2x2,\@abi-omnipotent | ||
167 | .align 16 | ||
168 | bn_GF2m_mul_2x2: | ||
169 | mov OPENSSL_ia32cap_P(%rip),%rax | ||
170 | bt \$33,%rax | ||
171 | jnc .Lvanilla_mul_2x2 | ||
172 | |||
173 | movq $a1,%xmm0 | ||
174 | movq $b1,%xmm1 | ||
175 | movq $a0,%xmm2 | ||
176 | ___ | ||
177 | $code.=<<___ if ($win64); | ||
178 | movq 40(%rsp),%xmm3 | ||
179 | ___ | ||
180 | $code.=<<___ if (!$win64); | ||
181 | movq $b0,%xmm3 | ||
182 | ___ | ||
183 | $code.=<<___; | ||
184 | movdqa %xmm0,%xmm4 | ||
185 | movdqa %xmm1,%xmm5 | ||
186 | pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 | ||
187 | pxor %xmm2,%xmm4 | ||
188 | pxor %xmm3,%xmm5 | ||
189 | pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 | ||
190 | pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) | ||
191 | xorps %xmm0,%xmm4 | ||
192 | xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 | ||
193 | movdqa %xmm4,%xmm5 | ||
194 | pslldq \$8,%xmm4 | ||
195 | psrldq \$8,%xmm5 | ||
196 | pxor %xmm4,%xmm2 | ||
197 | pxor %xmm5,%xmm0 | ||
198 | movdqu %xmm2,0($rp) | ||
199 | movdqu %xmm0,16($rp) | ||
200 | ret | ||
201 | |||
202 | .align 16 | ||
203 | .Lvanilla_mul_2x2: | ||
204 | lea -8*17(%rsp),%rsp | ||
205 | ___ | ||
206 | $code.=<<___ if ($win64); | ||
207 | mov `8*17+40`(%rsp),$b0 | ||
208 | mov %rdi,8*15(%rsp) | ||
209 | mov %rsi,8*16(%rsp) | ||
210 | ___ | ||
211 | $code.=<<___; | ||
212 | mov %r14,8*10(%rsp) | ||
213 | mov %r13,8*11(%rsp) | ||
214 | mov %r12,8*12(%rsp) | ||
215 | mov %rbp,8*13(%rsp) | ||
216 | mov %rbx,8*14(%rsp) | ||
217 | .Lbody_mul_2x2: | ||
218 | mov $rp,32(%rsp) # save the arguments | ||
219 | mov $a1,40(%rsp) | ||
220 | mov $a0,48(%rsp) | ||
221 | mov $b1,56(%rsp) | ||
222 | mov $b0,64(%rsp) | ||
223 | |||
224 | mov \$0xf,$mask | ||
225 | mov $a1,$a | ||
226 | mov $b1,$b | ||
227 | call _mul_1x1 # a1·b1 | ||
228 | mov $lo,16(%rsp) | ||
229 | mov $hi,24(%rsp) | ||
230 | |||
231 | mov 48(%rsp),$a | ||
232 | mov 64(%rsp),$b | ||
233 | call _mul_1x1 # a0·b0 | ||
234 | mov $lo,0(%rsp) | ||
235 | mov $hi,8(%rsp) | ||
236 | |||
237 | mov 40(%rsp),$a | ||
238 | mov 56(%rsp),$b | ||
239 | xor 48(%rsp),$a | ||
240 | xor 64(%rsp),$b | ||
241 | call _mul_1x1 # (a0+a1)·(b0+b1) | ||
242 | ___ | ||
243 | @r=("%rbx","%rcx","%rdi","%rsi"); | ||
244 | $code.=<<___; | ||
245 | mov 0(%rsp),@r[0] | ||
246 | mov 8(%rsp),@r[1] | ||
247 | mov 16(%rsp),@r[2] | ||
248 | mov 24(%rsp),@r[3] | ||
249 | mov 32(%rsp),%rbp | ||
250 | |||
251 | xor $hi,$lo | ||
252 | xor @r[1],$hi | ||
253 | xor @r[0],$lo | ||
254 | mov @r[0],0(%rbp) | ||
255 | xor @r[2],$hi | ||
256 | mov @r[3],24(%rbp) | ||
257 | xor @r[3],$lo | ||
258 | xor @r[3],$hi | ||
259 | xor $hi,$lo | ||
260 | mov $hi,16(%rbp) | ||
261 | mov $lo,8(%rbp) | ||
262 | |||
263 | mov 8*10(%rsp),%r14 | ||
264 | mov 8*11(%rsp),%r13 | ||
265 | mov 8*12(%rsp),%r12 | ||
266 | mov 8*13(%rsp),%rbp | ||
267 | mov 8*14(%rsp),%rbx | ||
268 | ___ | ||
269 | $code.=<<___ if ($win64); | ||
270 | mov 8*15(%rsp),%rdi | ||
271 | mov 8*16(%rsp),%rsi | ||
272 | ___ | ||
273 | $code.=<<___; | ||
274 | lea 8*17(%rsp),%rsp | ||
275 | ret | ||
276 | .Lend_mul_2x2: | ||
277 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
278 | .asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
279 | .align 16 | ||
280 | ___ | ||
281 | |||
282 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
283 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
284 | if ($win64) { | ||
285 | $rec="%rcx"; | ||
286 | $frame="%rdx"; | ||
287 | $context="%r8"; | ||
288 | $disp="%r9"; | ||
289 | |||
290 | $code.=<<___; | ||
291 | .extern __imp_RtlVirtualUnwind | ||
292 | |||
293 | .type se_handler,\@abi-omnipotent | ||
294 | .align 16 | ||
295 | se_handler: | ||
296 | push %rsi | ||
297 | push %rdi | ||
298 | push %rbx | ||
299 | push %rbp | ||
300 | push %r12 | ||
301 | push %r13 | ||
302 | push %r14 | ||
303 | push %r15 | ||
304 | pushfq | ||
305 | sub \$64,%rsp | ||
306 | |||
307 | mov 152($context),%rax # pull context->Rsp | ||
308 | mov 248($context),%rbx # pull context->Rip | ||
309 | |||
310 | lea .Lbody_mul_2x2(%rip),%r10 | ||
311 | cmp %r10,%rbx # context->Rip<"prologue" label | ||
312 | jb .Lin_prologue | ||
313 | |||
314 | mov 8*10(%rax),%r14 # mimic epilogue | ||
315 | mov 8*11(%rax),%r13 | ||
316 | mov 8*12(%rax),%r12 | ||
317 | mov 8*13(%rax),%rbp | ||
318 | mov 8*14(%rax),%rbx | ||
319 | mov 8*15(%rax),%rdi | ||
320 | mov 8*16(%rax),%rsi | ||
321 | |||
322 | mov %rbx,144($context) # restore context->Rbx | ||
323 | mov %rbp,160($context) # restore context->Rbp | ||
324 | mov %rsi,168($context) # restore context->Rsi | ||
325 | mov %rdi,176($context) # restore context->Rdi | ||
326 | mov %r12,216($context) # restore context->R12 | ||
327 | mov %r13,224($context) # restore context->R13 | ||
328 | mov %r14,232($context) # restore context->R14 | ||
329 | |||
330 | .Lin_prologue: | ||
331 | lea 8*17(%rax),%rax | ||
332 | mov %rax,152($context) # restore context->Rsp | ||
333 | |||
334 | mov 40($disp),%rdi # disp->ContextRecord | ||
335 | mov $context,%rsi # context | ||
336 | mov \$154,%ecx # sizeof(CONTEXT) | ||
337 | .long 0xa548f3fc # cld; rep movsq | ||
338 | |||
339 | mov $disp,%rsi | ||
340 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
341 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
342 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
343 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
344 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
345 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
346 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
347 | mov %r10,32(%rsp) # arg5 | ||
348 | mov %r11,40(%rsp) # arg6 | ||
349 | mov %r12,48(%rsp) # arg7 | ||
350 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
351 | call *__imp_RtlVirtualUnwind(%rip) | ||
352 | |||
353 | mov \$1,%eax # ExceptionContinueSearch | ||
354 | add \$64,%rsp | ||
355 | popfq | ||
356 | pop %r15 | ||
357 | pop %r14 | ||
358 | pop %r13 | ||
359 | pop %r12 | ||
360 | pop %rbp | ||
361 | pop %rbx | ||
362 | pop %rdi | ||
363 | pop %rsi | ||
364 | ret | ||
365 | .size se_handler,.-se_handler | ||
366 | |||
367 | .section .pdata | ||
368 | .align 4 | ||
369 | .rva _mul_1x1 | ||
370 | .rva .Lend_mul_1x1 | ||
371 | .rva .LSEH_info_1x1 | ||
372 | |||
373 | .rva .Lvanilla_mul_2x2 | ||
374 | .rva .Lend_mul_2x2 | ||
375 | .rva .LSEH_info_2x2 | ||
376 | .section .xdata | ||
377 | .align 8 | ||
378 | .LSEH_info_1x1: | ||
379 | .byte 0x01,0x07,0x02,0x00 | ||
380 | .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8 | ||
381 | .LSEH_info_2x2: | ||
382 | .byte 9,0,0,0 | ||
383 | .rva se_handler | ||
384 | ___ | ||
385 | } | ||
386 | |||
387 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
388 | print $code; | ||
389 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl index 3b7a6f243f..5d79b35e1c 100755 --- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl +++ b/src/lib/libcrypto/bn/asm/x86_64-mont.pl | |||
@@ -1,7 +1,7 @@ | |||
1 | #!/usr/bin/env perl | 1 | #!/usr/bin/env perl |
2 | 2 | ||
3 | # ==================================================================== | 3 | # ==================================================================== |
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
5 | # project. The module is, however, dual licensed under OpenSSL and | 5 | # project. The module is, however, dual licensed under OpenSSL and |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | 7 | # details see http://www.openssl.org/~appro/cryptogams/. |
@@ -15,6 +15,20 @@ | |||
15 | # respectful 50%. It remains to be seen if loop unrolling and | 15 | # respectful 50%. It remains to be seen if loop unrolling and |
16 | # dedicated squaring routine can provide further improvement... | 16 | # dedicated squaring routine can provide further improvement... |
17 | 17 | ||
18 | # July 2011. | ||
19 | # | ||
20 | # Add dedicated squaring procedure. Performance improvement varies | ||
21 | # from platform to platform, but in average it's ~5%/15%/25%/33% | ||
22 | # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. | ||
23 | |||
24 | # August 2011. | ||
25 | # | ||
26 | # Unroll and modulo-schedule inner loops in such manner that they | ||
27 | # are "fallen through" for input lengths of 8, which is critical for | ||
28 | # 1024-bit RSA *sign*. Average performance improvement in comparison | ||
29 | # to *initial* version of this module from 2005 is ~0%/30%/40%/45% | ||
30 | # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. | ||
31 | |||
18 | $flavour = shift; | 32 | $flavour = shift; |
19 | $output = shift; | 33 | $output = shift; |
20 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | 34 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
@@ -37,7 +51,6 @@ $n0="%r8"; # const BN_ULONG *n0, | |||
37 | $num="%r9"; # int num); | 51 | $num="%r9"; # int num); |
38 | $lo0="%r10"; | 52 | $lo0="%r10"; |
39 | $hi0="%r11"; | 53 | $hi0="%r11"; |
40 | $bp="%r12"; # reassign $bp | ||
41 | $hi1="%r13"; | 54 | $hi1="%r13"; |
42 | $i="%r14"; | 55 | $i="%r14"; |
43 | $j="%r15"; | 56 | $j="%r15"; |
@@ -51,6 +64,16 @@ $code=<<___; | |||
51 | .type bn_mul_mont,\@function,6 | 64 | .type bn_mul_mont,\@function,6 |
52 | .align 16 | 65 | .align 16 |
53 | bn_mul_mont: | 66 | bn_mul_mont: |
67 | test \$3,${num}d | ||
68 | jnz .Lmul_enter | ||
69 | cmp \$8,${num}d | ||
70 | jb .Lmul_enter | ||
71 | cmp $ap,$bp | ||
72 | jne .Lmul4x_enter | ||
73 | jmp .Lsqr4x_enter | ||
74 | |||
75 | .align 16 | ||
76 | .Lmul_enter: | ||
54 | push %rbx | 77 | push %rbx |
55 | push %rbp | 78 | push %rbp |
56 | push %r12 | 79 | push %r12 |
@@ -66,48 +89,66 @@ bn_mul_mont: | |||
66 | and \$-1024,%rsp # minimize TLB usage | 89 | and \$-1024,%rsp # minimize TLB usage |
67 | 90 | ||
68 | mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp | 91 | mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp |
69 | .Lprologue: | 92 | .Lmul_body: |
70 | mov %rdx,$bp # $bp reassigned, remember? | 93 | mov $bp,%r12 # reassign $bp |
71 | 94 | ___ | |
95 | $bp="%r12"; | ||
96 | $code.=<<___; | ||
72 | mov ($n0),$n0 # pull n0[0] value | 97 | mov ($n0),$n0 # pull n0[0] value |
98 | mov ($bp),$m0 # m0=bp[0] | ||
99 | mov ($ap),%rax | ||
73 | 100 | ||
74 | xor $i,$i # i=0 | 101 | xor $i,$i # i=0 |
75 | xor $j,$j # j=0 | 102 | xor $j,$j # j=0 |
76 | 103 | ||
77 | mov ($bp),$m0 # m0=bp[0] | 104 | mov $n0,$m1 |
78 | mov ($ap),%rax | ||
79 | mulq $m0 # ap[0]*bp[0] | 105 | mulq $m0 # ap[0]*bp[0] |
80 | mov %rax,$lo0 | 106 | mov %rax,$lo0 |
81 | mov %rdx,$hi0 | 107 | mov ($np),%rax |
82 | 108 | ||
83 | imulq $n0,%rax # "tp[0]"*n0 | 109 | imulq $lo0,$m1 # "tp[0]"*n0 |
84 | mov %rax,$m1 | 110 | mov %rdx,$hi0 |
85 | 111 | ||
86 | mulq ($np) # np[0]*m1 | 112 | mulq $m1 # np[0]*m1 |
87 | add $lo0,%rax # discarded | 113 | add %rax,$lo0 # discarded |
114 | mov 8($ap),%rax | ||
88 | adc \$0,%rdx | 115 | adc \$0,%rdx |
89 | mov %rdx,$hi1 | 116 | mov %rdx,$hi1 |
90 | 117 | ||
91 | lea 1($j),$j # j++ | 118 | lea 1($j),$j # j++ |
119 | jmp .L1st_enter | ||
120 | |||
121 | .align 16 | ||
92 | .L1st: | 122 | .L1st: |
123 | add %rax,$hi1 | ||
93 | mov ($ap,$j,8),%rax | 124 | mov ($ap,$j,8),%rax |
94 | mulq $m0 # ap[j]*bp[0] | ||
95 | add $hi0,%rax | ||
96 | adc \$0,%rdx | 125 | adc \$0,%rdx |
97 | mov %rax,$lo0 | 126 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] |
127 | mov $lo0,$hi0 | ||
128 | adc \$0,%rdx | ||
129 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
130 | mov %rdx,$hi1 | ||
131 | |||
132 | .L1st_enter: | ||
133 | mulq $m0 # ap[j]*bp[0] | ||
134 | add %rax,$hi0 | ||
98 | mov ($np,$j,8),%rax | 135 | mov ($np,$j,8),%rax |
99 | mov %rdx,$hi0 | 136 | adc \$0,%rdx |
137 | lea 1($j),$j # j++ | ||
138 | mov %rdx,$lo0 | ||
100 | 139 | ||
101 | mulq $m1 # np[j]*m1 | 140 | mulq $m1 # np[j]*m1 |
102 | add $hi1,%rax | 141 | cmp $num,$j |
103 | lea 1($j),$j # j++ | 142 | jne .L1st |
143 | |||
144 | add %rax,$hi1 | ||
145 | mov ($ap),%rax # ap[0] | ||
104 | adc \$0,%rdx | 146 | adc \$0,%rdx |
105 | add $lo0,%rax # np[j]*m1+ap[j]*bp[0] | 147 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] |
106 | adc \$0,%rdx | 148 | adc \$0,%rdx |
107 | mov %rax,-16(%rsp,$j,8) # tp[j-1] | 149 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] |
108 | cmp $num,$j | ||
109 | mov %rdx,$hi1 | 150 | mov %rdx,$hi1 |
110 | jl .L1st | 151 | mov $lo0,$hi0 |
111 | 152 | ||
112 | xor %rdx,%rdx | 153 | xor %rdx,%rdx |
113 | add $hi0,$hi1 | 154 | add $hi0,$hi1 |
@@ -116,50 +157,64 @@ bn_mul_mont: | |||
116 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit | 157 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit |
117 | 158 | ||
118 | lea 1($i),$i # i++ | 159 | lea 1($i),$i # i++ |
119 | .align 4 | 160 | jmp .Louter |
161 | .align 16 | ||
120 | .Louter: | 162 | .Louter: |
121 | xor $j,$j # j=0 | ||
122 | |||
123 | mov ($bp,$i,8),$m0 # m0=bp[i] | 163 | mov ($bp,$i,8),$m0 # m0=bp[i] |
124 | mov ($ap),%rax # ap[0] | 164 | xor $j,$j # j=0 |
165 | mov $n0,$m1 | ||
166 | mov (%rsp),$lo0 | ||
125 | mulq $m0 # ap[0]*bp[i] | 167 | mulq $m0 # ap[0]*bp[i] |
126 | add (%rsp),%rax # ap[0]*bp[i]+tp[0] | 168 | add %rax,$lo0 # ap[0]*bp[i]+tp[0] |
169 | mov ($np),%rax | ||
127 | adc \$0,%rdx | 170 | adc \$0,%rdx |
128 | mov %rax,$lo0 | ||
129 | mov %rdx,$hi0 | ||
130 | 171 | ||
131 | imulq $n0,%rax # tp[0]*n0 | 172 | imulq $lo0,$m1 # tp[0]*n0 |
132 | mov %rax,$m1 | 173 | mov %rdx,$hi0 |
133 | 174 | ||
134 | mulq ($np,$j,8) # np[0]*m1 | 175 | mulq $m1 # np[0]*m1 |
135 | add $lo0,%rax # discarded | 176 | add %rax,$lo0 # discarded |
136 | mov 8(%rsp),$lo0 # tp[1] | 177 | mov 8($ap),%rax |
137 | adc \$0,%rdx | 178 | adc \$0,%rdx |
179 | mov 8(%rsp),$lo0 # tp[1] | ||
138 | mov %rdx,$hi1 | 180 | mov %rdx,$hi1 |
139 | 181 | ||
140 | lea 1($j),$j # j++ | 182 | lea 1($j),$j # j++ |
141 | .align 4 | 183 | jmp .Linner_enter |
184 | |||
185 | .align 16 | ||
142 | .Linner: | 186 | .Linner: |
187 | add %rax,$hi1 | ||
143 | mov ($ap,$j,8),%rax | 188 | mov ($ap,$j,8),%rax |
144 | mulq $m0 # ap[j]*bp[i] | ||
145 | add $hi0,%rax | ||
146 | adc \$0,%rdx | 189 | adc \$0,%rdx |
147 | add %rax,$lo0 # ap[j]*bp[i]+tp[j] | 190 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] |
191 | mov (%rsp,$j,8),$lo0 | ||
192 | adc \$0,%rdx | ||
193 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
194 | mov %rdx,$hi1 | ||
195 | |||
196 | .Linner_enter: | ||
197 | mulq $m0 # ap[j]*bp[i] | ||
198 | add %rax,$hi0 | ||
148 | mov ($np,$j,8),%rax | 199 | mov ($np,$j,8),%rax |
149 | adc \$0,%rdx | 200 | adc \$0,%rdx |
201 | add $hi0,$lo0 # ap[j]*bp[i]+tp[j] | ||
150 | mov %rdx,$hi0 | 202 | mov %rdx,$hi0 |
203 | adc \$0,$hi0 | ||
204 | lea 1($j),$j # j++ | ||
151 | 205 | ||
152 | mulq $m1 # np[j]*m1 | 206 | mulq $m1 # np[j]*m1 |
153 | add $hi1,%rax | 207 | cmp $num,$j |
154 | lea 1($j),$j # j++ | 208 | jne .Linner |
155 | adc \$0,%rdx | 209 | |
156 | add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j] | 210 | add %rax,$hi1 |
211 | mov ($ap),%rax # ap[0] | ||
157 | adc \$0,%rdx | 212 | adc \$0,%rdx |
213 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
158 | mov (%rsp,$j,8),$lo0 | 214 | mov (%rsp,$j,8),$lo0 |
159 | cmp $num,$j | 215 | adc \$0,%rdx |
160 | mov %rax,-16(%rsp,$j,8) # tp[j-1] | 216 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] |
161 | mov %rdx,$hi1 | 217 | mov %rdx,$hi1 |
162 | jl .Linner | ||
163 | 218 | ||
164 | xor %rdx,%rdx | 219 | xor %rdx,%rdx |
165 | add $hi0,$hi1 | 220 | add $hi0,$hi1 |
@@ -173,35 +228,449 @@ bn_mul_mont: | |||
173 | cmp $num,$i | 228 | cmp $num,$i |
174 | jl .Louter | 229 | jl .Louter |
175 | 230 | ||
176 | lea (%rsp),$ap # borrow ap for tp | ||
177 | lea -1($num),$j # j=num-1 | ||
178 | |||
179 | mov ($ap),%rax # tp[0] | ||
180 | xor $i,$i # i=0 and clear CF! | 231 | xor $i,$i # i=0 and clear CF! |
232 | mov (%rsp),%rax # tp[0] | ||
233 | lea (%rsp),$ap # borrow ap for tp | ||
234 | mov $num,$j # j=num | ||
181 | jmp .Lsub | 235 | jmp .Lsub |
182 | .align 16 | 236 | .align 16 |
183 | .Lsub: sbb ($np,$i,8),%rax | 237 | .Lsub: sbb ($np,$i,8),%rax |
184 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] | 238 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] |
185 | dec $j # doesn't affect CF! | ||
186 | mov 8($ap,$i,8),%rax # tp[i+1] | 239 | mov 8($ap,$i,8),%rax # tp[i+1] |
187 | lea 1($i),$i # i++ | 240 | lea 1($i),$i # i++ |
188 | jge .Lsub | 241 | dec $j # doesnn't affect CF! |
242 | jnz .Lsub | ||
189 | 243 | ||
190 | sbb \$0,%rax # handle upmost overflow bit | 244 | sbb \$0,%rax # handle upmost overflow bit |
245 | xor $i,$i | ||
191 | and %rax,$ap | 246 | and %rax,$ap |
192 | not %rax | 247 | not %rax |
193 | mov $rp,$np | 248 | mov $rp,$np |
194 | and %rax,$np | 249 | and %rax,$np |
195 | lea -1($num),$j | 250 | mov $num,$j # j=num |
196 | or $np,$ap # ap=borrow?tp:rp | 251 | or $np,$ap # ap=borrow?tp:rp |
197 | .align 16 | 252 | .align 16 |
198 | .Lcopy: # copy or in-place refresh | 253 | .Lcopy: # copy or in-place refresh |
254 | mov ($ap,$i,8),%rax | ||
255 | mov $i,(%rsp,$i,8) # zap temporary vector | ||
256 | mov %rax,($rp,$i,8) # rp[i]=tp[i] | ||
257 | lea 1($i),$i | ||
258 | sub \$1,$j | ||
259 | jnz .Lcopy | ||
260 | |||
261 | mov 8(%rsp,$num,8),%rsi # restore %rsp | ||
262 | mov \$1,%rax | ||
263 | mov (%rsi),%r15 | ||
264 | mov 8(%rsi),%r14 | ||
265 | mov 16(%rsi),%r13 | ||
266 | mov 24(%rsi),%r12 | ||
267 | mov 32(%rsi),%rbp | ||
268 | mov 40(%rsi),%rbx | ||
269 | lea 48(%rsi),%rsp | ||
270 | .Lmul_epilogue: | ||
271 | ret | ||
272 | .size bn_mul_mont,.-bn_mul_mont | ||
273 | ___ | ||
274 | {{{ | ||
275 | my @A=("%r10","%r11"); | ||
276 | my @N=("%r13","%rdi"); | ||
277 | $code.=<<___; | ||
278 | .type bn_mul4x_mont,\@function,6 | ||
279 | .align 16 | ||
280 | bn_mul4x_mont: | ||
281 | .Lmul4x_enter: | ||
282 | push %rbx | ||
283 | push %rbp | ||
284 | push %r12 | ||
285 | push %r13 | ||
286 | push %r14 | ||
287 | push %r15 | ||
288 | |||
289 | mov ${num}d,${num}d | ||
290 | lea 4($num),%r10 | ||
291 | mov %rsp,%r11 | ||
292 | neg %r10 | ||
293 | lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) | ||
294 | and \$-1024,%rsp # minimize TLB usage | ||
295 | |||
296 | mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp | ||
297 | .Lmul4x_body: | ||
298 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp | ||
299 | mov %rdx,%r12 # reassign $bp | ||
300 | ___ | ||
301 | $bp="%r12"; | ||
302 | $code.=<<___; | ||
303 | mov ($n0),$n0 # pull n0[0] value | ||
304 | mov ($bp),$m0 # m0=bp[0] | ||
305 | mov ($ap),%rax | ||
306 | |||
307 | xor $i,$i # i=0 | ||
308 | xor $j,$j # j=0 | ||
309 | |||
310 | mov $n0,$m1 | ||
311 | mulq $m0 # ap[0]*bp[0] | ||
312 | mov %rax,$A[0] | ||
313 | mov ($np),%rax | ||
314 | |||
315 | imulq $A[0],$m1 # "tp[0]"*n0 | ||
316 | mov %rdx,$A[1] | ||
317 | |||
318 | mulq $m1 # np[0]*m1 | ||
319 | add %rax,$A[0] # discarded | ||
320 | mov 8($ap),%rax | ||
321 | adc \$0,%rdx | ||
322 | mov %rdx,$N[1] | ||
323 | |||
324 | mulq $m0 | ||
325 | add %rax,$A[1] | ||
326 | mov 8($np),%rax | ||
327 | adc \$0,%rdx | ||
328 | mov %rdx,$A[0] | ||
329 | |||
330 | mulq $m1 | ||
331 | add %rax,$N[1] | ||
332 | mov 16($ap),%rax | ||
333 | adc \$0,%rdx | ||
334 | add $A[1],$N[1] | ||
335 | lea 4($j),$j # j++ | ||
336 | adc \$0,%rdx | ||
337 | mov $N[1],(%rsp) | ||
338 | mov %rdx,$N[0] | ||
339 | jmp .L1st4x | ||
340 | .align 16 | ||
341 | .L1st4x: | ||
342 | mulq $m0 # ap[j]*bp[0] | ||
343 | add %rax,$A[0] | ||
344 | mov -16($np,$j,8),%rax | ||
345 | adc \$0,%rdx | ||
346 | mov %rdx,$A[1] | ||
347 | |||
348 | mulq $m1 # np[j]*m1 | ||
349 | add %rax,$N[0] | ||
350 | mov -8($ap,$j,8),%rax | ||
351 | adc \$0,%rdx | ||
352 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
353 | adc \$0,%rdx | ||
354 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
355 | mov %rdx,$N[1] | ||
356 | |||
357 | mulq $m0 # ap[j]*bp[0] | ||
358 | add %rax,$A[1] | ||
359 | mov -8($np,$j,8),%rax | ||
360 | adc \$0,%rdx | ||
361 | mov %rdx,$A[0] | ||
362 | |||
363 | mulq $m1 # np[j]*m1 | ||
364 | add %rax,$N[1] | ||
199 | mov ($ap,$j,8),%rax | 365 | mov ($ap,$j,8),%rax |
200 | mov %rax,($rp,$j,8) # rp[i]=tp[i] | 366 | adc \$0,%rdx |
201 | mov $i,(%rsp,$j,8) # zap temporary vector | 367 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] |
368 | adc \$0,%rdx | ||
369 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
370 | mov %rdx,$N[0] | ||
371 | |||
372 | mulq $m0 # ap[j]*bp[0] | ||
373 | add %rax,$A[0] | ||
374 | mov ($np,$j,8),%rax | ||
375 | adc \$0,%rdx | ||
376 | mov %rdx,$A[1] | ||
377 | |||
378 | mulq $m1 # np[j]*m1 | ||
379 | add %rax,$N[0] | ||
380 | mov 8($ap,$j,8),%rax | ||
381 | adc \$0,%rdx | ||
382 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
383 | adc \$0,%rdx | ||
384 | mov $N[0],-8(%rsp,$j,8) # tp[j-1] | ||
385 | mov %rdx,$N[1] | ||
386 | |||
387 | mulq $m0 # ap[j]*bp[0] | ||
388 | add %rax,$A[1] | ||
389 | mov 8($np,$j,8),%rax | ||
390 | adc \$0,%rdx | ||
391 | lea 4($j),$j # j++ | ||
392 | mov %rdx,$A[0] | ||
393 | |||
394 | mulq $m1 # np[j]*m1 | ||
395 | add %rax,$N[1] | ||
396 | mov -16($ap,$j,8),%rax | ||
397 | adc \$0,%rdx | ||
398 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | ||
399 | adc \$0,%rdx | ||
400 | mov $N[1],-32(%rsp,$j,8) # tp[j-1] | ||
401 | mov %rdx,$N[0] | ||
402 | cmp $num,$j | ||
403 | jl .L1st4x | ||
404 | |||
405 | mulq $m0 # ap[j]*bp[0] | ||
406 | add %rax,$A[0] | ||
407 | mov -16($np,$j,8),%rax | ||
408 | adc \$0,%rdx | ||
409 | mov %rdx,$A[1] | ||
410 | |||
411 | mulq $m1 # np[j]*m1 | ||
412 | add %rax,$N[0] | ||
413 | mov -8($ap,$j,8),%rax | ||
414 | adc \$0,%rdx | ||
415 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
416 | adc \$0,%rdx | ||
417 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
418 | mov %rdx,$N[1] | ||
419 | |||
420 | mulq $m0 # ap[j]*bp[0] | ||
421 | add %rax,$A[1] | ||
422 | mov -8($np,$j,8),%rax | ||
423 | adc \$0,%rdx | ||
424 | mov %rdx,$A[0] | ||
425 | |||
426 | mulq $m1 # np[j]*m1 | ||
427 | add %rax,$N[1] | ||
428 | mov ($ap),%rax # ap[0] | ||
429 | adc \$0,%rdx | ||
430 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | ||
431 | adc \$0,%rdx | ||
432 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
433 | mov %rdx,$N[0] | ||
434 | |||
435 | xor $N[1],$N[1] | ||
436 | add $A[0],$N[0] | ||
437 | adc \$0,$N[1] | ||
438 | mov $N[0],-8(%rsp,$j,8) | ||
439 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit | ||
440 | |||
441 | lea 1($i),$i # i++ | ||
442 | .align 4 | ||
443 | .Louter4x: | ||
444 | mov ($bp,$i,8),$m0 # m0=bp[i] | ||
445 | xor $j,$j # j=0 | ||
446 | mov (%rsp),$A[0] | ||
447 | mov $n0,$m1 | ||
448 | mulq $m0 # ap[0]*bp[i] | ||
449 | add %rax,$A[0] # ap[0]*bp[i]+tp[0] | ||
450 | mov ($np),%rax | ||
451 | adc \$0,%rdx | ||
452 | |||
453 | imulq $A[0],$m1 # tp[0]*n0 | ||
454 | mov %rdx,$A[1] | ||
455 | |||
456 | mulq $m1 # np[0]*m1 | ||
457 | add %rax,$A[0] # "$N[0]", discarded | ||
458 | mov 8($ap),%rax | ||
459 | adc \$0,%rdx | ||
460 | mov %rdx,$N[1] | ||
461 | |||
462 | mulq $m0 # ap[j]*bp[i] | ||
463 | add %rax,$A[1] | ||
464 | mov 8($np),%rax | ||
465 | adc \$0,%rdx | ||
466 | add 8(%rsp),$A[1] # +tp[1] | ||
467 | adc \$0,%rdx | ||
468 | mov %rdx,$A[0] | ||
469 | |||
470 | mulq $m1 # np[j]*m1 | ||
471 | add %rax,$N[1] | ||
472 | mov 16($ap),%rax | ||
473 | adc \$0,%rdx | ||
474 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
475 | lea 4($j),$j # j+=2 | ||
476 | adc \$0,%rdx | ||
477 | mov $N[1],(%rsp) # tp[j-1] | ||
478 | mov %rdx,$N[0] | ||
479 | jmp .Linner4x | ||
480 | .align 16 | ||
481 | .Linner4x: | ||
482 | mulq $m0 # ap[j]*bp[i] | ||
483 | add %rax,$A[0] | ||
484 | mov -16($np,$j,8),%rax | ||
485 | adc \$0,%rdx | ||
486 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
487 | adc \$0,%rdx | ||
488 | mov %rdx,$A[1] | ||
489 | |||
490 | mulq $m1 # np[j]*m1 | ||
491 | add %rax,$N[0] | ||
492 | mov -8($ap,$j,8),%rax | ||
493 | adc \$0,%rdx | ||
494 | add $A[0],$N[0] | ||
495 | adc \$0,%rdx | ||
496 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
497 | mov %rdx,$N[1] | ||
498 | |||
499 | mulq $m0 # ap[j]*bp[i] | ||
500 | add %rax,$A[1] | ||
501 | mov -8($np,$j,8),%rax | ||
502 | adc \$0,%rdx | ||
503 | add -8(%rsp,$j,8),$A[1] | ||
504 | adc \$0,%rdx | ||
505 | mov %rdx,$A[0] | ||
506 | |||
507 | mulq $m1 # np[j]*m1 | ||
508 | add %rax,$N[1] | ||
509 | mov ($ap,$j,8),%rax | ||
510 | adc \$0,%rdx | ||
511 | add $A[1],$N[1] | ||
512 | adc \$0,%rdx | ||
513 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
514 | mov %rdx,$N[0] | ||
515 | |||
516 | mulq $m0 # ap[j]*bp[i] | ||
517 | add %rax,$A[0] | ||
518 | mov ($np,$j,8),%rax | ||
519 | adc \$0,%rdx | ||
520 | add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
521 | adc \$0,%rdx | ||
522 | mov %rdx,$A[1] | ||
523 | |||
524 | mulq $m1 # np[j]*m1 | ||
525 | add %rax,$N[0] | ||
526 | mov 8($ap,$j,8),%rax | ||
527 | adc \$0,%rdx | ||
528 | add $A[0],$N[0] | ||
529 | adc \$0,%rdx | ||
530 | mov $N[0],-8(%rsp,$j,8) # tp[j-1] | ||
531 | mov %rdx,$N[1] | ||
532 | |||
533 | mulq $m0 # ap[j]*bp[i] | ||
534 | add %rax,$A[1] | ||
535 | mov 8($np,$j,8),%rax | ||
536 | adc \$0,%rdx | ||
537 | add 8(%rsp,$j,8),$A[1] | ||
538 | adc \$0,%rdx | ||
539 | lea 4($j),$j # j++ | ||
540 | mov %rdx,$A[0] | ||
541 | |||
542 | mulq $m1 # np[j]*m1 | ||
543 | add %rax,$N[1] | ||
544 | mov -16($ap,$j,8),%rax | ||
545 | adc \$0,%rdx | ||
546 | add $A[1],$N[1] | ||
547 | adc \$0,%rdx | ||
548 | mov $N[1],-32(%rsp,$j,8) # tp[j-1] | ||
549 | mov %rdx,$N[0] | ||
550 | cmp $num,$j | ||
551 | jl .Linner4x | ||
552 | |||
553 | mulq $m0 # ap[j]*bp[i] | ||
554 | add %rax,$A[0] | ||
555 | mov -16($np,$j,8),%rax | ||
556 | adc \$0,%rdx | ||
557 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
558 | adc \$0,%rdx | ||
559 | mov %rdx,$A[1] | ||
560 | |||
561 | mulq $m1 # np[j]*m1 | ||
562 | add %rax,$N[0] | ||
563 | mov -8($ap,$j,8),%rax | ||
564 | adc \$0,%rdx | ||
565 | add $A[0],$N[0] | ||
566 | adc \$0,%rdx | ||
567 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
568 | mov %rdx,$N[1] | ||
569 | |||
570 | mulq $m0 # ap[j]*bp[i] | ||
571 | add %rax,$A[1] | ||
572 | mov -8($np,$j,8),%rax | ||
573 | adc \$0,%rdx | ||
574 | add -8(%rsp,$j,8),$A[1] | ||
575 | adc \$0,%rdx | ||
576 | lea 1($i),$i # i++ | ||
577 | mov %rdx,$A[0] | ||
578 | |||
579 | mulq $m1 # np[j]*m1 | ||
580 | add %rax,$N[1] | ||
581 | mov ($ap),%rax # ap[0] | ||
582 | adc \$0,%rdx | ||
583 | add $A[1],$N[1] | ||
584 | adc \$0,%rdx | ||
585 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
586 | mov %rdx,$N[0] | ||
587 | |||
588 | xor $N[1],$N[1] | ||
589 | add $A[0],$N[0] | ||
590 | adc \$0,$N[1] | ||
591 | add (%rsp,$num,8),$N[0] # pull upmost overflow bit | ||
592 | adc \$0,$N[1] | ||
593 | mov $N[0],-8(%rsp,$j,8) | ||
594 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit | ||
595 | |||
596 | cmp $num,$i | ||
597 | jl .Louter4x | ||
598 | ___ | ||
599 | { | ||
600 | my @ri=("%rax","%rdx",$m0,$m1); | ||
601 | $code.=<<___; | ||
602 | mov 16(%rsp,$num,8),$rp # restore $rp | ||
603 | mov 0(%rsp),@ri[0] # tp[0] | ||
604 | pxor %xmm0,%xmm0 | ||
605 | mov 8(%rsp),@ri[1] # tp[1] | ||
606 | shr \$2,$num # num/=4 | ||
607 | lea (%rsp),$ap # borrow ap for tp | ||
608 | xor $i,$i # i=0 and clear CF! | ||
609 | |||
610 | sub 0($np),@ri[0] | ||
611 | mov 16($ap),@ri[2] # tp[2] | ||
612 | mov 24($ap),@ri[3] # tp[3] | ||
613 | sbb 8($np),@ri[1] | ||
614 | lea -1($num),$j # j=num/4-1 | ||
615 | jmp .Lsub4x | ||
616 | .align 16 | ||
617 | .Lsub4x: | ||
618 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
619 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
620 | sbb 16($np,$i,8),@ri[2] | ||
621 | mov 32($ap,$i,8),@ri[0] # tp[i+1] | ||
622 | mov 40($ap,$i,8),@ri[1] | ||
623 | sbb 24($np,$i,8),@ri[3] | ||
624 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
625 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
626 | sbb 32($np,$i,8),@ri[0] | ||
627 | mov 48($ap,$i,8),@ri[2] | ||
628 | mov 56($ap,$i,8),@ri[3] | ||
629 | sbb 40($np,$i,8),@ri[1] | ||
630 | lea 4($i),$i # i++ | ||
631 | dec $j # doesnn't affect CF! | ||
632 | jnz .Lsub4x | ||
633 | |||
634 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
635 | mov 32($ap,$i,8),@ri[0] # load overflow bit | ||
636 | sbb 16($np,$i,8),@ri[2] | ||
637 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
638 | sbb 24($np,$i,8),@ri[3] | ||
639 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
640 | |||
641 | sbb \$0,@ri[0] # handle upmost overflow bit | ||
642 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
643 | xor $i,$i # i=0 | ||
644 | and @ri[0],$ap | ||
645 | not @ri[0] | ||
646 | mov $rp,$np | ||
647 | and @ri[0],$np | ||
648 | lea -1($num),$j | ||
649 | or $np,$ap # ap=borrow?tp:rp | ||
650 | |||
651 | movdqu ($ap),%xmm1 | ||
652 | movdqa %xmm0,(%rsp) | ||
653 | movdqu %xmm1,($rp) | ||
654 | jmp .Lcopy4x | ||
655 | .align 16 | ||
656 | .Lcopy4x: # copy or in-place refresh | ||
657 | movdqu 16($ap,$i),%xmm2 | ||
658 | movdqu 32($ap,$i),%xmm1 | ||
659 | movdqa %xmm0,16(%rsp,$i) | ||
660 | movdqu %xmm2,16($rp,$i) | ||
661 | movdqa %xmm0,32(%rsp,$i) | ||
662 | movdqu %xmm1,32($rp,$i) | ||
663 | lea 32($i),$i | ||
202 | dec $j | 664 | dec $j |
203 | jge .Lcopy | 665 | jnz .Lcopy4x |
204 | 666 | ||
667 | shl \$2,$num | ||
668 | movdqu 16($ap,$i),%xmm2 | ||
669 | movdqa %xmm0,16(%rsp,$i) | ||
670 | movdqu %xmm2,16($rp,$i) | ||
671 | ___ | ||
672 | } | ||
673 | $code.=<<___; | ||
205 | mov 8(%rsp,$num,8),%rsi # restore %rsp | 674 | mov 8(%rsp,$num,8),%rsi # restore %rsp |
206 | mov \$1,%rax | 675 | mov \$1,%rax |
207 | mov (%rsi),%r15 | 676 | mov (%rsi),%r15 |
@@ -211,9 +680,823 @@ bn_mul_mont: | |||
211 | mov 32(%rsi),%rbp | 680 | mov 32(%rsi),%rbp |
212 | mov 40(%rsi),%rbx | 681 | mov 40(%rsi),%rbx |
213 | lea 48(%rsi),%rsp | 682 | lea 48(%rsi),%rsp |
214 | .Lepilogue: | 683 | .Lmul4x_epilogue: |
215 | ret | 684 | ret |
216 | .size bn_mul_mont,.-bn_mul_mont | 685 | .size bn_mul4x_mont,.-bn_mul4x_mont |
686 | ___ | ||
687 | }}} | ||
688 | {{{ | ||
689 | ###################################################################### | ||
690 | # void bn_sqr4x_mont( | ||
691 | my $rptr="%rdi"; # const BN_ULONG *rptr, | ||
692 | my $aptr="%rsi"; # const BN_ULONG *aptr, | ||
693 | my $bptr="%rdx"; # not used | ||
694 | my $nptr="%rcx"; # const BN_ULONG *nptr, | ||
695 | my $n0 ="%r8"; # const BN_ULONG *n0); | ||
696 | my $num ="%r9"; # int num, has to be divisible by 4 and | ||
697 | # not less than 8 | ||
698 | |||
699 | my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); | ||
700 | my @A0=("%r10","%r11"); | ||
701 | my @A1=("%r12","%r13"); | ||
702 | my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); | ||
703 | |||
704 | $code.=<<___; | ||
705 | .type bn_sqr4x_mont,\@function,6 | ||
706 | .align 16 | ||
707 | bn_sqr4x_mont: | ||
708 | .Lsqr4x_enter: | ||
709 | push %rbx | ||
710 | push %rbp | ||
711 | push %r12 | ||
712 | push %r13 | ||
713 | push %r14 | ||
714 | push %r15 | ||
715 | |||
716 | shl \$3,${num}d # convert $num to bytes | ||
717 | xor %r10,%r10 | ||
718 | mov %rsp,%r11 # put aside %rsp | ||
719 | sub $num,%r10 # -$num | ||
720 | mov ($n0),$n0 # *n0 | ||
721 | lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num) | ||
722 | and \$-1024,%rsp # minimize TLB usage | ||
723 | ############################################################## | ||
724 | # Stack layout | ||
725 | # | ||
726 | # +0 saved $num, used in reduction section | ||
727 | # +8 &t[2*$num], used in reduction section | ||
728 | # +32 saved $rptr | ||
729 | # +40 saved $nptr | ||
730 | # +48 saved *n0 | ||
731 | # +56 saved %rsp | ||
732 | # +64 t[2*$num] | ||
733 | # | ||
734 | mov $rptr,32(%rsp) # save $rptr | ||
735 | mov $nptr,40(%rsp) | ||
736 | mov $n0, 48(%rsp) | ||
737 | mov %r11, 56(%rsp) # save original %rsp | ||
738 | .Lsqr4x_body: | ||
739 | ############################################################## | ||
740 | # Squaring part: | ||
741 | # | ||
742 | # a) multiply-n-add everything but a[i]*a[i]; | ||
743 | # b) shift result of a) by 1 to the left and accumulate | ||
744 | # a[i]*a[i] products; | ||
745 | # | ||
746 | lea 32(%r10),$i # $i=-($num-32) | ||
747 | lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] | ||
748 | |||
749 | mov $num,$j # $j=$num | ||
750 | |||
751 | # comments apply to $num==8 case | ||
752 | mov -32($aptr,$i),$a0 # a[0] | ||
753 | lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] | ||
754 | mov -24($aptr,$i),%rax # a[1] | ||
755 | lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] | ||
756 | mov -16($aptr,$i),$ai # a[2] | ||
757 | mov %rax,$a1 | ||
758 | |||
759 | mul $a0 # a[1]*a[0] | ||
760 | mov %rax,$A0[0] # a[1]*a[0] | ||
761 | mov $ai,%rax # a[2] | ||
762 | mov %rdx,$A0[1] | ||
763 | mov $A0[0],-24($tptr,$i) # t[1] | ||
764 | |||
765 | xor $A0[0],$A0[0] | ||
766 | mul $a0 # a[2]*a[0] | ||
767 | add %rax,$A0[1] | ||
768 | mov $ai,%rax | ||
769 | adc %rdx,$A0[0] | ||
770 | mov $A0[1],-16($tptr,$i) # t[2] | ||
771 | |||
772 | lea -16($i),$j # j=-16 | ||
773 | |||
774 | |||
775 | mov 8($aptr,$j),$ai # a[3] | ||
776 | mul $a1 # a[2]*a[1] | ||
777 | mov %rax,$A1[0] # a[2]*a[1]+t[3] | ||
778 | mov $ai,%rax | ||
779 | mov %rdx,$A1[1] | ||
780 | |||
781 | xor $A0[1],$A0[1] | ||
782 | add $A1[0],$A0[0] | ||
783 | lea 16($j),$j | ||
784 | adc \$0,$A0[1] | ||
785 | mul $a0 # a[3]*a[0] | ||
786 | add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] | ||
787 | mov $ai,%rax | ||
788 | adc %rdx,$A0[1] | ||
789 | mov $A0[0],-8($tptr,$j) # t[3] | ||
790 | jmp .Lsqr4x_1st | ||
791 | |||
792 | .align 16 | ||
793 | .Lsqr4x_1st: | ||
794 | mov ($aptr,$j),$ai # a[4] | ||
795 | xor $A1[0],$A1[0] | ||
796 | mul $a1 # a[3]*a[1] | ||
797 | add %rax,$A1[1] # a[3]*a[1]+t[4] | ||
798 | mov $ai,%rax | ||
799 | adc %rdx,$A1[0] | ||
800 | |||
801 | xor $A0[0],$A0[0] | ||
802 | add $A1[1],$A0[1] | ||
803 | adc \$0,$A0[0] | ||
804 | mul $a0 # a[4]*a[0] | ||
805 | add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] | ||
806 | mov $ai,%rax # a[3] | ||
807 | adc %rdx,$A0[0] | ||
808 | mov $A0[1],($tptr,$j) # t[4] | ||
809 | |||
810 | |||
811 | mov 8($aptr,$j),$ai # a[5] | ||
812 | xor $A1[1],$A1[1] | ||
813 | mul $a1 # a[4]*a[3] | ||
814 | add %rax,$A1[0] # a[4]*a[3]+t[5] | ||
815 | mov $ai,%rax | ||
816 | adc %rdx,$A1[1] | ||
817 | |||
818 | xor $A0[1],$A0[1] | ||
819 | add $A1[0],$A0[0] | ||
820 | adc \$0,$A0[1] | ||
821 | mul $a0 # a[5]*a[2] | ||
822 | add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] | ||
823 | mov $ai,%rax | ||
824 | adc %rdx,$A0[1] | ||
825 | mov $A0[0],8($tptr,$j) # t[5] | ||
826 | |||
827 | mov 16($aptr,$j),$ai # a[6] | ||
828 | xor $A1[0],$A1[0] | ||
829 | mul $a1 # a[5]*a[3] | ||
830 | add %rax,$A1[1] # a[5]*a[3]+t[6] | ||
831 | mov $ai,%rax | ||
832 | adc %rdx,$A1[0] | ||
833 | |||
834 | xor $A0[0],$A0[0] | ||
835 | add $A1[1],$A0[1] | ||
836 | adc \$0,$A0[0] | ||
837 | mul $a0 # a[6]*a[2] | ||
838 | add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] | ||
839 | mov $ai,%rax # a[3] | ||
840 | adc %rdx,$A0[0] | ||
841 | mov $A0[1],16($tptr,$j) # t[6] | ||
842 | |||
843 | |||
844 | mov 24($aptr,$j),$ai # a[7] | ||
845 | xor $A1[1],$A1[1] | ||
846 | mul $a1 # a[6]*a[5] | ||
847 | add %rax,$A1[0] # a[6]*a[5]+t[7] | ||
848 | mov $ai,%rax | ||
849 | adc %rdx,$A1[1] | ||
850 | |||
851 | xor $A0[1],$A0[1] | ||
852 | add $A1[0],$A0[0] | ||
853 | lea 32($j),$j | ||
854 | adc \$0,$A0[1] | ||
855 | mul $a0 # a[7]*a[4] | ||
856 | add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] | ||
857 | mov $ai,%rax | ||
858 | adc %rdx,$A0[1] | ||
859 | mov $A0[0],-8($tptr,$j) # t[7] | ||
860 | |||
861 | cmp \$0,$j | ||
862 | jne .Lsqr4x_1st | ||
863 | |||
864 | xor $A1[0],$A1[0] | ||
865 | add $A0[1],$A1[1] | ||
866 | adc \$0,$A1[0] | ||
867 | mul $a1 # a[7]*a[5] | ||
868 | add %rax,$A1[1] | ||
869 | adc %rdx,$A1[0] | ||
870 | |||
871 | mov $A1[1],($tptr) # t[8] | ||
872 | lea 16($i),$i | ||
873 | mov $A1[0],8($tptr) # t[9] | ||
874 | jmp .Lsqr4x_outer | ||
875 | |||
876 | .align 16 | ||
877 | .Lsqr4x_outer: # comments apply to $num==6 case | ||
878 | mov -32($aptr,$i),$a0 # a[0] | ||
879 | lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] | ||
880 | mov -24($aptr,$i),%rax # a[1] | ||
881 | lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] | ||
882 | mov -16($aptr,$i),$ai # a[2] | ||
883 | mov %rax,$a1 | ||
884 | |||
885 | mov -24($tptr,$i),$A0[0] # t[1] | ||
886 | xor $A0[1],$A0[1] | ||
887 | mul $a0 # a[1]*a[0] | ||
888 | add %rax,$A0[0] # a[1]*a[0]+t[1] | ||
889 | mov $ai,%rax # a[2] | ||
890 | adc %rdx,$A0[1] | ||
891 | mov $A0[0],-24($tptr,$i) # t[1] | ||
892 | |||
893 | xor $A0[0],$A0[0] | ||
894 | add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] | ||
895 | adc \$0,$A0[0] | ||
896 | mul $a0 # a[2]*a[0] | ||
897 | add %rax,$A0[1] | ||
898 | mov $ai,%rax | ||
899 | adc %rdx,$A0[0] | ||
900 | mov $A0[1],-16($tptr,$i) # t[2] | ||
901 | |||
902 | lea -16($i),$j # j=-16 | ||
903 | xor $A1[0],$A1[0] | ||
904 | |||
905 | |||
906 | mov 8($aptr,$j),$ai # a[3] | ||
907 | xor $A1[1],$A1[1] | ||
908 | add 8($tptr,$j),$A1[0] | ||
909 | adc \$0,$A1[1] | ||
910 | mul $a1 # a[2]*a[1] | ||
911 | add %rax,$A1[0] # a[2]*a[1]+t[3] | ||
912 | mov $ai,%rax | ||
913 | adc %rdx,$A1[1] | ||
914 | |||
915 | xor $A0[1],$A0[1] | ||
916 | add $A1[0],$A0[0] | ||
917 | adc \$0,$A0[1] | ||
918 | mul $a0 # a[3]*a[0] | ||
919 | add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] | ||
920 | mov $ai,%rax | ||
921 | adc %rdx,$A0[1] | ||
922 | mov $A0[0],8($tptr,$j) # t[3] | ||
923 | |||
924 | lea 16($j),$j | ||
925 | jmp .Lsqr4x_inner | ||
926 | |||
927 | .align 16 | ||
928 | .Lsqr4x_inner: | ||
929 | mov ($aptr,$j),$ai # a[4] | ||
930 | xor $A1[0],$A1[0] | ||
931 | add ($tptr,$j),$A1[1] | ||
932 | adc \$0,$A1[0] | ||
933 | mul $a1 # a[3]*a[1] | ||
934 | add %rax,$A1[1] # a[3]*a[1]+t[4] | ||
935 | mov $ai,%rax | ||
936 | adc %rdx,$A1[0] | ||
937 | |||
938 | xor $A0[0],$A0[0] | ||
939 | add $A1[1],$A0[1] | ||
940 | adc \$0,$A0[0] | ||
941 | mul $a0 # a[4]*a[0] | ||
942 | add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] | ||
943 | mov $ai,%rax # a[3] | ||
944 | adc %rdx,$A0[0] | ||
945 | mov $A0[1],($tptr,$j) # t[4] | ||
946 | |||
947 | mov 8($aptr,$j),$ai # a[5] | ||
948 | xor $A1[1],$A1[1] | ||
949 | add 8($tptr,$j),$A1[0] | ||
950 | adc \$0,$A1[1] | ||
951 | mul $a1 # a[4]*a[3] | ||
952 | add %rax,$A1[0] # a[4]*a[3]+t[5] | ||
953 | mov $ai,%rax | ||
954 | adc %rdx,$A1[1] | ||
955 | |||
956 | xor $A0[1],$A0[1] | ||
957 | add $A1[0],$A0[0] | ||
958 | lea 16($j),$j # j++ | ||
959 | adc \$0,$A0[1] | ||
960 | mul $a0 # a[5]*a[2] | ||
961 | add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] | ||
962 | mov $ai,%rax | ||
963 | adc %rdx,$A0[1] | ||
964 | mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below | ||
965 | |||
966 | cmp \$0,$j | ||
967 | jne .Lsqr4x_inner | ||
968 | |||
969 | xor $A1[0],$A1[0] | ||
970 | add $A0[1],$A1[1] | ||
971 | adc \$0,$A1[0] | ||
972 | mul $a1 # a[5]*a[3] | ||
973 | add %rax,$A1[1] | ||
974 | adc %rdx,$A1[0] | ||
975 | |||
976 | mov $A1[1],($tptr) # t[6], "preloaded t[2]" below | ||
977 | mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below | ||
978 | |||
979 | add \$16,$i | ||
980 | jnz .Lsqr4x_outer | ||
981 | |||
982 | # comments apply to $num==4 case | ||
983 | mov -32($aptr),$a0 # a[0] | ||
984 | lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] | ||
985 | mov -24($aptr),%rax # a[1] | ||
986 | lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] | ||
987 | mov -16($aptr),$ai # a[2] | ||
988 | mov %rax,$a1 | ||
989 | |||
990 | xor $A0[1],$A0[1] | ||
991 | mul $a0 # a[1]*a[0] | ||
992 | add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] | ||
993 | mov $ai,%rax # a[2] | ||
994 | adc %rdx,$A0[1] | ||
995 | mov $A0[0],-24($tptr) # t[1] | ||
996 | |||
997 | xor $A0[0],$A0[0] | ||
998 | add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] | ||
999 | adc \$0,$A0[0] | ||
1000 | mul $a0 # a[2]*a[0] | ||
1001 | add %rax,$A0[1] | ||
1002 | mov $ai,%rax | ||
1003 | adc %rdx,$A0[0] | ||
1004 | mov $A0[1],-16($tptr) # t[2] | ||
1005 | |||
1006 | mov -8($aptr),$ai # a[3] | ||
1007 | mul $a1 # a[2]*a[1] | ||
1008 | add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] | ||
1009 | mov $ai,%rax | ||
1010 | adc \$0,%rdx | ||
1011 | |||
1012 | xor $A0[1],$A0[1] | ||
1013 | add $A1[0],$A0[0] | ||
1014 | mov %rdx,$A1[1] | ||
1015 | adc \$0,$A0[1] | ||
1016 | mul $a0 # a[3]*a[0] | ||
1017 | add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] | ||
1018 | mov $ai,%rax | ||
1019 | adc %rdx,$A0[1] | ||
1020 | mov $A0[0],-8($tptr) # t[3] | ||
1021 | |||
1022 | xor $A1[0],$A1[0] | ||
1023 | add $A0[1],$A1[1] | ||
1024 | adc \$0,$A1[0] | ||
1025 | mul $a1 # a[3]*a[1] | ||
1026 | add %rax,$A1[1] | ||
1027 | mov -16($aptr),%rax # a[2] | ||
1028 | adc %rdx,$A1[0] | ||
1029 | |||
1030 | mov $A1[1],($tptr) # t[4] | ||
1031 | mov $A1[0],8($tptr) # t[5] | ||
1032 | |||
1033 | mul $ai # a[2]*a[3] | ||
1034 | ___ | ||
1035 | { | ||
1036 | my ($shift,$carry)=($a0,$a1); | ||
1037 | my @S=(@A1,$ai,$n0); | ||
1038 | $code.=<<___; | ||
1039 | add \$16,$i | ||
1040 | xor $shift,$shift | ||
1041 | sub $num,$i # $i=16-$num | ||
1042 | xor $carry,$carry | ||
1043 | |||
1044 | add $A1[0],%rax # t[5] | ||
1045 | adc \$0,%rdx | ||
1046 | mov %rax,8($tptr) # t[5] | ||
1047 | mov %rdx,16($tptr) # t[6] | ||
1048 | mov $carry,24($tptr) # t[7] | ||
1049 | |||
1050 | mov -16($aptr,$i),%rax # a[0] | ||
1051 | lea 64(%rsp,$num,2),$tptr | ||
1052 | xor $A0[0],$A0[0] # t[0] | ||
1053 | mov -24($tptr,$i,2),$A0[1] # t[1] | ||
1054 | |||
1055 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | ||
1056 | shr \$63,$A0[0] | ||
1057 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | ||
1058 | shr \$63,$A0[1] | ||
1059 | or $A0[0],$S[1] # | t[2*i]>>63 | ||
1060 | mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
1061 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
1062 | mul %rax # a[i]*a[i] | ||
1063 | neg $carry # mov $carry,cf | ||
1064 | mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
1065 | adc %rax,$S[0] | ||
1066 | mov -8($aptr,$i),%rax # a[i+1] # prefetch | ||
1067 | mov $S[0],-32($tptr,$i,2) | ||
1068 | adc %rdx,$S[1] | ||
1069 | |||
1070 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift | ||
1071 | mov $S[1],-24($tptr,$i,2) | ||
1072 | sbb $carry,$carry # mov cf,$carry | ||
1073 | shr \$63,$A0[0] | ||
1074 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | ||
1075 | shr \$63,$A0[1] | ||
1076 | or $A0[0],$S[3] # | t[2*i]>>63 | ||
1077 | mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
1078 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
1079 | mul %rax # a[i]*a[i] | ||
1080 | neg $carry # mov $carry,cf | ||
1081 | mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
1082 | adc %rax,$S[2] | ||
1083 | mov 0($aptr,$i),%rax # a[i+1] # prefetch | ||
1084 | mov $S[2],-16($tptr,$i,2) | ||
1085 | adc %rdx,$S[3] | ||
1086 | lea 16($i),$i | ||
1087 | mov $S[3],-40($tptr,$i,2) | ||
1088 | sbb $carry,$carry # mov cf,$carry | ||
1089 | jmp .Lsqr4x_shift_n_add | ||
1090 | |||
1091 | .align 16 | ||
1092 | .Lsqr4x_shift_n_add: | ||
1093 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | ||
1094 | shr \$63,$A0[0] | ||
1095 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | ||
1096 | shr \$63,$A0[1] | ||
1097 | or $A0[0],$S[1] # | t[2*i]>>63 | ||
1098 | mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
1099 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
1100 | mul %rax # a[i]*a[i] | ||
1101 | neg $carry # mov $carry,cf | ||
1102 | mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
1103 | adc %rax,$S[0] | ||
1104 | mov -8($aptr,$i),%rax # a[i+1] # prefetch | ||
1105 | mov $S[0],-32($tptr,$i,2) | ||
1106 | adc %rdx,$S[1] | ||
1107 | |||
1108 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift | ||
1109 | mov $S[1],-24($tptr,$i,2) | ||
1110 | sbb $carry,$carry # mov cf,$carry | ||
1111 | shr \$63,$A0[0] | ||
1112 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | ||
1113 | shr \$63,$A0[1] | ||
1114 | or $A0[0],$S[3] # | t[2*i]>>63 | ||
1115 | mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
1116 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
1117 | mul %rax # a[i]*a[i] | ||
1118 | neg $carry # mov $carry,cf | ||
1119 | mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
1120 | adc %rax,$S[2] | ||
1121 | mov 0($aptr,$i),%rax # a[i+1] # prefetch | ||
1122 | mov $S[2],-16($tptr,$i,2) | ||
1123 | adc %rdx,$S[3] | ||
1124 | |||
1125 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | ||
1126 | mov $S[3],-8($tptr,$i,2) | ||
1127 | sbb $carry,$carry # mov cf,$carry | ||
1128 | shr \$63,$A0[0] | ||
1129 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | ||
1130 | shr \$63,$A0[1] | ||
1131 | or $A0[0],$S[1] # | t[2*i]>>63 | ||
1132 | mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
1133 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
1134 | mul %rax # a[i]*a[i] | ||
1135 | neg $carry # mov $carry,cf | ||
1136 | mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
1137 | adc %rax,$S[0] | ||
1138 | mov 8($aptr,$i),%rax # a[i+1] # prefetch | ||
1139 | mov $S[0],0($tptr,$i,2) | ||
1140 | adc %rdx,$S[1] | ||
1141 | |||
1142 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift | ||
1143 | mov $S[1],8($tptr,$i,2) | ||
1144 | sbb $carry,$carry # mov cf,$carry | ||
1145 | shr \$63,$A0[0] | ||
1146 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | ||
1147 | shr \$63,$A0[1] | ||
1148 | or $A0[0],$S[3] # | t[2*i]>>63 | ||
1149 | mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch | ||
1150 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
1151 | mul %rax # a[i]*a[i] | ||
1152 | neg $carry # mov $carry,cf | ||
1153 | mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch | ||
1154 | adc %rax,$S[2] | ||
1155 | mov 16($aptr,$i),%rax # a[i+1] # prefetch | ||
1156 | mov $S[2],16($tptr,$i,2) | ||
1157 | adc %rdx,$S[3] | ||
1158 | mov $S[3],24($tptr,$i,2) | ||
1159 | sbb $carry,$carry # mov cf,$carry | ||
1160 | add \$32,$i | ||
1161 | jnz .Lsqr4x_shift_n_add | ||
1162 | |||
1163 | lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift | ||
1164 | shr \$63,$A0[0] | ||
1165 | lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | | ||
1166 | shr \$63,$A0[1] | ||
1167 | or $A0[0],$S[1] # | t[2*i]>>63 | ||
1168 | mov -16($tptr),$A0[0] # t[2*i+2] # prefetch | ||
1169 | mov $A0[1],$shift # shift=t[2*i+1]>>63 | ||
1170 | mul %rax # a[i]*a[i] | ||
1171 | neg $carry # mov $carry,cf | ||
1172 | mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch | ||
1173 | adc %rax,$S[0] | ||
1174 | mov -8($aptr),%rax # a[i+1] # prefetch | ||
1175 | mov $S[0],-32($tptr) | ||
1176 | adc %rdx,$S[1] | ||
1177 | |||
1178 | lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift | ||
1179 | mov $S[1],-24($tptr) | ||
1180 | sbb $carry,$carry # mov cf,$carry | ||
1181 | shr \$63,$A0[0] | ||
1182 | lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | | ||
1183 | shr \$63,$A0[1] | ||
1184 | or $A0[0],$S[3] # | t[2*i]>>63 | ||
1185 | mul %rax # a[i]*a[i] | ||
1186 | neg $carry # mov $carry,cf | ||
1187 | adc %rax,$S[2] | ||
1188 | adc %rdx,$S[3] | ||
1189 | mov $S[2],-16($tptr) | ||
1190 | mov $S[3],-8($tptr) | ||
1191 | ___ | ||
1192 | } | ||
1193 | ############################################################## | ||
1194 | # Montgomery reduction part, "word-by-word" algorithm. | ||
1195 | # | ||
1196 | { | ||
1197 | my ($topbit,$nptr)=("%rbp",$aptr); | ||
1198 | my ($m0,$m1)=($a0,$a1); | ||
1199 | my @Ni=("%rbx","%r9"); | ||
1200 | $code.=<<___; | ||
1201 | mov 40(%rsp),$nptr # restore $nptr | ||
1202 | mov 48(%rsp),$n0 # restore *n0 | ||
1203 | xor $j,$j | ||
1204 | mov $num,0(%rsp) # save $num | ||
1205 | sub $num,$j # $j=-$num | ||
1206 | mov 64(%rsp),$A0[0] # t[0] # modsched # | ||
1207 | mov $n0,$m0 # # modsched # | ||
1208 | lea 64(%rsp,$num,2),%rax # end of t[] buffer | ||
1209 | lea 64(%rsp,$num),$tptr # end of t[] window | ||
1210 | mov %rax,8(%rsp) # save end of t[] buffer | ||
1211 | lea ($nptr,$num),$nptr # end of n[] buffer | ||
1212 | xor $topbit,$topbit # $topbit=0 | ||
1213 | |||
1214 | mov 0($nptr,$j),%rax # n[0] # modsched # | ||
1215 | mov 8($nptr,$j),$Ni[1] # n[1] # modsched # | ||
1216 | imulq $A0[0],$m0 # m0=t[0]*n0 # modsched # | ||
1217 | mov %rax,$Ni[0] # # modsched # | ||
1218 | jmp .Lsqr4x_mont_outer | ||
1219 | |||
1220 | .align 16 | ||
1221 | .Lsqr4x_mont_outer: | ||
1222 | xor $A0[1],$A0[1] | ||
1223 | mul $m0 # n[0]*m0 | ||
1224 | add %rax,$A0[0] # n[0]*m0+t[0] | ||
1225 | mov $Ni[1],%rax | ||
1226 | adc %rdx,$A0[1] | ||
1227 | mov $n0,$m1 | ||
1228 | |||
1229 | xor $A0[0],$A0[0] | ||
1230 | add 8($tptr,$j),$A0[1] | ||
1231 | adc \$0,$A0[0] | ||
1232 | mul $m0 # n[1]*m0 | ||
1233 | add %rax,$A0[1] # n[1]*m0+t[1] | ||
1234 | mov $Ni[0],%rax | ||
1235 | adc %rdx,$A0[0] | ||
1236 | |||
1237 | imulq $A0[1],$m1 | ||
1238 | |||
1239 | mov 16($nptr,$j),$Ni[0] # n[2] | ||
1240 | xor $A1[1],$A1[1] | ||
1241 | add $A0[1],$A1[0] | ||
1242 | adc \$0,$A1[1] | ||
1243 | mul $m1 # n[0]*m1 | ||
1244 | add %rax,$A1[0] # n[0]*m1+"t[1]" | ||
1245 | mov $Ni[0],%rax | ||
1246 | adc %rdx,$A1[1] | ||
1247 | mov $A1[0],8($tptr,$j) # "t[1]" | ||
1248 | |||
1249 | xor $A0[1],$A0[1] | ||
1250 | add 16($tptr,$j),$A0[0] | ||
1251 | adc \$0,$A0[1] | ||
1252 | mul $m0 # n[2]*m0 | ||
1253 | add %rax,$A0[0] # n[2]*m0+t[2] | ||
1254 | mov $Ni[1],%rax | ||
1255 | adc %rdx,$A0[1] | ||
1256 | |||
1257 | mov 24($nptr,$j),$Ni[1] # n[3] | ||
1258 | xor $A1[0],$A1[0] | ||
1259 | add $A0[0],$A1[1] | ||
1260 | adc \$0,$A1[0] | ||
1261 | mul $m1 # n[1]*m1 | ||
1262 | add %rax,$A1[1] # n[1]*m1+"t[2]" | ||
1263 | mov $Ni[1],%rax | ||
1264 | adc %rdx,$A1[0] | ||
1265 | mov $A1[1],16($tptr,$j) # "t[2]" | ||
1266 | |||
1267 | xor $A0[0],$A0[0] | ||
1268 | add 24($tptr,$j),$A0[1] | ||
1269 | lea 32($j),$j | ||
1270 | adc \$0,$A0[0] | ||
1271 | mul $m0 # n[3]*m0 | ||
1272 | add %rax,$A0[1] # n[3]*m0+t[3] | ||
1273 | mov $Ni[0],%rax | ||
1274 | adc %rdx,$A0[0] | ||
1275 | jmp .Lsqr4x_mont_inner | ||
1276 | |||
1277 | .align 16 | ||
1278 | .Lsqr4x_mont_inner: | ||
1279 | mov ($nptr,$j),$Ni[0] # n[4] | ||
1280 | xor $A1[1],$A1[1] | ||
1281 | add $A0[1],$A1[0] | ||
1282 | adc \$0,$A1[1] | ||
1283 | mul $m1 # n[2]*m1 | ||
1284 | add %rax,$A1[0] # n[2]*m1+"t[3]" | ||
1285 | mov $Ni[0],%rax | ||
1286 | adc %rdx,$A1[1] | ||
1287 | mov $A1[0],-8($tptr,$j) # "t[3]" | ||
1288 | |||
1289 | xor $A0[1],$A0[1] | ||
1290 | add ($tptr,$j),$A0[0] | ||
1291 | adc \$0,$A0[1] | ||
1292 | mul $m0 # n[4]*m0 | ||
1293 | add %rax,$A0[0] # n[4]*m0+t[4] | ||
1294 | mov $Ni[1],%rax | ||
1295 | adc %rdx,$A0[1] | ||
1296 | |||
1297 | mov 8($nptr,$j),$Ni[1] # n[5] | ||
1298 | xor $A1[0],$A1[0] | ||
1299 | add $A0[0],$A1[1] | ||
1300 | adc \$0,$A1[0] | ||
1301 | mul $m1 # n[3]*m1 | ||
1302 | add %rax,$A1[1] # n[3]*m1+"t[4]" | ||
1303 | mov $Ni[1],%rax | ||
1304 | adc %rdx,$A1[0] | ||
1305 | mov $A1[1],($tptr,$j) # "t[4]" | ||
1306 | |||
1307 | xor $A0[0],$A0[0] | ||
1308 | add 8($tptr,$j),$A0[1] | ||
1309 | adc \$0,$A0[0] | ||
1310 | mul $m0 # n[5]*m0 | ||
1311 | add %rax,$A0[1] # n[5]*m0+t[5] | ||
1312 | mov $Ni[0],%rax | ||
1313 | adc %rdx,$A0[0] | ||
1314 | |||
1315 | |||
1316 | mov 16($nptr,$j),$Ni[0] # n[6] | ||
1317 | xor $A1[1],$A1[1] | ||
1318 | add $A0[1],$A1[0] | ||
1319 | adc \$0,$A1[1] | ||
1320 | mul $m1 # n[4]*m1 | ||
1321 | add %rax,$A1[0] # n[4]*m1+"t[5]" | ||
1322 | mov $Ni[0],%rax | ||
1323 | adc %rdx,$A1[1] | ||
1324 | mov $A1[0],8($tptr,$j) # "t[5]" | ||
1325 | |||
1326 | xor $A0[1],$A0[1] | ||
1327 | add 16($tptr,$j),$A0[0] | ||
1328 | adc \$0,$A0[1] | ||
1329 | mul $m0 # n[6]*m0 | ||
1330 | add %rax,$A0[0] # n[6]*m0+t[6] | ||
1331 | mov $Ni[1],%rax | ||
1332 | adc %rdx,$A0[1] | ||
1333 | |||
1334 | mov 24($nptr,$j),$Ni[1] # n[7] | ||
1335 | xor $A1[0],$A1[0] | ||
1336 | add $A0[0],$A1[1] | ||
1337 | adc \$0,$A1[0] | ||
1338 | mul $m1 # n[5]*m1 | ||
1339 | add %rax,$A1[1] # n[5]*m1+"t[6]" | ||
1340 | mov $Ni[1],%rax | ||
1341 | adc %rdx,$A1[0] | ||
1342 | mov $A1[1],16($tptr,$j) # "t[6]" | ||
1343 | |||
1344 | xor $A0[0],$A0[0] | ||
1345 | add 24($tptr,$j),$A0[1] | ||
1346 | lea 32($j),$j | ||
1347 | adc \$0,$A0[0] | ||
1348 | mul $m0 # n[7]*m0 | ||
1349 | add %rax,$A0[1] # n[7]*m0+t[7] | ||
1350 | mov $Ni[0],%rax | ||
1351 | adc %rdx,$A0[0] | ||
1352 | cmp \$0,$j | ||
1353 | jne .Lsqr4x_mont_inner | ||
1354 | |||
1355 | sub 0(%rsp),$j # $j=-$num # modsched # | ||
1356 | mov $n0,$m0 # # modsched # | ||
1357 | |||
1358 | xor $A1[1],$A1[1] | ||
1359 | add $A0[1],$A1[0] | ||
1360 | adc \$0,$A1[1] | ||
1361 | mul $m1 # n[6]*m1 | ||
1362 | add %rax,$A1[0] # n[6]*m1+"t[7]" | ||
1363 | mov $Ni[1],%rax | ||
1364 | adc %rdx,$A1[1] | ||
1365 | mov $A1[0],-8($tptr) # "t[7]" | ||
1366 | |||
1367 | xor $A0[1],$A0[1] | ||
1368 | add ($tptr),$A0[0] # +t[8] | ||
1369 | adc \$0,$A0[1] | ||
1370 | mov 0($nptr,$j),$Ni[0] # n[0] # modsched # | ||
1371 | add $topbit,$A0[0] | ||
1372 | adc \$0,$A0[1] | ||
1373 | |||
1374 | imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched # | ||
1375 | xor $A1[0],$A1[0] | ||
1376 | mov 8($nptr,$j),$Ni[1] # n[1] # modsched # | ||
1377 | add $A0[0],$A1[1] | ||
1378 | mov 16($tptr,$j),$A0[0] # t[0] # modsched # | ||
1379 | adc \$0,$A1[0] | ||
1380 | mul $m1 # n[7]*m1 | ||
1381 | add %rax,$A1[1] # n[7]*m1+"t[8]" | ||
1382 | mov $Ni[0],%rax # # modsched # | ||
1383 | adc %rdx,$A1[0] | ||
1384 | mov $A1[1],($tptr) # "t[8]" | ||
1385 | |||
1386 | xor $topbit,$topbit | ||
1387 | add 8($tptr),$A1[0] # +t[9] | ||
1388 | adc $topbit,$topbit | ||
1389 | add $A0[1],$A1[0] | ||
1390 | lea 16($tptr),$tptr # "t[$num]>>128" | ||
1391 | adc \$0,$topbit | ||
1392 | mov $A1[0],-8($tptr) # "t[9]" | ||
1393 | cmp 8(%rsp),$tptr # are we done? | ||
1394 | jb .Lsqr4x_mont_outer | ||
1395 | |||
1396 | mov 0(%rsp),$num # restore $num | ||
1397 | mov $topbit,($tptr) # save $topbit | ||
1398 | ___ | ||
1399 | } | ||
1400 | ############################################################## | ||
1401 | # Post-condition, 4x unrolled copy from bn_mul_mont | ||
1402 | # | ||
1403 | { | ||
1404 | my ($tptr,$nptr)=("%rbx",$aptr); | ||
1405 | my @ri=("%rax","%rdx","%r10","%r11"); | ||
1406 | $code.=<<___; | ||
1407 | mov 64(%rsp,$num),@ri[0] # tp[0] | ||
1408 | lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result | ||
1409 | mov 40(%rsp),$nptr # restore $nptr | ||
1410 | shr \$5,$num # num/4 | ||
1411 | mov 8($tptr),@ri[1] # t[1] | ||
1412 | xor $i,$i # i=0 and clear CF! | ||
1413 | |||
1414 | mov 32(%rsp),$rptr # restore $rptr | ||
1415 | sub 0($nptr),@ri[0] | ||
1416 | mov 16($tptr),@ri[2] # t[2] | ||
1417 | mov 24($tptr),@ri[3] # t[3] | ||
1418 | sbb 8($nptr),@ri[1] | ||
1419 | lea -1($num),$j # j=num/4-1 | ||
1420 | jmp .Lsqr4x_sub | ||
1421 | .align 16 | ||
1422 | .Lsqr4x_sub: | ||
1423 | mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
1424 | mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
1425 | sbb 16($nptr,$i,8),@ri[2] | ||
1426 | mov 32($tptr,$i,8),@ri[0] # tp[i+1] | ||
1427 | mov 40($tptr,$i,8),@ri[1] | ||
1428 | sbb 24($nptr,$i,8),@ri[3] | ||
1429 | mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
1430 | mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
1431 | sbb 32($nptr,$i,8),@ri[0] | ||
1432 | mov 48($tptr,$i,8),@ri[2] | ||
1433 | mov 56($tptr,$i,8),@ri[3] | ||
1434 | sbb 40($nptr,$i,8),@ri[1] | ||
1435 | lea 4($i),$i # i++ | ||
1436 | dec $j # doesn't affect CF! | ||
1437 | jnz .Lsqr4x_sub | ||
1438 | |||
1439 | mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
1440 | mov 32($tptr,$i,8),@ri[0] # load overflow bit | ||
1441 | sbb 16($nptr,$i,8),@ri[2] | ||
1442 | mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
1443 | sbb 24($nptr,$i,8),@ri[3] | ||
1444 | mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
1445 | |||
1446 | sbb \$0,@ri[0] # handle upmost overflow bit | ||
1447 | mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] | ||
1448 | xor $i,$i # i=0 | ||
1449 | and @ri[0],$tptr | ||
1450 | not @ri[0] | ||
1451 | mov $rptr,$nptr | ||
1452 | and @ri[0],$nptr | ||
1453 | lea -1($num),$j | ||
1454 | or $nptr,$tptr # tp=borrow?tp:rp | ||
1455 | |||
1456 | pxor %xmm0,%xmm0 | ||
1457 | lea 64(%rsp,$num,8),$nptr | ||
1458 | movdqu ($tptr),%xmm1 | ||
1459 | lea ($nptr,$num,8),$nptr | ||
1460 | movdqa %xmm0,64(%rsp) # zap lower half of temporary vector | ||
1461 | movdqa %xmm0,($nptr) # zap upper half of temporary vector | ||
1462 | movdqu %xmm1,($rptr) | ||
1463 | jmp .Lsqr4x_copy | ||
1464 | .align 16 | ||
1465 | .Lsqr4x_copy: # copy or in-place refresh | ||
1466 | movdqu 16($tptr,$i),%xmm2 | ||
1467 | movdqu 32($tptr,$i),%xmm1 | ||
1468 | movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector | ||
1469 | movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector | ||
1470 | movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector | ||
1471 | movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector | ||
1472 | movdqu %xmm2,16($rptr,$i) | ||
1473 | movdqu %xmm1,32($rptr,$i) | ||
1474 | lea 32($i),$i | ||
1475 | dec $j | ||
1476 | jnz .Lsqr4x_copy | ||
1477 | |||
1478 | movdqu 16($tptr,$i),%xmm2 | ||
1479 | movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector | ||
1480 | movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector | ||
1481 | movdqu %xmm2,16($rptr,$i) | ||
1482 | ___ | ||
1483 | } | ||
1484 | $code.=<<___; | ||
1485 | mov 56(%rsp),%rsi # restore %rsp | ||
1486 | mov \$1,%rax | ||
1487 | mov 0(%rsi),%r15 | ||
1488 | mov 8(%rsi),%r14 | ||
1489 | mov 16(%rsi),%r13 | ||
1490 | mov 24(%rsi),%r12 | ||
1491 | mov 32(%rsi),%rbp | ||
1492 | mov 40(%rsi),%rbx | ||
1493 | lea 48(%rsi),%rsp | ||
1494 | .Lsqr4x_epilogue: | ||
1495 | ret | ||
1496 | .size bn_sqr4x_mont,.-bn_sqr4x_mont | ||
1497 | ___ | ||
1498 | }}} | ||
1499 | $code.=<<___; | ||
217 | .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | 1500 | .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
218 | .align 16 | 1501 | .align 16 |
219 | ___ | 1502 | ___ |
@@ -228,9 +1511,9 @@ $disp="%r9"; | |||
228 | 1511 | ||
229 | $code.=<<___; | 1512 | $code.=<<___; |
230 | .extern __imp_RtlVirtualUnwind | 1513 | .extern __imp_RtlVirtualUnwind |
231 | .type se_handler,\@abi-omnipotent | 1514 | .type mul_handler,\@abi-omnipotent |
232 | .align 16 | 1515 | .align 16 |
233 | se_handler: | 1516 | mul_handler: |
234 | push %rsi | 1517 | push %rsi |
235 | push %rdi | 1518 | push %rdi |
236 | push %rbx | 1519 | push %rbx |
@@ -245,15 +1528,20 @@ se_handler: | |||
245 | mov 120($context),%rax # pull context->Rax | 1528 | mov 120($context),%rax # pull context->Rax |
246 | mov 248($context),%rbx # pull context->Rip | 1529 | mov 248($context),%rbx # pull context->Rip |
247 | 1530 | ||
248 | lea .Lprologue(%rip),%r10 | 1531 | mov 8($disp),%rsi # disp->ImageBase |
249 | cmp %r10,%rbx # context->Rip<.Lprologue | 1532 | mov 56($disp),%r11 # disp->HandlerData |
250 | jb .Lin_prologue | 1533 | |
1534 | mov 0(%r11),%r10d # HandlerData[0] | ||
1535 | lea (%rsi,%r10),%r10 # end of prologue label | ||
1536 | cmp %r10,%rbx # context->Rip<end of prologue label | ||
1537 | jb .Lcommon_seh_tail | ||
251 | 1538 | ||
252 | mov 152($context),%rax # pull context->Rsp | 1539 | mov 152($context),%rax # pull context->Rsp |
253 | 1540 | ||
254 | lea .Lepilogue(%rip),%r10 | 1541 | mov 4(%r11),%r10d # HandlerData[1] |
255 | cmp %r10,%rbx # context->Rip>=.Lepilogue | 1542 | lea (%rsi,%r10),%r10 # epilogue label |
256 | jae .Lin_prologue | 1543 | cmp %r10,%rbx # context->Rip>=epilogue label |
1544 | jae .Lcommon_seh_tail | ||
257 | 1545 | ||
258 | mov 192($context),%r10 # pull $num | 1546 | mov 192($context),%r10 # pull $num |
259 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer | 1547 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer |
@@ -272,7 +1560,53 @@ se_handler: | |||
272 | mov %r14,232($context) # restore context->R14 | 1560 | mov %r14,232($context) # restore context->R14 |
273 | mov %r15,240($context) # restore context->R15 | 1561 | mov %r15,240($context) # restore context->R15 |
274 | 1562 | ||
275 | .Lin_prologue: | 1563 | jmp .Lcommon_seh_tail |
1564 | .size mul_handler,.-mul_handler | ||
1565 | |||
1566 | .type sqr_handler,\@abi-omnipotent | ||
1567 | .align 16 | ||
1568 | sqr_handler: | ||
1569 | push %rsi | ||
1570 | push %rdi | ||
1571 | push %rbx | ||
1572 | push %rbp | ||
1573 | push %r12 | ||
1574 | push %r13 | ||
1575 | push %r14 | ||
1576 | push %r15 | ||
1577 | pushfq | ||
1578 | sub \$64,%rsp | ||
1579 | |||
1580 | mov 120($context),%rax # pull context->Rax | ||
1581 | mov 248($context),%rbx # pull context->Rip | ||
1582 | |||
1583 | lea .Lsqr4x_body(%rip),%r10 | ||
1584 | cmp %r10,%rbx # context->Rip<.Lsqr_body | ||
1585 | jb .Lcommon_seh_tail | ||
1586 | |||
1587 | mov 152($context),%rax # pull context->Rsp | ||
1588 | |||
1589 | lea .Lsqr4x_epilogue(%rip),%r10 | ||
1590 | cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue | ||
1591 | jae .Lcommon_seh_tail | ||
1592 | |||
1593 | mov 56(%rax),%rax # pull saved stack pointer | ||
1594 | lea 48(%rax),%rax | ||
1595 | |||
1596 | mov -8(%rax),%rbx | ||
1597 | mov -16(%rax),%rbp | ||
1598 | mov -24(%rax),%r12 | ||
1599 | mov -32(%rax),%r13 | ||
1600 | mov -40(%rax),%r14 | ||
1601 | mov -48(%rax),%r15 | ||
1602 | mov %rbx,144($context) # restore context->Rbx | ||
1603 | mov %rbp,160($context) # restore context->Rbp | ||
1604 | mov %r12,216($context) # restore context->R12 | ||
1605 | mov %r13,224($context) # restore context->R13 | ||
1606 | mov %r14,232($context) # restore context->R14 | ||
1607 | mov %r15,240($context) # restore context->R15 | ||
1608 | |||
1609 | .Lcommon_seh_tail: | ||
276 | mov 8(%rax),%rdi | 1610 | mov 8(%rax),%rdi |
277 | mov 16(%rax),%rsi | 1611 | mov 16(%rax),%rsi |
278 | mov %rax,152($context) # restore context->Rsp | 1612 | mov %rax,152($context) # restore context->Rsp |
@@ -310,7 +1644,7 @@ se_handler: | |||
310 | pop %rdi | 1644 | pop %rdi |
311 | pop %rsi | 1645 | pop %rsi |
312 | ret | 1646 | ret |
313 | .size se_handler,.-se_handler | 1647 | .size sqr_handler,.-sqr_handler |
314 | 1648 | ||
315 | .section .pdata | 1649 | .section .pdata |
316 | .align 4 | 1650 | .align 4 |
@@ -318,11 +1652,27 @@ se_handler: | |||
318 | .rva .LSEH_end_bn_mul_mont | 1652 | .rva .LSEH_end_bn_mul_mont |
319 | .rva .LSEH_info_bn_mul_mont | 1653 | .rva .LSEH_info_bn_mul_mont |
320 | 1654 | ||
1655 | .rva .LSEH_begin_bn_mul4x_mont | ||
1656 | .rva .LSEH_end_bn_mul4x_mont | ||
1657 | .rva .LSEH_info_bn_mul4x_mont | ||
1658 | |||
1659 | .rva .LSEH_begin_bn_sqr4x_mont | ||
1660 | .rva .LSEH_end_bn_sqr4x_mont | ||
1661 | .rva .LSEH_info_bn_sqr4x_mont | ||
1662 | |||
321 | .section .xdata | 1663 | .section .xdata |
322 | .align 8 | 1664 | .align 8 |
323 | .LSEH_info_bn_mul_mont: | 1665 | .LSEH_info_bn_mul_mont: |
324 | .byte 9,0,0,0 | 1666 | .byte 9,0,0,0 |
325 | .rva se_handler | 1667 | .rva mul_handler |
1668 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[] | ||
1669 | .LSEH_info_bn_mul4x_mont: | ||
1670 | .byte 9,0,0,0 | ||
1671 | .rva mul_handler | ||
1672 | .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] | ||
1673 | .LSEH_info_bn_sqr4x_mont: | ||
1674 | .byte 9,0,0,0 | ||
1675 | .rva sqr_handler | ||
326 | ___ | 1676 | ___ |
327 | } | 1677 | } |
328 | 1678 | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl new file mode 100755 index 0000000000..057cda28aa --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl | |||
@@ -0,0 +1,1070 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # August 2011. | ||
11 | # | ||
12 | # Companion to x86_64-mont.pl that optimizes cache-timing attack | ||
13 | # countermeasures. The subroutines are produced by replacing bp[i] | ||
14 | # references in their x86_64-mont.pl counterparts with cache-neutral | ||
15 | # references to powers table computed in BN_mod_exp_mont_consttime. | ||
16 | # In addition subroutine that scatters elements of the powers table | ||
17 | # is implemented, so that scatter-/gathering can be tuned without | ||
18 | # bn_exp.c modifications. | ||
19 | |||
20 | $flavour = shift; | ||
21 | $output = shift; | ||
22 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
23 | |||
24 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
25 | |||
26 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
27 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
28 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
29 | die "can't locate x86_64-xlate.pl"; | ||
30 | |||
31 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
32 | |||
33 | # int bn_mul_mont_gather5( | ||
34 | $rp="%rdi"; # BN_ULONG *rp, | ||
35 | $ap="%rsi"; # const BN_ULONG *ap, | ||
36 | $bp="%rdx"; # const BN_ULONG *bp, | ||
37 | $np="%rcx"; # const BN_ULONG *np, | ||
38 | $n0="%r8"; # const BN_ULONG *n0, | ||
39 | $num="%r9"; # int num, | ||
40 | # int idx); # 0 to 2^5-1, "index" in $bp holding | ||
41 | # pre-computed powers of a', interlaced | ||
42 | # in such manner that b[0] is $bp[idx], | ||
43 | # b[1] is [2^5+idx], etc. | ||
44 | $lo0="%r10"; | ||
45 | $hi0="%r11"; | ||
46 | $hi1="%r13"; | ||
47 | $i="%r14"; | ||
48 | $j="%r15"; | ||
49 | $m0="%rbx"; | ||
50 | $m1="%rbp"; | ||
51 | |||
52 | $code=<<___; | ||
53 | .text | ||
54 | |||
55 | .globl bn_mul_mont_gather5 | ||
56 | .type bn_mul_mont_gather5,\@function,6 | ||
57 | .align 64 | ||
58 | bn_mul_mont_gather5: | ||
59 | test \$3,${num}d | ||
60 | jnz .Lmul_enter | ||
61 | cmp \$8,${num}d | ||
62 | jb .Lmul_enter | ||
63 | jmp .Lmul4x_enter | ||
64 | |||
65 | .align 16 | ||
66 | .Lmul_enter: | ||
67 | mov ${num}d,${num}d | ||
68 | mov `($win64?56:8)`(%rsp),%r10d # load 7th argument | ||
69 | push %rbx | ||
70 | push %rbp | ||
71 | push %r12 | ||
72 | push %r13 | ||
73 | push %r14 | ||
74 | push %r15 | ||
75 | ___ | ||
76 | $code.=<<___ if ($win64); | ||
77 | lea -0x28(%rsp),%rsp | ||
78 | movaps %xmm6,(%rsp) | ||
79 | movaps %xmm7,0x10(%rsp) | ||
80 | .Lmul_alloca: | ||
81 | ___ | ||
82 | $code.=<<___; | ||
83 | mov %rsp,%rax | ||
84 | lea 2($num),%r11 | ||
85 | neg %r11 | ||
86 | lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) | ||
87 | and \$-1024,%rsp # minimize TLB usage | ||
88 | |||
89 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | ||
90 | .Lmul_body: | ||
91 | mov $bp,%r12 # reassign $bp | ||
92 | ___ | ||
93 | $bp="%r12"; | ||
94 | $STRIDE=2**5*8; # 5 is "window size" | ||
95 | $N=$STRIDE/4; # should match cache line size | ||
96 | $code.=<<___; | ||
97 | mov %r10,%r11 | ||
98 | shr \$`log($N/8)/log(2)`,%r10 | ||
99 | and \$`$N/8-1`,%r11 | ||
100 | not %r10 | ||
101 | lea .Lmagic_masks(%rip),%rax | ||
102 | and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" | ||
103 | lea 96($bp,%r11,8),$bp # pointer within 1st cache line | ||
104 | movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which | ||
105 | movq 8(%rax,%r10,8),%xmm5 # cache line contains element | ||
106 | movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument | ||
107 | movq 24(%rax,%r10,8),%xmm7 | ||
108 | |||
109 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
110 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
111 | pand %xmm4,%xmm0 | ||
112 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
113 | pand %xmm5,%xmm1 | ||
114 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
115 | pand %xmm6,%xmm2 | ||
116 | por %xmm1,%xmm0 | ||
117 | pand %xmm7,%xmm3 | ||
118 | por %xmm2,%xmm0 | ||
119 | lea $STRIDE($bp),$bp | ||
120 | por %xmm3,%xmm0 | ||
121 | |||
122 | movq %xmm0,$m0 # m0=bp[0] | ||
123 | |||
124 | mov ($n0),$n0 # pull n0[0] value | ||
125 | mov ($ap),%rax | ||
126 | |||
127 | xor $i,$i # i=0 | ||
128 | xor $j,$j # j=0 | ||
129 | |||
130 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
131 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
132 | pand %xmm4,%xmm0 | ||
133 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
134 | pand %xmm5,%xmm1 | ||
135 | |||
136 | mov $n0,$m1 | ||
137 | mulq $m0 # ap[0]*bp[0] | ||
138 | mov %rax,$lo0 | ||
139 | mov ($np),%rax | ||
140 | |||
141 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
142 | pand %xmm6,%xmm2 | ||
143 | por %xmm1,%xmm0 | ||
144 | pand %xmm7,%xmm3 | ||
145 | |||
146 | imulq $lo0,$m1 # "tp[0]"*n0 | ||
147 | mov %rdx,$hi0 | ||
148 | |||
149 | por %xmm2,%xmm0 | ||
150 | lea $STRIDE($bp),$bp | ||
151 | por %xmm3,%xmm0 | ||
152 | |||
153 | mulq $m1 # np[0]*m1 | ||
154 | add %rax,$lo0 # discarded | ||
155 | mov 8($ap),%rax | ||
156 | adc \$0,%rdx | ||
157 | mov %rdx,$hi1 | ||
158 | |||
159 | lea 1($j),$j # j++ | ||
160 | jmp .L1st_enter | ||
161 | |||
162 | .align 16 | ||
163 | .L1st: | ||
164 | add %rax,$hi1 | ||
165 | mov ($ap,$j,8),%rax | ||
166 | adc \$0,%rdx | ||
167 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] | ||
168 | mov $lo0,$hi0 | ||
169 | adc \$0,%rdx | ||
170 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
171 | mov %rdx,$hi1 | ||
172 | |||
173 | .L1st_enter: | ||
174 | mulq $m0 # ap[j]*bp[0] | ||
175 | add %rax,$hi0 | ||
176 | mov ($np,$j,8),%rax | ||
177 | adc \$0,%rdx | ||
178 | lea 1($j),$j # j++ | ||
179 | mov %rdx,$lo0 | ||
180 | |||
181 | mulq $m1 # np[j]*m1 | ||
182 | cmp $num,$j | ||
183 | jne .L1st | ||
184 | |||
185 | movq %xmm0,$m0 # bp[1] | ||
186 | |||
187 | add %rax,$hi1 | ||
188 | mov ($ap),%rax # ap[0] | ||
189 | adc \$0,%rdx | ||
190 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] | ||
191 | adc \$0,%rdx | ||
192 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
193 | mov %rdx,$hi1 | ||
194 | mov $lo0,$hi0 | ||
195 | |||
196 | xor %rdx,%rdx | ||
197 | add $hi0,$hi1 | ||
198 | adc \$0,%rdx | ||
199 | mov $hi1,-8(%rsp,$num,8) | ||
200 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit | ||
201 | |||
202 | lea 1($i),$i # i++ | ||
203 | jmp .Louter | ||
204 | .align 16 | ||
205 | .Louter: | ||
206 | xor $j,$j # j=0 | ||
207 | mov $n0,$m1 | ||
208 | mov (%rsp),$lo0 | ||
209 | |||
210 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
211 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
212 | pand %xmm4,%xmm0 | ||
213 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
214 | pand %xmm5,%xmm1 | ||
215 | |||
216 | mulq $m0 # ap[0]*bp[i] | ||
217 | add %rax,$lo0 # ap[0]*bp[i]+tp[0] | ||
218 | mov ($np),%rax | ||
219 | adc \$0,%rdx | ||
220 | |||
221 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
222 | pand %xmm6,%xmm2 | ||
223 | por %xmm1,%xmm0 | ||
224 | pand %xmm7,%xmm3 | ||
225 | |||
226 | imulq $lo0,$m1 # tp[0]*n0 | ||
227 | mov %rdx,$hi0 | ||
228 | |||
229 | por %xmm2,%xmm0 | ||
230 | lea $STRIDE($bp),$bp | ||
231 | por %xmm3,%xmm0 | ||
232 | |||
233 | mulq $m1 # np[0]*m1 | ||
234 | add %rax,$lo0 # discarded | ||
235 | mov 8($ap),%rax | ||
236 | adc \$0,%rdx | ||
237 | mov 8(%rsp),$lo0 # tp[1] | ||
238 | mov %rdx,$hi1 | ||
239 | |||
240 | lea 1($j),$j # j++ | ||
241 | jmp .Linner_enter | ||
242 | |||
243 | .align 16 | ||
244 | .Linner: | ||
245 | add %rax,$hi1 | ||
246 | mov ($ap,$j,8),%rax | ||
247 | adc \$0,%rdx | ||
248 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
249 | mov (%rsp,$j,8),$lo0 | ||
250 | adc \$0,%rdx | ||
251 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
252 | mov %rdx,$hi1 | ||
253 | |||
254 | .Linner_enter: | ||
255 | mulq $m0 # ap[j]*bp[i] | ||
256 | add %rax,$hi0 | ||
257 | mov ($np,$j,8),%rax | ||
258 | adc \$0,%rdx | ||
259 | add $hi0,$lo0 # ap[j]*bp[i]+tp[j] | ||
260 | mov %rdx,$hi0 | ||
261 | adc \$0,$hi0 | ||
262 | lea 1($j),$j # j++ | ||
263 | |||
264 | mulq $m1 # np[j]*m1 | ||
265 | cmp $num,$j | ||
266 | jne .Linner | ||
267 | |||
268 | movq %xmm0,$m0 # bp[i+1] | ||
269 | |||
270 | add %rax,$hi1 | ||
271 | mov ($ap),%rax # ap[0] | ||
272 | adc \$0,%rdx | ||
273 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
274 | mov (%rsp,$j,8),$lo0 | ||
275 | adc \$0,%rdx | ||
276 | mov $hi1,-16(%rsp,$j,8) # tp[j-1] | ||
277 | mov %rdx,$hi1 | ||
278 | |||
279 | xor %rdx,%rdx | ||
280 | add $hi0,$hi1 | ||
281 | adc \$0,%rdx | ||
282 | add $lo0,$hi1 # pull upmost overflow bit | ||
283 | adc \$0,%rdx | ||
284 | mov $hi1,-8(%rsp,$num,8) | ||
285 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit | ||
286 | |||
287 | lea 1($i),$i # i++ | ||
288 | cmp $num,$i | ||
289 | jl .Louter | ||
290 | |||
291 | xor $i,$i # i=0 and clear CF! | ||
292 | mov (%rsp),%rax # tp[0] | ||
293 | lea (%rsp),$ap # borrow ap for tp | ||
294 | mov $num,$j # j=num | ||
295 | jmp .Lsub | ||
296 | .align 16 | ||
297 | .Lsub: sbb ($np,$i,8),%rax | ||
298 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
299 | mov 8($ap,$i,8),%rax # tp[i+1] | ||
300 | lea 1($i),$i # i++ | ||
301 | dec $j # doesnn't affect CF! | ||
302 | jnz .Lsub | ||
303 | |||
304 | sbb \$0,%rax # handle upmost overflow bit | ||
305 | xor $i,$i | ||
306 | and %rax,$ap | ||
307 | not %rax | ||
308 | mov $rp,$np | ||
309 | and %rax,$np | ||
310 | mov $num,$j # j=num | ||
311 | or $np,$ap # ap=borrow?tp:rp | ||
312 | .align 16 | ||
313 | .Lcopy: # copy or in-place refresh | ||
314 | mov ($ap,$i,8),%rax | ||
315 | mov $i,(%rsp,$i,8) # zap temporary vector | ||
316 | mov %rax,($rp,$i,8) # rp[i]=tp[i] | ||
317 | lea 1($i),$i | ||
318 | sub \$1,$j | ||
319 | jnz .Lcopy | ||
320 | |||
321 | mov 8(%rsp,$num,8),%rsi # restore %rsp | ||
322 | mov \$1,%rax | ||
323 | ___ | ||
324 | $code.=<<___ if ($win64); | ||
325 | movaps (%rsi),%xmm6 | ||
326 | movaps 0x10(%rsi),%xmm7 | ||
327 | lea 0x28(%rsi),%rsi | ||
328 | ___ | ||
329 | $code.=<<___; | ||
330 | mov (%rsi),%r15 | ||
331 | mov 8(%rsi),%r14 | ||
332 | mov 16(%rsi),%r13 | ||
333 | mov 24(%rsi),%r12 | ||
334 | mov 32(%rsi),%rbp | ||
335 | mov 40(%rsi),%rbx | ||
336 | lea 48(%rsi),%rsp | ||
337 | .Lmul_epilogue: | ||
338 | ret | ||
339 | .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 | ||
340 | ___ | ||
341 | {{{ | ||
342 | my @A=("%r10","%r11"); | ||
343 | my @N=("%r13","%rdi"); | ||
344 | $code.=<<___; | ||
345 | .type bn_mul4x_mont_gather5,\@function,6 | ||
346 | .align 16 | ||
347 | bn_mul4x_mont_gather5: | ||
348 | .Lmul4x_enter: | ||
349 | mov ${num}d,${num}d | ||
350 | mov `($win64?56:8)`(%rsp),%r10d # load 7th argument | ||
351 | push %rbx | ||
352 | push %rbp | ||
353 | push %r12 | ||
354 | push %r13 | ||
355 | push %r14 | ||
356 | push %r15 | ||
357 | ___ | ||
358 | $code.=<<___ if ($win64); | ||
359 | lea -0x28(%rsp),%rsp | ||
360 | movaps %xmm6,(%rsp) | ||
361 | movaps %xmm7,0x10(%rsp) | ||
362 | .Lmul4x_alloca: | ||
363 | ___ | ||
364 | $code.=<<___; | ||
365 | mov %rsp,%rax | ||
366 | lea 4($num),%r11 | ||
367 | neg %r11 | ||
368 | lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) | ||
369 | and \$-1024,%rsp # minimize TLB usage | ||
370 | |||
371 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | ||
372 | .Lmul4x_body: | ||
373 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp | ||
374 | mov %rdx,%r12 # reassign $bp | ||
375 | ___ | ||
376 | $bp="%r12"; | ||
377 | $STRIDE=2**5*8; # 5 is "window size" | ||
378 | $N=$STRIDE/4; # should match cache line size | ||
379 | $code.=<<___; | ||
380 | mov %r10,%r11 | ||
381 | shr \$`log($N/8)/log(2)`,%r10 | ||
382 | and \$`$N/8-1`,%r11 | ||
383 | not %r10 | ||
384 | lea .Lmagic_masks(%rip),%rax | ||
385 | and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" | ||
386 | lea 96($bp,%r11,8),$bp # pointer within 1st cache line | ||
387 | movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which | ||
388 | movq 8(%rax,%r10,8),%xmm5 # cache line contains element | ||
389 | movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument | ||
390 | movq 24(%rax,%r10,8),%xmm7 | ||
391 | |||
392 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
393 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
394 | pand %xmm4,%xmm0 | ||
395 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
396 | pand %xmm5,%xmm1 | ||
397 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
398 | pand %xmm6,%xmm2 | ||
399 | por %xmm1,%xmm0 | ||
400 | pand %xmm7,%xmm3 | ||
401 | por %xmm2,%xmm0 | ||
402 | lea $STRIDE($bp),$bp | ||
403 | por %xmm3,%xmm0 | ||
404 | |||
405 | movq %xmm0,$m0 # m0=bp[0] | ||
406 | mov ($n0),$n0 # pull n0[0] value | ||
407 | mov ($ap),%rax | ||
408 | |||
409 | xor $i,$i # i=0 | ||
410 | xor $j,$j # j=0 | ||
411 | |||
412 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
413 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
414 | pand %xmm4,%xmm0 | ||
415 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
416 | pand %xmm5,%xmm1 | ||
417 | |||
418 | mov $n0,$m1 | ||
419 | mulq $m0 # ap[0]*bp[0] | ||
420 | mov %rax,$A[0] | ||
421 | mov ($np),%rax | ||
422 | |||
423 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
424 | pand %xmm6,%xmm2 | ||
425 | por %xmm1,%xmm0 | ||
426 | pand %xmm7,%xmm3 | ||
427 | |||
428 | imulq $A[0],$m1 # "tp[0]"*n0 | ||
429 | mov %rdx,$A[1] | ||
430 | |||
431 | por %xmm2,%xmm0 | ||
432 | lea $STRIDE($bp),$bp | ||
433 | por %xmm3,%xmm0 | ||
434 | |||
435 | mulq $m1 # np[0]*m1 | ||
436 | add %rax,$A[0] # discarded | ||
437 | mov 8($ap),%rax | ||
438 | adc \$0,%rdx | ||
439 | mov %rdx,$N[1] | ||
440 | |||
441 | mulq $m0 | ||
442 | add %rax,$A[1] | ||
443 | mov 8($np),%rax | ||
444 | adc \$0,%rdx | ||
445 | mov %rdx,$A[0] | ||
446 | |||
447 | mulq $m1 | ||
448 | add %rax,$N[1] | ||
449 | mov 16($ap),%rax | ||
450 | adc \$0,%rdx | ||
451 | add $A[1],$N[1] | ||
452 | lea 4($j),$j # j++ | ||
453 | adc \$0,%rdx | ||
454 | mov $N[1],(%rsp) | ||
455 | mov %rdx,$N[0] | ||
456 | jmp .L1st4x | ||
457 | .align 16 | ||
458 | .L1st4x: | ||
459 | mulq $m0 # ap[j]*bp[0] | ||
460 | add %rax,$A[0] | ||
461 | mov -16($np,$j,8),%rax | ||
462 | adc \$0,%rdx | ||
463 | mov %rdx,$A[1] | ||
464 | |||
465 | mulq $m1 # np[j]*m1 | ||
466 | add %rax,$N[0] | ||
467 | mov -8($ap,$j,8),%rax | ||
468 | adc \$0,%rdx | ||
469 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
470 | adc \$0,%rdx | ||
471 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
472 | mov %rdx,$N[1] | ||
473 | |||
474 | mulq $m0 # ap[j]*bp[0] | ||
475 | add %rax,$A[1] | ||
476 | mov -8($np,$j,8),%rax | ||
477 | adc \$0,%rdx | ||
478 | mov %rdx,$A[0] | ||
479 | |||
480 | mulq $m1 # np[j]*m1 | ||
481 | add %rax,$N[1] | ||
482 | mov ($ap,$j,8),%rax | ||
483 | adc \$0,%rdx | ||
484 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | ||
485 | adc \$0,%rdx | ||
486 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
487 | mov %rdx,$N[0] | ||
488 | |||
489 | mulq $m0 # ap[j]*bp[0] | ||
490 | add %rax,$A[0] | ||
491 | mov ($np,$j,8),%rax | ||
492 | adc \$0,%rdx | ||
493 | mov %rdx,$A[1] | ||
494 | |||
495 | mulq $m1 # np[j]*m1 | ||
496 | add %rax,$N[0] | ||
497 | mov 8($ap,$j,8),%rax | ||
498 | adc \$0,%rdx | ||
499 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
500 | adc \$0,%rdx | ||
501 | mov $N[0],-8(%rsp,$j,8) # tp[j-1] | ||
502 | mov %rdx,$N[1] | ||
503 | |||
504 | mulq $m0 # ap[j]*bp[0] | ||
505 | add %rax,$A[1] | ||
506 | mov 8($np,$j,8),%rax | ||
507 | adc \$0,%rdx | ||
508 | lea 4($j),$j # j++ | ||
509 | mov %rdx,$A[0] | ||
510 | |||
511 | mulq $m1 # np[j]*m1 | ||
512 | add %rax,$N[1] | ||
513 | mov -16($ap,$j,8),%rax | ||
514 | adc \$0,%rdx | ||
515 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | ||
516 | adc \$0,%rdx | ||
517 | mov $N[1],-32(%rsp,$j,8) # tp[j-1] | ||
518 | mov %rdx,$N[0] | ||
519 | cmp $num,$j | ||
520 | jl .L1st4x | ||
521 | |||
522 | mulq $m0 # ap[j]*bp[0] | ||
523 | add %rax,$A[0] | ||
524 | mov -16($np,$j,8),%rax | ||
525 | adc \$0,%rdx | ||
526 | mov %rdx,$A[1] | ||
527 | |||
528 | mulq $m1 # np[j]*m1 | ||
529 | add %rax,$N[0] | ||
530 | mov -8($ap,$j,8),%rax | ||
531 | adc \$0,%rdx | ||
532 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | ||
533 | adc \$0,%rdx | ||
534 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
535 | mov %rdx,$N[1] | ||
536 | |||
537 | mulq $m0 # ap[j]*bp[0] | ||
538 | add %rax,$A[1] | ||
539 | mov -8($np,$j,8),%rax | ||
540 | adc \$0,%rdx | ||
541 | mov %rdx,$A[0] | ||
542 | |||
543 | mulq $m1 # np[j]*m1 | ||
544 | add %rax,$N[1] | ||
545 | mov ($ap),%rax # ap[0] | ||
546 | adc \$0,%rdx | ||
547 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | ||
548 | adc \$0,%rdx | ||
549 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
550 | mov %rdx,$N[0] | ||
551 | |||
552 | movq %xmm0,$m0 # bp[1] | ||
553 | |||
554 | xor $N[1],$N[1] | ||
555 | add $A[0],$N[0] | ||
556 | adc \$0,$N[1] | ||
557 | mov $N[0],-8(%rsp,$j,8) | ||
558 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit | ||
559 | |||
560 | lea 1($i),$i # i++ | ||
561 | .align 4 | ||
562 | .Louter4x: | ||
563 | xor $j,$j # j=0 | ||
564 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
565 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
566 | pand %xmm4,%xmm0 | ||
567 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
568 | pand %xmm5,%xmm1 | ||
569 | |||
570 | mov (%rsp),$A[0] | ||
571 | mov $n0,$m1 | ||
572 | mulq $m0 # ap[0]*bp[i] | ||
573 | add %rax,$A[0] # ap[0]*bp[i]+tp[0] | ||
574 | mov ($np),%rax | ||
575 | adc \$0,%rdx | ||
576 | |||
577 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
578 | pand %xmm6,%xmm2 | ||
579 | por %xmm1,%xmm0 | ||
580 | pand %xmm7,%xmm3 | ||
581 | |||
582 | imulq $A[0],$m1 # tp[0]*n0 | ||
583 | mov %rdx,$A[1] | ||
584 | |||
585 | por %xmm2,%xmm0 | ||
586 | lea $STRIDE($bp),$bp | ||
587 | por %xmm3,%xmm0 | ||
588 | |||
589 | mulq $m1 # np[0]*m1 | ||
590 | add %rax,$A[0] # "$N[0]", discarded | ||
591 | mov 8($ap),%rax | ||
592 | adc \$0,%rdx | ||
593 | mov %rdx,$N[1] | ||
594 | |||
595 | mulq $m0 # ap[j]*bp[i] | ||
596 | add %rax,$A[1] | ||
597 | mov 8($np),%rax | ||
598 | adc \$0,%rdx | ||
599 | add 8(%rsp),$A[1] # +tp[1] | ||
600 | adc \$0,%rdx | ||
601 | mov %rdx,$A[0] | ||
602 | |||
603 | mulq $m1 # np[j]*m1 | ||
604 | add %rax,$N[1] | ||
605 | mov 16($ap),%rax | ||
606 | adc \$0,%rdx | ||
607 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
608 | lea 4($j),$j # j+=2 | ||
609 | adc \$0,%rdx | ||
610 | mov %rdx,$N[0] | ||
611 | jmp .Linner4x | ||
612 | .align 16 | ||
613 | .Linner4x: | ||
614 | mulq $m0 # ap[j]*bp[i] | ||
615 | add %rax,$A[0] | ||
616 | mov -16($np,$j,8),%rax | ||
617 | adc \$0,%rdx | ||
618 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
619 | adc \$0,%rdx | ||
620 | mov %rdx,$A[1] | ||
621 | |||
622 | mulq $m1 # np[j]*m1 | ||
623 | add %rax,$N[0] | ||
624 | mov -8($ap,$j,8),%rax | ||
625 | adc \$0,%rdx | ||
626 | add $A[0],$N[0] | ||
627 | adc \$0,%rdx | ||
628 | mov $N[1],-32(%rsp,$j,8) # tp[j-1] | ||
629 | mov %rdx,$N[1] | ||
630 | |||
631 | mulq $m0 # ap[j]*bp[i] | ||
632 | add %rax,$A[1] | ||
633 | mov -8($np,$j,8),%rax | ||
634 | adc \$0,%rdx | ||
635 | add -8(%rsp,$j,8),$A[1] | ||
636 | adc \$0,%rdx | ||
637 | mov %rdx,$A[0] | ||
638 | |||
639 | mulq $m1 # np[j]*m1 | ||
640 | add %rax,$N[1] | ||
641 | mov ($ap,$j,8),%rax | ||
642 | adc \$0,%rdx | ||
643 | add $A[1],$N[1] | ||
644 | adc \$0,%rdx | ||
645 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
646 | mov %rdx,$N[0] | ||
647 | |||
648 | mulq $m0 # ap[j]*bp[i] | ||
649 | add %rax,$A[0] | ||
650 | mov ($np,$j,8),%rax | ||
651 | adc \$0,%rdx | ||
652 | add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
653 | adc \$0,%rdx | ||
654 | mov %rdx,$A[1] | ||
655 | |||
656 | mulq $m1 # np[j]*m1 | ||
657 | add %rax,$N[0] | ||
658 | mov 8($ap,$j,8),%rax | ||
659 | adc \$0,%rdx | ||
660 | add $A[0],$N[0] | ||
661 | adc \$0,%rdx | ||
662 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
663 | mov %rdx,$N[1] | ||
664 | |||
665 | mulq $m0 # ap[j]*bp[i] | ||
666 | add %rax,$A[1] | ||
667 | mov 8($np,$j,8),%rax | ||
668 | adc \$0,%rdx | ||
669 | add 8(%rsp,$j,8),$A[1] | ||
670 | adc \$0,%rdx | ||
671 | lea 4($j),$j # j++ | ||
672 | mov %rdx,$A[0] | ||
673 | |||
674 | mulq $m1 # np[j]*m1 | ||
675 | add %rax,$N[1] | ||
676 | mov -16($ap,$j,8),%rax | ||
677 | adc \$0,%rdx | ||
678 | add $A[1],$N[1] | ||
679 | adc \$0,%rdx | ||
680 | mov $N[0],-40(%rsp,$j,8) # tp[j-1] | ||
681 | mov %rdx,$N[0] | ||
682 | cmp $num,$j | ||
683 | jl .Linner4x | ||
684 | |||
685 | mulq $m0 # ap[j]*bp[i] | ||
686 | add %rax,$A[0] | ||
687 | mov -16($np,$j,8),%rax | ||
688 | adc \$0,%rdx | ||
689 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | ||
690 | adc \$0,%rdx | ||
691 | mov %rdx,$A[1] | ||
692 | |||
693 | mulq $m1 # np[j]*m1 | ||
694 | add %rax,$N[0] | ||
695 | mov -8($ap,$j,8),%rax | ||
696 | adc \$0,%rdx | ||
697 | add $A[0],$N[0] | ||
698 | adc \$0,%rdx | ||
699 | mov $N[1],-32(%rsp,$j,8) # tp[j-1] | ||
700 | mov %rdx,$N[1] | ||
701 | |||
702 | mulq $m0 # ap[j]*bp[i] | ||
703 | add %rax,$A[1] | ||
704 | mov -8($np,$j,8),%rax | ||
705 | adc \$0,%rdx | ||
706 | add -8(%rsp,$j,8),$A[1] | ||
707 | adc \$0,%rdx | ||
708 | lea 1($i),$i # i++ | ||
709 | mov %rdx,$A[0] | ||
710 | |||
711 | mulq $m1 # np[j]*m1 | ||
712 | add %rax,$N[1] | ||
713 | mov ($ap),%rax # ap[0] | ||
714 | adc \$0,%rdx | ||
715 | add $A[1],$N[1] | ||
716 | adc \$0,%rdx | ||
717 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | ||
718 | mov %rdx,$N[0] | ||
719 | |||
720 | movq %xmm0,$m0 # bp[i+1] | ||
721 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | ||
722 | |||
723 | xor $N[1],$N[1] | ||
724 | add $A[0],$N[0] | ||
725 | adc \$0,$N[1] | ||
726 | add (%rsp,$num,8),$N[0] # pull upmost overflow bit | ||
727 | adc \$0,$N[1] | ||
728 | mov $N[0],-8(%rsp,$j,8) | ||
729 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit | ||
730 | |||
731 | cmp $num,$i | ||
732 | jl .Louter4x | ||
733 | ___ | ||
734 | { | ||
735 | my @ri=("%rax","%rdx",$m0,$m1); | ||
736 | $code.=<<___; | ||
737 | mov 16(%rsp,$num,8),$rp # restore $rp | ||
738 | mov 0(%rsp),@ri[0] # tp[0] | ||
739 | pxor %xmm0,%xmm0 | ||
740 | mov 8(%rsp),@ri[1] # tp[1] | ||
741 | shr \$2,$num # num/=4 | ||
742 | lea (%rsp),$ap # borrow ap for tp | ||
743 | xor $i,$i # i=0 and clear CF! | ||
744 | |||
745 | sub 0($np),@ri[0] | ||
746 | mov 16($ap),@ri[2] # tp[2] | ||
747 | mov 24($ap),@ri[3] # tp[3] | ||
748 | sbb 8($np),@ri[1] | ||
749 | lea -1($num),$j # j=num/4-1 | ||
750 | jmp .Lsub4x | ||
751 | .align 16 | ||
752 | .Lsub4x: | ||
753 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
754 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
755 | sbb 16($np,$i,8),@ri[2] | ||
756 | mov 32($ap,$i,8),@ri[0] # tp[i+1] | ||
757 | mov 40($ap,$i,8),@ri[1] | ||
758 | sbb 24($np,$i,8),@ri[3] | ||
759 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
760 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
761 | sbb 32($np,$i,8),@ri[0] | ||
762 | mov 48($ap,$i,8),@ri[2] | ||
763 | mov 56($ap,$i,8),@ri[3] | ||
764 | sbb 40($np,$i,8),@ri[1] | ||
765 | lea 4($i),$i # i++ | ||
766 | dec $j # doesnn't affect CF! | ||
767 | jnz .Lsub4x | ||
768 | |||
769 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
770 | mov 32($ap,$i,8),@ri[0] # load overflow bit | ||
771 | sbb 16($np,$i,8),@ri[2] | ||
772 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
773 | sbb 24($np,$i,8),@ri[3] | ||
774 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
775 | |||
776 | sbb \$0,@ri[0] # handle upmost overflow bit | ||
777 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
778 | xor $i,$i # i=0 | ||
779 | and @ri[0],$ap | ||
780 | not @ri[0] | ||
781 | mov $rp,$np | ||
782 | and @ri[0],$np | ||
783 | lea -1($num),$j | ||
784 | or $np,$ap # ap=borrow?tp:rp | ||
785 | |||
786 | movdqu ($ap),%xmm1 | ||
787 | movdqa %xmm0,(%rsp) | ||
788 | movdqu %xmm1,($rp) | ||
789 | jmp .Lcopy4x | ||
790 | .align 16 | ||
791 | .Lcopy4x: # copy or in-place refresh | ||
792 | movdqu 16($ap,$i),%xmm2 | ||
793 | movdqu 32($ap,$i),%xmm1 | ||
794 | movdqa %xmm0,16(%rsp,$i) | ||
795 | movdqu %xmm2,16($rp,$i) | ||
796 | movdqa %xmm0,32(%rsp,$i) | ||
797 | movdqu %xmm1,32($rp,$i) | ||
798 | lea 32($i),$i | ||
799 | dec $j | ||
800 | jnz .Lcopy4x | ||
801 | |||
802 | shl \$2,$num | ||
803 | movdqu 16($ap,$i),%xmm2 | ||
804 | movdqa %xmm0,16(%rsp,$i) | ||
805 | movdqu %xmm2,16($rp,$i) | ||
806 | ___ | ||
807 | } | ||
808 | $code.=<<___; | ||
809 | mov 8(%rsp,$num,8),%rsi # restore %rsp | ||
810 | mov \$1,%rax | ||
811 | ___ | ||
812 | $code.=<<___ if ($win64); | ||
813 | movaps (%rsi),%xmm6 | ||
814 | movaps 0x10(%rsi),%xmm7 | ||
815 | lea 0x28(%rsi),%rsi | ||
816 | ___ | ||
817 | $code.=<<___; | ||
818 | mov (%rsi),%r15 | ||
819 | mov 8(%rsi),%r14 | ||
820 | mov 16(%rsi),%r13 | ||
821 | mov 24(%rsi),%r12 | ||
822 | mov 32(%rsi),%rbp | ||
823 | mov 40(%rsi),%rbx | ||
824 | lea 48(%rsi),%rsp | ||
825 | .Lmul4x_epilogue: | ||
826 | ret | ||
827 | .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 | ||
828 | ___ | ||
829 | }}} | ||
830 | |||
831 | { | ||
832 | my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order | ||
833 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | ||
834 | my $out=$inp; | ||
835 | my $STRIDE=2**5*8; | ||
836 | my $N=$STRIDE/4; | ||
837 | |||
838 | $code.=<<___; | ||
839 | .globl bn_scatter5 | ||
840 | .type bn_scatter5,\@abi-omnipotent | ||
841 | .align 16 | ||
842 | bn_scatter5: | ||
843 | cmp \$0, $num | ||
844 | jz .Lscatter_epilogue | ||
845 | lea ($tbl,$idx,8),$tbl | ||
846 | .Lscatter: | ||
847 | mov ($inp),%rax | ||
848 | lea 8($inp),$inp | ||
849 | mov %rax,($tbl) | ||
850 | lea 32*8($tbl),$tbl | ||
851 | sub \$1,$num | ||
852 | jnz .Lscatter | ||
853 | .Lscatter_epilogue: | ||
854 | ret | ||
855 | .size bn_scatter5,.-bn_scatter5 | ||
856 | |||
857 | .globl bn_gather5 | ||
858 | .type bn_gather5,\@abi-omnipotent | ||
859 | .align 16 | ||
860 | bn_gather5: | ||
861 | ___ | ||
862 | $code.=<<___ if ($win64); | ||
863 | .LSEH_begin_bn_gather5: | ||
864 | # I can't trust assembler to use specific encoding:-( | ||
865 | .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp | ||
866 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) | ||
867 | .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) | ||
868 | ___ | ||
869 | $code.=<<___; | ||
870 | mov $idx,%r11 | ||
871 | shr \$`log($N/8)/log(2)`,$idx | ||
872 | and \$`$N/8-1`,%r11 | ||
873 | not $idx | ||
874 | lea .Lmagic_masks(%rip),%rax | ||
875 | and \$`2**5/($N/8)-1`,$idx # 5 is "window size" | ||
876 | lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line | ||
877 | movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which | ||
878 | movq 8(%rax,$idx,8),%xmm5 # cache line contains element | ||
879 | movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument | ||
880 | movq 24(%rax,$idx,8),%xmm7 | ||
881 | jmp .Lgather | ||
882 | .align 16 | ||
883 | .Lgather: | ||
884 | movq `0*$STRIDE/4-96`($tbl),%xmm0 | ||
885 | movq `1*$STRIDE/4-96`($tbl),%xmm1 | ||
886 | pand %xmm4,%xmm0 | ||
887 | movq `2*$STRIDE/4-96`($tbl),%xmm2 | ||
888 | pand %xmm5,%xmm1 | ||
889 | movq `3*$STRIDE/4-96`($tbl),%xmm3 | ||
890 | pand %xmm6,%xmm2 | ||
891 | por %xmm1,%xmm0 | ||
892 | pand %xmm7,%xmm3 | ||
893 | por %xmm2,%xmm0 | ||
894 | lea $STRIDE($tbl),$tbl | ||
895 | por %xmm3,%xmm0 | ||
896 | |||
897 | movq %xmm0,($out) # m0=bp[0] | ||
898 | lea 8($out),$out | ||
899 | sub \$1,$num | ||
900 | jnz .Lgather | ||
901 | ___ | ||
902 | $code.=<<___ if ($win64); | ||
903 | movaps %xmm6,(%rsp) | ||
904 | movaps %xmm7,0x10(%rsp) | ||
905 | lea 0x28(%rsp),%rsp | ||
906 | ___ | ||
907 | $code.=<<___; | ||
908 | ret | ||
909 | .LSEH_end_bn_gather5: | ||
910 | .size bn_gather5,.-bn_gather5 | ||
911 | ___ | ||
912 | } | ||
913 | $code.=<<___; | ||
914 | .align 64 | ||
915 | .Lmagic_masks: | ||
916 | .long 0,0, 0,0, 0,0, -1,-1 | ||
917 | .long 0,0, 0,0, 0,0, 0,0 | ||
918 | .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
919 | ___ | ||
920 | |||
921 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
922 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
923 | if ($win64) { | ||
924 | $rec="%rcx"; | ||
925 | $frame="%rdx"; | ||
926 | $context="%r8"; | ||
927 | $disp="%r9"; | ||
928 | |||
929 | $code.=<<___; | ||
930 | .extern __imp_RtlVirtualUnwind | ||
931 | .type mul_handler,\@abi-omnipotent | ||
932 | .align 16 | ||
933 | mul_handler: | ||
934 | push %rsi | ||
935 | push %rdi | ||
936 | push %rbx | ||
937 | push %rbp | ||
938 | push %r12 | ||
939 | push %r13 | ||
940 | push %r14 | ||
941 | push %r15 | ||
942 | pushfq | ||
943 | sub \$64,%rsp | ||
944 | |||
945 | mov 120($context),%rax # pull context->Rax | ||
946 | mov 248($context),%rbx # pull context->Rip | ||
947 | |||
948 | mov 8($disp),%rsi # disp->ImageBase | ||
949 | mov 56($disp),%r11 # disp->HandlerData | ||
950 | |||
951 | mov 0(%r11),%r10d # HandlerData[0] | ||
952 | lea (%rsi,%r10),%r10 # end of prologue label | ||
953 | cmp %r10,%rbx # context->Rip<end of prologue label | ||
954 | jb .Lcommon_seh_tail | ||
955 | |||
956 | lea `40+48`(%rax),%rax | ||
957 | |||
958 | mov 4(%r11),%r10d # HandlerData[1] | ||
959 | lea (%rsi,%r10),%r10 # end of alloca label | ||
960 | cmp %r10,%rbx # context->Rip<end of alloca label | ||
961 | jb .Lcommon_seh_tail | ||
962 | |||
963 | mov 152($context),%rax # pull context->Rsp | ||
964 | |||
965 | mov 8(%r11),%r10d # HandlerData[2] | ||
966 | lea (%rsi,%r10),%r10 # epilogue label | ||
967 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
968 | jae .Lcommon_seh_tail | ||
969 | |||
970 | mov 192($context),%r10 # pull $num | ||
971 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer | ||
972 | |||
973 | movaps (%rax),%xmm0 | ||
974 | movaps 16(%rax),%xmm1 | ||
975 | lea `40+48`(%rax),%rax | ||
976 | |||
977 | mov -8(%rax),%rbx | ||
978 | mov -16(%rax),%rbp | ||
979 | mov -24(%rax),%r12 | ||
980 | mov -32(%rax),%r13 | ||
981 | mov -40(%rax),%r14 | ||
982 | mov -48(%rax),%r15 | ||
983 | mov %rbx,144($context) # restore context->Rbx | ||
984 | mov %rbp,160($context) # restore context->Rbp | ||
985 | mov %r12,216($context) # restore context->R12 | ||
986 | mov %r13,224($context) # restore context->R13 | ||
987 | mov %r14,232($context) # restore context->R14 | ||
988 | mov %r15,240($context) # restore context->R15 | ||
989 | movups %xmm0,512($context) # restore context->Xmm6 | ||
990 | movups %xmm1,528($context) # restore context->Xmm7 | ||
991 | |||
992 | .Lcommon_seh_tail: | ||
993 | mov 8(%rax),%rdi | ||
994 | mov 16(%rax),%rsi | ||
995 | mov %rax,152($context) # restore context->Rsp | ||
996 | mov %rsi,168($context) # restore context->Rsi | ||
997 | mov %rdi,176($context) # restore context->Rdi | ||
998 | |||
999 | mov 40($disp),%rdi # disp->ContextRecord | ||
1000 | mov $context,%rsi # context | ||
1001 | mov \$154,%ecx # sizeof(CONTEXT) | ||
1002 | .long 0xa548f3fc # cld; rep movsq | ||
1003 | |||
1004 | mov $disp,%rsi | ||
1005 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
1006 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
1007 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
1008 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
1009 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
1010 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
1011 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
1012 | mov %r10,32(%rsp) # arg5 | ||
1013 | mov %r11,40(%rsp) # arg6 | ||
1014 | mov %r12,48(%rsp) # arg7 | ||
1015 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
1016 | call *__imp_RtlVirtualUnwind(%rip) | ||
1017 | |||
1018 | mov \$1,%eax # ExceptionContinueSearch | ||
1019 | add \$64,%rsp | ||
1020 | popfq | ||
1021 | pop %r15 | ||
1022 | pop %r14 | ||
1023 | pop %r13 | ||
1024 | pop %r12 | ||
1025 | pop %rbp | ||
1026 | pop %rbx | ||
1027 | pop %rdi | ||
1028 | pop %rsi | ||
1029 | ret | ||
1030 | .size mul_handler,.-mul_handler | ||
1031 | |||
1032 | .section .pdata | ||
1033 | .align 4 | ||
1034 | .rva .LSEH_begin_bn_mul_mont_gather5 | ||
1035 | .rva .LSEH_end_bn_mul_mont_gather5 | ||
1036 | .rva .LSEH_info_bn_mul_mont_gather5 | ||
1037 | |||
1038 | .rva .LSEH_begin_bn_mul4x_mont_gather5 | ||
1039 | .rva .LSEH_end_bn_mul4x_mont_gather5 | ||
1040 | .rva .LSEH_info_bn_mul4x_mont_gather5 | ||
1041 | |||
1042 | .rva .LSEH_begin_bn_gather5 | ||
1043 | .rva .LSEH_end_bn_gather5 | ||
1044 | .rva .LSEH_info_bn_gather5 | ||
1045 | |||
1046 | .section .xdata | ||
1047 | .align 8 | ||
1048 | .LSEH_info_bn_mul_mont_gather5: | ||
1049 | .byte 9,0,0,0 | ||
1050 | .rva mul_handler | ||
1051 | .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[] | ||
1052 | .align 8 | ||
1053 | .LSEH_info_bn_mul4x_mont_gather5: | ||
1054 | .byte 9,0,0,0 | ||
1055 | .rva mul_handler | ||
1056 | .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] | ||
1057 | .align 8 | ||
1058 | .LSEH_info_bn_gather5: | ||
1059 | .byte 0x01,0x0d,0x05,0x00 | ||
1060 | .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 | ||
1061 | .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 | ||
1062 | .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 | ||
1063 | .align 8 | ||
1064 | ___ | ||
1065 | } | ||
1066 | |||
1067 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
1068 | |||
1069 | print $code; | ||
1070 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86.pl b/src/lib/libcrypto/camellia/asm/cmll-x86.pl index 027302ac86..c314d62312 100644 --- a/src/lib/libcrypto/camellia/asm/cmll-x86.pl +++ b/src/lib/libcrypto/camellia/asm/cmll-x86.pl | |||
@@ -723,11 +723,11 @@ my $bias=int(@T[0])?shift(@T):0; | |||
723 | &function_end("Camellia_Ekeygen"); | 723 | &function_end("Camellia_Ekeygen"); |
724 | 724 | ||
725 | if ($OPENSSL) { | 725 | if ($OPENSSL) { |
726 | # int Camellia_set_key ( | 726 | # int private_Camellia_set_key ( |
727 | # const unsigned char *userKey, | 727 | # const unsigned char *userKey, |
728 | # int bits, | 728 | # int bits, |
729 | # CAMELLIA_KEY *key) | 729 | # CAMELLIA_KEY *key) |
730 | &function_begin_B("Camellia_set_key"); | 730 | &function_begin_B("private_Camellia_set_key"); |
731 | &push ("ebx"); | 731 | &push ("ebx"); |
732 | &mov ("ecx",&wparam(0)); # pull arguments | 732 | &mov ("ecx",&wparam(0)); # pull arguments |
733 | &mov ("ebx",&wparam(1)); | 733 | &mov ("ebx",&wparam(1)); |
@@ -760,7 +760,7 @@ if ($OPENSSL) { | |||
760 | &set_label("done",4); | 760 | &set_label("done",4); |
761 | &pop ("ebx"); | 761 | &pop ("ebx"); |
762 | &ret (); | 762 | &ret (); |
763 | &function_end_B("Camellia_set_key"); | 763 | &function_end_B("private_Camellia_set_key"); |
764 | } | 764 | } |
765 | 765 | ||
766 | @SBOX=( | 766 | @SBOX=( |
diff --git a/src/lib/libcrypto/camellia/camellia.h b/src/lib/libcrypto/camellia/camellia.h index cf0457dd97..67911e0adf 100644 --- a/src/lib/libcrypto/camellia/camellia.h +++ b/src/lib/libcrypto/camellia/camellia.h | |||
@@ -88,6 +88,10 @@ struct camellia_key_st | |||
88 | }; | 88 | }; |
89 | typedef struct camellia_key_st CAMELLIA_KEY; | 89 | typedef struct camellia_key_st CAMELLIA_KEY; |
90 | 90 | ||
91 | #ifdef OPENSSL_FIPS | ||
92 | int private_Camellia_set_key(const unsigned char *userKey, const int bits, | ||
93 | CAMELLIA_KEY *key); | ||
94 | #endif | ||
91 | int Camellia_set_key(const unsigned char *userKey, const int bits, | 95 | int Camellia_set_key(const unsigned char *userKey, const int bits, |
92 | CAMELLIA_KEY *key); | 96 | CAMELLIA_KEY *key); |
93 | 97 | ||
diff --git a/src/lib/libcrypto/camellia/cmll_locl.h b/src/lib/libcrypto/camellia/cmll_locl.h index 4a4d880d16..246b6ce1d8 100644 --- a/src/lib/libcrypto/camellia/cmll_locl.h +++ b/src/lib/libcrypto/camellia/cmll_locl.h | |||
@@ -71,7 +71,8 @@ | |||
71 | typedef unsigned int u32; | 71 | typedef unsigned int u32; |
72 | typedef unsigned char u8; | 72 | typedef unsigned char u8; |
73 | 73 | ||
74 | int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, KEY_TABLE_TYPE keyTable); | 74 | int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, |
75 | KEY_TABLE_TYPE keyTable); | ||
75 | void Camellia_EncryptBlock_Rounds(int grandRounds, const u8 plaintext[], | 76 | void Camellia_EncryptBlock_Rounds(int grandRounds, const u8 plaintext[], |
76 | const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); | 77 | const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); |
77 | void Camellia_DecryptBlock_Rounds(int grandRounds, const u8 ciphertext[], | 78 | void Camellia_DecryptBlock_Rounds(int grandRounds, const u8 ciphertext[], |
@@ -80,4 +81,6 @@ void Camellia_EncryptBlock(int keyBitLength, const u8 plaintext[], | |||
80 | const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); | 81 | const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); |
81 | void Camellia_DecryptBlock(int keyBitLength, const u8 ciphertext[], | 82 | void Camellia_DecryptBlock(int keyBitLength, const u8 ciphertext[], |
82 | const KEY_TABLE_TYPE keyTable, u8 plaintext[]); | 83 | const KEY_TABLE_TYPE keyTable, u8 plaintext[]); |
84 | int private_Camellia_set_key(const unsigned char *userKey, const int bits, | ||
85 | CAMELLIA_KEY *key); | ||
83 | #endif /* #ifndef HEADER_CAMELLIA_LOCL_H */ | 86 | #endif /* #ifndef HEADER_CAMELLIA_LOCL_H */ |
diff --git a/src/lib/libcrypto/camellia/cmll_misc.c b/src/lib/libcrypto/camellia/cmll_misc.c index f44689124b..f44d48564c 100644 --- a/src/lib/libcrypto/camellia/cmll_misc.c +++ b/src/lib/libcrypto/camellia/cmll_misc.c | |||
@@ -50,12 +50,13 @@ | |||
50 | */ | 50 | */ |
51 | 51 | ||
52 | #include <openssl/opensslv.h> | 52 | #include <openssl/opensslv.h> |
53 | #include <openssl/crypto.h> | ||
53 | #include <openssl/camellia.h> | 54 | #include <openssl/camellia.h> |
54 | #include "cmll_locl.h" | 55 | #include "cmll_locl.h" |
55 | 56 | ||
56 | const char CAMELLIA_version[]="CAMELLIA" OPENSSL_VERSION_PTEXT; | 57 | const char CAMELLIA_version[]="CAMELLIA" OPENSSL_VERSION_PTEXT; |
57 | 58 | ||
58 | int Camellia_set_key(const unsigned char *userKey, const int bits, | 59 | int private_Camellia_set_key(const unsigned char *userKey, const int bits, |
59 | CAMELLIA_KEY *key) | 60 | CAMELLIA_KEY *key) |
60 | { | 61 | { |
61 | if(!userKey || !key) | 62 | if(!userKey || !key) |
diff --git a/src/lib/libcrypto/cmac/cm_ameth.c b/src/lib/libcrypto/cmac/cm_ameth.c new file mode 100644 index 0000000000..0b8e5670b0 --- /dev/null +++ b/src/lib/libcrypto/cmac/cm_ameth.c | |||
@@ -0,0 +1,97 @@ | |||
1 | /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | ||
2 | * project 2010. | ||
3 | */ | ||
4 | /* ==================================================================== | ||
5 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
6 | * | ||
7 | * Redistribution and use in source and binary forms, with or without | ||
8 | * modification, are permitted provided that the following conditions | ||
9 | * are met: | ||
10 | * | ||
11 | * 1. Redistributions of source code must retain the above copyright | ||
12 | * notice, this list of conditions and the following disclaimer. | ||
13 | * | ||
14 | * 2. Redistributions in binary form must reproduce the above copyright | ||
15 | * notice, this list of conditions and the following disclaimer in | ||
16 | * the documentation and/or other materials provided with the | ||
17 | * distribution. | ||
18 | * | ||
19 | * 3. All advertising materials mentioning features or use of this | ||
20 | * software must display the following acknowledgment: | ||
21 | * "This product includes software developed by the OpenSSL Project | ||
22 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
23 | * | ||
24 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
25 | * endorse or promote products derived from this software without | ||
26 | * prior written permission. For written permission, please contact | ||
27 | * licensing@OpenSSL.org. | ||
28 | * | ||
29 | * 5. Products derived from this software may not be called "OpenSSL" | ||
30 | * nor may "OpenSSL" appear in their names without prior written | ||
31 | * permission of the OpenSSL Project. | ||
32 | * | ||
33 | * 6. Redistributions of any form whatsoever must retain the following | ||
34 | * acknowledgment: | ||
35 | * "This product includes software developed by the OpenSSL Project | ||
36 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
37 | * | ||
38 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
39 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
40 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
41 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
42 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
43 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
44 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
45 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
46 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
47 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
48 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
49 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
50 | * ==================================================================== | ||
51 | */ | ||
52 | |||
53 | #include <stdio.h> | ||
54 | #include "cryptlib.h" | ||
55 | #include <openssl/evp.h> | ||
56 | #include <openssl/cmac.h> | ||
57 | #include "asn1_locl.h" | ||
58 | |||
59 | /* CMAC "ASN1" method. This is just here to indicate the | ||
60 | * maximum CMAC output length and to free up a CMAC | ||
61 | * key. | ||
62 | */ | ||
63 | |||
64 | static int cmac_size(const EVP_PKEY *pkey) | ||
65 | { | ||
66 | return EVP_MAX_BLOCK_LENGTH; | ||
67 | } | ||
68 | |||
69 | static void cmac_key_free(EVP_PKEY *pkey) | ||
70 | { | ||
71 | CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr; | ||
72 | if (cmctx) | ||
73 | CMAC_CTX_free(cmctx); | ||
74 | } | ||
75 | |||
76 | const EVP_PKEY_ASN1_METHOD cmac_asn1_meth = | ||
77 | { | ||
78 | EVP_PKEY_CMAC, | ||
79 | EVP_PKEY_CMAC, | ||
80 | 0, | ||
81 | |||
82 | "CMAC", | ||
83 | "OpenSSL CMAC method", | ||
84 | |||
85 | 0,0,0,0, | ||
86 | |||
87 | 0,0,0, | ||
88 | |||
89 | cmac_size, | ||
90 | 0, | ||
91 | 0,0,0,0,0,0,0, | ||
92 | |||
93 | cmac_key_free, | ||
94 | 0, | ||
95 | 0,0 | ||
96 | }; | ||
97 | |||
diff --git a/src/lib/libcrypto/cmac/cm_pmeth.c b/src/lib/libcrypto/cmac/cm_pmeth.c new file mode 100644 index 0000000000..072228ec7f --- /dev/null +++ b/src/lib/libcrypto/cmac/cm_pmeth.c | |||
@@ -0,0 +1,224 @@ | |||
1 | /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | ||
2 | * project 2010. | ||
3 | */ | ||
4 | /* ==================================================================== | ||
5 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
6 | * | ||
7 | * Redistribution and use in source and binary forms, with or without | ||
8 | * modification, are permitted provided that the following conditions | ||
9 | * are met: | ||
10 | * | ||
11 | * 1. Redistributions of source code must retain the above copyright | ||
12 | * notice, this list of conditions and the following disclaimer. | ||
13 | * | ||
14 | * 2. Redistributions in binary form must reproduce the above copyright | ||
15 | * notice, this list of conditions and the following disclaimer in | ||
16 | * the documentation and/or other materials provided with the | ||
17 | * distribution. | ||
18 | * | ||
19 | * 3. All advertising materials mentioning features or use of this | ||
20 | * software must display the following acknowledgment: | ||
21 | * "This product includes software developed by the OpenSSL Project | ||
22 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
23 | * | ||
24 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
25 | * endorse or promote products derived from this software without | ||
26 | * prior written permission. For written permission, please contact | ||
27 | * licensing@OpenSSL.org. | ||
28 | * | ||
29 | * 5. Products derived from this software may not be called "OpenSSL" | ||
30 | * nor may "OpenSSL" appear in their names without prior written | ||
31 | * permission of the OpenSSL Project. | ||
32 | * | ||
33 | * 6. Redistributions of any form whatsoever must retain the following | ||
34 | * acknowledgment: | ||
35 | * "This product includes software developed by the OpenSSL Project | ||
36 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
37 | * | ||
38 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
39 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
40 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
41 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
42 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
43 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
44 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
45 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
46 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
47 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
48 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
49 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
50 | * ==================================================================== | ||
51 | */ | ||
52 | |||
53 | #include <stdio.h> | ||
54 | #include "cryptlib.h" | ||
55 | #include <openssl/x509.h> | ||
56 | #include <openssl/x509v3.h> | ||
57 | #include <openssl/evp.h> | ||
58 | #include <openssl/cmac.h> | ||
59 | #include "evp_locl.h" | ||
60 | |||
61 | /* The context structure and "key" is simply a CMAC_CTX */ | ||
62 | |||
63 | static int pkey_cmac_init(EVP_PKEY_CTX *ctx) | ||
64 | { | ||
65 | ctx->data = CMAC_CTX_new(); | ||
66 | if (!ctx->data) | ||
67 | return 0; | ||
68 | ctx->keygen_info_count = 0; | ||
69 | return 1; | ||
70 | } | ||
71 | |||
72 | static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) | ||
73 | { | ||
74 | if (!pkey_cmac_init(dst)) | ||
75 | return 0; | ||
76 | if (!CMAC_CTX_copy(dst->data, src->data)) | ||
77 | return 0; | ||
78 | return 1; | ||
79 | } | ||
80 | |||
81 | static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx) | ||
82 | { | ||
83 | CMAC_CTX_free(ctx->data); | ||
84 | } | ||
85 | |||
86 | static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) | ||
87 | { | ||
88 | CMAC_CTX *cmkey = CMAC_CTX_new(); | ||
89 | CMAC_CTX *cmctx = ctx->data; | ||
90 | if (!cmkey) | ||
91 | return 0; | ||
92 | if (!CMAC_CTX_copy(cmkey, cmctx)) | ||
93 | { | ||
94 | CMAC_CTX_free(cmkey); | ||
95 | return 0; | ||
96 | } | ||
97 | EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey); | ||
98 | |||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count) | ||
103 | { | ||
104 | if (!CMAC_Update(ctx->pctx->data, data, count)) | ||
105 | return 0; | ||
106 | return 1; | ||
107 | } | ||
108 | |||
109 | static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx) | ||
110 | { | ||
111 | EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT); | ||
112 | mctx->update = int_update; | ||
113 | return 1; | ||
114 | } | ||
115 | |||
116 | static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | ||
117 | EVP_MD_CTX *mctx) | ||
118 | { | ||
119 | return CMAC_Final(ctx->data, sig, siglen); | ||
120 | } | ||
121 | |||
122 | static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | ||
123 | { | ||
124 | CMAC_CTX *cmctx = ctx->data; | ||
125 | switch (type) | ||
126 | { | ||
127 | |||
128 | case EVP_PKEY_CTRL_SET_MAC_KEY: | ||
129 | if (!p2 || p1 < 0) | ||
130 | return 0; | ||
131 | if (!CMAC_Init(cmctx, p2, p1, NULL, NULL)) | ||
132 | return 0; | ||
133 | break; | ||
134 | |||
135 | case EVP_PKEY_CTRL_CIPHER: | ||
136 | if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine)) | ||
137 | return 0; | ||
138 | break; | ||
139 | |||
140 | case EVP_PKEY_CTRL_MD: | ||
141 | if (ctx->pkey && !CMAC_CTX_copy(ctx->data, | ||
142 | (CMAC_CTX *)ctx->pkey->pkey.ptr)) | ||
143 | return 0; | ||
144 | if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL)) | ||
145 | return 0; | ||
146 | break; | ||
147 | |||
148 | default: | ||
149 | return -2; | ||
150 | |||
151 | } | ||
152 | return 1; | ||
153 | } | ||
154 | |||
155 | static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx, | ||
156 | const char *type, const char *value) | ||
157 | { | ||
158 | if (!value) | ||
159 | { | ||
160 | return 0; | ||
161 | } | ||
162 | if (!strcmp(type, "key")) | ||
163 | { | ||
164 | void *p = (void *)value; | ||
165 | return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, | ||
166 | strlen(p), p); | ||
167 | } | ||
168 | if (!strcmp(type, "cipher")) | ||
169 | { | ||
170 | const EVP_CIPHER *c; | ||
171 | c = EVP_get_cipherbyname(value); | ||
172 | if (!c) | ||
173 | return 0; | ||
174 | return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c); | ||
175 | } | ||
176 | if (!strcmp(type, "hexkey")) | ||
177 | { | ||
178 | unsigned char *key; | ||
179 | int r; | ||
180 | long keylen; | ||
181 | key = string_to_hex(value, &keylen); | ||
182 | if (!key) | ||
183 | return 0; | ||
184 | r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key); | ||
185 | OPENSSL_free(key); | ||
186 | return r; | ||
187 | } | ||
188 | return -2; | ||
189 | } | ||
190 | |||
191 | const EVP_PKEY_METHOD cmac_pkey_meth = | ||
192 | { | ||
193 | EVP_PKEY_CMAC, | ||
194 | EVP_PKEY_FLAG_SIGCTX_CUSTOM, | ||
195 | pkey_cmac_init, | ||
196 | pkey_cmac_copy, | ||
197 | pkey_cmac_cleanup, | ||
198 | |||
199 | 0, 0, | ||
200 | |||
201 | 0, | ||
202 | pkey_cmac_keygen, | ||
203 | |||
204 | 0, 0, | ||
205 | |||
206 | 0, 0, | ||
207 | |||
208 | 0,0, | ||
209 | |||
210 | cmac_signctx_init, | ||
211 | cmac_signctx, | ||
212 | |||
213 | 0,0, | ||
214 | |||
215 | 0,0, | ||
216 | |||
217 | 0,0, | ||
218 | |||
219 | 0,0, | ||
220 | |||
221 | pkey_cmac_ctrl, | ||
222 | pkey_cmac_ctrl_str | ||
223 | |||
224 | }; | ||
diff --git a/src/lib/libcrypto/cmac/cmac.c b/src/lib/libcrypto/cmac/cmac.c new file mode 100644 index 0000000000..8b72b09681 --- /dev/null +++ b/src/lib/libcrypto/cmac/cmac.c | |||
@@ -0,0 +1,308 @@ | |||
1 | /* crypto/cmac/cmac.c */ | ||
2 | /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | ||
3 | * project. | ||
4 | */ | ||
5 | /* ==================================================================== | ||
6 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
7 | * | ||
8 | * Redistribution and use in source and binary forms, with or without | ||
9 | * modification, are permitted provided that the following conditions | ||
10 | * are met: | ||
11 | * | ||
12 | * 1. Redistributions of source code must retain the above copyright | ||
13 | * notice, this list of conditions and the following disclaimer. | ||
14 | * | ||
15 | * 2. Redistributions in binary form must reproduce the above copyright | ||
16 | * notice, this list of conditions and the following disclaimer in | ||
17 | * the documentation and/or other materials provided with the | ||
18 | * distribution. | ||
19 | * | ||
20 | * 3. All advertising materials mentioning features or use of this | ||
21 | * software must display the following acknowledgment: | ||
22 | * "This product includes software developed by the OpenSSL Project | ||
23 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
24 | * | ||
25 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
26 | * endorse or promote products derived from this software without | ||
27 | * prior written permission. For written permission, please contact | ||
28 | * licensing@OpenSSL.org. | ||
29 | * | ||
30 | * 5. Products derived from this software may not be called "OpenSSL" | ||
31 | * nor may "OpenSSL" appear in their names without prior written | ||
32 | * permission of the OpenSSL Project. | ||
33 | * | ||
34 | * 6. Redistributions of any form whatsoever must retain the following | ||
35 | * acknowledgment: | ||
36 | * "This product includes software developed by the OpenSSL Project | ||
37 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
38 | * | ||
39 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
40 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
41 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
42 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
43 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
44 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
45 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
46 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
47 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
48 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
49 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
50 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
51 | * ==================================================================== | ||
52 | */ | ||
53 | |||
54 | #include <stdio.h> | ||
55 | #include <stdlib.h> | ||
56 | #include <string.h> | ||
57 | #include "cryptlib.h" | ||
58 | #include <openssl/cmac.h> | ||
59 | |||
60 | #ifdef OPENSSL_FIPS | ||
61 | #include <openssl/fips.h> | ||
62 | #endif | ||
63 | |||
64 | struct CMAC_CTX_st | ||
65 | { | ||
66 | /* Cipher context to use */ | ||
67 | EVP_CIPHER_CTX cctx; | ||
68 | /* Keys k1 and k2 */ | ||
69 | unsigned char k1[EVP_MAX_BLOCK_LENGTH]; | ||
70 | unsigned char k2[EVP_MAX_BLOCK_LENGTH]; | ||
71 | /* Temporary block */ | ||
72 | unsigned char tbl[EVP_MAX_BLOCK_LENGTH]; | ||
73 | /* Last (possibly partial) block */ | ||
74 | unsigned char last_block[EVP_MAX_BLOCK_LENGTH]; | ||
75 | /* Number of bytes in last block: -1 means context not initialised */ | ||
76 | int nlast_block; | ||
77 | }; | ||
78 | |||
79 | |||
80 | /* Make temporary keys K1 and K2 */ | ||
81 | |||
82 | static void make_kn(unsigned char *k1, unsigned char *l, int bl) | ||
83 | { | ||
84 | int i; | ||
85 | /* Shift block to left, including carry */ | ||
86 | for (i = 0; i < bl; i++) | ||
87 | { | ||
88 | k1[i] = l[i] << 1; | ||
89 | if (i < bl - 1 && l[i + 1] & 0x80) | ||
90 | k1[i] |= 1; | ||
91 | } | ||
92 | /* If MSB set fixup with R */ | ||
93 | if (l[0] & 0x80) | ||
94 | k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b; | ||
95 | } | ||
96 | |||
97 | CMAC_CTX *CMAC_CTX_new(void) | ||
98 | { | ||
99 | CMAC_CTX *ctx; | ||
100 | ctx = OPENSSL_malloc(sizeof(CMAC_CTX)); | ||
101 | if (!ctx) | ||
102 | return NULL; | ||
103 | EVP_CIPHER_CTX_init(&ctx->cctx); | ||
104 | ctx->nlast_block = -1; | ||
105 | return ctx; | ||
106 | } | ||
107 | |||
108 | void CMAC_CTX_cleanup(CMAC_CTX *ctx) | ||
109 | { | ||
110 | #ifdef OPENSSL_FIPS | ||
111 | if (FIPS_mode() && !ctx->cctx.engine) | ||
112 | { | ||
113 | FIPS_cmac_ctx_cleanup(ctx); | ||
114 | return; | ||
115 | } | ||
116 | #endif | ||
117 | EVP_CIPHER_CTX_cleanup(&ctx->cctx); | ||
118 | OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH); | ||
119 | OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH); | ||
120 | OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH); | ||
121 | OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH); | ||
122 | ctx->nlast_block = -1; | ||
123 | } | ||
124 | |||
125 | EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx) | ||
126 | { | ||
127 | return &ctx->cctx; | ||
128 | } | ||
129 | |||
130 | void CMAC_CTX_free(CMAC_CTX *ctx) | ||
131 | { | ||
132 | CMAC_CTX_cleanup(ctx); | ||
133 | OPENSSL_free(ctx); | ||
134 | } | ||
135 | |||
136 | int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in) | ||
137 | { | ||
138 | int bl; | ||
139 | if (in->nlast_block == -1) | ||
140 | return 0; | ||
141 | if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx)) | ||
142 | return 0; | ||
143 | bl = EVP_CIPHER_CTX_block_size(&in->cctx); | ||
144 | memcpy(out->k1, in->k1, bl); | ||
145 | memcpy(out->k2, in->k2, bl); | ||
146 | memcpy(out->tbl, in->tbl, bl); | ||
147 | memcpy(out->last_block, in->last_block, bl); | ||
148 | out->nlast_block = in->nlast_block; | ||
149 | return 1; | ||
150 | } | ||
151 | |||
152 | int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, | ||
153 | const EVP_CIPHER *cipher, ENGINE *impl) | ||
154 | { | ||
155 | static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH]; | ||
156 | #ifdef OPENSSL_FIPS | ||
157 | if (FIPS_mode()) | ||
158 | { | ||
159 | /* If we have an ENGINE need to allow non FIPS */ | ||
160 | if ((impl || ctx->cctx.engine) | ||
161 | && !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)) | ||
162 | |||
163 | { | ||
164 | EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS); | ||
165 | return 0; | ||
166 | } | ||
167 | /* Other algorithm blocking will be done in FIPS_cmac_init, | ||
168 | * via FIPS_cipherinit(). | ||
169 | */ | ||
170 | if (!impl && !ctx->cctx.engine) | ||
171 | return FIPS_cmac_init(ctx, key, keylen, cipher, NULL); | ||
172 | } | ||
173 | #endif | ||
174 | /* All zeros means restart */ | ||
175 | if (!key && !cipher && !impl && keylen == 0) | ||
176 | { | ||
177 | /* Not initialised */ | ||
178 | if (ctx->nlast_block == -1) | ||
179 | return 0; | ||
180 | if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv)) | ||
181 | return 0; | ||
182 | memset(ctx->tbl, 0, EVP_CIPHER_CTX_block_size(&ctx->cctx)); | ||
183 | ctx->nlast_block = 0; | ||
184 | return 1; | ||
185 | } | ||
186 | /* Initialiase context */ | ||
187 | if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL)) | ||
188 | return 0; | ||
189 | /* Non-NULL key means initialisation complete */ | ||
190 | if (key) | ||
191 | { | ||
192 | int bl; | ||
193 | if (!EVP_CIPHER_CTX_cipher(&ctx->cctx)) | ||
194 | return 0; | ||
195 | if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen)) | ||
196 | return 0; | ||
197 | if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv)) | ||
198 | return 0; | ||
199 | bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); | ||
200 | if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl)) | ||
201 | return 0; | ||
202 | make_kn(ctx->k1, ctx->tbl, bl); | ||
203 | make_kn(ctx->k2, ctx->k1, bl); | ||
204 | OPENSSL_cleanse(ctx->tbl, bl); | ||
205 | /* Reset context again ready for first data block */ | ||
206 | if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv)) | ||
207 | return 0; | ||
208 | /* Zero tbl so resume works */ | ||
209 | memset(ctx->tbl, 0, bl); | ||
210 | ctx->nlast_block = 0; | ||
211 | } | ||
212 | return 1; | ||
213 | } | ||
214 | |||
215 | int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen) | ||
216 | { | ||
217 | const unsigned char *data = in; | ||
218 | size_t bl; | ||
219 | #ifdef OPENSSL_FIPS | ||
220 | if (FIPS_mode() && !ctx->cctx.engine) | ||
221 | return FIPS_cmac_update(ctx, in, dlen); | ||
222 | #endif | ||
223 | if (ctx->nlast_block == -1) | ||
224 | return 0; | ||
225 | if (dlen == 0) | ||
226 | return 1; | ||
227 | bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); | ||
228 | /* Copy into partial block if we need to */ | ||
229 | if (ctx->nlast_block > 0) | ||
230 | { | ||
231 | size_t nleft; | ||
232 | nleft = bl - ctx->nlast_block; | ||
233 | if (dlen < nleft) | ||
234 | nleft = dlen; | ||
235 | memcpy(ctx->last_block + ctx->nlast_block, data, nleft); | ||
236 | dlen -= nleft; | ||
237 | ctx->nlast_block += nleft; | ||
238 | /* If no more to process return */ | ||
239 | if (dlen == 0) | ||
240 | return 1; | ||
241 | data += nleft; | ||
242 | /* Else not final block so encrypt it */ | ||
243 | if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl)) | ||
244 | return 0; | ||
245 | } | ||
246 | /* Encrypt all but one of the complete blocks left */ | ||
247 | while(dlen > bl) | ||
248 | { | ||
249 | if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl)) | ||
250 | return 0; | ||
251 | dlen -= bl; | ||
252 | data += bl; | ||
253 | } | ||
254 | /* Copy any data left to last block buffer */ | ||
255 | memcpy(ctx->last_block, data, dlen); | ||
256 | ctx->nlast_block = dlen; | ||
257 | return 1; | ||
258 | |||
259 | } | ||
260 | |||
261 | int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen) | ||
262 | { | ||
263 | int i, bl, lb; | ||
264 | #ifdef OPENSSL_FIPS | ||
265 | if (FIPS_mode() && !ctx->cctx.engine) | ||
266 | return FIPS_cmac_final(ctx, out, poutlen); | ||
267 | #endif | ||
268 | if (ctx->nlast_block == -1) | ||
269 | return 0; | ||
270 | bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); | ||
271 | *poutlen = (size_t)bl; | ||
272 | if (!out) | ||
273 | return 1; | ||
274 | lb = ctx->nlast_block; | ||
275 | /* Is last block complete? */ | ||
276 | if (lb == bl) | ||
277 | { | ||
278 | for (i = 0; i < bl; i++) | ||
279 | out[i] = ctx->last_block[i] ^ ctx->k1[i]; | ||
280 | } | ||
281 | else | ||
282 | { | ||
283 | ctx->last_block[lb] = 0x80; | ||
284 | if (bl - lb > 1) | ||
285 | memset(ctx->last_block + lb + 1, 0, bl - lb - 1); | ||
286 | for (i = 0; i < bl; i++) | ||
287 | out[i] = ctx->last_block[i] ^ ctx->k2[i]; | ||
288 | } | ||
289 | if (!EVP_Cipher(&ctx->cctx, out, out, bl)) | ||
290 | { | ||
291 | OPENSSL_cleanse(out, bl); | ||
292 | return 0; | ||
293 | } | ||
294 | return 1; | ||
295 | } | ||
296 | |||
297 | int CMAC_resume(CMAC_CTX *ctx) | ||
298 | { | ||
299 | if (ctx->nlast_block == -1) | ||
300 | return 0; | ||
301 | /* The buffer "tbl" containes the last fully encrypted block | ||
302 | * which is the last IV (or all zeroes if no last encrypted block). | ||
303 | * The last block has not been modified since CMAC_final(). | ||
304 | * So reinitliasing using the last decrypted block will allow | ||
305 | * CMAC to continue after calling CMAC_Final(). | ||
306 | */ | ||
307 | return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl); | ||
308 | } | ||
diff --git a/src/lib/libcrypto/cmac/cmac.h b/src/lib/libcrypto/cmac/cmac.h new file mode 100644 index 0000000000..712e92dced --- /dev/null +++ b/src/lib/libcrypto/cmac/cmac.h | |||
@@ -0,0 +1,82 @@ | |||
1 | /* crypto/cmac/cmac.h */ | ||
2 | /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | ||
3 | * project. | ||
4 | */ | ||
5 | /* ==================================================================== | ||
6 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
7 | * | ||
8 | * Redistribution and use in source and binary forms, with or without | ||
9 | * modification, are permitted provided that the following conditions | ||
10 | * are met: | ||
11 | * | ||
12 | * 1. Redistributions of source code must retain the above copyright | ||
13 | * notice, this list of conditions and the following disclaimer. | ||
14 | * | ||
15 | * 2. Redistributions in binary form must reproduce the above copyright | ||
16 | * notice, this list of conditions and the following disclaimer in | ||
17 | * the documentation and/or other materials provided with the | ||
18 | * distribution. | ||
19 | * | ||
20 | * 3. All advertising materials mentioning features or use of this | ||
21 | * software must display the following acknowledgment: | ||
22 | * "This product includes software developed by the OpenSSL Project | ||
23 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
24 | * | ||
25 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
26 | * endorse or promote products derived from this software without | ||
27 | * prior written permission. For written permission, please contact | ||
28 | * licensing@OpenSSL.org. | ||
29 | * | ||
30 | * 5. Products derived from this software may not be called "OpenSSL" | ||
31 | * nor may "OpenSSL" appear in their names without prior written | ||
32 | * permission of the OpenSSL Project. | ||
33 | * | ||
34 | * 6. Redistributions of any form whatsoever must retain the following | ||
35 | * acknowledgment: | ||
36 | * "This product includes software developed by the OpenSSL Project | ||
37 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
38 | * | ||
39 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
40 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
41 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
42 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
43 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
44 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
45 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
46 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
47 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
48 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
49 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
50 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
51 | * ==================================================================== | ||
52 | */ | ||
53 | |||
54 | |||
55 | #ifndef HEADER_CMAC_H | ||
56 | #define HEADER_CMAC_H | ||
57 | |||
58 | #ifdef __cplusplus | ||
59 | extern "C" { | ||
60 | #endif | ||
61 | |||
62 | #include <openssl/evp.h> | ||
63 | |||
64 | /* Opaque */ | ||
65 | typedef struct CMAC_CTX_st CMAC_CTX; | ||
66 | |||
67 | CMAC_CTX *CMAC_CTX_new(void); | ||
68 | void CMAC_CTX_cleanup(CMAC_CTX *ctx); | ||
69 | void CMAC_CTX_free(CMAC_CTX *ctx); | ||
70 | EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx); | ||
71 | int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in); | ||
72 | |||
73 | int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, | ||
74 | const EVP_CIPHER *cipher, ENGINE *impl); | ||
75 | int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen); | ||
76 | int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen); | ||
77 | int CMAC_resume(CMAC_CTX *ctx); | ||
78 | |||
79 | #ifdef __cplusplus | ||
80 | } | ||
81 | #endif | ||
82 | #endif | ||
diff --git a/src/lib/libcrypto/cms/cms.h b/src/lib/libcrypto/cms/cms.h index 09c45d0412..36994fa6a2 100644 --- a/src/lib/libcrypto/cms/cms.h +++ b/src/lib/libcrypto/cms/cms.h | |||
@@ -111,6 +111,7 @@ DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo) | |||
111 | #define CMS_PARTIAL 0x4000 | 111 | #define CMS_PARTIAL 0x4000 |
112 | #define CMS_REUSE_DIGEST 0x8000 | 112 | #define CMS_REUSE_DIGEST 0x8000 |
113 | #define CMS_USE_KEYID 0x10000 | 113 | #define CMS_USE_KEYID 0x10000 |
114 | #define CMS_DEBUG_DECRYPT 0x20000 | ||
114 | 115 | ||
115 | const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms); | 116 | const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms); |
116 | 117 | ||
@@ -184,6 +185,8 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert); | |||
184 | int CMS_decrypt_set1_key(CMS_ContentInfo *cms, | 185 | int CMS_decrypt_set1_key(CMS_ContentInfo *cms, |
185 | unsigned char *key, size_t keylen, | 186 | unsigned char *key, size_t keylen, |
186 | unsigned char *id, size_t idlen); | 187 | unsigned char *id, size_t idlen); |
188 | int CMS_decrypt_set1_password(CMS_ContentInfo *cms, | ||
189 | unsigned char *pass, ossl_ssize_t passlen); | ||
187 | 190 | ||
188 | STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms); | 191 | STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms); |
189 | int CMS_RecipientInfo_type(CMS_RecipientInfo *ri); | 192 | int CMS_RecipientInfo_type(CMS_RecipientInfo *ri); |
@@ -219,6 +222,16 @@ int CMS_RecipientInfo_set0_key(CMS_RecipientInfo *ri, | |||
219 | int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri, | 222 | int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri, |
220 | const unsigned char *id, size_t idlen); | 223 | const unsigned char *id, size_t idlen); |
221 | 224 | ||
225 | int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, | ||
226 | unsigned char *pass, | ||
227 | ossl_ssize_t passlen); | ||
228 | |||
229 | CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms, | ||
230 | int iter, int wrap_nid, int pbe_nid, | ||
231 | unsigned char *pass, | ||
232 | ossl_ssize_t passlen, | ||
233 | const EVP_CIPHER *kekciph); | ||
234 | |||
222 | int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri); | 235 | int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri); |
223 | 236 | ||
224 | int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out, | 237 | int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out, |
@@ -330,6 +343,7 @@ void ERR_load_CMS_strings(void); | |||
330 | #define CMS_F_CHECK_CONTENT 99 | 343 | #define CMS_F_CHECK_CONTENT 99 |
331 | #define CMS_F_CMS_ADD0_CERT 164 | 344 | #define CMS_F_CMS_ADD0_CERT 164 |
332 | #define CMS_F_CMS_ADD0_RECIPIENT_KEY 100 | 345 | #define CMS_F_CMS_ADD0_RECIPIENT_KEY 100 |
346 | #define CMS_F_CMS_ADD0_RECIPIENT_PASSWORD 165 | ||
333 | #define CMS_F_CMS_ADD1_RECEIPTREQUEST 158 | 347 | #define CMS_F_CMS_ADD1_RECEIPTREQUEST 158 |
334 | #define CMS_F_CMS_ADD1_RECIPIENT_CERT 101 | 348 | #define CMS_F_CMS_ADD1_RECIPIENT_CERT 101 |
335 | #define CMS_F_CMS_ADD1_SIGNER 102 | 349 | #define CMS_F_CMS_ADD1_SIGNER 102 |
@@ -344,6 +358,7 @@ void ERR_load_CMS_strings(void); | |||
344 | #define CMS_F_CMS_DATAINIT 111 | 358 | #define CMS_F_CMS_DATAINIT 111 |
345 | #define CMS_F_CMS_DECRYPT 112 | 359 | #define CMS_F_CMS_DECRYPT 112 |
346 | #define CMS_F_CMS_DECRYPT_SET1_KEY 113 | 360 | #define CMS_F_CMS_DECRYPT_SET1_KEY 113 |
361 | #define CMS_F_CMS_DECRYPT_SET1_PASSWORD 166 | ||
347 | #define CMS_F_CMS_DECRYPT_SET1_PKEY 114 | 362 | #define CMS_F_CMS_DECRYPT_SET1_PKEY 114 |
348 | #define CMS_F_CMS_DIGESTALGORITHM_FIND_CTX 115 | 363 | #define CMS_F_CMS_DIGESTALGORITHM_FIND_CTX 115 |
349 | #define CMS_F_CMS_DIGESTALGORITHM_INIT_BIO 116 | 364 | #define CMS_F_CMS_DIGESTALGORITHM_INIT_BIO 116 |
@@ -378,7 +393,9 @@ void ERR_load_CMS_strings(void); | |||
378 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT 141 | 393 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT 141 |
379 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS 142 | 394 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS 142 |
380 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID 143 | 395 | #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID 143 |
396 | #define CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT 167 | ||
381 | #define CMS_F_CMS_RECIPIENTINFO_SET0_KEY 144 | 397 | #define CMS_F_CMS_RECIPIENTINFO_SET0_KEY 144 |
398 | #define CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD 168 | ||
382 | #define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY 145 | 399 | #define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY 145 |
383 | #define CMS_F_CMS_SET1_SIGNERIDENTIFIER 146 | 400 | #define CMS_F_CMS_SET1_SIGNERIDENTIFIER 146 |
384 | #define CMS_F_CMS_SET_DETACHED 147 | 401 | #define CMS_F_CMS_SET_DETACHED 147 |
@@ -419,6 +436,7 @@ void ERR_load_CMS_strings(void); | |||
419 | #define CMS_R_ERROR_SETTING_KEY 115 | 436 | #define CMS_R_ERROR_SETTING_KEY 115 |
420 | #define CMS_R_ERROR_SETTING_RECIPIENTINFO 116 | 437 | #define CMS_R_ERROR_SETTING_RECIPIENTINFO 116 |
421 | #define CMS_R_INVALID_ENCRYPTED_KEY_LENGTH 117 | 438 | #define CMS_R_INVALID_ENCRYPTED_KEY_LENGTH 117 |
439 | #define CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER 176 | ||
422 | #define CMS_R_INVALID_KEY_LENGTH 118 | 440 | #define CMS_R_INVALID_KEY_LENGTH 118 |
423 | #define CMS_R_MD_BIO_INIT_ERROR 119 | 441 | #define CMS_R_MD_BIO_INIT_ERROR 119 |
424 | #define CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH 120 | 442 | #define CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH 120 |
@@ -431,6 +449,7 @@ void ERR_load_CMS_strings(void); | |||
431 | #define CMS_R_NOT_ENCRYPTED_DATA 122 | 449 | #define CMS_R_NOT_ENCRYPTED_DATA 122 |
432 | #define CMS_R_NOT_KEK 123 | 450 | #define CMS_R_NOT_KEK 123 |
433 | #define CMS_R_NOT_KEY_TRANSPORT 124 | 451 | #define CMS_R_NOT_KEY_TRANSPORT 124 |
452 | #define CMS_R_NOT_PWRI 177 | ||
434 | #define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE 125 | 453 | #define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE 125 |
435 | #define CMS_R_NO_CIPHER 126 | 454 | #define CMS_R_NO_CIPHER 126 |
436 | #define CMS_R_NO_CONTENT 127 | 455 | #define CMS_R_NO_CONTENT 127 |
@@ -443,6 +462,7 @@ void ERR_load_CMS_strings(void); | |||
443 | #define CMS_R_NO_MATCHING_RECIPIENT 132 | 462 | #define CMS_R_NO_MATCHING_RECIPIENT 132 |
444 | #define CMS_R_NO_MATCHING_SIGNATURE 166 | 463 | #define CMS_R_NO_MATCHING_SIGNATURE 166 |
445 | #define CMS_R_NO_MSGSIGDIGEST 167 | 464 | #define CMS_R_NO_MSGSIGDIGEST 167 |
465 | #define CMS_R_NO_PASSWORD 178 | ||
446 | #define CMS_R_NO_PRIVATE_KEY 133 | 466 | #define CMS_R_NO_PRIVATE_KEY 133 |
447 | #define CMS_R_NO_PUBLIC_KEY 134 | 467 | #define CMS_R_NO_PUBLIC_KEY 134 |
448 | #define CMS_R_NO_RECEIPT_REQUEST 168 | 468 | #define CMS_R_NO_RECEIPT_REQUEST 168 |
@@ -466,10 +486,12 @@ void ERR_load_CMS_strings(void); | |||
466 | #define CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM 151 | 486 | #define CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM 151 |
467 | #define CMS_R_UNSUPPORTED_CONTENT_TYPE 152 | 487 | #define CMS_R_UNSUPPORTED_CONTENT_TYPE 152 |
468 | #define CMS_R_UNSUPPORTED_KEK_ALGORITHM 153 | 488 | #define CMS_R_UNSUPPORTED_KEK_ALGORITHM 153 |
489 | #define CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM 179 | ||
469 | #define CMS_R_UNSUPPORTED_RECIPIENT_TYPE 154 | 490 | #define CMS_R_UNSUPPORTED_RECIPIENT_TYPE 154 |
470 | #define CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE 155 | 491 | #define CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE 155 |
471 | #define CMS_R_UNSUPPORTED_TYPE 156 | 492 | #define CMS_R_UNSUPPORTED_TYPE 156 |
472 | #define CMS_R_UNWRAP_ERROR 157 | 493 | #define CMS_R_UNWRAP_ERROR 157 |
494 | #define CMS_R_UNWRAP_FAILURE 180 | ||
473 | #define CMS_R_VERIFICATION_FAILURE 158 | 495 | #define CMS_R_VERIFICATION_FAILURE 158 |
474 | #define CMS_R_WRAP_ERROR 159 | 496 | #define CMS_R_WRAP_ERROR 159 |
475 | 497 | ||
diff --git a/src/lib/libcrypto/cms/cms_asn1.c b/src/lib/libcrypto/cms/cms_asn1.c index fcba4dcbcc..cfe67fb6c1 100644 --- a/src/lib/libcrypto/cms/cms_asn1.c +++ b/src/lib/libcrypto/cms/cms_asn1.c | |||
@@ -237,6 +237,15 @@ static int cms_ri_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it, | |||
237 | OPENSSL_free(kekri->key); | 237 | OPENSSL_free(kekri->key); |
238 | } | 238 | } |
239 | } | 239 | } |
240 | else if (ri->type == CMS_RECIPINFO_PASS) | ||
241 | { | ||
242 | CMS_PasswordRecipientInfo *pwri = ri->d.pwri; | ||
243 | if (pwri->pass) | ||
244 | { | ||
245 | OPENSSL_cleanse(pwri->pass, pwri->passlen); | ||
246 | OPENSSL_free(pwri->pass); | ||
247 | } | ||
248 | } | ||
240 | } | 249 | } |
241 | return 1; | 250 | return 1; |
242 | } | 251 | } |
diff --git a/src/lib/libcrypto/cms/cms_enc.c b/src/lib/libcrypto/cms/cms_enc.c index bab26235bd..f873ce3794 100644 --- a/src/lib/libcrypto/cms/cms_enc.c +++ b/src/lib/libcrypto/cms/cms_enc.c | |||
@@ -73,6 +73,8 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) | |||
73 | const EVP_CIPHER *ciph; | 73 | const EVP_CIPHER *ciph; |
74 | X509_ALGOR *calg = ec->contentEncryptionAlgorithm; | 74 | X509_ALGOR *calg = ec->contentEncryptionAlgorithm; |
75 | unsigned char iv[EVP_MAX_IV_LENGTH], *piv = NULL; | 75 | unsigned char iv[EVP_MAX_IV_LENGTH], *piv = NULL; |
76 | unsigned char *tkey = NULL; | ||
77 | size_t tkeylen; | ||
76 | 78 | ||
77 | int ok = 0; | 79 | int ok = 0; |
78 | 80 | ||
@@ -137,32 +139,57 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) | |||
137 | CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); | 139 | CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); |
138 | goto err; | 140 | goto err; |
139 | } | 141 | } |
140 | 142 | tkeylen = EVP_CIPHER_CTX_key_length(ctx); | |
141 | 143 | /* Generate random session key */ | |
142 | if (enc && !ec->key) | 144 | if (!enc || !ec->key) |
143 | { | 145 | { |
144 | /* Generate random key */ | 146 | tkey = OPENSSL_malloc(tkeylen); |
145 | if (!ec->keylen) | 147 | if (!tkey) |
146 | ec->keylen = EVP_CIPHER_CTX_key_length(ctx); | ||
147 | ec->key = OPENSSL_malloc(ec->keylen); | ||
148 | if (!ec->key) | ||
149 | { | 148 | { |
150 | CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, | 149 | CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, |
151 | ERR_R_MALLOC_FAILURE); | 150 | ERR_R_MALLOC_FAILURE); |
152 | goto err; | 151 | goto err; |
153 | } | 152 | } |
154 | if (EVP_CIPHER_CTX_rand_key(ctx, ec->key) <= 0) | 153 | if (EVP_CIPHER_CTX_rand_key(ctx, tkey) <= 0) |
155 | goto err; | 154 | goto err; |
156 | keep_key = 1; | ||
157 | } | 155 | } |
158 | else if (ec->keylen != (unsigned int)EVP_CIPHER_CTX_key_length(ctx)) | 156 | |
157 | if (!ec->key) | ||
158 | { | ||
159 | ec->key = tkey; | ||
160 | ec->keylen = tkeylen; | ||
161 | tkey = NULL; | ||
162 | if (enc) | ||
163 | keep_key = 1; | ||
164 | else | ||
165 | ERR_clear_error(); | ||
166 | |||
167 | } | ||
168 | |||
169 | if (ec->keylen != tkeylen) | ||
159 | { | 170 | { |
160 | /* If necessary set key length */ | 171 | /* If necessary set key length */ |
161 | if (EVP_CIPHER_CTX_set_key_length(ctx, ec->keylen) <= 0) | 172 | if (EVP_CIPHER_CTX_set_key_length(ctx, ec->keylen) <= 0) |
162 | { | 173 | { |
163 | CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, | 174 | /* Only reveal failure if debugging so we don't |
164 | CMS_R_INVALID_KEY_LENGTH); | 175 | * leak information which may be useful in MMA. |
165 | goto err; | 176 | */ |
177 | if (enc || ec->debug) | ||
178 | { | ||
179 | CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, | ||
180 | CMS_R_INVALID_KEY_LENGTH); | ||
181 | goto err; | ||
182 | } | ||
183 | else | ||
184 | { | ||
185 | /* Use random key */ | ||
186 | OPENSSL_cleanse(ec->key, ec->keylen); | ||
187 | OPENSSL_free(ec->key); | ||
188 | ec->key = tkey; | ||
189 | ec->keylen = tkeylen; | ||
190 | tkey = NULL; | ||
191 | ERR_clear_error(); | ||
192 | } | ||
166 | } | 193 | } |
167 | } | 194 | } |
168 | 195 | ||
@@ -198,6 +225,11 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) | |||
198 | OPENSSL_free(ec->key); | 225 | OPENSSL_free(ec->key); |
199 | ec->key = NULL; | 226 | ec->key = NULL; |
200 | } | 227 | } |
228 | if (tkey) | ||
229 | { | ||
230 | OPENSSL_cleanse(tkey, tkeylen); | ||
231 | OPENSSL_free(tkey); | ||
232 | } | ||
201 | if (ok) | 233 | if (ok) |
202 | return b; | 234 | return b; |
203 | BIO_free(b); | 235 | BIO_free(b); |
diff --git a/src/lib/libcrypto/cms/cms_env.c b/src/lib/libcrypto/cms/cms_env.c index b3237d4b94..be20b1c024 100644 --- a/src/lib/libcrypto/cms/cms_env.c +++ b/src/lib/libcrypto/cms/cms_env.c | |||
@@ -65,14 +65,13 @@ | |||
65 | /* CMS EnvelopedData Utilities */ | 65 | /* CMS EnvelopedData Utilities */ |
66 | 66 | ||
67 | DECLARE_ASN1_ITEM(CMS_EnvelopedData) | 67 | DECLARE_ASN1_ITEM(CMS_EnvelopedData) |
68 | DECLARE_ASN1_ITEM(CMS_RecipientInfo) | ||
69 | DECLARE_ASN1_ITEM(CMS_KeyTransRecipientInfo) | 68 | DECLARE_ASN1_ITEM(CMS_KeyTransRecipientInfo) |
70 | DECLARE_ASN1_ITEM(CMS_KEKRecipientInfo) | 69 | DECLARE_ASN1_ITEM(CMS_KEKRecipientInfo) |
71 | DECLARE_ASN1_ITEM(CMS_OtherKeyAttribute) | 70 | DECLARE_ASN1_ITEM(CMS_OtherKeyAttribute) |
72 | 71 | ||
73 | DECLARE_STACK_OF(CMS_RecipientInfo) | 72 | DECLARE_STACK_OF(CMS_RecipientInfo) |
74 | 73 | ||
75 | static CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms) | 74 | CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms) |
76 | { | 75 | { |
77 | if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_enveloped) | 76 | if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_enveloped) |
78 | { | 77 | { |
@@ -371,6 +370,8 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms, | |||
371 | unsigned char *ek = NULL; | 370 | unsigned char *ek = NULL; |
372 | size_t eklen; | 371 | size_t eklen; |
373 | int ret = 0; | 372 | int ret = 0; |
373 | CMS_EncryptedContentInfo *ec; | ||
374 | ec = cms->d.envelopedData->encryptedContentInfo; | ||
374 | 375 | ||
375 | if (ktri->pkey == NULL) | 376 | if (ktri->pkey == NULL) |
376 | { | 377 | { |
@@ -417,8 +418,14 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms, | |||
417 | 418 | ||
418 | ret = 1; | 419 | ret = 1; |
419 | 420 | ||
420 | cms->d.envelopedData->encryptedContentInfo->key = ek; | 421 | if (ec->key) |
421 | cms->d.envelopedData->encryptedContentInfo->keylen = eklen; | 422 | { |
423 | OPENSSL_cleanse(ec->key, ec->keylen); | ||
424 | OPENSSL_free(ec->key); | ||
425 | } | ||
426 | |||
427 | ec->key = ek; | ||
428 | ec->keylen = eklen; | ||
422 | 429 | ||
423 | err: | 430 | err: |
424 | if (pctx) | 431 | if (pctx) |
@@ -786,6 +793,9 @@ int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri) | |||
786 | case CMS_RECIPINFO_KEK: | 793 | case CMS_RECIPINFO_KEK: |
787 | return cms_RecipientInfo_kekri_decrypt(cms, ri); | 794 | return cms_RecipientInfo_kekri_decrypt(cms, ri); |
788 | 795 | ||
796 | case CMS_RECIPINFO_PASS: | ||
797 | return cms_RecipientInfo_pwri_crypt(cms, ri, 0); | ||
798 | |||
789 | default: | 799 | default: |
790 | CMSerr(CMS_F_CMS_RECIPIENTINFO_DECRYPT, | 800 | CMSerr(CMS_F_CMS_RECIPIENTINFO_DECRYPT, |
791 | CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE); | 801 | CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE); |
@@ -829,6 +839,10 @@ BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms) | |||
829 | r = cms_RecipientInfo_kekri_encrypt(cms, ri); | 839 | r = cms_RecipientInfo_kekri_encrypt(cms, ri); |
830 | break; | 840 | break; |
831 | 841 | ||
842 | case CMS_RECIPINFO_PASS: | ||
843 | r = cms_RecipientInfo_pwri_crypt(cms, ri, 1); | ||
844 | break; | ||
845 | |||
832 | default: | 846 | default: |
833 | CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO, | 847 | CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO, |
834 | CMS_R_UNSUPPORTED_RECIPIENT_TYPE); | 848 | CMS_R_UNSUPPORTED_RECIPIENT_TYPE); |
diff --git a/src/lib/libcrypto/cms/cms_err.c b/src/lib/libcrypto/cms/cms_err.c index ff7b0309e5..8330ead7ed 100644 --- a/src/lib/libcrypto/cms/cms_err.c +++ b/src/lib/libcrypto/cms/cms_err.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* crypto/cms/cms_err.c */ | 1 | /* crypto/cms/cms_err.c */ |
2 | /* ==================================================================== | 2 | /* ==================================================================== |
3 | * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved. |
4 | * | 4 | * |
5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
@@ -73,6 +73,7 @@ static ERR_STRING_DATA CMS_str_functs[]= | |||
73 | {ERR_FUNC(CMS_F_CHECK_CONTENT), "CHECK_CONTENT"}, | 73 | {ERR_FUNC(CMS_F_CHECK_CONTENT), "CHECK_CONTENT"}, |
74 | {ERR_FUNC(CMS_F_CMS_ADD0_CERT), "CMS_add0_cert"}, | 74 | {ERR_FUNC(CMS_F_CMS_ADD0_CERT), "CMS_add0_cert"}, |
75 | {ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_KEY), "CMS_add0_recipient_key"}, | 75 | {ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_KEY), "CMS_add0_recipient_key"}, |
76 | {ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD), "CMS_add0_recipient_password"}, | ||
76 | {ERR_FUNC(CMS_F_CMS_ADD1_RECEIPTREQUEST), "CMS_add1_ReceiptRequest"}, | 77 | {ERR_FUNC(CMS_F_CMS_ADD1_RECEIPTREQUEST), "CMS_add1_ReceiptRequest"}, |
77 | {ERR_FUNC(CMS_F_CMS_ADD1_RECIPIENT_CERT), "CMS_add1_recipient_cert"}, | 78 | {ERR_FUNC(CMS_F_CMS_ADD1_RECIPIENT_CERT), "CMS_add1_recipient_cert"}, |
78 | {ERR_FUNC(CMS_F_CMS_ADD1_SIGNER), "CMS_add1_signer"}, | 79 | {ERR_FUNC(CMS_F_CMS_ADD1_SIGNER), "CMS_add1_signer"}, |
@@ -87,6 +88,7 @@ static ERR_STRING_DATA CMS_str_functs[]= | |||
87 | {ERR_FUNC(CMS_F_CMS_DATAINIT), "CMS_dataInit"}, | 88 | {ERR_FUNC(CMS_F_CMS_DATAINIT), "CMS_dataInit"}, |
88 | {ERR_FUNC(CMS_F_CMS_DECRYPT), "CMS_decrypt"}, | 89 | {ERR_FUNC(CMS_F_CMS_DECRYPT), "CMS_decrypt"}, |
89 | {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_KEY), "CMS_decrypt_set1_key"}, | 90 | {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_KEY), "CMS_decrypt_set1_key"}, |
91 | {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PASSWORD), "CMS_decrypt_set1_password"}, | ||
90 | {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PKEY), "CMS_decrypt_set1_pkey"}, | 92 | {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PKEY), "CMS_decrypt_set1_pkey"}, |
91 | {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX), "cms_DigestAlgorithm_find_ctx"}, | 93 | {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX), "cms_DigestAlgorithm_find_ctx"}, |
92 | {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO), "cms_DigestAlgorithm_init_bio"}, | 94 | {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO), "cms_DigestAlgorithm_init_bio"}, |
@@ -105,7 +107,7 @@ static ERR_STRING_DATA CMS_str_functs[]= | |||
105 | {ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES), "CMS_GET0_CERTIFICATE_CHOICES"}, | 107 | {ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES), "CMS_GET0_CERTIFICATE_CHOICES"}, |
106 | {ERR_FUNC(CMS_F_CMS_GET0_CONTENT), "CMS_get0_content"}, | 108 | {ERR_FUNC(CMS_F_CMS_GET0_CONTENT), "CMS_get0_content"}, |
107 | {ERR_FUNC(CMS_F_CMS_GET0_ECONTENT_TYPE), "CMS_GET0_ECONTENT_TYPE"}, | 109 | {ERR_FUNC(CMS_F_CMS_GET0_ECONTENT_TYPE), "CMS_GET0_ECONTENT_TYPE"}, |
108 | {ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED), "CMS_GET0_ENVELOPED"}, | 110 | {ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED), "cms_get0_enveloped"}, |
109 | {ERR_FUNC(CMS_F_CMS_GET0_REVOCATION_CHOICES), "CMS_GET0_REVOCATION_CHOICES"}, | 111 | {ERR_FUNC(CMS_F_CMS_GET0_REVOCATION_CHOICES), "CMS_GET0_REVOCATION_CHOICES"}, |
110 | {ERR_FUNC(CMS_F_CMS_GET0_SIGNED), "CMS_GET0_SIGNED"}, | 112 | {ERR_FUNC(CMS_F_CMS_GET0_SIGNED), "CMS_GET0_SIGNED"}, |
111 | {ERR_FUNC(CMS_F_CMS_MSGSIGDIGEST_ADD1), "cms_msgSigDigest_add1"}, | 113 | {ERR_FUNC(CMS_F_CMS_MSGSIGDIGEST_ADD1), "cms_msgSigDigest_add1"}, |
@@ -121,7 +123,9 @@ static ERR_STRING_DATA CMS_str_functs[]= | |||
121 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT), "CMS_RECIPIENTINFO_KTRI_ENCRYPT"}, | 123 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT), "CMS_RECIPIENTINFO_KTRI_ENCRYPT"}, |
122 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS), "CMS_RecipientInfo_ktri_get0_algs"}, | 124 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS), "CMS_RecipientInfo_ktri_get0_algs"}, |
123 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID), "CMS_RecipientInfo_ktri_get0_signer_id"}, | 125 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID), "CMS_RecipientInfo_ktri_get0_signer_id"}, |
126 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT), "cms_RecipientInfo_pwri_crypt"}, | ||
124 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_KEY), "CMS_RecipientInfo_set0_key"}, | 127 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_KEY), "CMS_RecipientInfo_set0_key"}, |
128 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD), "CMS_RecipientInfo_set0_password"}, | ||
125 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY), "CMS_RecipientInfo_set0_pkey"}, | 129 | {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY), "CMS_RecipientInfo_set0_pkey"}, |
126 | {ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER), "cms_set1_SignerIdentifier"}, | 130 | {ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER), "cms_set1_SignerIdentifier"}, |
127 | {ERR_FUNC(CMS_F_CMS_SET_DETACHED), "CMS_set_detached"}, | 131 | {ERR_FUNC(CMS_F_CMS_SET_DETACHED), "CMS_set_detached"}, |
@@ -165,6 +169,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= | |||
165 | {ERR_REASON(CMS_R_ERROR_SETTING_KEY) ,"error setting key"}, | 169 | {ERR_REASON(CMS_R_ERROR_SETTING_KEY) ,"error setting key"}, |
166 | {ERR_REASON(CMS_R_ERROR_SETTING_RECIPIENTINFO),"error setting recipientinfo"}, | 170 | {ERR_REASON(CMS_R_ERROR_SETTING_RECIPIENTINFO),"error setting recipientinfo"}, |
167 | {ERR_REASON(CMS_R_INVALID_ENCRYPTED_KEY_LENGTH),"invalid encrypted key length"}, | 171 | {ERR_REASON(CMS_R_INVALID_ENCRYPTED_KEY_LENGTH),"invalid encrypted key length"}, |
172 | {ERR_REASON(CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER),"invalid key encryption parameter"}, | ||
168 | {ERR_REASON(CMS_R_INVALID_KEY_LENGTH) ,"invalid key length"}, | 173 | {ERR_REASON(CMS_R_INVALID_KEY_LENGTH) ,"invalid key length"}, |
169 | {ERR_REASON(CMS_R_MD_BIO_INIT_ERROR) ,"md bio init error"}, | 174 | {ERR_REASON(CMS_R_MD_BIO_INIT_ERROR) ,"md bio init error"}, |
170 | {ERR_REASON(CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH),"messagedigest attribute wrong length"}, | 175 | {ERR_REASON(CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH),"messagedigest attribute wrong length"}, |
@@ -177,6 +182,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= | |||
177 | {ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA) ,"not encrypted data"}, | 182 | {ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA) ,"not encrypted data"}, |
178 | {ERR_REASON(CMS_R_NOT_KEK) ,"not kek"}, | 183 | {ERR_REASON(CMS_R_NOT_KEK) ,"not kek"}, |
179 | {ERR_REASON(CMS_R_NOT_KEY_TRANSPORT) ,"not key transport"}, | 184 | {ERR_REASON(CMS_R_NOT_KEY_TRANSPORT) ,"not key transport"}, |
185 | {ERR_REASON(CMS_R_NOT_PWRI) ,"not pwri"}, | ||
180 | {ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),"not supported for this key type"}, | 186 | {ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),"not supported for this key type"}, |
181 | {ERR_REASON(CMS_R_NO_CIPHER) ,"no cipher"}, | 187 | {ERR_REASON(CMS_R_NO_CIPHER) ,"no cipher"}, |
182 | {ERR_REASON(CMS_R_NO_CONTENT) ,"no content"}, | 188 | {ERR_REASON(CMS_R_NO_CONTENT) ,"no content"}, |
@@ -189,6 +195,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= | |||
189 | {ERR_REASON(CMS_R_NO_MATCHING_RECIPIENT) ,"no matching recipient"}, | 195 | {ERR_REASON(CMS_R_NO_MATCHING_RECIPIENT) ,"no matching recipient"}, |
190 | {ERR_REASON(CMS_R_NO_MATCHING_SIGNATURE) ,"no matching signature"}, | 196 | {ERR_REASON(CMS_R_NO_MATCHING_SIGNATURE) ,"no matching signature"}, |
191 | {ERR_REASON(CMS_R_NO_MSGSIGDIGEST) ,"no msgsigdigest"}, | 197 | {ERR_REASON(CMS_R_NO_MSGSIGDIGEST) ,"no msgsigdigest"}, |
198 | {ERR_REASON(CMS_R_NO_PASSWORD) ,"no password"}, | ||
192 | {ERR_REASON(CMS_R_NO_PRIVATE_KEY) ,"no private key"}, | 199 | {ERR_REASON(CMS_R_NO_PRIVATE_KEY) ,"no private key"}, |
193 | {ERR_REASON(CMS_R_NO_PUBLIC_KEY) ,"no public key"}, | 200 | {ERR_REASON(CMS_R_NO_PUBLIC_KEY) ,"no public key"}, |
194 | {ERR_REASON(CMS_R_NO_RECEIPT_REQUEST) ,"no receipt request"}, | 201 | {ERR_REASON(CMS_R_NO_RECEIPT_REQUEST) ,"no receipt request"}, |
@@ -212,10 +219,12 @@ static ERR_STRING_DATA CMS_str_reasons[]= | |||
212 | {ERR_REASON(CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM),"unsupported compression algorithm"}, | 219 | {ERR_REASON(CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM),"unsupported compression algorithm"}, |
213 | {ERR_REASON(CMS_R_UNSUPPORTED_CONTENT_TYPE),"unsupported content type"}, | 220 | {ERR_REASON(CMS_R_UNSUPPORTED_CONTENT_TYPE),"unsupported content type"}, |
214 | {ERR_REASON(CMS_R_UNSUPPORTED_KEK_ALGORITHM),"unsupported kek algorithm"}, | 221 | {ERR_REASON(CMS_R_UNSUPPORTED_KEK_ALGORITHM),"unsupported kek algorithm"}, |
222 | {ERR_REASON(CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM),"unsupported key encryption algorithm"}, | ||
215 | {ERR_REASON(CMS_R_UNSUPPORTED_RECIPIENT_TYPE),"unsupported recipient type"}, | 223 | {ERR_REASON(CMS_R_UNSUPPORTED_RECIPIENT_TYPE),"unsupported recipient type"}, |
216 | {ERR_REASON(CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE),"unsupported recpientinfo type"}, | 224 | {ERR_REASON(CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE),"unsupported recpientinfo type"}, |
217 | {ERR_REASON(CMS_R_UNSUPPORTED_TYPE) ,"unsupported type"}, | 225 | {ERR_REASON(CMS_R_UNSUPPORTED_TYPE) ,"unsupported type"}, |
218 | {ERR_REASON(CMS_R_UNWRAP_ERROR) ,"unwrap error"}, | 226 | {ERR_REASON(CMS_R_UNWRAP_ERROR) ,"unwrap error"}, |
227 | {ERR_REASON(CMS_R_UNWRAP_FAILURE) ,"unwrap failure"}, | ||
219 | {ERR_REASON(CMS_R_VERIFICATION_FAILURE) ,"verification failure"}, | 228 | {ERR_REASON(CMS_R_VERIFICATION_FAILURE) ,"verification failure"}, |
220 | {ERR_REASON(CMS_R_WRAP_ERROR) ,"wrap error"}, | 229 | {ERR_REASON(CMS_R_WRAP_ERROR) ,"wrap error"}, |
221 | {0,NULL} | 230 | {0,NULL} |
diff --git a/src/lib/libcrypto/cms/cms_lcl.h b/src/lib/libcrypto/cms/cms_lcl.h index c8ecfa724a..a9f9730157 100644 --- a/src/lib/libcrypto/cms/cms_lcl.h +++ b/src/lib/libcrypto/cms/cms_lcl.h | |||
@@ -175,6 +175,8 @@ struct CMS_EncryptedContentInfo_st | |||
175 | const EVP_CIPHER *cipher; | 175 | const EVP_CIPHER *cipher; |
176 | unsigned char *key; | 176 | unsigned char *key; |
177 | size_t keylen; | 177 | size_t keylen; |
178 | /* Set to 1 if we are debugging decrypt and don't fake keys for MMA */ | ||
179 | int debug; | ||
178 | }; | 180 | }; |
179 | 181 | ||
180 | struct CMS_RecipientInfo_st | 182 | struct CMS_RecipientInfo_st |
@@ -273,6 +275,9 @@ struct CMS_PasswordRecipientInfo_st | |||
273 | X509_ALGOR *keyDerivationAlgorithm; | 275 | X509_ALGOR *keyDerivationAlgorithm; |
274 | X509_ALGOR *keyEncryptionAlgorithm; | 276 | X509_ALGOR *keyEncryptionAlgorithm; |
275 | ASN1_OCTET_STRING *encryptedKey; | 277 | ASN1_OCTET_STRING *encryptedKey; |
278 | /* Extra info: password to use */ | ||
279 | unsigned char *pass; | ||
280 | size_t passlen; | ||
276 | }; | 281 | }; |
277 | 282 | ||
278 | struct CMS_OtherRecipientInfo_st | 283 | struct CMS_OtherRecipientInfo_st |
@@ -411,6 +416,8 @@ DECLARE_ASN1_ITEM(CMS_SignerInfo) | |||
411 | DECLARE_ASN1_ITEM(CMS_IssuerAndSerialNumber) | 416 | DECLARE_ASN1_ITEM(CMS_IssuerAndSerialNumber) |
412 | DECLARE_ASN1_ITEM(CMS_Attributes_Sign) | 417 | DECLARE_ASN1_ITEM(CMS_Attributes_Sign) |
413 | DECLARE_ASN1_ITEM(CMS_Attributes_Verify) | 418 | DECLARE_ASN1_ITEM(CMS_Attributes_Verify) |
419 | DECLARE_ASN1_ITEM(CMS_RecipientInfo) | ||
420 | DECLARE_ASN1_ITEM(CMS_PasswordRecipientInfo) | ||
414 | DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber) | 421 | DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber) |
415 | 422 | ||
416 | #define CMS_SIGNERINFO_ISSUER_SERIAL 0 | 423 | #define CMS_SIGNERINFO_ISSUER_SERIAL 0 |
@@ -454,6 +461,11 @@ int cms_msgSigDigest_add1(CMS_SignerInfo *dest, CMS_SignerInfo *src); | |||
454 | ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si); | 461 | ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si); |
455 | 462 | ||
456 | BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms); | 463 | BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms); |
464 | CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms); | ||
465 | |||
466 | /* PWRI routines */ | ||
467 | int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri, | ||
468 | int en_de); | ||
457 | 469 | ||
458 | #ifdef __cplusplus | 470 | #ifdef __cplusplus |
459 | } | 471 | } |
diff --git a/src/lib/libcrypto/cms/cms_lib.c b/src/lib/libcrypto/cms/cms_lib.c index d00fe0f87b..f88e8f3b52 100644 --- a/src/lib/libcrypto/cms/cms_lib.c +++ b/src/lib/libcrypto/cms/cms_lib.c | |||
@@ -412,8 +412,7 @@ int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain, | |||
412 | */ | 412 | */ |
413 | || EVP_MD_pkey_type(EVP_MD_CTX_md(mtmp)) == nid) | 413 | || EVP_MD_pkey_type(EVP_MD_CTX_md(mtmp)) == nid) |
414 | { | 414 | { |
415 | EVP_MD_CTX_copy_ex(mctx, mtmp); | 415 | return EVP_MD_CTX_copy_ex(mctx, mtmp); |
416 | return 1; | ||
417 | } | 416 | } |
418 | chain = BIO_next(chain); | 417 | chain = BIO_next(chain); |
419 | } | 418 | } |
diff --git a/src/lib/libcrypto/cms/cms_pwri.c b/src/lib/libcrypto/cms/cms_pwri.c new file mode 100644 index 0000000000..b79612a12d --- /dev/null +++ b/src/lib/libcrypto/cms/cms_pwri.c | |||
@@ -0,0 +1,454 @@ | |||
1 | /* crypto/cms/cms_pwri.c */ | ||
2 | /* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | ||
3 | * project. | ||
4 | */ | ||
5 | /* ==================================================================== | ||
6 | * Copyright (c) 2009 The OpenSSL Project. All rights reserved. | ||
7 | * | ||
8 | * Redistribution and use in source and binary forms, with or without | ||
9 | * modification, are permitted provided that the following conditions | ||
10 | * are met: | ||
11 | * | ||
12 | * 1. Redistributions of source code must retain the above copyright | ||
13 | * notice, this list of conditions and the following disclaimer. | ||
14 | * | ||
15 | * 2. Redistributions in binary form must reproduce the above copyright | ||
16 | * notice, this list of conditions and the following disclaimer in | ||
17 | * the documentation and/or other materials provided with the | ||
18 | * distribution. | ||
19 | * | ||
20 | * 3. All advertising materials mentioning features or use of this | ||
21 | * software must display the following acknowledgment: | ||
22 | * "This product includes software developed by the OpenSSL Project | ||
23 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
24 | * | ||
25 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
26 | * endorse or promote products derived from this software without | ||
27 | * prior written permission. For written permission, please contact | ||
28 | * licensing@OpenSSL.org. | ||
29 | * | ||
30 | * 5. Products derived from this software may not be called "OpenSSL" | ||
31 | * nor may "OpenSSL" appear in their names without prior written | ||
32 | * permission of the OpenSSL Project. | ||
33 | * | ||
34 | * 6. Redistributions of any form whatsoever must retain the following | ||
35 | * acknowledgment: | ||
36 | * "This product includes software developed by the OpenSSL Project | ||
37 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
38 | * | ||
39 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
40 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
41 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
42 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
43 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
44 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
45 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
46 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
47 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
48 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
49 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
50 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
51 | * ==================================================================== | ||
52 | */ | ||
53 | |||
54 | #include "cryptlib.h" | ||
55 | #include <openssl/asn1t.h> | ||
56 | #include <openssl/pem.h> | ||
57 | #include <openssl/x509v3.h> | ||
58 | #include <openssl/err.h> | ||
59 | #include <openssl/cms.h> | ||
60 | #include <openssl/rand.h> | ||
61 | #include <openssl/aes.h> | ||
62 | #include "cms_lcl.h" | ||
63 | #include "asn1_locl.h" | ||
64 | |||
65 | int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, | ||
66 | unsigned char *pass, ossl_ssize_t passlen) | ||
67 | { | ||
68 | CMS_PasswordRecipientInfo *pwri; | ||
69 | if (ri->type != CMS_RECIPINFO_PASS) | ||
70 | { | ||
71 | CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD, CMS_R_NOT_PWRI); | ||
72 | return 0; | ||
73 | } | ||
74 | |||
75 | pwri = ri->d.pwri; | ||
76 | pwri->pass = pass; | ||
77 | if (pass && passlen < 0) | ||
78 | passlen = strlen((char *)pass); | ||
79 | pwri->passlen = passlen; | ||
80 | return 1; | ||
81 | } | ||
82 | |||
83 | CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms, | ||
84 | int iter, int wrap_nid, int pbe_nid, | ||
85 | unsigned char *pass, | ||
86 | ossl_ssize_t passlen, | ||
87 | const EVP_CIPHER *kekciph) | ||
88 | { | ||
89 | CMS_RecipientInfo *ri = NULL; | ||
90 | CMS_EnvelopedData *env; | ||
91 | CMS_PasswordRecipientInfo *pwri; | ||
92 | EVP_CIPHER_CTX ctx; | ||
93 | X509_ALGOR *encalg = NULL; | ||
94 | unsigned char iv[EVP_MAX_IV_LENGTH]; | ||
95 | int ivlen; | ||
96 | env = cms_get0_enveloped(cms); | ||
97 | if (!env) | ||
98 | goto err; | ||
99 | |||
100 | if (wrap_nid <= 0) | ||
101 | wrap_nid = NID_id_alg_PWRI_KEK; | ||
102 | |||
103 | if (pbe_nid <= 0) | ||
104 | pbe_nid = NID_id_pbkdf2; | ||
105 | |||
106 | /* Get from enveloped data */ | ||
107 | if (kekciph == NULL) | ||
108 | kekciph = env->encryptedContentInfo->cipher; | ||
109 | |||
110 | if (kekciph == NULL) | ||
111 | { | ||
112 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, CMS_R_NO_CIPHER); | ||
113 | return NULL; | ||
114 | } | ||
115 | if (wrap_nid != NID_id_alg_PWRI_KEK) | ||
116 | { | ||
117 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, | ||
118 | CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM); | ||
119 | return NULL; | ||
120 | } | ||
121 | |||
122 | /* Setup algorithm identifier for cipher */ | ||
123 | encalg = X509_ALGOR_new(); | ||
124 | EVP_CIPHER_CTX_init(&ctx); | ||
125 | |||
126 | if (EVP_EncryptInit_ex(&ctx, kekciph, NULL, NULL, NULL) <= 0) | ||
127 | { | ||
128 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_EVP_LIB); | ||
129 | goto err; | ||
130 | } | ||
131 | |||
132 | ivlen = EVP_CIPHER_CTX_iv_length(&ctx); | ||
133 | |||
134 | if (ivlen > 0) | ||
135 | { | ||
136 | if (RAND_pseudo_bytes(iv, ivlen) <= 0) | ||
137 | goto err; | ||
138 | if (EVP_EncryptInit_ex(&ctx, NULL, NULL, NULL, iv) <= 0) | ||
139 | { | ||
140 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, | ||
141 | ERR_R_EVP_LIB); | ||
142 | goto err; | ||
143 | } | ||
144 | encalg->parameter = ASN1_TYPE_new(); | ||
145 | if (!encalg->parameter) | ||
146 | { | ||
147 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, | ||
148 | ERR_R_MALLOC_FAILURE); | ||
149 | goto err; | ||
150 | } | ||
151 | if (EVP_CIPHER_param_to_asn1(&ctx, encalg->parameter) <= 0) | ||
152 | { | ||
153 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, | ||
154 | CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); | ||
155 | goto err; | ||
156 | } | ||
157 | } | ||
158 | |||
159 | |||
160 | encalg->algorithm = OBJ_nid2obj(EVP_CIPHER_CTX_type(&ctx)); | ||
161 | |||
162 | EVP_CIPHER_CTX_cleanup(&ctx); | ||
163 | |||
164 | /* Initialize recipient info */ | ||
165 | ri = M_ASN1_new_of(CMS_RecipientInfo); | ||
166 | if (!ri) | ||
167 | goto merr; | ||
168 | |||
169 | ri->d.pwri = M_ASN1_new_of(CMS_PasswordRecipientInfo); | ||
170 | if (!ri->d.pwri) | ||
171 | goto merr; | ||
172 | ri->type = CMS_RECIPINFO_PASS; | ||
173 | |||
174 | pwri = ri->d.pwri; | ||
175 | /* Since this is overwritten, free up empty structure already there */ | ||
176 | X509_ALGOR_free(pwri->keyEncryptionAlgorithm); | ||
177 | pwri->keyEncryptionAlgorithm = X509_ALGOR_new(); | ||
178 | if (!pwri->keyEncryptionAlgorithm) | ||
179 | goto merr; | ||
180 | pwri->keyEncryptionAlgorithm->algorithm = OBJ_nid2obj(wrap_nid); | ||
181 | pwri->keyEncryptionAlgorithm->parameter = ASN1_TYPE_new(); | ||
182 | if (!pwri->keyEncryptionAlgorithm->parameter) | ||
183 | goto merr; | ||
184 | |||
185 | if(!ASN1_item_pack(encalg, ASN1_ITEM_rptr(X509_ALGOR), | ||
186 | &pwri->keyEncryptionAlgorithm->parameter->value.sequence)) | ||
187 | goto merr; | ||
188 | pwri->keyEncryptionAlgorithm->parameter->type = V_ASN1_SEQUENCE; | ||
189 | |||
190 | X509_ALGOR_free(encalg); | ||
191 | encalg = NULL; | ||
192 | |||
193 | /* Setup PBE algorithm */ | ||
194 | |||
195 | pwri->keyDerivationAlgorithm = PKCS5_pbkdf2_set(iter, NULL, 0, -1, -1); | ||
196 | |||
197 | if (!pwri->keyDerivationAlgorithm) | ||
198 | goto err; | ||
199 | |||
200 | CMS_RecipientInfo_set0_password(ri, pass, passlen); | ||
201 | pwri->version = 0; | ||
202 | |||
203 | if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri)) | ||
204 | goto merr; | ||
205 | |||
206 | return ri; | ||
207 | |||
208 | merr: | ||
209 | CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_MALLOC_FAILURE); | ||
210 | err: | ||
211 | EVP_CIPHER_CTX_cleanup(&ctx); | ||
212 | if (ri) | ||
213 | M_ASN1_free_of(ri, CMS_RecipientInfo); | ||
214 | if (encalg) | ||
215 | X509_ALGOR_free(encalg); | ||
216 | return NULL; | ||
217 | |||
218 | } | ||
219 | |||
220 | /* This is an implementation of the key wrapping mechanism in RFC3211, | ||
221 | * at some point this should go into EVP. | ||
222 | */ | ||
223 | |||
224 | static int kek_unwrap_key(unsigned char *out, size_t *outlen, | ||
225 | const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx) | ||
226 | { | ||
227 | size_t blocklen = EVP_CIPHER_CTX_block_size(ctx); | ||
228 | unsigned char *tmp; | ||
229 | int outl, rv = 0; | ||
230 | if (inlen < 2 * blocklen) | ||
231 | { | ||
232 | /* too small */ | ||
233 | return 0; | ||
234 | } | ||
235 | if (inlen % blocklen) | ||
236 | { | ||
237 | /* Invalid size */ | ||
238 | return 0; | ||
239 | } | ||
240 | tmp = OPENSSL_malloc(inlen); | ||
241 | /* setup IV by decrypting last two blocks */ | ||
242 | EVP_DecryptUpdate(ctx, tmp + inlen - 2 * blocklen, &outl, | ||
243 | in + inlen - 2 * blocklen, blocklen * 2); | ||
244 | /* Do a decrypt of last decrypted block to set IV to correct value | ||
245 | * output it to start of buffer so we don't corrupt decrypted block | ||
246 | * this works because buffer is at least two block lengths long. | ||
247 | */ | ||
248 | EVP_DecryptUpdate(ctx, tmp, &outl, | ||
249 | tmp + inlen - blocklen, blocklen); | ||
250 | /* Can now decrypt first n - 1 blocks */ | ||
251 | EVP_DecryptUpdate(ctx, tmp, &outl, in, inlen - blocklen); | ||
252 | |||
253 | /* Reset IV to original value */ | ||
254 | EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, NULL); | ||
255 | /* Decrypt again */ | ||
256 | EVP_DecryptUpdate(ctx, tmp, &outl, tmp, inlen); | ||
257 | /* Check check bytes */ | ||
258 | if (((tmp[1] ^ tmp[4]) & (tmp[2] ^ tmp[5]) & (tmp[3] ^ tmp[6])) != 0xff) | ||
259 | { | ||
260 | /* Check byte failure */ | ||
261 | goto err; | ||
262 | } | ||
263 | if (inlen < (size_t)(tmp[0] - 4 )) | ||
264 | { | ||
265 | /* Invalid length value */ | ||
266 | goto err; | ||
267 | } | ||
268 | *outlen = (size_t)tmp[0]; | ||
269 | memcpy(out, tmp + 4, *outlen); | ||
270 | rv = 1; | ||
271 | err: | ||
272 | OPENSSL_cleanse(tmp, inlen); | ||
273 | OPENSSL_free(tmp); | ||
274 | return rv; | ||
275 | |||
276 | } | ||
277 | |||
278 | static int kek_wrap_key(unsigned char *out, size_t *outlen, | ||
279 | const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx) | ||
280 | { | ||
281 | size_t blocklen = EVP_CIPHER_CTX_block_size(ctx); | ||
282 | size_t olen; | ||
283 | int dummy; | ||
284 | /* First decide length of output buffer: need header and round up to | ||
285 | * multiple of block length. | ||
286 | */ | ||
287 | olen = (inlen + 4 + blocklen - 1)/blocklen; | ||
288 | olen *= blocklen; | ||
289 | if (olen < 2 * blocklen) | ||
290 | { | ||
291 | /* Key too small */ | ||
292 | return 0; | ||
293 | } | ||
294 | if (inlen > 0xFF) | ||
295 | { | ||
296 | /* Key too large */ | ||
297 | return 0; | ||
298 | } | ||
299 | if (out) | ||
300 | { | ||
301 | /* Set header */ | ||
302 | out[0] = (unsigned char)inlen; | ||
303 | out[1] = in[0] ^ 0xFF; | ||
304 | out[2] = in[1] ^ 0xFF; | ||
305 | out[3] = in[2] ^ 0xFF; | ||
306 | memcpy(out + 4, in, inlen); | ||
307 | /* Add random padding to end */ | ||
308 | if (olen > inlen + 4) | ||
309 | RAND_pseudo_bytes(out + 4 + inlen, olen - 4 - inlen); | ||
310 | /* Encrypt twice */ | ||
311 | EVP_EncryptUpdate(ctx, out, &dummy, out, olen); | ||
312 | EVP_EncryptUpdate(ctx, out, &dummy, out, olen); | ||
313 | } | ||
314 | |||
315 | *outlen = olen; | ||
316 | |||
317 | return 1; | ||
318 | } | ||
319 | |||
320 | /* Encrypt/Decrypt content key in PWRI recipient info */ | ||
321 | |||
322 | int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri, | ||
323 | int en_de) | ||
324 | { | ||
325 | CMS_EncryptedContentInfo *ec; | ||
326 | CMS_PasswordRecipientInfo *pwri; | ||
327 | const unsigned char *p = NULL; | ||
328 | int plen; | ||
329 | int r = 0; | ||
330 | X509_ALGOR *algtmp, *kekalg = NULL; | ||
331 | EVP_CIPHER_CTX kekctx; | ||
332 | const EVP_CIPHER *kekcipher; | ||
333 | unsigned char *key = NULL; | ||
334 | size_t keylen; | ||
335 | |||
336 | ec = cms->d.envelopedData->encryptedContentInfo; | ||
337 | |||
338 | pwri = ri->d.pwri; | ||
339 | EVP_CIPHER_CTX_init(&kekctx); | ||
340 | |||
341 | if (!pwri->pass) | ||
342 | { | ||
343 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, CMS_R_NO_PASSWORD); | ||
344 | return 0; | ||
345 | } | ||
346 | algtmp = pwri->keyEncryptionAlgorithm; | ||
347 | |||
348 | if (!algtmp || OBJ_obj2nid(algtmp->algorithm) != NID_id_alg_PWRI_KEK) | ||
349 | { | ||
350 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
351 | CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM); | ||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | if (algtmp->parameter->type == V_ASN1_SEQUENCE) | ||
356 | { | ||
357 | p = algtmp->parameter->value.sequence->data; | ||
358 | plen = algtmp->parameter->value.sequence->length; | ||
359 | kekalg = d2i_X509_ALGOR(NULL, &p, plen); | ||
360 | } | ||
361 | if (kekalg == NULL) | ||
362 | { | ||
363 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
364 | CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER); | ||
365 | return 0; | ||
366 | } | ||
367 | |||
368 | kekcipher = EVP_get_cipherbyobj(kekalg->algorithm); | ||
369 | |||
370 | if(!kekcipher) | ||
371 | { | ||
372 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
373 | CMS_R_UNKNOWN_CIPHER); | ||
374 | goto err; | ||
375 | } | ||
376 | |||
377 | /* Fixup cipher based on AlgorithmIdentifier to set IV etc */ | ||
378 | if (!EVP_CipherInit_ex(&kekctx, kekcipher, NULL, NULL, NULL, en_de)) | ||
379 | goto err; | ||
380 | EVP_CIPHER_CTX_set_padding(&kekctx, 0); | ||
381 | if(EVP_CIPHER_asn1_to_param(&kekctx, kekalg->parameter) < 0) | ||
382 | { | ||
383 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
384 | CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); | ||
385 | goto err; | ||
386 | } | ||
387 | |||
388 | algtmp = pwri->keyDerivationAlgorithm; | ||
389 | |||
390 | /* Finish password based key derivation to setup key in "ctx" */ | ||
391 | |||
392 | if (EVP_PBE_CipherInit(algtmp->algorithm, | ||
393 | (char *)pwri->pass, pwri->passlen, | ||
394 | algtmp->parameter, &kekctx, en_de) < 0) | ||
395 | { | ||
396 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, ERR_R_EVP_LIB); | ||
397 | goto err; | ||
398 | } | ||
399 | |||
400 | /* Finally wrap/unwrap the key */ | ||
401 | |||
402 | if (en_de) | ||
403 | { | ||
404 | |||
405 | if (!kek_wrap_key(NULL, &keylen, ec->key, ec->keylen, &kekctx)) | ||
406 | goto err; | ||
407 | |||
408 | key = OPENSSL_malloc(keylen); | ||
409 | |||
410 | if (!key) | ||
411 | goto err; | ||
412 | |||
413 | if (!kek_wrap_key(key, &keylen, ec->key, ec->keylen, &kekctx)) | ||
414 | goto err; | ||
415 | pwri->encryptedKey->data = key; | ||
416 | pwri->encryptedKey->length = keylen; | ||
417 | } | ||
418 | else | ||
419 | { | ||
420 | key = OPENSSL_malloc(pwri->encryptedKey->length); | ||
421 | |||
422 | if (!key) | ||
423 | { | ||
424 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
425 | ERR_R_MALLOC_FAILURE); | ||
426 | goto err; | ||
427 | } | ||
428 | if (!kek_unwrap_key(key, &keylen, | ||
429 | pwri->encryptedKey->data, | ||
430 | pwri->encryptedKey->length, &kekctx)) | ||
431 | { | ||
432 | CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, | ||
433 | CMS_R_UNWRAP_FAILURE); | ||
434 | goto err; | ||
435 | } | ||
436 | |||
437 | ec->key = key; | ||
438 | ec->keylen = keylen; | ||
439 | |||
440 | } | ||
441 | |||
442 | r = 1; | ||
443 | |||
444 | err: | ||
445 | |||
446 | EVP_CIPHER_CTX_cleanup(&kekctx); | ||
447 | |||
448 | if (!r && key) | ||
449 | OPENSSL_free(key); | ||
450 | X509_ALGOR_free(kekalg); | ||
451 | |||
452 | return r; | ||
453 | |||
454 | } | ||
diff --git a/src/lib/libcrypto/cms/cms_sd.c b/src/lib/libcrypto/cms/cms_sd.c index e3192b9c57..77fbd13596 100644 --- a/src/lib/libcrypto/cms/cms_sd.c +++ b/src/lib/libcrypto/cms/cms_sd.c | |||
@@ -641,7 +641,8 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms, | |||
641 | cms->d.signedData->encapContentInfo->eContentType; | 641 | cms->d.signedData->encapContentInfo->eContentType; |
642 | unsigned char md[EVP_MAX_MD_SIZE]; | 642 | unsigned char md[EVP_MAX_MD_SIZE]; |
643 | unsigned int mdlen; | 643 | unsigned int mdlen; |
644 | EVP_DigestFinal_ex(&mctx, md, &mdlen); | 644 | if (!EVP_DigestFinal_ex(&mctx, md, &mdlen)) |
645 | goto err; | ||
645 | if (!CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest, | 646 | if (!CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest, |
646 | V_ASN1_OCTET_STRING, | 647 | V_ASN1_OCTET_STRING, |
647 | md, mdlen)) | 648 | md, mdlen)) |
diff --git a/src/lib/libcrypto/dh/dh_ameth.c b/src/lib/libcrypto/dh/dh_ameth.c index 377caf96c9..02ec2d47b4 100644 --- a/src/lib/libcrypto/dh/dh_ameth.c +++ b/src/lib/libcrypto/dh/dh_ameth.c | |||
@@ -493,6 +493,7 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth = | |||
493 | dh_copy_parameters, | 493 | dh_copy_parameters, |
494 | dh_cmp_parameters, | 494 | dh_cmp_parameters, |
495 | dh_param_print, | 495 | dh_param_print, |
496 | 0, | ||
496 | 497 | ||
497 | int_dh_free, | 498 | int_dh_free, |
498 | 0 | 499 | 0 |
diff --git a/src/lib/libcrypto/dsa/dsa_ameth.c b/src/lib/libcrypto/dsa/dsa_ameth.c index 6413aae46e..376156ec5e 100644 --- a/src/lib/libcrypto/dsa/dsa_ameth.c +++ b/src/lib/libcrypto/dsa/dsa_ameth.c | |||
@@ -542,6 +542,52 @@ static int old_dsa_priv_encode(const EVP_PKEY *pkey, unsigned char **pder) | |||
542 | return i2d_DSAPrivateKey(pkey->pkey.dsa, pder); | 542 | return i2d_DSAPrivateKey(pkey->pkey.dsa, pder); |
543 | } | 543 | } |
544 | 544 | ||
545 | static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg, | ||
546 | const ASN1_STRING *sig, | ||
547 | int indent, ASN1_PCTX *pctx) | ||
548 | { | ||
549 | DSA_SIG *dsa_sig; | ||
550 | const unsigned char *p; | ||
551 | if (!sig) | ||
552 | { | ||
553 | if (BIO_puts(bp, "\n") <= 0) | ||
554 | return 0; | ||
555 | else | ||
556 | return 1; | ||
557 | } | ||
558 | p = sig->data; | ||
559 | dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length); | ||
560 | if (dsa_sig) | ||
561 | { | ||
562 | int rv = 0; | ||
563 | size_t buf_len = 0; | ||
564 | unsigned char *m=NULL; | ||
565 | update_buflen(dsa_sig->r, &buf_len); | ||
566 | update_buflen(dsa_sig->s, &buf_len); | ||
567 | m = OPENSSL_malloc(buf_len+10); | ||
568 | if (m == NULL) | ||
569 | { | ||
570 | DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE); | ||
571 | goto err; | ||
572 | } | ||
573 | |||
574 | if (BIO_write(bp, "\n", 1) != 1) | ||
575 | goto err; | ||
576 | |||
577 | if (!ASN1_bn_print(bp,"r: ",dsa_sig->r,m,indent)) | ||
578 | goto err; | ||
579 | if (!ASN1_bn_print(bp,"s: ",dsa_sig->s,m,indent)) | ||
580 | goto err; | ||
581 | rv = 1; | ||
582 | err: | ||
583 | if (m) | ||
584 | OPENSSL_free(m); | ||
585 | DSA_SIG_free(dsa_sig); | ||
586 | return rv; | ||
587 | } | ||
588 | return X509_signature_dump(bp, sig, indent); | ||
589 | } | ||
590 | |||
545 | static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) | 591 | static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) |
546 | { | 592 | { |
547 | switch (op) | 593 | switch (op) |
@@ -647,6 +693,7 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[] = | |||
647 | dsa_copy_parameters, | 693 | dsa_copy_parameters, |
648 | dsa_cmp_parameters, | 694 | dsa_cmp_parameters, |
649 | dsa_param_print, | 695 | dsa_param_print, |
696 | dsa_sig_print, | ||
650 | 697 | ||
651 | int_dsa_free, | 698 | int_dsa_free, |
652 | dsa_pkey_ctrl, | 699 | dsa_pkey_ctrl, |
diff --git a/src/lib/libcrypto/dsa/dsa_locl.h b/src/lib/libcrypto/dsa/dsa_locl.h index 2b8cfee3db..21e2e45242 100644 --- a/src/lib/libcrypto/dsa/dsa_locl.h +++ b/src/lib/libcrypto/dsa/dsa_locl.h | |||
@@ -56,4 +56,5 @@ | |||
56 | 56 | ||
57 | int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits, | 57 | int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits, |
58 | const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len, | 58 | const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len, |
59 | unsigned char *seed_out, | ||
59 | int *counter_ret, unsigned long *h_ret, BN_GENCB *cb); | 60 | int *counter_ret, unsigned long *h_ret, BN_GENCB *cb); |
diff --git a/src/lib/libcrypto/dsa/dsa_pmeth.c b/src/lib/libcrypto/dsa/dsa_pmeth.c index e2df54fec6..715d8d675b 100644 --- a/src/lib/libcrypto/dsa/dsa_pmeth.c +++ b/src/lib/libcrypto/dsa/dsa_pmeth.c | |||
@@ -189,7 +189,9 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | |||
189 | EVP_MD_type((const EVP_MD *)p2) != NID_dsa && | 189 | EVP_MD_type((const EVP_MD *)p2) != NID_dsa && |
190 | EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA && | 190 | EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA && |
191 | EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && | 191 | EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && |
192 | EVP_MD_type((const EVP_MD *)p2) != NID_sha256) | 192 | EVP_MD_type((const EVP_MD *)p2) != NID_sha256 && |
193 | EVP_MD_type((const EVP_MD *)p2) != NID_sha384 && | ||
194 | EVP_MD_type((const EVP_MD *)p2) != NID_sha512) | ||
193 | { | 195 | { |
194 | DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE); | 196 | DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE); |
195 | return 0; | 197 | return 0; |
@@ -253,7 +255,7 @@ static int pkey_dsa_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) | |||
253 | if (!dsa) | 255 | if (!dsa) |
254 | return 0; | 256 | return 0; |
255 | ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd, | 257 | ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd, |
256 | NULL, 0, NULL, NULL, pcb); | 258 | NULL, 0, NULL, NULL, NULL, pcb); |
257 | if (ret) | 259 | if (ret) |
258 | EVP_PKEY_assign_DSA(pkey, dsa); | 260 | EVP_PKEY_assign_DSA(pkey, dsa); |
259 | else | 261 | else |
diff --git a/src/lib/libcrypto/ec/ec2_mult.c b/src/lib/libcrypto/ec/ec2_mult.c index e12b9b284a..26f4a783fc 100644 --- a/src/lib/libcrypto/ec/ec2_mult.c +++ b/src/lib/libcrypto/ec/ec2_mult.c | |||
@@ -71,6 +71,8 @@ | |||
71 | 71 | ||
72 | #include "ec_lcl.h" | 72 | #include "ec_lcl.h" |
73 | 73 | ||
74 | #ifndef OPENSSL_NO_EC2M | ||
75 | |||
74 | 76 | ||
75 | /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective | 77 | /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective |
76 | * coordinates. | 78 | * coordinates. |
@@ -384,3 +386,5 @@ int ec_GF2m_have_precompute_mult(const EC_GROUP *group) | |||
384 | { | 386 | { |
385 | return ec_wNAF_have_precompute_mult(group); | 387 | return ec_wNAF_have_precompute_mult(group); |
386 | } | 388 | } |
389 | |||
390 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ec2_oct.c b/src/lib/libcrypto/ec/ec2_oct.c new file mode 100644 index 0000000000..f1d75e5ddf --- /dev/null +++ b/src/lib/libcrypto/ec/ec2_oct.c | |||
@@ -0,0 +1,407 @@ | |||
1 | /* crypto/ec/ec2_oct.c */ | ||
2 | /* ==================================================================== | ||
3 | * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. | ||
4 | * | ||
5 | * The Elliptic Curve Public-Key Crypto Library (ECC Code) included | ||
6 | * herein is developed by SUN MICROSYSTEMS, INC., and is contributed | ||
7 | * to the OpenSSL project. | ||
8 | * | ||
9 | * The ECC Code is licensed pursuant to the OpenSSL open source | ||
10 | * license provided below. | ||
11 | * | ||
12 | * The software is originally written by Sheueling Chang Shantz and | ||
13 | * Douglas Stebila of Sun Microsystems Laboratories. | ||
14 | * | ||
15 | */ | ||
16 | /* ==================================================================== | ||
17 | * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. | ||
18 | * | ||
19 | * Redistribution and use in source and binary forms, with or without | ||
20 | * modification, are permitted provided that the following conditions | ||
21 | * are met: | ||
22 | * | ||
23 | * 1. Redistributions of source code must retain the above copyright | ||
24 | * notice, this list of conditions and the following disclaimer. | ||
25 | * | ||
26 | * 2. Redistributions in binary form must reproduce the above copyright | ||
27 | * notice, this list of conditions and the following disclaimer in | ||
28 | * the documentation and/or other materials provided with the | ||
29 | * distribution. | ||
30 | * | ||
31 | * 3. All advertising materials mentioning features or use of this | ||
32 | * software must display the following acknowledgment: | ||
33 | * "This product includes software developed by the OpenSSL Project | ||
34 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
35 | * | ||
36 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
37 | * endorse or promote products derived from this software without | ||
38 | * prior written permission. For written permission, please contact | ||
39 | * openssl-core@openssl.org. | ||
40 | * | ||
41 | * 5. Products derived from this software may not be called "OpenSSL" | ||
42 | * nor may "OpenSSL" appear in their names without prior written | ||
43 | * permission of the OpenSSL Project. | ||
44 | * | ||
45 | * 6. Redistributions of any form whatsoever must retain the following | ||
46 | * acknowledgment: | ||
47 | * "This product includes software developed by the OpenSSL Project | ||
48 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
49 | * | ||
50 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
51 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
52 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
53 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
54 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
55 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
56 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
57 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
58 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
59 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
60 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
61 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
62 | * ==================================================================== | ||
63 | * | ||
64 | * This product includes cryptographic software written by Eric Young | ||
65 | * (eay@cryptsoft.com). This product includes software written by Tim | ||
66 | * Hudson (tjh@cryptsoft.com). | ||
67 | * | ||
68 | */ | ||
69 | |||
70 | #include <openssl/err.h> | ||
71 | |||
72 | #include "ec_lcl.h" | ||
73 | |||
74 | #ifndef OPENSSL_NO_EC2M | ||
75 | |||
76 | /* Calculates and sets the affine coordinates of an EC_POINT from the given | ||
77 | * compressed coordinates. Uses algorithm 2.3.4 of SEC 1. | ||
78 | * Note that the simple implementation only uses affine coordinates. | ||
79 | * | ||
80 | * The method is from the following publication: | ||
81 | * | ||
82 | * Harper, Menezes, Vanstone: | ||
83 | * "Public-Key Cryptosystems with Very Small Key Lengths", | ||
84 | * EUROCRYPT '92, Springer-Verlag LNCS 658, | ||
85 | * published February 1993 | ||
86 | * | ||
87 | * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe | ||
88 | * the same method, but claim no priority date earlier than July 29, 1994 | ||
89 | * (and additionally fail to cite the EUROCRYPT '92 publication as prior art). | ||
90 | */ | ||
91 | int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point, | ||
92 | const BIGNUM *x_, int y_bit, BN_CTX *ctx) | ||
93 | { | ||
94 | BN_CTX *new_ctx = NULL; | ||
95 | BIGNUM *tmp, *x, *y, *z; | ||
96 | int ret = 0, z0; | ||
97 | |||
98 | /* clear error queue */ | ||
99 | ERR_clear_error(); | ||
100 | |||
101 | if (ctx == NULL) | ||
102 | { | ||
103 | ctx = new_ctx = BN_CTX_new(); | ||
104 | if (ctx == NULL) | ||
105 | return 0; | ||
106 | } | ||
107 | |||
108 | y_bit = (y_bit != 0) ? 1 : 0; | ||
109 | |||
110 | BN_CTX_start(ctx); | ||
111 | tmp = BN_CTX_get(ctx); | ||
112 | x = BN_CTX_get(ctx); | ||
113 | y = BN_CTX_get(ctx); | ||
114 | z = BN_CTX_get(ctx); | ||
115 | if (z == NULL) goto err; | ||
116 | |||
117 | if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err; | ||
118 | if (BN_is_zero(x)) | ||
119 | { | ||
120 | if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err; | ||
121 | } | ||
122 | else | ||
123 | { | ||
124 | if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err; | ||
125 | if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err; | ||
126 | if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err; | ||
127 | if (!BN_GF2m_add(tmp, x, tmp)) goto err; | ||
128 | if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx)) | ||
129 | { | ||
130 | unsigned long err = ERR_peek_last_error(); | ||
131 | |||
132 | if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION) | ||
133 | { | ||
134 | ERR_clear_error(); | ||
135 | ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); | ||
136 | } | ||
137 | else | ||
138 | ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB); | ||
139 | goto err; | ||
140 | } | ||
141 | z0 = (BN_is_odd(z)) ? 1 : 0; | ||
142 | if (!group->meth->field_mul(group, y, x, z, ctx)) goto err; | ||
143 | if (z0 != y_bit) | ||
144 | { | ||
145 | if (!BN_GF2m_add(y, y, x)) goto err; | ||
146 | } | ||
147 | } | ||
148 | |||
149 | if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; | ||
150 | |||
151 | ret = 1; | ||
152 | |||
153 | err: | ||
154 | BN_CTX_end(ctx); | ||
155 | if (new_ctx != NULL) | ||
156 | BN_CTX_free(new_ctx); | ||
157 | return ret; | ||
158 | } | ||
159 | |||
160 | |||
161 | /* Converts an EC_POINT to an octet string. | ||
162 | * If buf is NULL, the encoded length will be returned. | ||
163 | * If the length len of buf is smaller than required an error will be returned. | ||
164 | */ | ||
165 | size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, | ||
166 | unsigned char *buf, size_t len, BN_CTX *ctx) | ||
167 | { | ||
168 | size_t ret; | ||
169 | BN_CTX *new_ctx = NULL; | ||
170 | int used_ctx = 0; | ||
171 | BIGNUM *x, *y, *yxi; | ||
172 | size_t field_len, i, skip; | ||
173 | |||
174 | if ((form != POINT_CONVERSION_COMPRESSED) | ||
175 | && (form != POINT_CONVERSION_UNCOMPRESSED) | ||
176 | && (form != POINT_CONVERSION_HYBRID)) | ||
177 | { | ||
178 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM); | ||
179 | goto err; | ||
180 | } | ||
181 | |||
182 | if (EC_POINT_is_at_infinity(group, point)) | ||
183 | { | ||
184 | /* encodes to a single 0 octet */ | ||
185 | if (buf != NULL) | ||
186 | { | ||
187 | if (len < 1) | ||
188 | { | ||
189 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); | ||
190 | return 0; | ||
191 | } | ||
192 | buf[0] = 0; | ||
193 | } | ||
194 | return 1; | ||
195 | } | ||
196 | |||
197 | |||
198 | /* ret := required output buffer length */ | ||
199 | field_len = (EC_GROUP_get_degree(group) + 7) / 8; | ||
200 | ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; | ||
201 | |||
202 | /* if 'buf' is NULL, just return required length */ | ||
203 | if (buf != NULL) | ||
204 | { | ||
205 | if (len < ret) | ||
206 | { | ||
207 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); | ||
208 | goto err; | ||
209 | } | ||
210 | |||
211 | if (ctx == NULL) | ||
212 | { | ||
213 | ctx = new_ctx = BN_CTX_new(); | ||
214 | if (ctx == NULL) | ||
215 | return 0; | ||
216 | } | ||
217 | |||
218 | BN_CTX_start(ctx); | ||
219 | used_ctx = 1; | ||
220 | x = BN_CTX_get(ctx); | ||
221 | y = BN_CTX_get(ctx); | ||
222 | yxi = BN_CTX_get(ctx); | ||
223 | if (yxi == NULL) goto err; | ||
224 | |||
225 | if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; | ||
226 | |||
227 | buf[0] = form; | ||
228 | if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x)) | ||
229 | { | ||
230 | if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err; | ||
231 | if (BN_is_odd(yxi)) buf[0]++; | ||
232 | } | ||
233 | |||
234 | i = 1; | ||
235 | |||
236 | skip = field_len - BN_num_bytes(x); | ||
237 | if (skip > field_len) | ||
238 | { | ||
239 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
240 | goto err; | ||
241 | } | ||
242 | while (skip > 0) | ||
243 | { | ||
244 | buf[i++] = 0; | ||
245 | skip--; | ||
246 | } | ||
247 | skip = BN_bn2bin(x, buf + i); | ||
248 | i += skip; | ||
249 | if (i != 1 + field_len) | ||
250 | { | ||
251 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
252 | goto err; | ||
253 | } | ||
254 | |||
255 | if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID) | ||
256 | { | ||
257 | skip = field_len - BN_num_bytes(y); | ||
258 | if (skip > field_len) | ||
259 | { | ||
260 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
261 | goto err; | ||
262 | } | ||
263 | while (skip > 0) | ||
264 | { | ||
265 | buf[i++] = 0; | ||
266 | skip--; | ||
267 | } | ||
268 | skip = BN_bn2bin(y, buf + i); | ||
269 | i += skip; | ||
270 | } | ||
271 | |||
272 | if (i != ret) | ||
273 | { | ||
274 | ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
275 | goto err; | ||
276 | } | ||
277 | } | ||
278 | |||
279 | if (used_ctx) | ||
280 | BN_CTX_end(ctx); | ||
281 | if (new_ctx != NULL) | ||
282 | BN_CTX_free(new_ctx); | ||
283 | return ret; | ||
284 | |||
285 | err: | ||
286 | if (used_ctx) | ||
287 | BN_CTX_end(ctx); | ||
288 | if (new_ctx != NULL) | ||
289 | BN_CTX_free(new_ctx); | ||
290 | return 0; | ||
291 | } | ||
292 | |||
293 | |||
294 | /* Converts an octet string representation to an EC_POINT. | ||
295 | * Note that the simple implementation only uses affine coordinates. | ||
296 | */ | ||
297 | int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point, | ||
298 | const unsigned char *buf, size_t len, BN_CTX *ctx) | ||
299 | { | ||
300 | point_conversion_form_t form; | ||
301 | int y_bit; | ||
302 | BN_CTX *new_ctx = NULL; | ||
303 | BIGNUM *x, *y, *yxi; | ||
304 | size_t field_len, enc_len; | ||
305 | int ret = 0; | ||
306 | |||
307 | if (len == 0) | ||
308 | { | ||
309 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL); | ||
310 | return 0; | ||
311 | } | ||
312 | form = buf[0]; | ||
313 | y_bit = form & 1; | ||
314 | form = form & ~1U; | ||
315 | if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED) | ||
316 | && (form != POINT_CONVERSION_UNCOMPRESSED) | ||
317 | && (form != POINT_CONVERSION_HYBRID)) | ||
318 | { | ||
319 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
320 | return 0; | ||
321 | } | ||
322 | if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit) | ||
323 | { | ||
324 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | if (form == 0) | ||
329 | { | ||
330 | if (len != 1) | ||
331 | { | ||
332 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
333 | return 0; | ||
334 | } | ||
335 | |||
336 | return EC_POINT_set_to_infinity(group, point); | ||
337 | } | ||
338 | |||
339 | field_len = (EC_GROUP_get_degree(group) + 7) / 8; | ||
340 | enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; | ||
341 | |||
342 | if (len != enc_len) | ||
343 | { | ||
344 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
345 | return 0; | ||
346 | } | ||
347 | |||
348 | if (ctx == NULL) | ||
349 | { | ||
350 | ctx = new_ctx = BN_CTX_new(); | ||
351 | if (ctx == NULL) | ||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | BN_CTX_start(ctx); | ||
356 | x = BN_CTX_get(ctx); | ||
357 | y = BN_CTX_get(ctx); | ||
358 | yxi = BN_CTX_get(ctx); | ||
359 | if (yxi == NULL) goto err; | ||
360 | |||
361 | if (!BN_bin2bn(buf + 1, field_len, x)) goto err; | ||
362 | if (BN_ucmp(x, &group->field) >= 0) | ||
363 | { | ||
364 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
365 | goto err; | ||
366 | } | ||
367 | |||
368 | if (form == POINT_CONVERSION_COMPRESSED) | ||
369 | { | ||
370 | if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err; | ||
371 | } | ||
372 | else | ||
373 | { | ||
374 | if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err; | ||
375 | if (BN_ucmp(y, &group->field) >= 0) | ||
376 | { | ||
377 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
378 | goto err; | ||
379 | } | ||
380 | if (form == POINT_CONVERSION_HYBRID) | ||
381 | { | ||
382 | if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err; | ||
383 | if (y_bit != BN_is_odd(yxi)) | ||
384 | { | ||
385 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
386 | goto err; | ||
387 | } | ||
388 | } | ||
389 | |||
390 | if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; | ||
391 | } | ||
392 | |||
393 | if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */ | ||
394 | { | ||
395 | ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE); | ||
396 | goto err; | ||
397 | } | ||
398 | |||
399 | ret = 1; | ||
400 | |||
401 | err: | ||
402 | BN_CTX_end(ctx); | ||
403 | if (new_ctx != NULL) | ||
404 | BN_CTX_free(new_ctx); | ||
405 | return ret; | ||
406 | } | ||
407 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ec_ameth.c b/src/lib/libcrypto/ec/ec_ameth.c index c00f7d746c..83909c1853 100644 --- a/src/lib/libcrypto/ec/ec_ameth.c +++ b/src/lib/libcrypto/ec/ec_ameth.c | |||
@@ -651,6 +651,7 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth = | |||
651 | ec_copy_parameters, | 651 | ec_copy_parameters, |
652 | ec_cmp_parameters, | 652 | ec_cmp_parameters, |
653 | eckey_param_print, | 653 | eckey_param_print, |
654 | 0, | ||
654 | 655 | ||
655 | int_ec_free, | 656 | int_ec_free, |
656 | ec_pkey_ctrl, | 657 | ec_pkey_ctrl, |
diff --git a/src/lib/libcrypto/ec/ec_asn1.c b/src/lib/libcrypto/ec/ec_asn1.c index ae55539859..175eec5342 100644 --- a/src/lib/libcrypto/ec/ec_asn1.c +++ b/src/lib/libcrypto/ec/ec_asn1.c | |||
@@ -83,7 +83,7 @@ int EC_GROUP_get_basis_type(const EC_GROUP *group) | |||
83 | /* everything else is currently not supported */ | 83 | /* everything else is currently not supported */ |
84 | return 0; | 84 | return 0; |
85 | } | 85 | } |
86 | 86 | #ifndef OPENSSL_NO_EC2M | |
87 | int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k) | 87 | int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k) |
88 | { | 88 | { |
89 | if (group == NULL) | 89 | if (group == NULL) |
@@ -101,7 +101,6 @@ int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k) | |||
101 | 101 | ||
102 | return 1; | 102 | return 1; |
103 | } | 103 | } |
104 | |||
105 | int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1, | 104 | int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1, |
106 | unsigned int *k2, unsigned int *k3) | 105 | unsigned int *k2, unsigned int *k3) |
107 | { | 106 | { |
@@ -124,7 +123,7 @@ int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1, | |||
124 | 123 | ||
125 | return 1; | 124 | return 1; |
126 | } | 125 | } |
127 | 126 | #endif | |
128 | 127 | ||
129 | 128 | ||
130 | /* some structures needed for the asn1 encoding */ | 129 | /* some structures needed for the asn1 encoding */ |
@@ -340,6 +339,12 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field) | |||
340 | } | 339 | } |
341 | } | 340 | } |
342 | else /* nid == NID_X9_62_characteristic_two_field */ | 341 | else /* nid == NID_X9_62_characteristic_two_field */ |
342 | #ifdef OPENSSL_NO_EC2M | ||
343 | { | ||
344 | ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED); | ||
345 | goto err; | ||
346 | } | ||
347 | #else | ||
343 | { | 348 | { |
344 | int field_type; | 349 | int field_type; |
345 | X9_62_CHARACTERISTIC_TWO *char_two; | 350 | X9_62_CHARACTERISTIC_TWO *char_two; |
@@ -419,6 +424,7 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field) | |||
419 | } | 424 | } |
420 | } | 425 | } |
421 | } | 426 | } |
427 | #endif | ||
422 | 428 | ||
423 | ok = 1; | 429 | ok = 1; |
424 | 430 | ||
@@ -456,6 +462,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve) | |||
456 | goto err; | 462 | goto err; |
457 | } | 463 | } |
458 | } | 464 | } |
465 | #ifndef OPENSSL_NO_EC2M | ||
459 | else /* nid == NID_X9_62_characteristic_two_field */ | 466 | else /* nid == NID_X9_62_characteristic_two_field */ |
460 | { | 467 | { |
461 | if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL)) | 468 | if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL)) |
@@ -464,7 +471,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve) | |||
464 | goto err; | 471 | goto err; |
465 | } | 472 | } |
466 | } | 473 | } |
467 | 474 | #endif | |
468 | len_1 = (size_t)BN_num_bytes(tmp_1); | 475 | len_1 = (size_t)BN_num_bytes(tmp_1); |
469 | len_2 = (size_t)BN_num_bytes(tmp_2); | 476 | len_2 = (size_t)BN_num_bytes(tmp_2); |
470 | 477 | ||
@@ -775,8 +782,13 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params) | |||
775 | 782 | ||
776 | /* get the field parameters */ | 783 | /* get the field parameters */ |
777 | tmp = OBJ_obj2nid(params->fieldID->fieldType); | 784 | tmp = OBJ_obj2nid(params->fieldID->fieldType); |
778 | |||
779 | if (tmp == NID_X9_62_characteristic_two_field) | 785 | if (tmp == NID_X9_62_characteristic_two_field) |
786 | #ifdef OPENSSL_NO_EC2M | ||
787 | { | ||
788 | ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED); | ||
789 | goto err; | ||
790 | } | ||
791 | #else | ||
780 | { | 792 | { |
781 | X9_62_CHARACTERISTIC_TWO *char_two; | 793 | X9_62_CHARACTERISTIC_TWO *char_two; |
782 | 794 | ||
@@ -862,6 +874,7 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params) | |||
862 | /* create the EC_GROUP structure */ | 874 | /* create the EC_GROUP structure */ |
863 | ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL); | 875 | ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL); |
864 | } | 876 | } |
877 | #endif | ||
865 | else if (tmp == NID_X9_62_prime_field) | 878 | else if (tmp == NID_X9_62_prime_field) |
866 | { | 879 | { |
867 | /* we have a curve over a prime field */ | 880 | /* we have a curve over a prime field */ |
@@ -1065,6 +1078,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len) | |||
1065 | if ((group = ec_asn1_pkparameters2group(params)) == NULL) | 1078 | if ((group = ec_asn1_pkparameters2group(params)) == NULL) |
1066 | { | 1079 | { |
1067 | ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE); | 1080 | ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE); |
1081 | ECPKPARAMETERS_free(params); | ||
1068 | return NULL; | 1082 | return NULL; |
1069 | } | 1083 | } |
1070 | 1084 | ||
diff --git a/src/lib/libcrypto/ec/ec_curve.c b/src/lib/libcrypto/ec/ec_curve.c index 23274e4031..c72fb2697c 100644 --- a/src/lib/libcrypto/ec/ec_curve.c +++ b/src/lib/libcrypto/ec/ec_curve.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * Written by Nils Larsch for the OpenSSL project. | 3 | * Written by Nils Larsch for the OpenSSL project. |
4 | */ | 4 | */ |
5 | /* ==================================================================== | 5 | /* ==================================================================== |
6 | * Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved. | 6 | * Copyright (c) 1998-2010 The OpenSSL Project. All rights reserved. |
7 | * | 7 | * |
8 | * Redistribution and use in source and binary forms, with or without | 8 | * Redistribution and use in source and binary forms, with or without |
9 | * modification, are permitted provided that the following conditions | 9 | * modification, are permitted provided that the following conditions |
@@ -72,6 +72,7 @@ | |||
72 | #include "ec_lcl.h" | 72 | #include "ec_lcl.h" |
73 | #include <openssl/err.h> | 73 | #include <openssl/err.h> |
74 | #include <openssl/obj_mac.h> | 74 | #include <openssl/obj_mac.h> |
75 | #include <openssl/opensslconf.h> | ||
75 | 76 | ||
76 | typedef struct { | 77 | typedef struct { |
77 | int field_type, /* either NID_X9_62_prime_field or | 78 | int field_type, /* either NID_X9_62_prime_field or |
@@ -703,6 +704,8 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+28*6]; } | |||
703 | 0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D } | 704 | 0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D } |
704 | }; | 705 | }; |
705 | 706 | ||
707 | #ifndef OPENSSL_NO_EC2M | ||
708 | |||
706 | /* characteristic two curves */ | 709 | /* characteristic two curves */ |
707 | static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; } | 710 | static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; } |
708 | _EC_SECG_CHAR2_113R1 = { | 711 | _EC_SECG_CHAR2_113R1 = { |
@@ -1300,7 +1303,7 @@ static const struct { EC_CURVE_DATA h; unsigned char data[20+21*6]; } | |||
1300 | { 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76, /* seed */ | 1303 | { 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76, /* seed */ |
1301 | 0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD, | 1304 | 0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD, |
1302 | 1305 | ||
1303 | 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */ | 1306 | 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */ |
1304 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, | 1307 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, |
1305 | 0x07, | 1308 | 0x07, |
1306 | 0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9, /* a */ | 1309 | 0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9, /* a */ |
@@ -1817,103 +1820,128 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+24*6]; } | |||
1817 | 0xBA,0xFC,0xA7,0x5E } | 1820 | 0xBA,0xFC,0xA7,0x5E } |
1818 | }; | 1821 | }; |
1819 | 1822 | ||
1823 | #endif | ||
1824 | |||
1820 | typedef struct _ec_list_element_st { | 1825 | typedef struct _ec_list_element_st { |
1821 | int nid; | 1826 | int nid; |
1822 | const EC_CURVE_DATA *data; | 1827 | const EC_CURVE_DATA *data; |
1828 | const EC_METHOD *(*meth)(void); | ||
1823 | const char *comment; | 1829 | const char *comment; |
1824 | } ec_list_element; | 1830 | } ec_list_element; |
1825 | 1831 | ||
1826 | static const ec_list_element curve_list[] = { | 1832 | static const ec_list_element curve_list[] = { |
1827 | /* prime field curves */ | 1833 | /* prime field curves */ |
1828 | /* secg curves */ | 1834 | /* secg curves */ |
1829 | { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"}, | 1835 | { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" }, |
1830 | { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"}, | 1836 | { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" }, |
1831 | { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"}, | 1837 | { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" }, |
1832 | { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"}, | 1838 | { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" }, |
1833 | { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"}, | 1839 | { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" }, |
1834 | { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"}, | 1840 | { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" }, |
1835 | { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"}, | 1841 | { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" }, |
1836 | /* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */ | 1842 | /* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */ |
1837 | { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"}, | 1843 | { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" }, |
1838 | { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"}, | 1844 | { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" }, |
1839 | { NID_secp224r1, &_EC_NIST_PRIME_224.h, "NIST/SECG curve over a 224 bit prime field"}, | 1845 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 |
1840 | { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"}, | 1846 | { NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" }, |
1847 | #else | ||
1848 | { NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" }, | ||
1849 | #endif | ||
1850 | { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" }, | ||
1841 | /* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */ | 1851 | /* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */ |
1842 | { NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"}, | 1852 | { NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" }, |
1843 | { NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"}, | 1853 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 |
1854 | { NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" }, | ||
1855 | #else | ||
1856 | { NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" }, | ||
1857 | #endif | ||
1844 | /* X9.62 curves */ | 1858 | /* X9.62 curves */ |
1845 | { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"}, | 1859 | { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" }, |
1846 | { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"}, | 1860 | { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" }, |
1847 | { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"}, | 1861 | { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" }, |
1848 | { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"}, | 1862 | { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" }, |
1849 | { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"}, | 1863 | { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" }, |
1850 | { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"}, | 1864 | { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" }, |
1851 | { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"}, | 1865 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 |
1866 | { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" }, | ||
1867 | #else | ||
1868 | { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" }, | ||
1869 | #endif | ||
1870 | #ifndef OPENSSL_NO_EC2M | ||
1852 | /* characteristic two field curves */ | 1871 | /* characteristic two field curves */ |
1853 | /* NIST/SECG curves */ | 1872 | /* NIST/SECG curves */ |
1854 | { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"}, | 1873 | { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" }, |
1855 | { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"}, | 1874 | { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" }, |
1856 | { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"}, | 1875 | { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" }, |
1857 | { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"}, | 1876 | { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" }, |
1858 | { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field" }, | 1877 | { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" }, |
1859 | { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"}, | 1878 | { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" }, |
1860 | { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, "NIST/SECG curve over a 163 bit binary field" }, | 1879 | { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" }, |
1861 | { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"}, | 1880 | { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" }, |
1862 | { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"}, | 1881 | { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" }, |
1863 | { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field" }, | 1882 | { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, |
1864 | { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field" }, | 1883 | { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, |
1865 | { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"}, | 1884 | { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" }, |
1866 | { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, "NIST/SECG curve over a 283 bit binary field" }, | 1885 | { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" }, |
1867 | { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, "NIST/SECG curve over a 283 bit binary field" }, | 1886 | { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" }, |
1868 | { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, "NIST/SECG curve over a 409 bit binary field" }, | 1887 | { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" }, |
1869 | { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, "NIST/SECG curve over a 409 bit binary field" }, | 1888 | { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" }, |
1870 | { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, "NIST/SECG curve over a 571 bit binary field" }, | 1889 | { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" }, |
1871 | { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, "NIST/SECG curve over a 571 bit binary field" }, | 1890 | { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" }, |
1872 | /* X9.62 curves */ | 1891 | /* X9.62 curves */ |
1873 | { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"}, | 1892 | { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" }, |
1874 | { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"}, | 1893 | { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" }, |
1875 | { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"}, | 1894 | { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" }, |
1876 | { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"}, | 1895 | { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" }, |
1877 | { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"}, | 1896 | { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" }, |
1878 | { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"}, | 1897 | { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" }, |
1879 | { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"}, | 1898 | { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" }, |
1880 | { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"}, | 1899 | { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" }, |
1881 | { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"}, | 1900 | { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" }, |
1882 | { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"}, | 1901 | { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" }, |
1883 | { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"}, | 1902 | { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" }, |
1884 | { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"}, | 1903 | { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" }, |
1885 | { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"}, | 1904 | { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" }, |
1886 | { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"}, | 1905 | { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" }, |
1887 | { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"}, | 1906 | { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" }, |
1888 | { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"}, | 1907 | { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" }, |
1889 | /* the WAP/WTLS curves | 1908 | /* the WAP/WTLS curves |
1890 | * [unlike SECG, spec has its own OIDs for curves from X9.62] */ | 1909 | * [unlike SECG, spec has its own OIDs for curves from X9.62] */ |
1891 | { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"}, | 1910 | { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" }, |
1892 | { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field"}, | 1911 | { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" }, |
1893 | { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"}, | 1912 | { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" }, |
1894 | { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"}, | 1913 | { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" }, |
1895 | { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"}, | 1914 | #endif |
1896 | { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"}, | 1915 | { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" }, |
1897 | { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"}, | 1916 | { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" }, |
1898 | { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" }, | 1917 | { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" }, |
1899 | { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"}, | 1918 | { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" }, |
1900 | { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"}, | 1919 | #ifndef OPENSSL_NO_EC2M |
1901 | { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"}, | 1920 | { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, |
1921 | { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, | ||
1922 | #endif | ||
1923 | { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" }, | ||
1924 | #ifndef OPENSSL_NO_EC2M | ||
1902 | /* IPSec curves */ | 1925 | /* IPSec curves */ |
1903 | { NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"}, | 1926 | { NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n" |
1904 | { NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"}, | 1927 | "\tNot suitable for ECDSA.\n\tQuestionable extension field!" }, |
1928 | { NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n" | ||
1929 | "\tNot suitable for ECDSA.\n\tQuestionable extension field!" }, | ||
1930 | #endif | ||
1905 | }; | 1931 | }; |
1906 | 1932 | ||
1907 | #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element)) | 1933 | #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element)) |
1908 | 1934 | ||
1909 | static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) | 1935 | static EC_GROUP *ec_group_new_from_data(const ec_list_element curve) |
1910 | { | 1936 | { |
1911 | EC_GROUP *group=NULL; | 1937 | EC_GROUP *group=NULL; |
1912 | EC_POINT *P=NULL; | 1938 | EC_POINT *P=NULL; |
1913 | BN_CTX *ctx=NULL; | 1939 | BN_CTX *ctx=NULL; |
1914 | BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL; | 1940 | BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL; |
1915 | int ok=0; | 1941 | int ok=0; |
1916 | int seed_len,param_len; | 1942 | int seed_len,param_len; |
1943 | const EC_METHOD *meth; | ||
1944 | const EC_CURVE_DATA *data; | ||
1917 | const unsigned char *params; | 1945 | const unsigned char *params; |
1918 | 1946 | ||
1919 | if ((ctx = BN_CTX_new()) == NULL) | 1947 | if ((ctx = BN_CTX_new()) == NULL) |
@@ -1922,10 +1950,11 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) | |||
1922 | goto err; | 1950 | goto err; |
1923 | } | 1951 | } |
1924 | 1952 | ||
1953 | data = curve.data; | ||
1925 | seed_len = data->seed_len; | 1954 | seed_len = data->seed_len; |
1926 | param_len = data->param_len; | 1955 | param_len = data->param_len; |
1927 | params = (const unsigned char *)(data+1); /* skip header */ | 1956 | params = (const unsigned char *)(data+1); /* skip header */ |
1928 | params += seed_len; /* skip seed */ | 1957 | params += seed_len; /* skip seed */ |
1929 | 1958 | ||
1930 | if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL)) | 1959 | if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL)) |
1931 | || !(a = BN_bin2bn(params+1*param_len, param_len, NULL)) | 1960 | || !(a = BN_bin2bn(params+1*param_len, param_len, NULL)) |
@@ -1935,7 +1964,17 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) | |||
1935 | goto err; | 1964 | goto err; |
1936 | } | 1965 | } |
1937 | 1966 | ||
1938 | if (data->field_type == NID_X9_62_prime_field) | 1967 | if (curve.meth != 0) |
1968 | { | ||
1969 | meth = curve.meth(); | ||
1970 | if (((group = EC_GROUP_new(meth)) == NULL) || | ||
1971 | (!(group->meth->group_set_curve(group, p, a, b, ctx)))) | ||
1972 | { | ||
1973 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); | ||
1974 | goto err; | ||
1975 | } | ||
1976 | } | ||
1977 | else if (data->field_type == NID_X9_62_prime_field) | ||
1939 | { | 1978 | { |
1940 | if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL) | 1979 | if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL) |
1941 | { | 1980 | { |
@@ -1943,6 +1982,7 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) | |||
1943 | goto err; | 1982 | goto err; |
1944 | } | 1983 | } |
1945 | } | 1984 | } |
1985 | #ifndef OPENSSL_NO_EC2M | ||
1946 | else /* field_type == NID_X9_62_characteristic_two_field */ | 1986 | else /* field_type == NID_X9_62_characteristic_two_field */ |
1947 | { | 1987 | { |
1948 | if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL) | 1988 | if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL) |
@@ -1951,20 +1991,21 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) | |||
1951 | goto err; | 1991 | goto err; |
1952 | } | 1992 | } |
1953 | } | 1993 | } |
1994 | #endif | ||
1954 | 1995 | ||
1955 | if ((P = EC_POINT_new(group)) == NULL) | 1996 | if ((P = EC_POINT_new(group)) == NULL) |
1956 | { | 1997 | { |
1957 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); | 1998 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); |
1958 | goto err; | 1999 | goto err; |
1959 | } | 2000 | } |
1960 | 2001 | ||
1961 | if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL)) | 2002 | if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL)) |
1962 | || !(y = BN_bin2bn(params+4*param_len, param_len, NULL))) | 2003 | || !(y = BN_bin2bn(params+4*param_len, param_len, NULL))) |
1963 | { | 2004 | { |
1964 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB); | 2005 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB); |
1965 | goto err; | 2006 | goto err; |
1966 | } | 2007 | } |
1967 | if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx)) | 2008 | if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx)) |
1968 | { | 2009 | { |
1969 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); | 2010 | ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); |
1970 | goto err; | 2011 | goto err; |
@@ -2025,7 +2066,7 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid) | |||
2025 | for (i=0; i<curve_list_length; i++) | 2066 | for (i=0; i<curve_list_length; i++) |
2026 | if (curve_list[i].nid == nid) | 2067 | if (curve_list[i].nid == nid) |
2027 | { | 2068 | { |
2028 | ret = ec_group_new_from_data(curve_list[i].data); | 2069 | ret = ec_group_new_from_data(curve_list[i]); |
2029 | break; | 2070 | break; |
2030 | } | 2071 | } |
2031 | 2072 | ||
diff --git a/src/lib/libcrypto/ec/ec_key.c b/src/lib/libcrypto/ec/ec_key.c index 522802c07a..bf9fd2dc2c 100644 --- a/src/lib/libcrypto/ec/ec_key.c +++ b/src/lib/libcrypto/ec/ec_key.c | |||
@@ -64,7 +64,9 @@ | |||
64 | #include <string.h> | 64 | #include <string.h> |
65 | #include "ec_lcl.h" | 65 | #include "ec_lcl.h" |
66 | #include <openssl/err.h> | 66 | #include <openssl/err.h> |
67 | #include <string.h> | 67 | #ifdef OPENSSL_FIPS |
68 | #include <openssl/fips.h> | ||
69 | #endif | ||
68 | 70 | ||
69 | EC_KEY *EC_KEY_new(void) | 71 | EC_KEY *EC_KEY_new(void) |
70 | { | 72 | { |
@@ -78,6 +80,7 @@ EC_KEY *EC_KEY_new(void) | |||
78 | } | 80 | } |
79 | 81 | ||
80 | ret->version = 1; | 82 | ret->version = 1; |
83 | ret->flags = 0; | ||
81 | ret->group = NULL; | 84 | ret->group = NULL; |
82 | ret->pub_key = NULL; | 85 | ret->pub_key = NULL; |
83 | ret->priv_key= NULL; | 86 | ret->priv_key= NULL; |
@@ -197,6 +200,7 @@ EC_KEY *EC_KEY_copy(EC_KEY *dest, const EC_KEY *src) | |||
197 | dest->enc_flag = src->enc_flag; | 200 | dest->enc_flag = src->enc_flag; |
198 | dest->conv_form = src->conv_form; | 201 | dest->conv_form = src->conv_form; |
199 | dest->version = src->version; | 202 | dest->version = src->version; |
203 | dest->flags = src->flags; | ||
200 | 204 | ||
201 | return dest; | 205 | return dest; |
202 | } | 206 | } |
@@ -237,6 +241,11 @@ int EC_KEY_generate_key(EC_KEY *eckey) | |||
237 | BIGNUM *priv_key = NULL, *order = NULL; | 241 | BIGNUM *priv_key = NULL, *order = NULL; |
238 | EC_POINT *pub_key = NULL; | 242 | EC_POINT *pub_key = NULL; |
239 | 243 | ||
244 | #ifdef OPENSSL_FIPS | ||
245 | if (FIPS_mode()) | ||
246 | return FIPS_ec_key_generate_key(eckey); | ||
247 | #endif | ||
248 | |||
240 | if (!eckey || !eckey->group) | 249 | if (!eckey || !eckey->group) |
241 | { | 250 | { |
242 | ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER); | 251 | ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER); |
@@ -371,6 +380,82 @@ err: | |||
371 | return(ok); | 380 | return(ok); |
372 | } | 381 | } |
373 | 382 | ||
383 | int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y) | ||
384 | { | ||
385 | BN_CTX *ctx = NULL; | ||
386 | BIGNUM *tx, *ty; | ||
387 | EC_POINT *point = NULL; | ||
388 | int ok = 0, tmp_nid, is_char_two = 0; | ||
389 | |||
390 | if (!key || !key->group || !x || !y) | ||
391 | { | ||
392 | ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES, | ||
393 | ERR_R_PASSED_NULL_PARAMETER); | ||
394 | return 0; | ||
395 | } | ||
396 | ctx = BN_CTX_new(); | ||
397 | if (!ctx) | ||
398 | goto err; | ||
399 | |||
400 | point = EC_POINT_new(key->group); | ||
401 | |||
402 | if (!point) | ||
403 | goto err; | ||
404 | |||
405 | tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group)); | ||
406 | |||
407 | if (tmp_nid == NID_X9_62_characteristic_two_field) | ||
408 | is_char_two = 1; | ||
409 | |||
410 | tx = BN_CTX_get(ctx); | ||
411 | ty = BN_CTX_get(ctx); | ||
412 | #ifndef OPENSSL_NO_EC2M | ||
413 | if (is_char_two) | ||
414 | { | ||
415 | if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point, | ||
416 | x, y, ctx)) | ||
417 | goto err; | ||
418 | if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point, | ||
419 | tx, ty, ctx)) | ||
420 | goto err; | ||
421 | } | ||
422 | else | ||
423 | #endif | ||
424 | { | ||
425 | if (!EC_POINT_set_affine_coordinates_GFp(key->group, point, | ||
426 | x, y, ctx)) | ||
427 | goto err; | ||
428 | if (!EC_POINT_get_affine_coordinates_GFp(key->group, point, | ||
429 | tx, ty, ctx)) | ||
430 | goto err; | ||
431 | } | ||
432 | /* Check if retrieved coordinates match originals: if not values | ||
433 | * are out of range. | ||
434 | */ | ||
435 | if (BN_cmp(x, tx) || BN_cmp(y, ty)) | ||
436 | { | ||
437 | ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES, | ||
438 | EC_R_COORDINATES_OUT_OF_RANGE); | ||
439 | goto err; | ||
440 | } | ||
441 | |||
442 | if (!EC_KEY_set_public_key(key, point)) | ||
443 | goto err; | ||
444 | |||
445 | if (EC_KEY_check_key(key) == 0) | ||
446 | goto err; | ||
447 | |||
448 | ok = 1; | ||
449 | |||
450 | err: | ||
451 | if (ctx) | ||
452 | BN_CTX_free(ctx); | ||
453 | if (point) | ||
454 | EC_POINT_free(point); | ||
455 | return ok; | ||
456 | |||
457 | } | ||
458 | |||
374 | const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key) | 459 | const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key) |
375 | { | 460 | { |
376 | return key->group; | 461 | return key->group; |
@@ -461,3 +546,18 @@ int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx) | |||
461 | return 0; | 546 | return 0; |
462 | return EC_GROUP_precompute_mult(key->group, ctx); | 547 | return EC_GROUP_precompute_mult(key->group, ctx); |
463 | } | 548 | } |
549 | |||
550 | int EC_KEY_get_flags(const EC_KEY *key) | ||
551 | { | ||
552 | return key->flags; | ||
553 | } | ||
554 | |||
555 | void EC_KEY_set_flags(EC_KEY *key, int flags) | ||
556 | { | ||
557 | key->flags |= flags; | ||
558 | } | ||
559 | |||
560 | void EC_KEY_clear_flags(EC_KEY *key, int flags) | ||
561 | { | ||
562 | key->flags &= ~flags; | ||
563 | } | ||
diff --git a/src/lib/libcrypto/ec/ec_oct.c b/src/lib/libcrypto/ec/ec_oct.c new file mode 100644 index 0000000000..fd9db0798d --- /dev/null +++ b/src/lib/libcrypto/ec/ec_oct.c | |||
@@ -0,0 +1,199 @@ | |||
1 | /* crypto/ec/ec_lib.c */ | ||
2 | /* | ||
3 | * Originally written by Bodo Moeller for the OpenSSL project. | ||
4 | */ | ||
5 | /* ==================================================================== | ||
6 | * Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved. | ||
7 | * | ||
8 | * Redistribution and use in source and binary forms, with or without | ||
9 | * modification, are permitted provided that the following conditions | ||
10 | * are met: | ||
11 | * | ||
12 | * 1. Redistributions of source code must retain the above copyright | ||
13 | * notice, this list of conditions and the following disclaimer. | ||
14 | * | ||
15 | * 2. Redistributions in binary form must reproduce the above copyright | ||
16 | * notice, this list of conditions and the following disclaimer in | ||
17 | * the documentation and/or other materials provided with the | ||
18 | * distribution. | ||
19 | * | ||
20 | * 3. All advertising materials mentioning features or use of this | ||
21 | * software must display the following acknowledgment: | ||
22 | * "This product includes software developed by the OpenSSL Project | ||
23 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
24 | * | ||
25 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
26 | * endorse or promote products derived from this software without | ||
27 | * prior written permission. For written permission, please contact | ||
28 | * openssl-core@openssl.org. | ||
29 | * | ||
30 | * 5. Products derived from this software may not be called "OpenSSL" | ||
31 | * nor may "OpenSSL" appear in their names without prior written | ||
32 | * permission of the OpenSSL Project. | ||
33 | * | ||
34 | * 6. Redistributions of any form whatsoever must retain the following | ||
35 | * acknowledgment: | ||
36 | * "This product includes software developed by the OpenSSL Project | ||
37 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
38 | * | ||
39 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
40 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
41 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
42 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
43 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
44 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
45 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
46 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
47 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
48 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
49 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
50 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
51 | * ==================================================================== | ||
52 | * | ||
53 | * This product includes cryptographic software written by Eric Young | ||
54 | * (eay@cryptsoft.com). This product includes software written by Tim | ||
55 | * Hudson (tjh@cryptsoft.com). | ||
56 | * | ||
57 | */ | ||
58 | /* ==================================================================== | ||
59 | * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. | ||
60 | * Binary polynomial ECC support in OpenSSL originally developed by | ||
61 | * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project. | ||
62 | */ | ||
63 | |||
64 | #include <string.h> | ||
65 | |||
66 | #include <openssl/err.h> | ||
67 | #include <openssl/opensslv.h> | ||
68 | |||
69 | #include "ec_lcl.h" | ||
70 | |||
71 | int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point, | ||
72 | const BIGNUM *x, int y_bit, BN_CTX *ctx) | ||
73 | { | ||
74 | if (group->meth->point_set_compressed_coordinates == 0 | ||
75 | && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) | ||
76 | { | ||
77 | ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); | ||
78 | return 0; | ||
79 | } | ||
80 | if (group->meth != point->meth) | ||
81 | { | ||
82 | ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS); | ||
83 | return 0; | ||
84 | } | ||
85 | if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) | ||
86 | { | ||
87 | if (group->meth->field_type == NID_X9_62_prime_field) | ||
88 | return ec_GFp_simple_set_compressed_coordinates( | ||
89 | group, point, x, y_bit, ctx); | ||
90 | else | ||
91 | #ifdef OPENSSL_NO_EC2M | ||
92 | { | ||
93 | ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED); | ||
94 | return 0; | ||
95 | } | ||
96 | #else | ||
97 | return ec_GF2m_simple_set_compressed_coordinates( | ||
98 | group, point, x, y_bit, ctx); | ||
99 | #endif | ||
100 | } | ||
101 | return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx); | ||
102 | } | ||
103 | |||
104 | #ifndef OPENSSL_NO_EC2M | ||
105 | int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point, | ||
106 | const BIGNUM *x, int y_bit, BN_CTX *ctx) | ||
107 | { | ||
108 | if (group->meth->point_set_compressed_coordinates == 0 | ||
109 | && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) | ||
110 | { | ||
111 | ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); | ||
112 | return 0; | ||
113 | } | ||
114 | if (group->meth != point->meth) | ||
115 | { | ||
116 | ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS); | ||
117 | return 0; | ||
118 | } | ||
119 | if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) | ||
120 | { | ||
121 | if (group->meth->field_type == NID_X9_62_prime_field) | ||
122 | return ec_GFp_simple_set_compressed_coordinates( | ||
123 | group, point, x, y_bit, ctx); | ||
124 | else | ||
125 | return ec_GF2m_simple_set_compressed_coordinates( | ||
126 | group, point, x, y_bit, ctx); | ||
127 | } | ||
128 | return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx); | ||
129 | } | ||
130 | #endif | ||
131 | |||
132 | size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, | ||
133 | unsigned char *buf, size_t len, BN_CTX *ctx) | ||
134 | { | ||
135 | if (group->meth->point2oct == 0 | ||
136 | && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) | ||
137 | { | ||
138 | ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); | ||
139 | return 0; | ||
140 | } | ||
141 | if (group->meth != point->meth) | ||
142 | { | ||
143 | ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS); | ||
144 | return 0; | ||
145 | } | ||
146 | if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) | ||
147 | { | ||
148 | if (group->meth->field_type == NID_X9_62_prime_field) | ||
149 | return ec_GFp_simple_point2oct(group, point, | ||
150 | form, buf, len, ctx); | ||
151 | else | ||
152 | #ifdef OPENSSL_NO_EC2M | ||
153 | { | ||
154 | ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED); | ||
155 | return 0; | ||
156 | } | ||
157 | #else | ||
158 | return ec_GF2m_simple_point2oct(group, point, | ||
159 | form, buf, len, ctx); | ||
160 | #endif | ||
161 | } | ||
162 | |||
163 | return group->meth->point2oct(group, point, form, buf, len, ctx); | ||
164 | } | ||
165 | |||
166 | |||
167 | int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point, | ||
168 | const unsigned char *buf, size_t len, BN_CTX *ctx) | ||
169 | { | ||
170 | if (group->meth->oct2point == 0 | ||
171 | && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) | ||
172 | { | ||
173 | ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); | ||
174 | return 0; | ||
175 | } | ||
176 | if (group->meth != point->meth) | ||
177 | { | ||
178 | ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS); | ||
179 | return 0; | ||
180 | } | ||
181 | if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) | ||
182 | { | ||
183 | if (group->meth->field_type == NID_X9_62_prime_field) | ||
184 | return ec_GFp_simple_oct2point(group, point, | ||
185 | buf, len, ctx); | ||
186 | else | ||
187 | #ifdef OPENSSL_NO_EC2M | ||
188 | { | ||
189 | ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED); | ||
190 | return 0; | ||
191 | } | ||
192 | #else | ||
193 | return ec_GF2m_simple_oct2point(group, point, | ||
194 | buf, len, ctx); | ||
195 | #endif | ||
196 | } | ||
197 | return group->meth->oct2point(group, point, buf, len, ctx); | ||
198 | } | ||
199 | |||
diff --git a/src/lib/libcrypto/ec/ec_pmeth.c b/src/lib/libcrypto/ec/ec_pmeth.c index f433076ca1..d1ed66c37e 100644 --- a/src/lib/libcrypto/ec/ec_pmeth.c +++ b/src/lib/libcrypto/ec/ec_pmeth.c | |||
@@ -221,6 +221,7 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | |||
221 | 221 | ||
222 | case EVP_PKEY_CTRL_MD: | 222 | case EVP_PKEY_CTRL_MD: |
223 | if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 && | 223 | if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 && |
224 | EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 && | ||
224 | EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && | 225 | EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && |
225 | EVP_MD_type((const EVP_MD *)p2) != NID_sha256 && | 226 | EVP_MD_type((const EVP_MD *)p2) != NID_sha256 && |
226 | EVP_MD_type((const EVP_MD *)p2) != NID_sha384 && | 227 | EVP_MD_type((const EVP_MD *)p2) != NID_sha384 && |
diff --git a/src/lib/libcrypto/ec/eck_prn.c b/src/lib/libcrypto/ec/eck_prn.c index 7d3e175ae7..06de8f3959 100644 --- a/src/lib/libcrypto/ec/eck_prn.c +++ b/src/lib/libcrypto/ec/eck_prn.c | |||
@@ -207,7 +207,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off) | |||
207 | reason = ERR_R_MALLOC_FAILURE; | 207 | reason = ERR_R_MALLOC_FAILURE; |
208 | goto err; | 208 | goto err; |
209 | } | 209 | } |
210 | 210 | #ifndef OPENSSL_NO_EC2M | |
211 | if (is_char_two) | 211 | if (is_char_two) |
212 | { | 212 | { |
213 | if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx)) | 213 | if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx)) |
@@ -217,6 +217,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off) | |||
217 | } | 217 | } |
218 | } | 218 | } |
219 | else /* prime field */ | 219 | else /* prime field */ |
220 | #endif | ||
220 | { | 221 | { |
221 | if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx)) | 222 | if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx)) |
222 | { | 223 | { |
diff --git a/src/lib/libcrypto/ec/ecp_nistp224.c b/src/lib/libcrypto/ec/ecp_nistp224.c new file mode 100644 index 0000000000..b5ff56c252 --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistp224.c | |||
@@ -0,0 +1,1658 @@ | |||
1 | /* crypto/ec/ecp_nistp224.c */ | ||
2 | /* | ||
3 | * Written by Emilia Kasper (Google) for the OpenSSL project. | ||
4 | */ | ||
5 | /* Copyright 2011 Google Inc. | ||
6 | * | ||
7 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
8 | * | ||
9 | * you may not use this file except in compliance with the License. | ||
10 | * You may obtain a copy of the License at | ||
11 | * | ||
12 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
13 | * | ||
14 | * Unless required by applicable law or agreed to in writing, software | ||
15 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
17 | * See the License for the specific language governing permissions and | ||
18 | * limitations under the License. | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication | ||
23 | * | ||
24 | * Inspired by Daniel J. Bernstein's public domain nistp224 implementation | ||
25 | * and Adam Langley's public domain 64-bit C implementation of curve25519 | ||
26 | */ | ||
27 | |||
28 | #include <openssl/opensslconf.h> | ||
29 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 | ||
30 | |||
31 | #ifndef OPENSSL_SYS_VMS | ||
32 | #include <stdint.h> | ||
33 | #else | ||
34 | #include <inttypes.h> | ||
35 | #endif | ||
36 | |||
37 | #include <string.h> | ||
38 | #include <openssl/err.h> | ||
39 | #include "ec_lcl.h" | ||
40 | |||
41 | #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) | ||
42 | /* even with gcc, the typedef won't work for 32-bit platforms */ | ||
43 | typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ | ||
44 | #else | ||
45 | #error "Need GCC 3.1 or later to define type uint128_t" | ||
46 | #endif | ||
47 | |||
48 | typedef uint8_t u8; | ||
49 | typedef uint64_t u64; | ||
50 | typedef int64_t s64; | ||
51 | |||
52 | |||
53 | /******************************************************************************/ | ||
54 | /* INTERNAL REPRESENTATION OF FIELD ELEMENTS | ||
55 | * | ||
56 | * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3 | ||
57 | * using 64-bit coefficients called 'limbs', | ||
58 | * and sometimes (for multiplication results) as | ||
59 | * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6 | ||
60 | * using 128-bit coefficients called 'widelimbs'. | ||
61 | * A 4-limb representation is an 'felem'; | ||
62 | * a 7-widelimb representation is a 'widefelem'. | ||
63 | * Even within felems, bits of adjacent limbs overlap, and we don't always | ||
64 | * reduce the representations: we ensure that inputs to each felem | ||
65 | * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60, | ||
66 | * and fit into a 128-bit word without overflow. The coefficients are then | ||
67 | * again partially reduced to obtain an felem satisfying a_i < 2^57. | ||
68 | * We only reduce to the unique minimal representation at the end of the | ||
69 | * computation. | ||
70 | */ | ||
71 | |||
72 | typedef uint64_t limb; | ||
73 | typedef uint128_t widelimb; | ||
74 | |||
75 | typedef limb felem[4]; | ||
76 | typedef widelimb widefelem[7]; | ||
77 | |||
78 | /* Field element represented as a byte arrary. | ||
79 | * 28*8 = 224 bits is also the group order size for the elliptic curve, | ||
80 | * and we also use this type for scalars for point multiplication. | ||
81 | */ | ||
82 | typedef u8 felem_bytearray[28]; | ||
83 | |||
84 | static const felem_bytearray nistp224_curve_params[5] = { | ||
85 | {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, /* p */ | ||
86 | 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00, | ||
87 | 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01}, | ||
88 | {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, /* a */ | ||
89 | 0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFF,0xFF, | ||
90 | 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE}, | ||
91 | {0xB4,0x05,0x0A,0x85,0x0C,0x04,0xB3,0xAB,0xF5,0x41, /* b */ | ||
92 | 0x32,0x56,0x50,0x44,0xB0,0xB7,0xD7,0xBF,0xD8,0xBA, | ||
93 | 0x27,0x0B,0x39,0x43,0x23,0x55,0xFF,0xB4}, | ||
94 | {0xB7,0x0E,0x0C,0xBD,0x6B,0xB4,0xBF,0x7F,0x32,0x13, /* x */ | ||
95 | 0x90,0xB9,0x4A,0x03,0xC1,0xD3,0x56,0xC2,0x11,0x22, | ||
96 | 0x34,0x32,0x80,0xD6,0x11,0x5C,0x1D,0x21}, | ||
97 | {0xbd,0x37,0x63,0x88,0xb5,0xf7,0x23,0xfb,0x4c,0x22, /* y */ | ||
98 | 0xdf,0xe6,0xcd,0x43,0x75,0xa0,0x5a,0x07,0x47,0x64, | ||
99 | 0x44,0xd5,0x81,0x99,0x85,0x00,0x7e,0x34} | ||
100 | }; | ||
101 | |||
102 | /* Precomputed multiples of the standard generator | ||
103 | * Points are given in coordinates (X, Y, Z) where Z normally is 1 | ||
104 | * (0 for the point at infinity). | ||
105 | * For each field element, slice a_0 is word 0, etc. | ||
106 | * | ||
107 | * The table has 2 * 16 elements, starting with the following: | ||
108 | * index | bits | point | ||
109 | * ------+---------+------------------------------ | ||
110 | * 0 | 0 0 0 0 | 0G | ||
111 | * 1 | 0 0 0 1 | 1G | ||
112 | * 2 | 0 0 1 0 | 2^56G | ||
113 | * 3 | 0 0 1 1 | (2^56 + 1)G | ||
114 | * 4 | 0 1 0 0 | 2^112G | ||
115 | * 5 | 0 1 0 1 | (2^112 + 1)G | ||
116 | * 6 | 0 1 1 0 | (2^112 + 2^56)G | ||
117 | * 7 | 0 1 1 1 | (2^112 + 2^56 + 1)G | ||
118 | * 8 | 1 0 0 0 | 2^168G | ||
119 | * 9 | 1 0 0 1 | (2^168 + 1)G | ||
120 | * 10 | 1 0 1 0 | (2^168 + 2^56)G | ||
121 | * 11 | 1 0 1 1 | (2^168 + 2^56 + 1)G | ||
122 | * 12 | 1 1 0 0 | (2^168 + 2^112)G | ||
123 | * 13 | 1 1 0 1 | (2^168 + 2^112 + 1)G | ||
124 | * 14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G | ||
125 | * 15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G | ||
126 | * followed by a copy of this with each element multiplied by 2^28. | ||
127 | * | ||
128 | * The reason for this is so that we can clock bits into four different | ||
129 | * locations when doing simple scalar multiplies against the base point, | ||
130 | * and then another four locations using the second 16 elements. | ||
131 | */ | ||
132 | static const felem gmul[2][16][3] = | ||
133 | {{{{0, 0, 0, 0}, | ||
134 | {0, 0, 0, 0}, | ||
135 | {0, 0, 0, 0}}, | ||
136 | {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf}, | ||
137 | {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723}, | ||
138 | {1, 0, 0, 0}}, | ||
139 | {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5}, | ||
140 | {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321}, | ||
141 | {1, 0, 0, 0}}, | ||
142 | {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748}, | ||
143 | {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17}, | ||
144 | {1, 0, 0, 0}}, | ||
145 | {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe}, | ||
146 | {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b}, | ||
147 | {1, 0, 0, 0}}, | ||
148 | {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3}, | ||
149 | {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a}, | ||
150 | {1, 0, 0, 0}}, | ||
151 | {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c}, | ||
152 | {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244}, | ||
153 | {1, 0, 0, 0}}, | ||
154 | {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849}, | ||
155 | {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112}, | ||
156 | {1, 0, 0, 0}}, | ||
157 | {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47}, | ||
158 | {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394}, | ||
159 | {1, 0, 0, 0}}, | ||
160 | {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d}, | ||
161 | {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7}, | ||
162 | {1, 0, 0, 0}}, | ||
163 | {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24}, | ||
164 | {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881}, | ||
165 | {1, 0, 0, 0}}, | ||
166 | {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984}, | ||
167 | {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369}, | ||
168 | {1, 0, 0, 0}}, | ||
169 | {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3}, | ||
170 | {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60}, | ||
171 | {1, 0, 0, 0}}, | ||
172 | {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057}, | ||
173 | {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9}, | ||
174 | {1, 0, 0, 0}}, | ||
175 | {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9}, | ||
176 | {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc}, | ||
177 | {1, 0, 0, 0}}, | ||
178 | {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58}, | ||
179 | {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558}, | ||
180 | {1, 0, 0, 0}}}, | ||
181 | {{{0, 0, 0, 0}, | ||
182 | {0, 0, 0, 0}, | ||
183 | {0, 0, 0, 0}}, | ||
184 | {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31}, | ||
185 | {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d}, | ||
186 | {1, 0, 0, 0}}, | ||
187 | {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3}, | ||
188 | {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a}, | ||
189 | {1, 0, 0, 0}}, | ||
190 | {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33}, | ||
191 | {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100}, | ||
192 | {1, 0, 0, 0}}, | ||
193 | {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5}, | ||
194 | {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea}, | ||
195 | {1, 0, 0, 0}}, | ||
196 | {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be}, | ||
197 | {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51}, | ||
198 | {1, 0, 0, 0}}, | ||
199 | {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1}, | ||
200 | {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb}, | ||
201 | {1, 0, 0, 0}}, | ||
202 | {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233}, | ||
203 | {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def}, | ||
204 | {1, 0, 0, 0}}, | ||
205 | {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae}, | ||
206 | {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45}, | ||
207 | {1, 0, 0, 0}}, | ||
208 | {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e}, | ||
209 | {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb}, | ||
210 | {1, 0, 0, 0}}, | ||
211 | {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de}, | ||
212 | {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3}, | ||
213 | {1, 0, 0, 0}}, | ||
214 | {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05}, | ||
215 | {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58}, | ||
216 | {1, 0, 0, 0}}, | ||
217 | {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb}, | ||
218 | {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0}, | ||
219 | {1, 0, 0, 0}}, | ||
220 | {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9}, | ||
221 | {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea}, | ||
222 | {1, 0, 0, 0}}, | ||
223 | {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba}, | ||
224 | {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405}, | ||
225 | {1, 0, 0, 0}}, | ||
226 | {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e}, | ||
227 | {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e}, | ||
228 | {1, 0, 0, 0}}}}; | ||
229 | |||
230 | /* Precomputation for the group generator. */ | ||
231 | typedef struct { | ||
232 | felem g_pre_comp[2][16][3]; | ||
233 | int references; | ||
234 | } NISTP224_PRE_COMP; | ||
235 | |||
236 | const EC_METHOD *EC_GFp_nistp224_method(void) | ||
237 | { | ||
238 | static const EC_METHOD ret = { | ||
239 | EC_FLAGS_DEFAULT_OCT, | ||
240 | NID_X9_62_prime_field, | ||
241 | ec_GFp_nistp224_group_init, | ||
242 | ec_GFp_simple_group_finish, | ||
243 | ec_GFp_simple_group_clear_finish, | ||
244 | ec_GFp_nist_group_copy, | ||
245 | ec_GFp_nistp224_group_set_curve, | ||
246 | ec_GFp_simple_group_get_curve, | ||
247 | ec_GFp_simple_group_get_degree, | ||
248 | ec_GFp_simple_group_check_discriminant, | ||
249 | ec_GFp_simple_point_init, | ||
250 | ec_GFp_simple_point_finish, | ||
251 | ec_GFp_simple_point_clear_finish, | ||
252 | ec_GFp_simple_point_copy, | ||
253 | ec_GFp_simple_point_set_to_infinity, | ||
254 | ec_GFp_simple_set_Jprojective_coordinates_GFp, | ||
255 | ec_GFp_simple_get_Jprojective_coordinates_GFp, | ||
256 | ec_GFp_simple_point_set_affine_coordinates, | ||
257 | ec_GFp_nistp224_point_get_affine_coordinates, | ||
258 | 0 /* point_set_compressed_coordinates */, | ||
259 | 0 /* point2oct */, | ||
260 | 0 /* oct2point */, | ||
261 | ec_GFp_simple_add, | ||
262 | ec_GFp_simple_dbl, | ||
263 | ec_GFp_simple_invert, | ||
264 | ec_GFp_simple_is_at_infinity, | ||
265 | ec_GFp_simple_is_on_curve, | ||
266 | ec_GFp_simple_cmp, | ||
267 | ec_GFp_simple_make_affine, | ||
268 | ec_GFp_simple_points_make_affine, | ||
269 | ec_GFp_nistp224_points_mul, | ||
270 | ec_GFp_nistp224_precompute_mult, | ||
271 | ec_GFp_nistp224_have_precompute_mult, | ||
272 | ec_GFp_nist_field_mul, | ||
273 | ec_GFp_nist_field_sqr, | ||
274 | 0 /* field_div */, | ||
275 | 0 /* field_encode */, | ||
276 | 0 /* field_decode */, | ||
277 | 0 /* field_set_to_one */ }; | ||
278 | |||
279 | return &ret; | ||
280 | } | ||
281 | |||
282 | /* Helper functions to convert field elements to/from internal representation */ | ||
283 | static void bin28_to_felem(felem out, const u8 in[28]) | ||
284 | { | ||
285 | out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff; | ||
286 | out[1] = (*((const uint64_t *)(in+7))) & 0x00ffffffffffffff; | ||
287 | out[2] = (*((const uint64_t *)(in+14))) & 0x00ffffffffffffff; | ||
288 | out[3] = (*((const uint64_t *)(in+21))) & 0x00ffffffffffffff; | ||
289 | } | ||
290 | |||
291 | static void felem_to_bin28(u8 out[28], const felem in) | ||
292 | { | ||
293 | unsigned i; | ||
294 | for (i = 0; i < 7; ++i) | ||
295 | { | ||
296 | out[i] = in[0]>>(8*i); | ||
297 | out[i+7] = in[1]>>(8*i); | ||
298 | out[i+14] = in[2]>>(8*i); | ||
299 | out[i+21] = in[3]>>(8*i); | ||
300 | } | ||
301 | } | ||
302 | |||
303 | /* To preserve endianness when using BN_bn2bin and BN_bin2bn */ | ||
304 | static void flip_endian(u8 *out, const u8 *in, unsigned len) | ||
305 | { | ||
306 | unsigned i; | ||
307 | for (i = 0; i < len; ++i) | ||
308 | out[i] = in[len-1-i]; | ||
309 | } | ||
310 | |||
311 | /* From OpenSSL BIGNUM to internal representation */ | ||
312 | static int BN_to_felem(felem out, const BIGNUM *bn) | ||
313 | { | ||
314 | felem_bytearray b_in; | ||
315 | felem_bytearray b_out; | ||
316 | unsigned num_bytes; | ||
317 | |||
318 | /* BN_bn2bin eats leading zeroes */ | ||
319 | memset(b_out, 0, sizeof b_out); | ||
320 | num_bytes = BN_num_bytes(bn); | ||
321 | if (num_bytes > sizeof b_out) | ||
322 | { | ||
323 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
324 | return 0; | ||
325 | } | ||
326 | if (BN_is_negative(bn)) | ||
327 | { | ||
328 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
329 | return 0; | ||
330 | } | ||
331 | num_bytes = BN_bn2bin(bn, b_in); | ||
332 | flip_endian(b_out, b_in, num_bytes); | ||
333 | bin28_to_felem(out, b_out); | ||
334 | return 1; | ||
335 | } | ||
336 | |||
337 | /* From internal representation to OpenSSL BIGNUM */ | ||
338 | static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) | ||
339 | { | ||
340 | felem_bytearray b_in, b_out; | ||
341 | felem_to_bin28(b_in, in); | ||
342 | flip_endian(b_out, b_in, sizeof b_out); | ||
343 | return BN_bin2bn(b_out, sizeof b_out, out); | ||
344 | } | ||
345 | |||
346 | /******************************************************************************/ | ||
347 | /* FIELD OPERATIONS | ||
348 | * | ||
349 | * Field operations, using the internal representation of field elements. | ||
350 | * NB! These operations are specific to our point multiplication and cannot be | ||
351 | * expected to be correct in general - e.g., multiplication with a large scalar | ||
352 | * will cause an overflow. | ||
353 | * | ||
354 | */ | ||
355 | |||
356 | static void felem_one(felem out) | ||
357 | { | ||
358 | out[0] = 1; | ||
359 | out[1] = 0; | ||
360 | out[2] = 0; | ||
361 | out[3] = 0; | ||
362 | } | ||
363 | |||
364 | static void felem_assign(felem out, const felem in) | ||
365 | { | ||
366 | out[0] = in[0]; | ||
367 | out[1] = in[1]; | ||
368 | out[2] = in[2]; | ||
369 | out[3] = in[3]; | ||
370 | } | ||
371 | |||
372 | /* Sum two field elements: out += in */ | ||
373 | static void felem_sum(felem out, const felem in) | ||
374 | { | ||
375 | out[0] += in[0]; | ||
376 | out[1] += in[1]; | ||
377 | out[2] += in[2]; | ||
378 | out[3] += in[3]; | ||
379 | } | ||
380 | |||
381 | /* Get negative value: out = -in */ | ||
382 | /* Assumes in[i] < 2^57 */ | ||
383 | static void felem_neg(felem out, const felem in) | ||
384 | { | ||
385 | static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2); | ||
386 | static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2); | ||
387 | static const limb two58m42m2 = (((limb) 1) << 58) - | ||
388 | (((limb) 1) << 42) - (((limb) 1) << 2); | ||
389 | |||
390 | /* Set to 0 mod 2^224-2^96+1 to ensure out > in */ | ||
391 | out[0] = two58p2 - in[0]; | ||
392 | out[1] = two58m42m2 - in[1]; | ||
393 | out[2] = two58m2 - in[2]; | ||
394 | out[3] = two58m2 - in[3]; | ||
395 | } | ||
396 | |||
397 | /* Subtract field elements: out -= in */ | ||
398 | /* Assumes in[i] < 2^57 */ | ||
399 | static void felem_diff(felem out, const felem in) | ||
400 | { | ||
401 | static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2); | ||
402 | static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2); | ||
403 | static const limb two58m42m2 = (((limb) 1) << 58) - | ||
404 | (((limb) 1) << 42) - (((limb) 1) << 2); | ||
405 | |||
406 | /* Add 0 mod 2^224-2^96+1 to ensure out > in */ | ||
407 | out[0] += two58p2; | ||
408 | out[1] += two58m42m2; | ||
409 | out[2] += two58m2; | ||
410 | out[3] += two58m2; | ||
411 | |||
412 | out[0] -= in[0]; | ||
413 | out[1] -= in[1]; | ||
414 | out[2] -= in[2]; | ||
415 | out[3] -= in[3]; | ||
416 | } | ||
417 | |||
418 | /* Subtract in unreduced 128-bit mode: out -= in */ | ||
419 | /* Assumes in[i] < 2^119 */ | ||
420 | static void widefelem_diff(widefelem out, const widefelem in) | ||
421 | { | ||
422 | static const widelimb two120 = ((widelimb) 1) << 120; | ||
423 | static const widelimb two120m64 = (((widelimb) 1) << 120) - | ||
424 | (((widelimb) 1) << 64); | ||
425 | static const widelimb two120m104m64 = (((widelimb) 1) << 120) - | ||
426 | (((widelimb) 1) << 104) - (((widelimb) 1) << 64); | ||
427 | |||
428 | /* Add 0 mod 2^224-2^96+1 to ensure out > in */ | ||
429 | out[0] += two120; | ||
430 | out[1] += two120m64; | ||
431 | out[2] += two120m64; | ||
432 | out[3] += two120; | ||
433 | out[4] += two120m104m64; | ||
434 | out[5] += two120m64; | ||
435 | out[6] += two120m64; | ||
436 | |||
437 | out[0] -= in[0]; | ||
438 | out[1] -= in[1]; | ||
439 | out[2] -= in[2]; | ||
440 | out[3] -= in[3]; | ||
441 | out[4] -= in[4]; | ||
442 | out[5] -= in[5]; | ||
443 | out[6] -= in[6]; | ||
444 | } | ||
445 | |||
446 | /* Subtract in mixed mode: out128 -= in64 */ | ||
447 | /* in[i] < 2^63 */ | ||
448 | static void felem_diff_128_64(widefelem out, const felem in) | ||
449 | { | ||
450 | static const widelimb two64p8 = (((widelimb) 1) << 64) + | ||
451 | (((widelimb) 1) << 8); | ||
452 | static const widelimb two64m8 = (((widelimb) 1) << 64) - | ||
453 | (((widelimb) 1) << 8); | ||
454 | static const widelimb two64m48m8 = (((widelimb) 1) << 64) - | ||
455 | (((widelimb) 1) << 48) - (((widelimb) 1) << 8); | ||
456 | |||
457 | /* Add 0 mod 2^224-2^96+1 to ensure out > in */ | ||
458 | out[0] += two64p8; | ||
459 | out[1] += two64m48m8; | ||
460 | out[2] += two64m8; | ||
461 | out[3] += two64m8; | ||
462 | |||
463 | out[0] -= in[0]; | ||
464 | out[1] -= in[1]; | ||
465 | out[2] -= in[2]; | ||
466 | out[3] -= in[3]; | ||
467 | } | ||
468 | |||
469 | /* Multiply a field element by a scalar: out = out * scalar | ||
470 | * The scalars we actually use are small, so results fit without overflow */ | ||
471 | static void felem_scalar(felem out, const limb scalar) | ||
472 | { | ||
473 | out[0] *= scalar; | ||
474 | out[1] *= scalar; | ||
475 | out[2] *= scalar; | ||
476 | out[3] *= scalar; | ||
477 | } | ||
478 | |||
479 | /* Multiply an unreduced field element by a scalar: out = out * scalar | ||
480 | * The scalars we actually use are small, so results fit without overflow */ | ||
481 | static void widefelem_scalar(widefelem out, const widelimb scalar) | ||
482 | { | ||
483 | out[0] *= scalar; | ||
484 | out[1] *= scalar; | ||
485 | out[2] *= scalar; | ||
486 | out[3] *= scalar; | ||
487 | out[4] *= scalar; | ||
488 | out[5] *= scalar; | ||
489 | out[6] *= scalar; | ||
490 | } | ||
491 | |||
492 | /* Square a field element: out = in^2 */ | ||
493 | static void felem_square(widefelem out, const felem in) | ||
494 | { | ||
495 | limb tmp0, tmp1, tmp2; | ||
496 | tmp0 = 2 * in[0]; tmp1 = 2 * in[1]; tmp2 = 2 * in[2]; | ||
497 | out[0] = ((widelimb) in[0]) * in[0]; | ||
498 | out[1] = ((widelimb) in[0]) * tmp1; | ||
499 | out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1]; | ||
500 | out[3] = ((widelimb) in[3]) * tmp0 + | ||
501 | ((widelimb) in[1]) * tmp2; | ||
502 | out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2]; | ||
503 | out[5] = ((widelimb) in[3]) * tmp2; | ||
504 | out[6] = ((widelimb) in[3]) * in[3]; | ||
505 | } | ||
506 | |||
507 | /* Multiply two field elements: out = in1 * in2 */ | ||
508 | static void felem_mul(widefelem out, const felem in1, const felem in2) | ||
509 | { | ||
510 | out[0] = ((widelimb) in1[0]) * in2[0]; | ||
511 | out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0]; | ||
512 | out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] + | ||
513 | ((widelimb) in1[2]) * in2[0]; | ||
514 | out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] + | ||
515 | ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0]; | ||
516 | out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] + | ||
517 | ((widelimb) in1[3]) * in2[1]; | ||
518 | out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2]; | ||
519 | out[6] = ((widelimb) in1[3]) * in2[3]; | ||
520 | } | ||
521 | |||
522 | /* Reduce seven 128-bit coefficients to four 64-bit coefficients. | ||
523 | * Requires in[i] < 2^126, | ||
524 | * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */ | ||
525 | static void felem_reduce(felem out, const widefelem in) | ||
526 | { | ||
527 | static const widelimb two127p15 = (((widelimb) 1) << 127) + | ||
528 | (((widelimb) 1) << 15); | ||
529 | static const widelimb two127m71 = (((widelimb) 1) << 127) - | ||
530 | (((widelimb) 1) << 71); | ||
531 | static const widelimb two127m71m55 = (((widelimb) 1) << 127) - | ||
532 | (((widelimb) 1) << 71) - (((widelimb) 1) << 55); | ||
533 | widelimb output[5]; | ||
534 | |||
535 | /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */ | ||
536 | output[0] = in[0] + two127p15; | ||
537 | output[1] = in[1] + two127m71m55; | ||
538 | output[2] = in[2] + two127m71; | ||
539 | output[3] = in[3]; | ||
540 | output[4] = in[4]; | ||
541 | |||
542 | /* Eliminate in[4], in[5], in[6] */ | ||
543 | output[4] += in[6] >> 16; | ||
544 | output[3] += (in[6] & 0xffff) << 40; | ||
545 | output[2] -= in[6]; | ||
546 | |||
547 | output[3] += in[5] >> 16; | ||
548 | output[2] += (in[5] & 0xffff) << 40; | ||
549 | output[1] -= in[5]; | ||
550 | |||
551 | output[2] += output[4] >> 16; | ||
552 | output[1] += (output[4] & 0xffff) << 40; | ||
553 | output[0] -= output[4]; | ||
554 | |||
555 | /* Carry 2 -> 3 -> 4 */ | ||
556 | output[3] += output[2] >> 56; | ||
557 | output[2] &= 0x00ffffffffffffff; | ||
558 | |||
559 | output[4] = output[3] >> 56; | ||
560 | output[3] &= 0x00ffffffffffffff; | ||
561 | |||
562 | /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */ | ||
563 | |||
564 | /* Eliminate output[4] */ | ||
565 | output[2] += output[4] >> 16; | ||
566 | /* output[2] < 2^56 + 2^56 = 2^57 */ | ||
567 | output[1] += (output[4] & 0xffff) << 40; | ||
568 | output[0] -= output[4]; | ||
569 | |||
570 | /* Carry 0 -> 1 -> 2 -> 3 */ | ||
571 | output[1] += output[0] >> 56; | ||
572 | out[0] = output[0] & 0x00ffffffffffffff; | ||
573 | |||
574 | output[2] += output[1] >> 56; | ||
575 | /* output[2] < 2^57 + 2^72 */ | ||
576 | out[1] = output[1] & 0x00ffffffffffffff; | ||
577 | output[3] += output[2] >> 56; | ||
578 | /* output[3] <= 2^56 + 2^16 */ | ||
579 | out[2] = output[2] & 0x00ffffffffffffff; | ||
580 | |||
581 | /* out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, | ||
582 | * out[3] <= 2^56 + 2^16 (due to final carry), | ||
583 | * so out < 2*p */ | ||
584 | out[3] = output[3]; | ||
585 | } | ||
586 | |||
587 | static void felem_square_reduce(felem out, const felem in) | ||
588 | { | ||
589 | widefelem tmp; | ||
590 | felem_square(tmp, in); | ||
591 | felem_reduce(out, tmp); | ||
592 | } | ||
593 | |||
594 | static void felem_mul_reduce(felem out, const felem in1, const felem in2) | ||
595 | { | ||
596 | widefelem tmp; | ||
597 | felem_mul(tmp, in1, in2); | ||
598 | felem_reduce(out, tmp); | ||
599 | } | ||
600 | |||
601 | /* Reduce to unique minimal representation. | ||
602 | * Requires 0 <= in < 2*p (always call felem_reduce first) */ | ||
603 | static void felem_contract(felem out, const felem in) | ||
604 | { | ||
605 | static const int64_t two56 = ((limb) 1) << 56; | ||
606 | /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */ | ||
607 | /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */ | ||
608 | int64_t tmp[4], a; | ||
609 | tmp[0] = in[0]; | ||
610 | tmp[1] = in[1]; | ||
611 | tmp[2] = in[2]; | ||
612 | tmp[3] = in[3]; | ||
613 | /* Case 1: a = 1 iff in >= 2^224 */ | ||
614 | a = (in[3] >> 56); | ||
615 | tmp[0] -= a; | ||
616 | tmp[1] += a << 40; | ||
617 | tmp[3] &= 0x00ffffffffffffff; | ||
618 | /* Case 2: a = 0 iff p <= in < 2^224, i.e., | ||
619 | * the high 128 bits are all 1 and the lower part is non-zero */ | ||
620 | a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) | | ||
621 | (((int64_t)(in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63); | ||
622 | a &= 0x00ffffffffffffff; | ||
623 | /* turn a into an all-one mask (if a = 0) or an all-zero mask */ | ||
624 | a = (a - 1) >> 63; | ||
625 | /* subtract 2^224 - 2^96 + 1 if a is all-one*/ | ||
626 | tmp[3] &= a ^ 0xffffffffffffffff; | ||
627 | tmp[2] &= a ^ 0xffffffffffffffff; | ||
628 | tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff; | ||
629 | tmp[0] -= 1 & a; | ||
630 | |||
631 | /* eliminate negative coefficients: if tmp[0] is negative, tmp[1] must | ||
632 | * be non-zero, so we only need one step */ | ||
633 | a = tmp[0] >> 63; | ||
634 | tmp[0] += two56 & a; | ||
635 | tmp[1] -= 1 & a; | ||
636 | |||
637 | /* carry 1 -> 2 -> 3 */ | ||
638 | tmp[2] += tmp[1] >> 56; | ||
639 | tmp[1] &= 0x00ffffffffffffff; | ||
640 | |||
641 | tmp[3] += tmp[2] >> 56; | ||
642 | tmp[2] &= 0x00ffffffffffffff; | ||
643 | |||
644 | /* Now 0 <= out < p */ | ||
645 | out[0] = tmp[0]; | ||
646 | out[1] = tmp[1]; | ||
647 | out[2] = tmp[2]; | ||
648 | out[3] = tmp[3]; | ||
649 | } | ||
650 | |||
651 | /* Zero-check: returns 1 if input is 0, and 0 otherwise. | ||
652 | * We know that field elements are reduced to in < 2^225, | ||
653 | * so we only need to check three cases: 0, 2^224 - 2^96 + 1, | ||
654 | * and 2^225 - 2^97 + 2 */ | ||
655 | static limb felem_is_zero(const felem in) | ||
656 | { | ||
657 | limb zero, two224m96p1, two225m97p2; | ||
658 | |||
659 | zero = in[0] | in[1] | in[2] | in[3]; | ||
660 | zero = (((int64_t)(zero) - 1) >> 63) & 1; | ||
661 | two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000) | ||
662 | | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff); | ||
663 | two224m96p1 = (((int64_t)(two224m96p1) - 1) >> 63) & 1; | ||
664 | two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000) | ||
665 | | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff); | ||
666 | two225m97p2 = (((int64_t)(two225m97p2) - 1) >> 63) & 1; | ||
667 | return (zero | two224m96p1 | two225m97p2); | ||
668 | } | ||
669 | |||
670 | static limb felem_is_zero_int(const felem in) | ||
671 | { | ||
672 | return (int) (felem_is_zero(in) & ((limb)1)); | ||
673 | } | ||
674 | |||
675 | /* Invert a field element */ | ||
676 | /* Computation chain copied from djb's code */ | ||
677 | static void felem_inv(felem out, const felem in) | ||
678 | { | ||
679 | felem ftmp, ftmp2, ftmp3, ftmp4; | ||
680 | widefelem tmp; | ||
681 | unsigned i; | ||
682 | |||
683 | felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2 */ | ||
684 | felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 1 */ | ||
685 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2 */ | ||
686 | felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 1 */ | ||
687 | felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp); /* 2^4 - 2 */ | ||
688 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^5 - 4 */ | ||
689 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^6 - 8 */ | ||
690 | felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp); /* 2^6 - 1 */ | ||
691 | felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp); /* 2^7 - 2 */ | ||
692 | for (i = 0; i < 5; ++i) /* 2^12 - 2^6 */ | ||
693 | { | ||
694 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); | ||
695 | } | ||
696 | felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp2, tmp); /* 2^12 - 1 */ | ||
697 | felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^13 - 2 */ | ||
698 | for (i = 0; i < 11; ++i) /* 2^24 - 2^12 */ | ||
699 | { | ||
700 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); | ||
701 | } | ||
702 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp2, tmp); /* 2^24 - 1 */ | ||
703 | felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^25 - 2 */ | ||
704 | for (i = 0; i < 23; ++i) /* 2^48 - 2^24 */ | ||
705 | { | ||
706 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); | ||
707 | } | ||
708 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^48 - 1 */ | ||
709 | felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp); /* 2^49 - 2 */ | ||
710 | for (i = 0; i < 47; ++i) /* 2^96 - 2^48 */ | ||
711 | { | ||
712 | felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); | ||
713 | } | ||
714 | felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^96 - 1 */ | ||
715 | felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp); /* 2^97 - 2 */ | ||
716 | for (i = 0; i < 23; ++i) /* 2^120 - 2^24 */ | ||
717 | { | ||
718 | felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); | ||
719 | } | ||
720 | felem_mul(tmp, ftmp2, ftmp4); felem_reduce(ftmp2, tmp); /* 2^120 - 1 */ | ||
721 | for (i = 0; i < 6; ++i) /* 2^126 - 2^6 */ | ||
722 | { | ||
723 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); | ||
724 | } | ||
725 | felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp); /* 2^126 - 1 */ | ||
726 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^127 - 2 */ | ||
727 | felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp); /* 2^127 - 1 */ | ||
728 | for (i = 0; i < 97; ++i) /* 2^224 - 2^97 */ | ||
729 | { | ||
730 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); | ||
731 | } | ||
732 | felem_mul(tmp, ftmp, ftmp3); felem_reduce(out, tmp); /* 2^224 - 2^96 - 1 */ | ||
733 | } | ||
734 | |||
735 | /* Copy in constant time: | ||
736 | * if icopy == 1, copy in to out, | ||
737 | * if icopy == 0, copy out to itself. */ | ||
738 | static void | ||
739 | copy_conditional(felem out, const felem in, limb icopy) | ||
740 | { | ||
741 | unsigned i; | ||
742 | /* icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one */ | ||
743 | const limb copy = -icopy; | ||
744 | for (i = 0; i < 4; ++i) | ||
745 | { | ||
746 | const limb tmp = copy & (in[i] ^ out[i]); | ||
747 | out[i] ^= tmp; | ||
748 | } | ||
749 | } | ||
750 | |||
751 | /******************************************************************************/ | ||
752 | /* ELLIPTIC CURVE POINT OPERATIONS | ||
753 | * | ||
754 | * Points are represented in Jacobian projective coordinates: | ||
755 | * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3), | ||
756 | * or to the point at infinity if Z == 0. | ||
757 | * | ||
758 | */ | ||
759 | |||
760 | /* Double an elliptic curve point: | ||
761 | * (X', Y', Z') = 2 * (X, Y, Z), where | ||
762 | * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2 | ||
763 | * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2 | ||
764 | * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z | ||
765 | * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed, | ||
766 | * while x_out == y_in is not (maybe this works, but it's not tested). */ | ||
767 | static void | ||
768 | point_double(felem x_out, felem y_out, felem z_out, | ||
769 | const felem x_in, const felem y_in, const felem z_in) | ||
770 | { | ||
771 | widefelem tmp, tmp2; | ||
772 | felem delta, gamma, beta, alpha, ftmp, ftmp2; | ||
773 | |||
774 | felem_assign(ftmp, x_in); | ||
775 | felem_assign(ftmp2, x_in); | ||
776 | |||
777 | /* delta = z^2 */ | ||
778 | felem_square(tmp, z_in); | ||
779 | felem_reduce(delta, tmp); | ||
780 | |||
781 | /* gamma = y^2 */ | ||
782 | felem_square(tmp, y_in); | ||
783 | felem_reduce(gamma, tmp); | ||
784 | |||
785 | /* beta = x*gamma */ | ||
786 | felem_mul(tmp, x_in, gamma); | ||
787 | felem_reduce(beta, tmp); | ||
788 | |||
789 | /* alpha = 3*(x-delta)*(x+delta) */ | ||
790 | felem_diff(ftmp, delta); | ||
791 | /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */ | ||
792 | felem_sum(ftmp2, delta); | ||
793 | /* ftmp2[i] < 2^57 + 2^57 = 2^58 */ | ||
794 | felem_scalar(ftmp2, 3); | ||
795 | /* ftmp2[i] < 3 * 2^58 < 2^60 */ | ||
796 | felem_mul(tmp, ftmp, ftmp2); | ||
797 | /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */ | ||
798 | felem_reduce(alpha, tmp); | ||
799 | |||
800 | /* x' = alpha^2 - 8*beta */ | ||
801 | felem_square(tmp, alpha); | ||
802 | /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ | ||
803 | felem_assign(ftmp, beta); | ||
804 | felem_scalar(ftmp, 8); | ||
805 | /* ftmp[i] < 8 * 2^57 = 2^60 */ | ||
806 | felem_diff_128_64(tmp, ftmp); | ||
807 | /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ | ||
808 | felem_reduce(x_out, tmp); | ||
809 | |||
810 | /* z' = (y + z)^2 - gamma - delta */ | ||
811 | felem_sum(delta, gamma); | ||
812 | /* delta[i] < 2^57 + 2^57 = 2^58 */ | ||
813 | felem_assign(ftmp, y_in); | ||
814 | felem_sum(ftmp, z_in); | ||
815 | /* ftmp[i] < 2^57 + 2^57 = 2^58 */ | ||
816 | felem_square(tmp, ftmp); | ||
817 | /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */ | ||
818 | felem_diff_128_64(tmp, delta); | ||
819 | /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */ | ||
820 | felem_reduce(z_out, tmp); | ||
821 | |||
822 | /* y' = alpha*(4*beta - x') - 8*gamma^2 */ | ||
823 | felem_scalar(beta, 4); | ||
824 | /* beta[i] < 4 * 2^57 = 2^59 */ | ||
825 | felem_diff(beta, x_out); | ||
826 | /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */ | ||
827 | felem_mul(tmp, alpha, beta); | ||
828 | /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */ | ||
829 | felem_square(tmp2, gamma); | ||
830 | /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */ | ||
831 | widefelem_scalar(tmp2, 8); | ||
832 | /* tmp2[i] < 8 * 2^116 = 2^119 */ | ||
833 | widefelem_diff(tmp, tmp2); | ||
834 | /* tmp[i] < 2^119 + 2^120 < 2^121 */ | ||
835 | felem_reduce(y_out, tmp); | ||
836 | } | ||
837 | |||
838 | /* Add two elliptic curve points: | ||
839 | * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where | ||
840 | * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 - | ||
841 | * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 | ||
842 | * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) - | ||
843 | * Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3 | ||
844 | * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2) | ||
845 | * | ||
846 | * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0. | ||
847 | */ | ||
848 | |||
849 | /* This function is not entirely constant-time: | ||
850 | * it includes a branch for checking whether the two input points are equal, | ||
851 | * (while not equal to the point at infinity). | ||
852 | * This case never happens during single point multiplication, | ||
853 | * so there is no timing leak for ECDH or ECDSA signing. */ | ||
854 | static void point_add(felem x3, felem y3, felem z3, | ||
855 | const felem x1, const felem y1, const felem z1, | ||
856 | const int mixed, const felem x2, const felem y2, const felem z2) | ||
857 | { | ||
858 | felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out; | ||
859 | widefelem tmp, tmp2; | ||
860 | limb z1_is_zero, z2_is_zero, x_equal, y_equal; | ||
861 | |||
862 | if (!mixed) | ||
863 | { | ||
864 | /* ftmp2 = z2^2 */ | ||
865 | felem_square(tmp, z2); | ||
866 | felem_reduce(ftmp2, tmp); | ||
867 | |||
868 | /* ftmp4 = z2^3 */ | ||
869 | felem_mul(tmp, ftmp2, z2); | ||
870 | felem_reduce(ftmp4, tmp); | ||
871 | |||
872 | /* ftmp4 = z2^3*y1 */ | ||
873 | felem_mul(tmp2, ftmp4, y1); | ||
874 | felem_reduce(ftmp4, tmp2); | ||
875 | |||
876 | /* ftmp2 = z2^2*x1 */ | ||
877 | felem_mul(tmp2, ftmp2, x1); | ||
878 | felem_reduce(ftmp2, tmp2); | ||
879 | } | ||
880 | else | ||
881 | { | ||
882 | /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ | ||
883 | |||
884 | /* ftmp4 = z2^3*y1 */ | ||
885 | felem_assign(ftmp4, y1); | ||
886 | |||
887 | /* ftmp2 = z2^2*x1 */ | ||
888 | felem_assign(ftmp2, x1); | ||
889 | } | ||
890 | |||
891 | /* ftmp = z1^2 */ | ||
892 | felem_square(tmp, z1); | ||
893 | felem_reduce(ftmp, tmp); | ||
894 | |||
895 | /* ftmp3 = z1^3 */ | ||
896 | felem_mul(tmp, ftmp, z1); | ||
897 | felem_reduce(ftmp3, tmp); | ||
898 | |||
899 | /* tmp = z1^3*y2 */ | ||
900 | felem_mul(tmp, ftmp3, y2); | ||
901 | /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ | ||
902 | |||
903 | /* ftmp3 = z1^3*y2 - z2^3*y1 */ | ||
904 | felem_diff_128_64(tmp, ftmp4); | ||
905 | /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ | ||
906 | felem_reduce(ftmp3, tmp); | ||
907 | |||
908 | /* tmp = z1^2*x2 */ | ||
909 | felem_mul(tmp, ftmp, x2); | ||
910 | /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ | ||
911 | |||
912 | /* ftmp = z1^2*x2 - z2^2*x1 */ | ||
913 | felem_diff_128_64(tmp, ftmp2); | ||
914 | /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ | ||
915 | felem_reduce(ftmp, tmp); | ||
916 | |||
917 | /* the formulae are incorrect if the points are equal | ||
918 | * so we check for this and do doubling if this happens */ | ||
919 | x_equal = felem_is_zero(ftmp); | ||
920 | y_equal = felem_is_zero(ftmp3); | ||
921 | z1_is_zero = felem_is_zero(z1); | ||
922 | z2_is_zero = felem_is_zero(z2); | ||
923 | /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */ | ||
924 | if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) | ||
925 | { | ||
926 | point_double(x3, y3, z3, x1, y1, z1); | ||
927 | return; | ||
928 | } | ||
929 | |||
930 | /* ftmp5 = z1*z2 */ | ||
931 | if (!mixed) | ||
932 | { | ||
933 | felem_mul(tmp, z1, z2); | ||
934 | felem_reduce(ftmp5, tmp); | ||
935 | } | ||
936 | else | ||
937 | { | ||
938 | /* special case z2 = 0 is handled later */ | ||
939 | felem_assign(ftmp5, z1); | ||
940 | } | ||
941 | |||
942 | /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */ | ||
943 | felem_mul(tmp, ftmp, ftmp5); | ||
944 | felem_reduce(z_out, tmp); | ||
945 | |||
946 | /* ftmp = (z1^2*x2 - z2^2*x1)^2 */ | ||
947 | felem_assign(ftmp5, ftmp); | ||
948 | felem_square(tmp, ftmp); | ||
949 | felem_reduce(ftmp, tmp); | ||
950 | |||
951 | /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */ | ||
952 | felem_mul(tmp, ftmp, ftmp5); | ||
953 | felem_reduce(ftmp5, tmp); | ||
954 | |||
955 | /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ | ||
956 | felem_mul(tmp, ftmp2, ftmp); | ||
957 | felem_reduce(ftmp2, tmp); | ||
958 | |||
959 | /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */ | ||
960 | felem_mul(tmp, ftmp4, ftmp5); | ||
961 | /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ | ||
962 | |||
963 | /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */ | ||
964 | felem_square(tmp2, ftmp3); | ||
965 | /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */ | ||
966 | |||
967 | /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */ | ||
968 | felem_diff_128_64(tmp2, ftmp5); | ||
969 | /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */ | ||
970 | |||
971 | /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ | ||
972 | felem_assign(ftmp5, ftmp2); | ||
973 | felem_scalar(ftmp5, 2); | ||
974 | /* ftmp5[i] < 2 * 2^57 = 2^58 */ | ||
975 | |||
976 | /* x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 - | ||
977 | 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ | ||
978 | felem_diff_128_64(tmp2, ftmp5); | ||
979 | /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */ | ||
980 | felem_reduce(x_out, tmp2); | ||
981 | |||
982 | /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */ | ||
983 | felem_diff(ftmp2, x_out); | ||
984 | /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */ | ||
985 | |||
986 | /* tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) */ | ||
987 | felem_mul(tmp2, ftmp3, ftmp2); | ||
988 | /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */ | ||
989 | |||
990 | /* y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) - | ||
991 | z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */ | ||
992 | widefelem_diff(tmp2, tmp); | ||
993 | /* tmp2[i] < 2^118 + 2^120 < 2^121 */ | ||
994 | felem_reduce(y_out, tmp2); | ||
995 | |||
996 | /* the result (x_out, y_out, z_out) is incorrect if one of the inputs is | ||
997 | * the point at infinity, so we need to check for this separately */ | ||
998 | |||
999 | /* if point 1 is at infinity, copy point 2 to output, and vice versa */ | ||
1000 | copy_conditional(x_out, x2, z1_is_zero); | ||
1001 | copy_conditional(x_out, x1, z2_is_zero); | ||
1002 | copy_conditional(y_out, y2, z1_is_zero); | ||
1003 | copy_conditional(y_out, y1, z2_is_zero); | ||
1004 | copy_conditional(z_out, z2, z1_is_zero); | ||
1005 | copy_conditional(z_out, z1, z2_is_zero); | ||
1006 | felem_assign(x3, x_out); | ||
1007 | felem_assign(y3, y_out); | ||
1008 | felem_assign(z3, z_out); | ||
1009 | } | ||
1010 | |||
1011 | /* select_point selects the |idx|th point from a precomputation table and | ||
1012 | * copies it to out. */ | ||
1013 | static void select_point(const u64 idx, unsigned int size, const felem pre_comp[/*size*/][3], felem out[3]) | ||
1014 | { | ||
1015 | unsigned i, j; | ||
1016 | limb *outlimbs = &out[0][0]; | ||
1017 | memset(outlimbs, 0, 3 * sizeof(felem)); | ||
1018 | |||
1019 | for (i = 0; i < size; i++) | ||
1020 | { | ||
1021 | const limb *inlimbs = &pre_comp[i][0][0]; | ||
1022 | u64 mask = i ^ idx; | ||
1023 | mask |= mask >> 4; | ||
1024 | mask |= mask >> 2; | ||
1025 | mask |= mask >> 1; | ||
1026 | mask &= 1; | ||
1027 | mask--; | ||
1028 | for (j = 0; j < 4 * 3; j++) | ||
1029 | outlimbs[j] |= inlimbs[j] & mask; | ||
1030 | } | ||
1031 | } | ||
1032 | |||
1033 | /* get_bit returns the |i|th bit in |in| */ | ||
1034 | static char get_bit(const felem_bytearray in, unsigned i) | ||
1035 | { | ||
1036 | if (i >= 224) | ||
1037 | return 0; | ||
1038 | return (in[i >> 3] >> (i & 7)) & 1; | ||
1039 | } | ||
1040 | |||
1041 | /* Interleaved point multiplication using precomputed point multiples: | ||
1042 | * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], | ||
1043 | * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple | ||
1044 | * of the generator, using certain (large) precomputed multiples in g_pre_comp. | ||
1045 | * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ | ||
1046 | static void batch_mul(felem x_out, felem y_out, felem z_out, | ||
1047 | const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, | ||
1048 | const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[2][16][3]) | ||
1049 | { | ||
1050 | int i, skip; | ||
1051 | unsigned num; | ||
1052 | unsigned gen_mul = (g_scalar != NULL); | ||
1053 | felem nq[3], tmp[4]; | ||
1054 | u64 bits; | ||
1055 | u8 sign, digit; | ||
1056 | |||
1057 | /* set nq to the point at infinity */ | ||
1058 | memset(nq, 0, 3 * sizeof(felem)); | ||
1059 | |||
1060 | /* Loop over all scalars msb-to-lsb, interleaving additions | ||
1061 | * of multiples of the generator (two in each of the last 28 rounds) | ||
1062 | * and additions of other points multiples (every 5th round). | ||
1063 | */ | ||
1064 | skip = 1; /* save two point operations in the first round */ | ||
1065 | for (i = (num_points ? 220 : 27); i >= 0; --i) | ||
1066 | { | ||
1067 | /* double */ | ||
1068 | if (!skip) | ||
1069 | point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); | ||
1070 | |||
1071 | /* add multiples of the generator */ | ||
1072 | if (gen_mul && (i <= 27)) | ||
1073 | { | ||
1074 | /* first, look 28 bits upwards */ | ||
1075 | bits = get_bit(g_scalar, i + 196) << 3; | ||
1076 | bits |= get_bit(g_scalar, i + 140) << 2; | ||
1077 | bits |= get_bit(g_scalar, i + 84) << 1; | ||
1078 | bits |= get_bit(g_scalar, i + 28); | ||
1079 | /* select the point to add, in constant time */ | ||
1080 | select_point(bits, 16, g_pre_comp[1], tmp); | ||
1081 | |||
1082 | if (!skip) | ||
1083 | { | ||
1084 | point_add(nq[0], nq[1], nq[2], | ||
1085 | nq[0], nq[1], nq[2], | ||
1086 | 1 /* mixed */, tmp[0], tmp[1], tmp[2]); | ||
1087 | } | ||
1088 | else | ||
1089 | { | ||
1090 | memcpy(nq, tmp, 3 * sizeof(felem)); | ||
1091 | skip = 0; | ||
1092 | } | ||
1093 | |||
1094 | /* second, look at the current position */ | ||
1095 | bits = get_bit(g_scalar, i + 168) << 3; | ||
1096 | bits |= get_bit(g_scalar, i + 112) << 2; | ||
1097 | bits |= get_bit(g_scalar, i + 56) << 1; | ||
1098 | bits |= get_bit(g_scalar, i); | ||
1099 | /* select the point to add, in constant time */ | ||
1100 | select_point(bits, 16, g_pre_comp[0], tmp); | ||
1101 | point_add(nq[0], nq[1], nq[2], | ||
1102 | nq[0], nq[1], nq[2], | ||
1103 | 1 /* mixed */, tmp[0], tmp[1], tmp[2]); | ||
1104 | } | ||
1105 | |||
1106 | /* do other additions every 5 doublings */ | ||
1107 | if (num_points && (i % 5 == 0)) | ||
1108 | { | ||
1109 | /* loop over all scalars */ | ||
1110 | for (num = 0; num < num_points; ++num) | ||
1111 | { | ||
1112 | bits = get_bit(scalars[num], i + 4) << 5; | ||
1113 | bits |= get_bit(scalars[num], i + 3) << 4; | ||
1114 | bits |= get_bit(scalars[num], i + 2) << 3; | ||
1115 | bits |= get_bit(scalars[num], i + 1) << 2; | ||
1116 | bits |= get_bit(scalars[num], i) << 1; | ||
1117 | bits |= get_bit(scalars[num], i - 1); | ||
1118 | ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); | ||
1119 | |||
1120 | /* select the point to add or subtract */ | ||
1121 | select_point(digit, 17, pre_comp[num], tmp); | ||
1122 | felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */ | ||
1123 | copy_conditional(tmp[1], tmp[3], sign); | ||
1124 | |||
1125 | if (!skip) | ||
1126 | { | ||
1127 | point_add(nq[0], nq[1], nq[2], | ||
1128 | nq[0], nq[1], nq[2], | ||
1129 | mixed, tmp[0], tmp[1], tmp[2]); | ||
1130 | } | ||
1131 | else | ||
1132 | { | ||
1133 | memcpy(nq, tmp, 3 * sizeof(felem)); | ||
1134 | skip = 0; | ||
1135 | } | ||
1136 | } | ||
1137 | } | ||
1138 | } | ||
1139 | felem_assign(x_out, nq[0]); | ||
1140 | felem_assign(y_out, nq[1]); | ||
1141 | felem_assign(z_out, nq[2]); | ||
1142 | } | ||
1143 | |||
1144 | /******************************************************************************/ | ||
1145 | /* FUNCTIONS TO MANAGE PRECOMPUTATION | ||
1146 | */ | ||
1147 | |||
1148 | static NISTP224_PRE_COMP *nistp224_pre_comp_new() | ||
1149 | { | ||
1150 | NISTP224_PRE_COMP *ret = NULL; | ||
1151 | ret = (NISTP224_PRE_COMP *) OPENSSL_malloc(sizeof *ret); | ||
1152 | if (!ret) | ||
1153 | { | ||
1154 | ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); | ||
1155 | return ret; | ||
1156 | } | ||
1157 | memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); | ||
1158 | ret->references = 1; | ||
1159 | return ret; | ||
1160 | } | ||
1161 | |||
1162 | static void *nistp224_pre_comp_dup(void *src_) | ||
1163 | { | ||
1164 | NISTP224_PRE_COMP *src = src_; | ||
1165 | |||
1166 | /* no need to actually copy, these objects never change! */ | ||
1167 | CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); | ||
1168 | |||
1169 | return src_; | ||
1170 | } | ||
1171 | |||
1172 | static void nistp224_pre_comp_free(void *pre_) | ||
1173 | { | ||
1174 | int i; | ||
1175 | NISTP224_PRE_COMP *pre = pre_; | ||
1176 | |||
1177 | if (!pre) | ||
1178 | return; | ||
1179 | |||
1180 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
1181 | if (i > 0) | ||
1182 | return; | ||
1183 | |||
1184 | OPENSSL_free(pre); | ||
1185 | } | ||
1186 | |||
1187 | static void nistp224_pre_comp_clear_free(void *pre_) | ||
1188 | { | ||
1189 | int i; | ||
1190 | NISTP224_PRE_COMP *pre = pre_; | ||
1191 | |||
1192 | if (!pre) | ||
1193 | return; | ||
1194 | |||
1195 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
1196 | if (i > 0) | ||
1197 | return; | ||
1198 | |||
1199 | OPENSSL_cleanse(pre, sizeof *pre); | ||
1200 | OPENSSL_free(pre); | ||
1201 | } | ||
1202 | |||
1203 | /******************************************************************************/ | ||
1204 | /* OPENSSL EC_METHOD FUNCTIONS | ||
1205 | */ | ||
1206 | |||
1207 | int ec_GFp_nistp224_group_init(EC_GROUP *group) | ||
1208 | { | ||
1209 | int ret; | ||
1210 | ret = ec_GFp_simple_group_init(group); | ||
1211 | group->a_is_minus3 = 1; | ||
1212 | return ret; | ||
1213 | } | ||
1214 | |||
1215 | int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, | ||
1216 | const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) | ||
1217 | { | ||
1218 | int ret = 0; | ||
1219 | BN_CTX *new_ctx = NULL; | ||
1220 | BIGNUM *curve_p, *curve_a, *curve_b; | ||
1221 | |||
1222 | if (ctx == NULL) | ||
1223 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
1224 | BN_CTX_start(ctx); | ||
1225 | if (((curve_p = BN_CTX_get(ctx)) == NULL) || | ||
1226 | ((curve_a = BN_CTX_get(ctx)) == NULL) || | ||
1227 | ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; | ||
1228 | BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p); | ||
1229 | BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a); | ||
1230 | BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b); | ||
1231 | if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || | ||
1232 | (BN_cmp(curve_b, b))) | ||
1233 | { | ||
1234 | ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE, | ||
1235 | EC_R_WRONG_CURVE_PARAMETERS); | ||
1236 | goto err; | ||
1237 | } | ||
1238 | group->field_mod_func = BN_nist_mod_224; | ||
1239 | ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); | ||
1240 | err: | ||
1241 | BN_CTX_end(ctx); | ||
1242 | if (new_ctx != NULL) | ||
1243 | BN_CTX_free(new_ctx); | ||
1244 | return ret; | ||
1245 | } | ||
1246 | |||
1247 | /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns | ||
1248 | * (X', Y') = (X/Z^2, Y/Z^3) */ | ||
1249 | int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, | ||
1250 | const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) | ||
1251 | { | ||
1252 | felem z1, z2, x_in, y_in, x_out, y_out; | ||
1253 | widefelem tmp; | ||
1254 | |||
1255 | if (EC_POINT_is_at_infinity(group, point)) | ||
1256 | { | ||
1257 | ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, | ||
1258 | EC_R_POINT_AT_INFINITY); | ||
1259 | return 0; | ||
1260 | } | ||
1261 | if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || | ||
1262 | (!BN_to_felem(z1, &point->Z))) return 0; | ||
1263 | felem_inv(z2, z1); | ||
1264 | felem_square(tmp, z2); felem_reduce(z1, tmp); | ||
1265 | felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); | ||
1266 | felem_contract(x_out, x_in); | ||
1267 | if (x != NULL) | ||
1268 | { | ||
1269 | if (!felem_to_BN(x, x_out)) { | ||
1270 | ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, | ||
1271 | ERR_R_BN_LIB); | ||
1272 | return 0; | ||
1273 | } | ||
1274 | } | ||
1275 | felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); | ||
1276 | felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); | ||
1277 | felem_contract(y_out, y_in); | ||
1278 | if (y != NULL) | ||
1279 | { | ||
1280 | if (!felem_to_BN(y, y_out)) { | ||
1281 | ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, | ||
1282 | ERR_R_BN_LIB); | ||
1283 | return 0; | ||
1284 | } | ||
1285 | } | ||
1286 | return 1; | ||
1287 | } | ||
1288 | |||
1289 | static void make_points_affine(size_t num, felem points[/*num*/][3], felem tmp_felems[/*num+1*/]) | ||
1290 | { | ||
1291 | /* Runs in constant time, unless an input is the point at infinity | ||
1292 | * (which normally shouldn't happen). */ | ||
1293 | ec_GFp_nistp_points_make_affine_internal( | ||
1294 | num, | ||
1295 | points, | ||
1296 | sizeof(felem), | ||
1297 | tmp_felems, | ||
1298 | (void (*)(void *)) felem_one, | ||
1299 | (int (*)(const void *)) felem_is_zero_int, | ||
1300 | (void (*)(void *, const void *)) felem_assign, | ||
1301 | (void (*)(void *, const void *)) felem_square_reduce, | ||
1302 | (void (*)(void *, const void *, const void *)) felem_mul_reduce, | ||
1303 | (void (*)(void *, const void *)) felem_inv, | ||
1304 | (void (*)(void *, const void *)) felem_contract); | ||
1305 | } | ||
1306 | |||
1307 | /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values | ||
1308 | * Result is stored in r (r can equal one of the inputs). */ | ||
1309 | int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, | ||
1310 | const BIGNUM *scalar, size_t num, const EC_POINT *points[], | ||
1311 | const BIGNUM *scalars[], BN_CTX *ctx) | ||
1312 | { | ||
1313 | int ret = 0; | ||
1314 | int j; | ||
1315 | unsigned i; | ||
1316 | int mixed = 0; | ||
1317 | BN_CTX *new_ctx = NULL; | ||
1318 | BIGNUM *x, *y, *z, *tmp_scalar; | ||
1319 | felem_bytearray g_secret; | ||
1320 | felem_bytearray *secrets = NULL; | ||
1321 | felem (*pre_comp)[17][3] = NULL; | ||
1322 | felem *tmp_felems = NULL; | ||
1323 | felem_bytearray tmp; | ||
1324 | unsigned num_bytes; | ||
1325 | int have_pre_comp = 0; | ||
1326 | size_t num_points = num; | ||
1327 | felem x_in, y_in, z_in, x_out, y_out, z_out; | ||
1328 | NISTP224_PRE_COMP *pre = NULL; | ||
1329 | const felem (*g_pre_comp)[16][3] = NULL; | ||
1330 | EC_POINT *generator = NULL; | ||
1331 | const EC_POINT *p = NULL; | ||
1332 | const BIGNUM *p_scalar = NULL; | ||
1333 | |||
1334 | if (ctx == NULL) | ||
1335 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
1336 | BN_CTX_start(ctx); | ||
1337 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
1338 | ((y = BN_CTX_get(ctx)) == NULL) || | ||
1339 | ((z = BN_CTX_get(ctx)) == NULL) || | ||
1340 | ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) | ||
1341 | goto err; | ||
1342 | |||
1343 | if (scalar != NULL) | ||
1344 | { | ||
1345 | pre = EC_EX_DATA_get_data(group->extra_data, | ||
1346 | nistp224_pre_comp_dup, nistp224_pre_comp_free, | ||
1347 | nistp224_pre_comp_clear_free); | ||
1348 | if (pre) | ||
1349 | /* we have precomputation, try to use it */ | ||
1350 | g_pre_comp = (const felem (*)[16][3]) pre->g_pre_comp; | ||
1351 | else | ||
1352 | /* try to use the standard precomputation */ | ||
1353 | g_pre_comp = &gmul[0]; | ||
1354 | generator = EC_POINT_new(group); | ||
1355 | if (generator == NULL) | ||
1356 | goto err; | ||
1357 | /* get the generator from precomputation */ | ||
1358 | if (!felem_to_BN(x, g_pre_comp[0][1][0]) || | ||
1359 | !felem_to_BN(y, g_pre_comp[0][1][1]) || | ||
1360 | !felem_to_BN(z, g_pre_comp[0][1][2])) | ||
1361 | { | ||
1362 | ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); | ||
1363 | goto err; | ||
1364 | } | ||
1365 | if (!EC_POINT_set_Jprojective_coordinates_GFp(group, | ||
1366 | generator, x, y, z, ctx)) | ||
1367 | goto err; | ||
1368 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
1369 | /* precomputation matches generator */ | ||
1370 | have_pre_comp = 1; | ||
1371 | else | ||
1372 | /* we don't have valid precomputation: | ||
1373 | * treat the generator as a random point */ | ||
1374 | num_points = num_points + 1; | ||
1375 | } | ||
1376 | |||
1377 | if (num_points > 0) | ||
1378 | { | ||
1379 | if (num_points >= 3) | ||
1380 | { | ||
1381 | /* unless we precompute multiples for just one or two points, | ||
1382 | * converting those into affine form is time well spent */ | ||
1383 | mixed = 1; | ||
1384 | } | ||
1385 | secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); | ||
1386 | pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem)); | ||
1387 | if (mixed) | ||
1388 | tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem)); | ||
1389 | if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL))) | ||
1390 | { | ||
1391 | ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE); | ||
1392 | goto err; | ||
1393 | } | ||
1394 | |||
1395 | /* we treat NULL scalars as 0, and NULL points as points at infinity, | ||
1396 | * i.e., they contribute nothing to the linear combination */ | ||
1397 | memset(secrets, 0, num_points * sizeof(felem_bytearray)); | ||
1398 | memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem)); | ||
1399 | for (i = 0; i < num_points; ++i) | ||
1400 | { | ||
1401 | if (i == num) | ||
1402 | /* the generator */ | ||
1403 | { | ||
1404 | p = EC_GROUP_get0_generator(group); | ||
1405 | p_scalar = scalar; | ||
1406 | } | ||
1407 | else | ||
1408 | /* the i^th point */ | ||
1409 | { | ||
1410 | p = points[i]; | ||
1411 | p_scalar = scalars[i]; | ||
1412 | } | ||
1413 | if ((p_scalar != NULL) && (p != NULL)) | ||
1414 | { | ||
1415 | /* reduce scalar to 0 <= scalar < 2^224 */ | ||
1416 | if ((BN_num_bits(p_scalar) > 224) || (BN_is_negative(p_scalar))) | ||
1417 | { | ||
1418 | /* this is an unusual input, and we don't guarantee | ||
1419 | * constant-timeness */ | ||
1420 | if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) | ||
1421 | { | ||
1422 | ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); | ||
1423 | goto err; | ||
1424 | } | ||
1425 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
1426 | } | ||
1427 | else | ||
1428 | num_bytes = BN_bn2bin(p_scalar, tmp); | ||
1429 | flip_endian(secrets[i], tmp, num_bytes); | ||
1430 | /* precompute multiples */ | ||
1431 | if ((!BN_to_felem(x_out, &p->X)) || | ||
1432 | (!BN_to_felem(y_out, &p->Y)) || | ||
1433 | (!BN_to_felem(z_out, &p->Z))) goto err; | ||
1434 | felem_assign(pre_comp[i][1][0], x_out); | ||
1435 | felem_assign(pre_comp[i][1][1], y_out); | ||
1436 | felem_assign(pre_comp[i][1][2], z_out); | ||
1437 | for (j = 2; j <= 16; ++j) | ||
1438 | { | ||
1439 | if (j & 1) | ||
1440 | { | ||
1441 | point_add( | ||
1442 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
1443 | pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], | ||
1444 | 0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); | ||
1445 | } | ||
1446 | else | ||
1447 | { | ||
1448 | point_double( | ||
1449 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
1450 | pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); | ||
1451 | } | ||
1452 | } | ||
1453 | } | ||
1454 | } | ||
1455 | if (mixed) | ||
1456 | make_points_affine(num_points * 17, pre_comp[0], tmp_felems); | ||
1457 | } | ||
1458 | |||
1459 | /* the scalar for the generator */ | ||
1460 | if ((scalar != NULL) && (have_pre_comp)) | ||
1461 | { | ||
1462 | memset(g_secret, 0, sizeof g_secret); | ||
1463 | /* reduce scalar to 0 <= scalar < 2^224 */ | ||
1464 | if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) | ||
1465 | { | ||
1466 | /* this is an unusual input, and we don't guarantee | ||
1467 | * constant-timeness */ | ||
1468 | if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) | ||
1469 | { | ||
1470 | ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); | ||
1471 | goto err; | ||
1472 | } | ||
1473 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
1474 | } | ||
1475 | else | ||
1476 | num_bytes = BN_bn2bin(scalar, tmp); | ||
1477 | flip_endian(g_secret, tmp, num_bytes); | ||
1478 | /* do the multiplication with generator precomputation*/ | ||
1479 | batch_mul(x_out, y_out, z_out, | ||
1480 | (const felem_bytearray (*)) secrets, num_points, | ||
1481 | g_secret, | ||
1482 | mixed, (const felem (*)[17][3]) pre_comp, | ||
1483 | g_pre_comp); | ||
1484 | } | ||
1485 | else | ||
1486 | /* do the multiplication without generator precomputation */ | ||
1487 | batch_mul(x_out, y_out, z_out, | ||
1488 | (const felem_bytearray (*)) secrets, num_points, | ||
1489 | NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL); | ||
1490 | /* reduce the output to its unique minimal representation */ | ||
1491 | felem_contract(x_in, x_out); | ||
1492 | felem_contract(y_in, y_out); | ||
1493 | felem_contract(z_in, z_out); | ||
1494 | if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || | ||
1495 | (!felem_to_BN(z, z_in))) | ||
1496 | { | ||
1497 | ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); | ||
1498 | goto err; | ||
1499 | } | ||
1500 | ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); | ||
1501 | |||
1502 | err: | ||
1503 | BN_CTX_end(ctx); | ||
1504 | if (generator != NULL) | ||
1505 | EC_POINT_free(generator); | ||
1506 | if (new_ctx != NULL) | ||
1507 | BN_CTX_free(new_ctx); | ||
1508 | if (secrets != NULL) | ||
1509 | OPENSSL_free(secrets); | ||
1510 | if (pre_comp != NULL) | ||
1511 | OPENSSL_free(pre_comp); | ||
1512 | if (tmp_felems != NULL) | ||
1513 | OPENSSL_free(tmp_felems); | ||
1514 | return ret; | ||
1515 | } | ||
1516 | |||
1517 | int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx) | ||
1518 | { | ||
1519 | int ret = 0; | ||
1520 | NISTP224_PRE_COMP *pre = NULL; | ||
1521 | int i, j; | ||
1522 | BN_CTX *new_ctx = NULL; | ||
1523 | BIGNUM *x, *y; | ||
1524 | EC_POINT *generator = NULL; | ||
1525 | felem tmp_felems[32]; | ||
1526 | |||
1527 | /* throw away old precomputation */ | ||
1528 | EC_EX_DATA_free_data(&group->extra_data, nistp224_pre_comp_dup, | ||
1529 | nistp224_pre_comp_free, nistp224_pre_comp_clear_free); | ||
1530 | if (ctx == NULL) | ||
1531 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
1532 | BN_CTX_start(ctx); | ||
1533 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
1534 | ((y = BN_CTX_get(ctx)) == NULL)) | ||
1535 | goto err; | ||
1536 | /* get the generator */ | ||
1537 | if (group->generator == NULL) goto err; | ||
1538 | generator = EC_POINT_new(group); | ||
1539 | if (generator == NULL) | ||
1540 | goto err; | ||
1541 | BN_bin2bn(nistp224_curve_params[3], sizeof (felem_bytearray), x); | ||
1542 | BN_bin2bn(nistp224_curve_params[4], sizeof (felem_bytearray), y); | ||
1543 | if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) | ||
1544 | goto err; | ||
1545 | if ((pre = nistp224_pre_comp_new()) == NULL) | ||
1546 | goto err; | ||
1547 | /* if the generator is the standard one, use built-in precomputation */ | ||
1548 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
1549 | { | ||
1550 | memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); | ||
1551 | ret = 1; | ||
1552 | goto err; | ||
1553 | } | ||
1554 | if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) || | ||
1555 | (!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) || | ||
1556 | (!BN_to_felem(pre->g_pre_comp[0][1][2], &group->generator->Z))) | ||
1557 | goto err; | ||
1558 | /* compute 2^56*G, 2^112*G, 2^168*G for the first table, | ||
1559 | * 2^28*G, 2^84*G, 2^140*G, 2^196*G for the second one | ||
1560 | */ | ||
1561 | for (i = 1; i <= 8; i <<= 1) | ||
1562 | { | ||
1563 | point_double( | ||
1564 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], | ||
1565 | pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]); | ||
1566 | for (j = 0; j < 27; ++j) | ||
1567 | { | ||
1568 | point_double( | ||
1569 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], | ||
1570 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); | ||
1571 | } | ||
1572 | if (i == 8) | ||
1573 | break; | ||
1574 | point_double( | ||
1575 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], | ||
1576 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); | ||
1577 | for (j = 0; j < 27; ++j) | ||
1578 | { | ||
1579 | point_double( | ||
1580 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], | ||
1581 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]); | ||
1582 | } | ||
1583 | } | ||
1584 | for (i = 0; i < 2; i++) | ||
1585 | { | ||
1586 | /* g_pre_comp[i][0] is the point at infinity */ | ||
1587 | memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0])); | ||
1588 | /* the remaining multiples */ | ||
1589 | /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */ | ||
1590 | point_add( | ||
1591 | pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], | ||
1592 | pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0], | ||
1593 | pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2], | ||
1594 | 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], | ||
1595 | pre->g_pre_comp[i][2][2]); | ||
1596 | /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */ | ||
1597 | point_add( | ||
1598 | pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], | ||
1599 | pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0], | ||
1600 | pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], | ||
1601 | 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], | ||
1602 | pre->g_pre_comp[i][2][2]); | ||
1603 | /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */ | ||
1604 | point_add( | ||
1605 | pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], | ||
1606 | pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0], | ||
1607 | pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], | ||
1608 | 0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], | ||
1609 | pre->g_pre_comp[i][4][2]); | ||
1610 | /* 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G */ | ||
1611 | point_add( | ||
1612 | pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], | ||
1613 | pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0], | ||
1614 | pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], | ||
1615 | 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], | ||
1616 | pre->g_pre_comp[i][2][2]); | ||
1617 | for (j = 1; j < 8; ++j) | ||
1618 | { | ||
1619 | /* odd multiples: add G resp. 2^28*G */ | ||
1620 | point_add( | ||
1621 | pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], | ||
1622 | pre->g_pre_comp[i][2*j+1][2], pre->g_pre_comp[i][2*j][0], | ||
1623 | pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2], | ||
1624 | 0, pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], | ||
1625 | pre->g_pre_comp[i][1][2]); | ||
1626 | } | ||
1627 | } | ||
1628 | make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems); | ||
1629 | |||
1630 | if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup, | ||
1631 | nistp224_pre_comp_free, nistp224_pre_comp_clear_free)) | ||
1632 | goto err; | ||
1633 | ret = 1; | ||
1634 | pre = NULL; | ||
1635 | err: | ||
1636 | BN_CTX_end(ctx); | ||
1637 | if (generator != NULL) | ||
1638 | EC_POINT_free(generator); | ||
1639 | if (new_ctx != NULL) | ||
1640 | BN_CTX_free(new_ctx); | ||
1641 | if (pre) | ||
1642 | nistp224_pre_comp_free(pre); | ||
1643 | return ret; | ||
1644 | } | ||
1645 | |||
1646 | int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group) | ||
1647 | { | ||
1648 | if (EC_EX_DATA_get_data(group->extra_data, nistp224_pre_comp_dup, | ||
1649 | nistp224_pre_comp_free, nistp224_pre_comp_clear_free) | ||
1650 | != NULL) | ||
1651 | return 1; | ||
1652 | else | ||
1653 | return 0; | ||
1654 | } | ||
1655 | |||
1656 | #else | ||
1657 | static void *dummy=&dummy; | ||
1658 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ecp_nistp256.c b/src/lib/libcrypto/ec/ecp_nistp256.c new file mode 100644 index 0000000000..4bc0f5dce0 --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistp256.c | |||
@@ -0,0 +1,2171 @@ | |||
1 | /* crypto/ec/ecp_nistp256.c */ | ||
2 | /* | ||
3 | * Written by Adam Langley (Google) for the OpenSSL project | ||
4 | */ | ||
5 | /* Copyright 2011 Google Inc. | ||
6 | * | ||
7 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
8 | * | ||
9 | * you may not use this file except in compliance with the License. | ||
10 | * You may obtain a copy of the License at | ||
11 | * | ||
12 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
13 | * | ||
14 | * Unless required by applicable law or agreed to in writing, software | ||
15 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
17 | * See the License for the specific language governing permissions and | ||
18 | * limitations under the License. | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication | ||
23 | * | ||
24 | * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c. | ||
25 | * Otherwise based on Emilia's P224 work, which was inspired by my curve25519 | ||
26 | * work which got its smarts from Daniel J. Bernstein's work on the same. | ||
27 | */ | ||
28 | |||
29 | #include <openssl/opensslconf.h> | ||
30 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 | ||
31 | |||
32 | #ifndef OPENSSL_SYS_VMS | ||
33 | #include <stdint.h> | ||
34 | #else | ||
35 | #include <inttypes.h> | ||
36 | #endif | ||
37 | |||
38 | #include <string.h> | ||
39 | #include <openssl/err.h> | ||
40 | #include "ec_lcl.h" | ||
41 | |||
42 | #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) | ||
43 | /* even with gcc, the typedef won't work for 32-bit platforms */ | ||
44 | typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ | ||
45 | typedef __int128_t int128_t; | ||
46 | #else | ||
47 | #error "Need GCC 3.1 or later to define type uint128_t" | ||
48 | #endif | ||
49 | |||
50 | typedef uint8_t u8; | ||
51 | typedef uint32_t u32; | ||
52 | typedef uint64_t u64; | ||
53 | typedef int64_t s64; | ||
54 | |||
55 | /* The underlying field. | ||
56 | * | ||
57 | * P256 operates over GF(2^256-2^224+2^192+2^96-1). We can serialise an element | ||
58 | * of this field into 32 bytes. We call this an felem_bytearray. */ | ||
59 | |||
60 | typedef u8 felem_bytearray[32]; | ||
61 | |||
62 | /* These are the parameters of P256, taken from FIPS 186-3, page 86. These | ||
63 | * values are big-endian. */ | ||
64 | static const felem_bytearray nistp256_curve_params[5] = { | ||
65 | {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */ | ||
66 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | ||
67 | 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, | ||
68 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, | ||
69 | {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */ | ||
70 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | ||
71 | 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, | ||
72 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */ | ||
73 | {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, | ||
74 | 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc, | ||
75 | 0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6, | ||
76 | 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b}, | ||
77 | {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */ | ||
78 | 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, | ||
79 | 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0, | ||
80 | 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96}, | ||
81 | {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */ | ||
82 | 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16, | ||
83 | 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, | ||
84 | 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5} | ||
85 | }; | ||
86 | |||
87 | /* The representation of field elements. | ||
88 | * ------------------------------------ | ||
89 | * | ||
90 | * We represent field elements with either four 128-bit values, eight 128-bit | ||
91 | * values, or four 64-bit values. The field element represented is: | ||
92 | * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192 (mod p) | ||
93 | * or: | ||
94 | * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512 (mod p) | ||
95 | * | ||
96 | * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits | ||
97 | * apart, but are 128-bits wide, the most significant bits of each limb overlap | ||
98 | * with the least significant bits of the next. | ||
99 | * | ||
100 | * A field element with four limbs is an 'felem'. One with eight limbs is a | ||
101 | * 'longfelem' | ||
102 | * | ||
103 | * A field element with four, 64-bit values is called a 'smallfelem'. Small | ||
104 | * values are used as intermediate values before multiplication. | ||
105 | */ | ||
106 | |||
107 | #define NLIMBS 4 | ||
108 | |||
109 | typedef uint128_t limb; | ||
110 | typedef limb felem[NLIMBS]; | ||
111 | typedef limb longfelem[NLIMBS * 2]; | ||
112 | typedef u64 smallfelem[NLIMBS]; | ||
113 | |||
114 | /* This is the value of the prime as four 64-bit words, little-endian. */ | ||
115 | static const u64 kPrime[4] = { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul }; | ||
116 | static const limb bottom32bits = 0xffffffff; | ||
117 | static const u64 bottom63bits = 0x7ffffffffffffffful; | ||
118 | |||
119 | /* bin32_to_felem takes a little-endian byte array and converts it into felem | ||
120 | * form. This assumes that the CPU is little-endian. */ | ||
121 | static void bin32_to_felem(felem out, const u8 in[32]) | ||
122 | { | ||
123 | out[0] = *((u64*) &in[0]); | ||
124 | out[1] = *((u64*) &in[8]); | ||
125 | out[2] = *((u64*) &in[16]); | ||
126 | out[3] = *((u64*) &in[24]); | ||
127 | } | ||
128 | |||
129 | /* smallfelem_to_bin32 takes a smallfelem and serialises into a little endian, | ||
130 | * 32 byte array. This assumes that the CPU is little-endian. */ | ||
131 | static void smallfelem_to_bin32(u8 out[32], const smallfelem in) | ||
132 | { | ||
133 | *((u64*) &out[0]) = in[0]; | ||
134 | *((u64*) &out[8]) = in[1]; | ||
135 | *((u64*) &out[16]) = in[2]; | ||
136 | *((u64*) &out[24]) = in[3]; | ||
137 | } | ||
138 | |||
139 | /* To preserve endianness when using BN_bn2bin and BN_bin2bn */ | ||
140 | static void flip_endian(u8 *out, const u8 *in, unsigned len) | ||
141 | { | ||
142 | unsigned i; | ||
143 | for (i = 0; i < len; ++i) | ||
144 | out[i] = in[len-1-i]; | ||
145 | } | ||
146 | |||
147 | /* BN_to_felem converts an OpenSSL BIGNUM into an felem */ | ||
148 | static int BN_to_felem(felem out, const BIGNUM *bn) | ||
149 | { | ||
150 | felem_bytearray b_in; | ||
151 | felem_bytearray b_out; | ||
152 | unsigned num_bytes; | ||
153 | |||
154 | /* BN_bn2bin eats leading zeroes */ | ||
155 | memset(b_out, 0, sizeof b_out); | ||
156 | num_bytes = BN_num_bytes(bn); | ||
157 | if (num_bytes > sizeof b_out) | ||
158 | { | ||
159 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
160 | return 0; | ||
161 | } | ||
162 | if (BN_is_negative(bn)) | ||
163 | { | ||
164 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
165 | return 0; | ||
166 | } | ||
167 | num_bytes = BN_bn2bin(bn, b_in); | ||
168 | flip_endian(b_out, b_in, num_bytes); | ||
169 | bin32_to_felem(out, b_out); | ||
170 | return 1; | ||
171 | } | ||
172 | |||
173 | /* felem_to_BN converts an felem into an OpenSSL BIGNUM */ | ||
174 | static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in) | ||
175 | { | ||
176 | felem_bytearray b_in, b_out; | ||
177 | smallfelem_to_bin32(b_in, in); | ||
178 | flip_endian(b_out, b_in, sizeof b_out); | ||
179 | return BN_bin2bn(b_out, sizeof b_out, out); | ||
180 | } | ||
181 | |||
182 | |||
183 | /* Field operations | ||
184 | * ---------------- */ | ||
185 | |||
186 | static void smallfelem_one(smallfelem out) | ||
187 | { | ||
188 | out[0] = 1; | ||
189 | out[1] = 0; | ||
190 | out[2] = 0; | ||
191 | out[3] = 0; | ||
192 | } | ||
193 | |||
194 | static void smallfelem_assign(smallfelem out, const smallfelem in) | ||
195 | { | ||
196 | out[0] = in[0]; | ||
197 | out[1] = in[1]; | ||
198 | out[2] = in[2]; | ||
199 | out[3] = in[3]; | ||
200 | } | ||
201 | |||
202 | static void felem_assign(felem out, const felem in) | ||
203 | { | ||
204 | out[0] = in[0]; | ||
205 | out[1] = in[1]; | ||
206 | out[2] = in[2]; | ||
207 | out[3] = in[3]; | ||
208 | } | ||
209 | |||
210 | /* felem_sum sets out = out + in. */ | ||
211 | static void felem_sum(felem out, const felem in) | ||
212 | { | ||
213 | out[0] += in[0]; | ||
214 | out[1] += in[1]; | ||
215 | out[2] += in[2]; | ||
216 | out[3] += in[3]; | ||
217 | } | ||
218 | |||
219 | /* felem_small_sum sets out = out + in. */ | ||
220 | static void felem_small_sum(felem out, const smallfelem in) | ||
221 | { | ||
222 | out[0] += in[0]; | ||
223 | out[1] += in[1]; | ||
224 | out[2] += in[2]; | ||
225 | out[3] += in[3]; | ||
226 | } | ||
227 | |||
228 | /* felem_scalar sets out = out * scalar */ | ||
229 | static void felem_scalar(felem out, const u64 scalar) | ||
230 | { | ||
231 | out[0] *= scalar; | ||
232 | out[1] *= scalar; | ||
233 | out[2] *= scalar; | ||
234 | out[3] *= scalar; | ||
235 | } | ||
236 | |||
237 | /* longfelem_scalar sets out = out * scalar */ | ||
238 | static void longfelem_scalar(longfelem out, const u64 scalar) | ||
239 | { | ||
240 | out[0] *= scalar; | ||
241 | out[1] *= scalar; | ||
242 | out[2] *= scalar; | ||
243 | out[3] *= scalar; | ||
244 | out[4] *= scalar; | ||
245 | out[5] *= scalar; | ||
246 | out[6] *= scalar; | ||
247 | out[7] *= scalar; | ||
248 | } | ||
249 | |||
250 | #define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9) | ||
251 | #define two105 (((limb)1) << 105) | ||
252 | #define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9) | ||
253 | |||
254 | /* zero105 is 0 mod p */ | ||
255 | static const felem zero105 = { two105m41m9, two105, two105m41p9, two105m41p9 }; | ||
256 | |||
257 | /* smallfelem_neg sets |out| to |-small| | ||
258 | * On exit: | ||
259 | * out[i] < out[i] + 2^105 | ||
260 | */ | ||
261 | static void smallfelem_neg(felem out, const smallfelem small) | ||
262 | { | ||
263 | /* In order to prevent underflow, we subtract from 0 mod p. */ | ||
264 | out[0] = zero105[0] - small[0]; | ||
265 | out[1] = zero105[1] - small[1]; | ||
266 | out[2] = zero105[2] - small[2]; | ||
267 | out[3] = zero105[3] - small[3]; | ||
268 | } | ||
269 | |||
270 | /* felem_diff subtracts |in| from |out| | ||
271 | * On entry: | ||
272 | * in[i] < 2^104 | ||
273 | * On exit: | ||
274 | * out[i] < out[i] + 2^105 | ||
275 | */ | ||
276 | static void felem_diff(felem out, const felem in) | ||
277 | { | ||
278 | /* In order to prevent underflow, we add 0 mod p before subtracting. */ | ||
279 | out[0] += zero105[0]; | ||
280 | out[1] += zero105[1]; | ||
281 | out[2] += zero105[2]; | ||
282 | out[3] += zero105[3]; | ||
283 | |||
284 | out[0] -= in[0]; | ||
285 | out[1] -= in[1]; | ||
286 | out[2] -= in[2]; | ||
287 | out[3] -= in[3]; | ||
288 | } | ||
289 | |||
290 | #define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11) | ||
291 | #define two107 (((limb)1) << 107) | ||
292 | #define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11) | ||
293 | |||
294 | /* zero107 is 0 mod p */ | ||
295 | static const felem zero107 = { two107m43m11, two107, two107m43p11, two107m43p11 }; | ||
296 | |||
297 | /* An alternative felem_diff for larger inputs |in| | ||
298 | * felem_diff_zero107 subtracts |in| from |out| | ||
299 | * On entry: | ||
300 | * in[i] < 2^106 | ||
301 | * On exit: | ||
302 | * out[i] < out[i] + 2^107 | ||
303 | */ | ||
304 | static void felem_diff_zero107(felem out, const felem in) | ||
305 | { | ||
306 | /* In order to prevent underflow, we add 0 mod p before subtracting. */ | ||
307 | out[0] += zero107[0]; | ||
308 | out[1] += zero107[1]; | ||
309 | out[2] += zero107[2]; | ||
310 | out[3] += zero107[3]; | ||
311 | |||
312 | out[0] -= in[0]; | ||
313 | out[1] -= in[1]; | ||
314 | out[2] -= in[2]; | ||
315 | out[3] -= in[3]; | ||
316 | } | ||
317 | |||
318 | /* longfelem_diff subtracts |in| from |out| | ||
319 | * On entry: | ||
320 | * in[i] < 7*2^67 | ||
321 | * On exit: | ||
322 | * out[i] < out[i] + 2^70 + 2^40 | ||
323 | */ | ||
324 | static void longfelem_diff(longfelem out, const longfelem in) | ||
325 | { | ||
326 | static const limb two70m8p6 = (((limb)1) << 70) - (((limb)1) << 8) + (((limb)1) << 6); | ||
327 | static const limb two70p40 = (((limb)1) << 70) + (((limb)1) << 40); | ||
328 | static const limb two70 = (((limb)1) << 70); | ||
329 | static const limb two70m40m38p6 = (((limb)1) << 70) - (((limb)1) << 40) - (((limb)1) << 38) + (((limb)1) << 6); | ||
330 | static const limb two70m6 = (((limb)1) << 70) - (((limb)1) << 6); | ||
331 | |||
332 | /* add 0 mod p to avoid underflow */ | ||
333 | out[0] += two70m8p6; | ||
334 | out[1] += two70p40; | ||
335 | out[2] += two70; | ||
336 | out[3] += two70m40m38p6; | ||
337 | out[4] += two70m6; | ||
338 | out[5] += two70m6; | ||
339 | out[6] += two70m6; | ||
340 | out[7] += two70m6; | ||
341 | |||
342 | /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */ | ||
343 | out[0] -= in[0]; | ||
344 | out[1] -= in[1]; | ||
345 | out[2] -= in[2]; | ||
346 | out[3] -= in[3]; | ||
347 | out[4] -= in[4]; | ||
348 | out[5] -= in[5]; | ||
349 | out[6] -= in[6]; | ||
350 | out[7] -= in[7]; | ||
351 | } | ||
352 | |||
353 | #define two64m0 (((limb)1) << 64) - 1 | ||
354 | #define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1 | ||
355 | #define two64m46 (((limb)1) << 64) - (((limb)1) << 46) | ||
356 | #define two64m32 (((limb)1) << 64) - (((limb)1) << 32) | ||
357 | |||
358 | /* zero110 is 0 mod p */ | ||
359 | static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 }; | ||
360 | |||
361 | /* felem_shrink converts an felem into a smallfelem. The result isn't quite | ||
362 | * minimal as the value may be greater than p. | ||
363 | * | ||
364 | * On entry: | ||
365 | * in[i] < 2^109 | ||
366 | * On exit: | ||
367 | * out[i] < 2^64 | ||
368 | */ | ||
369 | static void felem_shrink(smallfelem out, const felem in) | ||
370 | { | ||
371 | felem tmp; | ||
372 | u64 a, b, mask; | ||
373 | s64 high, low; | ||
374 | static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */ | ||
375 | |||
376 | /* Carry 2->3 */ | ||
377 | tmp[3] = zero110[3] + in[3] + ((u64) (in[2] >> 64)); | ||
378 | /* tmp[3] < 2^110 */ | ||
379 | |||
380 | tmp[2] = zero110[2] + (u64) in[2]; | ||
381 | tmp[0] = zero110[0] + in[0]; | ||
382 | tmp[1] = zero110[1] + in[1]; | ||
383 | /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */ | ||
384 | |||
385 | /* We perform two partial reductions where we eliminate the | ||
386 | * high-word of tmp[3]. We don't update the other words till the end. | ||
387 | */ | ||
388 | a = tmp[3] >> 64; /* a < 2^46 */ | ||
389 | tmp[3] = (u64) tmp[3]; | ||
390 | tmp[3] -= a; | ||
391 | tmp[3] += ((limb)a) << 32; | ||
392 | /* tmp[3] < 2^79 */ | ||
393 | |||
394 | b = a; | ||
395 | a = tmp[3] >> 64; /* a < 2^15 */ | ||
396 | b += a; /* b < 2^46 + 2^15 < 2^47 */ | ||
397 | tmp[3] = (u64) tmp[3]; | ||
398 | tmp[3] -= a; | ||
399 | tmp[3] += ((limb)a) << 32; | ||
400 | /* tmp[3] < 2^64 + 2^47 */ | ||
401 | |||
402 | /* This adjusts the other two words to complete the two partial | ||
403 | * reductions. */ | ||
404 | tmp[0] += b; | ||
405 | tmp[1] -= (((limb)b) << 32); | ||
406 | |||
407 | /* In order to make space in tmp[3] for the carry from 2 -> 3, we | ||
408 | * conditionally subtract kPrime if tmp[3] is large enough. */ | ||
409 | high = tmp[3] >> 64; | ||
410 | /* As tmp[3] < 2^65, high is either 1 or 0 */ | ||
411 | high <<= 63; | ||
412 | high >>= 63; | ||
413 | /* high is: | ||
414 | * all ones if the high word of tmp[3] is 1 | ||
415 | * all zeros if the high word of tmp[3] if 0 */ | ||
416 | low = tmp[3]; | ||
417 | mask = low >> 63; | ||
418 | /* mask is: | ||
419 | * all ones if the MSB of low is 1 | ||
420 | * all zeros if the MSB of low if 0 */ | ||
421 | low &= bottom63bits; | ||
422 | low -= kPrime3Test; | ||
423 | /* if low was greater than kPrime3Test then the MSB is zero */ | ||
424 | low = ~low; | ||
425 | low >>= 63; | ||
426 | /* low is: | ||
427 | * all ones if low was > kPrime3Test | ||
428 | * all zeros if low was <= kPrime3Test */ | ||
429 | mask = (mask & low) | high; | ||
430 | tmp[0] -= mask & kPrime[0]; | ||
431 | tmp[1] -= mask & kPrime[1]; | ||
432 | /* kPrime[2] is zero, so omitted */ | ||
433 | tmp[3] -= mask & kPrime[3]; | ||
434 | /* tmp[3] < 2**64 - 2**32 + 1 */ | ||
435 | |||
436 | tmp[1] += ((u64) (tmp[0] >> 64)); tmp[0] = (u64) tmp[0]; | ||
437 | tmp[2] += ((u64) (tmp[1] >> 64)); tmp[1] = (u64) tmp[1]; | ||
438 | tmp[3] += ((u64) (tmp[2] >> 64)); tmp[2] = (u64) tmp[2]; | ||
439 | /* tmp[i] < 2^64 */ | ||
440 | |||
441 | out[0] = tmp[0]; | ||
442 | out[1] = tmp[1]; | ||
443 | out[2] = tmp[2]; | ||
444 | out[3] = tmp[3]; | ||
445 | } | ||
446 | |||
447 | /* smallfelem_expand converts a smallfelem to an felem */ | ||
448 | static void smallfelem_expand(felem out, const smallfelem in) | ||
449 | { | ||
450 | out[0] = in[0]; | ||
451 | out[1] = in[1]; | ||
452 | out[2] = in[2]; | ||
453 | out[3] = in[3]; | ||
454 | } | ||
455 | |||
456 | /* smallfelem_square sets |out| = |small|^2 | ||
457 | * On entry: | ||
458 | * small[i] < 2^64 | ||
459 | * On exit: | ||
460 | * out[i] < 7 * 2^64 < 2^67 | ||
461 | */ | ||
462 | static void smallfelem_square(longfelem out, const smallfelem small) | ||
463 | { | ||
464 | limb a; | ||
465 | u64 high, low; | ||
466 | |||
467 | a = ((uint128_t) small[0]) * small[0]; | ||
468 | low = a; | ||
469 | high = a >> 64; | ||
470 | out[0] = low; | ||
471 | out[1] = high; | ||
472 | |||
473 | a = ((uint128_t) small[0]) * small[1]; | ||
474 | low = a; | ||
475 | high = a >> 64; | ||
476 | out[1] += low; | ||
477 | out[1] += low; | ||
478 | out[2] = high; | ||
479 | |||
480 | a = ((uint128_t) small[0]) * small[2]; | ||
481 | low = a; | ||
482 | high = a >> 64; | ||
483 | out[2] += low; | ||
484 | out[2] *= 2; | ||
485 | out[3] = high; | ||
486 | |||
487 | a = ((uint128_t) small[0]) * small[3]; | ||
488 | low = a; | ||
489 | high = a >> 64; | ||
490 | out[3] += low; | ||
491 | out[4] = high; | ||
492 | |||
493 | a = ((uint128_t) small[1]) * small[2]; | ||
494 | low = a; | ||
495 | high = a >> 64; | ||
496 | out[3] += low; | ||
497 | out[3] *= 2; | ||
498 | out[4] += high; | ||
499 | |||
500 | a = ((uint128_t) small[1]) * small[1]; | ||
501 | low = a; | ||
502 | high = a >> 64; | ||
503 | out[2] += low; | ||
504 | out[3] += high; | ||
505 | |||
506 | a = ((uint128_t) small[1]) * small[3]; | ||
507 | low = a; | ||
508 | high = a >> 64; | ||
509 | out[4] += low; | ||
510 | out[4] *= 2; | ||
511 | out[5] = high; | ||
512 | |||
513 | a = ((uint128_t) small[2]) * small[3]; | ||
514 | low = a; | ||
515 | high = a >> 64; | ||
516 | out[5] += low; | ||
517 | out[5] *= 2; | ||
518 | out[6] = high; | ||
519 | out[6] += high; | ||
520 | |||
521 | a = ((uint128_t) small[2]) * small[2]; | ||
522 | low = a; | ||
523 | high = a >> 64; | ||
524 | out[4] += low; | ||
525 | out[5] += high; | ||
526 | |||
527 | a = ((uint128_t) small[3]) * small[3]; | ||
528 | low = a; | ||
529 | high = a >> 64; | ||
530 | out[6] += low; | ||
531 | out[7] = high; | ||
532 | } | ||
533 | |||
534 | /* felem_square sets |out| = |in|^2 | ||
535 | * On entry: | ||
536 | * in[i] < 2^109 | ||
537 | * On exit: | ||
538 | * out[i] < 7 * 2^64 < 2^67 | ||
539 | */ | ||
540 | static void felem_square(longfelem out, const felem in) | ||
541 | { | ||
542 | u64 small[4]; | ||
543 | felem_shrink(small, in); | ||
544 | smallfelem_square(out, small); | ||
545 | } | ||
546 | |||
547 | /* smallfelem_mul sets |out| = |small1| * |small2| | ||
548 | * On entry: | ||
549 | * small1[i] < 2^64 | ||
550 | * small2[i] < 2^64 | ||
551 | * On exit: | ||
552 | * out[i] < 7 * 2^64 < 2^67 | ||
553 | */ | ||
554 | static void smallfelem_mul(longfelem out, const smallfelem small1, const smallfelem small2) | ||
555 | { | ||
556 | limb a; | ||
557 | u64 high, low; | ||
558 | |||
559 | a = ((uint128_t) small1[0]) * small2[0]; | ||
560 | low = a; | ||
561 | high = a >> 64; | ||
562 | out[0] = low; | ||
563 | out[1] = high; | ||
564 | |||
565 | |||
566 | a = ((uint128_t) small1[0]) * small2[1]; | ||
567 | low = a; | ||
568 | high = a >> 64; | ||
569 | out[1] += low; | ||
570 | out[2] = high; | ||
571 | |||
572 | a = ((uint128_t) small1[1]) * small2[0]; | ||
573 | low = a; | ||
574 | high = a >> 64; | ||
575 | out[1] += low; | ||
576 | out[2] += high; | ||
577 | |||
578 | |||
579 | a = ((uint128_t) small1[0]) * small2[2]; | ||
580 | low = a; | ||
581 | high = a >> 64; | ||
582 | out[2] += low; | ||
583 | out[3] = high; | ||
584 | |||
585 | a = ((uint128_t) small1[1]) * small2[1]; | ||
586 | low = a; | ||
587 | high = a >> 64; | ||
588 | out[2] += low; | ||
589 | out[3] += high; | ||
590 | |||
591 | a = ((uint128_t) small1[2]) * small2[0]; | ||
592 | low = a; | ||
593 | high = a >> 64; | ||
594 | out[2] += low; | ||
595 | out[3] += high; | ||
596 | |||
597 | |||
598 | a = ((uint128_t) small1[0]) * small2[3]; | ||
599 | low = a; | ||
600 | high = a >> 64; | ||
601 | out[3] += low; | ||
602 | out[4] = high; | ||
603 | |||
604 | a = ((uint128_t) small1[1]) * small2[2]; | ||
605 | low = a; | ||
606 | high = a >> 64; | ||
607 | out[3] += low; | ||
608 | out[4] += high; | ||
609 | |||
610 | a = ((uint128_t) small1[2]) * small2[1]; | ||
611 | low = a; | ||
612 | high = a >> 64; | ||
613 | out[3] += low; | ||
614 | out[4] += high; | ||
615 | |||
616 | a = ((uint128_t) small1[3]) * small2[0]; | ||
617 | low = a; | ||
618 | high = a >> 64; | ||
619 | out[3] += low; | ||
620 | out[4] += high; | ||
621 | |||
622 | |||
623 | a = ((uint128_t) small1[1]) * small2[3]; | ||
624 | low = a; | ||
625 | high = a >> 64; | ||
626 | out[4] += low; | ||
627 | out[5] = high; | ||
628 | |||
629 | a = ((uint128_t) small1[2]) * small2[2]; | ||
630 | low = a; | ||
631 | high = a >> 64; | ||
632 | out[4] += low; | ||
633 | out[5] += high; | ||
634 | |||
635 | a = ((uint128_t) small1[3]) * small2[1]; | ||
636 | low = a; | ||
637 | high = a >> 64; | ||
638 | out[4] += low; | ||
639 | out[5] += high; | ||
640 | |||
641 | |||
642 | a = ((uint128_t) small1[2]) * small2[3]; | ||
643 | low = a; | ||
644 | high = a >> 64; | ||
645 | out[5] += low; | ||
646 | out[6] = high; | ||
647 | |||
648 | a = ((uint128_t) small1[3]) * small2[2]; | ||
649 | low = a; | ||
650 | high = a >> 64; | ||
651 | out[5] += low; | ||
652 | out[6] += high; | ||
653 | |||
654 | |||
655 | a = ((uint128_t) small1[3]) * small2[3]; | ||
656 | low = a; | ||
657 | high = a >> 64; | ||
658 | out[6] += low; | ||
659 | out[7] = high; | ||
660 | } | ||
661 | |||
662 | /* felem_mul sets |out| = |in1| * |in2| | ||
663 | * On entry: | ||
664 | * in1[i] < 2^109 | ||
665 | * in2[i] < 2^109 | ||
666 | * On exit: | ||
667 | * out[i] < 7 * 2^64 < 2^67 | ||
668 | */ | ||
669 | static void felem_mul(longfelem out, const felem in1, const felem in2) | ||
670 | { | ||
671 | smallfelem small1, small2; | ||
672 | felem_shrink(small1, in1); | ||
673 | felem_shrink(small2, in2); | ||
674 | smallfelem_mul(out, small1, small2); | ||
675 | } | ||
676 | |||
677 | /* felem_small_mul sets |out| = |small1| * |in2| | ||
678 | * On entry: | ||
679 | * small1[i] < 2^64 | ||
680 | * in2[i] < 2^109 | ||
681 | * On exit: | ||
682 | * out[i] < 7 * 2^64 < 2^67 | ||
683 | */ | ||
684 | static void felem_small_mul(longfelem out, const smallfelem small1, const felem in2) | ||
685 | { | ||
686 | smallfelem small2; | ||
687 | felem_shrink(small2, in2); | ||
688 | smallfelem_mul(out, small1, small2); | ||
689 | } | ||
690 | |||
691 | #define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4) | ||
692 | #define two100 (((limb)1) << 100) | ||
693 | #define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4) | ||
694 | /* zero100 is 0 mod p */ | ||
695 | static const felem zero100 = { two100m36m4, two100, two100m36p4, two100m36p4 }; | ||
696 | |||
697 | /* Internal function for the different flavours of felem_reduce. | ||
698 | * felem_reduce_ reduces the higher coefficients in[4]-in[7]. | ||
699 | * On entry: | ||
700 | * out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7] | ||
701 | * out[1] >= in[7] + 2^32*in[4] | ||
702 | * out[2] >= in[5] + 2^32*in[5] | ||
703 | * out[3] >= in[4] + 2^32*in[5] + 2^32*in[6] | ||
704 | * On exit: | ||
705 | * out[0] <= out[0] + in[4] + 2^32*in[5] | ||
706 | * out[1] <= out[1] + in[5] + 2^33*in[6] | ||
707 | * out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7] | ||
708 | * out[3] <= out[3] + 2^32*in[4] + 3*in[7] | ||
709 | */ | ||
710 | static void felem_reduce_(felem out, const longfelem in) | ||
711 | { | ||
712 | int128_t c; | ||
713 | /* combine common terms from below */ | ||
714 | c = in[4] + (in[5] << 32); | ||
715 | out[0] += c; | ||
716 | out[3] -= c; | ||
717 | |||
718 | c = in[5] - in[7]; | ||
719 | out[1] += c; | ||
720 | out[2] -= c; | ||
721 | |||
722 | /* the remaining terms */ | ||
723 | /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */ | ||
724 | out[1] -= (in[4] << 32); | ||
725 | out[3] += (in[4] << 32); | ||
726 | |||
727 | /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */ | ||
728 | out[2] -= (in[5] << 32); | ||
729 | |||
730 | /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */ | ||
731 | out[0] -= in[6]; | ||
732 | out[0] -= (in[6] << 32); | ||
733 | out[1] += (in[6] << 33); | ||
734 | out[2] += (in[6] * 2); | ||
735 | out[3] -= (in[6] << 32); | ||
736 | |||
737 | /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */ | ||
738 | out[0] -= in[7]; | ||
739 | out[0] -= (in[7] << 32); | ||
740 | out[2] += (in[7] << 33); | ||
741 | out[3] += (in[7] * 3); | ||
742 | } | ||
743 | |||
744 | /* felem_reduce converts a longfelem into an felem. | ||
745 | * To be called directly after felem_square or felem_mul. | ||
746 | * On entry: | ||
747 | * in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64 | ||
748 | * in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64 | ||
749 | * On exit: | ||
750 | * out[i] < 2^101 | ||
751 | */ | ||
752 | static void felem_reduce(felem out, const longfelem in) | ||
753 | { | ||
754 | out[0] = zero100[0] + in[0]; | ||
755 | out[1] = zero100[1] + in[1]; | ||
756 | out[2] = zero100[2] + in[2]; | ||
757 | out[3] = zero100[3] + in[3]; | ||
758 | |||
759 | felem_reduce_(out, in); | ||
760 | |||
761 | /* out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0 | ||
762 | * out[1] > 2^100 - 2^64 - 7*2^96 > 0 | ||
763 | * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0 | ||
764 | * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0 | ||
765 | * | ||
766 | * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101 | ||
767 | * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101 | ||
768 | * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101 | ||
769 | * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101 | ||
770 | */ | ||
771 | } | ||
772 | |||
773 | /* felem_reduce_zero105 converts a larger longfelem into an felem. | ||
774 | * On entry: | ||
775 | * in[0] < 2^71 | ||
776 | * On exit: | ||
777 | * out[i] < 2^106 | ||
778 | */ | ||
779 | static void felem_reduce_zero105(felem out, const longfelem in) | ||
780 | { | ||
781 | out[0] = zero105[0] + in[0]; | ||
782 | out[1] = zero105[1] + in[1]; | ||
783 | out[2] = zero105[2] + in[2]; | ||
784 | out[3] = zero105[3] + in[3]; | ||
785 | |||
786 | felem_reduce_(out, in); | ||
787 | |||
788 | /* out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0 | ||
789 | * out[1] > 2^105 - 2^71 - 2^103 > 0 | ||
790 | * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0 | ||
791 | * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0 | ||
792 | * | ||
793 | * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 | ||
794 | * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 | ||
795 | * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106 | ||
796 | * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106 | ||
797 | */ | ||
798 | } | ||
799 | |||
800 | /* subtract_u64 sets *result = *result - v and *carry to one if the subtraction | ||
801 | * underflowed. */ | ||
802 | static void subtract_u64(u64* result, u64* carry, u64 v) | ||
803 | { | ||
804 | uint128_t r = *result; | ||
805 | r -= v; | ||
806 | *carry = (r >> 64) & 1; | ||
807 | *result = (u64) r; | ||
808 | } | ||
809 | |||
810 | /* felem_contract converts |in| to its unique, minimal representation. | ||
811 | * On entry: | ||
812 | * in[i] < 2^109 | ||
813 | */ | ||
814 | static void felem_contract(smallfelem out, const felem in) | ||
815 | { | ||
816 | unsigned i; | ||
817 | u64 all_equal_so_far = 0, result = 0, carry; | ||
818 | |||
819 | felem_shrink(out, in); | ||
820 | /* small is minimal except that the value might be > p */ | ||
821 | |||
822 | all_equal_so_far--; | ||
823 | /* We are doing a constant time test if out >= kPrime. We need to | ||
824 | * compare each u64, from most-significant to least significant. For | ||
825 | * each one, if all words so far have been equal (m is all ones) then a | ||
826 | * non-equal result is the answer. Otherwise we continue. */ | ||
827 | for (i = 3; i < 4; i--) | ||
828 | { | ||
829 | u64 equal; | ||
830 | uint128_t a = ((uint128_t) kPrime[i]) - out[i]; | ||
831 | /* if out[i] > kPrime[i] then a will underflow and the high | ||
832 | * 64-bits will all be set. */ | ||
833 | result |= all_equal_so_far & ((u64) (a >> 64)); | ||
834 | |||
835 | /* if kPrime[i] == out[i] then |equal| will be all zeros and | ||
836 | * the decrement will make it all ones. */ | ||
837 | equal = kPrime[i] ^ out[i]; | ||
838 | equal--; | ||
839 | equal &= equal << 32; | ||
840 | equal &= equal << 16; | ||
841 | equal &= equal << 8; | ||
842 | equal &= equal << 4; | ||
843 | equal &= equal << 2; | ||
844 | equal &= equal << 1; | ||
845 | equal = ((s64) equal) >> 63; | ||
846 | |||
847 | all_equal_so_far &= equal; | ||
848 | } | ||
849 | |||
850 | /* if all_equal_so_far is still all ones then the two values are equal | ||
851 | * and so out >= kPrime is true. */ | ||
852 | result |= all_equal_so_far; | ||
853 | |||
854 | /* if out >= kPrime then we subtract kPrime. */ | ||
855 | subtract_u64(&out[0], &carry, result & kPrime[0]); | ||
856 | subtract_u64(&out[1], &carry, carry); | ||
857 | subtract_u64(&out[2], &carry, carry); | ||
858 | subtract_u64(&out[3], &carry, carry); | ||
859 | |||
860 | subtract_u64(&out[1], &carry, result & kPrime[1]); | ||
861 | subtract_u64(&out[2], &carry, carry); | ||
862 | subtract_u64(&out[3], &carry, carry); | ||
863 | |||
864 | subtract_u64(&out[2], &carry, result & kPrime[2]); | ||
865 | subtract_u64(&out[3], &carry, carry); | ||
866 | |||
867 | subtract_u64(&out[3], &carry, result & kPrime[3]); | ||
868 | } | ||
869 | |||
870 | static void smallfelem_square_contract(smallfelem out, const smallfelem in) | ||
871 | { | ||
872 | longfelem longtmp; | ||
873 | felem tmp; | ||
874 | |||
875 | smallfelem_square(longtmp, in); | ||
876 | felem_reduce(tmp, longtmp); | ||
877 | felem_contract(out, tmp); | ||
878 | } | ||
879 | |||
880 | static void smallfelem_mul_contract(smallfelem out, const smallfelem in1, const smallfelem in2) | ||
881 | { | ||
882 | longfelem longtmp; | ||
883 | felem tmp; | ||
884 | |||
885 | smallfelem_mul(longtmp, in1, in2); | ||
886 | felem_reduce(tmp, longtmp); | ||
887 | felem_contract(out, tmp); | ||
888 | } | ||
889 | |||
890 | /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0 | ||
891 | * otherwise. | ||
892 | * On entry: | ||
893 | * small[i] < 2^64 | ||
894 | */ | ||
895 | static limb smallfelem_is_zero(const smallfelem small) | ||
896 | { | ||
897 | limb result; | ||
898 | u64 is_p; | ||
899 | |||
900 | u64 is_zero = small[0] | small[1] | small[2] | small[3]; | ||
901 | is_zero--; | ||
902 | is_zero &= is_zero << 32; | ||
903 | is_zero &= is_zero << 16; | ||
904 | is_zero &= is_zero << 8; | ||
905 | is_zero &= is_zero << 4; | ||
906 | is_zero &= is_zero << 2; | ||
907 | is_zero &= is_zero << 1; | ||
908 | is_zero = ((s64) is_zero) >> 63; | ||
909 | |||
910 | is_p = (small[0] ^ kPrime[0]) | | ||
911 | (small[1] ^ kPrime[1]) | | ||
912 | (small[2] ^ kPrime[2]) | | ||
913 | (small[3] ^ kPrime[3]); | ||
914 | is_p--; | ||
915 | is_p &= is_p << 32; | ||
916 | is_p &= is_p << 16; | ||
917 | is_p &= is_p << 8; | ||
918 | is_p &= is_p << 4; | ||
919 | is_p &= is_p << 2; | ||
920 | is_p &= is_p << 1; | ||
921 | is_p = ((s64) is_p) >> 63; | ||
922 | |||
923 | is_zero |= is_p; | ||
924 | |||
925 | result = is_zero; | ||
926 | result |= ((limb) is_zero) << 64; | ||
927 | return result; | ||
928 | } | ||
929 | |||
930 | static int smallfelem_is_zero_int(const smallfelem small) | ||
931 | { | ||
932 | return (int) (smallfelem_is_zero(small) & ((limb)1)); | ||
933 | } | ||
934 | |||
935 | /* felem_inv calculates |out| = |in|^{-1} | ||
936 | * | ||
937 | * Based on Fermat's Little Theorem: | ||
938 | * a^p = a (mod p) | ||
939 | * a^{p-1} = 1 (mod p) | ||
940 | * a^{p-2} = a^{-1} (mod p) | ||
941 | */ | ||
942 | static void felem_inv(felem out, const felem in) | ||
943 | { | ||
944 | felem ftmp, ftmp2; | ||
945 | /* each e_I will hold |in|^{2^I - 1} */ | ||
946 | felem e2, e4, e8, e16, e32, e64; | ||
947 | longfelem tmp; | ||
948 | unsigned i; | ||
949 | |||
950 | felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */ | ||
951 | felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */ | ||
952 | felem_assign(e2, ftmp); | ||
953 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */ | ||
954 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^2 */ | ||
955 | felem_mul(tmp, ftmp, e2); felem_reduce(ftmp, tmp); /* 2^4 - 2^0 */ | ||
956 | felem_assign(e4, ftmp); | ||
957 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^5 - 2^1 */ | ||
958 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^6 - 2^2 */ | ||
959 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^7 - 2^3 */ | ||
960 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^8 - 2^4 */ | ||
961 | felem_mul(tmp, ftmp, e4); felem_reduce(ftmp, tmp); /* 2^8 - 2^0 */ | ||
962 | felem_assign(e8, ftmp); | ||
963 | for (i = 0; i < 8; i++) { | ||
964 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); | ||
965 | } /* 2^16 - 2^8 */ | ||
966 | felem_mul(tmp, ftmp, e8); felem_reduce(ftmp, tmp); /* 2^16 - 2^0 */ | ||
967 | felem_assign(e16, ftmp); | ||
968 | for (i = 0; i < 16; i++) { | ||
969 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); | ||
970 | } /* 2^32 - 2^16 */ | ||
971 | felem_mul(tmp, ftmp, e16); felem_reduce(ftmp, tmp); /* 2^32 - 2^0 */ | ||
972 | felem_assign(e32, ftmp); | ||
973 | for (i = 0; i < 32; i++) { | ||
974 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); | ||
975 | } /* 2^64 - 2^32 */ | ||
976 | felem_assign(e64, ftmp); | ||
977 | felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp); /* 2^64 - 2^32 + 2^0 */ | ||
978 | for (i = 0; i < 192; i++) { | ||
979 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); | ||
980 | } /* 2^256 - 2^224 + 2^192 */ | ||
981 | |||
982 | felem_mul(tmp, e64, e32); felem_reduce(ftmp2, tmp); /* 2^64 - 2^0 */ | ||
983 | for (i = 0; i < 16; i++) { | ||
984 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); | ||
985 | } /* 2^80 - 2^16 */ | ||
986 | felem_mul(tmp, ftmp2, e16); felem_reduce(ftmp2, tmp); /* 2^80 - 2^0 */ | ||
987 | for (i = 0; i < 8; i++) { | ||
988 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); | ||
989 | } /* 2^88 - 2^8 */ | ||
990 | felem_mul(tmp, ftmp2, e8); felem_reduce(ftmp2, tmp); /* 2^88 - 2^0 */ | ||
991 | for (i = 0; i < 4; i++) { | ||
992 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); | ||
993 | } /* 2^92 - 2^4 */ | ||
994 | felem_mul(tmp, ftmp2, e4); felem_reduce(ftmp2, tmp); /* 2^92 - 2^0 */ | ||
995 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^93 - 2^1 */ | ||
996 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^94 - 2^2 */ | ||
997 | felem_mul(tmp, ftmp2, e2); felem_reduce(ftmp2, tmp); /* 2^94 - 2^0 */ | ||
998 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^95 - 2^1 */ | ||
999 | felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^96 - 2^2 */ | ||
1000 | felem_mul(tmp, ftmp2, in); felem_reduce(ftmp2, tmp); /* 2^96 - 3 */ | ||
1001 | |||
1002 | felem_mul(tmp, ftmp2, ftmp); felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */ | ||
1003 | } | ||
1004 | |||
1005 | static void smallfelem_inv_contract(smallfelem out, const smallfelem in) | ||
1006 | { | ||
1007 | felem tmp; | ||
1008 | |||
1009 | smallfelem_expand(tmp, in); | ||
1010 | felem_inv(tmp, tmp); | ||
1011 | felem_contract(out, tmp); | ||
1012 | } | ||
1013 | |||
1014 | /* Group operations | ||
1015 | * ---------------- | ||
1016 | * | ||
1017 | * Building on top of the field operations we have the operations on the | ||
1018 | * elliptic curve group itself. Points on the curve are represented in Jacobian | ||
1019 | * coordinates */ | ||
1020 | |||
1021 | /* point_double calculates 2*(x_in, y_in, z_in) | ||
1022 | * | ||
1023 | * The method is taken from: | ||
1024 | * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b | ||
1025 | * | ||
1026 | * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. | ||
1027 | * while x_out == y_in is not (maybe this works, but it's not tested). */ | ||
1028 | static void | ||
1029 | point_double(felem x_out, felem y_out, felem z_out, | ||
1030 | const felem x_in, const felem y_in, const felem z_in) | ||
1031 | { | ||
1032 | longfelem tmp, tmp2; | ||
1033 | felem delta, gamma, beta, alpha, ftmp, ftmp2; | ||
1034 | smallfelem small1, small2; | ||
1035 | |||
1036 | felem_assign(ftmp, x_in); | ||
1037 | /* ftmp[i] < 2^106 */ | ||
1038 | felem_assign(ftmp2, x_in); | ||
1039 | /* ftmp2[i] < 2^106 */ | ||
1040 | |||
1041 | /* delta = z^2 */ | ||
1042 | felem_square(tmp, z_in); | ||
1043 | felem_reduce(delta, tmp); | ||
1044 | /* delta[i] < 2^101 */ | ||
1045 | |||
1046 | /* gamma = y^2 */ | ||
1047 | felem_square(tmp, y_in); | ||
1048 | felem_reduce(gamma, tmp); | ||
1049 | /* gamma[i] < 2^101 */ | ||
1050 | felem_shrink(small1, gamma); | ||
1051 | |||
1052 | /* beta = x*gamma */ | ||
1053 | felem_small_mul(tmp, small1, x_in); | ||
1054 | felem_reduce(beta, tmp); | ||
1055 | /* beta[i] < 2^101 */ | ||
1056 | |||
1057 | /* alpha = 3*(x-delta)*(x+delta) */ | ||
1058 | felem_diff(ftmp, delta); | ||
1059 | /* ftmp[i] < 2^105 + 2^106 < 2^107 */ | ||
1060 | felem_sum(ftmp2, delta); | ||
1061 | /* ftmp2[i] < 2^105 + 2^106 < 2^107 */ | ||
1062 | felem_scalar(ftmp2, 3); | ||
1063 | /* ftmp2[i] < 3 * 2^107 < 2^109 */ | ||
1064 | felem_mul(tmp, ftmp, ftmp2); | ||
1065 | felem_reduce(alpha, tmp); | ||
1066 | /* alpha[i] < 2^101 */ | ||
1067 | felem_shrink(small2, alpha); | ||
1068 | |||
1069 | /* x' = alpha^2 - 8*beta */ | ||
1070 | smallfelem_square(tmp, small2); | ||
1071 | felem_reduce(x_out, tmp); | ||
1072 | felem_assign(ftmp, beta); | ||
1073 | felem_scalar(ftmp, 8); | ||
1074 | /* ftmp[i] < 8 * 2^101 = 2^104 */ | ||
1075 | felem_diff(x_out, ftmp); | ||
1076 | /* x_out[i] < 2^105 + 2^101 < 2^106 */ | ||
1077 | |||
1078 | /* z' = (y + z)^2 - gamma - delta */ | ||
1079 | felem_sum(delta, gamma); | ||
1080 | /* delta[i] < 2^101 + 2^101 = 2^102 */ | ||
1081 | felem_assign(ftmp, y_in); | ||
1082 | felem_sum(ftmp, z_in); | ||
1083 | /* ftmp[i] < 2^106 + 2^106 = 2^107 */ | ||
1084 | felem_square(tmp, ftmp); | ||
1085 | felem_reduce(z_out, tmp); | ||
1086 | felem_diff(z_out, delta); | ||
1087 | /* z_out[i] < 2^105 + 2^101 < 2^106 */ | ||
1088 | |||
1089 | /* y' = alpha*(4*beta - x') - 8*gamma^2 */ | ||
1090 | felem_scalar(beta, 4); | ||
1091 | /* beta[i] < 4 * 2^101 = 2^103 */ | ||
1092 | felem_diff_zero107(beta, x_out); | ||
1093 | /* beta[i] < 2^107 + 2^103 < 2^108 */ | ||
1094 | felem_small_mul(tmp, small2, beta); | ||
1095 | /* tmp[i] < 7 * 2^64 < 2^67 */ | ||
1096 | smallfelem_square(tmp2, small1); | ||
1097 | /* tmp2[i] < 7 * 2^64 */ | ||
1098 | longfelem_scalar(tmp2, 8); | ||
1099 | /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */ | ||
1100 | longfelem_diff(tmp, tmp2); | ||
1101 | /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */ | ||
1102 | felem_reduce_zero105(y_out, tmp); | ||
1103 | /* y_out[i] < 2^106 */ | ||
1104 | } | ||
1105 | |||
1106 | /* point_double_small is the same as point_double, except that it operates on | ||
1107 | * smallfelems */ | ||
1108 | static void | ||
1109 | point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out, | ||
1110 | const smallfelem x_in, const smallfelem y_in, const smallfelem z_in) | ||
1111 | { | ||
1112 | felem felem_x_out, felem_y_out, felem_z_out; | ||
1113 | felem felem_x_in, felem_y_in, felem_z_in; | ||
1114 | |||
1115 | smallfelem_expand(felem_x_in, x_in); | ||
1116 | smallfelem_expand(felem_y_in, y_in); | ||
1117 | smallfelem_expand(felem_z_in, z_in); | ||
1118 | point_double(felem_x_out, felem_y_out, felem_z_out, | ||
1119 | felem_x_in, felem_y_in, felem_z_in); | ||
1120 | felem_shrink(x_out, felem_x_out); | ||
1121 | felem_shrink(y_out, felem_y_out); | ||
1122 | felem_shrink(z_out, felem_z_out); | ||
1123 | } | ||
1124 | |||
1125 | /* copy_conditional copies in to out iff mask is all ones. */ | ||
1126 | static void | ||
1127 | copy_conditional(felem out, const felem in, limb mask) | ||
1128 | { | ||
1129 | unsigned i; | ||
1130 | for (i = 0; i < NLIMBS; ++i) | ||
1131 | { | ||
1132 | const limb tmp = mask & (in[i] ^ out[i]); | ||
1133 | out[i] ^= tmp; | ||
1134 | } | ||
1135 | } | ||
1136 | |||
1137 | /* copy_small_conditional copies in to out iff mask is all ones. */ | ||
1138 | static void | ||
1139 | copy_small_conditional(felem out, const smallfelem in, limb mask) | ||
1140 | { | ||
1141 | unsigned i; | ||
1142 | const u64 mask64 = mask; | ||
1143 | for (i = 0; i < NLIMBS; ++i) | ||
1144 | { | ||
1145 | out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask); | ||
1146 | } | ||
1147 | } | ||
1148 | |||
1149 | /* point_add calcuates (x1, y1, z1) + (x2, y2, z2) | ||
1150 | * | ||
1151 | * The method is taken from: | ||
1152 | * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, | ||
1153 | * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). | ||
1154 | * | ||
1155 | * This function includes a branch for checking whether the two input points | ||
1156 | * are equal, (while not equal to the point at infinity). This case never | ||
1157 | * happens during single point multiplication, so there is no timing leak for | ||
1158 | * ECDH or ECDSA signing. */ | ||
1159 | static void point_add(felem x3, felem y3, felem z3, | ||
1160 | const felem x1, const felem y1, const felem z1, | ||
1161 | const int mixed, const smallfelem x2, const smallfelem y2, const smallfelem z2) | ||
1162 | { | ||
1163 | felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; | ||
1164 | longfelem tmp, tmp2; | ||
1165 | smallfelem small1, small2, small3, small4, small5; | ||
1166 | limb x_equal, y_equal, z1_is_zero, z2_is_zero; | ||
1167 | |||
1168 | felem_shrink(small3, z1); | ||
1169 | |||
1170 | z1_is_zero = smallfelem_is_zero(small3); | ||
1171 | z2_is_zero = smallfelem_is_zero(z2); | ||
1172 | |||
1173 | /* ftmp = z1z1 = z1**2 */ | ||
1174 | smallfelem_square(tmp, small3); | ||
1175 | felem_reduce(ftmp, tmp); | ||
1176 | /* ftmp[i] < 2^101 */ | ||
1177 | felem_shrink(small1, ftmp); | ||
1178 | |||
1179 | if(!mixed) | ||
1180 | { | ||
1181 | /* ftmp2 = z2z2 = z2**2 */ | ||
1182 | smallfelem_square(tmp, z2); | ||
1183 | felem_reduce(ftmp2, tmp); | ||
1184 | /* ftmp2[i] < 2^101 */ | ||
1185 | felem_shrink(small2, ftmp2); | ||
1186 | |||
1187 | felem_shrink(small5, x1); | ||
1188 | |||
1189 | /* u1 = ftmp3 = x1*z2z2 */ | ||
1190 | smallfelem_mul(tmp, small5, small2); | ||
1191 | felem_reduce(ftmp3, tmp); | ||
1192 | /* ftmp3[i] < 2^101 */ | ||
1193 | |||
1194 | /* ftmp5 = z1 + z2 */ | ||
1195 | felem_assign(ftmp5, z1); | ||
1196 | felem_small_sum(ftmp5, z2); | ||
1197 | /* ftmp5[i] < 2^107 */ | ||
1198 | |||
1199 | /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */ | ||
1200 | felem_square(tmp, ftmp5); | ||
1201 | felem_reduce(ftmp5, tmp); | ||
1202 | /* ftmp2 = z2z2 + z1z1 */ | ||
1203 | felem_sum(ftmp2, ftmp); | ||
1204 | /* ftmp2[i] < 2^101 + 2^101 = 2^102 */ | ||
1205 | felem_diff(ftmp5, ftmp2); | ||
1206 | /* ftmp5[i] < 2^105 + 2^101 < 2^106 */ | ||
1207 | |||
1208 | /* ftmp2 = z2 * z2z2 */ | ||
1209 | smallfelem_mul(tmp, small2, z2); | ||
1210 | felem_reduce(ftmp2, tmp); | ||
1211 | |||
1212 | /* s1 = ftmp2 = y1 * z2**3 */ | ||
1213 | felem_mul(tmp, y1, ftmp2); | ||
1214 | felem_reduce(ftmp6, tmp); | ||
1215 | /* ftmp6[i] < 2^101 */ | ||
1216 | } | ||
1217 | else | ||
1218 | { | ||
1219 | /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ | ||
1220 | |||
1221 | /* u1 = ftmp3 = x1*z2z2 */ | ||
1222 | felem_assign(ftmp3, x1); | ||
1223 | /* ftmp3[i] < 2^106 */ | ||
1224 | |||
1225 | /* ftmp5 = 2z1z2 */ | ||
1226 | felem_assign(ftmp5, z1); | ||
1227 | felem_scalar(ftmp5, 2); | ||
1228 | /* ftmp5[i] < 2*2^106 = 2^107 */ | ||
1229 | |||
1230 | /* s1 = ftmp2 = y1 * z2**3 */ | ||
1231 | felem_assign(ftmp6, y1); | ||
1232 | /* ftmp6[i] < 2^106 */ | ||
1233 | } | ||
1234 | |||
1235 | /* u2 = x2*z1z1 */ | ||
1236 | smallfelem_mul(tmp, x2, small1); | ||
1237 | felem_reduce(ftmp4, tmp); | ||
1238 | |||
1239 | /* h = ftmp4 = u2 - u1 */ | ||
1240 | felem_diff_zero107(ftmp4, ftmp3); | ||
1241 | /* ftmp4[i] < 2^107 + 2^101 < 2^108 */ | ||
1242 | felem_shrink(small4, ftmp4); | ||
1243 | |||
1244 | x_equal = smallfelem_is_zero(small4); | ||
1245 | |||
1246 | /* z_out = ftmp5 * h */ | ||
1247 | felem_small_mul(tmp, small4, ftmp5); | ||
1248 | felem_reduce(z_out, tmp); | ||
1249 | /* z_out[i] < 2^101 */ | ||
1250 | |||
1251 | /* ftmp = z1 * z1z1 */ | ||
1252 | smallfelem_mul(tmp, small1, small3); | ||
1253 | felem_reduce(ftmp, tmp); | ||
1254 | |||
1255 | /* s2 = tmp = y2 * z1**3 */ | ||
1256 | felem_small_mul(tmp, y2, ftmp); | ||
1257 | felem_reduce(ftmp5, tmp); | ||
1258 | |||
1259 | /* r = ftmp5 = (s2 - s1)*2 */ | ||
1260 | felem_diff_zero107(ftmp5, ftmp6); | ||
1261 | /* ftmp5[i] < 2^107 + 2^107 = 2^108*/ | ||
1262 | felem_scalar(ftmp5, 2); | ||
1263 | /* ftmp5[i] < 2^109 */ | ||
1264 | felem_shrink(small1, ftmp5); | ||
1265 | y_equal = smallfelem_is_zero(small1); | ||
1266 | |||
1267 | if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) | ||
1268 | { | ||
1269 | point_double(x3, y3, z3, x1, y1, z1); | ||
1270 | return; | ||
1271 | } | ||
1272 | |||
1273 | /* I = ftmp = (2h)**2 */ | ||
1274 | felem_assign(ftmp, ftmp4); | ||
1275 | felem_scalar(ftmp, 2); | ||
1276 | /* ftmp[i] < 2*2^108 = 2^109 */ | ||
1277 | felem_square(tmp, ftmp); | ||
1278 | felem_reduce(ftmp, tmp); | ||
1279 | |||
1280 | /* J = ftmp2 = h * I */ | ||
1281 | felem_mul(tmp, ftmp4, ftmp); | ||
1282 | felem_reduce(ftmp2, tmp); | ||
1283 | |||
1284 | /* V = ftmp4 = U1 * I */ | ||
1285 | felem_mul(tmp, ftmp3, ftmp); | ||
1286 | felem_reduce(ftmp4, tmp); | ||
1287 | |||
1288 | /* x_out = r**2 - J - 2V */ | ||
1289 | smallfelem_square(tmp, small1); | ||
1290 | felem_reduce(x_out, tmp); | ||
1291 | felem_assign(ftmp3, ftmp4); | ||
1292 | felem_scalar(ftmp4, 2); | ||
1293 | felem_sum(ftmp4, ftmp2); | ||
1294 | /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */ | ||
1295 | felem_diff(x_out, ftmp4); | ||
1296 | /* x_out[i] < 2^105 + 2^101 */ | ||
1297 | |||
1298 | /* y_out = r(V-x_out) - 2 * s1 * J */ | ||
1299 | felem_diff_zero107(ftmp3, x_out); | ||
1300 | /* ftmp3[i] < 2^107 + 2^101 < 2^108 */ | ||
1301 | felem_small_mul(tmp, small1, ftmp3); | ||
1302 | felem_mul(tmp2, ftmp6, ftmp2); | ||
1303 | longfelem_scalar(tmp2, 2); | ||
1304 | /* tmp2[i] < 2*2^67 = 2^68 */ | ||
1305 | longfelem_diff(tmp, tmp2); | ||
1306 | /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */ | ||
1307 | felem_reduce_zero105(y_out, tmp); | ||
1308 | /* y_out[i] < 2^106 */ | ||
1309 | |||
1310 | copy_small_conditional(x_out, x2, z1_is_zero); | ||
1311 | copy_conditional(x_out, x1, z2_is_zero); | ||
1312 | copy_small_conditional(y_out, y2, z1_is_zero); | ||
1313 | copy_conditional(y_out, y1, z2_is_zero); | ||
1314 | copy_small_conditional(z_out, z2, z1_is_zero); | ||
1315 | copy_conditional(z_out, z1, z2_is_zero); | ||
1316 | felem_assign(x3, x_out); | ||
1317 | felem_assign(y3, y_out); | ||
1318 | felem_assign(z3, z_out); | ||
1319 | } | ||
1320 | |||
1321 | /* point_add_small is the same as point_add, except that it operates on | ||
1322 | * smallfelems */ | ||
1323 | static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3, | ||
1324 | smallfelem x1, smallfelem y1, smallfelem z1, | ||
1325 | smallfelem x2, smallfelem y2, smallfelem z2) | ||
1326 | { | ||
1327 | felem felem_x3, felem_y3, felem_z3; | ||
1328 | felem felem_x1, felem_y1, felem_z1; | ||
1329 | smallfelem_expand(felem_x1, x1); | ||
1330 | smallfelem_expand(felem_y1, y1); | ||
1331 | smallfelem_expand(felem_z1, z1); | ||
1332 | point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0, x2, y2, z2); | ||
1333 | felem_shrink(x3, felem_x3); | ||
1334 | felem_shrink(y3, felem_y3); | ||
1335 | felem_shrink(z3, felem_z3); | ||
1336 | } | ||
1337 | |||
1338 | /* Base point pre computation | ||
1339 | * -------------------------- | ||
1340 | * | ||
1341 | * Two different sorts of precomputed tables are used in the following code. | ||
1342 | * Each contain various points on the curve, where each point is three field | ||
1343 | * elements (x, y, z). | ||
1344 | * | ||
1345 | * For the base point table, z is usually 1 (0 for the point at infinity). | ||
1346 | * This table has 2 * 16 elements, starting with the following: | ||
1347 | * index | bits | point | ||
1348 | * ------+---------+------------------------------ | ||
1349 | * 0 | 0 0 0 0 | 0G | ||
1350 | * 1 | 0 0 0 1 | 1G | ||
1351 | * 2 | 0 0 1 0 | 2^64G | ||
1352 | * 3 | 0 0 1 1 | (2^64 + 1)G | ||
1353 | * 4 | 0 1 0 0 | 2^128G | ||
1354 | * 5 | 0 1 0 1 | (2^128 + 1)G | ||
1355 | * 6 | 0 1 1 0 | (2^128 + 2^64)G | ||
1356 | * 7 | 0 1 1 1 | (2^128 + 2^64 + 1)G | ||
1357 | * 8 | 1 0 0 0 | 2^192G | ||
1358 | * 9 | 1 0 0 1 | (2^192 + 1)G | ||
1359 | * 10 | 1 0 1 0 | (2^192 + 2^64)G | ||
1360 | * 11 | 1 0 1 1 | (2^192 + 2^64 + 1)G | ||
1361 | * 12 | 1 1 0 0 | (2^192 + 2^128)G | ||
1362 | * 13 | 1 1 0 1 | (2^192 + 2^128 + 1)G | ||
1363 | * 14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G | ||
1364 | * 15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G | ||
1365 | * followed by a copy of this with each element multiplied by 2^32. | ||
1366 | * | ||
1367 | * The reason for this is so that we can clock bits into four different | ||
1368 | * locations when doing simple scalar multiplies against the base point, | ||
1369 | * and then another four locations using the second 16 elements. | ||
1370 | * | ||
1371 | * Tables for other points have table[i] = iG for i in 0 .. 16. */ | ||
1372 | |||
1373 | /* gmul is the table of precomputed base points */ | ||
1374 | static const smallfelem gmul[2][16][3] = | ||
1375 | {{{{0, 0, 0, 0}, | ||
1376 | {0, 0, 0, 0}, | ||
1377 | {0, 0, 0, 0}}, | ||
1378 | {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2, 0x6b17d1f2e12c4247}, | ||
1379 | {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16, 0x4fe342e2fe1a7f9b}, | ||
1380 | {1, 0, 0, 0}}, | ||
1381 | {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de, 0x0fa822bc2811aaa5}, | ||
1382 | {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b, 0xbff44ae8f5dba80d}, | ||
1383 | {1, 0, 0, 0}}, | ||
1384 | {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789, 0x300a4bbc89d6726f}, | ||
1385 | {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f, 0x72aac7e0d09b4644}, | ||
1386 | {1, 0, 0, 0}}, | ||
1387 | {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e, 0x447d739beedb5e67}, | ||
1388 | {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7, 0x2d4825ab834131ee}, | ||
1389 | {1, 0, 0, 0}}, | ||
1390 | {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60, 0xef9519328a9c72ff}, | ||
1391 | {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c, 0x611e9fc37dbb2c9b}, | ||
1392 | {1, 0, 0, 0}}, | ||
1393 | {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf, 0x550663797b51f5d8}, | ||
1394 | {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5, 0x157164848aecb851}, | ||
1395 | {1, 0, 0, 0}}, | ||
1396 | {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391, 0xeb5d7745b21141ea}, | ||
1397 | {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee, 0xeafd72ebdbecc17b}, | ||
1398 | {1, 0, 0, 0}}, | ||
1399 | {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5, 0xa6d39677a7849276}, | ||
1400 | {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf, 0x674f84749b0b8816}, | ||
1401 | {1, 0, 0, 0}}, | ||
1402 | {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb, 0x4e769e7672c9ddad}, | ||
1403 | {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281, 0x42b99082de830663}, | ||
1404 | {1, 0, 0, 0}}, | ||
1405 | {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478, 0x78878ef61c6ce04d}, | ||
1406 | {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def, 0xb6cb3f5d7b72c321}, | ||
1407 | {1, 0, 0, 0}}, | ||
1408 | {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae, 0x0c88bc4d716b1287}, | ||
1409 | {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa, 0xdd5ddea3f3901dc6}, | ||
1410 | {1, 0, 0, 0}}, | ||
1411 | {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3, 0x68f344af6b317466}, | ||
1412 | {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3, 0x31b9c405f8540a20}, | ||
1413 | {1, 0, 0, 0}}, | ||
1414 | {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0, 0x4052bf4b6f461db9}, | ||
1415 | {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8, 0xfecf4d5190b0fc61}, | ||
1416 | {1, 0, 0, 0}}, | ||
1417 | {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a, 0x1eddbae2c802e41a}, | ||
1418 | {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0, 0x43104d86560ebcfc}, | ||
1419 | {1, 0, 0, 0}}, | ||
1420 | {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a, 0xb48e26b484f7a21c}, | ||
1421 | {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668, 0xfac015404d4d3dab}, | ||
1422 | {1, 0, 0, 0}}}, | ||
1423 | {{{0, 0, 0, 0}, | ||
1424 | {0, 0, 0, 0}, | ||
1425 | {0, 0, 0, 0}}, | ||
1426 | {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da, 0x7fe36b40af22af89}, | ||
1427 | {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1, 0xe697d45825b63624}, | ||
1428 | {1, 0, 0, 0}}, | ||
1429 | {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902, 0x4a5b506612a677a6}, | ||
1430 | {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40, 0xeb13461ceac089f1}, | ||
1431 | {1, 0, 0, 0}}, | ||
1432 | {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857, 0x0781b8291c6a220a}, | ||
1433 | {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434, 0x690cde8df0151593}, | ||
1434 | {1, 0, 0, 0}}, | ||
1435 | {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326, 0x8a535f566ec73617}, | ||
1436 | {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf, 0x0455c08468b08bd7}, | ||
1437 | {1, 0, 0, 0}}, | ||
1438 | {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279, 0x06bada7ab77f8276}, | ||
1439 | {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70, 0x5b476dfd0e6cb18a}, | ||
1440 | {1, 0, 0, 0}}, | ||
1441 | {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8, 0x3e29864e8a2ec908}, | ||
1442 | {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed, 0x239b90ea3dc31e7e}, | ||
1443 | {1, 0, 0, 0}}, | ||
1444 | {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4, 0x820f4dd949f72ff7}, | ||
1445 | {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3, 0x140406ec783a05ec}, | ||
1446 | {1, 0, 0, 0}}, | ||
1447 | {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe, 0x68f6b8542783dfee}, | ||
1448 | {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028, 0xcbe1feba92e40ce6}, | ||
1449 | {1, 0, 0, 0}}, | ||
1450 | {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927, 0xd0b2f94d2f420109}, | ||
1451 | {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a, 0x971459828b0719e5}, | ||
1452 | {1, 0, 0, 0}}, | ||
1453 | {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687, 0x961610004a866aba}, | ||
1454 | {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c, 0x7acb9fadcee75e44}, | ||
1455 | {1, 0, 0, 0}}, | ||
1456 | {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea, 0x24eb9acca333bf5b}, | ||
1457 | {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d, 0x69f891c5acd079cc}, | ||
1458 | {1, 0, 0, 0}}, | ||
1459 | {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514, 0xe51f547c5972a107}, | ||
1460 | {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06, 0x1c309a2b25bb1387}, | ||
1461 | {1, 0, 0, 0}}, | ||
1462 | {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828, 0x20b87b8aa2c4e503}, | ||
1463 | {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044, 0xf5c6fa49919776be}, | ||
1464 | {1, 0, 0, 0}}, | ||
1465 | {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56, 0x1ed7d1b9332010b9}, | ||
1466 | {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24, 0x3a2b03f03217257a}, | ||
1467 | {1, 0, 0, 0}}, | ||
1468 | {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b, 0x15fee545c78dd9f6}, | ||
1469 | {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb, 0x4ab5b6b2b8753f81}, | ||
1470 | {1, 0, 0, 0}}}}; | ||
1471 | |||
1472 | /* select_point selects the |idx|th point from a precomputation table and | ||
1473 | * copies it to out. */ | ||
1474 | static void select_point(const u64 idx, unsigned int size, const smallfelem pre_comp[16][3], smallfelem out[3]) | ||
1475 | { | ||
1476 | unsigned i, j; | ||
1477 | u64 *outlimbs = &out[0][0]; | ||
1478 | memset(outlimbs, 0, 3 * sizeof(smallfelem)); | ||
1479 | |||
1480 | for (i = 0; i < size; i++) | ||
1481 | { | ||
1482 | const u64 *inlimbs = (u64*) &pre_comp[i][0][0]; | ||
1483 | u64 mask = i ^ idx; | ||
1484 | mask |= mask >> 4; | ||
1485 | mask |= mask >> 2; | ||
1486 | mask |= mask >> 1; | ||
1487 | mask &= 1; | ||
1488 | mask--; | ||
1489 | for (j = 0; j < NLIMBS * 3; j++) | ||
1490 | outlimbs[j] |= inlimbs[j] & mask; | ||
1491 | } | ||
1492 | } | ||
1493 | |||
1494 | /* get_bit returns the |i|th bit in |in| */ | ||
1495 | static char get_bit(const felem_bytearray in, int i) | ||
1496 | { | ||
1497 | if ((i < 0) || (i >= 256)) | ||
1498 | return 0; | ||
1499 | return (in[i >> 3] >> (i & 7)) & 1; | ||
1500 | } | ||
1501 | |||
1502 | /* Interleaved point multiplication using precomputed point multiples: | ||
1503 | * The small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], | ||
1504 | * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple | ||
1505 | * of the generator, using certain (large) precomputed multiples in g_pre_comp. | ||
1506 | * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ | ||
1507 | static void batch_mul(felem x_out, felem y_out, felem z_out, | ||
1508 | const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, | ||
1509 | const int mixed, const smallfelem pre_comp[][17][3], const smallfelem g_pre_comp[2][16][3]) | ||
1510 | { | ||
1511 | int i, skip; | ||
1512 | unsigned num, gen_mul = (g_scalar != NULL); | ||
1513 | felem nq[3], ftmp; | ||
1514 | smallfelem tmp[3]; | ||
1515 | u64 bits; | ||
1516 | u8 sign, digit; | ||
1517 | |||
1518 | /* set nq to the point at infinity */ | ||
1519 | memset(nq, 0, 3 * sizeof(felem)); | ||
1520 | |||
1521 | /* Loop over all scalars msb-to-lsb, interleaving additions | ||
1522 | * of multiples of the generator (two in each of the last 32 rounds) | ||
1523 | * and additions of other points multiples (every 5th round). | ||
1524 | */ | ||
1525 | skip = 1; /* save two point operations in the first round */ | ||
1526 | for (i = (num_points ? 255 : 31); i >= 0; --i) | ||
1527 | { | ||
1528 | /* double */ | ||
1529 | if (!skip) | ||
1530 | point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); | ||
1531 | |||
1532 | /* add multiples of the generator */ | ||
1533 | if (gen_mul && (i <= 31)) | ||
1534 | { | ||
1535 | /* first, look 32 bits upwards */ | ||
1536 | bits = get_bit(g_scalar, i + 224) << 3; | ||
1537 | bits |= get_bit(g_scalar, i + 160) << 2; | ||
1538 | bits |= get_bit(g_scalar, i + 96) << 1; | ||
1539 | bits |= get_bit(g_scalar, i + 32); | ||
1540 | /* select the point to add, in constant time */ | ||
1541 | select_point(bits, 16, g_pre_comp[1], tmp); | ||
1542 | |||
1543 | if (!skip) | ||
1544 | { | ||
1545 | point_add(nq[0], nq[1], nq[2], | ||
1546 | nq[0], nq[1], nq[2], | ||
1547 | 1 /* mixed */, tmp[0], tmp[1], tmp[2]); | ||
1548 | } | ||
1549 | else | ||
1550 | { | ||
1551 | smallfelem_expand(nq[0], tmp[0]); | ||
1552 | smallfelem_expand(nq[1], tmp[1]); | ||
1553 | smallfelem_expand(nq[2], tmp[2]); | ||
1554 | skip = 0; | ||
1555 | } | ||
1556 | |||
1557 | /* second, look at the current position */ | ||
1558 | bits = get_bit(g_scalar, i + 192) << 3; | ||
1559 | bits |= get_bit(g_scalar, i + 128) << 2; | ||
1560 | bits |= get_bit(g_scalar, i + 64) << 1; | ||
1561 | bits |= get_bit(g_scalar, i); | ||
1562 | /* select the point to add, in constant time */ | ||
1563 | select_point(bits, 16, g_pre_comp[0], tmp); | ||
1564 | point_add(nq[0], nq[1], nq[2], | ||
1565 | nq[0], nq[1], nq[2], | ||
1566 | 1 /* mixed */, tmp[0], tmp[1], tmp[2]); | ||
1567 | } | ||
1568 | |||
1569 | /* do other additions every 5 doublings */ | ||
1570 | if (num_points && (i % 5 == 0)) | ||
1571 | { | ||
1572 | /* loop over all scalars */ | ||
1573 | for (num = 0; num < num_points; ++num) | ||
1574 | { | ||
1575 | bits = get_bit(scalars[num], i + 4) << 5; | ||
1576 | bits |= get_bit(scalars[num], i + 3) << 4; | ||
1577 | bits |= get_bit(scalars[num], i + 2) << 3; | ||
1578 | bits |= get_bit(scalars[num], i + 1) << 2; | ||
1579 | bits |= get_bit(scalars[num], i) << 1; | ||
1580 | bits |= get_bit(scalars[num], i - 1); | ||
1581 | ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); | ||
1582 | |||
1583 | /* select the point to add or subtract, in constant time */ | ||
1584 | select_point(digit, 17, pre_comp[num], tmp); | ||
1585 | smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative point */ | ||
1586 | copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1)); | ||
1587 | felem_contract(tmp[1], ftmp); | ||
1588 | |||
1589 | if (!skip) | ||
1590 | { | ||
1591 | point_add(nq[0], nq[1], nq[2], | ||
1592 | nq[0], nq[1], nq[2], | ||
1593 | mixed, tmp[0], tmp[1], tmp[2]); | ||
1594 | } | ||
1595 | else | ||
1596 | { | ||
1597 | smallfelem_expand(nq[0], tmp[0]); | ||
1598 | smallfelem_expand(nq[1], tmp[1]); | ||
1599 | smallfelem_expand(nq[2], tmp[2]); | ||
1600 | skip = 0; | ||
1601 | } | ||
1602 | } | ||
1603 | } | ||
1604 | } | ||
1605 | felem_assign(x_out, nq[0]); | ||
1606 | felem_assign(y_out, nq[1]); | ||
1607 | felem_assign(z_out, nq[2]); | ||
1608 | } | ||
1609 | |||
1610 | /* Precomputation for the group generator. */ | ||
1611 | typedef struct { | ||
1612 | smallfelem g_pre_comp[2][16][3]; | ||
1613 | int references; | ||
1614 | } NISTP256_PRE_COMP; | ||
1615 | |||
1616 | const EC_METHOD *EC_GFp_nistp256_method(void) | ||
1617 | { | ||
1618 | static const EC_METHOD ret = { | ||
1619 | EC_FLAGS_DEFAULT_OCT, | ||
1620 | NID_X9_62_prime_field, | ||
1621 | ec_GFp_nistp256_group_init, | ||
1622 | ec_GFp_simple_group_finish, | ||
1623 | ec_GFp_simple_group_clear_finish, | ||
1624 | ec_GFp_nist_group_copy, | ||
1625 | ec_GFp_nistp256_group_set_curve, | ||
1626 | ec_GFp_simple_group_get_curve, | ||
1627 | ec_GFp_simple_group_get_degree, | ||
1628 | ec_GFp_simple_group_check_discriminant, | ||
1629 | ec_GFp_simple_point_init, | ||
1630 | ec_GFp_simple_point_finish, | ||
1631 | ec_GFp_simple_point_clear_finish, | ||
1632 | ec_GFp_simple_point_copy, | ||
1633 | ec_GFp_simple_point_set_to_infinity, | ||
1634 | ec_GFp_simple_set_Jprojective_coordinates_GFp, | ||
1635 | ec_GFp_simple_get_Jprojective_coordinates_GFp, | ||
1636 | ec_GFp_simple_point_set_affine_coordinates, | ||
1637 | ec_GFp_nistp256_point_get_affine_coordinates, | ||
1638 | 0 /* point_set_compressed_coordinates */, | ||
1639 | 0 /* point2oct */, | ||
1640 | 0 /* oct2point */, | ||
1641 | ec_GFp_simple_add, | ||
1642 | ec_GFp_simple_dbl, | ||
1643 | ec_GFp_simple_invert, | ||
1644 | ec_GFp_simple_is_at_infinity, | ||
1645 | ec_GFp_simple_is_on_curve, | ||
1646 | ec_GFp_simple_cmp, | ||
1647 | ec_GFp_simple_make_affine, | ||
1648 | ec_GFp_simple_points_make_affine, | ||
1649 | ec_GFp_nistp256_points_mul, | ||
1650 | ec_GFp_nistp256_precompute_mult, | ||
1651 | ec_GFp_nistp256_have_precompute_mult, | ||
1652 | ec_GFp_nist_field_mul, | ||
1653 | ec_GFp_nist_field_sqr, | ||
1654 | 0 /* field_div */, | ||
1655 | 0 /* field_encode */, | ||
1656 | 0 /* field_decode */, | ||
1657 | 0 /* field_set_to_one */ }; | ||
1658 | |||
1659 | return &ret; | ||
1660 | } | ||
1661 | |||
1662 | /******************************************************************************/ | ||
1663 | /* FUNCTIONS TO MANAGE PRECOMPUTATION | ||
1664 | */ | ||
1665 | |||
1666 | static NISTP256_PRE_COMP *nistp256_pre_comp_new() | ||
1667 | { | ||
1668 | NISTP256_PRE_COMP *ret = NULL; | ||
1669 | ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret); | ||
1670 | if (!ret) | ||
1671 | { | ||
1672 | ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); | ||
1673 | return ret; | ||
1674 | } | ||
1675 | memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); | ||
1676 | ret->references = 1; | ||
1677 | return ret; | ||
1678 | } | ||
1679 | |||
1680 | static void *nistp256_pre_comp_dup(void *src_) | ||
1681 | { | ||
1682 | NISTP256_PRE_COMP *src = src_; | ||
1683 | |||
1684 | /* no need to actually copy, these objects never change! */ | ||
1685 | CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); | ||
1686 | |||
1687 | return src_; | ||
1688 | } | ||
1689 | |||
1690 | static void nistp256_pre_comp_free(void *pre_) | ||
1691 | { | ||
1692 | int i; | ||
1693 | NISTP256_PRE_COMP *pre = pre_; | ||
1694 | |||
1695 | if (!pre) | ||
1696 | return; | ||
1697 | |||
1698 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
1699 | if (i > 0) | ||
1700 | return; | ||
1701 | |||
1702 | OPENSSL_free(pre); | ||
1703 | } | ||
1704 | |||
1705 | static void nistp256_pre_comp_clear_free(void *pre_) | ||
1706 | { | ||
1707 | int i; | ||
1708 | NISTP256_PRE_COMP *pre = pre_; | ||
1709 | |||
1710 | if (!pre) | ||
1711 | return; | ||
1712 | |||
1713 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
1714 | if (i > 0) | ||
1715 | return; | ||
1716 | |||
1717 | OPENSSL_cleanse(pre, sizeof *pre); | ||
1718 | OPENSSL_free(pre); | ||
1719 | } | ||
1720 | |||
1721 | /******************************************************************************/ | ||
1722 | /* OPENSSL EC_METHOD FUNCTIONS | ||
1723 | */ | ||
1724 | |||
1725 | int ec_GFp_nistp256_group_init(EC_GROUP *group) | ||
1726 | { | ||
1727 | int ret; | ||
1728 | ret = ec_GFp_simple_group_init(group); | ||
1729 | group->a_is_minus3 = 1; | ||
1730 | return ret; | ||
1731 | } | ||
1732 | |||
1733 | int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, | ||
1734 | const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) | ||
1735 | { | ||
1736 | int ret = 0; | ||
1737 | BN_CTX *new_ctx = NULL; | ||
1738 | BIGNUM *curve_p, *curve_a, *curve_b; | ||
1739 | |||
1740 | if (ctx == NULL) | ||
1741 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
1742 | BN_CTX_start(ctx); | ||
1743 | if (((curve_p = BN_CTX_get(ctx)) == NULL) || | ||
1744 | ((curve_a = BN_CTX_get(ctx)) == NULL) || | ||
1745 | ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; | ||
1746 | BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p); | ||
1747 | BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a); | ||
1748 | BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b); | ||
1749 | if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || | ||
1750 | (BN_cmp(curve_b, b))) | ||
1751 | { | ||
1752 | ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE, | ||
1753 | EC_R_WRONG_CURVE_PARAMETERS); | ||
1754 | goto err; | ||
1755 | } | ||
1756 | group->field_mod_func = BN_nist_mod_256; | ||
1757 | ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); | ||
1758 | err: | ||
1759 | BN_CTX_end(ctx); | ||
1760 | if (new_ctx != NULL) | ||
1761 | BN_CTX_free(new_ctx); | ||
1762 | return ret; | ||
1763 | } | ||
1764 | |||
1765 | /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns | ||
1766 | * (X', Y') = (X/Z^2, Y/Z^3) */ | ||
1767 | int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, | ||
1768 | const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) | ||
1769 | { | ||
1770 | felem z1, z2, x_in, y_in; | ||
1771 | smallfelem x_out, y_out; | ||
1772 | longfelem tmp; | ||
1773 | |||
1774 | if (EC_POINT_is_at_infinity(group, point)) | ||
1775 | { | ||
1776 | ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, | ||
1777 | EC_R_POINT_AT_INFINITY); | ||
1778 | return 0; | ||
1779 | } | ||
1780 | if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || | ||
1781 | (!BN_to_felem(z1, &point->Z))) return 0; | ||
1782 | felem_inv(z2, z1); | ||
1783 | felem_square(tmp, z2); felem_reduce(z1, tmp); | ||
1784 | felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); | ||
1785 | felem_contract(x_out, x_in); | ||
1786 | if (x != NULL) | ||
1787 | { | ||
1788 | if (!smallfelem_to_BN(x, x_out)) { | ||
1789 | ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, | ||
1790 | ERR_R_BN_LIB); | ||
1791 | return 0; | ||
1792 | } | ||
1793 | } | ||
1794 | felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); | ||
1795 | felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); | ||
1796 | felem_contract(y_out, y_in); | ||
1797 | if (y != NULL) | ||
1798 | { | ||
1799 | if (!smallfelem_to_BN(y, y_out)) | ||
1800 | { | ||
1801 | ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, | ||
1802 | ERR_R_BN_LIB); | ||
1803 | return 0; | ||
1804 | } | ||
1805 | } | ||
1806 | return 1; | ||
1807 | } | ||
1808 | |||
1809 | static void make_points_affine(size_t num, smallfelem points[/* num */][3], smallfelem tmp_smallfelems[/* num+1 */]) | ||
1810 | { | ||
1811 | /* Runs in constant time, unless an input is the point at infinity | ||
1812 | * (which normally shouldn't happen). */ | ||
1813 | ec_GFp_nistp_points_make_affine_internal( | ||
1814 | num, | ||
1815 | points, | ||
1816 | sizeof(smallfelem), | ||
1817 | tmp_smallfelems, | ||
1818 | (void (*)(void *)) smallfelem_one, | ||
1819 | (int (*)(const void *)) smallfelem_is_zero_int, | ||
1820 | (void (*)(void *, const void *)) smallfelem_assign, | ||
1821 | (void (*)(void *, const void *)) smallfelem_square_contract, | ||
1822 | (void (*)(void *, const void *, const void *)) smallfelem_mul_contract, | ||
1823 | (void (*)(void *, const void *)) smallfelem_inv_contract, | ||
1824 | (void (*)(void *, const void *)) smallfelem_assign /* nothing to contract */); | ||
1825 | } | ||
1826 | |||
1827 | /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values | ||
1828 | * Result is stored in r (r can equal one of the inputs). */ | ||
1829 | int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, | ||
1830 | const BIGNUM *scalar, size_t num, const EC_POINT *points[], | ||
1831 | const BIGNUM *scalars[], BN_CTX *ctx) | ||
1832 | { | ||
1833 | int ret = 0; | ||
1834 | int j; | ||
1835 | int mixed = 0; | ||
1836 | BN_CTX *new_ctx = NULL; | ||
1837 | BIGNUM *x, *y, *z, *tmp_scalar; | ||
1838 | felem_bytearray g_secret; | ||
1839 | felem_bytearray *secrets = NULL; | ||
1840 | smallfelem (*pre_comp)[17][3] = NULL; | ||
1841 | smallfelem *tmp_smallfelems = NULL; | ||
1842 | felem_bytearray tmp; | ||
1843 | unsigned i, num_bytes; | ||
1844 | int have_pre_comp = 0; | ||
1845 | size_t num_points = num; | ||
1846 | smallfelem x_in, y_in, z_in; | ||
1847 | felem x_out, y_out, z_out; | ||
1848 | NISTP256_PRE_COMP *pre = NULL; | ||
1849 | const smallfelem (*g_pre_comp)[16][3] = NULL; | ||
1850 | EC_POINT *generator = NULL; | ||
1851 | const EC_POINT *p = NULL; | ||
1852 | const BIGNUM *p_scalar = NULL; | ||
1853 | |||
1854 | if (ctx == NULL) | ||
1855 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
1856 | BN_CTX_start(ctx); | ||
1857 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
1858 | ((y = BN_CTX_get(ctx)) == NULL) || | ||
1859 | ((z = BN_CTX_get(ctx)) == NULL) || | ||
1860 | ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) | ||
1861 | goto err; | ||
1862 | |||
1863 | if (scalar != NULL) | ||
1864 | { | ||
1865 | pre = EC_EX_DATA_get_data(group->extra_data, | ||
1866 | nistp256_pre_comp_dup, nistp256_pre_comp_free, | ||
1867 | nistp256_pre_comp_clear_free); | ||
1868 | if (pre) | ||
1869 | /* we have precomputation, try to use it */ | ||
1870 | g_pre_comp = (const smallfelem (*)[16][3]) pre->g_pre_comp; | ||
1871 | else | ||
1872 | /* try to use the standard precomputation */ | ||
1873 | g_pre_comp = &gmul[0]; | ||
1874 | generator = EC_POINT_new(group); | ||
1875 | if (generator == NULL) | ||
1876 | goto err; | ||
1877 | /* get the generator from precomputation */ | ||
1878 | if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) || | ||
1879 | !smallfelem_to_BN(y, g_pre_comp[0][1][1]) || | ||
1880 | !smallfelem_to_BN(z, g_pre_comp[0][1][2])) | ||
1881 | { | ||
1882 | ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); | ||
1883 | goto err; | ||
1884 | } | ||
1885 | if (!EC_POINT_set_Jprojective_coordinates_GFp(group, | ||
1886 | generator, x, y, z, ctx)) | ||
1887 | goto err; | ||
1888 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
1889 | /* precomputation matches generator */ | ||
1890 | have_pre_comp = 1; | ||
1891 | else | ||
1892 | /* we don't have valid precomputation: | ||
1893 | * treat the generator as a random point */ | ||
1894 | num_points++; | ||
1895 | } | ||
1896 | if (num_points > 0) | ||
1897 | { | ||
1898 | if (num_points >= 3) | ||
1899 | { | ||
1900 | /* unless we precompute multiples for just one or two points, | ||
1901 | * converting those into affine form is time well spent */ | ||
1902 | mixed = 1; | ||
1903 | } | ||
1904 | secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); | ||
1905 | pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem)); | ||
1906 | if (mixed) | ||
1907 | tmp_smallfelems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem)); | ||
1908 | if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_smallfelems == NULL))) | ||
1909 | { | ||
1910 | ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE); | ||
1911 | goto err; | ||
1912 | } | ||
1913 | |||
1914 | /* we treat NULL scalars as 0, and NULL points as points at infinity, | ||
1915 | * i.e., they contribute nothing to the linear combination */ | ||
1916 | memset(secrets, 0, num_points * sizeof(felem_bytearray)); | ||
1917 | memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem)); | ||
1918 | for (i = 0; i < num_points; ++i) | ||
1919 | { | ||
1920 | if (i == num) | ||
1921 | /* we didn't have a valid precomputation, so we pick | ||
1922 | * the generator */ | ||
1923 | { | ||
1924 | p = EC_GROUP_get0_generator(group); | ||
1925 | p_scalar = scalar; | ||
1926 | } | ||
1927 | else | ||
1928 | /* the i^th point */ | ||
1929 | { | ||
1930 | p = points[i]; | ||
1931 | p_scalar = scalars[i]; | ||
1932 | } | ||
1933 | if ((p_scalar != NULL) && (p != NULL)) | ||
1934 | { | ||
1935 | /* reduce scalar to 0 <= scalar < 2^256 */ | ||
1936 | if ((BN_num_bits(p_scalar) > 256) || (BN_is_negative(p_scalar))) | ||
1937 | { | ||
1938 | /* this is an unusual input, and we don't guarantee | ||
1939 | * constant-timeness */ | ||
1940 | if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) | ||
1941 | { | ||
1942 | ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); | ||
1943 | goto err; | ||
1944 | } | ||
1945 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
1946 | } | ||
1947 | else | ||
1948 | num_bytes = BN_bn2bin(p_scalar, tmp); | ||
1949 | flip_endian(secrets[i], tmp, num_bytes); | ||
1950 | /* precompute multiples */ | ||
1951 | if ((!BN_to_felem(x_out, &p->X)) || | ||
1952 | (!BN_to_felem(y_out, &p->Y)) || | ||
1953 | (!BN_to_felem(z_out, &p->Z))) goto err; | ||
1954 | felem_shrink(pre_comp[i][1][0], x_out); | ||
1955 | felem_shrink(pre_comp[i][1][1], y_out); | ||
1956 | felem_shrink(pre_comp[i][1][2], z_out); | ||
1957 | for (j = 2; j <= 16; ++j) | ||
1958 | { | ||
1959 | if (j & 1) | ||
1960 | { | ||
1961 | point_add_small( | ||
1962 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
1963 | pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], | ||
1964 | pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); | ||
1965 | } | ||
1966 | else | ||
1967 | { | ||
1968 | point_double_small( | ||
1969 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
1970 | pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); | ||
1971 | } | ||
1972 | } | ||
1973 | } | ||
1974 | } | ||
1975 | if (mixed) | ||
1976 | make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems); | ||
1977 | } | ||
1978 | |||
1979 | /* the scalar for the generator */ | ||
1980 | if ((scalar != NULL) && (have_pre_comp)) | ||
1981 | { | ||
1982 | memset(g_secret, 0, sizeof(g_secret)); | ||
1983 | /* reduce scalar to 0 <= scalar < 2^256 */ | ||
1984 | if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) | ||
1985 | { | ||
1986 | /* this is an unusual input, and we don't guarantee | ||
1987 | * constant-timeness */ | ||
1988 | if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) | ||
1989 | { | ||
1990 | ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); | ||
1991 | goto err; | ||
1992 | } | ||
1993 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
1994 | } | ||
1995 | else | ||
1996 | num_bytes = BN_bn2bin(scalar, tmp); | ||
1997 | flip_endian(g_secret, tmp, num_bytes); | ||
1998 | /* do the multiplication with generator precomputation*/ | ||
1999 | batch_mul(x_out, y_out, z_out, | ||
2000 | (const felem_bytearray (*)) secrets, num_points, | ||
2001 | g_secret, | ||
2002 | mixed, (const smallfelem (*)[17][3]) pre_comp, | ||
2003 | g_pre_comp); | ||
2004 | } | ||
2005 | else | ||
2006 | /* do the multiplication without generator precomputation */ | ||
2007 | batch_mul(x_out, y_out, z_out, | ||
2008 | (const felem_bytearray (*)) secrets, num_points, | ||
2009 | NULL, mixed, (const smallfelem (*)[17][3]) pre_comp, NULL); | ||
2010 | /* reduce the output to its unique minimal representation */ | ||
2011 | felem_contract(x_in, x_out); | ||
2012 | felem_contract(y_in, y_out); | ||
2013 | felem_contract(z_in, z_out); | ||
2014 | if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) || | ||
2015 | (!smallfelem_to_BN(z, z_in))) | ||
2016 | { | ||
2017 | ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); | ||
2018 | goto err; | ||
2019 | } | ||
2020 | ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); | ||
2021 | |||
2022 | err: | ||
2023 | BN_CTX_end(ctx); | ||
2024 | if (generator != NULL) | ||
2025 | EC_POINT_free(generator); | ||
2026 | if (new_ctx != NULL) | ||
2027 | BN_CTX_free(new_ctx); | ||
2028 | if (secrets != NULL) | ||
2029 | OPENSSL_free(secrets); | ||
2030 | if (pre_comp != NULL) | ||
2031 | OPENSSL_free(pre_comp); | ||
2032 | if (tmp_smallfelems != NULL) | ||
2033 | OPENSSL_free(tmp_smallfelems); | ||
2034 | return ret; | ||
2035 | } | ||
2036 | |||
2037 | int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx) | ||
2038 | { | ||
2039 | int ret = 0; | ||
2040 | NISTP256_PRE_COMP *pre = NULL; | ||
2041 | int i, j; | ||
2042 | BN_CTX *new_ctx = NULL; | ||
2043 | BIGNUM *x, *y; | ||
2044 | EC_POINT *generator = NULL; | ||
2045 | smallfelem tmp_smallfelems[32]; | ||
2046 | felem x_tmp, y_tmp, z_tmp; | ||
2047 | |||
2048 | /* throw away old precomputation */ | ||
2049 | EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup, | ||
2050 | nistp256_pre_comp_free, nistp256_pre_comp_clear_free); | ||
2051 | if (ctx == NULL) | ||
2052 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
2053 | BN_CTX_start(ctx); | ||
2054 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
2055 | ((y = BN_CTX_get(ctx)) == NULL)) | ||
2056 | goto err; | ||
2057 | /* get the generator */ | ||
2058 | if (group->generator == NULL) goto err; | ||
2059 | generator = EC_POINT_new(group); | ||
2060 | if (generator == NULL) | ||
2061 | goto err; | ||
2062 | BN_bin2bn(nistp256_curve_params[3], sizeof (felem_bytearray), x); | ||
2063 | BN_bin2bn(nistp256_curve_params[4], sizeof (felem_bytearray), y); | ||
2064 | if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) | ||
2065 | goto err; | ||
2066 | if ((pre = nistp256_pre_comp_new()) == NULL) | ||
2067 | goto err; | ||
2068 | /* if the generator is the standard one, use built-in precomputation */ | ||
2069 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
2070 | { | ||
2071 | memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); | ||
2072 | ret = 1; | ||
2073 | goto err; | ||
2074 | } | ||
2075 | if ((!BN_to_felem(x_tmp, &group->generator->X)) || | ||
2076 | (!BN_to_felem(y_tmp, &group->generator->Y)) || | ||
2077 | (!BN_to_felem(z_tmp, &group->generator->Z))) | ||
2078 | goto err; | ||
2079 | felem_shrink(pre->g_pre_comp[0][1][0], x_tmp); | ||
2080 | felem_shrink(pre->g_pre_comp[0][1][1], y_tmp); | ||
2081 | felem_shrink(pre->g_pre_comp[0][1][2], z_tmp); | ||
2082 | /* compute 2^64*G, 2^128*G, 2^192*G for the first table, | ||
2083 | * 2^32*G, 2^96*G, 2^160*G, 2^224*G for the second one | ||
2084 | */ | ||
2085 | for (i = 1; i <= 8; i <<= 1) | ||
2086 | { | ||
2087 | point_double_small( | ||
2088 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], | ||
2089 | pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]); | ||
2090 | for (j = 0; j < 31; ++j) | ||
2091 | { | ||
2092 | point_double_small( | ||
2093 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], | ||
2094 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); | ||
2095 | } | ||
2096 | if (i == 8) | ||
2097 | break; | ||
2098 | point_double_small( | ||
2099 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], | ||
2100 | pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); | ||
2101 | for (j = 0; j < 31; ++j) | ||
2102 | { | ||
2103 | point_double_small( | ||
2104 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], | ||
2105 | pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]); | ||
2106 | } | ||
2107 | } | ||
2108 | for (i = 0; i < 2; i++) | ||
2109 | { | ||
2110 | /* g_pre_comp[i][0] is the point at infinity */ | ||
2111 | memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0])); | ||
2112 | /* the remaining multiples */ | ||
2113 | /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */ | ||
2114 | point_add_small( | ||
2115 | pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], pre->g_pre_comp[i][6][2], | ||
2116 | pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2], | ||
2117 | pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); | ||
2118 | /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */ | ||
2119 | point_add_small( | ||
2120 | pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], pre->g_pre_comp[i][10][2], | ||
2121 | pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], | ||
2122 | pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); | ||
2123 | /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */ | ||
2124 | point_add_small( | ||
2125 | pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], | ||
2126 | pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], | ||
2127 | pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2]); | ||
2128 | /* 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G */ | ||
2129 | point_add_small( | ||
2130 | pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], pre->g_pre_comp[i][14][2], | ||
2131 | pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], | ||
2132 | pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); | ||
2133 | for (j = 1; j < 8; ++j) | ||
2134 | { | ||
2135 | /* odd multiples: add G resp. 2^32*G */ | ||
2136 | point_add_small( | ||
2137 | pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], pre->g_pre_comp[i][2*j+1][2], | ||
2138 | pre->g_pre_comp[i][2*j][0], pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2], | ||
2139 | pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], pre->g_pre_comp[i][1][2]); | ||
2140 | } | ||
2141 | } | ||
2142 | make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems); | ||
2143 | |||
2144 | if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup, | ||
2145 | nistp256_pre_comp_free, nistp256_pre_comp_clear_free)) | ||
2146 | goto err; | ||
2147 | ret = 1; | ||
2148 | pre = NULL; | ||
2149 | err: | ||
2150 | BN_CTX_end(ctx); | ||
2151 | if (generator != NULL) | ||
2152 | EC_POINT_free(generator); | ||
2153 | if (new_ctx != NULL) | ||
2154 | BN_CTX_free(new_ctx); | ||
2155 | if (pre) | ||
2156 | nistp256_pre_comp_free(pre); | ||
2157 | return ret; | ||
2158 | } | ||
2159 | |||
2160 | int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group) | ||
2161 | { | ||
2162 | if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup, | ||
2163 | nistp256_pre_comp_free, nistp256_pre_comp_clear_free) | ||
2164 | != NULL) | ||
2165 | return 1; | ||
2166 | else | ||
2167 | return 0; | ||
2168 | } | ||
2169 | #else | ||
2170 | static void *dummy=&dummy; | ||
2171 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ecp_nistp521.c b/src/lib/libcrypto/ec/ecp_nistp521.c new file mode 100644 index 0000000000..178b655f7f --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistp521.c | |||
@@ -0,0 +1,2025 @@ | |||
1 | /* crypto/ec/ecp_nistp521.c */ | ||
2 | /* | ||
3 | * Written by Adam Langley (Google) for the OpenSSL project | ||
4 | */ | ||
5 | /* Copyright 2011 Google Inc. | ||
6 | * | ||
7 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
8 | * | ||
9 | * you may not use this file except in compliance with the License. | ||
10 | * You may obtain a copy of the License at | ||
11 | * | ||
12 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
13 | * | ||
14 | * Unless required by applicable law or agreed to in writing, software | ||
15 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
17 | * See the License for the specific language governing permissions and | ||
18 | * limitations under the License. | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication | ||
23 | * | ||
24 | * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c. | ||
25 | * Otherwise based on Emilia's P224 work, which was inspired by my curve25519 | ||
26 | * work which got its smarts from Daniel J. Bernstein's work on the same. | ||
27 | */ | ||
28 | |||
29 | #include <openssl/opensslconf.h> | ||
30 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 | ||
31 | |||
32 | #ifndef OPENSSL_SYS_VMS | ||
33 | #include <stdint.h> | ||
34 | #else | ||
35 | #include <inttypes.h> | ||
36 | #endif | ||
37 | |||
38 | #include <string.h> | ||
39 | #include <openssl/err.h> | ||
40 | #include "ec_lcl.h" | ||
41 | |||
42 | #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) | ||
43 | /* even with gcc, the typedef won't work for 32-bit platforms */ | ||
44 | typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ | ||
45 | #else | ||
46 | #error "Need GCC 3.1 or later to define type uint128_t" | ||
47 | #endif | ||
48 | |||
49 | typedef uint8_t u8; | ||
50 | typedef uint64_t u64; | ||
51 | typedef int64_t s64; | ||
52 | |||
53 | /* The underlying field. | ||
54 | * | ||
55 | * P521 operates over GF(2^521-1). We can serialise an element of this field | ||
56 | * into 66 bytes where the most significant byte contains only a single bit. We | ||
57 | * call this an felem_bytearray. */ | ||
58 | |||
59 | typedef u8 felem_bytearray[66]; | ||
60 | |||
61 | /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5. | ||
62 | * These values are big-endian. */ | ||
63 | static const felem_bytearray nistp521_curve_params[5] = | ||
64 | { | ||
65 | {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */ | ||
66 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
67 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
68 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
69 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
70 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
71 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
72 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
73 | 0xff, 0xff}, | ||
74 | {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */ | ||
75 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
76 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
77 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
78 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
79 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
80 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
81 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, | ||
82 | 0xff, 0xfc}, | ||
83 | {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */ | ||
84 | 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85, | ||
85 | 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3, | ||
86 | 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1, | ||
87 | 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e, | ||
88 | 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1, | ||
89 | 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c, | ||
90 | 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50, | ||
91 | 0x3f, 0x00}, | ||
92 | {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */ | ||
93 | 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95, | ||
94 | 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f, | ||
95 | 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d, | ||
96 | 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7, | ||
97 | 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff, | ||
98 | 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a, | ||
99 | 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5, | ||
100 | 0xbd, 0x66}, | ||
101 | {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */ | ||
102 | 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d, | ||
103 | 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b, | ||
104 | 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e, | ||
105 | 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4, | ||
106 | 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad, | ||
107 | 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72, | ||
108 | 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1, | ||
109 | 0x66, 0x50} | ||
110 | }; | ||
111 | |||
112 | /* The representation of field elements. | ||
113 | * ------------------------------------ | ||
114 | * | ||
115 | * We represent field elements with nine values. These values are either 64 or | ||
116 | * 128 bits and the field element represented is: | ||
117 | * v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464 (mod p) | ||
118 | * Each of the nine values is called a 'limb'. Since the limbs are spaced only | ||
119 | * 58 bits apart, but are greater than 58 bits in length, the most significant | ||
120 | * bits of each limb overlap with the least significant bits of the next. | ||
121 | * | ||
122 | * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a | ||
123 | * 'largefelem' */ | ||
124 | |||
125 | #define NLIMBS 9 | ||
126 | |||
127 | typedef uint64_t limb; | ||
128 | typedef limb felem[NLIMBS]; | ||
129 | typedef uint128_t largefelem[NLIMBS]; | ||
130 | |||
131 | static const limb bottom57bits = 0x1ffffffffffffff; | ||
132 | static const limb bottom58bits = 0x3ffffffffffffff; | ||
133 | |||
134 | /* bin66_to_felem takes a little-endian byte array and converts it into felem | ||
135 | * form. This assumes that the CPU is little-endian. */ | ||
136 | static void bin66_to_felem(felem out, const u8 in[66]) | ||
137 | { | ||
138 | out[0] = (*((limb*) &in[0])) & bottom58bits; | ||
139 | out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits; | ||
140 | out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits; | ||
141 | out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits; | ||
142 | out[4] = (*((limb*) &in[29])) & bottom58bits; | ||
143 | out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits; | ||
144 | out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits; | ||
145 | out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits; | ||
146 | out[8] = (*((limb*) &in[58])) & bottom57bits; | ||
147 | } | ||
148 | |||
149 | /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte | ||
150 | * array. This assumes that the CPU is little-endian. */ | ||
151 | static void felem_to_bin66(u8 out[66], const felem in) | ||
152 | { | ||
153 | memset(out, 0, 66); | ||
154 | (*((limb*) &out[0])) = in[0]; | ||
155 | (*((limb*) &out[7])) |= in[1] << 2; | ||
156 | (*((limb*) &out[14])) |= in[2] << 4; | ||
157 | (*((limb*) &out[21])) |= in[3] << 6; | ||
158 | (*((limb*) &out[29])) = in[4]; | ||
159 | (*((limb*) &out[36])) |= in[5] << 2; | ||
160 | (*((limb*) &out[43])) |= in[6] << 4; | ||
161 | (*((limb*) &out[50])) |= in[7] << 6; | ||
162 | (*((limb*) &out[58])) = in[8]; | ||
163 | } | ||
164 | |||
165 | /* To preserve endianness when using BN_bn2bin and BN_bin2bn */ | ||
166 | static void flip_endian(u8 *out, const u8 *in, unsigned len) | ||
167 | { | ||
168 | unsigned i; | ||
169 | for (i = 0; i < len; ++i) | ||
170 | out[i] = in[len-1-i]; | ||
171 | } | ||
172 | |||
173 | /* BN_to_felem converts an OpenSSL BIGNUM into an felem */ | ||
174 | static int BN_to_felem(felem out, const BIGNUM *bn) | ||
175 | { | ||
176 | felem_bytearray b_in; | ||
177 | felem_bytearray b_out; | ||
178 | unsigned num_bytes; | ||
179 | |||
180 | /* BN_bn2bin eats leading zeroes */ | ||
181 | memset(b_out, 0, sizeof b_out); | ||
182 | num_bytes = BN_num_bytes(bn); | ||
183 | if (num_bytes > sizeof b_out) | ||
184 | { | ||
185 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
186 | return 0; | ||
187 | } | ||
188 | if (BN_is_negative(bn)) | ||
189 | { | ||
190 | ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); | ||
191 | return 0; | ||
192 | } | ||
193 | num_bytes = BN_bn2bin(bn, b_in); | ||
194 | flip_endian(b_out, b_in, num_bytes); | ||
195 | bin66_to_felem(out, b_out); | ||
196 | return 1; | ||
197 | } | ||
198 | |||
199 | /* felem_to_BN converts an felem into an OpenSSL BIGNUM */ | ||
200 | static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) | ||
201 | { | ||
202 | felem_bytearray b_in, b_out; | ||
203 | felem_to_bin66(b_in, in); | ||
204 | flip_endian(b_out, b_in, sizeof b_out); | ||
205 | return BN_bin2bn(b_out, sizeof b_out, out); | ||
206 | } | ||
207 | |||
208 | |||
209 | /* Field operations | ||
210 | * ---------------- */ | ||
211 | |||
212 | static void felem_one(felem out) | ||
213 | { | ||
214 | out[0] = 1; | ||
215 | out[1] = 0; | ||
216 | out[2] = 0; | ||
217 | out[3] = 0; | ||
218 | out[4] = 0; | ||
219 | out[5] = 0; | ||
220 | out[6] = 0; | ||
221 | out[7] = 0; | ||
222 | out[8] = 0; | ||
223 | } | ||
224 | |||
225 | static void felem_assign(felem out, const felem in) | ||
226 | { | ||
227 | out[0] = in[0]; | ||
228 | out[1] = in[1]; | ||
229 | out[2] = in[2]; | ||
230 | out[3] = in[3]; | ||
231 | out[4] = in[4]; | ||
232 | out[5] = in[5]; | ||
233 | out[6] = in[6]; | ||
234 | out[7] = in[7]; | ||
235 | out[8] = in[8]; | ||
236 | } | ||
237 | |||
238 | /* felem_sum64 sets out = out + in. */ | ||
239 | static void felem_sum64(felem out, const felem in) | ||
240 | { | ||
241 | out[0] += in[0]; | ||
242 | out[1] += in[1]; | ||
243 | out[2] += in[2]; | ||
244 | out[3] += in[3]; | ||
245 | out[4] += in[4]; | ||
246 | out[5] += in[5]; | ||
247 | out[6] += in[6]; | ||
248 | out[7] += in[7]; | ||
249 | out[8] += in[8]; | ||
250 | } | ||
251 | |||
252 | /* felem_scalar sets out = in * scalar */ | ||
253 | static void felem_scalar(felem out, const felem in, limb scalar) | ||
254 | { | ||
255 | out[0] = in[0] * scalar; | ||
256 | out[1] = in[1] * scalar; | ||
257 | out[2] = in[2] * scalar; | ||
258 | out[3] = in[3] * scalar; | ||
259 | out[4] = in[4] * scalar; | ||
260 | out[5] = in[5] * scalar; | ||
261 | out[6] = in[6] * scalar; | ||
262 | out[7] = in[7] * scalar; | ||
263 | out[8] = in[8] * scalar; | ||
264 | } | ||
265 | |||
266 | /* felem_scalar64 sets out = out * scalar */ | ||
267 | static void felem_scalar64(felem out, limb scalar) | ||
268 | { | ||
269 | out[0] *= scalar; | ||
270 | out[1] *= scalar; | ||
271 | out[2] *= scalar; | ||
272 | out[3] *= scalar; | ||
273 | out[4] *= scalar; | ||
274 | out[5] *= scalar; | ||
275 | out[6] *= scalar; | ||
276 | out[7] *= scalar; | ||
277 | out[8] *= scalar; | ||
278 | } | ||
279 | |||
280 | /* felem_scalar128 sets out = out * scalar */ | ||
281 | static void felem_scalar128(largefelem out, limb scalar) | ||
282 | { | ||
283 | out[0] *= scalar; | ||
284 | out[1] *= scalar; | ||
285 | out[2] *= scalar; | ||
286 | out[3] *= scalar; | ||
287 | out[4] *= scalar; | ||
288 | out[5] *= scalar; | ||
289 | out[6] *= scalar; | ||
290 | out[7] *= scalar; | ||
291 | out[8] *= scalar; | ||
292 | } | ||
293 | |||
294 | /* felem_neg sets |out| to |-in| | ||
295 | * On entry: | ||
296 | * in[i] < 2^59 + 2^14 | ||
297 | * On exit: | ||
298 | * out[i] < 2^62 | ||
299 | */ | ||
300 | static void felem_neg(felem out, const felem in) | ||
301 | { | ||
302 | /* In order to prevent underflow, we subtract from 0 mod p. */ | ||
303 | static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5); | ||
304 | static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4); | ||
305 | |||
306 | out[0] = two62m3 - in[0]; | ||
307 | out[1] = two62m2 - in[1]; | ||
308 | out[2] = two62m2 - in[2]; | ||
309 | out[3] = two62m2 - in[3]; | ||
310 | out[4] = two62m2 - in[4]; | ||
311 | out[5] = two62m2 - in[5]; | ||
312 | out[6] = two62m2 - in[6]; | ||
313 | out[7] = two62m2 - in[7]; | ||
314 | out[8] = two62m2 - in[8]; | ||
315 | } | ||
316 | |||
317 | /* felem_diff64 subtracts |in| from |out| | ||
318 | * On entry: | ||
319 | * in[i] < 2^59 + 2^14 | ||
320 | * On exit: | ||
321 | * out[i] < out[i] + 2^62 | ||
322 | */ | ||
323 | static void felem_diff64(felem out, const felem in) | ||
324 | { | ||
325 | /* In order to prevent underflow, we add 0 mod p before subtracting. */ | ||
326 | static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5); | ||
327 | static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4); | ||
328 | |||
329 | out[0] += two62m3 - in[0]; | ||
330 | out[1] += two62m2 - in[1]; | ||
331 | out[2] += two62m2 - in[2]; | ||
332 | out[3] += two62m2 - in[3]; | ||
333 | out[4] += two62m2 - in[4]; | ||
334 | out[5] += two62m2 - in[5]; | ||
335 | out[6] += two62m2 - in[6]; | ||
336 | out[7] += two62m2 - in[7]; | ||
337 | out[8] += two62m2 - in[8]; | ||
338 | } | ||
339 | |||
340 | /* felem_diff_128_64 subtracts |in| from |out| | ||
341 | * On entry: | ||
342 | * in[i] < 2^62 + 2^17 | ||
343 | * On exit: | ||
344 | * out[i] < out[i] + 2^63 | ||
345 | */ | ||
346 | static void felem_diff_128_64(largefelem out, const felem in) | ||
347 | { | ||
348 | /* In order to prevent underflow, we add 0 mod p before subtracting. */ | ||
349 | static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5); | ||
350 | static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4); | ||
351 | |||
352 | out[0] += two63m6 - in[0]; | ||
353 | out[1] += two63m5 - in[1]; | ||
354 | out[2] += two63m5 - in[2]; | ||
355 | out[3] += two63m5 - in[3]; | ||
356 | out[4] += two63m5 - in[4]; | ||
357 | out[5] += two63m5 - in[5]; | ||
358 | out[6] += two63m5 - in[6]; | ||
359 | out[7] += two63m5 - in[7]; | ||
360 | out[8] += two63m5 - in[8]; | ||
361 | } | ||
362 | |||
363 | /* felem_diff_128_64 subtracts |in| from |out| | ||
364 | * On entry: | ||
365 | * in[i] < 2^126 | ||
366 | * On exit: | ||
367 | * out[i] < out[i] + 2^127 - 2^69 | ||
368 | */ | ||
369 | static void felem_diff128(largefelem out, const largefelem in) | ||
370 | { | ||
371 | /* In order to prevent underflow, we add 0 mod p before subtracting. */ | ||
372 | static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70); | ||
373 | static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69); | ||
374 | |||
375 | out[0] += (two127m70 - in[0]); | ||
376 | out[1] += (two127m69 - in[1]); | ||
377 | out[2] += (two127m69 - in[2]); | ||
378 | out[3] += (two127m69 - in[3]); | ||
379 | out[4] += (two127m69 - in[4]); | ||
380 | out[5] += (two127m69 - in[5]); | ||
381 | out[6] += (two127m69 - in[6]); | ||
382 | out[7] += (two127m69 - in[7]); | ||
383 | out[8] += (two127m69 - in[8]); | ||
384 | } | ||
385 | |||
386 | /* felem_square sets |out| = |in|^2 | ||
387 | * On entry: | ||
388 | * in[i] < 2^62 | ||
389 | * On exit: | ||
390 | * out[i] < 17 * max(in[i]) * max(in[i]) | ||
391 | */ | ||
392 | static void felem_square(largefelem out, const felem in) | ||
393 | { | ||
394 | felem inx2, inx4; | ||
395 | felem_scalar(inx2, in, 2); | ||
396 | felem_scalar(inx4, in, 4); | ||
397 | |||
398 | /* We have many cases were we want to do | ||
399 | * in[x] * in[y] + | ||
400 | * in[y] * in[x] | ||
401 | * This is obviously just | ||
402 | * 2 * in[x] * in[y] | ||
403 | * However, rather than do the doubling on the 128 bit result, we | ||
404 | * double one of the inputs to the multiplication by reading from | ||
405 | * |inx2| */ | ||
406 | |||
407 | out[0] = ((uint128_t) in[0]) * in[0]; | ||
408 | out[1] = ((uint128_t) in[0]) * inx2[1]; | ||
409 | out[2] = ((uint128_t) in[0]) * inx2[2] + | ||
410 | ((uint128_t) in[1]) * in[1]; | ||
411 | out[3] = ((uint128_t) in[0]) * inx2[3] + | ||
412 | ((uint128_t) in[1]) * inx2[2]; | ||
413 | out[4] = ((uint128_t) in[0]) * inx2[4] + | ||
414 | ((uint128_t) in[1]) * inx2[3] + | ||
415 | ((uint128_t) in[2]) * in[2]; | ||
416 | out[5] = ((uint128_t) in[0]) * inx2[5] + | ||
417 | ((uint128_t) in[1]) * inx2[4] + | ||
418 | ((uint128_t) in[2]) * inx2[3]; | ||
419 | out[6] = ((uint128_t) in[0]) * inx2[6] + | ||
420 | ((uint128_t) in[1]) * inx2[5] + | ||
421 | ((uint128_t) in[2]) * inx2[4] + | ||
422 | ((uint128_t) in[3]) * in[3]; | ||
423 | out[7] = ((uint128_t) in[0]) * inx2[7] + | ||
424 | ((uint128_t) in[1]) * inx2[6] + | ||
425 | ((uint128_t) in[2]) * inx2[5] + | ||
426 | ((uint128_t) in[3]) * inx2[4]; | ||
427 | out[8] = ((uint128_t) in[0]) * inx2[8] + | ||
428 | ((uint128_t) in[1]) * inx2[7] + | ||
429 | ((uint128_t) in[2]) * inx2[6] + | ||
430 | ((uint128_t) in[3]) * inx2[5] + | ||
431 | ((uint128_t) in[4]) * in[4]; | ||
432 | |||
433 | /* The remaining limbs fall above 2^521, with the first falling at | ||
434 | * 2^522. They correspond to locations one bit up from the limbs | ||
435 | * produced above so we would have to multiply by two to align them. | ||
436 | * Again, rather than operate on the 128-bit result, we double one of | ||
437 | * the inputs to the multiplication. If we want to double for both this | ||
438 | * reason, and the reason above, then we end up multiplying by four. */ | ||
439 | |||
440 | /* 9 */ | ||
441 | out[0] += ((uint128_t) in[1]) * inx4[8] + | ||
442 | ((uint128_t) in[2]) * inx4[7] + | ||
443 | ((uint128_t) in[3]) * inx4[6] + | ||
444 | ((uint128_t) in[4]) * inx4[5]; | ||
445 | |||
446 | /* 10 */ | ||
447 | out[1] += ((uint128_t) in[2]) * inx4[8] + | ||
448 | ((uint128_t) in[3]) * inx4[7] + | ||
449 | ((uint128_t) in[4]) * inx4[6] + | ||
450 | ((uint128_t) in[5]) * inx2[5]; | ||
451 | |||
452 | /* 11 */ | ||
453 | out[2] += ((uint128_t) in[3]) * inx4[8] + | ||
454 | ((uint128_t) in[4]) * inx4[7] + | ||
455 | ((uint128_t) in[5]) * inx4[6]; | ||
456 | |||
457 | /* 12 */ | ||
458 | out[3] += ((uint128_t) in[4]) * inx4[8] + | ||
459 | ((uint128_t) in[5]) * inx4[7] + | ||
460 | ((uint128_t) in[6]) * inx2[6]; | ||
461 | |||
462 | /* 13 */ | ||
463 | out[4] += ((uint128_t) in[5]) * inx4[8] + | ||
464 | ((uint128_t) in[6]) * inx4[7]; | ||
465 | |||
466 | /* 14 */ | ||
467 | out[5] += ((uint128_t) in[6]) * inx4[8] + | ||
468 | ((uint128_t) in[7]) * inx2[7]; | ||
469 | |||
470 | /* 15 */ | ||
471 | out[6] += ((uint128_t) in[7]) * inx4[8]; | ||
472 | |||
473 | /* 16 */ | ||
474 | out[7] += ((uint128_t) in[8]) * inx2[8]; | ||
475 | } | ||
476 | |||
477 | /* felem_mul sets |out| = |in1| * |in2| | ||
478 | * On entry: | ||
479 | * in1[i] < 2^64 | ||
480 | * in2[i] < 2^63 | ||
481 | * On exit: | ||
482 | * out[i] < 17 * max(in1[i]) * max(in2[i]) | ||
483 | */ | ||
484 | static void felem_mul(largefelem out, const felem in1, const felem in2) | ||
485 | { | ||
486 | felem in2x2; | ||
487 | felem_scalar(in2x2, in2, 2); | ||
488 | |||
489 | out[0] = ((uint128_t) in1[0]) * in2[0]; | ||
490 | |||
491 | out[1] = ((uint128_t) in1[0]) * in2[1] + | ||
492 | ((uint128_t) in1[1]) * in2[0]; | ||
493 | |||
494 | out[2] = ((uint128_t) in1[0]) * in2[2] + | ||
495 | ((uint128_t) in1[1]) * in2[1] + | ||
496 | ((uint128_t) in1[2]) * in2[0]; | ||
497 | |||
498 | out[3] = ((uint128_t) in1[0]) * in2[3] + | ||
499 | ((uint128_t) in1[1]) * in2[2] + | ||
500 | ((uint128_t) in1[2]) * in2[1] + | ||
501 | ((uint128_t) in1[3]) * in2[0]; | ||
502 | |||
503 | out[4] = ((uint128_t) in1[0]) * in2[4] + | ||
504 | ((uint128_t) in1[1]) * in2[3] + | ||
505 | ((uint128_t) in1[2]) * in2[2] + | ||
506 | ((uint128_t) in1[3]) * in2[1] + | ||
507 | ((uint128_t) in1[4]) * in2[0]; | ||
508 | |||
509 | out[5] = ((uint128_t) in1[0]) * in2[5] + | ||
510 | ((uint128_t) in1[1]) * in2[4] + | ||
511 | ((uint128_t) in1[2]) * in2[3] + | ||
512 | ((uint128_t) in1[3]) * in2[2] + | ||
513 | ((uint128_t) in1[4]) * in2[1] + | ||
514 | ((uint128_t) in1[5]) * in2[0]; | ||
515 | |||
516 | out[6] = ((uint128_t) in1[0]) * in2[6] + | ||
517 | ((uint128_t) in1[1]) * in2[5] + | ||
518 | ((uint128_t) in1[2]) * in2[4] + | ||
519 | ((uint128_t) in1[3]) * in2[3] + | ||
520 | ((uint128_t) in1[4]) * in2[2] + | ||
521 | ((uint128_t) in1[5]) * in2[1] + | ||
522 | ((uint128_t) in1[6]) * in2[0]; | ||
523 | |||
524 | out[7] = ((uint128_t) in1[0]) * in2[7] + | ||
525 | ((uint128_t) in1[1]) * in2[6] + | ||
526 | ((uint128_t) in1[2]) * in2[5] + | ||
527 | ((uint128_t) in1[3]) * in2[4] + | ||
528 | ((uint128_t) in1[4]) * in2[3] + | ||
529 | ((uint128_t) in1[5]) * in2[2] + | ||
530 | ((uint128_t) in1[6]) * in2[1] + | ||
531 | ((uint128_t) in1[7]) * in2[0]; | ||
532 | |||
533 | out[8] = ((uint128_t) in1[0]) * in2[8] + | ||
534 | ((uint128_t) in1[1]) * in2[7] + | ||
535 | ((uint128_t) in1[2]) * in2[6] + | ||
536 | ((uint128_t) in1[3]) * in2[5] + | ||
537 | ((uint128_t) in1[4]) * in2[4] + | ||
538 | ((uint128_t) in1[5]) * in2[3] + | ||
539 | ((uint128_t) in1[6]) * in2[2] + | ||
540 | ((uint128_t) in1[7]) * in2[1] + | ||
541 | ((uint128_t) in1[8]) * in2[0]; | ||
542 | |||
543 | /* See comment in felem_square about the use of in2x2 here */ | ||
544 | |||
545 | out[0] += ((uint128_t) in1[1]) * in2x2[8] + | ||
546 | ((uint128_t) in1[2]) * in2x2[7] + | ||
547 | ((uint128_t) in1[3]) * in2x2[6] + | ||
548 | ((uint128_t) in1[4]) * in2x2[5] + | ||
549 | ((uint128_t) in1[5]) * in2x2[4] + | ||
550 | ((uint128_t) in1[6]) * in2x2[3] + | ||
551 | ((uint128_t) in1[7]) * in2x2[2] + | ||
552 | ((uint128_t) in1[8]) * in2x2[1]; | ||
553 | |||
554 | out[1] += ((uint128_t) in1[2]) * in2x2[8] + | ||
555 | ((uint128_t) in1[3]) * in2x2[7] + | ||
556 | ((uint128_t) in1[4]) * in2x2[6] + | ||
557 | ((uint128_t) in1[5]) * in2x2[5] + | ||
558 | ((uint128_t) in1[6]) * in2x2[4] + | ||
559 | ((uint128_t) in1[7]) * in2x2[3] + | ||
560 | ((uint128_t) in1[8]) * in2x2[2]; | ||
561 | |||
562 | out[2] += ((uint128_t) in1[3]) * in2x2[8] + | ||
563 | ((uint128_t) in1[4]) * in2x2[7] + | ||
564 | ((uint128_t) in1[5]) * in2x2[6] + | ||
565 | ((uint128_t) in1[6]) * in2x2[5] + | ||
566 | ((uint128_t) in1[7]) * in2x2[4] + | ||
567 | ((uint128_t) in1[8]) * in2x2[3]; | ||
568 | |||
569 | out[3] += ((uint128_t) in1[4]) * in2x2[8] + | ||
570 | ((uint128_t) in1[5]) * in2x2[7] + | ||
571 | ((uint128_t) in1[6]) * in2x2[6] + | ||
572 | ((uint128_t) in1[7]) * in2x2[5] + | ||
573 | ((uint128_t) in1[8]) * in2x2[4]; | ||
574 | |||
575 | out[4] += ((uint128_t) in1[5]) * in2x2[8] + | ||
576 | ((uint128_t) in1[6]) * in2x2[7] + | ||
577 | ((uint128_t) in1[7]) * in2x2[6] + | ||
578 | ((uint128_t) in1[8]) * in2x2[5]; | ||
579 | |||
580 | out[5] += ((uint128_t) in1[6]) * in2x2[8] + | ||
581 | ((uint128_t) in1[7]) * in2x2[7] + | ||
582 | ((uint128_t) in1[8]) * in2x2[6]; | ||
583 | |||
584 | out[6] += ((uint128_t) in1[7]) * in2x2[8] + | ||
585 | ((uint128_t) in1[8]) * in2x2[7]; | ||
586 | |||
587 | out[7] += ((uint128_t) in1[8]) * in2x2[8]; | ||
588 | } | ||
589 | |||
590 | static const limb bottom52bits = 0xfffffffffffff; | ||
591 | |||
592 | /* felem_reduce converts a largefelem to an felem. | ||
593 | * On entry: | ||
594 | * in[i] < 2^128 | ||
595 | * On exit: | ||
596 | * out[i] < 2^59 + 2^14 | ||
597 | */ | ||
598 | static void felem_reduce(felem out, const largefelem in) | ||
599 | { | ||
600 | u64 overflow1, overflow2; | ||
601 | |||
602 | out[0] = ((limb) in[0]) & bottom58bits; | ||
603 | out[1] = ((limb) in[1]) & bottom58bits; | ||
604 | out[2] = ((limb) in[2]) & bottom58bits; | ||
605 | out[3] = ((limb) in[3]) & bottom58bits; | ||
606 | out[4] = ((limb) in[4]) & bottom58bits; | ||
607 | out[5] = ((limb) in[5]) & bottom58bits; | ||
608 | out[6] = ((limb) in[6]) & bottom58bits; | ||
609 | out[7] = ((limb) in[7]) & bottom58bits; | ||
610 | out[8] = ((limb) in[8]) & bottom58bits; | ||
611 | |||
612 | /* out[i] < 2^58 */ | ||
613 | |||
614 | out[1] += ((limb) in[0]) >> 58; | ||
615 | out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6; | ||
616 | /* out[1] < 2^58 + 2^6 + 2^58 | ||
617 | * = 2^59 + 2^6 */ | ||
618 | out[2] += ((limb) (in[0] >> 64)) >> 52; | ||
619 | |||
620 | out[2] += ((limb) in[1]) >> 58; | ||
621 | out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6; | ||
622 | out[3] += ((limb) (in[1] >> 64)) >> 52; | ||
623 | |||
624 | out[3] += ((limb) in[2]) >> 58; | ||
625 | out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6; | ||
626 | out[4] += ((limb) (in[2] >> 64)) >> 52; | ||
627 | |||
628 | out[4] += ((limb) in[3]) >> 58; | ||
629 | out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6; | ||
630 | out[5] += ((limb) (in[3] >> 64)) >> 52; | ||
631 | |||
632 | out[5] += ((limb) in[4]) >> 58; | ||
633 | out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6; | ||
634 | out[6] += ((limb) (in[4] >> 64)) >> 52; | ||
635 | |||
636 | out[6] += ((limb) in[5]) >> 58; | ||
637 | out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6; | ||
638 | out[7] += ((limb) (in[5] >> 64)) >> 52; | ||
639 | |||
640 | out[7] += ((limb) in[6]) >> 58; | ||
641 | out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6; | ||
642 | out[8] += ((limb) (in[6] >> 64)) >> 52; | ||
643 | |||
644 | out[8] += ((limb) in[7]) >> 58; | ||
645 | out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6; | ||
646 | /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12 | ||
647 | * < 2^59 + 2^13 */ | ||
648 | overflow1 = ((limb) (in[7] >> 64)) >> 52; | ||
649 | |||
650 | overflow1 += ((limb) in[8]) >> 58; | ||
651 | overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6; | ||
652 | overflow2 = ((limb) (in[8] >> 64)) >> 52; | ||
653 | |||
654 | overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */ | ||
655 | overflow2 <<= 1; /* overflow2 < 2^13 */ | ||
656 | |||
657 | out[0] += overflow1; /* out[0] < 2^60 */ | ||
658 | out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */ | ||
659 | |||
660 | out[1] += out[0] >> 58; out[0] &= bottom58bits; | ||
661 | /* out[0] < 2^58 | ||
662 | * out[1] < 2^59 + 2^6 + 2^13 + 2^2 | ||
663 | * < 2^59 + 2^14 */ | ||
664 | } | ||
665 | |||
666 | static void felem_square_reduce(felem out, const felem in) | ||
667 | { | ||
668 | largefelem tmp; | ||
669 | felem_square(tmp, in); | ||
670 | felem_reduce(out, tmp); | ||
671 | } | ||
672 | |||
673 | static void felem_mul_reduce(felem out, const felem in1, const felem in2) | ||
674 | { | ||
675 | largefelem tmp; | ||
676 | felem_mul(tmp, in1, in2); | ||
677 | felem_reduce(out, tmp); | ||
678 | } | ||
679 | |||
680 | /* felem_inv calculates |out| = |in|^{-1} | ||
681 | * | ||
682 | * Based on Fermat's Little Theorem: | ||
683 | * a^p = a (mod p) | ||
684 | * a^{p-1} = 1 (mod p) | ||
685 | * a^{p-2} = a^{-1} (mod p) | ||
686 | */ | ||
687 | static void felem_inv(felem out, const felem in) | ||
688 | { | ||
689 | felem ftmp, ftmp2, ftmp3, ftmp4; | ||
690 | largefelem tmp; | ||
691 | unsigned i; | ||
692 | |||
693 | felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */ | ||
694 | felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */ | ||
695 | felem_assign(ftmp2, ftmp); | ||
696 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */ | ||
697 | felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */ | ||
698 | felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */ | ||
699 | |||
700 | felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */ | ||
701 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */ | ||
702 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */ | ||
703 | |||
704 | felem_assign(ftmp2, ftmp3); | ||
705 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */ | ||
706 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */ | ||
707 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */ | ||
708 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */ | ||
709 | felem_assign(ftmp4, ftmp3); | ||
710 | felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */ | ||
711 | felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */ | ||
712 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */ | ||
713 | felem_assign(ftmp2, ftmp3); | ||
714 | |||
715 | for (i = 0; i < 8; i++) | ||
716 | { | ||
717 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */ | ||
718 | } | ||
719 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */ | ||
720 | felem_assign(ftmp2, ftmp3); | ||
721 | |||
722 | for (i = 0; i < 16; i++) | ||
723 | { | ||
724 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */ | ||
725 | } | ||
726 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */ | ||
727 | felem_assign(ftmp2, ftmp3); | ||
728 | |||
729 | for (i = 0; i < 32; i++) | ||
730 | { | ||
731 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */ | ||
732 | } | ||
733 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */ | ||
734 | felem_assign(ftmp2, ftmp3); | ||
735 | |||
736 | for (i = 0; i < 64; i++) | ||
737 | { | ||
738 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */ | ||
739 | } | ||
740 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */ | ||
741 | felem_assign(ftmp2, ftmp3); | ||
742 | |||
743 | for (i = 0; i < 128; i++) | ||
744 | { | ||
745 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */ | ||
746 | } | ||
747 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */ | ||
748 | felem_assign(ftmp2, ftmp3); | ||
749 | |||
750 | for (i = 0; i < 256; i++) | ||
751 | { | ||
752 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */ | ||
753 | } | ||
754 | felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */ | ||
755 | |||
756 | for (i = 0; i < 9; i++) | ||
757 | { | ||
758 | felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */ | ||
759 | } | ||
760 | felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */ | ||
761 | felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp); /* 2^512 - 3 */ | ||
762 | } | ||
763 | |||
764 | /* This is 2^521-1, expressed as an felem */ | ||
765 | static const felem kPrime = | ||
766 | { | ||
767 | 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff, | ||
768 | 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff, | ||
769 | 0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff | ||
770 | }; | ||
771 | |||
772 | /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0 | ||
773 | * otherwise. | ||
774 | * On entry: | ||
775 | * in[i] < 2^59 + 2^14 | ||
776 | */ | ||
777 | static limb felem_is_zero(const felem in) | ||
778 | { | ||
779 | felem ftmp; | ||
780 | limb is_zero, is_p; | ||
781 | felem_assign(ftmp, in); | ||
782 | |||
783 | ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits; | ||
784 | /* ftmp[8] < 2^57 */ | ||
785 | ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits; | ||
786 | ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits; | ||
787 | ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits; | ||
788 | ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits; | ||
789 | ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits; | ||
790 | ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits; | ||
791 | ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits; | ||
792 | ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits; | ||
793 | /* ftmp[8] < 2^57 + 4 */ | ||
794 | |||
795 | /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is | ||
796 | * greater than our bound for ftmp[8]. Therefore we only have to check | ||
797 | * if the zero is zero or 2^521-1. */ | ||
798 | |||
799 | is_zero = 0; | ||
800 | is_zero |= ftmp[0]; | ||
801 | is_zero |= ftmp[1]; | ||
802 | is_zero |= ftmp[2]; | ||
803 | is_zero |= ftmp[3]; | ||
804 | is_zero |= ftmp[4]; | ||
805 | is_zero |= ftmp[5]; | ||
806 | is_zero |= ftmp[6]; | ||
807 | is_zero |= ftmp[7]; | ||
808 | is_zero |= ftmp[8]; | ||
809 | |||
810 | is_zero--; | ||
811 | /* We know that ftmp[i] < 2^63, therefore the only way that the top bit | ||
812 | * can be set is if is_zero was 0 before the decrement. */ | ||
813 | is_zero = ((s64) is_zero) >> 63; | ||
814 | |||
815 | is_p = ftmp[0] ^ kPrime[0]; | ||
816 | is_p |= ftmp[1] ^ kPrime[1]; | ||
817 | is_p |= ftmp[2] ^ kPrime[2]; | ||
818 | is_p |= ftmp[3] ^ kPrime[3]; | ||
819 | is_p |= ftmp[4] ^ kPrime[4]; | ||
820 | is_p |= ftmp[5] ^ kPrime[5]; | ||
821 | is_p |= ftmp[6] ^ kPrime[6]; | ||
822 | is_p |= ftmp[7] ^ kPrime[7]; | ||
823 | is_p |= ftmp[8] ^ kPrime[8]; | ||
824 | |||
825 | is_p--; | ||
826 | is_p = ((s64) is_p) >> 63; | ||
827 | |||
828 | is_zero |= is_p; | ||
829 | return is_zero; | ||
830 | } | ||
831 | |||
832 | static int felem_is_zero_int(const felem in) | ||
833 | { | ||
834 | return (int) (felem_is_zero(in) & ((limb)1)); | ||
835 | } | ||
836 | |||
837 | /* felem_contract converts |in| to its unique, minimal representation. | ||
838 | * On entry: | ||
839 | * in[i] < 2^59 + 2^14 | ||
840 | */ | ||
841 | static void felem_contract(felem out, const felem in) | ||
842 | { | ||
843 | limb is_p, is_greater, sign; | ||
844 | static const limb two58 = ((limb)1) << 58; | ||
845 | |||
846 | felem_assign(out, in); | ||
847 | |||
848 | out[0] += out[8] >> 57; out[8] &= bottom57bits; | ||
849 | /* out[8] < 2^57 */ | ||
850 | out[1] += out[0] >> 58; out[0] &= bottom58bits; | ||
851 | out[2] += out[1] >> 58; out[1] &= bottom58bits; | ||
852 | out[3] += out[2] >> 58; out[2] &= bottom58bits; | ||
853 | out[4] += out[3] >> 58; out[3] &= bottom58bits; | ||
854 | out[5] += out[4] >> 58; out[4] &= bottom58bits; | ||
855 | out[6] += out[5] >> 58; out[5] &= bottom58bits; | ||
856 | out[7] += out[6] >> 58; out[6] &= bottom58bits; | ||
857 | out[8] += out[7] >> 58; out[7] &= bottom58bits; | ||
858 | /* out[8] < 2^57 + 4 */ | ||
859 | |||
860 | /* If the value is greater than 2^521-1 then we have to subtract | ||
861 | * 2^521-1 out. See the comments in felem_is_zero regarding why we | ||
862 | * don't test for other multiples of the prime. */ | ||
863 | |||
864 | /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */ | ||
865 | |||
866 | is_p = out[0] ^ kPrime[0]; | ||
867 | is_p |= out[1] ^ kPrime[1]; | ||
868 | is_p |= out[2] ^ kPrime[2]; | ||
869 | is_p |= out[3] ^ kPrime[3]; | ||
870 | is_p |= out[4] ^ kPrime[4]; | ||
871 | is_p |= out[5] ^ kPrime[5]; | ||
872 | is_p |= out[6] ^ kPrime[6]; | ||
873 | is_p |= out[7] ^ kPrime[7]; | ||
874 | is_p |= out[8] ^ kPrime[8]; | ||
875 | |||
876 | is_p--; | ||
877 | is_p &= is_p << 32; | ||
878 | is_p &= is_p << 16; | ||
879 | is_p &= is_p << 8; | ||
880 | is_p &= is_p << 4; | ||
881 | is_p &= is_p << 2; | ||
882 | is_p &= is_p << 1; | ||
883 | is_p = ((s64) is_p) >> 63; | ||
884 | is_p = ~is_p; | ||
885 | |||
886 | /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */ | ||
887 | |||
888 | out[0] &= is_p; | ||
889 | out[1] &= is_p; | ||
890 | out[2] &= is_p; | ||
891 | out[3] &= is_p; | ||
892 | out[4] &= is_p; | ||
893 | out[5] &= is_p; | ||
894 | out[6] &= is_p; | ||
895 | out[7] &= is_p; | ||
896 | out[8] &= is_p; | ||
897 | |||
898 | /* In order to test that |out| >= 2^521-1 we need only test if out[8] | ||
899 | * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */ | ||
900 | is_greater = out[8] >> 57; | ||
901 | is_greater |= is_greater << 32; | ||
902 | is_greater |= is_greater << 16; | ||
903 | is_greater |= is_greater << 8; | ||
904 | is_greater |= is_greater << 4; | ||
905 | is_greater |= is_greater << 2; | ||
906 | is_greater |= is_greater << 1; | ||
907 | is_greater = ((s64) is_greater) >> 63; | ||
908 | |||
909 | out[0] -= kPrime[0] & is_greater; | ||
910 | out[1] -= kPrime[1] & is_greater; | ||
911 | out[2] -= kPrime[2] & is_greater; | ||
912 | out[3] -= kPrime[3] & is_greater; | ||
913 | out[4] -= kPrime[4] & is_greater; | ||
914 | out[5] -= kPrime[5] & is_greater; | ||
915 | out[6] -= kPrime[6] & is_greater; | ||
916 | out[7] -= kPrime[7] & is_greater; | ||
917 | out[8] -= kPrime[8] & is_greater; | ||
918 | |||
919 | /* Eliminate negative coefficients */ | ||
920 | sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign); | ||
921 | sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign); | ||
922 | sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign); | ||
923 | sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign); | ||
924 | sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign); | ||
925 | sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign); | ||
926 | sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign); | ||
927 | sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign); | ||
928 | sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign); | ||
929 | sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign); | ||
930 | sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign); | ||
931 | } | ||
932 | |||
933 | /* Group operations | ||
934 | * ---------------- | ||
935 | * | ||
936 | * Building on top of the field operations we have the operations on the | ||
937 | * elliptic curve group itself. Points on the curve are represented in Jacobian | ||
938 | * coordinates */ | ||
939 | |||
940 | /* point_double calcuates 2*(x_in, y_in, z_in) | ||
941 | * | ||
942 | * The method is taken from: | ||
943 | * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b | ||
944 | * | ||
945 | * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. | ||
946 | * while x_out == y_in is not (maybe this works, but it's not tested). */ | ||
947 | static void | ||
948 | point_double(felem x_out, felem y_out, felem z_out, | ||
949 | const felem x_in, const felem y_in, const felem z_in) | ||
950 | { | ||
951 | largefelem tmp, tmp2; | ||
952 | felem delta, gamma, beta, alpha, ftmp, ftmp2; | ||
953 | |||
954 | felem_assign(ftmp, x_in); | ||
955 | felem_assign(ftmp2, x_in); | ||
956 | |||
957 | /* delta = z^2 */ | ||
958 | felem_square(tmp, z_in); | ||
959 | felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */ | ||
960 | |||
961 | /* gamma = y^2 */ | ||
962 | felem_square(tmp, y_in); | ||
963 | felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */ | ||
964 | |||
965 | /* beta = x*gamma */ | ||
966 | felem_mul(tmp, x_in, gamma); | ||
967 | felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */ | ||
968 | |||
969 | /* alpha = 3*(x-delta)*(x+delta) */ | ||
970 | felem_diff64(ftmp, delta); | ||
971 | /* ftmp[i] < 2^61 */ | ||
972 | felem_sum64(ftmp2, delta); | ||
973 | /* ftmp2[i] < 2^60 + 2^15 */ | ||
974 | felem_scalar64(ftmp2, 3); | ||
975 | /* ftmp2[i] < 3*2^60 + 3*2^15 */ | ||
976 | felem_mul(tmp, ftmp, ftmp2); | ||
977 | /* tmp[i] < 17(3*2^121 + 3*2^76) | ||
978 | * = 61*2^121 + 61*2^76 | ||
979 | * < 64*2^121 + 64*2^76 | ||
980 | * = 2^127 + 2^82 | ||
981 | * < 2^128 */ | ||
982 | felem_reduce(alpha, tmp); | ||
983 | |||
984 | /* x' = alpha^2 - 8*beta */ | ||
985 | felem_square(tmp, alpha); | ||
986 | /* tmp[i] < 17*2^120 | ||
987 | * < 2^125 */ | ||
988 | felem_assign(ftmp, beta); | ||
989 | felem_scalar64(ftmp, 8); | ||
990 | /* ftmp[i] < 2^62 + 2^17 */ | ||
991 | felem_diff_128_64(tmp, ftmp); | ||
992 | /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */ | ||
993 | felem_reduce(x_out, tmp); | ||
994 | |||
995 | /* z' = (y + z)^2 - gamma - delta */ | ||
996 | felem_sum64(delta, gamma); | ||
997 | /* delta[i] < 2^60 + 2^15 */ | ||
998 | felem_assign(ftmp, y_in); | ||
999 | felem_sum64(ftmp, z_in); | ||
1000 | /* ftmp[i] < 2^60 + 2^15 */ | ||
1001 | felem_square(tmp, ftmp); | ||
1002 | /* tmp[i] < 17(2^122) | ||
1003 | * < 2^127 */ | ||
1004 | felem_diff_128_64(tmp, delta); | ||
1005 | /* tmp[i] < 2^127 + 2^63 */ | ||
1006 | felem_reduce(z_out, tmp); | ||
1007 | |||
1008 | /* y' = alpha*(4*beta - x') - 8*gamma^2 */ | ||
1009 | felem_scalar64(beta, 4); | ||
1010 | /* beta[i] < 2^61 + 2^16 */ | ||
1011 | felem_diff64(beta, x_out); | ||
1012 | /* beta[i] < 2^61 + 2^60 + 2^16 */ | ||
1013 | felem_mul(tmp, alpha, beta); | ||
1014 | /* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16)) | ||
1015 | * = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30) | ||
1016 | * = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30) | ||
1017 | * < 2^128 */ | ||
1018 | felem_square(tmp2, gamma); | ||
1019 | /* tmp2[i] < 17*(2^59 + 2^14)^2 | ||
1020 | * = 17*(2^118 + 2^74 + 2^28) */ | ||
1021 | felem_scalar128(tmp2, 8); | ||
1022 | /* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28) | ||
1023 | * = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31 | ||
1024 | * < 2^126 */ | ||
1025 | felem_diff128(tmp, tmp2); | ||
1026 | /* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30) | ||
1027 | * = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 + | ||
1028 | * 2^74 + 2^69 + 2^34 + 2^30 | ||
1029 | * < 2^128 */ | ||
1030 | felem_reduce(y_out, tmp); | ||
1031 | } | ||
1032 | |||
1033 | /* copy_conditional copies in to out iff mask is all ones. */ | ||
1034 | static void | ||
1035 | copy_conditional(felem out, const felem in, limb mask) | ||
1036 | { | ||
1037 | unsigned i; | ||
1038 | for (i = 0; i < NLIMBS; ++i) | ||
1039 | { | ||
1040 | const limb tmp = mask & (in[i] ^ out[i]); | ||
1041 | out[i] ^= tmp; | ||
1042 | } | ||
1043 | } | ||
1044 | |||
1045 | /* point_add calcuates (x1, y1, z1) + (x2, y2, z2) | ||
1046 | * | ||
1047 | * The method is taken from | ||
1048 | * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, | ||
1049 | * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). | ||
1050 | * | ||
1051 | * This function includes a branch for checking whether the two input points | ||
1052 | * are equal (while not equal to the point at infinity). This case never | ||
1053 | * happens during single point multiplication, so there is no timing leak for | ||
1054 | * ECDH or ECDSA signing. */ | ||
1055 | static void point_add(felem x3, felem y3, felem z3, | ||
1056 | const felem x1, const felem y1, const felem z1, | ||
1057 | const int mixed, const felem x2, const felem y2, const felem z2) | ||
1058 | { | ||
1059 | felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; | ||
1060 | largefelem tmp, tmp2; | ||
1061 | limb x_equal, y_equal, z1_is_zero, z2_is_zero; | ||
1062 | |||
1063 | z1_is_zero = felem_is_zero(z1); | ||
1064 | z2_is_zero = felem_is_zero(z2); | ||
1065 | |||
1066 | /* ftmp = z1z1 = z1**2 */ | ||
1067 | felem_square(tmp, z1); | ||
1068 | felem_reduce(ftmp, tmp); | ||
1069 | |||
1070 | if (!mixed) | ||
1071 | { | ||
1072 | /* ftmp2 = z2z2 = z2**2 */ | ||
1073 | felem_square(tmp, z2); | ||
1074 | felem_reduce(ftmp2, tmp); | ||
1075 | |||
1076 | /* u1 = ftmp3 = x1*z2z2 */ | ||
1077 | felem_mul(tmp, x1, ftmp2); | ||
1078 | felem_reduce(ftmp3, tmp); | ||
1079 | |||
1080 | /* ftmp5 = z1 + z2 */ | ||
1081 | felem_assign(ftmp5, z1); | ||
1082 | felem_sum64(ftmp5, z2); | ||
1083 | /* ftmp5[i] < 2^61 */ | ||
1084 | |||
1085 | /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */ | ||
1086 | felem_square(tmp, ftmp5); | ||
1087 | /* tmp[i] < 17*2^122 */ | ||
1088 | felem_diff_128_64(tmp, ftmp); | ||
1089 | /* tmp[i] < 17*2^122 + 2^63 */ | ||
1090 | felem_diff_128_64(tmp, ftmp2); | ||
1091 | /* tmp[i] < 17*2^122 + 2^64 */ | ||
1092 | felem_reduce(ftmp5, tmp); | ||
1093 | |||
1094 | /* ftmp2 = z2 * z2z2 */ | ||
1095 | felem_mul(tmp, ftmp2, z2); | ||
1096 | felem_reduce(ftmp2, tmp); | ||
1097 | |||
1098 | /* s1 = ftmp6 = y1 * z2**3 */ | ||
1099 | felem_mul(tmp, y1, ftmp2); | ||
1100 | felem_reduce(ftmp6, tmp); | ||
1101 | } | ||
1102 | else | ||
1103 | { | ||
1104 | /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ | ||
1105 | |||
1106 | /* u1 = ftmp3 = x1*z2z2 */ | ||
1107 | felem_assign(ftmp3, x1); | ||
1108 | |||
1109 | /* ftmp5 = 2*z1z2 */ | ||
1110 | felem_scalar(ftmp5, z1, 2); | ||
1111 | |||
1112 | /* s1 = ftmp6 = y1 * z2**3 */ | ||
1113 | felem_assign(ftmp6, y1); | ||
1114 | } | ||
1115 | |||
1116 | /* u2 = x2*z1z1 */ | ||
1117 | felem_mul(tmp, x2, ftmp); | ||
1118 | /* tmp[i] < 17*2^120 */ | ||
1119 | |||
1120 | /* h = ftmp4 = u2 - u1 */ | ||
1121 | felem_diff_128_64(tmp, ftmp3); | ||
1122 | /* tmp[i] < 17*2^120 + 2^63 */ | ||
1123 | felem_reduce(ftmp4, tmp); | ||
1124 | |||
1125 | x_equal = felem_is_zero(ftmp4); | ||
1126 | |||
1127 | /* z_out = ftmp5 * h */ | ||
1128 | felem_mul(tmp, ftmp5, ftmp4); | ||
1129 | felem_reduce(z_out, tmp); | ||
1130 | |||
1131 | /* ftmp = z1 * z1z1 */ | ||
1132 | felem_mul(tmp, ftmp, z1); | ||
1133 | felem_reduce(ftmp, tmp); | ||
1134 | |||
1135 | /* s2 = tmp = y2 * z1**3 */ | ||
1136 | felem_mul(tmp, y2, ftmp); | ||
1137 | /* tmp[i] < 17*2^120 */ | ||
1138 | |||
1139 | /* r = ftmp5 = (s2 - s1)*2 */ | ||
1140 | felem_diff_128_64(tmp, ftmp6); | ||
1141 | /* tmp[i] < 17*2^120 + 2^63 */ | ||
1142 | felem_reduce(ftmp5, tmp); | ||
1143 | y_equal = felem_is_zero(ftmp5); | ||
1144 | felem_scalar64(ftmp5, 2); | ||
1145 | /* ftmp5[i] < 2^61 */ | ||
1146 | |||
1147 | if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) | ||
1148 | { | ||
1149 | point_double(x3, y3, z3, x1, y1, z1); | ||
1150 | return; | ||
1151 | } | ||
1152 | |||
1153 | /* I = ftmp = (2h)**2 */ | ||
1154 | felem_assign(ftmp, ftmp4); | ||
1155 | felem_scalar64(ftmp, 2); | ||
1156 | /* ftmp[i] < 2^61 */ | ||
1157 | felem_square(tmp, ftmp); | ||
1158 | /* tmp[i] < 17*2^122 */ | ||
1159 | felem_reduce(ftmp, tmp); | ||
1160 | |||
1161 | /* J = ftmp2 = h * I */ | ||
1162 | felem_mul(tmp, ftmp4, ftmp); | ||
1163 | felem_reduce(ftmp2, tmp); | ||
1164 | |||
1165 | /* V = ftmp4 = U1 * I */ | ||
1166 | felem_mul(tmp, ftmp3, ftmp); | ||
1167 | felem_reduce(ftmp4, tmp); | ||
1168 | |||
1169 | /* x_out = r**2 - J - 2V */ | ||
1170 | felem_square(tmp, ftmp5); | ||
1171 | /* tmp[i] < 17*2^122 */ | ||
1172 | felem_diff_128_64(tmp, ftmp2); | ||
1173 | /* tmp[i] < 17*2^122 + 2^63 */ | ||
1174 | felem_assign(ftmp3, ftmp4); | ||
1175 | felem_scalar64(ftmp4, 2); | ||
1176 | /* ftmp4[i] < 2^61 */ | ||
1177 | felem_diff_128_64(tmp, ftmp4); | ||
1178 | /* tmp[i] < 17*2^122 + 2^64 */ | ||
1179 | felem_reduce(x_out, tmp); | ||
1180 | |||
1181 | /* y_out = r(V-x_out) - 2 * s1 * J */ | ||
1182 | felem_diff64(ftmp3, x_out); | ||
1183 | /* ftmp3[i] < 2^60 + 2^60 | ||
1184 | * = 2^61 */ | ||
1185 | felem_mul(tmp, ftmp5, ftmp3); | ||
1186 | /* tmp[i] < 17*2^122 */ | ||
1187 | felem_mul(tmp2, ftmp6, ftmp2); | ||
1188 | /* tmp2[i] < 17*2^120 */ | ||
1189 | felem_scalar128(tmp2, 2); | ||
1190 | /* tmp2[i] < 17*2^121 */ | ||
1191 | felem_diff128(tmp, tmp2); | ||
1192 | /* tmp[i] < 2^127 - 2^69 + 17*2^122 | ||
1193 | * = 2^126 - 2^122 - 2^6 - 2^2 - 1 | ||
1194 | * < 2^127 */ | ||
1195 | felem_reduce(y_out, tmp); | ||
1196 | |||
1197 | copy_conditional(x_out, x2, z1_is_zero); | ||
1198 | copy_conditional(x_out, x1, z2_is_zero); | ||
1199 | copy_conditional(y_out, y2, z1_is_zero); | ||
1200 | copy_conditional(y_out, y1, z2_is_zero); | ||
1201 | copy_conditional(z_out, z2, z1_is_zero); | ||
1202 | copy_conditional(z_out, z1, z2_is_zero); | ||
1203 | felem_assign(x3, x_out); | ||
1204 | felem_assign(y3, y_out); | ||
1205 | felem_assign(z3, z_out); | ||
1206 | } | ||
1207 | |||
1208 | /* Base point pre computation | ||
1209 | * -------------------------- | ||
1210 | * | ||
1211 | * Two different sorts of precomputed tables are used in the following code. | ||
1212 | * Each contain various points on the curve, where each point is three field | ||
1213 | * elements (x, y, z). | ||
1214 | * | ||
1215 | * For the base point table, z is usually 1 (0 for the point at infinity). | ||
1216 | * This table has 16 elements: | ||
1217 | * index | bits | point | ||
1218 | * ------+---------+------------------------------ | ||
1219 | * 0 | 0 0 0 0 | 0G | ||
1220 | * 1 | 0 0 0 1 | 1G | ||
1221 | * 2 | 0 0 1 0 | 2^130G | ||
1222 | * 3 | 0 0 1 1 | (2^130 + 1)G | ||
1223 | * 4 | 0 1 0 0 | 2^260G | ||
1224 | * 5 | 0 1 0 1 | (2^260 + 1)G | ||
1225 | * 6 | 0 1 1 0 | (2^260 + 2^130)G | ||
1226 | * 7 | 0 1 1 1 | (2^260 + 2^130 + 1)G | ||
1227 | * 8 | 1 0 0 0 | 2^390G | ||
1228 | * 9 | 1 0 0 1 | (2^390 + 1)G | ||
1229 | * 10 | 1 0 1 0 | (2^390 + 2^130)G | ||
1230 | * 11 | 1 0 1 1 | (2^390 + 2^130 + 1)G | ||
1231 | * 12 | 1 1 0 0 | (2^390 + 2^260)G | ||
1232 | * 13 | 1 1 0 1 | (2^390 + 2^260 + 1)G | ||
1233 | * 14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G | ||
1234 | * 15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G | ||
1235 | * | ||
1236 | * The reason for this is so that we can clock bits into four different | ||
1237 | * locations when doing simple scalar multiplies against the base point. | ||
1238 | * | ||
1239 | * Tables for other points have table[i] = iG for i in 0 .. 16. */ | ||
1240 | |||
1241 | /* gmul is the table of precomputed base points */ | ||
1242 | static const felem gmul[16][3] = | ||
1243 | {{{0, 0, 0, 0, 0, 0, 0, 0, 0}, | ||
1244 | {0, 0, 0, 0, 0, 0, 0, 0, 0}, | ||
1245 | {0, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1246 | {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334, | ||
1247 | 0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8, | ||
1248 | 0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404}, | ||
1249 | {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353, | ||
1250 | 0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45, | ||
1251 | 0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b}, | ||
1252 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1253 | {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad, | ||
1254 | 0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e, | ||
1255 | 0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5}, | ||
1256 | {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58, | ||
1257 | 0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c, | ||
1258 | 0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7}, | ||
1259 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1260 | {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873, | ||
1261 | 0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c, | ||
1262 | 0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9}, | ||
1263 | {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52, | ||
1264 | 0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e, | ||
1265 | 0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe}, | ||
1266 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1267 | {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2, | ||
1268 | 0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561, | ||
1269 | 0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065}, | ||
1270 | {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a, | ||
1271 | 0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e, | ||
1272 | 0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524}, | ||
1273 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1274 | {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6, | ||
1275 | 0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51, | ||
1276 | 0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe}, | ||
1277 | {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d, | ||
1278 | 0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c, | ||
1279 | 0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7}, | ||
1280 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1281 | {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27, | ||
1282 | 0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f, | ||
1283 | 0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256}, | ||
1284 | {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa, | ||
1285 | 0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2, | ||
1286 | 0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd}, | ||
1287 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1288 | {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890, | ||
1289 | 0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74, | ||
1290 | 0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23}, | ||
1291 | {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516, | ||
1292 | 0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1, | ||
1293 | 0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e}, | ||
1294 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1295 | {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce, | ||
1296 | 0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7, | ||
1297 | 0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5}, | ||
1298 | {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318, | ||
1299 | 0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83, | ||
1300 | 0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242}, | ||
1301 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1302 | {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae, | ||
1303 | 0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef, | ||
1304 | 0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203}, | ||
1305 | {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447, | ||
1306 | 0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283, | ||
1307 | 0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f}, | ||
1308 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1309 | {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5, | ||
1310 | 0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c, | ||
1311 | 0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a}, | ||
1312 | {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df, | ||
1313 | 0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645, | ||
1314 | 0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a}, | ||
1315 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1316 | {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292, | ||
1317 | 0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422, | ||
1318 | 0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b}, | ||
1319 | {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30, | ||
1320 | 0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb, | ||
1321 | 0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f}, | ||
1322 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1323 | {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767, | ||
1324 | 0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3, | ||
1325 | 0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf}, | ||
1326 | {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2, | ||
1327 | 0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692, | ||
1328 | 0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d}, | ||
1329 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1330 | {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3, | ||
1331 | 0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade, | ||
1332 | 0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684}, | ||
1333 | {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8, | ||
1334 | 0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a, | ||
1335 | 0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81}, | ||
1336 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1337 | {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608, | ||
1338 | 0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610, | ||
1339 | 0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d}, | ||
1340 | {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006, | ||
1341 | 0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86, | ||
1342 | 0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42}, | ||
1343 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}, | ||
1344 | {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c, | ||
1345 | 0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9, | ||
1346 | 0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f}, | ||
1347 | {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7, | ||
1348 | 0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c, | ||
1349 | 0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055}, | ||
1350 | {1, 0, 0, 0, 0, 0, 0, 0, 0}}}; | ||
1351 | |||
1352 | /* select_point selects the |idx|th point from a precomputation table and | ||
1353 | * copies it to out. */ | ||
1354 | static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3], | ||
1355 | felem out[3]) | ||
1356 | { | ||
1357 | unsigned i, j; | ||
1358 | limb *outlimbs = &out[0][0]; | ||
1359 | memset(outlimbs, 0, 3 * sizeof(felem)); | ||
1360 | |||
1361 | for (i = 0; i < size; i++) | ||
1362 | { | ||
1363 | const limb *inlimbs = &pre_comp[i][0][0]; | ||
1364 | limb mask = i ^ idx; | ||
1365 | mask |= mask >> 4; | ||
1366 | mask |= mask >> 2; | ||
1367 | mask |= mask >> 1; | ||
1368 | mask &= 1; | ||
1369 | mask--; | ||
1370 | for (j = 0; j < NLIMBS * 3; j++) | ||
1371 | outlimbs[j] |= inlimbs[j] & mask; | ||
1372 | } | ||
1373 | } | ||
1374 | |||
1375 | /* get_bit returns the |i|th bit in |in| */ | ||
1376 | static char get_bit(const felem_bytearray in, int i) | ||
1377 | { | ||
1378 | if (i < 0) | ||
1379 | return 0; | ||
1380 | return (in[i >> 3] >> (i & 7)) & 1; | ||
1381 | } | ||
1382 | |||
1383 | /* Interleaved point multiplication using precomputed point multiples: | ||
1384 | * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], | ||
1385 | * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple | ||
1386 | * of the generator, using certain (large) precomputed multiples in g_pre_comp. | ||
1387 | * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ | ||
1388 | static void batch_mul(felem x_out, felem y_out, felem z_out, | ||
1389 | const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, | ||
1390 | const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3]) | ||
1391 | { | ||
1392 | int i, skip; | ||
1393 | unsigned num, gen_mul = (g_scalar != NULL); | ||
1394 | felem nq[3], tmp[4]; | ||
1395 | limb bits; | ||
1396 | u8 sign, digit; | ||
1397 | |||
1398 | /* set nq to the point at infinity */ | ||
1399 | memset(nq, 0, 3 * sizeof(felem)); | ||
1400 | |||
1401 | /* Loop over all scalars msb-to-lsb, interleaving additions | ||
1402 | * of multiples of the generator (last quarter of rounds) | ||
1403 | * and additions of other points multiples (every 5th round). | ||
1404 | */ | ||
1405 | skip = 1; /* save two point operations in the first round */ | ||
1406 | for (i = (num_points ? 520 : 130); i >= 0; --i) | ||
1407 | { | ||
1408 | /* double */ | ||
1409 | if (!skip) | ||
1410 | point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); | ||
1411 | |||
1412 | /* add multiples of the generator */ | ||
1413 | if (gen_mul && (i <= 130)) | ||
1414 | { | ||
1415 | bits = get_bit(g_scalar, i + 390) << 3; | ||
1416 | if (i < 130) | ||
1417 | { | ||
1418 | bits |= get_bit(g_scalar, i + 260) << 2; | ||
1419 | bits |= get_bit(g_scalar, i + 130) << 1; | ||
1420 | bits |= get_bit(g_scalar, i); | ||
1421 | } | ||
1422 | /* select the point to add, in constant time */ | ||
1423 | select_point(bits, 16, g_pre_comp, tmp); | ||
1424 | if (!skip) | ||
1425 | { | ||
1426 | point_add(nq[0], nq[1], nq[2], | ||
1427 | nq[0], nq[1], nq[2], | ||
1428 | 1 /* mixed */, tmp[0], tmp[1], tmp[2]); | ||
1429 | } | ||
1430 | else | ||
1431 | { | ||
1432 | memcpy(nq, tmp, 3 * sizeof(felem)); | ||
1433 | skip = 0; | ||
1434 | } | ||
1435 | } | ||
1436 | |||
1437 | /* do other additions every 5 doublings */ | ||
1438 | if (num_points && (i % 5 == 0)) | ||
1439 | { | ||
1440 | /* loop over all scalars */ | ||
1441 | for (num = 0; num < num_points; ++num) | ||
1442 | { | ||
1443 | bits = get_bit(scalars[num], i + 4) << 5; | ||
1444 | bits |= get_bit(scalars[num], i + 3) << 4; | ||
1445 | bits |= get_bit(scalars[num], i + 2) << 3; | ||
1446 | bits |= get_bit(scalars[num], i + 1) << 2; | ||
1447 | bits |= get_bit(scalars[num], i) << 1; | ||
1448 | bits |= get_bit(scalars[num], i - 1); | ||
1449 | ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); | ||
1450 | |||
1451 | /* select the point to add or subtract, in constant time */ | ||
1452 | select_point(digit, 17, pre_comp[num], tmp); | ||
1453 | felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */ | ||
1454 | copy_conditional(tmp[1], tmp[3], (-(limb) sign)); | ||
1455 | |||
1456 | if (!skip) | ||
1457 | { | ||
1458 | point_add(nq[0], nq[1], nq[2], | ||
1459 | nq[0], nq[1], nq[2], | ||
1460 | mixed, tmp[0], tmp[1], tmp[2]); | ||
1461 | } | ||
1462 | else | ||
1463 | { | ||
1464 | memcpy(nq, tmp, 3 * sizeof(felem)); | ||
1465 | skip = 0; | ||
1466 | } | ||
1467 | } | ||
1468 | } | ||
1469 | } | ||
1470 | felem_assign(x_out, nq[0]); | ||
1471 | felem_assign(y_out, nq[1]); | ||
1472 | felem_assign(z_out, nq[2]); | ||
1473 | } | ||
1474 | |||
1475 | |||
1476 | /* Precomputation for the group generator. */ | ||
1477 | typedef struct { | ||
1478 | felem g_pre_comp[16][3]; | ||
1479 | int references; | ||
1480 | } NISTP521_PRE_COMP; | ||
1481 | |||
1482 | const EC_METHOD *EC_GFp_nistp521_method(void) | ||
1483 | { | ||
1484 | static const EC_METHOD ret = { | ||
1485 | EC_FLAGS_DEFAULT_OCT, | ||
1486 | NID_X9_62_prime_field, | ||
1487 | ec_GFp_nistp521_group_init, | ||
1488 | ec_GFp_simple_group_finish, | ||
1489 | ec_GFp_simple_group_clear_finish, | ||
1490 | ec_GFp_nist_group_copy, | ||
1491 | ec_GFp_nistp521_group_set_curve, | ||
1492 | ec_GFp_simple_group_get_curve, | ||
1493 | ec_GFp_simple_group_get_degree, | ||
1494 | ec_GFp_simple_group_check_discriminant, | ||
1495 | ec_GFp_simple_point_init, | ||
1496 | ec_GFp_simple_point_finish, | ||
1497 | ec_GFp_simple_point_clear_finish, | ||
1498 | ec_GFp_simple_point_copy, | ||
1499 | ec_GFp_simple_point_set_to_infinity, | ||
1500 | ec_GFp_simple_set_Jprojective_coordinates_GFp, | ||
1501 | ec_GFp_simple_get_Jprojective_coordinates_GFp, | ||
1502 | ec_GFp_simple_point_set_affine_coordinates, | ||
1503 | ec_GFp_nistp521_point_get_affine_coordinates, | ||
1504 | 0 /* point_set_compressed_coordinates */, | ||
1505 | 0 /* point2oct */, | ||
1506 | 0 /* oct2point */, | ||
1507 | ec_GFp_simple_add, | ||
1508 | ec_GFp_simple_dbl, | ||
1509 | ec_GFp_simple_invert, | ||
1510 | ec_GFp_simple_is_at_infinity, | ||
1511 | ec_GFp_simple_is_on_curve, | ||
1512 | ec_GFp_simple_cmp, | ||
1513 | ec_GFp_simple_make_affine, | ||
1514 | ec_GFp_simple_points_make_affine, | ||
1515 | ec_GFp_nistp521_points_mul, | ||
1516 | ec_GFp_nistp521_precompute_mult, | ||
1517 | ec_GFp_nistp521_have_precompute_mult, | ||
1518 | ec_GFp_nist_field_mul, | ||
1519 | ec_GFp_nist_field_sqr, | ||
1520 | 0 /* field_div */, | ||
1521 | 0 /* field_encode */, | ||
1522 | 0 /* field_decode */, | ||
1523 | 0 /* field_set_to_one */ }; | ||
1524 | |||
1525 | return &ret; | ||
1526 | } | ||
1527 | |||
1528 | |||
1529 | /******************************************************************************/ | ||
1530 | /* FUNCTIONS TO MANAGE PRECOMPUTATION | ||
1531 | */ | ||
1532 | |||
1533 | static NISTP521_PRE_COMP *nistp521_pre_comp_new() | ||
1534 | { | ||
1535 | NISTP521_PRE_COMP *ret = NULL; | ||
1536 | ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP)); | ||
1537 | if (!ret) | ||
1538 | { | ||
1539 | ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); | ||
1540 | return ret; | ||
1541 | } | ||
1542 | memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); | ||
1543 | ret->references = 1; | ||
1544 | return ret; | ||
1545 | } | ||
1546 | |||
1547 | static void *nistp521_pre_comp_dup(void *src_) | ||
1548 | { | ||
1549 | NISTP521_PRE_COMP *src = src_; | ||
1550 | |||
1551 | /* no need to actually copy, these objects never change! */ | ||
1552 | CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); | ||
1553 | |||
1554 | return src_; | ||
1555 | } | ||
1556 | |||
1557 | static void nistp521_pre_comp_free(void *pre_) | ||
1558 | { | ||
1559 | int i; | ||
1560 | NISTP521_PRE_COMP *pre = pre_; | ||
1561 | |||
1562 | if (!pre) | ||
1563 | return; | ||
1564 | |||
1565 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
1566 | if (i > 0) | ||
1567 | return; | ||
1568 | |||
1569 | OPENSSL_free(pre); | ||
1570 | } | ||
1571 | |||
1572 | static void nistp521_pre_comp_clear_free(void *pre_) | ||
1573 | { | ||
1574 | int i; | ||
1575 | NISTP521_PRE_COMP *pre = pre_; | ||
1576 | |||
1577 | if (!pre) | ||
1578 | return; | ||
1579 | |||
1580 | i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); | ||
1581 | if (i > 0) | ||
1582 | return; | ||
1583 | |||
1584 | OPENSSL_cleanse(pre, sizeof(*pre)); | ||
1585 | OPENSSL_free(pre); | ||
1586 | } | ||
1587 | |||
1588 | /******************************************************************************/ | ||
1589 | /* OPENSSL EC_METHOD FUNCTIONS | ||
1590 | */ | ||
1591 | |||
1592 | int ec_GFp_nistp521_group_init(EC_GROUP *group) | ||
1593 | { | ||
1594 | int ret; | ||
1595 | ret = ec_GFp_simple_group_init(group); | ||
1596 | group->a_is_minus3 = 1; | ||
1597 | return ret; | ||
1598 | } | ||
1599 | |||
1600 | int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, | ||
1601 | const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) | ||
1602 | { | ||
1603 | int ret = 0; | ||
1604 | BN_CTX *new_ctx = NULL; | ||
1605 | BIGNUM *curve_p, *curve_a, *curve_b; | ||
1606 | |||
1607 | if (ctx == NULL) | ||
1608 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
1609 | BN_CTX_start(ctx); | ||
1610 | if (((curve_p = BN_CTX_get(ctx)) == NULL) || | ||
1611 | ((curve_a = BN_CTX_get(ctx)) == NULL) || | ||
1612 | ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; | ||
1613 | BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p); | ||
1614 | BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a); | ||
1615 | BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b); | ||
1616 | if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || | ||
1617 | (BN_cmp(curve_b, b))) | ||
1618 | { | ||
1619 | ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE, | ||
1620 | EC_R_WRONG_CURVE_PARAMETERS); | ||
1621 | goto err; | ||
1622 | } | ||
1623 | group->field_mod_func = BN_nist_mod_521; | ||
1624 | ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); | ||
1625 | err: | ||
1626 | BN_CTX_end(ctx); | ||
1627 | if (new_ctx != NULL) | ||
1628 | BN_CTX_free(new_ctx); | ||
1629 | return ret; | ||
1630 | } | ||
1631 | |||
1632 | /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns | ||
1633 | * (X', Y') = (X/Z^2, Y/Z^3) */ | ||
1634 | int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, | ||
1635 | const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) | ||
1636 | { | ||
1637 | felem z1, z2, x_in, y_in, x_out, y_out; | ||
1638 | largefelem tmp; | ||
1639 | |||
1640 | if (EC_POINT_is_at_infinity(group, point)) | ||
1641 | { | ||
1642 | ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, | ||
1643 | EC_R_POINT_AT_INFINITY); | ||
1644 | return 0; | ||
1645 | } | ||
1646 | if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || | ||
1647 | (!BN_to_felem(z1, &point->Z))) return 0; | ||
1648 | felem_inv(z2, z1); | ||
1649 | felem_square(tmp, z2); felem_reduce(z1, tmp); | ||
1650 | felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); | ||
1651 | felem_contract(x_out, x_in); | ||
1652 | if (x != NULL) | ||
1653 | { | ||
1654 | if (!felem_to_BN(x, x_out)) | ||
1655 | { | ||
1656 | ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB); | ||
1657 | return 0; | ||
1658 | } | ||
1659 | } | ||
1660 | felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); | ||
1661 | felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); | ||
1662 | felem_contract(y_out, y_in); | ||
1663 | if (y != NULL) | ||
1664 | { | ||
1665 | if (!felem_to_BN(y, y_out)) | ||
1666 | { | ||
1667 | ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB); | ||
1668 | return 0; | ||
1669 | } | ||
1670 | } | ||
1671 | return 1; | ||
1672 | } | ||
1673 | |||
1674 | static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */]) | ||
1675 | { | ||
1676 | /* Runs in constant time, unless an input is the point at infinity | ||
1677 | * (which normally shouldn't happen). */ | ||
1678 | ec_GFp_nistp_points_make_affine_internal( | ||
1679 | num, | ||
1680 | points, | ||
1681 | sizeof(felem), | ||
1682 | tmp_felems, | ||
1683 | (void (*)(void *)) felem_one, | ||
1684 | (int (*)(const void *)) felem_is_zero_int, | ||
1685 | (void (*)(void *, const void *)) felem_assign, | ||
1686 | (void (*)(void *, const void *)) felem_square_reduce, | ||
1687 | (void (*)(void *, const void *, const void *)) felem_mul_reduce, | ||
1688 | (void (*)(void *, const void *)) felem_inv, | ||
1689 | (void (*)(void *, const void *)) felem_contract); | ||
1690 | } | ||
1691 | |||
1692 | /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values | ||
1693 | * Result is stored in r (r can equal one of the inputs). */ | ||
1694 | int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, | ||
1695 | const BIGNUM *scalar, size_t num, const EC_POINT *points[], | ||
1696 | const BIGNUM *scalars[], BN_CTX *ctx) | ||
1697 | { | ||
1698 | int ret = 0; | ||
1699 | int j; | ||
1700 | int mixed = 0; | ||
1701 | BN_CTX *new_ctx = NULL; | ||
1702 | BIGNUM *x, *y, *z, *tmp_scalar; | ||
1703 | felem_bytearray g_secret; | ||
1704 | felem_bytearray *secrets = NULL; | ||
1705 | felem (*pre_comp)[17][3] = NULL; | ||
1706 | felem *tmp_felems = NULL; | ||
1707 | felem_bytearray tmp; | ||
1708 | unsigned i, num_bytes; | ||
1709 | int have_pre_comp = 0; | ||
1710 | size_t num_points = num; | ||
1711 | felem x_in, y_in, z_in, x_out, y_out, z_out; | ||
1712 | NISTP521_PRE_COMP *pre = NULL; | ||
1713 | felem (*g_pre_comp)[3] = NULL; | ||
1714 | EC_POINT *generator = NULL; | ||
1715 | const EC_POINT *p = NULL; | ||
1716 | const BIGNUM *p_scalar = NULL; | ||
1717 | |||
1718 | if (ctx == NULL) | ||
1719 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
1720 | BN_CTX_start(ctx); | ||
1721 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
1722 | ((y = BN_CTX_get(ctx)) == NULL) || | ||
1723 | ((z = BN_CTX_get(ctx)) == NULL) || | ||
1724 | ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) | ||
1725 | goto err; | ||
1726 | |||
1727 | if (scalar != NULL) | ||
1728 | { | ||
1729 | pre = EC_EX_DATA_get_data(group->extra_data, | ||
1730 | nistp521_pre_comp_dup, nistp521_pre_comp_free, | ||
1731 | nistp521_pre_comp_clear_free); | ||
1732 | if (pre) | ||
1733 | /* we have precomputation, try to use it */ | ||
1734 | g_pre_comp = &pre->g_pre_comp[0]; | ||
1735 | else | ||
1736 | /* try to use the standard precomputation */ | ||
1737 | g_pre_comp = (felem (*)[3]) gmul; | ||
1738 | generator = EC_POINT_new(group); | ||
1739 | if (generator == NULL) | ||
1740 | goto err; | ||
1741 | /* get the generator from precomputation */ | ||
1742 | if (!felem_to_BN(x, g_pre_comp[1][0]) || | ||
1743 | !felem_to_BN(y, g_pre_comp[1][1]) || | ||
1744 | !felem_to_BN(z, g_pre_comp[1][2])) | ||
1745 | { | ||
1746 | ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); | ||
1747 | goto err; | ||
1748 | } | ||
1749 | if (!EC_POINT_set_Jprojective_coordinates_GFp(group, | ||
1750 | generator, x, y, z, ctx)) | ||
1751 | goto err; | ||
1752 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
1753 | /* precomputation matches generator */ | ||
1754 | have_pre_comp = 1; | ||
1755 | else | ||
1756 | /* we don't have valid precomputation: | ||
1757 | * treat the generator as a random point */ | ||
1758 | num_points++; | ||
1759 | } | ||
1760 | |||
1761 | if (num_points > 0) | ||
1762 | { | ||
1763 | if (num_points >= 2) | ||
1764 | { | ||
1765 | /* unless we precompute multiples for just one point, | ||
1766 | * converting those into affine form is time well spent */ | ||
1767 | mixed = 1; | ||
1768 | } | ||
1769 | secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); | ||
1770 | pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem)); | ||
1771 | if (mixed) | ||
1772 | tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem)); | ||
1773 | if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL))) | ||
1774 | { | ||
1775 | ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE); | ||
1776 | goto err; | ||
1777 | } | ||
1778 | |||
1779 | /* we treat NULL scalars as 0, and NULL points as points at infinity, | ||
1780 | * i.e., they contribute nothing to the linear combination */ | ||
1781 | memset(secrets, 0, num_points * sizeof(felem_bytearray)); | ||
1782 | memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem)); | ||
1783 | for (i = 0; i < num_points; ++i) | ||
1784 | { | ||
1785 | if (i == num) | ||
1786 | /* we didn't have a valid precomputation, so we pick | ||
1787 | * the generator */ | ||
1788 | { | ||
1789 | p = EC_GROUP_get0_generator(group); | ||
1790 | p_scalar = scalar; | ||
1791 | } | ||
1792 | else | ||
1793 | /* the i^th point */ | ||
1794 | { | ||
1795 | p = points[i]; | ||
1796 | p_scalar = scalars[i]; | ||
1797 | } | ||
1798 | if ((p_scalar != NULL) && (p != NULL)) | ||
1799 | { | ||
1800 | /* reduce scalar to 0 <= scalar < 2^521 */ | ||
1801 | if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar))) | ||
1802 | { | ||
1803 | /* this is an unusual input, and we don't guarantee | ||
1804 | * constant-timeness */ | ||
1805 | if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) | ||
1806 | { | ||
1807 | ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); | ||
1808 | goto err; | ||
1809 | } | ||
1810 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
1811 | } | ||
1812 | else | ||
1813 | num_bytes = BN_bn2bin(p_scalar, tmp); | ||
1814 | flip_endian(secrets[i], tmp, num_bytes); | ||
1815 | /* precompute multiples */ | ||
1816 | if ((!BN_to_felem(x_out, &p->X)) || | ||
1817 | (!BN_to_felem(y_out, &p->Y)) || | ||
1818 | (!BN_to_felem(z_out, &p->Z))) goto err; | ||
1819 | memcpy(pre_comp[i][1][0], x_out, sizeof(felem)); | ||
1820 | memcpy(pre_comp[i][1][1], y_out, sizeof(felem)); | ||
1821 | memcpy(pre_comp[i][1][2], z_out, sizeof(felem)); | ||
1822 | for (j = 2; j <= 16; ++j) | ||
1823 | { | ||
1824 | if (j & 1) | ||
1825 | { | ||
1826 | point_add( | ||
1827 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
1828 | pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], | ||
1829 | 0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); | ||
1830 | } | ||
1831 | else | ||
1832 | { | ||
1833 | point_double( | ||
1834 | pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], | ||
1835 | pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); | ||
1836 | } | ||
1837 | } | ||
1838 | } | ||
1839 | } | ||
1840 | if (mixed) | ||
1841 | make_points_affine(num_points * 17, pre_comp[0], tmp_felems); | ||
1842 | } | ||
1843 | |||
1844 | /* the scalar for the generator */ | ||
1845 | if ((scalar != NULL) && (have_pre_comp)) | ||
1846 | { | ||
1847 | memset(g_secret, 0, sizeof(g_secret)); | ||
1848 | /* reduce scalar to 0 <= scalar < 2^521 */ | ||
1849 | if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) | ||
1850 | { | ||
1851 | /* this is an unusual input, and we don't guarantee | ||
1852 | * constant-timeness */ | ||
1853 | if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) | ||
1854 | { | ||
1855 | ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); | ||
1856 | goto err; | ||
1857 | } | ||
1858 | num_bytes = BN_bn2bin(tmp_scalar, tmp); | ||
1859 | } | ||
1860 | else | ||
1861 | num_bytes = BN_bn2bin(scalar, tmp); | ||
1862 | flip_endian(g_secret, tmp, num_bytes); | ||
1863 | /* do the multiplication with generator precomputation*/ | ||
1864 | batch_mul(x_out, y_out, z_out, | ||
1865 | (const felem_bytearray (*)) secrets, num_points, | ||
1866 | g_secret, | ||
1867 | mixed, (const felem (*)[17][3]) pre_comp, | ||
1868 | (const felem (*)[3]) g_pre_comp); | ||
1869 | } | ||
1870 | else | ||
1871 | /* do the multiplication without generator precomputation */ | ||
1872 | batch_mul(x_out, y_out, z_out, | ||
1873 | (const felem_bytearray (*)) secrets, num_points, | ||
1874 | NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL); | ||
1875 | /* reduce the output to its unique minimal representation */ | ||
1876 | felem_contract(x_in, x_out); | ||
1877 | felem_contract(y_in, y_out); | ||
1878 | felem_contract(z_in, z_out); | ||
1879 | if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || | ||
1880 | (!felem_to_BN(z, z_in))) | ||
1881 | { | ||
1882 | ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); | ||
1883 | goto err; | ||
1884 | } | ||
1885 | ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); | ||
1886 | |||
1887 | err: | ||
1888 | BN_CTX_end(ctx); | ||
1889 | if (generator != NULL) | ||
1890 | EC_POINT_free(generator); | ||
1891 | if (new_ctx != NULL) | ||
1892 | BN_CTX_free(new_ctx); | ||
1893 | if (secrets != NULL) | ||
1894 | OPENSSL_free(secrets); | ||
1895 | if (pre_comp != NULL) | ||
1896 | OPENSSL_free(pre_comp); | ||
1897 | if (tmp_felems != NULL) | ||
1898 | OPENSSL_free(tmp_felems); | ||
1899 | return ret; | ||
1900 | } | ||
1901 | |||
1902 | int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx) | ||
1903 | { | ||
1904 | int ret = 0; | ||
1905 | NISTP521_PRE_COMP *pre = NULL; | ||
1906 | int i, j; | ||
1907 | BN_CTX *new_ctx = NULL; | ||
1908 | BIGNUM *x, *y; | ||
1909 | EC_POINT *generator = NULL; | ||
1910 | felem tmp_felems[16]; | ||
1911 | |||
1912 | /* throw away old precomputation */ | ||
1913 | EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup, | ||
1914 | nistp521_pre_comp_free, nistp521_pre_comp_clear_free); | ||
1915 | if (ctx == NULL) | ||
1916 | if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; | ||
1917 | BN_CTX_start(ctx); | ||
1918 | if (((x = BN_CTX_get(ctx)) == NULL) || | ||
1919 | ((y = BN_CTX_get(ctx)) == NULL)) | ||
1920 | goto err; | ||
1921 | /* get the generator */ | ||
1922 | if (group->generator == NULL) goto err; | ||
1923 | generator = EC_POINT_new(group); | ||
1924 | if (generator == NULL) | ||
1925 | goto err; | ||
1926 | BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x); | ||
1927 | BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y); | ||
1928 | if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) | ||
1929 | goto err; | ||
1930 | if ((pre = nistp521_pre_comp_new()) == NULL) | ||
1931 | goto err; | ||
1932 | /* if the generator is the standard one, use built-in precomputation */ | ||
1933 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) | ||
1934 | { | ||
1935 | memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); | ||
1936 | ret = 1; | ||
1937 | goto err; | ||
1938 | } | ||
1939 | if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) || | ||
1940 | (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) || | ||
1941 | (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z))) | ||
1942 | goto err; | ||
1943 | /* compute 2^130*G, 2^260*G, 2^390*G */ | ||
1944 | for (i = 1; i <= 4; i <<= 1) | ||
1945 | { | ||
1946 | point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1], | ||
1947 | pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0], | ||
1948 | pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]); | ||
1949 | for (j = 0; j < 129; ++j) | ||
1950 | { | ||
1951 | point_double(pre->g_pre_comp[2*i][0], | ||
1952 | pre->g_pre_comp[2*i][1], | ||
1953 | pre->g_pre_comp[2*i][2], | ||
1954 | pre->g_pre_comp[2*i][0], | ||
1955 | pre->g_pre_comp[2*i][1], | ||
1956 | pre->g_pre_comp[2*i][2]); | ||
1957 | } | ||
1958 | } | ||
1959 | /* g_pre_comp[0] is the point at infinity */ | ||
1960 | memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0])); | ||
1961 | /* the remaining multiples */ | ||
1962 | /* 2^130*G + 2^260*G */ | ||
1963 | point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], | ||
1964 | pre->g_pre_comp[6][2], pre->g_pre_comp[4][0], | ||
1965 | pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], | ||
1966 | 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], | ||
1967 | pre->g_pre_comp[2][2]); | ||
1968 | /* 2^130*G + 2^390*G */ | ||
1969 | point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], | ||
1970 | pre->g_pre_comp[10][2], pre->g_pre_comp[8][0], | ||
1971 | pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], | ||
1972 | 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], | ||
1973 | pre->g_pre_comp[2][2]); | ||
1974 | /* 2^260*G + 2^390*G */ | ||
1975 | point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], | ||
1976 | pre->g_pre_comp[12][2], pre->g_pre_comp[8][0], | ||
1977 | pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], | ||
1978 | 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], | ||
1979 | pre->g_pre_comp[4][2]); | ||
1980 | /* 2^130*G + 2^260*G + 2^390*G */ | ||
1981 | point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], | ||
1982 | pre->g_pre_comp[14][2], pre->g_pre_comp[12][0], | ||
1983 | pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], | ||
1984 | 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], | ||
1985 | pre->g_pre_comp[2][2]); | ||
1986 | for (i = 1; i < 8; ++i) | ||
1987 | { | ||
1988 | /* odd multiples: add G */ | ||
1989 | point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1], | ||
1990 | pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0], | ||
1991 | pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2], | ||
1992 | 0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], | ||
1993 | pre->g_pre_comp[1][2]); | ||
1994 | } | ||
1995 | make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems); | ||
1996 | |||
1997 | if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup, | ||
1998 | nistp521_pre_comp_free, nistp521_pre_comp_clear_free)) | ||
1999 | goto err; | ||
2000 | ret = 1; | ||
2001 | pre = NULL; | ||
2002 | err: | ||
2003 | BN_CTX_end(ctx); | ||
2004 | if (generator != NULL) | ||
2005 | EC_POINT_free(generator); | ||
2006 | if (new_ctx != NULL) | ||
2007 | BN_CTX_free(new_ctx); | ||
2008 | if (pre) | ||
2009 | nistp521_pre_comp_free(pre); | ||
2010 | return ret; | ||
2011 | } | ||
2012 | |||
2013 | int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group) | ||
2014 | { | ||
2015 | if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup, | ||
2016 | nistp521_pre_comp_free, nistp521_pre_comp_clear_free) | ||
2017 | != NULL) | ||
2018 | return 1; | ||
2019 | else | ||
2020 | return 0; | ||
2021 | } | ||
2022 | |||
2023 | #else | ||
2024 | static void *dummy=&dummy; | ||
2025 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ecp_nistputil.c b/src/lib/libcrypto/ec/ecp_nistputil.c new file mode 100644 index 0000000000..c8140c807f --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistputil.c | |||
@@ -0,0 +1,197 @@ | |||
1 | /* crypto/ec/ecp_nistputil.c */ | ||
2 | /* | ||
3 | * Written by Bodo Moeller for the OpenSSL project. | ||
4 | */ | ||
5 | /* Copyright 2011 Google Inc. | ||
6 | * | ||
7 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
8 | * | ||
9 | * you may not use this file except in compliance with the License. | ||
10 | * You may obtain a copy of the License at | ||
11 | * | ||
12 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
13 | * | ||
14 | * Unless required by applicable law or agreed to in writing, software | ||
15 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
17 | * See the License for the specific language governing permissions and | ||
18 | * limitations under the License. | ||
19 | */ | ||
20 | |||
21 | #include <openssl/opensslconf.h> | ||
22 | #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 | ||
23 | |||
24 | /* | ||
25 | * Common utility functions for ecp_nistp224.c, ecp_nistp256.c, ecp_nistp521.c. | ||
26 | */ | ||
27 | |||
28 | #include <stddef.h> | ||
29 | #include "ec_lcl.h" | ||
30 | |||
31 | /* Convert an array of points into affine coordinates. | ||
32 | * (If the point at infinity is found (Z = 0), it remains unchanged.) | ||
33 | * This function is essentially an equivalent to EC_POINTs_make_affine(), but | ||
34 | * works with the internal representation of points as used by ecp_nistp###.c | ||
35 | * rather than with (BIGNUM-based) EC_POINT data structures. | ||
36 | * | ||
37 | * point_array is the input/output buffer ('num' points in projective form, | ||
38 | * i.e. three coordinates each), based on an internal representation of | ||
39 | * field elements of size 'felem_size'. | ||
40 | * | ||
41 | * tmp_felems needs to point to a temporary array of 'num'+1 field elements | ||
42 | * for storage of intermediate values. | ||
43 | */ | ||
44 | void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array, | ||
45 | size_t felem_size, void *tmp_felems, | ||
46 | void (*felem_one)(void *out), | ||
47 | int (*felem_is_zero)(const void *in), | ||
48 | void (*felem_assign)(void *out, const void *in), | ||
49 | void (*felem_square)(void *out, const void *in), | ||
50 | void (*felem_mul)(void *out, const void *in1, const void *in2), | ||
51 | void (*felem_inv)(void *out, const void *in), | ||
52 | void (*felem_contract)(void *out, const void *in)) | ||
53 | { | ||
54 | int i = 0; | ||
55 | |||
56 | #define tmp_felem(I) (&((char *)tmp_felems)[(I) * felem_size]) | ||
57 | #define X(I) (&((char *)point_array)[3*(I) * felem_size]) | ||
58 | #define Y(I) (&((char *)point_array)[(3*(I) + 1) * felem_size]) | ||
59 | #define Z(I) (&((char *)point_array)[(3*(I) + 2) * felem_size]) | ||
60 | |||
61 | if (!felem_is_zero(Z(0))) | ||
62 | felem_assign(tmp_felem(0), Z(0)); | ||
63 | else | ||
64 | felem_one(tmp_felem(0)); | ||
65 | for (i = 1; i < (int)num; i++) | ||
66 | { | ||
67 | if (!felem_is_zero(Z(i))) | ||
68 | felem_mul(tmp_felem(i), tmp_felem(i-1), Z(i)); | ||
69 | else | ||
70 | felem_assign(tmp_felem(i), tmp_felem(i-1)); | ||
71 | } | ||
72 | /* Now each tmp_felem(i) is the product of Z(0) .. Z(i), skipping any zero-valued factors: | ||
73 | * if Z(i) = 0, we essentially pretend that Z(i) = 1 */ | ||
74 | |||
75 | felem_inv(tmp_felem(num-1), tmp_felem(num-1)); | ||
76 | for (i = num - 1; i >= 0; i--) | ||
77 | { | ||
78 | if (i > 0) | ||
79 | /* tmp_felem(i-1) is the product of Z(0) .. Z(i-1), | ||
80 | * tmp_felem(i) is the inverse of the product of Z(0) .. Z(i) | ||
81 | */ | ||
82 | felem_mul(tmp_felem(num), tmp_felem(i-1), tmp_felem(i)); /* 1/Z(i) */ | ||
83 | else | ||
84 | felem_assign(tmp_felem(num), tmp_felem(0)); /* 1/Z(0) */ | ||
85 | |||
86 | if (!felem_is_zero(Z(i))) | ||
87 | { | ||
88 | if (i > 0) | ||
89 | /* For next iteration, replace tmp_felem(i-1) by its inverse */ | ||
90 | felem_mul(tmp_felem(i-1), tmp_felem(i), Z(i)); | ||
91 | |||
92 | /* Convert point (X, Y, Z) into affine form (X/(Z^2), Y/(Z^3), 1) */ | ||
93 | felem_square(Z(i), tmp_felem(num)); /* 1/(Z^2) */ | ||
94 | felem_mul(X(i), X(i), Z(i)); /* X/(Z^2) */ | ||
95 | felem_mul(Z(i), Z(i), tmp_felem(num)); /* 1/(Z^3) */ | ||
96 | felem_mul(Y(i), Y(i), Z(i)); /* Y/(Z^3) */ | ||
97 | felem_contract(X(i), X(i)); | ||
98 | felem_contract(Y(i), Y(i)); | ||
99 | felem_one(Z(i)); | ||
100 | } | ||
101 | else | ||
102 | { | ||
103 | if (i > 0) | ||
104 | /* For next iteration, replace tmp_felem(i-1) by its inverse */ | ||
105 | felem_assign(tmp_felem(i-1), tmp_felem(i)); | ||
106 | } | ||
107 | } | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * This function looks at 5+1 scalar bits (5 current, 1 adjacent less | ||
112 | * significant bit), and recodes them into a signed digit for use in fast point | ||
113 | * multiplication: the use of signed rather than unsigned digits means that | ||
114 | * fewer points need to be precomputed, given that point inversion is easy | ||
115 | * (a precomputed point dP makes -dP available as well). | ||
116 | * | ||
117 | * BACKGROUND: | ||
118 | * | ||
119 | * Signed digits for multiplication were introduced by Booth ("A signed binary | ||
120 | * multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV, | ||
121 | * pt. 2 (1951), pp. 236-240), in that case for multiplication of integers. | ||
122 | * Booth's original encoding did not generally improve the density of nonzero | ||
123 | * digits over the binary representation, and was merely meant to simplify the | ||
124 | * handling of signed factors given in two's complement; but it has since been | ||
125 | * shown to be the basis of various signed-digit representations that do have | ||
126 | * further advantages, including the wNAF, using the following general approach: | ||
127 | * | ||
128 | * (1) Given a binary representation | ||
129 | * | ||
130 | * b_k ... b_2 b_1 b_0, | ||
131 | * | ||
132 | * of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1 | ||
133 | * by using bit-wise subtraction as follows: | ||
134 | * | ||
135 | * b_k b_(k-1) ... b_2 b_1 b_0 | ||
136 | * - b_k ... b_3 b_2 b_1 b_0 | ||
137 | * ------------------------------------- | ||
138 | * s_k b_(k-1) ... s_3 s_2 s_1 s_0 | ||
139 | * | ||
140 | * A left-shift followed by subtraction of the original value yields a new | ||
141 | * representation of the same value, using signed bits s_i = b_(i+1) - b_i. | ||
142 | * This representation from Booth's paper has since appeared in the | ||
143 | * literature under a variety of different names including "reversed binary | ||
144 | * form", "alternating greedy expansion", "mutual opposite form", and | ||
145 | * "sign-alternating {+-1}-representation". | ||
146 | * | ||
147 | * An interesting property is that among the nonzero bits, values 1 and -1 | ||
148 | * strictly alternate. | ||
149 | * | ||
150 | * (2) Various window schemes can be applied to the Booth representation of | ||
151 | * integers: for example, right-to-left sliding windows yield the wNAF | ||
152 | * (a signed-digit encoding independently discovered by various researchers | ||
153 | * in the 1990s), and left-to-right sliding windows yield a left-to-right | ||
154 | * equivalent of the wNAF (independently discovered by various researchers | ||
155 | * around 2004). | ||
156 | * | ||
157 | * To prevent leaking information through side channels in point multiplication, | ||
158 | * we need to recode the given integer into a regular pattern: sliding windows | ||
159 | * as in wNAFs won't do, we need their fixed-window equivalent -- which is a few | ||
160 | * decades older: we'll be using the so-called "modified Booth encoding" due to | ||
161 | * MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49 | ||
162 | * (1961), pp. 67-91), in a radix-2^5 setting. That is, we always combine five | ||
163 | * signed bits into a signed digit: | ||
164 | * | ||
165 | * s_(4j + 4) s_(4j + 3) s_(4j + 2) s_(4j + 1) s_(4j) | ||
166 | * | ||
167 | * The sign-alternating property implies that the resulting digit values are | ||
168 | * integers from -16 to 16. | ||
169 | * | ||
170 | * Of course, we don't actually need to compute the signed digits s_i as an | ||
171 | * intermediate step (that's just a nice way to see how this scheme relates | ||
172 | * to the wNAF): a direct computation obtains the recoded digit from the | ||
173 | * six bits b_(4j + 4) ... b_(4j - 1). | ||
174 | * | ||
175 | * This function takes those five bits as an integer (0 .. 63), writing the | ||
176 | * recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute | ||
177 | * value, in the range 0 .. 8). Note that this integer essentially provides the | ||
178 | * input bits "shifted to the left" by one position: for example, the input to | ||
179 | * compute the least significant recoded digit, given that there's no bit b_-1, | ||
180 | * has to be b_4 b_3 b_2 b_1 b_0 0. | ||
181 | * | ||
182 | */ | ||
183 | void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in) | ||
184 | { | ||
185 | unsigned char s, d; | ||
186 | |||
187 | s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as 6-bit value */ | ||
188 | d = (1 << 6) - in - 1; | ||
189 | d = (d & s) | (in & ~s); | ||
190 | d = (d >> 1) + (d & 1); | ||
191 | |||
192 | *sign = s & 1; | ||
193 | *digit = d; | ||
194 | } | ||
195 | #else | ||
196 | static void *dummy=&dummy; | ||
197 | #endif | ||
diff --git a/src/lib/libcrypto/ec/ecp_oct.c b/src/lib/libcrypto/ec/ecp_oct.c new file mode 100644 index 0000000000..374a0ee731 --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_oct.c | |||
@@ -0,0 +1,433 @@ | |||
1 | /* crypto/ec/ecp_oct.c */ | ||
2 | /* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de> | ||
3 | * for the OpenSSL project. | ||
4 | * Includes code written by Bodo Moeller for the OpenSSL project. | ||
5 | */ | ||
6 | /* ==================================================================== | ||
7 | * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. | ||
8 | * | ||
9 | * Redistribution and use in source and binary forms, with or without | ||
10 | * modification, are permitted provided that the following conditions | ||
11 | * are met: | ||
12 | * | ||
13 | * 1. Redistributions of source code must retain the above copyright | ||
14 | * notice, this list of conditions and the following disclaimer. | ||
15 | * | ||
16 | * 2. Redistributions in binary form must reproduce the above copyright | ||
17 | * notice, this list of conditions and the following disclaimer in | ||
18 | * the documentation and/or other materials provided with the | ||
19 | * distribution. | ||
20 | * | ||
21 | * 3. All advertising materials mentioning features or use of this | ||
22 | * software must display the following acknowledgment: | ||
23 | * "This product includes software developed by the OpenSSL Project | ||
24 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
25 | * | ||
26 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
27 | * endorse or promote products derived from this software without | ||
28 | * prior written permission. For written permission, please contact | ||
29 | * openssl-core@openssl.org. | ||
30 | * | ||
31 | * 5. Products derived from this software may not be called "OpenSSL" | ||
32 | * nor may "OpenSSL" appear in their names without prior written | ||
33 | * permission of the OpenSSL Project. | ||
34 | * | ||
35 | * 6. Redistributions of any form whatsoever must retain the following | ||
36 | * acknowledgment: | ||
37 | * "This product includes software developed by the OpenSSL Project | ||
38 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
39 | * | ||
40 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
41 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
42 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
43 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
44 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
45 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
46 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
47 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
49 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
50 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
51 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
52 | * ==================================================================== | ||
53 | * | ||
54 | * This product includes cryptographic software written by Eric Young | ||
55 | * (eay@cryptsoft.com). This product includes software written by Tim | ||
56 | * Hudson (tjh@cryptsoft.com). | ||
57 | * | ||
58 | */ | ||
59 | /* ==================================================================== | ||
60 | * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. | ||
61 | * Portions of this software developed by SUN MICROSYSTEMS, INC., | ||
62 | * and contributed to the OpenSSL project. | ||
63 | */ | ||
64 | |||
65 | #include <openssl/err.h> | ||
66 | #include <openssl/symhacks.h> | ||
67 | |||
68 | #include "ec_lcl.h" | ||
69 | |||
70 | int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point, | ||
71 | const BIGNUM *x_, int y_bit, BN_CTX *ctx) | ||
72 | { | ||
73 | BN_CTX *new_ctx = NULL; | ||
74 | BIGNUM *tmp1, *tmp2, *x, *y; | ||
75 | int ret = 0; | ||
76 | |||
77 | /* clear error queue*/ | ||
78 | ERR_clear_error(); | ||
79 | |||
80 | if (ctx == NULL) | ||
81 | { | ||
82 | ctx = new_ctx = BN_CTX_new(); | ||
83 | if (ctx == NULL) | ||
84 | return 0; | ||
85 | } | ||
86 | |||
87 | y_bit = (y_bit != 0); | ||
88 | |||
89 | BN_CTX_start(ctx); | ||
90 | tmp1 = BN_CTX_get(ctx); | ||
91 | tmp2 = BN_CTX_get(ctx); | ||
92 | x = BN_CTX_get(ctx); | ||
93 | y = BN_CTX_get(ctx); | ||
94 | if (y == NULL) goto err; | ||
95 | |||
96 | /* Recover y. We have a Weierstrass equation | ||
97 | * y^2 = x^3 + a*x + b, | ||
98 | * so y is one of the square roots of x^3 + a*x + b. | ||
99 | */ | ||
100 | |||
101 | /* tmp1 := x^3 */ | ||
102 | if (!BN_nnmod(x, x_, &group->field,ctx)) goto err; | ||
103 | if (group->meth->field_decode == 0) | ||
104 | { | ||
105 | /* field_{sqr,mul} work on standard representation */ | ||
106 | if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err; | ||
107 | if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err; | ||
108 | } | ||
109 | else | ||
110 | { | ||
111 | if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err; | ||
112 | if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err; | ||
113 | } | ||
114 | |||
115 | /* tmp1 := tmp1 + a*x */ | ||
116 | if (group->a_is_minus3) | ||
117 | { | ||
118 | if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err; | ||
119 | if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err; | ||
120 | if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err; | ||
121 | } | ||
122 | else | ||
123 | { | ||
124 | if (group->meth->field_decode) | ||
125 | { | ||
126 | if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err; | ||
127 | if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err; | ||
128 | } | ||
129 | else | ||
130 | { | ||
131 | /* field_mul works on standard representation */ | ||
132 | if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err; | ||
133 | } | ||
134 | |||
135 | if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err; | ||
136 | } | ||
137 | |||
138 | /* tmp1 := tmp1 + b */ | ||
139 | if (group->meth->field_decode) | ||
140 | { | ||
141 | if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err; | ||
142 | if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err; | ||
143 | } | ||
144 | else | ||
145 | { | ||
146 | if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err; | ||
147 | } | ||
148 | |||
149 | if (!BN_mod_sqrt(y, tmp1, &group->field, ctx)) | ||
150 | { | ||
151 | unsigned long err = ERR_peek_last_error(); | ||
152 | |||
153 | if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE) | ||
154 | { | ||
155 | ERR_clear_error(); | ||
156 | ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); | ||
157 | } | ||
158 | else | ||
159 | ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB); | ||
160 | goto err; | ||
161 | } | ||
162 | |||
163 | if (y_bit != BN_is_odd(y)) | ||
164 | { | ||
165 | if (BN_is_zero(y)) | ||
166 | { | ||
167 | int kron; | ||
168 | |||
169 | kron = BN_kronecker(x, &group->field, ctx); | ||
170 | if (kron == -2) goto err; | ||
171 | |||
172 | if (kron == 1) | ||
173 | ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT); | ||
174 | else | ||
175 | /* BN_mod_sqrt() should have cought this error (not a square) */ | ||
176 | ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); | ||
177 | goto err; | ||
178 | } | ||
179 | if (!BN_usub(y, &group->field, y)) goto err; | ||
180 | } | ||
181 | if (y_bit != BN_is_odd(y)) | ||
182 | { | ||
183 | ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR); | ||
184 | goto err; | ||
185 | } | ||
186 | |||
187 | if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; | ||
188 | |||
189 | ret = 1; | ||
190 | |||
191 | err: | ||
192 | BN_CTX_end(ctx); | ||
193 | if (new_ctx != NULL) | ||
194 | BN_CTX_free(new_ctx); | ||
195 | return ret; | ||
196 | } | ||
197 | |||
198 | |||
199 | size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, | ||
200 | unsigned char *buf, size_t len, BN_CTX *ctx) | ||
201 | { | ||
202 | size_t ret; | ||
203 | BN_CTX *new_ctx = NULL; | ||
204 | int used_ctx = 0; | ||
205 | BIGNUM *x, *y; | ||
206 | size_t field_len, i, skip; | ||
207 | |||
208 | if ((form != POINT_CONVERSION_COMPRESSED) | ||
209 | && (form != POINT_CONVERSION_UNCOMPRESSED) | ||
210 | && (form != POINT_CONVERSION_HYBRID)) | ||
211 | { | ||
212 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM); | ||
213 | goto err; | ||
214 | } | ||
215 | |||
216 | if (EC_POINT_is_at_infinity(group, point)) | ||
217 | { | ||
218 | /* encodes to a single 0 octet */ | ||
219 | if (buf != NULL) | ||
220 | { | ||
221 | if (len < 1) | ||
222 | { | ||
223 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); | ||
224 | return 0; | ||
225 | } | ||
226 | buf[0] = 0; | ||
227 | } | ||
228 | return 1; | ||
229 | } | ||
230 | |||
231 | |||
232 | /* ret := required output buffer length */ | ||
233 | field_len = BN_num_bytes(&group->field); | ||
234 | ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; | ||
235 | |||
236 | /* if 'buf' is NULL, just return required length */ | ||
237 | if (buf != NULL) | ||
238 | { | ||
239 | if (len < ret) | ||
240 | { | ||
241 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); | ||
242 | goto err; | ||
243 | } | ||
244 | |||
245 | if (ctx == NULL) | ||
246 | { | ||
247 | ctx = new_ctx = BN_CTX_new(); | ||
248 | if (ctx == NULL) | ||
249 | return 0; | ||
250 | } | ||
251 | |||
252 | BN_CTX_start(ctx); | ||
253 | used_ctx = 1; | ||
254 | x = BN_CTX_get(ctx); | ||
255 | y = BN_CTX_get(ctx); | ||
256 | if (y == NULL) goto err; | ||
257 | |||
258 | if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; | ||
259 | |||
260 | if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y)) | ||
261 | buf[0] = form + 1; | ||
262 | else | ||
263 | buf[0] = form; | ||
264 | |||
265 | i = 1; | ||
266 | |||
267 | skip = field_len - BN_num_bytes(x); | ||
268 | if (skip > field_len) | ||
269 | { | ||
270 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
271 | goto err; | ||
272 | } | ||
273 | while (skip > 0) | ||
274 | { | ||
275 | buf[i++] = 0; | ||
276 | skip--; | ||
277 | } | ||
278 | skip = BN_bn2bin(x, buf + i); | ||
279 | i += skip; | ||
280 | if (i != 1 + field_len) | ||
281 | { | ||
282 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
283 | goto err; | ||
284 | } | ||
285 | |||
286 | if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID) | ||
287 | { | ||
288 | skip = field_len - BN_num_bytes(y); | ||
289 | if (skip > field_len) | ||
290 | { | ||
291 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
292 | goto err; | ||
293 | } | ||
294 | while (skip > 0) | ||
295 | { | ||
296 | buf[i++] = 0; | ||
297 | skip--; | ||
298 | } | ||
299 | skip = BN_bn2bin(y, buf + i); | ||
300 | i += skip; | ||
301 | } | ||
302 | |||
303 | if (i != ret) | ||
304 | { | ||
305 | ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); | ||
306 | goto err; | ||
307 | } | ||
308 | } | ||
309 | |||
310 | if (used_ctx) | ||
311 | BN_CTX_end(ctx); | ||
312 | if (new_ctx != NULL) | ||
313 | BN_CTX_free(new_ctx); | ||
314 | return ret; | ||
315 | |||
316 | err: | ||
317 | if (used_ctx) | ||
318 | BN_CTX_end(ctx); | ||
319 | if (new_ctx != NULL) | ||
320 | BN_CTX_free(new_ctx); | ||
321 | return 0; | ||
322 | } | ||
323 | |||
324 | |||
325 | int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point, | ||
326 | const unsigned char *buf, size_t len, BN_CTX *ctx) | ||
327 | { | ||
328 | point_conversion_form_t form; | ||
329 | int y_bit; | ||
330 | BN_CTX *new_ctx = NULL; | ||
331 | BIGNUM *x, *y; | ||
332 | size_t field_len, enc_len; | ||
333 | int ret = 0; | ||
334 | |||
335 | if (len == 0) | ||
336 | { | ||
337 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL); | ||
338 | return 0; | ||
339 | } | ||
340 | form = buf[0]; | ||
341 | y_bit = form & 1; | ||
342 | form = form & ~1U; | ||
343 | if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED) | ||
344 | && (form != POINT_CONVERSION_UNCOMPRESSED) | ||
345 | && (form != POINT_CONVERSION_HYBRID)) | ||
346 | { | ||
347 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
348 | return 0; | ||
349 | } | ||
350 | if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit) | ||
351 | { | ||
352 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
353 | return 0; | ||
354 | } | ||
355 | |||
356 | if (form == 0) | ||
357 | { | ||
358 | if (len != 1) | ||
359 | { | ||
360 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
361 | return 0; | ||
362 | } | ||
363 | |||
364 | return EC_POINT_set_to_infinity(group, point); | ||
365 | } | ||
366 | |||
367 | field_len = BN_num_bytes(&group->field); | ||
368 | enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; | ||
369 | |||
370 | if (len != enc_len) | ||
371 | { | ||
372 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
373 | return 0; | ||
374 | } | ||
375 | |||
376 | if (ctx == NULL) | ||
377 | { | ||
378 | ctx = new_ctx = BN_CTX_new(); | ||
379 | if (ctx == NULL) | ||
380 | return 0; | ||
381 | } | ||
382 | |||
383 | BN_CTX_start(ctx); | ||
384 | x = BN_CTX_get(ctx); | ||
385 | y = BN_CTX_get(ctx); | ||
386 | if (y == NULL) goto err; | ||
387 | |||
388 | if (!BN_bin2bn(buf + 1, field_len, x)) goto err; | ||
389 | if (BN_ucmp(x, &group->field) >= 0) | ||
390 | { | ||
391 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
392 | goto err; | ||
393 | } | ||
394 | |||
395 | if (form == POINT_CONVERSION_COMPRESSED) | ||
396 | { | ||
397 | if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err; | ||
398 | } | ||
399 | else | ||
400 | { | ||
401 | if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err; | ||
402 | if (BN_ucmp(y, &group->field) >= 0) | ||
403 | { | ||
404 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
405 | goto err; | ||
406 | } | ||
407 | if (form == POINT_CONVERSION_HYBRID) | ||
408 | { | ||
409 | if (y_bit != BN_is_odd(y)) | ||
410 | { | ||
411 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); | ||
412 | goto err; | ||
413 | } | ||
414 | } | ||
415 | |||
416 | if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; | ||
417 | } | ||
418 | |||
419 | if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */ | ||
420 | { | ||
421 | ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE); | ||
422 | goto err; | ||
423 | } | ||
424 | |||
425 | ret = 1; | ||
426 | |||
427 | err: | ||
428 | BN_CTX_end(ctx); | ||
429 | if (new_ctx != NULL) | ||
430 | BN_CTX_free(new_ctx); | ||
431 | return ret; | ||
432 | } | ||
433 | |||
diff --git a/src/lib/libcrypto/ecdh/ecdh.h b/src/lib/libcrypto/ecdh/ecdh.h index b4b58ee65b..8887102c0b 100644 --- a/src/lib/libcrypto/ecdh/ecdh.h +++ b/src/lib/libcrypto/ecdh/ecdh.h | |||
@@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void); | |||
109 | /* Error codes for the ECDH functions. */ | 109 | /* Error codes for the ECDH functions. */ |
110 | 110 | ||
111 | /* Function codes. */ | 111 | /* Function codes. */ |
112 | #define ECDH_F_ECDH_CHECK 102 | ||
112 | #define ECDH_F_ECDH_COMPUTE_KEY 100 | 113 | #define ECDH_F_ECDH_COMPUTE_KEY 100 |
113 | #define ECDH_F_ECDH_DATA_NEW_METHOD 101 | 114 | #define ECDH_F_ECDH_DATA_NEW_METHOD 101 |
114 | 115 | ||
115 | /* Reason codes. */ | 116 | /* Reason codes. */ |
116 | #define ECDH_R_KDF_FAILED 102 | 117 | #define ECDH_R_KDF_FAILED 102 |
118 | #define ECDH_R_NON_FIPS_METHOD 103 | ||
117 | #define ECDH_R_NO_PRIVATE_VALUE 100 | 119 | #define ECDH_R_NO_PRIVATE_VALUE 100 |
118 | #define ECDH_R_POINT_ARITHMETIC_FAILURE 101 | 120 | #define ECDH_R_POINT_ARITHMETIC_FAILURE 101 |
119 | 121 | ||
diff --git a/src/lib/libcrypto/ecdh/ech_err.c b/src/lib/libcrypto/ecdh/ech_err.c index 6f4b0c9953..3bd247398d 100644 --- a/src/lib/libcrypto/ecdh/ech_err.c +++ b/src/lib/libcrypto/ecdh/ech_err.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* crypto/ecdh/ech_err.c */ | 1 | /* crypto/ecdh/ech_err.c */ |
2 | /* ==================================================================== | 2 | /* ==================================================================== |
3 | * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. |
4 | * | 4 | * |
5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
@@ -70,6 +70,7 @@ | |||
70 | 70 | ||
71 | static ERR_STRING_DATA ECDH_str_functs[]= | 71 | static ERR_STRING_DATA ECDH_str_functs[]= |
72 | { | 72 | { |
73 | {ERR_FUNC(ECDH_F_ECDH_CHECK), "ECDH_CHECK"}, | ||
73 | {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY), "ECDH_compute_key"}, | 74 | {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY), "ECDH_compute_key"}, |
74 | {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD), "ECDH_DATA_new_method"}, | 75 | {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD), "ECDH_DATA_new_method"}, |
75 | {0,NULL} | 76 | {0,NULL} |
@@ -78,6 +79,7 @@ static ERR_STRING_DATA ECDH_str_functs[]= | |||
78 | static ERR_STRING_DATA ECDH_str_reasons[]= | 79 | static ERR_STRING_DATA ECDH_str_reasons[]= |
79 | { | 80 | { |
80 | {ERR_REASON(ECDH_R_KDF_FAILED) ,"KDF failed"}, | 81 | {ERR_REASON(ECDH_R_KDF_FAILED) ,"KDF failed"}, |
82 | {ERR_REASON(ECDH_R_NON_FIPS_METHOD) ,"non fips method"}, | ||
81 | {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE) ,"no private value"}, | 83 | {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE) ,"no private value"}, |
82 | {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"}, | 84 | {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"}, |
83 | {0,NULL} | 85 | {0,NULL} |
diff --git a/src/lib/libcrypto/ecdh/ech_lib.c b/src/lib/libcrypto/ecdh/ech_lib.c index 4d8ea03d3d..dadbfd3c49 100644 --- a/src/lib/libcrypto/ecdh/ech_lib.c +++ b/src/lib/libcrypto/ecdh/ech_lib.c | |||
@@ -73,6 +73,9 @@ | |||
73 | #include <openssl/engine.h> | 73 | #include <openssl/engine.h> |
74 | #endif | 74 | #endif |
75 | #include <openssl/err.h> | 75 | #include <openssl/err.h> |
76 | #ifdef OPENSSL_FIPS | ||
77 | #include <openssl/fips.h> | ||
78 | #endif | ||
76 | 79 | ||
77 | const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT; | 80 | const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT; |
78 | 81 | ||
@@ -90,7 +93,16 @@ void ECDH_set_default_method(const ECDH_METHOD *meth) | |||
90 | const ECDH_METHOD *ECDH_get_default_method(void) | 93 | const ECDH_METHOD *ECDH_get_default_method(void) |
91 | { | 94 | { |
92 | if(!default_ECDH_method) | 95 | if(!default_ECDH_method) |
96 | { | ||
97 | #ifdef OPENSSL_FIPS | ||
98 | if (FIPS_mode()) | ||
99 | return FIPS_ecdh_openssl(); | ||
100 | else | ||
101 | return ECDH_OpenSSL(); | ||
102 | #else | ||
93 | default_ECDH_method = ECDH_OpenSSL(); | 103 | default_ECDH_method = ECDH_OpenSSL(); |
104 | #endif | ||
105 | } | ||
94 | return default_ECDH_method; | 106 | return default_ECDH_method; |
95 | } | 107 | } |
96 | 108 | ||
@@ -215,6 +227,14 @@ ECDH_DATA *ecdh_check(EC_KEY *key) | |||
215 | } | 227 | } |
216 | else | 228 | else |
217 | ecdh_data = (ECDH_DATA *)data; | 229 | ecdh_data = (ECDH_DATA *)data; |
230 | #ifdef OPENSSL_FIPS | ||
231 | if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD) | ||
232 | && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW)) | ||
233 | { | ||
234 | ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD); | ||
235 | return NULL; | ||
236 | } | ||
237 | #endif | ||
218 | 238 | ||
219 | 239 | ||
220 | return ecdh_data; | 240 | return ecdh_data; |
diff --git a/src/lib/libcrypto/ecdh/ech_locl.h b/src/lib/libcrypto/ecdh/ech_locl.h index f658526a7e..f6cad6a894 100644 --- a/src/lib/libcrypto/ecdh/ech_locl.h +++ b/src/lib/libcrypto/ecdh/ech_locl.h | |||
@@ -75,6 +75,14 @@ struct ecdh_method | |||
75 | char *app_data; | 75 | char *app_data; |
76 | }; | 76 | }; |
77 | 77 | ||
78 | /* If this flag is set the ECDH method is FIPS compliant and can be used | ||
79 | * in FIPS mode. This is set in the validated module method. If an | ||
80 | * application sets this flag in its own methods it is its responsibility | ||
81 | * to ensure the result is compliant. | ||
82 | */ | ||
83 | |||
84 | #define ECDH_FLAG_FIPS_METHOD 0x1 | ||
85 | |||
78 | typedef struct ecdh_data_st { | 86 | typedef struct ecdh_data_st { |
79 | /* EC_KEY_METH_DATA part */ | 87 | /* EC_KEY_METH_DATA part */ |
80 | int (*init)(EC_KEY *); | 88 | int (*init)(EC_KEY *); |
diff --git a/src/lib/libcrypto/ecdsa/ecdsa.h b/src/lib/libcrypto/ecdsa/ecdsa.h index e61c539812..7fb5254b62 100644 --- a/src/lib/libcrypto/ecdsa/ecdsa.h +++ b/src/lib/libcrypto/ecdsa/ecdsa.h | |||
@@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void); | |||
238 | /* Error codes for the ECDSA functions. */ | 238 | /* Error codes for the ECDSA functions. */ |
239 | 239 | ||
240 | /* Function codes. */ | 240 | /* Function codes. */ |
241 | #define ECDSA_F_ECDSA_CHECK 104 | ||
241 | #define ECDSA_F_ECDSA_DATA_NEW_METHOD 100 | 242 | #define ECDSA_F_ECDSA_DATA_NEW_METHOD 100 |
242 | #define ECDSA_F_ECDSA_DO_SIGN 101 | 243 | #define ECDSA_F_ECDSA_DO_SIGN 101 |
243 | #define ECDSA_F_ECDSA_DO_VERIFY 102 | 244 | #define ECDSA_F_ECDSA_DO_VERIFY 102 |
@@ -249,6 +250,7 @@ void ERR_load_ECDSA_strings(void); | |||
249 | #define ECDSA_R_ERR_EC_LIB 102 | 250 | #define ECDSA_R_ERR_EC_LIB 102 |
250 | #define ECDSA_R_MISSING_PARAMETERS 103 | 251 | #define ECDSA_R_MISSING_PARAMETERS 103 |
251 | #define ECDSA_R_NEED_NEW_SETUP_VALUES 106 | 252 | #define ECDSA_R_NEED_NEW_SETUP_VALUES 106 |
253 | #define ECDSA_R_NON_FIPS_METHOD 107 | ||
252 | #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED 104 | 254 | #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED 104 |
253 | #define ECDSA_R_SIGNATURE_MALLOC_FAILED 105 | 255 | #define ECDSA_R_SIGNATURE_MALLOC_FAILED 105 |
254 | 256 | ||
diff --git a/src/lib/libcrypto/ecdsa/ecs_err.c b/src/lib/libcrypto/ecdsa/ecs_err.c index 98e38d537f..81542e6d15 100644 --- a/src/lib/libcrypto/ecdsa/ecs_err.c +++ b/src/lib/libcrypto/ecdsa/ecs_err.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* crypto/ecdsa/ecs_err.c */ | 1 | /* crypto/ecdsa/ecs_err.c */ |
2 | /* ==================================================================== | 2 | /* ==================================================================== |
3 | * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. | 3 | * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. |
4 | * | 4 | * |
5 | * Redistribution and use in source and binary forms, with or without | 5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions | 6 | * modification, are permitted provided that the following conditions |
@@ -70,6 +70,7 @@ | |||
70 | 70 | ||
71 | static ERR_STRING_DATA ECDSA_str_functs[]= | 71 | static ERR_STRING_DATA ECDSA_str_functs[]= |
72 | { | 72 | { |
73 | {ERR_FUNC(ECDSA_F_ECDSA_CHECK), "ECDSA_CHECK"}, | ||
73 | {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD), "ECDSA_DATA_NEW_METHOD"}, | 74 | {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD), "ECDSA_DATA_NEW_METHOD"}, |
74 | {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN), "ECDSA_do_sign"}, | 75 | {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN), "ECDSA_do_sign"}, |
75 | {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY), "ECDSA_do_verify"}, | 76 | {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY), "ECDSA_do_verify"}, |
@@ -84,6 +85,7 @@ static ERR_STRING_DATA ECDSA_str_reasons[]= | |||
84 | {ERR_REASON(ECDSA_R_ERR_EC_LIB) ,"err ec lib"}, | 85 | {ERR_REASON(ECDSA_R_ERR_EC_LIB) ,"err ec lib"}, |
85 | {ERR_REASON(ECDSA_R_MISSING_PARAMETERS) ,"missing parameters"}, | 86 | {ERR_REASON(ECDSA_R_MISSING_PARAMETERS) ,"missing parameters"}, |
86 | {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"}, | 87 | {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"}, |
88 | {ERR_REASON(ECDSA_R_NON_FIPS_METHOD) ,"non fips method"}, | ||
87 | {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"}, | 89 | {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"}, |
88 | {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"}, | 90 | {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"}, |
89 | {0,NULL} | 91 | {0,NULL} |
diff --git a/src/lib/libcrypto/ecdsa/ecs_lib.c b/src/lib/libcrypto/ecdsa/ecs_lib.c index 2ebae3aa27..e477da430b 100644 --- a/src/lib/libcrypto/ecdsa/ecs_lib.c +++ b/src/lib/libcrypto/ecdsa/ecs_lib.c | |||
@@ -60,6 +60,9 @@ | |||
60 | #endif | 60 | #endif |
61 | #include <openssl/err.h> | 61 | #include <openssl/err.h> |
62 | #include <openssl/bn.h> | 62 | #include <openssl/bn.h> |
63 | #ifdef OPENSSL_FIPS | ||
64 | #include <openssl/fips.h> | ||
65 | #endif | ||
63 | 66 | ||
64 | const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT; | 67 | const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT; |
65 | 68 | ||
@@ -77,7 +80,16 @@ void ECDSA_set_default_method(const ECDSA_METHOD *meth) | |||
77 | const ECDSA_METHOD *ECDSA_get_default_method(void) | 80 | const ECDSA_METHOD *ECDSA_get_default_method(void) |
78 | { | 81 | { |
79 | if(!default_ECDSA_method) | 82 | if(!default_ECDSA_method) |
83 | { | ||
84 | #ifdef OPENSSL_FIPS | ||
85 | if (FIPS_mode()) | ||
86 | return FIPS_ecdsa_openssl(); | ||
87 | else | ||
88 | return ECDSA_OpenSSL(); | ||
89 | #else | ||
80 | default_ECDSA_method = ECDSA_OpenSSL(); | 90 | default_ECDSA_method = ECDSA_OpenSSL(); |
91 | #endif | ||
92 | } | ||
81 | return default_ECDSA_method; | 93 | return default_ECDSA_method; |
82 | } | 94 | } |
83 | 95 | ||
@@ -193,7 +205,14 @@ ECDSA_DATA *ecdsa_check(EC_KEY *key) | |||
193 | } | 205 | } |
194 | else | 206 | else |
195 | ecdsa_data = (ECDSA_DATA *)data; | 207 | ecdsa_data = (ECDSA_DATA *)data; |
196 | 208 | #ifdef OPENSSL_FIPS | |
209 | if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD) | ||
210 | && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW)) | ||
211 | { | ||
212 | ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD); | ||
213 | return NULL; | ||
214 | } | ||
215 | #endif | ||
197 | 216 | ||
198 | return ecdsa_data; | 217 | return ecdsa_data; |
199 | } | 218 | } |
diff --git a/src/lib/libcrypto/ecdsa/ecs_locl.h b/src/lib/libcrypto/ecdsa/ecs_locl.h index 3a69a840e2..cb3be13cfc 100644 --- a/src/lib/libcrypto/ecdsa/ecs_locl.h +++ b/src/lib/libcrypto/ecdsa/ecs_locl.h | |||
@@ -82,6 +82,14 @@ struct ecdsa_method | |||
82 | char *app_data; | 82 | char *app_data; |
83 | }; | 83 | }; |
84 | 84 | ||
85 | /* If this flag is set the ECDSA method is FIPS compliant and can be used | ||
86 | * in FIPS mode. This is set in the validated module method. If an | ||
87 | * application sets this flag in its own methods it is its responsibility | ||
88 | * to ensure the result is compliant. | ||
89 | */ | ||
90 | |||
91 | #define ECDSA_FLAG_FIPS_METHOD 0x1 | ||
92 | |||
85 | typedef struct ecdsa_data_st { | 93 | typedef struct ecdsa_data_st { |
86 | /* EC_KEY_METH_DATA part */ | 94 | /* EC_KEY_METH_DATA part */ |
87 | int (*init)(EC_KEY *); | 95 | int (*init)(EC_KEY *); |
diff --git a/src/lib/libcrypto/ecdsa/ecs_ossl.c b/src/lib/libcrypto/ecdsa/ecs_ossl.c index 1bbf328de5..7725935610 100644 --- a/src/lib/libcrypto/ecdsa/ecs_ossl.c +++ b/src/lib/libcrypto/ecdsa/ecs_ossl.c | |||
@@ -167,6 +167,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, | |||
167 | goto err; | 167 | goto err; |
168 | } | 168 | } |
169 | } | 169 | } |
170 | #ifndef OPENSSL_NO_EC2M | ||
170 | else /* NID_X9_62_characteristic_two_field */ | 171 | else /* NID_X9_62_characteristic_two_field */ |
171 | { | 172 | { |
172 | if (!EC_POINT_get_affine_coordinates_GF2m(group, | 173 | if (!EC_POINT_get_affine_coordinates_GF2m(group, |
@@ -176,6 +177,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, | |||
176 | goto err; | 177 | goto err; |
177 | } | 178 | } |
178 | } | 179 | } |
180 | #endif | ||
179 | if (!BN_nnmod(r, X, order, ctx)) | 181 | if (!BN_nnmod(r, X, order, ctx)) |
180 | { | 182 | { |
181 | ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB); | 183 | ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB); |
@@ -454,6 +456,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len, | |||
454 | goto err; | 456 | goto err; |
455 | } | 457 | } |
456 | } | 458 | } |
459 | #ifndef OPENSSL_NO_EC2M | ||
457 | else /* NID_X9_62_characteristic_two_field */ | 460 | else /* NID_X9_62_characteristic_two_field */ |
458 | { | 461 | { |
459 | if (!EC_POINT_get_affine_coordinates_GF2m(group, | 462 | if (!EC_POINT_get_affine_coordinates_GF2m(group, |
@@ -463,7 +466,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len, | |||
463 | goto err; | 466 | goto err; |
464 | } | 467 | } |
465 | } | 468 | } |
466 | 469 | #endif | |
467 | if (!BN_nnmod(u1, X, order, ctx)) | 470 | if (!BN_nnmod(u1, X, order, ctx)) |
468 | { | 471 | { |
469 | ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB); | 472 | ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB); |
diff --git a/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c b/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c new file mode 100644 index 0000000000..710fb79baf --- /dev/null +++ b/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c | |||
@@ -0,0 +1,406 @@ | |||
1 | /* ==================================================================== | ||
2 | * Copyright (c) 2011 The OpenSSL Project. All rights reserved. | ||
3 | * | ||
4 | * Redistribution and use in source and binary forms, with or without | ||
5 | * modification, are permitted provided that the following conditions | ||
6 | * are met: | ||
7 | * | ||
8 | * 1. Redistributions of source code must retain the above copyright | ||
9 | * notice, this list of conditions and the following disclaimer. | ||
10 | * | ||
11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
12 | * notice, this list of conditions and the following disclaimer in | ||
13 | * the documentation and/or other materials provided with the | ||
14 | * distribution. | ||
15 | * | ||
16 | * 3. All advertising materials mentioning features or use of this | ||
17 | * software must display the following acknowledgment: | ||
18 | * "This product includes software developed by the OpenSSL Project | ||
19 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
20 | * | ||
21 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
22 | * endorse or promote products derived from this software without | ||
23 | * prior written permission. For written permission, please contact | ||
24 | * licensing@OpenSSL.org. | ||
25 | * | ||
26 | * 5. Products derived from this software may not be called "OpenSSL" | ||
27 | * nor may "OpenSSL" appear in their names without prior written | ||
28 | * permission of the OpenSSL Project. | ||
29 | * | ||
30 | * 6. Redistributions of any form whatsoever must retain the following | ||
31 | * acknowledgment: | ||
32 | * "This product includes software developed by the OpenSSL Project | ||
33 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
34 | * | ||
35 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
36 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
38 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
39 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
40 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
41 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
42 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
45 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
46 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
47 | * ==================================================================== | ||
48 | */ | ||
49 | |||
50 | #include <openssl/opensslconf.h> | ||
51 | |||
52 | #include <stdio.h> | ||
53 | #include <string.h> | ||
54 | |||
55 | #if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1) | ||
56 | |||
57 | #include <openssl/evp.h> | ||
58 | #include <openssl/objects.h> | ||
59 | #include <openssl/aes.h> | ||
60 | #include <openssl/sha.h> | ||
61 | #include "evp_locl.h" | ||
62 | |||
63 | #ifndef EVP_CIPH_FLAG_AEAD_CIPHER | ||
64 | #define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000 | ||
65 | #define EVP_CTRL_AEAD_TLS1_AAD 0x16 | ||
66 | #define EVP_CTRL_AEAD_SET_MAC_KEY 0x17 | ||
67 | #endif | ||
68 | |||
69 | #if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1) | ||
70 | #define EVP_CIPH_FLAG_DEFAULT_ASN1 0 | ||
71 | #endif | ||
72 | |||
73 | #define TLS1_1_VERSION 0x0302 | ||
74 | |||
75 | typedef struct | ||
76 | { | ||
77 | AES_KEY ks; | ||
78 | SHA_CTX head,tail,md; | ||
79 | size_t payload_length; /* AAD length in decrypt case */ | ||
80 | union { | ||
81 | unsigned int tls_ver; | ||
82 | unsigned char tls_aad[16]; /* 13 used */ | ||
83 | } aux; | ||
84 | } EVP_AES_HMAC_SHA1; | ||
85 | |||
86 | #define NO_PAYLOAD_LENGTH ((size_t)-1) | ||
87 | |||
88 | #if defined(AES_ASM) && ( \ | ||
89 | defined(__x86_64) || defined(__x86_64__) || \ | ||
90 | defined(_M_AMD64) || defined(_M_X64) || \ | ||
91 | defined(__INTEL__) ) | ||
92 | |||
93 | extern unsigned int OPENSSL_ia32cap_P[2]; | ||
94 | #define AESNI_CAPABLE (1<<(57-32)) | ||
95 | |||
96 | int aesni_set_encrypt_key(const unsigned char *userKey, int bits, | ||
97 | AES_KEY *key); | ||
98 | int aesni_set_decrypt_key(const unsigned char *userKey, int bits, | ||
99 | AES_KEY *key); | ||
100 | |||
101 | void aesni_cbc_encrypt(const unsigned char *in, | ||
102 | unsigned char *out, | ||
103 | size_t length, | ||
104 | const AES_KEY *key, | ||
105 | unsigned char *ivec, int enc); | ||
106 | |||
107 | void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks, | ||
108 | const AES_KEY *key, unsigned char iv[16], | ||
109 | SHA_CTX *ctx,const void *in0); | ||
110 | |||
111 | #define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data) | ||
112 | |||
113 | static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx, | ||
114 | const unsigned char *inkey, | ||
115 | const unsigned char *iv, int enc) | ||
116 | { | ||
117 | EVP_AES_HMAC_SHA1 *key = data(ctx); | ||
118 | int ret; | ||
119 | |||
120 | if (enc) | ||
121 | ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks); | ||
122 | else | ||
123 | ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks); | ||
124 | |||
125 | SHA1_Init(&key->head); /* handy when benchmarking */ | ||
126 | key->tail = key->head; | ||
127 | key->md = key->head; | ||
128 | |||
129 | key->payload_length = NO_PAYLOAD_LENGTH; | ||
130 | |||
131 | return ret<0?0:1; | ||
132 | } | ||
133 | |||
134 | #define STITCHED_CALL | ||
135 | |||
136 | #if !defined(STITCHED_CALL) | ||
137 | #define aes_off 0 | ||
138 | #endif | ||
139 | |||
140 | void sha1_block_data_order (void *c,const void *p,size_t len); | ||
141 | |||
142 | static void sha1_update(SHA_CTX *c,const void *data,size_t len) | ||
143 | { const unsigned char *ptr = data; | ||
144 | size_t res; | ||
145 | |||
146 | if ((res = c->num)) { | ||
147 | res = SHA_CBLOCK-res; | ||
148 | if (len<res) res=len; | ||
149 | SHA1_Update (c,ptr,res); | ||
150 | ptr += res; | ||
151 | len -= res; | ||
152 | } | ||
153 | |||
154 | res = len % SHA_CBLOCK; | ||
155 | len -= res; | ||
156 | |||
157 | if (len) { | ||
158 | sha1_block_data_order(c,ptr,len/SHA_CBLOCK); | ||
159 | |||
160 | ptr += len; | ||
161 | c->Nh += len>>29; | ||
162 | c->Nl += len<<=3; | ||
163 | if (c->Nl<(unsigned int)len) c->Nh++; | ||
164 | } | ||
165 | |||
166 | if (res) | ||
167 | SHA1_Update(c,ptr,res); | ||
168 | } | ||
169 | |||
170 | #define SHA1_Update sha1_update | ||
171 | |||
172 | static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, | ||
173 | const unsigned char *in, size_t len) | ||
174 | { | ||
175 | EVP_AES_HMAC_SHA1 *key = data(ctx); | ||
176 | unsigned int l; | ||
177 | size_t plen = key->payload_length, | ||
178 | iv = 0, /* explicit IV in TLS 1.1 and later */ | ||
179 | sha_off = 0; | ||
180 | #if defined(STITCHED_CALL) | ||
181 | size_t aes_off = 0, | ||
182 | blocks; | ||
183 | |||
184 | sha_off = SHA_CBLOCK-key->md.num; | ||
185 | #endif | ||
186 | |||
187 | if (len%AES_BLOCK_SIZE) return 0; | ||
188 | |||
189 | if (ctx->encrypt) { | ||
190 | if (plen==NO_PAYLOAD_LENGTH) | ||
191 | plen = len; | ||
192 | else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)) | ||
193 | return 0; | ||
194 | else if (key->aux.tls_ver >= TLS1_1_VERSION) | ||
195 | iv = AES_BLOCK_SIZE; | ||
196 | |||
197 | #if defined(STITCHED_CALL) | ||
198 | if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) { | ||
199 | SHA1_Update(&key->md,in+iv,sha_off); | ||
200 | |||
201 | aesni_cbc_sha1_enc(in,out,blocks,&key->ks, | ||
202 | ctx->iv,&key->md,in+iv+sha_off); | ||
203 | blocks *= SHA_CBLOCK; | ||
204 | aes_off += blocks; | ||
205 | sha_off += blocks; | ||
206 | key->md.Nh += blocks>>29; | ||
207 | key->md.Nl += blocks<<=3; | ||
208 | if (key->md.Nl<(unsigned int)blocks) key->md.Nh++; | ||
209 | } else { | ||
210 | sha_off = 0; | ||
211 | } | ||
212 | #endif | ||
213 | sha_off += iv; | ||
214 | SHA1_Update(&key->md,in+sha_off,plen-sha_off); | ||
215 | |||
216 | if (plen!=len) { /* "TLS" mode of operation */ | ||
217 | if (in!=out) | ||
218 | memcpy(out+aes_off,in+aes_off,plen-aes_off); | ||
219 | |||
220 | /* calculate HMAC and append it to payload */ | ||
221 | SHA1_Final(out+plen,&key->md); | ||
222 | key->md = key->tail; | ||
223 | SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH); | ||
224 | SHA1_Final(out+plen,&key->md); | ||
225 | |||
226 | /* pad the payload|hmac */ | ||
227 | plen += SHA_DIGEST_LENGTH; | ||
228 | for (l=len-plen-1;plen<len;plen++) out[plen]=l; | ||
229 | /* encrypt HMAC|padding at once */ | ||
230 | aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off, | ||
231 | &key->ks,ctx->iv,1); | ||
232 | } else { | ||
233 | aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off, | ||
234 | &key->ks,ctx->iv,1); | ||
235 | } | ||
236 | } else { | ||
237 | unsigned char mac[SHA_DIGEST_LENGTH]; | ||
238 | |||
239 | /* decrypt HMAC|padding at once */ | ||
240 | aesni_cbc_encrypt(in,out,len, | ||
241 | &key->ks,ctx->iv,0); | ||
242 | |||
243 | if (plen) { /* "TLS" mode of operation */ | ||
244 | /* figure out payload length */ | ||
245 | if (len<(size_t)(out[len-1]+1+SHA_DIGEST_LENGTH)) | ||
246 | return 0; | ||
247 | |||
248 | len -= (out[len-1]+1+SHA_DIGEST_LENGTH); | ||
249 | |||
250 | if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3]) | ||
251 | >= TLS1_1_VERSION) { | ||
252 | len -= AES_BLOCK_SIZE; | ||
253 | iv = AES_BLOCK_SIZE; | ||
254 | } | ||
255 | |||
256 | key->aux.tls_aad[plen-2] = len>>8; | ||
257 | key->aux.tls_aad[plen-1] = len; | ||
258 | |||
259 | /* calculate HMAC and verify it */ | ||
260 | key->md = key->head; | ||
261 | SHA1_Update(&key->md,key->aux.tls_aad,plen); | ||
262 | SHA1_Update(&key->md,out+iv,len); | ||
263 | SHA1_Final(mac,&key->md); | ||
264 | |||
265 | key->md = key->tail; | ||
266 | SHA1_Update(&key->md,mac,SHA_DIGEST_LENGTH); | ||
267 | SHA1_Final(mac,&key->md); | ||
268 | |||
269 | if (memcmp(out+iv+len,mac,SHA_DIGEST_LENGTH)) | ||
270 | return 0; | ||
271 | } else { | ||
272 | SHA1_Update(&key->md,out,len); | ||
273 | } | ||
274 | } | ||
275 | |||
276 | key->payload_length = NO_PAYLOAD_LENGTH; | ||
277 | |||
278 | return 1; | ||
279 | } | ||
280 | |||
281 | static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) | ||
282 | { | ||
283 | EVP_AES_HMAC_SHA1 *key = data(ctx); | ||
284 | |||
285 | switch (type) | ||
286 | { | ||
287 | case EVP_CTRL_AEAD_SET_MAC_KEY: | ||
288 | { | ||
289 | unsigned int i; | ||
290 | unsigned char hmac_key[64]; | ||
291 | |||
292 | memset (hmac_key,0,sizeof(hmac_key)); | ||
293 | |||
294 | if (arg > (int)sizeof(hmac_key)) { | ||
295 | SHA1_Init(&key->head); | ||
296 | SHA1_Update(&key->head,ptr,arg); | ||
297 | SHA1_Final(hmac_key,&key->head); | ||
298 | } else { | ||
299 | memcpy(hmac_key,ptr,arg); | ||
300 | } | ||
301 | |||
302 | for (i=0;i<sizeof(hmac_key);i++) | ||
303 | hmac_key[i] ^= 0x36; /* ipad */ | ||
304 | SHA1_Init(&key->head); | ||
305 | SHA1_Update(&key->head,hmac_key,sizeof(hmac_key)); | ||
306 | |||
307 | for (i=0;i<sizeof(hmac_key);i++) | ||
308 | hmac_key[i] ^= 0x36^0x5c; /* opad */ | ||
309 | SHA1_Init(&key->tail); | ||
310 | SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key)); | ||
311 | |||
312 | return 1; | ||
313 | } | ||
314 | case EVP_CTRL_AEAD_TLS1_AAD: | ||
315 | { | ||
316 | unsigned char *p=ptr; | ||
317 | unsigned int len=p[arg-2]<<8|p[arg-1]; | ||
318 | |||
319 | if (ctx->encrypt) | ||
320 | { | ||
321 | key->payload_length = len; | ||
322 | if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) { | ||
323 | len -= AES_BLOCK_SIZE; | ||
324 | p[arg-2] = len>>8; | ||
325 | p[arg-1] = len; | ||
326 | } | ||
327 | key->md = key->head; | ||
328 | SHA1_Update(&key->md,p,arg); | ||
329 | |||
330 | return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE) | ||
331 | - len); | ||
332 | } | ||
333 | else | ||
334 | { | ||
335 | if (arg>13) arg = 13; | ||
336 | memcpy(key->aux.tls_aad,ptr,arg); | ||
337 | key->payload_length = arg; | ||
338 | |||
339 | return SHA_DIGEST_LENGTH; | ||
340 | } | ||
341 | } | ||
342 | default: | ||
343 | return -1; | ||
344 | } | ||
345 | } | ||
346 | |||
347 | static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher = | ||
348 | { | ||
349 | #ifdef NID_aes_128_cbc_hmac_sha1 | ||
350 | NID_aes_128_cbc_hmac_sha1, | ||
351 | #else | ||
352 | NID_undef, | ||
353 | #endif | ||
354 | 16,16,16, | ||
355 | EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER, | ||
356 | aesni_cbc_hmac_sha1_init_key, | ||
357 | aesni_cbc_hmac_sha1_cipher, | ||
358 | NULL, | ||
359 | sizeof(EVP_AES_HMAC_SHA1), | ||
360 | EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv, | ||
361 | EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv, | ||
362 | aesni_cbc_hmac_sha1_ctrl, | ||
363 | NULL | ||
364 | }; | ||
365 | |||
366 | static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher = | ||
367 | { | ||
368 | #ifdef NID_aes_256_cbc_hmac_sha1 | ||
369 | NID_aes_256_cbc_hmac_sha1, | ||
370 | #else | ||
371 | NID_undef, | ||
372 | #endif | ||
373 | 16,32,16, | ||
374 | EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER, | ||
375 | aesni_cbc_hmac_sha1_init_key, | ||
376 | aesni_cbc_hmac_sha1_cipher, | ||
377 | NULL, | ||
378 | sizeof(EVP_AES_HMAC_SHA1), | ||
379 | EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv, | ||
380 | EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv, | ||
381 | aesni_cbc_hmac_sha1_ctrl, | ||
382 | NULL | ||
383 | }; | ||
384 | |||
385 | const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void) | ||
386 | { | ||
387 | return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE? | ||
388 | &aesni_128_cbc_hmac_sha1_cipher:NULL); | ||
389 | } | ||
390 | |||
391 | const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void) | ||
392 | { | ||
393 | return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE? | ||
394 | &aesni_256_cbc_hmac_sha1_cipher:NULL); | ||
395 | } | ||
396 | #else | ||
397 | const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void) | ||
398 | { | ||
399 | return NULL; | ||
400 | } | ||
401 | const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void) | ||
402 | { | ||
403 | return NULL; | ||
404 | } | ||
405 | #endif | ||
406 | #endif | ||
diff --git a/src/lib/libcrypto/evp/e_rc4_hmac_md5.c b/src/lib/libcrypto/evp/e_rc4_hmac_md5.c new file mode 100644 index 0000000000..56563191ba --- /dev/null +++ b/src/lib/libcrypto/evp/e_rc4_hmac_md5.c | |||
@@ -0,0 +1,298 @@ | |||
1 | /* ==================================================================== | ||
2 | * Copyright (c) 2011 The OpenSSL Project. All rights reserved. | ||
3 | * | ||
4 | * Redistribution and use in source and binary forms, with or without | ||
5 | * modification, are permitted provided that the following conditions | ||
6 | * are met: | ||
7 | * | ||
8 | * 1. Redistributions of source code must retain the above copyright | ||
9 | * notice, this list of conditions and the following disclaimer. | ||
10 | * | ||
11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
12 | * notice, this list of conditions and the following disclaimer in | ||
13 | * the documentation and/or other materials provided with the | ||
14 | * distribution. | ||
15 | * | ||
16 | * 3. All advertising materials mentioning features or use of this | ||
17 | * software must display the following acknowledgment: | ||
18 | * "This product includes software developed by the OpenSSL Project | ||
19 | * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | ||
20 | * | ||
21 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
22 | * endorse or promote products derived from this software without | ||
23 | * prior written permission. For written permission, please contact | ||
24 | * licensing@OpenSSL.org. | ||
25 | * | ||
26 | * 5. Products derived from this software may not be called "OpenSSL" | ||
27 | * nor may "OpenSSL" appear in their names without prior written | ||
28 | * permission of the OpenSSL Project. | ||
29 | * | ||
30 | * 6. Redistributions of any form whatsoever must retain the following | ||
31 | * acknowledgment: | ||
32 | * "This product includes software developed by the OpenSSL Project | ||
33 | * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | ||
34 | * | ||
35 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
36 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
38 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
39 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
40 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
41 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
42 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
45 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
46 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
47 | * ==================================================================== | ||
48 | */ | ||
49 | |||
50 | #include <openssl/opensslconf.h> | ||
51 | |||
52 | #include <stdio.h> | ||
53 | #include <string.h> | ||
54 | |||
55 | #if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5) | ||
56 | |||
57 | #include <openssl/evp.h> | ||
58 | #include <openssl/objects.h> | ||
59 | #include <openssl/rc4.h> | ||
60 | #include <openssl/md5.h> | ||
61 | |||
62 | #ifndef EVP_CIPH_FLAG_AEAD_CIPHER | ||
63 | #define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000 | ||
64 | #define EVP_CTRL_AEAD_TLS1_AAD 0x16 | ||
65 | #define EVP_CTRL_AEAD_SET_MAC_KEY 0x17 | ||
66 | #endif | ||
67 | |||
68 | /* FIXME: surely this is available elsewhere? */ | ||
69 | #define EVP_RC4_KEY_SIZE 16 | ||
70 | |||
71 | typedef struct | ||
72 | { | ||
73 | RC4_KEY ks; | ||
74 | MD5_CTX head,tail,md; | ||
75 | size_t payload_length; | ||
76 | } EVP_RC4_HMAC_MD5; | ||
77 | |||
78 | #define NO_PAYLOAD_LENGTH ((size_t)-1) | ||
79 | |||
80 | void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out, | ||
81 | MD5_CTX *ctx,const void *inp,size_t blocks); | ||
82 | |||
83 | #define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data) | ||
84 | |||
85 | static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx, | ||
86 | const unsigned char *inkey, | ||
87 | const unsigned char *iv, int enc) | ||
88 | { | ||
89 | EVP_RC4_HMAC_MD5 *key = data(ctx); | ||
90 | |||
91 | RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx), | ||
92 | inkey); | ||
93 | |||
94 | MD5_Init(&key->head); /* handy when benchmarking */ | ||
95 | key->tail = key->head; | ||
96 | key->md = key->head; | ||
97 | |||
98 | key->payload_length = NO_PAYLOAD_LENGTH; | ||
99 | |||
100 | return 1; | ||
101 | } | ||
102 | |||
103 | #if !defined(OPENSSL_NO_ASM) && ( \ | ||
104 | defined(__x86_64) || defined(__x86_64__) || \ | ||
105 | defined(_M_AMD64) || defined(_M_X64) || \ | ||
106 | defined(__INTEL__) ) && \ | ||
107 | !(defined(__APPLE__) && defined(__MACH__)) | ||
108 | #define STITCHED_CALL | ||
109 | #endif | ||
110 | |||
111 | #if !defined(STITCHED_CALL) | ||
112 | #define rc4_off 0 | ||
113 | #define md5_off 0 | ||
114 | #endif | ||
115 | |||
116 | static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, | ||
117 | const unsigned char *in, size_t len) | ||
118 | { | ||
119 | EVP_RC4_HMAC_MD5 *key = data(ctx); | ||
120 | #if defined(STITCHED_CALL) | ||
121 | size_t rc4_off = 32-1-(key->ks.x&(32-1)), /* 32 is $MOD from rc4_md5-x86_64.pl */ | ||
122 | md5_off = MD5_CBLOCK-key->md.num, | ||
123 | blocks; | ||
124 | unsigned int l; | ||
125 | extern unsigned int OPENSSL_ia32cap_P[]; | ||
126 | #endif | ||
127 | size_t plen = key->payload_length; | ||
128 | |||
129 | if (plen!=NO_PAYLOAD_LENGTH && len!=(plen+MD5_DIGEST_LENGTH)) return 0; | ||
130 | |||
131 | if (ctx->encrypt) { | ||
132 | if (plen==NO_PAYLOAD_LENGTH) plen = len; | ||
133 | #if defined(STITCHED_CALL) | ||
134 | /* cipher has to "fall behind" */ | ||
135 | if (rc4_off>md5_off) md5_off+=MD5_CBLOCK; | ||
136 | |||
137 | if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK) && | ||
138 | (OPENSSL_ia32cap_P[0]&(1<<20))==0) { | ||
139 | MD5_Update(&key->md,in,md5_off); | ||
140 | RC4(&key->ks,rc4_off,in,out); | ||
141 | |||
142 | rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off, | ||
143 | &key->md,in+md5_off,blocks); | ||
144 | blocks *= MD5_CBLOCK; | ||
145 | rc4_off += blocks; | ||
146 | md5_off += blocks; | ||
147 | key->md.Nh += blocks>>29; | ||
148 | key->md.Nl += blocks<<=3; | ||
149 | if (key->md.Nl<(unsigned int)blocks) key->md.Nh++; | ||
150 | } else { | ||
151 | rc4_off = 0; | ||
152 | md5_off = 0; | ||
153 | } | ||
154 | #endif | ||
155 | MD5_Update(&key->md,in+md5_off,plen-md5_off); | ||
156 | |||
157 | if (plen!=len) { /* "TLS" mode of operation */ | ||
158 | if (in!=out) | ||
159 | memcpy(out+rc4_off,in+rc4_off,plen-rc4_off); | ||
160 | |||
161 | /* calculate HMAC and append it to payload */ | ||
162 | MD5_Final(out+plen,&key->md); | ||
163 | key->md = key->tail; | ||
164 | MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH); | ||
165 | MD5_Final(out+plen,&key->md); | ||
166 | /* encrypt HMAC at once */ | ||
167 | RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off); | ||
168 | } else { | ||
169 | RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off); | ||
170 | } | ||
171 | } else { | ||
172 | unsigned char mac[MD5_DIGEST_LENGTH]; | ||
173 | #if defined(STITCHED_CALL) | ||
174 | /* digest has to "fall behind" */ | ||
175 | if (md5_off>rc4_off) rc4_off += 2*MD5_CBLOCK; | ||
176 | else rc4_off += MD5_CBLOCK; | ||
177 | |||
178 | if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK) && | ||
179 | (OPENSSL_ia32cap_P[0]&(1<<20))==0) { | ||
180 | RC4(&key->ks,rc4_off,in,out); | ||
181 | MD5_Update(&key->md,out,md5_off); | ||
182 | |||
183 | rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off, | ||
184 | &key->md,out+md5_off,blocks); | ||
185 | blocks *= MD5_CBLOCK; | ||
186 | rc4_off += blocks; | ||
187 | md5_off += blocks; | ||
188 | l = (key->md.Nl+(blocks<<3))&0xffffffffU; | ||
189 | if (l<key->md.Nl) key->md.Nh++; | ||
190 | key->md.Nl = l; | ||
191 | key->md.Nh += blocks>>29; | ||
192 | } else { | ||
193 | md5_off=0; | ||
194 | rc4_off=0; | ||
195 | } | ||
196 | #endif | ||
197 | /* decrypt HMAC at once */ | ||
198 | RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off); | ||
199 | if (plen!=NO_PAYLOAD_LENGTH) { /* "TLS" mode of operation */ | ||
200 | MD5_Update(&key->md,out+md5_off,plen-md5_off); | ||
201 | |||
202 | /* calculate HMAC and verify it */ | ||
203 | MD5_Final(mac,&key->md); | ||
204 | key->md = key->tail; | ||
205 | MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH); | ||
206 | MD5_Final(mac,&key->md); | ||
207 | |||
208 | if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH)) | ||
209 | return 0; | ||
210 | } else { | ||
211 | MD5_Update(&key->md,out+md5_off,len-md5_off); | ||
212 | } | ||
213 | } | ||
214 | |||
215 | key->payload_length = NO_PAYLOAD_LENGTH; | ||
216 | |||
217 | return 1; | ||
218 | } | ||
219 | |||
220 | static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) | ||
221 | { | ||
222 | EVP_RC4_HMAC_MD5 *key = data(ctx); | ||
223 | |||
224 | switch (type) | ||
225 | { | ||
226 | case EVP_CTRL_AEAD_SET_MAC_KEY: | ||
227 | { | ||
228 | unsigned int i; | ||
229 | unsigned char hmac_key[64]; | ||
230 | |||
231 | memset (hmac_key,0,sizeof(hmac_key)); | ||
232 | |||
233 | if (arg > (int)sizeof(hmac_key)) { | ||
234 | MD5_Init(&key->head); | ||
235 | MD5_Update(&key->head,ptr,arg); | ||
236 | MD5_Final(hmac_key,&key->head); | ||
237 | } else { | ||
238 | memcpy(hmac_key,ptr,arg); | ||
239 | } | ||
240 | |||
241 | for (i=0;i<sizeof(hmac_key);i++) | ||
242 | hmac_key[i] ^= 0x36; /* ipad */ | ||
243 | MD5_Init(&key->head); | ||
244 | MD5_Update(&key->head,hmac_key,sizeof(hmac_key)); | ||
245 | |||
246 | for (i=0;i<sizeof(hmac_key);i++) | ||
247 | hmac_key[i] ^= 0x36^0x5c; /* opad */ | ||
248 | MD5_Init(&key->tail); | ||
249 | MD5_Update(&key->tail,hmac_key,sizeof(hmac_key)); | ||
250 | |||
251 | return 1; | ||
252 | } | ||
253 | case EVP_CTRL_AEAD_TLS1_AAD: | ||
254 | { | ||
255 | unsigned char *p=ptr; | ||
256 | unsigned int len=p[arg-2]<<8|p[arg-1]; | ||
257 | |||
258 | if (!ctx->encrypt) | ||
259 | { | ||
260 | len -= MD5_DIGEST_LENGTH; | ||
261 | p[arg-2] = len>>8; | ||
262 | p[arg-1] = len; | ||
263 | } | ||
264 | key->payload_length=len; | ||
265 | key->md = key->head; | ||
266 | MD5_Update(&key->md,p,arg); | ||
267 | |||
268 | return MD5_DIGEST_LENGTH; | ||
269 | } | ||
270 | default: | ||
271 | return -1; | ||
272 | } | ||
273 | } | ||
274 | |||
275 | static EVP_CIPHER r4_hmac_md5_cipher= | ||
276 | { | ||
277 | #ifdef NID_rc4_hmac_md5 | ||
278 | NID_rc4_hmac_md5, | ||
279 | #else | ||
280 | NID_undef, | ||
281 | #endif | ||
282 | 1,EVP_RC4_KEY_SIZE,0, | ||
283 | EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER, | ||
284 | rc4_hmac_md5_init_key, | ||
285 | rc4_hmac_md5_cipher, | ||
286 | NULL, | ||
287 | sizeof(EVP_RC4_HMAC_MD5), | ||
288 | NULL, | ||
289 | NULL, | ||
290 | rc4_hmac_md5_ctrl, | ||
291 | NULL | ||
292 | }; | ||
293 | |||
294 | const EVP_CIPHER *EVP_rc4_hmac_md5(void) | ||
295 | { | ||
296 | return(&r4_hmac_md5_cipher); | ||
297 | } | ||
298 | #endif | ||
diff --git a/src/lib/libcrypto/evp/m_ecdsa.c b/src/lib/libcrypto/evp/m_ecdsa.c index 8d87a49ebe..4b15fb0f6c 100644 --- a/src/lib/libcrypto/evp/m_ecdsa.c +++ b/src/lib/libcrypto/evp/m_ecdsa.c | |||
@@ -116,6 +116,8 @@ | |||
116 | #include <openssl/x509.h> | 116 | #include <openssl/x509.h> |
117 | 117 | ||
118 | #ifndef OPENSSL_NO_SHA | 118 | #ifndef OPENSSL_NO_SHA |
119 | #ifndef OPENSSL_FIPS | ||
120 | |||
119 | static int init(EVP_MD_CTX *ctx) | 121 | static int init(EVP_MD_CTX *ctx) |
120 | { return SHA1_Init(ctx->md_data); } | 122 | { return SHA1_Init(ctx->md_data); } |
121 | 123 | ||
@@ -146,3 +148,4 @@ const EVP_MD *EVP_ecdsa(void) | |||
146 | return(&ecdsa_md); | 148 | return(&ecdsa_md); |
147 | } | 149 | } |
148 | #endif | 150 | #endif |
151 | #endif | ||
diff --git a/src/lib/libcrypto/evp/m_wp.c b/src/lib/libcrypto/evp/m_wp.c index 1ce47c040b..c51bc2d5d1 100644 --- a/src/lib/libcrypto/evp/m_wp.c +++ b/src/lib/libcrypto/evp/m_wp.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <openssl/objects.h> | 9 | #include <openssl/objects.h> |
10 | #include <openssl/x509.h> | 10 | #include <openssl/x509.h> |
11 | #include <openssl/whrlpool.h> | 11 | #include <openssl/whrlpool.h> |
12 | #include "evp_locl.h" | ||
12 | 13 | ||
13 | static int init(EVP_MD_CTX *ctx) | 14 | static int init(EVP_MD_CTX *ctx) |
14 | { return WHIRLPOOL_Init(ctx->md_data); } | 15 | { return WHIRLPOOL_Init(ctx->md_data); } |
diff --git a/src/lib/libcrypto/evp/pmeth_gn.c b/src/lib/libcrypto/evp/pmeth_gn.c index 5d74161a09..4651c81370 100644 --- a/src/lib/libcrypto/evp/pmeth_gn.c +++ b/src/lib/libcrypto/evp/pmeth_gn.c | |||
@@ -199,7 +199,7 @@ int EVP_PKEY_CTX_get_keygen_info(EVP_PKEY_CTX *ctx, int idx) | |||
199 | } | 199 | } |
200 | 200 | ||
201 | EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, | 201 | EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, |
202 | unsigned char *key, int keylen) | 202 | const unsigned char *key, int keylen) |
203 | { | 203 | { |
204 | EVP_PKEY_CTX *mac_ctx = NULL; | 204 | EVP_PKEY_CTX *mac_ctx = NULL; |
205 | EVP_PKEY *mac_key = NULL; | 205 | EVP_PKEY *mac_key = NULL; |
@@ -209,7 +209,8 @@ EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, | |||
209 | if (EVP_PKEY_keygen_init(mac_ctx) <= 0) | 209 | if (EVP_PKEY_keygen_init(mac_ctx) <= 0) |
210 | goto merr; | 210 | goto merr; |
211 | if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN, | 211 | if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN, |
212 | EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key) <= 0) | 212 | EVP_PKEY_CTRL_SET_MAC_KEY, |
213 | keylen, (void *)key) <= 0) | ||
213 | goto merr; | 214 | goto merr; |
214 | if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0) | 215 | if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0) |
215 | goto merr; | 216 | goto merr; |
diff --git a/src/lib/libcrypto/evp/pmeth_lib.c b/src/lib/libcrypto/evp/pmeth_lib.c index 5481d4b8a5..acfa7b6f87 100644 --- a/src/lib/libcrypto/evp/pmeth_lib.c +++ b/src/lib/libcrypto/evp/pmeth_lib.c | |||
@@ -73,7 +73,7 @@ DECLARE_STACK_OF(EVP_PKEY_METHOD) | |||
73 | STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL; | 73 | STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL; |
74 | 74 | ||
75 | extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth; | 75 | extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth; |
76 | extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth; | 76 | extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth; |
77 | 77 | ||
78 | static const EVP_PKEY_METHOD *standard_methods[] = | 78 | static const EVP_PKEY_METHOD *standard_methods[] = |
79 | { | 79 | { |
@@ -90,6 +90,7 @@ static const EVP_PKEY_METHOD *standard_methods[] = | |||
90 | &ec_pkey_meth, | 90 | &ec_pkey_meth, |
91 | #endif | 91 | #endif |
92 | &hmac_pkey_meth, | 92 | &hmac_pkey_meth, |
93 | &cmac_pkey_meth | ||
93 | }; | 94 | }; |
94 | 95 | ||
95 | DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *, | 96 | DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *, |
@@ -203,6 +204,8 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags) | |||
203 | if (!pmeth) | 204 | if (!pmeth) |
204 | return NULL; | 205 | return NULL; |
205 | 206 | ||
207 | memset(pmeth, 0, sizeof(EVP_PKEY_METHOD)); | ||
208 | |||
206 | pmeth->pkey_id = id; | 209 | pmeth->pkey_id = id; |
207 | pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC; | 210 | pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC; |
208 | 211 | ||
@@ -235,6 +238,56 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags) | |||
235 | return pmeth; | 238 | return pmeth; |
236 | } | 239 | } |
237 | 240 | ||
241 | void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags, | ||
242 | const EVP_PKEY_METHOD *meth) | ||
243 | { | ||
244 | if (ppkey_id) | ||
245 | *ppkey_id = meth->pkey_id; | ||
246 | if (pflags) | ||
247 | *pflags = meth->flags; | ||
248 | } | ||
249 | |||
250 | void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src) | ||
251 | { | ||
252 | |||
253 | dst->init = src->init; | ||
254 | dst->copy = src->copy; | ||
255 | dst->cleanup = src->cleanup; | ||
256 | |||
257 | dst->paramgen_init = src->paramgen_init; | ||
258 | dst->paramgen = src->paramgen; | ||
259 | |||
260 | dst->keygen_init = src->keygen_init; | ||
261 | dst->keygen = src->keygen; | ||
262 | |||
263 | dst->sign_init = src->sign_init; | ||
264 | dst->sign = src->sign; | ||
265 | |||
266 | dst->verify_init = src->verify_init; | ||
267 | dst->verify = src->verify; | ||
268 | |||
269 | dst->verify_recover_init = src->verify_recover_init; | ||
270 | dst->verify_recover = src->verify_recover; | ||
271 | |||
272 | dst->signctx_init = src->signctx_init; | ||
273 | dst->signctx = src->signctx; | ||
274 | |||
275 | dst->verifyctx_init = src->verifyctx_init; | ||
276 | dst->verifyctx = src->verifyctx; | ||
277 | |||
278 | dst->encrypt_init = src->encrypt_init; | ||
279 | dst->encrypt = src->encrypt; | ||
280 | |||
281 | dst->decrypt_init = src->decrypt_init; | ||
282 | dst->decrypt = src->decrypt; | ||
283 | |||
284 | dst->derive_init = src->derive_init; | ||
285 | dst->derive = src->derive; | ||
286 | |||
287 | dst->ctrl = src->ctrl; | ||
288 | dst->ctrl_str = src->ctrl_str; | ||
289 | } | ||
290 | |||
238 | void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth) | 291 | void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth) |
239 | { | 292 | { |
240 | if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC)) | 293 | if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC)) |
diff --git a/src/lib/libcrypto/hmac/hm_ameth.c b/src/lib/libcrypto/hmac/hm_ameth.c index 6d8a89149e..e03f24aeda 100644 --- a/src/lib/libcrypto/hmac/hm_ameth.c +++ b/src/lib/libcrypto/hmac/hm_ameth.c | |||
@@ -153,7 +153,7 @@ const EVP_PKEY_ASN1_METHOD hmac_asn1_meth = | |||
153 | 153 | ||
154 | hmac_size, | 154 | hmac_size, |
155 | 0, | 155 | 0, |
156 | 0,0,0,0,0,0, | 156 | 0,0,0,0,0,0,0, |
157 | 157 | ||
158 | hmac_key_free, | 158 | hmac_key_free, |
159 | hmac_pkey_ctrl, | 159 | hmac_pkey_ctrl, |
diff --git a/src/lib/libcrypto/hmac/hm_pmeth.c b/src/lib/libcrypto/hmac/hm_pmeth.c index 71e8567a14..0daa44511d 100644 --- a/src/lib/libcrypto/hmac/hm_pmeth.c +++ b/src/lib/libcrypto/hmac/hm_pmeth.c | |||
@@ -100,7 +100,8 @@ static int pkey_hmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) | |||
100 | dctx = dst->data; | 100 | dctx = dst->data; |
101 | dctx->md = sctx->md; | 101 | dctx->md = sctx->md; |
102 | HMAC_CTX_init(&dctx->ctx); | 102 | HMAC_CTX_init(&dctx->ctx); |
103 | HMAC_CTX_copy(&dctx->ctx, &sctx->ctx); | 103 | if (!HMAC_CTX_copy(&dctx->ctx, &sctx->ctx)) |
104 | return 0; | ||
104 | if (sctx->ktmp.data) | 105 | if (sctx->ktmp.data) |
105 | { | 106 | { |
106 | if (!ASN1_OCTET_STRING_set(&dctx->ktmp, | 107 | if (!ASN1_OCTET_STRING_set(&dctx->ktmp, |
@@ -141,7 +142,8 @@ static int pkey_hmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) | |||
141 | static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count) | 142 | static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count) |
142 | { | 143 | { |
143 | HMAC_PKEY_CTX *hctx = ctx->pctx->data; | 144 | HMAC_PKEY_CTX *hctx = ctx->pctx->data; |
144 | HMAC_Update(&hctx->ctx, data, count); | 145 | if (!HMAC_Update(&hctx->ctx, data, count)) |
146 | return 0; | ||
145 | return 1; | 147 | return 1; |
146 | } | 148 | } |
147 | 149 | ||
@@ -167,7 +169,8 @@ static int hmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | |||
167 | if (!sig) | 169 | if (!sig) |
168 | return 1; | 170 | return 1; |
169 | 171 | ||
170 | HMAC_Final(&hctx->ctx, sig, &hlen); | 172 | if (!HMAC_Final(&hctx->ctx, sig, &hlen)) |
173 | return 0; | ||
171 | *siglen = (size_t)hlen; | 174 | *siglen = (size_t)hlen; |
172 | return 1; | 175 | return 1; |
173 | } | 176 | } |
@@ -192,8 +195,9 @@ static int pkey_hmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | |||
192 | 195 | ||
193 | case EVP_PKEY_CTRL_DIGESTINIT: | 196 | case EVP_PKEY_CTRL_DIGESTINIT: |
194 | key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr; | 197 | key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr; |
195 | HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md, | 198 | if (!HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md, |
196 | ctx->engine); | 199 | ctx->engine)) |
200 | return 0; | ||
197 | break; | 201 | break; |
198 | 202 | ||
199 | default: | 203 | default: |
diff --git a/src/lib/libcrypto/ia64cpuid.S b/src/lib/libcrypto/ia64cpuid.S index d705fff7ee..7832b9b640 100644 --- a/src/lib/libcrypto/ia64cpuid.S +++ b/src/lib/libcrypto/ia64cpuid.S | |||
@@ -26,7 +26,7 @@ OPENSSL_atomic_add: | |||
26 | { .mii; mov ar.ccv=r2 | 26 | { .mii; mov ar.ccv=r2 |
27 | add r8=r2,r33 | 27 | add r8=r2,r33 |
28 | mov r3=r2 };; | 28 | mov r3=r2 };; |
29 | { .mmi; mf | 29 | { .mmi; mf;; |
30 | cmpxchg4.acq r2=[r32],r8,ar.ccv | 30 | cmpxchg4.acq r2=[r32],r8,ar.ccv |
31 | nop.i 0 };; | 31 | nop.i 0 };; |
32 | { .mib; cmp.ne p6,p0=r2,r3 | 32 | { .mib; cmp.ne p6,p0=r2,r3 |
diff --git a/src/lib/libcrypto/idea/i_cbc.c b/src/lib/libcrypto/idea/i_cbc.c new file mode 100644 index 0000000000..ecb9cb8b83 --- /dev/null +++ b/src/lib/libcrypto/idea/i_cbc.c | |||
@@ -0,0 +1,168 @@ | |||
1 | /* crypto/idea/i_cbc.c */ | ||
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * This package is an SSL implementation written | ||
6 | * by Eric Young (eay@cryptsoft.com). | ||
7 | * The implementation was written so as to conform with Netscapes SSL. | ||
8 | * | ||
9 | * This library is free for commercial and non-commercial use as long as | ||
10 | * the following conditions are aheared to. The following conditions | ||
11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
13 | * included with this distribution is covered by the same copyright terms | ||
14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
15 | * | ||
16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
17 | * the code are not to be removed. | ||
18 | * If this package is used in a product, Eric Young should be given attribution | ||
19 | * as the author of the parts of the library used. | ||
20 | * This can be in the form of a textual message at program startup or | ||
21 | * in documentation (online or textual) provided with the package. | ||
22 | * | ||
23 | * Redistribution and use in source and binary forms, with or without | ||
24 | * modification, are permitted provided that the following conditions | ||
25 | * are met: | ||
26 | * 1. Redistributions of source code must retain the copyright | ||
27 | * notice, this list of conditions and the following disclaimer. | ||
28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
29 | * notice, this list of conditions and the following disclaimer in the | ||
30 | * documentation and/or other materials provided with the distribution. | ||
31 | * 3. All advertising materials mentioning features or use of this software | ||
32 | * must display the following acknowledgement: | ||
33 | * "This product includes cryptographic software written by | ||
34 | * Eric Young (eay@cryptsoft.com)" | ||
35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
36 | * being used are not cryptographic related :-). | ||
37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
38 | * the apps directory (application code) you must include an acknowledgement: | ||
39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
40 | * | ||
41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
51 | * SUCH DAMAGE. | ||
52 | * | ||
53 | * The licence and distribution terms for any publically available version or | ||
54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
55 | * copied and put under another distribution licence | ||
56 | * [including the GNU Public Licence.] | ||
57 | */ | ||
58 | |||
59 | #include <openssl/idea.h> | ||
60 | #include "idea_lcl.h" | ||
61 | |||
62 | void idea_cbc_encrypt(const unsigned char *in, unsigned char *out, long length, | ||
63 | IDEA_KEY_SCHEDULE *ks, unsigned char *iv, int encrypt) | ||
64 | { | ||
65 | register unsigned long tin0,tin1; | ||
66 | register unsigned long tout0,tout1,xor0,xor1; | ||
67 | register long l=length; | ||
68 | unsigned long tin[2]; | ||
69 | |||
70 | if (encrypt) | ||
71 | { | ||
72 | n2l(iv,tout0); | ||
73 | n2l(iv,tout1); | ||
74 | iv-=8; | ||
75 | for (l-=8; l>=0; l-=8) | ||
76 | { | ||
77 | n2l(in,tin0); | ||
78 | n2l(in,tin1); | ||
79 | tin0^=tout0; | ||
80 | tin1^=tout1; | ||
81 | tin[0]=tin0; | ||
82 | tin[1]=tin1; | ||
83 | idea_encrypt(tin,ks); | ||
84 | tout0=tin[0]; l2n(tout0,out); | ||
85 | tout1=tin[1]; l2n(tout1,out); | ||
86 | } | ||
87 | if (l != -8) | ||
88 | { | ||
89 | n2ln(in,tin0,tin1,l+8); | ||
90 | tin0^=tout0; | ||
91 | tin1^=tout1; | ||
92 | tin[0]=tin0; | ||
93 | tin[1]=tin1; | ||
94 | idea_encrypt(tin,ks); | ||
95 | tout0=tin[0]; l2n(tout0,out); | ||
96 | tout1=tin[1]; l2n(tout1,out); | ||
97 | } | ||
98 | l2n(tout0,iv); | ||
99 | l2n(tout1,iv); | ||
100 | } | ||
101 | else | ||
102 | { | ||
103 | n2l(iv,xor0); | ||
104 | n2l(iv,xor1); | ||
105 | iv-=8; | ||
106 | for (l-=8; l>=0; l-=8) | ||
107 | { | ||
108 | n2l(in,tin0); tin[0]=tin0; | ||
109 | n2l(in,tin1); tin[1]=tin1; | ||
110 | idea_encrypt(tin,ks); | ||
111 | tout0=tin[0]^xor0; | ||
112 | tout1=tin[1]^xor1; | ||
113 | l2n(tout0,out); | ||
114 | l2n(tout1,out); | ||
115 | xor0=tin0; | ||
116 | xor1=tin1; | ||
117 | } | ||
118 | if (l != -8) | ||
119 | { | ||
120 | n2l(in,tin0); tin[0]=tin0; | ||
121 | n2l(in,tin1); tin[1]=tin1; | ||
122 | idea_encrypt(tin,ks); | ||
123 | tout0=tin[0]^xor0; | ||
124 | tout1=tin[1]^xor1; | ||
125 | l2nn(tout0,tout1,out,l+8); | ||
126 | xor0=tin0; | ||
127 | xor1=tin1; | ||
128 | } | ||
129 | l2n(xor0,iv); | ||
130 | l2n(xor1,iv); | ||
131 | } | ||
132 | tin0=tin1=tout0=tout1=xor0=xor1=0; | ||
133 | tin[0]=tin[1]=0; | ||
134 | } | ||
135 | |||
136 | void idea_encrypt(unsigned long *d, IDEA_KEY_SCHEDULE *key) | ||
137 | { | ||
138 | register IDEA_INT *p; | ||
139 | register unsigned long x1,x2,x3,x4,t0,t1,ul; | ||
140 | |||
141 | x2=d[0]; | ||
142 | x1=(x2>>16); | ||
143 | x4=d[1]; | ||
144 | x3=(x4>>16); | ||
145 | |||
146 | p= &(key->data[0][0]); | ||
147 | |||
148 | E_IDEA(0); | ||
149 | E_IDEA(1); | ||
150 | E_IDEA(2); | ||
151 | E_IDEA(3); | ||
152 | E_IDEA(4); | ||
153 | E_IDEA(5); | ||
154 | E_IDEA(6); | ||
155 | E_IDEA(7); | ||
156 | |||
157 | x1&=0xffff; | ||
158 | idea_mul(x1,x1,*p,ul); p++; | ||
159 | |||
160 | t0= x3+ *(p++); | ||
161 | t1= x2+ *(p++); | ||
162 | |||
163 | x4&=0xffff; | ||
164 | idea_mul(x4,x4,*p,ul); | ||
165 | |||
166 | d[0]=(t0&0xffff)|((x1&0xffff)<<16); | ||
167 | d[1]=(x4&0xffff)|((t1&0xffff)<<16); | ||
168 | } | ||
diff --git a/src/lib/libcrypto/idea/i_cfb64.c b/src/lib/libcrypto/idea/i_cfb64.c new file mode 100644 index 0000000000..66d49d520e --- /dev/null +++ b/src/lib/libcrypto/idea/i_cfb64.c | |||
@@ -0,0 +1,122 @@ | |||
1 | /* crypto/idea/i_cfb64.c */ | ||
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * This package is an SSL implementation written | ||
6 | * by Eric Young (eay@cryptsoft.com). | ||
7 | * The implementation was written so as to conform with Netscapes SSL. | ||
8 | * | ||
9 | * This library is free for commercial and non-commercial use as long as | ||
10 | * the following conditions are aheared to. The following conditions | ||
11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
13 | * included with this distribution is covered by the same copyright terms | ||
14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
15 | * | ||
16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
17 | * the code are not to be removed. | ||
18 | * If this package is used in a product, Eric Young should be given attribution | ||
19 | * as the author of the parts of the library used. | ||
20 | * This can be in the form of a textual message at program startup or | ||
21 | * in documentation (online or textual) provided with the package. | ||
22 | * | ||
23 | * Redistribution and use in source and binary forms, with or without | ||
24 | * modification, are permitted provided that the following conditions | ||
25 | * are met: | ||
26 | * 1. Redistributions of source code must retain the copyright | ||
27 | * notice, this list of conditions and the following disclaimer. | ||
28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
29 | * notice, this list of conditions and the following disclaimer in the | ||
30 | * documentation and/or other materials provided with the distribution. | ||
31 | * 3. All advertising materials mentioning features or use of this software | ||
32 | * must display the following acknowledgement: | ||
33 | * "This product includes cryptographic software written by | ||
34 | * Eric Young (eay@cryptsoft.com)" | ||
35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
36 | * being used are not cryptographic related :-). | ||
37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
38 | * the apps directory (application code) you must include an acknowledgement: | ||
39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
40 | * | ||
41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
51 | * SUCH DAMAGE. | ||
52 | * | ||
53 | * The licence and distribution terms for any publically available version or | ||
54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
55 | * copied and put under another distribution licence | ||
56 | * [including the GNU Public Licence.] | ||
57 | */ | ||
58 | |||
59 | #include <openssl/idea.h> | ||
60 | #include "idea_lcl.h" | ||
61 | |||
62 | /* The input and output encrypted as though 64bit cfb mode is being | ||
63 | * used. The extra state information to record how much of the | ||
64 | * 64bit block we have used is contained in *num; | ||
65 | */ | ||
66 | |||
67 | void idea_cfb64_encrypt(const unsigned char *in, unsigned char *out, | ||
68 | long length, IDEA_KEY_SCHEDULE *schedule, | ||
69 | unsigned char *ivec, int *num, int encrypt) | ||
70 | { | ||
71 | register unsigned long v0,v1,t; | ||
72 | register int n= *num; | ||
73 | register long l=length; | ||
74 | unsigned long ti[2]; | ||
75 | unsigned char *iv,c,cc; | ||
76 | |||
77 | iv=(unsigned char *)ivec; | ||
78 | if (encrypt) | ||
79 | { | ||
80 | while (l--) | ||
81 | { | ||
82 | if (n == 0) | ||
83 | { | ||
84 | n2l(iv,v0); ti[0]=v0; | ||
85 | n2l(iv,v1); ti[1]=v1; | ||
86 | idea_encrypt((unsigned long *)ti,schedule); | ||
87 | iv=(unsigned char *)ivec; | ||
88 | t=ti[0]; l2n(t,iv); | ||
89 | t=ti[1]; l2n(t,iv); | ||
90 | iv=(unsigned char *)ivec; | ||
91 | } | ||
92 | c= *(in++)^iv[n]; | ||
93 | *(out++)=c; | ||
94 | iv[n]=c; | ||
95 | n=(n+1)&0x07; | ||
96 | } | ||
97 | } | ||
98 | else | ||
99 | { | ||
100 | while (l--) | ||
101 | { | ||
102 | if (n == 0) | ||
103 | { | ||
104 | n2l(iv,v0); ti[0]=v0; | ||
105 | n2l(iv,v1); ti[1]=v1; | ||
106 | idea_encrypt((unsigned long *)ti,schedule); | ||
107 | iv=(unsigned char *)ivec; | ||
108 | t=ti[0]; l2n(t,iv); | ||
109 | t=ti[1]; l2n(t,iv); | ||
110 | iv=(unsigned char *)ivec; | ||
111 | } | ||
112 | cc= *(in++); | ||
113 | c=iv[n]; | ||
114 | iv[n]=cc; | ||
115 | *(out++)=c^cc; | ||
116 | n=(n+1)&0x07; | ||
117 | } | ||
118 | } | ||
119 | v0=v1=ti[0]=ti[1]=t=c=cc=0; | ||
120 | *num=n; | ||
121 | } | ||
122 | |||
diff --git a/src/lib/libcrypto/idea/i_ecb.c b/src/lib/libcrypto/idea/i_ecb.c new file mode 100644 index 0000000000..fef38230a7 --- /dev/null +++ b/src/lib/libcrypto/idea/i_ecb.c | |||
@@ -0,0 +1,85 @@ | |||
1 | /* crypto/idea/i_ecb.c */ | ||
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * This package is an SSL implementation written | ||
6 | * by Eric Young (eay@cryptsoft.com). | ||
7 | * The implementation was written so as to conform with Netscapes SSL. | ||
8 | * | ||
9 | * This library is free for commercial and non-commercial use as long as | ||
10 | * the following conditions are aheared to. The following conditions | ||
11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
13 | * included with this distribution is covered by the same copyright terms | ||
14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
15 | * | ||
16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
17 | * the code are not to be removed. | ||
18 | * If this package is used in a product, Eric Young should be given attribution | ||
19 | * as the author of the parts of the library used. | ||
20 | * This can be in the form of a textual message at program startup or | ||
21 | * in documentation (online or textual) provided with the package. | ||
22 | * | ||
23 | * Redistribution and use in source and binary forms, with or without | ||
24 | * modification, are permitted provided that the following conditions | ||
25 | * are met: | ||
26 | * 1. Redistributions of source code must retain the copyright | ||
27 | * notice, this list of conditions and the following disclaimer. | ||
28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
29 | * notice, this list of conditions and the following disclaimer in the | ||
30 | * documentation and/or other materials provided with the distribution. | ||
31 | * 3. All advertising materials mentioning features or use of this software | ||
32 | * must display the following acknowledgement: | ||
33 | * "This product includes cryptographic software written by | ||
34 | * Eric Young (eay@cryptsoft.com)" | ||
35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
36 | * being used are not cryptographic related :-). | ||
37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
38 | * the apps directory (application code) you must include an acknowledgement: | ||
39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
40 | * | ||
41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
51 | * SUCH DAMAGE. | ||
52 | * | ||
53 | * The licence and distribution terms for any publically available version or | ||
54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
55 | * copied and put under another distribution licence | ||
56 | * [including the GNU Public Licence.] | ||
57 | */ | ||
58 | |||
59 | #include <openssl/idea.h> | ||
60 | #include "idea_lcl.h" | ||
61 | #include <openssl/opensslv.h> | ||
62 | |||
63 | const char IDEA_version[]="IDEA" OPENSSL_VERSION_PTEXT; | ||
64 | |||
65 | const char *idea_options(void) | ||
66 | { | ||
67 | if (sizeof(short) != sizeof(IDEA_INT)) | ||
68 | return("idea(int)"); | ||
69 | else | ||
70 | return("idea(short)"); | ||
71 | } | ||
72 | |||
73 | void idea_ecb_encrypt(const unsigned char *in, unsigned char *out, | ||
74 | IDEA_KEY_SCHEDULE *ks) | ||
75 | { | ||
76 | unsigned long l0,l1,d[2]; | ||
77 | |||
78 | n2l(in,l0); d[0]=l0; | ||
79 | n2l(in,l1); d[1]=l1; | ||
80 | idea_encrypt(d,ks); | ||
81 | l0=d[0]; l2n(l0,out); | ||
82 | l1=d[1]; l2n(l1,out); | ||
83 | l0=l1=d[0]=d[1]=0; | ||
84 | } | ||
85 | |||
diff --git a/src/lib/libcrypto/idea/i_ofb64.c b/src/lib/libcrypto/idea/i_ofb64.c new file mode 100644 index 0000000000..e749e88e34 --- /dev/null +++ b/src/lib/libcrypto/idea/i_ofb64.c | |||
@@ -0,0 +1,111 @@ | |||
1 | /* crypto/idea/i_ofb64.c */ | ||
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * This package is an SSL implementation written | ||
6 | * by Eric Young (eay@cryptsoft.com). | ||
7 | * The implementation was written so as to conform with Netscapes SSL. | ||
8 | * | ||
9 | * This library is free for commercial and non-commercial use as long as | ||
10 | * the following conditions are aheared to. The following conditions | ||
11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
13 | * included with this distribution is covered by the same copyright terms | ||
14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
15 | * | ||
16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
17 | * the code are not to be removed. | ||
18 | * If this package is used in a product, Eric Young should be given attribution | ||
19 | * as the author of the parts of the library used. | ||
20 | * This can be in the form of a textual message at program startup or | ||
21 | * in documentation (online or textual) provided with the package. | ||
22 | * | ||
23 | * Redistribution and use in source and binary forms, with or without | ||
24 | * modification, are permitted provided that the following conditions | ||
25 | * are met: | ||
26 | * 1. Redistributions of source code must retain the copyright | ||
27 | * notice, this list of conditions and the following disclaimer. | ||
28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
29 | * notice, this list of conditions and the following disclaimer in the | ||
30 | * documentation and/or other materials provided with the distribution. | ||
31 | * 3. All advertising materials mentioning features or use of this software | ||
32 | * must display the following acknowledgement: | ||
33 | * "This product includes cryptographic software written by | ||
34 | * Eric Young (eay@cryptsoft.com)" | ||
35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
36 | * being used are not cryptographic related :-). | ||
37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
38 | * the apps directory (application code) you must include an acknowledgement: | ||
39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
40 | * | ||
41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
51 | * SUCH DAMAGE. | ||
52 | * | ||
53 | * The licence and distribution terms for any publically available version or | ||
54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
55 | * copied and put under another distribution licence | ||
56 | * [including the GNU Public Licence.] | ||
57 | */ | ||
58 | |||
59 | #include <openssl/idea.h> | ||
60 | #include "idea_lcl.h" | ||
61 | |||
62 | /* The input and output encrypted as though 64bit ofb mode is being | ||
63 | * used. The extra state information to record how much of the | ||
64 | * 64bit block we have used is contained in *num; | ||
65 | */ | ||
66 | void idea_ofb64_encrypt(const unsigned char *in, unsigned char *out, | ||
67 | long length, IDEA_KEY_SCHEDULE *schedule, | ||
68 | unsigned char *ivec, int *num) | ||
69 | { | ||
70 | register unsigned long v0,v1,t; | ||
71 | register int n= *num; | ||
72 | register long l=length; | ||
73 | unsigned char d[8]; | ||
74 | register char *dp; | ||
75 | unsigned long ti[2]; | ||
76 | unsigned char *iv; | ||
77 | int save=0; | ||
78 | |||
79 | iv=(unsigned char *)ivec; | ||
80 | n2l(iv,v0); | ||
81 | n2l(iv,v1); | ||
82 | ti[0]=v0; | ||
83 | ti[1]=v1; | ||
84 | dp=(char *)d; | ||
85 | l2n(v0,dp); | ||
86 | l2n(v1,dp); | ||
87 | while (l--) | ||
88 | { | ||
89 | if (n == 0) | ||
90 | { | ||
91 | idea_encrypt((unsigned long *)ti,schedule); | ||
92 | dp=(char *)d; | ||
93 | t=ti[0]; l2n(t,dp); | ||
94 | t=ti[1]; l2n(t,dp); | ||
95 | save++; | ||
96 | } | ||
97 | *(out++)= *(in++)^d[n]; | ||
98 | n=(n+1)&0x07; | ||
99 | } | ||
100 | if (save) | ||
101 | { | ||
102 | v0=ti[0]; | ||
103 | v1=ti[1]; | ||
104 | iv=(unsigned char *)ivec; | ||
105 | l2n(v0,iv); | ||
106 | l2n(v1,iv); | ||
107 | } | ||
108 | t=v0=v1=ti[0]=ti[1]=0; | ||
109 | *num=n; | ||
110 | } | ||
111 | |||
diff --git a/src/lib/libcrypto/idea/i_skey.c b/src/lib/libcrypto/idea/i_skey.c new file mode 100644 index 0000000000..afb830964d --- /dev/null +++ b/src/lib/libcrypto/idea/i_skey.c | |||
@@ -0,0 +1,164 @@ | |||
1 | /* crypto/idea/i_skey.c */ | ||
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * This package is an SSL implementation written | ||
6 | * by Eric Young (eay@cryptsoft.com). | ||
7 | * The implementation was written so as to conform with Netscapes SSL. | ||
8 | * | ||
9 | * This library is free for commercial and non-commercial use as long as | ||
10 | * the following conditions are aheared to. The following conditions | ||
11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
13 | * included with this distribution is covered by the same copyright terms | ||
14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
15 | * | ||
16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
17 | * the code are not to be removed. | ||
18 | * If this package is used in a product, Eric Young should be given attribution | ||
19 | * as the author of the parts of the library used. | ||
20 | * This can be in the form of a textual message at program startup or | ||
21 | * in documentation (online or textual) provided with the package. | ||
22 | * | ||
23 | * Redistribution and use in source and binary forms, with or without | ||
24 | * modification, are permitted provided that the following conditions | ||
25 | * are met: | ||
26 | * 1. Redistributions of source code must retain the copyright | ||
27 | * notice, this list of conditions and the following disclaimer. | ||
28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
29 | * notice, this list of conditions and the following disclaimer in the | ||
30 | * documentation and/or other materials provided with the distribution. | ||
31 | * 3. All advertising materials mentioning features or use of this software | ||
32 | * must display the following acknowledgement: | ||
33 | * "This product includes cryptographic software written by | ||
34 | * Eric Young (eay@cryptsoft.com)" | ||
35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
36 | * being used are not cryptographic related :-). | ||
37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
38 | * the apps directory (application code) you must include an acknowledgement: | ||
39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
40 | * | ||
41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
51 | * SUCH DAMAGE. | ||
52 | * | ||
53 | * The licence and distribution terms for any publically available version or | ||
54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
55 | * copied and put under another distribution licence | ||
56 | * [including the GNU Public Licence.] | ||
57 | */ | ||
58 | |||
59 | #include <openssl/crypto.h> | ||
60 | #include <openssl/idea.h> | ||
61 | #include "idea_lcl.h" | ||
62 | |||
63 | static IDEA_INT inverse(unsigned int xin); | ||
64 | void idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks) | ||
65 | #ifdef OPENSSL_FIPS | ||
66 | { | ||
67 | fips_cipher_abort(IDEA); | ||
68 | private_idea_set_encrypt_key(key, ks); | ||
69 | } | ||
70 | void private_idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks) | ||
71 | #endif | ||
72 | { | ||
73 | int i; | ||
74 | register IDEA_INT *kt,*kf,r0,r1,r2; | ||
75 | |||
76 | kt= &(ks->data[0][0]); | ||
77 | n2s(key,kt[0]); n2s(key,kt[1]); n2s(key,kt[2]); n2s(key,kt[3]); | ||
78 | n2s(key,kt[4]); n2s(key,kt[5]); n2s(key,kt[6]); n2s(key,kt[7]); | ||
79 | |||
80 | kf=kt; | ||
81 | kt+=8; | ||
82 | for (i=0; i<6; i++) | ||
83 | { | ||
84 | r2= kf[1]; | ||
85 | r1= kf[2]; | ||
86 | *(kt++)= ((r2<<9) | (r1>>7))&0xffff; | ||
87 | r0= kf[3]; | ||
88 | *(kt++)= ((r1<<9) | (r0>>7))&0xffff; | ||
89 | r1= kf[4]; | ||
90 | *(kt++)= ((r0<<9) | (r1>>7))&0xffff; | ||
91 | r0= kf[5]; | ||
92 | *(kt++)= ((r1<<9) | (r0>>7))&0xffff; | ||
93 | r1= kf[6]; | ||
94 | *(kt++)= ((r0<<9) | (r1>>7))&0xffff; | ||
95 | r0= kf[7]; | ||
96 | *(kt++)= ((r1<<9) | (r0>>7))&0xffff; | ||
97 | r1= kf[0]; | ||
98 | if (i >= 5) break; | ||
99 | *(kt++)= ((r0<<9) | (r1>>7))&0xffff; | ||
100 | *(kt++)= ((r1<<9) | (r2>>7))&0xffff; | ||
101 | kf+=8; | ||
102 | } | ||
103 | } | ||
104 | |||
105 | void idea_set_decrypt_key(IDEA_KEY_SCHEDULE *ek, IDEA_KEY_SCHEDULE *dk) | ||
106 | { | ||
107 | int r; | ||
108 | register IDEA_INT *fp,*tp,t; | ||
109 | |||
110 | tp= &(dk->data[0][0]); | ||
111 | fp= &(ek->data[8][0]); | ||
112 | for (r=0; r<9; r++) | ||
113 | { | ||
114 | *(tp++)=inverse(fp[0]); | ||
115 | *(tp++)=((int)(0x10000L-fp[2])&0xffff); | ||
116 | *(tp++)=((int)(0x10000L-fp[1])&0xffff); | ||
117 | *(tp++)=inverse(fp[3]); | ||
118 | if (r == 8) break; | ||
119 | fp-=6; | ||
120 | *(tp++)=fp[4]; | ||
121 | *(tp++)=fp[5]; | ||
122 | } | ||
123 | |||
124 | tp= &(dk->data[0][0]); | ||
125 | t=tp[1]; | ||
126 | tp[1]=tp[2]; | ||
127 | tp[2]=t; | ||
128 | |||
129 | t=tp[49]; | ||
130 | tp[49]=tp[50]; | ||
131 | tp[50]=t; | ||
132 | } | ||
133 | |||
134 | /* taken directly from the 'paper' I'll have a look at it later */ | ||
135 | static IDEA_INT inverse(unsigned int xin) | ||
136 | { | ||
137 | long n1,n2,q,r,b1,b2,t; | ||
138 | |||
139 | if (xin == 0) | ||
140 | b2=0; | ||
141 | else | ||
142 | { | ||
143 | n1=0x10001; | ||
144 | n2=xin; | ||
145 | b2=1; | ||
146 | b1=0; | ||
147 | |||
148 | do { | ||
149 | r=(n1%n2); | ||
150 | q=(n1-r)/n2; | ||
151 | if (r == 0) | ||
152 | { if (b2 < 0) b2=0x10001+b2; } | ||
153 | else | ||
154 | { | ||
155 | n1=n2; | ||
156 | n2=r; | ||
157 | t=b2; | ||
158 | b2=b1-q*b2; | ||
159 | b1=t; | ||
160 | } | ||
161 | } while (r != 0); | ||
162 | } | ||
163 | return((IDEA_INT)b2); | ||
164 | } | ||
diff --git a/src/lib/libcrypto/idea/idea_lcl.h b/src/lib/libcrypto/idea/idea_lcl.h new file mode 100644 index 0000000000..f3dbfa67e9 --- /dev/null +++ b/src/lib/libcrypto/idea/idea_lcl.h | |||
@@ -0,0 +1,215 @@ | |||
1 | /* crypto/idea/idea_lcl.h */ | ||
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * This package is an SSL implementation written | ||
6 | * by Eric Young (eay@cryptsoft.com). | ||
7 | * The implementation was written so as to conform with Netscapes SSL. | ||
8 | * | ||
9 | * This library is free for commercial and non-commercial use as long as | ||
10 | * the following conditions are aheared to. The following conditions | ||
11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
13 | * included with this distribution is covered by the same copyright terms | ||
14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
15 | * | ||
16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
17 | * the code are not to be removed. | ||
18 | * If this package is used in a product, Eric Young should be given attribution | ||
19 | * as the author of the parts of the library used. | ||
20 | * This can be in the form of a textual message at program startup or | ||
21 | * in documentation (online or textual) provided with the package. | ||
22 | * | ||
23 | * Redistribution and use in source and binary forms, with or without | ||
24 | * modification, are permitted provided that the following conditions | ||
25 | * are met: | ||
26 | * 1. Redistributions of source code must retain the copyright | ||
27 | * notice, this list of conditions and the following disclaimer. | ||
28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
29 | * notice, this list of conditions and the following disclaimer in the | ||
30 | * documentation and/or other materials provided with the distribution. | ||
31 | * 3. All advertising materials mentioning features or use of this software | ||
32 | * must display the following acknowledgement: | ||
33 | * "This product includes cryptographic software written by | ||
34 | * Eric Young (eay@cryptsoft.com)" | ||
35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
36 | * being used are not cryptographic related :-). | ||
37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
38 | * the apps directory (application code) you must include an acknowledgement: | ||
39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
40 | * | ||
41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
51 | * SUCH DAMAGE. | ||
52 | * | ||
53 | * The licence and distribution terms for any publically available version or | ||
54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
55 | * copied and put under another distribution licence | ||
56 | * [including the GNU Public Licence.] | ||
57 | */ | ||
58 | |||
59 | /* The new form of this macro (check if the a*b == 0) was suggested by | ||
60 | * Colin Plumb <colin@nyx10.cs.du.edu> */ | ||
61 | /* Removal of the inner if from from Wei Dai 24/4/96 */ | ||
62 | #define idea_mul(r,a,b,ul) \ | ||
63 | ul=(unsigned long)a*b; \ | ||
64 | if (ul != 0) \ | ||
65 | { \ | ||
66 | r=(ul&0xffff)-(ul>>16); \ | ||
67 | r-=((r)>>16); \ | ||
68 | } \ | ||
69 | else \ | ||
70 | r=(-(int)a-b+1); /* assuming a or b is 0 and in range */ | ||
71 | |||
72 | #ifdef undef | ||
73 | #define idea_mul(r,a,b,ul,sl) \ | ||
74 | if (a == 0) r=(0x10001-b)&0xffff; \ | ||
75 | else if (b == 0) r=(0x10001-a)&0xffff; \ | ||
76 | else { \ | ||
77 | ul=(unsigned long)a*b; \ | ||
78 | sl=(ul&0xffff)-(ul>>16); \ | ||
79 | if (sl <= 0) sl+=0x10001; \ | ||
80 | r=sl; \ | ||
81 | } | ||
82 | #endif | ||
83 | |||
84 | /* 7/12/95 - Many thanks to Rhys Weatherley <rweather@us.oracle.com> | ||
85 | * for pointing out that I was assuming little endian | ||
86 | * byte order for all quantities what idea | ||
87 | * actually used bigendian. No where in the spec does it mention | ||
88 | * this, it is all in terms of 16 bit numbers and even the example | ||
89 | * does not use byte streams for the input example :-(. | ||
90 | * If you byte swap each pair of input, keys and iv, the functions | ||
91 | * would produce the output as the old version :-(. | ||
92 | */ | ||
93 | |||
94 | /* NOTE - c is not incremented as per n2l */ | ||
95 | #define n2ln(c,l1,l2,n) { \ | ||
96 | c+=n; \ | ||
97 | l1=l2=0; \ | ||
98 | switch (n) { \ | ||
99 | case 8: l2 =((unsigned long)(*(--(c)))) ; \ | ||
100 | case 7: l2|=((unsigned long)(*(--(c))))<< 8; \ | ||
101 | case 6: l2|=((unsigned long)(*(--(c))))<<16; \ | ||
102 | case 5: l2|=((unsigned long)(*(--(c))))<<24; \ | ||
103 | case 4: l1 =((unsigned long)(*(--(c)))) ; \ | ||
104 | case 3: l1|=((unsigned long)(*(--(c))))<< 8; \ | ||
105 | case 2: l1|=((unsigned long)(*(--(c))))<<16; \ | ||
106 | case 1: l1|=((unsigned long)(*(--(c))))<<24; \ | ||
107 | } \ | ||
108 | } | ||
109 | |||
110 | /* NOTE - c is not incremented as per l2n */ | ||
111 | #define l2nn(l1,l2,c,n) { \ | ||
112 | c+=n; \ | ||
113 | switch (n) { \ | ||
114 | case 8: *(--(c))=(unsigned char)(((l2) )&0xff); \ | ||
115 | case 7: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \ | ||
116 | case 6: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \ | ||
117 | case 5: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \ | ||
118 | case 4: *(--(c))=(unsigned char)(((l1) )&0xff); \ | ||
119 | case 3: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \ | ||
120 | case 2: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \ | ||
121 | case 1: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \ | ||
122 | } \ | ||
123 | } | ||
124 | |||
125 | #undef n2l | ||
126 | #define n2l(c,l) (l =((unsigned long)(*((c)++)))<<24L, \ | ||
127 | l|=((unsigned long)(*((c)++)))<<16L, \ | ||
128 | l|=((unsigned long)(*((c)++)))<< 8L, \ | ||
129 | l|=((unsigned long)(*((c)++)))) | ||
130 | |||
131 | #undef l2n | ||
132 | #define l2n(l,c) (*((c)++)=(unsigned char)(((l)>>24L)&0xff), \ | ||
133 | *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ | ||
134 | *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ | ||
135 | *((c)++)=(unsigned char)(((l) )&0xff)) | ||
136 | |||
137 | #undef s2n | ||
138 | #define s2n(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ | ||
139 | *((c)++)=(unsigned char)(((l)>> 8L)&0xff)) | ||
140 | |||
141 | #undef n2s | ||
142 | #define n2s(c,l) (l =((IDEA_INT)(*((c)++)))<< 8L, \ | ||
143 | l|=((IDEA_INT)(*((c)++))) ) | ||
144 | |||
145 | #ifdef undef | ||
146 | /* NOTE - c is not incremented as per c2l */ | ||
147 | #define c2ln(c,l1,l2,n) { \ | ||
148 | c+=n; \ | ||
149 | l1=l2=0; \ | ||
150 | switch (n) { \ | ||
151 | case 8: l2 =((unsigned long)(*(--(c))))<<24; \ | ||
152 | case 7: l2|=((unsigned long)(*(--(c))))<<16; \ | ||
153 | case 6: l2|=((unsigned long)(*(--(c))))<< 8; \ | ||
154 | case 5: l2|=((unsigned long)(*(--(c)))); \ | ||
155 | case 4: l1 =((unsigned long)(*(--(c))))<<24; \ | ||
156 | case 3: l1|=((unsigned long)(*(--(c))))<<16; \ | ||
157 | case 2: l1|=((unsigned long)(*(--(c))))<< 8; \ | ||
158 | case 1: l1|=((unsigned long)(*(--(c)))); \ | ||
159 | } \ | ||
160 | } | ||
161 | |||
162 | /* NOTE - c is not incremented as per l2c */ | ||
163 | #define l2cn(l1,l2,c,n) { \ | ||
164 | c+=n; \ | ||
165 | switch (n) { \ | ||
166 | case 8: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \ | ||
167 | case 7: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \ | ||
168 | case 6: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \ | ||
169 | case 5: *(--(c))=(unsigned char)(((l2) )&0xff); \ | ||
170 | case 4: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \ | ||
171 | case 3: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \ | ||
172 | case 2: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \ | ||
173 | case 1: *(--(c))=(unsigned char)(((l1) )&0xff); \ | ||
174 | } \ | ||
175 | } | ||
176 | |||
177 | #undef c2s | ||
178 | #define c2s(c,l) (l =((unsigned long)(*((c)++))) , \ | ||
179 | l|=((unsigned long)(*((c)++)))<< 8L) | ||
180 | |||
181 | #undef s2c | ||
182 | #define s2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ | ||
183 | *((c)++)=(unsigned char)(((l)>> 8L)&0xff)) | ||
184 | |||
185 | #undef c2l | ||
186 | #define c2l(c,l) (l =((unsigned long)(*((c)++))) , \ | ||
187 | l|=((unsigned long)(*((c)++)))<< 8L, \ | ||
188 | l|=((unsigned long)(*((c)++)))<<16L, \ | ||
189 | l|=((unsigned long)(*((c)++)))<<24L) | ||
190 | |||
191 | #undef l2c | ||
192 | #define l2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ | ||
193 | *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ | ||
194 | *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ | ||
195 | *((c)++)=(unsigned char)(((l)>>24L)&0xff)) | ||
196 | #endif | ||
197 | |||
198 | #define E_IDEA(num) \ | ||
199 | x1&=0xffff; \ | ||
200 | idea_mul(x1,x1,*p,ul); p++; \ | ||
201 | x2+= *(p++); \ | ||
202 | x3+= *(p++); \ | ||
203 | x4&=0xffff; \ | ||
204 | idea_mul(x4,x4,*p,ul); p++; \ | ||
205 | t0=(x1^x3)&0xffff; \ | ||
206 | idea_mul(t0,t0,*p,ul); p++; \ | ||
207 | t1=(t0+(x2^x4))&0xffff; \ | ||
208 | idea_mul(t1,t1,*p,ul); p++; \ | ||
209 | t0+=t1; \ | ||
210 | x1^=t1; \ | ||
211 | x4^=t0; \ | ||
212 | ul=x2^t0; /* do the swap to x3 */ \ | ||
213 | x2=x3^t1; \ | ||
214 | x3=ul; | ||
215 | |||
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl new file mode 100644 index 0000000000..6358b2750f --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-alpha.pl | |||
@@ -0,0 +1,451 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # March 2010 | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
14 | # uses 256 bytes per-key table [+128 bytes shared table]. Even though | ||
15 | # loops are aggressively modulo-scheduled in respect to references to | ||
16 | # Htbl and Z.hi updates for 8 cycles per byte, measured performance is | ||
17 | # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic | ||
18 | # scheduling "glitch," because uprofile(1) indicates uniform sample | ||
19 | # distribution, as if all instruction bundles execute in 1.5 cycles. | ||
20 | # Meaning that it could have been even faster, yet 12 cycles is ~60% | ||
21 | # better than gcc-generated code and ~80% than code generated by vendor | ||
22 | # compiler. | ||
23 | |||
24 | $cnt="v0"; # $0 | ||
25 | $t0="t0"; | ||
26 | $t1="t1"; | ||
27 | $t2="t2"; | ||
28 | $Thi0="t3"; # $4 | ||
29 | $Tlo0="t4"; | ||
30 | $Thi1="t5"; | ||
31 | $Tlo1="t6"; | ||
32 | $rem="t7"; # $8 | ||
33 | ################# | ||
34 | $Xi="a0"; # $16, input argument block | ||
35 | $Htbl="a1"; | ||
36 | $inp="a2"; | ||
37 | $len="a3"; | ||
38 | $nlo="a4"; # $20 | ||
39 | $nhi="a5"; | ||
40 | $Zhi="t8"; | ||
41 | $Zlo="t9"; | ||
42 | $Xhi="t10"; # $24 | ||
43 | $Xlo="t11"; | ||
44 | $remp="t12"; | ||
45 | $rem_4bit="AT"; # $28 | ||
46 | |||
47 | { my $N; | ||
48 | sub loop() { | ||
49 | |||
50 | $N++; | ||
51 | $code.=<<___; | ||
52 | .align 4 | ||
53 | extbl $Xlo,7,$nlo | ||
54 | and $nlo,0xf0,$nhi | ||
55 | sll $nlo,4,$nlo | ||
56 | and $nlo,0xf0,$nlo | ||
57 | |||
58 | addq $nlo,$Htbl,$nlo | ||
59 | ldq $Zlo,8($nlo) | ||
60 | addq $nhi,$Htbl,$nhi | ||
61 | ldq $Zhi,0($nlo) | ||
62 | |||
63 | and $Zlo,0x0f,$remp | ||
64 | sll $Zhi,60,$t0 | ||
65 | lda $cnt,6(zero) | ||
66 | extbl $Xlo,6,$nlo | ||
67 | |||
68 | ldq $Tlo1,8($nhi) | ||
69 | s8addq $remp,$rem_4bit,$remp | ||
70 | ldq $Thi1,0($nhi) | ||
71 | srl $Zlo,4,$Zlo | ||
72 | |||
73 | ldq $rem,0($remp) | ||
74 | srl $Zhi,4,$Zhi | ||
75 | xor $t0,$Zlo,$Zlo | ||
76 | and $nlo,0xf0,$nhi | ||
77 | |||
78 | xor $Tlo1,$Zlo,$Zlo | ||
79 | sll $nlo,4,$nlo | ||
80 | xor $Thi1,$Zhi,$Zhi | ||
81 | and $nlo,0xf0,$nlo | ||
82 | |||
83 | addq $nlo,$Htbl,$nlo | ||
84 | ldq $Tlo0,8($nlo) | ||
85 | addq $nhi,$Htbl,$nhi | ||
86 | ldq $Thi0,0($nlo) | ||
87 | |||
88 | .Looplo$N: | ||
89 | and $Zlo,0x0f,$remp | ||
90 | sll $Zhi,60,$t0 | ||
91 | subq $cnt,1,$cnt | ||
92 | srl $Zlo,4,$Zlo | ||
93 | |||
94 | ldq $Tlo1,8($nhi) | ||
95 | xor $rem,$Zhi,$Zhi | ||
96 | ldq $Thi1,0($nhi) | ||
97 | s8addq $remp,$rem_4bit,$remp | ||
98 | |||
99 | ldq $rem,0($remp) | ||
100 | srl $Zhi,4,$Zhi | ||
101 | xor $t0,$Zlo,$Zlo | ||
102 | extbl $Xlo,$cnt,$nlo | ||
103 | |||
104 | and $nlo,0xf0,$nhi | ||
105 | xor $Thi0,$Zhi,$Zhi | ||
106 | xor $Tlo0,$Zlo,$Zlo | ||
107 | sll $nlo,4,$nlo | ||
108 | |||
109 | |||
110 | and $Zlo,0x0f,$remp | ||
111 | sll $Zhi,60,$t0 | ||
112 | and $nlo,0xf0,$nlo | ||
113 | srl $Zlo,4,$Zlo | ||
114 | |||
115 | s8addq $remp,$rem_4bit,$remp | ||
116 | xor $rem,$Zhi,$Zhi | ||
117 | addq $nlo,$Htbl,$nlo | ||
118 | addq $nhi,$Htbl,$nhi | ||
119 | |||
120 | ldq $rem,0($remp) | ||
121 | srl $Zhi,4,$Zhi | ||
122 | ldq $Tlo0,8($nlo) | ||
123 | xor $t0,$Zlo,$Zlo | ||
124 | |||
125 | xor $Tlo1,$Zlo,$Zlo | ||
126 | xor $Thi1,$Zhi,$Zhi | ||
127 | ldq $Thi0,0($nlo) | ||
128 | bne $cnt,.Looplo$N | ||
129 | |||
130 | |||
131 | and $Zlo,0x0f,$remp | ||
132 | sll $Zhi,60,$t0 | ||
133 | lda $cnt,7(zero) | ||
134 | srl $Zlo,4,$Zlo | ||
135 | |||
136 | ldq $Tlo1,8($nhi) | ||
137 | xor $rem,$Zhi,$Zhi | ||
138 | ldq $Thi1,0($nhi) | ||
139 | s8addq $remp,$rem_4bit,$remp | ||
140 | |||
141 | ldq $rem,0($remp) | ||
142 | srl $Zhi,4,$Zhi | ||
143 | xor $t0,$Zlo,$Zlo | ||
144 | extbl $Xhi,$cnt,$nlo | ||
145 | |||
146 | and $nlo,0xf0,$nhi | ||
147 | xor $Thi0,$Zhi,$Zhi | ||
148 | xor $Tlo0,$Zlo,$Zlo | ||
149 | sll $nlo,4,$nlo | ||
150 | |||
151 | and $Zlo,0x0f,$remp | ||
152 | sll $Zhi,60,$t0 | ||
153 | and $nlo,0xf0,$nlo | ||
154 | srl $Zlo,4,$Zlo | ||
155 | |||
156 | s8addq $remp,$rem_4bit,$remp | ||
157 | xor $rem,$Zhi,$Zhi | ||
158 | addq $nlo,$Htbl,$nlo | ||
159 | addq $nhi,$Htbl,$nhi | ||
160 | |||
161 | ldq $rem,0($remp) | ||
162 | srl $Zhi,4,$Zhi | ||
163 | ldq $Tlo0,8($nlo) | ||
164 | xor $t0,$Zlo,$Zlo | ||
165 | |||
166 | xor $Tlo1,$Zlo,$Zlo | ||
167 | xor $Thi1,$Zhi,$Zhi | ||
168 | ldq $Thi0,0($nlo) | ||
169 | unop | ||
170 | |||
171 | |||
172 | .Loophi$N: | ||
173 | and $Zlo,0x0f,$remp | ||
174 | sll $Zhi,60,$t0 | ||
175 | subq $cnt,1,$cnt | ||
176 | srl $Zlo,4,$Zlo | ||
177 | |||
178 | ldq $Tlo1,8($nhi) | ||
179 | xor $rem,$Zhi,$Zhi | ||
180 | ldq $Thi1,0($nhi) | ||
181 | s8addq $remp,$rem_4bit,$remp | ||
182 | |||
183 | ldq $rem,0($remp) | ||
184 | srl $Zhi,4,$Zhi | ||
185 | xor $t0,$Zlo,$Zlo | ||
186 | extbl $Xhi,$cnt,$nlo | ||
187 | |||
188 | and $nlo,0xf0,$nhi | ||
189 | xor $Thi0,$Zhi,$Zhi | ||
190 | xor $Tlo0,$Zlo,$Zlo | ||
191 | sll $nlo,4,$nlo | ||
192 | |||
193 | |||
194 | and $Zlo,0x0f,$remp | ||
195 | sll $Zhi,60,$t0 | ||
196 | and $nlo,0xf0,$nlo | ||
197 | srl $Zlo,4,$Zlo | ||
198 | |||
199 | s8addq $remp,$rem_4bit,$remp | ||
200 | xor $rem,$Zhi,$Zhi | ||
201 | addq $nlo,$Htbl,$nlo | ||
202 | addq $nhi,$Htbl,$nhi | ||
203 | |||
204 | ldq $rem,0($remp) | ||
205 | srl $Zhi,4,$Zhi | ||
206 | ldq $Tlo0,8($nlo) | ||
207 | xor $t0,$Zlo,$Zlo | ||
208 | |||
209 | xor $Tlo1,$Zlo,$Zlo | ||
210 | xor $Thi1,$Zhi,$Zhi | ||
211 | ldq $Thi0,0($nlo) | ||
212 | bne $cnt,.Loophi$N | ||
213 | |||
214 | |||
215 | and $Zlo,0x0f,$remp | ||
216 | sll $Zhi,60,$t0 | ||
217 | srl $Zlo,4,$Zlo | ||
218 | |||
219 | ldq $Tlo1,8($nhi) | ||
220 | xor $rem,$Zhi,$Zhi | ||
221 | ldq $Thi1,0($nhi) | ||
222 | s8addq $remp,$rem_4bit,$remp | ||
223 | |||
224 | ldq $rem,0($remp) | ||
225 | srl $Zhi,4,$Zhi | ||
226 | xor $t0,$Zlo,$Zlo | ||
227 | |||
228 | xor $Tlo0,$Zlo,$Zlo | ||
229 | xor $Thi0,$Zhi,$Zhi | ||
230 | |||
231 | and $Zlo,0x0f,$remp | ||
232 | sll $Zhi,60,$t0 | ||
233 | srl $Zlo,4,$Zlo | ||
234 | |||
235 | s8addq $remp,$rem_4bit,$remp | ||
236 | xor $rem,$Zhi,$Zhi | ||
237 | |||
238 | ldq $rem,0($remp) | ||
239 | srl $Zhi,4,$Zhi | ||
240 | xor $Tlo1,$Zlo,$Zlo | ||
241 | xor $Thi1,$Zhi,$Zhi | ||
242 | xor $t0,$Zlo,$Zlo | ||
243 | xor $rem,$Zhi,$Zhi | ||
244 | ___ | ||
245 | }} | ||
246 | |||
247 | $code=<<___; | ||
248 | #ifdef __linux__ | ||
249 | #include <asm/regdef.h> | ||
250 | #else | ||
251 | #include <asm.h> | ||
252 | #include <regdef.h> | ||
253 | #endif | ||
254 | |||
255 | .text | ||
256 | |||
257 | .set noat | ||
258 | .set noreorder | ||
259 | .globl gcm_gmult_4bit | ||
260 | .align 4 | ||
261 | .ent gcm_gmult_4bit | ||
262 | gcm_gmult_4bit: | ||
263 | .frame sp,0,ra | ||
264 | .prologue 0 | ||
265 | |||
266 | ldq $Xlo,8($Xi) | ||
267 | ldq $Xhi,0($Xi) | ||
268 | |||
269 | br $rem_4bit,.Lpic1 | ||
270 | .Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit) | ||
271 | ___ | ||
272 | |||
273 | &loop(); | ||
274 | |||
275 | $code.=<<___; | ||
276 | srl $Zlo,24,$t0 # byte swap | ||
277 | srl $Zlo,8,$t1 | ||
278 | |||
279 | sll $Zlo,8,$t2 | ||
280 | sll $Zlo,24,$Zlo | ||
281 | zapnot $t0,0x11,$t0 | ||
282 | zapnot $t1,0x22,$t1 | ||
283 | |||
284 | zapnot $Zlo,0x88,$Zlo | ||
285 | or $t0,$t1,$t0 | ||
286 | zapnot $t2,0x44,$t2 | ||
287 | |||
288 | or $Zlo,$t0,$Zlo | ||
289 | srl $Zhi,24,$t0 | ||
290 | srl $Zhi,8,$t1 | ||
291 | |||
292 | or $Zlo,$t2,$Zlo | ||
293 | sll $Zhi,8,$t2 | ||
294 | sll $Zhi,24,$Zhi | ||
295 | |||
296 | srl $Zlo,32,$Xlo | ||
297 | sll $Zlo,32,$Zlo | ||
298 | |||
299 | zapnot $t0,0x11,$t0 | ||
300 | zapnot $t1,0x22,$t1 | ||
301 | or $Zlo,$Xlo,$Xlo | ||
302 | |||
303 | zapnot $Zhi,0x88,$Zhi | ||
304 | or $t0,$t1,$t0 | ||
305 | zapnot $t2,0x44,$t2 | ||
306 | |||
307 | or $Zhi,$t0,$Zhi | ||
308 | or $Zhi,$t2,$Zhi | ||
309 | |||
310 | srl $Zhi,32,$Xhi | ||
311 | sll $Zhi,32,$Zhi | ||
312 | |||
313 | or $Zhi,$Xhi,$Xhi | ||
314 | stq $Xlo,8($Xi) | ||
315 | stq $Xhi,0($Xi) | ||
316 | |||
317 | ret (ra) | ||
318 | .end gcm_gmult_4bit | ||
319 | ___ | ||
320 | |||
321 | $inhi="s0"; | ||
322 | $inlo="s1"; | ||
323 | |||
324 | $code.=<<___; | ||
325 | .globl gcm_ghash_4bit | ||
326 | .align 4 | ||
327 | .ent gcm_ghash_4bit | ||
328 | gcm_ghash_4bit: | ||
329 | lda sp,-32(sp) | ||
330 | stq ra,0(sp) | ||
331 | stq s0,8(sp) | ||
332 | stq s1,16(sp) | ||
333 | .mask 0x04000600,-32 | ||
334 | .frame sp,32,ra | ||
335 | .prologue 0 | ||
336 | |||
337 | ldq_u $inhi,0($inp) | ||
338 | ldq_u $Thi0,7($inp) | ||
339 | ldq_u $inlo,8($inp) | ||
340 | ldq_u $Tlo0,15($inp) | ||
341 | ldq $Xhi,0($Xi) | ||
342 | ldq $Xlo,8($Xi) | ||
343 | |||
344 | br $rem_4bit,.Lpic2 | ||
345 | .Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit) | ||
346 | |||
347 | .Louter: | ||
348 | extql $inhi,$inp,$inhi | ||
349 | extqh $Thi0,$inp,$Thi0 | ||
350 | or $inhi,$Thi0,$inhi | ||
351 | lda $inp,16($inp) | ||
352 | |||
353 | extql $inlo,$inp,$inlo | ||
354 | extqh $Tlo0,$inp,$Tlo0 | ||
355 | or $inlo,$Tlo0,$inlo | ||
356 | subq $len,16,$len | ||
357 | |||
358 | xor $Xlo,$inlo,$Xlo | ||
359 | xor $Xhi,$inhi,$Xhi | ||
360 | ___ | ||
361 | |||
362 | &loop(); | ||
363 | |||
364 | $code.=<<___; | ||
365 | srl $Zlo,24,$t0 # byte swap | ||
366 | srl $Zlo,8,$t1 | ||
367 | |||
368 | sll $Zlo,8,$t2 | ||
369 | sll $Zlo,24,$Zlo | ||
370 | zapnot $t0,0x11,$t0 | ||
371 | zapnot $t1,0x22,$t1 | ||
372 | |||
373 | zapnot $Zlo,0x88,$Zlo | ||
374 | or $t0,$t1,$t0 | ||
375 | zapnot $t2,0x44,$t2 | ||
376 | |||
377 | or $Zlo,$t0,$Zlo | ||
378 | srl $Zhi,24,$t0 | ||
379 | srl $Zhi,8,$t1 | ||
380 | |||
381 | or $Zlo,$t2,$Zlo | ||
382 | sll $Zhi,8,$t2 | ||
383 | sll $Zhi,24,$Zhi | ||
384 | |||
385 | srl $Zlo,32,$Xlo | ||
386 | sll $Zlo,32,$Zlo | ||
387 | beq $len,.Ldone | ||
388 | |||
389 | zapnot $t0,0x11,$t0 | ||
390 | zapnot $t1,0x22,$t1 | ||
391 | or $Zlo,$Xlo,$Xlo | ||
392 | ldq_u $inhi,0($inp) | ||
393 | |||
394 | zapnot $Zhi,0x88,$Zhi | ||
395 | or $t0,$t1,$t0 | ||
396 | zapnot $t2,0x44,$t2 | ||
397 | ldq_u $Thi0,7($inp) | ||
398 | |||
399 | or $Zhi,$t0,$Zhi | ||
400 | or $Zhi,$t2,$Zhi | ||
401 | ldq_u $inlo,8($inp) | ||
402 | ldq_u $Tlo0,15($inp) | ||
403 | |||
404 | srl $Zhi,32,$Xhi | ||
405 | sll $Zhi,32,$Zhi | ||
406 | |||
407 | or $Zhi,$Xhi,$Xhi | ||
408 | br zero,.Louter | ||
409 | |||
410 | .Ldone: | ||
411 | zapnot $t0,0x11,$t0 | ||
412 | zapnot $t1,0x22,$t1 | ||
413 | or $Zlo,$Xlo,$Xlo | ||
414 | |||
415 | zapnot $Zhi,0x88,$Zhi | ||
416 | or $t0,$t1,$t0 | ||
417 | zapnot $t2,0x44,$t2 | ||
418 | |||
419 | or $Zhi,$t0,$Zhi | ||
420 | or $Zhi,$t2,$Zhi | ||
421 | |||
422 | srl $Zhi,32,$Xhi | ||
423 | sll $Zhi,32,$Zhi | ||
424 | |||
425 | or $Zhi,$Xhi,$Xhi | ||
426 | |||
427 | stq $Xlo,8($Xi) | ||
428 | stq $Xhi,0($Xi) | ||
429 | |||
430 | .set noreorder | ||
431 | /*ldq ra,0(sp)*/ | ||
432 | ldq s0,8(sp) | ||
433 | ldq s1,16(sp) | ||
434 | lda sp,32(sp) | ||
435 | ret (ra) | ||
436 | .end gcm_ghash_4bit | ||
437 | |||
438 | .align 4 | ||
439 | rem_4bit: | ||
440 | .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 | ||
441 | .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 | ||
442 | .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 | ||
443 | .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 | ||
444 | .ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | ||
445 | .align 4 | ||
446 | |||
447 | ___ | ||
448 | $output=shift and open STDOUT,">$output"; | ||
449 | print $code; | ||
450 | close STDOUT; | ||
451 | |||
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl new file mode 100644 index 0000000000..d91586ee29 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-armv4.pl | |||
@@ -0,0 +1,429 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # April 2010 | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
14 | # uses 256 bytes per-key table [+32 bytes shared table]. There is no | ||
15 | # experimental performance data available yet. The only approximation | ||
16 | # that can be made at this point is based on code size. Inner loop is | ||
17 | # 32 instructions long and on single-issue core should execute in <40 | ||
18 | # cycles. Having verified that gcc 3.4 didn't unroll corresponding | ||
19 | # loop, this assembler loop body was found to be ~3x smaller than | ||
20 | # compiler-generated one... | ||
21 | # | ||
22 | # July 2010 | ||
23 | # | ||
24 | # Rescheduling for dual-issue pipeline resulted in 8.5% improvement on | ||
25 | # Cortex A8 core and ~25 cycles per processed byte (which was observed | ||
26 | # to be ~3 times faster than gcc-generated code:-) | ||
27 | # | ||
28 | # February 2011 | ||
29 | # | ||
30 | # Profiler-assisted and platform-specific optimization resulted in 7% | ||
31 | # improvement on Cortex A8 core and ~23.5 cycles per byte. | ||
32 | # | ||
33 | # March 2011 | ||
34 | # | ||
35 | # Add NEON implementation featuring polynomial multiplication, i.e. no | ||
36 | # lookup tables involved. On Cortex A8 it was measured to process one | ||
37 | # byte in 15 cycles or 55% faster than integer-only code. | ||
38 | |||
39 | # ==================================================================== | ||
40 | # Note about "528B" variant. In ARM case it makes lesser sense to | ||
41 | # implement it for following reasons: | ||
42 | # | ||
43 | # - performance improvement won't be anywhere near 50%, because 128- | ||
44 | # bit shift operation is neatly fused with 128-bit xor here, and | ||
45 | # "538B" variant would eliminate only 4-5 instructions out of 32 | ||
46 | # in the inner loop (meaning that estimated improvement is ~15%); | ||
47 | # - ARM-based systems are often embedded ones and extra memory | ||
48 | # consumption might be unappreciated (for so little improvement); | ||
49 | # | ||
50 | # Byte order [in]dependence. ========================================= | ||
51 | # | ||
52 | # Caller is expected to maintain specific *dword* order in Htable, | ||
53 | # namely with *least* significant dword of 128-bit value at *lower* | ||
54 | # address. This differs completely from C code and has everything to | ||
55 | # do with ldm instruction and order in which dwords are "consumed" by | ||
56 | # algorithm. *Byte* order within these dwords in turn is whatever | ||
57 | # *native* byte order on current platform. See gcm128.c for working | ||
58 | # example... | ||
59 | |||
60 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
61 | open STDOUT,">$output"; | ||
62 | |||
63 | $Xi="r0"; # argument block | ||
64 | $Htbl="r1"; | ||
65 | $inp="r2"; | ||
66 | $len="r3"; | ||
67 | |||
68 | $Zll="r4"; # variables | ||
69 | $Zlh="r5"; | ||
70 | $Zhl="r6"; | ||
71 | $Zhh="r7"; | ||
72 | $Tll="r8"; | ||
73 | $Tlh="r9"; | ||
74 | $Thl="r10"; | ||
75 | $Thh="r11"; | ||
76 | $nlo="r12"; | ||
77 | ################# r13 is stack pointer | ||
78 | $nhi="r14"; | ||
79 | ################# r15 is program counter | ||
80 | |||
81 | $rem_4bit=$inp; # used in gcm_gmult_4bit | ||
82 | $cnt=$len; | ||
83 | |||
84 | sub Zsmash() { | ||
85 | my $i=12; | ||
86 | my @args=@_; | ||
87 | for ($Zll,$Zlh,$Zhl,$Zhh) { | ||
88 | $code.=<<___; | ||
89 | #if __ARM_ARCH__>=7 && defined(__ARMEL__) | ||
90 | rev $_,$_ | ||
91 | str $_,[$Xi,#$i] | ||
92 | #elif defined(__ARMEB__) | ||
93 | str $_,[$Xi,#$i] | ||
94 | #else | ||
95 | mov $Tlh,$_,lsr#8 | ||
96 | strb $_,[$Xi,#$i+3] | ||
97 | mov $Thl,$_,lsr#16 | ||
98 | strb $Tlh,[$Xi,#$i+2] | ||
99 | mov $Thh,$_,lsr#24 | ||
100 | strb $Thl,[$Xi,#$i+1] | ||
101 | strb $Thh,[$Xi,#$i] | ||
102 | #endif | ||
103 | ___ | ||
104 | $code.="\t".shift(@args)."\n"; | ||
105 | $i-=4; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | $code=<<___; | ||
110 | #include "arm_arch.h" | ||
111 | |||
112 | .text | ||
113 | .code 32 | ||
114 | |||
115 | .type rem_4bit,%object | ||
116 | .align 5 | ||
117 | rem_4bit: | ||
118 | .short 0x0000,0x1C20,0x3840,0x2460 | ||
119 | .short 0x7080,0x6CA0,0x48C0,0x54E0 | ||
120 | .short 0xE100,0xFD20,0xD940,0xC560 | ||
121 | .short 0x9180,0x8DA0,0xA9C0,0xB5E0 | ||
122 | .size rem_4bit,.-rem_4bit | ||
123 | |||
124 | .type rem_4bit_get,%function | ||
125 | rem_4bit_get: | ||
126 | sub $rem_4bit,pc,#8 | ||
127 | sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit | ||
128 | b .Lrem_4bit_got | ||
129 | nop | ||
130 | .size rem_4bit_get,.-rem_4bit_get | ||
131 | |||
132 | .global gcm_ghash_4bit | ||
133 | .type gcm_ghash_4bit,%function | ||
134 | gcm_ghash_4bit: | ||
135 | sub r12,pc,#8 | ||
136 | add $len,$inp,$len @ $len to point at the end | ||
137 | stmdb sp!,{r3-r11,lr} @ save $len/end too | ||
138 | sub r12,r12,#48 @ &rem_4bit | ||
139 | |||
140 | ldmia r12,{r4-r11} @ copy rem_4bit ... | ||
141 | stmdb sp!,{r4-r11} @ ... to stack | ||
142 | |||
143 | ldrb $nlo,[$inp,#15] | ||
144 | ldrb $nhi,[$Xi,#15] | ||
145 | .Louter: | ||
146 | eor $nlo,$nlo,$nhi | ||
147 | and $nhi,$nlo,#0xf0 | ||
148 | and $nlo,$nlo,#0x0f | ||
149 | mov $cnt,#14 | ||
150 | |||
151 | add $Zhh,$Htbl,$nlo,lsl#4 | ||
152 | ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] | ||
153 | add $Thh,$Htbl,$nhi | ||
154 | ldrb $nlo,[$inp,#14] | ||
155 | |||
156 | and $nhi,$Zll,#0xf @ rem | ||
157 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] | ||
158 | add $nhi,$nhi,$nhi | ||
159 | eor $Zll,$Tll,$Zll,lsr#4 | ||
160 | ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] | ||
161 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
162 | ldrb $nhi,[$Xi,#14] | ||
163 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
164 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
165 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
166 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
167 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
168 | eor $nlo,$nlo,$nhi | ||
169 | and $nhi,$nlo,#0xf0 | ||
170 | and $nlo,$nlo,#0x0f | ||
171 | eor $Zhh,$Zhh,$Tll,lsl#16 | ||
172 | |||
173 | .Linner: | ||
174 | add $Thh,$Htbl,$nlo,lsl#4 | ||
175 | and $nlo,$Zll,#0xf @ rem | ||
176 | subs $cnt,$cnt,#1 | ||
177 | add $nlo,$nlo,$nlo | ||
178 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] | ||
179 | eor $Zll,$Tll,$Zll,lsr#4 | ||
180 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
181 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
182 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
183 | ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] | ||
184 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
185 | ldrplb $nlo,[$inp,$cnt] | ||
186 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
187 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
188 | |||
189 | add $Thh,$Htbl,$nhi | ||
190 | and $nhi,$Zll,#0xf @ rem | ||
191 | eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] | ||
192 | add $nhi,$nhi,$nhi | ||
193 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] | ||
194 | eor $Zll,$Tll,$Zll,lsr#4 | ||
195 | ldrplb $Tll,[$Xi,$cnt] | ||
196 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
197 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
198 | ldrh $Tlh,[sp,$nhi] | ||
199 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
200 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
201 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
202 | eorpl $nlo,$nlo,$Tll | ||
203 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
204 | andpl $nhi,$nlo,#0xf0 | ||
205 | andpl $nlo,$nlo,#0x0f | ||
206 | eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] | ||
207 | bpl .Linner | ||
208 | |||
209 | ldr $len,[sp,#32] @ re-load $len/end | ||
210 | add $inp,$inp,#16 | ||
211 | mov $nhi,$Zll | ||
212 | ___ | ||
213 | &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]"); | ||
214 | $code.=<<___; | ||
215 | bne .Louter | ||
216 | |||
217 | add sp,sp,#36 | ||
218 | #if __ARM_ARCH__>=5 | ||
219 | ldmia sp!,{r4-r11,pc} | ||
220 | #else | ||
221 | ldmia sp!,{r4-r11,lr} | ||
222 | tst lr,#1 | ||
223 | moveq pc,lr @ be binary compatible with V4, yet | ||
224 | bx lr @ interoperable with Thumb ISA:-) | ||
225 | #endif | ||
226 | .size gcm_ghash_4bit,.-gcm_ghash_4bit | ||
227 | |||
228 | .global gcm_gmult_4bit | ||
229 | .type gcm_gmult_4bit,%function | ||
230 | gcm_gmult_4bit: | ||
231 | stmdb sp!,{r4-r11,lr} | ||
232 | ldrb $nlo,[$Xi,#15] | ||
233 | b rem_4bit_get | ||
234 | .Lrem_4bit_got: | ||
235 | and $nhi,$nlo,#0xf0 | ||
236 | and $nlo,$nlo,#0x0f | ||
237 | mov $cnt,#14 | ||
238 | |||
239 | add $Zhh,$Htbl,$nlo,lsl#4 | ||
240 | ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] | ||
241 | ldrb $nlo,[$Xi,#14] | ||
242 | |||
243 | add $Thh,$Htbl,$nhi | ||
244 | and $nhi,$Zll,#0xf @ rem | ||
245 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] | ||
246 | add $nhi,$nhi,$nhi | ||
247 | eor $Zll,$Tll,$Zll,lsr#4 | ||
248 | ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] | ||
249 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
250 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
251 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
252 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
253 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
254 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
255 | and $nhi,$nlo,#0xf0 | ||
256 | eor $Zhh,$Zhh,$Tll,lsl#16 | ||
257 | and $nlo,$nlo,#0x0f | ||
258 | |||
259 | .Loop: | ||
260 | add $Thh,$Htbl,$nlo,lsl#4 | ||
261 | and $nlo,$Zll,#0xf @ rem | ||
262 | subs $cnt,$cnt,#1 | ||
263 | add $nlo,$nlo,$nlo | ||
264 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] | ||
265 | eor $Zll,$Tll,$Zll,lsr#4 | ||
266 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
267 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
268 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
269 | ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] | ||
270 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
271 | ldrplb $nlo,[$Xi,$cnt] | ||
272 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
273 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
274 | |||
275 | add $Thh,$Htbl,$nhi | ||
276 | and $nhi,$Zll,#0xf @ rem | ||
277 | eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] | ||
278 | add $nhi,$nhi,$nhi | ||
279 | ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] | ||
280 | eor $Zll,$Tll,$Zll,lsr#4 | ||
281 | eor $Zll,$Zll,$Zlh,lsl#28 | ||
282 | eor $Zlh,$Tlh,$Zlh,lsr#4 | ||
283 | ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] | ||
284 | eor $Zlh,$Zlh,$Zhl,lsl#28 | ||
285 | eor $Zhl,$Thl,$Zhl,lsr#4 | ||
286 | eor $Zhl,$Zhl,$Zhh,lsl#28 | ||
287 | eor $Zhh,$Thh,$Zhh,lsr#4 | ||
288 | andpl $nhi,$nlo,#0xf0 | ||
289 | andpl $nlo,$nlo,#0x0f | ||
290 | eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] | ||
291 | bpl .Loop | ||
292 | ___ | ||
293 | &Zsmash(); | ||
294 | $code.=<<___; | ||
295 | #if __ARM_ARCH__>=5 | ||
296 | ldmia sp!,{r4-r11,pc} | ||
297 | #else | ||
298 | ldmia sp!,{r4-r11,lr} | ||
299 | tst lr,#1 | ||
300 | moveq pc,lr @ be binary compatible with V4, yet | ||
301 | bx lr @ interoperable with Thumb ISA:-) | ||
302 | #endif | ||
303 | .size gcm_gmult_4bit,.-gcm_gmult_4bit | ||
304 | ___ | ||
305 | { | ||
306 | my $cnt=$Htbl; # $Htbl is used once in the very beginning | ||
307 | |||
308 | my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); | ||
309 | my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); | ||
310 | |||
311 | # Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit | ||
312 | # in Zo. Or should I say "top bit", because GHASH is specified in | ||
313 | # reverse bit order? Otherwise straightforward 128-bt H by one input | ||
314 | # byte multiplication and modulo-reduction, times 16. | ||
315 | |||
316 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | ||
317 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | ||
318 | sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } | ||
319 | |||
320 | $code.=<<___; | ||
321 | #if __ARM_ARCH__>=7 | ||
322 | .fpu neon | ||
323 | |||
324 | .global gcm_gmult_neon | ||
325 | .type gcm_gmult_neon,%function | ||
326 | .align 4 | ||
327 | gcm_gmult_neon: | ||
328 | sub $Htbl,#16 @ point at H in GCM128_CTX | ||
329 | vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi | ||
330 | vmov.i32 $mod,#0xe1 @ our irreducible polynomial | ||
331 | vld1.64 `&Dlo("$IN")`,[$Xi,:64]! | ||
332 | vshr.u64 $mod,#32 | ||
333 | vldmia $Htbl,{$Hhi-$Hlo} @ load H | ||
334 | veor $zero,$zero | ||
335 | #ifdef __ARMEL__ | ||
336 | vrev64.8 $IN,$IN | ||
337 | #endif | ||
338 | veor $Qpost,$Qpost | ||
339 | veor $R,$R | ||
340 | mov $cnt,#16 | ||
341 | veor $Z,$Z | ||
342 | mov $len,#16 | ||
343 | veor $Zo,$Zo | ||
344 | vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte | ||
345 | b .Linner_neon | ||
346 | .size gcm_gmult_neon,.-gcm_gmult_neon | ||
347 | |||
348 | .global gcm_ghash_neon | ||
349 | .type gcm_ghash_neon,%function | ||
350 | .align 4 | ||
351 | gcm_ghash_neon: | ||
352 | vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi | ||
353 | vmov.i32 $mod,#0xe1 @ our irreducible polynomial | ||
354 | vld1.64 `&Dlo("$Z")`,[$Xi,:64]! | ||
355 | vshr.u64 $mod,#32 | ||
356 | vldmia $Xi,{$Hhi-$Hlo} @ load H | ||
357 | veor $zero,$zero | ||
358 | nop | ||
359 | #ifdef __ARMEL__ | ||
360 | vrev64.8 $Z,$Z | ||
361 | #endif | ||
362 | .Louter_neon: | ||
363 | vld1.64 `&Dhi($IN)`,[$inp]! @ load inp | ||
364 | veor $Qpost,$Qpost | ||
365 | vld1.64 `&Dlo($IN)`,[$inp]! | ||
366 | veor $R,$R | ||
367 | mov $cnt,#16 | ||
368 | #ifdef __ARMEL__ | ||
369 | vrev64.8 $IN,$IN | ||
370 | #endif | ||
371 | veor $Zo,$Zo | ||
372 | veor $IN,$Z @ inp^=Xi | ||
373 | veor $Z,$Z | ||
374 | vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte | ||
375 | .Linner_neon: | ||
376 | subs $cnt,$cnt,#1 | ||
377 | vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i] | ||
378 | vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i] | ||
379 | vext.8 $IN,$zero,#1 @ IN>>=8 | ||
380 | |||
381 | veor $Z,$Qpost @ modulo-scheduled part | ||
382 | vshl.i64 `&Dlo("$R")`,#48 | ||
383 | vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte | ||
384 | veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")` | ||
385 | |||
386 | veor `&Dhi("$Z")`,`&Dlo("$R")` | ||
387 | vuzp.8 $Qlo,$Qhi | ||
388 | vsli.8 $Zo,$T,#1 @ compose the "carry" byte | ||
389 | vext.8 $Z,$zero,#1 @ Z>>=8 | ||
390 | |||
391 | vmull.p8 $R,$Zo,$mod @ "carry"·0xe1 | ||
392 | vshr.u8 $Zo,$T,#7 @ save Z's bottom bit | ||
393 | vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8 | ||
394 | veor $Z,$Qhi | ||
395 | bne .Linner_neon | ||
396 | |||
397 | veor $Z,$Qpost @ modulo-scheduled artefact | ||
398 | vshl.i64 `&Dlo("$R")`,#48 | ||
399 | veor `&Dhi("$Z")`,`&Dlo("$R")` | ||
400 | |||
401 | @ finalization, normalize Z:Zo | ||
402 | vand $Zo,$mod @ suffices to mask the bit | ||
403 | vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 | ||
404 | vshl.i64 $Z,#1 | ||
405 | subs $len,#16 | ||
406 | vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1 | ||
407 | bne .Louter_neon | ||
408 | |||
409 | #ifdef __ARMEL__ | ||
410 | vrev64.8 $Z,$Z | ||
411 | #endif | ||
412 | sub $Xi,#16 | ||
413 | vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi | ||
414 | vst1.64 `&Dlo("$Z")`,[$Xi,:64] | ||
415 | |||
416 | bx lr | ||
417 | .size gcm_ghash_neon,.-gcm_ghash_neon | ||
418 | #endif | ||
419 | ___ | ||
420 | } | ||
421 | $code.=<<___; | ||
422 | .asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
423 | .align 2 | ||
424 | ___ | ||
425 | |||
426 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
427 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
428 | print $code; | ||
429 | close STDOUT; # enforce flush | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl new file mode 100755 index 0000000000..0354c95444 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-ia64.pl | |||
@@ -0,0 +1,463 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # March 2010 | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
14 | # uses 256 bytes per-key table [+128 bytes shared table]. Streamed | ||
15 | # GHASH performance was measured to be 6.67 cycles per processed byte | ||
16 | # on Itanium 2, which is >90% better than Microsoft compiler generated | ||
17 | # code. To anchor to something else sha1-ia64.pl module processes one | ||
18 | # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per | ||
19 | # byte. | ||
20 | |||
21 | # September 2010 | ||
22 | # | ||
23 | # It was originally thought that it makes lesser sense to implement | ||
24 | # "528B" variant on Itanium 2 for following reason. Because number of | ||
25 | # functional units is naturally limited, it appeared impossible to | ||
26 | # implement "528B" loop in 4 cycles, only in 5. This would mean that | ||
27 | # theoretically performance improvement couldn't be more than 20%. | ||
28 | # But occasionally you prove yourself wrong:-) I figured out a way to | ||
29 | # fold couple of instructions and having freed yet another instruction | ||
30 | # slot by unrolling the loop... Resulting performance is 4.45 cycles | ||
31 | # per processed byte and 50% better than "256B" version. On original | ||
32 | # Itanium performance should remain the same as the "256B" version, | ||
33 | # i.e. ~8.5 cycles. | ||
34 | |||
35 | $output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); | ||
36 | |||
37 | if ($^O eq "hpux") { | ||
38 | $ADDP="addp4"; | ||
39 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | ||
40 | } else { $ADDP="add"; } | ||
41 | for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); | ||
42 | $big_endian=0 if (/\-DL_ENDIAN/); } | ||
43 | if (!defined($big_endian)) | ||
44 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
45 | |||
46 | sub loop() { | ||
47 | my $label=shift; | ||
48 | my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp | ||
49 | |||
50 | # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. | ||
51 | # in scalable manner;-) Naturally assuming data in L1 cache... | ||
52 | # Special note about 'dep' instruction, which is used to construct | ||
53 | # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 | ||
54 | # bytes boundary and lower 7 bits of its address are guaranteed to | ||
55 | # be zero. | ||
56 | $code.=<<___; | ||
57 | $label: | ||
58 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | ||
59 | (p19) dep rem=Zlo,rem_4bitp,3,4 } | ||
60 | { .mfi; (p19) xor Zhi=Zhi,Hhi | ||
61 | ($p17) xor xi[1]=xi[1],in[1] };; | ||
62 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | ||
63 | (p19) shrp Zlo=Zhi,Zlo,4 } | ||
64 | { .mfi; (p19) ld8 rem=[rem] | ||
65 | (p18) and Hi[1]=mask0xf0,xi[2] };; | ||
66 | { .mmi; ($p16) ld1 in[0]=[inp],-1 | ||
67 | (p18) xor Zlo=Zlo,Hlo | ||
68 | (p19) shr.u Zhi=Zhi,4 } | ||
69 | { .mib; (p19) xor Hhi=Hhi,rem | ||
70 | (p18) add Hi[1]=Htbl,Hi[1] };; | ||
71 | |||
72 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | ||
73 | (p18) dep rem=Zlo,rem_4bitp,3,4 } | ||
74 | { .mfi; (p17) shladd Hi[0]=xi[1],4,r0 | ||
75 | (p18) xor Zhi=Zhi,Hhi };; | ||
76 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | ||
77 | (p18) shrp Zlo=Zhi,Zlo,4 } | ||
78 | { .mfi; (p18) ld8 rem=[rem] | ||
79 | (p17) and Hi[0]=mask0xf0,Hi[0] };; | ||
80 | { .mmi; (p16) ld1 xi[0]=[Xi],-1 | ||
81 | (p18) xor Zlo=Zlo,Hlo | ||
82 | (p18) shr.u Zhi=Zhi,4 } | ||
83 | { .mib; (p18) xor Hhi=Hhi,rem | ||
84 | (p17) add Hi[0]=Htbl,Hi[0] | ||
85 | br.ctop.sptk $label };; | ||
86 | ___ | ||
87 | } | ||
88 | |||
89 | $code=<<___; | ||
90 | .explicit | ||
91 | .text | ||
92 | |||
93 | prevfs=r2; prevlc=r3; prevpr=r8; | ||
94 | mask0xf0=r21; | ||
95 | rem=r22; rem_4bitp=r23; | ||
96 | Xi=r24; Htbl=r25; | ||
97 | inp=r26; end=r27; | ||
98 | Hhi=r28; Hlo=r29; | ||
99 | Zhi=r30; Zlo=r31; | ||
100 | |||
101 | .align 128 | ||
102 | .skip 16 // aligns loop body | ||
103 | .global gcm_gmult_4bit# | ||
104 | .proc gcm_gmult_4bit# | ||
105 | gcm_gmult_4bit: | ||
106 | .prologue | ||
107 | { .mmi; .save ar.pfs,prevfs | ||
108 | alloc prevfs=ar.pfs,2,6,0,8 | ||
109 | $ADDP Xi=15,in0 // &Xi[15] | ||
110 | mov rem_4bitp=ip } | ||
111 | { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo | ||
112 | .save ar.lc,prevlc | ||
113 | mov prevlc=ar.lc | ||
114 | .save pr,prevpr | ||
115 | mov prevpr=pr };; | ||
116 | |||
117 | .body | ||
118 | .rotr in[3],xi[3],Hi[2] | ||
119 | |||
120 | { .mib; ld1 xi[2]=[Xi],-1 // Xi[15] | ||
121 | mov mask0xf0=0xf0 | ||
122 | brp.loop.imp .Loop1,.Lend1-16};; | ||
123 | { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] | ||
124 | };; | ||
125 | { .mii; shladd Hi[1]=xi[2],4,r0 | ||
126 | mov pr.rot=0x7<<16 | ||
127 | mov ar.lc=13 };; | ||
128 | { .mii; and Hi[1]=mask0xf0,Hi[1] | ||
129 | mov ar.ec=3 | ||
130 | xor Zlo=Zlo,Zlo };; | ||
131 | { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo | ||
132 | add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp | ||
133 | xor Zhi=Zhi,Zhi };; | ||
134 | ___ | ||
135 | &loop (".Loop1",1); | ||
136 | $code.=<<___; | ||
137 | .Lend1: | ||
138 | { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact | ||
139 | { .mib; mux1 Zlo=Zlo,\@rev };; | ||
140 | { .mib; mux1 Zhi=Zhi,\@rev };; | ||
141 | { .mmi; add Hlo=9,Xi;; // ;; is here to prevent | ||
142 | add Hhi=1,Xi };; // pipeline flush on Itanium | ||
143 | { .mib; st8 [Hlo]=Zlo | ||
144 | mov pr=prevpr,0x1ffff };; | ||
145 | { .mib; st8 [Hhi]=Zhi | ||
146 | mov ar.lc=prevlc | ||
147 | br.ret.sptk.many b0 };; | ||
148 | .endp gcm_gmult_4bit# | ||
149 | ___ | ||
150 | |||
151 | ###################################################################### | ||
152 | # "528B" (well, "512B" actualy) streamed GHASH | ||
153 | # | ||
154 | $Xip="in0"; | ||
155 | $Htbl="in1"; | ||
156 | $inp="in2"; | ||
157 | $len="in3"; | ||
158 | $rem_8bit="loc0"; | ||
159 | $mask0xff="loc1"; | ||
160 | ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); | ||
161 | |||
162 | sub load_htable() { | ||
163 | for (my $i=0;$i<8;$i++) { | ||
164 | $code.=<<___; | ||
165 | { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi | ||
166 | ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo | ||
167 | { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi | ||
168 | ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo | ||
169 | ___ | ||
170 | $code.=shift if (($i+$#_)==7); | ||
171 | $code.="\t};;\n" | ||
172 | } | ||
173 | } | ||
174 | |||
175 | $code.=<<___; | ||
176 | prevsp=r3; | ||
177 | |||
178 | .align 32 | ||
179 | .skip 16 // aligns loop body | ||
180 | .global gcm_ghash_4bit# | ||
181 | .proc gcm_ghash_4bit# | ||
182 | gcm_ghash_4bit: | ||
183 | .prologue | ||
184 | { .mmi; .save ar.pfs,prevfs | ||
185 | alloc prevfs=ar.pfs,4,2,0,0 | ||
186 | .vframe prevsp | ||
187 | mov prevsp=sp | ||
188 | mov $rem_8bit=ip };; | ||
189 | .body | ||
190 | { .mfi; $ADDP r8=0+0,$Htbl | ||
191 | $ADDP r9=0+8,$Htbl } | ||
192 | { .mfi; $ADDP r10=128+0,$Htbl | ||
193 | $ADDP r11=128+8,$Htbl };; | ||
194 | ___ | ||
195 | &load_htable( | ||
196 | " $ADDP $Xip=15,$Xip", # &Xi[15] | ||
197 | " $ADDP $len=$len,$inp", # &inp[len] | ||
198 | " $ADDP $inp=15,$inp", # &inp[15] | ||
199 | " mov $mask0xff=0xff", | ||
200 | " add sp=-512,sp", | ||
201 | " andcm sp=sp,$mask0xff", # align stack frame | ||
202 | " add r14=0,sp", | ||
203 | " add r15=8,sp"); | ||
204 | $code.=<<___; | ||
205 | { .mmi; $sum 1<<1 // go big-endian | ||
206 | add r8=256+0,sp | ||
207 | add r9=256+8,sp } | ||
208 | { .mmi; add r10=256+128+0,sp | ||
209 | add r11=256+128+8,sp | ||
210 | add $len=-17,$len };; | ||
211 | ___ | ||
212 | for($i=0;$i<8;$i++) { # generate first half of Hshr4[] | ||
213 | my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); | ||
214 | $code.=<<___; | ||
215 | { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo | ||
216 | st8 [r9]=$rhi,16 // Htable[$i].hi | ||
217 | shrp $rlo=$rhi,$rlo,4 }//;; | ||
218 | { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo | ||
219 | stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi | ||
220 | shr.u $rhi=$rhi,4 };; | ||
221 | { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 | ||
222 | st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 | ||
223 | ___ | ||
224 | } | ||
225 | $code.=<<___; | ||
226 | { .mmi; ld8 r16=[r8],16 // Htable[8].lo | ||
227 | ld8 r17=[r9],16 };; // Htable[8].hi | ||
228 | { .mmi; ld8 r18=[r8],16 // Htable[9].lo | ||
229 | ld8 r19=[r9],16 } // Htable[9].hi | ||
230 | { .mmi; rum 1<<5 // clear um.mfh | ||
231 | shrp r16=r17,r16,4 };; | ||
232 | ___ | ||
233 | for($i=0;$i<6;$i++) { # generate second half of Hshr4[] | ||
234 | $code.=<<___; | ||
235 | { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo | ||
236 | ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi | ||
237 | shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; | ||
238 | { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 | ||
239 | st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 | ||
240 | shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } | ||
241 | ___ | ||
242 | } | ||
243 | $code.=<<___; | ||
244 | { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; | ||
245 | { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 | ||
246 | st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 | ||
247 | shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } | ||
248 | { .mmi; add $Htbl=256,sp // &Htable[0] | ||
249 | add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit | ||
250 | shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; | ||
251 | { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 | ||
252 | st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 | ||
253 | ___ | ||
254 | |||
255 | $in="r15"; | ||
256 | @xi=("r16","r17"); | ||
257 | @rem=("r18","r19"); | ||
258 | ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); | ||
259 | ($Atbl,$Btbl)=("r26","r27"); | ||
260 | |||
261 | $code.=<<___; # (p16) | ||
262 | { .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- | ||
263 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
264 | cmp.eq p0,p6=r0,r0 };; // clear p6 | ||
265 | ___ | ||
266 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
267 | |||
268 | $code.=<<___; # (p16),(p17) | ||
269 | { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
270 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
271 | { .mii; ld1 $in=[$inp],-1 //(p16) *inp-- | ||
272 | dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo | ||
273 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
274 | .align 32 | ||
275 | .LOOP: | ||
276 | { .mmi; | ||
277 | (p6) st8 [$Xip]=$Zhi,13 | ||
278 | xor $Zlo=$Zlo,$Zlo | ||
279 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo | ||
280 | ___ | ||
281 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
282 | |||
283 | $code.=<<___; # (p16),(p17),(p18) | ||
284 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
285 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
286 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
287 | { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
288 | dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo | ||
289 | { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
290 | xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo | ||
291 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
292 | ld1 $in=[$inp],-1 } //(p16) *inp-- | ||
293 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
294 | mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
295 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
296 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
297 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
298 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
299 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
300 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
301 | ___ | ||
302 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
303 | |||
304 | for ($i=1;$i<14;$i++) { | ||
305 | # Above and below fragments are derived from this one by removing | ||
306 | # unsuitable (p??) instructions. | ||
307 | $code.=<<___; # (p16),(p17),(p18),(p19) | ||
308 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
309 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
310 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
311 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
312 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
313 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
314 | { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
315 | ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
316 | dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo | ||
317 | { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
318 | xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo | ||
319 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
320 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
321 | ld1 $in=[$inp],-1 //(p16) *inp-- | ||
322 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
323 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
324 | xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
325 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
326 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
327 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
328 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
329 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
330 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
331 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
332 | ___ | ||
333 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
334 | } | ||
335 | |||
336 | $code.=<<___; # (p17),(p18),(p19) | ||
337 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
338 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
339 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
340 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
341 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
342 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
343 | { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
344 | ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
345 | dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo | ||
346 | { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
347 | xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo | ||
348 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
349 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
350 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
351 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
352 | xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
353 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
354 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
355 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
356 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
357 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
358 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
359 | ___ | ||
360 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
361 | |||
362 | $code.=<<___; # (p18),(p19) | ||
363 | { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
364 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
365 | { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
366 | xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo | ||
367 | { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
368 | xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo | ||
369 | { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
370 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
371 | { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi | ||
372 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
373 | { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 | ||
374 | xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi | ||
375 | { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi | ||
376 | shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) | ||
377 | { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
378 | xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
379 | ___ | ||
380 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
381 | |||
382 | $code.=<<___; # (p19) | ||
383 | { .mmi; cmp.ltu p6,p0=$inp,$len | ||
384 | add $inp=32,$inp | ||
385 | shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 | ||
386 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
387 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
388 | add $Xip=9,$Xip };; // &Xi.lo | ||
389 | { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
390 | (p6) ld1 $in=[$inp],-1 //[p16] *inp-- | ||
391 | (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] | ||
392 | { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi | ||
393 | (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] | ||
394 | { .mmi; st8 [$Xip]=$Zlo,-8 | ||
395 | (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] | ||
396 | shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 | ||
397 | { .mmi; | ||
398 | (p6) ld1 $in=[$inp],-1 //[p16] *inp-- | ||
399 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
400 | (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo | ||
401 | { .mib; | ||
402 | (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 | ||
403 | (p6) br.cond.dptk.many .LOOP };; | ||
404 | |||
405 | { .mib; st8 [$Xip]=$Zhi };; | ||
406 | { .mib; $rum 1<<1 // return to little-endian | ||
407 | .restore sp | ||
408 | mov sp=prevsp | ||
409 | br.ret.sptk.many b0 };; | ||
410 | .endp gcm_ghash_4bit# | ||
411 | ___ | ||
412 | $code.=<<___; | ||
413 | .align 128 | ||
414 | .type rem_4bit#,\@object | ||
415 | rem_4bit: | ||
416 | data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 | ||
417 | data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 | ||
418 | data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 | ||
419 | data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 | ||
420 | .size rem_4bit#,128 | ||
421 | .type rem_8bit#,\@object | ||
422 | rem_8bit: | ||
423 | data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E | ||
424 | data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E | ||
425 | data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E | ||
426 | data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E | ||
427 | data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E | ||
428 | data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E | ||
429 | data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E | ||
430 | data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E | ||
431 | data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE | ||
432 | data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE | ||
433 | data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE | ||
434 | data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE | ||
435 | data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E | ||
436 | data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E | ||
437 | data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE | ||
438 | data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE | ||
439 | data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E | ||
440 | data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E | ||
441 | data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E | ||
442 | data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E | ||
443 | data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E | ||
444 | data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E | ||
445 | data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E | ||
446 | data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E | ||
447 | data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE | ||
448 | data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE | ||
449 | data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE | ||
450 | data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE | ||
451 | data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E | ||
452 | data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E | ||
453 | data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE | ||
454 | data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE | ||
455 | .size rem_8bit#,512 | ||
456 | stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" | ||
457 | ___ | ||
458 | |||
459 | $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); | ||
460 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
461 | |||
462 | print $code; | ||
463 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl new file mode 100644 index 0000000000..8c7454ee93 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-parisc.pl | |||
@@ -0,0 +1,730 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # April 2010 | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
14 | # uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC | ||
15 | # it processes one byte in 19.6 cycles, which is more than twice as | ||
16 | # fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for | ||
17 | # 8 cycles, but measured performance on PA-8600 system is ~9 cycles per | ||
18 | # processed byte. This is ~2.2x faster than 64-bit code generated by | ||
19 | # vendor compiler (which used to be very hard to beat:-). | ||
20 | # | ||
21 | # Special thanks to polarhome.com for providing HP-UX account. | ||
22 | |||
23 | $flavour = shift; | ||
24 | $output = shift; | ||
25 | open STDOUT,">$output"; | ||
26 | |||
27 | if ($flavour =~ /64/) { | ||
28 | $LEVEL ="2.0W"; | ||
29 | $SIZE_T =8; | ||
30 | $FRAME_MARKER =80; | ||
31 | $SAVED_RP =16; | ||
32 | $PUSH ="std"; | ||
33 | $PUSHMA ="std,ma"; | ||
34 | $POP ="ldd"; | ||
35 | $POPMB ="ldd,mb"; | ||
36 | $NREGS =6; | ||
37 | } else { | ||
38 | $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0"; | ||
39 | $SIZE_T =4; | ||
40 | $FRAME_MARKER =48; | ||
41 | $SAVED_RP =20; | ||
42 | $PUSH ="stw"; | ||
43 | $PUSHMA ="stwm"; | ||
44 | $POP ="ldw"; | ||
45 | $POPMB ="ldwm"; | ||
46 | $NREGS =11; | ||
47 | } | ||
48 | |||
49 | $FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker | ||
50 | # [+ argument transfer] | ||
51 | |||
52 | ################# volatile registers | ||
53 | $Xi="%r26"; # argument block | ||
54 | $Htbl="%r25"; | ||
55 | $inp="%r24"; | ||
56 | $len="%r23"; | ||
57 | $Hhh=$Htbl; # variables | ||
58 | $Hll="%r22"; | ||
59 | $Zhh="%r21"; | ||
60 | $Zll="%r20"; | ||
61 | $cnt="%r19"; | ||
62 | $rem_4bit="%r28"; | ||
63 | $rem="%r29"; | ||
64 | $mask0xf0="%r31"; | ||
65 | |||
66 | ################# preserved registers | ||
67 | $Thh="%r1"; | ||
68 | $Tll="%r2"; | ||
69 | $nlo="%r3"; | ||
70 | $nhi="%r4"; | ||
71 | $byte="%r5"; | ||
72 | if ($SIZE_T==4) { | ||
73 | $Zhl="%r6"; | ||
74 | $Zlh="%r7"; | ||
75 | $Hhl="%r8"; | ||
76 | $Hlh="%r9"; | ||
77 | $Thl="%r10"; | ||
78 | $Tlh="%r11"; | ||
79 | } | ||
80 | $rem2="%r6"; # used in PA-RISC 2.0 code | ||
81 | |||
82 | $code.=<<___; | ||
83 | .LEVEL $LEVEL | ||
84 | .SPACE \$TEXT\$ | ||
85 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
86 | |||
87 | .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR | ||
88 | .ALIGN 64 | ||
89 | gcm_gmult_4bit | ||
90 | .PROC | ||
91 | .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS | ||
92 | .ENTRY | ||
93 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
94 | $PUSHMA %r3,$FRAME(%sp) | ||
95 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
96 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
97 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
98 | ___ | ||
99 | $code.=<<___ if ($SIZE_T==4); | ||
100 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
101 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
102 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
103 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
104 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
105 | ___ | ||
106 | $code.=<<___; | ||
107 | blr %r0,$rem_4bit | ||
108 | ldi 3,$rem | ||
109 | L\$pic_gmult | ||
110 | andcm $rem_4bit,$rem,$rem_4bit | ||
111 | addl $inp,$len,$len | ||
112 | ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit | ||
113 | ldi 0xf0,$mask0xf0 | ||
114 | ___ | ||
115 | $code.=<<___ if ($SIZE_T==4); | ||
116 | ldi 31,$rem | ||
117 | mtctl $rem,%cr11 | ||
118 | extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 | ||
119 | b L\$parisc1_gmult | ||
120 | nop | ||
121 | ___ | ||
122 | |||
123 | $code.=<<___; | ||
124 | ldb 15($Xi),$nlo | ||
125 | ldo 8($Htbl),$Hll | ||
126 | |||
127 | and $mask0xf0,$nlo,$nhi | ||
128 | depd,z $nlo,59,4,$nlo | ||
129 | |||
130 | ldd $nlo($Hll),$Zll | ||
131 | ldd $nlo($Hhh),$Zhh | ||
132 | |||
133 | depd,z $Zll,60,4,$rem | ||
134 | shrpd $Zhh,$Zll,4,$Zll | ||
135 | extrd,u $Zhh,59,60,$Zhh | ||
136 | ldb 14($Xi),$nlo | ||
137 | |||
138 | ldd $nhi($Hll),$Tll | ||
139 | ldd $nhi($Hhh),$Thh | ||
140 | and $mask0xf0,$nlo,$nhi | ||
141 | depd,z $nlo,59,4,$nlo | ||
142 | |||
143 | xor $Tll,$Zll,$Zll | ||
144 | xor $Thh,$Zhh,$Zhh | ||
145 | ldd $rem($rem_4bit),$rem | ||
146 | b L\$oop_gmult_pa2 | ||
147 | ldi 13,$cnt | ||
148 | |||
149 | .ALIGN 8 | ||
150 | L\$oop_gmult_pa2 | ||
151 | xor $rem,$Zhh,$Zhh ; moved here to work around gas bug | ||
152 | depd,z $Zll,60,4,$rem | ||
153 | |||
154 | shrpd $Zhh,$Zll,4,$Zll | ||
155 | extrd,u $Zhh,59,60,$Zhh | ||
156 | ldd $nlo($Hll),$Tll | ||
157 | ldd $nlo($Hhh),$Thh | ||
158 | |||
159 | xor $Tll,$Zll,$Zll | ||
160 | xor $Thh,$Zhh,$Zhh | ||
161 | ldd $rem($rem_4bit),$rem | ||
162 | |||
163 | xor $rem,$Zhh,$Zhh | ||
164 | depd,z $Zll,60,4,$rem | ||
165 | ldbx $cnt($Xi),$nlo | ||
166 | |||
167 | shrpd $Zhh,$Zll,4,$Zll | ||
168 | extrd,u $Zhh,59,60,$Zhh | ||
169 | ldd $nhi($Hll),$Tll | ||
170 | ldd $nhi($Hhh),$Thh | ||
171 | |||
172 | and $mask0xf0,$nlo,$nhi | ||
173 | depd,z $nlo,59,4,$nlo | ||
174 | ldd $rem($rem_4bit),$rem | ||
175 | |||
176 | xor $Tll,$Zll,$Zll | ||
177 | addib,uv -1,$cnt,L\$oop_gmult_pa2 | ||
178 | xor $Thh,$Zhh,$Zhh | ||
179 | |||
180 | xor $rem,$Zhh,$Zhh | ||
181 | depd,z $Zll,60,4,$rem | ||
182 | |||
183 | shrpd $Zhh,$Zll,4,$Zll | ||
184 | extrd,u $Zhh,59,60,$Zhh | ||
185 | ldd $nlo($Hll),$Tll | ||
186 | ldd $nlo($Hhh),$Thh | ||
187 | |||
188 | xor $Tll,$Zll,$Zll | ||
189 | xor $Thh,$Zhh,$Zhh | ||
190 | ldd $rem($rem_4bit),$rem | ||
191 | |||
192 | xor $rem,$Zhh,$Zhh | ||
193 | depd,z $Zll,60,4,$rem | ||
194 | |||
195 | shrpd $Zhh,$Zll,4,$Zll | ||
196 | extrd,u $Zhh,59,60,$Zhh | ||
197 | ldd $nhi($Hll),$Tll | ||
198 | ldd $nhi($Hhh),$Thh | ||
199 | |||
200 | xor $Tll,$Zll,$Zll | ||
201 | xor $Thh,$Zhh,$Zhh | ||
202 | ldd $rem($rem_4bit),$rem | ||
203 | |||
204 | xor $rem,$Zhh,$Zhh | ||
205 | std $Zll,8($Xi) | ||
206 | std $Zhh,0($Xi) | ||
207 | ___ | ||
208 | |||
209 | $code.=<<___ if ($SIZE_T==4); | ||
210 | b L\$done_gmult | ||
211 | nop | ||
212 | |||
213 | L\$parisc1_gmult | ||
214 | ldb 15($Xi),$nlo | ||
215 | ldo 12($Htbl),$Hll | ||
216 | ldo 8($Htbl),$Hlh | ||
217 | ldo 4($Htbl),$Hhl | ||
218 | |||
219 | and $mask0xf0,$nlo,$nhi | ||
220 | zdep $nlo,27,4,$nlo | ||
221 | |||
222 | ldwx $nlo($Hll),$Zll | ||
223 | ldwx $nlo($Hlh),$Zlh | ||
224 | ldwx $nlo($Hhl),$Zhl | ||
225 | ldwx $nlo($Hhh),$Zhh | ||
226 | zdep $Zll,28,4,$rem | ||
227 | ldb 14($Xi),$nlo | ||
228 | ldwx $rem($rem_4bit),$rem | ||
229 | shrpw $Zlh,$Zll,4,$Zll | ||
230 | ldwx $nhi($Hll),$Tll | ||
231 | shrpw $Zhl,$Zlh,4,$Zlh | ||
232 | ldwx $nhi($Hlh),$Tlh | ||
233 | shrpw $Zhh,$Zhl,4,$Zhl | ||
234 | ldwx $nhi($Hhl),$Thl | ||
235 | extru $Zhh,27,28,$Zhh | ||
236 | ldwx $nhi($Hhh),$Thh | ||
237 | xor $rem,$Zhh,$Zhh | ||
238 | and $mask0xf0,$nlo,$nhi | ||
239 | zdep $nlo,27,4,$nlo | ||
240 | |||
241 | xor $Tll,$Zll,$Zll | ||
242 | ldwx $nlo($Hll),$Tll | ||
243 | xor $Tlh,$Zlh,$Zlh | ||
244 | ldwx $nlo($Hlh),$Tlh | ||
245 | xor $Thl,$Zhl,$Zhl | ||
246 | b L\$oop_gmult_pa1 | ||
247 | ldi 13,$cnt | ||
248 | |||
249 | .ALIGN 8 | ||
250 | L\$oop_gmult_pa1 | ||
251 | zdep $Zll,28,4,$rem | ||
252 | ldwx $nlo($Hhl),$Thl | ||
253 | xor $Thh,$Zhh,$Zhh | ||
254 | ldwx $rem($rem_4bit),$rem | ||
255 | shrpw $Zlh,$Zll,4,$Zll | ||
256 | ldwx $nlo($Hhh),$Thh | ||
257 | shrpw $Zhl,$Zlh,4,$Zlh | ||
258 | ldbx $cnt($Xi),$nlo | ||
259 | xor $Tll,$Zll,$Zll | ||
260 | ldwx $nhi($Hll),$Tll | ||
261 | shrpw $Zhh,$Zhl,4,$Zhl | ||
262 | xor $Tlh,$Zlh,$Zlh | ||
263 | ldwx $nhi($Hlh),$Tlh | ||
264 | extru $Zhh,27,28,$Zhh | ||
265 | xor $Thl,$Zhl,$Zhl | ||
266 | ldwx $nhi($Hhl),$Thl | ||
267 | xor $rem,$Zhh,$Zhh | ||
268 | zdep $Zll,28,4,$rem | ||
269 | xor $Thh,$Zhh,$Zhh | ||
270 | ldwx $nhi($Hhh),$Thh | ||
271 | shrpw $Zlh,$Zll,4,$Zll | ||
272 | ldwx $rem($rem_4bit),$rem | ||
273 | shrpw $Zhl,$Zlh,4,$Zlh | ||
274 | shrpw $Zhh,$Zhl,4,$Zhl | ||
275 | and $mask0xf0,$nlo,$nhi | ||
276 | extru $Zhh,27,28,$Zhh | ||
277 | zdep $nlo,27,4,$nlo | ||
278 | xor $Tll,$Zll,$Zll | ||
279 | ldwx $nlo($Hll),$Tll | ||
280 | xor $Tlh,$Zlh,$Zlh | ||
281 | ldwx $nlo($Hlh),$Tlh | ||
282 | xor $rem,$Zhh,$Zhh | ||
283 | addib,uv -1,$cnt,L\$oop_gmult_pa1 | ||
284 | xor $Thl,$Zhl,$Zhl | ||
285 | |||
286 | zdep $Zll,28,4,$rem | ||
287 | ldwx $nlo($Hhl),$Thl | ||
288 | xor $Thh,$Zhh,$Zhh | ||
289 | ldwx $rem($rem_4bit),$rem | ||
290 | shrpw $Zlh,$Zll,4,$Zll | ||
291 | ldwx $nlo($Hhh),$Thh | ||
292 | shrpw $Zhl,$Zlh,4,$Zlh | ||
293 | xor $Tll,$Zll,$Zll | ||
294 | ldwx $nhi($Hll),$Tll | ||
295 | shrpw $Zhh,$Zhl,4,$Zhl | ||
296 | xor $Tlh,$Zlh,$Zlh | ||
297 | ldwx $nhi($Hlh),$Tlh | ||
298 | extru $Zhh,27,28,$Zhh | ||
299 | xor $rem,$Zhh,$Zhh | ||
300 | xor $Thl,$Zhl,$Zhl | ||
301 | ldwx $nhi($Hhl),$Thl | ||
302 | xor $Thh,$Zhh,$Zhh | ||
303 | ldwx $nhi($Hhh),$Thh | ||
304 | zdep $Zll,28,4,$rem | ||
305 | ldwx $rem($rem_4bit),$rem | ||
306 | shrpw $Zlh,$Zll,4,$Zll | ||
307 | shrpw $Zhl,$Zlh,4,$Zlh | ||
308 | shrpw $Zhh,$Zhl,4,$Zhl | ||
309 | extru $Zhh,27,28,$Zhh | ||
310 | xor $Tll,$Zll,$Zll | ||
311 | xor $Tlh,$Zlh,$Zlh | ||
312 | xor $rem,$Zhh,$Zhh | ||
313 | stw $Zll,12($Xi) | ||
314 | xor $Thl,$Zhl,$Zhl | ||
315 | stw $Zlh,8($Xi) | ||
316 | xor $Thh,$Zhh,$Zhh | ||
317 | stw $Zhl,4($Xi) | ||
318 | stw $Zhh,0($Xi) | ||
319 | ___ | ||
320 | $code.=<<___; | ||
321 | L\$done_gmult | ||
322 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
323 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
324 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
325 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
326 | ___ | ||
327 | $code.=<<___ if ($SIZE_T==4); | ||
328 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
329 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
330 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
331 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
332 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
333 | ___ | ||
334 | $code.=<<___; | ||
335 | bv (%r2) | ||
336 | .EXIT | ||
337 | $POPMB -$FRAME(%sp),%r3 | ||
338 | .PROCEND | ||
339 | |||
340 | .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | ||
341 | .ALIGN 64 | ||
342 | gcm_ghash_4bit | ||
343 | .PROC | ||
344 | .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11 | ||
345 | .ENTRY | ||
346 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
347 | $PUSHMA %r3,$FRAME(%sp) | ||
348 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
349 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
350 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
351 | ___ | ||
352 | $code.=<<___ if ($SIZE_T==4); | ||
353 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
354 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
355 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
356 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
357 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
358 | ___ | ||
359 | $code.=<<___; | ||
360 | blr %r0,$rem_4bit | ||
361 | ldi 3,$rem | ||
362 | L\$pic_ghash | ||
363 | andcm $rem_4bit,$rem,$rem_4bit | ||
364 | addl $inp,$len,$len | ||
365 | ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit | ||
366 | ldi 0xf0,$mask0xf0 | ||
367 | ___ | ||
368 | $code.=<<___ if ($SIZE_T==4); | ||
369 | ldi 31,$rem | ||
370 | mtctl $rem,%cr11 | ||
371 | extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 | ||
372 | b L\$parisc1_ghash | ||
373 | nop | ||
374 | ___ | ||
375 | |||
376 | $code.=<<___; | ||
377 | ldb 15($Xi),$nlo | ||
378 | ldo 8($Htbl),$Hll | ||
379 | |||
380 | L\$outer_ghash_pa2 | ||
381 | ldb 15($inp),$nhi | ||
382 | xor $nhi,$nlo,$nlo | ||
383 | and $mask0xf0,$nlo,$nhi | ||
384 | depd,z $nlo,59,4,$nlo | ||
385 | |||
386 | ldd $nlo($Hll),$Zll | ||
387 | ldd $nlo($Hhh),$Zhh | ||
388 | |||
389 | depd,z $Zll,60,4,$rem | ||
390 | shrpd $Zhh,$Zll,4,$Zll | ||
391 | extrd,u $Zhh,59,60,$Zhh | ||
392 | ldb 14($Xi),$nlo | ||
393 | ldb 14($inp),$byte | ||
394 | |||
395 | ldd $nhi($Hll),$Tll | ||
396 | ldd $nhi($Hhh),$Thh | ||
397 | xor $byte,$nlo,$nlo | ||
398 | and $mask0xf0,$nlo,$nhi | ||
399 | depd,z $nlo,59,4,$nlo | ||
400 | |||
401 | xor $Tll,$Zll,$Zll | ||
402 | xor $Thh,$Zhh,$Zhh | ||
403 | ldd $rem($rem_4bit),$rem | ||
404 | b L\$oop_ghash_pa2 | ||
405 | ldi 13,$cnt | ||
406 | |||
407 | .ALIGN 8 | ||
408 | L\$oop_ghash_pa2 | ||
409 | xor $rem,$Zhh,$Zhh ; moved here to work around gas bug | ||
410 | depd,z $Zll,60,4,$rem2 | ||
411 | |||
412 | shrpd $Zhh,$Zll,4,$Zll | ||
413 | extrd,u $Zhh,59,60,$Zhh | ||
414 | ldd $nlo($Hll),$Tll | ||
415 | ldd $nlo($Hhh),$Thh | ||
416 | |||
417 | xor $Tll,$Zll,$Zll | ||
418 | xor $Thh,$Zhh,$Zhh | ||
419 | ldbx $cnt($Xi),$nlo | ||
420 | ldbx $cnt($inp),$byte | ||
421 | |||
422 | depd,z $Zll,60,4,$rem | ||
423 | shrpd $Zhh,$Zll,4,$Zll | ||
424 | ldd $rem2($rem_4bit),$rem2 | ||
425 | |||
426 | xor $rem2,$Zhh,$Zhh | ||
427 | xor $byte,$nlo,$nlo | ||
428 | ldd $nhi($Hll),$Tll | ||
429 | ldd $nhi($Hhh),$Thh | ||
430 | |||
431 | and $mask0xf0,$nlo,$nhi | ||
432 | depd,z $nlo,59,4,$nlo | ||
433 | |||
434 | extrd,u $Zhh,59,60,$Zhh | ||
435 | xor $Tll,$Zll,$Zll | ||
436 | |||
437 | ldd $rem($rem_4bit),$rem | ||
438 | addib,uv -1,$cnt,L\$oop_ghash_pa2 | ||
439 | xor $Thh,$Zhh,$Zhh | ||
440 | |||
441 | xor $rem,$Zhh,$Zhh | ||
442 | depd,z $Zll,60,4,$rem2 | ||
443 | |||
444 | shrpd $Zhh,$Zll,4,$Zll | ||
445 | extrd,u $Zhh,59,60,$Zhh | ||
446 | ldd $nlo($Hll),$Tll | ||
447 | ldd $nlo($Hhh),$Thh | ||
448 | |||
449 | xor $Tll,$Zll,$Zll | ||
450 | xor $Thh,$Zhh,$Zhh | ||
451 | |||
452 | depd,z $Zll,60,4,$rem | ||
453 | shrpd $Zhh,$Zll,4,$Zll | ||
454 | ldd $rem2($rem_4bit),$rem2 | ||
455 | |||
456 | xor $rem2,$Zhh,$Zhh | ||
457 | ldd $nhi($Hll),$Tll | ||
458 | ldd $nhi($Hhh),$Thh | ||
459 | |||
460 | extrd,u $Zhh,59,60,$Zhh | ||
461 | xor $Tll,$Zll,$Zll | ||
462 | xor $Thh,$Zhh,$Zhh | ||
463 | ldd $rem($rem_4bit),$rem | ||
464 | |||
465 | xor $rem,$Zhh,$Zhh | ||
466 | std $Zll,8($Xi) | ||
467 | ldo 16($inp),$inp | ||
468 | std $Zhh,0($Xi) | ||
469 | cmpb,*<> $inp,$len,L\$outer_ghash_pa2 | ||
470 | copy $Zll,$nlo | ||
471 | ___ | ||
472 | |||
473 | $code.=<<___ if ($SIZE_T==4); | ||
474 | b L\$done_ghash | ||
475 | nop | ||
476 | |||
477 | L\$parisc1_ghash | ||
478 | ldb 15($Xi),$nlo | ||
479 | ldo 12($Htbl),$Hll | ||
480 | ldo 8($Htbl),$Hlh | ||
481 | ldo 4($Htbl),$Hhl | ||
482 | |||
483 | L\$outer_ghash_pa1 | ||
484 | ldb 15($inp),$byte | ||
485 | xor $byte,$nlo,$nlo | ||
486 | and $mask0xf0,$nlo,$nhi | ||
487 | zdep $nlo,27,4,$nlo | ||
488 | |||
489 | ldwx $nlo($Hll),$Zll | ||
490 | ldwx $nlo($Hlh),$Zlh | ||
491 | ldwx $nlo($Hhl),$Zhl | ||
492 | ldwx $nlo($Hhh),$Zhh | ||
493 | zdep $Zll,28,4,$rem | ||
494 | ldb 14($Xi),$nlo | ||
495 | ldb 14($inp),$byte | ||
496 | ldwx $rem($rem_4bit),$rem | ||
497 | shrpw $Zlh,$Zll,4,$Zll | ||
498 | ldwx $nhi($Hll),$Tll | ||
499 | shrpw $Zhl,$Zlh,4,$Zlh | ||
500 | ldwx $nhi($Hlh),$Tlh | ||
501 | shrpw $Zhh,$Zhl,4,$Zhl | ||
502 | ldwx $nhi($Hhl),$Thl | ||
503 | extru $Zhh,27,28,$Zhh | ||
504 | ldwx $nhi($Hhh),$Thh | ||
505 | xor $byte,$nlo,$nlo | ||
506 | xor $rem,$Zhh,$Zhh | ||
507 | and $mask0xf0,$nlo,$nhi | ||
508 | zdep $nlo,27,4,$nlo | ||
509 | |||
510 | xor $Tll,$Zll,$Zll | ||
511 | ldwx $nlo($Hll),$Tll | ||
512 | xor $Tlh,$Zlh,$Zlh | ||
513 | ldwx $nlo($Hlh),$Tlh | ||
514 | xor $Thl,$Zhl,$Zhl | ||
515 | b L\$oop_ghash_pa1 | ||
516 | ldi 13,$cnt | ||
517 | |||
518 | .ALIGN 8 | ||
519 | L\$oop_ghash_pa1 | ||
520 | zdep $Zll,28,4,$rem | ||
521 | ldwx $nlo($Hhl),$Thl | ||
522 | xor $Thh,$Zhh,$Zhh | ||
523 | ldwx $rem($rem_4bit),$rem | ||
524 | shrpw $Zlh,$Zll,4,$Zll | ||
525 | ldwx $nlo($Hhh),$Thh | ||
526 | shrpw $Zhl,$Zlh,4,$Zlh | ||
527 | ldbx $cnt($Xi),$nlo | ||
528 | xor $Tll,$Zll,$Zll | ||
529 | ldwx $nhi($Hll),$Tll | ||
530 | shrpw $Zhh,$Zhl,4,$Zhl | ||
531 | ldbx $cnt($inp),$byte | ||
532 | xor $Tlh,$Zlh,$Zlh | ||
533 | ldwx $nhi($Hlh),$Tlh | ||
534 | extru $Zhh,27,28,$Zhh | ||
535 | xor $Thl,$Zhl,$Zhl | ||
536 | ldwx $nhi($Hhl),$Thl | ||
537 | xor $rem,$Zhh,$Zhh | ||
538 | zdep $Zll,28,4,$rem | ||
539 | xor $Thh,$Zhh,$Zhh | ||
540 | ldwx $nhi($Hhh),$Thh | ||
541 | shrpw $Zlh,$Zll,4,$Zll | ||
542 | ldwx $rem($rem_4bit),$rem | ||
543 | shrpw $Zhl,$Zlh,4,$Zlh | ||
544 | xor $byte,$nlo,$nlo | ||
545 | shrpw $Zhh,$Zhl,4,$Zhl | ||
546 | and $mask0xf0,$nlo,$nhi | ||
547 | extru $Zhh,27,28,$Zhh | ||
548 | zdep $nlo,27,4,$nlo | ||
549 | xor $Tll,$Zll,$Zll | ||
550 | ldwx $nlo($Hll),$Tll | ||
551 | xor $Tlh,$Zlh,$Zlh | ||
552 | ldwx $nlo($Hlh),$Tlh | ||
553 | xor $rem,$Zhh,$Zhh | ||
554 | addib,uv -1,$cnt,L\$oop_ghash_pa1 | ||
555 | xor $Thl,$Zhl,$Zhl | ||
556 | |||
557 | zdep $Zll,28,4,$rem | ||
558 | ldwx $nlo($Hhl),$Thl | ||
559 | xor $Thh,$Zhh,$Zhh | ||
560 | ldwx $rem($rem_4bit),$rem | ||
561 | shrpw $Zlh,$Zll,4,$Zll | ||
562 | ldwx $nlo($Hhh),$Thh | ||
563 | shrpw $Zhl,$Zlh,4,$Zlh | ||
564 | xor $Tll,$Zll,$Zll | ||
565 | ldwx $nhi($Hll),$Tll | ||
566 | shrpw $Zhh,$Zhl,4,$Zhl | ||
567 | xor $Tlh,$Zlh,$Zlh | ||
568 | ldwx $nhi($Hlh),$Tlh | ||
569 | extru $Zhh,27,28,$Zhh | ||
570 | xor $rem,$Zhh,$Zhh | ||
571 | xor $Thl,$Zhl,$Zhl | ||
572 | ldwx $nhi($Hhl),$Thl | ||
573 | xor $Thh,$Zhh,$Zhh | ||
574 | ldwx $nhi($Hhh),$Thh | ||
575 | zdep $Zll,28,4,$rem | ||
576 | ldwx $rem($rem_4bit),$rem | ||
577 | shrpw $Zlh,$Zll,4,$Zll | ||
578 | shrpw $Zhl,$Zlh,4,$Zlh | ||
579 | shrpw $Zhh,$Zhl,4,$Zhl | ||
580 | extru $Zhh,27,28,$Zhh | ||
581 | xor $Tll,$Zll,$Zll | ||
582 | xor $Tlh,$Zlh,$Zlh | ||
583 | xor $rem,$Zhh,$Zhh | ||
584 | stw $Zll,12($Xi) | ||
585 | xor $Thl,$Zhl,$Zhl | ||
586 | stw $Zlh,8($Xi) | ||
587 | xor $Thh,$Zhh,$Zhh | ||
588 | stw $Zhl,4($Xi) | ||
589 | ldo 16($inp),$inp | ||
590 | stw $Zhh,0($Xi) | ||
591 | comb,<> $inp,$len,L\$outer_ghash_pa1 | ||
592 | copy $Zll,$nlo | ||
593 | ___ | ||
594 | $code.=<<___; | ||
595 | L\$done_ghash | ||
596 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
597 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
598 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
599 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
600 | ___ | ||
601 | $code.=<<___ if ($SIZE_T==4); | ||
602 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
603 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
604 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
605 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
606 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
607 | ___ | ||
608 | $code.=<<___; | ||
609 | bv (%r2) | ||
610 | .EXIT | ||
611 | $POPMB -$FRAME(%sp),%r3 | ||
612 | .PROCEND | ||
613 | |||
614 | .ALIGN 64 | ||
615 | L\$rem_4bit | ||
616 | .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 | ||
617 | .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 | ||
618 | .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 | ||
619 | .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 | ||
620 | .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>" | ||
621 | .ALIGN 64 | ||
622 | ___ | ||
623 | |||
624 | # Explicitly encode PA-RISC 2.0 instructions used in this module, so | ||
625 | # that it can be compiled with .LEVEL 1.0. It should be noted that I | ||
626 | # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 | ||
627 | # directive... | ||
628 | |||
629 | my $ldd = sub { | ||
630 | my ($mod,$args) = @_; | ||
631 | my $orig = "ldd$mod\t$args"; | ||
632 | |||
633 | if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 | ||
634 | { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; | ||
635 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
636 | } | ||
637 | elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 | ||
638 | { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; | ||
639 | $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset | ||
640 | $opcode|=(1<<5) if ($mod =~ /^,m/); | ||
641 | $opcode|=(1<<13) if ($mod =~ /^,mb/); | ||
642 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
643 | } | ||
644 | else { "\t".$orig; } | ||
645 | }; | ||
646 | |||
647 | my $std = sub { | ||
648 | my ($mod,$args) = @_; | ||
649 | my $orig = "std$mod\t$args"; | ||
650 | |||
651 | if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices | ||
652 | { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); | ||
653 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
654 | } | ||
655 | else { "\t".$orig; } | ||
656 | }; | ||
657 | |||
658 | my $extrd = sub { | ||
659 | my ($mod,$args) = @_; | ||
660 | my $orig = "extrd$mod\t$args"; | ||
661 | |||
662 | # I only have ",u" completer, it's implicitly encoded... | ||
663 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 | ||
664 | { my $opcode=(0x36<<26)|($1<<21)|($4<<16); | ||
665 | my $len=32-$3; | ||
666 | $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos | ||
667 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len | ||
668 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
669 | } | ||
670 | elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 | ||
671 | { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); | ||
672 | my $len=32-$2; | ||
673 | $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len | ||
674 | $opcode |= (1<<13) if ($mod =~ /,\**=/); | ||
675 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
676 | } | ||
677 | else { "\t".$orig; } | ||
678 | }; | ||
679 | |||
680 | my $shrpd = sub { | ||
681 | my ($mod,$args) = @_; | ||
682 | my $orig = "shrpd$mod\t$args"; | ||
683 | |||
684 | if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 | ||
685 | { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; | ||
686 | my $cpos=63-$3; | ||
687 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa | ||
688 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
689 | } | ||
690 | elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 | ||
691 | { sprintf "\t.WORD\t0x%08x\t; %s", | ||
692 | (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; | ||
693 | } | ||
694 | else { "\t".$orig; } | ||
695 | }; | ||
696 | |||
697 | my $depd = sub { | ||
698 | my ($mod,$args) = @_; | ||
699 | my $orig = "depd$mod\t$args"; | ||
700 | |||
701 | # I only have ",z" completer, it's impicitly encoded... | ||
702 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 | ||
703 | { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); | ||
704 | my $cpos=63-$2; | ||
705 | my $len=32-$3; | ||
706 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos | ||
707 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len | ||
708 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
709 | } | ||
710 | else { "\t".$orig; } | ||
711 | }; | ||
712 | |||
713 | sub assemble { | ||
714 | my ($mnemonic,$mod,$args)=@_; | ||
715 | my $opcode = eval("\$$mnemonic"); | ||
716 | |||
717 | ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; | ||
718 | } | ||
719 | |||
720 | foreach (split("\n",$code)) { | ||
721 | s/\`([^\`]*)\`/eval $1/ge; | ||
722 | if ($SIZE_T==4) { | ||
723 | s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e; | ||
724 | s/cmpb,\*/comb,/; | ||
725 | s/,\*/,/; | ||
726 | } | ||
727 | print $_,"\n"; | ||
728 | } | ||
729 | |||
730 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl new file mode 100644 index 0000000000..6a40d5d89c --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-s390x.pl | |||
@@ -0,0 +1,262 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # September 2010. | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
14 | # uses 256 bytes per-key table [+128 bytes shared table]. Performance | ||
15 | # was measured to be ~18 cycles per processed byte on z10, which is | ||
16 | # almost 40% better than gcc-generated code. It should be noted that | ||
17 | # 18 cycles is worse result than expected: loop is scheduled for 12 | ||
18 | # and the result should be close to 12. In the lack of instruction- | ||
19 | # level profiling data it's impossible to tell why... | ||
20 | |||
21 | # November 2010. | ||
22 | # | ||
23 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
24 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
25 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
26 | # application context. The feature is not specific to any particular | ||
27 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
28 | # remains z/Architecture specific. On z990 it was measured to perform | ||
29 | # 2.8x better than 32-bit code generated by gcc 4.3. | ||
30 | |||
31 | # March 2011. | ||
32 | # | ||
33 | # Support for hardware KIMD-GHASH is verified to produce correct | ||
34 | # result and therefore is engaged. On z196 it was measured to process | ||
35 | # 8KB buffer ~7 faster than software implementation. It's not as | ||
36 | # impressive for smaller buffer sizes and for smallest 16-bytes buffer | ||
37 | # it's actually almost 2 times slower. Which is the reason why | ||
38 | # KIMD-GHASH is not used in gcm_gmult_4bit. | ||
39 | |||
40 | $flavour = shift; | ||
41 | |||
42 | if ($flavour =~ /3[12]/) { | ||
43 | $SIZE_T=4; | ||
44 | $g=""; | ||
45 | } else { | ||
46 | $SIZE_T=8; | ||
47 | $g="g"; | ||
48 | } | ||
49 | |||
50 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
51 | open STDOUT,">$output"; | ||
52 | |||
53 | $softonly=0; | ||
54 | |||
55 | $Zhi="%r0"; | ||
56 | $Zlo="%r1"; | ||
57 | |||
58 | $Xi="%r2"; # argument block | ||
59 | $Htbl="%r3"; | ||
60 | $inp="%r4"; | ||
61 | $len="%r5"; | ||
62 | |||
63 | $rem0="%r6"; # variables | ||
64 | $rem1="%r7"; | ||
65 | $nlo="%r8"; | ||
66 | $nhi="%r9"; | ||
67 | $xi="%r10"; | ||
68 | $cnt="%r11"; | ||
69 | $tmp="%r12"; | ||
70 | $x78="%r13"; | ||
71 | $rem_4bit="%r14"; | ||
72 | |||
73 | $sp="%r15"; | ||
74 | |||
75 | $code.=<<___; | ||
76 | .text | ||
77 | |||
78 | .globl gcm_gmult_4bit | ||
79 | .align 32 | ||
80 | gcm_gmult_4bit: | ||
81 | ___ | ||
82 | $code.=<<___ if(!$softonly && 0); # hardware is slow for single block... | ||
83 | larl %r1,OPENSSL_s390xcap_P | ||
84 | lg %r0,0(%r1) | ||
85 | tmhl %r0,0x4000 # check for message-security-assist | ||
86 | jz .Lsoft_gmult | ||
87 | lghi %r0,0 | ||
88 | la %r1,16($sp) | ||
89 | .long 0xb93e0004 # kimd %r0,%r4 | ||
90 | lg %r1,24($sp) | ||
91 | tmhh %r1,0x4000 # check for function 65 | ||
92 | jz .Lsoft_gmult | ||
93 | stg %r0,16($sp) # arrange 16 bytes of zero input | ||
94 | stg %r0,24($sp) | ||
95 | lghi %r0,65 # function 65 | ||
96 | la %r1,0($Xi) # H lies right after Xi in gcm128_context | ||
97 | la $inp,16($sp) | ||
98 | lghi $len,16 | ||
99 | .long 0xb93e0004 # kimd %r0,$inp | ||
100 | brc 1,.-4 # pay attention to "partial completion" | ||
101 | br %r14 | ||
102 | .align 32 | ||
103 | .Lsoft_gmult: | ||
104 | ___ | ||
105 | $code.=<<___; | ||
106 | stm${g} %r6,%r14,6*$SIZE_T($sp) | ||
107 | |||
108 | aghi $Xi,-1 | ||
109 | lghi $len,1 | ||
110 | lghi $x78,`0xf<<3` | ||
111 | larl $rem_4bit,rem_4bit | ||
112 | |||
113 | lg $Zlo,8+1($Xi) # Xi | ||
114 | j .Lgmult_shortcut | ||
115 | .type gcm_gmult_4bit,\@function | ||
116 | .size gcm_gmult_4bit,(.-gcm_gmult_4bit) | ||
117 | |||
118 | .globl gcm_ghash_4bit | ||
119 | .align 32 | ||
120 | gcm_ghash_4bit: | ||
121 | ___ | ||
122 | $code.=<<___ if(!$softonly); | ||
123 | larl %r1,OPENSSL_s390xcap_P | ||
124 | lg %r0,0(%r1) | ||
125 | tmhl %r0,0x4000 # check for message-security-assist | ||
126 | jz .Lsoft_ghash | ||
127 | lghi %r0,0 | ||
128 | la %r1,16($sp) | ||
129 | .long 0xb93e0004 # kimd %r0,%r4 | ||
130 | lg %r1,24($sp) | ||
131 | tmhh %r1,0x4000 # check for function 65 | ||
132 | jz .Lsoft_ghash | ||
133 | lghi %r0,65 # function 65 | ||
134 | la %r1,0($Xi) # H lies right after Xi in gcm128_context | ||
135 | .long 0xb93e0004 # kimd %r0,$inp | ||
136 | brc 1,.-4 # pay attention to "partial completion" | ||
137 | br %r14 | ||
138 | .align 32 | ||
139 | .Lsoft_ghash: | ||
140 | ___ | ||
141 | $code.=<<___ if ($flavour =~ /3[12]/); | ||
142 | llgfr $len,$len | ||
143 | ___ | ||
144 | $code.=<<___; | ||
145 | stm${g} %r6,%r14,6*$SIZE_T($sp) | ||
146 | |||
147 | aghi $Xi,-1 | ||
148 | srlg $len,$len,4 | ||
149 | lghi $x78,`0xf<<3` | ||
150 | larl $rem_4bit,rem_4bit | ||
151 | |||
152 | lg $Zlo,8+1($Xi) # Xi | ||
153 | lg $Zhi,0+1($Xi) | ||
154 | lghi $tmp,0 | ||
155 | .Louter: | ||
156 | xg $Zhi,0($inp) # Xi ^= inp | ||
157 | xg $Zlo,8($inp) | ||
158 | xgr $Zhi,$tmp | ||
159 | stg $Zlo,8+1($Xi) | ||
160 | stg $Zhi,0+1($Xi) | ||
161 | |||
162 | .Lgmult_shortcut: | ||
163 | lghi $tmp,0xf0 | ||
164 | sllg $nlo,$Zlo,4 | ||
165 | srlg $xi,$Zlo,8 # extract second byte | ||
166 | ngr $nlo,$tmp | ||
167 | lgr $nhi,$Zlo | ||
168 | lghi $cnt,14 | ||
169 | ngr $nhi,$tmp | ||
170 | |||
171 | lg $Zlo,8($nlo,$Htbl) | ||
172 | lg $Zhi,0($nlo,$Htbl) | ||
173 | |||
174 | sllg $nlo,$xi,4 | ||
175 | sllg $rem0,$Zlo,3 | ||
176 | ngr $nlo,$tmp | ||
177 | ngr $rem0,$x78 | ||
178 | ngr $xi,$tmp | ||
179 | |||
180 | sllg $tmp,$Zhi,60 | ||
181 | srlg $Zlo,$Zlo,4 | ||
182 | srlg $Zhi,$Zhi,4 | ||
183 | xg $Zlo,8($nhi,$Htbl) | ||
184 | xg $Zhi,0($nhi,$Htbl) | ||
185 | lgr $nhi,$xi | ||
186 | sllg $rem1,$Zlo,3 | ||
187 | xgr $Zlo,$tmp | ||
188 | ngr $rem1,$x78 | ||
189 | j .Lghash_inner | ||
190 | .align 16 | ||
191 | .Lghash_inner: | ||
192 | srlg $Zlo,$Zlo,4 | ||
193 | sllg $tmp,$Zhi,60 | ||
194 | xg $Zlo,8($nlo,$Htbl) | ||
195 | srlg $Zhi,$Zhi,4 | ||
196 | llgc $xi,0($cnt,$Xi) | ||
197 | xg $Zhi,0($nlo,$Htbl) | ||
198 | sllg $nlo,$xi,4 | ||
199 | xg $Zhi,0($rem0,$rem_4bit) | ||
200 | nill $nlo,0xf0 | ||
201 | sllg $rem0,$Zlo,3 | ||
202 | xgr $Zlo,$tmp | ||
203 | ngr $rem0,$x78 | ||
204 | nill $xi,0xf0 | ||
205 | |||
206 | sllg $tmp,$Zhi,60 | ||
207 | srlg $Zlo,$Zlo,4 | ||
208 | srlg $Zhi,$Zhi,4 | ||
209 | xg $Zlo,8($nhi,$Htbl) | ||
210 | xg $Zhi,0($nhi,$Htbl) | ||
211 | lgr $nhi,$xi | ||
212 | xg $Zhi,0($rem1,$rem_4bit) | ||
213 | sllg $rem1,$Zlo,3 | ||
214 | xgr $Zlo,$tmp | ||
215 | ngr $rem1,$x78 | ||
216 | brct $cnt,.Lghash_inner | ||
217 | |||
218 | sllg $tmp,$Zhi,60 | ||
219 | srlg $Zlo,$Zlo,4 | ||
220 | srlg $Zhi,$Zhi,4 | ||
221 | xg $Zlo,8($nlo,$Htbl) | ||
222 | xg $Zhi,0($nlo,$Htbl) | ||
223 | sllg $xi,$Zlo,3 | ||
224 | xg $Zhi,0($rem0,$rem_4bit) | ||
225 | xgr $Zlo,$tmp | ||
226 | ngr $xi,$x78 | ||
227 | |||
228 | sllg $tmp,$Zhi,60 | ||
229 | srlg $Zlo,$Zlo,4 | ||
230 | srlg $Zhi,$Zhi,4 | ||
231 | xg $Zlo,8($nhi,$Htbl) | ||
232 | xg $Zhi,0($nhi,$Htbl) | ||
233 | xgr $Zlo,$tmp | ||
234 | xg $Zhi,0($rem1,$rem_4bit) | ||
235 | |||
236 | lg $tmp,0($xi,$rem_4bit) | ||
237 | la $inp,16($inp) | ||
238 | sllg $tmp,$tmp,4 # correct last rem_4bit[rem] | ||
239 | brctg $len,.Louter | ||
240 | |||
241 | xgr $Zhi,$tmp | ||
242 | stg $Zlo,8+1($Xi) | ||
243 | stg $Zhi,0+1($Xi) | ||
244 | lm${g} %r6,%r14,6*$SIZE_T($sp) | ||
245 | br %r14 | ||
246 | .type gcm_ghash_4bit,\@function | ||
247 | .size gcm_ghash_4bit,(.-gcm_ghash_4bit) | ||
248 | |||
249 | .align 64 | ||
250 | rem_4bit: | ||
251 | .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0 | ||
252 | .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0 | ||
253 | .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0 | ||
254 | .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0 | ||
255 | .type rem_4bit,\@object | ||
256 | .size rem_4bit,(.-rem_4bit) | ||
257 | .string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>" | ||
258 | ___ | ||
259 | |||
260 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
261 | print $code; | ||
262 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl new file mode 100644 index 0000000000..70e7b044a3 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl | |||
@@ -0,0 +1,330 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # March 2010 | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
14 | # uses 256 bytes per-key table [+128 bytes shared table]. Performance | ||
15 | # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU | ||
16 | # and are expressed in cycles per processed byte, less is better: | ||
17 | # | ||
18 | # gcc 3.3.x cc 5.2 this assembler | ||
19 | # | ||
20 | # 32-bit build 81.4 43.3 12.6 (+546%/+244%) | ||
21 | # 64-bit build 20.2 21.2 12.6 (+60%/+68%) | ||
22 | # | ||
23 | # Here is data collected on UltraSPARC T1 system running Linux: | ||
24 | # | ||
25 | # gcc 4.4.1 this assembler | ||
26 | # | ||
27 | # 32-bit build 566 50 (+1000%) | ||
28 | # 64-bit build 56 50 (+12%) | ||
29 | # | ||
30 | # I don't quite understand why difference between 32-bit and 64-bit | ||
31 | # compiler-generated code is so big. Compilers *were* instructed to | ||
32 | # generate code for UltraSPARC and should have used 64-bit registers | ||
33 | # for Z vector (see C code) even in 32-bit build... Oh well, it only | ||
34 | # means more impressive improvement coefficients for this assembler | ||
35 | # module;-) Loops are aggressively modulo-scheduled in respect to | ||
36 | # references to input data and Z.hi updates to achieve 12 cycles | ||
37 | # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 | ||
38 | # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. | ||
39 | |||
40 | $bits=32; | ||
41 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
42 | if ($bits==64) { $bias=2047; $frame=192; } | ||
43 | else { $bias=0; $frame=112; } | ||
44 | |||
45 | $output=shift; | ||
46 | open STDOUT,">$output"; | ||
47 | |||
48 | $Zhi="%o0"; # 64-bit values | ||
49 | $Zlo="%o1"; | ||
50 | $Thi="%o2"; | ||
51 | $Tlo="%o3"; | ||
52 | $rem="%o4"; | ||
53 | $tmp="%o5"; | ||
54 | |||
55 | $nhi="%l0"; # small values and pointers | ||
56 | $nlo="%l1"; | ||
57 | $xi0="%l2"; | ||
58 | $xi1="%l3"; | ||
59 | $rem_4bit="%l4"; | ||
60 | $remi="%l5"; | ||
61 | $Htblo="%l6"; | ||
62 | $cnt="%l7"; | ||
63 | |||
64 | $Xi="%i0"; # input argument block | ||
65 | $Htbl="%i1"; | ||
66 | $inp="%i2"; | ||
67 | $len="%i3"; | ||
68 | |||
69 | $code.=<<___; | ||
70 | .section ".text",#alloc,#execinstr | ||
71 | |||
72 | .align 64 | ||
73 | rem_4bit: | ||
74 | .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 | ||
75 | .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 | ||
76 | .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 | ||
77 | .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 | ||
78 | .type rem_4bit,#object | ||
79 | .size rem_4bit,(.-rem_4bit) | ||
80 | |||
81 | .globl gcm_ghash_4bit | ||
82 | .align 32 | ||
83 | gcm_ghash_4bit: | ||
84 | save %sp,-$frame,%sp | ||
85 | ldub [$inp+15],$nlo | ||
86 | ldub [$Xi+15],$xi0 | ||
87 | ldub [$Xi+14],$xi1 | ||
88 | add $len,$inp,$len | ||
89 | add $Htbl,8,$Htblo | ||
90 | |||
91 | 1: call .+8 | ||
92 | add %o7,rem_4bit-1b,$rem_4bit | ||
93 | |||
94 | .Louter: | ||
95 | xor $xi0,$nlo,$nlo | ||
96 | and $nlo,0xf0,$nhi | ||
97 | and $nlo,0x0f,$nlo | ||
98 | sll $nlo,4,$nlo | ||
99 | ldx [$Htblo+$nlo],$Zlo | ||
100 | ldx [$Htbl+$nlo],$Zhi | ||
101 | |||
102 | ldub [$inp+14],$nlo | ||
103 | |||
104 | ldx [$Htblo+$nhi],$Tlo | ||
105 | and $Zlo,0xf,$remi | ||
106 | ldx [$Htbl+$nhi],$Thi | ||
107 | sll $remi,3,$remi | ||
108 | ldx [$rem_4bit+$remi],$rem | ||
109 | srlx $Zlo,4,$Zlo | ||
110 | mov 13,$cnt | ||
111 | sllx $Zhi,60,$tmp | ||
112 | xor $Tlo,$Zlo,$Zlo | ||
113 | srlx $Zhi,4,$Zhi | ||
114 | xor $Zlo,$tmp,$Zlo | ||
115 | |||
116 | xor $xi1,$nlo,$nlo | ||
117 | and $Zlo,0xf,$remi | ||
118 | and $nlo,0xf0,$nhi | ||
119 | and $nlo,0x0f,$nlo | ||
120 | ba .Lghash_inner | ||
121 | sll $nlo,4,$nlo | ||
122 | .align 32 | ||
123 | .Lghash_inner: | ||
124 | ldx [$Htblo+$nlo],$Tlo | ||
125 | sll $remi,3,$remi | ||
126 | xor $Thi,$Zhi,$Zhi | ||
127 | ldx [$Htbl+$nlo],$Thi | ||
128 | srlx $Zlo,4,$Zlo | ||
129 | xor $rem,$Zhi,$Zhi | ||
130 | ldx [$rem_4bit+$remi],$rem | ||
131 | sllx $Zhi,60,$tmp | ||
132 | xor $Tlo,$Zlo,$Zlo | ||
133 | ldub [$inp+$cnt],$nlo | ||
134 | srlx $Zhi,4,$Zhi | ||
135 | xor $Zlo,$tmp,$Zlo | ||
136 | ldub [$Xi+$cnt],$xi1 | ||
137 | xor $Thi,$Zhi,$Zhi | ||
138 | and $Zlo,0xf,$remi | ||
139 | |||
140 | ldx [$Htblo+$nhi],$Tlo | ||
141 | sll $remi,3,$remi | ||
142 | xor $rem,$Zhi,$Zhi | ||
143 | ldx [$Htbl+$nhi],$Thi | ||
144 | srlx $Zlo,4,$Zlo | ||
145 | ldx [$rem_4bit+$remi],$rem | ||
146 | sllx $Zhi,60,$tmp | ||
147 | xor $xi1,$nlo,$nlo | ||
148 | srlx $Zhi,4,$Zhi | ||
149 | and $nlo,0xf0,$nhi | ||
150 | addcc $cnt,-1,$cnt | ||
151 | xor $Zlo,$tmp,$Zlo | ||
152 | and $nlo,0x0f,$nlo | ||
153 | xor $Tlo,$Zlo,$Zlo | ||
154 | sll $nlo,4,$nlo | ||
155 | blu .Lghash_inner | ||
156 | and $Zlo,0xf,$remi | ||
157 | |||
158 | ldx [$Htblo+$nlo],$Tlo | ||
159 | sll $remi,3,$remi | ||
160 | xor $Thi,$Zhi,$Zhi | ||
161 | ldx [$Htbl+$nlo],$Thi | ||
162 | srlx $Zlo,4,$Zlo | ||
163 | xor $rem,$Zhi,$Zhi | ||
164 | ldx [$rem_4bit+$remi],$rem | ||
165 | sllx $Zhi,60,$tmp | ||
166 | xor $Tlo,$Zlo,$Zlo | ||
167 | srlx $Zhi,4,$Zhi | ||
168 | xor $Zlo,$tmp,$Zlo | ||
169 | xor $Thi,$Zhi,$Zhi | ||
170 | |||
171 | add $inp,16,$inp | ||
172 | cmp $inp,$len | ||
173 | be,pn `$bits==64?"%xcc":"%icc"`,.Ldone | ||
174 | and $Zlo,0xf,$remi | ||
175 | |||
176 | ldx [$Htblo+$nhi],$Tlo | ||
177 | sll $remi,3,$remi | ||
178 | xor $rem,$Zhi,$Zhi | ||
179 | ldx [$Htbl+$nhi],$Thi | ||
180 | srlx $Zlo,4,$Zlo | ||
181 | ldx [$rem_4bit+$remi],$rem | ||
182 | sllx $Zhi,60,$tmp | ||
183 | xor $Tlo,$Zlo,$Zlo | ||
184 | ldub [$inp+15],$nlo | ||
185 | srlx $Zhi,4,$Zhi | ||
186 | xor $Zlo,$tmp,$Zlo | ||
187 | xor $Thi,$Zhi,$Zhi | ||
188 | stx $Zlo,[$Xi+8] | ||
189 | xor $rem,$Zhi,$Zhi | ||
190 | stx $Zhi,[$Xi] | ||
191 | srl $Zlo,8,$xi1 | ||
192 | and $Zlo,0xff,$xi0 | ||
193 | ba .Louter | ||
194 | and $xi1,0xff,$xi1 | ||
195 | .align 32 | ||
196 | .Ldone: | ||
197 | ldx [$Htblo+$nhi],$Tlo | ||
198 | sll $remi,3,$remi | ||
199 | xor $rem,$Zhi,$Zhi | ||
200 | ldx [$Htbl+$nhi],$Thi | ||
201 | srlx $Zlo,4,$Zlo | ||
202 | ldx [$rem_4bit+$remi],$rem | ||
203 | sllx $Zhi,60,$tmp | ||
204 | xor $Tlo,$Zlo,$Zlo | ||
205 | srlx $Zhi,4,$Zhi | ||
206 | xor $Zlo,$tmp,$Zlo | ||
207 | xor $Thi,$Zhi,$Zhi | ||
208 | stx $Zlo,[$Xi+8] | ||
209 | xor $rem,$Zhi,$Zhi | ||
210 | stx $Zhi,[$Xi] | ||
211 | |||
212 | ret | ||
213 | restore | ||
214 | .type gcm_ghash_4bit,#function | ||
215 | .size gcm_ghash_4bit,(.-gcm_ghash_4bit) | ||
216 | ___ | ||
217 | |||
218 | undef $inp; | ||
219 | undef $len; | ||
220 | |||
221 | $code.=<<___; | ||
222 | .globl gcm_gmult_4bit | ||
223 | .align 32 | ||
224 | gcm_gmult_4bit: | ||
225 | save %sp,-$frame,%sp | ||
226 | ldub [$Xi+15],$nlo | ||
227 | add $Htbl,8,$Htblo | ||
228 | |||
229 | 1: call .+8 | ||
230 | add %o7,rem_4bit-1b,$rem_4bit | ||
231 | |||
232 | and $nlo,0xf0,$nhi | ||
233 | and $nlo,0x0f,$nlo | ||
234 | sll $nlo,4,$nlo | ||
235 | ldx [$Htblo+$nlo],$Zlo | ||
236 | ldx [$Htbl+$nlo],$Zhi | ||
237 | |||
238 | ldub [$Xi+14],$nlo | ||
239 | |||
240 | ldx [$Htblo+$nhi],$Tlo | ||
241 | and $Zlo,0xf,$remi | ||
242 | ldx [$Htbl+$nhi],$Thi | ||
243 | sll $remi,3,$remi | ||
244 | ldx [$rem_4bit+$remi],$rem | ||
245 | srlx $Zlo,4,$Zlo | ||
246 | mov 13,$cnt | ||
247 | sllx $Zhi,60,$tmp | ||
248 | xor $Tlo,$Zlo,$Zlo | ||
249 | srlx $Zhi,4,$Zhi | ||
250 | xor $Zlo,$tmp,$Zlo | ||
251 | |||
252 | and $Zlo,0xf,$remi | ||
253 | and $nlo,0xf0,$nhi | ||
254 | and $nlo,0x0f,$nlo | ||
255 | ba .Lgmult_inner | ||
256 | sll $nlo,4,$nlo | ||
257 | .align 32 | ||
258 | .Lgmult_inner: | ||
259 | ldx [$Htblo+$nlo],$Tlo | ||
260 | sll $remi,3,$remi | ||
261 | xor $Thi,$Zhi,$Zhi | ||
262 | ldx [$Htbl+$nlo],$Thi | ||
263 | srlx $Zlo,4,$Zlo | ||
264 | xor $rem,$Zhi,$Zhi | ||
265 | ldx [$rem_4bit+$remi],$rem | ||
266 | sllx $Zhi,60,$tmp | ||
267 | xor $Tlo,$Zlo,$Zlo | ||
268 | ldub [$Xi+$cnt],$nlo | ||
269 | srlx $Zhi,4,$Zhi | ||
270 | xor $Zlo,$tmp,$Zlo | ||
271 | xor $Thi,$Zhi,$Zhi | ||
272 | and $Zlo,0xf,$remi | ||
273 | |||
274 | ldx [$Htblo+$nhi],$Tlo | ||
275 | sll $remi,3,$remi | ||
276 | xor $rem,$Zhi,$Zhi | ||
277 | ldx [$Htbl+$nhi],$Thi | ||
278 | srlx $Zlo,4,$Zlo | ||
279 | ldx [$rem_4bit+$remi],$rem | ||
280 | sllx $Zhi,60,$tmp | ||
281 | srlx $Zhi,4,$Zhi | ||
282 | and $nlo,0xf0,$nhi | ||
283 | addcc $cnt,-1,$cnt | ||
284 | xor $Zlo,$tmp,$Zlo | ||
285 | and $nlo,0x0f,$nlo | ||
286 | xor $Tlo,$Zlo,$Zlo | ||
287 | sll $nlo,4,$nlo | ||
288 | blu .Lgmult_inner | ||
289 | and $Zlo,0xf,$remi | ||
290 | |||
291 | ldx [$Htblo+$nlo],$Tlo | ||
292 | sll $remi,3,$remi | ||
293 | xor $Thi,$Zhi,$Zhi | ||
294 | ldx [$Htbl+$nlo],$Thi | ||
295 | srlx $Zlo,4,$Zlo | ||
296 | xor $rem,$Zhi,$Zhi | ||
297 | ldx [$rem_4bit+$remi],$rem | ||
298 | sllx $Zhi,60,$tmp | ||
299 | xor $Tlo,$Zlo,$Zlo | ||
300 | srlx $Zhi,4,$Zhi | ||
301 | xor $Zlo,$tmp,$Zlo | ||
302 | xor $Thi,$Zhi,$Zhi | ||
303 | and $Zlo,0xf,$remi | ||
304 | |||
305 | ldx [$Htblo+$nhi],$Tlo | ||
306 | sll $remi,3,$remi | ||
307 | xor $rem,$Zhi,$Zhi | ||
308 | ldx [$Htbl+$nhi],$Thi | ||
309 | srlx $Zlo,4,$Zlo | ||
310 | ldx [$rem_4bit+$remi],$rem | ||
311 | sllx $Zhi,60,$tmp | ||
312 | xor $Tlo,$Zlo,$Zlo | ||
313 | srlx $Zhi,4,$Zhi | ||
314 | xor $Zlo,$tmp,$Zlo | ||
315 | xor $Thi,$Zhi,$Zhi | ||
316 | stx $Zlo,[$Xi+8] | ||
317 | xor $rem,$Zhi,$Zhi | ||
318 | stx $Zhi,[$Xi] | ||
319 | |||
320 | ret | ||
321 | restore | ||
322 | .type gcm_gmult_4bit,#function | ||
323 | .size gcm_gmult_4bit,(.-gcm_gmult_4bit) | ||
324 | .asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" | ||
325 | .align 4 | ||
326 | ___ | ||
327 | |||
328 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
329 | print $code; | ||
330 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl new file mode 100644 index 0000000000..6b09669d47 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-x86.pl | |||
@@ -0,0 +1,1342 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # March, May, June 2010 | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
14 | # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two | ||
15 | # code paths: vanilla x86 and vanilla MMX. Former will be executed on | ||
16 | # 486 and Pentium, latter on all others. MMX GHASH features so called | ||
17 | # "528B" variant of "4-bit" method utilizing additional 256+16 bytes | ||
18 | # of per-key storage [+512 bytes shared table]. Performance results | ||
19 | # are for streamed GHASH subroutine and are expressed in cycles per | ||
20 | # processed byte, less is better: | ||
21 | # | ||
22 | # gcc 2.95.3(*) MMX assembler x86 assembler | ||
23 | # | ||
24 | # Pentium 105/111(**) - 50 | ||
25 | # PIII 68 /75 12.2 24 | ||
26 | # P4 125/125 17.8 84(***) | ||
27 | # Opteron 66 /70 10.1 30 | ||
28 | # Core2 54 /67 8.4 18 | ||
29 | # | ||
30 | # (*) gcc 3.4.x was observed to generate few percent slower code, | ||
31 | # which is one of reasons why 2.95.3 results were chosen, | ||
32 | # another reason is lack of 3.4.x results for older CPUs; | ||
33 | # comparison with MMX results is not completely fair, because C | ||
34 | # results are for vanilla "256B" implementation, while | ||
35 | # assembler results are for "528B";-) | ||
36 | # (**) second number is result for code compiled with -fPIC flag, | ||
37 | # which is actually more relevant, because assembler code is | ||
38 | # position-independent; | ||
39 | # (***) see comment in non-MMX routine for further details; | ||
40 | # | ||
41 | # To summarize, it's >2-5 times faster than gcc-generated code. To | ||
42 | # anchor it to something else SHA1 assembler processes one byte in | ||
43 | # 11-13 cycles on contemporary x86 cores. As for choice of MMX in | ||
44 | # particular, see comment at the end of the file... | ||
45 | |||
46 | # May 2010 | ||
47 | # | ||
48 | # Add PCLMULQDQ version performing at 2.10 cycles per processed byte. | ||
49 | # The question is how close is it to theoretical limit? The pclmulqdq | ||
50 | # instruction latency appears to be 14 cycles and there can't be more | ||
51 | # than 2 of them executing at any given time. This means that single | ||
52 | # Karatsuba multiplication would take 28 cycles *plus* few cycles for | ||
53 | # pre- and post-processing. Then multiplication has to be followed by | ||
54 | # modulo-reduction. Given that aggregated reduction method [see | ||
55 | # "Carry-less Multiplication and Its Usage for Computing the GCM Mode" | ||
56 | # white paper by Intel] allows you to perform reduction only once in | ||
57 | # a while we can assume that asymptotic performance can be estimated | ||
58 | # as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction | ||
59 | # and Naggr is the aggregation factor. | ||
60 | # | ||
61 | # Before we proceed to this implementation let's have closer look at | ||
62 | # the best-performing code suggested by Intel in their white paper. | ||
63 | # By tracing inter-register dependencies Tmod is estimated as ~19 | ||
64 | # cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per | ||
65 | # processed byte. As implied, this is quite optimistic estimate, | ||
66 | # because it does not account for Karatsuba pre- and post-processing, | ||
67 | # which for a single multiplication is ~5 cycles. Unfortunately Intel | ||
68 | # does not provide performance data for GHASH alone. But benchmarking | ||
69 | # AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt | ||
70 | # alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that | ||
71 | # the result accounts even for pre-computing of degrees of the hash | ||
72 | # key H, but its portion is negligible at 16KB buffer size. | ||
73 | # | ||
74 | # Moving on to the implementation in question. Tmod is estimated as | ||
75 | # ~13 cycles and Naggr is 2, giving asymptotic performance of ... | ||
76 | # 2.16. How is it possible that measured performance is better than | ||
77 | # optimistic theoretical estimate? There is one thing Intel failed | ||
78 | # to recognize. By serializing GHASH with CTR in same subroutine | ||
79 | # former's performance is really limited to above (Tmul + Tmod/Naggr) | ||
80 | # equation. But if GHASH procedure is detached, the modulo-reduction | ||
81 | # can be interleaved with Naggr-1 multiplications at instruction level | ||
82 | # and under ideal conditions even disappear from the equation. So that | ||
83 | # optimistic theoretical estimate for this implementation is ... | ||
84 | # 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, | ||
85 | # at least for such small Naggr. I'd argue that (28+Tproc/Naggr), | ||
86 | # where Tproc is time required for Karatsuba pre- and post-processing, | ||
87 | # is more realistic estimate. In this case it gives ... 1.91 cycles. | ||
88 | # Or in other words, depending on how well we can interleave reduction | ||
89 | # and one of the two multiplications the performance should be betwen | ||
90 | # 1.91 and 2.16. As already mentioned, this implementation processes | ||
91 | # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart | ||
92 | # - in 2.02. x86_64 performance is better, because larger register | ||
93 | # bank allows to interleave reduction and multiplication better. | ||
94 | # | ||
95 | # Does it make sense to increase Naggr? To start with it's virtually | ||
96 | # impossible in 32-bit mode, because of limited register bank | ||
97 | # capacity. Otherwise improvement has to be weighed agiainst slower | ||
98 | # setup, as well as code size and complexity increase. As even | ||
99 | # optimistic estimate doesn't promise 30% performance improvement, | ||
100 | # there are currently no plans to increase Naggr. | ||
101 | # | ||
102 | # Special thanks to David Woodhouse <dwmw2@infradead.org> for | ||
103 | # providing access to a Westmere-based system on behalf of Intel | ||
104 | # Open Source Technology Centre. | ||
105 | |||
106 | # January 2010 | ||
107 | # | ||
108 | # Tweaked to optimize transitions between integer and FP operations | ||
109 | # on same XMM register, PCLMULQDQ subroutine was measured to process | ||
110 | # one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. | ||
111 | # The minor regression on Westmere is outweighed by ~15% improvement | ||
112 | # on Sandy Bridge. Strangely enough attempt to modify 64-bit code in | ||
113 | # similar manner resulted in almost 20% degradation on Sandy Bridge, | ||
114 | # where original 64-bit code processes one byte in 1.95 cycles. | ||
115 | |||
116 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
117 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
118 | require "x86asm.pl"; | ||
119 | |||
120 | &asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); | ||
121 | |||
122 | $sse2=0; | ||
123 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
124 | |||
125 | ($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); | ||
126 | $inp = "edi"; | ||
127 | $Htbl = "esi"; | ||
128 | |||
129 | $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse | ||
130 | # than unrolled, which has to be weighted against | ||
131 | # 2.5x x86-specific code size reduction. | ||
132 | |||
133 | sub x86_loop { | ||
134 | my $off = shift; | ||
135 | my $rem = "eax"; | ||
136 | |||
137 | &mov ($Zhh,&DWP(4,$Htbl,$Zll)); | ||
138 | &mov ($Zhl,&DWP(0,$Htbl,$Zll)); | ||
139 | &mov ($Zlh,&DWP(12,$Htbl,$Zll)); | ||
140 | &mov ($Zll,&DWP(8,$Htbl,$Zll)); | ||
141 | &xor ($rem,$rem); # avoid partial register stalls on PIII | ||
142 | |||
143 | # shrd practically kills P4, 2.5x deterioration, but P4 has | ||
144 | # MMX code-path to execute. shrd runs tad faster [than twice | ||
145 | # the shifts, move's and or's] on pre-MMX Pentium (as well as | ||
146 | # PIII and Core2), *but* minimizes code size, spares register | ||
147 | # and thus allows to fold the loop... | ||
148 | if (!$unroll) { | ||
149 | my $cnt = $inp; | ||
150 | &mov ($cnt,15); | ||
151 | &jmp (&label("x86_loop")); | ||
152 | &set_label("x86_loop",16); | ||
153 | for($i=1;$i<=2;$i++) { | ||
154 | &mov (&LB($rem),&LB($Zll)); | ||
155 | &shrd ($Zll,$Zlh,4); | ||
156 | &and (&LB($rem),0xf); | ||
157 | &shrd ($Zlh,$Zhl,4); | ||
158 | &shrd ($Zhl,$Zhh,4); | ||
159 | &shr ($Zhh,4); | ||
160 | &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); | ||
161 | |||
162 | &mov (&LB($rem),&BP($off,"esp",$cnt)); | ||
163 | if ($i&1) { | ||
164 | &and (&LB($rem),0xf0); | ||
165 | } else { | ||
166 | &shl (&LB($rem),4); | ||
167 | } | ||
168 | |||
169 | &xor ($Zll,&DWP(8,$Htbl,$rem)); | ||
170 | &xor ($Zlh,&DWP(12,$Htbl,$rem)); | ||
171 | &xor ($Zhl,&DWP(0,$Htbl,$rem)); | ||
172 | &xor ($Zhh,&DWP(4,$Htbl,$rem)); | ||
173 | |||
174 | if ($i&1) { | ||
175 | &dec ($cnt); | ||
176 | &js (&label("x86_break")); | ||
177 | } else { | ||
178 | &jmp (&label("x86_loop")); | ||
179 | } | ||
180 | } | ||
181 | &set_label("x86_break",16); | ||
182 | } else { | ||
183 | for($i=1;$i<32;$i++) { | ||
184 | &comment($i); | ||
185 | &mov (&LB($rem),&LB($Zll)); | ||
186 | &shrd ($Zll,$Zlh,4); | ||
187 | &and (&LB($rem),0xf); | ||
188 | &shrd ($Zlh,$Zhl,4); | ||
189 | &shrd ($Zhl,$Zhh,4); | ||
190 | &shr ($Zhh,4); | ||
191 | &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); | ||
192 | |||
193 | if ($i&1) { | ||
194 | &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); | ||
195 | &and (&LB($rem),0xf0); | ||
196 | } else { | ||
197 | &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); | ||
198 | &shl (&LB($rem),4); | ||
199 | } | ||
200 | |||
201 | &xor ($Zll,&DWP(8,$Htbl,$rem)); | ||
202 | &xor ($Zlh,&DWP(12,$Htbl,$rem)); | ||
203 | &xor ($Zhl,&DWP(0,$Htbl,$rem)); | ||
204 | &xor ($Zhh,&DWP(4,$Htbl,$rem)); | ||
205 | } | ||
206 | } | ||
207 | &bswap ($Zll); | ||
208 | &bswap ($Zlh); | ||
209 | &bswap ($Zhl); | ||
210 | if (!$x86only) { | ||
211 | &bswap ($Zhh); | ||
212 | } else { | ||
213 | &mov ("eax",$Zhh); | ||
214 | &bswap ("eax"); | ||
215 | &mov ($Zhh,"eax"); | ||
216 | } | ||
217 | } | ||
218 | |||
219 | if ($unroll) { | ||
220 | &function_begin_B("_x86_gmult_4bit_inner"); | ||
221 | &x86_loop(4); | ||
222 | &ret (); | ||
223 | &function_end_B("_x86_gmult_4bit_inner"); | ||
224 | } | ||
225 | |||
226 | sub deposit_rem_4bit { | ||
227 | my $bias = shift; | ||
228 | |||
229 | &mov (&DWP($bias+0, "esp"),0x0000<<16); | ||
230 | &mov (&DWP($bias+4, "esp"),0x1C20<<16); | ||
231 | &mov (&DWP($bias+8, "esp"),0x3840<<16); | ||
232 | &mov (&DWP($bias+12,"esp"),0x2460<<16); | ||
233 | &mov (&DWP($bias+16,"esp"),0x7080<<16); | ||
234 | &mov (&DWP($bias+20,"esp"),0x6CA0<<16); | ||
235 | &mov (&DWP($bias+24,"esp"),0x48C0<<16); | ||
236 | &mov (&DWP($bias+28,"esp"),0x54E0<<16); | ||
237 | &mov (&DWP($bias+32,"esp"),0xE100<<16); | ||
238 | &mov (&DWP($bias+36,"esp"),0xFD20<<16); | ||
239 | &mov (&DWP($bias+40,"esp"),0xD940<<16); | ||
240 | &mov (&DWP($bias+44,"esp"),0xC560<<16); | ||
241 | &mov (&DWP($bias+48,"esp"),0x9180<<16); | ||
242 | &mov (&DWP($bias+52,"esp"),0x8DA0<<16); | ||
243 | &mov (&DWP($bias+56,"esp"),0xA9C0<<16); | ||
244 | &mov (&DWP($bias+60,"esp"),0xB5E0<<16); | ||
245 | } | ||
246 | |||
247 | $suffix = $x86only ? "" : "_x86"; | ||
248 | |||
249 | &function_begin("gcm_gmult_4bit".$suffix); | ||
250 | &stack_push(16+4+1); # +1 for stack alignment | ||
251 | &mov ($inp,&wparam(0)); # load Xi | ||
252 | &mov ($Htbl,&wparam(1)); # load Htable | ||
253 | |||
254 | &mov ($Zhh,&DWP(0,$inp)); # load Xi[16] | ||
255 | &mov ($Zhl,&DWP(4,$inp)); | ||
256 | &mov ($Zlh,&DWP(8,$inp)); | ||
257 | &mov ($Zll,&DWP(12,$inp)); | ||
258 | |||
259 | &deposit_rem_4bit(16); | ||
260 | |||
261 | &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack | ||
262 | &mov (&DWP(4,"esp"),$Zhl); | ||
263 | &mov (&DWP(8,"esp"),$Zlh); | ||
264 | &mov (&DWP(12,"esp"),$Zll); | ||
265 | &shr ($Zll,20); | ||
266 | &and ($Zll,0xf0); | ||
267 | |||
268 | if ($unroll) { | ||
269 | &call ("_x86_gmult_4bit_inner"); | ||
270 | } else { | ||
271 | &x86_loop(0); | ||
272 | &mov ($inp,&wparam(0)); | ||
273 | } | ||
274 | |||
275 | &mov (&DWP(12,$inp),$Zll); | ||
276 | &mov (&DWP(8,$inp),$Zlh); | ||
277 | &mov (&DWP(4,$inp),$Zhl); | ||
278 | &mov (&DWP(0,$inp),$Zhh); | ||
279 | &stack_pop(16+4+1); | ||
280 | &function_end("gcm_gmult_4bit".$suffix); | ||
281 | |||
282 | &function_begin("gcm_ghash_4bit".$suffix); | ||
283 | &stack_push(16+4+1); # +1 for 64-bit alignment | ||
284 | &mov ($Zll,&wparam(0)); # load Xi | ||
285 | &mov ($Htbl,&wparam(1)); # load Htable | ||
286 | &mov ($inp,&wparam(2)); # load in | ||
287 | &mov ("ecx",&wparam(3)); # load len | ||
288 | &add ("ecx",$inp); | ||
289 | &mov (&wparam(3),"ecx"); | ||
290 | |||
291 | &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] | ||
292 | &mov ($Zhl,&DWP(4,$Zll)); | ||
293 | &mov ($Zlh,&DWP(8,$Zll)); | ||
294 | &mov ($Zll,&DWP(12,$Zll)); | ||
295 | |||
296 | &deposit_rem_4bit(16); | ||
297 | |||
298 | &set_label("x86_outer_loop",16); | ||
299 | &xor ($Zll,&DWP(12,$inp)); # xor with input | ||
300 | &xor ($Zlh,&DWP(8,$inp)); | ||
301 | &xor ($Zhl,&DWP(4,$inp)); | ||
302 | &xor ($Zhh,&DWP(0,$inp)); | ||
303 | &mov (&DWP(12,"esp"),$Zll); # dump it on stack | ||
304 | &mov (&DWP(8,"esp"),$Zlh); | ||
305 | &mov (&DWP(4,"esp"),$Zhl); | ||
306 | &mov (&DWP(0,"esp"),$Zhh); | ||
307 | |||
308 | &shr ($Zll,20); | ||
309 | &and ($Zll,0xf0); | ||
310 | |||
311 | if ($unroll) { | ||
312 | &call ("_x86_gmult_4bit_inner"); | ||
313 | } else { | ||
314 | &x86_loop(0); | ||
315 | &mov ($inp,&wparam(2)); | ||
316 | } | ||
317 | &lea ($inp,&DWP(16,$inp)); | ||
318 | &cmp ($inp,&wparam(3)); | ||
319 | &mov (&wparam(2),$inp) if (!$unroll); | ||
320 | &jb (&label("x86_outer_loop")); | ||
321 | |||
322 | &mov ($inp,&wparam(0)); # load Xi | ||
323 | &mov (&DWP(12,$inp),$Zll); | ||
324 | &mov (&DWP(8,$inp),$Zlh); | ||
325 | &mov (&DWP(4,$inp),$Zhl); | ||
326 | &mov (&DWP(0,$inp),$Zhh); | ||
327 | &stack_pop(16+4+1); | ||
328 | &function_end("gcm_ghash_4bit".$suffix); | ||
329 | |||
330 | if (!$x86only) {{{ | ||
331 | |||
332 | &static_label("rem_4bit"); | ||
333 | |||
334 | if (!$sse2) {{ # pure-MMX "May" version... | ||
335 | |||
336 | $S=12; # shift factor for rem_4bit | ||
337 | |||
338 | &function_begin_B("_mmx_gmult_4bit_inner"); | ||
339 | # MMX version performs 3.5 times better on P4 (see comment in non-MMX | ||
340 | # routine for further details), 100% better on Opteron, ~70% better | ||
341 | # on Core2 and PIII... In other words effort is considered to be well | ||
342 | # spent... Since initial release the loop was unrolled in order to | ||
343 | # "liberate" register previously used as loop counter. Instead it's | ||
344 | # used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. | ||
345 | # The path involves move of Z.lo from MMX to integer register, | ||
346 | # effective address calculation and finally merge of value to Z.hi. | ||
347 | # Reference to rem_4bit is scheduled so late that I had to >>4 | ||
348 | # rem_4bit elements. This resulted in 20-45% procent improvement | ||
349 | # on contemporary µ-archs. | ||
350 | { | ||
351 | my $cnt; | ||
352 | my $rem_4bit = "eax"; | ||
353 | my @rem = ($Zhh,$Zll); | ||
354 | my $nhi = $Zhl; | ||
355 | my $nlo = $Zlh; | ||
356 | |||
357 | my ($Zlo,$Zhi) = ("mm0","mm1"); | ||
358 | my $tmp = "mm2"; | ||
359 | |||
360 | &xor ($nlo,$nlo); # avoid partial register stalls on PIII | ||
361 | &mov ($nhi,$Zll); | ||
362 | &mov (&LB($nlo),&LB($nhi)); | ||
363 | &shl (&LB($nlo),4); | ||
364 | &and ($nhi,0xf0); | ||
365 | &movq ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
366 | &movq ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
367 | &movd ($rem[0],$Zlo); | ||
368 | |||
369 | for ($cnt=28;$cnt>=-2;$cnt--) { | ||
370 | my $odd = $cnt&1; | ||
371 | my $nix = $odd ? $nlo : $nhi; | ||
372 | |||
373 | &shl (&LB($nlo),4) if ($odd); | ||
374 | &psrlq ($Zlo,4); | ||
375 | &movq ($tmp,$Zhi); | ||
376 | &psrlq ($Zhi,4); | ||
377 | &pxor ($Zlo,&QWP(8,$Htbl,$nix)); | ||
378 | &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0); | ||
379 | &psllq ($tmp,60); | ||
380 | &and ($nhi,0xf0) if ($odd); | ||
381 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28); | ||
382 | &and ($rem[0],0xf); | ||
383 | &pxor ($Zhi,&QWP(0,$Htbl,$nix)); | ||
384 | &mov ($nhi,$nlo) if (!$odd && $cnt>=0); | ||
385 | &movd ($rem[1],$Zlo); | ||
386 | &pxor ($Zlo,$tmp); | ||
387 | |||
388 | push (@rem,shift(@rem)); # "rotate" registers | ||
389 | } | ||
390 | |||
391 | &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem] | ||
392 | |||
393 | &psrlq ($Zlo,32); # lower part of Zlo is already there | ||
394 | &movd ($Zhl,$Zhi); | ||
395 | &psrlq ($Zhi,32); | ||
396 | &movd ($Zlh,$Zlo); | ||
397 | &movd ($Zhh,$Zhi); | ||
398 | &shl ($inp,4); # compensate for rem_4bit[i] being >>4 | ||
399 | |||
400 | &bswap ($Zll); | ||
401 | &bswap ($Zhl); | ||
402 | &bswap ($Zlh); | ||
403 | &xor ($Zhh,$inp); | ||
404 | &bswap ($Zhh); | ||
405 | |||
406 | &ret (); | ||
407 | } | ||
408 | &function_end_B("_mmx_gmult_4bit_inner"); | ||
409 | |||
410 | &function_begin("gcm_gmult_4bit_mmx"); | ||
411 | &mov ($inp,&wparam(0)); # load Xi | ||
412 | &mov ($Htbl,&wparam(1)); # load Htable | ||
413 | |||
414 | &call (&label("pic_point")); | ||
415 | &set_label("pic_point"); | ||
416 | &blindpop("eax"); | ||
417 | &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); | ||
418 | |||
419 | &movz ($Zll,&BP(15,$inp)); | ||
420 | |||
421 | &call ("_mmx_gmult_4bit_inner"); | ||
422 | |||
423 | &mov ($inp,&wparam(0)); # load Xi | ||
424 | &emms (); | ||
425 | &mov (&DWP(12,$inp),$Zll); | ||
426 | &mov (&DWP(4,$inp),$Zhl); | ||
427 | &mov (&DWP(8,$inp),$Zlh); | ||
428 | &mov (&DWP(0,$inp),$Zhh); | ||
429 | &function_end("gcm_gmult_4bit_mmx"); | ||
430 | |||
431 | # Streamed version performs 20% better on P4, 7% on Opteron, | ||
432 | # 10% on Core2 and PIII... | ||
433 | &function_begin("gcm_ghash_4bit_mmx"); | ||
434 | &mov ($Zhh,&wparam(0)); # load Xi | ||
435 | &mov ($Htbl,&wparam(1)); # load Htable | ||
436 | &mov ($inp,&wparam(2)); # load in | ||
437 | &mov ($Zlh,&wparam(3)); # load len | ||
438 | |||
439 | &call (&label("pic_point")); | ||
440 | &set_label("pic_point"); | ||
441 | &blindpop("eax"); | ||
442 | &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); | ||
443 | |||
444 | &add ($Zlh,$inp); | ||
445 | &mov (&wparam(3),$Zlh); # len to point at the end of input | ||
446 | &stack_push(4+1); # +1 for stack alignment | ||
447 | |||
448 | &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] | ||
449 | &mov ($Zhl,&DWP(4,$Zhh)); | ||
450 | &mov ($Zlh,&DWP(8,$Zhh)); | ||
451 | &mov ($Zhh,&DWP(0,$Zhh)); | ||
452 | &jmp (&label("mmx_outer_loop")); | ||
453 | |||
454 | &set_label("mmx_outer_loop",16); | ||
455 | &xor ($Zll,&DWP(12,$inp)); | ||
456 | &xor ($Zhl,&DWP(4,$inp)); | ||
457 | &xor ($Zlh,&DWP(8,$inp)); | ||
458 | &xor ($Zhh,&DWP(0,$inp)); | ||
459 | &mov (&wparam(2),$inp); | ||
460 | &mov (&DWP(12,"esp"),$Zll); | ||
461 | &mov (&DWP(4,"esp"),$Zhl); | ||
462 | &mov (&DWP(8,"esp"),$Zlh); | ||
463 | &mov (&DWP(0,"esp"),$Zhh); | ||
464 | |||
465 | &mov ($inp,"esp"); | ||
466 | &shr ($Zll,24); | ||
467 | |||
468 | &call ("_mmx_gmult_4bit_inner"); | ||
469 | |||
470 | &mov ($inp,&wparam(2)); | ||
471 | &lea ($inp,&DWP(16,$inp)); | ||
472 | &cmp ($inp,&wparam(3)); | ||
473 | &jb (&label("mmx_outer_loop")); | ||
474 | |||
475 | &mov ($inp,&wparam(0)); # load Xi | ||
476 | &emms (); | ||
477 | &mov (&DWP(12,$inp),$Zll); | ||
478 | &mov (&DWP(4,$inp),$Zhl); | ||
479 | &mov (&DWP(8,$inp),$Zlh); | ||
480 | &mov (&DWP(0,$inp),$Zhh); | ||
481 | |||
482 | &stack_pop(4+1); | ||
483 | &function_end("gcm_ghash_4bit_mmx"); | ||
484 | |||
485 | }} else {{ # "June" MMX version... | ||
486 | # ... has slower "April" gcm_gmult_4bit_mmx with folded | ||
487 | # loop. This is done to conserve code size... | ||
488 | $S=16; # shift factor for rem_4bit | ||
489 | |||
490 | sub mmx_loop() { | ||
491 | # MMX version performs 2.8 times better on P4 (see comment in non-MMX | ||
492 | # routine for further details), 40% better on Opteron and Core2, 50% | ||
493 | # better on PIII... In other words effort is considered to be well | ||
494 | # spent... | ||
495 | my $inp = shift; | ||
496 | my $rem_4bit = shift; | ||
497 | my $cnt = $Zhh; | ||
498 | my $nhi = $Zhl; | ||
499 | my $nlo = $Zlh; | ||
500 | my $rem = $Zll; | ||
501 | |||
502 | my ($Zlo,$Zhi) = ("mm0","mm1"); | ||
503 | my $tmp = "mm2"; | ||
504 | |||
505 | &xor ($nlo,$nlo); # avoid partial register stalls on PIII | ||
506 | &mov ($nhi,$Zll); | ||
507 | &mov (&LB($nlo),&LB($nhi)); | ||
508 | &mov ($cnt,14); | ||
509 | &shl (&LB($nlo),4); | ||
510 | &and ($nhi,0xf0); | ||
511 | &movq ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
512 | &movq ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
513 | &movd ($rem,$Zlo); | ||
514 | &jmp (&label("mmx_loop")); | ||
515 | |||
516 | &set_label("mmx_loop",16); | ||
517 | &psrlq ($Zlo,4); | ||
518 | &and ($rem,0xf); | ||
519 | &movq ($tmp,$Zhi); | ||
520 | &psrlq ($Zhi,4); | ||
521 | &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); | ||
522 | &mov (&LB($nlo),&BP(0,$inp,$cnt)); | ||
523 | &psllq ($tmp,60); | ||
524 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
525 | &dec ($cnt); | ||
526 | &movd ($rem,$Zlo); | ||
527 | &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); | ||
528 | &mov ($nhi,$nlo); | ||
529 | &pxor ($Zlo,$tmp); | ||
530 | &js (&label("mmx_break")); | ||
531 | |||
532 | &shl (&LB($nlo),4); | ||
533 | &and ($rem,0xf); | ||
534 | &psrlq ($Zlo,4); | ||
535 | &and ($nhi,0xf0); | ||
536 | &movq ($tmp,$Zhi); | ||
537 | &psrlq ($Zhi,4); | ||
538 | &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
539 | &psllq ($tmp,60); | ||
540 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
541 | &movd ($rem,$Zlo); | ||
542 | &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
543 | &pxor ($Zlo,$tmp); | ||
544 | &jmp (&label("mmx_loop")); | ||
545 | |||
546 | &set_label("mmx_break",16); | ||
547 | &shl (&LB($nlo),4); | ||
548 | &and ($rem,0xf); | ||
549 | &psrlq ($Zlo,4); | ||
550 | &and ($nhi,0xf0); | ||
551 | &movq ($tmp,$Zhi); | ||
552 | &psrlq ($Zhi,4); | ||
553 | &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); | ||
554 | &psllq ($tmp,60); | ||
555 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
556 | &movd ($rem,$Zlo); | ||
557 | &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); | ||
558 | &pxor ($Zlo,$tmp); | ||
559 | |||
560 | &psrlq ($Zlo,4); | ||
561 | &and ($rem,0xf); | ||
562 | &movq ($tmp,$Zhi); | ||
563 | &psrlq ($Zhi,4); | ||
564 | &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); | ||
565 | &psllq ($tmp,60); | ||
566 | &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); | ||
567 | &movd ($rem,$Zlo); | ||
568 | &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); | ||
569 | &pxor ($Zlo,$tmp); | ||
570 | |||
571 | &psrlq ($Zlo,32); # lower part of Zlo is already there | ||
572 | &movd ($Zhl,$Zhi); | ||
573 | &psrlq ($Zhi,32); | ||
574 | &movd ($Zlh,$Zlo); | ||
575 | &movd ($Zhh,$Zhi); | ||
576 | |||
577 | &bswap ($Zll); | ||
578 | &bswap ($Zhl); | ||
579 | &bswap ($Zlh); | ||
580 | &bswap ($Zhh); | ||
581 | } | ||
582 | |||
583 | &function_begin("gcm_gmult_4bit_mmx"); | ||
584 | &mov ($inp,&wparam(0)); # load Xi | ||
585 | &mov ($Htbl,&wparam(1)); # load Htable | ||
586 | |||
587 | &call (&label("pic_point")); | ||
588 | &set_label("pic_point"); | ||
589 | &blindpop("eax"); | ||
590 | &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); | ||
591 | |||
592 | &movz ($Zll,&BP(15,$inp)); | ||
593 | |||
594 | &mmx_loop($inp,"eax"); | ||
595 | |||
596 | &emms (); | ||
597 | &mov (&DWP(12,$inp),$Zll); | ||
598 | &mov (&DWP(4,$inp),$Zhl); | ||
599 | &mov (&DWP(8,$inp),$Zlh); | ||
600 | &mov (&DWP(0,$inp),$Zhh); | ||
601 | &function_end("gcm_gmult_4bit_mmx"); | ||
602 | |||
603 | ###################################################################### | ||
604 | # Below subroutine is "528B" variant of "4-bit" GCM GHASH function | ||
605 | # (see gcm128.c for details). It provides further 20-40% performance | ||
606 | # improvement over above mentioned "May" version. | ||
607 | |||
608 | &static_label("rem_8bit"); | ||
609 | |||
610 | &function_begin("gcm_ghash_4bit_mmx"); | ||
611 | { my ($Zlo,$Zhi) = ("mm7","mm6"); | ||
612 | my $rem_8bit = "esi"; | ||
613 | my $Htbl = "ebx"; | ||
614 | |||
615 | # parameter block | ||
616 | &mov ("eax",&wparam(0)); # Xi | ||
617 | &mov ("ebx",&wparam(1)); # Htable | ||
618 | &mov ("ecx",&wparam(2)); # inp | ||
619 | &mov ("edx",&wparam(3)); # len | ||
620 | &mov ("ebp","esp"); # original %esp | ||
621 | &call (&label("pic_point")); | ||
622 | &set_label ("pic_point"); | ||
623 | &blindpop ($rem_8bit); | ||
624 | &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit)); | ||
625 | |||
626 | &sub ("esp",512+16+16); # allocate stack frame... | ||
627 | &and ("esp",-64); # ...and align it | ||
628 | &sub ("esp",16); # place for (u8)(H[]<<4) | ||
629 | |||
630 | &add ("edx","ecx"); # pointer to the end of input | ||
631 | &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi | ||
632 | &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len | ||
633 | &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp | ||
634 | |||
635 | { my @lo = ("mm0","mm1","mm2"); | ||
636 | my @hi = ("mm3","mm4","mm5"); | ||
637 | my @tmp = ("mm6","mm7"); | ||
638 | my $off1=0,$off2=0,$i; | ||
639 | |||
640 | &add ($Htbl,128); # optimize for size | ||
641 | &lea ("edi",&DWP(16+128,"esp")); | ||
642 | &lea ("ebp",&DWP(16+256+128,"esp")); | ||
643 | |||
644 | # decompose Htable (low and high parts are kept separately), | ||
645 | # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack... | ||
646 | for ($i=0;$i<18;$i++) { | ||
647 | |||
648 | &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16); | ||
649 | &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16); | ||
650 | &psllq ($tmp[1],60) if ($i>1); | ||
651 | &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16); | ||
652 | &por ($lo[2],$tmp[1]) if ($i>1); | ||
653 | &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17); | ||
654 | &psrlq ($lo[1],4) if ($i>0 && $i<17); | ||
655 | &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17); | ||
656 | &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17); | ||
657 | &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1); | ||
658 | &psrlq ($hi[1],4) if ($i>0 && $i<17); | ||
659 | &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1); | ||
660 | &shl ("edx",4) if ($i<16); | ||
661 | &mov (&BP($i,"esp"),&LB("edx")) if ($i<16); | ||
662 | |||
663 | unshift (@lo,pop(@lo)); # "rotate" registers | ||
664 | unshift (@hi,pop(@hi)); | ||
665 | unshift (@tmp,pop(@tmp)); | ||
666 | $off1 += 8 if ($i>0); | ||
667 | $off2 += 8 if ($i>1); | ||
668 | } | ||
669 | } | ||
670 | |||
671 | &movq ($Zhi,&QWP(0,"eax")); | ||
672 | &mov ("ebx",&DWP(8,"eax")); | ||
673 | &mov ("edx",&DWP(12,"eax")); # load Xi | ||
674 | |||
675 | &set_label("outer",16); | ||
676 | { my $nlo = "eax"; | ||
677 | my $dat = "edx"; | ||
678 | my @nhi = ("edi","ebp"); | ||
679 | my @rem = ("ebx","ecx"); | ||
680 | my @red = ("mm0","mm1","mm2"); | ||
681 | my $tmp = "mm3"; | ||
682 | |||
683 | &xor ($dat,&DWP(12,"ecx")); # merge input data | ||
684 | &xor ("ebx",&DWP(8,"ecx")); | ||
685 | &pxor ($Zhi,&QWP(0,"ecx")); | ||
686 | &lea ("ecx",&DWP(16,"ecx")); # inp+=16 | ||
687 | #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi | ||
688 | &mov (&DWP(528+8,"esp"),"ebx"); | ||
689 | &movq (&QWP(528+0,"esp"),$Zhi); | ||
690 | &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp | ||
691 | |||
692 | &xor ($nlo,$nlo); | ||
693 | &rol ($dat,8); | ||
694 | &mov (&LB($nlo),&LB($dat)); | ||
695 | &mov ($nhi[1],$nlo); | ||
696 | &and (&LB($nlo),0x0f); | ||
697 | &shr ($nhi[1],4); | ||
698 | &pxor ($red[0],$red[0]); | ||
699 | &rol ($dat,8); # next byte | ||
700 | &pxor ($red[1],$red[1]); | ||
701 | &pxor ($red[2],$red[2]); | ||
702 | |||
703 | # Just like in "May" verson modulo-schedule for critical path in | ||
704 | # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor' | ||
705 | # is scheduled so late that rem_8bit[] has to be shifted *right* | ||
706 | # by 16, which is why last argument to pinsrw is 2, which | ||
707 | # corresponds to <<32=<<48>>16... | ||
708 | for ($j=11,$i=0;$i<15;$i++) { | ||
709 | |||
710 | if ($i>0) { | ||
711 | &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] | ||
712 | &rol ($dat,8); # next byte | ||
713 | &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); | ||
714 | |||
715 | &pxor ($Zlo,$tmp); | ||
716 | &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); | ||
717 | &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) | ||
718 | } else { | ||
719 | &movq ($Zlo,&QWP(16,"esp",$nlo,8)); | ||
720 | &movq ($Zhi,&QWP(16+128,"esp",$nlo,8)); | ||
721 | } | ||
722 | |||
723 | &mov (&LB($nlo),&LB($dat)); | ||
724 | &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0); | ||
725 | |||
726 | &movd ($rem[0],$Zlo); | ||
727 | &movz ($rem[1],&LB($rem[1])) if ($i>0); | ||
728 | &psrlq ($Zlo,8); # Z>>=8 | ||
729 | |||
730 | &movq ($tmp,$Zhi); | ||
731 | &mov ($nhi[0],$nlo); | ||
732 | &psrlq ($Zhi,8); | ||
733 | |||
734 | &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4 | ||
735 | &and (&LB($nlo),0x0f); | ||
736 | &psllq ($tmp,56); | ||
737 | |||
738 | &pxor ($Zhi,$red[1]) if ($i>1); | ||
739 | &shr ($nhi[0],4); | ||
740 | &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0); | ||
741 | |||
742 | unshift (@red,pop(@red)); # "rotate" registers | ||
743 | unshift (@rem,pop(@rem)); | ||
744 | unshift (@nhi,pop(@nhi)); | ||
745 | } | ||
746 | |||
747 | &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] | ||
748 | &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); | ||
749 | &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) | ||
750 | |||
751 | &pxor ($Zlo,$tmp); | ||
752 | &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); | ||
753 | &movz ($rem[1],&LB($rem[1])); | ||
754 | |||
755 | &pxor ($red[2],$red[2]); # clear 2nd word | ||
756 | &psllq ($red[1],4); | ||
757 | |||
758 | &movd ($rem[0],$Zlo); | ||
759 | &psrlq ($Zlo,4); # Z>>=4 | ||
760 | |||
761 | &movq ($tmp,$Zhi); | ||
762 | &psrlq ($Zhi,4); | ||
763 | &shl ($rem[0],4); # rem<<4 | ||
764 | |||
765 | &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi] | ||
766 | &psllq ($tmp,60); | ||
767 | &movz ($rem[0],&LB($rem[0])); | ||
768 | |||
769 | &pxor ($Zlo,$tmp); | ||
770 | &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8)); | ||
771 | |||
772 | &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2); | ||
773 | &pxor ($Zhi,$red[1]); | ||
774 | |||
775 | &movd ($dat,$Zlo); | ||
776 | &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48 | ||
777 | |||
778 | &psllq ($red[0],12); # correct by <<16>>4 | ||
779 | &pxor ($Zhi,$red[0]); | ||
780 | &psrlq ($Zlo,32); | ||
781 | &pxor ($Zhi,$red[2]); | ||
782 | |||
783 | &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp | ||
784 | &movd ("ebx",$Zlo); | ||
785 | &movq ($tmp,$Zhi); # 01234567 | ||
786 | &psllw ($Zhi,8); # 1.3.5.7. | ||
787 | &psrlw ($tmp,8); # .0.2.4.6 | ||
788 | &por ($Zhi,$tmp); # 10325476 | ||
789 | &bswap ($dat); | ||
790 | &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 | ||
791 | &bswap ("ebx"); | ||
792 | |||
793 | &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? | ||
794 | &jne (&label("outer")); | ||
795 | } | ||
796 | |||
797 | &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi | ||
798 | &mov (&DWP(12,"eax"),"edx"); | ||
799 | &mov (&DWP(8,"eax"),"ebx"); | ||
800 | &movq (&QWP(0,"eax"),$Zhi); | ||
801 | |||
802 | &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp | ||
803 | &emms (); | ||
804 | } | ||
805 | &function_end("gcm_ghash_4bit_mmx"); | ||
806 | }} | ||
807 | |||
808 | if ($sse2) {{ | ||
809 | ###################################################################### | ||
810 | # PCLMULQDQ version. | ||
811 | |||
812 | $Xip="eax"; | ||
813 | $Htbl="edx"; | ||
814 | $const="ecx"; | ||
815 | $inp="esi"; | ||
816 | $len="ebx"; | ||
817 | |||
818 | ($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; | ||
819 | ($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); | ||
820 | ($Xn,$Xhn)=("xmm6","xmm7"); | ||
821 | |||
822 | &static_label("bswap"); | ||
823 | |||
824 | sub clmul64x64_T2 { # minimal "register" pressure | ||
825 | my ($Xhi,$Xi,$Hkey)=@_; | ||
826 | |||
827 | &movdqa ($Xhi,$Xi); # | ||
828 | &pshufd ($T1,$Xi,0b01001110); | ||
829 | &pshufd ($T2,$Hkey,0b01001110); | ||
830 | &pxor ($T1,$Xi); # | ||
831 | &pxor ($T2,$Hkey); | ||
832 | |||
833 | &pclmulqdq ($Xi,$Hkey,0x00); ####### | ||
834 | &pclmulqdq ($Xhi,$Hkey,0x11); ####### | ||
835 | &pclmulqdq ($T1,$T2,0x00); ####### | ||
836 | &xorps ($T1,$Xi); # | ||
837 | &xorps ($T1,$Xhi); # | ||
838 | |||
839 | &movdqa ($T2,$T1); # | ||
840 | &psrldq ($T1,8); | ||
841 | &pslldq ($T2,8); # | ||
842 | &pxor ($Xhi,$T1); | ||
843 | &pxor ($Xi,$T2); # | ||
844 | } | ||
845 | |||
846 | sub clmul64x64_T3 { | ||
847 | # Even though this subroutine offers visually better ILP, it | ||
848 | # was empirically found to be a tad slower than above version. | ||
849 | # At least in gcm_ghash_clmul context. But it's just as well, | ||
850 | # because loop modulo-scheduling is possible only thanks to | ||
851 | # minimized "register" pressure... | ||
852 | my ($Xhi,$Xi,$Hkey)=@_; | ||
853 | |||
854 | &movdqa ($T1,$Xi); # | ||
855 | &movdqa ($Xhi,$Xi); | ||
856 | &pclmulqdq ($Xi,$Hkey,0x00); ####### | ||
857 | &pclmulqdq ($Xhi,$Hkey,0x11); ####### | ||
858 | &pshufd ($T2,$T1,0b01001110); # | ||
859 | &pshufd ($T3,$Hkey,0b01001110); | ||
860 | &pxor ($T2,$T1); # | ||
861 | &pxor ($T3,$Hkey); | ||
862 | &pclmulqdq ($T2,$T3,0x00); ####### | ||
863 | &pxor ($T2,$Xi); # | ||
864 | &pxor ($T2,$Xhi); # | ||
865 | |||
866 | &movdqa ($T3,$T2); # | ||
867 | &psrldq ($T2,8); | ||
868 | &pslldq ($T3,8); # | ||
869 | &pxor ($Xhi,$T2); | ||
870 | &pxor ($Xi,$T3); # | ||
871 | } | ||
872 | |||
873 | if (1) { # Algorithm 9 with <<1 twist. | ||
874 | # Reduction is shorter and uses only two | ||
875 | # temporary registers, which makes it better | ||
876 | # candidate for interleaving with 64x64 | ||
877 | # multiplication. Pre-modulo-scheduled loop | ||
878 | # was found to be ~20% faster than Algorithm 5 | ||
879 | # below. Algorithm 9 was therefore chosen for | ||
880 | # further optimization... | ||
881 | |||
882 | sub reduction_alg9 { # 17/13 times faster than Intel version | ||
883 | my ($Xhi,$Xi) = @_; | ||
884 | |||
885 | # 1st phase | ||
886 | &movdqa ($T1,$Xi) # | ||
887 | &psllq ($Xi,1); | ||
888 | &pxor ($Xi,$T1); # | ||
889 | &psllq ($Xi,5); # | ||
890 | &pxor ($Xi,$T1); # | ||
891 | &psllq ($Xi,57); # | ||
892 | &movdqa ($T2,$Xi); # | ||
893 | &pslldq ($Xi,8); | ||
894 | &psrldq ($T2,8); # | ||
895 | &pxor ($Xi,$T1); | ||
896 | &pxor ($Xhi,$T2); # | ||
897 | |||
898 | # 2nd phase | ||
899 | &movdqa ($T2,$Xi); | ||
900 | &psrlq ($Xi,5); | ||
901 | &pxor ($Xi,$T2); # | ||
902 | &psrlq ($Xi,1); # | ||
903 | &pxor ($Xi,$T2); # | ||
904 | &pxor ($T2,$Xhi); | ||
905 | &psrlq ($Xi,1); # | ||
906 | &pxor ($Xi,$T2); # | ||
907 | } | ||
908 | |||
909 | &function_begin_B("gcm_init_clmul"); | ||
910 | &mov ($Htbl,&wparam(0)); | ||
911 | &mov ($Xip,&wparam(1)); | ||
912 | |||
913 | &call (&label("pic")); | ||
914 | &set_label("pic"); | ||
915 | &blindpop ($const); | ||
916 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
917 | |||
918 | &movdqu ($Hkey,&QWP(0,$Xip)); | ||
919 | &pshufd ($Hkey,$Hkey,0b01001110);# dword swap | ||
920 | |||
921 | # <<1 twist | ||
922 | &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword | ||
923 | &movdqa ($T1,$Hkey); | ||
924 | &psllq ($Hkey,1); | ||
925 | &pxor ($T3,$T3); # | ||
926 | &psrlq ($T1,63); | ||
927 | &pcmpgtd ($T3,$T2); # broadcast carry bit | ||
928 | &pslldq ($T1,8); | ||
929 | &por ($Hkey,$T1); # H<<=1 | ||
930 | |||
931 | # magic reduction | ||
932 | &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial | ||
933 | &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial | ||
934 | |||
935 | # calculate H^2 | ||
936 | &movdqa ($Xi,$Hkey); | ||
937 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); | ||
938 | &reduction_alg9 ($Xhi,$Xi); | ||
939 | |||
940 | &movdqu (&QWP(0,$Htbl),$Hkey); # save H | ||
941 | &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 | ||
942 | |||
943 | &ret (); | ||
944 | &function_end_B("gcm_init_clmul"); | ||
945 | |||
946 | &function_begin_B("gcm_gmult_clmul"); | ||
947 | &mov ($Xip,&wparam(0)); | ||
948 | &mov ($Htbl,&wparam(1)); | ||
949 | |||
950 | &call (&label("pic")); | ||
951 | &set_label("pic"); | ||
952 | &blindpop ($const); | ||
953 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
954 | |||
955 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
956 | &movdqa ($T3,&QWP(0,$const)); | ||
957 | &movups ($Hkey,&QWP(0,$Htbl)); | ||
958 | &pshufb ($Xi,$T3); | ||
959 | |||
960 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); | ||
961 | &reduction_alg9 ($Xhi,$Xi); | ||
962 | |||
963 | &pshufb ($Xi,$T3); | ||
964 | &movdqu (&QWP(0,$Xip),$Xi); | ||
965 | |||
966 | &ret (); | ||
967 | &function_end_B("gcm_gmult_clmul"); | ||
968 | |||
969 | &function_begin("gcm_ghash_clmul"); | ||
970 | &mov ($Xip,&wparam(0)); | ||
971 | &mov ($Htbl,&wparam(1)); | ||
972 | &mov ($inp,&wparam(2)); | ||
973 | &mov ($len,&wparam(3)); | ||
974 | |||
975 | &call (&label("pic")); | ||
976 | &set_label("pic"); | ||
977 | &blindpop ($const); | ||
978 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
979 | |||
980 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
981 | &movdqa ($T3,&QWP(0,$const)); | ||
982 | &movdqu ($Hkey,&QWP(0,$Htbl)); | ||
983 | &pshufb ($Xi,$T3); | ||
984 | |||
985 | &sub ($len,0x10); | ||
986 | &jz (&label("odd_tail")); | ||
987 | |||
988 | ####### | ||
989 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = | ||
990 | # [(H*Ii+1) + (H*Xi+1)] mod P = | ||
991 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P | ||
992 | # | ||
993 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
994 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
995 | &pshufb ($T1,$T3); | ||
996 | &pshufb ($Xn,$T3); | ||
997 | &pxor ($Xi,$T1); # Ii+Xi | ||
998 | |||
999 | &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
1000 | &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
1001 | |||
1002 | &lea ($inp,&DWP(32,$inp)); # i+=2 | ||
1003 | &sub ($len,0x20); | ||
1004 | &jbe (&label("even_tail")); | ||
1005 | |||
1006 | &set_label("mod_loop"); | ||
1007 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
1008 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
1009 | &movups ($Hkey,&QWP(0,$Htbl)); # load H | ||
1010 | |||
1011 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
1012 | &pxor ($Xhi,$Xhn); | ||
1013 | |||
1014 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
1015 | &pshufb ($T1,$T3); | ||
1016 | &pshufb ($Xn,$T3); | ||
1017 | |||
1018 | &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 | ||
1019 | &movdqa ($Xhn,$Xn); | ||
1020 | &pxor ($Xhi,$T1); # "Ii+Xi", consume early | ||
1021 | |||
1022 | &movdqa ($T1,$Xi) #&reduction_alg9($Xhi,$Xi); 1st phase | ||
1023 | &psllq ($Xi,1); | ||
1024 | &pxor ($Xi,$T1); # | ||
1025 | &psllq ($Xi,5); # | ||
1026 | &pxor ($Xi,$T1); # | ||
1027 | &pclmulqdq ($Xn,$Hkey,0x00); ####### | ||
1028 | &psllq ($Xi,57); # | ||
1029 | &movdqa ($T2,$Xi); # | ||
1030 | &pslldq ($Xi,8); | ||
1031 | &psrldq ($T2,8); # | ||
1032 | &pxor ($Xi,$T1); | ||
1033 | &pshufd ($T1,$T3,0b01001110); | ||
1034 | &pxor ($Xhi,$T2); # | ||
1035 | &pxor ($T1,$T3); | ||
1036 | &pshufd ($T3,$Hkey,0b01001110); | ||
1037 | &pxor ($T3,$Hkey); # | ||
1038 | |||
1039 | &pclmulqdq ($Xhn,$Hkey,0x11); ####### | ||
1040 | &movdqa ($T2,$Xi); # 2nd phase | ||
1041 | &psrlq ($Xi,5); | ||
1042 | &pxor ($Xi,$T2); # | ||
1043 | &psrlq ($Xi,1); # | ||
1044 | &pxor ($Xi,$T2); # | ||
1045 | &pxor ($T2,$Xhi); | ||
1046 | &psrlq ($Xi,1); # | ||
1047 | &pxor ($Xi,$T2); # | ||
1048 | |||
1049 | &pclmulqdq ($T1,$T3,0x00); ####### | ||
1050 | &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
1051 | &xorps ($T1,$Xn); # | ||
1052 | &xorps ($T1,$Xhn); # | ||
1053 | |||
1054 | &movdqa ($T3,$T1); # | ||
1055 | &psrldq ($T1,8); | ||
1056 | &pslldq ($T3,8); # | ||
1057 | &pxor ($Xhn,$T1); | ||
1058 | &pxor ($Xn,$T3); # | ||
1059 | &movdqa ($T3,&QWP(0,$const)); | ||
1060 | |||
1061 | &lea ($inp,&DWP(32,$inp)); | ||
1062 | &sub ($len,0x20); | ||
1063 | &ja (&label("mod_loop")); | ||
1064 | |||
1065 | &set_label("even_tail"); | ||
1066 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
1067 | |||
1068 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
1069 | &pxor ($Xhi,$Xhn); | ||
1070 | |||
1071 | &reduction_alg9 ($Xhi,$Xi); | ||
1072 | |||
1073 | &test ($len,$len); | ||
1074 | &jnz (&label("done")); | ||
1075 | |||
1076 | &movups ($Hkey,&QWP(0,$Htbl)); # load H | ||
1077 | &set_label("odd_tail"); | ||
1078 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
1079 | &pshufb ($T1,$T3); | ||
1080 | &pxor ($Xi,$T1); # Ii+Xi | ||
1081 | |||
1082 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) | ||
1083 | &reduction_alg9 ($Xhi,$Xi); | ||
1084 | |||
1085 | &set_label("done"); | ||
1086 | &pshufb ($Xi,$T3); | ||
1087 | &movdqu (&QWP(0,$Xip),$Xi); | ||
1088 | &function_end("gcm_ghash_clmul"); | ||
1089 | |||
1090 | } else { # Algorith 5. Kept for reference purposes. | ||
1091 | |||
1092 | sub reduction_alg5 { # 19/16 times faster than Intel version | ||
1093 | my ($Xhi,$Xi)=@_; | ||
1094 | |||
1095 | # <<1 | ||
1096 | &movdqa ($T1,$Xi); # | ||
1097 | &movdqa ($T2,$Xhi); | ||
1098 | &pslld ($Xi,1); | ||
1099 | &pslld ($Xhi,1); # | ||
1100 | &psrld ($T1,31); | ||
1101 | &psrld ($T2,31); # | ||
1102 | &movdqa ($T3,$T1); | ||
1103 | &pslldq ($T1,4); | ||
1104 | &psrldq ($T3,12); # | ||
1105 | &pslldq ($T2,4); | ||
1106 | &por ($Xhi,$T3); # | ||
1107 | &por ($Xi,$T1); | ||
1108 | &por ($Xhi,$T2); # | ||
1109 | |||
1110 | # 1st phase | ||
1111 | &movdqa ($T1,$Xi); | ||
1112 | &movdqa ($T2,$Xi); | ||
1113 | &movdqa ($T3,$Xi); # | ||
1114 | &pslld ($T1,31); | ||
1115 | &pslld ($T2,30); | ||
1116 | &pslld ($Xi,25); # | ||
1117 | &pxor ($T1,$T2); | ||
1118 | &pxor ($T1,$Xi); # | ||
1119 | &movdqa ($T2,$T1); # | ||
1120 | &pslldq ($T1,12); | ||
1121 | &psrldq ($T2,4); # | ||
1122 | &pxor ($T3,$T1); | ||
1123 | |||
1124 | # 2nd phase | ||
1125 | &pxor ($Xhi,$T3); # | ||
1126 | &movdqa ($Xi,$T3); | ||
1127 | &movdqa ($T1,$T3); | ||
1128 | &psrld ($Xi,1); # | ||
1129 | &psrld ($T1,2); | ||
1130 | &psrld ($T3,7); # | ||
1131 | &pxor ($Xi,$T1); | ||
1132 | &pxor ($Xhi,$T2); | ||
1133 | &pxor ($Xi,$T3); # | ||
1134 | &pxor ($Xi,$Xhi); # | ||
1135 | } | ||
1136 | |||
1137 | &function_begin_B("gcm_init_clmul"); | ||
1138 | &mov ($Htbl,&wparam(0)); | ||
1139 | &mov ($Xip,&wparam(1)); | ||
1140 | |||
1141 | &call (&label("pic")); | ||
1142 | &set_label("pic"); | ||
1143 | &blindpop ($const); | ||
1144 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
1145 | |||
1146 | &movdqu ($Hkey,&QWP(0,$Xip)); | ||
1147 | &pshufd ($Hkey,$Hkey,0b01001110);# dword swap | ||
1148 | |||
1149 | # calculate H^2 | ||
1150 | &movdqa ($Xi,$Hkey); | ||
1151 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); | ||
1152 | &reduction_alg5 ($Xhi,$Xi); | ||
1153 | |||
1154 | &movdqu (&QWP(0,$Htbl),$Hkey); # save H | ||
1155 | &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 | ||
1156 | |||
1157 | &ret (); | ||
1158 | &function_end_B("gcm_init_clmul"); | ||
1159 | |||
1160 | &function_begin_B("gcm_gmult_clmul"); | ||
1161 | &mov ($Xip,&wparam(0)); | ||
1162 | &mov ($Htbl,&wparam(1)); | ||
1163 | |||
1164 | &call (&label("pic")); | ||
1165 | &set_label("pic"); | ||
1166 | &blindpop ($const); | ||
1167 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
1168 | |||
1169 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
1170 | &movdqa ($Xn,&QWP(0,$const)); | ||
1171 | &movdqu ($Hkey,&QWP(0,$Htbl)); | ||
1172 | &pshufb ($Xi,$Xn); | ||
1173 | |||
1174 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); | ||
1175 | &reduction_alg5 ($Xhi,$Xi); | ||
1176 | |||
1177 | &pshufb ($Xi,$Xn); | ||
1178 | &movdqu (&QWP(0,$Xip),$Xi); | ||
1179 | |||
1180 | &ret (); | ||
1181 | &function_end_B("gcm_gmult_clmul"); | ||
1182 | |||
1183 | &function_begin("gcm_ghash_clmul"); | ||
1184 | &mov ($Xip,&wparam(0)); | ||
1185 | &mov ($Htbl,&wparam(1)); | ||
1186 | &mov ($inp,&wparam(2)); | ||
1187 | &mov ($len,&wparam(3)); | ||
1188 | |||
1189 | &call (&label("pic")); | ||
1190 | &set_label("pic"); | ||
1191 | &blindpop ($const); | ||
1192 | &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); | ||
1193 | |||
1194 | &movdqu ($Xi,&QWP(0,$Xip)); | ||
1195 | &movdqa ($T3,&QWP(0,$const)); | ||
1196 | &movdqu ($Hkey,&QWP(0,$Htbl)); | ||
1197 | &pshufb ($Xi,$T3); | ||
1198 | |||
1199 | &sub ($len,0x10); | ||
1200 | &jz (&label("odd_tail")); | ||
1201 | |||
1202 | ####### | ||
1203 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = | ||
1204 | # [(H*Ii+1) + (H*Xi+1)] mod P = | ||
1205 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P | ||
1206 | # | ||
1207 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
1208 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
1209 | &pshufb ($T1,$T3); | ||
1210 | &pshufb ($Xn,$T3); | ||
1211 | &pxor ($Xi,$T1); # Ii+Xi | ||
1212 | |||
1213 | &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
1214 | &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
1215 | |||
1216 | &sub ($len,0x20); | ||
1217 | &lea ($inp,&DWP(32,$inp)); # i+=2 | ||
1218 | &jbe (&label("even_tail")); | ||
1219 | |||
1220 | &set_label("mod_loop"); | ||
1221 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
1222 | &movdqu ($Hkey,&QWP(0,$Htbl)); # load H | ||
1223 | |||
1224 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
1225 | &pxor ($Xhi,$Xhn); | ||
1226 | |||
1227 | &reduction_alg5 ($Xhi,$Xi); | ||
1228 | |||
1229 | ####### | ||
1230 | &movdqa ($T3,&QWP(0,$const)); | ||
1231 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
1232 | &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 | ||
1233 | &pshufb ($T1,$T3); | ||
1234 | &pshufb ($Xn,$T3); | ||
1235 | &pxor ($Xi,$T1); # Ii+Xi | ||
1236 | |||
1237 | &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
1238 | &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 | ||
1239 | |||
1240 | &sub ($len,0x20); | ||
1241 | &lea ($inp,&DWP(32,$inp)); | ||
1242 | &ja (&label("mod_loop")); | ||
1243 | |||
1244 | &set_label("even_tail"); | ||
1245 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) | ||
1246 | |||
1247 | &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) | ||
1248 | &pxor ($Xhi,$Xhn); | ||
1249 | |||
1250 | &reduction_alg5 ($Xhi,$Xi); | ||
1251 | |||
1252 | &movdqa ($T3,&QWP(0,$const)); | ||
1253 | &test ($len,$len); | ||
1254 | &jnz (&label("done")); | ||
1255 | |||
1256 | &movdqu ($Hkey,&QWP(0,$Htbl)); # load H | ||
1257 | &set_label("odd_tail"); | ||
1258 | &movdqu ($T1,&QWP(0,$inp)); # Ii | ||
1259 | &pshufb ($T1,$T3); | ||
1260 | &pxor ($Xi,$T1); # Ii+Xi | ||
1261 | |||
1262 | &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) | ||
1263 | &reduction_alg5 ($Xhi,$Xi); | ||
1264 | |||
1265 | &movdqa ($T3,&QWP(0,$const)); | ||
1266 | &set_label("done"); | ||
1267 | &pshufb ($Xi,$T3); | ||
1268 | &movdqu (&QWP(0,$Xip),$Xi); | ||
1269 | &function_end("gcm_ghash_clmul"); | ||
1270 | |||
1271 | } | ||
1272 | |||
1273 | &set_label("bswap",64); | ||
1274 | &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | ||
1275 | &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial | ||
1276 | }} # $sse2 | ||
1277 | |||
1278 | &set_label("rem_4bit",64); | ||
1279 | &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); | ||
1280 | &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); | ||
1281 | &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); | ||
1282 | &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); | ||
1283 | &set_label("rem_8bit",64); | ||
1284 | &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); | ||
1285 | &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); | ||
1286 | &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E); | ||
1287 | &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E); | ||
1288 | &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E); | ||
1289 | &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E); | ||
1290 | &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E); | ||
1291 | &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E); | ||
1292 | &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE); | ||
1293 | &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE); | ||
1294 | &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE); | ||
1295 | &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE); | ||
1296 | &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E); | ||
1297 | &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E); | ||
1298 | &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE); | ||
1299 | &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE); | ||
1300 | &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E); | ||
1301 | &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E); | ||
1302 | &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E); | ||
1303 | &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E); | ||
1304 | &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E); | ||
1305 | &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E); | ||
1306 | &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E); | ||
1307 | &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E); | ||
1308 | &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE); | ||
1309 | &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE); | ||
1310 | &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE); | ||
1311 | &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE); | ||
1312 | &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E); | ||
1313 | &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); | ||
1314 | &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); | ||
1315 | &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); | ||
1316 | }}} # !$x86only | ||
1317 | |||
1318 | &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
1319 | &asm_finish(); | ||
1320 | |||
1321 | # A question was risen about choice of vanilla MMX. Or rather why wasn't | ||
1322 | # SSE2 chosen instead? In addition to the fact that MMX runs on legacy | ||
1323 | # CPUs such as PIII, "4-bit" MMX version was observed to provide better | ||
1324 | # performance than *corresponding* SSE2 one even on contemporary CPUs. | ||
1325 | # SSE2 results were provided by Peter-Michael Hager. He maintains SSE2 | ||
1326 | # implementation featuring full range of lookup-table sizes, but with | ||
1327 | # per-invocation lookup table setup. Latter means that table size is | ||
1328 | # chosen depending on how much data is to be hashed in every given call, | ||
1329 | # more data - larger table. Best reported result for Core2 is ~4 cycles | ||
1330 | # per processed byte out of 64KB block. This number accounts even for | ||
1331 | # 64KB table setup overhead. As discussed in gcm128.c we choose to be | ||
1332 | # more conservative in respect to lookup table sizes, but how do the | ||
1333 | # results compare? Minimalistic "256B" MMX version delivers ~11 cycles | ||
1334 | # on same platform. As also discussed in gcm128.c, next in line "8-bit | ||
1335 | # Shoup's" or "4KB" method should deliver twice the performance of | ||
1336 | # "256B" one, in other words not worse than ~6 cycles per byte. It | ||
1337 | # should be also be noted that in SSE2 case improvement can be "super- | ||
1338 | # linear," i.e. more than twice, mostly because >>8 maps to single | ||
1339 | # instruction on SSE2 register. This is unlike "4-bit" case when >>4 | ||
1340 | # maps to same amount of instructions in both MMX and SSE2 cases. | ||
1341 | # Bottom line is that switch to SSE2 is considered to be justifiable | ||
1342 | # only in case we choose to implement "8-bit" method... | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl new file mode 100644 index 0000000000..a5ae180882 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl | |||
@@ -0,0 +1,805 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # March, June 2010 | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that | ||
14 | # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH | ||
15 | # function features so called "528B" variant utilizing additional | ||
16 | # 256+16 bytes of per-key storage [+512 bytes shared table]. | ||
17 | # Performance results are for this streamed GHASH subroutine and are | ||
18 | # expressed in cycles per processed byte, less is better: | ||
19 | # | ||
20 | # gcc 3.4.x(*) assembler | ||
21 | # | ||
22 | # P4 28.6 14.0 +100% | ||
23 | # Opteron 19.3 7.7 +150% | ||
24 | # Core2 17.8 8.1(**) +120% | ||
25 | # | ||
26 | # (*) comparison is not completely fair, because C results are | ||
27 | # for vanilla "256B" implementation, while assembler results | ||
28 | # are for "528B";-) | ||
29 | # (**) it's mystery [to me] why Core2 result is not same as for | ||
30 | # Opteron; | ||
31 | |||
32 | # May 2010 | ||
33 | # | ||
34 | # Add PCLMULQDQ version performing at 2.02 cycles per processed byte. | ||
35 | # See ghash-x86.pl for background information and details about coding | ||
36 | # techniques. | ||
37 | # | ||
38 | # Special thanks to David Woodhouse <dwmw2@infradead.org> for | ||
39 | # providing access to a Westmere-based system on behalf of Intel | ||
40 | # Open Source Technology Centre. | ||
41 | |||
42 | $flavour = shift; | ||
43 | $output = shift; | ||
44 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
45 | |||
46 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
47 | |||
48 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
49 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
50 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
51 | die "can't locate x86_64-xlate.pl"; | ||
52 | |||
53 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
54 | |||
55 | # common register layout | ||
56 | $nlo="%rax"; | ||
57 | $nhi="%rbx"; | ||
58 | $Zlo="%r8"; | ||
59 | $Zhi="%r9"; | ||
60 | $tmp="%r10"; | ||
61 | $rem_4bit = "%r11"; | ||
62 | |||
63 | $Xi="%rdi"; | ||
64 | $Htbl="%rsi"; | ||
65 | |||
66 | # per-function register layout | ||
67 | $cnt="%rcx"; | ||
68 | $rem="%rdx"; | ||
69 | |||
70 | sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or | ||
71 | $r =~ s/%[er]([sd]i)/%\1l/ or | ||
72 | $r =~ s/%[er](bp)/%\1l/ or | ||
73 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } | ||
74 | |||
75 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | ||
76 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | ||
77 | my $arg = pop; | ||
78 | $arg = "\$$arg" if ($arg*1 eq $arg); | ||
79 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | ||
80 | } | ||
81 | |||
82 | { my $N; | ||
83 | sub loop() { | ||
84 | my $inp = shift; | ||
85 | |||
86 | $N++; | ||
87 | $code.=<<___; | ||
88 | xor $nlo,$nlo | ||
89 | xor $nhi,$nhi | ||
90 | mov `&LB("$Zlo")`,`&LB("$nlo")` | ||
91 | mov `&LB("$Zlo")`,`&LB("$nhi")` | ||
92 | shl \$4,`&LB("$nlo")` | ||
93 | mov \$14,$cnt | ||
94 | mov 8($Htbl,$nlo),$Zlo | ||
95 | mov ($Htbl,$nlo),$Zhi | ||
96 | and \$0xf0,`&LB("$nhi")` | ||
97 | mov $Zlo,$rem | ||
98 | jmp .Loop$N | ||
99 | |||
100 | .align 16 | ||
101 | .Loop$N: | ||
102 | shr \$4,$Zlo | ||
103 | and \$0xf,$rem | ||
104 | mov $Zhi,$tmp | ||
105 | mov ($inp,$cnt),`&LB("$nlo")` | ||
106 | shr \$4,$Zhi | ||
107 | xor 8($Htbl,$nhi),$Zlo | ||
108 | shl \$60,$tmp | ||
109 | xor ($Htbl,$nhi),$Zhi | ||
110 | mov `&LB("$nlo")`,`&LB("$nhi")` | ||
111 | xor ($rem_4bit,$rem,8),$Zhi | ||
112 | mov $Zlo,$rem | ||
113 | shl \$4,`&LB("$nlo")` | ||
114 | xor $tmp,$Zlo | ||
115 | dec $cnt | ||
116 | js .Lbreak$N | ||
117 | |||
118 | shr \$4,$Zlo | ||
119 | and \$0xf,$rem | ||
120 | mov $Zhi,$tmp | ||
121 | shr \$4,$Zhi | ||
122 | xor 8($Htbl,$nlo),$Zlo | ||
123 | shl \$60,$tmp | ||
124 | xor ($Htbl,$nlo),$Zhi | ||
125 | and \$0xf0,`&LB("$nhi")` | ||
126 | xor ($rem_4bit,$rem,8),$Zhi | ||
127 | mov $Zlo,$rem | ||
128 | xor $tmp,$Zlo | ||
129 | jmp .Loop$N | ||
130 | |||
131 | .align 16 | ||
132 | .Lbreak$N: | ||
133 | shr \$4,$Zlo | ||
134 | and \$0xf,$rem | ||
135 | mov $Zhi,$tmp | ||
136 | shr \$4,$Zhi | ||
137 | xor 8($Htbl,$nlo),$Zlo | ||
138 | shl \$60,$tmp | ||
139 | xor ($Htbl,$nlo),$Zhi | ||
140 | and \$0xf0,`&LB("$nhi")` | ||
141 | xor ($rem_4bit,$rem,8),$Zhi | ||
142 | mov $Zlo,$rem | ||
143 | xor $tmp,$Zlo | ||
144 | |||
145 | shr \$4,$Zlo | ||
146 | and \$0xf,$rem | ||
147 | mov $Zhi,$tmp | ||
148 | shr \$4,$Zhi | ||
149 | xor 8($Htbl,$nhi),$Zlo | ||
150 | shl \$60,$tmp | ||
151 | xor ($Htbl,$nhi),$Zhi | ||
152 | xor $tmp,$Zlo | ||
153 | xor ($rem_4bit,$rem,8),$Zhi | ||
154 | |||
155 | bswap $Zlo | ||
156 | bswap $Zhi | ||
157 | ___ | ||
158 | }} | ||
159 | |||
160 | $code=<<___; | ||
161 | .text | ||
162 | |||
163 | .globl gcm_gmult_4bit | ||
164 | .type gcm_gmult_4bit,\@function,2 | ||
165 | .align 16 | ||
166 | gcm_gmult_4bit: | ||
167 | push %rbx | ||
168 | push %rbp # %rbp and %r12 are pushed exclusively in | ||
169 | push %r12 # order to reuse Win64 exception handler... | ||
170 | .Lgmult_prologue: | ||
171 | |||
172 | movzb 15($Xi),$Zlo | ||
173 | lea .Lrem_4bit(%rip),$rem_4bit | ||
174 | ___ | ||
175 | &loop ($Xi); | ||
176 | $code.=<<___; | ||
177 | mov $Zlo,8($Xi) | ||
178 | mov $Zhi,($Xi) | ||
179 | |||
180 | mov 16(%rsp),%rbx | ||
181 | lea 24(%rsp),%rsp | ||
182 | .Lgmult_epilogue: | ||
183 | ret | ||
184 | .size gcm_gmult_4bit,.-gcm_gmult_4bit | ||
185 | ___ | ||
186 | |||
187 | # per-function register layout | ||
188 | $inp="%rdx"; | ||
189 | $len="%rcx"; | ||
190 | $rem_8bit=$rem_4bit; | ||
191 | |||
192 | $code.=<<___; | ||
193 | .globl gcm_ghash_4bit | ||
194 | .type gcm_ghash_4bit,\@function,4 | ||
195 | .align 16 | ||
196 | gcm_ghash_4bit: | ||
197 | push %rbx | ||
198 | push %rbp | ||
199 | push %r12 | ||
200 | push %r13 | ||
201 | push %r14 | ||
202 | push %r15 | ||
203 | sub \$280,%rsp | ||
204 | .Lghash_prologue: | ||
205 | mov $inp,%r14 # reassign couple of args | ||
206 | mov $len,%r15 | ||
207 | ___ | ||
208 | { my $inp="%r14"; | ||
209 | my $dat="%edx"; | ||
210 | my $len="%r15"; | ||
211 | my @nhi=("%ebx","%ecx"); | ||
212 | my @rem=("%r12","%r13"); | ||
213 | my $Hshr4="%rbp"; | ||
214 | |||
215 | &sub ($Htbl,-128); # size optimization | ||
216 | &lea ($Hshr4,"16+128(%rsp)"); | ||
217 | { my @lo =($nlo,$nhi); | ||
218 | my @hi =($Zlo,$Zhi); | ||
219 | |||
220 | &xor ($dat,$dat); | ||
221 | for ($i=0,$j=-2;$i<18;$i++,$j++) { | ||
222 | &mov ("$j(%rsp)",&LB($dat)) if ($i>1); | ||
223 | &or ($lo[0],$tmp) if ($i>1); | ||
224 | &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); | ||
225 | &shr ($lo[1],4) if ($i>0 && $i<17); | ||
226 | &mov ($tmp,$hi[1]) if ($i>0 && $i<17); | ||
227 | &shr ($hi[1],4) if ($i>0 && $i<17); | ||
228 | &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); | ||
229 | &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); | ||
230 | &shl (&LB($dat),4) if ($i>0 && $i<17); | ||
231 | &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); | ||
232 | &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); | ||
233 | &shl ($tmp,60) if ($i>0 && $i<17); | ||
234 | |||
235 | push (@lo,shift(@lo)); | ||
236 | push (@hi,shift(@hi)); | ||
237 | } | ||
238 | } | ||
239 | &add ($Htbl,-128); | ||
240 | &mov ($Zlo,"8($Xi)"); | ||
241 | &mov ($Zhi,"0($Xi)"); | ||
242 | &add ($len,$inp); # pointer to the end of data | ||
243 | &lea ($rem_8bit,".Lrem_8bit(%rip)"); | ||
244 | &jmp (".Louter_loop"); | ||
245 | |||
246 | $code.=".align 16\n.Louter_loop:\n"; | ||
247 | &xor ($Zhi,"($inp)"); | ||
248 | &mov ("%rdx","8($inp)"); | ||
249 | &lea ($inp,"16($inp)"); | ||
250 | &xor ("%rdx",$Zlo); | ||
251 | &mov ("($Xi)",$Zhi); | ||
252 | &mov ("8($Xi)","%rdx"); | ||
253 | &shr ("%rdx",32); | ||
254 | |||
255 | &xor ($nlo,$nlo); | ||
256 | &rol ($dat,8); | ||
257 | &mov (&LB($nlo),&LB($dat)); | ||
258 | &movz ($nhi[0],&LB($dat)); | ||
259 | &shl (&LB($nlo),4); | ||
260 | &shr ($nhi[0],4); | ||
261 | |||
262 | for ($j=11,$i=0;$i<15;$i++) { | ||
263 | &rol ($dat,8); | ||
264 | &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); | ||
265 | &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); | ||
266 | &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); | ||
267 | &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); | ||
268 | |||
269 | &mov (&LB($nlo),&LB($dat)); | ||
270 | &xor ($Zlo,$tmp) if ($i>0); | ||
271 | &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); | ||
272 | |||
273 | &movz ($nhi[1],&LB($dat)); | ||
274 | &shl (&LB($nlo),4); | ||
275 | &movzb ($rem[0],"(%rsp,$nhi[0])"); | ||
276 | |||
277 | &shr ($nhi[1],4) if ($i<14); | ||
278 | &and ($nhi[1],0xf0) if ($i==14); | ||
279 | &shl ($rem[1],48) if ($i>0); | ||
280 | &xor ($rem[0],$Zlo); | ||
281 | |||
282 | &mov ($tmp,$Zhi); | ||
283 | &xor ($Zhi,$rem[1]) if ($i>0); | ||
284 | &shr ($Zlo,8); | ||
285 | |||
286 | &movz ($rem[0],&LB($rem[0])); | ||
287 | &mov ($dat,"$j($Xi)") if (--$j%4==0); | ||
288 | &shr ($Zhi,8); | ||
289 | |||
290 | &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); | ||
291 | &shl ($tmp,56); | ||
292 | &xor ($Zhi,"($Hshr4,$nhi[0],8)"); | ||
293 | |||
294 | unshift (@nhi,pop(@nhi)); # "rotate" registers | ||
295 | unshift (@rem,pop(@rem)); | ||
296 | } | ||
297 | &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); | ||
298 | &xor ($Zlo,"8($Htbl,$nlo)"); | ||
299 | &xor ($Zhi,"($Htbl,$nlo)"); | ||
300 | |||
301 | &shl ($rem[1],48); | ||
302 | &xor ($Zlo,$tmp); | ||
303 | |||
304 | &xor ($Zhi,$rem[1]); | ||
305 | &movz ($rem[0],&LB($Zlo)); | ||
306 | &shr ($Zlo,4); | ||
307 | |||
308 | &mov ($tmp,$Zhi); | ||
309 | &shl (&LB($rem[0]),4); | ||
310 | &shr ($Zhi,4); | ||
311 | |||
312 | &xor ($Zlo,"8($Htbl,$nhi[0])"); | ||
313 | &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); | ||
314 | &shl ($tmp,60); | ||
315 | |||
316 | &xor ($Zhi,"($Htbl,$nhi[0])"); | ||
317 | &xor ($Zlo,$tmp); | ||
318 | &shl ($rem[0],48); | ||
319 | |||
320 | &bswap ($Zlo); | ||
321 | &xor ($Zhi,$rem[0]); | ||
322 | |||
323 | &bswap ($Zhi); | ||
324 | &cmp ($inp,$len); | ||
325 | &jb (".Louter_loop"); | ||
326 | } | ||
327 | $code.=<<___; | ||
328 | mov $Zlo,8($Xi) | ||
329 | mov $Zhi,($Xi) | ||
330 | |||
331 | lea 280(%rsp),%rsi | ||
332 | mov 0(%rsi),%r15 | ||
333 | mov 8(%rsi),%r14 | ||
334 | mov 16(%rsi),%r13 | ||
335 | mov 24(%rsi),%r12 | ||
336 | mov 32(%rsi),%rbp | ||
337 | mov 40(%rsi),%rbx | ||
338 | lea 48(%rsi),%rsp | ||
339 | .Lghash_epilogue: | ||
340 | ret | ||
341 | .size gcm_ghash_4bit,.-gcm_ghash_4bit | ||
342 | ___ | ||
343 | |||
344 | ###################################################################### | ||
345 | # PCLMULQDQ version. | ||
346 | |||
347 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order | ||
348 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | ||
349 | |||
350 | ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; | ||
351 | ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); | ||
352 | |||
353 | sub clmul64x64_T2 { # minimal register pressure | ||
354 | my ($Xhi,$Xi,$Hkey,$modulo)=@_; | ||
355 | |||
356 | $code.=<<___ if (!defined($modulo)); | ||
357 | movdqa $Xi,$Xhi # | ||
358 | pshufd \$0b01001110,$Xi,$T1 | ||
359 | pshufd \$0b01001110,$Hkey,$T2 | ||
360 | pxor $Xi,$T1 # | ||
361 | pxor $Hkey,$T2 | ||
362 | ___ | ||
363 | $code.=<<___; | ||
364 | pclmulqdq \$0x00,$Hkey,$Xi ####### | ||
365 | pclmulqdq \$0x11,$Hkey,$Xhi ####### | ||
366 | pclmulqdq \$0x00,$T2,$T1 ####### | ||
367 | pxor $Xi,$T1 # | ||
368 | pxor $Xhi,$T1 # | ||
369 | |||
370 | movdqa $T1,$T2 # | ||
371 | psrldq \$8,$T1 | ||
372 | pslldq \$8,$T2 # | ||
373 | pxor $T1,$Xhi | ||
374 | pxor $T2,$Xi # | ||
375 | ___ | ||
376 | } | ||
377 | |||
378 | sub reduction_alg9 { # 17/13 times faster than Intel version | ||
379 | my ($Xhi,$Xi) = @_; | ||
380 | |||
381 | $code.=<<___; | ||
382 | # 1st phase | ||
383 | movdqa $Xi,$T1 # | ||
384 | psllq \$1,$Xi | ||
385 | pxor $T1,$Xi # | ||
386 | psllq \$5,$Xi # | ||
387 | pxor $T1,$Xi # | ||
388 | psllq \$57,$Xi # | ||
389 | movdqa $Xi,$T2 # | ||
390 | pslldq \$8,$Xi | ||
391 | psrldq \$8,$T2 # | ||
392 | pxor $T1,$Xi | ||
393 | pxor $T2,$Xhi # | ||
394 | |||
395 | # 2nd phase | ||
396 | movdqa $Xi,$T2 | ||
397 | psrlq \$5,$Xi | ||
398 | pxor $T2,$Xi # | ||
399 | psrlq \$1,$Xi # | ||
400 | pxor $T2,$Xi # | ||
401 | pxor $Xhi,$T2 | ||
402 | psrlq \$1,$Xi # | ||
403 | pxor $T2,$Xi # | ||
404 | ___ | ||
405 | } | ||
406 | |||
407 | { my ($Htbl,$Xip)=@_4args; | ||
408 | |||
409 | $code.=<<___; | ||
410 | .globl gcm_init_clmul | ||
411 | .type gcm_init_clmul,\@abi-omnipotent | ||
412 | .align 16 | ||
413 | gcm_init_clmul: | ||
414 | movdqu ($Xip),$Hkey | ||
415 | pshufd \$0b01001110,$Hkey,$Hkey # dword swap | ||
416 | |||
417 | # <<1 twist | ||
418 | pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword | ||
419 | movdqa $Hkey,$T1 | ||
420 | psllq \$1,$Hkey | ||
421 | pxor $T3,$T3 # | ||
422 | psrlq \$63,$T1 | ||
423 | pcmpgtd $T2,$T3 # broadcast carry bit | ||
424 | pslldq \$8,$T1 | ||
425 | por $T1,$Hkey # H<<=1 | ||
426 | |||
427 | # magic reduction | ||
428 | pand .L0x1c2_polynomial(%rip),$T3 | ||
429 | pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial | ||
430 | |||
431 | # calculate H^2 | ||
432 | movdqa $Hkey,$Xi | ||
433 | ___ | ||
434 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); | ||
435 | &reduction_alg9 ($Xhi,$Xi); | ||
436 | $code.=<<___; | ||
437 | movdqu $Hkey,($Htbl) # save H | ||
438 | movdqu $Xi,16($Htbl) # save H^2 | ||
439 | ret | ||
440 | .size gcm_init_clmul,.-gcm_init_clmul | ||
441 | ___ | ||
442 | } | ||
443 | |||
444 | { my ($Xip,$Htbl)=@_4args; | ||
445 | |||
446 | $code.=<<___; | ||
447 | .globl gcm_gmult_clmul | ||
448 | .type gcm_gmult_clmul,\@abi-omnipotent | ||
449 | .align 16 | ||
450 | gcm_gmult_clmul: | ||
451 | movdqu ($Xip),$Xi | ||
452 | movdqa .Lbswap_mask(%rip),$T3 | ||
453 | movdqu ($Htbl),$Hkey | ||
454 | pshufb $T3,$Xi | ||
455 | ___ | ||
456 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); | ||
457 | &reduction_alg9 ($Xhi,$Xi); | ||
458 | $code.=<<___; | ||
459 | pshufb $T3,$Xi | ||
460 | movdqu $Xi,($Xip) | ||
461 | ret | ||
462 | .size gcm_gmult_clmul,.-gcm_gmult_clmul | ||
463 | ___ | ||
464 | } | ||
465 | |||
466 | { my ($Xip,$Htbl,$inp,$len)=@_4args; | ||
467 | my $Xn="%xmm6"; | ||
468 | my $Xhn="%xmm7"; | ||
469 | my $Hkey2="%xmm8"; | ||
470 | my $T1n="%xmm9"; | ||
471 | my $T2n="%xmm10"; | ||
472 | |||
473 | $code.=<<___; | ||
474 | .globl gcm_ghash_clmul | ||
475 | .type gcm_ghash_clmul,\@abi-omnipotent | ||
476 | .align 16 | ||
477 | gcm_ghash_clmul: | ||
478 | ___ | ||
479 | $code.=<<___ if ($win64); | ||
480 | .LSEH_begin_gcm_ghash_clmul: | ||
481 | # I can't trust assembler to use specific encoding:-( | ||
482 | .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp | ||
483 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) | ||
484 | .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) | ||
485 | .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp) | ||
486 | .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp) | ||
487 | .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp) | ||
488 | ___ | ||
489 | $code.=<<___; | ||
490 | movdqa .Lbswap_mask(%rip),$T3 | ||
491 | |||
492 | movdqu ($Xip),$Xi | ||
493 | movdqu ($Htbl),$Hkey | ||
494 | pshufb $T3,$Xi | ||
495 | |||
496 | sub \$0x10,$len | ||
497 | jz .Lodd_tail | ||
498 | |||
499 | movdqu 16($Htbl),$Hkey2 | ||
500 | ####### | ||
501 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = | ||
502 | # [(H*Ii+1) + (H*Xi+1)] mod P = | ||
503 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P | ||
504 | # | ||
505 | movdqu ($inp),$T1 # Ii | ||
506 | movdqu 16($inp),$Xn # Ii+1 | ||
507 | pshufb $T3,$T1 | ||
508 | pshufb $T3,$Xn | ||
509 | pxor $T1,$Xi # Ii+Xi | ||
510 | ___ | ||
511 | &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 | ||
512 | $code.=<<___; | ||
513 | movdqa $Xi,$Xhi # | ||
514 | pshufd \$0b01001110,$Xi,$T1 | ||
515 | pshufd \$0b01001110,$Hkey2,$T2 | ||
516 | pxor $Xi,$T1 # | ||
517 | pxor $Hkey2,$T2 | ||
518 | |||
519 | lea 32($inp),$inp # i+=2 | ||
520 | sub \$0x20,$len | ||
521 | jbe .Leven_tail | ||
522 | |||
523 | .Lmod_loop: | ||
524 | ___ | ||
525 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) | ||
526 | $code.=<<___; | ||
527 | movdqu ($inp),$T1 # Ii | ||
528 | pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) | ||
529 | pxor $Xhn,$Xhi | ||
530 | |||
531 | movdqu 16($inp),$Xn # Ii+1 | ||
532 | pshufb $T3,$T1 | ||
533 | pshufb $T3,$Xn | ||
534 | |||
535 | movdqa $Xn,$Xhn # | ||
536 | pshufd \$0b01001110,$Xn,$T1n | ||
537 | pshufd \$0b01001110,$Hkey,$T2n | ||
538 | pxor $Xn,$T1n # | ||
539 | pxor $Hkey,$T2n | ||
540 | pxor $T1,$Xhi # "Ii+Xi", consume early | ||
541 | |||
542 | movdqa $Xi,$T1 # 1st phase | ||
543 | psllq \$1,$Xi | ||
544 | pxor $T1,$Xi # | ||
545 | psllq \$5,$Xi # | ||
546 | pxor $T1,$Xi # | ||
547 | pclmulqdq \$0x00,$Hkey,$Xn ####### | ||
548 | psllq \$57,$Xi # | ||
549 | movdqa $Xi,$T2 # | ||
550 | pslldq \$8,$Xi | ||
551 | psrldq \$8,$T2 # | ||
552 | pxor $T1,$Xi | ||
553 | pxor $T2,$Xhi # | ||
554 | |||
555 | pclmulqdq \$0x11,$Hkey,$Xhn ####### | ||
556 | movdqa $Xi,$T2 # 2nd phase | ||
557 | psrlq \$5,$Xi | ||
558 | pxor $T2,$Xi # | ||
559 | psrlq \$1,$Xi # | ||
560 | pxor $T2,$Xi # | ||
561 | pxor $Xhi,$T2 | ||
562 | psrlq \$1,$Xi # | ||
563 | pxor $T2,$Xi # | ||
564 | |||
565 | pclmulqdq \$0x00,$T2n,$T1n ####### | ||
566 | movdqa $Xi,$Xhi # | ||
567 | pshufd \$0b01001110,$Xi,$T1 | ||
568 | pshufd \$0b01001110,$Hkey2,$T2 | ||
569 | pxor $Xi,$T1 # | ||
570 | pxor $Hkey2,$T2 | ||
571 | |||
572 | pxor $Xn,$T1n # | ||
573 | pxor $Xhn,$T1n # | ||
574 | movdqa $T1n,$T2n # | ||
575 | psrldq \$8,$T1n | ||
576 | pslldq \$8,$T2n # | ||
577 | pxor $T1n,$Xhn | ||
578 | pxor $T2n,$Xn # | ||
579 | |||
580 | lea 32($inp),$inp | ||
581 | sub \$0x20,$len | ||
582 | ja .Lmod_loop | ||
583 | |||
584 | .Leven_tail: | ||
585 | ___ | ||
586 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) | ||
587 | $code.=<<___; | ||
588 | pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) | ||
589 | pxor $Xhn,$Xhi | ||
590 | ___ | ||
591 | &reduction_alg9 ($Xhi,$Xi); | ||
592 | $code.=<<___; | ||
593 | test $len,$len | ||
594 | jnz .Ldone | ||
595 | |||
596 | .Lodd_tail: | ||
597 | movdqu ($inp),$T1 # Ii | ||
598 | pshufb $T3,$T1 | ||
599 | pxor $T1,$Xi # Ii+Xi | ||
600 | ___ | ||
601 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) | ||
602 | &reduction_alg9 ($Xhi,$Xi); | ||
603 | $code.=<<___; | ||
604 | .Ldone: | ||
605 | pshufb $T3,$Xi | ||
606 | movdqu $Xi,($Xip) | ||
607 | ___ | ||
608 | $code.=<<___ if ($win64); | ||
609 | movaps (%rsp),%xmm6 | ||
610 | movaps 0x10(%rsp),%xmm7 | ||
611 | movaps 0x20(%rsp),%xmm8 | ||
612 | movaps 0x30(%rsp),%xmm9 | ||
613 | movaps 0x40(%rsp),%xmm10 | ||
614 | add \$0x58,%rsp | ||
615 | ___ | ||
616 | $code.=<<___; | ||
617 | ret | ||
618 | .LSEH_end_gcm_ghash_clmul: | ||
619 | .size gcm_ghash_clmul,.-gcm_ghash_clmul | ||
620 | ___ | ||
621 | } | ||
622 | |||
623 | $code.=<<___; | ||
624 | .align 64 | ||
625 | .Lbswap_mask: | ||
626 | .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | ||
627 | .L0x1c2_polynomial: | ||
628 | .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 | ||
629 | .align 64 | ||
630 | .type .Lrem_4bit,\@object | ||
631 | .Lrem_4bit: | ||
632 | .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` | ||
633 | .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` | ||
634 | .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` | ||
635 | .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` | ||
636 | .type .Lrem_8bit,\@object | ||
637 | .Lrem_8bit: | ||
638 | .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E | ||
639 | .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E | ||
640 | .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E | ||
641 | .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E | ||
642 | .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E | ||
643 | .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E | ||
644 | .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E | ||
645 | .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E | ||
646 | .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE | ||
647 | .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE | ||
648 | .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE | ||
649 | .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE | ||
650 | .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E | ||
651 | .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E | ||
652 | .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE | ||
653 | .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE | ||
654 | .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E | ||
655 | .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E | ||
656 | .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E | ||
657 | .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E | ||
658 | .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E | ||
659 | .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E | ||
660 | .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E | ||
661 | .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E | ||
662 | .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE | ||
663 | .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE | ||
664 | .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE | ||
665 | .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE | ||
666 | .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E | ||
667 | .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E | ||
668 | .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE | ||
669 | .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE | ||
670 | |||
671 | .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
672 | .align 64 | ||
673 | ___ | ||
674 | |||
675 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
676 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
677 | if ($win64) { | ||
678 | $rec="%rcx"; | ||
679 | $frame="%rdx"; | ||
680 | $context="%r8"; | ||
681 | $disp="%r9"; | ||
682 | |||
683 | $code.=<<___; | ||
684 | .extern __imp_RtlVirtualUnwind | ||
685 | .type se_handler,\@abi-omnipotent | ||
686 | .align 16 | ||
687 | se_handler: | ||
688 | push %rsi | ||
689 | push %rdi | ||
690 | push %rbx | ||
691 | push %rbp | ||
692 | push %r12 | ||
693 | push %r13 | ||
694 | push %r14 | ||
695 | push %r15 | ||
696 | pushfq | ||
697 | sub \$64,%rsp | ||
698 | |||
699 | mov 120($context),%rax # pull context->Rax | ||
700 | mov 248($context),%rbx # pull context->Rip | ||
701 | |||
702 | mov 8($disp),%rsi # disp->ImageBase | ||
703 | mov 56($disp),%r11 # disp->HandlerData | ||
704 | |||
705 | mov 0(%r11),%r10d # HandlerData[0] | ||
706 | lea (%rsi,%r10),%r10 # prologue label | ||
707 | cmp %r10,%rbx # context->Rip<prologue label | ||
708 | jb .Lin_prologue | ||
709 | |||
710 | mov 152($context),%rax # pull context->Rsp | ||
711 | |||
712 | mov 4(%r11),%r10d # HandlerData[1] | ||
713 | lea (%rsi,%r10),%r10 # epilogue label | ||
714 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
715 | jae .Lin_prologue | ||
716 | |||
717 | lea 24(%rax),%rax # adjust "rsp" | ||
718 | |||
719 | mov -8(%rax),%rbx | ||
720 | mov -16(%rax),%rbp | ||
721 | mov -24(%rax),%r12 | ||
722 | mov %rbx,144($context) # restore context->Rbx | ||
723 | mov %rbp,160($context) # restore context->Rbp | ||
724 | mov %r12,216($context) # restore context->R12 | ||
725 | |||
726 | .Lin_prologue: | ||
727 | mov 8(%rax),%rdi | ||
728 | mov 16(%rax),%rsi | ||
729 | mov %rax,152($context) # restore context->Rsp | ||
730 | mov %rsi,168($context) # restore context->Rsi | ||
731 | mov %rdi,176($context) # restore context->Rdi | ||
732 | |||
733 | mov 40($disp),%rdi # disp->ContextRecord | ||
734 | mov $context,%rsi # context | ||
735 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
736 | .long 0xa548f3fc # cld; rep movsq | ||
737 | |||
738 | mov $disp,%rsi | ||
739 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
740 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
741 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
742 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
743 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
744 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
745 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
746 | mov %r10,32(%rsp) # arg5 | ||
747 | mov %r11,40(%rsp) # arg6 | ||
748 | mov %r12,48(%rsp) # arg7 | ||
749 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
750 | call *__imp_RtlVirtualUnwind(%rip) | ||
751 | |||
752 | mov \$1,%eax # ExceptionContinueSearch | ||
753 | add \$64,%rsp | ||
754 | popfq | ||
755 | pop %r15 | ||
756 | pop %r14 | ||
757 | pop %r13 | ||
758 | pop %r12 | ||
759 | pop %rbp | ||
760 | pop %rbx | ||
761 | pop %rdi | ||
762 | pop %rsi | ||
763 | ret | ||
764 | .size se_handler,.-se_handler | ||
765 | |||
766 | .section .pdata | ||
767 | .align 4 | ||
768 | .rva .LSEH_begin_gcm_gmult_4bit | ||
769 | .rva .LSEH_end_gcm_gmult_4bit | ||
770 | .rva .LSEH_info_gcm_gmult_4bit | ||
771 | |||
772 | .rva .LSEH_begin_gcm_ghash_4bit | ||
773 | .rva .LSEH_end_gcm_ghash_4bit | ||
774 | .rva .LSEH_info_gcm_ghash_4bit | ||
775 | |||
776 | .rva .LSEH_begin_gcm_ghash_clmul | ||
777 | .rva .LSEH_end_gcm_ghash_clmul | ||
778 | .rva .LSEH_info_gcm_ghash_clmul | ||
779 | |||
780 | .section .xdata | ||
781 | .align 8 | ||
782 | .LSEH_info_gcm_gmult_4bit: | ||
783 | .byte 9,0,0,0 | ||
784 | .rva se_handler | ||
785 | .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData | ||
786 | .LSEH_info_gcm_ghash_4bit: | ||
787 | .byte 9,0,0,0 | ||
788 | .rva se_handler | ||
789 | .rva .Lghash_prologue,.Lghash_epilogue # HandlerData | ||
790 | .LSEH_info_gcm_ghash_clmul: | ||
791 | .byte 0x01,0x1f,0x0b,0x00 | ||
792 | .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 | ||
793 | .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 | ||
794 | .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 | ||
795 | .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 | ||
796 | .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 | ||
797 | .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58 | ||
798 | ___ | ||
799 | } | ||
800 | |||
801 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
802 | |||
803 | print $code; | ||
804 | |||
805 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c index 8f8bd563b9..3d3782cbe1 100644 --- a/src/lib/libcrypto/modes/cbc128.c +++ b/src/lib/libcrypto/modes/cbc128.c | |||
@@ -48,7 +48,8 @@ | |||
48 | * | 48 | * |
49 | */ | 49 | */ |
50 | 50 | ||
51 | #include "modes.h" | 51 | #include <openssl/crypto.h> |
52 | #include "modes_lcl.h" | ||
52 | #include <string.h> | 53 | #include <string.h> |
53 | 54 | ||
54 | #ifndef MODES_DEBUG | 55 | #ifndef MODES_DEBUG |
@@ -58,12 +59,7 @@ | |||
58 | #endif | 59 | #endif |
59 | #include <assert.h> | 60 | #include <assert.h> |
60 | 61 | ||
61 | #define STRICT_ALIGNMENT 1 | 62 | #ifndef STRICT_ALIGNMENT |
62 | #if defined(__i386) || defined(__i386__) || \ | ||
63 | defined(__x86_64) || defined(__x86_64__) || \ | ||
64 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ | ||
65 | defined(__s390__) || defined(__s390x__) | ||
66 | # undef STRICT_ALIGNMENT | ||
67 | # define STRICT_ALIGNMENT 0 | 63 | # define STRICT_ALIGNMENT 0 |
68 | #endif | 64 | #endif |
69 | 65 | ||
diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c new file mode 100644 index 0000000000..c9b35e5b35 --- /dev/null +++ b/src/lib/libcrypto/modes/ccm128.c | |||
@@ -0,0 +1,441 @@ | |||
1 | /* ==================================================================== | ||
2 | * Copyright (c) 2011 The OpenSSL Project. All rights reserved. | ||
3 | * | ||
4 | * Redistribution and use in source and binary forms, with or without | ||
5 | * modification, are permitted provided that the following conditions | ||
6 | * are met: | ||
7 | * | ||
8 | * 1. Redistributions of source code must retain the above copyright | ||
9 | * notice, this list of conditions and the following disclaimer. | ||
10 | * | ||
11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
12 | * notice, this list of conditions and the following disclaimer in | ||
13 | * the documentation and/or other materials provided with the | ||
14 | * distribution. | ||
15 | * | ||
16 | * 3. All advertising materials mentioning features or use of this | ||
17 | * software must display the following acknowledgment: | ||
18 | * "This product includes software developed by the OpenSSL Project | ||
19 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
20 | * | ||
21 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
22 | * endorse or promote products derived from this software without | ||
23 | * prior written permission. For written permission, please contact | ||
24 | * openssl-core@openssl.org. | ||
25 | * | ||
26 | * 5. Products derived from this software may not be called "OpenSSL" | ||
27 | * nor may "OpenSSL" appear in their names without prior written | ||
28 | * permission of the OpenSSL Project. | ||
29 | * | ||
30 | * 6. Redistributions of any form whatsoever must retain the following | ||
31 | * acknowledgment: | ||
32 | * "This product includes software developed by the OpenSSL Project | ||
33 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
34 | * | ||
35 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
36 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
38 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
39 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
40 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
41 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
42 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
45 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
46 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
47 | * ==================================================================== | ||
48 | */ | ||
49 | |||
50 | #include <openssl/crypto.h> | ||
51 | #include "modes_lcl.h" | ||
52 | #include <string.h> | ||
53 | |||
54 | #ifndef MODES_DEBUG | ||
55 | # ifndef NDEBUG | ||
56 | # define NDEBUG | ||
57 | # endif | ||
58 | #endif | ||
59 | #include <assert.h> | ||
60 | |||
61 | /* First you setup M and L parameters and pass the key schedule. | ||
62 | * This is called once per session setup... */ | ||
63 | void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, | ||
64 | unsigned int M,unsigned int L,void *key,block128_f block) | ||
65 | { | ||
66 | memset(ctx->nonce.c,0,sizeof(ctx->nonce.c)); | ||
67 | ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3; | ||
68 | ctx->blocks = 0; | ||
69 | ctx->block = block; | ||
70 | ctx->key = key; | ||
71 | } | ||
72 | |||
73 | /* !!! Following interfaces are to be called *once* per packet !!! */ | ||
74 | |||
75 | /* Then you setup per-message nonce and pass the length of the message */ | ||
76 | int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, | ||
77 | const unsigned char *nonce,size_t nlen,size_t mlen) | ||
78 | { | ||
79 | unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */ | ||
80 | |||
81 | if (nlen<(14-L)) return -1; /* nonce is too short */ | ||
82 | |||
83 | if (sizeof(mlen)==8 && L>=3) { | ||
84 | ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8))); | ||
85 | ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8))); | ||
86 | ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8))); | ||
87 | ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8))); | ||
88 | } | ||
89 | else | ||
90 | *(u32*)(&ctx->nonce.c[8]) = 0; | ||
91 | |||
92 | ctx->nonce.c[12] = (u8)(mlen>>24); | ||
93 | ctx->nonce.c[13] = (u8)(mlen>>16); | ||
94 | ctx->nonce.c[14] = (u8)(mlen>>8); | ||
95 | ctx->nonce.c[15] = (u8)mlen; | ||
96 | |||
97 | ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */ | ||
98 | memcpy(&ctx->nonce.c[1],nonce,14-L); | ||
99 | |||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | /* Then you pass additional authentication data, this is optional */ | ||
104 | void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, | ||
105 | const unsigned char *aad,size_t alen) | ||
106 | { unsigned int i; | ||
107 | block128_f block = ctx->block; | ||
108 | |||
109 | if (alen==0) return; | ||
110 | |||
111 | ctx->nonce.c[0] |= 0x40; /* set Adata flag */ | ||
112 | (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key), | ||
113 | ctx->blocks++; | ||
114 | |||
115 | if (alen<(0x10000-0x100)) { | ||
116 | ctx->cmac.c[0] ^= (u8)(alen>>8); | ||
117 | ctx->cmac.c[1] ^= (u8)alen; | ||
118 | i=2; | ||
119 | } | ||
120 | else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) { | ||
121 | ctx->cmac.c[0] ^= 0xFF; | ||
122 | ctx->cmac.c[1] ^= 0xFF; | ||
123 | ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8))); | ||
124 | ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8))); | ||
125 | ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8))); | ||
126 | ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8))); | ||
127 | ctx->cmac.c[6] ^= (u8)(alen>>24); | ||
128 | ctx->cmac.c[7] ^= (u8)(alen>>16); | ||
129 | ctx->cmac.c[8] ^= (u8)(alen>>8); | ||
130 | ctx->cmac.c[9] ^= (u8)alen; | ||
131 | i=10; | ||
132 | } | ||
133 | else { | ||
134 | ctx->cmac.c[0] ^= 0xFF; | ||
135 | ctx->cmac.c[1] ^= 0xFE; | ||
136 | ctx->cmac.c[2] ^= (u8)(alen>>24); | ||
137 | ctx->cmac.c[3] ^= (u8)(alen>>16); | ||
138 | ctx->cmac.c[4] ^= (u8)(alen>>8); | ||
139 | ctx->cmac.c[5] ^= (u8)alen; | ||
140 | i=6; | ||
141 | } | ||
142 | |||
143 | do { | ||
144 | for(;i<16 && alen;++i,++aad,--alen) | ||
145 | ctx->cmac.c[i] ^= *aad; | ||
146 | (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key), | ||
147 | ctx->blocks++; | ||
148 | i=0; | ||
149 | } while (alen); | ||
150 | } | ||
151 | |||
152 | /* Finally you encrypt or decrypt the message */ | ||
153 | |||
154 | /* counter part of nonce may not be larger than L*8 bits, | ||
155 | * L is not larger than 8, therefore 64-bit counter... */ | ||
156 | static void ctr64_inc(unsigned char *counter) { | ||
157 | unsigned int n=8; | ||
158 | u8 c; | ||
159 | |||
160 | counter += 8; | ||
161 | do { | ||
162 | --n; | ||
163 | c = counter[n]; | ||
164 | ++c; | ||
165 | counter[n] = c; | ||
166 | if (c) return; | ||
167 | } while (n); | ||
168 | } | ||
169 | |||
170 | int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, | ||
171 | const unsigned char *inp, unsigned char *out, | ||
172 | size_t len) | ||
173 | { | ||
174 | size_t n; | ||
175 | unsigned int i,L; | ||
176 | unsigned char flags0 = ctx->nonce.c[0]; | ||
177 | block128_f block = ctx->block; | ||
178 | void * key = ctx->key; | ||
179 | union { u64 u[2]; u8 c[16]; } scratch; | ||
180 | |||
181 | if (!(flags0&0x40)) | ||
182 | (*block)(ctx->nonce.c,ctx->cmac.c,key), | ||
183 | ctx->blocks++; | ||
184 | |||
185 | ctx->nonce.c[0] = L = flags0&7; | ||
186 | for (n=0,i=15-L;i<15;++i) { | ||
187 | n |= ctx->nonce.c[i]; | ||
188 | ctx->nonce.c[i]=0; | ||
189 | n <<= 8; | ||
190 | } | ||
191 | n |= ctx->nonce.c[15]; /* reconstructed length */ | ||
192 | ctx->nonce.c[15]=1; | ||
193 | |||
194 | if (n!=len) return -1; /* length mismatch */ | ||
195 | |||
196 | ctx->blocks += ((len+15)>>3)|1; | ||
197 | if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */ | ||
198 | |||
199 | while (len>=16) { | ||
200 | #if defined(STRICT_ALIGNMENT) | ||
201 | union { u64 u[2]; u8 c[16]; } temp; | ||
202 | |||
203 | memcpy (temp.c,inp,16); | ||
204 | ctx->cmac.u[0] ^= temp.u[0]; | ||
205 | ctx->cmac.u[1] ^= temp.u[1]; | ||
206 | #else | ||
207 | ctx->cmac.u[0] ^= ((u64*)inp)[0]; | ||
208 | ctx->cmac.u[1] ^= ((u64*)inp)[1]; | ||
209 | #endif | ||
210 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
211 | (*block)(ctx->nonce.c,scratch.c,key); | ||
212 | ctr64_inc(ctx->nonce.c); | ||
213 | #if defined(STRICT_ALIGNMENT) | ||
214 | temp.u[0] ^= scratch.u[0]; | ||
215 | temp.u[1] ^= scratch.u[1]; | ||
216 | memcpy(out,temp.c,16); | ||
217 | #else | ||
218 | ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]; | ||
219 | ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]; | ||
220 | #endif | ||
221 | inp += 16; | ||
222 | out += 16; | ||
223 | len -= 16; | ||
224 | } | ||
225 | |||
226 | if (len) { | ||
227 | for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i]; | ||
228 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
229 | (*block)(ctx->nonce.c,scratch.c,key); | ||
230 | for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i]; | ||
231 | } | ||
232 | |||
233 | for (i=15-L;i<16;++i) | ||
234 | ctx->nonce.c[i]=0; | ||
235 | |||
236 | (*block)(ctx->nonce.c,scratch.c,key); | ||
237 | ctx->cmac.u[0] ^= scratch.u[0]; | ||
238 | ctx->cmac.u[1] ^= scratch.u[1]; | ||
239 | |||
240 | ctx->nonce.c[0] = flags0; | ||
241 | |||
242 | return 0; | ||
243 | } | ||
244 | |||
245 | int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, | ||
246 | const unsigned char *inp, unsigned char *out, | ||
247 | size_t len) | ||
248 | { | ||
249 | size_t n; | ||
250 | unsigned int i,L; | ||
251 | unsigned char flags0 = ctx->nonce.c[0]; | ||
252 | block128_f block = ctx->block; | ||
253 | void * key = ctx->key; | ||
254 | union { u64 u[2]; u8 c[16]; } scratch; | ||
255 | |||
256 | if (!(flags0&0x40)) | ||
257 | (*block)(ctx->nonce.c,ctx->cmac.c,key); | ||
258 | |||
259 | ctx->nonce.c[0] = L = flags0&7; | ||
260 | for (n=0,i=15-L;i<15;++i) { | ||
261 | n |= ctx->nonce.c[i]; | ||
262 | ctx->nonce.c[i]=0; | ||
263 | n <<= 8; | ||
264 | } | ||
265 | n |= ctx->nonce.c[15]; /* reconstructed length */ | ||
266 | ctx->nonce.c[15]=1; | ||
267 | |||
268 | if (n!=len) return -1; | ||
269 | |||
270 | while (len>=16) { | ||
271 | #if defined(STRICT_ALIGNMENT) | ||
272 | union { u64 u[2]; u8 c[16]; } temp; | ||
273 | #endif | ||
274 | (*block)(ctx->nonce.c,scratch.c,key); | ||
275 | ctr64_inc(ctx->nonce.c); | ||
276 | #if defined(STRICT_ALIGNMENT) | ||
277 | memcpy (temp.c,inp,16); | ||
278 | ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]); | ||
279 | ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]); | ||
280 | memcpy (out,scratch.c,16); | ||
281 | #else | ||
282 | ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]); | ||
283 | ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]); | ||
284 | #endif | ||
285 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
286 | |||
287 | inp += 16; | ||
288 | out += 16; | ||
289 | len -= 16; | ||
290 | } | ||
291 | |||
292 | if (len) { | ||
293 | (*block)(ctx->nonce.c,scratch.c,key); | ||
294 | for (i=0; i<len; ++i) | ||
295 | ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); | ||
296 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
297 | } | ||
298 | |||
299 | for (i=15-L;i<16;++i) | ||
300 | ctx->nonce.c[i]=0; | ||
301 | |||
302 | (*block)(ctx->nonce.c,scratch.c,key); | ||
303 | ctx->cmac.u[0] ^= scratch.u[0]; | ||
304 | ctx->cmac.u[1] ^= scratch.u[1]; | ||
305 | |||
306 | ctx->nonce.c[0] = flags0; | ||
307 | |||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | static void ctr64_add (unsigned char *counter,size_t inc) | ||
312 | { size_t n=8, val=0; | ||
313 | |||
314 | counter += 8; | ||
315 | do { | ||
316 | --n; | ||
317 | val += counter[n] + (inc&0xff); | ||
318 | counter[n] = (unsigned char)val; | ||
319 | val >>= 8; /* carry bit */ | ||
320 | inc >>= 8; | ||
321 | } while(n && (inc || val)); | ||
322 | } | ||
323 | |||
324 | int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, | ||
325 | const unsigned char *inp, unsigned char *out, | ||
326 | size_t len,ccm128_f stream) | ||
327 | { | ||
328 | size_t n; | ||
329 | unsigned int i,L; | ||
330 | unsigned char flags0 = ctx->nonce.c[0]; | ||
331 | block128_f block = ctx->block; | ||
332 | void * key = ctx->key; | ||
333 | union { u64 u[2]; u8 c[16]; } scratch; | ||
334 | |||
335 | if (!(flags0&0x40)) | ||
336 | (*block)(ctx->nonce.c,ctx->cmac.c,key), | ||
337 | ctx->blocks++; | ||
338 | |||
339 | ctx->nonce.c[0] = L = flags0&7; | ||
340 | for (n=0,i=15-L;i<15;++i) { | ||
341 | n |= ctx->nonce.c[i]; | ||
342 | ctx->nonce.c[i]=0; | ||
343 | n <<= 8; | ||
344 | } | ||
345 | n |= ctx->nonce.c[15]; /* reconstructed length */ | ||
346 | ctx->nonce.c[15]=1; | ||
347 | |||
348 | if (n!=len) return -1; /* length mismatch */ | ||
349 | |||
350 | ctx->blocks += ((len+15)>>3)|1; | ||
351 | if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */ | ||
352 | |||
353 | if ((n=len/16)) { | ||
354 | (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c); | ||
355 | n *= 16; | ||
356 | inp += n; | ||
357 | out += n; | ||
358 | len -= n; | ||
359 | if (len) ctr64_add(ctx->nonce.c,n/16); | ||
360 | } | ||
361 | |||
362 | if (len) { | ||
363 | for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i]; | ||
364 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
365 | (*block)(ctx->nonce.c,scratch.c,key); | ||
366 | for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i]; | ||
367 | } | ||
368 | |||
369 | for (i=15-L;i<16;++i) | ||
370 | ctx->nonce.c[i]=0; | ||
371 | |||
372 | (*block)(ctx->nonce.c,scratch.c,key); | ||
373 | ctx->cmac.u[0] ^= scratch.u[0]; | ||
374 | ctx->cmac.u[1] ^= scratch.u[1]; | ||
375 | |||
376 | ctx->nonce.c[0] = flags0; | ||
377 | |||
378 | return 0; | ||
379 | } | ||
380 | |||
381 | int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, | ||
382 | const unsigned char *inp, unsigned char *out, | ||
383 | size_t len,ccm128_f stream) | ||
384 | { | ||
385 | size_t n; | ||
386 | unsigned int i,L; | ||
387 | unsigned char flags0 = ctx->nonce.c[0]; | ||
388 | block128_f block = ctx->block; | ||
389 | void * key = ctx->key; | ||
390 | union { u64 u[2]; u8 c[16]; } scratch; | ||
391 | |||
392 | if (!(flags0&0x40)) | ||
393 | (*block)(ctx->nonce.c,ctx->cmac.c,key); | ||
394 | |||
395 | ctx->nonce.c[0] = L = flags0&7; | ||
396 | for (n=0,i=15-L;i<15;++i) { | ||
397 | n |= ctx->nonce.c[i]; | ||
398 | ctx->nonce.c[i]=0; | ||
399 | n <<= 8; | ||
400 | } | ||
401 | n |= ctx->nonce.c[15]; /* reconstructed length */ | ||
402 | ctx->nonce.c[15]=1; | ||
403 | |||
404 | if (n!=len) return -1; | ||
405 | |||
406 | if ((n=len/16)) { | ||
407 | (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c); | ||
408 | n *= 16; | ||
409 | inp += n; | ||
410 | out += n; | ||
411 | len -= n; | ||
412 | if (len) ctr64_add(ctx->nonce.c,n/16); | ||
413 | } | ||
414 | |||
415 | if (len) { | ||
416 | (*block)(ctx->nonce.c,scratch.c,key); | ||
417 | for (i=0; i<len; ++i) | ||
418 | ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); | ||
419 | (*block)(ctx->cmac.c,ctx->cmac.c,key); | ||
420 | } | ||
421 | |||
422 | for (i=15-L;i<16;++i) | ||
423 | ctx->nonce.c[i]=0; | ||
424 | |||
425 | (*block)(ctx->nonce.c,scratch.c,key); | ||
426 | ctx->cmac.u[0] ^= scratch.u[0]; | ||
427 | ctx->cmac.u[1] ^= scratch.u[1]; | ||
428 | |||
429 | ctx->nonce.c[0] = flags0; | ||
430 | |||
431 | return 0; | ||
432 | } | ||
433 | |||
434 | size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len) | ||
435 | { unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */ | ||
436 | |||
437 | M *= 2; M += 2; | ||
438 | if (len<M) return 0; | ||
439 | memcpy(tag,ctx->cmac.c,M); | ||
440 | return M; | ||
441 | } | ||
diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c index e5938c6137..4e6f5d35e1 100644 --- a/src/lib/libcrypto/modes/cfb128.c +++ b/src/lib/libcrypto/modes/cfb128.c | |||
@@ -48,7 +48,8 @@ | |||
48 | * | 48 | * |
49 | */ | 49 | */ |
50 | 50 | ||
51 | #include "modes.h" | 51 | #include <openssl/crypto.h> |
52 | #include "modes_lcl.h" | ||
52 | #include <string.h> | 53 | #include <string.h> |
53 | 54 | ||
54 | #ifndef MODES_DEBUG | 55 | #ifndef MODES_DEBUG |
@@ -58,14 +59,6 @@ | |||
58 | #endif | 59 | #endif |
59 | #include <assert.h> | 60 | #include <assert.h> |
60 | 61 | ||
61 | #define STRICT_ALIGNMENT | ||
62 | #if defined(__i386) || defined(__i386__) || \ | ||
63 | defined(__x86_64) || defined(__x86_64__) || \ | ||
64 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ | ||
65 | defined(__s390__) || defined(__s390x__) | ||
66 | # undef STRICT_ALIGNMENT | ||
67 | #endif | ||
68 | |||
69 | /* The input and output encrypted as though 128bit cfb mode is being | 62 | /* The input and output encrypted as though 128bit cfb mode is being |
70 | * used. The extra state information to record how much of the | 63 | * used. The extra state information to record how much of the |
71 | * 128bit block we have used is contained in *num; | 64 | * 128bit block we have used is contained in *num; |
diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c index 932037f551..ee642c5863 100644 --- a/src/lib/libcrypto/modes/ctr128.c +++ b/src/lib/libcrypto/modes/ctr128.c | |||
@@ -48,7 +48,8 @@ | |||
48 | * | 48 | * |
49 | */ | 49 | */ |
50 | 50 | ||
51 | #include "modes.h" | 51 | #include <openssl/crypto.h> |
52 | #include "modes_lcl.h" | ||
52 | #include <string.h> | 53 | #include <string.h> |
53 | 54 | ||
54 | #ifndef MODES_DEBUG | 55 | #ifndef MODES_DEBUG |
@@ -58,17 +59,6 @@ | |||
58 | #endif | 59 | #endif |
59 | #include <assert.h> | 60 | #include <assert.h> |
60 | 61 | ||
61 | typedef unsigned int u32; | ||
62 | typedef unsigned char u8; | ||
63 | |||
64 | #define STRICT_ALIGNMENT | ||
65 | #if defined(__i386) || defined(__i386__) || \ | ||
66 | defined(__x86_64) || defined(__x86_64__) || \ | ||
67 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ | ||
68 | defined(__s390__) || defined(__s390x__) | ||
69 | # undef STRICT_ALIGNMENT | ||
70 | #endif | ||
71 | |||
72 | /* NOTE: the IV/counter CTR mode is big-endian. The code itself | 62 | /* NOTE: the IV/counter CTR mode is big-endian. The code itself |
73 | * is endian-neutral. */ | 63 | * is endian-neutral. */ |
74 | 64 | ||
@@ -182,3 +172,81 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, | |||
182 | 172 | ||
183 | *num=n; | 173 | *num=n; |
184 | } | 174 | } |
175 | |||
176 | /* increment upper 96 bits of 128-bit counter by 1 */ | ||
177 | static void ctr96_inc(unsigned char *counter) { | ||
178 | u32 n=12; | ||
179 | u8 c; | ||
180 | |||
181 | do { | ||
182 | --n; | ||
183 | c = counter[n]; | ||
184 | ++c; | ||
185 | counter[n] = c; | ||
186 | if (c) return; | ||
187 | } while (n); | ||
188 | } | ||
189 | |||
190 | void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, | ||
191 | size_t len, const void *key, | ||
192 | unsigned char ivec[16], unsigned char ecount_buf[16], | ||
193 | unsigned int *num, ctr128_f func) | ||
194 | { | ||
195 | unsigned int n,ctr32; | ||
196 | |||
197 | assert(in && out && key && ecount_buf && num); | ||
198 | assert(*num < 16); | ||
199 | |||
200 | n = *num; | ||
201 | |||
202 | while (n && len) { | ||
203 | *(out++) = *(in++) ^ ecount_buf[n]; | ||
204 | --len; | ||
205 | n = (n+1) % 16; | ||
206 | } | ||
207 | |||
208 | ctr32 = GETU32(ivec+12); | ||
209 | while (len>=16) { | ||
210 | size_t blocks = len/16; | ||
211 | /* | ||
212 | * 1<<28 is just a not-so-small yet not-so-large number... | ||
213 | * Below condition is practically never met, but it has to | ||
214 | * be checked for code correctness. | ||
215 | */ | ||
216 | if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28)) | ||
217 | blocks = (1U<<28); | ||
218 | /* | ||
219 | * As (*func) operates on 32-bit counter, caller | ||
220 | * has to handle overflow. 'if' below detects the | ||
221 | * overflow, which is then handled by limiting the | ||
222 | * amount of blocks to the exact overflow point... | ||
223 | */ | ||
224 | ctr32 += (u32)blocks; | ||
225 | if (ctr32 < blocks) { | ||
226 | blocks -= ctr32; | ||
227 | ctr32 = 0; | ||
228 | } | ||
229 | (*func)(in,out,blocks,key,ivec); | ||
230 | /* (*ctr) does not update ivec, caller does: */ | ||
231 | PUTU32(ivec+12,ctr32); | ||
232 | /* ... overflow was detected, propogate carry. */ | ||
233 | if (ctr32 == 0) ctr96_inc(ivec); | ||
234 | blocks *= 16; | ||
235 | len -= blocks; | ||
236 | out += blocks; | ||
237 | in += blocks; | ||
238 | } | ||
239 | if (len) { | ||
240 | memset(ecount_buf,0,16); | ||
241 | (*func)(ecount_buf,ecount_buf,1,key,ivec); | ||
242 | ++ctr32; | ||
243 | PUTU32(ivec+12,ctr32); | ||
244 | if (ctr32 == 0) ctr96_inc(ivec); | ||
245 | while (len--) { | ||
246 | out[n] = in[n] ^ ecount_buf[n]; | ||
247 | ++n; | ||
248 | } | ||
249 | } | ||
250 | |||
251 | *num=n; | ||
252 | } | ||
diff --git a/src/lib/libcrypto/modes/cts128.c b/src/lib/libcrypto/modes/cts128.c index e0430f9fdc..c0e1f3696c 100644 --- a/src/lib/libcrypto/modes/cts128.c +++ b/src/lib/libcrypto/modes/cts128.c | |||
@@ -5,7 +5,8 @@ | |||
5 | * forms are granted according to the OpenSSL license. | 5 | * forms are granted according to the OpenSSL license. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "modes.h" | 8 | #include <openssl/crypto.h> |
9 | #include "modes_lcl.h" | ||
9 | #include <string.h> | 10 | #include <string.h> |
10 | 11 | ||
11 | #ifndef MODES_DEBUG | 12 | #ifndef MODES_DEBUG |
@@ -23,8 +24,9 @@ | |||
23 | * deviates from mentioned RFCs. Most notably it allows input to be | 24 | * deviates from mentioned RFCs. Most notably it allows input to be |
24 | * of block length and it doesn't flip the order of the last two | 25 | * of block length and it doesn't flip the order of the last two |
25 | * blocks. CTS is being discussed even in ECB context, but it's not | 26 | * blocks. CTS is being discussed even in ECB context, but it's not |
26 | * adopted for any known application. This implementation complies | 27 | * adopted for any known application. This implementation provides |
27 | * with mentioned RFCs and [as such] extends CBC mode. | 28 | * two interfaces: one compliant with above mentioned RFCs and one |
29 | * compliant with the NIST proposal, both extending CBC mode. | ||
28 | */ | 30 | */ |
29 | 31 | ||
30 | size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, | 32 | size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, |
@@ -54,6 +56,34 @@ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, | |||
54 | return len+residue; | 56 | return len+residue; |
55 | } | 57 | } |
56 | 58 | ||
59 | size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out, | ||
60 | size_t len, const void *key, | ||
61 | unsigned char ivec[16], block128_f block) | ||
62 | { size_t residue, n; | ||
63 | |||
64 | assert (in && out && key && ivec); | ||
65 | |||
66 | if (len < 16) return 0; | ||
67 | |||
68 | residue=len%16; | ||
69 | |||
70 | len -= residue; | ||
71 | |||
72 | CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block); | ||
73 | |||
74 | if (residue==0) return len; | ||
75 | |||
76 | in += len; | ||
77 | out += len; | ||
78 | |||
79 | for (n=0; n<residue; ++n) | ||
80 | ivec[n] ^= in[n]; | ||
81 | (*block)(ivec,ivec,key); | ||
82 | memcpy(out-16+residue,ivec,16); | ||
83 | |||
84 | return len+residue; | ||
85 | } | ||
86 | |||
57 | size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, | 87 | size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, |
58 | size_t len, const void *key, | 88 | size_t len, const void *key, |
59 | unsigned char ivec[16], cbc128_f cbc) | 89 | unsigned char ivec[16], cbc128_f cbc) |
@@ -90,6 +120,41 @@ size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out, | |||
90 | return len+residue; | 120 | return len+residue; |
91 | } | 121 | } |
92 | 122 | ||
123 | size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out, | ||
124 | size_t len, const void *key, | ||
125 | unsigned char ivec[16], cbc128_f cbc) | ||
126 | { size_t residue; | ||
127 | union { size_t align; unsigned char c[16]; } tmp; | ||
128 | |||
129 | assert (in && out && key && ivec); | ||
130 | |||
131 | if (len < 16) return 0; | ||
132 | |||
133 | residue=len%16; | ||
134 | |||
135 | len -= residue; | ||
136 | |||
137 | (*cbc)(in,out,len,key,ivec,1); | ||
138 | |||
139 | if (residue==0) return len; | ||
140 | |||
141 | in += len; | ||
142 | out += len; | ||
143 | |||
144 | #if defined(CBC_HANDLES_TRUNCATED_IO) | ||
145 | (*cbc)(in,out-16+residue,residue,key,ivec,1); | ||
146 | #else | ||
147 | { | ||
148 | size_t n; | ||
149 | for (n=0; n<16; n+=sizeof(size_t)) | ||
150 | *(size_t *)(tmp.c+n) = 0; | ||
151 | memcpy(tmp.c,in,residue); | ||
152 | } | ||
153 | (*cbc)(tmp.c,out-16+residue,16,key,ivec,1); | ||
154 | #endif | ||
155 | return len+residue; | ||
156 | } | ||
157 | |||
93 | size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, | 158 | size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, |
94 | size_t len, const void *key, | 159 | size_t len, const void *key, |
95 | unsigned char ivec[16], block128_f block) | 160 | unsigned char ivec[16], block128_f block) |
@@ -125,7 +190,51 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, | |||
125 | for(residue+=16; n<residue; ++n) | 190 | for(residue+=16; n<residue; ++n) |
126 | out[n] = tmp.c[n] ^ in[n]; | 191 | out[n] = tmp.c[n] ^ in[n]; |
127 | 192 | ||
128 | return len+residue-16; | 193 | return 16+len+residue; |
194 | } | ||
195 | |||
196 | size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out, | ||
197 | size_t len, const void *key, | ||
198 | unsigned char ivec[16], block128_f block) | ||
199 | { size_t residue, n; | ||
200 | union { size_t align; unsigned char c[32]; } tmp; | ||
201 | |||
202 | assert (in && out && key && ivec); | ||
203 | |||
204 | if (len<16) return 0; | ||
205 | |||
206 | residue=len%16; | ||
207 | |||
208 | if (residue==0) { | ||
209 | CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block); | ||
210 | return len; | ||
211 | } | ||
212 | |||
213 | len -= 16+residue; | ||
214 | |||
215 | if (len) { | ||
216 | CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block); | ||
217 | in += len; | ||
218 | out += len; | ||
219 | } | ||
220 | |||
221 | (*block)(in+residue,tmp.c+16,key); | ||
222 | |||
223 | for (n=0; n<16; n+=sizeof(size_t)) | ||
224 | *(size_t *)(tmp.c+n) = *(size_t *)(tmp.c+16+n); | ||
225 | memcpy(tmp.c,in,residue); | ||
226 | (*block)(tmp.c,tmp.c,key); | ||
227 | |||
228 | for(n=0; n<16; ++n) { | ||
229 | unsigned char c = in[n]; | ||
230 | out[n] = tmp.c[n] ^ ivec[n]; | ||
231 | ivec[n] = in[n+residue]; | ||
232 | tmp.c[n] = c; | ||
233 | } | ||
234 | for(residue+=16; n<residue; ++n) | ||
235 | out[n] = tmp.c[n] ^ tmp.c[n-16]; | ||
236 | |||
237 | return 16+len+residue; | ||
129 | } | 238 | } |
130 | 239 | ||
131 | size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, | 240 | size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, |
@@ -160,7 +269,47 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, | |||
160 | (*cbc)(tmp.c,tmp.c,32,key,ivec,0); | 269 | (*cbc)(tmp.c,tmp.c,32,key,ivec,0); |
161 | memcpy(out,tmp.c,16+residue); | 270 | memcpy(out,tmp.c,16+residue); |
162 | #endif | 271 | #endif |
163 | return len+residue; | 272 | return 16+len+residue; |
273 | } | ||
274 | |||
275 | size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out, | ||
276 | size_t len, const void *key, | ||
277 | unsigned char ivec[16], cbc128_f cbc) | ||
278 | { size_t residue, n; | ||
279 | union { size_t align; unsigned char c[32]; } tmp; | ||
280 | |||
281 | assert (in && out && key && ivec); | ||
282 | |||
283 | if (len<16) return 0; | ||
284 | |||
285 | residue=len%16; | ||
286 | |||
287 | if (residue==0) { | ||
288 | (*cbc)(in,out,len,key,ivec,0); | ||
289 | return len; | ||
290 | } | ||
291 | |||
292 | len -= 16+residue; | ||
293 | |||
294 | if (len) { | ||
295 | (*cbc)(in,out,len,key,ivec,0); | ||
296 | in += len; | ||
297 | out += len; | ||
298 | } | ||
299 | |||
300 | for (n=16; n<32; n+=sizeof(size_t)) | ||
301 | *(size_t *)(tmp.c+n) = 0; | ||
302 | /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */ | ||
303 | (*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0); | ||
304 | |||
305 | memcpy(tmp.c,in,residue); | ||
306 | #if defined(CBC_HANDLES_TRUNCATED_IO) | ||
307 | (*cbc)(tmp.c,out,16+residue,key,ivec,0); | ||
308 | #else | ||
309 | (*cbc)(tmp.c,tmp.c,32,key,ivec,0); | ||
310 | memcpy(out,tmp.c,16+residue); | ||
311 | #endif | ||
312 | return 16+len+residue; | ||
164 | } | 313 | } |
165 | 314 | ||
166 | #if defined(SELFTEST) | 315 | #if defined(SELFTEST) |
@@ -200,9 +349,8 @@ static const unsigned char vector_64[64] = | |||
200 | static AES_KEY encks, decks; | 349 | static AES_KEY encks, decks; |
201 | 350 | ||
202 | void test_vector(const unsigned char *vector,size_t len) | 351 | void test_vector(const unsigned char *vector,size_t len) |
203 | { unsigned char cleartext[64]; | 352 | { unsigned char iv[sizeof(test_iv)]; |
204 | unsigned char iv[sizeof(test_iv)]; | 353 | unsigned char cleartext[64],ciphertext[64]; |
205 | unsigned char ciphertext[64]; | ||
206 | size_t tail; | 354 | size_t tail; |
207 | 355 | ||
208 | printf("vector_%d\n",len); fflush(stdout); | 356 | printf("vector_%d\n",len); fflush(stdout); |
@@ -243,7 +391,57 @@ void test_vector(const unsigned char *vector,size_t len) | |||
243 | fprintf(stderr,"iv_%d mismatch\n",len), exit(4); | 391 | fprintf(stderr,"iv_%d mismatch\n",len), exit(4); |
244 | } | 392 | } |
245 | 393 | ||
246 | main() | 394 | void test_nistvector(const unsigned char *vector,size_t len) |
395 | { unsigned char iv[sizeof(test_iv)]; | ||
396 | unsigned char cleartext[64],ciphertext[64],nistvector[64]; | ||
397 | size_t tail; | ||
398 | |||
399 | printf("nistvector_%d\n",len); fflush(stdout); | ||
400 | |||
401 | if ((tail=len%16) == 0) tail = 16; | ||
402 | |||
403 | len -= 16 + tail; | ||
404 | memcpy(nistvector,vector,len); | ||
405 | /* flip two last blocks */ | ||
406 | memcpy(nistvector+len,vector+len+16,tail); | ||
407 | memcpy(nistvector+len+tail,vector+len,16); | ||
408 | len += 16 + tail; | ||
409 | tail = 16; | ||
410 | |||
411 | /* test block-based encryption */ | ||
412 | memcpy(iv,test_iv,sizeof(test_iv)); | ||
413 | CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt); | ||
414 | if (memcmp(ciphertext,nistvector,len)) | ||
415 | fprintf(stderr,"output_%d mismatch\n",len), exit(1); | ||
416 | if (memcmp(iv,nistvector+len-tail,sizeof(iv))) | ||
417 | fprintf(stderr,"iv_%d mismatch\n",len), exit(1); | ||
418 | |||
419 | /* test block-based decryption */ | ||
420 | memcpy(iv,test_iv,sizeof(test_iv)); | ||
421 | CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt); | ||
422 | if (memcmp(cleartext,test_input,len)) | ||
423 | fprintf(stderr,"input_%d mismatch\n",len), exit(2); | ||
424 | if (memcmp(iv,nistvector+len-tail,sizeof(iv))) | ||
425 | fprintf(stderr,"iv_%d mismatch\n",len), exit(2); | ||
426 | |||
427 | /* test streamed encryption */ | ||
428 | memcpy(iv,test_iv,sizeof(test_iv)); | ||
429 | CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt); | ||
430 | if (memcmp(ciphertext,nistvector,len)) | ||
431 | fprintf(stderr,"output_%d mismatch\n",len), exit(3); | ||
432 | if (memcmp(iv,nistvector+len-tail,sizeof(iv))) | ||
433 | fprintf(stderr,"iv_%d mismatch\n",len), exit(3); | ||
434 | |||
435 | /* test streamed decryption */ | ||
436 | memcpy(iv,test_iv,sizeof(test_iv)); | ||
437 | CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt); | ||
438 | if (memcmp(cleartext,test_input,len)) | ||
439 | fprintf(stderr,"input_%d mismatch\n",len), exit(4); | ||
440 | if (memcmp(iv,nistvector+len-tail,sizeof(iv))) | ||
441 | fprintf(stderr,"iv_%d mismatch\n",len), exit(4); | ||
442 | } | ||
443 | |||
444 | int main() | ||
247 | { | 445 | { |
248 | AES_set_encrypt_key(test_key,128,&encks); | 446 | AES_set_encrypt_key(test_key,128,&encks); |
249 | AES_set_decrypt_key(test_key,128,&decks); | 447 | AES_set_decrypt_key(test_key,128,&decks); |
@@ -254,6 +452,14 @@ main() | |||
254 | test_vector(vector_47,sizeof(vector_47)); | 452 | test_vector(vector_47,sizeof(vector_47)); |
255 | test_vector(vector_48,sizeof(vector_48)); | 453 | test_vector(vector_48,sizeof(vector_48)); |
256 | test_vector(vector_64,sizeof(vector_64)); | 454 | test_vector(vector_64,sizeof(vector_64)); |
257 | exit(0); | 455 | |
456 | test_nistvector(vector_17,sizeof(vector_17)); | ||
457 | test_nistvector(vector_31,sizeof(vector_31)); | ||
458 | test_nistvector(vector_32,sizeof(vector_32)); | ||
459 | test_nistvector(vector_47,sizeof(vector_47)); | ||
460 | test_nistvector(vector_48,sizeof(vector_48)); | ||
461 | test_nistvector(vector_64,sizeof(vector_64)); | ||
462 | |||
463 | return 0; | ||
258 | } | 464 | } |
259 | #endif | 465 | #endif |
diff --git a/src/lib/libcrypto/modes/gcm128.c b/src/lib/libcrypto/modes/gcm128.c new file mode 100644 index 0000000000..7d6d034970 --- /dev/null +++ b/src/lib/libcrypto/modes/gcm128.c | |||
@@ -0,0 +1,1757 @@ | |||
1 | /* ==================================================================== | ||
2 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
3 | * | ||
4 | * Redistribution and use in source and binary forms, with or without | ||
5 | * modification, are permitted provided that the following conditions | ||
6 | * are met: | ||
7 | * | ||
8 | * 1. Redistributions of source code must retain the above copyright | ||
9 | * notice, this list of conditions and the following disclaimer. | ||
10 | * | ||
11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
12 | * notice, this list of conditions and the following disclaimer in | ||
13 | * the documentation and/or other materials provided with the | ||
14 | * distribution. | ||
15 | * | ||
16 | * 3. All advertising materials mentioning features or use of this | ||
17 | * software must display the following acknowledgment: | ||
18 | * "This product includes software developed by the OpenSSL Project | ||
19 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
20 | * | ||
21 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
22 | * endorse or promote products derived from this software without | ||
23 | * prior written permission. For written permission, please contact | ||
24 | * openssl-core@openssl.org. | ||
25 | * | ||
26 | * 5. Products derived from this software may not be called "OpenSSL" | ||
27 | * nor may "OpenSSL" appear in their names without prior written | ||
28 | * permission of the OpenSSL Project. | ||
29 | * | ||
30 | * 6. Redistributions of any form whatsoever must retain the following | ||
31 | * acknowledgment: | ||
32 | * "This product includes software developed by the OpenSSL Project | ||
33 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
34 | * | ||
35 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
36 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
38 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
39 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
40 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
41 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
42 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
45 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
46 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
47 | * ==================================================================== | ||
48 | */ | ||
49 | |||
50 | #define OPENSSL_FIPSAPI | ||
51 | |||
52 | #include <openssl/crypto.h> | ||
53 | #include "modes_lcl.h" | ||
54 | #include <string.h> | ||
55 | |||
56 | #ifndef MODES_DEBUG | ||
57 | # ifndef NDEBUG | ||
58 | # define NDEBUG | ||
59 | # endif | ||
60 | #endif | ||
61 | #include <assert.h> | ||
62 | |||
63 | #if defined(BSWAP4) && defined(STRICT_ALIGNMENT) | ||
64 | /* redefine, because alignment is ensured */ | ||
65 | #undef GETU32 | ||
66 | #define GETU32(p) BSWAP4(*(const u32 *)(p)) | ||
67 | #undef PUTU32 | ||
68 | #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) | ||
69 | #endif | ||
70 | |||
71 | #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) | ||
72 | #define REDUCE1BIT(V) do { \ | ||
73 | if (sizeof(size_t)==8) { \ | ||
74 | u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ | ||
75 | V.lo = (V.hi<<63)|(V.lo>>1); \ | ||
76 | V.hi = (V.hi>>1 )^T; \ | ||
77 | } \ | ||
78 | else { \ | ||
79 | u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ | ||
80 | V.lo = (V.hi<<63)|(V.lo>>1); \ | ||
81 | V.hi = (V.hi>>1 )^((u64)T<<32); \ | ||
82 | } \ | ||
83 | } while(0) | ||
84 | |||
85 | /* | ||
86 | * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should | ||
87 | * never be set to 8. 8 is effectively reserved for testing purposes. | ||
88 | * TABLE_BITS>1 are lookup-table-driven implementations referred to as | ||
89 | * "Shoup's" in GCM specification. In other words OpenSSL does not cover | ||
90 | * whole spectrum of possible table driven implementations. Why? In | ||
91 | * non-"Shoup's" case memory access pattern is segmented in such manner, | ||
92 | * that it's trivial to see that cache timing information can reveal | ||
93 | * fair portion of intermediate hash value. Given that ciphertext is | ||
94 | * always available to attacker, it's possible for him to attempt to | ||
95 | * deduce secret parameter H and if successful, tamper with messages | ||
96 | * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's | ||
97 | * not as trivial, but there is no reason to believe that it's resistant | ||
98 | * to cache-timing attack. And the thing about "8-bit" implementation is | ||
99 | * that it consumes 16 (sixteen) times more memory, 4KB per individual | ||
100 | * key + 1KB shared. Well, on pros side it should be twice as fast as | ||
101 | * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version | ||
102 | * was observed to run ~75% faster, closer to 100% for commercial | ||
103 | * compilers... Yet "4-bit" procedure is preferred, because it's | ||
104 | * believed to provide better security-performance balance and adequate | ||
105 | * all-round performance. "All-round" refers to things like: | ||
106 | * | ||
107 | * - shorter setup time effectively improves overall timing for | ||
108 | * handling short messages; | ||
109 | * - larger table allocation can become unbearable because of VM | ||
110 | * subsystem penalties (for example on Windows large enough free | ||
111 | * results in VM working set trimming, meaning that consequent | ||
112 | * malloc would immediately incur working set expansion); | ||
113 | * - larger table has larger cache footprint, which can affect | ||
114 | * performance of other code paths (not necessarily even from same | ||
115 | * thread in Hyper-Threading world); | ||
116 | * | ||
117 | * Value of 1 is not appropriate for performance reasons. | ||
118 | */ | ||
119 | #if TABLE_BITS==8 | ||
120 | |||
121 | static void gcm_init_8bit(u128 Htable[256], u64 H[2]) | ||
122 | { | ||
123 | int i, j; | ||
124 | u128 V; | ||
125 | |||
126 | Htable[0].hi = 0; | ||
127 | Htable[0].lo = 0; | ||
128 | V.hi = H[0]; | ||
129 | V.lo = H[1]; | ||
130 | |||
131 | for (Htable[128]=V, i=64; i>0; i>>=1) { | ||
132 | REDUCE1BIT(V); | ||
133 | Htable[i] = V; | ||
134 | } | ||
135 | |||
136 | for (i=2; i<256; i<<=1) { | ||
137 | u128 *Hi = Htable+i, H0 = *Hi; | ||
138 | for (j=1; j<i; ++j) { | ||
139 | Hi[j].hi = H0.hi^Htable[j].hi; | ||
140 | Hi[j].lo = H0.lo^Htable[j].lo; | ||
141 | } | ||
142 | } | ||
143 | } | ||
144 | |||
145 | static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) | ||
146 | { | ||
147 | u128 Z = { 0, 0}; | ||
148 | const u8 *xi = (const u8 *)Xi+15; | ||
149 | size_t rem, n = *xi; | ||
150 | const union { long one; char little; } is_endian = {1}; | ||
151 | static const size_t rem_8bit[256] = { | ||
152 | PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), | ||
153 | PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), | ||
154 | PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), | ||
155 | PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), | ||
156 | PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), | ||
157 | PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), | ||
158 | PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), | ||
159 | PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), | ||
160 | PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), | ||
161 | PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), | ||
162 | PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), | ||
163 | PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), | ||
164 | PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), | ||
165 | PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), | ||
166 | PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), | ||
167 | PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), | ||
168 | PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), | ||
169 | PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), | ||
170 | PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), | ||
171 | PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), | ||
172 | PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), | ||
173 | PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), | ||
174 | PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), | ||
175 | PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), | ||
176 | PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), | ||
177 | PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), | ||
178 | PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), | ||
179 | PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), | ||
180 | PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), | ||
181 | PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), | ||
182 | PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), | ||
183 | PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), | ||
184 | PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), | ||
185 | PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), | ||
186 | PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), | ||
187 | PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), | ||
188 | PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), | ||
189 | PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), | ||
190 | PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), | ||
191 | PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), | ||
192 | PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), | ||
193 | PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), | ||
194 | PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), | ||
195 | PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), | ||
196 | PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), | ||
197 | PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), | ||
198 | PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), | ||
199 | PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), | ||
200 | PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), | ||
201 | PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), | ||
202 | PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), | ||
203 | PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), | ||
204 | PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), | ||
205 | PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), | ||
206 | PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), | ||
207 | PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), | ||
208 | PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), | ||
209 | PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), | ||
210 | PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), | ||
211 | PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), | ||
212 | PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), | ||
213 | PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), | ||
214 | PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), | ||
215 | PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; | ||
216 | |||
217 | while (1) { | ||
218 | Z.hi ^= Htable[n].hi; | ||
219 | Z.lo ^= Htable[n].lo; | ||
220 | |||
221 | if ((u8 *)Xi==xi) break; | ||
222 | |||
223 | n = *(--xi); | ||
224 | |||
225 | rem = (size_t)Z.lo&0xff; | ||
226 | Z.lo = (Z.hi<<56)|(Z.lo>>8); | ||
227 | Z.hi = (Z.hi>>8); | ||
228 | if (sizeof(size_t)==8) | ||
229 | Z.hi ^= rem_8bit[rem]; | ||
230 | else | ||
231 | Z.hi ^= (u64)rem_8bit[rem]<<32; | ||
232 | } | ||
233 | |||
234 | if (is_endian.little) { | ||
235 | #ifdef BSWAP8 | ||
236 | Xi[0] = BSWAP8(Z.hi); | ||
237 | Xi[1] = BSWAP8(Z.lo); | ||
238 | #else | ||
239 | u8 *p = (u8 *)Xi; | ||
240 | u32 v; | ||
241 | v = (u32)(Z.hi>>32); PUTU32(p,v); | ||
242 | v = (u32)(Z.hi); PUTU32(p+4,v); | ||
243 | v = (u32)(Z.lo>>32); PUTU32(p+8,v); | ||
244 | v = (u32)(Z.lo); PUTU32(p+12,v); | ||
245 | #endif | ||
246 | } | ||
247 | else { | ||
248 | Xi[0] = Z.hi; | ||
249 | Xi[1] = Z.lo; | ||
250 | } | ||
251 | } | ||
252 | #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) | ||
253 | |||
254 | #elif TABLE_BITS==4 | ||
255 | |||
256 | static void gcm_init_4bit(u128 Htable[16], u64 H[2]) | ||
257 | { | ||
258 | u128 V; | ||
259 | #if defined(OPENSSL_SMALL_FOOTPRINT) | ||
260 | int i; | ||
261 | #endif | ||
262 | |||
263 | Htable[0].hi = 0; | ||
264 | Htable[0].lo = 0; | ||
265 | V.hi = H[0]; | ||
266 | V.lo = H[1]; | ||
267 | |||
268 | #if defined(OPENSSL_SMALL_FOOTPRINT) | ||
269 | for (Htable[8]=V, i=4; i>0; i>>=1) { | ||
270 | REDUCE1BIT(V); | ||
271 | Htable[i] = V; | ||
272 | } | ||
273 | |||
274 | for (i=2; i<16; i<<=1) { | ||
275 | u128 *Hi = Htable+i; | ||
276 | int j; | ||
277 | for (V=*Hi, j=1; j<i; ++j) { | ||
278 | Hi[j].hi = V.hi^Htable[j].hi; | ||
279 | Hi[j].lo = V.lo^Htable[j].lo; | ||
280 | } | ||
281 | } | ||
282 | #else | ||
283 | Htable[8] = V; | ||
284 | REDUCE1BIT(V); | ||
285 | Htable[4] = V; | ||
286 | REDUCE1BIT(V); | ||
287 | Htable[2] = V; | ||
288 | REDUCE1BIT(V); | ||
289 | Htable[1] = V; | ||
290 | Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo; | ||
291 | V=Htable[4]; | ||
292 | Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo; | ||
293 | Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo; | ||
294 | Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo; | ||
295 | V=Htable[8]; | ||
296 | Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo; | ||
297 | Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo; | ||
298 | Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo; | ||
299 | Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo; | ||
300 | Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo; | ||
301 | Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo; | ||
302 | Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo; | ||
303 | #endif | ||
304 | #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) | ||
305 | /* | ||
306 | * ARM assembler expects specific dword order in Htable. | ||
307 | */ | ||
308 | { | ||
309 | int j; | ||
310 | const union { long one; char little; } is_endian = {1}; | ||
311 | |||
312 | if (is_endian.little) | ||
313 | for (j=0;j<16;++j) { | ||
314 | V = Htable[j]; | ||
315 | Htable[j].hi = V.lo; | ||
316 | Htable[j].lo = V.hi; | ||
317 | } | ||
318 | else | ||
319 | for (j=0;j<16;++j) { | ||
320 | V = Htable[j]; | ||
321 | Htable[j].hi = V.lo<<32|V.lo>>32; | ||
322 | Htable[j].lo = V.hi<<32|V.hi>>32; | ||
323 | } | ||
324 | } | ||
325 | #endif | ||
326 | } | ||
327 | |||
328 | #ifndef GHASH_ASM | ||
329 | static const size_t rem_4bit[16] = { | ||
330 | PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), | ||
331 | PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), | ||
332 | PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), | ||
333 | PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; | ||
334 | |||
335 | static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) | ||
336 | { | ||
337 | u128 Z; | ||
338 | int cnt = 15; | ||
339 | size_t rem, nlo, nhi; | ||
340 | const union { long one; char little; } is_endian = {1}; | ||
341 | |||
342 | nlo = ((const u8 *)Xi)[15]; | ||
343 | nhi = nlo>>4; | ||
344 | nlo &= 0xf; | ||
345 | |||
346 | Z.hi = Htable[nlo].hi; | ||
347 | Z.lo = Htable[nlo].lo; | ||
348 | |||
349 | while (1) { | ||
350 | rem = (size_t)Z.lo&0xf; | ||
351 | Z.lo = (Z.hi<<60)|(Z.lo>>4); | ||
352 | Z.hi = (Z.hi>>4); | ||
353 | if (sizeof(size_t)==8) | ||
354 | Z.hi ^= rem_4bit[rem]; | ||
355 | else | ||
356 | Z.hi ^= (u64)rem_4bit[rem]<<32; | ||
357 | |||
358 | Z.hi ^= Htable[nhi].hi; | ||
359 | Z.lo ^= Htable[nhi].lo; | ||
360 | |||
361 | if (--cnt<0) break; | ||
362 | |||
363 | nlo = ((const u8 *)Xi)[cnt]; | ||
364 | nhi = nlo>>4; | ||
365 | nlo &= 0xf; | ||
366 | |||
367 | rem = (size_t)Z.lo&0xf; | ||
368 | Z.lo = (Z.hi<<60)|(Z.lo>>4); | ||
369 | Z.hi = (Z.hi>>4); | ||
370 | if (sizeof(size_t)==8) | ||
371 | Z.hi ^= rem_4bit[rem]; | ||
372 | else | ||
373 | Z.hi ^= (u64)rem_4bit[rem]<<32; | ||
374 | |||
375 | Z.hi ^= Htable[nlo].hi; | ||
376 | Z.lo ^= Htable[nlo].lo; | ||
377 | } | ||
378 | |||
379 | if (is_endian.little) { | ||
380 | #ifdef BSWAP8 | ||
381 | Xi[0] = BSWAP8(Z.hi); | ||
382 | Xi[1] = BSWAP8(Z.lo); | ||
383 | #else | ||
384 | u8 *p = (u8 *)Xi; | ||
385 | u32 v; | ||
386 | v = (u32)(Z.hi>>32); PUTU32(p,v); | ||
387 | v = (u32)(Z.hi); PUTU32(p+4,v); | ||
388 | v = (u32)(Z.lo>>32); PUTU32(p+8,v); | ||
389 | v = (u32)(Z.lo); PUTU32(p+12,v); | ||
390 | #endif | ||
391 | } | ||
392 | else { | ||
393 | Xi[0] = Z.hi; | ||
394 | Xi[1] = Z.lo; | ||
395 | } | ||
396 | } | ||
397 | |||
398 | #if !defined(OPENSSL_SMALL_FOOTPRINT) | ||
399 | /* | ||
400 | * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for | ||
401 | * details... Compiler-generated code doesn't seem to give any | ||
402 | * performance improvement, at least not on x86[_64]. It's here | ||
403 | * mostly as reference and a placeholder for possible future | ||
404 | * non-trivial optimization[s]... | ||
405 | */ | ||
406 | static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16], | ||
407 | const u8 *inp,size_t len) | ||
408 | { | ||
409 | u128 Z; | ||
410 | int cnt; | ||
411 | size_t rem, nlo, nhi; | ||
412 | const union { long one; char little; } is_endian = {1}; | ||
413 | |||
414 | #if 1 | ||
415 | do { | ||
416 | cnt = 15; | ||
417 | nlo = ((const u8 *)Xi)[15]; | ||
418 | nlo ^= inp[15]; | ||
419 | nhi = nlo>>4; | ||
420 | nlo &= 0xf; | ||
421 | |||
422 | Z.hi = Htable[nlo].hi; | ||
423 | Z.lo = Htable[nlo].lo; | ||
424 | |||
425 | while (1) { | ||
426 | rem = (size_t)Z.lo&0xf; | ||
427 | Z.lo = (Z.hi<<60)|(Z.lo>>4); | ||
428 | Z.hi = (Z.hi>>4); | ||
429 | if (sizeof(size_t)==8) | ||
430 | Z.hi ^= rem_4bit[rem]; | ||
431 | else | ||
432 | Z.hi ^= (u64)rem_4bit[rem]<<32; | ||
433 | |||
434 | Z.hi ^= Htable[nhi].hi; | ||
435 | Z.lo ^= Htable[nhi].lo; | ||
436 | |||
437 | if (--cnt<0) break; | ||
438 | |||
439 | nlo = ((const u8 *)Xi)[cnt]; | ||
440 | nlo ^= inp[cnt]; | ||
441 | nhi = nlo>>4; | ||
442 | nlo &= 0xf; | ||
443 | |||
444 | rem = (size_t)Z.lo&0xf; | ||
445 | Z.lo = (Z.hi<<60)|(Z.lo>>4); | ||
446 | Z.hi = (Z.hi>>4); | ||
447 | if (sizeof(size_t)==8) | ||
448 | Z.hi ^= rem_4bit[rem]; | ||
449 | else | ||
450 | Z.hi ^= (u64)rem_4bit[rem]<<32; | ||
451 | |||
452 | Z.hi ^= Htable[nlo].hi; | ||
453 | Z.lo ^= Htable[nlo].lo; | ||
454 | } | ||
455 | #else | ||
456 | /* | ||
457 | * Extra 256+16 bytes per-key plus 512 bytes shared tables | ||
458 | * [should] give ~50% improvement... One could have PACK()-ed | ||
459 | * the rem_8bit even here, but the priority is to minimize | ||
460 | * cache footprint... | ||
461 | */ | ||
462 | u128 Hshr4[16]; /* Htable shifted right by 4 bits */ | ||
463 | u8 Hshl4[16]; /* Htable shifted left by 4 bits */ | ||
464 | static const unsigned short rem_8bit[256] = { | ||
465 | 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, | ||
466 | 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, | ||
467 | 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, | ||
468 | 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, | ||
469 | 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, | ||
470 | 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, | ||
471 | 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, | ||
472 | 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, | ||
473 | 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, | ||
474 | 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, | ||
475 | 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, | ||
476 | 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, | ||
477 | 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, | ||
478 | 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, | ||
479 | 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, | ||
480 | 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, | ||
481 | 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, | ||
482 | 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, | ||
483 | 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, | ||
484 | 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, | ||
485 | 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, | ||
486 | 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, | ||
487 | 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, | ||
488 | 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, | ||
489 | 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, | ||
490 | 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, | ||
491 | 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, | ||
492 | 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, | ||
493 | 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, | ||
494 | 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, | ||
495 | 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, | ||
496 | 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; | ||
497 | /* | ||
498 | * This pre-processing phase slows down procedure by approximately | ||
499 | * same time as it makes each loop spin faster. In other words | ||
500 | * single block performance is approximately same as straightforward | ||
501 | * "4-bit" implementation, and then it goes only faster... | ||
502 | */ | ||
503 | for (cnt=0; cnt<16; ++cnt) { | ||
504 | Z.hi = Htable[cnt].hi; | ||
505 | Z.lo = Htable[cnt].lo; | ||
506 | Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4); | ||
507 | Hshr4[cnt].hi = (Z.hi>>4); | ||
508 | Hshl4[cnt] = (u8)(Z.lo<<4); | ||
509 | } | ||
510 | |||
511 | do { | ||
512 | for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) { | ||
513 | nlo = ((const u8 *)Xi)[cnt]; | ||
514 | nlo ^= inp[cnt]; | ||
515 | nhi = nlo>>4; | ||
516 | nlo &= 0xf; | ||
517 | |||
518 | Z.hi ^= Htable[nlo].hi; | ||
519 | Z.lo ^= Htable[nlo].lo; | ||
520 | |||
521 | rem = (size_t)Z.lo&0xff; | ||
522 | |||
523 | Z.lo = (Z.hi<<56)|(Z.lo>>8); | ||
524 | Z.hi = (Z.hi>>8); | ||
525 | |||
526 | Z.hi ^= Hshr4[nhi].hi; | ||
527 | Z.lo ^= Hshr4[nhi].lo; | ||
528 | Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48; | ||
529 | } | ||
530 | |||
531 | nlo = ((const u8 *)Xi)[0]; | ||
532 | nlo ^= inp[0]; | ||
533 | nhi = nlo>>4; | ||
534 | nlo &= 0xf; | ||
535 | |||
536 | Z.hi ^= Htable[nlo].hi; | ||
537 | Z.lo ^= Htable[nlo].lo; | ||
538 | |||
539 | rem = (size_t)Z.lo&0xf; | ||
540 | |||
541 | Z.lo = (Z.hi<<60)|(Z.lo>>4); | ||
542 | Z.hi = (Z.hi>>4); | ||
543 | |||
544 | Z.hi ^= Htable[nhi].hi; | ||
545 | Z.lo ^= Htable[nhi].lo; | ||
546 | Z.hi ^= ((u64)rem_8bit[rem<<4])<<48; | ||
547 | #endif | ||
548 | |||
549 | if (is_endian.little) { | ||
550 | #ifdef BSWAP8 | ||
551 | Xi[0] = BSWAP8(Z.hi); | ||
552 | Xi[1] = BSWAP8(Z.lo); | ||
553 | #else | ||
554 | u8 *p = (u8 *)Xi; | ||
555 | u32 v; | ||
556 | v = (u32)(Z.hi>>32); PUTU32(p,v); | ||
557 | v = (u32)(Z.hi); PUTU32(p+4,v); | ||
558 | v = (u32)(Z.lo>>32); PUTU32(p+8,v); | ||
559 | v = (u32)(Z.lo); PUTU32(p+12,v); | ||
560 | #endif | ||
561 | } | ||
562 | else { | ||
563 | Xi[0] = Z.hi; | ||
564 | Xi[1] = Z.lo; | ||
565 | } | ||
566 | } while (inp+=16, len-=16); | ||
567 | } | ||
568 | #endif | ||
569 | #else | ||
570 | void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]); | ||
571 | void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
572 | #endif | ||
573 | |||
574 | #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) | ||
575 | #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) | ||
576 | #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) | ||
577 | /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache | ||
578 | * trashing effect. In other words idea is to hash data while it's | ||
579 | * still in L1 cache after encryption pass... */ | ||
580 | #define GHASH_CHUNK (3*1024) | ||
581 | #endif | ||
582 | |||
583 | #else /* TABLE_BITS */ | ||
584 | |||
585 | static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) | ||
586 | { | ||
587 | u128 V,Z = { 0,0 }; | ||
588 | long X; | ||
589 | int i,j; | ||
590 | const long *xi = (const long *)Xi; | ||
591 | const union { long one; char little; } is_endian = {1}; | ||
592 | |||
593 | V.hi = H[0]; /* H is in host byte order, no byte swapping */ | ||
594 | V.lo = H[1]; | ||
595 | |||
596 | for (j=0; j<16/sizeof(long); ++j) { | ||
597 | if (is_endian.little) { | ||
598 | if (sizeof(long)==8) { | ||
599 | #ifdef BSWAP8 | ||
600 | X = (long)(BSWAP8(xi[j])); | ||
601 | #else | ||
602 | const u8 *p = (const u8 *)(xi+j); | ||
603 | X = (long)((u64)GETU32(p)<<32|GETU32(p+4)); | ||
604 | #endif | ||
605 | } | ||
606 | else { | ||
607 | const u8 *p = (const u8 *)(xi+j); | ||
608 | X = (long)GETU32(p); | ||
609 | } | ||
610 | } | ||
611 | else | ||
612 | X = xi[j]; | ||
613 | |||
614 | for (i=0; i<8*sizeof(long); ++i, X<<=1) { | ||
615 | u64 M = (u64)(X>>(8*sizeof(long)-1)); | ||
616 | Z.hi ^= V.hi&M; | ||
617 | Z.lo ^= V.lo&M; | ||
618 | |||
619 | REDUCE1BIT(V); | ||
620 | } | ||
621 | } | ||
622 | |||
623 | if (is_endian.little) { | ||
624 | #ifdef BSWAP8 | ||
625 | Xi[0] = BSWAP8(Z.hi); | ||
626 | Xi[1] = BSWAP8(Z.lo); | ||
627 | #else | ||
628 | u8 *p = (u8 *)Xi; | ||
629 | u32 v; | ||
630 | v = (u32)(Z.hi>>32); PUTU32(p,v); | ||
631 | v = (u32)(Z.hi); PUTU32(p+4,v); | ||
632 | v = (u32)(Z.lo>>32); PUTU32(p+8,v); | ||
633 | v = (u32)(Z.lo); PUTU32(p+12,v); | ||
634 | #endif | ||
635 | } | ||
636 | else { | ||
637 | Xi[0] = Z.hi; | ||
638 | Xi[1] = Z.lo; | ||
639 | } | ||
640 | } | ||
641 | #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) | ||
642 | |||
643 | #endif | ||
644 | |||
645 | #if TABLE_BITS==4 && defined(GHASH_ASM) | ||
646 | # if !defined(I386_ONLY) && \ | ||
647 | (defined(__i386) || defined(__i386__) || \ | ||
648 | defined(__x86_64) || defined(__x86_64__) || \ | ||
649 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) | ||
650 | # define GHASH_ASM_X86_OR_64 | ||
651 | # define GCM_FUNCREF_4BIT | ||
652 | extern unsigned int OPENSSL_ia32cap_P[2]; | ||
653 | |||
654 | void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); | ||
655 | void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); | ||
656 | void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
657 | |||
658 | # if defined(__i386) || defined(__i386__) || defined(_M_IX86) | ||
659 | # define GHASH_ASM_X86 | ||
660 | void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); | ||
661 | void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
662 | |||
663 | void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); | ||
664 | void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
665 | # endif | ||
666 | # elif defined(__arm__) || defined(__arm) | ||
667 | # include "arm_arch.h" | ||
668 | # if __ARM_ARCH__>=7 | ||
669 | # define GHASH_ASM_ARM | ||
670 | # define GCM_FUNCREF_4BIT | ||
671 | void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); | ||
672 | void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
673 | # endif | ||
674 | # endif | ||
675 | #endif | ||
676 | |||
677 | #ifdef GCM_FUNCREF_4BIT | ||
678 | # undef GCM_MUL | ||
679 | # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) | ||
680 | # ifdef GHASH | ||
681 | # undef GHASH | ||
682 | # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) | ||
683 | # endif | ||
684 | #endif | ||
685 | |||
686 | void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) | ||
687 | { | ||
688 | const union { long one; char little; } is_endian = {1}; | ||
689 | |||
690 | memset(ctx,0,sizeof(*ctx)); | ||
691 | ctx->block = block; | ||
692 | ctx->key = key; | ||
693 | |||
694 | (*block)(ctx->H.c,ctx->H.c,key); | ||
695 | |||
696 | if (is_endian.little) { | ||
697 | /* H is stored in host byte order */ | ||
698 | #ifdef BSWAP8 | ||
699 | ctx->H.u[0] = BSWAP8(ctx->H.u[0]); | ||
700 | ctx->H.u[1] = BSWAP8(ctx->H.u[1]); | ||
701 | #else | ||
702 | u8 *p = ctx->H.c; | ||
703 | u64 hi,lo; | ||
704 | hi = (u64)GETU32(p) <<32|GETU32(p+4); | ||
705 | lo = (u64)GETU32(p+8)<<32|GETU32(p+12); | ||
706 | ctx->H.u[0] = hi; | ||
707 | ctx->H.u[1] = lo; | ||
708 | #endif | ||
709 | } | ||
710 | |||
711 | #if TABLE_BITS==8 | ||
712 | gcm_init_8bit(ctx->Htable,ctx->H.u); | ||
713 | #elif TABLE_BITS==4 | ||
714 | # if defined(GHASH_ASM_X86_OR_64) | ||
715 | # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) | ||
716 | if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */ | ||
717 | OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */ | ||
718 | gcm_init_clmul(ctx->Htable,ctx->H.u); | ||
719 | ctx->gmult = gcm_gmult_clmul; | ||
720 | ctx->ghash = gcm_ghash_clmul; | ||
721 | return; | ||
722 | } | ||
723 | # endif | ||
724 | gcm_init_4bit(ctx->Htable,ctx->H.u); | ||
725 | # if defined(GHASH_ASM_X86) /* x86 only */ | ||
726 | # if defined(OPENSSL_IA32_SSE2) | ||
727 | if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */ | ||
728 | # else | ||
729 | if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */ | ||
730 | # endif | ||
731 | ctx->gmult = gcm_gmult_4bit_mmx; | ||
732 | ctx->ghash = gcm_ghash_4bit_mmx; | ||
733 | } else { | ||
734 | ctx->gmult = gcm_gmult_4bit_x86; | ||
735 | ctx->ghash = gcm_ghash_4bit_x86; | ||
736 | } | ||
737 | # else | ||
738 | ctx->gmult = gcm_gmult_4bit; | ||
739 | ctx->ghash = gcm_ghash_4bit; | ||
740 | # endif | ||
741 | # elif defined(GHASH_ASM_ARM) | ||
742 | if (OPENSSL_armcap_P & ARMV7_NEON) { | ||
743 | ctx->gmult = gcm_gmult_neon; | ||
744 | ctx->ghash = gcm_ghash_neon; | ||
745 | } else { | ||
746 | gcm_init_4bit(ctx->Htable,ctx->H.u); | ||
747 | ctx->gmult = gcm_gmult_4bit; | ||
748 | ctx->ghash = gcm_ghash_4bit; | ||
749 | } | ||
750 | # else | ||
751 | gcm_init_4bit(ctx->Htable,ctx->H.u); | ||
752 | # endif | ||
753 | #endif | ||
754 | } | ||
755 | |||
756 | void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len) | ||
757 | { | ||
758 | const union { long one; char little; } is_endian = {1}; | ||
759 | unsigned int ctr; | ||
760 | #ifdef GCM_FUNCREF_4BIT | ||
761 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
762 | #endif | ||
763 | |||
764 | ctx->Yi.u[0] = 0; | ||
765 | ctx->Yi.u[1] = 0; | ||
766 | ctx->Xi.u[0] = 0; | ||
767 | ctx->Xi.u[1] = 0; | ||
768 | ctx->len.u[0] = 0; /* AAD length */ | ||
769 | ctx->len.u[1] = 0; /* message length */ | ||
770 | ctx->ares = 0; | ||
771 | ctx->mres = 0; | ||
772 | |||
773 | if (len==12) { | ||
774 | memcpy(ctx->Yi.c,iv,12); | ||
775 | ctx->Yi.c[15]=1; | ||
776 | ctr=1; | ||
777 | } | ||
778 | else { | ||
779 | size_t i; | ||
780 | u64 len0 = len; | ||
781 | |||
782 | while (len>=16) { | ||
783 | for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i]; | ||
784 | GCM_MUL(ctx,Yi); | ||
785 | iv += 16; | ||
786 | len -= 16; | ||
787 | } | ||
788 | if (len) { | ||
789 | for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i]; | ||
790 | GCM_MUL(ctx,Yi); | ||
791 | } | ||
792 | len0 <<= 3; | ||
793 | if (is_endian.little) { | ||
794 | #ifdef BSWAP8 | ||
795 | ctx->Yi.u[1] ^= BSWAP8(len0); | ||
796 | #else | ||
797 | ctx->Yi.c[8] ^= (u8)(len0>>56); | ||
798 | ctx->Yi.c[9] ^= (u8)(len0>>48); | ||
799 | ctx->Yi.c[10] ^= (u8)(len0>>40); | ||
800 | ctx->Yi.c[11] ^= (u8)(len0>>32); | ||
801 | ctx->Yi.c[12] ^= (u8)(len0>>24); | ||
802 | ctx->Yi.c[13] ^= (u8)(len0>>16); | ||
803 | ctx->Yi.c[14] ^= (u8)(len0>>8); | ||
804 | ctx->Yi.c[15] ^= (u8)(len0); | ||
805 | #endif | ||
806 | } | ||
807 | else | ||
808 | ctx->Yi.u[1] ^= len0; | ||
809 | |||
810 | GCM_MUL(ctx,Yi); | ||
811 | |||
812 | if (is_endian.little) | ||
813 | ctr = GETU32(ctx->Yi.c+12); | ||
814 | else | ||
815 | ctr = ctx->Yi.d[3]; | ||
816 | } | ||
817 | |||
818 | (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key); | ||
819 | ++ctr; | ||
820 | if (is_endian.little) | ||
821 | PUTU32(ctx->Yi.c+12,ctr); | ||
822 | else | ||
823 | ctx->Yi.d[3] = ctr; | ||
824 | } | ||
825 | |||
826 | int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) | ||
827 | { | ||
828 | size_t i; | ||
829 | unsigned int n; | ||
830 | u64 alen = ctx->len.u[0]; | ||
831 | #ifdef GCM_FUNCREF_4BIT | ||
832 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
833 | # ifdef GHASH | ||
834 | void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], | ||
835 | const u8 *inp,size_t len) = ctx->ghash; | ||
836 | # endif | ||
837 | #endif | ||
838 | |||
839 | if (ctx->len.u[1]) return -2; | ||
840 | |||
841 | alen += len; | ||
842 | if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len)) | ||
843 | return -1; | ||
844 | ctx->len.u[0] = alen; | ||
845 | |||
846 | n = ctx->ares; | ||
847 | if (n) { | ||
848 | while (n && len) { | ||
849 | ctx->Xi.c[n] ^= *(aad++); | ||
850 | --len; | ||
851 | n = (n+1)%16; | ||
852 | } | ||
853 | if (n==0) GCM_MUL(ctx,Xi); | ||
854 | else { | ||
855 | ctx->ares = n; | ||
856 | return 0; | ||
857 | } | ||
858 | } | ||
859 | |||
860 | #ifdef GHASH | ||
861 | if ((i = (len&(size_t)-16))) { | ||
862 | GHASH(ctx,aad,i); | ||
863 | aad += i; | ||
864 | len -= i; | ||
865 | } | ||
866 | #else | ||
867 | while (len>=16) { | ||
868 | for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i]; | ||
869 | GCM_MUL(ctx,Xi); | ||
870 | aad += 16; | ||
871 | len -= 16; | ||
872 | } | ||
873 | #endif | ||
874 | if (len) { | ||
875 | n = (unsigned int)len; | ||
876 | for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i]; | ||
877 | } | ||
878 | |||
879 | ctx->ares = n; | ||
880 | return 0; | ||
881 | } | ||
882 | |||
883 | int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, | ||
884 | const unsigned char *in, unsigned char *out, | ||
885 | size_t len) | ||
886 | { | ||
887 | const union { long one; char little; } is_endian = {1}; | ||
888 | unsigned int n, ctr; | ||
889 | size_t i; | ||
890 | u64 mlen = ctx->len.u[1]; | ||
891 | block128_f block = ctx->block; | ||
892 | void *key = ctx->key; | ||
893 | #ifdef GCM_FUNCREF_4BIT | ||
894 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
895 | # ifdef GHASH | ||
896 | void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], | ||
897 | const u8 *inp,size_t len) = ctx->ghash; | ||
898 | # endif | ||
899 | #endif | ||
900 | |||
901 | #if 0 | ||
902 | n = (unsigned int)mlen%16; /* alternative to ctx->mres */ | ||
903 | #endif | ||
904 | mlen += len; | ||
905 | if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) | ||
906 | return -1; | ||
907 | ctx->len.u[1] = mlen; | ||
908 | |||
909 | if (ctx->ares) { | ||
910 | /* First call to encrypt finalizes GHASH(AAD) */ | ||
911 | GCM_MUL(ctx,Xi); | ||
912 | ctx->ares = 0; | ||
913 | } | ||
914 | |||
915 | if (is_endian.little) | ||
916 | ctr = GETU32(ctx->Yi.c+12); | ||
917 | else | ||
918 | ctr = ctx->Yi.d[3]; | ||
919 | |||
920 | n = ctx->mres; | ||
921 | #if !defined(OPENSSL_SMALL_FOOTPRINT) | ||
922 | if (16%sizeof(size_t) == 0) do { /* always true actually */ | ||
923 | if (n) { | ||
924 | while (n && len) { | ||
925 | ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; | ||
926 | --len; | ||
927 | n = (n+1)%16; | ||
928 | } | ||
929 | if (n==0) GCM_MUL(ctx,Xi); | ||
930 | else { | ||
931 | ctx->mres = n; | ||
932 | return 0; | ||
933 | } | ||
934 | } | ||
935 | #if defined(STRICT_ALIGNMENT) | ||
936 | if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) | ||
937 | break; | ||
938 | #endif | ||
939 | #if defined(GHASH) && defined(GHASH_CHUNK) | ||
940 | while (len>=GHASH_CHUNK) { | ||
941 | size_t j=GHASH_CHUNK; | ||
942 | |||
943 | while (j) { | ||
944 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
945 | ++ctr; | ||
946 | if (is_endian.little) | ||
947 | PUTU32(ctx->Yi.c+12,ctr); | ||
948 | else | ||
949 | ctx->Yi.d[3] = ctr; | ||
950 | for (i=0; i<16; i+=sizeof(size_t)) | ||
951 | *(size_t *)(out+i) = | ||
952 | *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); | ||
953 | out += 16; | ||
954 | in += 16; | ||
955 | j -= 16; | ||
956 | } | ||
957 | GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK); | ||
958 | len -= GHASH_CHUNK; | ||
959 | } | ||
960 | if ((i = (len&(size_t)-16))) { | ||
961 | size_t j=i; | ||
962 | |||
963 | while (len>=16) { | ||
964 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
965 | ++ctr; | ||
966 | if (is_endian.little) | ||
967 | PUTU32(ctx->Yi.c+12,ctr); | ||
968 | else | ||
969 | ctx->Yi.d[3] = ctr; | ||
970 | for (i=0; i<16; i+=sizeof(size_t)) | ||
971 | *(size_t *)(out+i) = | ||
972 | *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); | ||
973 | out += 16; | ||
974 | in += 16; | ||
975 | len -= 16; | ||
976 | } | ||
977 | GHASH(ctx,out-j,j); | ||
978 | } | ||
979 | #else | ||
980 | while (len>=16) { | ||
981 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
982 | ++ctr; | ||
983 | if (is_endian.little) | ||
984 | PUTU32(ctx->Yi.c+12,ctr); | ||
985 | else | ||
986 | ctx->Yi.d[3] = ctr; | ||
987 | for (i=0; i<16; i+=sizeof(size_t)) | ||
988 | *(size_t *)(ctx->Xi.c+i) ^= | ||
989 | *(size_t *)(out+i) = | ||
990 | *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); | ||
991 | GCM_MUL(ctx,Xi); | ||
992 | out += 16; | ||
993 | in += 16; | ||
994 | len -= 16; | ||
995 | } | ||
996 | #endif | ||
997 | if (len) { | ||
998 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
999 | ++ctr; | ||
1000 | if (is_endian.little) | ||
1001 | PUTU32(ctx->Yi.c+12,ctr); | ||
1002 | else | ||
1003 | ctx->Yi.d[3] = ctr; | ||
1004 | while (len--) { | ||
1005 | ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; | ||
1006 | ++n; | ||
1007 | } | ||
1008 | } | ||
1009 | |||
1010 | ctx->mres = n; | ||
1011 | return 0; | ||
1012 | } while(0); | ||
1013 | #endif | ||
1014 | for (i=0;i<len;++i) { | ||
1015 | if (n==0) { | ||
1016 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
1017 | ++ctr; | ||
1018 | if (is_endian.little) | ||
1019 | PUTU32(ctx->Yi.c+12,ctr); | ||
1020 | else | ||
1021 | ctx->Yi.d[3] = ctr; | ||
1022 | } | ||
1023 | ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n]; | ||
1024 | n = (n+1)%16; | ||
1025 | if (n==0) | ||
1026 | GCM_MUL(ctx,Xi); | ||
1027 | } | ||
1028 | |||
1029 | ctx->mres = n; | ||
1030 | return 0; | ||
1031 | } | ||
1032 | |||
1033 | int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, | ||
1034 | const unsigned char *in, unsigned char *out, | ||
1035 | size_t len) | ||
1036 | { | ||
1037 | const union { long one; char little; } is_endian = {1}; | ||
1038 | unsigned int n, ctr; | ||
1039 | size_t i; | ||
1040 | u64 mlen = ctx->len.u[1]; | ||
1041 | block128_f block = ctx->block; | ||
1042 | void *key = ctx->key; | ||
1043 | #ifdef GCM_FUNCREF_4BIT | ||
1044 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
1045 | # ifdef GHASH | ||
1046 | void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], | ||
1047 | const u8 *inp,size_t len) = ctx->ghash; | ||
1048 | # endif | ||
1049 | #endif | ||
1050 | |||
1051 | mlen += len; | ||
1052 | if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) | ||
1053 | return -1; | ||
1054 | ctx->len.u[1] = mlen; | ||
1055 | |||
1056 | if (ctx->ares) { | ||
1057 | /* First call to decrypt finalizes GHASH(AAD) */ | ||
1058 | GCM_MUL(ctx,Xi); | ||
1059 | ctx->ares = 0; | ||
1060 | } | ||
1061 | |||
1062 | if (is_endian.little) | ||
1063 | ctr = GETU32(ctx->Yi.c+12); | ||
1064 | else | ||
1065 | ctr = ctx->Yi.d[3]; | ||
1066 | |||
1067 | n = ctx->mres; | ||
1068 | #if !defined(OPENSSL_SMALL_FOOTPRINT) | ||
1069 | if (16%sizeof(size_t) == 0) do { /* always true actually */ | ||
1070 | if (n) { | ||
1071 | while (n && len) { | ||
1072 | u8 c = *(in++); | ||
1073 | *(out++) = c^ctx->EKi.c[n]; | ||
1074 | ctx->Xi.c[n] ^= c; | ||
1075 | --len; | ||
1076 | n = (n+1)%16; | ||
1077 | } | ||
1078 | if (n==0) GCM_MUL (ctx,Xi); | ||
1079 | else { | ||
1080 | ctx->mres = n; | ||
1081 | return 0; | ||
1082 | } | ||
1083 | } | ||
1084 | #if defined(STRICT_ALIGNMENT) | ||
1085 | if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) | ||
1086 | break; | ||
1087 | #endif | ||
1088 | #if defined(GHASH) && defined(GHASH_CHUNK) | ||
1089 | while (len>=GHASH_CHUNK) { | ||
1090 | size_t j=GHASH_CHUNK; | ||
1091 | |||
1092 | GHASH(ctx,in,GHASH_CHUNK); | ||
1093 | while (j) { | ||
1094 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
1095 | ++ctr; | ||
1096 | if (is_endian.little) | ||
1097 | PUTU32(ctx->Yi.c+12,ctr); | ||
1098 | else | ||
1099 | ctx->Yi.d[3] = ctr; | ||
1100 | for (i=0; i<16; i+=sizeof(size_t)) | ||
1101 | *(size_t *)(out+i) = | ||
1102 | *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); | ||
1103 | out += 16; | ||
1104 | in += 16; | ||
1105 | j -= 16; | ||
1106 | } | ||
1107 | len -= GHASH_CHUNK; | ||
1108 | } | ||
1109 | if ((i = (len&(size_t)-16))) { | ||
1110 | GHASH(ctx,in,i); | ||
1111 | while (len>=16) { | ||
1112 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
1113 | ++ctr; | ||
1114 | if (is_endian.little) | ||
1115 | PUTU32(ctx->Yi.c+12,ctr); | ||
1116 | else | ||
1117 | ctx->Yi.d[3] = ctr; | ||
1118 | for (i=0; i<16; i+=sizeof(size_t)) | ||
1119 | *(size_t *)(out+i) = | ||
1120 | *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); | ||
1121 | out += 16; | ||
1122 | in += 16; | ||
1123 | len -= 16; | ||
1124 | } | ||
1125 | } | ||
1126 | #else | ||
1127 | while (len>=16) { | ||
1128 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
1129 | ++ctr; | ||
1130 | if (is_endian.little) | ||
1131 | PUTU32(ctx->Yi.c+12,ctr); | ||
1132 | else | ||
1133 | ctx->Yi.d[3] = ctr; | ||
1134 | for (i=0; i<16; i+=sizeof(size_t)) { | ||
1135 | size_t c = *(size_t *)(in+i); | ||
1136 | *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i); | ||
1137 | *(size_t *)(ctx->Xi.c+i) ^= c; | ||
1138 | } | ||
1139 | GCM_MUL(ctx,Xi); | ||
1140 | out += 16; | ||
1141 | in += 16; | ||
1142 | len -= 16; | ||
1143 | } | ||
1144 | #endif | ||
1145 | if (len) { | ||
1146 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
1147 | ++ctr; | ||
1148 | if (is_endian.little) | ||
1149 | PUTU32(ctx->Yi.c+12,ctr); | ||
1150 | else | ||
1151 | ctx->Yi.d[3] = ctr; | ||
1152 | while (len--) { | ||
1153 | u8 c = in[n]; | ||
1154 | ctx->Xi.c[n] ^= c; | ||
1155 | out[n] = c^ctx->EKi.c[n]; | ||
1156 | ++n; | ||
1157 | } | ||
1158 | } | ||
1159 | |||
1160 | ctx->mres = n; | ||
1161 | return 0; | ||
1162 | } while(0); | ||
1163 | #endif | ||
1164 | for (i=0;i<len;++i) { | ||
1165 | u8 c; | ||
1166 | if (n==0) { | ||
1167 | (*block)(ctx->Yi.c,ctx->EKi.c,key); | ||
1168 | ++ctr; | ||
1169 | if (is_endian.little) | ||
1170 | PUTU32(ctx->Yi.c+12,ctr); | ||
1171 | else | ||
1172 | ctx->Yi.d[3] = ctr; | ||
1173 | } | ||
1174 | c = in[i]; | ||
1175 | out[i] = c^ctx->EKi.c[n]; | ||
1176 | ctx->Xi.c[n] ^= c; | ||
1177 | n = (n+1)%16; | ||
1178 | if (n==0) | ||
1179 | GCM_MUL(ctx,Xi); | ||
1180 | } | ||
1181 | |||
1182 | ctx->mres = n; | ||
1183 | return 0; | ||
1184 | } | ||
1185 | |||
1186 | int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, | ||
1187 | const unsigned char *in, unsigned char *out, | ||
1188 | size_t len, ctr128_f stream) | ||
1189 | { | ||
1190 | const union { long one; char little; } is_endian = {1}; | ||
1191 | unsigned int n, ctr; | ||
1192 | size_t i; | ||
1193 | u64 mlen = ctx->len.u[1]; | ||
1194 | void *key = ctx->key; | ||
1195 | #ifdef GCM_FUNCREF_4BIT | ||
1196 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
1197 | # ifdef GHASH | ||
1198 | void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], | ||
1199 | const u8 *inp,size_t len) = ctx->ghash; | ||
1200 | # endif | ||
1201 | #endif | ||
1202 | |||
1203 | mlen += len; | ||
1204 | if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) | ||
1205 | return -1; | ||
1206 | ctx->len.u[1] = mlen; | ||
1207 | |||
1208 | if (ctx->ares) { | ||
1209 | /* First call to encrypt finalizes GHASH(AAD) */ | ||
1210 | GCM_MUL(ctx,Xi); | ||
1211 | ctx->ares = 0; | ||
1212 | } | ||
1213 | |||
1214 | if (is_endian.little) | ||
1215 | ctr = GETU32(ctx->Yi.c+12); | ||
1216 | else | ||
1217 | ctr = ctx->Yi.d[3]; | ||
1218 | |||
1219 | n = ctx->mres; | ||
1220 | if (n) { | ||
1221 | while (n && len) { | ||
1222 | ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; | ||
1223 | --len; | ||
1224 | n = (n+1)%16; | ||
1225 | } | ||
1226 | if (n==0) GCM_MUL(ctx,Xi); | ||
1227 | else { | ||
1228 | ctx->mres = n; | ||
1229 | return 0; | ||
1230 | } | ||
1231 | } | ||
1232 | #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) | ||
1233 | while (len>=GHASH_CHUNK) { | ||
1234 | (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); | ||
1235 | ctr += GHASH_CHUNK/16; | ||
1236 | if (is_endian.little) | ||
1237 | PUTU32(ctx->Yi.c+12,ctr); | ||
1238 | else | ||
1239 | ctx->Yi.d[3] = ctr; | ||
1240 | GHASH(ctx,out,GHASH_CHUNK); | ||
1241 | out += GHASH_CHUNK; | ||
1242 | in += GHASH_CHUNK; | ||
1243 | len -= GHASH_CHUNK; | ||
1244 | } | ||
1245 | #endif | ||
1246 | if ((i = (len&(size_t)-16))) { | ||
1247 | size_t j=i/16; | ||
1248 | |||
1249 | (*stream)(in,out,j,key,ctx->Yi.c); | ||
1250 | ctr += (unsigned int)j; | ||
1251 | if (is_endian.little) | ||
1252 | PUTU32(ctx->Yi.c+12,ctr); | ||
1253 | else | ||
1254 | ctx->Yi.d[3] = ctr; | ||
1255 | in += i; | ||
1256 | len -= i; | ||
1257 | #if defined(GHASH) | ||
1258 | GHASH(ctx,out,i); | ||
1259 | out += i; | ||
1260 | #else | ||
1261 | while (j--) { | ||
1262 | for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i]; | ||
1263 | GCM_MUL(ctx,Xi); | ||
1264 | out += 16; | ||
1265 | } | ||
1266 | #endif | ||
1267 | } | ||
1268 | if (len) { | ||
1269 | (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); | ||
1270 | ++ctr; | ||
1271 | if (is_endian.little) | ||
1272 | PUTU32(ctx->Yi.c+12,ctr); | ||
1273 | else | ||
1274 | ctx->Yi.d[3] = ctr; | ||
1275 | while (len--) { | ||
1276 | ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; | ||
1277 | ++n; | ||
1278 | } | ||
1279 | } | ||
1280 | |||
1281 | ctx->mres = n; | ||
1282 | return 0; | ||
1283 | } | ||
1284 | |||
1285 | int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, | ||
1286 | const unsigned char *in, unsigned char *out, | ||
1287 | size_t len,ctr128_f stream) | ||
1288 | { | ||
1289 | const union { long one; char little; } is_endian = {1}; | ||
1290 | unsigned int n, ctr; | ||
1291 | size_t i; | ||
1292 | u64 mlen = ctx->len.u[1]; | ||
1293 | void *key = ctx->key; | ||
1294 | #ifdef GCM_FUNCREF_4BIT | ||
1295 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
1296 | # ifdef GHASH | ||
1297 | void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], | ||
1298 | const u8 *inp,size_t len) = ctx->ghash; | ||
1299 | # endif | ||
1300 | #endif | ||
1301 | |||
1302 | mlen += len; | ||
1303 | if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) | ||
1304 | return -1; | ||
1305 | ctx->len.u[1] = mlen; | ||
1306 | |||
1307 | if (ctx->ares) { | ||
1308 | /* First call to decrypt finalizes GHASH(AAD) */ | ||
1309 | GCM_MUL(ctx,Xi); | ||
1310 | ctx->ares = 0; | ||
1311 | } | ||
1312 | |||
1313 | if (is_endian.little) | ||
1314 | ctr = GETU32(ctx->Yi.c+12); | ||
1315 | else | ||
1316 | ctr = ctx->Yi.d[3]; | ||
1317 | |||
1318 | n = ctx->mres; | ||
1319 | if (n) { | ||
1320 | while (n && len) { | ||
1321 | u8 c = *(in++); | ||
1322 | *(out++) = c^ctx->EKi.c[n]; | ||
1323 | ctx->Xi.c[n] ^= c; | ||
1324 | --len; | ||
1325 | n = (n+1)%16; | ||
1326 | } | ||
1327 | if (n==0) GCM_MUL (ctx,Xi); | ||
1328 | else { | ||
1329 | ctx->mres = n; | ||
1330 | return 0; | ||
1331 | } | ||
1332 | } | ||
1333 | #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) | ||
1334 | while (len>=GHASH_CHUNK) { | ||
1335 | GHASH(ctx,in,GHASH_CHUNK); | ||
1336 | (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); | ||
1337 | ctr += GHASH_CHUNK/16; | ||
1338 | if (is_endian.little) | ||
1339 | PUTU32(ctx->Yi.c+12,ctr); | ||
1340 | else | ||
1341 | ctx->Yi.d[3] = ctr; | ||
1342 | out += GHASH_CHUNK; | ||
1343 | in += GHASH_CHUNK; | ||
1344 | len -= GHASH_CHUNK; | ||
1345 | } | ||
1346 | #endif | ||
1347 | if ((i = (len&(size_t)-16))) { | ||
1348 | size_t j=i/16; | ||
1349 | |||
1350 | #if defined(GHASH) | ||
1351 | GHASH(ctx,in,i); | ||
1352 | #else | ||
1353 | while (j--) { | ||
1354 | size_t k; | ||
1355 | for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k]; | ||
1356 | GCM_MUL(ctx,Xi); | ||
1357 | in += 16; | ||
1358 | } | ||
1359 | j = i/16; | ||
1360 | in -= i; | ||
1361 | #endif | ||
1362 | (*stream)(in,out,j,key,ctx->Yi.c); | ||
1363 | ctr += (unsigned int)j; | ||
1364 | if (is_endian.little) | ||
1365 | PUTU32(ctx->Yi.c+12,ctr); | ||
1366 | else | ||
1367 | ctx->Yi.d[3] = ctr; | ||
1368 | out += i; | ||
1369 | in += i; | ||
1370 | len -= i; | ||
1371 | } | ||
1372 | if (len) { | ||
1373 | (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); | ||
1374 | ++ctr; | ||
1375 | if (is_endian.little) | ||
1376 | PUTU32(ctx->Yi.c+12,ctr); | ||
1377 | else | ||
1378 | ctx->Yi.d[3] = ctr; | ||
1379 | while (len--) { | ||
1380 | u8 c = in[n]; | ||
1381 | ctx->Xi.c[n] ^= c; | ||
1382 | out[n] = c^ctx->EKi.c[n]; | ||
1383 | ++n; | ||
1384 | } | ||
1385 | } | ||
1386 | |||
1387 | ctx->mres = n; | ||
1388 | return 0; | ||
1389 | } | ||
1390 | |||
1391 | int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, | ||
1392 | size_t len) | ||
1393 | { | ||
1394 | const union { long one; char little; } is_endian = {1}; | ||
1395 | u64 alen = ctx->len.u[0]<<3; | ||
1396 | u64 clen = ctx->len.u[1]<<3; | ||
1397 | #ifdef GCM_FUNCREF_4BIT | ||
1398 | void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; | ||
1399 | #endif | ||
1400 | |||
1401 | if (ctx->mres) | ||
1402 | GCM_MUL(ctx,Xi); | ||
1403 | |||
1404 | if (is_endian.little) { | ||
1405 | #ifdef BSWAP8 | ||
1406 | alen = BSWAP8(alen); | ||
1407 | clen = BSWAP8(clen); | ||
1408 | #else | ||
1409 | u8 *p = ctx->len.c; | ||
1410 | |||
1411 | ctx->len.u[0] = alen; | ||
1412 | ctx->len.u[1] = clen; | ||
1413 | |||
1414 | alen = (u64)GETU32(p) <<32|GETU32(p+4); | ||
1415 | clen = (u64)GETU32(p+8)<<32|GETU32(p+12); | ||
1416 | #endif | ||
1417 | } | ||
1418 | |||
1419 | ctx->Xi.u[0] ^= alen; | ||
1420 | ctx->Xi.u[1] ^= clen; | ||
1421 | GCM_MUL(ctx,Xi); | ||
1422 | |||
1423 | ctx->Xi.u[0] ^= ctx->EK0.u[0]; | ||
1424 | ctx->Xi.u[1] ^= ctx->EK0.u[1]; | ||
1425 | |||
1426 | if (tag && len<=sizeof(ctx->Xi)) | ||
1427 | return memcmp(ctx->Xi.c,tag,len); | ||
1428 | else | ||
1429 | return -1; | ||
1430 | } | ||
1431 | |||
1432 | void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) | ||
1433 | { | ||
1434 | CRYPTO_gcm128_finish(ctx, NULL, 0); | ||
1435 | memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c)); | ||
1436 | } | ||
1437 | |||
1438 | GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) | ||
1439 | { | ||
1440 | GCM128_CONTEXT *ret; | ||
1441 | |||
1442 | if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT)))) | ||
1443 | CRYPTO_gcm128_init(ret,key,block); | ||
1444 | |||
1445 | return ret; | ||
1446 | } | ||
1447 | |||
1448 | void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) | ||
1449 | { | ||
1450 | if (ctx) { | ||
1451 | OPENSSL_cleanse(ctx,sizeof(*ctx)); | ||
1452 | OPENSSL_free(ctx); | ||
1453 | } | ||
1454 | } | ||
1455 | |||
1456 | #if defined(SELFTEST) | ||
1457 | #include <stdio.h> | ||
1458 | #include <openssl/aes.h> | ||
1459 | |||
1460 | /* Test Case 1 */ | ||
1461 | static const u8 K1[16], | ||
1462 | *P1=NULL, | ||
1463 | *A1=NULL, | ||
1464 | IV1[12], | ||
1465 | *C1=NULL, | ||
1466 | T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a}; | ||
1467 | |||
1468 | /* Test Case 2 */ | ||
1469 | #define K2 K1 | ||
1470 | #define A2 A1 | ||
1471 | #define IV2 IV1 | ||
1472 | static const u8 P2[16], | ||
1473 | C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78}, | ||
1474 | T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf}; | ||
1475 | |||
1476 | /* Test Case 3 */ | ||
1477 | #define A3 A2 | ||
1478 | static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08}, | ||
1479 | P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
1480 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
1481 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
1482 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, | ||
1483 | IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, | ||
1484 | C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c, | ||
1485 | 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e, | ||
1486 | 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05, | ||
1487 | 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85}, | ||
1488 | T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4}; | ||
1489 | |||
1490 | /* Test Case 4 */ | ||
1491 | #define K4 K3 | ||
1492 | #define IV4 IV3 | ||
1493 | static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
1494 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
1495 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
1496 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, | ||
1497 | A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, | ||
1498 | 0xab,0xad,0xda,0xd2}, | ||
1499 | C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c, | ||
1500 | 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e, | ||
1501 | 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05, | ||
1502 | 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91}, | ||
1503 | T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47}; | ||
1504 | |||
1505 | /* Test Case 5 */ | ||
1506 | #define K5 K4 | ||
1507 | #define P5 P4 | ||
1508 | #define A5 A4 | ||
1509 | static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, | ||
1510 | C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55, | ||
1511 | 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23, | ||
1512 | 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42, | ||
1513 | 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98}, | ||
1514 | T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb}; | ||
1515 | |||
1516 | /* Test Case 6 */ | ||
1517 | #define K6 K5 | ||
1518 | #define P6 P5 | ||
1519 | #define A6 A5 | ||
1520 | static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, | ||
1521 | 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, | ||
1522 | 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, | ||
1523 | 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, | ||
1524 | C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94, | ||
1525 | 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7, | ||
1526 | 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f, | ||
1527 | 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5}, | ||
1528 | T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50}; | ||
1529 | |||
1530 | /* Test Case 7 */ | ||
1531 | static const u8 K7[24], | ||
1532 | *P7=NULL, | ||
1533 | *A7=NULL, | ||
1534 | IV7[12], | ||
1535 | *C7=NULL, | ||
1536 | T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35}; | ||
1537 | |||
1538 | /* Test Case 8 */ | ||
1539 | #define K8 K7 | ||
1540 | #define IV8 IV7 | ||
1541 | #define A8 A7 | ||
1542 | static const u8 P8[16], | ||
1543 | C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00}, | ||
1544 | T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb}; | ||
1545 | |||
1546 | /* Test Case 9 */ | ||
1547 | #define A9 A8 | ||
1548 | static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08, | ||
1549 | 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c}, | ||
1550 | P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
1551 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
1552 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
1553 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, | ||
1554 | IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, | ||
1555 | C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57, | ||
1556 | 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c, | ||
1557 | 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47, | ||
1558 | 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56}, | ||
1559 | T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14}; | ||
1560 | |||
1561 | /* Test Case 10 */ | ||
1562 | #define K10 K9 | ||
1563 | #define IV10 IV9 | ||
1564 | static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
1565 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
1566 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
1567 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, | ||
1568 | A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, | ||
1569 | 0xab,0xad,0xda,0xd2}, | ||
1570 | C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57, | ||
1571 | 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c, | ||
1572 | 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47, | ||
1573 | 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10}, | ||
1574 | T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c}; | ||
1575 | |||
1576 | /* Test Case 11 */ | ||
1577 | #define K11 K10 | ||
1578 | #define P11 P10 | ||
1579 | #define A11 A10 | ||
1580 | static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, | ||
1581 | C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8, | ||
1582 | 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57, | ||
1583 | 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9, | ||
1584 | 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7}, | ||
1585 | T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8}; | ||
1586 | |||
1587 | /* Test Case 12 */ | ||
1588 | #define K12 K11 | ||
1589 | #define P12 P11 | ||
1590 | #define A12 A11 | ||
1591 | static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, | ||
1592 | 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, | ||
1593 | 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, | ||
1594 | 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, | ||
1595 | C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff, | ||
1596 | 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45, | ||
1597 | 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3, | ||
1598 | 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b}, | ||
1599 | T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9}; | ||
1600 | |||
1601 | /* Test Case 13 */ | ||
1602 | static const u8 K13[32], | ||
1603 | *P13=NULL, | ||
1604 | *A13=NULL, | ||
1605 | IV13[12], | ||
1606 | *C13=NULL, | ||
1607 | T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b}; | ||
1608 | |||
1609 | /* Test Case 14 */ | ||
1610 | #define K14 K13 | ||
1611 | #define A14 A13 | ||
1612 | static const u8 P14[16], | ||
1613 | IV14[12], | ||
1614 | C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18}, | ||
1615 | T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19}; | ||
1616 | |||
1617 | /* Test Case 15 */ | ||
1618 | #define A15 A14 | ||
1619 | static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08, | ||
1620 | 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08}, | ||
1621 | P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
1622 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
1623 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
1624 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, | ||
1625 | IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, | ||
1626 | C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, | ||
1627 | 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, | ||
1628 | 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, | ||
1629 | 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad}, | ||
1630 | T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c}; | ||
1631 | |||
1632 | /* Test Case 16 */ | ||
1633 | #define K16 K15 | ||
1634 | #define IV16 IV15 | ||
1635 | static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, | ||
1636 | 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, | ||
1637 | 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, | ||
1638 | 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, | ||
1639 | A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, | ||
1640 | 0xab,0xad,0xda,0xd2}, | ||
1641 | C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, | ||
1642 | 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, | ||
1643 | 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, | ||
1644 | 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62}, | ||
1645 | T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b}; | ||
1646 | |||
1647 | /* Test Case 17 */ | ||
1648 | #define K17 K16 | ||
1649 | #define P17 P16 | ||
1650 | #define A17 A16 | ||
1651 | static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, | ||
1652 | C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb, | ||
1653 | 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0, | ||
1654 | 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78, | ||
1655 | 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f}, | ||
1656 | T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2}; | ||
1657 | |||
1658 | /* Test Case 18 */ | ||
1659 | #define K18 K17 | ||
1660 | #define P18 P17 | ||
1661 | #define A18 A17 | ||
1662 | static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, | ||
1663 | 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, | ||
1664 | 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, | ||
1665 | 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, | ||
1666 | C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20, | ||
1667 | 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4, | ||
1668 | 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde, | ||
1669 | 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f}, | ||
1670 | T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a}; | ||
1671 | |||
1672 | #define TEST_CASE(n) do { \ | ||
1673 | u8 out[sizeof(P##n)]; \ | ||
1674 | AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \ | ||
1675 | CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \ | ||
1676 | CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \ | ||
1677 | memset(out,0,sizeof(out)); \ | ||
1678 | if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \ | ||
1679 | if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \ | ||
1680 | if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \ | ||
1681 | (C##n && memcmp(out,C##n,sizeof(out)))) \ | ||
1682 | ret++, printf ("encrypt test#%d failed.\n",n); \ | ||
1683 | CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \ | ||
1684 | memset(out,0,sizeof(out)); \ | ||
1685 | if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \ | ||
1686 | if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \ | ||
1687 | if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \ | ||
1688 | (P##n && memcmp(out,P##n,sizeof(out)))) \ | ||
1689 | ret++, printf ("decrypt test#%d failed.\n",n); \ | ||
1690 | } while(0) | ||
1691 | |||
1692 | int main() | ||
1693 | { | ||
1694 | GCM128_CONTEXT ctx; | ||
1695 | AES_KEY key; | ||
1696 | int ret=0; | ||
1697 | |||
1698 | TEST_CASE(1); | ||
1699 | TEST_CASE(2); | ||
1700 | TEST_CASE(3); | ||
1701 | TEST_CASE(4); | ||
1702 | TEST_CASE(5); | ||
1703 | TEST_CASE(6); | ||
1704 | TEST_CASE(7); | ||
1705 | TEST_CASE(8); | ||
1706 | TEST_CASE(9); | ||
1707 | TEST_CASE(10); | ||
1708 | TEST_CASE(11); | ||
1709 | TEST_CASE(12); | ||
1710 | TEST_CASE(13); | ||
1711 | TEST_CASE(14); | ||
1712 | TEST_CASE(15); | ||
1713 | TEST_CASE(16); | ||
1714 | TEST_CASE(17); | ||
1715 | TEST_CASE(18); | ||
1716 | |||
1717 | #ifdef OPENSSL_CPUID_OBJ | ||
1718 | { | ||
1719 | size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc(); | ||
1720 | union { u64 u; u8 c[1024]; } buf; | ||
1721 | int i; | ||
1722 | |||
1723 | AES_set_encrypt_key(K1,sizeof(K1)*8,&key); | ||
1724 | CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); | ||
1725 | CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1)); | ||
1726 | |||
1727 | CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf)); | ||
1728 | start = OPENSSL_rdtsc(); | ||
1729 | CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf)); | ||
1730 | gcm_t = OPENSSL_rdtsc() - start; | ||
1731 | |||
1732 | CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf), | ||
1733 | &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres, | ||
1734 | (block128_f)AES_encrypt); | ||
1735 | start = OPENSSL_rdtsc(); | ||
1736 | CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf), | ||
1737 | &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres, | ||
1738 | (block128_f)AES_encrypt); | ||
1739 | ctr_t = OPENSSL_rdtsc() - start; | ||
1740 | |||
1741 | printf("%.2f-%.2f=%.2f\n", | ||
1742 | gcm_t/(double)sizeof(buf), | ||
1743 | ctr_t/(double)sizeof(buf), | ||
1744 | (gcm_t-ctr_t)/(double)sizeof(buf)); | ||
1745 | #ifdef GHASH | ||
1746 | GHASH(&ctx,buf.c,sizeof(buf)); | ||
1747 | start = OPENSSL_rdtsc(); | ||
1748 | for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf)); | ||
1749 | gcm_t = OPENSSL_rdtsc() - start; | ||
1750 | printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i); | ||
1751 | #endif | ||
1752 | } | ||
1753 | #endif | ||
1754 | |||
1755 | return ret; | ||
1756 | } | ||
1757 | #endif | ||
diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h index af8d97d795..f18215bb2b 100644 --- a/src/lib/libcrypto/modes/modes.h +++ b/src/lib/libcrypto/modes/modes.h | |||
@@ -15,6 +15,14 @@ typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out, | |||
15 | size_t len, const void *key, | 15 | size_t len, const void *key, |
16 | unsigned char ivec[16], int enc); | 16 | unsigned char ivec[16], int enc); |
17 | 17 | ||
18 | typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out, | ||
19 | size_t blocks, const void *key, | ||
20 | const unsigned char ivec[16]); | ||
21 | |||
22 | typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out, | ||
23 | size_t blocks, const void *key, | ||
24 | const unsigned char ivec[16],unsigned char cmac[16]); | ||
25 | |||
18 | void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out, | 26 | void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out, |
19 | size_t len, const void *key, | 27 | size_t len, const void *key, |
20 | unsigned char ivec[16], block128_f block); | 28 | unsigned char ivec[16], block128_f block); |
@@ -27,6 +35,11 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, | |||
27 | unsigned char ivec[16], unsigned char ecount_buf[16], | 35 | unsigned char ivec[16], unsigned char ecount_buf[16], |
28 | unsigned int *num, block128_f block); | 36 | unsigned int *num, block128_f block); |
29 | 37 | ||
38 | void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, | ||
39 | size_t len, const void *key, | ||
40 | unsigned char ivec[16], unsigned char ecount_buf[16], | ||
41 | unsigned int *num, ctr128_f ctr); | ||
42 | |||
30 | void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, | 43 | void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, |
31 | size_t len, const void *key, | 44 | size_t len, const void *key, |
32 | unsigned char ivec[16], int *num, | 45 | unsigned char ivec[16], int *num, |
@@ -57,3 +70,66 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, | |||
57 | size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, | 70 | size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, |
58 | size_t len, const void *key, | 71 | size_t len, const void *key, |
59 | unsigned char ivec[16], cbc128_f cbc); | 72 | unsigned char ivec[16], cbc128_f cbc); |
73 | |||
74 | size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out, | ||
75 | size_t len, const void *key, | ||
76 | unsigned char ivec[16], block128_f block); | ||
77 | size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out, | ||
78 | size_t len, const void *key, | ||
79 | unsigned char ivec[16], cbc128_f cbc); | ||
80 | size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out, | ||
81 | size_t len, const void *key, | ||
82 | unsigned char ivec[16], block128_f block); | ||
83 | size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out, | ||
84 | size_t len, const void *key, | ||
85 | unsigned char ivec[16], cbc128_f cbc); | ||
86 | |||
87 | typedef struct gcm128_context GCM128_CONTEXT; | ||
88 | |||
89 | GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block); | ||
90 | void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block); | ||
91 | void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, | ||
92 | size_t len); | ||
93 | int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, | ||
94 | size_t len); | ||
95 | int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, | ||
96 | const unsigned char *in, unsigned char *out, | ||
97 | size_t len); | ||
98 | int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, | ||
99 | const unsigned char *in, unsigned char *out, | ||
100 | size_t len); | ||
101 | int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, | ||
102 | const unsigned char *in, unsigned char *out, | ||
103 | size_t len, ctr128_f stream); | ||
104 | int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, | ||
105 | const unsigned char *in, unsigned char *out, | ||
106 | size_t len, ctr128_f stream); | ||
107 | int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, | ||
108 | size_t len); | ||
109 | void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len); | ||
110 | void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx); | ||
111 | |||
112 | typedef struct ccm128_context CCM128_CONTEXT; | ||
113 | |||
114 | void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, | ||
115 | unsigned int M, unsigned int L, void *key,block128_f block); | ||
116 | int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, | ||
117 | const unsigned char *nonce, size_t nlen, size_t mlen); | ||
118 | void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, | ||
119 | const unsigned char *aad, size_t alen); | ||
120 | int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, | ||
121 | const unsigned char *inp, unsigned char *out, size_t len); | ||
122 | int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, | ||
123 | const unsigned char *inp, unsigned char *out, size_t len); | ||
124 | int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, | ||
125 | const unsigned char *inp, unsigned char *out, size_t len, | ||
126 | ccm128_f stream); | ||
127 | int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, | ||
128 | const unsigned char *inp, unsigned char *out, size_t len, | ||
129 | ccm128_f stream); | ||
130 | size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len); | ||
131 | |||
132 | typedef struct xts128_context XTS128_CONTEXT; | ||
133 | |||
134 | int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16], | ||
135 | const unsigned char *inp, unsigned char *out, size_t len, int enc); | ||
diff --git a/src/lib/libcrypto/modes/modes_lcl.h b/src/lib/libcrypto/modes/modes_lcl.h new file mode 100644 index 0000000000..b6dc3c336f --- /dev/null +++ b/src/lib/libcrypto/modes/modes_lcl.h | |||
@@ -0,0 +1,131 @@ | |||
1 | /* ==================================================================== | ||
2 | * Copyright (c) 2010 The OpenSSL Project. All rights reserved. | ||
3 | * | ||
4 | * Redistribution and use is governed by OpenSSL license. | ||
5 | * ==================================================================== | ||
6 | */ | ||
7 | |||
8 | #include <openssl/modes.h> | ||
9 | |||
10 | |||
11 | #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) | ||
12 | typedef __int64 i64; | ||
13 | typedef unsigned __int64 u64; | ||
14 | #define U64(C) C##UI64 | ||
15 | #elif defined(__arch64__) | ||
16 | typedef long i64; | ||
17 | typedef unsigned long u64; | ||
18 | #define U64(C) C##UL | ||
19 | #else | ||
20 | typedef long long i64; | ||
21 | typedef unsigned long long u64; | ||
22 | #define U64(C) C##ULL | ||
23 | #endif | ||
24 | |||
25 | typedef unsigned int u32; | ||
26 | typedef unsigned char u8; | ||
27 | |||
28 | #define STRICT_ALIGNMENT 1 | ||
29 | #if defined(__i386) || defined(__i386__) || \ | ||
30 | defined(__x86_64) || defined(__x86_64__) || \ | ||
31 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ | ||
32 | defined(__s390__) || defined(__s390x__) || \ | ||
33 | ( (defined(__arm__) || defined(__arm)) && \ | ||
34 | (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ | ||
35 | defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)) ) | ||
36 | # undef STRICT_ALIGNMENT | ||
37 | #endif | ||
38 | |||
39 | #if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) | ||
40 | #if defined(__GNUC__) && __GNUC__>=2 | ||
41 | # if defined(__x86_64) || defined(__x86_64__) | ||
42 | # define BSWAP8(x) ({ u64 ret=(x); \ | ||
43 | asm ("bswapq %0" \ | ||
44 | : "+r"(ret)); ret; }) | ||
45 | # define BSWAP4(x) ({ u32 ret=(x); \ | ||
46 | asm ("bswapl %0" \ | ||
47 | : "+r"(ret)); ret; }) | ||
48 | # elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY) | ||
49 | # define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \ | ||
50 | asm ("bswapl %0; bswapl %1" \ | ||
51 | : "+r"(hi),"+r"(lo)); \ | ||
52 | (u64)hi<<32|lo; }) | ||
53 | # define BSWAP4(x) ({ u32 ret=(x); \ | ||
54 | asm ("bswapl %0" \ | ||
55 | : "+r"(ret)); ret; }) | ||
56 | # elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT) | ||
57 | # define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \ | ||
58 | asm ("rev %0,%0; rev %1,%1" \ | ||
59 | : "+r"(hi),"+r"(lo)); \ | ||
60 | (u64)hi<<32|lo; }) | ||
61 | # define BSWAP4(x) ({ u32 ret; \ | ||
62 | asm ("rev %0,%1" \ | ||
63 | : "=r"(ret) : "r"((u32)(x))); \ | ||
64 | ret; }) | ||
65 | # endif | ||
66 | #elif defined(_MSC_VER) | ||
67 | # if _MSC_VER>=1300 | ||
68 | # pragma intrinsic(_byteswap_uint64,_byteswap_ulong) | ||
69 | # define BSWAP8(x) _byteswap_uint64((u64)(x)) | ||
70 | # define BSWAP4(x) _byteswap_ulong((u32)(x)) | ||
71 | # elif defined(_M_IX86) | ||
72 | __inline u32 _bswap4(u32 val) { | ||
73 | _asm mov eax,val | ||
74 | _asm bswap eax | ||
75 | } | ||
76 | # define BSWAP4(x) _bswap4(x) | ||
77 | # endif | ||
78 | #endif | ||
79 | #endif | ||
80 | |||
81 | #if defined(BSWAP4) && !defined(STRICT_ALIGNMENT) | ||
82 | #define GETU32(p) BSWAP4(*(const u32 *)(p)) | ||
83 | #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) | ||
84 | #else | ||
85 | #define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) | ||
86 | #define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v)) | ||
87 | #endif | ||
88 | |||
89 | /* GCM definitions */ | ||
90 | |||
91 | typedef struct { u64 hi,lo; } u128; | ||
92 | |||
93 | #ifdef TABLE_BITS | ||
94 | #undef TABLE_BITS | ||
95 | #endif | ||
96 | /* | ||
97 | * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should | ||
98 | * never be set to 8 [or 1]. For further information see gcm128.c. | ||
99 | */ | ||
100 | #define TABLE_BITS 4 | ||
101 | |||
102 | struct gcm128_context { | ||
103 | /* Following 6 names follow names in GCM specification */ | ||
104 | union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,len, | ||
105 | Xi,H; | ||
106 | /* Relative position of Xi, H and pre-computed Htable is used | ||
107 | * in some assembler modules, i.e. don't change the order! */ | ||
108 | #if TABLE_BITS==8 | ||
109 | u128 Htable[256]; | ||
110 | #else | ||
111 | u128 Htable[16]; | ||
112 | void (*gmult)(u64 Xi[2],const u128 Htable[16]); | ||
113 | void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | ||
114 | #endif | ||
115 | unsigned int mres, ares; | ||
116 | block128_f block; | ||
117 | void *key; | ||
118 | }; | ||
119 | |||
120 | struct xts128_context { | ||
121 | void *key1, *key2; | ||
122 | block128_f block1,block2; | ||
123 | }; | ||
124 | |||
125 | struct ccm128_context { | ||
126 | union { u64 u[2]; u8 c[16]; } nonce, cmac; | ||
127 | u64 blocks; | ||
128 | block128_f block; | ||
129 | void *key; | ||
130 | }; | ||
131 | |||
diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c index c732e2ec58..01c01702c4 100644 --- a/src/lib/libcrypto/modes/ofb128.c +++ b/src/lib/libcrypto/modes/ofb128.c | |||
@@ -48,7 +48,8 @@ | |||
48 | * | 48 | * |
49 | */ | 49 | */ |
50 | 50 | ||
51 | #include "modes.h" | 51 | #include <openssl/crypto.h> |
52 | #include "modes_lcl.h" | ||
52 | #include <string.h> | 53 | #include <string.h> |
53 | 54 | ||
54 | #ifndef MODES_DEBUG | 55 | #ifndef MODES_DEBUG |
@@ -58,14 +59,6 @@ | |||
58 | #endif | 59 | #endif |
59 | #include <assert.h> | 60 | #include <assert.h> |
60 | 61 | ||
61 | #define STRICT_ALIGNMENT | ||
62 | #if defined(__i386) || defined(__i386__) || \ | ||
63 | defined(__x86_64) || defined(__x86_64__) || \ | ||
64 | defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ | ||
65 | defined(__s390__) || defined(__s390x__) | ||
66 | # undef STRICT_ALIGNMENT | ||
67 | #endif | ||
68 | |||
69 | /* The input and output encrypted as though 128bit ofb mode is being | 62 | /* The input and output encrypted as though 128bit ofb mode is being |
70 | * used. The extra state information to record how much of the | 63 | * used. The extra state information to record how much of the |
71 | * 128bit block we have used is contained in *num; | 64 | * 128bit block we have used is contained in *num; |
diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c new file mode 100644 index 0000000000..9cf27a25e9 --- /dev/null +++ b/src/lib/libcrypto/modes/xts128.c | |||
@@ -0,0 +1,187 @@ | |||
1 | /* ==================================================================== | ||
2 | * Copyright (c) 2011 The OpenSSL Project. All rights reserved. | ||
3 | * | ||
4 | * Redistribution and use in source and binary forms, with or without | ||
5 | * modification, are permitted provided that the following conditions | ||
6 | * are met: | ||
7 | * | ||
8 | * 1. Redistributions of source code must retain the above copyright | ||
9 | * notice, this list of conditions and the following disclaimer. | ||
10 | * | ||
11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
12 | * notice, this list of conditions and the following disclaimer in | ||
13 | * the documentation and/or other materials provided with the | ||
14 | * distribution. | ||
15 | * | ||
16 | * 3. All advertising materials mentioning features or use of this | ||
17 | * software must display the following acknowledgment: | ||
18 | * "This product includes software developed by the OpenSSL Project | ||
19 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
20 | * | ||
21 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
22 | * endorse or promote products derived from this software without | ||
23 | * prior written permission. For written permission, please contact | ||
24 | * openssl-core@openssl.org. | ||
25 | * | ||
26 | * 5. Products derived from this software may not be called "OpenSSL" | ||
27 | * nor may "OpenSSL" appear in their names without prior written | ||
28 | * permission of the OpenSSL Project. | ||
29 | * | ||
30 | * 6. Redistributions of any form whatsoever must retain the following | ||
31 | * acknowledgment: | ||
32 | * "This product includes software developed by the OpenSSL Project | ||
33 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
34 | * | ||
35 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
36 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
38 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
39 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
40 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
41 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
42 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
43 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
44 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
45 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
46 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
47 | * ==================================================================== | ||
48 | */ | ||
49 | |||
50 | #include <openssl/crypto.h> | ||
51 | #include "modes_lcl.h" | ||
52 | #include <string.h> | ||
53 | |||
54 | #ifndef MODES_DEBUG | ||
55 | # ifndef NDEBUG | ||
56 | # define NDEBUG | ||
57 | # endif | ||
58 | #endif | ||
59 | #include <assert.h> | ||
60 | |||
61 | int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16], | ||
62 | const unsigned char *inp, unsigned char *out, | ||
63 | size_t len, int enc) | ||
64 | { | ||
65 | const union { long one; char little; } is_endian = {1}; | ||
66 | union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch; | ||
67 | unsigned int i; | ||
68 | |||
69 | if (len<16) return -1; | ||
70 | |||
71 | memcpy(tweak.c, iv, 16); | ||
72 | |||
73 | (*ctx->block2)(tweak.c,tweak.c,ctx->key2); | ||
74 | |||
75 | if (!enc && (len%16)) len-=16; | ||
76 | |||
77 | while (len>=16) { | ||
78 | #if defined(STRICT_ALIGNMENT) | ||
79 | memcpy(scratch.c,inp,16); | ||
80 | scratch.u[0] ^= tweak.u[0]; | ||
81 | scratch.u[1] ^= tweak.u[1]; | ||
82 | #else | ||
83 | scratch.u[0] = ((u64*)inp)[0]^tweak.u[0]; | ||
84 | scratch.u[1] = ((u64*)inp)[1]^tweak.u[1]; | ||
85 | #endif | ||
86 | (*ctx->block1)(scratch.c,scratch.c,ctx->key1); | ||
87 | #if defined(STRICT_ALIGNMENT) | ||
88 | scratch.u[0] ^= tweak.u[0]; | ||
89 | scratch.u[1] ^= tweak.u[1]; | ||
90 | memcpy(out,scratch.c,16); | ||
91 | #else | ||
92 | ((u64*)out)[0] = scratch.u[0]^=tweak.u[0]; | ||
93 | ((u64*)out)[1] = scratch.u[1]^=tweak.u[1]; | ||
94 | #endif | ||
95 | inp += 16; | ||
96 | out += 16; | ||
97 | len -= 16; | ||
98 | |||
99 | if (len==0) return 0; | ||
100 | |||
101 | if (is_endian.little) { | ||
102 | unsigned int carry,res; | ||
103 | |||
104 | res = 0x87&(((int)tweak.d[3])>>31); | ||
105 | carry = (unsigned int)(tweak.u[0]>>63); | ||
106 | tweak.u[0] = (tweak.u[0]<<1)^res; | ||
107 | tweak.u[1] = (tweak.u[1]<<1)|carry; | ||
108 | } | ||
109 | else { | ||
110 | size_t c; | ||
111 | |||
112 | for (c=0,i=0;i<16;++i) { | ||
113 | /*+ substitutes for |, because c is 1 bit */ | ||
114 | c += ((size_t)tweak.c[i])<<1; | ||
115 | tweak.c[i] = (u8)c; | ||
116 | c = c>>8; | ||
117 | } | ||
118 | tweak.c[0] ^= (u8)(0x87&(0-c)); | ||
119 | } | ||
120 | } | ||
121 | if (enc) { | ||
122 | for (i=0;i<len;++i) { | ||
123 | u8 c = inp[i]; | ||
124 | out[i] = scratch.c[i]; | ||
125 | scratch.c[i] = c; | ||
126 | } | ||
127 | scratch.u[0] ^= tweak.u[0]; | ||
128 | scratch.u[1] ^= tweak.u[1]; | ||
129 | (*ctx->block1)(scratch.c,scratch.c,ctx->key1); | ||
130 | scratch.u[0] ^= tweak.u[0]; | ||
131 | scratch.u[1] ^= tweak.u[1]; | ||
132 | memcpy(out-16,scratch.c,16); | ||
133 | } | ||
134 | else { | ||
135 | union { u64 u[2]; u8 c[16]; } tweak1; | ||
136 | |||
137 | if (is_endian.little) { | ||
138 | unsigned int carry,res; | ||
139 | |||
140 | res = 0x87&(((int)tweak.d[3])>>31); | ||
141 | carry = (unsigned int)(tweak.u[0]>>63); | ||
142 | tweak1.u[0] = (tweak.u[0]<<1)^res; | ||
143 | tweak1.u[1] = (tweak.u[1]<<1)|carry; | ||
144 | } | ||
145 | else { | ||
146 | size_t c; | ||
147 | |||
148 | for (c=0,i=0;i<16;++i) { | ||
149 | /*+ substitutes for |, because c is 1 bit */ | ||
150 | c += ((size_t)tweak.c[i])<<1; | ||
151 | tweak1.c[i] = (u8)c; | ||
152 | c = c>>8; | ||
153 | } | ||
154 | tweak1.c[0] ^= (u8)(0x87&(0-c)); | ||
155 | } | ||
156 | #if defined(STRICT_ALIGNMENT) | ||
157 | memcpy(scratch.c,inp,16); | ||
158 | scratch.u[0] ^= tweak1.u[0]; | ||
159 | scratch.u[1] ^= tweak1.u[1]; | ||
160 | #else | ||
161 | scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0]; | ||
162 | scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1]; | ||
163 | #endif | ||
164 | (*ctx->block1)(scratch.c,scratch.c,ctx->key1); | ||
165 | scratch.u[0] ^= tweak1.u[0]; | ||
166 | scratch.u[1] ^= tweak1.u[1]; | ||
167 | |||
168 | for (i=0;i<len;++i) { | ||
169 | u8 c = inp[16+i]; | ||
170 | out[16+i] = scratch.c[i]; | ||
171 | scratch.c[i] = c; | ||
172 | } | ||
173 | scratch.u[0] ^= tweak.u[0]; | ||
174 | scratch.u[1] ^= tweak.u[1]; | ||
175 | (*ctx->block1)(scratch.c,scratch.c,ctx->key1); | ||
176 | #if defined(STRICT_ALIGNMENT) | ||
177 | scratch.u[0] ^= tweak.u[0]; | ||
178 | scratch.u[1] ^= tweak.u[1]; | ||
179 | memcpy (out,scratch.c,16); | ||
180 | #else | ||
181 | ((u64*)out)[0] = scratch.u[0]^tweak.u[0]; | ||
182 | ((u64*)out)[1] = scratch.u[1]^tweak.u[1]; | ||
183 | #endif | ||
184 | } | ||
185 | |||
186 | return 0; | ||
187 | } | ||
diff --git a/src/lib/libcrypto/objects/obj_xref.c b/src/lib/libcrypto/objects/obj_xref.c index 152eca5c67..9f744bcede 100644 --- a/src/lib/libcrypto/objects/obj_xref.c +++ b/src/lib/libcrypto/objects/obj_xref.c | |||
@@ -110,8 +110,10 @@ int OBJ_find_sigid_algs(int signid, int *pdig_nid, int *ppkey_nid) | |||
110 | #endif | 110 | #endif |
111 | if (rv == NULL) | 111 | if (rv == NULL) |
112 | return 0; | 112 | return 0; |
113 | *pdig_nid = rv->hash_id; | 113 | if (pdig_nid) |
114 | *ppkey_nid = rv->pkey_id; | 114 | *pdig_nid = rv->hash_id; |
115 | if (ppkey_nid) | ||
116 | *ppkey_nid = rv->pkey_id; | ||
115 | return 1; | 117 | return 1; |
116 | } | 118 | } |
117 | 119 | ||
@@ -144,7 +146,8 @@ int OBJ_find_sigid_by_algs(int *psignid, int dig_nid, int pkey_nid) | |||
144 | #endif | 146 | #endif |
145 | if (rv == NULL) | 147 | if (rv == NULL) |
146 | return 0; | 148 | return 0; |
147 | *psignid = (*rv)->sign_id; | 149 | if (psignid) |
150 | *psignid = (*rv)->sign_id; | ||
148 | return 1; | 151 | return 1; |
149 | } | 152 | } |
150 | 153 | ||
diff --git a/src/lib/libcrypto/objects/obj_xref.h b/src/lib/libcrypto/objects/obj_xref.h index d5b9b8e198..e23938c296 100644 --- a/src/lib/libcrypto/objects/obj_xref.h +++ b/src/lib/libcrypto/objects/obj_xref.h | |||
@@ -38,10 +38,12 @@ static const nid_triple sigoid_srt[] = | |||
38 | {NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94}, | 38 | {NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94}, |
39 | {NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc}, | 39 | {NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc}, |
40 | {NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc}, | 40 | {NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc}, |
41 | {NID_rsassaPss, NID_undef, NID_rsaEncryption}, | ||
41 | }; | 42 | }; |
42 | 43 | ||
43 | static const nid_triple * const sigoid_srt_xref[] = | 44 | static const nid_triple * const sigoid_srt_xref[] = |
44 | { | 45 | { |
46 | &sigoid_srt[29], | ||
45 | &sigoid_srt[17], | 47 | &sigoid_srt[17], |
46 | &sigoid_srt[18], | 48 | &sigoid_srt[18], |
47 | &sigoid_srt[0], | 49 | &sigoid_srt[0], |
diff --git a/src/lib/libcrypto/objects/obj_xref.txt b/src/lib/libcrypto/objects/obj_xref.txt index e45b3d34b9..cb917182ee 100644 --- a/src/lib/libcrypto/objects/obj_xref.txt +++ b/src/lib/libcrypto/objects/obj_xref.txt | |||
@@ -13,6 +13,10 @@ sha512WithRSAEncryption sha512 rsaEncryption | |||
13 | sha224WithRSAEncryption sha224 rsaEncryption | 13 | sha224WithRSAEncryption sha224 rsaEncryption |
14 | mdc2WithRSA mdc2 rsaEncryption | 14 | mdc2WithRSA mdc2 rsaEncryption |
15 | ripemd160WithRSA ripemd160 rsaEncryption | 15 | ripemd160WithRSA ripemd160 rsaEncryption |
16 | # For PSS the digest algorithm can vary and depends on the included | ||
17 | # AlgorithmIdentifier. The digest "undef" indicates the public key | ||
18 | # method should handle this explicitly. | ||
19 | rsassaPss undef rsaEncryption | ||
16 | 20 | ||
17 | # Alternative deprecated OIDs. By using the older "rsa" OID this | 21 | # Alternative deprecated OIDs. By using the older "rsa" OID this |
18 | # type will be recognized by not normally used. | 22 | # type will be recognized by not normally used. |
diff --git a/src/lib/libcrypto/pariscid.pl b/src/lib/libcrypto/pariscid.pl new file mode 100644 index 0000000000..477ec9b87d --- /dev/null +++ b/src/lib/libcrypto/pariscid.pl | |||
@@ -0,0 +1,224 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | $flavour = shift; | ||
4 | $output = shift; | ||
5 | open STDOUT,">$output"; | ||
6 | |||
7 | if ($flavour =~ /64/) { | ||
8 | $LEVEL ="2.0W"; | ||
9 | $SIZE_T =8; | ||
10 | $ST ="std"; | ||
11 | } else { | ||
12 | $LEVEL ="1.1"; | ||
13 | $SIZE_T =4; | ||
14 | $ST ="stw"; | ||
15 | } | ||
16 | |||
17 | $rp="%r2"; | ||
18 | $sp="%r30"; | ||
19 | $rv="%r28"; | ||
20 | |||
21 | $code=<<___; | ||
22 | .LEVEL $LEVEL | ||
23 | .SPACE \$TEXT\$ | ||
24 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
25 | |||
26 | .EXPORT OPENSSL_cpuid_setup,ENTRY | ||
27 | .ALIGN 8 | ||
28 | OPENSSL_cpuid_setup | ||
29 | .PROC | ||
30 | .CALLINFO NO_CALLS | ||
31 | .ENTRY | ||
32 | bv ($rp) | ||
33 | .EXIT | ||
34 | nop | ||
35 | .PROCEND | ||
36 | |||
37 | .EXPORT OPENSSL_rdtsc,ENTRY | ||
38 | .ALIGN 8 | ||
39 | OPENSSL_rdtsc | ||
40 | .PROC | ||
41 | .CALLINFO NO_CALLS | ||
42 | .ENTRY | ||
43 | mfctl %cr16,$rv | ||
44 | bv ($rp) | ||
45 | .EXIT | ||
46 | nop | ||
47 | .PROCEND | ||
48 | |||
49 | .EXPORT OPENSSL_wipe_cpu,ENTRY | ||
50 | .ALIGN 8 | ||
51 | OPENSSL_wipe_cpu | ||
52 | .PROC | ||
53 | .CALLINFO NO_CALLS | ||
54 | .ENTRY | ||
55 | xor %r0,%r0,%r1 | ||
56 | fcpy,dbl %fr0,%fr4 | ||
57 | xor %r0,%r0,%r19 | ||
58 | fcpy,dbl %fr0,%fr5 | ||
59 | xor %r0,%r0,%r20 | ||
60 | fcpy,dbl %fr0,%fr6 | ||
61 | xor %r0,%r0,%r21 | ||
62 | fcpy,dbl %fr0,%fr7 | ||
63 | xor %r0,%r0,%r22 | ||
64 | fcpy,dbl %fr0,%fr8 | ||
65 | xor %r0,%r0,%r23 | ||
66 | fcpy,dbl %fr0,%fr9 | ||
67 | xor %r0,%r0,%r24 | ||
68 | fcpy,dbl %fr0,%fr10 | ||
69 | xor %r0,%r0,%r25 | ||
70 | fcpy,dbl %fr0,%fr11 | ||
71 | xor %r0,%r0,%r26 | ||
72 | fcpy,dbl %fr0,%fr22 | ||
73 | xor %r0,%r0,%r29 | ||
74 | fcpy,dbl %fr0,%fr23 | ||
75 | xor %r0,%r0,%r31 | ||
76 | fcpy,dbl %fr0,%fr24 | ||
77 | fcpy,dbl %fr0,%fr25 | ||
78 | fcpy,dbl %fr0,%fr26 | ||
79 | fcpy,dbl %fr0,%fr27 | ||
80 | fcpy,dbl %fr0,%fr28 | ||
81 | fcpy,dbl %fr0,%fr29 | ||
82 | fcpy,dbl %fr0,%fr30 | ||
83 | fcpy,dbl %fr0,%fr31 | ||
84 | bv ($rp) | ||
85 | .EXIT | ||
86 | ldo 0($sp),$rv | ||
87 | .PROCEND | ||
88 | ___ | ||
89 | { | ||
90 | my $inp="%r26"; | ||
91 | my $len="%r25"; | ||
92 | |||
93 | $code.=<<___; | ||
94 | .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR | ||
95 | .ALIGN 8 | ||
96 | OPENSSL_cleanse | ||
97 | .PROC | ||
98 | .CALLINFO NO_CALLS | ||
99 | .ENTRY | ||
100 | cmpib,*= 0,$len,Ldone | ||
101 | nop | ||
102 | cmpib,*>>= 15,$len,Little | ||
103 | ldi $SIZE_T-1,%r1 | ||
104 | |||
105 | Lalign | ||
106 | and,*<> $inp,%r1,%r28 | ||
107 | b,n Laligned | ||
108 | stb %r0,0($inp) | ||
109 | ldo -1($len),$len | ||
110 | b Lalign | ||
111 | ldo 1($inp),$inp | ||
112 | |||
113 | Laligned | ||
114 | andcm $len,%r1,%r28 | ||
115 | Lot | ||
116 | $ST %r0,0($inp) | ||
117 | addib,*<> -$SIZE_T,%r28,Lot | ||
118 | ldo $SIZE_T($inp),$inp | ||
119 | |||
120 | and,*<> $len,%r1,$len | ||
121 | b,n Ldone | ||
122 | Little | ||
123 | stb %r0,0($inp) | ||
124 | addib,*<> -1,$len,Little | ||
125 | ldo 1($inp),$inp | ||
126 | Ldone | ||
127 | bv ($rp) | ||
128 | .EXIT | ||
129 | nop | ||
130 | .PROCEND | ||
131 | ___ | ||
132 | } | ||
133 | { | ||
134 | my ($out,$cnt,$max)=("%r26","%r25","%r24"); | ||
135 | my ($tick,$lasttick)=("%r23","%r22"); | ||
136 | my ($diff,$lastdiff)=("%r21","%r20"); | ||
137 | |||
138 | $code.=<<___; | ||
139 | .EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR | ||
140 | .ALIGN 8 | ||
141 | OPENSSL_instrument_bus | ||
142 | .PROC | ||
143 | .CALLINFO NO_CALLS | ||
144 | .ENTRY | ||
145 | copy $cnt,$rv | ||
146 | mfctl %cr16,$tick | ||
147 | copy $tick,$lasttick | ||
148 | ldi 0,$diff | ||
149 | |||
150 | fdc 0($out) | ||
151 | ldw 0($out),$tick | ||
152 | add $diff,$tick,$tick | ||
153 | stw $tick,0($out) | ||
154 | Loop | ||
155 | mfctl %cr16,$tick | ||
156 | sub $tick,$lasttick,$diff | ||
157 | copy $tick,$lasttick | ||
158 | |||
159 | fdc 0($out) | ||
160 | ldw 0($out),$tick | ||
161 | add $diff,$tick,$tick | ||
162 | stw $tick,0($out) | ||
163 | |||
164 | addib,<> -1,$cnt,Loop | ||
165 | addi 4,$out,$out | ||
166 | |||
167 | bv ($rp) | ||
168 | .EXIT | ||
169 | sub $rv,$cnt,$rv | ||
170 | .PROCEND | ||
171 | |||
172 | .EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR | ||
173 | .ALIGN 8 | ||
174 | OPENSSL_instrument_bus2 | ||
175 | .PROC | ||
176 | .CALLINFO NO_CALLS | ||
177 | .ENTRY | ||
178 | copy $cnt,$rv | ||
179 | sub %r0,$cnt,$cnt | ||
180 | |||
181 | mfctl %cr16,$tick | ||
182 | copy $tick,$lasttick | ||
183 | ldi 0,$diff | ||
184 | |||
185 | fdc 0($out) | ||
186 | ldw 0($out),$tick | ||
187 | add $diff,$tick,$tick | ||
188 | stw $tick,0($out) | ||
189 | |||
190 | mfctl %cr16,$tick | ||
191 | sub $tick,$lasttick,$diff | ||
192 | copy $tick,$lasttick | ||
193 | Loop2 | ||
194 | copy $diff,$lastdiff | ||
195 | fdc 0($out) | ||
196 | ldw 0($out),$tick | ||
197 | add $diff,$tick,$tick | ||
198 | stw $tick,0($out) | ||
199 | |||
200 | addib,= -1,$max,Ldone2 | ||
201 | nop | ||
202 | |||
203 | mfctl %cr16,$tick | ||
204 | sub $tick,$lasttick,$diff | ||
205 | copy $tick,$lasttick | ||
206 | cmpclr,<> $lastdiff,$diff,$tick | ||
207 | ldi 1,$tick | ||
208 | |||
209 | ldi 1,%r1 | ||
210 | xor %r1,$tick,$tick | ||
211 | addb,<> $tick,$cnt,Loop2 | ||
212 | shladd,l $tick,2,$out,$out | ||
213 | Ldone2 | ||
214 | bv ($rp) | ||
215 | .EXIT | ||
216 | add $rv,$cnt,$rv | ||
217 | .PROCEND | ||
218 | ___ | ||
219 | } | ||
220 | $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); | ||
221 | $code =~ s/,\*/,/gm if ($SIZE_T==4); | ||
222 | print $code; | ||
223 | close STDOUT; | ||
224 | |||
diff --git a/src/lib/libcrypto/pem/pvkfmt.c b/src/lib/libcrypto/pem/pvkfmt.c index 5f130c4528..b1bf71a5da 100644 --- a/src/lib/libcrypto/pem/pvkfmt.c +++ b/src/lib/libcrypto/pem/pvkfmt.c | |||
@@ -709,13 +709,16 @@ static int derive_pvk_key(unsigned char *key, | |||
709 | const unsigned char *pass, int passlen) | 709 | const unsigned char *pass, int passlen) |
710 | { | 710 | { |
711 | EVP_MD_CTX mctx; | 711 | EVP_MD_CTX mctx; |
712 | int rv = 1; | ||
712 | EVP_MD_CTX_init(&mctx); | 713 | EVP_MD_CTX_init(&mctx); |
713 | EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL); | 714 | if (!EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL) |
714 | EVP_DigestUpdate(&mctx, salt, saltlen); | 715 | || !EVP_DigestUpdate(&mctx, salt, saltlen) |
715 | EVP_DigestUpdate(&mctx, pass, passlen); | 716 | || !EVP_DigestUpdate(&mctx, pass, passlen) |
716 | EVP_DigestFinal_ex(&mctx, key, NULL); | 717 | || !EVP_DigestFinal_ex(&mctx, key, NULL)) |
718 | rv = 0; | ||
719 | |||
717 | EVP_MD_CTX_cleanup(&mctx); | 720 | EVP_MD_CTX_cleanup(&mctx); |
718 | return 1; | 721 | return rv; |
719 | } | 722 | } |
720 | 723 | ||
721 | 724 | ||
@@ -727,11 +730,12 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in, | |||
727 | const unsigned char *p = *in; | 730 | const unsigned char *p = *in; |
728 | unsigned int magic; | 731 | unsigned int magic; |
729 | unsigned char *enctmp = NULL, *q; | 732 | unsigned char *enctmp = NULL, *q; |
733 | EVP_CIPHER_CTX cctx; | ||
734 | EVP_CIPHER_CTX_init(&cctx); | ||
730 | if (saltlen) | 735 | if (saltlen) |
731 | { | 736 | { |
732 | char psbuf[PEM_BUFSIZE]; | 737 | char psbuf[PEM_BUFSIZE]; |
733 | unsigned char keybuf[20]; | 738 | unsigned char keybuf[20]; |
734 | EVP_CIPHER_CTX cctx; | ||
735 | int enctmplen, inlen; | 739 | int enctmplen, inlen; |
736 | if (cb) | 740 | if (cb) |
737 | inlen=cb(psbuf,PEM_BUFSIZE,0,u); | 741 | inlen=cb(psbuf,PEM_BUFSIZE,0,u); |
@@ -757,37 +761,41 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in, | |||
757 | p += 8; | 761 | p += 8; |
758 | inlen = keylen - 8; | 762 | inlen = keylen - 8; |
759 | q = enctmp + 8; | 763 | q = enctmp + 8; |
760 | EVP_CIPHER_CTX_init(&cctx); | 764 | if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL)) |
761 | EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL); | 765 | goto err; |
762 | EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen); | 766 | if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen)) |
763 | EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen); | 767 | goto err; |
768 | if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen)) | ||
769 | goto err; | ||
764 | magic = read_ledword((const unsigned char **)&q); | 770 | magic = read_ledword((const unsigned char **)&q); |
765 | if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) | 771 | if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) |
766 | { | 772 | { |
767 | q = enctmp + 8; | 773 | q = enctmp + 8; |
768 | memset(keybuf + 5, 0, 11); | 774 | memset(keybuf + 5, 0, 11); |
769 | EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, | 775 | if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, |
770 | NULL); | 776 | NULL)) |
777 | goto err; | ||
771 | OPENSSL_cleanse(keybuf, 20); | 778 | OPENSSL_cleanse(keybuf, 20); |
772 | EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen); | 779 | if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen)) |
773 | EVP_DecryptFinal_ex(&cctx, q + enctmplen, | 780 | goto err; |
774 | &enctmplen); | 781 | if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, |
782 | &enctmplen)) | ||
783 | goto err; | ||
775 | magic = read_ledword((const unsigned char **)&q); | 784 | magic = read_ledword((const unsigned char **)&q); |
776 | if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) | 785 | if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) |
777 | { | 786 | { |
778 | EVP_CIPHER_CTX_cleanup(&cctx); | ||
779 | PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT); | 787 | PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT); |
780 | goto err; | 788 | goto err; |
781 | } | 789 | } |
782 | } | 790 | } |
783 | else | 791 | else |
784 | OPENSSL_cleanse(keybuf, 20); | 792 | OPENSSL_cleanse(keybuf, 20); |
785 | EVP_CIPHER_CTX_cleanup(&cctx); | ||
786 | p = enctmp; | 793 | p = enctmp; |
787 | } | 794 | } |
788 | 795 | ||
789 | ret = b2i_PrivateKey(&p, keylen); | 796 | ret = b2i_PrivateKey(&p, keylen); |
790 | err: | 797 | err: |
798 | EVP_CIPHER_CTX_cleanup(&cctx); | ||
791 | if (enctmp && saltlen) | 799 | if (enctmp && saltlen) |
792 | OPENSSL_free(enctmp); | 800 | OPENSSL_free(enctmp); |
793 | return ret; | 801 | return ret; |
@@ -841,6 +849,8 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, | |||
841 | { | 849 | { |
842 | int outlen = 24, pklen; | 850 | int outlen = 24, pklen; |
843 | unsigned char *p, *salt = NULL; | 851 | unsigned char *p, *salt = NULL; |
852 | EVP_CIPHER_CTX cctx; | ||
853 | EVP_CIPHER_CTX_init(&cctx); | ||
844 | if (enclevel) | 854 | if (enclevel) |
845 | outlen += PVK_SALTLEN; | 855 | outlen += PVK_SALTLEN; |
846 | pklen = do_i2b(NULL, pk, 0); | 856 | pklen = do_i2b(NULL, pk, 0); |
@@ -885,7 +895,6 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, | |||
885 | { | 895 | { |
886 | char psbuf[PEM_BUFSIZE]; | 896 | char psbuf[PEM_BUFSIZE]; |
887 | unsigned char keybuf[20]; | 897 | unsigned char keybuf[20]; |
888 | EVP_CIPHER_CTX cctx; | ||
889 | int enctmplen, inlen; | 898 | int enctmplen, inlen; |
890 | if (cb) | 899 | if (cb) |
891 | inlen=cb(psbuf,PEM_BUFSIZE,1,u); | 900 | inlen=cb(psbuf,PEM_BUFSIZE,1,u); |
@@ -902,16 +911,19 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, | |||
902 | if (enclevel == 1) | 911 | if (enclevel == 1) |
903 | memset(keybuf + 5, 0, 11); | 912 | memset(keybuf + 5, 0, 11); |
904 | p = salt + PVK_SALTLEN + 8; | 913 | p = salt + PVK_SALTLEN + 8; |
905 | EVP_CIPHER_CTX_init(&cctx); | 914 | if (!EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL)) |
906 | EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL); | 915 | goto error; |
907 | OPENSSL_cleanse(keybuf, 20); | 916 | OPENSSL_cleanse(keybuf, 20); |
908 | EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8); | 917 | if (!EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8)) |
909 | EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen); | 918 | goto error; |
910 | EVP_CIPHER_CTX_cleanup(&cctx); | 919 | if (!EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen)) |
920 | goto error; | ||
911 | } | 921 | } |
922 | EVP_CIPHER_CTX_cleanup(&cctx); | ||
912 | return outlen; | 923 | return outlen; |
913 | 924 | ||
914 | error: | 925 | error: |
926 | EVP_CIPHER_CTX_cleanup(&cctx); | ||
915 | return -1; | 927 | return -1; |
916 | } | 928 | } |
917 | 929 | ||
diff --git a/src/lib/libcrypto/perlasm/ppc-xlate.pl b/src/lib/libcrypto/perlasm/ppc-xlate.pl index 4579671c97..a3edd982b6 100755 --- a/src/lib/libcrypto/perlasm/ppc-xlate.pl +++ b/src/lib/libcrypto/perlasm/ppc-xlate.pl | |||
@@ -31,10 +31,9 @@ my $globl = sub { | |||
31 | $ret .= ".type $name,\@function"; | 31 | $ret .= ".type $name,\@function"; |
32 | last; | 32 | last; |
33 | }; | 33 | }; |
34 | /linux.*64/ && do { $ret .= ".globl .$name\n"; | 34 | /linux.*64/ && do { $ret .= ".globl $name\n"; |
35 | $ret .= ".type .$name,\@function\n"; | 35 | $ret .= ".type $name,\@function\n"; |
36 | $ret .= ".section \".opd\",\"aw\"\n"; | 36 | $ret .= ".section \".opd\",\"aw\"\n"; |
37 | $ret .= ".globl $name\n"; | ||
38 | $ret .= ".align 3\n"; | 37 | $ret .= ".align 3\n"; |
39 | $ret .= "$name:\n"; | 38 | $ret .= "$name:\n"; |
40 | $ret .= ".quad .$name,.TOC.\@tocbase,0\n"; | 39 | $ret .= ".quad .$name,.TOC.\@tocbase,0\n"; |
@@ -62,6 +61,14 @@ my $machine = sub { | |||
62 | } | 61 | } |
63 | ".machine $arch"; | 62 | ".machine $arch"; |
64 | }; | 63 | }; |
64 | my $size = sub { | ||
65 | if ($flavour =~ /linux.*32/) | ||
66 | { shift; | ||
67 | ".size " . join(",",@_); | ||
68 | } | ||
69 | else | ||
70 | { ""; } | ||
71 | }; | ||
65 | my $asciz = sub { | 72 | my $asciz = sub { |
66 | shift; | 73 | shift; |
67 | my $line = join(",",@_); | 74 | my $line = join(",",@_); |
diff --git a/src/lib/libcrypto/ppccap.c b/src/lib/libcrypto/ppccap.c new file mode 100644 index 0000000000..ab89ccaa12 --- /dev/null +++ b/src/lib/libcrypto/ppccap.c | |||
@@ -0,0 +1,115 @@ | |||
1 | #include <stdio.h> | ||
2 | #include <stdlib.h> | ||
3 | #include <string.h> | ||
4 | #include <setjmp.h> | ||
5 | #include <signal.h> | ||
6 | #include <crypto.h> | ||
7 | #include <openssl/bn.h> | ||
8 | |||
9 | #define PPC_FPU64 (1<<0) | ||
10 | #define PPC_ALTIVEC (1<<1) | ||
11 | |||
12 | static int OPENSSL_ppccap_P = 0; | ||
13 | |||
14 | static sigset_t all_masked; | ||
15 | |||
16 | #ifdef OPENSSL_BN_ASM_MONT | ||
17 | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num) | ||
18 | { | ||
19 | int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); | ||
20 | int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); | ||
21 | |||
22 | if (sizeof(size_t)==4) | ||
23 | { | ||
24 | #if (defined(__APPLE__) && defined(__MACH__)) | ||
25 | if (num>=8 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64)) | ||
26 | return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); | ||
27 | #else | ||
28 | /* boundary of 32 was experimentally determined on | ||
29 | Linux 2.6.22, might have to be adjusted on AIX... */ | ||
30 | if (num>=32 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64)) | ||
31 | { | ||
32 | sigset_t oset; | ||
33 | int ret; | ||
34 | |||
35 | sigprocmask(SIG_SETMASK,&all_masked,&oset); | ||
36 | ret=bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); | ||
37 | sigprocmask(SIG_SETMASK,&oset,NULL); | ||
38 | |||
39 | return ret; | ||
40 | } | ||
41 | #endif | ||
42 | } | ||
43 | else if ((OPENSSL_ppccap_P&PPC_FPU64)) | ||
44 | /* this is a "must" on POWER6, but run-time detection | ||
45 | * is not implemented yet... */ | ||
46 | return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); | ||
47 | |||
48 | return bn_mul_mont_int(rp,ap,bp,np,n0,num); | ||
49 | } | ||
50 | #endif | ||
51 | |||
52 | static sigjmp_buf ill_jmp; | ||
53 | static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } | ||
54 | |||
55 | void OPENSSL_ppc64_probe(void); | ||
56 | |||
57 | void OPENSSL_cpuid_setup(void) | ||
58 | { | ||
59 | char *e; | ||
60 | struct sigaction ill_oact,ill_act; | ||
61 | sigset_t oset; | ||
62 | static int trigger=0; | ||
63 | |||
64 | if (trigger) return; | ||
65 | trigger=1; | ||
66 | |||
67 | sigfillset(&all_masked); | ||
68 | sigdelset(&all_masked,SIGILL); | ||
69 | sigdelset(&all_masked,SIGTRAP); | ||
70 | #ifdef SIGEMT | ||
71 | sigdelset(&all_masked,SIGEMT); | ||
72 | #endif | ||
73 | sigdelset(&all_masked,SIGFPE); | ||
74 | sigdelset(&all_masked,SIGBUS); | ||
75 | sigdelset(&all_masked,SIGSEGV); | ||
76 | |||
77 | if ((e=getenv("OPENSSL_ppccap"))) | ||
78 | { | ||
79 | OPENSSL_ppccap_P=strtoul(e,NULL,0); | ||
80 | return; | ||
81 | } | ||
82 | |||
83 | OPENSSL_ppccap_P = 0; | ||
84 | |||
85 | memset(&ill_act,0,sizeof(ill_act)); | ||
86 | ill_act.sa_handler = ill_handler; | ||
87 | ill_act.sa_mask = all_masked; | ||
88 | |||
89 | sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset); | ||
90 | sigaction(SIGILL,&ill_act,&ill_oact); | ||
91 | |||
92 | if (sizeof(size_t)==4) | ||
93 | { | ||
94 | if (sigsetjmp(ill_jmp,1) == 0) | ||
95 | { | ||
96 | OPENSSL_ppc64_probe(); | ||
97 | OPENSSL_ppccap_P |= PPC_FPU64; | ||
98 | } | ||
99 | } | ||
100 | else | ||
101 | { | ||
102 | /* | ||
103 | * Wanted code detecting POWER6 CPU and setting PPC_FPU64 | ||
104 | */ | ||
105 | } | ||
106 | |||
107 | if (sigsetjmp(ill_jmp,1) == 0) | ||
108 | { | ||
109 | OPENSSL_altivec_probe(); | ||
110 | OPENSSL_ppccap_P |= PPC_ALTIVEC; | ||
111 | } | ||
112 | |||
113 | sigaction (SIGILL,&ill_oact,NULL); | ||
114 | sigprocmask(SIG_SETMASK,&oset,NULL); | ||
115 | } | ||
diff --git a/src/lib/libcrypto/ppccpuid.pl b/src/lib/libcrypto/ppccpuid.pl index 369e1d0df9..4ba736a1d1 100755 --- a/src/lib/libcrypto/ppccpuid.pl +++ b/src/lib/libcrypto/ppccpuid.pl | |||
@@ -23,36 +23,67 @@ $code=<<___; | |||
23 | .machine "any" | 23 | .machine "any" |
24 | .text | 24 | .text |
25 | 25 | ||
26 | .globl .OPENSSL_cpuid_setup | 26 | .globl .OPENSSL_ppc64_probe |
27 | .align 4 | 27 | .align 4 |
28 | .OPENSSL_cpuid_setup: | 28 | .OPENSSL_ppc64_probe: |
29 | fcfid f1,f1 | ||
30 | extrdi r0,r0,32,0 | ||
29 | blr | 31 | blr |
32 | .long 0 | ||
33 | .byte 0,12,0x14,0,0,0,0,0 | ||
34 | |||
35 | .globl .OPENSSL_altivec_probe | ||
36 | .align 4 | ||
37 | .OPENSSL_altivec_probe: | ||
38 | .long 0x10000484 # vor v0,v0,v0 | ||
39 | blr | ||
40 | .long 0 | ||
41 | .byte 0,12,0x14,0,0,0,0,0 | ||
30 | 42 | ||
31 | .globl .OPENSSL_wipe_cpu | 43 | .globl .OPENSSL_wipe_cpu |
32 | .align 4 | 44 | .align 4 |
33 | .OPENSSL_wipe_cpu: | 45 | .OPENSSL_wipe_cpu: |
34 | xor r0,r0,r0 | 46 | xor r0,r0,r0 |
47 | fmr f0,f31 | ||
48 | fmr f1,f31 | ||
49 | fmr f2,f31 | ||
35 | mr r3,r1 | 50 | mr r3,r1 |
51 | fmr f3,f31 | ||
36 | xor r4,r4,r4 | 52 | xor r4,r4,r4 |
53 | fmr f4,f31 | ||
37 | xor r5,r5,r5 | 54 | xor r5,r5,r5 |
55 | fmr f5,f31 | ||
38 | xor r6,r6,r6 | 56 | xor r6,r6,r6 |
57 | fmr f6,f31 | ||
39 | xor r7,r7,r7 | 58 | xor r7,r7,r7 |
59 | fmr f7,f31 | ||
40 | xor r8,r8,r8 | 60 | xor r8,r8,r8 |
61 | fmr f8,f31 | ||
41 | xor r9,r9,r9 | 62 | xor r9,r9,r9 |
63 | fmr f9,f31 | ||
42 | xor r10,r10,r10 | 64 | xor r10,r10,r10 |
65 | fmr f10,f31 | ||
43 | xor r11,r11,r11 | 66 | xor r11,r11,r11 |
67 | fmr f11,f31 | ||
44 | xor r12,r12,r12 | 68 | xor r12,r12,r12 |
69 | fmr f12,f31 | ||
70 | fmr f13,f31 | ||
45 | blr | 71 | blr |
72 | .long 0 | ||
73 | .byte 0,12,0x14,0,0,0,0,0 | ||
46 | 74 | ||
47 | .globl .OPENSSL_atomic_add | 75 | .globl .OPENSSL_atomic_add |
48 | .align 4 | 76 | .align 4 |
49 | .OPENSSL_atomic_add: | 77 | .OPENSSL_atomic_add: |
50 | Loop: lwarx r5,0,r3 | 78 | Ladd: lwarx r5,0,r3 |
51 | add r0,r4,r5 | 79 | add r0,r4,r5 |
52 | stwcx. r0,0,r3 | 80 | stwcx. r0,0,r3 |
53 | bne- Loop | 81 | bne- Ladd |
54 | $SIGNX r3,r0 | 82 | $SIGNX r3,r0 |
55 | blr | 83 | blr |
84 | .long 0 | ||
85 | .byte 0,12,0x14,0,0,0,2,0 | ||
86 | .long 0 | ||
56 | 87 | ||
57 | .globl .OPENSSL_rdtsc | 88 | .globl .OPENSSL_rdtsc |
58 | .align 4 | 89 | .align 4 |
@@ -60,6 +91,8 @@ Loop: lwarx r5,0,r3 | |||
60 | mftb r3 | 91 | mftb r3 |
61 | mftbu r4 | 92 | mftbu r4 |
62 | blr | 93 | blr |
94 | .long 0 | ||
95 | .byte 0,12,0x14,0,0,0,0,0 | ||
63 | 96 | ||
64 | .globl .OPENSSL_cleanse | 97 | .globl .OPENSSL_cleanse |
65 | .align 4 | 98 | .align 4 |
@@ -72,7 +105,7 @@ Loop: lwarx r5,0,r3 | |||
72 | Little: mtctr r4 | 105 | Little: mtctr r4 |
73 | stb r0,0(r3) | 106 | stb r0,0(r3) |
74 | addi r3,r3,1 | 107 | addi r3,r3,1 |
75 | bdnz- \$-8 | 108 | bdnz \$-8 |
76 | blr | 109 | blr |
77 | Lot: andi. r5,r3,3 | 110 | Lot: andi. r5,r3,3 |
78 | beq Laligned | 111 | beq Laligned |
@@ -85,10 +118,13 @@ Laligned: | |||
85 | mtctr r5 | 118 | mtctr r5 |
86 | stw r0,0(r3) | 119 | stw r0,0(r3) |
87 | addi r3,r3,4 | 120 | addi r3,r3,4 |
88 | bdnz- \$-8 | 121 | bdnz \$-8 |
89 | andi. r4,r4,3 | 122 | andi. r4,r4,3 |
90 | bne Little | 123 | bne Little |
91 | blr | 124 | blr |
125 | .long 0 | ||
126 | .byte 0,12,0x14,0,0,0,2,0 | ||
127 | .long 0 | ||
92 | ___ | 128 | ___ |
93 | 129 | ||
94 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 130 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl new file mode 100644 index 0000000000..7f684092d4 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl | |||
@@ -0,0 +1,631 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # June 2011 | ||
11 | # | ||
12 | # This is RC4+MD5 "stitch" implementation. The idea, as spelled in | ||
13 | # http://download.intel.com/design/intarch/papers/323686.pdf, is that | ||
14 | # since both algorithms exhibit instruction-level parallelism, ILP, | ||
15 | # below theoretical maximum, interleaving them would allow to utilize | ||
16 | # processor resources better and achieve better performance. RC4 | ||
17 | # instruction sequence is virtually identical to rc4-x86_64.pl, which | ||
18 | # is heavily based on submission by Maxim Perminov, Maxim Locktyukhin | ||
19 | # and Jim Guilford of Intel. MD5 is fresh implementation aiming to | ||
20 | # minimize register usage, which was used as "main thread" with RC4 | ||
21 | # weaved into it, one RC4 round per one MD5 round. In addition to the | ||
22 | # stiched subroutine the script can generate standalone replacement | ||
23 | # md5_block_asm_data_order and RC4. Below are performance numbers in | ||
24 | # cycles per processed byte, less is better, for these the standalone | ||
25 | # subroutines, sum of them, and stitched one: | ||
26 | # | ||
27 | # RC4 MD5 RC4+MD5 stitch gain | ||
28 | # Opteron 6.5(*) 5.4 11.9 7.0 +70%(*) | ||
29 | # Core2 6.5 5.8 12.3 7.7 +60% | ||
30 | # Westmere 4.3 5.2 9.5 7.0 +36% | ||
31 | # Sandy Bridge 4.2 5.5 9.7 6.8 +43% | ||
32 | # Atom 9.3 6.5 15.8 11.1 +42% | ||
33 | # | ||
34 | # (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement | ||
35 | # is +53%... | ||
36 | |||
37 | my ($rc4,$md5)=(1,1); # what to generate? | ||
38 | my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(), | ||
39 | # but its result is discarded. Idea here is | ||
40 | # to be able to use 'openssl speed rc4' for | ||
41 | # benchmarking the stitched subroutine... | ||
42 | |||
43 | my $flavour = shift; | ||
44 | my $output = shift; | ||
45 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
46 | |||
47 | my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
48 | |||
49 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; | ||
50 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
51 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
52 | die "can't locate x86_64-xlate.pl"; | ||
53 | |||
54 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
55 | |||
56 | my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs); | ||
57 | |||
58 | if ($rc4 && !$md5) { | ||
59 | ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx"); | ||
60 | $func="RC4"; $nargs=4; | ||
61 | } elsif ($md5 && !$rc4) { | ||
62 | ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx"); | ||
63 | $func="md5_block_asm_data_order"; $nargs=3; | ||
64 | } else { | ||
65 | ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||
66 | $func="rc4_md5_enc"; $nargs=6; | ||
67 | # void rc4_md5_enc( | ||
68 | # RC4_KEY *key, # | ||
69 | # const void *in0, # RC4 input | ||
70 | # void *out, # RC4 output | ||
71 | # MD5_CTX *ctx, # | ||
72 | # const void *inp, # MD5 input | ||
73 | # size_t len); # number of 64-byte blocks | ||
74 | } | ||
75 | |||
76 | my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, | ||
77 | 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, | ||
78 | 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, | ||
79 | 0x6b901122,0xfd987193,0xa679438e,0x49b40821, | ||
80 | |||
81 | 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, | ||
82 | 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, | ||
83 | 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, | ||
84 | 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, | ||
85 | |||
86 | 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, | ||
87 | 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, | ||
88 | 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, | ||
89 | 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, | ||
90 | |||
91 | 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, | ||
92 | 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, | ||
93 | 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, | ||
94 | 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 ); | ||
95 | |||
96 | my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers | ||
97 | my $tmp="%r12d"; | ||
98 | |||
99 | my @XX=("%rbp","%rsi"); # RC4 registers | ||
100 | my @TX=("%rax","%rbx"); | ||
101 | my $YY="%rcx"; | ||
102 | my $TY="%rdx"; | ||
103 | |||
104 | my $MOD=32; # 16, 32 or 64 | ||
105 | |||
106 | $code.=<<___; | ||
107 | .text | ||
108 | .align 16 | ||
109 | |||
110 | .globl $func | ||
111 | .type $func,\@function,$nargs | ||
112 | $func: | ||
113 | cmp \$0,$len | ||
114 | je .Labort | ||
115 | push %rbx | ||
116 | push %rbp | ||
117 | push %r12 | ||
118 | push %r13 | ||
119 | push %r14 | ||
120 | push %r15 | ||
121 | sub \$40,%rsp | ||
122 | .Lbody: | ||
123 | ___ | ||
124 | if ($rc4) { | ||
125 | $code.=<<___; | ||
126 | $D#md5# mov $ctx,%r11 # reassign arguments | ||
127 | mov $len,%r12 | ||
128 | mov $in0,%r13 | ||
129 | mov $out,%r14 | ||
130 | $D#md5# mov $inp,%r15 | ||
131 | ___ | ||
132 | $ctx="%r11" if ($md5); # reassign arguments | ||
133 | $len="%r12"; | ||
134 | $in0="%r13"; | ||
135 | $out="%r14"; | ||
136 | $inp="%r15" if ($md5); | ||
137 | $inp=$in0 if (!$md5); | ||
138 | $code.=<<___; | ||
139 | xor $XX[0],$XX[0] | ||
140 | xor $YY,$YY | ||
141 | |||
142 | lea 8($dat),$dat | ||
143 | mov -8($dat),$XX[0]#b | ||
144 | mov -4($dat),$YY#b | ||
145 | |||
146 | inc $XX[0]#b | ||
147 | sub $in0,$out | ||
148 | movl ($dat,$XX[0],4),$TX[0]#d | ||
149 | ___ | ||
150 | $code.=<<___ if (!$md5); | ||
151 | xor $TX[1],$TX[1] | ||
152 | test \$-128,$len | ||
153 | jz .Loop1 | ||
154 | sub $XX[0],$TX[1] | ||
155 | and \$`$MOD-1`,$TX[1] | ||
156 | jz .Loop${MOD}_is_hot | ||
157 | sub $TX[1],$len | ||
158 | .Loop${MOD}_warmup: | ||
159 | add $TX[0]#b,$YY#b | ||
160 | movl ($dat,$YY,4),$TY#d | ||
161 | movl $TX[0]#d,($dat,$YY,4) | ||
162 | movl $TY#d,($dat,$XX[0],4) | ||
163 | add $TY#b,$TX[0]#b | ||
164 | inc $XX[0]#b | ||
165 | movl ($dat,$TX[0],4),$TY#d | ||
166 | movl ($dat,$XX[0],4),$TX[0]#d | ||
167 | xorb ($in0),$TY#b | ||
168 | movb $TY#b,($out,$in0) | ||
169 | lea 1($in0),$in0 | ||
170 | dec $TX[1] | ||
171 | jnz .Loop${MOD}_warmup | ||
172 | |||
173 | mov $YY,$TX[1] | ||
174 | xor $YY,$YY | ||
175 | mov $TX[1]#b,$YY#b | ||
176 | |||
177 | .Loop${MOD}_is_hot: | ||
178 | mov $len,32(%rsp) # save original $len | ||
179 | shr \$6,$len # number of 64-byte blocks | ||
180 | ___ | ||
181 | if ($D && !$md5) { # stitch in dummy MD5 | ||
182 | $md5=1; | ||
183 | $ctx="%r11"; | ||
184 | $inp="%r15"; | ||
185 | $code.=<<___; | ||
186 | mov %rsp,$ctx | ||
187 | mov $in0,$inp | ||
188 | ___ | ||
189 | } | ||
190 | } | ||
191 | $code.=<<___; | ||
192 | #rc4# add $TX[0]#b,$YY#b | ||
193 | #rc4# lea ($dat,$XX[0],4),$XX[1] | ||
194 | shl \$6,$len | ||
195 | add $inp,$len # pointer to the end of input | ||
196 | mov $len,16(%rsp) | ||
197 | |||
198 | #md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX | ||
199 | #md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX | ||
200 | #md5# mov 1*4($ctx),$V[1] | ||
201 | #md5# mov 2*4($ctx),$V[2] | ||
202 | #md5# mov 3*4($ctx),$V[3] | ||
203 | jmp .Loop | ||
204 | |||
205 | .align 16 | ||
206 | .Loop: | ||
207 | #md5# mov $V[0],0*4(%rsp) # put aside current hash value | ||
208 | #md5# mov $V[1],1*4(%rsp) | ||
209 | #md5# mov $V[2],2*4(%rsp) | ||
210 | #md5# mov $V[3],$tmp # forward reference | ||
211 | #md5# mov $V[3],3*4(%rsp) | ||
212 | ___ | ||
213 | |||
214 | sub R0 { | ||
215 | my ($i,$a,$b,$c,$d)=@_; | ||
216 | my @rot0=(7,12,17,22); | ||
217 | my $j=$i%16; | ||
218 | my $k=$i%$MOD; | ||
219 | my $xmm="%xmm".($j&1); | ||
220 | $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15); | ||
221 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
222 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
223 | $code.=<<___; | ||
224 | #rc4# movl ($dat,$YY,4),$TY#d | ||
225 | #md5# xor $c,$tmp | ||
226 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
227 | #md5# and $b,$tmp | ||
228 | #md5# add 4*`$j`($inp),$a | ||
229 | #rc4# add $TY#b,$TX[0]#b | ||
230 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
231 | #md5# add \$$K[$i],$a | ||
232 | #md5# xor $d,$tmp | ||
233 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
234 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
235 | #md5# add $tmp,$a | ||
236 | #rc4# add $TX[1]#b,$YY#b | ||
237 | #md5# rol \$$rot0[$j%4],$a | ||
238 | #md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference | ||
239 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
240 | #md5# add $b,$a | ||
241 | ___ | ||
242 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
243 | mov $YY,$XX[1] | ||
244 | xor $YY,$YY # keyword to partial register | ||
245 | mov $XX[1]#b,$YY#b | ||
246 | lea ($dat,$XX[0],4),$XX[1] | ||
247 | ___ | ||
248 | $code.=<<___ if ($rc4 && $j==15); | ||
249 | psllq \$8,%xmm1 | ||
250 | pxor %xmm0,%xmm2 | ||
251 | pxor %xmm1,%xmm2 | ||
252 | ___ | ||
253 | } | ||
254 | sub R1 { | ||
255 | my ($i,$a,$b,$c,$d)=@_; | ||
256 | my @rot1=(5,9,14,20); | ||
257 | my $j=$i%16; | ||
258 | my $k=$i%$MOD; | ||
259 | my $xmm="%xmm".($j&1); | ||
260 | $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15); | ||
261 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
262 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
263 | $code.=<<___; | ||
264 | #rc4# movl ($dat,$YY,4),$TY#d | ||
265 | #md5# xor $b,$tmp | ||
266 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
267 | #md5# and $d,$tmp | ||
268 | #md5# add 4*`((1+5*$j)%16)`($inp),$a | ||
269 | #rc4# add $TY#b,$TX[0]#b | ||
270 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
271 | #md5# add \$$K[$i],$a | ||
272 | #md5# xor $c,$tmp | ||
273 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
274 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
275 | #md5# add $tmp,$a | ||
276 | #rc4# add $TX[1]#b,$YY#b | ||
277 | #md5# rol \$$rot1[$j%4],$a | ||
278 | #md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference | ||
279 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
280 | #md5# add $b,$a | ||
281 | ___ | ||
282 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
283 | mov $YY,$XX[1] | ||
284 | xor $YY,$YY # keyword to partial register | ||
285 | mov $XX[1]#b,$YY#b | ||
286 | lea ($dat,$XX[0],4),$XX[1] | ||
287 | ___ | ||
288 | $code.=<<___ if ($rc4 && $j==15); | ||
289 | psllq \$8,%xmm1 | ||
290 | pxor %xmm0,%xmm3 | ||
291 | pxor %xmm1,%xmm3 | ||
292 | ___ | ||
293 | } | ||
294 | sub R2 { | ||
295 | my ($i,$a,$b,$c,$d)=@_; | ||
296 | my @rot2=(4,11,16,23); | ||
297 | my $j=$i%16; | ||
298 | my $k=$i%$MOD; | ||
299 | my $xmm="%xmm".($j&1); | ||
300 | $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15); | ||
301 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
302 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
303 | $code.=<<___; | ||
304 | #rc4# movl ($dat,$YY,4),$TY#d | ||
305 | #md5# xor $c,$tmp | ||
306 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
307 | #md5# xor $b,$tmp | ||
308 | #md5# add 4*`((5+3*$j)%16)`($inp),$a | ||
309 | #rc4# add $TY#b,$TX[0]#b | ||
310 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
311 | #md5# add \$$K[$i],$a | ||
312 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
313 | #md5# add $tmp,$a | ||
314 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
315 | #rc4# add $TX[1]#b,$YY#b | ||
316 | #md5# rol \$$rot2[$j%4],$a | ||
317 | #md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference | ||
318 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
319 | #md5# add $b,$a | ||
320 | ___ | ||
321 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
322 | mov $YY,$XX[1] | ||
323 | xor $YY,$YY # keyword to partial register | ||
324 | mov $XX[1]#b,$YY#b | ||
325 | lea ($dat,$XX[0],4),$XX[1] | ||
326 | ___ | ||
327 | $code.=<<___ if ($rc4 && $j==15); | ||
328 | psllq \$8,%xmm1 | ||
329 | pxor %xmm0,%xmm4 | ||
330 | pxor %xmm1,%xmm4 | ||
331 | ___ | ||
332 | } | ||
333 | sub R3 { | ||
334 | my ($i,$a,$b,$c,$d)=@_; | ||
335 | my @rot3=(6,10,15,21); | ||
336 | my $j=$i%16; | ||
337 | my $k=$i%$MOD; | ||
338 | my $xmm="%xmm".($j&1); | ||
339 | $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15); | ||
340 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
341 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
342 | $code.=<<___; | ||
343 | #rc4# movl ($dat,$YY,4),$TY#d | ||
344 | #md5# xor $d,$tmp | ||
345 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
346 | #md5# or $b,$tmp | ||
347 | #md5# add 4*`((7*$j)%16)`($inp),$a | ||
348 | #rc4# add $TY#b,$TX[0]#b | ||
349 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
350 | #md5# add \$$K[$i],$a | ||
351 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
352 | #md5# xor $c,$tmp | ||
353 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
354 | #md5# add $tmp,$a | ||
355 | #rc4# add $TX[1]#b,$YY#b | ||
356 | #md5# rol \$$rot3[$j%4],$a | ||
357 | #md5# mov \$-1,$tmp # forward reference | ||
358 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
359 | #md5# add $b,$a | ||
360 | ___ | ||
361 | $code.=<<___ if ($rc4 && $j==15); | ||
362 | mov $XX[0],$XX[1] | ||
363 | xor $XX[0],$XX[0] # keyword to partial register | ||
364 | mov $XX[1]#b,$XX[0]#b | ||
365 | mov $YY,$XX[1] | ||
366 | xor $YY,$YY # keyword to partial register | ||
367 | mov $XX[1]#b,$YY#b | ||
368 | lea ($dat,$XX[0],4),$XX[1] | ||
369 | psllq \$8,%xmm1 | ||
370 | pxor %xmm0,%xmm5 | ||
371 | pxor %xmm1,%xmm5 | ||
372 | ___ | ||
373 | } | ||
374 | |||
375 | my $i=0; | ||
376 | for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
377 | for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
378 | for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
379 | for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
380 | |||
381 | $code.=<<___; | ||
382 | #md5# add 0*4(%rsp),$V[0] # accumulate hash value | ||
383 | #md5# add 1*4(%rsp),$V[1] | ||
384 | #md5# add 2*4(%rsp),$V[2] | ||
385 | #md5# add 3*4(%rsp),$V[3] | ||
386 | |||
387 | #rc4# movdqu %xmm2,($out,$in0) # write RC4 output | ||
388 | #rc4# movdqu %xmm3,16($out,$in0) | ||
389 | #rc4# movdqu %xmm4,32($out,$in0) | ||
390 | #rc4# movdqu %xmm5,48($out,$in0) | ||
391 | #md5# lea 64($inp),$inp | ||
392 | #rc4# lea 64($in0),$in0 | ||
393 | cmp 16(%rsp),$inp # are we done? | ||
394 | jb .Loop | ||
395 | |||
396 | #md5# mov 24(%rsp),$len # restore pointer to MD5_CTX | ||
397 | #rc4# sub $TX[0]#b,$YY#b # correct $YY | ||
398 | #md5# mov $V[0],0*4($len) # write MD5_CTX | ||
399 | #md5# mov $V[1],1*4($len) | ||
400 | #md5# mov $V[2],2*4($len) | ||
401 | #md5# mov $V[3],3*4($len) | ||
402 | ___ | ||
403 | $code.=<<___ if ($rc4 && (!$md5 || $D)); | ||
404 | mov 32(%rsp),$len # restore original $len | ||
405 | and \$63,$len # remaining bytes | ||
406 | jnz .Loop1 | ||
407 | jmp .Ldone | ||
408 | |||
409 | .align 16 | ||
410 | .Loop1: | ||
411 | add $TX[0]#b,$YY#b | ||
412 | movl ($dat,$YY,4),$TY#d | ||
413 | movl $TX[0]#d,($dat,$YY,4) | ||
414 | movl $TY#d,($dat,$XX[0],4) | ||
415 | add $TY#b,$TX[0]#b | ||
416 | inc $XX[0]#b | ||
417 | movl ($dat,$TX[0],4),$TY#d | ||
418 | movl ($dat,$XX[0],4),$TX[0]#d | ||
419 | xorb ($in0),$TY#b | ||
420 | movb $TY#b,($out,$in0) | ||
421 | lea 1($in0),$in0 | ||
422 | dec $len | ||
423 | jnz .Loop1 | ||
424 | |||
425 | .Ldone: | ||
426 | ___ | ||
427 | $code.=<<___; | ||
428 | #rc4# sub \$1,$XX[0]#b | ||
429 | #rc4# movl $XX[0]#d,-8($dat) | ||
430 | #rc4# movl $YY#d,-4($dat) | ||
431 | |||
432 | mov 40(%rsp),%r15 | ||
433 | mov 48(%rsp),%r14 | ||
434 | mov 56(%rsp),%r13 | ||
435 | mov 64(%rsp),%r12 | ||
436 | mov 72(%rsp),%rbp | ||
437 | mov 80(%rsp),%rbx | ||
438 | lea 88(%rsp),%rsp | ||
439 | .Lepilogue: | ||
440 | .Labort: | ||
441 | ret | ||
442 | .size $func,.-$func | ||
443 | ___ | ||
444 | |||
445 | if ($rc4 && $D) { # sole purpose of this section is to provide | ||
446 | # option to use the generated module as drop-in | ||
447 | # replacement for rc4-x86_64.pl for debugging | ||
448 | # and testing purposes... | ||
449 | my ($idx,$ido)=("%r8","%r9"); | ||
450 | my ($dat,$len,$inp)=("%rdi","%rsi","%rdx"); | ||
451 | |||
452 | $code.=<<___; | ||
453 | .globl RC4_set_key | ||
454 | .type RC4_set_key,\@function,3 | ||
455 | .align 16 | ||
456 | RC4_set_key: | ||
457 | lea 8($dat),$dat | ||
458 | lea ($inp,$len),$inp | ||
459 | neg $len | ||
460 | mov $len,%rcx | ||
461 | xor %eax,%eax | ||
462 | xor $ido,$ido | ||
463 | xor %r10,%r10 | ||
464 | xor %r11,%r11 | ||
465 | jmp .Lw1stloop | ||
466 | |||
467 | .align 16 | ||
468 | .Lw1stloop: | ||
469 | mov %eax,($dat,%rax,4) | ||
470 | add \$1,%al | ||
471 | jnc .Lw1stloop | ||
472 | |||
473 | xor $ido,$ido | ||
474 | xor $idx,$idx | ||
475 | .align 16 | ||
476 | .Lw2ndloop: | ||
477 | mov ($dat,$ido,4),%r10d | ||
478 | add ($inp,$len,1),$idx#b | ||
479 | add %r10b,$idx#b | ||
480 | add \$1,$len | ||
481 | mov ($dat,$idx,4),%r11d | ||
482 | cmovz %rcx,$len | ||
483 | mov %r10d,($dat,$idx,4) | ||
484 | mov %r11d,($dat,$ido,4) | ||
485 | add \$1,$ido#b | ||
486 | jnc .Lw2ndloop | ||
487 | |||
488 | xor %eax,%eax | ||
489 | mov %eax,-8($dat) | ||
490 | mov %eax,-4($dat) | ||
491 | ret | ||
492 | .size RC4_set_key,.-RC4_set_key | ||
493 | |||
494 | .globl RC4_options | ||
495 | .type RC4_options,\@abi-omnipotent | ||
496 | .align 16 | ||
497 | RC4_options: | ||
498 | lea .Lopts(%rip),%rax | ||
499 | ret | ||
500 | .align 64 | ||
501 | .Lopts: | ||
502 | .asciz "rc4(64x,int)" | ||
503 | .align 64 | ||
504 | .size RC4_options,.-RC4_options | ||
505 | ___ | ||
506 | } | ||
507 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
508 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
509 | if ($win64) { | ||
510 | my $rec="%rcx"; | ||
511 | my $frame="%rdx"; | ||
512 | my $context="%r8"; | ||
513 | my $disp="%r9"; | ||
514 | |||
515 | $code.=<<___; | ||
516 | .extern __imp_RtlVirtualUnwind | ||
517 | .type se_handler,\@abi-omnipotent | ||
518 | .align 16 | ||
519 | se_handler: | ||
520 | push %rsi | ||
521 | push %rdi | ||
522 | push %rbx | ||
523 | push %rbp | ||
524 | push %r12 | ||
525 | push %r13 | ||
526 | push %r14 | ||
527 | push %r15 | ||
528 | pushfq | ||
529 | sub \$64,%rsp | ||
530 | |||
531 | mov 120($context),%rax # pull context->Rax | ||
532 | mov 248($context),%rbx # pull context->Rip | ||
533 | |||
534 | lea .Lbody(%rip),%r10 | ||
535 | cmp %r10,%rbx # context->Rip<.Lbody | ||
536 | jb .Lin_prologue | ||
537 | |||
538 | mov 152($context),%rax # pull context->Rsp | ||
539 | |||
540 | lea .Lepilogue(%rip),%r10 | ||
541 | cmp %r10,%rbx # context->Rip>=.Lepilogue | ||
542 | jae .Lin_prologue | ||
543 | |||
544 | mov 40(%rax),%r15 | ||
545 | mov 48(%rax),%r14 | ||
546 | mov 56(%rax),%r13 | ||
547 | mov 64(%rax),%r12 | ||
548 | mov 72(%rax),%rbp | ||
549 | mov 80(%rax),%rbx | ||
550 | lea 88(%rax),%rax | ||
551 | |||
552 | mov %rbx,144($context) # restore context->Rbx | ||
553 | mov %rbp,160($context) # restore context->Rbp | ||
554 | mov %r12,216($context) # restore context->R12 | ||
555 | mov %r13,224($context) # restore context->R12 | ||
556 | mov %r14,232($context) # restore context->R14 | ||
557 | mov %r15,240($context) # restore context->R15 | ||
558 | |||
559 | .Lin_prologue: | ||
560 | mov 8(%rax),%rdi | ||
561 | mov 16(%rax),%rsi | ||
562 | mov %rax,152($context) # restore context->Rsp | ||
563 | mov %rsi,168($context) # restore context->Rsi | ||
564 | mov %rdi,176($context) # restore context->Rdi | ||
565 | |||
566 | mov 40($disp),%rdi # disp->ContextRecord | ||
567 | mov $context,%rsi # context | ||
568 | mov \$154,%ecx # sizeof(CONTEXT) | ||
569 | .long 0xa548f3fc # cld; rep movsq | ||
570 | |||
571 | mov $disp,%rsi | ||
572 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
573 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
574 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
575 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
576 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
577 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
578 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
579 | mov %r10,32(%rsp) # arg5 | ||
580 | mov %r11,40(%rsp) # arg6 | ||
581 | mov %r12,48(%rsp) # arg7 | ||
582 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
583 | call *__imp_RtlVirtualUnwind(%rip) | ||
584 | |||
585 | mov \$1,%eax # ExceptionContinueSearch | ||
586 | add \$64,%rsp | ||
587 | popfq | ||
588 | pop %r15 | ||
589 | pop %r14 | ||
590 | pop %r13 | ||
591 | pop %r12 | ||
592 | pop %rbp | ||
593 | pop %rbx | ||
594 | pop %rdi | ||
595 | pop %rsi | ||
596 | ret | ||
597 | .size se_handler,.-se_handler | ||
598 | |||
599 | .section .pdata | ||
600 | .align 4 | ||
601 | .rva .LSEH_begin_$func | ||
602 | .rva .LSEH_end_$func | ||
603 | .rva .LSEH_info_$func | ||
604 | |||
605 | .section .xdata | ||
606 | .align 8 | ||
607 | .LSEH_info_$func: | ||
608 | .byte 9,0,0,0 | ||
609 | .rva se_handler | ||
610 | ___ | ||
611 | } | ||
612 | |||
613 | sub reg_part { | ||
614 | my ($reg,$conv)=@_; | ||
615 | if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } | ||
616 | elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } | ||
617 | elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } | ||
618 | elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } | ||
619 | return $reg; | ||
620 | } | ||
621 | |||
622 | $code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; | ||
623 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
624 | $code =~ s/pinsrw\s+\$0,/movd /gm; | ||
625 | |||
626 | $code =~ s/#md5#//gm if ($md5); | ||
627 | $code =~ s/#rc4#//gm if ($rc4); | ||
628 | |||
629 | print $code; | ||
630 | |||
631 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/rc4/asm/rc4-parisc.pl b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl new file mode 100644 index 0000000000..9165067080 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl | |||
@@ -0,0 +1,313 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # RC4 for PA-RISC. | ||
11 | |||
12 | # June 2009. | ||
13 | # | ||
14 | # Performance is 33% better than gcc 3.2 generated code on PA-7100LC. | ||
15 | # For reference, [4x] unrolled loop is >40% faster than folded one. | ||
16 | # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement | ||
17 | # is believed to be not sufficient to justify the effort... | ||
18 | # | ||
19 | # Special thanks to polarhome.com for providing HP-UX account. | ||
20 | |||
21 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
22 | |||
23 | $flavour = shift; | ||
24 | $output = shift; | ||
25 | open STDOUT,">$output"; | ||
26 | |||
27 | if ($flavour =~ /64/) { | ||
28 | $LEVEL ="2.0W"; | ||
29 | $SIZE_T =8; | ||
30 | $FRAME_MARKER =80; | ||
31 | $SAVED_RP =16; | ||
32 | $PUSH ="std"; | ||
33 | $PUSHMA ="std,ma"; | ||
34 | $POP ="ldd"; | ||
35 | $POPMB ="ldd,mb"; | ||
36 | } else { | ||
37 | $LEVEL ="1.0"; | ||
38 | $SIZE_T =4; | ||
39 | $FRAME_MARKER =48; | ||
40 | $SAVED_RP =20; | ||
41 | $PUSH ="stw"; | ||
42 | $PUSHMA ="stwm"; | ||
43 | $POP ="ldw"; | ||
44 | $POPMB ="ldwm"; | ||
45 | } | ||
46 | |||
47 | $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker | ||
48 | # [+ argument transfer] | ||
49 | $SZ=1; # defaults to RC4_CHAR | ||
50 | if (open CONF,"<${dir}../../opensslconf.h") { | ||
51 | while(<CONF>) { | ||
52 | if (m/#\s*define\s+RC4_INT\s+(.*)/) { | ||
53 | $SZ = ($1=~/char$/) ? 1 : 4; | ||
54 | last; | ||
55 | } | ||
56 | } | ||
57 | close CONF; | ||
58 | } | ||
59 | |||
60 | if ($SZ==1) { # RC4_CHAR | ||
61 | $LD="ldb"; | ||
62 | $LDX="ldbx"; | ||
63 | $MKX="addl"; | ||
64 | $ST="stb"; | ||
65 | } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) | ||
66 | $LD="ldw"; | ||
67 | $LDX="ldwx,s"; | ||
68 | $MKX="sh2addl"; | ||
69 | $ST="stw"; | ||
70 | } | ||
71 | |||
72 | $key="%r26"; | ||
73 | $len="%r25"; | ||
74 | $inp="%r24"; | ||
75 | $out="%r23"; | ||
76 | |||
77 | @XX=("%r19","%r20"); | ||
78 | @TX=("%r21","%r22"); | ||
79 | $YY="%r28"; | ||
80 | $TY="%r29"; | ||
81 | |||
82 | $acc="%r1"; | ||
83 | $ix="%r2"; | ||
84 | $iy="%r3"; | ||
85 | $dat0="%r4"; | ||
86 | $dat1="%r5"; | ||
87 | $rem="%r6"; | ||
88 | $mask="%r31"; | ||
89 | |||
90 | sub unrolledloopbody { | ||
91 | for ($i=0;$i<4;$i++) { | ||
92 | $code.=<<___; | ||
93 | ldo 1($XX[0]),$XX[1] | ||
94 | `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` | ||
95 | and $mask,$XX[1],$XX[1] | ||
96 | $LDX $YY($key),$TY | ||
97 | $MKX $YY,$key,$ix | ||
98 | $LDX $XX[1]($key),$TX[1] | ||
99 | $MKX $XX[0],$key,$iy | ||
100 | $ST $TX[0],0($ix) | ||
101 | comclr,<> $XX[1],$YY,%r0 ; conditional | ||
102 | copy $TX[0],$TX[1] ; move | ||
103 | `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` | ||
104 | $ST $TY,0($iy) | ||
105 | addl $TX[0],$TY,$TY | ||
106 | addl $TX[1],$YY,$YY | ||
107 | and $mask,$TY,$TY | ||
108 | and $mask,$YY,$YY | ||
109 | ___ | ||
110 | push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers | ||
111 | } } | ||
112 | |||
113 | sub foldedloop { | ||
114 | my ($label,$count)=@_; | ||
115 | $code.=<<___; | ||
116 | $label | ||
117 | $MKX $YY,$key,$iy | ||
118 | $LDX $YY($key),$TY | ||
119 | $MKX $XX[0],$key,$ix | ||
120 | $ST $TX[0],0($iy) | ||
121 | ldo 1($XX[0]),$XX[0] | ||
122 | $ST $TY,0($ix) | ||
123 | addl $TX[0],$TY,$TY | ||
124 | ldbx $inp($out),$dat1 | ||
125 | and $mask,$TY,$TY | ||
126 | and $mask,$XX[0],$XX[0] | ||
127 | $LDX $TY($key),$acc | ||
128 | $LDX $XX[0]($key),$TX[0] | ||
129 | ldo 1($out),$out | ||
130 | xor $dat1,$acc,$acc | ||
131 | addl $TX[0],$YY,$YY | ||
132 | stb $acc,-1($out) | ||
133 | addib,<> -1,$count,$label ; $count is always small | ||
134 | and $mask,$YY,$YY | ||
135 | ___ | ||
136 | } | ||
137 | |||
138 | $code=<<___; | ||
139 | .LEVEL $LEVEL | ||
140 | .SPACE \$TEXT\$ | ||
141 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
142 | |||
143 | .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | ||
144 | RC4 | ||
145 | .PROC | ||
146 | .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 | ||
147 | .ENTRY | ||
148 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
149 | $PUSHMA %r3,$FRAME(%sp) | ||
150 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
151 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
152 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
153 | |||
154 | cmpib,*= 0,$len,L\$abort | ||
155 | sub $inp,$out,$inp ; distance between $inp and $out | ||
156 | |||
157 | $LD `0*$SZ`($key),$XX[0] | ||
158 | $LD `1*$SZ`($key),$YY | ||
159 | ldo `2*$SZ`($key),$key | ||
160 | |||
161 | ldi 0xff,$mask | ||
162 | ldi 3,$dat0 | ||
163 | |||
164 | ldo 1($XX[0]),$XX[0] ; warm up loop | ||
165 | and $mask,$XX[0],$XX[0] | ||
166 | $LDX $XX[0]($key),$TX[0] | ||
167 | addl $TX[0],$YY,$YY | ||
168 | cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? | ||
169 | and $mask,$YY,$YY | ||
170 | |||
171 | and,<> $out,$dat0,$rem ; is $out aligned? | ||
172 | b L\$alignedout | ||
173 | subi 4,$rem,$rem | ||
174 | sub $len,$rem,$len | ||
175 | ___ | ||
176 | &foldedloop("L\$alignout",$rem); # process till $out is aligned | ||
177 | |||
178 | $code.=<<___; | ||
179 | L\$alignedout ; $len is at least 4 here | ||
180 | and,<> $inp,$dat0,$acc ; is $inp aligned? | ||
181 | b L\$oop4 | ||
182 | sub $inp,$acc,$rem ; align $inp | ||
183 | |||
184 | sh3addl $acc,%r0,$acc | ||
185 | subi 32,$acc,$acc | ||
186 | mtctl $acc,%cr11 ; load %sar with vshd align factor | ||
187 | ldwx $rem($out),$dat0 | ||
188 | ldo 4($rem),$rem | ||
189 | L\$oop4misalignedinp | ||
190 | ___ | ||
191 | &unrolledloopbody(); | ||
192 | $code.=<<___; | ||
193 | $LDX $TY($key),$ix | ||
194 | ldwx $rem($out),$dat1 | ||
195 | ldo -4($len),$len | ||
196 | or $ix,$acc,$acc ; last piece, no need to dep | ||
197 | vshd $dat0,$dat1,$iy ; align data | ||
198 | copy $dat1,$dat0 | ||
199 | xor $iy,$acc,$acc | ||
200 | stw $acc,0($out) | ||
201 | cmpib,*<< 3,$len,L\$oop4misalignedinp | ||
202 | ldo 4($out),$out | ||
203 | cmpib,*= 0,$len,L\$done | ||
204 | nop | ||
205 | b L\$oop1 | ||
206 | nop | ||
207 | |||
208 | .ALIGN 8 | ||
209 | L\$oop4 | ||
210 | ___ | ||
211 | &unrolledloopbody(); | ||
212 | $code.=<<___; | ||
213 | $LDX $TY($key),$ix | ||
214 | ldwx $inp($out),$dat0 | ||
215 | ldo -4($len),$len | ||
216 | or $ix,$acc,$acc ; last piece, no need to dep | ||
217 | xor $dat0,$acc,$acc | ||
218 | stw $acc,0($out) | ||
219 | cmpib,*<< 3,$len,L\$oop4 | ||
220 | ldo 4($out),$out | ||
221 | cmpib,*= 0,$len,L\$done | ||
222 | nop | ||
223 | ___ | ||
224 | &foldedloop("L\$oop1",$len); | ||
225 | $code.=<<___; | ||
226 | L\$done | ||
227 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 | ||
228 | ldo -1($XX[0]),$XX[0] ; chill out loop | ||
229 | sub $YY,$TX[0],$YY | ||
230 | and $mask,$XX[0],$XX[0] | ||
231 | and $mask,$YY,$YY | ||
232 | $ST $XX[0],`-2*$SZ`($key) | ||
233 | $ST $YY,`-1*$SZ`($key) | ||
234 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
235 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
236 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
237 | L\$abort | ||
238 | bv (%r2) | ||
239 | .EXIT | ||
240 | $POPMB -$FRAME(%sp),%r3 | ||
241 | .PROCEND | ||
242 | ___ | ||
243 | |||
244 | $code.=<<___; | ||
245 | |||
246 | .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
247 | .ALIGN 8 | ||
248 | private_RC4_set_key | ||
249 | .PROC | ||
250 | .CALLINFO NO_CALLS | ||
251 | .ENTRY | ||
252 | $ST %r0,`0*$SZ`($key) | ||
253 | $ST %r0,`1*$SZ`($key) | ||
254 | ldo `2*$SZ`($key),$key | ||
255 | copy %r0,@XX[0] | ||
256 | L\$1st | ||
257 | $ST @XX[0],0($key) | ||
258 | ldo 1(@XX[0]),@XX[0] | ||
259 | bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 | ||
260 | ldo $SZ($key),$key | ||
261 | |||
262 | ldo `-256*$SZ`($key),$key ; rewind $key | ||
263 | addl $len,$inp,$inp ; $inp to point at the end | ||
264 | sub %r0,$len,%r23 ; inverse index | ||
265 | copy %r0,@XX[0] | ||
266 | copy %r0,@XX[1] | ||
267 | ldi 0xff,$mask | ||
268 | |||
269 | L\$2nd | ||
270 | $LDX @XX[0]($key),@TX[0] | ||
271 | ldbx %r23($inp),@TX[1] | ||
272 | addi,nuv 1,%r23,%r23 ; increment and conditional | ||
273 | sub %r0,$len,%r23 ; inverse index | ||
274 | addl @TX[0],@XX[1],@XX[1] | ||
275 | addl @TX[1],@XX[1],@XX[1] | ||
276 | and $mask,@XX[1],@XX[1] | ||
277 | $MKX @XX[0],$key,$TY | ||
278 | $LDX @XX[1]($key),@TX[1] | ||
279 | $MKX @XX[1],$key,$YY | ||
280 | ldo 1(@XX[0]),@XX[0] | ||
281 | $ST @TX[0],0($YY) | ||
282 | bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 | ||
283 | $ST @TX[1],0($TY) | ||
284 | |||
285 | bv,n (%r2) | ||
286 | .EXIT | ||
287 | nop | ||
288 | .PROCEND | ||
289 | |||
290 | .EXPORT RC4_options,ENTRY | ||
291 | .ALIGN 8 | ||
292 | RC4_options | ||
293 | .PROC | ||
294 | .CALLINFO NO_CALLS | ||
295 | .ENTRY | ||
296 | blr %r0,%r28 | ||
297 | ldi 3,%r1 | ||
298 | L\$pic | ||
299 | andcm %r28,%r1,%r28 | ||
300 | bv (%r2) | ||
301 | .EXIT | ||
302 | ldo L\$opts-L\$pic(%r28),%r28 | ||
303 | .PROCEND | ||
304 | .ALIGN 8 | ||
305 | L\$opts | ||
306 | .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" | ||
307 | .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
308 | ___ | ||
309 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
310 | $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); | ||
311 | |||
312 | print $code; | ||
313 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl index 96681fa05e..7528ece13c 100644 --- a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl | |||
@@ -13,6 +13,29 @@ | |||
13 | # "cluster" Address Generation Interlocks, so that one pipeline stall | 13 | # "cluster" Address Generation Interlocks, so that one pipeline stall |
14 | # resolves several dependencies. | 14 | # resolves several dependencies. |
15 | 15 | ||
16 | # November 2010. | ||
17 | # | ||
18 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
19 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
20 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
21 | # application context. The feature is not specific to any particular | ||
22 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
23 | # remains z/Architecture specific. On z990 it was measured to perform | ||
24 | # 50% better than code generated by gcc 4.3. | ||
25 | |||
26 | $flavour = shift; | ||
27 | |||
28 | if ($flavour =~ /3[12]/) { | ||
29 | $SIZE_T=4; | ||
30 | $g=""; | ||
31 | } else { | ||
32 | $SIZE_T=8; | ||
33 | $g="g"; | ||
34 | } | ||
35 | |||
36 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
37 | open STDOUT,">$output"; | ||
38 | |||
16 | $rp="%r14"; | 39 | $rp="%r14"; |
17 | $sp="%r15"; | 40 | $sp="%r15"; |
18 | $code=<<___; | 41 | $code=<<___; |
@@ -39,7 +62,12 @@ $code.=<<___; | |||
39 | .type RC4,\@function | 62 | .type RC4,\@function |
40 | .align 64 | 63 | .align 64 |
41 | RC4: | 64 | RC4: |
42 | stmg %r6,%r11,48($sp) | 65 | stm${g} %r6,%r11,6*$SIZE_T($sp) |
66 | ___ | ||
67 | $code.=<<___ if ($flavour =~ /3[12]/); | ||
68 | llgfr $len,$len | ||
69 | ___ | ||
70 | $code.=<<___; | ||
43 | llgc $XX[0],0($key) | 71 | llgc $XX[0],0($key) |
44 | llgc $YY,1($key) | 72 | llgc $YY,1($key) |
45 | la $XX[0],1($XX[0]) | 73 | la $XX[0],1($XX[0]) |
@@ -90,7 +118,7 @@ $code.=<<___; | |||
90 | xgr $acc,$TX[1] | 118 | xgr $acc,$TX[1] |
91 | stg $acc,0($out) | 119 | stg $acc,0($out) |
92 | la $out,8($out) | 120 | la $out,8($out) |
93 | brct $cnt,.Loop8 | 121 | brctg $cnt,.Loop8 |
94 | 122 | ||
95 | .Lshort: | 123 | .Lshort: |
96 | lghi $acc,7 | 124 | lghi $acc,7 |
@@ -122,7 +150,7 @@ $code.=<<___; | |||
122 | ahi $XX[0],-1 | 150 | ahi $XX[0],-1 |
123 | stc $XX[0],0($key) | 151 | stc $XX[0],0($key) |
124 | stc $YY,1($key) | 152 | stc $YY,1($key) |
125 | lmg %r6,%r11,48($sp) | 153 | lm${g} %r6,%r11,6*$SIZE_T($sp) |
126 | br $rp | 154 | br $rp |
127 | .size RC4,.-RC4 | 155 | .size RC4,.-RC4 |
128 | .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 156 | .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
@@ -143,11 +171,11 @@ $ikey="%r7"; | |||
143 | $iinp="%r8"; | 171 | $iinp="%r8"; |
144 | 172 | ||
145 | $code.=<<___; | 173 | $code.=<<___; |
146 | .globl RC4_set_key | 174 | .globl private_RC4_set_key |
147 | .type RC4_set_key,\@function | 175 | .type private_RC4_set_key,\@function |
148 | .align 64 | 176 | .align 64 |
149 | RC4_set_key: | 177 | private_RC4_set_key: |
150 | stmg %r6,%r8,48($sp) | 178 | stm${g} %r6,%r8,6*$SIZE_T($sp) |
151 | lhi $cnt,256 | 179 | lhi $cnt,256 |
152 | la $idx,0(%r0) | 180 | la $idx,0(%r0) |
153 | sth $idx,0($key) | 181 | sth $idx,0($key) |
@@ -180,9 +208,9 @@ RC4_set_key: | |||
180 | la $iinp,0(%r0) | 208 | la $iinp,0(%r0) |
181 | j .L2ndloop | 209 | j .L2ndloop |
182 | .Ldone: | 210 | .Ldone: |
183 | lmg %r6,%r8,48($sp) | 211 | lm${g} %r6,%r8,6*$SIZE_T($sp) |
184 | br $rp | 212 | br $rp |
185 | .size RC4_set_key,.-RC4_set_key | 213 | .size private_RC4_set_key,.-private_RC4_set_key |
186 | 214 | ||
187 | ___ | 215 | ___ |
188 | } | 216 | } |
@@ -203,3 +231,4 @@ RC4_options: | |||
203 | ___ | 231 | ___ |
204 | 232 | ||
205 | print $code; | 233 | print $code; |
234 | close STDOUT; # force flush | ||
diff --git a/src/lib/libcrypto/rsa/rsa_ameth.c b/src/lib/libcrypto/rsa/rsa_ameth.c index 8c3209885e..2460910ab2 100644 --- a/src/lib/libcrypto/rsa/rsa_ameth.c +++ b/src/lib/libcrypto/rsa/rsa_ameth.c | |||
@@ -265,6 +265,147 @@ static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent, | |||
265 | return do_rsa_print(bp, pkey->pkey.rsa, indent, 1); | 265 | return do_rsa_print(bp, pkey->pkey.rsa, indent, 1); |
266 | } | 266 | } |
267 | 267 | ||
268 | static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg, | ||
269 | X509_ALGOR **pmaskHash) | ||
270 | { | ||
271 | const unsigned char *p; | ||
272 | int plen; | ||
273 | RSA_PSS_PARAMS *pss; | ||
274 | |||
275 | *pmaskHash = NULL; | ||
276 | |||
277 | if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE) | ||
278 | return NULL; | ||
279 | p = alg->parameter->value.sequence->data; | ||
280 | plen = alg->parameter->value.sequence->length; | ||
281 | pss = d2i_RSA_PSS_PARAMS(NULL, &p, plen); | ||
282 | |||
283 | if (!pss) | ||
284 | return NULL; | ||
285 | |||
286 | if (pss->maskGenAlgorithm) | ||
287 | { | ||
288 | ASN1_TYPE *param = pss->maskGenAlgorithm->parameter; | ||
289 | if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1 | ||
290 | && param->type == V_ASN1_SEQUENCE) | ||
291 | { | ||
292 | p = param->value.sequence->data; | ||
293 | plen = param->value.sequence->length; | ||
294 | *pmaskHash = d2i_X509_ALGOR(NULL, &p, plen); | ||
295 | } | ||
296 | } | ||
297 | |||
298 | return pss; | ||
299 | } | ||
300 | |||
301 | static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss, | ||
302 | X509_ALGOR *maskHash, int indent) | ||
303 | { | ||
304 | int rv = 0; | ||
305 | if (!pss) | ||
306 | { | ||
307 | if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0) | ||
308 | return 0; | ||
309 | return 1; | ||
310 | } | ||
311 | if (BIO_puts(bp, "\n") <= 0) | ||
312 | goto err; | ||
313 | if (!BIO_indent(bp, indent, 128)) | ||
314 | goto err; | ||
315 | if (BIO_puts(bp, "Hash Algorithm: ") <= 0) | ||
316 | goto err; | ||
317 | |||
318 | if (pss->hashAlgorithm) | ||
319 | { | ||
320 | if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0) | ||
321 | goto err; | ||
322 | } | ||
323 | else if (BIO_puts(bp, "sha1 (default)") <= 0) | ||
324 | goto err; | ||
325 | |||
326 | if (BIO_puts(bp, "\n") <= 0) | ||
327 | goto err; | ||
328 | |||
329 | if (!BIO_indent(bp, indent, 128)) | ||
330 | goto err; | ||
331 | |||
332 | if (BIO_puts(bp, "Mask Algorithm: ") <= 0) | ||
333 | goto err; | ||
334 | if (pss->maskGenAlgorithm) | ||
335 | { | ||
336 | if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0) | ||
337 | goto err; | ||
338 | if (BIO_puts(bp, " with ") <= 0) | ||
339 | goto err; | ||
340 | if (maskHash) | ||
341 | { | ||
342 | if (i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0) | ||
343 | goto err; | ||
344 | } | ||
345 | else if (BIO_puts(bp, "INVALID") <= 0) | ||
346 | goto err; | ||
347 | } | ||
348 | else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0) | ||
349 | goto err; | ||
350 | BIO_puts(bp, "\n"); | ||
351 | |||
352 | if (!BIO_indent(bp, indent, 128)) | ||
353 | goto err; | ||
354 | if (BIO_puts(bp, "Salt Length: ") <= 0) | ||
355 | goto err; | ||
356 | if (pss->saltLength) | ||
357 | { | ||
358 | if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0) | ||
359 | goto err; | ||
360 | } | ||
361 | else if (BIO_puts(bp, "20 (default)") <= 0) | ||
362 | goto err; | ||
363 | BIO_puts(bp, "\n"); | ||
364 | |||
365 | if (!BIO_indent(bp, indent, 128)) | ||
366 | goto err; | ||
367 | if (BIO_puts(bp, "Trailer Field: ") <= 0) | ||
368 | goto err; | ||
369 | if (pss->trailerField) | ||
370 | { | ||
371 | if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0) | ||
372 | goto err; | ||
373 | } | ||
374 | else if (BIO_puts(bp, "0xbc (default)") <= 0) | ||
375 | goto err; | ||
376 | BIO_puts(bp, "\n"); | ||
377 | |||
378 | rv = 1; | ||
379 | |||
380 | err: | ||
381 | return rv; | ||
382 | |||
383 | } | ||
384 | |||
385 | static int rsa_sig_print(BIO *bp, const X509_ALGOR *sigalg, | ||
386 | const ASN1_STRING *sig, | ||
387 | int indent, ASN1_PCTX *pctx) | ||
388 | { | ||
389 | if (OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss) | ||
390 | { | ||
391 | int rv; | ||
392 | RSA_PSS_PARAMS *pss; | ||
393 | X509_ALGOR *maskHash; | ||
394 | pss = rsa_pss_decode(sigalg, &maskHash); | ||
395 | rv = rsa_pss_param_print(bp, pss, maskHash, indent); | ||
396 | if (pss) | ||
397 | RSA_PSS_PARAMS_free(pss); | ||
398 | if (maskHash) | ||
399 | X509_ALGOR_free(maskHash); | ||
400 | if (!rv) | ||
401 | return 0; | ||
402 | } | ||
403 | else if (!sig && BIO_puts(bp, "\n") <= 0) | ||
404 | return 0; | ||
405 | if (sig) | ||
406 | return X509_signature_dump(bp, sig, indent); | ||
407 | return 1; | ||
408 | } | ||
268 | 409 | ||
269 | static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) | 410 | static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) |
270 | { | 411 | { |
@@ -310,6 +451,211 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) | |||
310 | 451 | ||
311 | } | 452 | } |
312 | 453 | ||
454 | /* Customised RSA item verification routine. This is called | ||
455 | * when a signature is encountered requiring special handling. We | ||
456 | * currently only handle PSS. | ||
457 | */ | ||
458 | |||
459 | |||
460 | static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, | ||
461 | X509_ALGOR *sigalg, ASN1_BIT_STRING *sig, | ||
462 | EVP_PKEY *pkey) | ||
463 | { | ||
464 | int rv = -1; | ||
465 | int saltlen; | ||
466 | const EVP_MD *mgf1md = NULL, *md = NULL; | ||
467 | RSA_PSS_PARAMS *pss; | ||
468 | X509_ALGOR *maskHash; | ||
469 | EVP_PKEY_CTX *pkctx; | ||
470 | /* Sanity check: make sure it is PSS */ | ||
471 | if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss) | ||
472 | { | ||
473 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE); | ||
474 | return -1; | ||
475 | } | ||
476 | /* Decode PSS parameters */ | ||
477 | pss = rsa_pss_decode(sigalg, &maskHash); | ||
478 | |||
479 | if (pss == NULL) | ||
480 | { | ||
481 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS); | ||
482 | goto err; | ||
483 | } | ||
484 | /* Check mask and lookup mask hash algorithm */ | ||
485 | if (pss->maskGenAlgorithm) | ||
486 | { | ||
487 | if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1) | ||
488 | { | ||
489 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM); | ||
490 | goto err; | ||
491 | } | ||
492 | if (!maskHash) | ||
493 | { | ||
494 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER); | ||
495 | goto err; | ||
496 | } | ||
497 | mgf1md = EVP_get_digestbyobj(maskHash->algorithm); | ||
498 | if (mgf1md == NULL) | ||
499 | { | ||
500 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST); | ||
501 | goto err; | ||
502 | } | ||
503 | } | ||
504 | else | ||
505 | mgf1md = EVP_sha1(); | ||
506 | |||
507 | if (pss->hashAlgorithm) | ||
508 | { | ||
509 | md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm); | ||
510 | if (md == NULL) | ||
511 | { | ||
512 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST); | ||
513 | goto err; | ||
514 | } | ||
515 | } | ||
516 | else | ||
517 | md = EVP_sha1(); | ||
518 | |||
519 | if (pss->saltLength) | ||
520 | { | ||
521 | saltlen = ASN1_INTEGER_get(pss->saltLength); | ||
522 | |||
523 | /* Could perform more salt length sanity checks but the main | ||
524 | * RSA routines will trap other invalid values anyway. | ||
525 | */ | ||
526 | if (saltlen < 0) | ||
527 | { | ||
528 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH); | ||
529 | goto err; | ||
530 | } | ||
531 | } | ||
532 | else | ||
533 | saltlen = 20; | ||
534 | |||
535 | /* low-level routines support only trailer field 0xbc (value 1) | ||
536 | * and PKCS#1 says we should reject any other value anyway. | ||
537 | */ | ||
538 | if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1) | ||
539 | { | ||
540 | RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER); | ||
541 | goto err; | ||
542 | } | ||
543 | |||
544 | /* We have all parameters now set up context */ | ||
545 | |||
546 | if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey)) | ||
547 | goto err; | ||
548 | |||
549 | if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0) | ||
550 | goto err; | ||
551 | |||
552 | if (EVP_PKEY_CTX_set_rsa_pss_saltlen(pkctx, saltlen) <= 0) | ||
553 | goto err; | ||
554 | |||
555 | if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0) | ||
556 | goto err; | ||
557 | /* Carry on */ | ||
558 | rv = 2; | ||
559 | |||
560 | err: | ||
561 | RSA_PSS_PARAMS_free(pss); | ||
562 | if (maskHash) | ||
563 | X509_ALGOR_free(maskHash); | ||
564 | return rv; | ||
565 | } | ||
566 | |||
567 | static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, | ||
568 | X509_ALGOR *alg1, X509_ALGOR *alg2, | ||
569 | ASN1_BIT_STRING *sig) | ||
570 | { | ||
571 | int pad_mode; | ||
572 | EVP_PKEY_CTX *pkctx = ctx->pctx; | ||
573 | if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0) | ||
574 | return 0; | ||
575 | if (pad_mode == RSA_PKCS1_PADDING) | ||
576 | return 2; | ||
577 | if (pad_mode == RSA_PKCS1_PSS_PADDING) | ||
578 | { | ||
579 | const EVP_MD *sigmd, *mgf1md; | ||
580 | RSA_PSS_PARAMS *pss = NULL; | ||
581 | X509_ALGOR *mgf1alg = NULL; | ||
582 | ASN1_STRING *os1 = NULL, *os2 = NULL; | ||
583 | EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx); | ||
584 | int saltlen, rv = 0; | ||
585 | sigmd = EVP_MD_CTX_md(ctx); | ||
586 | if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0) | ||
587 | goto err; | ||
588 | if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen)) | ||
589 | goto err; | ||
590 | if (saltlen == -1) | ||
591 | saltlen = EVP_MD_size(sigmd); | ||
592 | else if (saltlen == -2) | ||
593 | { | ||
594 | saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2; | ||
595 | if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0) | ||
596 | saltlen--; | ||
597 | } | ||
598 | pss = RSA_PSS_PARAMS_new(); | ||
599 | if (!pss) | ||
600 | goto err; | ||
601 | if (saltlen != 20) | ||
602 | { | ||
603 | pss->saltLength = ASN1_INTEGER_new(); | ||
604 | if (!pss->saltLength) | ||
605 | goto err; | ||
606 | if (!ASN1_INTEGER_set(pss->saltLength, saltlen)) | ||
607 | goto err; | ||
608 | } | ||
609 | if (EVP_MD_type(sigmd) != NID_sha1) | ||
610 | { | ||
611 | pss->hashAlgorithm = X509_ALGOR_new(); | ||
612 | if (!pss->hashAlgorithm) | ||
613 | goto err; | ||
614 | X509_ALGOR_set_md(pss->hashAlgorithm, sigmd); | ||
615 | } | ||
616 | if (EVP_MD_type(mgf1md) != NID_sha1) | ||
617 | { | ||
618 | ASN1_STRING *stmp = NULL; | ||
619 | /* need to embed algorithm ID inside another */ | ||
620 | mgf1alg = X509_ALGOR_new(); | ||
621 | X509_ALGOR_set_md(mgf1alg, mgf1md); | ||
622 | if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR), | ||
623 | &stmp)) | ||
624 | goto err; | ||
625 | pss->maskGenAlgorithm = X509_ALGOR_new(); | ||
626 | if (!pss->maskGenAlgorithm) | ||
627 | goto err; | ||
628 | X509_ALGOR_set0(pss->maskGenAlgorithm, | ||
629 | OBJ_nid2obj(NID_mgf1), | ||
630 | V_ASN1_SEQUENCE, stmp); | ||
631 | } | ||
632 | /* Finally create string with pss parameter encoding. */ | ||
633 | if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1)) | ||
634 | goto err; | ||
635 | if (alg2) | ||
636 | { | ||
637 | os2 = ASN1_STRING_dup(os1); | ||
638 | if (!os2) | ||
639 | goto err; | ||
640 | X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss), | ||
641 | V_ASN1_SEQUENCE, os2); | ||
642 | } | ||
643 | X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss), | ||
644 | V_ASN1_SEQUENCE, os1); | ||
645 | os1 = os2 = NULL; | ||
646 | rv = 3; | ||
647 | err: | ||
648 | if (mgf1alg) | ||
649 | X509_ALGOR_free(mgf1alg); | ||
650 | if (pss) | ||
651 | RSA_PSS_PARAMS_free(pss); | ||
652 | if (os1) | ||
653 | ASN1_STRING_free(os1); | ||
654 | return rv; | ||
655 | |||
656 | } | ||
657 | return 2; | ||
658 | } | ||
313 | 659 | ||
314 | const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = | 660 | const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = |
315 | { | 661 | { |
@@ -335,10 +681,13 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = | |||
335 | 681 | ||
336 | 0,0,0,0,0,0, | 682 | 0,0,0,0,0,0, |
337 | 683 | ||
684 | rsa_sig_print, | ||
338 | int_rsa_free, | 685 | int_rsa_free, |
339 | rsa_pkey_ctrl, | 686 | rsa_pkey_ctrl, |
340 | old_rsa_priv_decode, | 687 | old_rsa_priv_decode, |
341 | old_rsa_priv_encode | 688 | old_rsa_priv_encode, |
689 | rsa_item_verify, | ||
690 | rsa_item_sign | ||
342 | }, | 691 | }, |
343 | 692 | ||
344 | { | 693 | { |
diff --git a/src/lib/libcrypto/rsa/rsa_crpt.c b/src/lib/libcrypto/rsa/rsa_crpt.c new file mode 100644 index 0000000000..d3e44785dc --- /dev/null +++ b/src/lib/libcrypto/rsa/rsa_crpt.c | |||
@@ -0,0 +1,257 @@ | |||
1 | /* crypto/rsa/rsa_lib.c */ | ||
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * This package is an SSL implementation written | ||
6 | * by Eric Young (eay@cryptsoft.com). | ||
7 | * The implementation was written so as to conform with Netscapes SSL. | ||
8 | * | ||
9 | * This library is free for commercial and non-commercial use as long as | ||
10 | * the following conditions are aheared to. The following conditions | ||
11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
13 | * included with this distribution is covered by the same copyright terms | ||
14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
15 | * | ||
16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
17 | * the code are not to be removed. | ||
18 | * If this package is used in a product, Eric Young should be given attribution | ||
19 | * as the author of the parts of the library used. | ||
20 | * This can be in the form of a textual message at program startup or | ||
21 | * in documentation (online or textual) provided with the package. | ||
22 | * | ||
23 | * Redistribution and use in source and binary forms, with or without | ||
24 | * modification, are permitted provided that the following conditions | ||
25 | * are met: | ||
26 | * 1. Redistributions of source code must retain the copyright | ||
27 | * notice, this list of conditions and the following disclaimer. | ||
28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
29 | * notice, this list of conditions and the following disclaimer in the | ||
30 | * documentation and/or other materials provided with the distribution. | ||
31 | * 3. All advertising materials mentioning features or use of this software | ||
32 | * must display the following acknowledgement: | ||
33 | * "This product includes cryptographic software written by | ||
34 | * Eric Young (eay@cryptsoft.com)" | ||
35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
36 | * being used are not cryptographic related :-). | ||
37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
38 | * the apps directory (application code) you must include an acknowledgement: | ||
39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
40 | * | ||
41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
51 | * SUCH DAMAGE. | ||
52 | * | ||
53 | * The licence and distribution terms for any publically available version or | ||
54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
55 | * copied and put under another distribution licence | ||
56 | * [including the GNU Public Licence.] | ||
57 | */ | ||
58 | |||
59 | #include <stdio.h> | ||
60 | #include <openssl/crypto.h> | ||
61 | #include "cryptlib.h" | ||
62 | #include <openssl/lhash.h> | ||
63 | #include <openssl/bn.h> | ||
64 | #include <openssl/rsa.h> | ||
65 | #include <openssl/rand.h> | ||
66 | #ifndef OPENSSL_NO_ENGINE | ||
67 | #include <openssl/engine.h> | ||
68 | #endif | ||
69 | |||
70 | int RSA_size(const RSA *r) | ||
71 | { | ||
72 | return(BN_num_bytes(r->n)); | ||
73 | } | ||
74 | |||
75 | int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to, | ||
76 | RSA *rsa, int padding) | ||
77 | { | ||
78 | #ifdef OPENSSL_FIPS | ||
79 | if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) | ||
80 | && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) | ||
81 | { | ||
82 | RSAerr(RSA_F_RSA_PUBLIC_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD); | ||
83 | return -1; | ||
84 | } | ||
85 | #endif | ||
86 | return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding)); | ||
87 | } | ||
88 | |||
89 | int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to, | ||
90 | RSA *rsa, int padding) | ||
91 | { | ||
92 | #ifdef OPENSSL_FIPS | ||
93 | if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) | ||
94 | && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) | ||
95 | { | ||
96 | RSAerr(RSA_F_RSA_PRIVATE_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD); | ||
97 | return -1; | ||
98 | } | ||
99 | #endif | ||
100 | return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding)); | ||
101 | } | ||
102 | |||
103 | int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to, | ||
104 | RSA *rsa, int padding) | ||
105 | { | ||
106 | #ifdef OPENSSL_FIPS | ||
107 | if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) | ||
108 | && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) | ||
109 | { | ||
110 | RSAerr(RSA_F_RSA_PRIVATE_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD); | ||
111 | return -1; | ||
112 | } | ||
113 | #endif | ||
114 | return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding)); | ||
115 | } | ||
116 | |||
117 | int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to, | ||
118 | RSA *rsa, int padding) | ||
119 | { | ||
120 | #ifdef OPENSSL_FIPS | ||
121 | if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) | ||
122 | && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) | ||
123 | { | ||
124 | RSAerr(RSA_F_RSA_PUBLIC_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD); | ||
125 | return -1; | ||
126 | } | ||
127 | #endif | ||
128 | return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding)); | ||
129 | } | ||
130 | |||
131 | int RSA_flags(const RSA *r) | ||
132 | { | ||
133 | return((r == NULL)?0:r->meth->flags); | ||
134 | } | ||
135 | |||
136 | void RSA_blinding_off(RSA *rsa) | ||
137 | { | ||
138 | if (rsa->blinding != NULL) | ||
139 | { | ||
140 | BN_BLINDING_free(rsa->blinding); | ||
141 | rsa->blinding=NULL; | ||
142 | } | ||
143 | rsa->flags &= ~RSA_FLAG_BLINDING; | ||
144 | rsa->flags |= RSA_FLAG_NO_BLINDING; | ||
145 | } | ||
146 | |||
147 | int RSA_blinding_on(RSA *rsa, BN_CTX *ctx) | ||
148 | { | ||
149 | int ret=0; | ||
150 | |||
151 | if (rsa->blinding != NULL) | ||
152 | RSA_blinding_off(rsa); | ||
153 | |||
154 | rsa->blinding = RSA_setup_blinding(rsa, ctx); | ||
155 | if (rsa->blinding == NULL) | ||
156 | goto err; | ||
157 | |||
158 | rsa->flags |= RSA_FLAG_BLINDING; | ||
159 | rsa->flags &= ~RSA_FLAG_NO_BLINDING; | ||
160 | ret=1; | ||
161 | err: | ||
162 | return(ret); | ||
163 | } | ||
164 | |||
165 | static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p, | ||
166 | const BIGNUM *q, BN_CTX *ctx) | ||
167 | { | ||
168 | BIGNUM *ret = NULL, *r0, *r1, *r2; | ||
169 | |||
170 | if (d == NULL || p == NULL || q == NULL) | ||
171 | return NULL; | ||
172 | |||
173 | BN_CTX_start(ctx); | ||
174 | r0 = BN_CTX_get(ctx); | ||
175 | r1 = BN_CTX_get(ctx); | ||
176 | r2 = BN_CTX_get(ctx); | ||
177 | if (r2 == NULL) | ||
178 | goto err; | ||
179 | |||
180 | if (!BN_sub(r1, p, BN_value_one())) goto err; | ||
181 | if (!BN_sub(r2, q, BN_value_one())) goto err; | ||
182 | if (!BN_mul(r0, r1, r2, ctx)) goto err; | ||
183 | |||
184 | ret = BN_mod_inverse(NULL, d, r0, ctx); | ||
185 | err: | ||
186 | BN_CTX_end(ctx); | ||
187 | return ret; | ||
188 | } | ||
189 | |||
190 | BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx) | ||
191 | { | ||
192 | BIGNUM local_n; | ||
193 | BIGNUM *e,*n; | ||
194 | BN_CTX *ctx; | ||
195 | BN_BLINDING *ret = NULL; | ||
196 | |||
197 | if (in_ctx == NULL) | ||
198 | { | ||
199 | if ((ctx = BN_CTX_new()) == NULL) return 0; | ||
200 | } | ||
201 | else | ||
202 | ctx = in_ctx; | ||
203 | |||
204 | BN_CTX_start(ctx); | ||
205 | e = BN_CTX_get(ctx); | ||
206 | if (e == NULL) | ||
207 | { | ||
208 | RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE); | ||
209 | goto err; | ||
210 | } | ||
211 | |||
212 | if (rsa->e == NULL) | ||
213 | { | ||
214 | e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx); | ||
215 | if (e == NULL) | ||
216 | { | ||
217 | RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT); | ||
218 | goto err; | ||
219 | } | ||
220 | } | ||
221 | else | ||
222 | e = rsa->e; | ||
223 | |||
224 | |||
225 | if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL) | ||
226 | { | ||
227 | /* if PRNG is not properly seeded, resort to secret | ||
228 | * exponent as unpredictable seed */ | ||
229 | RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0); | ||
230 | } | ||
231 | |||
232 | if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) | ||
233 | { | ||
234 | /* Set BN_FLG_CONSTTIME flag */ | ||
235 | n = &local_n; | ||
236 | BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME); | ||
237 | } | ||
238 | else | ||
239 | n = rsa->n; | ||
240 | |||
241 | ret = BN_BLINDING_create_param(NULL, e, n, ctx, | ||
242 | rsa->meth->bn_mod_exp, rsa->_method_mod_n); | ||
243 | if (ret == NULL) | ||
244 | { | ||
245 | RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB); | ||
246 | goto err; | ||
247 | } | ||
248 | CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret)); | ||
249 | err: | ||
250 | BN_CTX_end(ctx); | ||
251 | if (in_ctx == NULL) | ||
252 | BN_CTX_free(ctx); | ||
253 | if(rsa->e == NULL) | ||
254 | BN_free(e); | ||
255 | |||
256 | return ret; | ||
257 | } | ||
diff --git a/src/lib/libcrypto/rsa/rsa_pmeth.c b/src/lib/libcrypto/rsa/rsa_pmeth.c index c6892ecd09..5b2ecf56ad 100644 --- a/src/lib/libcrypto/rsa/rsa_pmeth.c +++ b/src/lib/libcrypto/rsa/rsa_pmeth.c | |||
@@ -63,6 +63,12 @@ | |||
63 | #include <openssl/rsa.h> | 63 | #include <openssl/rsa.h> |
64 | #include <openssl/bn.h> | 64 | #include <openssl/bn.h> |
65 | #include <openssl/evp.h> | 65 | #include <openssl/evp.h> |
66 | #ifndef OPENSSL_NO_CMS | ||
67 | #include <openssl/cms.h> | ||
68 | #endif | ||
69 | #ifdef OPENSSL_FIPS | ||
70 | #include <openssl/fips.h> | ||
71 | #endif | ||
66 | #include "evp_locl.h" | 72 | #include "evp_locl.h" |
67 | #include "rsa_locl.h" | 73 | #include "rsa_locl.h" |
68 | 74 | ||
@@ -79,6 +85,8 @@ typedef struct | |||
79 | int pad_mode; | 85 | int pad_mode; |
80 | /* message digest */ | 86 | /* message digest */ |
81 | const EVP_MD *md; | 87 | const EVP_MD *md; |
88 | /* message digest for MGF1 */ | ||
89 | const EVP_MD *mgf1md; | ||
82 | /* PSS/OAEP salt length */ | 90 | /* PSS/OAEP salt length */ |
83 | int saltlen; | 91 | int saltlen; |
84 | /* Temp buffer */ | 92 | /* Temp buffer */ |
@@ -95,6 +103,7 @@ static int pkey_rsa_init(EVP_PKEY_CTX *ctx) | |||
95 | rctx->pub_exp = NULL; | 103 | rctx->pub_exp = NULL; |
96 | rctx->pad_mode = RSA_PKCS1_PADDING; | 104 | rctx->pad_mode = RSA_PKCS1_PADDING; |
97 | rctx->md = NULL; | 105 | rctx->md = NULL; |
106 | rctx->mgf1md = NULL; | ||
98 | rctx->tbuf = NULL; | 107 | rctx->tbuf = NULL; |
99 | 108 | ||
100 | rctx->saltlen = -2; | 109 | rctx->saltlen = -2; |
@@ -147,6 +156,31 @@ static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx) | |||
147 | OPENSSL_free(rctx); | 156 | OPENSSL_free(rctx); |
148 | } | 157 | } |
149 | } | 158 | } |
159 | #ifdef OPENSSL_FIPS | ||
160 | /* FIP checker. Return value indicates status of context parameters: | ||
161 | * 1 : redirect to FIPS. | ||
162 | * 0 : don't redirect to FIPS. | ||
163 | * -1 : illegal operation in FIPS mode. | ||
164 | */ | ||
165 | |||
166 | static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx) | ||
167 | { | ||
168 | RSA_PKEY_CTX *rctx = ctx->data; | ||
169 | RSA *rsa = ctx->pkey->pkey.rsa; | ||
170 | int rv = -1; | ||
171 | if (!FIPS_mode()) | ||
172 | return 0; | ||
173 | if (rsa->flags & RSA_FLAG_NON_FIPS_ALLOW) | ||
174 | rv = 0; | ||
175 | if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv) | ||
176 | return -1; | ||
177 | if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS)) | ||
178 | return rv; | ||
179 | if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS)) | ||
180 | return rv; | ||
181 | return 1; | ||
182 | } | ||
183 | #endif | ||
150 | 184 | ||
151 | static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | 185 | static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, |
152 | const unsigned char *tbs, size_t tbslen) | 186 | const unsigned char *tbs, size_t tbslen) |
@@ -155,6 +189,15 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | |||
155 | RSA_PKEY_CTX *rctx = ctx->data; | 189 | RSA_PKEY_CTX *rctx = ctx->data; |
156 | RSA *rsa = ctx->pkey->pkey.rsa; | 190 | RSA *rsa = ctx->pkey->pkey.rsa; |
157 | 191 | ||
192 | #ifdef OPENSSL_FIPS | ||
193 | ret = pkey_fips_check_ctx(ctx); | ||
194 | if (ret < 0) | ||
195 | { | ||
196 | RSAerr(RSA_F_PKEY_RSA_SIGN, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE); | ||
197 | return -1; | ||
198 | } | ||
199 | #endif | ||
200 | |||
158 | if (rctx->md) | 201 | if (rctx->md) |
159 | { | 202 | { |
160 | if (tbslen != (size_t)EVP_MD_size(rctx->md)) | 203 | if (tbslen != (size_t)EVP_MD_size(rctx->md)) |
@@ -163,7 +206,36 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | |||
163 | RSA_R_INVALID_DIGEST_LENGTH); | 206 | RSA_R_INVALID_DIGEST_LENGTH); |
164 | return -1; | 207 | return -1; |
165 | } | 208 | } |
166 | if (rctx->pad_mode == RSA_X931_PADDING) | 209 | #ifdef OPENSSL_FIPS |
210 | if (ret > 0) | ||
211 | { | ||
212 | unsigned int slen; | ||
213 | ret = FIPS_rsa_sign_digest(rsa, tbs, tbslen, rctx->md, | ||
214 | rctx->pad_mode, | ||
215 | rctx->saltlen, | ||
216 | rctx->mgf1md, | ||
217 | sig, &slen); | ||
218 | if (ret > 0) | ||
219 | *siglen = slen; | ||
220 | else | ||
221 | *siglen = 0; | ||
222 | return ret; | ||
223 | } | ||
224 | #endif | ||
225 | |||
226 | if (EVP_MD_type(rctx->md) == NID_mdc2) | ||
227 | { | ||
228 | unsigned int sltmp; | ||
229 | if (rctx->pad_mode != RSA_PKCS1_PADDING) | ||
230 | return -1; | ||
231 | ret = RSA_sign_ASN1_OCTET_STRING(NID_mdc2, | ||
232 | tbs, tbslen, sig, &sltmp, rsa); | ||
233 | |||
234 | if (ret <= 0) | ||
235 | return ret; | ||
236 | ret = sltmp; | ||
237 | } | ||
238 | else if (rctx->pad_mode == RSA_X931_PADDING) | ||
167 | { | 239 | { |
168 | if (!setup_tbuf(rctx, ctx)) | 240 | if (!setup_tbuf(rctx, ctx)) |
169 | return -1; | 241 | return -1; |
@@ -186,8 +258,10 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, | |||
186 | { | 258 | { |
187 | if (!setup_tbuf(rctx, ctx)) | 259 | if (!setup_tbuf(rctx, ctx)) |
188 | return -1; | 260 | return -1; |
189 | if (!RSA_padding_add_PKCS1_PSS(rsa, rctx->tbuf, tbs, | 261 | if (!RSA_padding_add_PKCS1_PSS_mgf1(rsa, |
190 | rctx->md, rctx->saltlen)) | 262 | rctx->tbuf, tbs, |
263 | rctx->md, rctx->mgf1md, | ||
264 | rctx->saltlen)) | ||
191 | return -1; | 265 | return -1; |
192 | ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf, | 266 | ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf, |
193 | sig, rsa, RSA_NO_PADDING); | 267 | sig, rsa, RSA_NO_PADDING); |
@@ -269,8 +343,30 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx, | |||
269 | RSA_PKEY_CTX *rctx = ctx->data; | 343 | RSA_PKEY_CTX *rctx = ctx->data; |
270 | RSA *rsa = ctx->pkey->pkey.rsa; | 344 | RSA *rsa = ctx->pkey->pkey.rsa; |
271 | size_t rslen; | 345 | size_t rslen; |
346 | #ifdef OPENSSL_FIPS | ||
347 | int rv; | ||
348 | rv = pkey_fips_check_ctx(ctx); | ||
349 | if (rv < 0) | ||
350 | { | ||
351 | RSAerr(RSA_F_PKEY_RSA_VERIFY, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE); | ||
352 | return -1; | ||
353 | } | ||
354 | #endif | ||
272 | if (rctx->md) | 355 | if (rctx->md) |
273 | { | 356 | { |
357 | #ifdef OPENSSL_FIPS | ||
358 | if (rv > 0) | ||
359 | { | ||
360 | return FIPS_rsa_verify_digest(rsa, | ||
361 | tbs, tbslen, | ||
362 | rctx->md, | ||
363 | rctx->pad_mode, | ||
364 | rctx->saltlen, | ||
365 | rctx->mgf1md, | ||
366 | sig, siglen); | ||
367 | |||
368 | } | ||
369 | #endif | ||
274 | if (rctx->pad_mode == RSA_PKCS1_PADDING) | 370 | if (rctx->pad_mode == RSA_PKCS1_PADDING) |
275 | return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen, | 371 | return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen, |
276 | sig, siglen, rsa); | 372 | sig, siglen, rsa); |
@@ -289,7 +385,8 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx, | |||
289 | rsa, RSA_NO_PADDING); | 385 | rsa, RSA_NO_PADDING); |
290 | if (ret <= 0) | 386 | if (ret <= 0) |
291 | return 0; | 387 | return 0; |
292 | ret = RSA_verify_PKCS1_PSS(rsa, tbs, rctx->md, | 388 | ret = RSA_verify_PKCS1_PSS_mgf1(rsa, tbs, |
389 | rctx->md, rctx->mgf1md, | ||
293 | rctx->tbuf, rctx->saltlen); | 390 | rctx->tbuf, rctx->saltlen); |
294 | if (ret <= 0) | 391 | if (ret <= 0) |
295 | return 0; | 392 | return 0; |
@@ -403,15 +500,25 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | |||
403 | RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE); | 500 | RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE); |
404 | return -2; | 501 | return -2; |
405 | 502 | ||
503 | case EVP_PKEY_CTRL_GET_RSA_PADDING: | ||
504 | *(int *)p2 = rctx->pad_mode; | ||
505 | return 1; | ||
506 | |||
406 | case EVP_PKEY_CTRL_RSA_PSS_SALTLEN: | 507 | case EVP_PKEY_CTRL_RSA_PSS_SALTLEN: |
407 | if (p1 < -2) | 508 | case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN: |
408 | return -2; | ||
409 | if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) | 509 | if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) |
410 | { | 510 | { |
411 | RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN); | 511 | RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN); |
412 | return -2; | 512 | return -2; |
413 | } | 513 | } |
414 | rctx->saltlen = p1; | 514 | if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN) |
515 | *(int *)p2 = rctx->saltlen; | ||
516 | else | ||
517 | { | ||
518 | if (p1 < -2) | ||
519 | return -2; | ||
520 | rctx->saltlen = p1; | ||
521 | } | ||
415 | return 1; | 522 | return 1; |
416 | 523 | ||
417 | case EVP_PKEY_CTRL_RSA_KEYGEN_BITS: | 524 | case EVP_PKEY_CTRL_RSA_KEYGEN_BITS: |
@@ -435,16 +542,45 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) | |||
435 | rctx->md = p2; | 542 | rctx->md = p2; |
436 | return 1; | 543 | return 1; |
437 | 544 | ||
545 | case EVP_PKEY_CTRL_RSA_MGF1_MD: | ||
546 | case EVP_PKEY_CTRL_GET_RSA_MGF1_MD: | ||
547 | if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) | ||
548 | { | ||
549 | RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD); | ||
550 | return -2; | ||
551 | } | ||
552 | if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD) | ||
553 | { | ||
554 | if (rctx->mgf1md) | ||
555 | *(const EVP_MD **)p2 = rctx->mgf1md; | ||
556 | else | ||
557 | *(const EVP_MD **)p2 = rctx->md; | ||
558 | } | ||
559 | else | ||
560 | rctx->mgf1md = p2; | ||
561 | return 1; | ||
562 | |||
438 | case EVP_PKEY_CTRL_DIGESTINIT: | 563 | case EVP_PKEY_CTRL_DIGESTINIT: |
439 | case EVP_PKEY_CTRL_PKCS7_ENCRYPT: | 564 | case EVP_PKEY_CTRL_PKCS7_ENCRYPT: |
440 | case EVP_PKEY_CTRL_PKCS7_DECRYPT: | 565 | case EVP_PKEY_CTRL_PKCS7_DECRYPT: |
441 | case EVP_PKEY_CTRL_PKCS7_SIGN: | 566 | case EVP_PKEY_CTRL_PKCS7_SIGN: |
567 | return 1; | ||
442 | #ifndef OPENSSL_NO_CMS | 568 | #ifndef OPENSSL_NO_CMS |
443 | case EVP_PKEY_CTRL_CMS_ENCRYPT: | ||
444 | case EVP_PKEY_CTRL_CMS_DECRYPT: | 569 | case EVP_PKEY_CTRL_CMS_DECRYPT: |
570 | { | ||
571 | X509_ALGOR *alg = NULL; | ||
572 | ASN1_OBJECT *encalg = NULL; | ||
573 | if (p2) | ||
574 | CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg); | ||
575 | if (alg) | ||
576 | X509_ALGOR_get0(&encalg, NULL, NULL, alg); | ||
577 | if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep) | ||
578 | rctx->pad_mode = RSA_PKCS1_OAEP_PADDING; | ||
579 | } | ||
580 | case EVP_PKEY_CTRL_CMS_ENCRYPT: | ||
445 | case EVP_PKEY_CTRL_CMS_SIGN: | 581 | case EVP_PKEY_CTRL_CMS_SIGN: |
446 | #endif | ||
447 | return 1; | 582 | return 1; |
583 | #endif | ||
448 | case EVP_PKEY_CTRL_PEER_KEY: | 584 | case EVP_PKEY_CTRL_PEER_KEY: |
449 | RSAerr(RSA_F_PKEY_RSA_CTRL, | 585 | RSAerr(RSA_F_PKEY_RSA_CTRL, |
450 | RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); | 586 | RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); |
diff --git a/src/lib/libcrypto/rsa/rsa_pss.c b/src/lib/libcrypto/rsa/rsa_pss.c index ac211e2ffe..5f9f533d0c 100644 --- a/src/lib/libcrypto/rsa/rsa_pss.c +++ b/src/lib/libcrypto/rsa/rsa_pss.c | |||
@@ -73,6 +73,13 @@ static const unsigned char zeroes[] = {0,0,0,0,0,0,0,0}; | |||
73 | int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | 73 | int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, |
74 | const EVP_MD *Hash, const unsigned char *EM, int sLen) | 74 | const EVP_MD *Hash, const unsigned char *EM, int sLen) |
75 | { | 75 | { |
76 | return RSA_verify_PKCS1_PSS_mgf1(rsa, mHash, Hash, NULL, EM, sLen); | ||
77 | } | ||
78 | |||
79 | int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash, | ||
80 | const EVP_MD *Hash, const EVP_MD *mgf1Hash, | ||
81 | const unsigned char *EM, int sLen) | ||
82 | { | ||
76 | int i; | 83 | int i; |
77 | int ret = 0; | 84 | int ret = 0; |
78 | int hLen, maskedDBLen, MSBits, emLen; | 85 | int hLen, maskedDBLen, MSBits, emLen; |
@@ -80,6 +87,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
80 | unsigned char *DB = NULL; | 87 | unsigned char *DB = NULL; |
81 | EVP_MD_CTX ctx; | 88 | EVP_MD_CTX ctx; |
82 | unsigned char H_[EVP_MAX_MD_SIZE]; | 89 | unsigned char H_[EVP_MAX_MD_SIZE]; |
90 | EVP_MD_CTX_init(&ctx); | ||
91 | |||
92 | if (mgf1Hash == NULL) | ||
93 | mgf1Hash = Hash; | ||
83 | 94 | ||
84 | hLen = EVP_MD_size(Hash); | 95 | hLen = EVP_MD_size(Hash); |
85 | if (hLen < 0) | 96 | if (hLen < 0) |
@@ -94,7 +105,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
94 | else if (sLen == -2) sLen = -2; | 105 | else if (sLen == -2) sLen = -2; |
95 | else if (sLen < -2) | 106 | else if (sLen < -2) |
96 | { | 107 | { |
97 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); | 108 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); |
98 | goto err; | 109 | goto err; |
99 | } | 110 | } |
100 | 111 | ||
@@ -102,7 +113,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
102 | emLen = RSA_size(rsa); | 113 | emLen = RSA_size(rsa); |
103 | if (EM[0] & (0xFF << MSBits)) | 114 | if (EM[0] & (0xFF << MSBits)) |
104 | { | 115 | { |
105 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_FIRST_OCTET_INVALID); | 116 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_FIRST_OCTET_INVALID); |
106 | goto err; | 117 | goto err; |
107 | } | 118 | } |
108 | if (MSBits == 0) | 119 | if (MSBits == 0) |
@@ -112,12 +123,12 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
112 | } | 123 | } |
113 | if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */ | 124 | if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */ |
114 | { | 125 | { |
115 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_DATA_TOO_LARGE); | 126 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_DATA_TOO_LARGE); |
116 | goto err; | 127 | goto err; |
117 | } | 128 | } |
118 | if (EM[emLen - 1] != 0xbc) | 129 | if (EM[emLen - 1] != 0xbc) |
119 | { | 130 | { |
120 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_LAST_OCTET_INVALID); | 131 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_LAST_OCTET_INVALID); |
121 | goto err; | 132 | goto err; |
122 | } | 133 | } |
123 | maskedDBLen = emLen - hLen - 1; | 134 | maskedDBLen = emLen - hLen - 1; |
@@ -125,10 +136,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
125 | DB = OPENSSL_malloc(maskedDBLen); | 136 | DB = OPENSSL_malloc(maskedDBLen); |
126 | if (!DB) | 137 | if (!DB) |
127 | { | 138 | { |
128 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, ERR_R_MALLOC_FAILURE); | 139 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, ERR_R_MALLOC_FAILURE); |
129 | goto err; | 140 | goto err; |
130 | } | 141 | } |
131 | if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, Hash) < 0) | 142 | if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash) < 0) |
132 | goto err; | 143 | goto err; |
133 | for (i = 0; i < maskedDBLen; i++) | 144 | for (i = 0; i < maskedDBLen; i++) |
134 | DB[i] ^= EM[i]; | 145 | DB[i] ^= EM[i]; |
@@ -137,25 +148,28 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
137 | for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ; | 148 | for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ; |
138 | if (DB[i++] != 0x1) | 149 | if (DB[i++] != 0x1) |
139 | { | 150 | { |
140 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_RECOVERY_FAILED); | 151 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_RECOVERY_FAILED); |
141 | goto err; | 152 | goto err; |
142 | } | 153 | } |
143 | if (sLen >= 0 && (maskedDBLen - i) != sLen) | 154 | if (sLen >= 0 && (maskedDBLen - i) != sLen) |
144 | { | 155 | { |
145 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); | 156 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); |
146 | goto err; | 157 | goto err; |
147 | } | 158 | } |
148 | EVP_MD_CTX_init(&ctx); | 159 | if (!EVP_DigestInit_ex(&ctx, Hash, NULL) |
149 | EVP_DigestInit_ex(&ctx, Hash, NULL); | 160 | || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes) |
150 | EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes); | 161 | || !EVP_DigestUpdate(&ctx, mHash, hLen)) |
151 | EVP_DigestUpdate(&ctx, mHash, hLen); | 162 | goto err; |
152 | if (maskedDBLen - i) | 163 | if (maskedDBLen - i) |
153 | EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i); | 164 | { |
154 | EVP_DigestFinal(&ctx, H_, NULL); | 165 | if (!EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i)) |
155 | EVP_MD_CTX_cleanup(&ctx); | 166 | goto err; |
167 | } | ||
168 | if (!EVP_DigestFinal_ex(&ctx, H_, NULL)) | ||
169 | goto err; | ||
156 | if (memcmp(H_, H, hLen)) | 170 | if (memcmp(H_, H, hLen)) |
157 | { | 171 | { |
158 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_BAD_SIGNATURE); | 172 | RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_BAD_SIGNATURE); |
159 | ret = 0; | 173 | ret = 0; |
160 | } | 174 | } |
161 | else | 175 | else |
@@ -164,6 +178,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, | |||
164 | err: | 178 | err: |
165 | if (DB) | 179 | if (DB) |
166 | OPENSSL_free(DB); | 180 | OPENSSL_free(DB); |
181 | EVP_MD_CTX_cleanup(&ctx); | ||
167 | 182 | ||
168 | return ret; | 183 | return ret; |
169 | 184 | ||
@@ -173,12 +188,22 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, | |||
173 | const unsigned char *mHash, | 188 | const unsigned char *mHash, |
174 | const EVP_MD *Hash, int sLen) | 189 | const EVP_MD *Hash, int sLen) |
175 | { | 190 | { |
191 | return RSA_padding_add_PKCS1_PSS_mgf1(rsa, EM, mHash, Hash, NULL, sLen); | ||
192 | } | ||
193 | |||
194 | int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM, | ||
195 | const unsigned char *mHash, | ||
196 | const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen) | ||
197 | { | ||
176 | int i; | 198 | int i; |
177 | int ret = 0; | 199 | int ret = 0; |
178 | int hLen, maskedDBLen, MSBits, emLen; | 200 | int hLen, maskedDBLen, MSBits, emLen; |
179 | unsigned char *H, *salt = NULL, *p; | 201 | unsigned char *H, *salt = NULL, *p; |
180 | EVP_MD_CTX ctx; | 202 | EVP_MD_CTX ctx; |
181 | 203 | ||
204 | if (mgf1Hash == NULL) | ||
205 | mgf1Hash = Hash; | ||
206 | |||
182 | hLen = EVP_MD_size(Hash); | 207 | hLen = EVP_MD_size(Hash); |
183 | if (hLen < 0) | 208 | if (hLen < 0) |
184 | goto err; | 209 | goto err; |
@@ -192,7 +217,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, | |||
192 | else if (sLen == -2) sLen = -2; | 217 | else if (sLen == -2) sLen = -2; |
193 | else if (sLen < -2) | 218 | else if (sLen < -2) |
194 | { | 219 | { |
195 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); | 220 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); |
196 | goto err; | 221 | goto err; |
197 | } | 222 | } |
198 | 223 | ||
@@ -209,8 +234,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, | |||
209 | } | 234 | } |
210 | else if (emLen < (hLen + sLen + 2)) | 235 | else if (emLen < (hLen + sLen + 2)) |
211 | { | 236 | { |
212 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, | 237 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); |
213 | RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); | ||
214 | goto err; | 238 | goto err; |
215 | } | 239 | } |
216 | if (sLen > 0) | 240 | if (sLen > 0) |
@@ -218,8 +242,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, | |||
218 | salt = OPENSSL_malloc(sLen); | 242 | salt = OPENSSL_malloc(sLen); |
219 | if (!salt) | 243 | if (!salt) |
220 | { | 244 | { |
221 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, | 245 | RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,ERR_R_MALLOC_FAILURE); |
222 | ERR_R_MALLOC_FAILURE); | ||
223 | goto err; | 246 | goto err; |
224 | } | 247 | } |
225 | if (RAND_bytes(salt, sLen) <= 0) | 248 | if (RAND_bytes(salt, sLen) <= 0) |
@@ -228,16 +251,18 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, | |||
228 | maskedDBLen = emLen - hLen - 1; | 251 | maskedDBLen = emLen - hLen - 1; |
229 | H = EM + maskedDBLen; | 252 | H = EM + maskedDBLen; |
230 | EVP_MD_CTX_init(&ctx); | 253 | EVP_MD_CTX_init(&ctx); |
231 | EVP_DigestInit_ex(&ctx, Hash, NULL); | 254 | if (!EVP_DigestInit_ex(&ctx, Hash, NULL) |
232 | EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes); | 255 | || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes) |
233 | EVP_DigestUpdate(&ctx, mHash, hLen); | 256 | || !EVP_DigestUpdate(&ctx, mHash, hLen)) |
234 | if (sLen) | 257 | goto err; |
235 | EVP_DigestUpdate(&ctx, salt, sLen); | 258 | if (sLen && !EVP_DigestUpdate(&ctx, salt, sLen)) |
236 | EVP_DigestFinal(&ctx, H, NULL); | 259 | goto err; |
260 | if (!EVP_DigestFinal_ex(&ctx, H, NULL)) | ||
261 | goto err; | ||
237 | EVP_MD_CTX_cleanup(&ctx); | 262 | EVP_MD_CTX_cleanup(&ctx); |
238 | 263 | ||
239 | /* Generate dbMask in place then perform XOR on it */ | 264 | /* Generate dbMask in place then perform XOR on it */ |
240 | if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, Hash)) | 265 | if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash)) |
241 | goto err; | 266 | goto err; |
242 | 267 | ||
243 | p = EM; | 268 | p = EM; |
diff --git a/src/lib/libcrypto/s390xcap.c b/src/lib/libcrypto/s390xcap.c index ffbe0235f9..f2e94ef47e 100644 --- a/src/lib/libcrypto/s390xcap.c +++ b/src/lib/libcrypto/s390xcap.c | |||
@@ -4,7 +4,7 @@ | |||
4 | #include <setjmp.h> | 4 | #include <setjmp.h> |
5 | #include <signal.h> | 5 | #include <signal.h> |
6 | 6 | ||
7 | extern unsigned long OPENSSL_s390xcap_P; | 7 | extern unsigned long OPENSSL_s390xcap_P[]; |
8 | 8 | ||
9 | static sigjmp_buf ill_jmp; | 9 | static sigjmp_buf ill_jmp; |
10 | static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } | 10 | static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } |
@@ -16,7 +16,9 @@ void OPENSSL_cpuid_setup(void) | |||
16 | sigset_t oset; | 16 | sigset_t oset; |
17 | struct sigaction ill_act,oact; | 17 | struct sigaction ill_act,oact; |
18 | 18 | ||
19 | if (OPENSSL_s390xcap_P) return; | 19 | if (OPENSSL_s390xcap_P[0]) return; |
20 | |||
21 | OPENSSL_s390xcap_P[0] = 1UL<<(8*sizeof(unsigned long)-1); | ||
20 | 22 | ||
21 | memset(&ill_act,0,sizeof(ill_act)); | 23 | memset(&ill_act,0,sizeof(ill_act)); |
22 | ill_act.sa_handler = ill_handler; | 24 | ill_act.sa_handler = ill_handler; |
@@ -27,10 +29,8 @@ void OPENSSL_cpuid_setup(void) | |||
27 | sigaction (SIGILL,&ill_act,&oact); | 29 | sigaction (SIGILL,&ill_act,&oact); |
28 | 30 | ||
29 | /* protection against missing store-facility-list-extended */ | 31 | /* protection against missing store-facility-list-extended */ |
30 | if (sigsetjmp(ill_jmp,0) == 0) | 32 | if (sigsetjmp(ill_jmp,1) == 0) |
31 | OPENSSL_s390xcap_P = OPENSSL_s390x_facilities(); | 33 | OPENSSL_s390x_facilities(); |
32 | else | ||
33 | OPENSSL_s390xcap_P = 1UL<<63; | ||
34 | 34 | ||
35 | sigaction (SIGILL,&oact,NULL); | 35 | sigaction (SIGILL,&oact,NULL); |
36 | sigprocmask(SIG_SETMASK,&oset,NULL); | 36 | sigprocmask(SIG_SETMASK,&oset,NULL); |
diff --git a/src/lib/libcrypto/s390xcpuid.S b/src/lib/libcrypto/s390xcpuid.S index b053c6a281..06815347e6 100644 --- a/src/lib/libcrypto/s390xcpuid.S +++ b/src/lib/libcrypto/s390xcpuid.S | |||
@@ -5,10 +5,14 @@ | |||
5 | .align 16 | 5 | .align 16 |
6 | OPENSSL_s390x_facilities: | 6 | OPENSSL_s390x_facilities: |
7 | lghi %r0,0 | 7 | lghi %r0,0 |
8 | .long 0xb2b0f010 # stfle 16(%r15) | 8 | larl %r2,OPENSSL_s390xcap_P |
9 | lg %r2,16(%r15) | 9 | stg %r0,8(%r2) |
10 | larl %r1,OPENSSL_s390xcap_P | 10 | .long 0xb2b02000 # stfle 0(%r2) |
11 | stg %r2,0(%r1) | 11 | brc 8,.Ldone |
12 | lghi %r0,1 | ||
13 | .long 0xb2b02000 # stfle 0(%r2) | ||
14 | .Ldone: | ||
15 | lg %r2,0(%r2) | ||
12 | br %r14 | 16 | br %r14 |
13 | .size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities | 17 | .size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities |
14 | 18 | ||
@@ -58,6 +62,9 @@ OPENSSL_wipe_cpu: | |||
58 | .type OPENSSL_cleanse,@function | 62 | .type OPENSSL_cleanse,@function |
59 | .align 16 | 63 | .align 16 |
60 | OPENSSL_cleanse: | 64 | OPENSSL_cleanse: |
65 | #if !defined(__s390x__) && !defined(__s390x) | ||
66 | llgfr %r3,%r3 | ||
67 | #endif | ||
61 | lghi %r4,15 | 68 | lghi %r4,15 |
62 | lghi %r0,0 | 69 | lghi %r0,0 |
63 | clgr %r3,%r4 | 70 | clgr %r3,%r4 |
@@ -89,4 +96,4 @@ OPENSSL_cleanse: | |||
89 | .section .init | 96 | .section .init |
90 | brasl %r14,OPENSSL_cpuid_setup | 97 | brasl %r14,OPENSSL_cpuid_setup |
91 | 98 | ||
92 | .comm OPENSSL_s390xcap_P,8,8 | 99 | .comm OPENSSL_s390xcap_P,16,8 |
diff --git a/src/lib/libcrypto/sha/asm/sha1-alpha.pl b/src/lib/libcrypto/sha/asm/sha1-alpha.pl new file mode 100644 index 0000000000..6c4b9251fd --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha1-alpha.pl | |||
@@ -0,0 +1,322 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # SHA1 block procedure for Alpha. | ||
11 | |||
12 | # On 21264 performance is 33% better than code generated by vendor | ||
13 | # compiler, and 75% better than GCC [3.4], and in absolute terms is | ||
14 | # 8.7 cycles per processed byte. Implementation features vectorized | ||
15 | # byte swap, but not Xupdate. | ||
16 | |||
17 | @X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", | ||
18 | "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15"); | ||
19 | $ctx="a0"; # $16 | ||
20 | $inp="a1"; | ||
21 | $num="a2"; | ||
22 | $A="a3"; | ||
23 | $B="a4"; # 20 | ||
24 | $C="a5"; | ||
25 | $D="t8"; | ||
26 | $E="t9"; @V=($A,$B,$C,$D,$E); | ||
27 | $t0="t10"; # 24 | ||
28 | $t1="t11"; | ||
29 | $t2="ra"; | ||
30 | $t3="t12"; | ||
31 | $K="AT"; # 28 | ||
32 | |||
33 | sub BODY_00_19 { | ||
34 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
35 | my $j=$i+1; | ||
36 | $code.=<<___ if ($i==0); | ||
37 | ldq_u @X[0],0+0($inp) | ||
38 | ldq_u @X[1],0+7($inp) | ||
39 | ___ | ||
40 | $code.=<<___ if (!($i&1) && $i<14); | ||
41 | ldq_u @X[$i+2],($i+2)*4+0($inp) | ||
42 | ldq_u @X[$i+3],($i+2)*4+7($inp) | ||
43 | ___ | ||
44 | $code.=<<___ if (!($i&1) && $i<15); | ||
45 | extql @X[$i],$inp,@X[$i] | ||
46 | extqh @X[$i+1],$inp,@X[$i+1] | ||
47 | |||
48 | or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched | ||
49 | |||
50 | srl @X[$i],24,$t0 # vectorized byte swap | ||
51 | srl @X[$i],8,$t2 | ||
52 | |||
53 | sll @X[$i],8,$t3 | ||
54 | sll @X[$i],24,@X[$i] | ||
55 | zapnot $t0,0x11,$t0 | ||
56 | zapnot $t2,0x22,$t2 | ||
57 | |||
58 | zapnot @X[$i],0x88,@X[$i] | ||
59 | or $t0,$t2,$t0 | ||
60 | zapnot $t3,0x44,$t3 | ||
61 | sll $a,5,$t1 | ||
62 | |||
63 | or @X[$i],$t0,@X[$i] | ||
64 | addl $K,$e,$e | ||
65 | and $b,$c,$t2 | ||
66 | zapnot $a,0xf,$a | ||
67 | |||
68 | or @X[$i],$t3,@X[$i] | ||
69 | srl $a,27,$t0 | ||
70 | bic $d,$b,$t3 | ||
71 | sll $b,30,$b | ||
72 | |||
73 | extll @X[$i],4,@X[$i+1] # extract upper half | ||
74 | or $t2,$t3,$t2 | ||
75 | addl @X[$i],$e,$e | ||
76 | |||
77 | addl $t1,$e,$e | ||
78 | srl $b,32,$t3 | ||
79 | zapnot @X[$i],0xf,@X[$i] | ||
80 | |||
81 | addl $t0,$e,$e | ||
82 | addl $t2,$e,$e | ||
83 | or $t3,$b,$b | ||
84 | ___ | ||
85 | $code.=<<___ if (($i&1) && $i<15); | ||
86 | sll $a,5,$t1 | ||
87 | addl $K,$e,$e | ||
88 | and $b,$c,$t2 | ||
89 | zapnot $a,0xf,$a | ||
90 | |||
91 | srl $a,27,$t0 | ||
92 | addl @X[$i%16],$e,$e | ||
93 | bic $d,$b,$t3 | ||
94 | sll $b,30,$b | ||
95 | |||
96 | or $t2,$t3,$t2 | ||
97 | addl $t1,$e,$e | ||
98 | srl $b,32,$t3 | ||
99 | zapnot @X[$i],0xf,@X[$i] | ||
100 | |||
101 | addl $t0,$e,$e | ||
102 | addl $t2,$e,$e | ||
103 | or $t3,$b,$b | ||
104 | ___ | ||
105 | $code.=<<___ if ($i>=15); # with forward Xupdate | ||
106 | sll $a,5,$t1 | ||
107 | addl $K,$e,$e | ||
108 | and $b,$c,$t2 | ||
109 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] | ||
110 | |||
111 | zapnot $a,0xf,$a | ||
112 | addl @X[$i%16],$e,$e | ||
113 | bic $d,$b,$t3 | ||
114 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
115 | |||
116 | srl $a,27,$t0 | ||
117 | addl $t1,$e,$e | ||
118 | or $t2,$t3,$t2 | ||
119 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
120 | |||
121 | sll $b,30,$b | ||
122 | addl $t0,$e,$e | ||
123 | srl @X[$j%16],31,$t1 | ||
124 | |||
125 | addl $t2,$e,$e | ||
126 | srl $b,32,$t3 | ||
127 | addl @X[$j%16],@X[$j%16],@X[$j%16] | ||
128 | |||
129 | or $t3,$b,$b | ||
130 | zapnot @X[$i%16],0xf,@X[$i%16] | ||
131 | or $t1,@X[$j%16],@X[$j%16] | ||
132 | ___ | ||
133 | } | ||
134 | |||
135 | sub BODY_20_39 { | ||
136 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
137 | my $j=$i+1; | ||
138 | $code.=<<___ if ($i<79); # with forward Xupdate | ||
139 | sll $a,5,$t1 | ||
140 | addl $K,$e,$e | ||
141 | zapnot $a,0xf,$a | ||
142 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] | ||
143 | |||
144 | sll $b,30,$t3 | ||
145 | addl $t1,$e,$e | ||
146 | xor $b,$c,$t2 | ||
147 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
148 | |||
149 | srl $b,2,$b | ||
150 | addl @X[$i%16],$e,$e | ||
151 | xor $d,$t2,$t2 | ||
152 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
153 | |||
154 | srl @X[$j%16],31,$t1 | ||
155 | addl $t2,$e,$e | ||
156 | srl $a,27,$t0 | ||
157 | addl @X[$j%16],@X[$j%16],@X[$j%16] | ||
158 | |||
159 | or $t3,$b,$b | ||
160 | addl $t0,$e,$e | ||
161 | or $t1,@X[$j%16],@X[$j%16] | ||
162 | ___ | ||
163 | $code.=<<___ if ($i<77); | ||
164 | zapnot @X[$i%16],0xf,@X[$i%16] | ||
165 | ___ | ||
166 | $code.=<<___ if ($i==79); # with context fetch | ||
167 | sll $a,5,$t1 | ||
168 | addl $K,$e,$e | ||
169 | zapnot $a,0xf,$a | ||
170 | ldl @X[0],0($ctx) | ||
171 | |||
172 | sll $b,30,$t3 | ||
173 | addl $t1,$e,$e | ||
174 | xor $b,$c,$t2 | ||
175 | ldl @X[1],4($ctx) | ||
176 | |||
177 | srl $b,2,$b | ||
178 | addl @X[$i%16],$e,$e | ||
179 | xor $d,$t2,$t2 | ||
180 | ldl @X[2],8($ctx) | ||
181 | |||
182 | srl $a,27,$t0 | ||
183 | addl $t2,$e,$e | ||
184 | ldl @X[3],12($ctx) | ||
185 | |||
186 | or $t3,$b,$b | ||
187 | addl $t0,$e,$e | ||
188 | ldl @X[4],16($ctx) | ||
189 | ___ | ||
190 | } | ||
191 | |||
192 | sub BODY_40_59 { | ||
193 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
194 | my $j=$i+1; | ||
195 | $code.=<<___; # with forward Xupdate | ||
196 | sll $a,5,$t1 | ||
197 | addl $K,$e,$e | ||
198 | zapnot $a,0xf,$a | ||
199 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] | ||
200 | |||
201 | srl $a,27,$t0 | ||
202 | and $b,$c,$t2 | ||
203 | and $b,$d,$t3 | ||
204 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
205 | |||
206 | sll $b,30,$b | ||
207 | addl $t1,$e,$e | ||
208 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
209 | |||
210 | srl @X[$j%16],31,$t1 | ||
211 | addl $t0,$e,$e | ||
212 | or $t2,$t3,$t2 | ||
213 | and $c,$d,$t3 | ||
214 | |||
215 | or $t2,$t3,$t2 | ||
216 | srl $b,32,$t3 | ||
217 | addl @X[$i%16],$e,$e | ||
218 | addl @X[$j%16],@X[$j%16],@X[$j%16] | ||
219 | |||
220 | or $t3,$b,$b | ||
221 | addl $t2,$e,$e | ||
222 | or $t1,@X[$j%16],@X[$j%16] | ||
223 | zapnot @X[$i%16],0xf,@X[$i%16] | ||
224 | ___ | ||
225 | } | ||
226 | |||
227 | $code=<<___; | ||
228 | #ifdef __linux__ | ||
229 | #include <asm/regdef.h> | ||
230 | #else | ||
231 | #include <asm.h> | ||
232 | #include <regdef.h> | ||
233 | #endif | ||
234 | |||
235 | .text | ||
236 | |||
237 | .set noat | ||
238 | .set noreorder | ||
239 | .globl sha1_block_data_order | ||
240 | .align 5 | ||
241 | .ent sha1_block_data_order | ||
242 | sha1_block_data_order: | ||
243 | lda sp,-64(sp) | ||
244 | stq ra,0(sp) | ||
245 | stq s0,8(sp) | ||
246 | stq s1,16(sp) | ||
247 | stq s2,24(sp) | ||
248 | stq s3,32(sp) | ||
249 | stq s4,40(sp) | ||
250 | stq s5,48(sp) | ||
251 | stq fp,56(sp) | ||
252 | .mask 0x0400fe00,-64 | ||
253 | .frame sp,64,ra | ||
254 | .prologue 0 | ||
255 | |||
256 | ldl $A,0($ctx) | ||
257 | ldl $B,4($ctx) | ||
258 | sll $num,6,$num | ||
259 | ldl $C,8($ctx) | ||
260 | ldl $D,12($ctx) | ||
261 | ldl $E,16($ctx) | ||
262 | addq $inp,$num,$num | ||
263 | |||
264 | .Lloop: | ||
265 | .set noreorder | ||
266 | ldah $K,23170(zero) | ||
267 | zapnot $B,0xf,$B | ||
268 | lda $K,31129($K) # K_00_19 | ||
269 | ___ | ||
270 | for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } | ||
271 | |||
272 | $code.=<<___; | ||
273 | ldah $K,28378(zero) | ||
274 | lda $K,-5215($K) # K_20_39 | ||
275 | ___ | ||
276 | for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
277 | |||
278 | $code.=<<___; | ||
279 | ldah $K,-28900(zero) | ||
280 | lda $K,-17188($K) # K_40_59 | ||
281 | ___ | ||
282 | for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | ||
283 | |||
284 | $code.=<<___; | ||
285 | ldah $K,-13725(zero) | ||
286 | lda $K,-15914($K) # K_60_79 | ||
287 | ___ | ||
288 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
289 | |||
290 | $code.=<<___; | ||
291 | addl @X[0],$A,$A | ||
292 | addl @X[1],$B,$B | ||
293 | addl @X[2],$C,$C | ||
294 | addl @X[3],$D,$D | ||
295 | addl @X[4],$E,$E | ||
296 | stl $A,0($ctx) | ||
297 | stl $B,4($ctx) | ||
298 | addq $inp,64,$inp | ||
299 | stl $C,8($ctx) | ||
300 | stl $D,12($ctx) | ||
301 | stl $E,16($ctx) | ||
302 | cmpult $inp,$num,$t1 | ||
303 | bne $t1,.Lloop | ||
304 | |||
305 | .set noreorder | ||
306 | ldq ra,0(sp) | ||
307 | ldq s0,8(sp) | ||
308 | ldq s1,16(sp) | ||
309 | ldq s2,24(sp) | ||
310 | ldq s3,32(sp) | ||
311 | ldq s4,40(sp) | ||
312 | ldq s5,48(sp) | ||
313 | ldq fp,56(sp) | ||
314 | lda sp,64(sp) | ||
315 | ret (ra) | ||
316 | .end sha1_block_data_order | ||
317 | .ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | ||
318 | .align 2 | ||
319 | ___ | ||
320 | $output=shift and open STDOUT,">$output"; | ||
321 | print $code; | ||
322 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl index 6e65fe3e01..fe8207f77f 100644 --- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl +++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl | |||
@@ -47,6 +47,10 @@ | |||
47 | # Cortex A8 core and in absolute terms ~870 cycles per input block | 47 | # Cortex A8 core and in absolute terms ~870 cycles per input block |
48 | # [or 13.6 cycles per byte]. | 48 | # [or 13.6 cycles per byte]. |
49 | 49 | ||
50 | # February 2011. | ||
51 | # | ||
52 | # Profiler-assisted and platform-specific optimization resulted in 10% | ||
53 | # improvement on Cortex A8 core and 12.2 cycles per byte. | ||
50 | 54 | ||
51 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | 55 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
52 | open STDOUT,">$output"; | 56 | open STDOUT,">$output"; |
@@ -76,31 +80,41 @@ $code.=<<___; | |||
76 | add $e,$K,$e,ror#2 @ E+=K_xx_xx | 80 | add $e,$K,$e,ror#2 @ E+=K_xx_xx |
77 | ldr $t3,[$Xi,#2*4] | 81 | ldr $t3,[$Xi,#2*4] |
78 | eor $t0,$t0,$t1 | 82 | eor $t0,$t0,$t1 |
79 | eor $t2,$t2,$t3 | 83 | eor $t2,$t2,$t3 @ 1 cycle stall |
80 | eor $t1,$c,$d @ F_xx_xx | 84 | eor $t1,$c,$d @ F_xx_xx |
81 | mov $t0,$t0,ror#31 | 85 | mov $t0,$t0,ror#31 |
82 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) | 86 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) |
83 | eor $t0,$t0,$t2,ror#31 | 87 | eor $t0,$t0,$t2,ror#31 |
88 | str $t0,[$Xi,#-4]! | ||
84 | $opt1 @ F_xx_xx | 89 | $opt1 @ F_xx_xx |
85 | $opt2 @ F_xx_xx | 90 | $opt2 @ F_xx_xx |
86 | add $e,$e,$t0 @ E+=X[i] | 91 | add $e,$e,$t0 @ E+=X[i] |
87 | str $t0,[$Xi,#-4]! | ||
88 | ___ | 92 | ___ |
89 | } | 93 | } |
90 | 94 | ||
91 | sub BODY_00_15 { | 95 | sub BODY_00_15 { |
92 | my ($a,$b,$c,$d,$e)=@_; | 96 | my ($a,$b,$c,$d,$e)=@_; |
93 | $code.=<<___; | 97 | $code.=<<___; |
94 | ldrb $t0,[$inp],#4 | 98 | #if __ARM_ARCH__<7 |
95 | ldrb $t1,[$inp,#-1] | 99 | ldrb $t1,[$inp,#2] |
96 | ldrb $t2,[$inp,#-2] | 100 | ldrb $t0,[$inp,#3] |
101 | ldrb $t2,[$inp,#1] | ||
97 | add $e,$K,$e,ror#2 @ E+=K_00_19 | 102 | add $e,$K,$e,ror#2 @ E+=K_00_19 |
98 | ldrb $t3,[$inp,#-3] | 103 | ldrb $t3,[$inp],#4 |
104 | orr $t0,$t0,$t1,lsl#8 | ||
105 | eor $t1,$c,$d @ F_xx_xx | ||
106 | orr $t0,$t0,$t2,lsl#16 | ||
99 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) | 107 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) |
100 | orr $t0,$t1,$t0,lsl#24 | 108 | orr $t0,$t0,$t3,lsl#24 |
109 | #else | ||
110 | ldr $t0,[$inp],#4 @ handles unaligned | ||
111 | add $e,$K,$e,ror#2 @ E+=K_00_19 | ||
101 | eor $t1,$c,$d @ F_xx_xx | 112 | eor $t1,$c,$d @ F_xx_xx |
102 | orr $t0,$t0,$t2,lsl#8 | 113 | add $e,$e,$a,ror#27 @ E+=ROR(A,27) |
103 | orr $t0,$t0,$t3,lsl#16 | 114 | #ifdef __ARMEL__ |
115 | rev $t0,$t0 @ byte swap | ||
116 | #endif | ||
117 | #endif | ||
104 | and $t1,$b,$t1,ror#2 | 118 | and $t1,$b,$t1,ror#2 |
105 | add $e,$e,$t0 @ E+=X[i] | 119 | add $e,$e,$t0 @ E+=X[i] |
106 | eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) | 120 | eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) |
@@ -136,6 +150,8 @@ ___ | |||
136 | } | 150 | } |
137 | 151 | ||
138 | $code=<<___; | 152 | $code=<<___; |
153 | #include "arm_arch.h" | ||
154 | |||
139 | .text | 155 | .text |
140 | 156 | ||
141 | .global sha1_block_data_order | 157 | .global sha1_block_data_order |
@@ -209,10 +225,14 @@ $code.=<<___; | |||
209 | teq $inp,$len | 225 | teq $inp,$len |
210 | bne .Lloop @ [+18], total 1307 | 226 | bne .Lloop @ [+18], total 1307 |
211 | 227 | ||
228 | #if __ARM_ARCH__>=5 | ||
229 | ldmia sp!,{r4-r12,pc} | ||
230 | #else | ||
212 | ldmia sp!,{r4-r12,lr} | 231 | ldmia sp!,{r4-r12,lr} |
213 | tst lr,#1 | 232 | tst lr,#1 |
214 | moveq pc,lr @ be binary compatible with V4, yet | 233 | moveq pc,lr @ be binary compatible with V4, yet |
215 | bx lr @ interoperable with Thumb ISA:-) | 234 | bx lr @ interoperable with Thumb ISA:-) |
235 | #endif | ||
216 | .align 2 | 236 | .align 2 |
217 | .LK_00_19: .word 0x5a827999 | 237 | .LK_00_19: .word 0x5a827999 |
218 | .LK_20_39: .word 0x6ed9eba1 | 238 | .LK_20_39: .word 0x6ed9eba1 |
diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl index 51c4f47ecb..db28f0805a 100644 --- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl +++ b/src/lib/libcrypto/sha/asm/sha1-ia64.pl | |||
@@ -15,7 +15,7 @@ | |||
15 | # is >50% better than HP C and >2x better than gcc. | 15 | # is >50% better than HP C and >2x better than gcc. |
16 | 16 | ||
17 | $code=<<___; | 17 | $code=<<___; |
18 | .ident \"sha1-ia64.s, version 1.2\" | 18 | .ident \"sha1-ia64.s, version 1.3\" |
19 | .ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" | 19 | .ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" |
20 | .explicit | 20 | .explicit |
21 | 21 | ||
@@ -26,14 +26,10 @@ if ($^O eq "hpux") { | |||
26 | $ADDP="addp4"; | 26 | $ADDP="addp4"; |
27 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | 27 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } |
28 | } else { $ADDP="add"; } | 28 | } else { $ADDP="add"; } |
29 | for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); | ||
30 | $big_endian=0 if (/\-DL_ENDIAN/); } | ||
31 | if (!defined($big_endian)) | ||
32 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
33 | 29 | ||
34 | #$human=1; | 30 | #$human=1; |
35 | if ($human) { # useful for visual code auditing... | 31 | if ($human) { # useful for visual code auditing... |
36 | ($A,$B,$C,$D,$E,$T) = ("A","B","C","D","E","T"); | 32 | ($A,$B,$C,$D,$E) = ("A","B","C","D","E"); |
37 | ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4"); | 33 | ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4"); |
38 | ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = | 34 | ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = |
39 | ( "K_00_19","K_20_39","K_40_59","K_60_79" ); | 35 | ( "K_00_19","K_20_39","K_40_59","K_60_79" ); |
@@ -41,47 +37,50 @@ if ($human) { # useful for visual code auditing... | |||
41 | "X8", "X9","X10","X11","X12","X13","X14","X15" ); | 37 | "X8", "X9","X10","X11","X12","X13","X14","X15" ); |
42 | } | 38 | } |
43 | else { | 39 | else { |
44 | ($A,$B,$C,$D,$E,$T) = ("loc0","loc1","loc2","loc3","loc4","loc5"); | 40 | ($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4"); |
45 | ($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10"); | 41 | ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9"); |
46 | ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = | 42 | ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = |
47 | ( "r14", "r15", "loc11", "loc12" ); | 43 | ( "r14", "r15", "loc10", "loc11" ); |
48 | @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", | 44 | @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", |
49 | "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" ); | 45 | "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" ); |
50 | } | 46 | } |
51 | 47 | ||
52 | sub BODY_00_15 { | 48 | sub BODY_00_15 { |
53 | local *code=shift; | 49 | local *code=shift; |
54 | local ($i,$a,$b,$c,$d,$e,$f)=@_; | 50 | my ($i,$a,$b,$c,$d,$e)=@_; |
51 | my $j=$i+1; | ||
52 | my $Xn=@X[$j%16]; | ||
55 | 53 | ||
56 | $code.=<<___ if ($i==0); | 54 | $code.=<<___ if ($i==0); |
57 | { .mmi; ld1 $X[$i&0xf]=[inp],2 // MSB | 55 | { .mmi; ld1 $X[$i]=[inp],2 // MSB |
58 | ld1 tmp2=[tmp3],2 };; | 56 | ld1 tmp2=[tmp3],2 };; |
59 | { .mmi; ld1 tmp0=[inp],2 | 57 | { .mmi; ld1 tmp0=[inp],2 |
60 | ld1 tmp4=[tmp3],2 // LSB | 58 | ld1 tmp4=[tmp3],2 // LSB |
61 | dep $X[$i&0xf]=$X[$i&0xf],tmp2,8,8 };; | 59 | dep $X[$i]=$X[$i],tmp2,8,8 };; |
62 | ___ | 60 | ___ |
63 | if ($i<15) { | 61 | if ($i<15) { |
64 | $code.=<<___; | 62 | $code.=<<___; |
65 | { .mmi; ld1 $X[($i+1)&0xf]=[inp],2 // +1 | 63 | { .mmi; ld1 $Xn=[inp],2 // forward Xload |
64 | nop.m 0x0 | ||
66 | dep tmp1=tmp0,tmp4,8,8 };; | 65 | dep tmp1=tmp0,tmp4,8,8 };; |
67 | { .mmi; ld1 tmp2=[tmp3],2 // +1 | 66 | { .mmi; ld1 tmp2=[tmp3],2 // forward Xload |
68 | and tmp4=$c,$b | 67 | and tmp4=$c,$b |
69 | dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; | 68 | dep $X[$i]=$X[$i],tmp1,16,16} //;; |
70 | { .mmi; andcm tmp1=$d,$b | 69 | { .mmi; add $e=$e,$K_00_19 // e+=K_00_19 |
71 | add tmp0=$e,$K_00_19 | 70 | andcm tmp1=$d,$b |
72 | dep.z tmp5=$a,5,27 };; // a<<5 | 71 | dep.z tmp5=$a,5,27 };; // a<<5 |
73 | { .mmi; or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) | 72 | { .mmi; add $e=$e,$X[$i] // e+=Xload |
74 | add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 | 73 | or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) |
75 | extr.u tmp1=$a,27,5 };; // a>>27 | 74 | extr.u tmp1=$a,27,5 };; // a>>27 |
76 | { .mmi; ld1 tmp0=[inp],2 // +1 | 75 | { .mmi; ld1 tmp0=[inp],2 // forward Xload |
77 | add $f=$f,tmp4 // f+=F_00_19(b,c,d) | 76 | add $e=$e,tmp4 // e+=F_00_19(b,c,d) |
78 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | 77 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) |
79 | { .mmi; ld1 tmp4=[tmp3],2 // +1 | 78 | { .mmi; ld1 tmp4=[tmp3],2 // forward Xload |
80 | or tmp5=tmp1,tmp5 // ROTATE(a,5) | 79 | or tmp5=tmp1,tmp5 // ROTATE(a,5) |
81 | mux2 tmp6=$a,0x44 };; // see b in next iteration | 80 | mux2 tmp6=$a,0x44 };; // see b in next iteration |
82 | { .mii; add $f=$f,tmp5 // f+=ROTATE(a,5) | 81 | { .mii; add $e=$e,tmp5 // e+=ROTATE(a,5) |
83 | dep $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8 // +1 | 82 | dep $Xn=$Xn,tmp2,8,8 // forward Xload |
84 | mux2 $X[$i&0xf]=$X[$i&0xf],0x44 } //;; | 83 | mux2 $X[$i]=$X[$i],0x44 } //;; |
85 | 84 | ||
86 | ___ | 85 | ___ |
87 | } | 86 | } |
@@ -89,24 +88,24 @@ else { | |||
89 | $code.=<<___; | 88 | $code.=<<___; |
90 | { .mii; and tmp3=$c,$b | 89 | { .mii; and tmp3=$c,$b |
91 | dep tmp1=tmp0,tmp4,8,8;; | 90 | dep tmp1=tmp0,tmp4,8,8;; |
92 | dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; | 91 | dep $X[$i]=$X[$i],tmp1,16,16} //;; |
93 | { .mmi; andcm tmp1=$d,$b | 92 | { .mmi; add $e=$e,$K_00_19 // e+=K_00_19 |
94 | add tmp0=$e,$K_00_19 | 93 | andcm tmp1=$d,$b |
95 | dep.z tmp5=$a,5,27 };; // a<<5 | 94 | dep.z tmp5=$a,5,27 };; // a<<5 |
96 | { .mmi; or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) | 95 | { .mmi; add $e=$e,$X[$i] // e+=Xupdate |
97 | add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 | 96 | or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) |
98 | extr.u tmp1=$a,27,5 } // a>>27 | 97 | extr.u tmp1=$a,27,5 } // a>>27 |
99 | { .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 | 98 | { .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate |
100 | xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 | 99 | xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate |
101 | nop.i 0 };; | 100 | nop.i 0 };; |
102 | { .mmi; add $f=$f,tmp4 // f+=F_00_19(b,c,d) | 101 | { .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d) |
103 | xor tmp2=tmp2,tmp3 // +1 | 102 | xor $Xn=$Xn,tmp3 // forward Xupdate |
104 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | 103 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) |
105 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | 104 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) |
106 | mux2 tmp6=$a,0x44 };; // see b in next iteration | 105 | mux2 tmp6=$a,0x44 };; // see b in next iteration |
107 | { .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) | 106 | { .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) |
108 | shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) | 107 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) |
109 | mux2 $X[$i&0xf]=$X[$i&0xf],0x44 };; | 108 | mux2 $X[$i]=$X[$i],0x44 };; |
110 | 109 | ||
111 | ___ | 110 | ___ |
112 | } | 111 | } |
@@ -114,27 +113,28 @@ ___ | |||
114 | 113 | ||
115 | sub BODY_16_19 { | 114 | sub BODY_16_19 { |
116 | local *code=shift; | 115 | local *code=shift; |
117 | local ($i,$a,$b,$c,$d,$e,$f)=@_; | 116 | my ($i,$a,$b,$c,$d,$e)=@_; |
117 | my $j=$i+1; | ||
118 | my $Xn=@X[$j%16]; | ||
118 | 119 | ||
119 | $code.=<<___; | 120 | $code.=<<___; |
120 | { .mmi; mov $X[$i&0xf]=$f // Xupdate | 121 | { .mib; add $e=$e,$K_00_19 // e+=K_00_19 |
121 | and tmp0=$c,$b | ||
122 | dep.z tmp5=$a,5,27 } // a<<5 | 122 | dep.z tmp5=$a,5,27 } // a<<5 |
123 | { .mmi; andcm tmp1=$d,$b | 123 | { .mib; andcm tmp1=$d,$b |
124 | add tmp4=$e,$K_00_19 };; | 124 | and tmp0=$c,$b };; |
125 | { .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) | 125 | { .mmi; add $e=$e,$X[$i%16] // e+=Xupdate |
126 | add $f=$f,tmp4 // f+=e+K_00_19 | 126 | or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) |
127 | extr.u tmp1=$a,27,5 } // a>>27 | 127 | extr.u tmp1=$a,27,5 } // a>>27 |
128 | { .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 | 128 | { .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate |
129 | xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 | 129 | xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate |
130 | nop.i 0 };; | 130 | nop.i 0 };; |
131 | { .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d) | 131 | { .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d) |
132 | xor tmp2=tmp2,tmp3 // +1 | 132 | xor $Xn=$Xn,tmp3 // forward Xupdate |
133 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | 133 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) |
134 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | 134 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) |
135 | mux2 tmp6=$a,0x44 };; // see b in next iteration | 135 | mux2 tmp6=$a,0x44 };; // see b in next iteration |
136 | { .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) | 136 | { .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) |
137 | shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) | 137 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) |
138 | nop.i 0 };; | 138 | nop.i 0 };; |
139 | 139 | ||
140 | ___ | 140 | ___ |
@@ -142,49 +142,47 @@ ___ | |||
142 | 142 | ||
143 | sub BODY_20_39 { | 143 | sub BODY_20_39 { |
144 | local *code=shift; | 144 | local *code=shift; |
145 | local ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_; | 145 | my ($i,$a,$b,$c,$d,$e,$Konst)=@_; |
146 | $Konst = $K_20_39 if (!defined($Konst)); | 146 | $Konst = $K_20_39 if (!defined($Konst)); |
147 | my $j=$i+1; | ||
148 | my $Xn=@X[$j%16]; | ||
147 | 149 | ||
148 | if ($i<79) { | 150 | if ($i<79) { |
149 | $code.=<<___; | 151 | $code.=<<___; |
150 | { .mib; mov $X[$i&0xf]=$f // Xupdate | 152 | { .mib; add $e=$e,$Konst // e+=K_XX_XX |
151 | dep.z tmp5=$a,5,27 } // a<<5 | 153 | dep.z tmp5=$a,5,27 } // a<<5 |
152 | { .mib; xor tmp0=$c,$b | 154 | { .mib; xor tmp0=$c,$b |
153 | add tmp4=$e,$Konst };; | 155 | xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate |
154 | { .mmi; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d | 156 | { .mib; add $e=$e,$X[$i%16] // e+=Xupdate |
155 | add $f=$f,tmp4 // f+=e+K_20_39 | ||
156 | extr.u tmp1=$a,27,5 } // a>>27 | 157 | extr.u tmp1=$a,27,5 } // a>>27 |
157 | { .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 | 158 | { .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d |
158 | xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 | 159 | xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate |
159 | nop.i 0 };; | 160 | { .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) |
160 | { .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) | 161 | xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate |
161 | xor tmp2=tmp2,tmp3 // +1 | ||
162 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | 162 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) |
163 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | 163 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) |
164 | mux2 tmp6=$a,0x44 };; // see b in next iteration | 164 | mux2 tmp6=$a,0x44 };; // see b in next iteration |
165 | { .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) | 165 | { .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) |
166 | shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) | 166 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) |
167 | nop.i 0 };; | 167 | nop.i 0 };; |
168 | 168 | ||
169 | ___ | 169 | ___ |
170 | } | 170 | } |
171 | else { | 171 | else { |
172 | $code.=<<___; | 172 | $code.=<<___; |
173 | { .mib; mov $X[$i&0xf]=$f // Xupdate | 173 | { .mib; add $e=$e,$Konst // e+=K_60_79 |
174 | dep.z tmp5=$a,5,27 } // a<<5 | 174 | dep.z tmp5=$a,5,27 } // a<<5 |
175 | { .mib; xor tmp0=$c,$b | 175 | { .mib; xor tmp0=$c,$b |
176 | add tmp4=$e,$Konst };; | ||
177 | { .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d | ||
178 | extr.u tmp1=$a,27,5 } // a>>27 | ||
179 | { .mib; add $f=$f,tmp4 // f+=e+K_20_39 | ||
180 | add $h1=$h1,$a };; // wrap up | 176 | add $h1=$h1,$a };; // wrap up |
181 | { .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) | 177 | { .mib; add $e=$e,$X[$i%16] // e+=Xupdate |
182 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) ;;? | 178 | extr.u tmp1=$a,27,5 } // a>>27 |
183 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | 179 | { .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d |
184 | add $h3=$h3,$c };; // wrap up | 180 | add $h3=$h3,$c };; // wrap up |
185 | { .mib; add tmp3=1,inp // used in unaligned codepath | 181 | { .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) |
186 | add $f=$f,tmp1 } // f+=ROTATE(a,5) | 182 | or tmp1=tmp1,tmp5 // ROTATE(a,5) |
187 | { .mib; add $h2=$h2,$b // wrap up | 183 | shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;? |
184 | { .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5) | ||
185 | add tmp3=1,inp // used in unaligned codepath | ||
188 | add $h4=$h4,$d };; // wrap up | 186 | add $h4=$h4,$d };; // wrap up |
189 | 187 | ||
190 | ___ | 188 | ___ |
@@ -193,29 +191,29 @@ ___ | |||
193 | 191 | ||
194 | sub BODY_40_59 { | 192 | sub BODY_40_59 { |
195 | local *code=shift; | 193 | local *code=shift; |
196 | local ($i,$a,$b,$c,$d,$e,$f)=@_; | 194 | my ($i,$a,$b,$c,$d,$e)=@_; |
195 | my $j=$i+1; | ||
196 | my $Xn=@X[$j%16]; | ||
197 | 197 | ||
198 | $code.=<<___; | 198 | $code.=<<___; |
199 | { .mmi; mov $X[$i&0xf]=$f // Xupdate | 199 | { .mib; add $e=$e,$K_40_59 // e+=K_40_59 |
200 | and tmp0=$c,$b | ||
201 | dep.z tmp5=$a,5,27 } // a<<5 | 200 | dep.z tmp5=$a,5,27 } // a<<5 |
202 | { .mmi; and tmp1=$d,$b | 201 | { .mib; and tmp1=$c,$d |
203 | add tmp4=$e,$K_40_59 };; | 202 | xor tmp0=$c,$d };; |
204 | { .mmi; or tmp0=tmp0,tmp1 // (b&c)|(b&d) | 203 | { .mmi; add $e=$e,$X[$i%16] // e+=Xupdate |
205 | add $f=$f,tmp4 // f+=e+K_40_59 | 204 | add tmp5=tmp5,tmp1 // a<<5+(c&d) |
206 | extr.u tmp1=$a,27,5 } // a>>27 | 205 | extr.u tmp1=$a,27,5 } // a>>27 |
207 | { .mmi; and tmp4=$c,$d | 206 | { .mmi; and tmp0=tmp0,$b |
208 | xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 | 207 | xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate |
209 | xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 | 208 | xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate |
210 | };; | 209 | { .mmi; add $e=$e,tmp0 // e+=b&(c^d) |
211 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | 210 | add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d) |
212 | xor tmp2=tmp2,tmp3 // +1 | ||
213 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | 211 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) |
214 | { .mmi; or tmp0=tmp0,tmp4 // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d) | 212 | { .mmi; xor $Xn=$Xn,tmp3 |
215 | mux2 tmp6=$a,0x44 };; // see b in next iteration | 213 | mux2 tmp6=$a,0x44 };; // see b in next iteration |
216 | { .mii; add $f=$f,tmp0 // f+=F_40_59(b,c,d) | 214 | { .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d) |
217 | shrp $e=tmp2,tmp2,31;; // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) | 215 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) |
218 | add $f=$f,tmp1 };; // f+=ROTATE(a,5) | 216 | nop.i 0x0 };; |
219 | 217 | ||
220 | ___ | 218 | ___ |
221 | } | 219 | } |
@@ -237,7 +235,7 @@ inp=r33; // in1 | |||
237 | .align 32 | 235 | .align 32 |
238 | sha1_block_data_order: | 236 | sha1_block_data_order: |
239 | .prologue | 237 | .prologue |
240 | { .mmi; alloc tmp1=ar.pfs,3,15,0,0 | 238 | { .mmi; alloc tmp1=ar.pfs,3,14,0,0 |
241 | $ADDP tmp0=4,ctx | 239 | $ADDP tmp0=4,ctx |
242 | .save ar.lc,r3 | 240 | .save ar.lc,r3 |
243 | mov r3=ar.lc } | 241 | mov r3=ar.lc } |
@@ -245,8 +243,8 @@ sha1_block_data_order: | |||
245 | $ADDP inp=0,inp | 243 | $ADDP inp=0,inp |
246 | mov r2=pr };; | 244 | mov r2=pr };; |
247 | tmp4=in2; | 245 | tmp4=in2; |
248 | tmp5=loc13; | 246 | tmp5=loc12; |
249 | tmp6=loc14; | 247 | tmp6=loc13; |
250 | .body | 248 | .body |
251 | { .mlx; ld4 $h0=[ctx],8 | 249 | { .mlx; ld4 $h0=[ctx],8 |
252 | movl $K_00_19=0x5a827999 } | 250 | movl $K_00_19=0x5a827999 } |
@@ -273,7 +271,7 @@ tmp6=loc14; | |||
273 | 271 | ||
274 | ___ | 272 | ___ |
275 | 273 | ||
276 | { my $i,@V=($A,$B,$C,$D,$E,$T); | 274 | { my $i,@V=($A,$B,$C,$D,$E); |
277 | 275 | ||
278 | for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); } | 276 | for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); } |
279 | for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); } | 277 | for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); } |
@@ -281,12 +279,12 @@ ___ | |||
281 | for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); } | 279 | for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); } |
282 | for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); } | 280 | for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); } |
283 | 281 | ||
284 | (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check | 282 | (($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check |
285 | } | 283 | } |
286 | 284 | ||
287 | $code.=<<___; | 285 | $code.=<<___; |
288 | { .mmb; add $h0=$h0,$E | 286 | { .mmb; add $h0=$h0,$A |
289 | nop.m 0 | 287 | add $h2=$h2,$C |
290 | br.ctop.dptk.many .Ldtop };; | 288 | br.ctop.dptk.many .Ldtop };; |
291 | .Ldend: | 289 | .Ldend: |
292 | { .mmi; add tmp0=4,ctx | 290 | { .mmi; add tmp0=4,ctx |
diff --git a/src/lib/libcrypto/sha/asm/sha1-mips.pl b/src/lib/libcrypto/sha/asm/sha1-mips.pl new file mode 100644 index 0000000000..f1a702f38f --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha1-mips.pl | |||
@@ -0,0 +1,354 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # SHA1 block procedure for MIPS. | ||
11 | |||
12 | # Performance improvement is 30% on unaligned input. The "secret" is | ||
13 | # to deploy lwl/lwr pair to load unaligned input. One could have | ||
14 | # vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- | ||
15 | # compatible subroutine. There is room for minor optimization on | ||
16 | # little-endian platforms... | ||
17 | |||
18 | ###################################################################### | ||
19 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | ||
20 | # widely used. Then there is a new contender: NUBI. It appears that if | ||
21 | # one picks the latter, it's possible to arrange code in ABI neutral | ||
22 | # manner. Therefore let's stick to NUBI register layout: | ||
23 | # | ||
24 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | ||
25 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
26 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | ||
27 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | ||
28 | # | ||
29 | # The return value is placed in $a0. Following coding rules facilitate | ||
30 | # interoperability: | ||
31 | # | ||
32 | # - never ever touch $tp, "thread pointer", former $gp; | ||
33 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | ||
34 | # old code]; | ||
35 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | ||
36 | # | ||
37 | # For reference here is register layout for N32/64 MIPS ABIs: | ||
38 | # | ||
39 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
40 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
41 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
42 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
43 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
44 | # | ||
45 | $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 | ||
46 | |||
47 | if ($flavour =~ /64|n32/i) { | ||
48 | $PTR_ADD="dadd"; # incidentally works even on n32 | ||
49 | $PTR_SUB="dsub"; # incidentally works even on n32 | ||
50 | $REG_S="sd"; | ||
51 | $REG_L="ld"; | ||
52 | $PTR_SLL="dsll"; # incidentally works even on n32 | ||
53 | $SZREG=8; | ||
54 | } else { | ||
55 | $PTR_ADD="add"; | ||
56 | $PTR_SUB="sub"; | ||
57 | $REG_S="sw"; | ||
58 | $REG_L="lw"; | ||
59 | $PTR_SLL="sll"; | ||
60 | $SZREG=4; | ||
61 | } | ||
62 | # | ||
63 | # <appro@openssl.org> | ||
64 | # | ||
65 | ###################################################################### | ||
66 | |||
67 | $big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; | ||
68 | |||
69 | for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } | ||
70 | open STDOUT,">$output"; | ||
71 | |||
72 | if (!defined($big_endian)) | ||
73 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
74 | |||
75 | # offsets of the Most and Least Significant Bytes | ||
76 | $MSB=$big_endian?0:3; | ||
77 | $LSB=3&~$MSB; | ||
78 | |||
79 | @X=map("\$$_",(8..23)); # a4-a7,s0-s11 | ||
80 | |||
81 | $ctx=$a0; | ||
82 | $inp=$a1; | ||
83 | $num=$a2; | ||
84 | $A="\$1"; | ||
85 | $B="\$2"; | ||
86 | $C="\$3"; | ||
87 | $D="\$7"; | ||
88 | $E="\$24"; @V=($A,$B,$C,$D,$E); | ||
89 | $t0="\$25"; | ||
90 | $t1=$num; # $num is offloaded to stack | ||
91 | $t2="\$30"; # fp | ||
92 | $K="\$31"; # ra | ||
93 | |||
94 | sub BODY_00_14 { | ||
95 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
96 | my $j=$i+1; | ||
97 | $code.=<<___ if (!$big_endian); | ||
98 | srl $t0,@X[$i],24 # byte swap($i) | ||
99 | srl $t1,@X[$i],8 | ||
100 | andi $t2,@X[$i],0xFF00 | ||
101 | sll @X[$i],@X[$i],24 | ||
102 | andi $t1,0xFF00 | ||
103 | sll $t2,$t2,8 | ||
104 | or @X[$i],$t0 | ||
105 | or $t1,$t2 | ||
106 | or @X[$i],$t1 | ||
107 | ___ | ||
108 | $code.=<<___; | ||
109 | lwl @X[$j],$j*4+$MSB($inp) | ||
110 | sll $t0,$a,5 # $i | ||
111 | addu $e,$K | ||
112 | lwr @X[$j],$j*4+$LSB($inp) | ||
113 | srl $t1,$a,27 | ||
114 | addu $e,$t0 | ||
115 | xor $t0,$c,$d | ||
116 | addu $e,$t1 | ||
117 | sll $t2,$b,30 | ||
118 | and $t0,$b | ||
119 | srl $b,$b,2 | ||
120 | xor $t0,$d | ||
121 | addu $e,@X[$i] | ||
122 | or $b,$t2 | ||
123 | addu $e,$t0 | ||
124 | ___ | ||
125 | } | ||
126 | |||
127 | sub BODY_15_19 { | ||
128 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
129 | my $j=$i+1; | ||
130 | |||
131 | $code.=<<___ if (!$big_endian && $i==15); | ||
132 | srl $t0,@X[$i],24 # byte swap($i) | ||
133 | srl $t1,@X[$i],8 | ||
134 | andi $t2,@X[$i],0xFF00 | ||
135 | sll @X[$i],@X[$i],24 | ||
136 | andi $t1,0xFF00 | ||
137 | sll $t2,$t2,8 | ||
138 | or @X[$i],$t0 | ||
139 | or @X[$i],$t1 | ||
140 | or @X[$i],$t2 | ||
141 | ___ | ||
142 | $code.=<<___; | ||
143 | xor @X[$j%16],@X[($j+2)%16] | ||
144 | sll $t0,$a,5 # $i | ||
145 | addu $e,$K | ||
146 | srl $t1,$a,27 | ||
147 | addu $e,$t0 | ||
148 | xor @X[$j%16],@X[($j+8)%16] | ||
149 | xor $t0,$c,$d | ||
150 | addu $e,$t1 | ||
151 | xor @X[$j%16],@X[($j+13)%16] | ||
152 | sll $t2,$b,30 | ||
153 | and $t0,$b | ||
154 | srl $t1,@X[$j%16],31 | ||
155 | addu @X[$j%16],@X[$j%16] | ||
156 | srl $b,$b,2 | ||
157 | xor $t0,$d | ||
158 | or @X[$j%16],$t1 | ||
159 | addu $e,@X[$i%16] | ||
160 | or $b,$t2 | ||
161 | addu $e,$t0 | ||
162 | ___ | ||
163 | } | ||
164 | |||
165 | sub BODY_20_39 { | ||
166 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
167 | my $j=$i+1; | ||
168 | $code.=<<___ if ($i<79); | ||
169 | xor @X[$j%16],@X[($j+2)%16] | ||
170 | sll $t0,$a,5 # $i | ||
171 | addu $e,$K | ||
172 | srl $t1,$a,27 | ||
173 | addu $e,$t0 | ||
174 | xor @X[$j%16],@X[($j+8)%16] | ||
175 | xor $t0,$c,$d | ||
176 | addu $e,$t1 | ||
177 | xor @X[$j%16],@X[($j+13)%16] | ||
178 | sll $t2,$b,30 | ||
179 | xor $t0,$b | ||
180 | srl $t1,@X[$j%16],31 | ||
181 | addu @X[$j%16],@X[$j%16] | ||
182 | srl $b,$b,2 | ||
183 | addu $e,@X[$i%16] | ||
184 | or @X[$j%16],$t1 | ||
185 | or $b,$t2 | ||
186 | addu $e,$t0 | ||
187 | ___ | ||
188 | $code.=<<___ if ($i==79); | ||
189 | lw @X[0],0($ctx) | ||
190 | sll $t0,$a,5 # $i | ||
191 | addu $e,$K | ||
192 | lw @X[1],4($ctx) | ||
193 | srl $t1,$a,27 | ||
194 | addu $e,$t0 | ||
195 | lw @X[2],8($ctx) | ||
196 | xor $t0,$c,$d | ||
197 | addu $e,$t1 | ||
198 | lw @X[3],12($ctx) | ||
199 | sll $t2,$b,30 | ||
200 | xor $t0,$b | ||
201 | lw @X[4],16($ctx) | ||
202 | srl $b,$b,2 | ||
203 | addu $e,@X[$i%16] | ||
204 | or $b,$t2 | ||
205 | addu $e,$t0 | ||
206 | ___ | ||
207 | } | ||
208 | |||
209 | sub BODY_40_59 { | ||
210 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
211 | my $j=$i+1; | ||
212 | $code.=<<___ if ($i<79); | ||
213 | xor @X[$j%16],@X[($j+2)%16] | ||
214 | sll $t0,$a,5 # $i | ||
215 | addu $e,$K | ||
216 | srl $t1,$a,27 | ||
217 | addu $e,$t0 | ||
218 | xor @X[$j%16],@X[($j+8)%16] | ||
219 | and $t0,$c,$d | ||
220 | addu $e,$t1 | ||
221 | xor @X[$j%16],@X[($j+13)%16] | ||
222 | sll $t2,$b,30 | ||
223 | addu $e,$t0 | ||
224 | srl $t1,@X[$j%16],31 | ||
225 | xor $t0,$c,$d | ||
226 | addu @X[$j%16],@X[$j%16] | ||
227 | and $t0,$b | ||
228 | srl $b,$b,2 | ||
229 | or @X[$j%16],$t1 | ||
230 | addu $e,@X[$i%16] | ||
231 | or $b,$t2 | ||
232 | addu $e,$t0 | ||
233 | ___ | ||
234 | } | ||
235 | |||
236 | $FRAMESIZE=16; # large enough to accomodate NUBI saved registers | ||
237 | $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; | ||
238 | |||
239 | $code=<<___; | ||
240 | #ifdef OPENSSL_FIPSCANISTER | ||
241 | # include <openssl/fipssyms.h> | ||
242 | #endif | ||
243 | |||
244 | .text | ||
245 | |||
246 | .set noat | ||
247 | .set noreorder | ||
248 | .align 5 | ||
249 | .globl sha1_block_data_order | ||
250 | .ent sha1_block_data_order | ||
251 | sha1_block_data_order: | ||
252 | .frame $sp,$FRAMESIZE*$SZREG,$ra | ||
253 | .mask $SAVED_REGS_MASK,-$SZREG | ||
254 | .set noreorder | ||
255 | $PTR_SUB $sp,$FRAMESIZE*$SZREG | ||
256 | $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) | ||
257 | $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) | ||
258 | $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) | ||
259 | $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) | ||
260 | $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) | ||
261 | $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) | ||
262 | $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) | ||
263 | $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) | ||
264 | $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) | ||
265 | $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) | ||
266 | ___ | ||
267 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
268 | $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) | ||
269 | $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) | ||
270 | $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) | ||
271 | $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) | ||
272 | $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) | ||
273 | ___ | ||
274 | $code.=<<___; | ||
275 | $PTR_SLL $num,6 | ||
276 | $PTR_ADD $num,$inp | ||
277 | $REG_S $num,0($sp) | ||
278 | lw $A,0($ctx) | ||
279 | lw $B,4($ctx) | ||
280 | lw $C,8($ctx) | ||
281 | lw $D,12($ctx) | ||
282 | b .Loop | ||
283 | lw $E,16($ctx) | ||
284 | .align 4 | ||
285 | .Loop: | ||
286 | .set reorder | ||
287 | lwl @X[0],$MSB($inp) | ||
288 | lui $K,0x5a82 | ||
289 | lwr @X[0],$LSB($inp) | ||
290 | ori $K,0x7999 # K_00_19 | ||
291 | ___ | ||
292 | for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } | ||
293 | for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } | ||
294 | $code.=<<___; | ||
295 | lui $K,0x6ed9 | ||
296 | ori $K,0xeba1 # K_20_39 | ||
297 | ___ | ||
298 | for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
299 | $code.=<<___; | ||
300 | lui $K,0x8f1b | ||
301 | ori $K,0xbcdc # K_40_59 | ||
302 | ___ | ||
303 | for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | ||
304 | $code.=<<___; | ||
305 | lui $K,0xca62 | ||
306 | ori $K,0xc1d6 # K_60_79 | ||
307 | ___ | ||
308 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
309 | $code.=<<___; | ||
310 | $PTR_ADD $inp,64 | ||
311 | $REG_L $num,0($sp) | ||
312 | |||
313 | addu $A,$X[0] | ||
314 | addu $B,$X[1] | ||
315 | sw $A,0($ctx) | ||
316 | addu $C,$X[2] | ||
317 | addu $D,$X[3] | ||
318 | sw $B,4($ctx) | ||
319 | addu $E,$X[4] | ||
320 | sw $C,8($ctx) | ||
321 | sw $D,12($ctx) | ||
322 | sw $E,16($ctx) | ||
323 | .set noreorder | ||
324 | bne $inp,$num,.Loop | ||
325 | nop | ||
326 | |||
327 | .set noreorder | ||
328 | $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) | ||
329 | $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) | ||
330 | $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) | ||
331 | $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) | ||
332 | $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) | ||
333 | $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) | ||
334 | $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) | ||
335 | $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) | ||
336 | $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) | ||
337 | $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) | ||
338 | ___ | ||
339 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
340 | $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) | ||
341 | $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) | ||
342 | $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) | ||
343 | $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) | ||
344 | $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) | ||
345 | ___ | ||
346 | $code.=<<___; | ||
347 | jr $ra | ||
348 | $PTR_ADD $sp,$FRAMESIZE*$SZREG | ||
349 | .end sha1_block_data_order | ||
350 | .rdata | ||
351 | .asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" | ||
352 | ___ | ||
353 | print $code; | ||
354 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha1-parisc.pl b/src/lib/libcrypto/sha/asm/sha1-parisc.pl new file mode 100644 index 0000000000..6d7bf495b2 --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha1-parisc.pl | |||
@@ -0,0 +1,259 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # SHA1 block procedure for PA-RISC. | ||
11 | |||
12 | # June 2009. | ||
13 | # | ||
14 | # On PA-7100LC performance is >30% better than gcc 3.2 generated code | ||
15 | # for aligned input and >50% better for unaligned. Compared to vendor | ||
16 | # compiler on PA-8600 it's almost 60% faster in 64-bit build and just | ||
17 | # few percent faster in 32-bit one (this for aligned input, data for | ||
18 | # unaligned input is not available). | ||
19 | # | ||
20 | # Special thanks to polarhome.com for providing HP-UX account. | ||
21 | |||
22 | $flavour = shift; | ||
23 | $output = shift; | ||
24 | open STDOUT,">$output"; | ||
25 | |||
26 | if ($flavour =~ /64/) { | ||
27 | $LEVEL ="2.0W"; | ||
28 | $SIZE_T =8; | ||
29 | $FRAME_MARKER =80; | ||
30 | $SAVED_RP =16; | ||
31 | $PUSH ="std"; | ||
32 | $PUSHMA ="std,ma"; | ||
33 | $POP ="ldd"; | ||
34 | $POPMB ="ldd,mb"; | ||
35 | } else { | ||
36 | $LEVEL ="1.0"; | ||
37 | $SIZE_T =4; | ||
38 | $FRAME_MARKER =48; | ||
39 | $SAVED_RP =20; | ||
40 | $PUSH ="stw"; | ||
41 | $PUSHMA ="stwm"; | ||
42 | $POP ="ldw"; | ||
43 | $POPMB ="ldwm"; | ||
44 | } | ||
45 | |||
46 | $FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker | ||
47 | # [+ argument transfer] | ||
48 | $ctx="%r26"; # arg0 | ||
49 | $inp="%r25"; # arg1 | ||
50 | $num="%r24"; # arg2 | ||
51 | |||
52 | $t0="%r28"; | ||
53 | $t1="%r29"; | ||
54 | $K="%r31"; | ||
55 | |||
56 | @X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", | ||
57 | "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0); | ||
58 | |||
59 | @V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23"); | ||
60 | |||
61 | sub BODY_00_19 { | ||
62 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
63 | my $j=$i+1; | ||
64 | $code.=<<___ if ($i<15); | ||
65 | addl $K,$e,$e ; $i | ||
66 | shd $a,$a,27,$t1 | ||
67 | addl @X[$i],$e,$e | ||
68 | and $c,$b,$t0 | ||
69 | addl $t1,$e,$e | ||
70 | andcm $d,$b,$t1 | ||
71 | shd $b,$b,2,$b | ||
72 | or $t1,$t0,$t0 | ||
73 | addl $t0,$e,$e | ||
74 | ___ | ||
75 | $code.=<<___ if ($i>=15); # with forward Xupdate | ||
76 | addl $K,$e,$e ; $i | ||
77 | shd $a,$a,27,$t1 | ||
78 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] | ||
79 | addl @X[$i%16],$e,$e | ||
80 | and $c,$b,$t0 | ||
81 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
82 | addl $t1,$e,$e | ||
83 | andcm $d,$b,$t1 | ||
84 | shd $b,$b,2,$b | ||
85 | or $t1,$t0,$t0 | ||
86 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
87 | add $t0,$e,$e | ||
88 | shd @X[$j%16],@X[$j%16],31,@X[$j%16] | ||
89 | ___ | ||
90 | } | ||
91 | |||
92 | sub BODY_20_39 { | ||
93 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
94 | my $j=$i+1; | ||
95 | $code.=<<___ if ($i<79); | ||
96 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i | ||
97 | addl $K,$e,$e | ||
98 | shd $a,$a,27,$t1 | ||
99 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
100 | addl @X[$i%16],$e,$e | ||
101 | xor $b,$c,$t0 | ||
102 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
103 | addl $t1,$e,$e | ||
104 | shd $b,$b,2,$b | ||
105 | xor $d,$t0,$t0 | ||
106 | shd @X[$j%16],@X[$j%16],31,@X[$j%16] | ||
107 | addl $t0,$e,$e | ||
108 | ___ | ||
109 | $code.=<<___ if ($i==79); # with context load | ||
110 | ldw 0($ctx),@X[0] ; $i | ||
111 | addl $K,$e,$e | ||
112 | shd $a,$a,27,$t1 | ||
113 | ldw 4($ctx),@X[1] | ||
114 | addl @X[$i%16],$e,$e | ||
115 | xor $b,$c,$t0 | ||
116 | ldw 8($ctx),@X[2] | ||
117 | addl $t1,$e,$e | ||
118 | shd $b,$b,2,$b | ||
119 | xor $d,$t0,$t0 | ||
120 | ldw 12($ctx),@X[3] | ||
121 | addl $t0,$e,$e | ||
122 | ldw 16($ctx),@X[4] | ||
123 | ___ | ||
124 | } | ||
125 | |||
126 | sub BODY_40_59 { | ||
127 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
128 | my $j=$i+1; | ||
129 | $code.=<<___; | ||
130 | shd $a,$a,27,$t1 ; $i | ||
131 | addl $K,$e,$e | ||
132 | xor @X[($j+2)%16],@X[$j%16],@X[$j%16] | ||
133 | xor $d,$c,$t0 | ||
134 | addl @X[$i%16],$e,$e | ||
135 | xor @X[($j+8)%16],@X[$j%16],@X[$j%16] | ||
136 | and $b,$t0,$t0 | ||
137 | addl $t1,$e,$e | ||
138 | shd $b,$b,2,$b | ||
139 | xor @X[($j+13)%16],@X[$j%16],@X[$j%16] | ||
140 | addl $t0,$e,$e | ||
141 | and $d,$c,$t1 | ||
142 | shd @X[$j%16],@X[$j%16],31,@X[$j%16] | ||
143 | addl $t1,$e,$e | ||
144 | ___ | ||
145 | } | ||
146 | |||
147 | $code=<<___; | ||
148 | .LEVEL $LEVEL | ||
149 | .SPACE \$TEXT\$ | ||
150 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
151 | |||
152 | .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
153 | sha1_block_data_order | ||
154 | .PROC | ||
155 | .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16 | ||
156 | .ENTRY | ||
157 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
158 | $PUSHMA %r3,$FRAME(%sp) | ||
159 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
160 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
161 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
162 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
163 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
164 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
165 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
166 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
167 | $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) | ||
168 | $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) | ||
169 | $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) | ||
170 | $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) | ||
171 | $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) | ||
172 | |||
173 | ldw 0($ctx),$A | ||
174 | ldw 4($ctx),$B | ||
175 | ldw 8($ctx),$C | ||
176 | ldw 12($ctx),$D | ||
177 | ldw 16($ctx),$E | ||
178 | |||
179 | extru $inp,31,2,$t0 ; t0=inp&3; | ||
180 | sh3addl $t0,%r0,$t0 ; t0*=8; | ||
181 | subi 32,$t0,$t0 ; t0=32-t0; | ||
182 | mtctl $t0,%cr11 ; %sar=t0; | ||
183 | |||
184 | L\$oop | ||
185 | ldi 3,$t0 | ||
186 | andcm $inp,$t0,$t0 ; 64-bit neutral | ||
187 | ___ | ||
188 | for ($i=0;$i<15;$i++) { # load input block | ||
189 | $code.="\tldw `4*$i`($t0),@X[$i]\n"; } | ||
190 | $code.=<<___; | ||
191 | cmpb,*= $inp,$t0,L\$aligned | ||
192 | ldw 60($t0),@X[15] | ||
193 | ldw 64($t0),@X[16] | ||
194 | ___ | ||
195 | for ($i=0;$i<16;$i++) { # align input | ||
196 | $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; } | ||
197 | $code.=<<___; | ||
198 | L\$aligned | ||
199 | ldil L'0x5a827000,$K ; K_00_19 | ||
200 | ldo 0x999($K),$K | ||
201 | ___ | ||
202 | for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } | ||
203 | $code.=<<___; | ||
204 | ldil L'0x6ed9e000,$K ; K_20_39 | ||
205 | ldo 0xba1($K),$K | ||
206 | ___ | ||
207 | |||
208 | for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
209 | $code.=<<___; | ||
210 | ldil L'0x8f1bb000,$K ; K_40_59 | ||
211 | ldo 0xcdc($K),$K | ||
212 | ___ | ||
213 | |||
214 | for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | ||
215 | $code.=<<___; | ||
216 | ldil L'0xca62c000,$K ; K_60_79 | ||
217 | ldo 0x1d6($K),$K | ||
218 | ___ | ||
219 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
220 | |||
221 | $code.=<<___; | ||
222 | addl @X[0],$A,$A | ||
223 | addl @X[1],$B,$B | ||
224 | addl @X[2],$C,$C | ||
225 | addl @X[3],$D,$D | ||
226 | addl @X[4],$E,$E | ||
227 | stw $A,0($ctx) | ||
228 | stw $B,4($ctx) | ||
229 | stw $C,8($ctx) | ||
230 | stw $D,12($ctx) | ||
231 | stw $E,16($ctx) | ||
232 | addib,*<> -1,$num,L\$oop | ||
233 | ldo 64($inp),$inp | ||
234 | |||
235 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
236 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
237 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
238 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
239 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
240 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
241 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
242 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
243 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
244 | $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 | ||
245 | $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 | ||
246 | $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 | ||
247 | $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 | ||
248 | $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 | ||
249 | bv (%r2) | ||
250 | .EXIT | ||
251 | $POPMB -$FRAME(%sp),%r3 | ||
252 | .PROCEND | ||
253 | .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
254 | ___ | ||
255 | |||
256 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
257 | $code =~ s/,\*/,/gm if ($SIZE_T==4); | ||
258 | print $code; | ||
259 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl index dcd0fcdfcf..2140dd2f8d 100755 --- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl +++ b/src/lib/libcrypto/sha/asm/sha1-ppc.pl | |||
@@ -24,12 +24,14 @@ $flavour = shift; | |||
24 | 24 | ||
25 | if ($flavour =~ /64/) { | 25 | if ($flavour =~ /64/) { |
26 | $SIZE_T =8; | 26 | $SIZE_T =8; |
27 | $LRSAVE =2*$SIZE_T; | ||
27 | $UCMP ="cmpld"; | 28 | $UCMP ="cmpld"; |
28 | $STU ="stdu"; | 29 | $STU ="stdu"; |
29 | $POP ="ld"; | 30 | $POP ="ld"; |
30 | $PUSH ="std"; | 31 | $PUSH ="std"; |
31 | } elsif ($flavour =~ /32/) { | 32 | } elsif ($flavour =~ /32/) { |
32 | $SIZE_T =4; | 33 | $SIZE_T =4; |
34 | $LRSAVE =$SIZE_T; | ||
33 | $UCMP ="cmplw"; | 35 | $UCMP ="cmplw"; |
34 | $STU ="stwu"; | 36 | $STU ="stwu"; |
35 | $POP ="lwz"; | 37 | $POP ="lwz"; |
@@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl"; | |||
43 | 45 | ||
44 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | 46 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
45 | 47 | ||
46 | $FRAME=24*$SIZE_T; | 48 | $FRAME=24*$SIZE_T+64; |
49 | $LOCALS=6*$SIZE_T; | ||
47 | 50 | ||
48 | $K ="r0"; | 51 | $K ="r0"; |
49 | $sp ="r1"; | 52 | $sp ="r1"; |
@@ -162,9 +165,8 @@ $code=<<___; | |||
162 | .globl .sha1_block_data_order | 165 | .globl .sha1_block_data_order |
163 | .align 4 | 166 | .align 4 |
164 | .sha1_block_data_order: | 167 | .sha1_block_data_order: |
168 | $STU $sp,-$FRAME($sp) | ||
165 | mflr r0 | 169 | mflr r0 |
166 | $STU $sp,`-($FRAME+64)`($sp) | ||
167 | $PUSH r0,`$FRAME-$SIZE_T*18`($sp) | ||
168 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) | 170 | $PUSH r15,`$FRAME-$SIZE_T*17`($sp) |
169 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) | 171 | $PUSH r16,`$FRAME-$SIZE_T*16`($sp) |
170 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) | 172 | $PUSH r17,`$FRAME-$SIZE_T*15`($sp) |
@@ -182,6 +184,7 @@ $code=<<___; | |||
182 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | 184 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
183 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | 185 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
184 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | 186 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
187 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | ||
185 | lwz $A,0($ctx) | 188 | lwz $A,0($ctx) |
186 | lwz $B,4($ctx) | 189 | lwz $B,4($ctx) |
187 | lwz $C,8($ctx) | 190 | lwz $C,8($ctx) |
@@ -192,37 +195,14 @@ $code=<<___; | |||
192 | Laligned: | 195 | Laligned: |
193 | mtctr $num | 196 | mtctr $num |
194 | bl Lsha1_block_private | 197 | bl Lsha1_block_private |
195 | Ldone: | 198 | b Ldone |
196 | $POP r0,`$FRAME-$SIZE_T*18`($sp) | ||
197 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | ||
198 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | ||
199 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | ||
200 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | ||
201 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | ||
202 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | ||
203 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | ||
204 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | ||
205 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | ||
206 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | ||
207 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | ||
208 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | ||
209 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | ||
210 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | ||
211 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | ||
212 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | ||
213 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | ||
214 | mtlr r0 | ||
215 | addi $sp,$sp,`$FRAME+64` | ||
216 | blr | ||
217 | ___ | ||
218 | 199 | ||
219 | # PowerPC specification allows an implementation to be ill-behaved | 200 | ; PowerPC specification allows an implementation to be ill-behaved |
220 | # upon unaligned access which crosses page boundary. "Better safe | 201 | ; upon unaligned access which crosses page boundary. "Better safe |
221 | # than sorry" principle makes me treat it specially. But I don't | 202 | ; than sorry" principle makes me treat it specially. But I don't |
222 | # look for particular offending word, but rather for 64-byte input | 203 | ; look for particular offending word, but rather for 64-byte input |
223 | # block which crosses the boundary. Once found that block is aligned | 204 | ; block which crosses the boundary. Once found that block is aligned |
224 | # and hashed separately... | 205 | ; and hashed separately... |
225 | $code.=<<___; | ||
226 | .align 4 | 206 | .align 4 |
227 | Lunaligned: | 207 | Lunaligned: |
228 | subfic $t1,$inp,4096 | 208 | subfic $t1,$inp,4096 |
@@ -237,7 +217,7 @@ Lunaligned: | |||
237 | Lcross_page: | 217 | Lcross_page: |
238 | li $t1,16 | 218 | li $t1,16 |
239 | mtctr $t1 | 219 | mtctr $t1 |
240 | addi r20,$sp,$FRAME ; spot below the frame | 220 | addi r20,$sp,$LOCALS ; spot within the frame |
241 | Lmemcpy: | 221 | Lmemcpy: |
242 | lbz r16,0($inp) | 222 | lbz r16,0($inp) |
243 | lbz r17,1($inp) | 223 | lbz r17,1($inp) |
@@ -251,15 +231,40 @@ Lmemcpy: | |||
251 | addi r20,r20,4 | 231 | addi r20,r20,4 |
252 | bdnz Lmemcpy | 232 | bdnz Lmemcpy |
253 | 233 | ||
254 | $PUSH $inp,`$FRAME-$SIZE_T*19`($sp) | 234 | $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) |
255 | li $t1,1 | 235 | li $t1,1 |
256 | addi $inp,$sp,$FRAME | 236 | addi $inp,$sp,$LOCALS |
257 | mtctr $t1 | 237 | mtctr $t1 |
258 | bl Lsha1_block_private | 238 | bl Lsha1_block_private |
259 | $POP $inp,`$FRAME-$SIZE_T*19`($sp) | 239 | $POP $inp,`$FRAME-$SIZE_T*18`($sp) |
260 | addic. $num,$num,-1 | 240 | addic. $num,$num,-1 |
261 | bne- Lunaligned | 241 | bne- Lunaligned |
262 | b Ldone | 242 | |
243 | Ldone: | ||
244 | $POP r0,`$FRAME+$LRSAVE`($sp) | ||
245 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | ||
246 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | ||
247 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | ||
248 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | ||
249 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | ||
250 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | ||
251 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | ||
252 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | ||
253 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | ||
254 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | ||
255 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | ||
256 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | ||
257 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | ||
258 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | ||
259 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | ||
260 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | ||
261 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | ||
262 | mtlr r0 | ||
263 | addi $sp,$sp,$FRAME | ||
264 | blr | ||
265 | .long 0 | ||
266 | .byte 0,12,4,1,0x80,18,3,0 | ||
267 | .long 0 | ||
263 | ___ | 268 | ___ |
264 | 269 | ||
265 | # This is private block function, which uses tailored calling | 270 | # This is private block function, which uses tailored calling |
@@ -309,6 +314,8 @@ $code.=<<___; | |||
309 | addi $inp,$inp,`16*4` | 314 | addi $inp,$inp,`16*4` |
310 | bdnz- Lsha1_block_private | 315 | bdnz- Lsha1_block_private |
311 | blr | 316 | blr |
317 | .long 0 | ||
318 | .byte 0,12,0x14,0,0,0,0,0 | ||
312 | ___ | 319 | ___ |
313 | $code.=<<___; | 320 | $code.=<<___; |
314 | .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" | 321 | .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" |
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl index 4b17848287..9193dda45e 100644 --- a/src/lib/libcrypto/sha/asm/sha1-s390x.pl +++ b/src/lib/libcrypto/sha/asm/sha1-s390x.pl | |||
@@ -21,9 +21,28 @@ | |||
21 | # instructions to favour dual-issue z10 pipeline. On z10 hardware is | 21 | # instructions to favour dual-issue z10 pipeline. On z10 hardware is |
22 | # "only" ~2.3x faster than software. | 22 | # "only" ~2.3x faster than software. |
23 | 23 | ||
24 | # November 2010. | ||
25 | # | ||
26 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
27 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
28 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
29 | # application context. The feature is not specific to any particular | ||
30 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
31 | # remains z/Architecture specific. | ||
32 | |||
24 | $kimdfunc=1; # magic function code for kimd instruction | 33 | $kimdfunc=1; # magic function code for kimd instruction |
25 | 34 | ||
26 | $output=shift; | 35 | $flavour = shift; |
36 | |||
37 | if ($flavour =~ /3[12]/) { | ||
38 | $SIZE_T=4; | ||
39 | $g=""; | ||
40 | } else { | ||
41 | $SIZE_T=8; | ||
42 | $g="g"; | ||
43 | } | ||
44 | |||
45 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
27 | open STDOUT,">$output"; | 46 | open STDOUT,">$output"; |
28 | 47 | ||
29 | $K_00_39="%r0"; $K=$K_00_39; | 48 | $K_00_39="%r0"; $K=$K_00_39; |
@@ -42,13 +61,14 @@ $t1="%r11"; | |||
42 | @X=("%r12","%r13","%r14"); | 61 | @X=("%r12","%r13","%r14"); |
43 | $sp="%r15"; | 62 | $sp="%r15"; |
44 | 63 | ||
45 | $frame=160+16*4; | 64 | $stdframe=16*$SIZE_T+4*8; |
65 | $frame=$stdframe+16*4; | ||
46 | 66 | ||
47 | sub Xupdate { | 67 | sub Xupdate { |
48 | my $i=shift; | 68 | my $i=shift; |
49 | 69 | ||
50 | $code.=<<___ if ($i==15); | 70 | $code.=<<___ if ($i==15); |
51 | lg $prefetch,160($sp) ### Xupdate(16) warm-up | 71 | lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up |
52 | lr $X[0],$X[2] | 72 | lr $X[0],$X[2] |
53 | ___ | 73 | ___ |
54 | return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle | 74 | return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle |
@@ -58,8 +78,8 @@ $code.=<<___ if ($i<16); | |||
58 | ___ | 78 | ___ |
59 | $code.=<<___ if ($i>=16); | 79 | $code.=<<___ if ($i>=16); |
60 | xgr $X[0],$prefetch ### Xupdate($i) | 80 | xgr $X[0],$prefetch ### Xupdate($i) |
61 | lg $prefetch,`160+4*(($i+2)%16)`($sp) | 81 | lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp) |
62 | xg $X[0],`160+4*(($i+8)%16)`($sp) | 82 | xg $X[0],`$stdframe+4*(($i+8)%16)`($sp) |
63 | xgr $X[0],$prefetch | 83 | xgr $X[0],$prefetch |
64 | rll $X[0],$X[0],1 | 84 | rll $X[0],$X[0],1 |
65 | rllg $X[1],$X[0],32 | 85 | rllg $X[1],$X[0],32 |
@@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16); | |||
68 | lr $X[2],$X[1] # feedback | 88 | lr $X[2],$X[1] # feedback |
69 | ___ | 89 | ___ |
70 | $code.=<<___ if ($i<=70); | 90 | $code.=<<___ if ($i<=70); |
71 | stg $X[0],`160+4*($i%16)`($sp) | 91 | stg $X[0],`$stdframe+4*($i%16)`($sp) |
72 | ___ | 92 | ___ |
73 | unshift(@X,pop(@X)); | 93 | unshift(@X,pop(@X)); |
74 | } | 94 | } |
@@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc); | |||
148 | tmhl %r0,0x4000 # check for message-security assist | 168 | tmhl %r0,0x4000 # check for message-security assist |
149 | jz .Lsoftware | 169 | jz .Lsoftware |
150 | lghi %r0,0 | 170 | lghi %r0,0 |
151 | la %r1,16($sp) | 171 | la %r1,`2*$SIZE_T`($sp) |
152 | .long 0xb93e0002 # kimd %r0,%r2 | 172 | .long 0xb93e0002 # kimd %r0,%r2 |
153 | lg %r0,16($sp) | 173 | lg %r0,`2*$SIZE_T`($sp) |
154 | tmhh %r0,`0x8000>>$kimdfunc` | 174 | tmhh %r0,`0x8000>>$kimdfunc` |
155 | jz .Lsoftware | 175 | jz .Lsoftware |
156 | lghi %r0,$kimdfunc | 176 | lghi %r0,$kimdfunc |
@@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc); | |||
165 | ___ | 185 | ___ |
166 | $code.=<<___; | 186 | $code.=<<___; |
167 | lghi %r1,-$frame | 187 | lghi %r1,-$frame |
168 | stg $ctx,16($sp) | 188 | st${g} $ctx,`2*$SIZE_T`($sp) |
169 | stmg %r6,%r15,48($sp) | 189 | stm${g} %r6,%r15,`6*$SIZE_T`($sp) |
170 | lgr %r0,$sp | 190 | lgr %r0,$sp |
171 | la $sp,0(%r1,$sp) | 191 | la $sp,0(%r1,$sp) |
172 | stg %r0,0($sp) | 192 | st${g} %r0,0($sp) |
173 | 193 | ||
174 | larl $t0,Ktable | 194 | larl $t0,Ktable |
175 | llgf $A,0($ctx) | 195 | llgf $A,0($ctx) |
@@ -199,7 +219,7 @@ ___ | |||
199 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | 219 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
200 | $code.=<<___; | 220 | $code.=<<___; |
201 | 221 | ||
202 | lg $ctx,`$frame+16`($sp) | 222 | l${g} $ctx,`$frame+2*$SIZE_T`($sp) |
203 | la $inp,64($inp) | 223 | la $inp,64($inp) |
204 | al $A,0($ctx) | 224 | al $A,0($ctx) |
205 | al $B,4($ctx) | 225 | al $B,4($ctx) |
@@ -211,13 +231,13 @@ $code.=<<___; | |||
211 | st $C,8($ctx) | 231 | st $C,8($ctx) |
212 | st $D,12($ctx) | 232 | st $D,12($ctx) |
213 | st $E,16($ctx) | 233 | st $E,16($ctx) |
214 | brct $len,.Lloop | 234 | brct${g} $len,.Lloop |
215 | 235 | ||
216 | lmg %r6,%r15,`$frame+48`($sp) | 236 | lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) |
217 | br %r14 | 237 | br %r14 |
218 | .size sha1_block_data_order,.-sha1_block_data_order | 238 | .size sha1_block_data_order,.-sha1_block_data_order |
219 | .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 239 | .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
220 | .comm OPENSSL_s390xcap_P,8,8 | 240 | .comm OPENSSL_s390xcap_P,16,8 |
221 | ___ | 241 | ___ |
222 | 242 | ||
223 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 243 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl index 4edc5ea9ad..f27c1e3fb0 100755 --- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl +++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl | |||
@@ -16,7 +16,7 @@ | |||
16 | # There was suggestion to mechanically translate 32-bit code, but I | 16 | # There was suggestion to mechanically translate 32-bit code, but I |
17 | # dismissed it, reasoning that x86_64 offers enough register bank | 17 | # dismissed it, reasoning that x86_64 offers enough register bank |
18 | # capacity to fully utilize SHA-1 parallelism. Therefore this fresh | 18 | # capacity to fully utilize SHA-1 parallelism. Therefore this fresh |
19 | # implementation:-) However! While 64-bit code does performs better | 19 | # implementation:-) However! While 64-bit code does perform better |
20 | # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, | 20 | # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, |
21 | # x86_64 does offer larger *addressable* bank, but out-of-order core | 21 | # x86_64 does offer larger *addressable* bank, but out-of-order core |
22 | # reaches for even more registers through dynamic aliasing, and EM64T | 22 | # reaches for even more registers through dynamic aliasing, and EM64T |
@@ -29,6 +29,38 @@ | |||
29 | # Xeon P4 +65% +0% 9.9 | 29 | # Xeon P4 +65% +0% 9.9 |
30 | # Core2 +60% +10% 7.0 | 30 | # Core2 +60% +10% 7.0 |
31 | 31 | ||
32 | # August 2009. | ||
33 | # | ||
34 | # The code was revised to minimize code size and to maximize | ||
35 | # "distance" between instructions producing input to 'lea' | ||
36 | # instruction and the 'lea' instruction itself, which is essential | ||
37 | # for Intel Atom core. | ||
38 | |||
39 | # October 2010. | ||
40 | # | ||
41 | # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it | ||
42 | # is to offload message schedule denoted by Wt in NIST specification, | ||
43 | # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module | ||
44 | # for background and implementation details. The only difference from | ||
45 | # 32-bit code is that 64-bit code doesn't have to spill @X[] elements | ||
46 | # to free temporary registers. | ||
47 | |||
48 | # April 2011. | ||
49 | # | ||
50 | # Add AVX code path. See sha1-586.pl for further information. | ||
51 | |||
52 | ###################################################################### | ||
53 | # Current performance is summarized in following table. Numbers are | ||
54 | # CPU clock cycles spent to process single byte (less is better). | ||
55 | # | ||
56 | # x86_64 SSSE3 AVX | ||
57 | # P4 9.8 - | ||
58 | # Opteron 6.6 - | ||
59 | # Core2 6.7 6.1/+10% - | ||
60 | # Atom 11.0 9.7/+13% - | ||
61 | # Westmere 7.1 5.6/+27% - | ||
62 | # Sandy Bridge 7.9 6.3/+25% 5.2/+51% | ||
63 | |||
32 | $flavour = shift; | 64 | $flavour = shift; |
33 | $output = shift; | 65 | $output = shift; |
34 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | 66 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
@@ -40,6 +72,16 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |||
40 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | 72 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
41 | die "can't locate x86_64-xlate.pl"; | 73 | die "can't locate x86_64-xlate.pl"; |
42 | 74 | ||
75 | $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | ||
76 | =~ /GNU assembler version ([2-9]\.[0-9]+)/ && | ||
77 | $1>=2.19); | ||
78 | $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | ||
79 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && | ||
80 | $1>=2.09); | ||
81 | $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | ||
82 | `ml64 2>&1` =~ /Version ([0-9]+)\./ && | ||
83 | $1>=10); | ||
84 | |||
43 | open STDOUT,"| $^X $xlate $flavour $output"; | 85 | open STDOUT,"| $^X $xlate $flavour $output"; |
44 | 86 | ||
45 | $ctx="%rdi"; # 1st arg | 87 | $ctx="%rdi"; # 1st arg |
@@ -51,196 +93,994 @@ $ctx="%r8"; | |||
51 | $inp="%r9"; | 93 | $inp="%r9"; |
52 | $num="%r10"; | 94 | $num="%r10"; |
53 | 95 | ||
54 | $xi="%eax"; | 96 | $t0="%eax"; |
55 | $t0="%ebx"; | 97 | $t1="%ebx"; |
56 | $t1="%ecx"; | 98 | $t2="%ecx"; |
57 | $A="%edx"; | 99 | @xi=("%edx","%ebp"); |
58 | $B="%esi"; | 100 | $A="%esi"; |
59 | $C="%edi"; | 101 | $B="%edi"; |
60 | $D="%ebp"; | 102 | $C="%r11d"; |
61 | $E="%r11d"; | 103 | $D="%r12d"; |
62 | $T="%r12d"; | 104 | $E="%r13d"; |
63 | |||
64 | @V=($A,$B,$C,$D,$E,$T); | ||
65 | 105 | ||
66 | sub PROLOGUE { | 106 | @V=($A,$B,$C,$D,$E); |
67 | my $func=shift; | ||
68 | $code.=<<___; | ||
69 | .globl $func | ||
70 | .type $func,\@function,3 | ||
71 | .align 16 | ||
72 | $func: | ||
73 | push %rbx | ||
74 | push %rbp | ||
75 | push %r12 | ||
76 | mov %rsp,%r11 | ||
77 | mov %rdi,$ctx # reassigned argument | ||
78 | sub \$`8+16*4`,%rsp | ||
79 | mov %rsi,$inp # reassigned argument | ||
80 | and \$-64,%rsp | ||
81 | mov %rdx,$num # reassigned argument | ||
82 | mov %r11,`16*4`(%rsp) | ||
83 | .Lprologue: | ||
84 | |||
85 | mov 0($ctx),$A | ||
86 | mov 4($ctx),$B | ||
87 | mov 8($ctx),$C | ||
88 | mov 12($ctx),$D | ||
89 | mov 16($ctx),$E | ||
90 | ___ | ||
91 | } | ||
92 | |||
93 | sub EPILOGUE { | ||
94 | my $func=shift; | ||
95 | $code.=<<___; | ||
96 | mov `16*4`(%rsp),%rsi | ||
97 | mov (%rsi),%r12 | ||
98 | mov 8(%rsi),%rbp | ||
99 | mov 16(%rsi),%rbx | ||
100 | lea 24(%rsi),%rsp | ||
101 | .Lepilogue: | ||
102 | ret | ||
103 | .size $func,.-$func | ||
104 | ___ | ||
105 | } | ||
106 | 107 | ||
107 | sub BODY_00_19 { | 108 | sub BODY_00_19 { |
108 | my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; | 109 | my ($i,$a,$b,$c,$d,$e)=@_; |
109 | my $j=$i+1; | 110 | my $j=$i+1; |
110 | $code.=<<___ if ($i==0); | 111 | $code.=<<___ if ($i==0); |
111 | mov `4*$i`($inp),$xi | 112 | mov `4*$i`($inp),$xi[0] |
112 | `"bswap $xi" if(!defined($host))` | 113 | bswap $xi[0] |
113 | mov $xi,`4*$i`(%rsp) | 114 | mov $xi[0],`4*$i`(%rsp) |
114 | ___ | 115 | ___ |
115 | $code.=<<___ if ($i<15); | 116 | $code.=<<___ if ($i<15); |
116 | lea 0x5a827999($xi,$e),$f | ||
117 | mov $c,$t0 | 117 | mov $c,$t0 |
118 | mov `4*$j`($inp),$xi | 118 | mov `4*$j`($inp),$xi[1] |
119 | mov $a,$e | 119 | mov $a,$t2 |
120 | xor $d,$t0 | 120 | xor $d,$t0 |
121 | `"bswap $xi" if(!defined($host))` | 121 | bswap $xi[1] |
122 | rol \$5,$e | 122 | rol \$5,$t2 |
123 | lea 0x5a827999($xi[0],$e),$e | ||
123 | and $b,$t0 | 124 | and $b,$t0 |
124 | mov $xi,`4*$j`(%rsp) | 125 | mov $xi[1],`4*$j`(%rsp) |
125 | add $e,$f | 126 | add $t2,$e |
126 | xor $d,$t0 | 127 | xor $d,$t0 |
127 | rol \$30,$b | 128 | rol \$30,$b |
128 | add $t0,$f | 129 | add $t0,$e |
129 | ___ | 130 | ___ |
130 | $code.=<<___ if ($i>=15); | 131 | $code.=<<___ if ($i>=15); |
131 | lea 0x5a827999($xi,$e),$f | 132 | mov `4*($j%16)`(%rsp),$xi[1] |
132 | mov `4*($j%16)`(%rsp),$xi | ||
133 | mov $c,$t0 | 133 | mov $c,$t0 |
134 | mov $a,$e | 134 | mov $a,$t2 |
135 | xor `4*(($j+2)%16)`(%rsp),$xi | 135 | xor `4*(($j+2)%16)`(%rsp),$xi[1] |
136 | xor $d,$t0 | 136 | xor $d,$t0 |
137 | rol \$5,$e | 137 | rol \$5,$t2 |
138 | xor `4*(($j+8)%16)`(%rsp),$xi | 138 | xor `4*(($j+8)%16)`(%rsp),$xi[1] |
139 | and $b,$t0 | 139 | and $b,$t0 |
140 | add $e,$f | 140 | lea 0x5a827999($xi[0],$e),$e |
141 | xor `4*(($j+13)%16)`(%rsp),$xi | 141 | xor `4*(($j+13)%16)`(%rsp),$xi[1] |
142 | xor $d,$t0 | 142 | xor $d,$t0 |
143 | rol \$1,$xi[1] | ||
144 | add $t2,$e | ||
143 | rol \$30,$b | 145 | rol \$30,$b |
144 | add $t0,$f | 146 | mov $xi[1],`4*($j%16)`(%rsp) |
145 | rol \$1,$xi | 147 | add $t0,$e |
146 | mov $xi,`4*($j%16)`(%rsp) | ||
147 | ___ | 148 | ___ |
149 | unshift(@xi,pop(@xi)); | ||
148 | } | 150 | } |
149 | 151 | ||
150 | sub BODY_20_39 { | 152 | sub BODY_20_39 { |
151 | my ($i,$a,$b,$c,$d,$e,$f)=@_; | 153 | my ($i,$a,$b,$c,$d,$e)=@_; |
152 | my $j=$i+1; | 154 | my $j=$i+1; |
153 | my $K=($i<40)?0x6ed9eba1:0xca62c1d6; | 155 | my $K=($i<40)?0x6ed9eba1:0xca62c1d6; |
154 | $code.=<<___ if ($i<79); | 156 | $code.=<<___ if ($i<79); |
155 | lea $K($xi,$e),$f | 157 | mov `4*($j%16)`(%rsp),$xi[1] |
156 | mov `4*($j%16)`(%rsp),$xi | ||
157 | mov $c,$t0 | 158 | mov $c,$t0 |
158 | mov $a,$e | 159 | mov $a,$t2 |
159 | xor `4*(($j+2)%16)`(%rsp),$xi | 160 | xor `4*(($j+2)%16)`(%rsp),$xi[1] |
160 | xor $b,$t0 | 161 | xor $b,$t0 |
161 | rol \$5,$e | 162 | rol \$5,$t2 |
162 | xor `4*(($j+8)%16)`(%rsp),$xi | 163 | lea $K($xi[0],$e),$e |
164 | xor `4*(($j+8)%16)`(%rsp),$xi[1] | ||
163 | xor $d,$t0 | 165 | xor $d,$t0 |
164 | add $e,$f | 166 | add $t2,$e |
165 | xor `4*(($j+13)%16)`(%rsp),$xi | 167 | xor `4*(($j+13)%16)`(%rsp),$xi[1] |
166 | rol \$30,$b | 168 | rol \$30,$b |
167 | add $t0,$f | 169 | add $t0,$e |
168 | rol \$1,$xi | 170 | rol \$1,$xi[1] |
169 | ___ | 171 | ___ |
170 | $code.=<<___ if ($i<76); | 172 | $code.=<<___ if ($i<76); |
171 | mov $xi,`4*($j%16)`(%rsp) | 173 | mov $xi[1],`4*($j%16)`(%rsp) |
172 | ___ | 174 | ___ |
173 | $code.=<<___ if ($i==79); | 175 | $code.=<<___ if ($i==79); |
174 | lea $K($xi,$e),$f | ||
175 | mov $c,$t0 | 176 | mov $c,$t0 |
176 | mov $a,$e | 177 | mov $a,$t2 |
177 | xor $b,$t0 | 178 | xor $b,$t0 |
178 | rol \$5,$e | 179 | lea $K($xi[0],$e),$e |
180 | rol \$5,$t2 | ||
179 | xor $d,$t0 | 181 | xor $d,$t0 |
180 | add $e,$f | 182 | add $t2,$e |
181 | rol \$30,$b | 183 | rol \$30,$b |
182 | add $t0,$f | 184 | add $t0,$e |
183 | ___ | 185 | ___ |
186 | unshift(@xi,pop(@xi)); | ||
184 | } | 187 | } |
185 | 188 | ||
186 | sub BODY_40_59 { | 189 | sub BODY_40_59 { |
187 | my ($i,$a,$b,$c,$d,$e,$f)=@_; | 190 | my ($i,$a,$b,$c,$d,$e)=@_; |
188 | my $j=$i+1; | 191 | my $j=$i+1; |
189 | $code.=<<___; | 192 | $code.=<<___; |
190 | lea 0x8f1bbcdc($xi,$e),$f | 193 | mov `4*($j%16)`(%rsp),$xi[1] |
191 | mov `4*($j%16)`(%rsp),$xi | 194 | mov $c,$t0 |
192 | mov $b,$t0 | 195 | mov $c,$t1 |
193 | mov $b,$t1 | 196 | xor `4*(($j+2)%16)`(%rsp),$xi[1] |
194 | xor `4*(($j+2)%16)`(%rsp),$xi | 197 | and $d,$t0 |
195 | mov $a,$e | 198 | mov $a,$t2 |
196 | and $c,$t0 | 199 | xor `4*(($j+8)%16)`(%rsp),$xi[1] |
197 | xor `4*(($j+8)%16)`(%rsp),$xi | 200 | xor $d,$t1 |
198 | or $c,$t1 | 201 | lea 0x8f1bbcdc($xi[0],$e),$e |
199 | rol \$5,$e | 202 | rol \$5,$t2 |
200 | xor `4*(($j+13)%16)`(%rsp),$xi | 203 | xor `4*(($j+13)%16)`(%rsp),$xi[1] |
201 | and $d,$t1 | 204 | add $t0,$e |
202 | add $e,$f | 205 | and $b,$t1 |
203 | rol \$1,$xi | 206 | rol \$1,$xi[1] |
204 | or $t1,$t0 | 207 | add $t1,$e |
205 | rol \$30,$b | 208 | rol \$30,$b |
206 | mov $xi,`4*($j%16)`(%rsp) | 209 | mov $xi[1],`4*($j%16)`(%rsp) |
207 | add $t0,$f | 210 | add $t2,$e |
208 | ___ | 211 | ___ |
212 | unshift(@xi,pop(@xi)); | ||
209 | } | 213 | } |
210 | 214 | ||
211 | $code=".text\n"; | 215 | $code.=<<___; |
216 | .text | ||
217 | .extern OPENSSL_ia32cap_P | ||
212 | 218 | ||
213 | &PROLOGUE("sha1_block_data_order"); | 219 | .globl sha1_block_data_order |
214 | $code.=".align 4\n.Lloop:\n"; | 220 | .type sha1_block_data_order,\@function,3 |
221 | .align 16 | ||
222 | sha1_block_data_order: | ||
223 | mov OPENSSL_ia32cap_P+0(%rip),%r9d | ||
224 | mov OPENSSL_ia32cap_P+4(%rip),%r8d | ||
225 | test \$`1<<9`,%r8d # check SSSE3 bit | ||
226 | jz .Lialu | ||
227 | ___ | ||
228 | $code.=<<___ if ($avx); | ||
229 | and \$`1<<28`,%r8d # mask AVX bit | ||
230 | and \$`1<<30`,%r9d # mask "Intel CPU" bit | ||
231 | or %r9d,%r8d | ||
232 | cmp \$`1<<28|1<<30`,%r8d | ||
233 | je _avx_shortcut | ||
234 | ___ | ||
235 | $code.=<<___; | ||
236 | jmp _ssse3_shortcut | ||
237 | |||
238 | .align 16 | ||
239 | .Lialu: | ||
240 | push %rbx | ||
241 | push %rbp | ||
242 | push %r12 | ||
243 | push %r13 | ||
244 | mov %rsp,%r11 | ||
245 | mov %rdi,$ctx # reassigned argument | ||
246 | sub \$`8+16*4`,%rsp | ||
247 | mov %rsi,$inp # reassigned argument | ||
248 | and \$-64,%rsp | ||
249 | mov %rdx,$num # reassigned argument | ||
250 | mov %r11,`16*4`(%rsp) | ||
251 | .Lprologue: | ||
252 | |||
253 | mov 0($ctx),$A | ||
254 | mov 4($ctx),$B | ||
255 | mov 8($ctx),$C | ||
256 | mov 12($ctx),$D | ||
257 | mov 16($ctx),$E | ||
258 | jmp .Lloop | ||
259 | |||
260 | .align 16 | ||
261 | .Lloop: | ||
262 | ___ | ||
215 | for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } | 263 | for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } |
216 | for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | 264 | for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
217 | for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | 265 | for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } |
218 | for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | 266 | for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
219 | $code.=<<___; | 267 | $code.=<<___; |
220 | add 0($ctx),$E | 268 | add 0($ctx),$A |
221 | add 4($ctx),$T | 269 | add 4($ctx),$B |
222 | add 8($ctx),$A | 270 | add 8($ctx),$C |
223 | add 12($ctx),$B | 271 | add 12($ctx),$D |
224 | add 16($ctx),$C | 272 | add 16($ctx),$E |
225 | mov $E,0($ctx) | 273 | mov $A,0($ctx) |
226 | mov $T,4($ctx) | 274 | mov $B,4($ctx) |
227 | mov $A,8($ctx) | 275 | mov $C,8($ctx) |
228 | mov $B,12($ctx) | 276 | mov $D,12($ctx) |
229 | mov $C,16($ctx) | 277 | mov $E,16($ctx) |
230 | 278 | ||
231 | xchg $E,$A # mov $E,$A | ||
232 | xchg $T,$B # mov $T,$B | ||
233 | xchg $E,$C # mov $A,$C | ||
234 | xchg $T,$D # mov $B,$D | ||
235 | # mov $C,$E | ||
236 | lea `16*4`($inp),$inp | ||
237 | sub \$1,$num | 279 | sub \$1,$num |
280 | lea `16*4`($inp),$inp | ||
238 | jnz .Lloop | 281 | jnz .Lloop |
282 | |||
283 | mov `16*4`(%rsp),%rsi | ||
284 | mov (%rsi),%r13 | ||
285 | mov 8(%rsi),%r12 | ||
286 | mov 16(%rsi),%rbp | ||
287 | mov 24(%rsi),%rbx | ||
288 | lea 32(%rsi),%rsp | ||
289 | .Lepilogue: | ||
290 | ret | ||
291 | .size sha1_block_data_order,.-sha1_block_data_order | ||
239 | ___ | 292 | ___ |
240 | &EPILOGUE("sha1_block_data_order"); | 293 | {{{ |
294 | my $Xi=4; | ||
295 | my @X=map("%xmm$_",(4..7,0..3)); | ||
296 | my @Tx=map("%xmm$_",(8..10)); | ||
297 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization | ||
298 | my @T=("%esi","%edi"); | ||
299 | my $j=0; | ||
300 | my $K_XX_XX="%r11"; | ||
301 | |||
302 | my $_rol=sub { &rol(@_) }; | ||
303 | my $_ror=sub { &ror(@_) }; | ||
304 | |||
305 | $code.=<<___; | ||
306 | .type sha1_block_data_order_ssse3,\@function,3 | ||
307 | .align 16 | ||
308 | sha1_block_data_order_ssse3: | ||
309 | _ssse3_shortcut: | ||
310 | push %rbx | ||
311 | push %rbp | ||
312 | push %r12 | ||
313 | lea `-64-($win64?5*16:0)`(%rsp),%rsp | ||
314 | ___ | ||
315 | $code.=<<___ if ($win64); | ||
316 | movaps %xmm6,64+0(%rsp) | ||
317 | movaps %xmm7,64+16(%rsp) | ||
318 | movaps %xmm8,64+32(%rsp) | ||
319 | movaps %xmm9,64+48(%rsp) | ||
320 | movaps %xmm10,64+64(%rsp) | ||
321 | .Lprologue_ssse3: | ||
322 | ___ | ||
323 | $code.=<<___; | ||
324 | mov %rdi,$ctx # reassigned argument | ||
325 | mov %rsi,$inp # reassigned argument | ||
326 | mov %rdx,$num # reassigned argument | ||
327 | |||
328 | shl \$6,$num | ||
329 | add $inp,$num | ||
330 | lea K_XX_XX(%rip),$K_XX_XX | ||
331 | |||
332 | mov 0($ctx),$A # load context | ||
333 | mov 4($ctx),$B | ||
334 | mov 8($ctx),$C | ||
335 | mov 12($ctx),$D | ||
336 | mov $B,@T[0] # magic seed | ||
337 | mov 16($ctx),$E | ||
338 | |||
339 | movdqa 64($K_XX_XX),@X[2] # pbswap mask | ||
340 | movdqa 0($K_XX_XX),@Tx[1] # K_00_19 | ||
341 | movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] | ||
342 | movdqu 16($inp),@X[-3&7] | ||
343 | movdqu 32($inp),@X[-2&7] | ||
344 | movdqu 48($inp),@X[-1&7] | ||
345 | pshufb @X[2],@X[-4&7] # byte swap | ||
346 | add \$64,$inp | ||
347 | pshufb @X[2],@X[-3&7] | ||
348 | pshufb @X[2],@X[-2&7] | ||
349 | pshufb @X[2],@X[-1&7] | ||
350 | paddd @Tx[1],@X[-4&7] # add K_00_19 | ||
351 | paddd @Tx[1],@X[-3&7] | ||
352 | paddd @Tx[1],@X[-2&7] | ||
353 | movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU | ||
354 | psubd @Tx[1],@X[-4&7] # restore X[] | ||
355 | movdqa @X[-3&7],16(%rsp) | ||
356 | psubd @Tx[1],@X[-3&7] | ||
357 | movdqa @X[-2&7],32(%rsp) | ||
358 | psubd @Tx[1],@X[-2&7] | ||
359 | jmp .Loop_ssse3 | ||
360 | ___ | ||
361 | |||
362 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | ||
363 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | ||
364 | my $arg = pop; | ||
365 | $arg = "\$$arg" if ($arg*1 eq $arg); | ||
366 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | ||
367 | } | ||
368 | |||
369 | sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 | ||
370 | { use integer; | ||
371 | my $body = shift; | ||
372 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | ||
373 | my ($a,$b,$c,$d,$e); | ||
374 | |||
375 | &movdqa (@X[0],@X[-3&7]); | ||
376 | eval(shift(@insns)); | ||
377 | eval(shift(@insns)); | ||
378 | &movdqa (@Tx[0],@X[-1&7]); | ||
379 | &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" | ||
380 | eval(shift(@insns)); | ||
381 | eval(shift(@insns)); | ||
382 | |||
383 | &paddd (@Tx[1],@X[-1&7]); | ||
384 | eval(shift(@insns)); | ||
385 | eval(shift(@insns)); | ||
386 | &psrldq (@Tx[0],4); # "X[-3]", 3 dwords | ||
387 | eval(shift(@insns)); | ||
388 | eval(shift(@insns)); | ||
389 | &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | ||
390 | eval(shift(@insns)); | ||
391 | eval(shift(@insns)); | ||
392 | |||
393 | &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" | ||
394 | eval(shift(@insns)); | ||
395 | eval(shift(@insns)); | ||
396 | eval(shift(@insns)); | ||
397 | eval(shift(@insns)); | ||
398 | |||
399 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" | ||
400 | eval(shift(@insns)); | ||
401 | eval(shift(@insns)); | ||
402 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
403 | eval(shift(@insns)); | ||
404 | eval(shift(@insns)); | ||
405 | |||
406 | &movdqa (@Tx[2],@X[0]); | ||
407 | &movdqa (@Tx[0],@X[0]); | ||
408 | eval(shift(@insns)); | ||
409 | eval(shift(@insns)); | ||
410 | eval(shift(@insns)); | ||
411 | eval(shift(@insns)); | ||
412 | |||
413 | &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword | ||
414 | &paddd (@X[0],@X[0]); | ||
415 | eval(shift(@insns)); | ||
416 | eval(shift(@insns)); | ||
417 | eval(shift(@insns)); | ||
418 | eval(shift(@insns)); | ||
419 | |||
420 | &psrld (@Tx[0],31); | ||
421 | eval(shift(@insns)); | ||
422 | eval(shift(@insns)); | ||
423 | &movdqa (@Tx[1],@Tx[2]); | ||
424 | eval(shift(@insns)); | ||
425 | eval(shift(@insns)); | ||
426 | |||
427 | &psrld (@Tx[2],30); | ||
428 | &por (@X[0],@Tx[0]); # "X[0]"<<<=1 | ||
429 | eval(shift(@insns)); | ||
430 | eval(shift(@insns)); | ||
431 | eval(shift(@insns)); | ||
432 | eval(shift(@insns)); | ||
433 | |||
434 | &pslld (@Tx[1],2); | ||
435 | &pxor (@X[0],@Tx[2]); | ||
436 | eval(shift(@insns)); | ||
437 | eval(shift(@insns)); | ||
438 | &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX | ||
439 | eval(shift(@insns)); | ||
440 | eval(shift(@insns)); | ||
441 | |||
442 | &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 | ||
443 | |||
444 | foreach (@insns) { eval; } # remaining instructions [if any] | ||
445 | |||
446 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
447 | push(@Tx,shift(@Tx)); | ||
448 | } | ||
449 | |||
450 | sub Xupdate_ssse3_32_79() | ||
451 | { use integer; | ||
452 | my $body = shift; | ||
453 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | ||
454 | my ($a,$b,$c,$d,$e); | ||
455 | |||
456 | &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); | ||
457 | eval(shift(@insns)); # body_20_39 | ||
458 | &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | ||
459 | &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" | ||
460 | eval(shift(@insns)); | ||
461 | eval(shift(@insns)); | ||
462 | eval(shift(@insns)); # rol | ||
463 | |||
464 | &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | ||
465 | eval(shift(@insns)); | ||
466 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); | ||
467 | if ($Xi%5) { | ||
468 | &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... | ||
469 | } else { # ... or load next one | ||
470 | &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); | ||
471 | } | ||
472 | &paddd (@Tx[1],@X[-1&7]); | ||
473 | eval(shift(@insns)); # ror | ||
474 | eval(shift(@insns)); | ||
475 | |||
476 | &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" | ||
477 | eval(shift(@insns)); # body_20_39 | ||
478 | eval(shift(@insns)); | ||
479 | eval(shift(@insns)); | ||
480 | eval(shift(@insns)); # rol | ||
481 | |||
482 | &movdqa (@Tx[0],@X[0]); | ||
483 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
484 | eval(shift(@insns)); | ||
485 | eval(shift(@insns)); | ||
486 | eval(shift(@insns)); # ror | ||
487 | eval(shift(@insns)); | ||
488 | |||
489 | &pslld (@X[0],2); | ||
490 | eval(shift(@insns)); # body_20_39 | ||
491 | eval(shift(@insns)); | ||
492 | &psrld (@Tx[0],30); | ||
493 | eval(shift(@insns)); | ||
494 | eval(shift(@insns)); # rol | ||
495 | eval(shift(@insns)); | ||
496 | eval(shift(@insns)); | ||
497 | eval(shift(@insns)); # ror | ||
498 | eval(shift(@insns)); | ||
499 | |||
500 | &por (@X[0],@Tx[0]); # "X[0]"<<<=2 | ||
501 | eval(shift(@insns)); # body_20_39 | ||
502 | eval(shift(@insns)); | ||
503 | &movdqa (@Tx[1],@X[0]) if ($Xi<19); | ||
504 | eval(shift(@insns)); | ||
505 | eval(shift(@insns)); # rol | ||
506 | eval(shift(@insns)); | ||
507 | eval(shift(@insns)); | ||
508 | eval(shift(@insns)); # rol | ||
509 | eval(shift(@insns)); | ||
510 | |||
511 | foreach (@insns) { eval; } # remaining instructions | ||
512 | |||
513 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
514 | push(@Tx,shift(@Tx)); | ||
515 | } | ||
516 | |||
517 | sub Xuplast_ssse3_80() | ||
518 | { use integer; | ||
519 | my $body = shift; | ||
520 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
521 | my ($a,$b,$c,$d,$e); | ||
522 | |||
523 | eval(shift(@insns)); | ||
524 | &paddd (@Tx[1],@X[-1&7]); | ||
525 | eval(shift(@insns)); | ||
526 | eval(shift(@insns)); | ||
527 | eval(shift(@insns)); | ||
528 | eval(shift(@insns)); | ||
529 | |||
530 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU | ||
531 | |||
532 | foreach (@insns) { eval; } # remaining instructions | ||
533 | |||
534 | &cmp ($inp,$num); | ||
535 | &je (".Ldone_ssse3"); | ||
536 | |||
537 | unshift(@Tx,pop(@Tx)); | ||
538 | |||
539 | &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask | ||
540 | &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 | ||
541 | &movdqu (@X[-4&7],"0($inp)"); # load input | ||
542 | &movdqu (@X[-3&7],"16($inp)"); | ||
543 | &movdqu (@X[-2&7],"32($inp)"); | ||
544 | &movdqu (@X[-1&7],"48($inp)"); | ||
545 | &pshufb (@X[-4&7],@X[2]); # byte swap | ||
546 | &add ($inp,64); | ||
547 | |||
548 | $Xi=0; | ||
549 | } | ||
550 | |||
551 | sub Xloop_ssse3() | ||
552 | { use integer; | ||
553 | my $body = shift; | ||
554 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
555 | my ($a,$b,$c,$d,$e); | ||
556 | |||
557 | eval(shift(@insns)); | ||
558 | eval(shift(@insns)); | ||
559 | &pshufb (@X[($Xi-3)&7],@X[2]); | ||
560 | eval(shift(@insns)); | ||
561 | eval(shift(@insns)); | ||
562 | &paddd (@X[($Xi-4)&7],@Tx[1]); | ||
563 | eval(shift(@insns)); | ||
564 | eval(shift(@insns)); | ||
565 | eval(shift(@insns)); | ||
566 | eval(shift(@insns)); | ||
567 | &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU | ||
568 | eval(shift(@insns)); | ||
569 | eval(shift(@insns)); | ||
570 | &psubd (@X[($Xi-4)&7],@Tx[1]); | ||
571 | |||
572 | foreach (@insns) { eval; } | ||
573 | $Xi++; | ||
574 | } | ||
575 | |||
576 | sub Xtail_ssse3() | ||
577 | { use integer; | ||
578 | my $body = shift; | ||
579 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
580 | my ($a,$b,$c,$d,$e); | ||
581 | |||
582 | foreach (@insns) { eval; } | ||
583 | } | ||
584 | |||
585 | sub body_00_19 () { | ||
586 | ( | ||
587 | '($a,$b,$c,$d,$e)=@V;'. | ||
588 | '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer | ||
589 | '&xor ($c,$d);', | ||
590 | '&mov (@T[1],$a);', # $b in next round | ||
591 | '&$_rol ($a,5);', | ||
592 | '&and (@T[0],$c);', # ($b&($c^$d)) | ||
593 | '&xor ($c,$d);', # restore $c | ||
594 | '&xor (@T[0],$d);', | ||
595 | '&add ($e,$a);', | ||
596 | '&$_ror ($b,$j?7:2);', # $b>>>2 | ||
597 | '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
598 | ); | ||
599 | } | ||
600 | |||
601 | sub body_20_39 () { | ||
602 | ( | ||
603 | '($a,$b,$c,$d,$e)=@V;'. | ||
604 | '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer | ||
605 | '&xor (@T[0],$d);', # ($b^$d) | ||
606 | '&mov (@T[1],$a);', # $b in next round | ||
607 | '&$_rol ($a,5);', | ||
608 | '&xor (@T[0],$c);', # ($b^$d^$c) | ||
609 | '&add ($e,$a);', | ||
610 | '&$_ror ($b,7);', # $b>>>2 | ||
611 | '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
612 | ); | ||
613 | } | ||
614 | |||
615 | sub body_40_59 () { | ||
616 | ( | ||
617 | '($a,$b,$c,$d,$e)=@V;'. | ||
618 | '&mov (@T[1],$c);', | ||
619 | '&xor ($c,$d);', | ||
620 | '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer | ||
621 | '&and (@T[1],$d);', | ||
622 | '&and (@T[0],$c);', # ($b&($c^$d)) | ||
623 | '&$_ror ($b,7);', # $b>>>2 | ||
624 | '&add ($e,@T[1]);', | ||
625 | '&mov (@T[1],$a);', # $b in next round | ||
626 | '&$_rol ($a,5);', | ||
627 | '&add ($e,@T[0]);', | ||
628 | '&xor ($c,$d);', # restore $c | ||
629 | '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | ||
630 | ); | ||
631 | } | ||
241 | $code.=<<___; | 632 | $code.=<<___; |
242 | .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
243 | .align 16 | 633 | .align 16 |
634 | .Loop_ssse3: | ||
635 | ___ | ||
636 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
637 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
638 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
639 | &Xupdate_ssse3_16_31(\&body_00_19); | ||
640 | &Xupdate_ssse3_32_79(\&body_00_19); | ||
641 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
642 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
643 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
644 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
645 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
646 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
647 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
648 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
649 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
650 | &Xupdate_ssse3_32_79(\&body_40_59); | ||
651 | &Xupdate_ssse3_32_79(\&body_20_39); | ||
652 | &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" | ||
653 | |||
654 | $saved_j=$j; @saved_V=@V; | ||
655 | |||
656 | &Xloop_ssse3(\&body_20_39); | ||
657 | &Xloop_ssse3(\&body_20_39); | ||
658 | &Xloop_ssse3(\&body_20_39); | ||
659 | |||
660 | $code.=<<___; | ||
661 | add 0($ctx),$A # update context | ||
662 | add 4($ctx),@T[0] | ||
663 | add 8($ctx),$C | ||
664 | add 12($ctx),$D | ||
665 | mov $A,0($ctx) | ||
666 | add 16($ctx),$E | ||
667 | mov @T[0],4($ctx) | ||
668 | mov @T[0],$B # magic seed | ||
669 | mov $C,8($ctx) | ||
670 | mov $D,12($ctx) | ||
671 | mov $E,16($ctx) | ||
672 | jmp .Loop_ssse3 | ||
673 | |||
674 | .align 16 | ||
675 | .Ldone_ssse3: | ||
676 | ___ | ||
677 | $j=$saved_j; @V=@saved_V; | ||
678 | |||
679 | &Xtail_ssse3(\&body_20_39); | ||
680 | &Xtail_ssse3(\&body_20_39); | ||
681 | &Xtail_ssse3(\&body_20_39); | ||
682 | |||
683 | $code.=<<___; | ||
684 | add 0($ctx),$A # update context | ||
685 | add 4($ctx),@T[0] | ||
686 | add 8($ctx),$C | ||
687 | mov $A,0($ctx) | ||
688 | add 12($ctx),$D | ||
689 | mov @T[0],4($ctx) | ||
690 | add 16($ctx),$E | ||
691 | mov $C,8($ctx) | ||
692 | mov $D,12($ctx) | ||
693 | mov $E,16($ctx) | ||
694 | ___ | ||
695 | $code.=<<___ if ($win64); | ||
696 | movaps 64+0(%rsp),%xmm6 | ||
697 | movaps 64+16(%rsp),%xmm7 | ||
698 | movaps 64+32(%rsp),%xmm8 | ||
699 | movaps 64+48(%rsp),%xmm9 | ||
700 | movaps 64+64(%rsp),%xmm10 | ||
701 | ___ | ||
702 | $code.=<<___; | ||
703 | lea `64+($win64?5*16:0)`(%rsp),%rsi | ||
704 | mov 0(%rsi),%r12 | ||
705 | mov 8(%rsi),%rbp | ||
706 | mov 16(%rsi),%rbx | ||
707 | lea 24(%rsi),%rsp | ||
708 | .Lepilogue_ssse3: | ||
709 | ret | ||
710 | .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 | ||
711 | ___ | ||
712 | |||
713 | if ($avx) { | ||
714 | my $Xi=4; | ||
715 | my @X=map("%xmm$_",(4..7,0..3)); | ||
716 | my @Tx=map("%xmm$_",(8..10)); | ||
717 | my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization | ||
718 | my @T=("%esi","%edi"); | ||
719 | my $j=0; | ||
720 | my $K_XX_XX="%r11"; | ||
721 | |||
722 | my $_rol=sub { &shld(@_[0],@_) }; | ||
723 | my $_ror=sub { &shrd(@_[0],@_) }; | ||
724 | |||
725 | $code.=<<___; | ||
726 | .type sha1_block_data_order_avx,\@function,3 | ||
727 | .align 16 | ||
728 | sha1_block_data_order_avx: | ||
729 | _avx_shortcut: | ||
730 | push %rbx | ||
731 | push %rbp | ||
732 | push %r12 | ||
733 | lea `-64-($win64?5*16:0)`(%rsp),%rsp | ||
734 | ___ | ||
735 | $code.=<<___ if ($win64); | ||
736 | movaps %xmm6,64+0(%rsp) | ||
737 | movaps %xmm7,64+16(%rsp) | ||
738 | movaps %xmm8,64+32(%rsp) | ||
739 | movaps %xmm9,64+48(%rsp) | ||
740 | movaps %xmm10,64+64(%rsp) | ||
741 | .Lprologue_avx: | ||
742 | ___ | ||
743 | $code.=<<___; | ||
744 | mov %rdi,$ctx # reassigned argument | ||
745 | mov %rsi,$inp # reassigned argument | ||
746 | mov %rdx,$num # reassigned argument | ||
747 | vzeroall | ||
748 | |||
749 | shl \$6,$num | ||
750 | add $inp,$num | ||
751 | lea K_XX_XX(%rip),$K_XX_XX | ||
752 | |||
753 | mov 0($ctx),$A # load context | ||
754 | mov 4($ctx),$B | ||
755 | mov 8($ctx),$C | ||
756 | mov 12($ctx),$D | ||
757 | mov $B,@T[0] # magic seed | ||
758 | mov 16($ctx),$E | ||
759 | |||
760 | vmovdqa 64($K_XX_XX),@X[2] # pbswap mask | ||
761 | vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 | ||
762 | vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] | ||
763 | vmovdqu 16($inp),@X[-3&7] | ||
764 | vmovdqu 32($inp),@X[-2&7] | ||
765 | vmovdqu 48($inp),@X[-1&7] | ||
766 | vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap | ||
767 | add \$64,$inp | ||
768 | vpshufb @X[2],@X[-3&7],@X[-3&7] | ||
769 | vpshufb @X[2],@X[-2&7],@X[-2&7] | ||
770 | vpshufb @X[2],@X[-1&7],@X[-1&7] | ||
771 | vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 | ||
772 | vpaddd @Tx[1],@X[-3&7],@X[1] | ||
773 | vpaddd @Tx[1],@X[-2&7],@X[2] | ||
774 | vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU | ||
775 | vmovdqa @X[1],16(%rsp) | ||
776 | vmovdqa @X[2],32(%rsp) | ||
777 | jmp .Loop_avx | ||
778 | ___ | ||
779 | |||
780 | sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 | ||
781 | { use integer; | ||
782 | my $body = shift; | ||
783 | my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | ||
784 | my ($a,$b,$c,$d,$e); | ||
785 | |||
786 | eval(shift(@insns)); | ||
787 | eval(shift(@insns)); | ||
788 | &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" | ||
789 | eval(shift(@insns)); | ||
790 | eval(shift(@insns)); | ||
791 | |||
792 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
793 | eval(shift(@insns)); | ||
794 | eval(shift(@insns)); | ||
795 | &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords | ||
796 | eval(shift(@insns)); | ||
797 | eval(shift(@insns)); | ||
798 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | ||
799 | eval(shift(@insns)); | ||
800 | eval(shift(@insns)); | ||
801 | |||
802 | &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" | ||
803 | eval(shift(@insns)); | ||
804 | eval(shift(@insns)); | ||
805 | eval(shift(@insns)); | ||
806 | eval(shift(@insns)); | ||
807 | |||
808 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" | ||
809 | eval(shift(@insns)); | ||
810 | eval(shift(@insns)); | ||
811 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
812 | eval(shift(@insns)); | ||
813 | eval(shift(@insns)); | ||
814 | |||
815 | &vpsrld (@Tx[0],@X[0],31); | ||
816 | eval(shift(@insns)); | ||
817 | eval(shift(@insns)); | ||
818 | eval(shift(@insns)); | ||
819 | eval(shift(@insns)); | ||
820 | |||
821 | &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword | ||
822 | &vpaddd (@X[0],@X[0],@X[0]); | ||
823 | eval(shift(@insns)); | ||
824 | eval(shift(@insns)); | ||
825 | eval(shift(@insns)); | ||
826 | eval(shift(@insns)); | ||
827 | |||
828 | &vpsrld (@Tx[1],@Tx[2],30); | ||
829 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 | ||
830 | eval(shift(@insns)); | ||
831 | eval(shift(@insns)); | ||
832 | eval(shift(@insns)); | ||
833 | eval(shift(@insns)); | ||
834 | |||
835 | &vpslld (@Tx[2],@Tx[2],2); | ||
836 | &vpxor (@X[0],@X[0],@Tx[1]); | ||
837 | eval(shift(@insns)); | ||
838 | eval(shift(@insns)); | ||
839 | eval(shift(@insns)); | ||
840 | eval(shift(@insns)); | ||
841 | |||
842 | &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 | ||
843 | eval(shift(@insns)); | ||
844 | eval(shift(@insns)); | ||
845 | &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX | ||
846 | eval(shift(@insns)); | ||
847 | eval(shift(@insns)); | ||
848 | |||
849 | |||
850 | foreach (@insns) { eval; } # remaining instructions [if any] | ||
851 | |||
852 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
853 | push(@Tx,shift(@Tx)); | ||
854 | } | ||
855 | |||
856 | sub Xupdate_avx_32_79() | ||
857 | { use integer; | ||
858 | my $body = shift; | ||
859 | my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | ||
860 | my ($a,$b,$c,$d,$e); | ||
861 | |||
862 | &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" | ||
863 | &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | ||
864 | eval(shift(@insns)); # body_20_39 | ||
865 | eval(shift(@insns)); | ||
866 | eval(shift(@insns)); | ||
867 | eval(shift(@insns)); # rol | ||
868 | |||
869 | &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | ||
870 | eval(shift(@insns)); | ||
871 | eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); | ||
872 | if ($Xi%5) { | ||
873 | &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... | ||
874 | } else { # ... or load next one | ||
875 | &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); | ||
876 | } | ||
877 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
878 | eval(shift(@insns)); # ror | ||
879 | eval(shift(@insns)); | ||
880 | |||
881 | &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" | ||
882 | eval(shift(@insns)); # body_20_39 | ||
883 | eval(shift(@insns)); | ||
884 | eval(shift(@insns)); | ||
885 | eval(shift(@insns)); # rol | ||
886 | |||
887 | &vpsrld (@Tx[0],@X[0],30); | ||
888 | &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU | ||
889 | eval(shift(@insns)); | ||
890 | eval(shift(@insns)); | ||
891 | eval(shift(@insns)); # ror | ||
892 | eval(shift(@insns)); | ||
893 | |||
894 | &vpslld (@X[0],@X[0],2); | ||
895 | eval(shift(@insns)); # body_20_39 | ||
896 | eval(shift(@insns)); | ||
897 | eval(shift(@insns)); | ||
898 | eval(shift(@insns)); # rol | ||
899 | eval(shift(@insns)); | ||
900 | eval(shift(@insns)); | ||
901 | eval(shift(@insns)); # ror | ||
902 | eval(shift(@insns)); | ||
903 | |||
904 | &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 | ||
905 | eval(shift(@insns)); # body_20_39 | ||
906 | eval(shift(@insns)); | ||
907 | &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); | ||
908 | eval(shift(@insns)); | ||
909 | eval(shift(@insns)); # rol | ||
910 | eval(shift(@insns)); | ||
911 | eval(shift(@insns)); | ||
912 | eval(shift(@insns)); # rol | ||
913 | eval(shift(@insns)); | ||
914 | |||
915 | foreach (@insns) { eval; } # remaining instructions | ||
916 | |||
917 | $Xi++; push(@X,shift(@X)); # "rotate" X[] | ||
918 | push(@Tx,shift(@Tx)); | ||
919 | } | ||
920 | |||
921 | sub Xuplast_avx_80() | ||
922 | { use integer; | ||
923 | my $body = shift; | ||
924 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
925 | my ($a,$b,$c,$d,$e); | ||
926 | |||
927 | eval(shift(@insns)); | ||
928 | &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | ||
929 | eval(shift(@insns)); | ||
930 | eval(shift(@insns)); | ||
931 | eval(shift(@insns)); | ||
932 | eval(shift(@insns)); | ||
933 | |||
934 | &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU | ||
935 | |||
936 | foreach (@insns) { eval; } # remaining instructions | ||
937 | |||
938 | &cmp ($inp,$num); | ||
939 | &je (".Ldone_avx"); | ||
940 | |||
941 | unshift(@Tx,pop(@Tx)); | ||
942 | |||
943 | &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask | ||
944 | &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 | ||
945 | &vmovdqu(@X[-4&7],"0($inp)"); # load input | ||
946 | &vmovdqu(@X[-3&7],"16($inp)"); | ||
947 | &vmovdqu(@X[-2&7],"32($inp)"); | ||
948 | &vmovdqu(@X[-1&7],"48($inp)"); | ||
949 | &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | ||
950 | &add ($inp,64); | ||
951 | |||
952 | $Xi=0; | ||
953 | } | ||
954 | |||
955 | sub Xloop_avx() | ||
956 | { use integer; | ||
957 | my $body = shift; | ||
958 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
959 | my ($a,$b,$c,$d,$e); | ||
960 | |||
961 | eval(shift(@insns)); | ||
962 | eval(shift(@insns)); | ||
963 | &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); | ||
964 | eval(shift(@insns)); | ||
965 | eval(shift(@insns)); | ||
966 | &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); | ||
967 | eval(shift(@insns)); | ||
968 | eval(shift(@insns)); | ||
969 | eval(shift(@insns)); | ||
970 | eval(shift(@insns)); | ||
971 | &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU | ||
972 | eval(shift(@insns)); | ||
973 | eval(shift(@insns)); | ||
974 | |||
975 | foreach (@insns) { eval; } | ||
976 | $Xi++; | ||
977 | } | ||
978 | |||
979 | sub Xtail_avx() | ||
980 | { use integer; | ||
981 | my $body = shift; | ||
982 | my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | ||
983 | my ($a,$b,$c,$d,$e); | ||
984 | |||
985 | foreach (@insns) { eval; } | ||
986 | } | ||
987 | |||
988 | $code.=<<___; | ||
989 | .align 16 | ||
990 | .Loop_avx: | ||
991 | ___ | ||
992 | &Xupdate_avx_16_31(\&body_00_19); | ||
993 | &Xupdate_avx_16_31(\&body_00_19); | ||
994 | &Xupdate_avx_16_31(\&body_00_19); | ||
995 | &Xupdate_avx_16_31(\&body_00_19); | ||
996 | &Xupdate_avx_32_79(\&body_00_19); | ||
997 | &Xupdate_avx_32_79(\&body_20_39); | ||
998 | &Xupdate_avx_32_79(\&body_20_39); | ||
999 | &Xupdate_avx_32_79(\&body_20_39); | ||
1000 | &Xupdate_avx_32_79(\&body_20_39); | ||
1001 | &Xupdate_avx_32_79(\&body_20_39); | ||
1002 | &Xupdate_avx_32_79(\&body_40_59); | ||
1003 | &Xupdate_avx_32_79(\&body_40_59); | ||
1004 | &Xupdate_avx_32_79(\&body_40_59); | ||
1005 | &Xupdate_avx_32_79(\&body_40_59); | ||
1006 | &Xupdate_avx_32_79(\&body_40_59); | ||
1007 | &Xupdate_avx_32_79(\&body_20_39); | ||
1008 | &Xuplast_avx_80(\&body_20_39); # can jump to "done" | ||
1009 | |||
1010 | $saved_j=$j; @saved_V=@V; | ||
1011 | |||
1012 | &Xloop_avx(\&body_20_39); | ||
1013 | &Xloop_avx(\&body_20_39); | ||
1014 | &Xloop_avx(\&body_20_39); | ||
1015 | |||
1016 | $code.=<<___; | ||
1017 | add 0($ctx),$A # update context | ||
1018 | add 4($ctx),@T[0] | ||
1019 | add 8($ctx),$C | ||
1020 | add 12($ctx),$D | ||
1021 | mov $A,0($ctx) | ||
1022 | add 16($ctx),$E | ||
1023 | mov @T[0],4($ctx) | ||
1024 | mov @T[0],$B # magic seed | ||
1025 | mov $C,8($ctx) | ||
1026 | mov $D,12($ctx) | ||
1027 | mov $E,16($ctx) | ||
1028 | jmp .Loop_avx | ||
1029 | |||
1030 | .align 16 | ||
1031 | .Ldone_avx: | ||
1032 | ___ | ||
1033 | $j=$saved_j; @V=@saved_V; | ||
1034 | |||
1035 | &Xtail_avx(\&body_20_39); | ||
1036 | &Xtail_avx(\&body_20_39); | ||
1037 | &Xtail_avx(\&body_20_39); | ||
1038 | |||
1039 | $code.=<<___; | ||
1040 | vzeroall | ||
1041 | |||
1042 | add 0($ctx),$A # update context | ||
1043 | add 4($ctx),@T[0] | ||
1044 | add 8($ctx),$C | ||
1045 | mov $A,0($ctx) | ||
1046 | add 12($ctx),$D | ||
1047 | mov @T[0],4($ctx) | ||
1048 | add 16($ctx),$E | ||
1049 | mov $C,8($ctx) | ||
1050 | mov $D,12($ctx) | ||
1051 | mov $E,16($ctx) | ||
1052 | ___ | ||
1053 | $code.=<<___ if ($win64); | ||
1054 | movaps 64+0(%rsp),%xmm6 | ||
1055 | movaps 64+16(%rsp),%xmm7 | ||
1056 | movaps 64+32(%rsp),%xmm8 | ||
1057 | movaps 64+48(%rsp),%xmm9 | ||
1058 | movaps 64+64(%rsp),%xmm10 | ||
1059 | ___ | ||
1060 | $code.=<<___; | ||
1061 | lea `64+($win64?5*16:0)`(%rsp),%rsi | ||
1062 | mov 0(%rsi),%r12 | ||
1063 | mov 8(%rsi),%rbp | ||
1064 | mov 16(%rsi),%rbx | ||
1065 | lea 24(%rsi),%rsp | ||
1066 | .Lepilogue_avx: | ||
1067 | ret | ||
1068 | .size sha1_block_data_order_avx,.-sha1_block_data_order_avx | ||
1069 | ___ | ||
1070 | } | ||
1071 | $code.=<<___; | ||
1072 | .align 64 | ||
1073 | K_XX_XX: | ||
1074 | .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 | ||
1075 | .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 | ||
1076 | .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 | ||
1077 | .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 | ||
1078 | .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask | ||
1079 | ___ | ||
1080 | }}} | ||
1081 | $code.=<<___; | ||
1082 | .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
1083 | .align 64 | ||
244 | ___ | 1084 | ___ |
245 | 1085 | ||
246 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | 1086 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
@@ -272,25 +1112,75 @@ se_handler: | |||
272 | 1112 | ||
273 | lea .Lprologue(%rip),%r10 | 1113 | lea .Lprologue(%rip),%r10 |
274 | cmp %r10,%rbx # context->Rip<.Lprologue | 1114 | cmp %r10,%rbx # context->Rip<.Lprologue |
275 | jb .Lin_prologue | 1115 | jb .Lcommon_seh_tail |
276 | 1116 | ||
277 | mov 152($context),%rax # pull context->Rsp | 1117 | mov 152($context),%rax # pull context->Rsp |
278 | 1118 | ||
279 | lea .Lepilogue(%rip),%r10 | 1119 | lea .Lepilogue(%rip),%r10 |
280 | cmp %r10,%rbx # context->Rip>=.Lepilogue | 1120 | cmp %r10,%rbx # context->Rip>=.Lepilogue |
281 | jae .Lin_prologue | 1121 | jae .Lcommon_seh_tail |
282 | 1122 | ||
283 | mov `16*4`(%rax),%rax # pull saved stack pointer | 1123 | mov `16*4`(%rax),%rax # pull saved stack pointer |
284 | lea 24(%rax),%rax | 1124 | lea 32(%rax),%rax |
285 | 1125 | ||
286 | mov -8(%rax),%rbx | 1126 | mov -8(%rax),%rbx |
287 | mov -16(%rax),%rbp | 1127 | mov -16(%rax),%rbp |
288 | mov -24(%rax),%r12 | 1128 | mov -24(%rax),%r12 |
1129 | mov -32(%rax),%r13 | ||
289 | mov %rbx,144($context) # restore context->Rbx | 1130 | mov %rbx,144($context) # restore context->Rbx |
290 | mov %rbp,160($context) # restore context->Rbp | 1131 | mov %rbp,160($context) # restore context->Rbp |
291 | mov %r12,216($context) # restore context->R12 | 1132 | mov %r12,216($context) # restore context->R12 |
1133 | mov %r13,224($context) # restore context->R13 | ||
1134 | |||
1135 | jmp .Lcommon_seh_tail | ||
1136 | .size se_handler,.-se_handler | ||
292 | 1137 | ||
293 | .Lin_prologue: | 1138 | .type ssse3_handler,\@abi-omnipotent |
1139 | .align 16 | ||
1140 | ssse3_handler: | ||
1141 | push %rsi | ||
1142 | push %rdi | ||
1143 | push %rbx | ||
1144 | push %rbp | ||
1145 | push %r12 | ||
1146 | push %r13 | ||
1147 | push %r14 | ||
1148 | push %r15 | ||
1149 | pushfq | ||
1150 | sub \$64,%rsp | ||
1151 | |||
1152 | mov 120($context),%rax # pull context->Rax | ||
1153 | mov 248($context),%rbx # pull context->Rip | ||
1154 | |||
1155 | mov 8($disp),%rsi # disp->ImageBase | ||
1156 | mov 56($disp),%r11 # disp->HandlerData | ||
1157 | |||
1158 | mov 0(%r11),%r10d # HandlerData[0] | ||
1159 | lea (%rsi,%r10),%r10 # prologue label | ||
1160 | cmp %r10,%rbx # context->Rip<prologue label | ||
1161 | jb .Lcommon_seh_tail | ||
1162 | |||
1163 | mov 152($context),%rax # pull context->Rsp | ||
1164 | |||
1165 | mov 4(%r11),%r10d # HandlerData[1] | ||
1166 | lea (%rsi,%r10),%r10 # epilogue label | ||
1167 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
1168 | jae .Lcommon_seh_tail | ||
1169 | |||
1170 | lea 64(%rax),%rsi | ||
1171 | lea 512($context),%rdi # &context.Xmm6 | ||
1172 | mov \$10,%ecx | ||
1173 | .long 0xa548f3fc # cld; rep movsq | ||
1174 | lea `24+64+5*16`(%rax),%rax # adjust stack pointer | ||
1175 | |||
1176 | mov -8(%rax),%rbx | ||
1177 | mov -16(%rax),%rbp | ||
1178 | mov -24(%rax),%r12 | ||
1179 | mov %rbx,144($context) # restore context->Rbx | ||
1180 | mov %rbp,160($context) # restore context->Rbp | ||
1181 | mov %r12,216($context) # restore cotnext->R12 | ||
1182 | |||
1183 | .Lcommon_seh_tail: | ||
294 | mov 8(%rax),%rdi | 1184 | mov 8(%rax),%rdi |
295 | mov 16(%rax),%rsi | 1185 | mov 16(%rax),%rsi |
296 | mov %rax,152($context) # restore context->Rsp | 1186 | mov %rax,152($context) # restore context->Rsp |
@@ -328,19 +1218,38 @@ se_handler: | |||
328 | pop %rdi | 1218 | pop %rdi |
329 | pop %rsi | 1219 | pop %rsi |
330 | ret | 1220 | ret |
331 | .size se_handler,.-se_handler | 1221 | .size ssse3_handler,.-ssse3_handler |
332 | 1222 | ||
333 | .section .pdata | 1223 | .section .pdata |
334 | .align 4 | 1224 | .align 4 |
335 | .rva .LSEH_begin_sha1_block_data_order | 1225 | .rva .LSEH_begin_sha1_block_data_order |
336 | .rva .LSEH_end_sha1_block_data_order | 1226 | .rva .LSEH_end_sha1_block_data_order |
337 | .rva .LSEH_info_sha1_block_data_order | 1227 | .rva .LSEH_info_sha1_block_data_order |
338 | 1228 | .rva .LSEH_begin_sha1_block_data_order_ssse3 | |
1229 | .rva .LSEH_end_sha1_block_data_order_ssse3 | ||
1230 | .rva .LSEH_info_sha1_block_data_order_ssse3 | ||
1231 | ___ | ||
1232 | $code.=<<___ if ($avx); | ||
1233 | .rva .LSEH_begin_sha1_block_data_order_avx | ||
1234 | .rva .LSEH_end_sha1_block_data_order_avx | ||
1235 | .rva .LSEH_info_sha1_block_data_order_avx | ||
1236 | ___ | ||
1237 | $code.=<<___; | ||
339 | .section .xdata | 1238 | .section .xdata |
340 | .align 8 | 1239 | .align 8 |
341 | .LSEH_info_sha1_block_data_order: | 1240 | .LSEH_info_sha1_block_data_order: |
342 | .byte 9,0,0,0 | 1241 | .byte 9,0,0,0 |
343 | .rva se_handler | 1242 | .rva se_handler |
1243 | .LSEH_info_sha1_block_data_order_ssse3: | ||
1244 | .byte 9,0,0,0 | ||
1245 | .rva ssse3_handler | ||
1246 | .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] | ||
1247 | ___ | ||
1248 | $code.=<<___ if ($avx); | ||
1249 | .LSEH_info_sha1_block_data_order_avx: | ||
1250 | .byte 9,0,0,0 | ||
1251 | .rva ssse3_handler | ||
1252 | .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] | ||
344 | ___ | 1253 | ___ |
345 | } | 1254 | } |
346 | 1255 | ||
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl index ecc8b69c75..928ec53123 100644 --- a/src/lib/libcrypto/sha/asm/sha256-586.pl +++ b/src/lib/libcrypto/sha/asm/sha256-586.pl | |||
@@ -14,8 +14,8 @@ | |||
14 | # Pentium PIII P4 AMD K8 Core2 | 14 | # Pentium PIII P4 AMD K8 Core2 |
15 | # gcc 46 36 41 27 26 | 15 | # gcc 46 36 41 27 26 |
16 | # icc 57 33 38 25 23 | 16 | # icc 57 33 38 25 23 |
17 | # x86 asm 40 30 35 20 20 | 17 | # x86 asm 40 30 33 20 18 |
18 | # x86_64 asm(*) - - 21 15.8 16.5 | 18 | # x86_64 asm(*) - - 21 16 16 |
19 | # | 19 | # |
20 | # (*) x86_64 assembler performance is presented for reference | 20 | # (*) x86_64 assembler performance is presented for reference |
21 | # purposes. | 21 | # purposes. |
@@ -48,20 +48,19 @@ sub BODY_00_15() { | |||
48 | my $in_16_63=shift; | 48 | my $in_16_63=shift; |
49 | 49 | ||
50 | &mov ("ecx",$E); | 50 | &mov ("ecx",$E); |
51 | &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_63); # T += X[-7] | 51 | &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) |
52 | &ror ("ecx",6); | 52 | &ror ("ecx",25-11); |
53 | &mov ("edi",$E); | ||
54 | &ror ("edi",11); | ||
55 | &mov ("esi",$Foff); | 53 | &mov ("esi",$Foff); |
56 | &xor ("ecx","edi"); | 54 | &xor ("ecx",$E); |
57 | &ror ("edi",25-11); | 55 | &ror ("ecx",11-6); |
58 | &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0] | 56 | &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0] |
59 | &xor ("ecx","edi"); # Sigma1(e) | 57 | &xor ("ecx",$E); |
58 | &ror ("ecx",6); # Sigma1(e) | ||
60 | &mov ("edi",$Goff); | 59 | &mov ("edi",$Goff); |
61 | &add ($T,"ecx"); # T += Sigma1(e) | 60 | &add ($T,"ecx"); # T += Sigma1(e) |
62 | &mov ($Eoff,$E); # modulo-scheduled | ||
63 | 61 | ||
64 | &xor ("esi","edi"); | 62 | &xor ("esi","edi"); |
63 | &mov ($Eoff,$E); # modulo-scheduled | ||
65 | &mov ("ecx",$A); | 64 | &mov ("ecx",$A); |
66 | &and ("esi",$E); | 65 | &and ("esi",$E); |
67 | &mov ($E,$Doff); # e becomes d, which is e in next iteration | 66 | &mov ($E,$Doff); # e becomes d, which is e in next iteration |
@@ -69,14 +68,14 @@ sub BODY_00_15() { | |||
69 | &mov ("edi",$A); | 68 | &mov ("edi",$A); |
70 | &add ($T,"esi"); # T += Ch(e,f,g) | 69 | &add ($T,"esi"); # T += Ch(e,f,g) |
71 | 70 | ||
72 | &ror ("ecx",2); | 71 | &ror ("ecx",22-13); |
73 | &add ($T,$Hoff); # T += h | 72 | &add ($T,$Hoff); # T += h |
74 | &ror ("edi",13); | 73 | &xor ("ecx",$A); |
74 | &ror ("ecx",13-2); | ||
75 | &mov ("esi",$Boff); | 75 | &mov ("esi",$Boff); |
76 | &xor ("ecx","edi"); | 76 | &xor ("ecx",$A); |
77 | &ror ("edi",22-13); | 77 | &ror ("ecx",2); # Sigma0(a) |
78 | &add ($E,$T); # d += T | 78 | &add ($E,$T); # d += T |
79 | &xor ("ecx","edi"); # Sigma0(a) | ||
80 | &mov ("edi",$Coff); | 79 | &mov ("edi",$Coff); |
81 | 80 | ||
82 | &add ($T,"ecx"); # T += Sigma0(a) | 81 | &add ($T,"ecx"); # T += Sigma0(a) |
@@ -168,23 +167,22 @@ sub BODY_00_15() { | |||
168 | &set_label("16_63",16); | 167 | &set_label("16_63",16); |
169 | &mov ("esi",$T); | 168 | &mov ("esi",$T); |
170 | &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); | 169 | &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); |
171 | &shr ($T,3); | ||
172 | &ror ("esi",7); | ||
173 | &xor ($T,"esi"); | ||
174 | &ror ("esi",18-7); | 170 | &ror ("esi",18-7); |
175 | &mov ("edi","ecx"); | 171 | &mov ("edi","ecx"); |
176 | &xor ($T,"esi"); # T = sigma0(X[-15]) | 172 | &xor ("esi",$T); |
173 | &ror ("esi",7); | ||
174 | &shr ($T,3); | ||
177 | 175 | ||
178 | &shr ("ecx",10); | ||
179 | &mov ("esi",&DWP(4*(8+15+16),"esp")); | ||
180 | &ror ("edi",17); | ||
181 | &xor ("ecx","edi"); | ||
182 | &ror ("edi",19-17); | 176 | &ror ("edi",19-17); |
183 | &add ($T,"esi"); # T += X[-16] | 177 | &xor ($T,"esi"); # T = sigma0(X[-15]) |
184 | &xor ("edi","ecx") # sigma1(X[-2]) | 178 | &xor ("edi","ecx"); |
179 | &ror ("edi",17); | ||
180 | &shr ("ecx",10); | ||
181 | &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16] | ||
182 | &xor ("edi","ecx"); # sigma1(X[-2]) | ||
185 | 183 | ||
186 | &add ($T,"edi"); # T += sigma1(X[-2]) | 184 | &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7] |
187 | # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1) | 185 | # &add ($T,"edi"); # T += sigma1(X[-2]) |
188 | # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] | 186 | # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] |
189 | 187 | ||
190 | &BODY_00_15(1); | 188 | &BODY_00_15(1); |
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl index 492cb62bc0..9c84e8d93c 100644 --- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl +++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl | |||
@@ -18,11 +18,16 @@ | |||
18 | # Rescheduling for dual-issue pipeline resulted in 22% improvement on | 18 | # Rescheduling for dual-issue pipeline resulted in 22% improvement on |
19 | # Cortex A8 core and ~20 cycles per processed byte. | 19 | # Cortex A8 core and ~20 cycles per processed byte. |
20 | 20 | ||
21 | # February 2011. | ||
22 | # | ||
23 | # Profiler-assisted and platform-specific optimization resulted in 16% | ||
24 | # improvement on Cortex A8 core and ~17 cycles per processed byte. | ||
25 | |||
21 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | 26 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
22 | open STDOUT,">$output"; | 27 | open STDOUT,">$output"; |
23 | 28 | ||
24 | $ctx="r0"; $t0="r0"; | 29 | $ctx="r0"; $t0="r0"; |
25 | $inp="r1"; | 30 | $inp="r1"; $t3="r1"; |
26 | $len="r2"; $t1="r2"; | 31 | $len="r2"; $t1="r2"; |
27 | $T1="r3"; | 32 | $T1="r3"; |
28 | $A="r4"; | 33 | $A="r4"; |
@@ -46,6 +51,9 @@ sub BODY_00_15 { | |||
46 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 51 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
47 | 52 | ||
48 | $code.=<<___ if ($i<16); | 53 | $code.=<<___ if ($i<16); |
54 | #if __ARM_ARCH__>=7 | ||
55 | ldr $T1,[$inp],#4 | ||
56 | #else | ||
49 | ldrb $T1,[$inp,#3] @ $i | 57 | ldrb $T1,[$inp,#3] @ $i |
50 | ldrb $t2,[$inp,#2] | 58 | ldrb $t2,[$inp,#2] |
51 | ldrb $t1,[$inp,#1] | 59 | ldrb $t1,[$inp,#1] |
@@ -53,16 +61,24 @@ $code.=<<___ if ($i<16); | |||
53 | orr $T1,$T1,$t2,lsl#8 | 61 | orr $T1,$T1,$t2,lsl#8 |
54 | orr $T1,$T1,$t1,lsl#16 | 62 | orr $T1,$T1,$t1,lsl#16 |
55 | orr $T1,$T1,$t0,lsl#24 | 63 | orr $T1,$T1,$t0,lsl#24 |
56 | `"str $inp,[sp,#17*4]" if ($i==15)` | 64 | #endif |
57 | ___ | 65 | ___ |
58 | $code.=<<___; | 66 | $code.=<<___; |
59 | ldr $t2,[$Ktbl],#4 @ *K256++ | ||
60 | mov $t0,$e,ror#$Sigma1[0] | 67 | mov $t0,$e,ror#$Sigma1[0] |
61 | str $T1,[sp,#`$i%16`*4] | 68 | ldr $t2,[$Ktbl],#4 @ *K256++ |
62 | eor $t0,$t0,$e,ror#$Sigma1[1] | 69 | eor $t0,$t0,$e,ror#$Sigma1[1] |
63 | eor $t1,$f,$g | 70 | eor $t1,$f,$g |
71 | #if $i>=16 | ||
72 | add $T1,$T1,$t3 @ from BODY_16_xx | ||
73 | #elif __ARM_ARCH__>=7 && defined(__ARMEL__) | ||
74 | rev $T1,$T1 | ||
75 | #endif | ||
76 | #if $i==15 | ||
77 | str $inp,[sp,#17*4] @ leave room for $t3 | ||
78 | #endif | ||
64 | eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) | 79 | eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) |
65 | and $t1,$t1,$e | 80 | and $t1,$t1,$e |
81 | str $T1,[sp,#`$i%16`*4] | ||
66 | add $T1,$T1,$t0 | 82 | add $T1,$T1,$t0 |
67 | eor $t1,$t1,$g @ Ch(e,f,g) | 83 | eor $t1,$t1,$g @ Ch(e,f,g) |
68 | add $T1,$T1,$h | 84 | add $T1,$T1,$h |
@@ -71,6 +87,9 @@ $code.=<<___; | |||
71 | eor $h,$h,$a,ror#$Sigma0[1] | 87 | eor $h,$h,$a,ror#$Sigma0[1] |
72 | add $T1,$T1,$t2 | 88 | add $T1,$T1,$t2 |
73 | eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) | 89 | eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) |
90 | #if $i>=15 | ||
91 | ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx | ||
92 | #endif | ||
74 | orr $t0,$a,$b | 93 | orr $t0,$a,$b |
75 | and $t1,$a,$b | 94 | and $t1,$a,$b |
76 | and $t0,$t0,$c | 95 | and $t0,$t0,$c |
@@ -85,24 +104,26 @@ sub BODY_16_XX { | |||
85 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 104 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
86 | 105 | ||
87 | $code.=<<___; | 106 | $code.=<<___; |
88 | ldr $t1,[sp,#`($i+1)%16`*4] @ $i | 107 | @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i |
89 | ldr $t2,[sp,#`($i+14)%16`*4] | 108 | ldr $t2,[sp,#`($i+14)%16`*4] |
109 | mov $t0,$t3,ror#$sigma0[0] | ||
90 | ldr $T1,[sp,#`($i+0)%16`*4] | 110 | ldr $T1,[sp,#`($i+0)%16`*4] |
91 | mov $t0,$t1,ror#$sigma0[0] | 111 | eor $t0,$t0,$t3,ror#$sigma0[1] |
92 | ldr $inp,[sp,#`($i+9)%16`*4] | 112 | ldr $t1,[sp,#`($i+9)%16`*4] |
93 | eor $t0,$t0,$t1,ror#$sigma0[1] | 113 | eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1]) |
94 | eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) | 114 | mov $t3,$t2,ror#$sigma1[0] |
95 | mov $t1,$t2,ror#$sigma1[0] | ||
96 | add $T1,$T1,$t0 | 115 | add $T1,$T1,$t0 |
97 | eor $t1,$t1,$t2,ror#$sigma1[1] | 116 | eor $t3,$t3,$t2,ror#$sigma1[1] |
98 | add $T1,$T1,$inp | ||
99 | eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) | ||
100 | add $T1,$T1,$t1 | 117 | add $T1,$T1,$t1 |
118 | eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) | ||
119 | @ add $T1,$T1,$t3 | ||
101 | ___ | 120 | ___ |
102 | &BODY_00_15(@_); | 121 | &BODY_00_15(@_); |
103 | } | 122 | } |
104 | 123 | ||
105 | $code=<<___; | 124 | $code=<<___; |
125 | #include "arm_arch.h" | ||
126 | |||
106 | .text | 127 | .text |
107 | .code 32 | 128 | .code 32 |
108 | 129 | ||
@@ -132,7 +153,7 @@ K256: | |||
132 | sha256_block_data_order: | 153 | sha256_block_data_order: |
133 | sub r3,pc,#8 @ sha256_block_data_order | 154 | sub r3,pc,#8 @ sha256_block_data_order |
134 | add $len,$inp,$len,lsl#6 @ len to point at the end of inp | 155 | add $len,$inp,$len,lsl#6 @ len to point at the end of inp |
135 | stmdb sp!,{$ctx,$inp,$len,r4-r12,lr} | 156 | stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} |
136 | ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} | 157 | ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} |
137 | sub $Ktbl,r3,#256 @ K256 | 158 | sub $Ktbl,r3,#256 @ K256 |
138 | sub sp,sp,#16*4 @ alloca(X[16]) | 159 | sub sp,sp,#16*4 @ alloca(X[16]) |
@@ -171,10 +192,14 @@ $code.=<<___; | |||
171 | bne .Loop | 192 | bne .Loop |
172 | 193 | ||
173 | add sp,sp,#`16+3`*4 @ destroy frame | 194 | add sp,sp,#`16+3`*4 @ destroy frame |
174 | ldmia sp!,{r4-r12,lr} | 195 | #if __ARM_ARCH__>=5 |
196 | ldmia sp!,{r4-r11,pc} | ||
197 | #else | ||
198 | ldmia sp!,{r4-r11,lr} | ||
175 | tst lr,#1 | 199 | tst lr,#1 |
176 | moveq pc,lr @ be binary compatible with V4, yet | 200 | moveq pc,lr @ be binary compatible with V4, yet |
177 | bx lr @ interoperable with Thumb ISA:-) | 201 | bx lr @ interoperable with Thumb ISA:-) |
202 | #endif | ||
178 | .size sha256_block_data_order,.-sha256_block_data_order | 203 | .size sha256_block_data_order,.-sha256_block_data_order |
179 | .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | 204 | .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" |
180 | .align 2 | 205 | .align 2 |
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl index 3a35861ac6..7faf37b147 100644 --- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl +++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl | |||
@@ -18,22 +18,33 @@ | |||
18 | # Rescheduling for dual-issue pipeline resulted in 6% improvement on | 18 | # Rescheduling for dual-issue pipeline resulted in 6% improvement on |
19 | # Cortex A8 core and ~40 cycles per processed byte. | 19 | # Cortex A8 core and ~40 cycles per processed byte. |
20 | 20 | ||
21 | # February 2011. | ||
22 | # | ||
23 | # Profiler-assisted and platform-specific optimization resulted in 7% | ||
24 | # improvement on Coxtex A8 core and ~38 cycles per byte. | ||
25 | |||
26 | # March 2011. | ||
27 | # | ||
28 | # Add NEON implementation. On Cortex A8 it was measured to process | ||
29 | # one byte in 25.5 cycles or 47% faster than integer-only code. | ||
30 | |||
21 | # Byte order [in]dependence. ========================================= | 31 | # Byte order [in]dependence. ========================================= |
22 | # | 32 | # |
23 | # Caller is expected to maintain specific *dword* order in h[0-7], | 33 | # Originally caller was expected to maintain specific *dword* order in |
24 | # namely with most significant dword at *lower* address, which is | 34 | # h[0-7], namely with most significant dword at *lower* address, which |
25 | # reflected in below two parameters. *Byte* order within these dwords | 35 | # was reflected in below two parameters as 0 and 4. Now caller is |
26 | # in turn is whatever *native* byte order on current platform. | 36 | # expected to maintain native byte order for whole 64-bit values. |
27 | $hi=0; | 37 | $hi="HI"; |
28 | $lo=4; | 38 | $lo="LO"; |
29 | # ==================================================================== | 39 | # ==================================================================== |
30 | 40 | ||
31 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | 41 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
32 | open STDOUT,">$output"; | 42 | open STDOUT,">$output"; |
33 | 43 | ||
34 | $ctx="r0"; | 44 | $ctx="r0"; # parameter block |
35 | $inp="r1"; | 45 | $inp="r1"; |
36 | $len="r2"; | 46 | $len="r2"; |
47 | |||
37 | $Tlo="r3"; | 48 | $Tlo="r3"; |
38 | $Thi="r4"; | 49 | $Thi="r4"; |
39 | $Alo="r5"; | 50 | $Alo="r5"; |
@@ -61,15 +72,17 @@ $Xoff=8*8; | |||
61 | sub BODY_00_15() { | 72 | sub BODY_00_15() { |
62 | my $magic = shift; | 73 | my $magic = shift; |
63 | $code.=<<___; | 74 | $code.=<<___; |
64 | ldr $t2,[sp,#$Hoff+0] @ h.lo | ||
65 | ldr $t3,[sp,#$Hoff+4] @ h.hi | ||
66 | @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) | 75 | @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) |
67 | @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 | 76 | @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 |
68 | @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 | 77 | @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 |
69 | mov $t0,$Elo,lsr#14 | 78 | mov $t0,$Elo,lsr#14 |
79 | str $Tlo,[sp,#$Xoff+0] | ||
70 | mov $t1,$Ehi,lsr#14 | 80 | mov $t1,$Ehi,lsr#14 |
81 | str $Thi,[sp,#$Xoff+4] | ||
71 | eor $t0,$t0,$Ehi,lsl#18 | 82 | eor $t0,$t0,$Ehi,lsl#18 |
83 | ldr $t2,[sp,#$Hoff+0] @ h.lo | ||
72 | eor $t1,$t1,$Elo,lsl#18 | 84 | eor $t1,$t1,$Elo,lsl#18 |
85 | ldr $t3,[sp,#$Hoff+4] @ h.hi | ||
73 | eor $t0,$t0,$Elo,lsr#18 | 86 | eor $t0,$t0,$Elo,lsr#18 |
74 | eor $t1,$t1,$Ehi,lsr#18 | 87 | eor $t1,$t1,$Ehi,lsr#18 |
75 | eor $t0,$t0,$Ehi,lsl#14 | 88 | eor $t0,$t0,$Ehi,lsl#14 |
@@ -96,25 +109,24 @@ $code.=<<___; | |||
96 | and $t1,$t1,$Ehi | 109 | and $t1,$t1,$Ehi |
97 | str $Ahi,[sp,#$Aoff+4] | 110 | str $Ahi,[sp,#$Aoff+4] |
98 | eor $t0,$t0,$t2 | 111 | eor $t0,$t0,$t2 |
99 | ldr $t2,[$Ktbl,#4] @ K[i].lo | 112 | ldr $t2,[$Ktbl,#$lo] @ K[i].lo |
100 | eor $t1,$t1,$t3 @ Ch(e,f,g) | 113 | eor $t1,$t1,$t3 @ Ch(e,f,g) |
101 | ldr $t3,[$Ktbl,#0] @ K[i].hi | 114 | ldr $t3,[$Ktbl,#$hi] @ K[i].hi |
102 | 115 | ||
103 | adds $Tlo,$Tlo,$t0 | 116 | adds $Tlo,$Tlo,$t0 |
104 | ldr $Elo,[sp,#$Doff+0] @ d.lo | 117 | ldr $Elo,[sp,#$Doff+0] @ d.lo |
105 | adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) | 118 | adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) |
106 | ldr $Ehi,[sp,#$Doff+4] @ d.hi | 119 | ldr $Ehi,[sp,#$Doff+4] @ d.hi |
107 | adds $Tlo,$Tlo,$t2 | 120 | adds $Tlo,$Tlo,$t2 |
121 | and $t0,$t2,#0xff | ||
108 | adc $Thi,$Thi,$t3 @ T += K[i] | 122 | adc $Thi,$Thi,$t3 @ T += K[i] |
109 | adds $Elo,$Elo,$Tlo | 123 | adds $Elo,$Elo,$Tlo |
124 | ldr $t2,[sp,#$Boff+0] @ b.lo | ||
110 | adc $Ehi,$Ehi,$Thi @ d += T | 125 | adc $Ehi,$Ehi,$Thi @ d += T |
111 | |||
112 | and $t0,$t2,#0xff | ||
113 | teq $t0,#$magic | 126 | teq $t0,#$magic |
114 | orreq $Ktbl,$Ktbl,#1 | ||
115 | 127 | ||
116 | ldr $t2,[sp,#$Boff+0] @ b.lo | ||
117 | ldr $t3,[sp,#$Coff+0] @ c.lo | 128 | ldr $t3,[sp,#$Coff+0] @ c.lo |
129 | orreq $Ktbl,$Ktbl,#1 | ||
118 | @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) | 130 | @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) |
119 | @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 | 131 | @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 |
120 | @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 | 132 | @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 |
@@ -131,80 +143,100 @@ $code.=<<___; | |||
131 | eor $t0,$t0,$Alo,lsl#25 | 143 | eor $t0,$t0,$Alo,lsl#25 |
132 | eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) | 144 | eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) |
133 | adds $Tlo,$Tlo,$t0 | 145 | adds $Tlo,$Tlo,$t0 |
146 | and $t0,$Alo,$t2 | ||
134 | adc $Thi,$Thi,$t1 @ T += Sigma0(a) | 147 | adc $Thi,$Thi,$t1 @ T += Sigma0(a) |
135 | 148 | ||
136 | and $t0,$Alo,$t2 | ||
137 | orr $Alo,$Alo,$t2 | ||
138 | ldr $t1,[sp,#$Boff+4] @ b.hi | 149 | ldr $t1,[sp,#$Boff+4] @ b.hi |
150 | orr $Alo,$Alo,$t2 | ||
139 | ldr $t2,[sp,#$Coff+4] @ c.hi | 151 | ldr $t2,[sp,#$Coff+4] @ c.hi |
140 | and $Alo,$Alo,$t3 | 152 | and $Alo,$Alo,$t3 |
141 | orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo | ||
142 | and $t3,$Ahi,$t1 | 153 | and $t3,$Ahi,$t1 |
143 | orr $Ahi,$Ahi,$t1 | 154 | orr $Ahi,$Ahi,$t1 |
155 | orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo | ||
144 | and $Ahi,$Ahi,$t2 | 156 | and $Ahi,$Ahi,$t2 |
145 | orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi | ||
146 | adds $Alo,$Alo,$Tlo | 157 | adds $Alo,$Alo,$Tlo |
147 | adc $Ahi,$Ahi,$Thi @ h += T | 158 | orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi |
148 | |||
149 | sub sp,sp,#8 | 159 | sub sp,sp,#8 |
160 | adc $Ahi,$Ahi,$Thi @ h += T | ||
161 | tst $Ktbl,#1 | ||
150 | add $Ktbl,$Ktbl,#8 | 162 | add $Ktbl,$Ktbl,#8 |
151 | ___ | 163 | ___ |
152 | } | 164 | } |
153 | $code=<<___; | 165 | $code=<<___; |
166 | #include "arm_arch.h" | ||
167 | #ifdef __ARMEL__ | ||
168 | # define LO 0 | ||
169 | # define HI 4 | ||
170 | # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 | ||
171 | #else | ||
172 | # define HI 0 | ||
173 | # define LO 4 | ||
174 | # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 | ||
175 | #endif | ||
176 | |||
154 | .text | 177 | .text |
155 | .code 32 | 178 | .code 32 |
156 | .type K512,%object | 179 | .type K512,%object |
157 | .align 5 | 180 | .align 5 |
158 | K512: | 181 | K512: |
159 | .word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd | 182 | WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) |
160 | .word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc | 183 | WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) |
161 | .word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 | 184 | WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) |
162 | .word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 | 185 | WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) |
163 | .word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe | 186 | WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) |
164 | .word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 | 187 | WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) |
165 | .word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 | 188 | WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) |
166 | .word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 | 189 | WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) |
167 | .word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 | 190 | WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) |
168 | .word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 | 191 | WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) |
169 | .word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 | 192 | WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) |
170 | .word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 | 193 | WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) |
171 | .word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 | 194 | WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) |
172 | .word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 | 195 | WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) |
173 | .word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 | 196 | WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) |
174 | .word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 | 197 | WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) |
175 | .word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 | 198 | WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) |
176 | .word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df | 199 | WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) |
177 | .word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 | 200 | WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) |
178 | .word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b | 201 | WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) |
179 | .word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 | 202 | WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) |
180 | .word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 | 203 | WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) |
181 | .word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 | 204 | WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) |
182 | .word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 | 205 | WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) |
183 | .word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 | 206 | WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) |
184 | .word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 | 207 | WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) |
185 | .word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb | 208 | WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) |
186 | .word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 | 209 | WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) |
187 | .word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 | 210 | WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) |
188 | .word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec | 211 | WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) |
189 | .word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 | 212 | WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) |
190 | .word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b | 213 | WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) |
191 | .word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 | 214 | WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) |
192 | .word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 | 215 | WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) |
193 | .word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 | 216 | WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) |
194 | .word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b | 217 | WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) |
195 | .word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 | 218 | WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) |
196 | .word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c | 219 | WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) |
197 | .word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a | 220 | WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) |
198 | .word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 | 221 | WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) |
199 | .size K512,.-K512 | 222 | .size K512,.-K512 |
223 | .LOPENSSL_armcap: | ||
224 | .word OPENSSL_armcap_P-sha512_block_data_order | ||
225 | .skip 32-4 | ||
200 | 226 | ||
201 | .global sha512_block_data_order | 227 | .global sha512_block_data_order |
202 | .type sha512_block_data_order,%function | 228 | .type sha512_block_data_order,%function |
203 | sha512_block_data_order: | 229 | sha512_block_data_order: |
204 | sub r3,pc,#8 @ sha512_block_data_order | 230 | sub r3,pc,#8 @ sha512_block_data_order |
205 | add $len,$inp,$len,lsl#7 @ len to point at the end of inp | 231 | add $len,$inp,$len,lsl#7 @ len to point at the end of inp |
232 | #if __ARM_ARCH__>=7 | ||
233 | ldr r12,.LOPENSSL_armcap | ||
234 | ldr r12,[r3,r12] @ OPENSSL_armcap_P | ||
235 | tst r12,#1 | ||
236 | bne .LNEON | ||
237 | #endif | ||
206 | stmdb sp!,{r4-r12,lr} | 238 | stmdb sp!,{r4-r12,lr} |
207 | sub $Ktbl,r3,#640 @ K512 | 239 | sub $Ktbl,r3,#672 @ K512 |
208 | sub sp,sp,#9*8 | 240 | sub sp,sp,#9*8 |
209 | 241 | ||
210 | ldr $Elo,[$ctx,#$Eoff+$lo] | 242 | ldr $Elo,[$ctx,#$Eoff+$lo] |
@@ -238,6 +270,7 @@ sha512_block_data_order: | |||
238 | str $Thi,[sp,#$Foff+4] | 270 | str $Thi,[sp,#$Foff+4] |
239 | 271 | ||
240 | .L00_15: | 272 | .L00_15: |
273 | #if __ARM_ARCH__<7 | ||
241 | ldrb $Tlo,[$inp,#7] | 274 | ldrb $Tlo,[$inp,#7] |
242 | ldrb $t0, [$inp,#6] | 275 | ldrb $t0, [$inp,#6] |
243 | ldrb $t1, [$inp,#5] | 276 | ldrb $t1, [$inp,#5] |
@@ -252,26 +285,30 @@ sha512_block_data_order: | |||
252 | orr $Thi,$Thi,$t3,lsl#8 | 285 | orr $Thi,$Thi,$t3,lsl#8 |
253 | orr $Thi,$Thi,$t0,lsl#16 | 286 | orr $Thi,$Thi,$t0,lsl#16 |
254 | orr $Thi,$Thi,$t1,lsl#24 | 287 | orr $Thi,$Thi,$t1,lsl#24 |
255 | str $Tlo,[sp,#$Xoff+0] | 288 | #else |
256 | str $Thi,[sp,#$Xoff+4] | 289 | ldr $Tlo,[$inp,#4] |
290 | ldr $Thi,[$inp],#8 | ||
291 | #ifdef __ARMEL__ | ||
292 | rev $Tlo,$Tlo | ||
293 | rev $Thi,$Thi | ||
294 | #endif | ||
295 | #endif | ||
257 | ___ | 296 | ___ |
258 | &BODY_00_15(0x94); | 297 | &BODY_00_15(0x94); |
259 | $code.=<<___; | 298 | $code.=<<___; |
260 | tst $Ktbl,#1 | 299 | tst $Ktbl,#1 |
261 | beq .L00_15 | 300 | beq .L00_15 |
262 | bic $Ktbl,$Ktbl,#1 | ||
263 | |||
264 | .L16_79: | ||
265 | ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] | 301 | ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] |
266 | ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] | 302 | ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] |
267 | ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] | 303 | bic $Ktbl,$Ktbl,#1 |
268 | ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] | 304 | .L16_79: |
269 | |||
270 | @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) | 305 | @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) |
271 | @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 | 306 | @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 |
272 | @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 | 307 | @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 |
273 | mov $Tlo,$t0,lsr#1 | 308 | mov $Tlo,$t0,lsr#1 |
309 | ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] | ||
274 | mov $Thi,$t1,lsr#1 | 310 | mov $Thi,$t1,lsr#1 |
311 | ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] | ||
275 | eor $Tlo,$Tlo,$t1,lsl#31 | 312 | eor $Tlo,$Tlo,$t1,lsl#31 |
276 | eor $Thi,$Thi,$t0,lsl#31 | 313 | eor $Thi,$Thi,$t0,lsl#31 |
277 | eor $Tlo,$Tlo,$t0,lsr#8 | 314 | eor $Tlo,$Tlo,$t0,lsr#8 |
@@ -295,25 +332,24 @@ $code.=<<___; | |||
295 | eor $t1,$t1,$t3,lsl#3 | 332 | eor $t1,$t1,$t3,lsl#3 |
296 | eor $t0,$t0,$t2,lsr#6 | 333 | eor $t0,$t0,$t2,lsr#6 |
297 | eor $t1,$t1,$t3,lsr#6 | 334 | eor $t1,$t1,$t3,lsr#6 |
335 | ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] | ||
298 | eor $t0,$t0,$t3,lsl#26 | 336 | eor $t0,$t0,$t3,lsl#26 |
299 | 337 | ||
300 | ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] | ||
301 | ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] | 338 | ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] |
302 | adds $Tlo,$Tlo,$t0 | 339 | adds $Tlo,$Tlo,$t0 |
340 | ldr $t0,[sp,#`$Xoff+8*16`+0] | ||
303 | adc $Thi,$Thi,$t1 | 341 | adc $Thi,$Thi,$t1 |
304 | 342 | ||
305 | ldr $t0,[sp,#`$Xoff+8*16`+0] | ||
306 | ldr $t1,[sp,#`$Xoff+8*16`+4] | 343 | ldr $t1,[sp,#`$Xoff+8*16`+4] |
307 | adds $Tlo,$Tlo,$t2 | 344 | adds $Tlo,$Tlo,$t2 |
308 | adc $Thi,$Thi,$t3 | 345 | adc $Thi,$Thi,$t3 |
309 | adds $Tlo,$Tlo,$t0 | 346 | adds $Tlo,$Tlo,$t0 |
310 | adc $Thi,$Thi,$t1 | 347 | adc $Thi,$Thi,$t1 |
311 | str $Tlo,[sp,#$Xoff+0] | ||
312 | str $Thi,[sp,#$Xoff+4] | ||
313 | ___ | 348 | ___ |
314 | &BODY_00_15(0x17); | 349 | &BODY_00_15(0x17); |
315 | $code.=<<___; | 350 | $code.=<<___; |
316 | tst $Ktbl,#1 | 351 | ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] |
352 | ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] | ||
317 | beq .L16_79 | 353 | beq .L16_79 |
318 | bic $Ktbl,$Ktbl,#1 | 354 | bic $Ktbl,$Ktbl,#1 |
319 | 355 | ||
@@ -324,12 +360,12 @@ $code.=<<___; | |||
324 | ldr $t2, [$ctx,#$Boff+$lo] | 360 | ldr $t2, [$ctx,#$Boff+$lo] |
325 | ldr $t3, [$ctx,#$Boff+$hi] | 361 | ldr $t3, [$ctx,#$Boff+$hi] |
326 | adds $t0,$Alo,$t0 | 362 | adds $t0,$Alo,$t0 |
327 | adc $t1,$Ahi,$t1 | ||
328 | adds $t2,$Tlo,$t2 | ||
329 | adc $t3,$Thi,$t3 | ||
330 | str $t0, [$ctx,#$Aoff+$lo] | 363 | str $t0, [$ctx,#$Aoff+$lo] |
364 | adc $t1,$Ahi,$t1 | ||
331 | str $t1, [$ctx,#$Aoff+$hi] | 365 | str $t1, [$ctx,#$Aoff+$hi] |
366 | adds $t2,$Tlo,$t2 | ||
332 | str $t2, [$ctx,#$Boff+$lo] | 367 | str $t2, [$ctx,#$Boff+$lo] |
368 | adc $t3,$Thi,$t3 | ||
333 | str $t3, [$ctx,#$Boff+$hi] | 369 | str $t3, [$ctx,#$Boff+$hi] |
334 | 370 | ||
335 | ldr $Alo,[sp,#$Coff+0] | 371 | ldr $Alo,[sp,#$Coff+0] |
@@ -341,12 +377,12 @@ $code.=<<___; | |||
341 | ldr $t2, [$ctx,#$Doff+$lo] | 377 | ldr $t2, [$ctx,#$Doff+$lo] |
342 | ldr $t3, [$ctx,#$Doff+$hi] | 378 | ldr $t3, [$ctx,#$Doff+$hi] |
343 | adds $t0,$Alo,$t0 | 379 | adds $t0,$Alo,$t0 |
344 | adc $t1,$Ahi,$t1 | ||
345 | adds $t2,$Tlo,$t2 | ||
346 | adc $t3,$Thi,$t3 | ||
347 | str $t0, [$ctx,#$Coff+$lo] | 380 | str $t0, [$ctx,#$Coff+$lo] |
381 | adc $t1,$Ahi,$t1 | ||
348 | str $t1, [$ctx,#$Coff+$hi] | 382 | str $t1, [$ctx,#$Coff+$hi] |
383 | adds $t2,$Tlo,$t2 | ||
349 | str $t2, [$ctx,#$Doff+$lo] | 384 | str $t2, [$ctx,#$Doff+$lo] |
385 | adc $t3,$Thi,$t3 | ||
350 | str $t3, [$ctx,#$Doff+$hi] | 386 | str $t3, [$ctx,#$Doff+$hi] |
351 | 387 | ||
352 | ldr $Tlo,[sp,#$Foff+0] | 388 | ldr $Tlo,[sp,#$Foff+0] |
@@ -356,12 +392,12 @@ $code.=<<___; | |||
356 | ldr $t2, [$ctx,#$Foff+$lo] | 392 | ldr $t2, [$ctx,#$Foff+$lo] |
357 | ldr $t3, [$ctx,#$Foff+$hi] | 393 | ldr $t3, [$ctx,#$Foff+$hi] |
358 | adds $Elo,$Elo,$t0 | 394 | adds $Elo,$Elo,$t0 |
359 | adc $Ehi,$Ehi,$t1 | ||
360 | adds $t2,$Tlo,$t2 | ||
361 | adc $t3,$Thi,$t3 | ||
362 | str $Elo,[$ctx,#$Eoff+$lo] | 395 | str $Elo,[$ctx,#$Eoff+$lo] |
396 | adc $Ehi,$Ehi,$t1 | ||
363 | str $Ehi,[$ctx,#$Eoff+$hi] | 397 | str $Ehi,[$ctx,#$Eoff+$hi] |
398 | adds $t2,$Tlo,$t2 | ||
364 | str $t2, [$ctx,#$Foff+$lo] | 399 | str $t2, [$ctx,#$Foff+$lo] |
400 | adc $t3,$Thi,$t3 | ||
365 | str $t3, [$ctx,#$Foff+$hi] | 401 | str $t3, [$ctx,#$Foff+$hi] |
366 | 402 | ||
367 | ldr $Alo,[sp,#$Goff+0] | 403 | ldr $Alo,[sp,#$Goff+0] |
@@ -373,12 +409,12 @@ $code.=<<___; | |||
373 | ldr $t2, [$ctx,#$Hoff+$lo] | 409 | ldr $t2, [$ctx,#$Hoff+$lo] |
374 | ldr $t3, [$ctx,#$Hoff+$hi] | 410 | ldr $t3, [$ctx,#$Hoff+$hi] |
375 | adds $t0,$Alo,$t0 | 411 | adds $t0,$Alo,$t0 |
376 | adc $t1,$Ahi,$t1 | ||
377 | adds $t2,$Tlo,$t2 | ||
378 | adc $t3,$Thi,$t3 | ||
379 | str $t0, [$ctx,#$Goff+$lo] | 412 | str $t0, [$ctx,#$Goff+$lo] |
413 | adc $t1,$Ahi,$t1 | ||
380 | str $t1, [$ctx,#$Goff+$hi] | 414 | str $t1, [$ctx,#$Goff+$hi] |
415 | adds $t2,$Tlo,$t2 | ||
381 | str $t2, [$ctx,#$Hoff+$lo] | 416 | str $t2, [$ctx,#$Hoff+$lo] |
417 | adc $t3,$Thi,$t3 | ||
382 | str $t3, [$ctx,#$Hoff+$hi] | 418 | str $t3, [$ctx,#$Hoff+$hi] |
383 | 419 | ||
384 | add sp,sp,#640 | 420 | add sp,sp,#640 |
@@ -388,13 +424,156 @@ $code.=<<___; | |||
388 | bne .Loop | 424 | bne .Loop |
389 | 425 | ||
390 | add sp,sp,#8*9 @ destroy frame | 426 | add sp,sp,#8*9 @ destroy frame |
427 | #if __ARM_ARCH__>=5 | ||
428 | ldmia sp!,{r4-r12,pc} | ||
429 | #else | ||
391 | ldmia sp!,{r4-r12,lr} | 430 | ldmia sp!,{r4-r12,lr} |
392 | tst lr,#1 | 431 | tst lr,#1 |
393 | moveq pc,lr @ be binary compatible with V4, yet | 432 | moveq pc,lr @ be binary compatible with V4, yet |
394 | bx lr @ interoperable with Thumb ISA:-) | 433 | bx lr @ interoperable with Thumb ISA:-) |
395 | .size sha512_block_data_order,.-sha512_block_data_order | 434 | #endif |
396 | .asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | 435 | ___ |
436 | |||
437 | { | ||
438 | my @Sigma0=(28,34,39); | ||
439 | my @Sigma1=(14,18,41); | ||
440 | my @sigma0=(1, 8, 7); | ||
441 | my @sigma1=(19,61,6); | ||
442 | |||
443 | my $Ktbl="r3"; | ||
444 | my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch | ||
445 | |||
446 | my @X=map("d$_",(0..15)); | ||
447 | my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); | ||
448 | |||
449 | sub NEON_00_15() { | ||
450 | my $i=shift; | ||
451 | my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; | ||
452 | my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps | ||
453 | |||
454 | $code.=<<___ if ($i<16 || $i&1); | ||
455 | vshr.u64 $t0,$e,#@Sigma1[0] @ $i | ||
456 | #if $i<16 | ||
457 | vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned | ||
458 | #endif | ||
459 | vshr.u64 $t1,$e,#@Sigma1[1] | ||
460 | vshr.u64 $t2,$e,#@Sigma1[2] | ||
461 | ___ | ||
462 | $code.=<<___; | ||
463 | vld1.64 {$K},[$Ktbl,:64]! @ K[i++] | ||
464 | vsli.64 $t0,$e,#`64-@Sigma1[0]` | ||
465 | vsli.64 $t1,$e,#`64-@Sigma1[1]` | ||
466 | vsli.64 $t2,$e,#`64-@Sigma1[2]` | ||
467 | #if $i<16 && defined(__ARMEL__) | ||
468 | vrev64.8 @X[$i],@X[$i] | ||
469 | #endif | ||
470 | vadd.i64 $T1,$K,$h | ||
471 | veor $Ch,$f,$g | ||
472 | veor $t0,$t1 | ||
473 | vand $Ch,$e | ||
474 | veor $t0,$t2 @ Sigma1(e) | ||
475 | veor $Ch,$g @ Ch(e,f,g) | ||
476 | vadd.i64 $T1,$t0 | ||
477 | vshr.u64 $t0,$a,#@Sigma0[0] | ||
478 | vadd.i64 $T1,$Ch | ||
479 | vshr.u64 $t1,$a,#@Sigma0[1] | ||
480 | vshr.u64 $t2,$a,#@Sigma0[2] | ||
481 | vsli.64 $t0,$a,#`64-@Sigma0[0]` | ||
482 | vsli.64 $t1,$a,#`64-@Sigma0[1]` | ||
483 | vsli.64 $t2,$a,#`64-@Sigma0[2]` | ||
484 | vadd.i64 $T1,@X[$i%16] | ||
485 | vorr $Maj,$a,$c | ||
486 | vand $Ch,$a,$c | ||
487 | veor $h,$t0,$t1 | ||
488 | vand $Maj,$b | ||
489 | veor $h,$t2 @ Sigma0(a) | ||
490 | vorr $Maj,$Ch @ Maj(a,b,c) | ||
491 | vadd.i64 $h,$T1 | ||
492 | vadd.i64 $d,$T1 | ||
493 | vadd.i64 $h,$Maj | ||
494 | ___ | ||
495 | } | ||
496 | |||
497 | sub NEON_16_79() { | ||
498 | my $i=shift; | ||
499 | |||
500 | if ($i&1) { &NEON_00_15($i,@_); return; } | ||
501 | |||
502 | # 2x-vectorized, therefore runs every 2nd round | ||
503 | my @X=map("q$_",(0..7)); # view @X as 128-bit vector | ||
504 | my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps | ||
505 | my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 | ||
506 | my $e=@_[4]; # $e from NEON_00_15 | ||
507 | $i /= 2; | ||
508 | $code.=<<___; | ||
509 | vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] | ||
510 | vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] | ||
511 | vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] | ||
512 | vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` | ||
513 | vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] | ||
514 | vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` | ||
515 | veor $s1,$t0 | ||
516 | vshr.u64 $t0,$s0,#@sigma0[0] | ||
517 | veor $s1,$t1 @ sigma1(X[i+14]) | ||
518 | vshr.u64 $t1,$s0,#@sigma0[1] | ||
519 | vadd.i64 @X[$i%8],$s1 | ||
520 | vshr.u64 $s1,$s0,#@sigma0[2] | ||
521 | vsli.64 $t0,$s0,#`64-@sigma0[0]` | ||
522 | vsli.64 $t1,$s0,#`64-@sigma0[1]` | ||
523 | vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] | ||
524 | veor $s1,$t0 | ||
525 | vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 | ||
526 | vadd.i64 @X[$i%8],$s0 | ||
527 | vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 | ||
528 | veor $s1,$t1 @ sigma0(X[i+1]) | ||
529 | vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 | ||
530 | vadd.i64 @X[$i%8],$s1 | ||
531 | ___ | ||
532 | &NEON_00_15(2*$i,@_); | ||
533 | } | ||
534 | |||
535 | $code.=<<___; | ||
536 | #if __ARM_ARCH__>=7 | ||
537 | .fpu neon | ||
538 | |||
539 | .align 4 | ||
540 | .LNEON: | ||
541 | dmb @ errata #451034 on early Cortex A8 | ||
542 | vstmdb sp!,{d8-d15} @ ABI specification says so | ||
543 | sub $Ktbl,r3,#672 @ K512 | ||
544 | vldmia $ctx,{$A-$H} @ load context | ||
545 | .Loop_neon: | ||
546 | ___ | ||
547 | for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } | ||
548 | $code.=<<___; | ||
549 | mov $cnt,#4 | ||
550 | .L16_79_neon: | ||
551 | subs $cnt,#1 | ||
552 | ___ | ||
553 | for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } | ||
554 | $code.=<<___; | ||
555 | bne .L16_79_neon | ||
556 | |||
557 | vldmia $ctx,{d24-d31} @ load context to temp | ||
558 | vadd.i64 q8,q12 @ vectorized accumulate | ||
559 | vadd.i64 q9,q13 | ||
560 | vadd.i64 q10,q14 | ||
561 | vadd.i64 q11,q15 | ||
562 | vstmia $ctx,{$A-$H} @ save context | ||
563 | teq $inp,$len | ||
564 | sub $Ktbl,#640 @ rewind K512 | ||
565 | bne .Loop_neon | ||
566 | |||
567 | vldmia sp!,{d8-d15} @ epilogue | ||
568 | bx lr | ||
569 | #endif | ||
570 | ___ | ||
571 | } | ||
572 | $code.=<<___; | ||
573 | .size sha512_block_data_order,.-sha512_block_data_order | ||
574 | .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
397 | .align 2 | 575 | .align 2 |
576 | .comm OPENSSL_armcap_P,4,4 | ||
398 | ___ | 577 | ___ |
399 | 578 | ||
400 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 579 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/sha/asm/sha512-mips.pl b/src/lib/libcrypto/sha/asm/sha512-mips.pl new file mode 100644 index 0000000000..ba5b250890 --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha512-mips.pl | |||
@@ -0,0 +1,455 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # SHA2 block procedures for MIPS. | ||
11 | |||
12 | # October 2010. | ||
13 | # | ||
14 | # SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc- | ||
15 | # generated code in o32 build and ~55% in n32/64 build. SHA512 [which | ||
16 | # for now can only be compiled for MIPS64 ISA] improvement is modest | ||
17 | # ~17%, but it comes for free, because it's same instruction sequence. | ||
18 | # Improvement coefficients are for aligned input. | ||
19 | |||
20 | ###################################################################### | ||
21 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | ||
22 | # widely used. Then there is a new contender: NUBI. It appears that if | ||
23 | # one picks the latter, it's possible to arrange code in ABI neutral | ||
24 | # manner. Therefore let's stick to NUBI register layout: | ||
25 | # | ||
26 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | ||
27 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
28 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | ||
29 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | ||
30 | # | ||
31 | # The return value is placed in $a0. Following coding rules facilitate | ||
32 | # interoperability: | ||
33 | # | ||
34 | # - never ever touch $tp, "thread pointer", former $gp [o32 can be | ||
35 | # excluded from the rule, because it's specified volatile]; | ||
36 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | ||
37 | # old code]; | ||
38 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | ||
39 | # | ||
40 | # For reference here is register layout for N32/64 MIPS ABIs: | ||
41 | # | ||
42 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
43 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
44 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
45 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
46 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
47 | # | ||
48 | $flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 | ||
49 | |||
50 | if ($flavour =~ /64|n32/i) { | ||
51 | $PTR_ADD="dadd"; # incidentally works even on n32 | ||
52 | $PTR_SUB="dsub"; # incidentally works even on n32 | ||
53 | $REG_S="sd"; | ||
54 | $REG_L="ld"; | ||
55 | $PTR_SLL="dsll"; # incidentally works even on n32 | ||
56 | $SZREG=8; | ||
57 | } else { | ||
58 | $PTR_ADD="add"; | ||
59 | $PTR_SUB="sub"; | ||
60 | $REG_S="sw"; | ||
61 | $REG_L="lw"; | ||
62 | $PTR_SLL="sll"; | ||
63 | $SZREG=4; | ||
64 | } | ||
65 | $pf = ($flavour =~ /nubi/i) ? $t0 : $t2; | ||
66 | # | ||
67 | # <appro@openssl.org> | ||
68 | # | ||
69 | ###################################################################### | ||
70 | |||
71 | $big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; | ||
72 | |||
73 | for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } | ||
74 | open STDOUT,">$output"; | ||
75 | |||
76 | if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } | ||
77 | |||
78 | if ($output =~ /512/) { | ||
79 | $label="512"; | ||
80 | $SZ=8; | ||
81 | $LD="ld"; # load from memory | ||
82 | $ST="sd"; # store to memory | ||
83 | $SLL="dsll"; # shift left logical | ||
84 | $SRL="dsrl"; # shift right logical | ||
85 | $ADDU="daddu"; | ||
86 | @Sigma0=(28,34,39); | ||
87 | @Sigma1=(14,18,41); | ||
88 | @sigma0=( 7, 1, 8); # right shift first | ||
89 | @sigma1=( 6,19,61); # right shift first | ||
90 | $lastK=0x817; | ||
91 | $rounds=80; | ||
92 | } else { | ||
93 | $label="256"; | ||
94 | $SZ=4; | ||
95 | $LD="lw"; # load from memory | ||
96 | $ST="sw"; # store to memory | ||
97 | $SLL="sll"; # shift left logical | ||
98 | $SRL="srl"; # shift right logical | ||
99 | $ADDU="addu"; | ||
100 | @Sigma0=( 2,13,22); | ||
101 | @Sigma1=( 6,11,25); | ||
102 | @sigma0=( 3, 7,18); # right shift first | ||
103 | @sigma1=(10,17,19); # right shift first | ||
104 | $lastK=0x8f2; | ||
105 | $rounds=64; | ||
106 | } | ||
107 | |||
108 | $MSB = $big_endian ? 0 : ($SZ-1); | ||
109 | $LSB = ($SZ-1)&~$MSB; | ||
110 | |||
111 | @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31)); | ||
112 | @X=map("\$$_",(8..23)); | ||
113 | |||
114 | $ctx=$a0; | ||
115 | $inp=$a1; | ||
116 | $len=$a2; $Ktbl=$len; | ||
117 | |||
118 | sub BODY_00_15 { | ||
119 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; | ||
120 | my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]); | ||
121 | |||
122 | $code.=<<___ if ($i<15); | ||
123 | ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp) | ||
124 | ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp) | ||
125 | ___ | ||
126 | $code.=<<___ if (!$big_endian && $i<16 && $SZ==4); | ||
127 | srl $tmp0,@X[0],24 # byte swap($i) | ||
128 | srl $tmp1,@X[0],8 | ||
129 | andi $tmp2,@X[0],0xFF00 | ||
130 | sll @X[0],@X[0],24 | ||
131 | andi $tmp1,0xFF00 | ||
132 | sll $tmp2,$tmp2,8 | ||
133 | or @X[0],$tmp0 | ||
134 | or $tmp1,$tmp2 | ||
135 | or @X[0],$tmp1 | ||
136 | ___ | ||
137 | $code.=<<___ if (!$big_endian && $i<16 && $SZ==8); | ||
138 | ori $tmp0,$zero,0xFF | ||
139 | dsll $tmp2,$tmp0,32 | ||
140 | or $tmp0,$tmp2 # 0x000000FF000000FF | ||
141 | and $tmp1,@X[0],$tmp0 # byte swap($i) | ||
142 | dsrl $tmp2,@X[0],24 | ||
143 | dsll $tmp1,24 | ||
144 | and $tmp2,$tmp0 | ||
145 | dsll $tmp0,8 # 0x0000FF000000FF00 | ||
146 | or $tmp1,$tmp2 | ||
147 | and $tmp2,@X[0],$tmp0 | ||
148 | dsrl @X[0],8 | ||
149 | dsll $tmp2,8 | ||
150 | and @X[0],$tmp0 | ||
151 | or $tmp1,$tmp2 | ||
152 | or @X[0],$tmp1 | ||
153 | dsrl $tmp1,@X[0],32 | ||
154 | dsll @X[0],32 | ||
155 | or @X[0],$tmp1 | ||
156 | ___ | ||
157 | $code.=<<___; | ||
158 | $ADDU $T1,$X[0],$h # $i | ||
159 | $SRL $h,$e,@Sigma1[0] | ||
160 | xor $tmp2,$f,$g | ||
161 | $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]` | ||
162 | and $tmp2,$e | ||
163 | $SRL $tmp0,$e,@Sigma1[1] | ||
164 | xor $h,$tmp1 | ||
165 | $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]` | ||
166 | xor $h,$tmp0 | ||
167 | $SRL $tmp0,$e,@Sigma1[2] | ||
168 | xor $h,$tmp1 | ||
169 | $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]` | ||
170 | xor $h,$tmp0 | ||
171 | xor $tmp2,$g # Ch(e,f,g) | ||
172 | xor $tmp0,$tmp1,$h # Sigma1(e) | ||
173 | |||
174 | $SRL $h,$a,@Sigma0[0] | ||
175 | $ADDU $T1,$tmp2 | ||
176 | $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i] | ||
177 | $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]` | ||
178 | $ADDU $T1,$tmp0 | ||
179 | $SRL $tmp0,$a,@Sigma0[1] | ||
180 | xor $h,$tmp1 | ||
181 | $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]` | ||
182 | xor $h,$tmp0 | ||
183 | $SRL $tmp0,$a,@Sigma0[2] | ||
184 | xor $h,$tmp1 | ||
185 | $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]` | ||
186 | xor $h,$tmp0 | ||
187 | $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer | ||
188 | xor $h,$tmp1 # Sigma0(a) | ||
189 | |||
190 | or $tmp0,$a,$b | ||
191 | and $tmp1,$a,$b | ||
192 | and $tmp0,$c | ||
193 | or $tmp1,$tmp0 # Maj(a,b,c) | ||
194 | $ADDU $T1,$tmp2 # +=K[$i] | ||
195 | $ADDU $h,$tmp1 | ||
196 | |||
197 | $ADDU $d,$T1 | ||
198 | $ADDU $h,$T1 | ||
199 | ___ | ||
200 | $code.=<<___ if ($i>=13); | ||
201 | $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer | ||
202 | ___ | ||
203 | } | ||
204 | |||
205 | sub BODY_16_XX { | ||
206 | my $i=@_[0]; | ||
207 | my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]); | ||
208 | |||
209 | $code.=<<___; | ||
210 | $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i) | ||
211 | $ADDU @X[0],@X[9] # +=X[i+9] | ||
212 | $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]` | ||
213 | $SRL $tmp0,@X[1],@sigma0[1] | ||
214 | xor $tmp2,$tmp1 | ||
215 | $SLL $tmp1,`@sigma0[2]-@sigma0[1]` | ||
216 | xor $tmp2,$tmp0 | ||
217 | $SRL $tmp0,@X[1],@sigma0[2] | ||
218 | xor $tmp2,$tmp1 | ||
219 | |||
220 | $SRL $tmp3,@X[14],@sigma1[0] | ||
221 | xor $tmp2,$tmp0 # sigma0(X[i+1]) | ||
222 | $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]` | ||
223 | $ADDU @X[0],$tmp2 | ||
224 | $SRL $tmp0,@X[14],@sigma1[1] | ||
225 | xor $tmp3,$tmp1 | ||
226 | $SLL $tmp1,`@sigma1[2]-@sigma1[1]` | ||
227 | xor $tmp3,$tmp0 | ||
228 | $SRL $tmp0,@X[14],@sigma1[2] | ||
229 | xor $tmp3,$tmp1 | ||
230 | |||
231 | xor $tmp3,$tmp0 # sigma1(X[i+14]) | ||
232 | $ADDU @X[0],$tmp3 | ||
233 | ___ | ||
234 | &BODY_00_15(@_); | ||
235 | } | ||
236 | |||
237 | $FRAMESIZE=16*$SZ+16*$SZREG; | ||
238 | $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; | ||
239 | |||
240 | $code.=<<___; | ||
241 | #ifdef OPENSSL_FIPSCANISTER | ||
242 | # include <openssl/fipssyms.h> | ||
243 | #endif | ||
244 | |||
245 | .text | ||
246 | .set noat | ||
247 | #if !defined(__vxworks) || defined(__pic__) | ||
248 | .option pic2 | ||
249 | #endif | ||
250 | |||
251 | .align 5 | ||
252 | .globl sha${label}_block_data_order | ||
253 | .ent sha${label}_block_data_order | ||
254 | sha${label}_block_data_order: | ||
255 | .frame $sp,$FRAMESIZE,$ra | ||
256 | .mask $SAVED_REGS_MASK,-$SZREG | ||
257 | .set noreorder | ||
258 | ___ | ||
259 | $code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification | ||
260 | .cpload $pf | ||
261 | ___ | ||
262 | $code.=<<___; | ||
263 | $PTR_SUB $sp,$FRAMESIZE | ||
264 | $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) | ||
265 | $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) | ||
266 | $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) | ||
267 | $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) | ||
268 | $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) | ||
269 | $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) | ||
270 | $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) | ||
271 | $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) | ||
272 | $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) | ||
273 | $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) | ||
274 | ___ | ||
275 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
276 | $REG_S $s3,$FRAMESIZE-11*$SZREG($sp) | ||
277 | $REG_S $s2,$FRAMESIZE-12*$SZREG($sp) | ||
278 | $REG_S $s1,$FRAMESIZE-13*$SZREG($sp) | ||
279 | $REG_S $s0,$FRAMESIZE-14*$SZREG($sp) | ||
280 | $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) | ||
281 | ___ | ||
282 | $code.=<<___; | ||
283 | $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)` | ||
284 | ___ | ||
285 | $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification | ||
286 | .cplocal $Ktbl | ||
287 | .cpsetup $pf,$zero,sha${label}_block_data_order | ||
288 | ___ | ||
289 | $code.=<<___; | ||
290 | .set reorder | ||
291 | la $Ktbl,K${label} # PIC-ified 'load address' | ||
292 | |||
293 | $LD $A,0*$SZ($ctx) # load context | ||
294 | $LD $B,1*$SZ($ctx) | ||
295 | $LD $C,2*$SZ($ctx) | ||
296 | $LD $D,3*$SZ($ctx) | ||
297 | $LD $E,4*$SZ($ctx) | ||
298 | $LD $F,5*$SZ($ctx) | ||
299 | $LD $G,6*$SZ($ctx) | ||
300 | $LD $H,7*$SZ($ctx) | ||
301 | |||
302 | $PTR_ADD @X[15],$inp # pointer to the end of input | ||
303 | $REG_S @X[15],16*$SZ($sp) | ||
304 | b .Loop | ||
305 | |||
306 | .align 5 | ||
307 | .Loop: | ||
308 | ${LD}l @X[0],$MSB($inp) | ||
309 | ${LD}r @X[0],$LSB($inp) | ||
310 | ___ | ||
311 | for ($i=0;$i<16;$i++) | ||
312 | { &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } | ||
313 | $code.=<<___; | ||
314 | b .L16_xx | ||
315 | .align 4 | ||
316 | .L16_xx: | ||
317 | ___ | ||
318 | for (;$i<32;$i++) | ||
319 | { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } | ||
320 | $code.=<<___; | ||
321 | and @X[6],0xfff | ||
322 | li @X[7],$lastK | ||
323 | .set noreorder | ||
324 | bne @X[6],@X[7],.L16_xx | ||
325 | $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16 | ||
326 | |||
327 | $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input | ||
328 | $LD @X[0],0*$SZ($ctx) | ||
329 | $LD @X[1],1*$SZ($ctx) | ||
330 | $LD @X[2],2*$SZ($ctx) | ||
331 | $PTR_ADD $inp,16*$SZ | ||
332 | $LD @X[3],3*$SZ($ctx) | ||
333 | $ADDU $A,@X[0] | ||
334 | $LD @X[4],4*$SZ($ctx) | ||
335 | $ADDU $B,@X[1] | ||
336 | $LD @X[5],5*$SZ($ctx) | ||
337 | $ADDU $C,@X[2] | ||
338 | $LD @X[6],6*$SZ($ctx) | ||
339 | $ADDU $D,@X[3] | ||
340 | $LD @X[7],7*$SZ($ctx) | ||
341 | $ADDU $E,@X[4] | ||
342 | $ST $A,0*$SZ($ctx) | ||
343 | $ADDU $F,@X[5] | ||
344 | $ST $B,1*$SZ($ctx) | ||
345 | $ADDU $G,@X[6] | ||
346 | $ST $C,2*$SZ($ctx) | ||
347 | $ADDU $H,@X[7] | ||
348 | $ST $D,3*$SZ($ctx) | ||
349 | $ST $E,4*$SZ($ctx) | ||
350 | $ST $F,5*$SZ($ctx) | ||
351 | $ST $G,6*$SZ($ctx) | ||
352 | $ST $H,7*$SZ($ctx) | ||
353 | |||
354 | bnel $inp,@X[15],.Loop | ||
355 | $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl | ||
356 | |||
357 | $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) | ||
358 | $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) | ||
359 | $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) | ||
360 | $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) | ||
361 | $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) | ||
362 | $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) | ||
363 | $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) | ||
364 | $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) | ||
365 | $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) | ||
366 | $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) | ||
367 | ___ | ||
368 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
369 | $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) | ||
370 | $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) | ||
371 | $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) | ||
372 | $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) | ||
373 | $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) | ||
374 | ___ | ||
375 | $code.=<<___; | ||
376 | jr $ra | ||
377 | $PTR_ADD $sp,$FRAMESIZE | ||
378 | .end sha${label}_block_data_order | ||
379 | |||
380 | .rdata | ||
381 | .align 5 | ||
382 | K${label}: | ||
383 | ___ | ||
384 | if ($SZ==4) { | ||
385 | $code.=<<___; | ||
386 | .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 | ||
387 | .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 | ||
388 | .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 | ||
389 | .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 | ||
390 | .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc | ||
391 | .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da | ||
392 | .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 | ||
393 | .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 | ||
394 | .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 | ||
395 | .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 | ||
396 | .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 | ||
397 | .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 | ||
398 | .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 | ||
399 | .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 | ||
400 | .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 | ||
401 | .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | ||
402 | ___ | ||
403 | } else { | ||
404 | $code.=<<___; | ||
405 | .dword 0x428a2f98d728ae22, 0x7137449123ef65cd | ||
406 | .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc | ||
407 | .dword 0x3956c25bf348b538, 0x59f111f1b605d019 | ||
408 | .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 | ||
409 | .dword 0xd807aa98a3030242, 0x12835b0145706fbe | ||
410 | .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 | ||
411 | .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 | ||
412 | .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 | ||
413 | .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 | ||
414 | .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 | ||
415 | .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 | ||
416 | .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 | ||
417 | .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 | ||
418 | .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 | ||
419 | .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 | ||
420 | .dword 0x06ca6351e003826f, 0x142929670a0e6e70 | ||
421 | .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 | ||
422 | .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df | ||
423 | .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 | ||
424 | .dword 0x81c2c92e47edaee6, 0x92722c851482353b | ||
425 | .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 | ||
426 | .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 | ||
427 | .dword 0xd192e819d6ef5218, 0xd69906245565a910 | ||
428 | .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 | ||
429 | .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 | ||
430 | .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 | ||
431 | .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb | ||
432 | .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 | ||
433 | .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 | ||
434 | .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec | ||
435 | .dword 0x90befffa23631e28, 0xa4506cebde82bde9 | ||
436 | .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b | ||
437 | .dword 0xca273eceea26619c, 0xd186b8c721c0c207 | ||
438 | .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 | ||
439 | .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 | ||
440 | .dword 0x113f9804bef90dae, 0x1b710b35131c471b | ||
441 | .dword 0x28db77f523047d84, 0x32caab7b40c72493 | ||
442 | .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c | ||
443 | .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a | ||
444 | .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 | ||
445 | ___ | ||
446 | } | ||
447 | $code.=<<___; | ||
448 | .asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>" | ||
449 | .align 5 | ||
450 | |||
451 | ___ | ||
452 | |||
453 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
454 | print $code; | ||
455 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha512-parisc.pl b/src/lib/libcrypto/sha/asm/sha512-parisc.pl new file mode 100755 index 0000000000..e24ee58ae9 --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha512-parisc.pl | |||
@@ -0,0 +1,791 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # SHA256/512 block procedure for PA-RISC. | ||
11 | |||
12 | # June 2009. | ||
13 | # | ||
14 | # SHA256 performance is >75% better than gcc 3.2 generated code on | ||
15 | # PA-7100LC. Compared to code generated by vendor compiler this | ||
16 | # implementation is almost 70% faster in 64-bit build, but delivers | ||
17 | # virtually same performance in 32-bit build on PA-8600. | ||
18 | # | ||
19 | # SHA512 performance is >2.9x better than gcc 3.2 generated code on | ||
20 | # PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the | ||
21 | # code is executed on PA-RISC 2.0 processor and switches to 64-bit | ||
22 | # code path delivering adequate peformance even in "blended" 32-bit | ||
23 | # build. Though 64-bit code is not any faster than code generated by | ||
24 | # vendor compiler on PA-8600... | ||
25 | # | ||
26 | # Special thanks to polarhome.com for providing HP-UX account. | ||
27 | |||
28 | $flavour = shift; | ||
29 | $output = shift; | ||
30 | open STDOUT,">$output"; | ||
31 | |||
32 | if ($flavour =~ /64/) { | ||
33 | $LEVEL ="2.0W"; | ||
34 | $SIZE_T =8; | ||
35 | $FRAME_MARKER =80; | ||
36 | $SAVED_RP =16; | ||
37 | $PUSH ="std"; | ||
38 | $PUSHMA ="std,ma"; | ||
39 | $POP ="ldd"; | ||
40 | $POPMB ="ldd,mb"; | ||
41 | } else { | ||
42 | $LEVEL ="1.0"; | ||
43 | $SIZE_T =4; | ||
44 | $FRAME_MARKER =48; | ||
45 | $SAVED_RP =20; | ||
46 | $PUSH ="stw"; | ||
47 | $PUSHMA ="stwm"; | ||
48 | $POP ="ldw"; | ||
49 | $POPMB ="ldwm"; | ||
50 | } | ||
51 | |||
52 | if ($output =~ /512/) { | ||
53 | $func="sha512_block_data_order"; | ||
54 | $SZ=8; | ||
55 | @Sigma0=(28,34,39); | ||
56 | @Sigma1=(14,18,41); | ||
57 | @sigma0=(1, 8, 7); | ||
58 | @sigma1=(19,61, 6); | ||
59 | $rounds=80; | ||
60 | $LAST10BITS=0x017; | ||
61 | $LD="ldd"; | ||
62 | $LDM="ldd,ma"; | ||
63 | $ST="std"; | ||
64 | } else { | ||
65 | $func="sha256_block_data_order"; | ||
66 | $SZ=4; | ||
67 | @Sigma0=( 2,13,22); | ||
68 | @Sigma1=( 6,11,25); | ||
69 | @sigma0=( 7,18, 3); | ||
70 | @sigma1=(17,19,10); | ||
71 | $rounds=64; | ||
72 | $LAST10BITS=0x0f2; | ||
73 | $LD="ldw"; | ||
74 | $LDM="ldwm"; | ||
75 | $ST="stw"; | ||
76 | } | ||
77 | |||
78 | $FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker | ||
79 | # [+ argument transfer] | ||
80 | $XOFF=16*$SZ+32; # local variables | ||
81 | $FRAME+=$XOFF; | ||
82 | $XOFF+=$FRAME_MARKER; # distance between %sp and local variables | ||
83 | |||
84 | $ctx="%r26"; # zapped by $a0 | ||
85 | $inp="%r25"; # zapped by $a1 | ||
86 | $num="%r24"; # zapped by $t0 | ||
87 | |||
88 | $a0 ="%r26"; | ||
89 | $a1 ="%r25"; | ||
90 | $t0 ="%r24"; | ||
91 | $t1 ="%r29"; | ||
92 | $Tbl="%r31"; | ||
93 | |||
94 | @V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28"); | ||
95 | |||
96 | @X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", | ||
97 | "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp); | ||
98 | |||
99 | sub ROUND_00_15 { | ||
100 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; | ||
101 | $code.=<<___; | ||
102 | _ror $e,$Sigma1[0],$a0 | ||
103 | and $f,$e,$t0 | ||
104 | _ror $e,$Sigma1[1],$a1 | ||
105 | addl $t1,$h,$h | ||
106 | andcm $g,$e,$t1 | ||
107 | xor $a1,$a0,$a0 | ||
108 | _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1 | ||
109 | or $t0,$t1,$t1 ; Ch(e,f,g) | ||
110 | addl @X[$i%16],$h,$h | ||
111 | xor $a0,$a1,$a1 ; Sigma1(e) | ||
112 | addl $t1,$h,$h | ||
113 | _ror $a,$Sigma0[0],$a0 | ||
114 | addl $a1,$h,$h | ||
115 | |||
116 | _ror $a,$Sigma0[1],$a1 | ||
117 | and $a,$b,$t0 | ||
118 | and $a,$c,$t1 | ||
119 | xor $a1,$a0,$a0 | ||
120 | _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1 | ||
121 | xor $t1,$t0,$t0 | ||
122 | and $b,$c,$t1 | ||
123 | xor $a0,$a1,$a1 ; Sigma0(a) | ||
124 | addl $h,$d,$d | ||
125 | xor $t1,$t0,$t0 ; Maj(a,b,c) | ||
126 | `"$LDM $SZ($Tbl),$t1" if ($i<15)` | ||
127 | addl $a1,$h,$h | ||
128 | addl $t0,$h,$h | ||
129 | |||
130 | ___ | ||
131 | } | ||
132 | |||
133 | sub ROUND_16_xx { | ||
134 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; | ||
135 | $i-=16; | ||
136 | $code.=<<___; | ||
137 | _ror @X[($i+1)%16],$sigma0[0],$a0 | ||
138 | _ror @X[($i+1)%16],$sigma0[1],$a1 | ||
139 | addl @X[($i+9)%16],@X[$i],@X[$i] | ||
140 | _ror @X[($i+14)%16],$sigma1[0],$t0 | ||
141 | _ror @X[($i+14)%16],$sigma1[1],$t1 | ||
142 | xor $a1,$a0,$a0 | ||
143 | _shr @X[($i+1)%16],$sigma0[2],$a1 | ||
144 | xor $t1,$t0,$t0 | ||
145 | _shr @X[($i+14)%16],$sigma1[2],$t1 | ||
146 | xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f]) | ||
147 | xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f]) | ||
148 | $LDM $SZ($Tbl),$t1 | ||
149 | addl $a0,@X[$i],@X[$i] | ||
150 | addl $t0,@X[$i],@X[$i] | ||
151 | ___ | ||
152 | $code.=<<___ if ($i==15); | ||
153 | extru $t1,31,10,$a1 | ||
154 | comiclr,<> $LAST10BITS,$a1,%r0 | ||
155 | ldo 1($Tbl),$Tbl ; signal end of $Tbl | ||
156 | ___ | ||
157 | &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h); | ||
158 | } | ||
159 | |||
160 | $code=<<___; | ||
161 | .LEVEL $LEVEL | ||
162 | .SPACE \$TEXT\$ | ||
163 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
164 | |||
165 | .ALIGN 64 | ||
166 | L\$table | ||
167 | ___ | ||
168 | $code.=<<___ if ($SZ==8); | ||
169 | .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd | ||
170 | .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc | ||
171 | .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019 | ||
172 | .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118 | ||
173 | .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe | ||
174 | .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2 | ||
175 | .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1 | ||
176 | .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694 | ||
177 | .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3 | ||
178 | .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65 | ||
179 | .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483 | ||
180 | .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5 | ||
181 | .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210 | ||
182 | .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4 | ||
183 | .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725 | ||
184 | .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70 | ||
185 | .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926 | ||
186 | .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df | ||
187 | .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8 | ||
188 | .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b | ||
189 | .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001 | ||
190 | .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30 | ||
191 | .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910 | ||
192 | .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8 | ||
193 | .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53 | ||
194 | .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8 | ||
195 | .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb | ||
196 | .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3 | ||
197 | .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60 | ||
198 | .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec | ||
199 | .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9 | ||
200 | .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b | ||
201 | .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207 | ||
202 | .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178 | ||
203 | .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6 | ||
204 | .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b | ||
205 | .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493 | ||
206 | .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c | ||
207 | .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a | ||
208 | .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817 | ||
209 | ___ | ||
210 | $code.=<<___ if ($SZ==4); | ||
211 | .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | ||
212 | .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | ||
213 | .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | ||
214 | .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | ||
215 | .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | ||
216 | .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | ||
217 | .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | ||
218 | .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | ||
219 | .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | ||
220 | .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | ||
221 | .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | ||
222 | .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | ||
223 | .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | ||
224 | .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | ||
225 | .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | ||
226 | .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | ||
227 | ___ | ||
228 | $code.=<<___; | ||
229 | |||
230 | .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
231 | .ALIGN 64 | ||
232 | $func | ||
233 | .PROC | ||
234 | .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 | ||
235 | .ENTRY | ||
236 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
237 | $PUSHMA %r3,$FRAME(%sp) | ||
238 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
239 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
240 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
241 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
242 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
243 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
244 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
245 | $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) | ||
246 | $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) | ||
247 | $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) | ||
248 | $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) | ||
249 | $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) | ||
250 | $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) | ||
251 | $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) | ||
252 | $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) | ||
253 | |||
254 | _shl $num,`log(16*$SZ)/log(2)`,$num | ||
255 | addl $inp,$num,$num ; $num to point at the end of $inp | ||
256 | |||
257 | $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments | ||
258 | $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) | ||
259 | $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp) | ||
260 | |||
261 | blr %r0,$Tbl | ||
262 | ldi 3,$t1 | ||
263 | L\$pic | ||
264 | andcm $Tbl,$t1,$Tbl ; wipe privilege level | ||
265 | ldo L\$table-L\$pic($Tbl),$Tbl | ||
266 | ___ | ||
267 | $code.=<<___ if ($SZ==8 && $SIZE_T==4); | ||
268 | ldi 31,$t1 | ||
269 | mtctl $t1,%cr11 | ||
270 | extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0 | ||
271 | b L\$parisc1 | ||
272 | nop | ||
273 | ___ | ||
274 | $code.=<<___; | ||
275 | $LD `0*$SZ`($ctx),$A ; load context | ||
276 | $LD `1*$SZ`($ctx),$B | ||
277 | $LD `2*$SZ`($ctx),$C | ||
278 | $LD `3*$SZ`($ctx),$D | ||
279 | $LD `4*$SZ`($ctx),$E | ||
280 | $LD `5*$SZ`($ctx),$F | ||
281 | $LD `6*$SZ`($ctx),$G | ||
282 | $LD `7*$SZ`($ctx),$H | ||
283 | |||
284 | extru $inp,31,`log($SZ)/log(2)`,$t0 | ||
285 | sh3addl $t0,%r0,$t0 | ||
286 | subi `8*$SZ`,$t0,$t0 | ||
287 | mtctl $t0,%cr11 ; load %sar with align factor | ||
288 | |||
289 | L\$oop | ||
290 | ldi `$SZ-1`,$t0 | ||
291 | $LDM $SZ($Tbl),$t1 | ||
292 | andcm $inp,$t0,$t0 ; align $inp | ||
293 | ___ | ||
294 | for ($i=0;$i<15;$i++) { # load input block | ||
295 | $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; } | ||
296 | $code.=<<___; | ||
297 | cmpb,*= $inp,$t0,L\$aligned | ||
298 | $LD `$SZ*15`($t0),@X[15] | ||
299 | $LD `$SZ*16`($t0),@X[16] | ||
300 | ___ | ||
301 | for ($i=0;$i<16;$i++) { # align data | ||
302 | $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; } | ||
303 | $code.=<<___; | ||
304 | L\$aligned | ||
305 | nop ; otherwise /usr/ccs/bin/as is confused by below .WORD | ||
306 | ___ | ||
307 | |||
308 | for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } | ||
309 | $code.=<<___; | ||
310 | L\$rounds | ||
311 | nop ; otherwise /usr/ccs/bin/as is confused by below .WORD | ||
312 | ___ | ||
313 | for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); } | ||
314 | $code.=<<___; | ||
315 | bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled? | ||
316 | nop | ||
317 | |||
318 | $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments | ||
319 | $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp | ||
320 | $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num | ||
321 | ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl | ||
322 | |||
323 | $LD `0*$SZ`($ctx),@X[0] ; load context | ||
324 | $LD `1*$SZ`($ctx),@X[1] | ||
325 | $LD `2*$SZ`($ctx),@X[2] | ||
326 | $LD `3*$SZ`($ctx),@X[3] | ||
327 | $LD `4*$SZ`($ctx),@X[4] | ||
328 | $LD `5*$SZ`($ctx),@X[5] | ||
329 | addl @X[0],$A,$A | ||
330 | $LD `6*$SZ`($ctx),@X[6] | ||
331 | addl @X[1],$B,$B | ||
332 | $LD `7*$SZ`($ctx),@X[7] | ||
333 | ldo `16*$SZ`($inp),$inp ; advance $inp | ||
334 | |||
335 | $ST $A,`0*$SZ`($ctx) ; save context | ||
336 | addl @X[2],$C,$C | ||
337 | $ST $B,`1*$SZ`($ctx) | ||
338 | addl @X[3],$D,$D | ||
339 | $ST $C,`2*$SZ`($ctx) | ||
340 | addl @X[4],$E,$E | ||
341 | $ST $D,`3*$SZ`($ctx) | ||
342 | addl @X[5],$F,$F | ||
343 | $ST $E,`4*$SZ`($ctx) | ||
344 | addl @X[6],$G,$G | ||
345 | $ST $F,`5*$SZ`($ctx) | ||
346 | addl @X[7],$H,$H | ||
347 | $ST $G,`6*$SZ`($ctx) | ||
348 | $ST $H,`7*$SZ`($ctx) | ||
349 | |||
350 | cmpb,*<>,n $inp,$num,L\$oop | ||
351 | $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp | ||
352 | ___ | ||
353 | if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0 | ||
354 | {{ | ||
355 | $code.=<<___; | ||
356 | b L\$done | ||
357 | nop | ||
358 | |||
359 | .ALIGN 64 | ||
360 | L\$parisc1 | ||
361 | ___ | ||
362 | |||
363 | @V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo, | ||
364 | $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = | ||
365 | ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", | ||
366 | "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16"); | ||
367 | $a0 ="%r17"; | ||
368 | $a1 ="%r18"; | ||
369 | $a2 ="%r19"; | ||
370 | $a3 ="%r20"; | ||
371 | $t0 ="%r21"; | ||
372 | $t1 ="%r22"; | ||
373 | $t2 ="%r28"; | ||
374 | $t3 ="%r29"; | ||
375 | $Tbl="%r31"; | ||
376 | |||
377 | @X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx | ||
378 | |||
379 | sub ROUND_00_15_pa1 { | ||
380 | my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, | ||
381 | $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_; | ||
382 | my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; | ||
383 | |||
384 | $code.=<<___ if (!$flag); | ||
385 | ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi | ||
386 | ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] | ||
387 | ___ | ||
388 | $code.=<<___; | ||
389 | shd $ehi,$elo,$Sigma1[0],$t0 | ||
390 | add $Xlo,$hlo,$hlo | ||
391 | shd $elo,$ehi,$Sigma1[0],$t1 | ||
392 | addc $Xhi,$hhi,$hhi ; h += X[i] | ||
393 | shd $ehi,$elo,$Sigma1[1],$t2 | ||
394 | ldwm 8($Tbl),$Xhi | ||
395 | shd $elo,$ehi,$Sigma1[1],$t3 | ||
396 | ldw -4($Tbl),$Xlo ; load K[i] | ||
397 | xor $t2,$t0,$t0 | ||
398 | xor $t3,$t1,$t1 | ||
399 | and $flo,$elo,$a0 | ||
400 | and $fhi,$ehi,$a1 | ||
401 | shd $ehi,$elo,$Sigma1[2],$t2 | ||
402 | andcm $glo,$elo,$a2 | ||
403 | shd $elo,$ehi,$Sigma1[2],$t3 | ||
404 | andcm $ghi,$ehi,$a3 | ||
405 | xor $t2,$t0,$t0 | ||
406 | xor $t3,$t1,$t1 ; Sigma1(e) | ||
407 | add $Xlo,$hlo,$hlo | ||
408 | xor $a2,$a0,$a0 | ||
409 | addc $Xhi,$hhi,$hhi ; h += K[i] | ||
410 | xor $a3,$a1,$a1 ; Ch(e,f,g) | ||
411 | |||
412 | add $t0,$hlo,$hlo | ||
413 | shd $ahi,$alo,$Sigma0[0],$t0 | ||
414 | addc $t1,$hhi,$hhi ; h += Sigma1(e) | ||
415 | shd $alo,$ahi,$Sigma0[0],$t1 | ||
416 | add $a0,$hlo,$hlo | ||
417 | shd $ahi,$alo,$Sigma0[1],$t2 | ||
418 | addc $a1,$hhi,$hhi ; h += Ch(e,f,g) | ||
419 | shd $alo,$ahi,$Sigma0[1],$t3 | ||
420 | |||
421 | xor $t2,$t0,$t0 | ||
422 | xor $t3,$t1,$t1 | ||
423 | shd $ahi,$alo,$Sigma0[2],$t2 | ||
424 | and $alo,$blo,$a0 | ||
425 | shd $alo,$ahi,$Sigma0[2],$t3 | ||
426 | and $ahi,$bhi,$a1 | ||
427 | xor $t2,$t0,$t0 | ||
428 | xor $t3,$t1,$t1 ; Sigma0(a) | ||
429 | |||
430 | and $alo,$clo,$a2 | ||
431 | and $ahi,$chi,$a3 | ||
432 | xor $a2,$a0,$a0 | ||
433 | add $hlo,$dlo,$dlo | ||
434 | xor $a3,$a1,$a1 | ||
435 | addc $hhi,$dhi,$dhi ; d += h | ||
436 | and $blo,$clo,$a2 | ||
437 | add $t0,$hlo,$hlo | ||
438 | and $bhi,$chi,$a3 | ||
439 | addc $t1,$hhi,$hhi ; h += Sigma0(a) | ||
440 | xor $a2,$a0,$a0 | ||
441 | add $a0,$hlo,$hlo | ||
442 | xor $a3,$a1,$a1 ; Maj(a,b,c) | ||
443 | addc $a1,$hhi,$hhi ; h += Maj(a,b,c) | ||
444 | |||
445 | ___ | ||
446 | $code.=<<___ if ($i==15 && $flag); | ||
447 | extru $Xlo,31,10,$Xlo | ||
448 | comiclr,= $LAST10BITS,$Xlo,%r0 | ||
449 | b L\$rounds_pa1 | ||
450 | nop | ||
451 | ___ | ||
452 | push(@X,shift(@X)); push(@X,shift(@X)); | ||
453 | } | ||
454 | |||
455 | sub ROUND_16_xx_pa1 { | ||
456 | my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; | ||
457 | my ($i)=shift; | ||
458 | $i-=16; | ||
459 | $code.=<<___; | ||
460 | ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi | ||
461 | ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] | ||
462 | ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1 | ||
463 | ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9] | ||
464 | ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3 | ||
465 | ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14] | ||
466 | shd $Xnhi,$Xnlo,$sigma0[0],$t0 | ||
467 | shd $Xnlo,$Xnhi,$sigma0[0],$t1 | ||
468 | add $a0,$Xlo,$Xlo | ||
469 | shd $Xnhi,$Xnlo,$sigma0[1],$t2 | ||
470 | addc $a1,$Xhi,$Xhi | ||
471 | shd $Xnlo,$Xnhi,$sigma0[1],$t3 | ||
472 | xor $t2,$t0,$t0 | ||
473 | shd $Xnhi,$Xnlo,$sigma0[2],$t2 | ||
474 | xor $t3,$t1,$t1 | ||
475 | extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3 | ||
476 | xor $t2,$t0,$t0 | ||
477 | shd $a3,$a2,$sigma1[0],$a0 | ||
478 | xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f]) | ||
479 | shd $a2,$a3,$sigma1[0],$a1 | ||
480 | add $t0,$Xlo,$Xlo | ||
481 | shd $a3,$a2,$sigma1[1],$t2 | ||
482 | addc $t1,$Xhi,$Xhi | ||
483 | shd $a2,$a3,$sigma1[1],$t3 | ||
484 | xor $t2,$a0,$a0 | ||
485 | shd $a3,$a2,$sigma1[2],$t2 | ||
486 | xor $t3,$a1,$a1 | ||
487 | extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3 | ||
488 | xor $t2,$a0,$a0 | ||
489 | xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f]) | ||
490 | add $a0,$Xlo,$Xlo | ||
491 | addc $a1,$Xhi,$Xhi | ||
492 | |||
493 | stw $Xhi,`-$XOFF+8*($i%16)`(%sp) | ||
494 | stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp) | ||
495 | ___ | ||
496 | &ROUND_00_15_pa1($i,@_,1); | ||
497 | } | ||
498 | $code.=<<___; | ||
499 | ldw `0*4`($ctx),$Ahi ; load context | ||
500 | ldw `1*4`($ctx),$Alo | ||
501 | ldw `2*4`($ctx),$Bhi | ||
502 | ldw `3*4`($ctx),$Blo | ||
503 | ldw `4*4`($ctx),$Chi | ||
504 | ldw `5*4`($ctx),$Clo | ||
505 | ldw `6*4`($ctx),$Dhi | ||
506 | ldw `7*4`($ctx),$Dlo | ||
507 | ldw `8*4`($ctx),$Ehi | ||
508 | ldw `9*4`($ctx),$Elo | ||
509 | ldw `10*4`($ctx),$Fhi | ||
510 | ldw `11*4`($ctx),$Flo | ||
511 | ldw `12*4`($ctx),$Ghi | ||
512 | ldw `13*4`($ctx),$Glo | ||
513 | ldw `14*4`($ctx),$Hhi | ||
514 | ldw `15*4`($ctx),$Hlo | ||
515 | |||
516 | extru $inp,31,2,$t0 | ||
517 | sh3addl $t0,%r0,$t0 | ||
518 | subi 32,$t0,$t0 | ||
519 | mtctl $t0,%cr11 ; load %sar with align factor | ||
520 | |||
521 | L\$oop_pa1 | ||
522 | extru $inp,31,2,$a3 | ||
523 | comib,= 0,$a3,L\$aligned_pa1 | ||
524 | sub $inp,$a3,$inp | ||
525 | |||
526 | ldw `0*4`($inp),$X[0] | ||
527 | ldw `1*4`($inp),$X[1] | ||
528 | ldw `2*4`($inp),$t2 | ||
529 | ldw `3*4`($inp),$t3 | ||
530 | ldw `4*4`($inp),$a0 | ||
531 | ldw `5*4`($inp),$a1 | ||
532 | ldw `6*4`($inp),$a2 | ||
533 | ldw `7*4`($inp),$a3 | ||
534 | vshd $X[0],$X[1],$X[0] | ||
535 | vshd $X[1],$t2,$X[1] | ||
536 | stw $X[0],`-$XOFF+0*4`(%sp) | ||
537 | ldw `8*4`($inp),$t0 | ||
538 | vshd $t2,$t3,$t2 | ||
539 | stw $X[1],`-$XOFF+1*4`(%sp) | ||
540 | ldw `9*4`($inp),$t1 | ||
541 | vshd $t3,$a0,$t3 | ||
542 | ___ | ||
543 | { | ||
544 | my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); | ||
545 | for ($i=2;$i<=(128/4-8);$i++) { | ||
546 | $code.=<<___; | ||
547 | stw $t[0],`-$XOFF+$i*4`(%sp) | ||
548 | ldw `(8+$i)*4`($inp),$t[0] | ||
549 | vshd $t[1],$t[2],$t[1] | ||
550 | ___ | ||
551 | push(@t,shift(@t)); | ||
552 | } | ||
553 | for (;$i<(128/4-1);$i++) { | ||
554 | $code.=<<___; | ||
555 | stw $t[0],`-$XOFF+$i*4`(%sp) | ||
556 | vshd $t[1],$t[2],$t[1] | ||
557 | ___ | ||
558 | push(@t,shift(@t)); | ||
559 | } | ||
560 | $code.=<<___; | ||
561 | b L\$collected_pa1 | ||
562 | stw $t[0],`-$XOFF+$i*4`(%sp) | ||
563 | |||
564 | ___ | ||
565 | } | ||
566 | $code.=<<___; | ||
567 | L\$aligned_pa1 | ||
568 | ldw `0*4`($inp),$X[0] | ||
569 | ldw `1*4`($inp),$X[1] | ||
570 | ldw `2*4`($inp),$t2 | ||
571 | ldw `3*4`($inp),$t3 | ||
572 | ldw `4*4`($inp),$a0 | ||
573 | ldw `5*4`($inp),$a1 | ||
574 | ldw `6*4`($inp),$a2 | ||
575 | ldw `7*4`($inp),$a3 | ||
576 | stw $X[0],`-$XOFF+0*4`(%sp) | ||
577 | ldw `8*4`($inp),$t0 | ||
578 | stw $X[1],`-$XOFF+1*4`(%sp) | ||
579 | ldw `9*4`($inp),$t1 | ||
580 | ___ | ||
581 | { | ||
582 | my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); | ||
583 | for ($i=2;$i<(128/4-8);$i++) { | ||
584 | $code.=<<___; | ||
585 | stw $t[0],`-$XOFF+$i*4`(%sp) | ||
586 | ldw `(8+$i)*4`($inp),$t[0] | ||
587 | ___ | ||
588 | push(@t,shift(@t)); | ||
589 | } | ||
590 | for (;$i<128/4;$i++) { | ||
591 | $code.=<<___; | ||
592 | stw $t[0],`-$XOFF+$i*4`(%sp) | ||
593 | ___ | ||
594 | push(@t,shift(@t)); | ||
595 | } | ||
596 | $code.="L\$collected_pa1\n"; | ||
597 | } | ||
598 | |||
599 | for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } | ||
600 | $code.="L\$rounds_pa1\n"; | ||
601 | for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } | ||
602 | |||
603 | $code.=<<___; | ||
604 | $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments | ||
605 | $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp | ||
606 | $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num | ||
607 | ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl | ||
608 | |||
609 | ldw `0*4`($ctx),$t1 ; update context | ||
610 | ldw `1*4`($ctx),$t0 | ||
611 | ldw `2*4`($ctx),$t3 | ||
612 | ldw `3*4`($ctx),$t2 | ||
613 | ldw `4*4`($ctx),$a1 | ||
614 | ldw `5*4`($ctx),$a0 | ||
615 | ldw `6*4`($ctx),$a3 | ||
616 | add $t0,$Alo,$Alo | ||
617 | ldw `7*4`($ctx),$a2 | ||
618 | addc $t1,$Ahi,$Ahi | ||
619 | ldw `8*4`($ctx),$t1 | ||
620 | add $t2,$Blo,$Blo | ||
621 | ldw `9*4`($ctx),$t0 | ||
622 | addc $t3,$Bhi,$Bhi | ||
623 | ldw `10*4`($ctx),$t3 | ||
624 | add $a0,$Clo,$Clo | ||
625 | ldw `11*4`($ctx),$t2 | ||
626 | addc $a1,$Chi,$Chi | ||
627 | ldw `12*4`($ctx),$a1 | ||
628 | add $a2,$Dlo,$Dlo | ||
629 | ldw `13*4`($ctx),$a0 | ||
630 | addc $a3,$Dhi,$Dhi | ||
631 | ldw `14*4`($ctx),$a3 | ||
632 | add $t0,$Elo,$Elo | ||
633 | ldw `15*4`($ctx),$a2 | ||
634 | addc $t1,$Ehi,$Ehi | ||
635 | stw $Ahi,`0*4`($ctx) | ||
636 | add $t2,$Flo,$Flo | ||
637 | stw $Alo,`1*4`($ctx) | ||
638 | addc $t3,$Fhi,$Fhi | ||
639 | stw $Bhi,`2*4`($ctx) | ||
640 | add $a0,$Glo,$Glo | ||
641 | stw $Blo,`3*4`($ctx) | ||
642 | addc $a1,$Ghi,$Ghi | ||
643 | stw $Chi,`4*4`($ctx) | ||
644 | add $a2,$Hlo,$Hlo | ||
645 | stw $Clo,`5*4`($ctx) | ||
646 | addc $a3,$Hhi,$Hhi | ||
647 | stw $Dhi,`6*4`($ctx) | ||
648 | ldo `16*$SZ`($inp),$inp ; advance $inp | ||
649 | stw $Dlo,`7*4`($ctx) | ||
650 | stw $Ehi,`8*4`($ctx) | ||
651 | stw $Elo,`9*4`($ctx) | ||
652 | stw $Fhi,`10*4`($ctx) | ||
653 | stw $Flo,`11*4`($ctx) | ||
654 | stw $Ghi,`12*4`($ctx) | ||
655 | stw $Glo,`13*4`($ctx) | ||
656 | stw $Hhi,`14*4`($ctx) | ||
657 | comb,= $inp,$num,L\$done | ||
658 | stw $Hlo,`15*4`($ctx) | ||
659 | b L\$oop_pa1 | ||
660 | $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp | ||
661 | L\$done | ||
662 | ___ | ||
663 | }} | ||
664 | $code.=<<___; | ||
665 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
666 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
667 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
668 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
669 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
670 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
671 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
672 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
673 | $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 | ||
674 | $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 | ||
675 | $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 | ||
676 | $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 | ||
677 | $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 | ||
678 | $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 | ||
679 | $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 | ||
680 | $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 | ||
681 | bv (%r2) | ||
682 | .EXIT | ||
683 | $POPMB -$FRAME(%sp),%r3 | ||
684 | .PROCEND | ||
685 | .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
686 | ___ | ||
687 | |||
688 | # Explicitly encode PA-RISC 2.0 instructions used in this module, so | ||
689 | # that it can be compiled with .LEVEL 1.0. It should be noted that I | ||
690 | # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 | ||
691 | # directive... | ||
692 | |||
693 | my $ldd = sub { | ||
694 | my ($mod,$args) = @_; | ||
695 | my $orig = "ldd$mod\t$args"; | ||
696 | |||
697 | if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices | ||
698 | { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1); | ||
699 | $opcode|=(1<<3) if ($mod =~ /^,m/); | ||
700 | $opcode|=(1<<2) if ($mod =~ /^,mb/); | ||
701 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
702 | } | ||
703 | else { "\t".$orig; } | ||
704 | }; | ||
705 | |||
706 | my $std = sub { | ||
707 | my ($mod,$args) = @_; | ||
708 | my $orig = "std$mod\t$args"; | ||
709 | |||
710 | if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices | ||
711 | { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); | ||
712 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
713 | } | ||
714 | else { "\t".$orig; } | ||
715 | }; | ||
716 | |||
717 | my $extrd = sub { | ||
718 | my ($mod,$args) = @_; | ||
719 | my $orig = "extrd$mod\t$args"; | ||
720 | |||
721 | # I only have ",u" completer, it's implicitly encoded... | ||
722 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 | ||
723 | { my $opcode=(0x36<<26)|($1<<21)|($4<<16); | ||
724 | my $len=32-$3; | ||
725 | $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos | ||
726 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len | ||
727 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
728 | } | ||
729 | elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 | ||
730 | { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); | ||
731 | my $len=32-$2; | ||
732 | $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len | ||
733 | $opcode |= (1<<13) if ($mod =~ /,\**=/); | ||
734 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
735 | } | ||
736 | else { "\t".$orig; } | ||
737 | }; | ||
738 | |||
739 | my $shrpd = sub { | ||
740 | my ($mod,$args) = @_; | ||
741 | my $orig = "shrpd$mod\t$args"; | ||
742 | |||
743 | if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 | ||
744 | { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; | ||
745 | my $cpos=63-$3; | ||
746 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa | ||
747 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
748 | } | ||
749 | elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 | ||
750 | { sprintf "\t.WORD\t0x%08x\t; %s", | ||
751 | (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; | ||
752 | } | ||
753 | else { "\t".$orig; } | ||
754 | }; | ||
755 | |||
756 | sub assemble { | ||
757 | my ($mnemonic,$mod,$args)=@_; | ||
758 | my $opcode = eval("\$$mnemonic"); | ||
759 | |||
760 | ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; | ||
761 | } | ||
762 | |||
763 | foreach (split("\n",$code)) { | ||
764 | s/\`([^\`]*)\`/eval $1/ge; | ||
765 | |||
766 | s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/ | ||
767 | $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32 | ||
768 | : sprintf("shd\t%$1,%$2,%d",$3)/e or | ||
769 | # translate made up instructons: _ror, _shr, _align, _shl | ||
770 | s/_ror(\s+)(%r[0-9]+),/ | ||
771 | ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or | ||
772 | |||
773 | s/_shr(\s+%r[0-9]+),([0-9]+),/ | ||
774 | $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2) | ||
775 | : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or | ||
776 | |||
777 | s/_align(\s+%r[0-9]+,%r[0-9]+),/ | ||
778 | ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or | ||
779 | |||
780 | s/_shl(\s+%r[0-9]+),([0-9]+),/ | ||
781 | $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2) | ||
782 | : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e; | ||
783 | |||
784 | s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4); | ||
785 | |||
786 | s/cmpb,\*/comb,/ if ($SIZE_T==4); | ||
787 | |||
788 | print $_,"\n"; | ||
789 | } | ||
790 | |||
791 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl index 768a6a6fad..6b44a68e59 100755 --- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl +++ b/src/lib/libcrypto/sha/asm/sha512-ppc.pl | |||
@@ -40,6 +40,7 @@ $output =shift; | |||
40 | 40 | ||
41 | if ($flavour =~ /64/) { | 41 | if ($flavour =~ /64/) { |
42 | $SIZE_T=8; | 42 | $SIZE_T=8; |
43 | $LRSAVE=2*$SIZE_T; | ||
43 | $STU="stdu"; | 44 | $STU="stdu"; |
44 | $UCMP="cmpld"; | 45 | $UCMP="cmpld"; |
45 | $SHL="sldi"; | 46 | $SHL="sldi"; |
@@ -47,6 +48,7 @@ if ($flavour =~ /64/) { | |||
47 | $PUSH="std"; | 48 | $PUSH="std"; |
48 | } elsif ($flavour =~ /32/) { | 49 | } elsif ($flavour =~ /32/) { |
49 | $SIZE_T=4; | 50 | $SIZE_T=4; |
51 | $LRSAVE=$SIZE_T; | ||
50 | $STU="stwu"; | 52 | $STU="stwu"; |
51 | $UCMP="cmplw"; | 53 | $UCMP="cmplw"; |
52 | $SHL="slwi"; | 54 | $SHL="slwi"; |
@@ -87,7 +89,8 @@ if ($output =~ /512/) { | |||
87 | $SHR="srwi"; | 89 | $SHR="srwi"; |
88 | } | 90 | } |
89 | 91 | ||
90 | $FRAME=32*$SIZE_T; | 92 | $FRAME=32*$SIZE_T+16*$SZ; |
93 | $LOCALS=6*$SIZE_T; | ||
91 | 94 | ||
92 | $sp ="r1"; | 95 | $sp ="r1"; |
93 | $toc="r2"; | 96 | $toc="r2"; |
@@ -179,13 +182,12 @@ $code=<<___; | |||
179 | .globl $func | 182 | .globl $func |
180 | .align 6 | 183 | .align 6 |
181 | $func: | 184 | $func: |
185 | $STU $sp,-$FRAME($sp) | ||
182 | mflr r0 | 186 | mflr r0 |
183 | $STU $sp,`-($FRAME+16*$SZ)`($sp) | ||
184 | $SHL $num,$num,`log(16*$SZ)/log(2)` | 187 | $SHL $num,$num,`log(16*$SZ)/log(2)` |
185 | 188 | ||
186 | $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp) | 189 | $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp) |
187 | 190 | ||
188 | $PUSH r0,`$FRAME-$SIZE_T*21`($sp) | ||
189 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) | 191 | $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) |
190 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) | 192 | $PUSH r13,`$FRAME-$SIZE_T*19`($sp) |
191 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | 193 | $PUSH r14,`$FRAME-$SIZE_T*18`($sp) |
@@ -206,6 +208,7 @@ $func: | |||
206 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | 208 | $PUSH r29,`$FRAME-$SIZE_T*3`($sp) |
207 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | 209 | $PUSH r30,`$FRAME-$SIZE_T*2`($sp) |
208 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | 210 | $PUSH r31,`$FRAME-$SIZE_T*1`($sp) |
211 | $PUSH r0,`$FRAME+$LRSAVE`($sp) | ||
209 | 212 | ||
210 | $LD $A,`0*$SZ`($ctx) | 213 | $LD $A,`0*$SZ`($ctx) |
211 | mr $inp,r4 ; incarnate $inp | 214 | mr $inp,r4 ; incarnate $inp |
@@ -217,7 +220,7 @@ $func: | |||
217 | $LD $G,`6*$SZ`($ctx) | 220 | $LD $G,`6*$SZ`($ctx) |
218 | $LD $H,`7*$SZ`($ctx) | 221 | $LD $H,`7*$SZ`($ctx) |
219 | 222 | ||
220 | b LPICmeup | 223 | bl LPICmeup |
221 | LPICedup: | 224 | LPICedup: |
222 | andi. r0,$inp,3 | 225 | andi. r0,$inp,3 |
223 | bne Lunaligned | 226 | bne Lunaligned |
@@ -226,40 +229,14 @@ Laligned: | |||
226 | $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer | 229 | $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer |
227 | $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer | 230 | $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer |
228 | bl Lsha2_block_private | 231 | bl Lsha2_block_private |
229 | Ldone: | 232 | b Ldone |
230 | $POP r0,`$FRAME-$SIZE_T*21`($sp) | ||
231 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) | ||
232 | $POP r13,`$FRAME-$SIZE_T*19`($sp) | ||
233 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | ||
234 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | ||
235 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | ||
236 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | ||
237 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | ||
238 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | ||
239 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | ||
240 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | ||
241 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | ||
242 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | ||
243 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | ||
244 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | ||
245 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | ||
246 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | ||
247 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | ||
248 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | ||
249 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | ||
250 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | ||
251 | mtlr r0 | ||
252 | addi $sp,$sp,`$FRAME+16*$SZ` | ||
253 | blr | ||
254 | ___ | ||
255 | 233 | ||
256 | # PowerPC specification allows an implementation to be ill-behaved | 234 | ; PowerPC specification allows an implementation to be ill-behaved |
257 | # upon unaligned access which crosses page boundary. "Better safe | 235 | ; upon unaligned access which crosses page boundary. "Better safe |
258 | # than sorry" principle makes me treat it specially. But I don't | 236 | ; than sorry" principle makes me treat it specially. But I don't |
259 | # look for particular offending word, but rather for the input | 237 | ; look for particular offending word, but rather for the input |
260 | # block which crosses the boundary. Once found that block is aligned | 238 | ; block which crosses the boundary. Once found that block is aligned |
261 | # and hashed separately... | 239 | ; and hashed separately... |
262 | $code.=<<___; | ||
263 | .align 4 | 240 | .align 4 |
264 | Lunaligned: | 241 | Lunaligned: |
265 | subfic $t1,$inp,4096 | 242 | subfic $t1,$inp,4096 |
@@ -278,7 +255,7 @@ Lunaligned: | |||
278 | Lcross_page: | 255 | Lcross_page: |
279 | li $t1,`16*$SZ/4` | 256 | li $t1,`16*$SZ/4` |
280 | mtctr $t1 | 257 | mtctr $t1 |
281 | addi r20,$sp,$FRAME ; aligned spot below the frame | 258 | addi r20,$sp,$LOCALS ; aligned spot below the frame |
282 | Lmemcpy: | 259 | Lmemcpy: |
283 | lbz r16,0($inp) | 260 | lbz r16,0($inp) |
284 | lbz r17,1($inp) | 261 | lbz r17,1($inp) |
@@ -293,8 +270,8 @@ Lmemcpy: | |||
293 | bdnz Lmemcpy | 270 | bdnz Lmemcpy |
294 | 271 | ||
295 | $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp | 272 | $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp |
296 | addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer | 273 | addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer |
297 | addi $inp,$sp,$FRAME ; fictitious inp pointer | 274 | addi $inp,$sp,$LOCALS ; fictitious inp pointer |
298 | $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num | 275 | $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num |
299 | $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer | 276 | $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer |
300 | $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer | 277 | $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer |
@@ -303,10 +280,36 @@ Lmemcpy: | |||
303 | $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num | 280 | $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num |
304 | addic. $num,$num,`-16*$SZ` ; num-- | 281 | addic. $num,$num,`-16*$SZ` ; num-- |
305 | bne- Lunaligned | 282 | bne- Lunaligned |
306 | b Ldone | ||
307 | ___ | ||
308 | 283 | ||
309 | $code.=<<___; | 284 | Ldone: |
285 | $POP r0,`$FRAME+$LRSAVE`($sp) | ||
286 | $POP $toc,`$FRAME-$SIZE_T*20`($sp) | ||
287 | $POP r13,`$FRAME-$SIZE_T*19`($sp) | ||
288 | $POP r14,`$FRAME-$SIZE_T*18`($sp) | ||
289 | $POP r15,`$FRAME-$SIZE_T*17`($sp) | ||
290 | $POP r16,`$FRAME-$SIZE_T*16`($sp) | ||
291 | $POP r17,`$FRAME-$SIZE_T*15`($sp) | ||
292 | $POP r18,`$FRAME-$SIZE_T*14`($sp) | ||
293 | $POP r19,`$FRAME-$SIZE_T*13`($sp) | ||
294 | $POP r20,`$FRAME-$SIZE_T*12`($sp) | ||
295 | $POP r21,`$FRAME-$SIZE_T*11`($sp) | ||
296 | $POP r22,`$FRAME-$SIZE_T*10`($sp) | ||
297 | $POP r23,`$FRAME-$SIZE_T*9`($sp) | ||
298 | $POP r24,`$FRAME-$SIZE_T*8`($sp) | ||
299 | $POP r25,`$FRAME-$SIZE_T*7`($sp) | ||
300 | $POP r26,`$FRAME-$SIZE_T*6`($sp) | ||
301 | $POP r27,`$FRAME-$SIZE_T*5`($sp) | ||
302 | $POP r28,`$FRAME-$SIZE_T*4`($sp) | ||
303 | $POP r29,`$FRAME-$SIZE_T*3`($sp) | ||
304 | $POP r30,`$FRAME-$SIZE_T*2`($sp) | ||
305 | $POP r31,`$FRAME-$SIZE_T*1`($sp) | ||
306 | mtlr r0 | ||
307 | addi $sp,$sp,$FRAME | ||
308 | blr | ||
309 | .long 0 | ||
310 | .byte 0,12,4,1,0x80,18,3,0 | ||
311 | .long 0 | ||
312 | |||
310 | .align 4 | 313 | .align 4 |
311 | Lsha2_block_private: | 314 | Lsha2_block_private: |
312 | ___ | 315 | ___ |
@@ -372,6 +375,8 @@ $code.=<<___; | |||
372 | $ST $H,`7*$SZ`($ctx) | 375 | $ST $H,`7*$SZ`($ctx) |
373 | bne Lsha2_block_private | 376 | bne Lsha2_block_private |
374 | blr | 377 | blr |
378 | .long 0 | ||
379 | .byte 0,12,0x14,0,0,0,0,0 | ||
375 | ___ | 380 | ___ |
376 | 381 | ||
377 | # Ugly hack here, because PPC assembler syntax seem to vary too | 382 | # Ugly hack here, because PPC assembler syntax seem to vary too |
@@ -379,22 +384,15 @@ ___ | |||
379 | $code.=<<___; | 384 | $code.=<<___; |
380 | .align 6 | 385 | .align 6 |
381 | LPICmeup: | 386 | LPICmeup: |
382 | bl LPIC | 387 | mflr r0 |
383 | addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop | 388 | bcl 20,31,\$+4 |
384 | b LPICedup | 389 | mflr $Tbl ; vvvvvv "distance" between . and 1st data entry |
385 | nop | 390 | addi $Tbl,$Tbl,`64-8` |
386 | nop | 391 | mtlr r0 |
387 | nop | ||
388 | nop | ||
389 | nop | ||
390 | LPIC: mflr $Tbl | ||
391 | blr | 392 | blr |
392 | nop | 393 | .long 0 |
393 | nop | 394 | .byte 0,12,0x14,0,0,0,0,0 |
394 | nop | 395 | .space `64-9*4` |
395 | nop | ||
396 | nop | ||
397 | nop | ||
398 | ___ | 396 | ___ |
399 | $code.=<<___ if ($SZ==8); | 397 | $code.=<<___ if ($SZ==8); |
400 | .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd | 398 | .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd |
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl index e7ef2d5a9f..079a3fc78a 100644 --- a/src/lib/libcrypto/sha/asm/sha512-s390x.pl +++ b/src/lib/libcrypto/sha/asm/sha512-s390x.pl | |||
@@ -26,6 +26,26 @@ | |||
26 | # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster | 26 | # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster |
27 | # than software. | 27 | # than software. |
28 | 28 | ||
29 | # November 2010. | ||
30 | # | ||
31 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
32 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
33 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
34 | # application context. The feature is not specific to any particular | ||
35 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
36 | # remains z/Architecture specific. On z900 SHA256 was measured to | ||
37 | # perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3. | ||
38 | |||
39 | $flavour = shift; | ||
40 | |||
41 | if ($flavour =~ /3[12]/) { | ||
42 | $SIZE_T=4; | ||
43 | $g=""; | ||
44 | } else { | ||
45 | $SIZE_T=8; | ||
46 | $g="g"; | ||
47 | } | ||
48 | |||
29 | $t0="%r0"; | 49 | $t0="%r0"; |
30 | $t1="%r1"; | 50 | $t1="%r1"; |
31 | $ctx="%r2"; $t2="%r2"; | 51 | $ctx="%r2"; $t2="%r2"; |
@@ -44,7 +64,7 @@ $tbl="%r13"; | |||
44 | $T1="%r14"; | 64 | $T1="%r14"; |
45 | $sp="%r15"; | 65 | $sp="%r15"; |
46 | 66 | ||
47 | $output=shift; | 67 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
48 | open STDOUT,">$output"; | 68 | open STDOUT,">$output"; |
49 | 69 | ||
50 | if ($output =~ /512/) { | 70 | if ($output =~ /512/) { |
@@ -78,7 +98,8 @@ if ($output =~ /512/) { | |||
78 | } | 98 | } |
79 | $Func="sha${label}_block_data_order"; | 99 | $Func="sha${label}_block_data_order"; |
80 | $Table="K${label}"; | 100 | $Table="K${label}"; |
81 | $frame=160+16*$SZ; | 101 | $stdframe=16*$SIZE_T+4*8; |
102 | $frame=$stdframe+16*$SZ; | ||
82 | 103 | ||
83 | sub BODY_00_15 { | 104 | sub BODY_00_15 { |
84 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 105 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
@@ -93,9 +114,9 @@ $code.=<<___; | |||
93 | xgr $t0,$t1 | 114 | xgr $t0,$t1 |
94 | $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` | 115 | $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` |
95 | xgr $t2,$g | 116 | xgr $t2,$g |
96 | $ST $T1,`160+$SZ*($i%16)`($sp) | 117 | $ST $T1,`$stdframe+$SZ*($i%16)`($sp) |
97 | xgr $t0,$t1 # Sigma1(e) | 118 | xgr $t0,$t1 # Sigma1(e) |
98 | la $T1,0($T1,$h) # T1+=h | 119 | algr $T1,$h # T1+=h |
99 | ngr $t2,$e | 120 | ngr $t2,$e |
100 | lgr $t1,$a | 121 | lgr $t1,$a |
101 | algr $T1,$t0 # T1+=Sigma1(e) | 122 | algr $T1,$t0 # T1+=Sigma1(e) |
@@ -113,7 +134,7 @@ $code.=<<___; | |||
113 | ngr $t2,$b | 134 | ngr $t2,$b |
114 | algr $h,$T1 # h+=T1 | 135 | algr $h,$T1 # h+=T1 |
115 | ogr $t2,$t1 # Maj(a,b,c) | 136 | ogr $t2,$t1 # Maj(a,b,c) |
116 | la $d,0($d,$T1) # d+=T1 | 137 | algr $d,$T1 # d+=T1 |
117 | algr $h,$t2 # h+=Maj(a,b,c) | 138 | algr $h,$t2 # h+=Maj(a,b,c) |
118 | ___ | 139 | ___ |
119 | } | 140 | } |
@@ -122,19 +143,19 @@ sub BODY_16_XX { | |||
122 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 143 | my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
123 | 144 | ||
124 | $code.=<<___; | 145 | $code.=<<___; |
125 | $LD $T1,`160+$SZ*(($i+1)%16)`($sp) ### $i | 146 | $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i |
126 | $LD $t1,`160+$SZ*(($i+14)%16)`($sp) | 147 | $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp) |
127 | $ROT $t0,$T1,$sigma0[0] | 148 | $ROT $t0,$T1,$sigma0[0] |
128 | $SHR $T1,$sigma0[2] | 149 | $SHR $T1,$sigma0[2] |
129 | $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]` | 150 | $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]` |
130 | xgr $T1,$t0 | 151 | xgr $T1,$t0 |
131 | $ROT $t0,$t1,$sigma1[0] | 152 | $ROT $t0,$t1,$sigma1[0] |
132 | xgr $T1,$t2 # sigma0(X[i+1]) | 153 | xgr $T1,$t2 # sigma0(X[i+1]) |
133 | $SHR $t1,$sigma1[2] | 154 | $SHR $t1,$sigma1[2] |
134 | $ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i] | 155 | $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i] |
135 | xgr $t1,$t0 | 156 | xgr $t1,$t0 |
136 | $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` | 157 | $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` |
137 | $ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9] | 158 | $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9] |
138 | xgr $t1,$t0 # sigma1(X[i+14]) | 159 | xgr $t1,$t0 # sigma1(X[i+14]) |
139 | algr $T1,$t1 # +=sigma1(X[i+14]) | 160 | algr $T1,$t1 # +=sigma1(X[i+14]) |
140 | ___ | 161 | ___ |
@@ -212,6 +233,7 @@ $code.=<<___; | |||
212 | .globl $Func | 233 | .globl $Func |
213 | .type $Func,\@function | 234 | .type $Func,\@function |
214 | $Func: | 235 | $Func: |
236 | sllg $len,$len,`log(16*$SZ)/log(2)` | ||
215 | ___ | 237 | ___ |
216 | $code.=<<___ if ($kimdfunc); | 238 | $code.=<<___ if ($kimdfunc); |
217 | larl %r1,OPENSSL_s390xcap_P | 239 | larl %r1,OPENSSL_s390xcap_P |
@@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc); | |||
219 | tmhl %r0,0x4000 # check for message-security assist | 241 | tmhl %r0,0x4000 # check for message-security assist |
220 | jz .Lsoftware | 242 | jz .Lsoftware |
221 | lghi %r0,0 | 243 | lghi %r0,0 |
222 | la %r1,16($sp) | 244 | la %r1,`2*$SIZE_T`($sp) |
223 | .long 0xb93e0002 # kimd %r0,%r2 | 245 | .long 0xb93e0002 # kimd %r0,%r2 |
224 | lg %r0,16($sp) | 246 | lg %r0,`2*$SIZE_T`($sp) |
225 | tmhh %r0,`0x8000>>$kimdfunc` | 247 | tmhh %r0,`0x8000>>$kimdfunc` |
226 | jz .Lsoftware | 248 | jz .Lsoftware |
227 | lghi %r0,$kimdfunc | 249 | lghi %r0,$kimdfunc |
228 | lgr %r1,$ctx | 250 | lgr %r1,$ctx |
229 | lgr %r2,$inp | 251 | lgr %r2,$inp |
230 | sllg %r3,$len,`log(16*$SZ)/log(2)` | 252 | lgr %r3,$len |
231 | .long 0xb93e0002 # kimd %r0,%r2 | 253 | .long 0xb93e0002 # kimd %r0,%r2 |
232 | brc 1,.-4 # pay attention to "partial completion" | 254 | brc 1,.-4 # pay attention to "partial completion" |
233 | br %r14 | 255 | br %r14 |
@@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc); | |||
235 | .Lsoftware: | 257 | .Lsoftware: |
236 | ___ | 258 | ___ |
237 | $code.=<<___; | 259 | $code.=<<___; |
238 | sllg $len,$len,`log(16*$SZ)/log(2)` | ||
239 | lghi %r1,-$frame | 260 | lghi %r1,-$frame |
240 | agr $len,$inp | 261 | la $len,0($len,$inp) |
241 | stmg $ctx,%r15,16($sp) | 262 | stm${g} $ctx,%r15,`2*$SIZE_T`($sp) |
242 | lgr %r0,$sp | 263 | lgr %r0,$sp |
243 | la $sp,0(%r1,$sp) | 264 | la $sp,0(%r1,$sp) |
244 | stg %r0,0($sp) | 265 | st${g} %r0,0($sp) |
245 | 266 | ||
246 | larl $tbl,$Table | 267 | larl $tbl,$Table |
247 | $LD $A,`0*$SZ`($ctx) | 268 | $LD $A,`0*$SZ`($ctx) |
@@ -265,7 +286,7 @@ $code.=<<___; | |||
265 | clgr $len,$t0 | 286 | clgr $len,$t0 |
266 | jne .Lrounds_16_xx | 287 | jne .Lrounds_16_xx |
267 | 288 | ||
268 | lg $ctx,`$frame+16`($sp) | 289 | l${g} $ctx,`$frame+2*$SIZE_T`($sp) |
269 | la $inp,`16*$SZ`($inp) | 290 | la $inp,`16*$SZ`($inp) |
270 | $ADD $A,`0*$SZ`($ctx) | 291 | $ADD $A,`0*$SZ`($ctx) |
271 | $ADD $B,`1*$SZ`($ctx) | 292 | $ADD $B,`1*$SZ`($ctx) |
@@ -283,14 +304,14 @@ $code.=<<___; | |||
283 | $ST $F,`5*$SZ`($ctx) | 304 | $ST $F,`5*$SZ`($ctx) |
284 | $ST $G,`6*$SZ`($ctx) | 305 | $ST $G,`6*$SZ`($ctx) |
285 | $ST $H,`7*$SZ`($ctx) | 306 | $ST $H,`7*$SZ`($ctx) |
286 | clg $inp,`$frame+32`($sp) | 307 | cl${g} $inp,`$frame+4*$SIZE_T`($sp) |
287 | jne .Lloop | 308 | jne .Lloop |
288 | 309 | ||
289 | lmg %r6,%r15,`$frame+48`($sp) | 310 | lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) |
290 | br %r14 | 311 | br %r14 |
291 | .size $Func,.-$Func | 312 | .size $Func,.-$Func |
292 | .string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 313 | .string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
293 | .comm OPENSSL_s390xcap_P,8,8 | 314 | .comm OPENSSL_s390xcap_P,16,8 |
294 | ___ | 315 | ___ |
295 | 316 | ||
296 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 317 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl index ec5d78135e..585740789e 100644 --- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl +++ b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl | |||
@@ -305,9 +305,9 @@ $code.=<<___; | |||
305 | srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] | 305 | srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] |
306 | xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) | 306 | xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) |
307 | srl @X[($i/2)%8],0,$tmp0 | 307 | srl @X[($i/2)%8],0,$tmp0 |
308 | add $tmp2,$tmp1,$tmp1 | ||
308 | add $xi,$T1,$T1 ! +=X[i] | 309 | add $xi,$T1,$T1 ! +=X[i] |
309 | xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] | 310 | xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] |
310 | add $tmp2,$T1,$T1 | ||
311 | add $tmp1,$T1,$T1 | 311 | add $tmp1,$T1,$T1 |
312 | 312 | ||
313 | srl $T1,0,$T1 | 313 | srl $T1,0,$T1 |
@@ -318,9 +318,9 @@ ___ | |||
318 | $code.=<<___; | 318 | $code.=<<___; |
319 | srlx @X[($i/2)%8],32,$tmp1 ! X[i] | 319 | srlx @X[($i/2)%8],32,$tmp1 ! X[i] |
320 | xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) | 320 | xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) |
321 | srl @X[($i/2)%8],0,@X[($i/2)%8] | ||
322 | add $xi,$T1,$T1 ! +=X[i+9] | 321 | add $xi,$T1,$T1 ! +=X[i+9] |
323 | add $tmp2,$T1,$T1 | 322 | add $tmp2,$tmp1,$tmp1 |
323 | srl @X[($i/2)%8],0,@X[($i/2)%8] | ||
324 | add $tmp1,$T1,$T1 | 324 | add $tmp1,$T1,$T1 |
325 | 325 | ||
326 | sllx $T1,32,$tmp0 | 326 | sllx $T1,32,$tmp0 |
diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl index e6643f8cf6..f611a2d898 100755 --- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl +++ b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl | |||
@@ -95,50 +95,44 @@ sub ROUND_00_15() | |||
95 | { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; | 95 | { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
96 | 96 | ||
97 | $code.=<<___; | 97 | $code.=<<___; |
98 | mov $e,$a0 | 98 | ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 |
99 | mov $e,$a1 | ||
100 | mov $f,$a2 | 99 | mov $f,$a2 |
100 | mov $T1,`$SZ*($i&0xf)`(%rsp) | ||
101 | 101 | ||
102 | ror \$$Sigma1[0],$a0 | 102 | ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 |
103 | ror \$$Sigma1[1],$a1 | 103 | xor $e,$a0 |
104 | xor $g,$a2 # f^g | 104 | xor $g,$a2 # f^g |
105 | 105 | ||
106 | xor $a1,$a0 | 106 | ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 |
107 | ror \$`$Sigma1[2]-$Sigma1[1]`,$a1 | 107 | add $h,$T1 # T1+=h |
108 | xor $a,$a1 | ||
109 | |||
110 | add ($Tbl,$round,$SZ),$T1 # T1+=K[round] | ||
108 | and $e,$a2 # (f^g)&e | 111 | and $e,$a2 # (f^g)&e |
109 | mov $T1,`$SZ*($i&0xf)`(%rsp) | 112 | mov $b,$h |
110 | 113 | ||
111 | xor $a1,$a0 # Sigma1(e) | 114 | ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 |
115 | xor $e,$a0 | ||
112 | xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g | 116 | xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g |
113 | add $h,$T1 # T1+=h | ||
114 | |||
115 | mov $a,$h | ||
116 | add $a0,$T1 # T1+=Sigma1(e) | ||
117 | 117 | ||
118 | xor $c,$h # b^c | ||
119 | xor $a,$a1 | ||
118 | add $a2,$T1 # T1+=Ch(e,f,g) | 120 | add $a2,$T1 # T1+=Ch(e,f,g) |
119 | mov $a,$a0 | 121 | mov $b,$a2 |
120 | mov $a,$a1 | ||
121 | 122 | ||
122 | ror \$$Sigma0[0],$h | 123 | ror \$$Sigma1[0],$a0 # Sigma1(e) |
123 | ror \$$Sigma0[1],$a0 | 124 | and $a,$h # h=(b^c)&a |
124 | mov $a,$a2 | 125 | and $c,$a2 # b&c |
125 | add ($Tbl,$round,$SZ),$T1 # T1+=K[round] | ||
126 | 126 | ||
127 | xor $a0,$h | 127 | ror \$$Sigma0[0],$a1 # Sigma0(a) |
128 | ror \$`$Sigma0[2]-$Sigma0[1]`,$a0 | 128 | add $a0,$T1 # T1+=Sigma1(e) |
129 | or $c,$a1 # a|c | 129 | add $a2,$h # h+=b&c (completes +=Maj(a,b,c) |
130 | 130 | ||
131 | xor $a0,$h # h=Sigma0(a) | ||
132 | and $c,$a2 # a&c | ||
133 | add $T1,$d # d+=T1 | 131 | add $T1,$d # d+=T1 |
134 | |||
135 | and $b,$a1 # (a|c)&b | ||
136 | add $T1,$h # h+=T1 | 132 | add $T1,$h # h+=T1 |
137 | |||
138 | or $a2,$a1 # Maj(a,b,c)=((a|c)&b)|(a&c) | ||
139 | lea 1($round),$round # round++ | 133 | lea 1($round),$round # round++ |
134 | add $a1,$h # h+=Sigma0(a) | ||
140 | 135 | ||
141 | add $a1,$h # h+=Maj(a,b,c) | ||
142 | ___ | 136 | ___ |
143 | } | 137 | } |
144 | 138 | ||
@@ -147,32 +141,30 @@ sub ROUND_16_XX() | |||
147 | 141 | ||
148 | $code.=<<___; | 142 | $code.=<<___; |
149 | mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 | 143 | mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 |
150 | mov `$SZ*(($i+14)&0xf)`(%rsp),$T1 | 144 | mov `$SZ*(($i+14)&0xf)`(%rsp),$a1 |
151 | 145 | mov $a0,$T1 | |
152 | mov $a0,$a2 | 146 | mov $a1,$a2 |
153 | 147 | ||
148 | ror \$`$sigma0[1]-$sigma0[0]`,$T1 | ||
149 | xor $a0,$T1 | ||
154 | shr \$$sigma0[2],$a0 | 150 | shr \$$sigma0[2],$a0 |
155 | ror \$$sigma0[0],$a2 | ||
156 | |||
157 | xor $a2,$a0 | ||
158 | ror \$`$sigma0[1]-$sigma0[0]`,$a2 | ||
159 | 151 | ||
160 | xor $a2,$a0 # sigma0(X[(i+1)&0xf]) | 152 | ror \$$sigma0[0],$T1 |
161 | mov $T1,$a1 | 153 | xor $T1,$a0 # sigma0(X[(i+1)&0xf]) |
154 | mov `$SZ*(($i+9)&0xf)`(%rsp),$T1 | ||
162 | 155 | ||
163 | shr \$$sigma1[2],$T1 | 156 | ror \$`$sigma1[1]-$sigma1[0]`,$a2 |
164 | ror \$$sigma1[0],$a1 | 157 | xor $a1,$a2 |
165 | 158 | shr \$$sigma1[2],$a1 | |
166 | xor $a1,$T1 | ||
167 | ror \$`$sigma1[1]-$sigma1[0]`,$a1 | ||
168 | |||
169 | xor $a1,$T1 # sigma1(X[(i+14)&0xf]) | ||
170 | 159 | ||
160 | ror \$$sigma1[0],$a2 | ||
171 | add $a0,$T1 | 161 | add $a0,$T1 |
172 | 162 | xor $a2,$a1 # sigma1(X[(i+14)&0xf]) | |
173 | add `$SZ*(($i+9)&0xf)`(%rsp),$T1 | ||
174 | 163 | ||
175 | add `$SZ*($i&0xf)`(%rsp),$T1 | 164 | add `$SZ*($i&0xf)`(%rsp),$T1 |
165 | mov $e,$a0 | ||
166 | add $a1,$T1 | ||
167 | mov $a,$a1 | ||
176 | ___ | 168 | ___ |
177 | &ROUND_00_15(@_); | 169 | &ROUND_00_15(@_); |
178 | } | 170 | } |
@@ -219,6 +211,8 @@ $func: | |||
219 | ___ | 211 | ___ |
220 | for($i=0;$i<16;$i++) { | 212 | for($i=0;$i<16;$i++) { |
221 | $code.=" mov $SZ*$i($inp),$T1\n"; | 213 | $code.=" mov $SZ*$i($inp),$T1\n"; |
214 | $code.=" mov @ROT[4],$a0\n"; | ||
215 | $code.=" mov @ROT[0],$a1\n"; | ||
222 | $code.=" bswap $T1\n"; | 216 | $code.=" bswap $T1\n"; |
223 | &ROUND_00_15($i,@ROT); | 217 | &ROUND_00_15($i,@ROT); |
224 | unshift(@ROT,pop(@ROT)); | 218 | unshift(@ROT,pop(@ROT)); |
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c index 8952d87673..f88d3d6dad 100644 --- a/src/lib/libcrypto/sha/sha256.c +++ b/src/lib/libcrypto/sha/sha256.c | |||
@@ -16,7 +16,7 @@ | |||
16 | 16 | ||
17 | const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT; | 17 | const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT; |
18 | 18 | ||
19 | int SHA224_Init (SHA256_CTX *c) | 19 | fips_md_init_ctx(SHA224, SHA256) |
20 | { | 20 | { |
21 | memset (c,0,sizeof(*c)); | 21 | memset (c,0,sizeof(*c)); |
22 | c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL; | 22 | c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL; |
@@ -27,7 +27,7 @@ int SHA224_Init (SHA256_CTX *c) | |||
27 | return 1; | 27 | return 1; |
28 | } | 28 | } |
29 | 29 | ||
30 | int SHA256_Init (SHA256_CTX *c) | 30 | fips_md_init(SHA256) |
31 | { | 31 | { |
32 | memset (c,0,sizeof(*c)); | 32 | memset (c,0,sizeof(*c)); |
33 | c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL; | 33 | c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL; |
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c index cbc0e58c48..50dd7dc744 100644 --- a/src/lib/libcrypto/sha/sha512.c +++ b/src/lib/libcrypto/sha/sha512.c | |||
@@ -59,21 +59,8 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT; | |||
59 | #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA | 59 | #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA |
60 | #endif | 60 | #endif |
61 | 61 | ||
62 | int SHA384_Init (SHA512_CTX *c) | 62 | fips_md_init_ctx(SHA384, SHA512) |
63 | { | 63 | { |
64 | #if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) | ||
65 | /* maintain dword order required by assembler module */ | ||
66 | unsigned int *h = (unsigned int *)c->h; | ||
67 | |||
68 | h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8; | ||
69 | h[2] = 0x629a292a; h[3] = 0x367cd507; | ||
70 | h[4] = 0x9159015a; h[5] = 0x3070dd17; | ||
71 | h[6] = 0x152fecd8; h[7] = 0xf70e5939; | ||
72 | h[8] = 0x67332667; h[9] = 0xffc00b31; | ||
73 | h[10] = 0x8eb44a87; h[11] = 0x68581511; | ||
74 | h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7; | ||
75 | h[14] = 0x47b5481d; h[15] = 0xbefa4fa4; | ||
76 | #else | ||
77 | c->h[0]=U64(0xcbbb9d5dc1059ed8); | 64 | c->h[0]=U64(0xcbbb9d5dc1059ed8); |
78 | c->h[1]=U64(0x629a292a367cd507); | 65 | c->h[1]=U64(0x629a292a367cd507); |
79 | c->h[2]=U64(0x9159015a3070dd17); | 66 | c->h[2]=U64(0x9159015a3070dd17); |
@@ -82,27 +69,14 @@ int SHA384_Init (SHA512_CTX *c) | |||
82 | c->h[5]=U64(0x8eb44a8768581511); | 69 | c->h[5]=U64(0x8eb44a8768581511); |
83 | c->h[6]=U64(0xdb0c2e0d64f98fa7); | 70 | c->h[6]=U64(0xdb0c2e0d64f98fa7); |
84 | c->h[7]=U64(0x47b5481dbefa4fa4); | 71 | c->h[7]=U64(0x47b5481dbefa4fa4); |
85 | #endif | 72 | |
86 | c->Nl=0; c->Nh=0; | 73 | c->Nl=0; c->Nh=0; |
87 | c->num=0; c->md_len=SHA384_DIGEST_LENGTH; | 74 | c->num=0; c->md_len=SHA384_DIGEST_LENGTH; |
88 | return 1; | 75 | return 1; |
89 | } | 76 | } |
90 | 77 | ||
91 | int SHA512_Init (SHA512_CTX *c) | 78 | fips_md_init(SHA512) |
92 | { | 79 | { |
93 | #if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) | ||
94 | /* maintain dword order required by assembler module */ | ||
95 | unsigned int *h = (unsigned int *)c->h; | ||
96 | |||
97 | h[0] = 0x6a09e667; h[1] = 0xf3bcc908; | ||
98 | h[2] = 0xbb67ae85; h[3] = 0x84caa73b; | ||
99 | h[4] = 0x3c6ef372; h[5] = 0xfe94f82b; | ||
100 | h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1; | ||
101 | h[8] = 0x510e527f; h[9] = 0xade682d1; | ||
102 | h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f; | ||
103 | h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b; | ||
104 | h[14] = 0x5be0cd19; h[15] = 0x137e2179; | ||
105 | #else | ||
106 | c->h[0]=U64(0x6a09e667f3bcc908); | 80 | c->h[0]=U64(0x6a09e667f3bcc908); |
107 | c->h[1]=U64(0xbb67ae8584caa73b); | 81 | c->h[1]=U64(0xbb67ae8584caa73b); |
108 | c->h[2]=U64(0x3c6ef372fe94f82b); | 82 | c->h[2]=U64(0x3c6ef372fe94f82b); |
@@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c) | |||
111 | c->h[5]=U64(0x9b05688c2b3e6c1f); | 85 | c->h[5]=U64(0x9b05688c2b3e6c1f); |
112 | c->h[6]=U64(0x1f83d9abfb41bd6b); | 86 | c->h[6]=U64(0x1f83d9abfb41bd6b); |
113 | c->h[7]=U64(0x5be0cd19137e2179); | 87 | c->h[7]=U64(0x5be0cd19137e2179); |
114 | #endif | 88 | |
115 | c->Nl=0; c->Nh=0; | 89 | c->Nl=0; c->Nh=0; |
116 | c->num=0; c->md_len=SHA512_DIGEST_LENGTH; | 90 | c->num=0; c->md_len=SHA512_DIGEST_LENGTH; |
117 | return 1; | 91 | return 1; |
@@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c) | |||
160 | 134 | ||
161 | if (md==0) return 0; | 135 | if (md==0) return 0; |
162 | 136 | ||
163 | #if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) | ||
164 | /* recall assembler dword order... */ | ||
165 | n = c->md_len; | ||
166 | if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH) | ||
167 | { | ||
168 | unsigned int *h = (unsigned int *)c->h, t; | ||
169 | |||
170 | for (n/=4;n;n--) | ||
171 | { | ||
172 | t = *(h++); | ||
173 | *(md++) = (unsigned char)(t>>24); | ||
174 | *(md++) = (unsigned char)(t>>16); | ||
175 | *(md++) = (unsigned char)(t>>8); | ||
176 | *(md++) = (unsigned char)(t); | ||
177 | } | ||
178 | } | ||
179 | else return 0; | ||
180 | #else | ||
181 | switch (c->md_len) | 137 | switch (c->md_len) |
182 | { | 138 | { |
183 | /* Let compiler decide if it's appropriate to unroll... */ | 139 | /* Let compiler decide if it's appropriate to unroll... */ |
@@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c) | |||
214 | /* ... as well as make sure md_len is not abused. */ | 170 | /* ... as well as make sure md_len is not abused. */ |
215 | default: return 0; | 171 | default: return 0; |
216 | } | 172 | } |
217 | #endif | 173 | |
218 | return 1; | 174 | return 1; |
219 | } | 175 | } |
220 | 176 | ||
diff --git a/src/lib/libcrypto/sparcv9cap.c b/src/lib/libcrypto/sparcv9cap.c index ed195ab402..43b3ac6f81 100644 --- a/src/lib/libcrypto/sparcv9cap.c +++ b/src/lib/libcrypto/sparcv9cap.c | |||
@@ -19,7 +19,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U | |||
19 | int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); | 19 | int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); |
20 | int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); | 20 | int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); |
21 | 21 | ||
22 | if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == | 22 | if (num>=8 && !(num&1) && |
23 | (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == | ||
23 | (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) | 24 | (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) |
24 | return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); | 25 | return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); |
25 | else | 26 | else |
@@ -169,7 +170,6 @@ void OPENSSL_cpuid_setup(void) | |||
169 | char *e; | 170 | char *e; |
170 | struct sigaction common_act,ill_oact,bus_oact; | 171 | struct sigaction common_act,ill_oact,bus_oact; |
171 | sigset_t all_masked,oset; | 172 | sigset_t all_masked,oset; |
172 | int sig; | ||
173 | static int trigger=0; | 173 | static int trigger=0; |
174 | 174 | ||
175 | if (trigger) return; | 175 | if (trigger) return; |
diff --git a/src/lib/libcrypto/ts/ts.h b/src/lib/libcrypto/ts/ts.h index 190e8a1bf2..c2448e3c3b 100644 --- a/src/lib/libcrypto/ts/ts.h +++ b/src/lib/libcrypto/ts/ts.h | |||
@@ -86,9 +86,6 @@ | |||
86 | #include <openssl/dh.h> | 86 | #include <openssl/dh.h> |
87 | #endif | 87 | #endif |
88 | 88 | ||
89 | #include <openssl/evp.h> | ||
90 | |||
91 | |||
92 | #ifdef __cplusplus | 89 | #ifdef __cplusplus |
93 | extern "C" { | 90 | extern "C" { |
94 | #endif | 91 | #endif |
diff --git a/src/lib/libcrypto/whrlpool/whrlpool.h b/src/lib/libcrypto/whrlpool/whrlpool.h index 03c91da115..9e01f5b076 100644 --- a/src/lib/libcrypto/whrlpool/whrlpool.h +++ b/src/lib/libcrypto/whrlpool/whrlpool.h | |||
@@ -24,6 +24,9 @@ typedef struct { | |||
24 | } WHIRLPOOL_CTX; | 24 | } WHIRLPOOL_CTX; |
25 | 25 | ||
26 | #ifndef OPENSSL_NO_WHIRLPOOL | 26 | #ifndef OPENSSL_NO_WHIRLPOOL |
27 | #ifdef OPENSSL_FIPS | ||
28 | int private_WHIRLPOOL_Init(WHIRLPOOL_CTX *c); | ||
29 | #endif | ||
27 | int WHIRLPOOL_Init (WHIRLPOOL_CTX *c); | 30 | int WHIRLPOOL_Init (WHIRLPOOL_CTX *c); |
28 | int WHIRLPOOL_Update (WHIRLPOOL_CTX *c,const void *inp,size_t bytes); | 31 | int WHIRLPOOL_Update (WHIRLPOOL_CTX *c,const void *inp,size_t bytes); |
29 | void WHIRLPOOL_BitUpdate(WHIRLPOOL_CTX *c,const void *inp,size_t bits); | 32 | void WHIRLPOOL_BitUpdate(WHIRLPOOL_CTX *c,const void *inp,size_t bits); |
diff --git a/src/lib/libcrypto/whrlpool/wp_block.c b/src/lib/libcrypto/whrlpool/wp_block.c index 221f6cc59f..824ed1827c 100644 --- a/src/lib/libcrypto/whrlpool/wp_block.c +++ b/src/lib/libcrypto/whrlpool/wp_block.c | |||
@@ -68,9 +68,9 @@ typedef unsigned long long u64; | |||
68 | CPUs this is actually faster! */ | 68 | CPUs this is actually faster! */ |
69 | # endif | 69 | # endif |
70 | # define GO_FOR_MMX(ctx,inp,num) do { \ | 70 | # define GO_FOR_MMX(ctx,inp,num) do { \ |
71 | extern unsigned long OPENSSL_ia32cap_P; \ | 71 | extern unsigned int OPENSSL_ia32cap_P[]; \ |
72 | void whirlpool_block_mmx(void *,const void *,size_t); \ | 72 | void whirlpool_block_mmx(void *,const void *,size_t); \ |
73 | if (!(OPENSSL_ia32cap_P & (1<<23))) break; \ | 73 | if (!(OPENSSL_ia32cap_P[0] & (1<<23))) break; \ |
74 | whirlpool_block_mmx(ctx->H.c,inp,num); return; \ | 74 | whirlpool_block_mmx(ctx->H.c,inp,num); return; \ |
75 | } while (0) | 75 | } while (0) |
76 | # endif | 76 | # endif |
diff --git a/src/lib/libcrypto/whrlpool/wp_dgst.c b/src/lib/libcrypto/whrlpool/wp_dgst.c index ee5c5c1bf3..7e28bef51d 100644 --- a/src/lib/libcrypto/whrlpool/wp_dgst.c +++ b/src/lib/libcrypto/whrlpool/wp_dgst.c | |||
@@ -52,9 +52,10 @@ | |||
52 | */ | 52 | */ |
53 | 53 | ||
54 | #include "wp_locl.h" | 54 | #include "wp_locl.h" |
55 | #include <openssl/crypto.h> | ||
55 | #include <string.h> | 56 | #include <string.h> |
56 | 57 | ||
57 | int WHIRLPOOL_Init (WHIRLPOOL_CTX *c) | 58 | fips_md_init(WHIRLPOOL) |
58 | { | 59 | { |
59 | memset (c,0,sizeof(*c)); | 60 | memset (c,0,sizeof(*c)); |
60 | return(1); | 61 | return(1); |
diff --git a/src/lib/libcrypto/x86cpuid.pl b/src/lib/libcrypto/x86cpuid.pl index a7464af19b..39fd8f2293 100644 --- a/src/lib/libcrypto/x86cpuid.pl +++ b/src/lib/libcrypto/x86cpuid.pl | |||
@@ -19,9 +19,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
19 | &pushf (); | 19 | &pushf (); |
20 | &pop ("eax"); | 20 | &pop ("eax"); |
21 | &xor ("ecx","eax"); | 21 | &xor ("ecx","eax"); |
22 | &bt ("ecx",21); | ||
23 | &jnc (&label("done")); | ||
24 | &xor ("eax","eax"); | 22 | &xor ("eax","eax"); |
23 | &bt ("ecx",21); | ||
24 | &jnc (&label("nocpuid")); | ||
25 | &cpuid (); | 25 | &cpuid (); |
26 | &mov ("edi","eax"); # max value for standard query level | 26 | &mov ("edi","eax"); # max value for standard query level |
27 | 27 | ||
@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
51 | # AMD specific | 51 | # AMD specific |
52 | &mov ("eax",0x80000000); | 52 | &mov ("eax",0x80000000); |
53 | &cpuid (); | 53 | &cpuid (); |
54 | &cmp ("eax",0x80000008); | 54 | &cmp ("eax",0x80000001); |
55 | &jb (&label("intel")); | ||
56 | &mov ("esi","eax"); | ||
57 | &mov ("eax",0x80000001); | ||
58 | &cpuid (); | ||
59 | &or ("ebp","ecx"); | ||
60 | &and ("ebp",1<<11|1); # isolate XOP bit | ||
61 | &cmp ("esi",0x80000008); | ||
55 | &jb (&label("intel")); | 62 | &jb (&label("intel")); |
56 | 63 | ||
57 | &mov ("eax",0x80000008); | 64 | &mov ("eax",0x80000008); |
@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
62 | &mov ("eax",1); | 69 | &mov ("eax",1); |
63 | &cpuid (); | 70 | &cpuid (); |
64 | &bt ("edx",28); | 71 | &bt ("edx",28); |
65 | &jnc (&label("done")); | 72 | &jnc (&label("generic")); |
66 | &shr ("ebx",16); | 73 | &shr ("ebx",16); |
67 | &and ("ebx",0xff); | 74 | &and ("ebx",0xff); |
68 | &cmp ("ebx","esi"); | 75 | &cmp ("ebx","esi"); |
69 | &ja (&label("done")); | 76 | &ja (&label("generic")); |
70 | &and ("edx",0xefffffff); # clear hyper-threading bit | 77 | &and ("edx",0xefffffff); # clear hyper-threading bit |
71 | &jmp (&label("done")); | 78 | &jmp (&label("generic")); |
72 | 79 | ||
73 | &set_label("intel"); | 80 | &set_label("intel"); |
74 | &cmp ("edi",4); | 81 | &cmp ("edi",4); |
@@ -85,27 +92,51 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
85 | &set_label("nocacheinfo"); | 92 | &set_label("nocacheinfo"); |
86 | &mov ("eax",1); | 93 | &mov ("eax",1); |
87 | &cpuid (); | 94 | &cpuid (); |
95 | &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0 | ||
88 | &cmp ("ebp",0); | 96 | &cmp ("ebp",0); |
89 | &jne (&label("notP4")); | 97 | &jne (&label("notintel")); |
98 | &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs | ||
90 | &and (&HB("eax"),15); # familiy ID | 99 | &and (&HB("eax"),15); # familiy ID |
91 | &cmp (&HB("eax"),15); # P4? | 100 | &cmp (&HB("eax"),15); # P4? |
92 | &jne (&label("notP4")); | 101 | &jne (&label("notintel")); |
93 | &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR | 102 | &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR |
94 | &set_label("notP4"); | 103 | &set_label("notintel"); |
95 | &bt ("edx",28); # test hyper-threading bit | 104 | &bt ("edx",28); # test hyper-threading bit |
96 | &jnc (&label("done")); | 105 | &jnc (&label("generic")); |
97 | &and ("edx",0xefffffff); | 106 | &and ("edx",0xefffffff); |
98 | &cmp ("edi",0); | 107 | &cmp ("edi",0); |
99 | &je (&label("done")); | 108 | &je (&label("generic")); |
100 | 109 | ||
101 | &or ("edx",0x10000000); | 110 | &or ("edx",0x10000000); |
102 | &shr ("ebx",16); | 111 | &shr ("ebx",16); |
103 | &cmp (&LB("ebx"),1); | 112 | &cmp (&LB("ebx"),1); |
104 | &ja (&label("done")); | 113 | &ja (&label("generic")); |
105 | &and ("edx",0xefffffff); # clear hyper-threading bit if not | 114 | &and ("edx",0xefffffff); # clear hyper-threading bit if not |
115 | |||
116 | &set_label("generic"); | ||
117 | &and ("ebp",1<<11); # isolate AMD XOP flag | ||
118 | &and ("ecx",0xfffff7ff); # force 11th bit to 0 | ||
119 | &mov ("esi","edx"); | ||
120 | &or ("ebp","ecx"); # merge AMD XOP flag | ||
121 | |||
122 | &bt ("ecx",27); # check OSXSAVE bit | ||
123 | &jnc (&label("clear_avx")); | ||
124 | &xor ("ecx","ecx"); | ||
125 | &data_byte(0x0f,0x01,0xd0); # xgetbv | ||
126 | &and ("eax",6); | ||
127 | &cmp ("eax",6); | ||
128 | &je (&label("done")); | ||
129 | &cmp ("eax",2); | ||
130 | &je (&label("clear_avx")); | ||
131 | &set_label("clear_xmm"); | ||
132 | &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits | ||
133 | &and ("esi",0xfeffffff); # clear FXSR | ||
134 | &set_label("clear_avx"); | ||
135 | &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits | ||
106 | &set_label("done"); | 136 | &set_label("done"); |
107 | &mov ("eax","edx"); | 137 | &mov ("eax","esi"); |
108 | &mov ("edx","ecx"); | 138 | &mov ("edx","ebp"); |
139 | &set_label("nocpuid"); | ||
109 | &function_end("OPENSSL_ia32_cpuid"); | 140 | &function_end("OPENSSL_ia32_cpuid"); |
110 | 141 | ||
111 | &external_label("OPENSSL_ia32cap_P"); | 142 | &external_label("OPENSSL_ia32cap_P"); |
@@ -199,8 +230,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
199 | &bt (&DWP(0,"ecx"),1); | 230 | &bt (&DWP(0,"ecx"),1); |
200 | &jnc (&label("no_x87")); | 231 | &jnc (&label("no_x87")); |
201 | if ($sse2) { | 232 | if ($sse2) { |
202 | &bt (&DWP(0,"ecx"),26); | 233 | &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits |
203 | &jnc (&label("no_sse2")); | 234 | &cmp ("ecx",1<<26|1<<24); |
235 | &jne (&label("no_sse2")); | ||
204 | &pxor ("xmm0","xmm0"); | 236 | &pxor ("xmm0","xmm0"); |
205 | &pxor ("xmm1","xmm1"); | 237 | &pxor ("xmm1","xmm1"); |
206 | &pxor ("xmm2","xmm2"); | 238 | &pxor ("xmm2","xmm2"); |
@@ -307,6 +339,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |||
307 | &ret (); | 339 | &ret (); |
308 | &function_end_B("OPENSSL_cleanse"); | 340 | &function_end_B("OPENSSL_cleanse"); |
309 | 341 | ||
342 | &function_begin_B("OPENSSL_ia32_rdrand"); | ||
343 | &mov ("ecx",8); | ||
344 | &set_label("loop"); | ||
345 | &rdrand ("eax"); | ||
346 | &jc (&label("break")); | ||
347 | &loop (&label("loop")); | ||
348 | &set_label("break"); | ||
349 | &cmp ("eax",0); | ||
350 | &cmove ("eax","ecx"); | ||
351 | &ret (); | ||
352 | &function_end_B("OPENSSL_ia32_rdrand"); | ||
353 | |||
310 | &initseg("OPENSSL_cpuid_setup"); | 354 | &initseg("OPENSSL_cpuid_setup"); |
311 | 355 | ||
312 | &asm_finish(); | 356 | &asm_finish(); |
diff --git a/src/lib/libssl/d1_both.c b/src/lib/libssl/d1_both.c index 9f898d6997..de8bab873f 100644 --- a/src/lib/libssl/d1_both.c +++ b/src/lib/libssl/d1_both.c | |||
@@ -227,14 +227,14 @@ int dtls1_do_write(SSL *s, int type) | |||
227 | unsigned int len, frag_off, mac_size, blocksize; | 227 | unsigned int len, frag_off, mac_size, blocksize; |
228 | 228 | ||
229 | /* AHA! Figure out the MTU, and stick to the right size */ | 229 | /* AHA! Figure out the MTU, and stick to the right size */ |
230 | if ( ! (SSL_get_options(s) & SSL_OP_NO_QUERY_MTU)) | 230 | if (s->d1->mtu < dtls1_min_mtu() && !(SSL_get_options(s) & SSL_OP_NO_QUERY_MTU)) |
231 | { | 231 | { |
232 | s->d1->mtu = | 232 | s->d1->mtu = |
233 | BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_QUERY_MTU, 0, NULL); | 233 | BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_QUERY_MTU, 0, NULL); |
234 | 234 | ||
235 | /* I've seen the kernel return bogus numbers when it doesn't know | 235 | /* I've seen the kernel return bogus numbers when it doesn't know |
236 | * (initial write), so just make sure we have a reasonable number */ | 236 | * (initial write), so just make sure we have a reasonable number */ |
237 | if ( s->d1->mtu < dtls1_min_mtu()) | 237 | if (s->d1->mtu < dtls1_min_mtu()) |
238 | { | 238 | { |
239 | s->d1->mtu = 0; | 239 | s->d1->mtu = 0; |
240 | s->d1->mtu = dtls1_guess_mtu(s->d1->mtu); | 240 | s->d1->mtu = dtls1_guess_mtu(s->d1->mtu); |
@@ -1084,7 +1084,11 @@ int dtls1_read_failed(SSL *s, int code) | |||
1084 | return code; | 1084 | return code; |
1085 | } | 1085 | } |
1086 | 1086 | ||
1087 | if ( ! SSL_in_init(s)) /* done, no need to send a retransmit */ | 1087 | #ifndef OPENSSL_NO_HEARTBEATS |
1088 | if (!SSL_in_init(s) && !s->tlsext_hb_pending) /* done, no need to send a retransmit */ | ||
1089 | #else | ||
1090 | if (!SSL_in_init(s)) /* done, no need to send a retransmit */ | ||
1091 | #endif | ||
1088 | { | 1092 | { |
1089 | BIO_set_flags(SSL_get_rbio(s), BIO_FLAGS_READ); | 1093 | BIO_set_flags(SSL_get_rbio(s), BIO_FLAGS_READ); |
1090 | return code; | 1094 | return code; |
@@ -1417,3 +1421,171 @@ dtls1_get_ccs_header(unsigned char *data, struct ccs_header_st *ccs_hdr) | |||
1417 | 1421 | ||
1418 | ccs_hdr->type = *(data++); | 1422 | ccs_hdr->type = *(data++); |
1419 | } | 1423 | } |
1424 | |||
1425 | int dtls1_shutdown(SSL *s) | ||
1426 | { | ||
1427 | int ret; | ||
1428 | #ifndef OPENSSL_NO_SCTP | ||
1429 | if (BIO_dgram_is_sctp(SSL_get_wbio(s)) && | ||
1430 | !(s->shutdown & SSL_SENT_SHUTDOWN)) | ||
1431 | { | ||
1432 | ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s)); | ||
1433 | if (ret < 0) return -1; | ||
1434 | |||
1435 | if (ret == 0) | ||
1436 | BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 1, NULL); | ||
1437 | } | ||
1438 | #endif | ||
1439 | ret = ssl3_shutdown(s); | ||
1440 | #ifndef OPENSSL_NO_SCTP | ||
1441 | BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 0, NULL); | ||
1442 | #endif | ||
1443 | return ret; | ||
1444 | } | ||
1445 | |||
1446 | #ifndef OPENSSL_NO_HEARTBEATS | ||
1447 | int | ||
1448 | dtls1_process_heartbeat(SSL *s) | ||
1449 | { | ||
1450 | unsigned char *p = &s->s3->rrec.data[0], *pl; | ||
1451 | unsigned short hbtype; | ||
1452 | unsigned int payload; | ||
1453 | unsigned int padding = 16; /* Use minimum padding */ | ||
1454 | |||
1455 | /* Read type and payload length first */ | ||
1456 | hbtype = *p++; | ||
1457 | n2s(p, payload); | ||
1458 | pl = p; | ||
1459 | |||
1460 | if (s->msg_callback) | ||
1461 | s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT, | ||
1462 | &s->s3->rrec.data[0], s->s3->rrec.length, | ||
1463 | s, s->msg_callback_arg); | ||
1464 | |||
1465 | if (hbtype == TLS1_HB_REQUEST) | ||
1466 | { | ||
1467 | unsigned char *buffer, *bp; | ||
1468 | int r; | ||
1469 | |||
1470 | /* Allocate memory for the response, size is 1 byte | ||
1471 | * message type, plus 2 bytes payload length, plus | ||
1472 | * payload, plus padding | ||
1473 | */ | ||
1474 | buffer = OPENSSL_malloc(1 + 2 + payload + padding); | ||
1475 | bp = buffer; | ||
1476 | |||
1477 | /* Enter response type, length and copy payload */ | ||
1478 | *bp++ = TLS1_HB_RESPONSE; | ||
1479 | s2n(payload, bp); | ||
1480 | memcpy(bp, pl, payload); | ||
1481 | bp += payload; | ||
1482 | /* Random padding */ | ||
1483 | RAND_pseudo_bytes(bp, padding); | ||
1484 | |||
1485 | r = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding); | ||
1486 | |||
1487 | if (r >= 0 && s->msg_callback) | ||
1488 | s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT, | ||
1489 | buffer, 3 + payload + padding, | ||
1490 | s, s->msg_callback_arg); | ||
1491 | |||
1492 | OPENSSL_free(buffer); | ||
1493 | |||
1494 | if (r < 0) | ||
1495 | return r; | ||
1496 | } | ||
1497 | else if (hbtype == TLS1_HB_RESPONSE) | ||
1498 | { | ||
1499 | unsigned int seq; | ||
1500 | |||
1501 | /* We only send sequence numbers (2 bytes unsigned int), | ||
1502 | * and 16 random bytes, so we just try to read the | ||
1503 | * sequence number */ | ||
1504 | n2s(pl, seq); | ||
1505 | |||
1506 | if (payload == 18 && seq == s->tlsext_hb_seq) | ||
1507 | { | ||
1508 | dtls1_stop_timer(s); | ||
1509 | s->tlsext_hb_seq++; | ||
1510 | s->tlsext_hb_pending = 0; | ||
1511 | } | ||
1512 | } | ||
1513 | |||
1514 | return 0; | ||
1515 | } | ||
1516 | |||
1517 | int | ||
1518 | dtls1_heartbeat(SSL *s) | ||
1519 | { | ||
1520 | unsigned char *buf, *p; | ||
1521 | int ret; | ||
1522 | unsigned int payload = 18; /* Sequence number + random bytes */ | ||
1523 | unsigned int padding = 16; /* Use minimum padding */ | ||
1524 | |||
1525 | /* Only send if peer supports and accepts HB requests... */ | ||
1526 | if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) || | ||
1527 | s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS) | ||
1528 | { | ||
1529 | SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT); | ||
1530 | return -1; | ||
1531 | } | ||
1532 | |||
1533 | /* ...and there is none in flight yet... */ | ||
1534 | if (s->tlsext_hb_pending) | ||
1535 | { | ||
1536 | SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING); | ||
1537 | return -1; | ||
1538 | } | ||
1539 | |||
1540 | /* ...and no handshake in progress. */ | ||
1541 | if (SSL_in_init(s) || s->in_handshake) | ||
1542 | { | ||
1543 | SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE); | ||
1544 | return -1; | ||
1545 | } | ||
1546 | |||
1547 | /* Check if padding is too long, payload and padding | ||
1548 | * must not exceed 2^14 - 3 = 16381 bytes in total. | ||
1549 | */ | ||
1550 | OPENSSL_assert(payload + padding <= 16381); | ||
1551 | |||
1552 | /* Create HeartBeat message, we just use a sequence number | ||
1553 | * as payload to distuingish different messages and add | ||
1554 | * some random stuff. | ||
1555 | * - Message Type, 1 byte | ||
1556 | * - Payload Length, 2 bytes (unsigned int) | ||
1557 | * - Payload, the sequence number (2 bytes uint) | ||
1558 | * - Payload, random bytes (16 bytes uint) | ||
1559 | * - Padding | ||
1560 | */ | ||
1561 | buf = OPENSSL_malloc(1 + 2 + payload + padding); | ||
1562 | p = buf; | ||
1563 | /* Message Type */ | ||
1564 | *p++ = TLS1_HB_REQUEST; | ||
1565 | /* Payload length (18 bytes here) */ | ||
1566 | s2n(payload, p); | ||
1567 | /* Sequence number */ | ||
1568 | s2n(s->tlsext_hb_seq, p); | ||
1569 | /* 16 random bytes */ | ||
1570 | RAND_pseudo_bytes(p, 16); | ||
1571 | p += 16; | ||
1572 | /* Random padding */ | ||
1573 | RAND_pseudo_bytes(p, padding); | ||
1574 | |||
1575 | ret = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding); | ||
1576 | if (ret >= 0) | ||
1577 | { | ||
1578 | if (s->msg_callback) | ||
1579 | s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT, | ||
1580 | buf, 3 + payload + padding, | ||
1581 | s, s->msg_callback_arg); | ||
1582 | |||
1583 | dtls1_start_timer(s); | ||
1584 | s->tlsext_hb_pending = 1; | ||
1585 | } | ||
1586 | |||
1587 | OPENSSL_free(buf); | ||
1588 | |||
1589 | return ret; | ||
1590 | } | ||
1591 | #endif | ||
diff --git a/src/lib/libssl/d1_enc.c b/src/lib/libssl/d1_enc.c index becbab91c2..07a5e97ce5 100644 --- a/src/lib/libssl/d1_enc.c +++ b/src/lib/libssl/d1_enc.c | |||
@@ -260,7 +260,7 @@ int dtls1_enc(SSL *s, int send) | |||
260 | } | 260 | } |
261 | /* TLS 1.0 does not bound the number of padding bytes by the block size. | 261 | /* TLS 1.0 does not bound the number of padding bytes by the block size. |
262 | * All of them must have value 'padding_length'. */ | 262 | * All of them must have value 'padding_length'. */ |
263 | if (i > (int)rec->length) | 263 | if (i + bs > (int)rec->length) |
264 | { | 264 | { |
265 | /* Incorrect padding. SSLerr() and ssl3_alert are done | 265 | /* Incorrect padding. SSLerr() and ssl3_alert are done |
266 | * by caller: we don't want to reveal whether this is | 266 | * by caller: we don't want to reveal whether this is |
diff --git a/src/lib/libssl/d1_lib.c b/src/lib/libssl/d1_lib.c index c3b77c889b..f61f718183 100644 --- a/src/lib/libssl/d1_lib.c +++ b/src/lib/libssl/d1_lib.c | |||
@@ -82,6 +82,7 @@ SSL3_ENC_METHOD DTLSv1_enc_data={ | |||
82 | TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE, | 82 | TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE, |
83 | TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE, | 83 | TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE, |
84 | tls1_alert_code, | 84 | tls1_alert_code, |
85 | tls1_export_keying_material, | ||
85 | }; | 86 | }; |
86 | 87 | ||
87 | long dtls1_default_timeout(void) | 88 | long dtls1_default_timeout(void) |
@@ -291,6 +292,15 @@ const SSL_CIPHER *dtls1_get_cipher(unsigned int u) | |||
291 | 292 | ||
292 | void dtls1_start_timer(SSL *s) | 293 | void dtls1_start_timer(SSL *s) |
293 | { | 294 | { |
295 | #ifndef OPENSSL_NO_SCTP | ||
296 | /* Disable timer for SCTP */ | ||
297 | if (BIO_dgram_is_sctp(SSL_get_wbio(s))) | ||
298 | { | ||
299 | memset(&(s->d1->next_timeout), 0, sizeof(struct timeval)); | ||
300 | return; | ||
301 | } | ||
302 | #endif | ||
303 | |||
294 | /* If timer is not set, initialize duration with 1 second */ | 304 | /* If timer is not set, initialize duration with 1 second */ |
295 | if (s->d1->next_timeout.tv_sec == 0 && s->d1->next_timeout.tv_usec == 0) | 305 | if (s->d1->next_timeout.tv_sec == 0 && s->d1->next_timeout.tv_usec == 0) |
296 | { | 306 | { |
@@ -381,6 +391,7 @@ void dtls1_double_timeout(SSL *s) | |||
381 | void dtls1_stop_timer(SSL *s) | 391 | void dtls1_stop_timer(SSL *s) |
382 | { | 392 | { |
383 | /* Reset everything */ | 393 | /* Reset everything */ |
394 | memset(&(s->d1->timeout), 0, sizeof(struct dtls1_timeout_st)); | ||
384 | memset(&(s->d1->next_timeout), 0, sizeof(struct timeval)); | 395 | memset(&(s->d1->next_timeout), 0, sizeof(struct timeval)); |
385 | s->d1->timeout_duration = 1; | 396 | s->d1->timeout_duration = 1; |
386 | BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT, 0, &(s->d1->next_timeout)); | 397 | BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT, 0, &(s->d1->next_timeout)); |
@@ -388,10 +399,28 @@ void dtls1_stop_timer(SSL *s) | |||
388 | dtls1_clear_record_buffer(s); | 399 | dtls1_clear_record_buffer(s); |
389 | } | 400 | } |
390 | 401 | ||
391 | int dtls1_handle_timeout(SSL *s) | 402 | int dtls1_check_timeout_num(SSL *s) |
392 | { | 403 | { |
393 | DTLS1_STATE *state; | 404 | s->d1->timeout.num_alerts++; |
405 | |||
406 | /* Reduce MTU after 2 unsuccessful retransmissions */ | ||
407 | if (s->d1->timeout.num_alerts > 2) | ||
408 | { | ||
409 | s->d1->mtu = BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_GET_FALLBACK_MTU, 0, NULL); | ||
410 | } | ||
394 | 411 | ||
412 | if (s->d1->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT) | ||
413 | { | ||
414 | /* fail the connection, enough alerts have been sent */ | ||
415 | SSLerr(SSL_F_DTLS1_CHECK_TIMEOUT_NUM,SSL_R_READ_TIMEOUT_EXPIRED); | ||
416 | return -1; | ||
417 | } | ||
418 | |||
419 | return 0; | ||
420 | } | ||
421 | |||
422 | int dtls1_handle_timeout(SSL *s) | ||
423 | { | ||
395 | /* if no timer is expired, don't do anything */ | 424 | /* if no timer is expired, don't do anything */ |
396 | if (!dtls1_is_timer_expired(s)) | 425 | if (!dtls1_is_timer_expired(s)) |
397 | { | 426 | { |
@@ -399,20 +428,23 @@ int dtls1_handle_timeout(SSL *s) | |||
399 | } | 428 | } |
400 | 429 | ||
401 | dtls1_double_timeout(s); | 430 | dtls1_double_timeout(s); |
402 | state = s->d1; | 431 | |
403 | state->timeout.num_alerts++; | 432 | if (dtls1_check_timeout_num(s) < 0) |
404 | if ( state->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT) | ||
405 | { | ||
406 | /* fail the connection, enough alerts have been sent */ | ||
407 | SSLerr(SSL_F_DTLS1_HANDLE_TIMEOUT,SSL_R_READ_TIMEOUT_EXPIRED); | ||
408 | return -1; | 433 | return -1; |
434 | |||
435 | s->d1->timeout.read_timeouts++; | ||
436 | if (s->d1->timeout.read_timeouts > DTLS1_TMO_READ_COUNT) | ||
437 | { | ||
438 | s->d1->timeout.read_timeouts = 1; | ||
409 | } | 439 | } |
410 | 440 | ||
411 | state->timeout.read_timeouts++; | 441 | #ifndef OPENSSL_NO_HEARTBEATS |
412 | if ( state->timeout.read_timeouts > DTLS1_TMO_READ_COUNT) | 442 | if (s->tlsext_hb_pending) |
413 | { | 443 | { |
414 | state->timeout.read_timeouts = 1; | 444 | s->tlsext_hb_pending = 0; |
445 | return dtls1_heartbeat(s); | ||
415 | } | 446 | } |
447 | #endif | ||
416 | 448 | ||
417 | dtls1_start_timer(s); | 449 | dtls1_start_timer(s); |
418 | return dtls1_retransmit_buffered_messages(s); | 450 | return dtls1_retransmit_buffered_messages(s); |
diff --git a/src/lib/libssl/d1_srtp.c b/src/lib/libssl/d1_srtp.c new file mode 100644 index 0000000000..928935bd8b --- /dev/null +++ b/src/lib/libssl/d1_srtp.c | |||
@@ -0,0 +1,493 @@ | |||
1 | /* ssl/t1_lib.c */ | ||
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * This package is an SSL implementation written | ||
6 | * by Eric Young (eay@cryptsoft.com). | ||
7 | * The implementation was written so as to conform with Netscapes SSL. | ||
8 | * | ||
9 | * This library is free for commercial and non-commercial use as long as | ||
10 | * the following conditions are aheared to. The following conditions | ||
11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
13 | * included with this distribution is covered by the same copyright terms | ||
14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
15 | * | ||
16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
17 | * the code are not to be removed. | ||
18 | * If this package is used in a product, Eric Young should be given attribution | ||
19 | * as the author of the parts of the library used. | ||
20 | * This can be in the form of a textual message at program startup or | ||
21 | * in documentation (online or textual) provided with the package. | ||
22 | * | ||
23 | * Redistribution and use in source and binary forms, with or without | ||
24 | * modification, are permitted provided that the following conditions | ||
25 | * are met: | ||
26 | * 1. Redistributions of source code must retain the copyright | ||
27 | * notice, this list of conditions and the following disclaimer. | ||
28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
29 | * notice, this list of conditions and the following disclaimer in the | ||
30 | * documentation and/or other materials provided with the distribution. | ||
31 | * 3. All advertising materials mentioning features or use of this software | ||
32 | * must display the following acknowledgement: | ||
33 | * "This product includes cryptographic software written by | ||
34 | * Eric Young (eay@cryptsoft.com)" | ||
35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
36 | * being used are not cryptographic related :-). | ||
37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
38 | * the apps directory (application code) you must include an acknowledgement: | ||
39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
40 | * | ||
41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
51 | * SUCH DAMAGE. | ||
52 | * | ||
53 | * The licence and distribution terms for any publically available version or | ||
54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
55 | * copied and put under another distribution licence | ||
56 | * [including the GNU Public Licence.] | ||
57 | */ | ||
58 | /* ==================================================================== | ||
59 | * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. | ||
60 | * | ||
61 | * Redistribution and use in source and binary forms, with or without | ||
62 | * modification, are permitted provided that the following conditions | ||
63 | * are met: | ||
64 | * | ||
65 | * 1. Redistributions of source code must retain the above copyright | ||
66 | * notice, this list of conditions and the following disclaimer. | ||
67 | * | ||
68 | * 2. Redistributions in binary form must reproduce the above copyright | ||
69 | * notice, this list of conditions and the following disclaimer in | ||
70 | * the documentation and/or other materials provided with the | ||
71 | * distribution. | ||
72 | * | ||
73 | * 3. All advertising materials mentioning features or use of this | ||
74 | * software must display the following acknowledgment: | ||
75 | * "This product includes software developed by the OpenSSL Project | ||
76 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
77 | * | ||
78 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
79 | * endorse or promote products derived from this software without | ||
80 | * prior written permission. For written permission, please contact | ||
81 | * openssl-core@openssl.org. | ||
82 | * | ||
83 | * 5. Products derived from this software may not be called "OpenSSL" | ||
84 | * nor may "OpenSSL" appear in their names without prior written | ||
85 | * permission of the OpenSSL Project. | ||
86 | * | ||
87 | * 6. Redistributions of any form whatsoever must retain the following | ||
88 | * acknowledgment: | ||
89 | * "This product includes software developed by the OpenSSL Project | ||
90 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
91 | * | ||
92 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
93 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
94 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
95 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
96 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
97 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
98 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
99 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
100 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
101 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
102 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
103 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
104 | * ==================================================================== | ||
105 | * | ||
106 | * This product includes cryptographic software written by Eric Young | ||
107 | * (eay@cryptsoft.com). This product includes software written by Tim | ||
108 | * Hudson (tjh@cryptsoft.com). | ||
109 | * | ||
110 | */ | ||
111 | /* | ||
112 | DTLS code by Eric Rescorla <ekr@rtfm.com> | ||
113 | |||
114 | Copyright (C) 2006, Network Resonance, Inc. | ||
115 | Copyright (C) 2011, RTFM, Inc. | ||
116 | */ | ||
117 | |||
118 | #ifndef OPENSSL_NO_SRTP | ||
119 | |||
120 | #include <stdio.h> | ||
121 | #include <openssl/objects.h> | ||
122 | #include "ssl_locl.h" | ||
123 | #include "srtp.h" | ||
124 | |||
125 | |||
126 | static SRTP_PROTECTION_PROFILE srtp_known_profiles[]= | ||
127 | { | ||
128 | { | ||
129 | "SRTP_AES128_CM_SHA1_80", | ||
130 | SRTP_AES128_CM_SHA1_80, | ||
131 | }, | ||
132 | { | ||
133 | "SRTP_AES128_CM_SHA1_32", | ||
134 | SRTP_AES128_CM_SHA1_32, | ||
135 | }, | ||
136 | #if 0 | ||
137 | { | ||
138 | "SRTP_NULL_SHA1_80", | ||
139 | SRTP_NULL_SHA1_80, | ||
140 | }, | ||
141 | { | ||
142 | "SRTP_NULL_SHA1_32", | ||
143 | SRTP_NULL_SHA1_32, | ||
144 | }, | ||
145 | #endif | ||
146 | {0} | ||
147 | }; | ||
148 | |||
149 | static int find_profile_by_name(char *profile_name, | ||
150 | SRTP_PROTECTION_PROFILE **pptr,unsigned len) | ||
151 | { | ||
152 | SRTP_PROTECTION_PROFILE *p; | ||
153 | |||
154 | p=srtp_known_profiles; | ||
155 | while(p->name) | ||
156 | { | ||
157 | if((len == strlen(p->name)) && !strncmp(p->name,profile_name, | ||
158 | len)) | ||
159 | { | ||
160 | *pptr=p; | ||
161 | return 0; | ||
162 | } | ||
163 | |||
164 | p++; | ||
165 | } | ||
166 | |||
167 | return 1; | ||
168 | } | ||
169 | |||
170 | static int find_profile_by_num(unsigned profile_num, | ||
171 | SRTP_PROTECTION_PROFILE **pptr) | ||
172 | { | ||
173 | SRTP_PROTECTION_PROFILE *p; | ||
174 | |||
175 | p=srtp_known_profiles; | ||
176 | while(p->name) | ||
177 | { | ||
178 | if(p->id == profile_num) | ||
179 | { | ||
180 | *pptr=p; | ||
181 | return 0; | ||
182 | } | ||
183 | p++; | ||
184 | } | ||
185 | |||
186 | return 1; | ||
187 | } | ||
188 | |||
189 | static int ssl_ctx_make_profiles(const char *profiles_string,STACK_OF(SRTP_PROTECTION_PROFILE) **out) | ||
190 | { | ||
191 | STACK_OF(SRTP_PROTECTION_PROFILE) *profiles; | ||
192 | |||
193 | char *col; | ||
194 | char *ptr=(char *)profiles_string; | ||
195 | |||
196 | SRTP_PROTECTION_PROFILE *p; | ||
197 | |||
198 | if(!(profiles=sk_SRTP_PROTECTION_PROFILE_new_null())) | ||
199 | { | ||
200 | SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES, SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES); | ||
201 | return 1; | ||
202 | } | ||
203 | |||
204 | do | ||
205 | { | ||
206 | col=strchr(ptr,':'); | ||
207 | |||
208 | if(!find_profile_by_name(ptr,&p, | ||
209 | col ? col-ptr : (int)strlen(ptr))) | ||
210 | { | ||
211 | sk_SRTP_PROTECTION_PROFILE_push(profiles,p); | ||
212 | } | ||
213 | else | ||
214 | { | ||
215 | SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES,SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE); | ||
216 | return 1; | ||
217 | } | ||
218 | |||
219 | if(col) ptr=col+1; | ||
220 | } while (col); | ||
221 | |||
222 | *out=profiles; | ||
223 | |||
224 | return 0; | ||
225 | } | ||
226 | |||
227 | int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx,const char *profiles) | ||
228 | { | ||
229 | return ssl_ctx_make_profiles(profiles,&ctx->srtp_profiles); | ||
230 | } | ||
231 | |||
232 | int SSL_set_tlsext_use_srtp(SSL *s,const char *profiles) | ||
233 | { | ||
234 | return ssl_ctx_make_profiles(profiles,&s->srtp_profiles); | ||
235 | } | ||
236 | |||
237 | |||
238 | STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *s) | ||
239 | { | ||
240 | if(s != NULL) | ||
241 | { | ||
242 | if(s->srtp_profiles != NULL) | ||
243 | { | ||
244 | return s->srtp_profiles; | ||
245 | } | ||
246 | else if((s->ctx != NULL) && | ||
247 | (s->ctx->srtp_profiles != NULL)) | ||
248 | { | ||
249 | return s->ctx->srtp_profiles; | ||
250 | } | ||
251 | } | ||
252 | |||
253 | return NULL; | ||
254 | } | ||
255 | |||
256 | SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s) | ||
257 | { | ||
258 | return s->srtp_profile; | ||
259 | } | ||
260 | |||
261 | /* Note: this function returns 0 length if there are no | ||
262 | profiles specified */ | ||
263 | int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen) | ||
264 | { | ||
265 | int ct=0; | ||
266 | int i; | ||
267 | STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0; | ||
268 | SRTP_PROTECTION_PROFILE *prof; | ||
269 | |||
270 | clnt=SSL_get_srtp_profiles(s); | ||
271 | ct=sk_SRTP_PROTECTION_PROFILE_num(clnt); /* -1 if clnt == 0 */ | ||
272 | |||
273 | if(p) | ||
274 | { | ||
275 | if(ct==0) | ||
276 | { | ||
277 | SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST); | ||
278 | return 1; | ||
279 | } | ||
280 | |||
281 | if((2 + ct*2 + 1) > maxlen) | ||
282 | { | ||
283 | SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG); | ||
284 | return 1; | ||
285 | } | ||
286 | |||
287 | /* Add the length */ | ||
288 | s2n(ct * 2, p); | ||
289 | for(i=0;i<ct;i++) | ||
290 | { | ||
291 | prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i); | ||
292 | s2n(prof->id,p); | ||
293 | } | ||
294 | |||
295 | /* Add an empty use_mki value */ | ||
296 | *p++ = 0; | ||
297 | } | ||
298 | |||
299 | *len=2 + ct*2 + 1; | ||
300 | |||
301 | return 0; | ||
302 | } | ||
303 | |||
304 | |||
305 | int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al) | ||
306 | { | ||
307 | SRTP_PROTECTION_PROFILE *cprof,*sprof; | ||
308 | STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0,*srvr; | ||
309 | int ct; | ||
310 | int mki_len; | ||
311 | int i,j; | ||
312 | int id; | ||
313 | int ret; | ||
314 | |||
315 | /* Length value + the MKI length */ | ||
316 | if(len < 3) | ||
317 | { | ||
318 | SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
319 | *al=SSL_AD_DECODE_ERROR; | ||
320 | return 1; | ||
321 | } | ||
322 | |||
323 | /* Pull off the length of the cipher suite list */ | ||
324 | n2s(d, ct); | ||
325 | len -= 2; | ||
326 | |||
327 | /* Check that it is even */ | ||
328 | if(ct%2) | ||
329 | { | ||
330 | SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
331 | *al=SSL_AD_DECODE_ERROR; | ||
332 | return 1; | ||
333 | } | ||
334 | |||
335 | /* Check that lengths are consistent */ | ||
336 | if(len < (ct + 1)) | ||
337 | { | ||
338 | SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
339 | *al=SSL_AD_DECODE_ERROR; | ||
340 | return 1; | ||
341 | } | ||
342 | |||
343 | |||
344 | clnt=sk_SRTP_PROTECTION_PROFILE_new_null(); | ||
345 | |||
346 | while(ct) | ||
347 | { | ||
348 | n2s(d,id); | ||
349 | ct-=2; | ||
350 | len-=2; | ||
351 | |||
352 | if(!find_profile_by_num(id,&cprof)) | ||
353 | { | ||
354 | sk_SRTP_PROTECTION_PROFILE_push(clnt,cprof); | ||
355 | } | ||
356 | else | ||
357 | { | ||
358 | ; /* Ignore */ | ||
359 | } | ||
360 | } | ||
361 | |||
362 | /* Now extract the MKI value as a sanity check, but discard it for now */ | ||
363 | mki_len = *d; | ||
364 | d++; len--; | ||
365 | |||
366 | if (mki_len != len) | ||
367 | { | ||
368 | SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE); | ||
369 | *al=SSL_AD_DECODE_ERROR; | ||
370 | return 1; | ||
371 | } | ||
372 | |||
373 | srvr=SSL_get_srtp_profiles(s); | ||
374 | |||
375 | /* Pick our most preferred profile. If no profiles have been | ||
376 | configured then the outer loop doesn't run | ||
377 | (sk_SRTP_PROTECTION_PROFILE_num() = -1) | ||
378 | and so we just return without doing anything */ | ||
379 | for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(srvr);i++) | ||
380 | { | ||
381 | sprof=sk_SRTP_PROTECTION_PROFILE_value(srvr,i); | ||
382 | |||
383 | for(j=0;j<sk_SRTP_PROTECTION_PROFILE_num(clnt);j++) | ||
384 | { | ||
385 | cprof=sk_SRTP_PROTECTION_PROFILE_value(clnt,j); | ||
386 | |||
387 | if(cprof->id==sprof->id) | ||
388 | { | ||
389 | s->srtp_profile=sprof; | ||
390 | *al=0; | ||
391 | ret=0; | ||
392 | goto done; | ||
393 | } | ||
394 | } | ||
395 | } | ||
396 | |||
397 | ret=0; | ||
398 | |||
399 | done: | ||
400 | if(clnt) sk_SRTP_PROTECTION_PROFILE_free(clnt); | ||
401 | |||
402 | return ret; | ||
403 | } | ||
404 | |||
405 | int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen) | ||
406 | { | ||
407 | if(p) | ||
408 | { | ||
409 | if(maxlen < 5) | ||
410 | { | ||
411 | SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG); | ||
412 | return 1; | ||
413 | } | ||
414 | |||
415 | if(s->srtp_profile==0) | ||
416 | { | ||
417 | SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_USE_SRTP_NOT_NEGOTIATED); | ||
418 | return 1; | ||
419 | } | ||
420 | s2n(2, p); | ||
421 | s2n(s->srtp_profile->id,p); | ||
422 | *p++ = 0; | ||
423 | } | ||
424 | *len=5; | ||
425 | |||
426 | return 0; | ||
427 | } | ||
428 | |||
429 | |||
430 | int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al) | ||
431 | { | ||
432 | unsigned id; | ||
433 | int i; | ||
434 | int ct; | ||
435 | |||
436 | STACK_OF(SRTP_PROTECTION_PROFILE) *clnt; | ||
437 | SRTP_PROTECTION_PROFILE *prof; | ||
438 | |||
439 | if(len!=5) | ||
440 | { | ||
441 | SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
442 | *al=SSL_AD_DECODE_ERROR; | ||
443 | return 1; | ||
444 | } | ||
445 | |||
446 | n2s(d, ct); | ||
447 | if(ct!=2) | ||
448 | { | ||
449 | SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
450 | *al=SSL_AD_DECODE_ERROR; | ||
451 | return 1; | ||
452 | } | ||
453 | |||
454 | n2s(d,id); | ||
455 | if (*d) /* Must be no MKI, since we never offer one */ | ||
456 | { | ||
457 | SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE); | ||
458 | *al=SSL_AD_ILLEGAL_PARAMETER; | ||
459 | return 1; | ||
460 | } | ||
461 | |||
462 | clnt=SSL_get_srtp_profiles(s); | ||
463 | |||
464 | /* Throw an error if the server gave us an unsolicited extension */ | ||
465 | if (clnt == NULL) | ||
466 | { | ||
467 | SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_NO_SRTP_PROFILES); | ||
468 | *al=SSL_AD_DECODE_ERROR; | ||
469 | return 1; | ||
470 | } | ||
471 | |||
472 | /* Check to see if the server gave us something we support | ||
473 | (and presumably offered) | ||
474 | */ | ||
475 | for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(clnt);i++) | ||
476 | { | ||
477 | prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i); | ||
478 | |||
479 | if(prof->id == id) | ||
480 | { | ||
481 | s->srtp_profile=prof; | ||
482 | *al=0; | ||
483 | return 0; | ||
484 | } | ||
485 | } | ||
486 | |||
487 | SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); | ||
488 | *al=SSL_AD_DECODE_ERROR; | ||
489 | return 1; | ||
490 | } | ||
491 | |||
492 | |||
493 | #endif | ||
diff --git a/src/lib/libssl/srtp.h b/src/lib/libssl/srtp.h new file mode 100644 index 0000000000..c0cf33ef28 --- /dev/null +++ b/src/lib/libssl/srtp.h | |||
@@ -0,0 +1,145 @@ | |||
1 | /* ssl/tls1.h */ | ||
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | ||
3 | * All rights reserved. | ||
4 | * | ||
5 | * This package is an SSL implementation written | ||
6 | * by Eric Young (eay@cryptsoft.com). | ||
7 | * The implementation was written so as to conform with Netscapes SSL. | ||
8 | * | ||
9 | * This library is free for commercial and non-commercial use as long as | ||
10 | * the following conditions are aheared to. The following conditions | ||
11 | * apply to all code found in this distribution, be it the RC4, RSA, | ||
12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | ||
13 | * included with this distribution is covered by the same copyright terms | ||
14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | ||
15 | * | ||
16 | * Copyright remains Eric Young's, and as such any Copyright notices in | ||
17 | * the code are not to be removed. | ||
18 | * If this package is used in a product, Eric Young should be given attribution | ||
19 | * as the author of the parts of the library used. | ||
20 | * This can be in the form of a textual message at program startup or | ||
21 | * in documentation (online or textual) provided with the package. | ||
22 | * | ||
23 | * Redistribution and use in source and binary forms, with or without | ||
24 | * modification, are permitted provided that the following conditions | ||
25 | * are met: | ||
26 | * 1. Redistributions of source code must retain the copyright | ||
27 | * notice, this list of conditions and the following disclaimer. | ||
28 | * 2. Redistributions in binary form must reproduce the above copyright | ||
29 | * notice, this list of conditions and the following disclaimer in the | ||
30 | * documentation and/or other materials provided with the distribution. | ||
31 | * 3. All advertising materials mentioning features or use of this software | ||
32 | * must display the following acknowledgement: | ||
33 | * "This product includes cryptographic software written by | ||
34 | * Eric Young (eay@cryptsoft.com)" | ||
35 | * The word 'cryptographic' can be left out if the rouines from the library | ||
36 | * being used are not cryptographic related :-). | ||
37 | * 4. If you include any Windows specific code (or a derivative thereof) from | ||
38 | * the apps directory (application code) you must include an acknowledgement: | ||
39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | ||
40 | * | ||
41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND | ||
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
51 | * SUCH DAMAGE. | ||
52 | * | ||
53 | * The licence and distribution terms for any publically available version or | ||
54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | ||
55 | * copied and put under another distribution licence | ||
56 | * [including the GNU Public Licence.] | ||
57 | */ | ||
58 | /* ==================================================================== | ||
59 | * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. | ||
60 | * | ||
61 | * Redistribution and use in source and binary forms, with or without | ||
62 | * modification, are permitted provided that the following conditions | ||
63 | * are met: | ||
64 | * | ||
65 | * 1. Redistributions of source code must retain the above copyright | ||
66 | * notice, this list of conditions and the following disclaimer. | ||
67 | * | ||
68 | * 2. Redistributions in binary form must reproduce the above copyright | ||
69 | * notice, this list of conditions and the following disclaimer in | ||
70 | * the documentation and/or other materials provided with the | ||
71 | * distribution. | ||
72 | * | ||
73 | * 3. All advertising materials mentioning features or use of this | ||
74 | * software must display the following acknowledgment: | ||
75 | * "This product includes software developed by the OpenSSL Project | ||
76 | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" | ||
77 | * | ||
78 | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | ||
79 | * endorse or promote products derived from this software without | ||
80 | * prior written permission. For written permission, please contact | ||
81 | * openssl-core@openssl.org. | ||
82 | * | ||
83 | * 5. Products derived from this software may not be called "OpenSSL" | ||
84 | * nor may "OpenSSL" appear in their names without prior written | ||
85 | * permission of the OpenSSL Project. | ||
86 | * | ||
87 | * 6. Redistributions of any form whatsoever must retain the following | ||
88 | * acknowledgment: | ||
89 | * "This product includes software developed by the OpenSSL Project | ||
90 | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" | ||
91 | * | ||
92 | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | ||
93 | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
94 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
95 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | ||
96 | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
97 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
98 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
99 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
100 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
101 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
102 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | ||
103 | * OF THE POSSIBILITY OF SUCH DAMAGE. | ||
104 | * ==================================================================== | ||
105 | * | ||
106 | * This product includes cryptographic software written by Eric Young | ||
107 | * (eay@cryptsoft.com). This product includes software written by Tim | ||
108 | * Hudson (tjh@cryptsoft.com). | ||
109 | * | ||
110 | */ | ||
111 | /* | ||
112 | DTLS code by Eric Rescorla <ekr@rtfm.com> | ||
113 | |||
114 | Copyright (C) 2006, Network Resonance, Inc. | ||
115 | Copyright (C) 2011, RTFM, Inc. | ||
116 | */ | ||
117 | |||
118 | #ifndef HEADER_D1_SRTP_H | ||
119 | #define HEADER_D1_SRTP_H | ||
120 | |||
121 | #ifdef __cplusplus | ||
122 | extern "C" { | ||
123 | #endif | ||
124 | |||
125 | |||
126 | #define SRTP_AES128_CM_SHA1_80 0x0001 | ||
127 | #define SRTP_AES128_CM_SHA1_32 0x0002 | ||
128 | #define SRTP_AES128_F8_SHA1_80 0x0003 | ||
129 | #define SRTP_AES128_F8_SHA1_32 0x0004 | ||
130 | #define SRTP_NULL_SHA1_80 0x0005 | ||
131 | #define SRTP_NULL_SHA1_32 0x0006 | ||
132 | |||
133 | int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles); | ||
134 | int SSL_set_tlsext_use_srtp(SSL *ctx, const char *profiles); | ||
135 | SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s); | ||
136 | |||
137 | STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *ssl); | ||
138 | SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s); | ||
139 | |||
140 | #ifdef __cplusplus | ||
141 | } | ||
142 | #endif | ||
143 | |||
144 | #endif | ||
145 | |||
diff --git a/src/lib/libssl/test/P1ss.cnf b/src/lib/libssl/test/P1ss.cnf index 876a0d35f8..326cce2ba8 100644 --- a/src/lib/libssl/test/P1ss.cnf +++ b/src/lib/libssl/test/P1ss.cnf | |||
@@ -7,7 +7,7 @@ RANDFILE = ./.rnd | |||
7 | 7 | ||
8 | #################################################################### | 8 | #################################################################### |
9 | [ req ] | 9 | [ req ] |
10 | default_bits = 512 | 10 | default_bits = 1024 |
11 | default_keyfile = keySS.pem | 11 | default_keyfile = keySS.pem |
12 | distinguished_name = req_distinguished_name | 12 | distinguished_name = req_distinguished_name |
13 | encrypt_rsa_key = no | 13 | encrypt_rsa_key = no |
diff --git a/src/lib/libssl/test/P2ss.cnf b/src/lib/libssl/test/P2ss.cnf index 373a87e7c2..8b502321b8 100644 --- a/src/lib/libssl/test/P2ss.cnf +++ b/src/lib/libssl/test/P2ss.cnf | |||
@@ -7,7 +7,7 @@ RANDFILE = ./.rnd | |||
7 | 7 | ||
8 | #################################################################### | 8 | #################################################################### |
9 | [ req ] | 9 | [ req ] |
10 | default_bits = 512 | 10 | default_bits = 1024 |
11 | default_keyfile = keySS.pem | 11 | default_keyfile = keySS.pem |
12 | distinguished_name = req_distinguished_name | 12 | distinguished_name = req_distinguished_name |
13 | encrypt_rsa_key = no | 13 | encrypt_rsa_key = no |
diff --git a/src/lib/libssl/test/pkits-test.pl b/src/lib/libssl/test/pkits-test.pl index 69dffa16f9..5c6b89fcdb 100644 --- a/src/lib/libssl/test/pkits-test.pl +++ b/src/lib/libssl/test/pkits-test.pl | |||
@@ -784,6 +784,15 @@ my $ossl = "ossl/apps/openssl"; | |||
784 | 784 | ||
785 | my $ossl_cmd = "$ossl_path cms -verify -verify_retcode "; | 785 | my $ossl_cmd = "$ossl_path cms -verify -verify_retcode "; |
786 | $ossl_cmd .= "-CAfile pkitsta.pem -crl_check_all -x509_strict "; | 786 | $ossl_cmd .= "-CAfile pkitsta.pem -crl_check_all -x509_strict "; |
787 | |||
788 | # Check for expiry of trust anchor | ||
789 | system "$ossl_path x509 -inform DER -in $pkitsta -checkend 0"; | ||
790 | if ($? == 256) | ||
791 | { | ||
792 | print STDERR "WARNING: using older expired data\n"; | ||
793 | $ossl_cmd .= "-attime 1291940972 "; | ||
794 | } | ||
795 | |||
787 | $ossl_cmd .= "-policy_check -extended_crl -use_deltas -out /dev/null 2>&1 "; | 796 | $ossl_cmd .= "-policy_check -extended_crl -use_deltas -out /dev/null 2>&1 "; |
788 | 797 | ||
789 | system "$ossl_path x509 -inform DER -in $pkitsta -out pkitsta.pem"; | 798 | system "$ossl_path x509 -inform DER -in $pkitsta -out pkitsta.pem"; |
diff --git a/src/lib/libssl/test/test.cnf b/src/lib/libssl/test/test.cnf index faad3914a8..10834442a1 100644 --- a/src/lib/libssl/test/test.cnf +++ b/src/lib/libssl/test/test.cnf | |||
@@ -56,7 +56,7 @@ emailAddress = optional | |||
56 | 56 | ||
57 | #################################################################### | 57 | #################################################################### |
58 | [ req ] | 58 | [ req ] |
59 | default_bits = 512 | 59 | default_bits = 1024 |
60 | default_keyfile = testkey.pem | 60 | default_keyfile = testkey.pem |
61 | distinguished_name = req_distinguished_name | 61 | distinguished_name = req_distinguished_name |
62 | encrypt_rsa_key = no | 62 | encrypt_rsa_key = no |