summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/bn_asm.c
diff options
context:
space:
mode:
authorjsing <>2023-01-20 17:31:52 +0000
committerjsing <>2023-01-20 17:31:52 +0000
commitec907bb8e44028294d6c2a6faf9c735ce8012e48 (patch)
treeb7a2361e00d87650a48e90b0530c6a86d27e0039 /src/lib/libcrypto/bn/bn_asm.c
parenta50b434b87829ee0d12767c21ae98194684ab720 (diff)
downloadopenbsd-ec907bb8e44028294d6c2a6faf9c735ce8012e48.tar.gz
openbsd-ec907bb8e44028294d6c2a6faf9c735ce8012e48.tar.bz2
openbsd-ec907bb8e44028294d6c2a6faf9c735ce8012e48.zip
Move bn_{mul,sqr}_comba{4,8}() from bn_asm.c to bn_mul.c/bn_sqr.c.
Wrap these in HAVE_BN_{MUL,SQR}_COMBA{4,8} defines. Add these defines to bn_arch.h where the architecture currently provides its own version. ok tb@
Diffstat (limited to 'src/lib/libcrypto/bn/bn_asm.c')
-rw-r--r--src/lib/libcrypto/bn/bn_asm.c300
1 files changed, 1 insertions, 299 deletions
diff --git a/src/lib/libcrypto/bn/bn_asm.c b/src/lib/libcrypto/bn/bn_asm.c
index 84063486b3..df4ddaea17 100644
--- a/src/lib/libcrypto/bn/bn_asm.c
+++ b/src/lib/libcrypto/bn/bn_asm.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_asm.c,v 1.18 2023/01/20 17:26:03 jsing Exp $ */ 1/* $OpenBSD: bn_asm.c,v 1.19 2023/01/20 17:31:52 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved. 3 * All rights reserved.
4 * 4 *
@@ -479,265 +479,6 @@ bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
479 479
480#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 480#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
481 481
482#undef bn_mul_comba8
483#undef bn_mul_comba4
484#undef bn_sqr_comba8
485#undef bn_sqr_comba4
486
487void
488bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
489{
490 BN_ULONG c1, c2, c3;
491
492 c1 = 0;
493 c2 = 0;
494 c3 = 0;
495 mul_add_c(a[0], b[0], c1, c2, c3);
496 r[0] = c1;
497 c1 = 0;
498 mul_add_c(a[0], b[1], c2, c3, c1);
499 mul_add_c(a[1], b[0], c2, c3, c1);
500 r[1] = c2;
501 c2 = 0;
502 mul_add_c(a[2], b[0], c3, c1, c2);
503 mul_add_c(a[1], b[1], c3, c1, c2);
504 mul_add_c(a[0], b[2], c3, c1, c2);
505 r[2] = c3;
506 c3 = 0;
507 mul_add_c(a[0], b[3], c1, c2, c3);
508 mul_add_c(a[1], b[2], c1, c2, c3);
509 mul_add_c(a[2], b[1], c1, c2, c3);
510 mul_add_c(a[3], b[0], c1, c2, c3);
511 r[3] = c1;
512 c1 = 0;
513 mul_add_c(a[4], b[0], c2, c3, c1);
514 mul_add_c(a[3], b[1], c2, c3, c1);
515 mul_add_c(a[2], b[2], c2, c3, c1);
516 mul_add_c(a[1], b[3], c2, c3, c1);
517 mul_add_c(a[0], b[4], c2, c3, c1);
518 r[4] = c2;
519 c2 = 0;
520 mul_add_c(a[0], b[5], c3, c1, c2);
521 mul_add_c(a[1], b[4], c3, c1, c2);
522 mul_add_c(a[2], b[3], c3, c1, c2);
523 mul_add_c(a[3], b[2], c3, c1, c2);
524 mul_add_c(a[4], b[1], c3, c1, c2);
525 mul_add_c(a[5], b[0], c3, c1, c2);
526 r[5] = c3;
527 c3 = 0;
528 mul_add_c(a[6], b[0], c1, c2, c3);
529 mul_add_c(a[5], b[1], c1, c2, c3);
530 mul_add_c(a[4], b[2], c1, c2, c3);
531 mul_add_c(a[3], b[3], c1, c2, c3);
532 mul_add_c(a[2], b[4], c1, c2, c3);
533 mul_add_c(a[1], b[5], c1, c2, c3);
534 mul_add_c(a[0], b[6], c1, c2, c3);
535 r[6] = c1;
536 c1 = 0;
537 mul_add_c(a[0], b[7], c2, c3, c1);
538 mul_add_c(a[1], b[6], c2, c3, c1);
539 mul_add_c(a[2], b[5], c2, c3, c1);
540 mul_add_c(a[3], b[4], c2, c3, c1);
541 mul_add_c(a[4], b[3], c2, c3, c1);
542 mul_add_c(a[5], b[2], c2, c3, c1);
543 mul_add_c(a[6], b[1], c2, c3, c1);
544 mul_add_c(a[7], b[0], c2, c3, c1);
545 r[7] = c2;
546 c2 = 0;
547 mul_add_c(a[7], b[1], c3, c1, c2);
548 mul_add_c(a[6], b[2], c3, c1, c2);
549 mul_add_c(a[5], b[3], c3, c1, c2);
550 mul_add_c(a[4], b[4], c3, c1, c2);
551 mul_add_c(a[3], b[5], c3, c1, c2);
552 mul_add_c(a[2], b[6], c3, c1, c2);
553 mul_add_c(a[1], b[7], c3, c1, c2);
554 r[8] = c3;
555 c3 = 0;
556 mul_add_c(a[2], b[7], c1, c2, c3);
557 mul_add_c(a[3], b[6], c1, c2, c3);
558 mul_add_c(a[4], b[5], c1, c2, c3);
559 mul_add_c(a[5], b[4], c1, c2, c3);
560 mul_add_c(a[6], b[3], c1, c2, c3);
561 mul_add_c(a[7], b[2], c1, c2, c3);
562 r[9] = c1;
563 c1 = 0;
564 mul_add_c(a[7], b[3], c2, c3, c1);
565 mul_add_c(a[6], b[4], c2, c3, c1);
566 mul_add_c(a[5], b[5], c2, c3, c1);
567 mul_add_c(a[4], b[6], c2, c3, c1);
568 mul_add_c(a[3], b[7], c2, c3, c1);
569 r[10] = c2;
570 c2 = 0;
571 mul_add_c(a[4], b[7], c3, c1, c2);
572 mul_add_c(a[5], b[6], c3, c1, c2);
573 mul_add_c(a[6], b[5], c3, c1, c2);
574 mul_add_c(a[7], b[4], c3, c1, c2);
575 r[11] = c3;
576 c3 = 0;
577 mul_add_c(a[7], b[5], c1, c2, c3);
578 mul_add_c(a[6], b[6], c1, c2, c3);
579 mul_add_c(a[5], b[7], c1, c2, c3);
580 r[12] = c1;
581 c1 = 0;
582 mul_add_c(a[6], b[7], c2, c3, c1);
583 mul_add_c(a[7], b[6], c2, c3, c1);
584 r[13] = c2;
585 c2 = 0;
586 mul_add_c(a[7], b[7], c3, c1, c2);
587 r[14] = c3;
588 r[15] = c1;
589}
590
591void
592bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
593{
594 BN_ULONG c1, c2, c3;
595
596 c1 = 0;
597 c2 = 0;
598 c3 = 0;
599 mul_add_c(a[0], b[0], c1, c2, c3);
600 r[0] = c1;
601 c1 = 0;
602 mul_add_c(a[0], b[1], c2, c3, c1);
603 mul_add_c(a[1], b[0], c2, c3, c1);
604 r[1] = c2;
605 c2 = 0;
606 mul_add_c(a[2], b[0], c3, c1, c2);
607 mul_add_c(a[1], b[1], c3, c1, c2);
608 mul_add_c(a[0], b[2], c3, c1, c2);
609 r[2] = c3;
610 c3 = 0;
611 mul_add_c(a[0], b[3], c1, c2, c3);
612 mul_add_c(a[1], b[2], c1, c2, c3);
613 mul_add_c(a[2], b[1], c1, c2, c3);
614 mul_add_c(a[3], b[0], c1, c2, c3);
615 r[3] = c1;
616 c1 = 0;
617 mul_add_c(a[3], b[1], c2, c3, c1);
618 mul_add_c(a[2], b[2], c2, c3, c1);
619 mul_add_c(a[1], b[3], c2, c3, c1);
620 r[4] = c2;
621 c2 = 0;
622 mul_add_c(a[2], b[3], c3, c1, c2);
623 mul_add_c(a[3], b[2], c3, c1, c2);
624 r[5] = c3;
625 c3 = 0;
626 mul_add_c(a[3], b[3], c1, c2, c3);
627 r[6] = c1;
628 r[7] = c2;
629}
630
631void
632bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
633{
634 BN_ULONG c1, c2, c3;
635
636 c1 = 0;
637 c2 = 0;
638 c3 = 0;
639 sqr_add_c(a, 0, c1, c2, c3);
640 r[0] = c1;
641 c1 = 0;
642 sqr_add_c2(a, 1, 0, c2, c3, c1);
643 r[1] = c2;
644 c2 = 0;
645 sqr_add_c(a, 1, c3, c1, c2);
646 sqr_add_c2(a, 2, 0, c3, c1, c2);
647 r[2] = c3;
648 c3 = 0;
649 sqr_add_c2(a, 3, 0, c1, c2, c3);
650 sqr_add_c2(a, 2, 1, c1, c2, c3);
651 r[3] = c1;
652 c1 = 0;
653 sqr_add_c(a, 2, c2, c3, c1);
654 sqr_add_c2(a, 3, 1, c2, c3, c1);
655 sqr_add_c2(a, 4, 0, c2, c3, c1);
656 r[4] = c2;
657 c2 = 0;
658 sqr_add_c2(a, 5, 0, c3, c1, c2);
659 sqr_add_c2(a, 4, 1, c3, c1, c2);
660 sqr_add_c2(a, 3, 2, c3, c1, c2);
661 r[5] = c3;
662 c3 = 0;
663 sqr_add_c(a, 3, c1, c2, c3);
664 sqr_add_c2(a, 4, 2, c1, c2, c3);
665 sqr_add_c2(a, 5, 1, c1, c2, c3);
666 sqr_add_c2(a, 6, 0, c1, c2, c3);
667 r[6] = c1;
668 c1 = 0;
669 sqr_add_c2(a, 7, 0, c2, c3, c1);
670 sqr_add_c2(a, 6, 1, c2, c3, c1);
671 sqr_add_c2(a, 5, 2, c2, c3, c1);
672 sqr_add_c2(a, 4, 3, c2, c3, c1);
673 r[7] = c2;
674 c2 = 0;
675 sqr_add_c(a, 4, c3, c1, c2);
676 sqr_add_c2(a, 5, 3, c3, c1, c2);
677 sqr_add_c2(a, 6, 2, c3, c1, c2);
678 sqr_add_c2(a, 7, 1, c3, c1, c2);
679 r[8] = c3;
680 c3 = 0;
681 sqr_add_c2(a, 7, 2, c1, c2, c3);
682 sqr_add_c2(a, 6, 3, c1, c2, c3);
683 sqr_add_c2(a, 5, 4, c1, c2, c3);
684 r[9] = c1;
685 c1 = 0;
686 sqr_add_c(a, 5, c2, c3, c1);
687 sqr_add_c2(a, 6, 4, c2, c3, c1);
688 sqr_add_c2(a, 7, 3, c2, c3, c1);
689 r[10] = c2;
690 c2 = 0;
691 sqr_add_c2(a, 7, 4, c3, c1, c2);
692 sqr_add_c2(a, 6, 5, c3, c1, c2);
693 r[11] = c3;
694 c3 = 0;
695 sqr_add_c(a, 6, c1, c2, c3);
696 sqr_add_c2(a, 7, 5, c1, c2, c3);
697 r[12] = c1;
698 c1 = 0;
699 sqr_add_c2(a, 7, 6, c2, c3, c1);
700 r[13] = c2;
701 c2 = 0;
702 sqr_add_c(a, 7, c3, c1, c2);
703 r[14] = c3;
704 r[15] = c1;
705}
706
707void
708bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
709{
710 BN_ULONG c1, c2, c3;
711
712 c1 = 0;
713 c2 = 0;
714 c3 = 0;
715 sqr_add_c(a, 0, c1, c2, c3);
716 r[0] = c1;
717 c1 = 0;
718 sqr_add_c2(a, 1, 0, c2, c3, c1);
719 r[1] = c2;
720 c2 = 0;
721 sqr_add_c(a, 1, c3, c1, c2);
722 sqr_add_c2(a, 2, 0, c3, c1, c2);
723 r[2] = c3;
724 c3 = 0;
725 sqr_add_c2(a, 3, 0, c1, c2, c3);
726 sqr_add_c2(a, 2, 1, c1, c2, c3);
727 r[3] = c1;
728 c1 = 0;
729 sqr_add_c(a, 2, c2, c3, c1);
730 sqr_add_c2(a, 3, 1, c2, c3, c1);
731 r[4] = c2;
732 c2 = 0;
733 sqr_add_c2(a, 3, 2, c3, c1, c2);
734 r[5] = c3;
735 c3 = 0;
736 sqr_add_c(a, 3, c1, c2, c3);
737 r[6] = c1;
738 r[7] = c2;
739}
740
741#ifdef OPENSSL_NO_ASM 482#ifdef OPENSSL_NO_ASM
742#ifdef OPENSSL_BN_ASM_MONT 483#ifdef OPENSSL_BN_ASM_MONT
743/* 484/*
@@ -853,45 +594,6 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
853 594
854#else /* !BN_MUL_COMBA */ 595#else /* !BN_MUL_COMBA */
855 596
856/* hmm... is it faster just to do a multiply? */
857#undef bn_sqr_comba4
858void
859bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
860{
861 BN_ULONG t[8];
862 bn_sqr_normal(r, a, 4, t);
863}
864
865#undef bn_sqr_comba8
866void
867bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
868{
869 BN_ULONG t[16];
870 bn_sqr_normal(r, a, 8, t);
871}
872
873void
874bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
875{
876 r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
877 r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
878 r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
879 r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
880}
881
882void
883bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
884{
885 r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
886 r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
887 r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
888 r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
889 r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
890 r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
891 r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
892 r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
893}
894
895#ifdef OPENSSL_NO_ASM 597#ifdef OPENSSL_NO_ASM
896#ifdef OPENSSL_BN_ASM_MONT 598#ifdef OPENSSL_BN_ASM_MONT
897int 599int