diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-04-27 13:09:44 +0200 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-04-27 13:09:44 +0200 |
commit | 646e85629497ee364e97101de4402d7141919144 (patch) | |
tree | 86913653cb8f3552f96061fa24360b6265bdd25a | |
parent | 48a18d15dfbfa41d137802e811a3abdfef012ac8 (diff) | |
download | busybox-w32-646e85629497ee364e97101de4402d7141919144.tar.gz busybox-w32-646e85629497ee364e97101de4402d7141919144.tar.bz2 busybox-w32-646e85629497ee364e97101de4402d7141919144.zip |
tls: shrink sp_256_mod_mul_norm_10
function old new delta
sp_256_mod_mul_norm_10 1439 1405 -34
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | networking/tls_sp_c32.c | 145 | ||||
-rw-r--r-- | networking/tls_sp_c32.patch | 142 |
2 files changed, 85 insertions, 202 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index f9c66b186..c5e887aad 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c | |||
@@ -460,51 +460,90 @@ static void sp_256_mont_inv_10(sp_digit* r, sp_digit* a) | |||
460 | static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a) | 460 | static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a) |
461 | { | 461 | { |
462 | int64_t t[8]; | 462 | int64_t t[8]; |
463 | int64_t a32[8]; | ||
464 | int64_t o; | 463 | int64_t o; |
465 | 464 | uint32_t a32; | |
466 | a32[0] = a[0]; | ||
467 | a32[0] |= a[1] << 26; | ||
468 | a32[0] &= 0xffffffff; | ||
469 | a32[1] = (sp_digit)(a[1] >> 6); | ||
470 | a32[1] |= a[2] << 20; | ||
471 | a32[1] &= 0xffffffff; | ||
472 | a32[2] = (sp_digit)(a[2] >> 12); | ||
473 | a32[2] |= a[3] << 14; | ||
474 | a32[2] &= 0xffffffff; | ||
475 | a32[3] = (sp_digit)(a[3] >> 18); | ||
476 | a32[3] |= a[4] << 8; | ||
477 | a32[3] &= 0xffffffff; | ||
478 | a32[4] = (sp_digit)(a[4] >> 24); | ||
479 | a32[4] |= a[5] << 2; | ||
480 | a32[4] |= a[6] << 28; | ||
481 | a32[4] &= 0xffffffff; | ||
482 | a32[5] = (sp_digit)(a[6] >> 4); | ||
483 | a32[5] |= a[7] << 22; | ||
484 | a32[5] &= 0xffffffff; | ||
485 | a32[6] = (sp_digit)(a[7] >> 10); | ||
486 | a32[6] |= a[8] << 16; | ||
487 | a32[6] &= 0xffffffff; | ||
488 | a32[7] = (sp_digit)(a[8] >> 16); | ||
489 | a32[7] |= a[9] << 10; | ||
490 | a32[7] &= 0xffffffff; | ||
491 | 465 | ||
492 | /* 1 1 0 -1 -1 -1 -1 0 */ | 466 | /* 1 1 0 -1 -1 -1 -1 0 */ |
493 | t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6]; | ||
494 | /* 0 1 1 0 -1 -1 -1 -1 */ | 467 | /* 0 1 1 0 -1 -1 -1 -1 */ |
495 | t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7]; | ||
496 | /* 0 0 1 1 0 -1 -1 -1 */ | 468 | /* 0 0 1 1 0 -1 -1 -1 */ |
497 | t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7]; | ||
498 | /* -1 -1 0 2 2 1 0 -1 */ | 469 | /* -1 -1 0 2 2 1 0 -1 */ |
499 | t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7]; | ||
500 | /* 0 -1 -1 0 2 2 1 0 */ | 470 | /* 0 -1 -1 0 2 2 1 0 */ |
501 | t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6]; | ||
502 | /* 0 0 -1 -1 0 2 2 1 */ | 471 | /* 0 0 -1 -1 0 2 2 1 */ |
503 | t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7]; | ||
504 | /* -1 -1 0 0 0 1 3 2 */ | 472 | /* -1 -1 0 0 0 1 3 2 */ |
505 | t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7]; | ||
506 | /* 1 0 -1 -1 -1 -1 0 3 */ | 473 | /* 1 0 -1 -1 -1 -1 0 3 */ |
507 | t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7]; | 474 | // t[] should be calculated from "a" (converted from 26-bit to 32-bit vector a32[8]) |
475 | // according to the above matrix: | ||
476 | //t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6] ; | ||
477 | //t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7] ; | ||
478 | //t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7] ; | ||
479 | //t[3] = 0 - a32[0] - a32[1] + 2*a32[3] + 2*a32[4] + a32[5] - a32[7] ; | ||
480 | //t[4] = 0 - a32[1] - a32[2] + 2*a32[4] + 2*a32[5] + a32[6] ; | ||
481 | //t[5] = 0 - a32[2] - a32[3] + 2*a32[5] + 2*a32[6] + a32[7] ; | ||
482 | //t[6] = 0 - a32[0] - a32[1] + a32[5] + 3*a32[6] + 2*a32[7]; | ||
483 | //t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3*a32[7]; | ||
484 | // We can do it "piecemeal" after each a32[i] is known, no need to store entire a32[8] vector: | ||
485 | |||
486 | #define A32 (int64_t)a32 | ||
487 | a32 = a[0] | (a[1] << 26); | ||
488 | t[0] = 0 + A32; | ||
489 | t[3] = 0 - A32; | ||
490 | t[6] = 0 - A32; | ||
491 | t[7] = 0 + A32; | ||
492 | |||
493 | a32 = (a[1] >> 6) | (a[2] << 20); | ||
494 | t[0] += A32 ; | ||
495 | t[1] = 0 + A32; | ||
496 | t[3] -= A32 ; | ||
497 | t[4] = 0 - A32; | ||
498 | t[6] -= A32 ; | ||
499 | |||
500 | a32 = (a[2] >> 12) | (a[3] << 14); | ||
501 | t[1] += A32 ; | ||
502 | t[2] = 0 + A32; | ||
503 | t[4] -= A32 ; | ||
504 | t[5] = 0 - A32; | ||
505 | t[7] -= A32 ; | ||
506 | |||
507 | a32 = (a[3] >> 18) | (a[4] << 8); | ||
508 | t[0] -= A32 ; | ||
509 | t[2] += A32 ; | ||
510 | t[3] += 2*A32; | ||
511 | t[5] -= A32 ; | ||
512 | t[7] -= A32 ; | ||
513 | |||
514 | a32 = (a[4] >> 24) | (a[5] << 2) | (a[6] << 28); | ||
515 | t[0] -= A32 ; | ||
516 | t[1] -= A32 ; | ||
517 | t[3] += 2*A32; | ||
518 | t[4] += 2*A32; | ||
519 | t[7] -= A32 ; | ||
520 | |||
521 | a32 = (a[6] >> 4) | (a[7] << 22); | ||
522 | t[0] -= A32 ; | ||
523 | t[1] -= A32 ; | ||
524 | t[2] -= A32 ; | ||
525 | t[3] += A32 ; | ||
526 | t[4] += 2*A32; | ||
527 | t[5] += 2*A32; | ||
528 | t[6] += A32 ; | ||
529 | t[7] -= A32 ; | ||
530 | |||
531 | a32 = (a[7] >> 10) | (a[8] << 16); | ||
532 | t[0] -= A32 ; | ||
533 | t[1] -= A32 ; | ||
534 | t[2] -= A32 ; | ||
535 | t[4] += A32 ; | ||
536 | t[5] += 2*A32; | ||
537 | t[6] += 3*A32; | ||
538 | |||
539 | a32 = (a[8] >> 16) | (a[9] << 10); | ||
540 | t[1] -= A32 ; | ||
541 | t[2] -= A32 ; | ||
542 | t[3] -= A32 ; | ||
543 | t[5] += A32 ; | ||
544 | t[6] += 2*A32; | ||
545 | t[7] += 3*A32; | ||
546 | #undef A32 | ||
508 | 547 | ||
509 | t[1] += t[0] >> 32; t[0] &= 0xffffffff; | 548 | t[1] += t[0] >> 32; t[0] &= 0xffffffff; |
510 | t[2] += t[1] >> 32; t[1] &= 0xffffffff; | 549 | t[2] += t[1] >> 32; t[1] &= 0xffffffff; |
@@ -526,30 +565,16 @@ static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a) | |||
526 | t[6] += t[5] >> 32; t[5] &= 0xffffffff; | 565 | t[6] += t[5] >> 32; t[5] &= 0xffffffff; |
527 | t[7] += t[6] >> 32; t[6] &= 0xffffffff; | 566 | t[7] += t[6] >> 32; t[6] &= 0xffffffff; |
528 | 567 | ||
529 | r[0] = (sp_digit)(t[0]) & 0x3ffffff; | 568 | r[0] = 0x3ffffff & ((sp_digit)(t[0])); |
530 | r[1] = (sp_digit)(t[0] >> 26); | 569 | r[1] = 0x3ffffff & ((sp_digit)(t[0] >> 26) | ((sp_digit)t[1] << 6)); |
531 | r[1] |= t[1] << 6; | 570 | r[2] = 0x3ffffff & ((sp_digit)(t[1] >> 20) | ((sp_digit)t[2] << 12)); |
532 | r[1] &= 0x3ffffff; | 571 | r[3] = 0x3ffffff & ((sp_digit)(t[2] >> 14) | ((sp_digit)t[3] << 18)); |
533 | r[2] = (sp_digit)(t[1] >> 20); | 572 | r[4] = 0x3ffffff & ((sp_digit)(t[3] >> 8) | ((sp_digit)t[4] << 24)); |
534 | r[2] |= t[2] << 12; | 573 | r[5] = 0x3ffffff & ((sp_digit)t[4] >> 2); /* small shift, ok to cast t[4] to narrower type */ |
535 | r[2] &= 0x3ffffff; | 574 | r[6] = 0x3ffffff & ((sp_digit)(t[4] >> 28) | ((sp_digit)t[5] << 4)); |
536 | r[3] = (sp_digit)(t[2] >> 14); | 575 | r[7] = 0x3ffffff & ((sp_digit)(t[5] >> 22) | ((sp_digit)t[6] << 10)); |
537 | r[3] |= t[3] << 18; | 576 | r[8] = 0x3ffffff & ((sp_digit)(t[6] >> 16) | ((sp_digit)t[7] << 16)); |
538 | r[3] &= 0x3ffffff; | 577 | r[9] = ((sp_digit)(t[7] >> 10)); |
539 | r[4] = (sp_digit)(t[3] >> 8); | ||
540 | r[4] |= t[4] << 24; | ||
541 | r[4] &= 0x3ffffff; | ||
542 | r[5] = (sp_digit)(t[4] >> 2) & 0x3ffffff; | ||
543 | r[6] = (sp_digit)(t[4] >> 28); | ||
544 | r[6] |= t[5] << 4; | ||
545 | r[6] &= 0x3ffffff; | ||
546 | r[7] = (sp_digit)(t[5] >> 22); | ||
547 | r[7] |= t[6] << 10; | ||
548 | r[7] &= 0x3ffffff; | ||
549 | r[8] = (sp_digit)(t[6] >> 16); | ||
550 | r[8] |= t[7] << 16; | ||
551 | r[8] &= 0x3ffffff; | ||
552 | r[9] = (sp_digit)(t[7] >> 10); | ||
553 | } | 578 | } |
554 | 579 | ||
555 | /* Map the Montgomery form projective co-ordinate point to an affine point. | 580 | /* Map the Montgomery form projective co-ordinate point to an affine point. |
@@ -795,7 +820,7 @@ static void sp_256_ecc_mulmod_base_10(sp_point* r, sp_digit* k /*, int map*/) | |||
795 | 0x6b,0x17,0xd1,0xf2,0xe1,0x2c,0x42,0x47,0xf8,0xbc,0xe6,0xe5,0x63,0xa4,0x40,0xf2,0x77,0x03,0x7d,0x81,0x2d,0xeb,0x33,0xa0,0xf4,0xa1,0x39,0x45,0xd8,0x98,0xc2,0x96, | 820 | 0x6b,0x17,0xd1,0xf2,0xe1,0x2c,0x42,0x47,0xf8,0xbc,0xe6,0xe5,0x63,0xa4,0x40,0xf2,0x77,0x03,0x7d,0x81,0x2d,0xeb,0x33,0xa0,0xf4,0xa1,0x39,0x45,0xd8,0x98,0xc2,0x96, |
796 | /* y */ | 821 | /* y */ |
797 | 0x4f,0xe3,0x42,0xe2,0xfe,0x1a,0x7f,0x9b,0x8e,0xe7,0xeb,0x4a,0x7c,0x0f,0x9e,0x16,0x2b,0xce,0x33,0x57,0x6b,0x31,0x5e,0xce,0xcb,0xb6,0x40,0x68,0x37,0xbf,0x51,0xf5, | 822 | 0x4f,0xe3,0x42,0xe2,0xfe,0x1a,0x7f,0x9b,0x8e,0xe7,0xeb,0x4a,0x7c,0x0f,0x9e,0x16,0x2b,0xce,0x33,0x57,0x6b,0x31,0x5e,0xce,0xcb,0xb6,0x40,0x68,0x37,0xbf,0x51,0xf5, |
798 | /* z will be set to 0, infinity flag to "false" */ | 823 | /* z will be set to 1, infinity flag to "false" */ |
799 | }; | 824 | }; |
800 | sp_point p256_base; | 825 | sp_point p256_base; |
801 | 826 | ||
diff --git a/networking/tls_sp_c32.patch b/networking/tls_sp_c32.patch deleted file mode 100644 index 7559586c9..000000000 --- a/networking/tls_sp_c32.patch +++ /dev/null | |||
@@ -1,142 +0,0 @@ | |||
1 | Somehow, gcc 6+ does this optimization same or better than the below | ||
2 | hand-written optimized code (gcc seem to eliminate a32[] array, uses 32-bit | ||
3 | registers/memory for "lower halves" of a32[i] elements). | ||
4 | |||
5 | But there can be arches where gcc won't be this good? | ||
6 | |||
7 | diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c | ||
8 | index 72a3be537..e8a011ad1 100644 | ||
9 | --- a/networking/tls_sp_c32.c | ||
10 | +++ b/networking/tls_sp_c32.c | ||
11 | @@ -228,51 +228,96 @@ static void sp_256_rshift1_10(sp_digit* r, sp_digit* a) | ||
12 | static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a) | ||
13 | { | ||
14 | int64_t t[8]; | ||
15 | - int64_t a32[8]; | ||
16 | + uint32_t a32; | ||
17 | int64_t o; | ||
18 | |||
19 | - a32[0] = a[0]; | ||
20 | - a32[0] |= a[1] << 26; | ||
21 | - a32[0] &= 0xffffffff; | ||
22 | - a32[1] = (sp_digit)(a[1] >> 6); | ||
23 | - a32[1] |= a[2] << 20; | ||
24 | - a32[1] &= 0xffffffff; | ||
25 | - a32[2] = (sp_digit)(a[2] >> 12); | ||
26 | - a32[2] |= a[3] << 14; | ||
27 | - a32[2] &= 0xffffffff; | ||
28 | - a32[3] = (sp_digit)(a[3] >> 18); | ||
29 | - a32[3] |= a[4] << 8; | ||
30 | - a32[3] &= 0xffffffff; | ||
31 | - a32[4] = (sp_digit)(a[4] >> 24); | ||
32 | - a32[4] |= a[5] << 2; | ||
33 | - a32[4] |= a[6] << 28; | ||
34 | - a32[4] &= 0xffffffff; | ||
35 | - a32[5] = (sp_digit)(a[6] >> 4); | ||
36 | - a32[5] |= a[7] << 22; | ||
37 | - a32[5] &= 0xffffffff; | ||
38 | - a32[6] = (sp_digit)(a[7] >> 10); | ||
39 | - a32[6] |= a[8] << 16; | ||
40 | - a32[6] &= 0xffffffff; | ||
41 | - a32[7] = (sp_digit)(a[8] >> 16); | ||
42 | - a32[7] |= a[9] << 10; | ||
43 | - a32[7] &= 0xffffffff; | ||
44 | - | ||
45 | /* 1 1 0 -1 -1 -1 -1 0 */ | ||
46 | - t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6]; | ||
47 | /* 0 1 1 0 -1 -1 -1 -1 */ | ||
48 | - t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7]; | ||
49 | /* 0 0 1 1 0 -1 -1 -1 */ | ||
50 | - t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7]; | ||
51 | /* -1 -1 0 2 2 1 0 -1 */ | ||
52 | - t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7]; | ||
53 | /* 0 -1 -1 0 2 2 1 0 */ | ||
54 | - t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6]; | ||
55 | /* 0 0 -1 -1 0 2 2 1 */ | ||
56 | - t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7]; | ||
57 | /* -1 -1 0 0 0 1 3 2 */ | ||
58 | - t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7]; | ||
59 | /* 1 0 -1 -1 -1 -1 0 3 */ | ||
60 | - t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7]; | ||
61 | + //t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6] ; | ||
62 | + //t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7] ; | ||
63 | + //t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7] ; | ||
64 | + //t[3] = 0 - a32[0] - a32[1] + 2*a32[3] + 2*a32[4] + a32[5] - a32[7] ; | ||
65 | + //t[4] = 0 - a32[1] - a32[2] + 2*a32[4] + 2*a32[5] + a32[6] ; | ||
66 | + //t[5] = 0 - a32[2] - a32[3] + 2*a32[5] + 2*a32[6] + a32[7] ; | ||
67 | + //t[6] = 0 - a32[0] - a32[1] + a32[5] + 3*a32[6] + 2*a32[7]; | ||
68 | + //t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3*a32[7]; | ||
69 | + | ||
70 | +#define A32 (int64_t)a32 | ||
71 | + a32 = a[0]; | ||
72 | + a32 |= a[1] << 26; | ||
73 | + t[0] = 0 + A32; | ||
74 | + t[3] = 0 - A32; | ||
75 | + t[6] = 0 - A32; | ||
76 | + t[7] = 0 + A32; | ||
77 | + | ||
78 | + a32 = (sp_digit)(a[1] >> 6); | ||
79 | + a32 |= a[2] << 20; | ||
80 | + t[0] += A32 ; | ||
81 | + t[1] = 0 + A32; | ||
82 | + t[3] -= A32 ; | ||
83 | + t[4] = 0 - A32; | ||
84 | + t[6] -= A32 ; | ||
85 | + | ||
86 | + a32 = (sp_digit)(a[2] >> 12); | ||
87 | + a32 |= a[3] << 14; | ||
88 | + t[1] += A32 ; | ||
89 | + t[2] = 0 + A32; | ||
90 | + t[4] -= A32 ; | ||
91 | + t[5] = 0 - A32; | ||
92 | + t[7] -= A32 ; | ||
93 | + | ||
94 | + a32 = (sp_digit)(a[3] >> 18); | ||
95 | + a32 |= a[4] << 8; | ||
96 | + t[0] -= A32 ; | ||
97 | + t[2] += A32 ; | ||
98 | + t[3] += 2*A32; | ||
99 | + t[5] -= A32 ; | ||
100 | + t[7] -= A32 ; | ||
101 | + | ||
102 | + a32 = (sp_digit)(a[4] >> 24); | ||
103 | + a32 |= a[5] << 2; | ||
104 | + a32 |= a[6] << 28; | ||
105 | + t[0] -= A32 ; | ||
106 | + t[1] -= A32 ; | ||
107 | + t[3] += 2*A32; | ||
108 | + t[4] += 2*A32; | ||
109 | + t[7] -= A32 ; | ||
110 | + | ||
111 | + a32 = (sp_digit)(a[6] >> 4); | ||
112 | + a32 |= a[7] << 22; | ||
113 | + t[0] -= A32 ; | ||
114 | + t[1] -= A32 ; | ||
115 | + t[2] -= A32 ; | ||
116 | + t[3] += A32 ; | ||
117 | + t[4] += 2*A32; | ||
118 | + t[5] += 2*A32; | ||
119 | + t[6] += A32 ; | ||
120 | + t[7] -= A32 ; | ||
121 | + | ||
122 | + a32 = (sp_digit)(a[7] >> 10); | ||
123 | + a32 |= a[8] << 16; | ||
124 | + t[0] -= A32 ; | ||
125 | + t[1] -= A32 ; | ||
126 | + t[2] -= A32 ; | ||
127 | + t[4] += A32 ; | ||
128 | + t[5] += 2*A32; | ||
129 | + t[6] += 3*A32; | ||
130 | + | ||
131 | + a32 = (sp_digit)(a[8] >> 16); | ||
132 | + a32 |= a[9] << 10; | ||
133 | + t[1] -= A32 ; | ||
134 | + t[2] -= A32 ; | ||
135 | + t[3] -= A32 ; | ||
136 | + t[5] += A32 ; | ||
137 | + t[6] += 2*A32; | ||
138 | + t[7] += 3*A32; | ||
139 | +#undef A32 | ||
140 | |||
141 | t[1] += t[0] >> 32; t[0] &= 0xffffffff; | ||
142 | t[2] += t[1] >> 32; t[1] &= 0xffffffff; | ||