diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2021-10-05 23:19:18 +0200 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-10-05 23:19:18 +0200 |
| commit | bbd723ebec33aa14746dde88b982b160977938b6 (patch) | |
| tree | 41c91f86f4d96a67322234bf382e3795dfe06bec | |
| parent | 3b411ebbfc749f9f12b0eb739cb5ba3ec052197e (diff) | |
| download | busybox-w32-bbd723ebec33aa14746dde88b982b160977938b6.tar.gz busybox-w32-bbd723ebec33aa14746dde88b982b160977938b6.tar.bz2 busybox-w32-bbd723ebec33aa14746dde88b982b160977938b6.zip | |
tls: optimize sp_256_mul_8 in P256
function old new delta
sp_256_mont_mul_8 151 150 -1
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
| -rw-r--r-- | networking/tls_sp_c32.c | 84 |
1 files changed, 82 insertions, 2 deletions
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index b99951890..e1c4cdd54 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c | |||
| @@ -294,6 +294,85 @@ static int sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | |||
| 294 | /* Multiply a and b into r. (r = a * b) */ | 294 | /* Multiply a and b into r. (r = a * b) */ |
| 295 | static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | 295 | static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) |
| 296 | { | 296 | { |
| 297 | #if ALLOW_ASM && defined(__GNUC__) && defined(__i386__) | ||
| 298 | sp_digit rr[15]; /* in case r coincides with a or b */ | ||
| 299 | int k; | ||
| 300 | uint32_t accl; | ||
| 301 | uint32_t acch; | ||
| 302 | |||
| 303 | acch = accl = 0; | ||
| 304 | for (k = 0; k < 15; k++) { | ||
| 305 | int i, j; | ||
| 306 | uint32_t acc_hi; | ||
| 307 | i = k - 7; | ||
| 308 | if (i < 0) | ||
| 309 | i = 0; | ||
| 310 | j = k - i; | ||
| 311 | acc_hi = 0; | ||
| 312 | do { | ||
| 313 | //////////////////////// | ||
| 314 | // uint64_t m = ((uint64_t)a[i]) * b[j]; | ||
| 315 | // acc_hi:acch:accl += m; | ||
| 316 | asm volatile ( | ||
| 317 | // a[i] is already loaded in %%eax | ||
| 318 | "\n mull %7" | ||
| 319 | "\n addl %%eax, %0" | ||
| 320 | "\n adcl %%edx, %1" | ||
| 321 | "\n adcl $0, %2" | ||
| 322 | : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi) | ||
| 323 | : "0" (accl), "1" (acch), "2" (acc_hi), "a" (a[i]), "m" (b[j]) | ||
| 324 | : "cc", "dx" | ||
| 325 | ); | ||
| 326 | //////////////////////// | ||
| 327 | j--; | ||
| 328 | i++; | ||
| 329 | } while (i != 8 && i <= k); | ||
| 330 | rr[k] = accl; | ||
| 331 | accl = acch; | ||
| 332 | acch = acc_hi; | ||
| 333 | } | ||
| 334 | r[15] = accl; | ||
| 335 | memcpy(r, rr, sizeof(rr)); | ||
| 336 | #elif 0 | ||
| 337 | //TODO: arm assembly (untested) | ||
| 338 | sp_digit tmp[16]; | ||
| 339 | |||
| 340 | asm volatile ( | ||
| 341 | "\n mov r5, #0" | ||
| 342 | "\n mov r6, #0" | ||
| 343 | "\n mov r7, #0" | ||
| 344 | "\n mov r8, #0" | ||
| 345 | "\n 1:" | ||
| 346 | "\n subs r3, r5, #28" | ||
| 347 | "\n movcc r3, #0" | ||
| 348 | "\n sub r4, r5, r3" | ||
| 349 | "\n 2:" | ||
| 350 | "\n ldr r14, [%[a], r3]" | ||
| 351 | "\n ldr r12, [%[b], r4]" | ||
| 352 | "\n umull r9, r10, r14, r12" | ||
| 353 | "\n adds r6, r6, r9" | ||
| 354 | "\n adcs r7, r7, r10" | ||
| 355 | "\n adc r8, r8, #0" | ||
| 356 | "\n add r3, r3, #4" | ||
| 357 | "\n sub r4, r4, #4" | ||
| 358 | "\n cmp r3, #32" | ||
| 359 | "\n beq 3f" | ||
| 360 | "\n cmp r3, r5" | ||
| 361 | "\n ble 2b" | ||
| 362 | "\n 3:" | ||
| 363 | "\n str r6, [%[r], r5]" | ||
| 364 | "\n mov r6, r7" | ||
| 365 | "\n mov r7, r8" | ||
| 366 | "\n mov r8, #0" | ||
| 367 | "\n add r5, r5, #4" | ||
| 368 | "\n cmp r5, #56" | ||
| 369 | "\n ble 1b" | ||
| 370 | "\n str r6, [%[r], r5]" | ||
| 371 | : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) | ||
| 372 | : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" | ||
| 373 | ); | ||
| 374 | memcpy(r, tmp, sizeof(tmp)); | ||
| 375 | #else | ||
| 297 | sp_digit rr[15]; /* in case r coincides with a or b */ | 376 | sp_digit rr[15]; /* in case r coincides with a or b */ |
| 298 | int i, j, k; | 377 | int i, j, k; |
| 299 | uint64_t acc; | 378 | uint64_t acc; |
| @@ -306,19 +385,20 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) | |||
| 306 | i = 0; | 385 | i = 0; |
| 307 | j = k - i; | 386 | j = k - i; |
| 308 | acc_hi = 0; | 387 | acc_hi = 0; |
| 309 | while (i != 8 && i <= k) { | 388 | do { |
| 310 | uint64_t m = ((uint64_t)a[i]) * b[j]; | 389 | uint64_t m = ((uint64_t)a[i]) * b[j]; |
| 311 | acc += m; | 390 | acc += m; |
| 312 | if (acc < m) | 391 | if (acc < m) |
| 313 | acc_hi++; | 392 | acc_hi++; |
| 314 | j--; | 393 | j--; |
| 315 | i++; | 394 | i++; |
| 316 | } | 395 | } while (i != 8 && i <= k); |
| 317 | rr[k] = acc; | 396 | rr[k] = acc; |
| 318 | acc = (acc >> 32) | ((uint64_t)acc_hi << 32); | 397 | acc = (acc >> 32) | ((uint64_t)acc_hi << 32); |
| 319 | } | 398 | } |
| 320 | r[15] = acc; | 399 | r[15] = acc; |
| 321 | memcpy(r, rr, sizeof(rr)); | 400 | memcpy(r, rr, sizeof(rr)); |
| 401 | #endif | ||
| 322 | } | 402 | } |
| 323 | 403 | ||
| 324 | /* Shift number right one bit. Bottom bit is lost. */ | 404 | /* Shift number right one bit. Bottom bit is lost. */ |
