From e5f4dc854c2c390526097888bf3654db21bdbb15 Mon Sep 17 00:00:00 2001 From: jsing <> Date: Tue, 12 Aug 2025 10:09:46 +0000 Subject: Bring in bignum_{mul,sqr}_6_12{,_alt}() from s2n-bignum. These provide fast multiplication and squaring of inputs with 6x words, producing a 12 word result. The non-_alt versions require the CPU to support ADX instructions, while the _alt versions do not. --- src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S | 210 ++++++++++++++++++++ .../libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S | 186 ++++++++++++++++++ src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S | 214 +++++++++++++++++++++ .../libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S | 197 +++++++++++++++++++ 4 files changed, 807 insertions(+) create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S (limited to 'src/lib') diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S new file mode 100644 index 0000000000..081b602cde --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S @@ -0,0 +1,210 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[6], y[6]; output z[12] +// +// extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6], +// const uint64_t y[static 6]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12) + .text + +// These are actually right + +#define z rdi +#define x rsi + +// Copied in or set up + +#define y rcx + +// A zero register + +#define zero rbp +#define zeroe ebp + +// Add in x[i] * rdx to the (i,i+1) position with the register window +// Would be nice to have conditional expressions reg[i], reg[i+1] ... + +.macro mulpadd arg1,arg2 + mulx rbx, rax, [x+8*\arg2] +.if ((\arg1 + \arg2) % 6 == 0) + adcx r8, rax + adox r9, rbx +.elseif ((\arg1 + \arg2) % 6 == 1) + adcx r9, rax + adox r10, rbx +.elseif ((\arg1 + \arg2) % 6 == 2) + adcx r10, rax + adox r11, rbx +.elseif ((\arg1 + \arg2) % 6 == 3) + adcx r11, rax + adox r12, rbx +.elseif ((\arg1 + \arg2) % 6 == 4) + adcx r12, rax + adox r13, rbx +.elseif ((\arg1 + \arg2) % 6 == 5) + adcx r13, rax + adox r8, rbx +.endif + +.endm + + +// Add in the whole j'th row + +.macro addrow arg1 + mov rdx, [y+8*\arg1] + xor zeroe, zeroe + + mulpadd \arg1, 0 + +.if (\arg1 % 6 == 0) + mov [z+8*\arg1],r8 +.elseif (\arg1 % 6 == 1) + mov [z+8*\arg1],r9 +.elseif (\arg1 % 6 == 2) + mov [z+8*\arg1],r10 +.elseif (\arg1 % 6 == 3) + mov [z+8*\arg1],r11 +.elseif (\arg1 % 6 == 4) + mov [z+8*\arg1],r12 +.elseif (\arg1 % 6 == 5) + mov [z+8*\arg1],r13 +.endif + + mulpadd \arg1, 1 + mulpadd \arg1, 2 + mulpadd \arg1, 3 + mulpadd \arg1, 4 + +.if (\arg1 % 6 == 0) + mulx r8, rax, [x+40] + adcx r13, rax + adox r8, zero + adcx r8, zero +.elseif (\arg1 % 6 == 1) + mulx r9, rax, [x+40] + adcx r8, rax + adox r9, zero + adcx r9, zero +.elseif (\arg1 % 6 == 2) + mulx r10, rax, [x+40] + adcx r9, rax + adox r10, zero + adcx r10, zero +.elseif (\arg1 % 6 == 3) + mulx r11, rax, [x+40] + adcx r10, rax + adox r11, zero + adcx r11, zero +.elseif (\arg1 % 6 == 4) + mulx r12, rax, [x+40] + adcx r11, rax + adox r12, zero + adcx r12, zero +.elseif (\arg1 % 6 == 5) + mulx r13, rax, [x+40] + adcx r12, rax + adox r13, zero + adcx r13, zero +.endif + +.endm + + + +S2N_BN_SYMBOL(bignum_mul_6_12): + _CET_ENDBR + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 +#endif + +// Save more registers to play with + + push rbp + push rbx + push r12 + push r13 + +// Copy y into a safe register to start with + + mov y, rdx + +// Zero a register, which also makes sure we don't get a fake carry-in + + xor zeroe, zeroe + +// Do the zeroth row, which is a bit different +// Write back the zero-zero product and then accumulate +// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6 + + mov rdx, [y] + + mulx r9, r8, [x] + mov [z], r8 + + mulx r10, rbx, [x+8] + adcx r9, rbx + + mulx r11, rbx, [x+16] + adcx r10, rbx + + mulx r12, rbx, [x+24] + adcx r11, rbx + + mulx r13, rbx, [x+32] + adcx r12, rbx + + mulx r8, rbx, [x+40] + adcx r13, rbx + adcx r8, zero + +// Now all the other rows in a uniform pattern + + addrow 1 + addrow 2 + addrow 3 + addrow 4 + addrow 5 + +// Now write back the additional columns + + mov [z+48], r8 + mov [z+56], r9 + mov [z+64], r10 + mov [z+72], r11 + mov [z+80], r12 + mov [z+88], r13 + +// Restore registers and return + + pop r13 + pop r12 + pop rbx + pop rbp + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S new file mode 100644 index 0000000000..f4ea4ed68b --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S @@ -0,0 +1,186 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[6], y[6]; output z[12] +// +// extern void bignum_mul_6_12_alt(uint64_t z[static 12], +// const uint64_t x[static 6], +// const uint64_t y[static 6]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt) + .text + +// These are actually right + +#define z rdi +#define x rsi + +// This is moved from rdx to free it for muls + +#define y rcx + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 r8 +#define t1 r9 +#define t2 r10 + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +// A minutely shorter form for when c = 0 initially + +#define combadz(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx; \ + adc c, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx + +S2N_BN_SYMBOL(bignum_mul_6_12_alt): + _CET_ENDBR + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 +#endif + +// Copy y into a safe register to start with + + mov y, rdx + +// Result term 0 + + mov rax, [x] + mul QWORD PTR [y] + + mov [z], rax + mov t0, rdx + xor t1, t1 + +// Result term 1 + + xor t2, t2 + combads(t1,t0,[x],[y+8]) + combadz(t2,t1,t0,[x+8],[y]) + mov [z+8], t0 + +// Result term 2 + + xor t0, t0 + combadz(t0,t2,t1,[x],[y+16]) + combadd(t0,t2,t1,[x+8],[y+8]) + combadd(t0,t2,t1,[x+16],[y]) + mov [z+16], t1 + +// Result term 3 + + xor t1, t1 + combadz(t1,t0,t2,[x],[y+24]) + combadd(t1,t0,t2,[x+8],[y+16]) + combadd(t1,t0,t2,[x+16],[y+8]) + combadd(t1,t0,t2,[x+24],[y]) + mov [z+24], t2 + +// Result term 4 + + xor t2, t2 + combadz(t2,t1,t0,[x],[y+32]) + combadd(t2,t1,t0,[x+8],[y+24]) + combadd(t2,t1,t0,[x+16],[y+16]) + combadd(t2,t1,t0,[x+24],[y+8]) + combadd(t2,t1,t0,[x+32],[y]) + mov [z+32], t0 + +// Result term 5 + + xor t0, t0 + combadz(t0,t2,t1,[x],[y+40]) + combadd(t0,t2,t1,[x+8],[y+32]) + combadd(t0,t2,t1,[x+16],[y+24]) + combadd(t0,t2,t1,[x+24],[y+16]) + combadd(t0,t2,t1,[x+32],[y+8]) + combadd(t0,t2,t1,[x+40],[y]) + mov [z+40], t1 + +// Result term 6 + + xor t1, t1 + combadz(t1,t0,t2,[x+8],[y+40]) + combadd(t1,t0,t2,[x+16],[y+32]) + combadd(t1,t0,t2,[x+24],[y+24]) + combadd(t1,t0,t2,[x+32],[y+16]) + combadd(t1,t0,t2,[x+40],[y+8]) + mov [z+48], t2 + +// Result term 7 + + xor t2, t2 + combadz(t2,t1,t0,[x+16],[y+40]) + combadd(t2,t1,t0,[x+24],[y+32]) + combadd(t2,t1,t0,[x+32],[y+24]) + combadd(t2,t1,t0,[x+40],[y+16]) + mov [z+56], t0 + +// Result term 8 + + xor t0, t0 + combadz(t0,t2,t1,[x+24],[y+40]) + combadd(t0,t2,t1,[x+32],[y+32]) + combadd(t0,t2,t1,[x+40],[y+24]) + mov [z+64], t1 + +// Result term 9 + + xor t1, t1 + combadz(t1,t0,t2,[x+32],[y+40]) + combadd(t1,t0,t2,[x+40],[y+32]) + mov [z+72], t2 + +// Result term 10 + + combads(t1,t0,[x+40],[y+40]) + mov [z+80], t0 + +// Result term 11 + + mov [z+88], t1 + +// Return + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S new file mode 100644 index 0000000000..07d840b47c --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S @@ -0,0 +1,214 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[6]; output z[12] +// +// extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12) + .text + +// These are actually right + +#define z rdi +#define x rsi + +// A zero register + +#define zero rbp +#define zeroe ebp + +// Other registers + +#define d1 r8 +#define d2 r9 +#define d3 r10 +#define d4 r11 +#define d5 r12 +#define d6 r13 +#define d7 r14 +#define d8 r15 +#define d9 rbx + +// Care is needed: re-using the zero register + +#define d10 rbp + + +S2N_BN_SYMBOL(bignum_sqr_6_12): + _CET_ENDBR + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx +#endif + +// Save more registers to play with + + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + +// Set up an initial window [d8;...d1] = [34;05;03;01] + + mov rdx, [x] + mulx d2, d1, [x+8] + mulx d4, d3, [x+24] + mulx d6, d5, [x+40] + mov rdx, [x+24] + mulx d8, d7, [x+32] + +// Clear our zero register, and also initialize the flags for the carry chain + + xor zeroe, zeroe + +// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window +// (no carry-out possible since we add it to the top of a product) + + mov rdx, [x+16] + mulx rcx, rax, [x] + adcx d2, rax + adox d3, rcx + mulx rcx, rax, [x+8] + adcx d3, rax + adox d4, rcx + mov rdx, [x+8] + mulx rcx, rax, [x+24] + adcx d4, rax + adox d5, rcx + mulx rcx, rax, [x+32] + adcx d5, rax + adox d6, rcx + mulx rcx, rax, [x+40] + adcx d6, rax + adox d7, rcx + adcx d7, zero + adox d8, zero + adcx d8, zero + +// Again zero out the flags. Actually they are already cleared but it may +// help decouple these in the OOO engine not to wait for the chain above + + xor zeroe, zeroe + +// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms +// We are running out of registers and here our zero register is not zero! + + mov rdx, [x+32] + mulx rcx, rax, [x] + adcx d4, rax + adox d5, rcx + mov rdx, [x+16] + mulx rcx, rax, [x+24] + adcx d5, rax + adox d6, rcx + mulx rcx, rax, [x+32] + adcx d6, rax + adox d7, rcx + mulx rcx, rax, [x+40] + adcx d7, rax + adox d8, rcx + mov rdx, [x+24] + mulx d9, rax, [x+40] + adcx d8, rax + adox d9, zero + mov rdx, [x+32] + mulx d10, rax, [x+40] + adcx d9, rax + mov eax, 0 + adox d10, rax + adcx d10, rax + +// Again, just for a clear fresh start for the flags + + xor eax, eax + +// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms +// +// We could use shift-double but this seems tidier and in larger squarings +// it was actually more efficient. I haven't experimented with this small +// case to see how much that matters. Note: the writeback here is sprinkled +// into the sequence in such a way that things still work if z = x, i.e. if +// the output overwrites the input buffer and beyond. + + mov rdx, [x] + mulx rdx, rax, rdx + mov [z], rax + adcx d1, d1 + adox d1, rdx + mov rdx, [x+8] + mov [z+8], d1 + mulx rdx, rax, rdx + adcx d2, d2 + adox d2, rax + adcx d3, d3 + adox d3, rdx + mov rdx, [x+16] + mov [z+16], d2 + mulx rdx, rax, rdx + adcx d4, d4 + adox d4, rax + adcx d5, d5 + adox d5, rdx + mov rdx, [x+24] + mov [z+24], d3 + mulx rdx, rax, rdx + adcx d6, d6 + adox d6, rax + adcx d7, d7 + adox d7, rdx + mov rdx, [x+32] + mov [z+32], d4 + mulx rdx, rax, rdx + adcx d8, d8 + adox d8, rax + adcx d9, d9 + adox d9, rdx + mov rdx, [x+40] + mov [z+40], d5 + mulx rdx, rax, rdx + mov [z+48], d6 + adcx d10, d10 + mov [z+56], d7 + adox d10, rax + mov [z+64], d8 + mov eax, 0 + mov [z+72], d9 + adcx rdx, rax + mov [z+80], d10 + adox rdx, rax + mov [z+88], rdx + +// Restore saved registers and return + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S new file mode 100644 index 0000000000..a517e5c654 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S @@ -0,0 +1,197 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[6]; output z[12] +// +// extern void bignum_sqr_6_12_alt(uint64_t z[static 12], +// const uint64_t x[static 6]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt) + .text + +// Input arguments + +#define z rdi +#define x rsi + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 r8 +#define t1 r9 +#define t2 r10 + +// Additional temporaries for local windows to share doublings + +#define u0 rcx +#define u1 r11 + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +// Set up initial window (c,h,l) = numa * numb + +#define combaddz(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + xor c, c; \ + mov l, rax; \ + mov h, rdx + +// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) + +#define doubladd(c,h,l,hh,ll) \ + add ll, ll; \ + adc hh, hh; \ + adc c, c; \ + add l, ll; \ + adc h, hh; \ + adc c, 0 + +// Square term incorporation (c,h,l) += numba^2 + +#define combadd1(c,h,l,numa) \ + mov rax, numa; \ + mul rax; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +// A short form where we don't expect a top carry + +#define combads(h,l,numa) \ + mov rax, numa; \ + mul rax; \ + add l, rax; \ + adc h, rdx + +// A version doubling directly before adding, for single non-square terms + +#define combadd2(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add rax, rax; \ + adc rdx, rdx; \ + adc c, 0; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +S2N_BN_SYMBOL(bignum_sqr_6_12_alt): + _CET_ENDBR + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx +#endif + +// Result term 0 + + mov rax, [x] + mul rax + + mov [z], rax + mov t0, rdx + xor t1, t1 + +// Result term 1 + + xor t2, t2 + combadd2(t2,t1,t0,[x],[x+8]) + mov [z+8], t0 + +// Result term 2 + + xor t0, t0 + combadd1(t0,t2,t1,[x+8]) + combadd2(t0,t2,t1,[x],[x+16]) + mov [z+16], t1 + +// Result term 3 + + combaddz(t1,u1,u0,[x],[x+24]) + combadd(t1,u1,u0,[x+8],[x+16]) + doubladd(t1,t0,t2,u1,u0) + mov [z+24], t2 + +// Result term 4 + + combaddz(t2,u1,u0,[x],[x+32]) + combadd(t2,u1,u0,[x+8],[x+24]) + doubladd(t2,t1,t0,u1,u0) + combadd1(t2,t1,t0,[x+16]) + mov [z+32], t0 + +// Result term 5 + + combaddz(t0,u1,u0,[x],[x+40]) + combadd(t0,u1,u0,[x+8],[x+32]) + combadd(t0,u1,u0,[x+16],[x+24]) + doubladd(t0,t2,t1,u1,u0) + mov [z+40], t1 + +// Result term 6 + + combaddz(t1,u1,u0,[x+8],[x+40]) + combadd(t1,u1,u0,[x+16],[x+32]) + doubladd(t1,t0,t2,u1,u0) + combadd1(t1,t0,t2,[x+24]) + mov [z+48], t2 + +// Result term 7 + + combaddz(t2,u1,u0,[x+16],[x+40]) + combadd(t2,u1,u0,[x+24],[x+32]) + doubladd(t2,t1,t0,u1,u0) + mov [z+56], t0 + +// Result term 8 + + xor t0, t0 + combadd2(t0,t2,t1,[x+24],[x+40]) + combadd1(t0,t2,t1,[x+32]) + mov [z+64], t1 + +// Result term 9 + + xor t1, t1 + combadd2(t1,t0,t2,[x+32],[x+40]) + mov [z+72], t2 + +// Result term 10 + + combads(t1,t0,[x+40]) + mov [z+80], t0 + +// Result term 11 + + mov [z+88], t1 + +// Return + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif -- cgit v1.2.3-55-g6feb