From 9a1a39a85b5242df4ffd7c3a20eb012acdc716d7 Mon Sep 17 00:00:00 2001 From: jsing <> Date: Tue, 12 Aug 2025 10:14:24 +0000 Subject: Bring in bignum_{mul,sqr}_{4_8,8_16}() from s2n-bignum. These provide fast multiplication and squaring of inputs with 4 words or 8 words, producing an 8 or 16 word result. These versions require the CPU to support ADX instructions, while the _alt versions that have previously been imported do not. --- src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S | 174 +++++++++++++ src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S | 260 +++++++++++++++++++ src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S | 145 +++++++++++ src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S | 298 ++++++++++++++++++++++ 4 files changed, 877 insertions(+) create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S (limited to 'src/lib') diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S new file mode 100644 index 0000000000..5e9aebd685 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S @@ -0,0 +1,174 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[4], y[4]; output z[8] +// +// extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4], +// const uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8) + .text + +// These are actually right + +#define z rdi +#define x rsi + +// Copied in or set up + +#define y rcx + +// A zero register + +#define zero rbp +#define zeroe ebp + +// Add in x[i] * rdx to the (i,i+1) position with the register window +// Would be nice to have conditional expressions reg[i], reg[i+1] ... + +.macro mulpadd arg1,arg2 + mulx rbx, rax, [x+8*\arg2] +.if ((\arg1 + \arg2) % 4 == 0) + adcx r8, rax + adox r9, rbx +.elseif ((\arg1 + \arg2) % 4 == 1) + adcx r9, rax + adox r10, rbx +.elseif ((\arg1 + \arg2) % 4 == 2) + adcx r10, rax + adox r11, rbx +.elseif ((\arg1 + \arg2) % 4 == 3) + adcx r11, rax + adox r8, rbx +.endif + +.endm + + +// Add in the whole j'th row + +.macro addrow arg1 + mov rdx, [y+8*\arg1] + xor zeroe, zeroe + + mulpadd \arg1, 0 + +.if (\arg1 % 4 == 0) + mov [z+8*\arg1],r8 +.elseif (\arg1 % 4 == 1) + mov [z+8*\arg1],r9 +.elseif (\arg1 % 4 == 2) + mov [z+8*\arg1],r10 +.elseif (\arg1 % 4 == 3) + mov [z+8*\arg1],r11 +.endif + + mulpadd \arg1, 1 + mulpadd \arg1, 2 + +.if (\arg1 % 4 == 0) + mulx r8, rax, [x+24] + adcx r11, rax + adox r8, zero + adcx r8, zero +.elseif (\arg1 % 4 == 1) + mulx r9, rax, [x+24] + adcx r8, rax + adox r9, zero + adcx r9, zero +.elseif (\arg1 % 4 == 2) + mulx r10, rax, [x+24] + adcx r9, rax + adox r10, zero + adcx r10, zero +.elseif (\arg1 % 4 == 3) + mulx r11, rax, [x+24] + adcx r10, rax + adox r11, zero + adcx r11, zero +.endif + +.endm + + + +S2N_BN_SYMBOL(bignum_mul_4_8): + _CET_ENDBR + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 +#endif + +// Save more registers to play with + + push rbp + push rbx + +// Copy y into a safe register to start with + + mov y, rdx + +// Zero a register, which also makes sure we don't get a fake carry-in + + xor zeroe, zeroe + +// Do the zeroth row, which is a bit different +// Write back the zero-zero product and then accumulate +// r8,r11,r10,r9 as y[0] * x from 1..4 + + mov rdx, [y] + + mulx r9, r8, [x] + mov [z], r8 + + mulx r10, rbx, [x+8] + adcx r9, rbx + + mulx r11, rbx, [x+16] + adcx r10, rbx + + mulx r8, rbx, [x+24] + adcx r11, rbx + adcx r8, zero + +// Now all the other rows in a uniform pattern + + addrow 1 + addrow 2 + addrow 3 + +// Now write back the additional columns + + mov [z+32], r8 + mov [z+40], r9 + mov [z+48], r10 + mov [z+56], r11 + +// Restore registers and return + + pop rbx + pop rbp + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S new file mode 100644 index 0000000000..7bfae22a3f --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S @@ -0,0 +1,260 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[8], y[8]; output z[16] +// +// extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8], +// const uint64_t y[static 8]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16) + .text + +// These are actually right + +#define z rdi +#define x rsi + +// Copied in or set up + +#define y rcx + +// A zero register + +#define zero rbp +#define zeroe ebp + +// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j + +.macro mulpadd arg1,arg2 + mulx rbx, rax, [x+8*\arg1] +.if ((\arg1 + \arg2) % 8 == 0) + adcx r8, rax + adox r9, rbx +.elseif ((\arg1 + \arg2) % 8 == 1) + adcx r9, rax + adox r10, rbx +.elseif ((\arg1 + \arg2) % 8 == 2) + adcx r10, rax + adox r11, rbx +.elseif ((\arg1 + \arg2) % 8 == 3) + adcx r11, rax + adox r12, rbx +.elseif ((\arg1 + \arg2) % 8 == 4) + adcx r12, rax + adox r13, rbx +.elseif ((\arg1 + \arg2) % 8 == 5) + adcx r13, rax + adox r14, rbx +.elseif ((\arg1 + \arg2) % 8 == 6) + adcx r14, rax + adox r15, rbx +.elseif ((\arg1 + \arg2) % 8 == 7) + adcx r15, rax + adox r8, rbx +.endif + +.endm + +// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j +// but re-creates the top word assuming nothing to add there + +.macro mulpade arg1,arg2 +.if ((\arg1 + \arg2) % 8 == 0) + mulx r9, rax, [x+8*\arg1] + adcx r8, rax + adox r9, zero +.elseif ((\arg1 + \arg2) % 8 == 1) + mulx r10, rax, [x+8*\arg1] + adcx r9, rax + adox r10, zero +.elseif ((\arg1 + \arg2) % 8 == 2) + mulx r11, rax, [x+8*\arg1] + adcx r10, rax + adox r11, zero +.elseif ((\arg1 + \arg2) % 8 == 3) + mulx r12, rax, [x+8*\arg1] + adcx r11, rax + adox r12, zero +.elseif ((\arg1 + \arg2) % 8 == 4) + mulx r13, rax, [x+8*\arg1] + adcx r12, rax + adox r13, zero +.elseif ((\arg1 + \arg2) % 8 == 5) + mulx r14, rax, [x+8*\arg1] + adcx r13, rax + adox r14, zero +.elseif ((\arg1 + \arg2) % 8 == 6) + mulx r15, rax, [x+8*\arg1] + adcx r14, rax + adox r15, zero +.elseif ((\arg1 + \arg2) % 8 == 7) + mulx r8, rax, [x+8*\arg1] + adcx r15, rax + adox r8, zero +.endif + +.endm + +// Add in the whole j'th row + +.macro addrow arg1 + mov rdx, [y+8*\arg1] + xor zeroe, zeroe + + mulpadd 0, \arg1 + +.if (\arg1 % 8 == 0) + mov [z+8*\arg1],r8 +.elseif (\arg1 % 8 == 1) + mov [z+8*\arg1],r9 +.elseif (\arg1 % 8 == 2) + mov [z+8*\arg1],r10 +.elseif (\arg1 % 8 == 3) + mov [z+8*\arg1],r11 +.elseif (\arg1 % 8 == 4) + mov [z+8*\arg1],r12 +.elseif (\arg1 % 8 == 5) + mov [z+8*\arg1],r13 +.elseif (\arg1 % 8 == 6) + mov [z+8*\arg1],r14 +.elseif (\arg1 % 8 == 7) + mov [z+8*\arg1],r15 +.endif + + mulpadd 1, \arg1 + mulpadd 2, \arg1 + mulpadd 3, \arg1 + mulpadd 4, \arg1 + mulpadd 5, \arg1 + mulpadd 6, \arg1 + mulpade 7, \arg1 + +.if (\arg1 % 8 == 0) + adc r8, zero +.elseif (\arg1 % 8 == 1) + adc r9, zero +.elseif (\arg1 % 8 == 2) + adc r10, zero +.elseif (\arg1 % 8 == 3) + adc r11, zero +.elseif (\arg1 % 8 == 4) + adc r12, zero +.elseif (\arg1 % 8 == 5) + adc r13, zero +.elseif (\arg1 % 8 == 6) + adc r14, zero +.elseif (\arg1 % 8 == 7) + adc r15, zero +.endif + +.endm + + +S2N_BN_SYMBOL(bignum_mul_8_16): + _CET_ENDBR + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 +#endif + +// Save more registers to play with + + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + +// Copy y into a safe register to start with + + mov y, rdx + +// Zero a register, which also makes sure we don't get a fake carry-in + + xor zeroe, zeroe + +// Do the zeroth row, which is a bit different +// Write back the zero-zero product and then accumulate +// r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8 + + mov rdx, [y] + + mulx r9, r8, [x] + mov [z], r8 + + mulx r10, rbx, [x+8] + adc r9, rbx + + mulx r11, rbx, [x+16] + adc r10, rbx + + mulx r12, rbx, [x+24] + adc r11, rbx + + mulx r13, rbx, [x+32] + adc r12, rbx + + mulx r14, rbx, [x+40] + adc r13, rbx + + mulx r15, rbx, [x+48] + adc r14, rbx + + mulx r8, rbx, [x+56] + adc r15, rbx + adc r8, zero + +// Now all the other rows in a uniform pattern + + addrow 1 + addrow 2 + addrow 3 + addrow 4 + addrow 5 + addrow 6 + addrow 7 + +// Now write back the additional columns + + mov [z+64], r8 + mov [z+72], r9 + mov [z+80], r10 + mov [z+88], r11 + mov [z+96], r12 + mov [z+104], r13 + mov [z+112], r14 + mov [z+120], r15 + +// Real epilog + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S new file mode 100644 index 0000000000..205455bd2c --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S @@ -0,0 +1,145 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[4]; output z[8] +// +// extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8) + .text + +// These are actually right + +#define z rdi +#define x rsi + +// A zero register + +#define zero rbp +#define zeroe ebp + +// Other registers + +#define d1 r8 +#define d2 r9 +#define d3 r10 +#define d4 r11 +#define d5 r12 +#define d6 r13 + + + +S2N_BN_SYMBOL(bignum_sqr_4_8): + _CET_ENDBR + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx +#endif + +// Save more registers to play with + + push rbp + push r12 + push r13 + +// Set up an initial window [d6;...d1] = [23;03;01] + + mov rdx, [x] + mulx d2, d1, [x+8] + mulx d4, d3, [x+24] + mov rdx, [x+16] + mulx d6, d5, [x+24] + +// Clear our zero register, and also initialize the flags for the carry chain + + xor zeroe, zeroe + +// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) +// This gives all the "heterogeneous" terms of the squaring ready to double + + mulx rcx, rax, [x] + adcx d2, rax + adox d3, rcx + mulx rcx, rax, [x+8] + adcx d3, rax + adox d4, rcx + mov rdx, [x+24] + mulx rcx, rax, [x+8] + adcx d4, rax + adox d5, rcx + adcx d5, zero + adox d6, zero + adcx d6, zero + +// In principle this is otiose as CF and OF carries are absorbed at this point +// However it seems helpful for the OOO engine to be told it's a fresh start + + xor zeroe, zeroe + +// Double and add to the 00 + 11 + 22 + 33 terms +// +// We could use shift-double but this seems tidier and in larger squarings +// it was actually more efficient. I haven't experimented with this small +// case to see how much that matters. Note: the writeback here is sprinkled +// into the sequence in such a way that things still work if z = x, i.e. if +// the output overwrites the input buffer and beyond. + + mov rdx, [x] + mulx rdx, rax, rdx + mov [z], rax + adcx d1, d1 + adox d1, rdx + mov rdx, [x+8] + mov [z+8], d1 + mulx rdx, rax, rdx + adcx d2, d2 + adox d2, rax + adcx d3, d3 + adox d3, rdx + mov rdx, [x+16] + mov [z+16], d2 + mulx rdx, rax, rdx + adcx d4, d4 + adox d4, rax + adcx d5, d5 + adox d5, rdx + mov rdx, [x+24] + mov [z+24], d3 + mulx rdx, rax, rdx + mov [z+32], d4 + adcx d6, d6 + mov [z+40], d5 + adox d6, rax + mov [z+48], d6 + adcx rdx, zero + adox rdx, zero + mov [z+56], rdx + +// Restore saved registers and return + + pop r13 + pop r12 + pop rbp + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S new file mode 100644 index 0000000000..77741c603e --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S @@ -0,0 +1,298 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[8]; output z[16] +// +// extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16) + .text + +// These are actually right + +#define z rdi +#define x rsi + +// A zero register + +#define zero rbp +#define zeroe ebp + +// mulpadd i, j adds rdx * x[i] into the window at the i+j point + +.macro mulpadd arg1,arg2 + mulx rcx, rax, [x+8*\arg1] +.if ((\arg1 + \arg2) % 8 == 0) + adcx r8, rax + adox r9, rcx +.elseif ((\arg1 + \arg2) % 8 == 1) + adcx r9, rax + adox r10, rcx +.elseif ((\arg1 + \arg2) % 8 == 2) + adcx r10, rax + adox r11, rcx +.elseif ((\arg1 + \arg2) % 8 == 3) + adcx r11, rax + adox r12, rcx +.elseif ((\arg1 + \arg2) % 8 == 4) + adcx r12, rax + adox r13, rcx +.elseif ((\arg1 + \arg2) % 8 == 5) + adcx r13, rax + adox r14, rcx +.elseif ((\arg1 + \arg2) % 8 == 6) + adcx r14, rax + adox r15, rcx +.elseif ((\arg1 + \arg2) % 8 == 7) + adcx r15, rax + adox r8, rcx +.endif + +.endm + +// mulpade i, j adds rdx * x[i] into the window at i+j +// but re-creates the top word assuming nothing to add there + +.macro mulpade arg1,arg2 +.if ((\arg1 + \arg2) % 8 == 0) + mulx r9, rax, [x+8*\arg1] + adcx r8, rax + adox r9, zero +.elseif ((\arg1 + \arg2) % 8 == 1) + mulx r10, rax, [x+8*\arg1] + adcx r9, rax + adox r10, zero +.elseif ((\arg1 + \arg2) % 8 == 2) + mulx r11, rax, [x+8*\arg1] + adcx r10, rax + adox r11, zero +.elseif ((\arg1 + \arg2) % 8 == 3) + mulx r12, rax, [x+8*\arg1] + adcx r11, rax + adox r12, zero +.elseif ((\arg1 + \arg2) % 8 == 4) + mulx r13, rax, [x+8*\arg1] + adcx r12, rax + adox r13, zero +.elseif ((\arg1 + \arg2) % 8 == 5) + mulx r14, rax, [x+8*\arg1] + adcx r13, rax + adox r14, zero +.elseif ((\arg1 + \arg2) % 8 == 6) + mulx r15, rax, [x+8*\arg1] + adcx r14, rax + adox r15, zero +.elseif ((\arg1 + \arg2) % 8 == 7) + mulx r8, rax, [x+8*\arg1] + adcx r15, rax + adox r8, zero +.endif + +.endm + +.macro diagonals + + xor zeroe, zeroe + +// Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70 + + mov rdx, [x] + mulx rax, r9, [x+8] + mov [z+8], r9 + mulx rcx, r10, [x+16] + adcx r10, rax + mov [z+16], r10 + mulx rax, r11, [x+24] + adcx r11, rcx + mulx rcx, r12, [x+32] + adcx r12, rax + mulx rax, r13, [x+40] + adcx r13, rcx + mulx rcx, r14, [x+48] + adcx r14, rax + mulx r8, r15, [x+56] + adcx r15, rcx + adcx r8, zero + +// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54 + + xor zeroe, zeroe + mov rdx, [x+8] + mulpadd 2, 1 + mov [z+24], r11 + mulpadd 3, 1 + mov [z+32], r12 + mulpadd 4, 1 + mulpadd 5, 1 + mulpadd 6, 1 + mulpade 7, 1 + mov rdx, [x+32] + mulpade 5, 4 + adcx r10, zero + +// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65 + + xor zeroe, zeroe + mov rdx, [x+16] + mulpadd 3, 2 + mov [z+40], r13 + mulpadd 4, 2 + mov [z+48], r14 + mulpadd 5, 2 + mulpadd 6, 2 + mulpadd 7, 2 + mov rdx, [x+48] + mulpade 4, 6 + mulpade 5, 6 + adcx r12, zero + +// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76 + + xor zeroe, zeroe + mov rdx, [x+24] + mulpadd 4, 3 + mov [z+56], r15 + mulpadd 5, 3 + mov [z+64], r8 + mulpadd 6, 3 + mulpadd 7, 3 + mov rdx, [x+56] + mulpadd 4, 7 + mulpade 5, 7 + mulpade 6, 7 + adcx r14, zero + +// Double and add things; use z[1]..z[8] and thereafter the registers +// r9..r15 which haven't been written back yet + + xor zeroe, zeroe + mov rdx, [x] + mulx rcx, rax, rdx + mov [z], rax + mov rax, [z+8] + adcx rax, rax + adox rax, rcx + mov [z+8], rax + + mov rax, [z+16] + mov rdx, [x+8] + mulx rcx, rdx, rdx + adcx rax, rax + adox rax, rdx + mov [z+16], rax + mov rax, [z+24] + adcx rax, rax + adox rax, rcx + mov [z+24], rax + + mov rax, [z+32] + mov rdx, [x+16] + mulx rcx, rdx, rdx + adcx rax, rax + adox rax, rdx + mov [z+32], rax + mov rax, [z+40] + adcx rax, rax + adox rax, rcx + mov [z+40], rax + + mov rax, [z+48] + mov rdx, [x+24] + mulx rcx, rdx, rdx + adcx rax, rax + adox rax, rdx + mov [z+48], rax + mov rax, [z+56] + adcx rax, rax + adox rax, rcx + mov [z+56], rax + + mov rax, [z+64] + mov rdx, [x+32] + mulx rcx, rdx, rdx + adcx rax, rax + adox rax, rdx + mov [z+64], rax + adcx r9, r9 + adox r9, rcx + mov [z+72], r9 + + mov rdx, [x+40] + mulx rcx, rdx, rdx + adcx r10, r10 + adox r10, rdx + mov [z+80], r10 + adcx r11, r11 + adox r11, rcx + mov [z+88], r11 + + mov rdx, [x+48] + mulx rcx, rdx, rdx + adcx r12, r12 + adox r12, rdx + mov [z+96], r12 + adcx r13, r13 + adox r13, rcx + mov [z+104], r13 + + mov rdx, [x+56] + mulx r15, rdx, rdx + adcx r14, r14 + adox r14, rdx + mov [z+112], r14 + adcx r15, zero + adox r15, zero + mov [z+120], r15 + +.endm + + +S2N_BN_SYMBOL(bignum_sqr_8_16): + _CET_ENDBR + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx +#endif + +// Save more registers to play with + + push rbp + push r12 + push r13 + push r14 + push r15 + +// Do the multiplication + + diagonals + +// Real epilog + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif -- cgit v1.2.3-55-g6feb