Bring in bignum_{mul,sqr}_6_12{,_alt}() from s2n-bignum.

These provide fast multiplication and squaring of inputs with 6x words, producing a 12 word result. The non-_alt versions require the CPU to support ADX instructions, while the _alt versions do not.
author: jsing <> 2025-08-12 10:09:46 +0000
committer: jsing <> 2025-08-12 10:09:46 +0000
commit: e5f4dc854c2c390526097888bf3654db21bdbb15 (patch)
tree: 25b70579b17511a42354b46d22fd95e599a86b6c /src
parent: f111c1e299493f0bdf21f9ff08f046d429764e75 (diff)
download: openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.tar.gz
openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.tar.bz2
openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.zip
4 files changed, 807 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
new file mode 100644
index 0000000000..081b602cde
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
@@ -0,0 +1,210 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6],
+//                                const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// Copied in or set up
+#define y rcx
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Add in x[i] * rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg2]
+.if ((\arg1 + \arg2) % 6 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 6 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 6 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 6 == 3)
+        adcx    r11, rax
+        adox    r12, rbx
+.elseif ((\arg1 + \arg2) % 6 == 4)
+        adcx    r12, rax
+        adox    r13, rbx
+.elseif ((\arg1 + \arg2) % 6 == 5)
+        adcx    r13, rax
+        adox    r8, rbx
+.endif
+.endm
+// Add in the whole j'th row
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+        mulpadd \arg1, 0
+.if (\arg1 % 6 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 6 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 6 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 6 == 3)
+        mov     [z+8*\arg1],r11
+.elseif (\arg1 % 6 == 4)
+        mov     [z+8*\arg1],r12
+.elseif (\arg1 % 6 == 5)
+        mov     [z+8*\arg1],r13
+.endif
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+        mulpadd \arg1, 3
+        mulpadd \arg1, 4
+.if (\arg1 % 6 == 0)
+        mulx    r8, rax, [x+40]
+        adcx    r13, rax
+        adox    r8, zero
+        adcx    r8, zero
+.elseif (\arg1 % 6 == 1)
+        mulx    r9, rax, [x+40]
+        adcx    r8, rax
+        adox    r9, zero
+        adcx    r9, zero
+.elseif (\arg1 % 6 == 2)
+        mulx    r10, rax, [x+40]
+        adcx    r9, rax
+        adox    r10, zero
+        adcx    r10, zero
+.elseif (\arg1 % 6 == 3)
+        mulx    r11, rax, [x+40]
+        adcx    r10, rax
+        adox    r11, zero
+        adcx    r11, zero
+.elseif (\arg1 % 6 == 4)
+        mulx    r12, rax, [x+40]
+        adcx    r11, rax
+        adox    r12, zero
+        adcx    r12, zero
+.elseif (\arg1 % 6 == 5)
+        mulx    r13, rax, [x+40]
+        adcx    r12, rax
+        adox    r13, zero
+        adcx    r13, zero
+.endif
+.endm
+S2N_BN_SYMBOL(bignum_mul_6_12):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Zero a register, which also makes sure we don't get a fake carry-in
+        xor     zeroe, zeroe
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6
+        mov     rdx, [y]
+        mulx    r9, r8, [x]
+        mov     [z], r8
+        mulx    r10, rbx, [x+8]
+        adcx    r9, rbx
+        mulx    r11, rbx, [x+16]
+        adcx    r10, rbx
+        mulx    r12, rbx, [x+24]
+        adcx    r11, rbx
+        mulx    r13, rbx, [x+32]
+        adcx    r12, rbx
+        mulx    r8, rbx, [x+40]
+        adcx    r13, rbx
+        adcx    r8, zero
+// Now all the other rows in a uniform pattern
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+// Now write back the additional columns
+        mov     [z+48], r8
+        mov     [z+56], r9
+        mov     [z+64], r10
+        mov     [z+72], r11
+        mov     [z+80], r12
+        mov     [z+88], r13
+// Restore registers and return
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
new file mode 100644
index 0000000000..f4ea4ed68b
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
@@ -0,0 +1,186 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6],
+//                                    const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// This is moved from rdx to free it for muls
+#define y rcx
+// Other variables used as a rotating 3-word window to add terms to
+#define t0 r8
+#define t1 r9
+#define t2 r10
+// Macro for the key "multiply and add to (c,h,l)" step
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// A minutely shorter form for when c = 0 initially
+#define combadz(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, c
+// A short form where we don't expect a top carry
+#define combads(h,l,numa,numb)                  \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx
+S2N_BN_SYMBOL(bignum_mul_6_12_alt):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+// Copy y into a safe register to start with
+        mov     y, rdx
+// Result term 0
+        mov     rax, [x]
+        mul     QWORD PTR [y]
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+// Result term 1
+        xor     t2, t2
+        combads(t1,t0,[x],[y+8])
+        combadz(t2,t1,t0,[x+8],[y])
+        mov     [z+8], t0
+// Result term 2
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+16])
+        combadd(t0,t2,t1,[x+8],[y+8])
+        combadd(t0,t2,t1,[x+16],[y])
+        mov     [z+16], t1
+// Result term 3
+        xor     t1, t1
+        combadz(t1,t0,t2,[x],[y+24])
+        combadd(t1,t0,t2,[x+8],[y+16])
+        combadd(t1,t0,t2,[x+16],[y+8])
+        combadd(t1,t0,t2,[x+24],[y])
+        mov     [z+24], t2
+// Result term 4
+        xor     t2, t2
+        combadz(t2,t1,t0,[x],[y+32])
+        combadd(t2,t1,t0,[x+8],[y+24])
+        combadd(t2,t1,t0,[x+16],[y+16])
+        combadd(t2,t1,t0,[x+24],[y+8])
+        combadd(t2,t1,t0,[x+32],[y])
+        mov     [z+32], t0
+// Result term 5
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+40])
+        combadd(t0,t2,t1,[x+8],[y+32])
+        combadd(t0,t2,t1,[x+16],[y+24])
+        combadd(t0,t2,t1,[x+24],[y+16])
+        combadd(t0,t2,t1,[x+32],[y+8])
+        combadd(t0,t2,t1,[x+40],[y])
+        mov     [z+40], t1
+// Result term 6
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+8],[y+40])
+        combadd(t1,t0,t2,[x+16],[y+32])
+        combadd(t1,t0,t2,[x+24],[y+24])
+        combadd(t1,t0,t2,[x+32],[y+16])
+        combadd(t1,t0,t2,[x+40],[y+8])
+        mov     [z+48], t2
+// Result term 7
+        xor     t2, t2
+        combadz(t2,t1,t0,[x+16],[y+40])
+        combadd(t2,t1,t0,[x+24],[y+32])
+        combadd(t2,t1,t0,[x+32],[y+24])
+        combadd(t2,t1,t0,[x+40],[y+16])
+        mov     [z+56], t0
+// Result term 8
+        xor     t0, t0
+        combadz(t0,t2,t1,[x+24],[y+40])
+        combadd(t0,t2,t1,[x+32],[y+32])
+        combadd(t0,t2,t1,[x+40],[y+24])
+        mov     [z+64], t1
+// Result term 9
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+32],[y+40])
+        combadd(t1,t0,t2,[x+40],[y+32])
+        mov     [z+72], t2
+// Result term 10
+        combads(t1,t0,[x+40],[y+40])
+        mov     [z+80], t0
+// Result term 11
+        mov     [z+88], t1
+// Return
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
new file mode 100644
index 0000000000..07d840b47c
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
@@ -0,0 +1,214 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
+        .text
+// These are actually right
+#define z rdi
+#define x rsi
+// A zero register
+#define zero rbp
+#define zeroe ebp
+// Other registers
+#define d1 r8
+#define d2 r9
+#define d3 r10
+#define d4 r11
+#define d5 r12
+#define d6 r13
+#define d7 r14
+#define d8 r15
+#define d9 rbx
+// Care is needed: re-using the zero register
+#define d10 rbp
+S2N_BN_SYMBOL(bignum_sqr_6_12):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Save more registers to play with
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+// Set up an initial window [d8;...d1] = [34;05;03;01]
+        mov     rdx, [x]
+        mulx    d2, d1, [x+8]
+        mulx    d4, d3, [x+24]
+        mulx    d6, d5, [x+40]
+        mov     rdx, [x+24]
+        mulx    d8, d7, [x+32]
+// Clear our zero register, and also initialize the flags for the carry chain
+        xor     zeroe, zeroe
+// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
+// (no carry-out possible since we add it to the top of a product)
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x]
+        adcx    d2, rax
+        adox    d3, rcx
+        mulx    rcx, rax, [x+8]
+        adcx    d3, rax
+        adox    d4, rcx
+        mov     rdx, [x+8]
+        mulx    rcx, rax, [x+24]
+        adcx    d4, rax
+        adox    d5, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d6, rax
+        adox    d7, rcx
+        adcx    d7, zero
+        adox    d8, zero
+        adcx    d8, zero
+// Again zero out the flags. Actually they are already cleared but it may
+// help decouple these in the OOO engine not to wait for the chain above
+        xor     zeroe, zeroe
+// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
+// We are running out of registers and here our zero register is not zero!
+        mov     rdx, [x+32]
+        mulx    rcx, rax, [x]
+        adcx    d4, rax
+        adox    d5, rcx
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x+24]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d6, rax
+        adox    d7, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d7, rax
+        adox    d8, rcx
+        mov     rdx, [x+24]
+        mulx    d9, rax, [x+40]
+        adcx    d8, rax
+        adox    d9, zero
+        mov     rdx, [x+32]
+        mulx    d10, rax, [x+40]
+        adcx    d9, rax
+        mov     eax, 0
+        adox    d10, rax
+        adcx    d10, rax
+// Again, just for a clear fresh start for the flags
+        xor     eax, eax
+// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+        mov     rdx, [x]
+        mulx    rdx, rax, rdx
+        mov     [z], rax
+        adcx    d1, d1
+        adox    d1, rdx
+        mov     rdx, [x+8]
+        mov     [z+8], d1
+        mulx    rdx, rax, rdx
+        adcx    d2, d2
+        adox    d2, rax
+        adcx    d3, d3
+        adox    d3, rdx
+        mov     rdx, [x+16]
+        mov     [z+16], d2
+        mulx    rdx, rax, rdx
+        adcx    d4, d4
+        adox    d4, rax
+        adcx    d5, d5
+        adox    d5, rdx
+        mov     rdx, [x+24]
+        mov     [z+24], d3
+        mulx    rdx, rax, rdx
+        adcx    d6, d6
+        adox    d6, rax
+        adcx    d7, d7
+        adox    d7, rdx
+        mov     rdx, [x+32]
+        mov     [z+32], d4
+        mulx    rdx, rax, rdx
+        adcx    d8, d8
+        adox    d8, rax
+        adcx    d9, d9
+        adox    d9, rdx
+        mov     rdx, [x+40]
+        mov     [z+40], d5
+        mulx    rdx, rax, rdx
+        mov     [z+48], d6
+        adcx    d10, d10
+        mov     [z+56], d7
+        adox    d10, rax
+        mov     [z+64], d8
+        mov     eax, 0
+        mov     [z+72], d9
+        adcx    rdx, rax
+        mov     [z+80], d10
+        adox    rdx, rax
+        mov     [z+88], rdx
+// Restore saved registers and return
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
new file mode 100644
index 0000000000..a517e5c654
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
@@ -0,0 +1,197 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+#include "_internal_s2n_bignum.h"
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
+        .text
+// Input arguments
+#define z rdi
+#define x rsi
+// Other variables used as a rotating 3-word window to add terms to
+#define t0 r8
+#define t1 r9
+#define t2 r10
+// Additional temporaries for local windows to share doublings
+#define u0 rcx
+#define u1 r11
+// Macro for the key "multiply and add to (c,h,l)" step
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// Set up initial window (c,h,l) = numa * numb
+#define combaddz(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        xor     c, c;                           \
+        mov     l, rax;                         \
+        mov     h, rdx
+// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
+#define doubladd(c,h,l,hh,ll)                   \
+        add     ll, ll;                         \
+        adc     hh, hh;                         \
+        adc     c, c;                           \
+        add     l, ll;                          \
+        adc     h, hh;                          \
+        adc     c, 0
+// Square term incorporation (c,h,l) += numba^2
+#define combadd1(c,h,l,numa)                    \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+// A short form where we don't expect a top carry
+#define combads(h,l,numa)                       \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx
+// A version doubling directly before adding, for single non-square terms
+#define combadd2(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     rax, rax;                       \
+        adc     rdx, rdx;                       \
+        adc     c, 0;                           \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
+        _CET_ENDBR
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+// Result term 0
+        mov     rax, [x]
+        mul     rax
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+// Result term 1
+        xor     t2, t2
+        combadd2(t2,t1,t0,[x],[x+8])
+        mov     [z+8], t0
+// Result term 2
+        xor     t0, t0
+        combadd1(t0,t2,t1,[x+8])
+        combadd2(t0,t2,t1,[x],[x+16])
+        mov     [z+16], t1
+// Result term 3
+        combaddz(t1,u1,u0,[x],[x+24])
+        combadd(t1,u1,u0,[x+8],[x+16])
+        doubladd(t1,t0,t2,u1,u0)
+        mov     [z+24], t2
+// Result term 4
+        combaddz(t2,u1,u0,[x],[x+32])
+        combadd(t2,u1,u0,[x+8],[x+24])
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,[x+16])
+        mov     [z+32], t0
+// Result term 5
+        combaddz(t0,u1,u0,[x],[x+40])
+        combadd(t0,u1,u0,[x+8],[x+32])
+        combadd(t0,u1,u0,[x+16],[x+24])
+        doubladd(t0,t2,t1,u1,u0)
+        mov     [z+40], t1
+// Result term 6
+        combaddz(t1,u1,u0,[x+8],[x+40])
+        combadd(t1,u1,u0,[x+16],[x+32])
+        doubladd(t1,t0,t2,u1,u0)
+        combadd1(t1,t0,t2,[x+24])
+        mov     [z+48], t2
+// Result term 7
+        combaddz(t2,u1,u0,[x+16],[x+40])
+        combadd(t2,u1,u0,[x+24],[x+32])
+        doubladd(t2,t1,t0,u1,u0)
+        mov     [z+56], t0
+// Result term 8
+        xor     t0, t0
+        combadd2(t0,t2,t1,[x+24],[x+40])
+        combadd1(t0,t2,t1,[x+32])
+        mov     [z+64], t1
+// Result term 9
+        xor     t1, t1
+        combadd2(t1,t0,t2,[x+32],[x+40])
+        mov     [z+72], t2
+// Result term 10
+        combads(t1,t0,[x+40])
+        mov     [z+80], t0
+// Result term 11
+        mov     [z+88], t1
+// Return
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
author	jsing <>	2025-08-12 10:09:46 +0000
committer	jsing <>	2025-08-12 10:09:46 +0000
commit	e5f4dc854c2c390526097888bf3654db21bdbb15 (patch)
tree	25b70579b17511a42354b46d22fd95e599a86b6c /src
parent	f111c1e299493f0bdf21f9ff08f046d429764e75 (diff)
download	openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.tar.gz openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.tar.bz2 openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.zip

diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S new file mode 100644 index 0000000000..081b602cde --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
@@ -0,0 +1,210 @@
	1	// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
	2	// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
	3
	4	// ----------------------------------------------------------------------------
	5	// Multiply z := x * y
	6	// Inputs x[6], y[6]; output z[12]
	7	//
	8	// extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6],
	9	// const uint64_t y[static 6]);
	10	//
	11	// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
	12	// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
	13	// ----------------------------------------------------------------------------
	14
	15	#include "_internal_s2n_bignum.h"
	16
	17	.intel_syntax noprefix
	18	S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
	19	S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
	20	.text
	21
	22	// These are actually right
	23
	24	#define z rdi
	25	#define x rsi
	26
	27	// Copied in or set up
	28
	29	#define y rcx
	30
	31	// A zero register
	32
	33	#define zero rbp
	34	#define zeroe ebp
	35
	36	// Add in x[i] * rdx to the (i,i+1) position with the register window
	37	// Would be nice to have conditional expressions reg[i], reg[i+1] ...
	38
	39	.macro mulpadd arg1,arg2
	40	mulx rbx, rax, [x+8*\arg2]
	41	.if ((\arg1 + \arg2) % 6 == 0)
	42	adcx r8, rax
	43	adox r9, rbx
	44	.elseif ((\arg1 + \arg2) % 6 == 1)
	45	adcx r9, rax
	46	adox r10, rbx
	47	.elseif ((\arg1 + \arg2) % 6 == 2)
	48	adcx r10, rax
	49	adox r11, rbx
	50	.elseif ((\arg1 + \arg2) % 6 == 3)
	51	adcx r11, rax
	52	adox r12, rbx
	53	.elseif ((\arg1 + \arg2) % 6 == 4)
	54	adcx r12, rax
	55	adox r13, rbx
	56	.elseif ((\arg1 + \arg2) % 6 == 5)
	57	adcx r13, rax
	58	adox r8, rbx
	59	.endif
	60
	61	.endm
	62
	63
	64	// Add in the whole j'th row
	65
	66	.macro addrow arg1
	67	mov rdx, [y+8*\arg1]
	68	xor zeroe, zeroe
	69
	70	mulpadd \arg1, 0
	71
	72	.if (\arg1 % 6 == 0)
	73	mov [z+8*\arg1],r8
	74	.elseif (\arg1 % 6 == 1)
	75	mov [z+8*\arg1],r9
	76	.elseif (\arg1 % 6 == 2)
	77	mov [z+8*\arg1],r10
	78	.elseif (\arg1 % 6 == 3)
	79	mov [z+8*\arg1],r11
	80	.elseif (\arg1 % 6 == 4)
	81	mov [z+8*\arg1],r12
	82	.elseif (\arg1 % 6 == 5)
	83	mov [z+8*\arg1],r13
	84	.endif
	85
	86	mulpadd \arg1, 1
	87	mulpadd \arg1, 2
	88	mulpadd \arg1, 3
	89	mulpadd \arg1, 4
	90
	91	.if (\arg1 % 6 == 0)
	92	mulx r8, rax, [x+40]
	93	adcx r13, rax
	94	adox r8, zero
	95	adcx r8, zero
	96	.elseif (\arg1 % 6 == 1)
	97	mulx r9, rax, [x+40]
	98	adcx r8, rax
	99	adox r9, zero
	100	adcx r9, zero
	101	.elseif (\arg1 % 6 == 2)
	102	mulx r10, rax, [x+40]
	103	adcx r9, rax
	104	adox r10, zero
	105	adcx r10, zero
	106	.elseif (\arg1 % 6 == 3)
	107	mulx r11, rax, [x+40]
	108	adcx r10, rax
	109	adox r11, zero
	110	adcx r11, zero
	111	.elseif (\arg1 % 6 == 4)
	112	mulx r12, rax, [x+40]
	113	adcx r11, rax
	114	adox r12, zero
	115	adcx r12, zero
	116	.elseif (\arg1 % 6 == 5)
	117	mulx r13, rax, [x+40]
	118	adcx r12, rax
	119	adox r13, zero
	120	adcx r13, zero
	121	.endif
	122
	123	.endm
	124
	125
	126
	127	S2N_BN_SYMBOL(bignum_mul_6_12):
	128	_CET_ENDBR
	129
	130	#if WINDOWS_ABI
	131	push rdi
	132	push rsi
	133	mov rdi, rcx
	134	mov rsi, rdx
	135	mov rdx, r8
	136	#endif
	137
	138	// Save more registers to play with
	139
	140	push rbp
	141	push rbx
	142	push r12
	143	push r13
	144
	145	// Copy y into a safe register to start with
	146
	147	mov y, rdx
	148
	149	// Zero a register, which also makes sure we don't get a fake carry-in
	150
	151	xor zeroe, zeroe
	152
	153	// Do the zeroth row, which is a bit different
	154	// Write back the zero-zero product and then accumulate
	155	// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6
	156
	157	mov rdx, [y]
	158
	159	mulx r9, r8, [x]
	160	mov [z], r8
	161
	162	mulx r10, rbx, [x+8]
	163	adcx r9, rbx
	164
	165	mulx r11, rbx, [x+16]
	166	adcx r10, rbx
	167
	168	mulx r12, rbx, [x+24]
	169	adcx r11, rbx
	170
	171	mulx r13, rbx, [x+32]
	172	adcx r12, rbx
	173
	174	mulx r8, rbx, [x+40]
	175	adcx r13, rbx
	176	adcx r8, zero
	177
	178	// Now all the other rows in a uniform pattern
	179
	180	addrow 1
	181	addrow 2
	182	addrow 3
	183	addrow 4
	184	addrow 5
	185
	186	// Now write back the additional columns
	187
	188	mov [z+48], r8
	189	mov [z+56], r9
	190	mov [z+64], r10
	191	mov [z+72], r11
	192	mov [z+80], r12
	193	mov [z+88], r13
	194
	195	// Restore registers and return
	196
	197	pop r13
	198	pop r12
	199	pop rbx
	200	pop rbp
	201
	202	#if WINDOWS_ABI
	203	pop rsi
	204	pop rdi
	205	#endif
	206	ret
	207
	208	#if defined(__linux__) && defined(__ELF__)
	209	.section .note.GNU-stack,"",%progbits
	210	#endif


diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S new file mode 100644 index 0000000000..f4ea4ed68b --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
@@ -0,0 +1,186 @@
	1	// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
	2	// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
	3
	4	// ----------------------------------------------------------------------------
	5	// Multiply z := x * y
	6	// Inputs x[6], y[6]; output z[12]
	7	//
	8	// extern void bignum_mul_6_12_alt(uint64_t z[static 12],
	9	// const uint64_t x[static 6],
	10	// const uint64_t y[static 6]);
	11	//
	12	// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
	13	// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
	14	// ----------------------------------------------------------------------------
	15
	16	#include "_internal_s2n_bignum.h"
	17
	18	.intel_syntax noprefix
	19	S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
	20	S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
	21	.text
	22
	23	// These are actually right
	24
	25	#define z rdi
	26	#define x rsi
	27
	28	// This is moved from rdx to free it for muls
	29
	30	#define y rcx
	31
	32	// Other variables used as a rotating 3-word window to add terms to
	33
	34	#define t0 r8
	35	#define t1 r9
	36	#define t2 r10
	37
	38	// Macro for the key "multiply and add to (c,h,l)" step
	39
	40	#define combadd(c,h,l,numa,numb) \
	41	mov rax, numa; \
	42	mul QWORD PTR numb; \
	43	add l, rax; \
	44	adc h, rdx; \
	45	adc c, 0
	46
	47	// A minutely shorter form for when c = 0 initially
	48
	49	#define combadz(c,h,l,numa,numb) \
	50	mov rax, numa; \
	51	mul QWORD PTR numb; \
	52	add l, rax; \
	53	adc h, rdx; \
	54	adc c, c
	55
	56	// A short form where we don't expect a top carry
	57
	58	#define combads(h,l,numa,numb) \
	59	mov rax, numa; \
	60	mul QWORD PTR numb; \
	61	add l, rax; \
	62	adc h, rdx
	63
	64	S2N_BN_SYMBOL(bignum_mul_6_12_alt):
	65	_CET_ENDBR
	66
	67	#if WINDOWS_ABI
	68	push rdi
	69	push rsi
	70	mov rdi, rcx
	71	mov rsi, rdx
	72	mov rdx, r8
	73	#endif
	74
	75	// Copy y into a safe register to start with
	76
	77	mov y, rdx
	78
	79	// Result term 0
	80
	81	mov rax, [x]
	82	mul QWORD PTR [y]
	83
	84	mov [z], rax
	85	mov t0, rdx
	86	xor t1, t1
	87
	88	// Result term 1
	89
	90	xor t2, t2
	91	combads(t1,t0,[x],[y+8])
	92	combadz(t2,t1,t0,[x+8],[y])
	93	mov [z+8], t0
	94
	95	// Result term 2
	96
	97	xor t0, t0
	98	combadz(t0,t2,t1,[x],[y+16])
	99	combadd(t0,t2,t1,[x+8],[y+8])
	100	combadd(t0,t2,t1,[x+16],[y])
	101	mov [z+16], t1
	102
	103	// Result term 3
	104
	105	xor t1, t1
	106	combadz(t1,t0,t2,[x],[y+24])
	107	combadd(t1,t0,t2,[x+8],[y+16])
	108	combadd(t1,t0,t2,[x+16],[y+8])
	109	combadd(t1,t0,t2,[x+24],[y])
	110	mov [z+24], t2
	111
	112	// Result term 4
	113
	114	xor t2, t2
	115	combadz(t2,t1,t0,[x],[y+32])
	116	combadd(t2,t1,t0,[x+8],[y+24])
	117	combadd(t2,t1,t0,[x+16],[y+16])
	118	combadd(t2,t1,t0,[x+24],[y+8])
	119	combadd(t2,t1,t0,[x+32],[y])
	120	mov [z+32], t0
	121
	122	// Result term 5
	123
	124	xor t0, t0
	125	combadz(t0,t2,t1,[x],[y+40])
	126	combadd(t0,t2,t1,[x+8],[y+32])
	127	combadd(t0,t2,t1,[x+16],[y+24])
	128	combadd(t0,t2,t1,[x+24],[y+16])
	129	combadd(t0,t2,t1,[x+32],[y+8])
	130	combadd(t0,t2,t1,[x+40],[y])
	131	mov [z+40], t1
	132
	133	// Result term 6
	134
	135	xor t1, t1
	136	combadz(t1,t0,t2,[x+8],[y+40])
	137	combadd(t1,t0,t2,[x+16],[y+32])
	138	combadd(t1,t0,t2,[x+24],[y+24])
	139	combadd(t1,t0,t2,[x+32],[y+16])
	140	combadd(t1,t0,t2,[x+40],[y+8])
	141	mov [z+48], t2
	142
	143	// Result term 7
	144
	145	xor t2, t2
	146	combadz(t2,t1,t0,[x+16],[y+40])
	147	combadd(t2,t1,t0,[x+24],[y+32])
	148	combadd(t2,t1,t0,[x+32],[y+24])
	149	combadd(t2,t1,t0,[x+40],[y+16])
	150	mov [z+56], t0
	151
	152	// Result term 8
	153
	154	xor t0, t0
	155	combadz(t0,t2,t1,[x+24],[y+40])
	156	combadd(t0,t2,t1,[x+32],[y+32])
	157	combadd(t0,t2,t1,[x+40],[y+24])
	158	mov [z+64], t1
	159
	160	// Result term 9
	161
	162	xor t1, t1
	163	combadz(t1,t0,t2,[x+32],[y+40])
	164	combadd(t1,t0,t2,[x+40],[y+32])
	165	mov [z+72], t2
	166
	167	// Result term 10
	168
	169	combads(t1,t0,[x+40],[y+40])
	170	mov [z+80], t0
	171
	172	// Result term 11
	173
	174	mov [z+88], t1
	175
	176	// Return
	177
	178	#if WINDOWS_ABI
	179	pop rsi
	180	pop rdi
	181	#endif
	182	ret
	183
	184	#if defined(__linux__) && defined(__ELF__)
	185	.section .note.GNU-stack,"",%progbits
	186	#endif


diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S new file mode 100644 index 0000000000..07d840b47c --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
@@ -0,0 +1,214 @@
	1	// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
	2	// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
	3
	4	// ----------------------------------------------------------------------------
	5	// Square, z := x^2
	6	// Input x[6]; output z[12]
	7	//
	8	// extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
	9	//
	10	// Standard x86-64 ABI: RDI = z, RSI = x
	11	// Microsoft x64 ABI: RCX = z, RDX = x
	12	// ----------------------------------------------------------------------------
	13
	14	#include "_internal_s2n_bignum.h"
	15
	16	.intel_syntax noprefix
	17	S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
	18	S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
	19	.text
	20
	21	// These are actually right
	22
	23	#define z rdi
	24	#define x rsi
	25
	26	// A zero register
	27
	28	#define zero rbp
	29	#define zeroe ebp
	30
	31	// Other registers
	32
	33	#define d1 r8
	34	#define d2 r9
	35	#define d3 r10
	36	#define d4 r11
	37	#define d5 r12
	38	#define d6 r13
	39	#define d7 r14
	40	#define d8 r15
	41	#define d9 rbx
	42
	43	// Care is needed: re-using the zero register
	44
	45	#define d10 rbp
	46
	47
	48	S2N_BN_SYMBOL(bignum_sqr_6_12):
	49	_CET_ENDBR
	50
	51	#if WINDOWS_ABI
	52	push rdi
	53	push rsi
	54	mov rdi, rcx
	55	mov rsi, rdx
	56	#endif
	57
	58	// Save more registers to play with
	59
	60	push rbp
	61	push rbx
	62	push r12
	63	push r13
	64	push r14
	65	push r15
	66
	67	// Set up an initial window [d8;...d1] = [34;05;03;01]
	68
	69	mov rdx, [x]
	70	mulx d2, d1, [x+8]
	71	mulx d4, d3, [x+24]
	72	mulx d6, d5, [x+40]
	73	mov rdx, [x+24]
	74	mulx d8, d7, [x+32]
	75
	76	// Clear our zero register, and also initialize the flags for the carry chain
	77
	78	xor zeroe, zeroe
	79
	80	// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
	81	// (no carry-out possible since we add it to the top of a product)
	82
	83	mov rdx, [x+16]
	84	mulx rcx, rax, [x]
	85	adcx d2, rax
	86	adox d3, rcx
	87	mulx rcx, rax, [x+8]
	88	adcx d3, rax
	89	adox d4, rcx
	90	mov rdx, [x+8]
	91	mulx rcx, rax, [x+24]
	92	adcx d4, rax
	93	adox d5, rcx
	94	mulx rcx, rax, [x+32]
	95	adcx d5, rax
	96	adox d6, rcx
	97	mulx rcx, rax, [x+40]
	98	adcx d6, rax
	99	adox d7, rcx
	100	adcx d7, zero
	101	adox d8, zero
	102	adcx d8, zero
	103
	104	// Again zero out the flags. Actually they are already cleared but it may
	105	// help decouple these in the OOO engine not to wait for the chain above
	106
	107	xor zeroe, zeroe
	108
	109	// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
	110	// We are running out of registers and here our zero register is not zero!
	111
	112	mov rdx, [x+32]
	113	mulx rcx, rax, [x]
	114	adcx d4, rax
	115	adox d5, rcx
	116	mov rdx, [x+16]
	117	mulx rcx, rax, [x+24]
	118	adcx d5, rax
	119	adox d6, rcx
	120	mulx rcx, rax, [x+32]
	121	adcx d6, rax
	122	adox d7, rcx
	123	mulx rcx, rax, [x+40]
	124	adcx d7, rax
	125	adox d8, rcx
	126	mov rdx, [x+24]
	127	mulx d9, rax, [x+40]
	128	adcx d8, rax
	129	adox d9, zero
	130	mov rdx, [x+32]
	131	mulx d10, rax, [x+40]
	132	adcx d9, rax
	133	mov eax, 0
	134	adox d10, rax
	135	adcx d10, rax
	136
	137	// Again, just for a clear fresh start for the flags
	138
	139	xor eax, eax
	140
	141	// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
	142	//
	143	// We could use shift-double but this seems tidier and in larger squarings
	144	// it was actually more efficient. I haven't experimented with this small
	145	// case to see how much that matters. Note: the writeback here is sprinkled
	146	// into the sequence in such a way that things still work if z = x, i.e. if
	147	// the output overwrites the input buffer and beyond.
	148
	149	mov rdx, [x]
	150	mulx rdx, rax, rdx
	151	mov [z], rax
	152	adcx d1, d1
	153	adox d1, rdx
	154	mov rdx, [x+8]
	155	mov [z+8], d1
	156	mulx rdx, rax, rdx
	157	adcx d2, d2
	158	adox d2, rax
	159	adcx d3, d3
	160	adox d3, rdx
	161	mov rdx, [x+16]
	162	mov [z+16], d2
	163	mulx rdx, rax, rdx
	164	adcx d4, d4
	165	adox d4, rax
	166	adcx d5, d5
	167	adox d5, rdx
	168	mov rdx, [x+24]
	169	mov [z+24], d3
	170	mulx rdx, rax, rdx
	171	adcx d6, d6
	172	adox d6, rax
	173	adcx d7, d7
	174	adox d7, rdx
	175	mov rdx, [x+32]
	176	mov [z+32], d4
	177	mulx rdx, rax, rdx
	178	adcx d8, d8
	179	adox d8, rax
	180	adcx d9, d9
	181	adox d9, rdx
	182	mov rdx, [x+40]
	183	mov [z+40], d5
	184	mulx rdx, rax, rdx
	185	mov [z+48], d6
	186	adcx d10, d10
	187	mov [z+56], d7
	188	adox d10, rax
	189	mov [z+64], d8
	190	mov eax, 0
	191	mov [z+72], d9
	192	adcx rdx, rax
	193	mov [z+80], d10
	194	adox rdx, rax
	195	mov [z+88], rdx
	196
	197	// Restore saved registers and return
	198
	199	pop r15
	200	pop r14
	201	pop r13
	202	pop r12
	203	pop rbx
	204	pop rbp
	205
	206	#if WINDOWS_ABI
	207	pop rsi
	208	pop rdi
	209	#endif
	210	ret
	211
	212	#if defined(__linux__) && defined(__ELF__)
	213	.section .note.GNU-stack,"",%progbits
	214	#endif


diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S new file mode 100644 index 0000000000..a517e5c654 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
@@ -0,0 +1,197 @@
	1	// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
	2	// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
	3
	4	// ----------------------------------------------------------------------------
	5	// Square, z := x^2
	6	// Input x[6]; output z[12]
	7	//
	8	// extern void bignum_sqr_6_12_alt(uint64_t z[static 12],
	9	// const uint64_t x[static 6]);
	10	//
	11	// Standard x86-64 ABI: RDI = z, RSI = x
	12	// Microsoft x64 ABI: RCX = z, RDX = x
	13	// ----------------------------------------------------------------------------
	14
	15	#include "_internal_s2n_bignum.h"
	16
	17	.intel_syntax noprefix
	18	S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
	19	S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
	20	.text
	21
	22	// Input arguments
	23
	24	#define z rdi
	25	#define x rsi
	26
	27	// Other variables used as a rotating 3-word window to add terms to
	28
	29	#define t0 r8
	30	#define t1 r9
	31	#define t2 r10
	32
	33	// Additional temporaries for local windows to share doublings
	34
	35	#define u0 rcx
	36	#define u1 r11
	37
	38	// Macro for the key "multiply and add to (c,h,l)" step
	39
	40	#define combadd(c,h,l,numa,numb) \
	41	mov rax, numa; \
	42	mul QWORD PTR numb; \
	43	add l, rax; \
	44	adc h, rdx; \
	45	adc c, 0
	46
	47	// Set up initial window (c,h,l) = numa * numb
	48
	49	#define combaddz(c,h,l,numa,numb) \
	50	mov rax, numa; \
	51	mul QWORD PTR numb; \
	52	xor c, c; \
	53	mov l, rax; \
	54	mov h, rdx
	55
	56	// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
	57
	58	#define doubladd(c,h,l,hh,ll) \
	59	add ll, ll; \
	60	adc hh, hh; \
	61	adc c, c; \
	62	add l, ll; \
	63	adc h, hh; \
	64	adc c, 0
	65
	66	// Square term incorporation (c,h,l) += numba^2
	67
	68	#define combadd1(c,h,l,numa) \
	69	mov rax, numa; \
	70	mul rax; \
	71	add l, rax; \
	72	adc h, rdx; \
	73	adc c, 0
	74
	75	// A short form where we don't expect a top carry
	76
	77	#define combads(h,l,numa) \
	78	mov rax, numa; \
	79	mul rax; \
	80	add l, rax; \
	81	adc h, rdx
	82
	83	// A version doubling directly before adding, for single non-square terms
	84
	85	#define combadd2(c,h,l,numa,numb) \
	86	mov rax, numa; \
	87	mul QWORD PTR numb; \
	88	add rax, rax; \
	89	adc rdx, rdx; \
	90	adc c, 0; \
	91	add l, rax; \
	92	adc h, rdx; \
	93	adc c, 0
	94
	95	S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
	96	_CET_ENDBR
	97
	98	#if WINDOWS_ABI
	99	push rdi
	100	push rsi
	101	mov rdi, rcx
	102	mov rsi, rdx
	103	#endif
	104
	105	// Result term 0
	106
	107	mov rax, [x]
	108	mul rax
	109
	110	mov [z], rax
	111	mov t0, rdx
	112	xor t1, t1
	113
	114	// Result term 1
	115
	116	xor t2, t2
	117	combadd2(t2,t1,t0,[x],[x+8])
	118	mov [z+8], t0
	119
	120	// Result term 2
	121
	122	xor t0, t0
	123	combadd1(t0,t2,t1,[x+8])
	124	combadd2(t0,t2,t1,[x],[x+16])
	125	mov [z+16], t1
	126
	127	// Result term 3
	128
	129	combaddz(t1,u1,u0,[x],[x+24])
	130	combadd(t1,u1,u0,[x+8],[x+16])
	131	doubladd(t1,t0,t2,u1,u0)
	132	mov [z+24], t2
	133
	134	// Result term 4
	135
	136	combaddz(t2,u1,u0,[x],[x+32])
	137	combadd(t2,u1,u0,[x+8],[x+24])
	138	doubladd(t2,t1,t0,u1,u0)
	139	combadd1(t2,t1,t0,[x+16])
	140	mov [z+32], t0
	141
	142	// Result term 5
	143
	144	combaddz(t0,u1,u0,[x],[x+40])
	145	combadd(t0,u1,u0,[x+8],[x+32])
	146	combadd(t0,u1,u0,[x+16],[x+24])
	147	doubladd(t0,t2,t1,u1,u0)
	148	mov [z+40], t1
	149
	150	// Result term 6
	151
	152	combaddz(t1,u1,u0,[x+8],[x+40])
	153	combadd(t1,u1,u0,[x+16],[x+32])
	154	doubladd(t1,t0,t2,u1,u0)
	155	combadd1(t1,t0,t2,[x+24])
	156	mov [z+48], t2
	157
	158	// Result term 7
	159
	160	combaddz(t2,u1,u0,[x+16],[x+40])
	161	combadd(t2,u1,u0,[x+24],[x+32])
	162	doubladd(t2,t1,t0,u1,u0)
	163	mov [z+56], t0
	164
	165	// Result term 8
	166
	167	xor t0, t0
	168	combadd2(t0,t2,t1,[x+24],[x+40])
	169	combadd1(t0,t2,t1,[x+32])
	170	mov [z+64], t1
	171
	172	// Result term 9
	173
	174	xor t1, t1
	175	combadd2(t1,t0,t2,[x+32],[x+40])
	176	mov [z+72], t2
	177
	178	// Result term 10
	179
	180	combads(t1,t0,[x+40])
	181	mov [z+80], t0
	182
	183	// Result term 11
	184
	185	mov [z+88], t1
	186
	187	// Return
	188
	189	#if WINDOWS_ABI
	190	pop rsi
	191	pop rdi
	192	#endif
	193	ret
	194
	195	#if defined(__linux__) && defined(__ELF__)
	196	.section .note.GNU-stack,"",%progbits
	197	#endif