From e5f4dc854c2c390526097888bf3654db21bdbb15 Mon Sep 17 00:00:00 2001
From: jsing <>
Date: Tue, 12 Aug 2025 10:09:46 +0000
Subject: Bring in bignum_{mul,sqr}_6_12{,_alt}() from s2n-bignum.

These provide fast multiplication and squaring of inputs with 6x words,
producing a 12 word result. The non-_alt versions require the CPU to
support ADX instructions, while the _alt versions do not.
---
 src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S  | 210 ++++++++++++++++++++
 .../libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S  | 186 ++++++++++++++++++
 src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S  | 214 +++++++++++++++++++++
 .../libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S  | 197 +++++++++++++++++++
 4 files changed, 807 insertions(+)
 create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
 create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
 create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
 create mode 100644 src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S

(limited to 'src/lib')

diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
new file mode 100644
index 0000000000..081b602cde
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S
@@ -0,0 +1,210 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6],
+//                                const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12)
+        .text
+
+// These are actually right
+
+#define z rdi
+#define x rsi
+
+// Copied in or set up
+
+#define y rcx
+
+// A zero register
+
+#define zero rbp
+#define zeroe ebp
+
+// Add in x[i] * rdx to the (i,i+1) position with the register window
+// Would be nice to have conditional expressions reg[i], reg[i+1] ...
+
+.macro mulpadd arg1,arg2
+        mulx    rbx, rax, [x+8*\arg2]
+.if ((\arg1 + \arg2) % 6 == 0)
+        adcx    r8, rax
+        adox    r9, rbx
+.elseif ((\arg1 + \arg2) % 6 == 1)
+        adcx    r9, rax
+        adox    r10, rbx
+.elseif ((\arg1 + \arg2) % 6 == 2)
+        adcx    r10, rax
+        adox    r11, rbx
+.elseif ((\arg1 + \arg2) % 6 == 3)
+        adcx    r11, rax
+        adox    r12, rbx
+.elseif ((\arg1 + \arg2) % 6 == 4)
+        adcx    r12, rax
+        adox    r13, rbx
+.elseif ((\arg1 + \arg2) % 6 == 5)
+        adcx    r13, rax
+        adox    r8, rbx
+.endif
+
+.endm
+
+
+// Add in the whole j'th row
+
+.macro addrow arg1
+        mov     rdx, [y+8*\arg1]
+        xor     zeroe, zeroe
+
+        mulpadd \arg1, 0
+
+.if (\arg1 % 6 == 0)
+        mov     [z+8*\arg1],r8
+.elseif (\arg1 % 6 == 1)
+        mov     [z+8*\arg1],r9
+.elseif (\arg1 % 6 == 2)
+        mov     [z+8*\arg1],r10
+.elseif (\arg1 % 6 == 3)
+        mov     [z+8*\arg1],r11
+.elseif (\arg1 % 6 == 4)
+        mov     [z+8*\arg1],r12
+.elseif (\arg1 % 6 == 5)
+        mov     [z+8*\arg1],r13
+.endif
+
+        mulpadd \arg1, 1
+        mulpadd \arg1, 2
+        mulpadd \arg1, 3
+        mulpadd \arg1, 4
+
+.if (\arg1 % 6 == 0)
+        mulx    r8, rax, [x+40]
+        adcx    r13, rax
+        adox    r8, zero
+        adcx    r8, zero
+.elseif (\arg1 % 6 == 1)
+        mulx    r9, rax, [x+40]
+        adcx    r8, rax
+        adox    r9, zero
+        adcx    r9, zero
+.elseif (\arg1 % 6 == 2)
+        mulx    r10, rax, [x+40]
+        adcx    r9, rax
+        adox    r10, zero
+        adcx    r10, zero
+.elseif (\arg1 % 6 == 3)
+        mulx    r11, rax, [x+40]
+        adcx    r10, rax
+        adox    r11, zero
+        adcx    r11, zero
+.elseif (\arg1 % 6 == 4)
+        mulx    r12, rax, [x+40]
+        adcx    r11, rax
+        adox    r12, zero
+        adcx    r12, zero
+.elseif (\arg1 % 6 == 5)
+        mulx    r13, rax, [x+40]
+        adcx    r12, rax
+        adox    r13, zero
+        adcx    r13, zero
+.endif
+
+.endm
+
+
+
+S2N_BN_SYMBOL(bignum_mul_6_12):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+
+// Save more registers to play with
+
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+
+// Copy y into a safe register to start with
+
+        mov     y, rdx
+
+// Zero a register, which also makes sure we don't get a fake carry-in
+
+        xor     zeroe, zeroe
+
+// Do the zeroth row, which is a bit different
+// Write back the zero-zero product and then accumulate
+// r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6
+
+        mov     rdx, [y]
+
+        mulx    r9, r8, [x]
+        mov     [z], r8
+
+        mulx    r10, rbx, [x+8]
+        adcx    r9, rbx
+
+        mulx    r11, rbx, [x+16]
+        adcx    r10, rbx
+
+        mulx    r12, rbx, [x+24]
+        adcx    r11, rbx
+
+        mulx    r13, rbx, [x+32]
+        adcx    r12, rbx
+
+        mulx    r8, rbx, [x+40]
+        adcx    r13, rbx
+        adcx    r8, zero
+
+// Now all the other rows in a uniform pattern
+
+        addrow  1
+        addrow  2
+        addrow  3
+        addrow  4
+        addrow  5
+
+// Now write back the additional columns
+
+        mov     [z+48], r8
+        mov     [z+56], r9
+        mov     [z+64], r10
+        mov     [z+72], r11
+        mov     [z+80], r12
+        mov     [z+88], r13
+
+// Restore registers and return
+
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
new file mode 100644
index 0000000000..f4ea4ed68b
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S
@@ -0,0 +1,186 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[6], y[6]; output z[12]
+//
+//    extern void bignum_mul_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6],
+//                                    const uint64_t y[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt)
+        .text
+
+// These are actually right
+
+#define z rdi
+#define x rsi
+
+// This is moved from rdx to free it for muls
+
+#define y rcx
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 r8
+#define t1 r9
+#define t2 r10
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb)                  \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx
+
+S2N_BN_SYMBOL(bignum_mul_6_12_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+#endif
+
+// Copy y into a safe register to start with
+
+        mov     y, rdx
+
+// Result term 0
+
+        mov     rax, [x]
+        mul     QWORD PTR [y]
+
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+
+// Result term 1
+
+        xor     t2, t2
+        combads(t1,t0,[x],[y+8])
+        combadz(t2,t1,t0,[x+8],[y])
+        mov     [z+8], t0
+
+// Result term 2
+
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+16])
+        combadd(t0,t2,t1,[x+8],[y+8])
+        combadd(t0,t2,t1,[x+16],[y])
+        mov     [z+16], t1
+
+// Result term 3
+
+        xor     t1, t1
+        combadz(t1,t0,t2,[x],[y+24])
+        combadd(t1,t0,t2,[x+8],[y+16])
+        combadd(t1,t0,t2,[x+16],[y+8])
+        combadd(t1,t0,t2,[x+24],[y])
+        mov     [z+24], t2
+
+// Result term 4
+
+        xor     t2, t2
+        combadz(t2,t1,t0,[x],[y+32])
+        combadd(t2,t1,t0,[x+8],[y+24])
+        combadd(t2,t1,t0,[x+16],[y+16])
+        combadd(t2,t1,t0,[x+24],[y+8])
+        combadd(t2,t1,t0,[x+32],[y])
+        mov     [z+32], t0
+
+// Result term 5
+
+        xor     t0, t0
+        combadz(t0,t2,t1,[x],[y+40])
+        combadd(t0,t2,t1,[x+8],[y+32])
+        combadd(t0,t2,t1,[x+16],[y+24])
+        combadd(t0,t2,t1,[x+24],[y+16])
+        combadd(t0,t2,t1,[x+32],[y+8])
+        combadd(t0,t2,t1,[x+40],[y])
+        mov     [z+40], t1
+
+// Result term 6
+
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+8],[y+40])
+        combadd(t1,t0,t2,[x+16],[y+32])
+        combadd(t1,t0,t2,[x+24],[y+24])
+        combadd(t1,t0,t2,[x+32],[y+16])
+        combadd(t1,t0,t2,[x+40],[y+8])
+        mov     [z+48], t2
+
+// Result term 7
+
+        xor     t2, t2
+        combadz(t2,t1,t0,[x+16],[y+40])
+        combadd(t2,t1,t0,[x+24],[y+32])
+        combadd(t2,t1,t0,[x+32],[y+24])
+        combadd(t2,t1,t0,[x+40],[y+16])
+        mov     [z+56], t0
+
+// Result term 8
+
+        xor     t0, t0
+        combadz(t0,t2,t1,[x+24],[y+40])
+        combadd(t0,t2,t1,[x+32],[y+32])
+        combadd(t0,t2,t1,[x+40],[y+24])
+        mov     [z+64], t1
+
+// Result term 9
+
+        xor     t1, t1
+        combadz(t1,t0,t2,[x+32],[y+40])
+        combadd(t1,t0,t2,[x+40],[y+32])
+        mov     [z+72], t2
+
+// Result term 10
+
+        combads(t1,t0,[x+40],[y+40])
+        mov     [z+80], t0
+
+// Result term 11
+
+        mov     [z+88], t1
+
+// Return
+
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
new file mode 100644
index 0000000000..07d840b47c
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
@@ -0,0 +1,214 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
+        .text
+
+// These are actually right
+
+#define z rdi
+#define x rsi
+
+// A zero register
+
+#define zero rbp
+#define zeroe ebp
+
+// Other registers
+
+#define d1 r8
+#define d2 r9
+#define d3 r10
+#define d4 r11
+#define d5 r12
+#define d6 r13
+#define d7 r14
+#define d8 r15
+#define d9 rbx
+
+// Care is needed: re-using the zero register
+
+#define d10 rbp
+
+
+S2N_BN_SYMBOL(bignum_sqr_6_12):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+
+// Save more registers to play with
+
+        push    rbp
+        push    rbx
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+
+// Set up an initial window [d8;...d1] = [34;05;03;01]
+
+        mov     rdx, [x]
+        mulx    d2, d1, [x+8]
+        mulx    d4, d3, [x+24]
+        mulx    d6, d5, [x+40]
+        mov     rdx, [x+24]
+        mulx    d8, d7, [x+32]
+
+// Clear our zero register, and also initialize the flags for the carry chain
+
+        xor     zeroe, zeroe
+
+// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
+// (no carry-out possible since we add it to the top of a product)
+
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x]
+        adcx    d2, rax
+        adox    d3, rcx
+        mulx    rcx, rax, [x+8]
+        adcx    d3, rax
+        adox    d4, rcx
+        mov     rdx, [x+8]
+        mulx    rcx, rax, [x+24]
+        adcx    d4, rax
+        adox    d5, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d6, rax
+        adox    d7, rcx
+        adcx    d7, zero
+        adox    d8, zero
+        adcx    d8, zero
+
+// Again zero out the flags. Actually they are already cleared but it may
+// help decouple these in the OOO engine not to wait for the chain above
+
+        xor     zeroe, zeroe
+
+// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
+// We are running out of registers and here our zero register is not zero!
+
+        mov     rdx, [x+32]
+        mulx    rcx, rax, [x]
+        adcx    d4, rax
+        adox    d5, rcx
+        mov     rdx, [x+16]
+        mulx    rcx, rax, [x+24]
+        adcx    d5, rax
+        adox    d6, rcx
+        mulx    rcx, rax, [x+32]
+        adcx    d6, rax
+        adox    d7, rcx
+        mulx    rcx, rax, [x+40]
+        adcx    d7, rax
+        adox    d8, rcx
+        mov     rdx, [x+24]
+        mulx    d9, rax, [x+40]
+        adcx    d8, rax
+        adox    d9, zero
+        mov     rdx, [x+32]
+        mulx    d10, rax, [x+40]
+        adcx    d9, rax
+        mov     eax, 0
+        adox    d10, rax
+        adcx    d10, rax
+
+// Again, just for a clear fresh start for the flags
+
+        xor     eax, eax
+
+// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
+//
+// We could use shift-double but this seems tidier and in larger squarings
+// it was actually more efficient. I haven't experimented with this small
+// case to see how much that matters. Note: the writeback here is sprinkled
+// into the sequence in such a way that things still work if z = x, i.e. if
+// the output overwrites the input buffer and beyond.
+
+        mov     rdx, [x]
+        mulx    rdx, rax, rdx
+        mov     [z], rax
+        adcx    d1, d1
+        adox    d1, rdx
+        mov     rdx, [x+8]
+        mov     [z+8], d1
+        mulx    rdx, rax, rdx
+        adcx    d2, d2
+        adox    d2, rax
+        adcx    d3, d3
+        adox    d3, rdx
+        mov     rdx, [x+16]
+        mov     [z+16], d2
+        mulx    rdx, rax, rdx
+        adcx    d4, d4
+        adox    d4, rax
+        adcx    d5, d5
+        adox    d5, rdx
+        mov     rdx, [x+24]
+        mov     [z+24], d3
+        mulx    rdx, rax, rdx
+        adcx    d6, d6
+        adox    d6, rax
+        adcx    d7, d7
+        adox    d7, rdx
+        mov     rdx, [x+32]
+        mov     [z+32], d4
+        mulx    rdx, rax, rdx
+        adcx    d8, d8
+        adox    d8, rax
+        adcx    d9, d9
+        adox    d9, rdx
+        mov     rdx, [x+40]
+        mov     [z+40], d5
+        mulx    rdx, rax, rdx
+        mov     [z+48], d6
+        adcx    d10, d10
+        mov     [z+56], d7
+        adox    d10, rax
+        mov     [z+64], d8
+        mov     eax, 0
+        mov     [z+72], d9
+        adcx    rdx, rax
+        mov     [z+80], d10
+        adox    rdx, rax
+        mov     [z+88], rdx
+
+// Restore saved registers and return
+
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbx
+        pop     rbp
+
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
new file mode 100644
index 0000000000..a517e5c654
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S
@@ -0,0 +1,197 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[6]; output z[12]
+//
+//    extern void bignum_sqr_6_12_alt(uint64_t z[static 12],
+//                                    const uint64_t x[static 6]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI:   RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt)
+        .text
+
+// Input arguments
+
+#define z rdi
+#define x rsi
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 r8
+#define t1 r9
+#define t2 r10
+
+// Additional temporaries for local windows to share doublings
+
+#define u0 rcx
+#define u1 r11
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb)                \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+// Set up initial window (c,h,l) = numa * numb
+
+#define combaddz(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        xor     c, c;                           \
+        mov     l, rax;                         \
+        mov     h, rdx
+
+// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
+
+#define doubladd(c,h,l,hh,ll)                   \
+        add     ll, ll;                         \
+        adc     hh, hh;                         \
+        adc     c, c;                           \
+        add     l, ll;                          \
+        adc     h, hh;                          \
+        adc     c, 0
+
+// Square term incorporation (c,h,l) += numba^2
+
+#define combadd1(c,h,l,numa)                    \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa)                       \
+        mov     rax, numa;                      \
+        mul     rax;                            \
+        add     l, rax;                         \
+        adc     h, rdx
+
+// A version doubling directly before adding, for single non-square terms
+
+#define combadd2(c,h,l,numa,numb)               \
+        mov     rax, numa;                      \
+        mul     QWORD PTR numb;                 \
+        add     rax, rax;                       \
+        adc     rdx, rdx;                       \
+        adc     c, 0;                           \
+        add     l, rax;                         \
+        adc     h, rdx;                         \
+        adc     c, 0
+
+S2N_BN_SYMBOL(bignum_sqr_6_12_alt):
+        _CET_ENDBR
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+#endif
+
+// Result term 0
+
+        mov     rax, [x]
+        mul     rax
+
+        mov     [z], rax
+        mov     t0, rdx
+        xor     t1, t1
+
+// Result term 1
+
+        xor     t2, t2
+        combadd2(t2,t1,t0,[x],[x+8])
+        mov     [z+8], t0
+
+// Result term 2
+
+        xor     t0, t0
+        combadd1(t0,t2,t1,[x+8])
+        combadd2(t0,t2,t1,[x],[x+16])
+        mov     [z+16], t1
+
+// Result term 3
+
+        combaddz(t1,u1,u0,[x],[x+24])
+        combadd(t1,u1,u0,[x+8],[x+16])
+        doubladd(t1,t0,t2,u1,u0)
+        mov     [z+24], t2
+
+// Result term 4
+
+        combaddz(t2,u1,u0,[x],[x+32])
+        combadd(t2,u1,u0,[x+8],[x+24])
+        doubladd(t2,t1,t0,u1,u0)
+        combadd1(t2,t1,t0,[x+16])
+        mov     [z+32], t0
+
+// Result term 5
+
+        combaddz(t0,u1,u0,[x],[x+40])
+        combadd(t0,u1,u0,[x+8],[x+32])
+        combadd(t0,u1,u0,[x+16],[x+24])
+        doubladd(t0,t2,t1,u1,u0)
+        mov     [z+40], t1
+
+// Result term 6
+
+        combaddz(t1,u1,u0,[x+8],[x+40])
+        combadd(t1,u1,u0,[x+16],[x+32])
+        doubladd(t1,t0,t2,u1,u0)
+        combadd1(t1,t0,t2,[x+24])
+        mov     [z+48], t2
+
+// Result term 7
+
+        combaddz(t2,u1,u0,[x+16],[x+40])
+        combadd(t2,u1,u0,[x+24],[x+32])
+        doubladd(t2,t1,t0,u1,u0)
+        mov     [z+56], t0
+
+// Result term 8
+
+        xor     t0, t0
+        combadd2(t0,t2,t1,[x+24],[x+40])
+        combadd1(t0,t2,t1,[x+32])
+        mov     [z+64], t1
+
+// Result term 9
+
+        xor     t1, t1
+        combadd2(t1,t0,t2,[x+32],[x+40])
+        mov     [z+72], t2
+
+// Result term 10
+
+        combads(t1,t0,[x+40])
+        mov     [z+80], t0
+
+// Result term 11
+
+        mov     [z+88], t1
+
+// Return
+
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
-- 
cgit v1.2.3-55-g6feb