diff options
| author | jsing <> | 2025-08-12 10:09:46 +0000 |
|---|---|---|
| committer | jsing <> | 2025-08-12 10:09:46 +0000 |
| commit | e5f4dc854c2c390526097888bf3654db21bdbb15 (patch) | |
| tree | 25b70579b17511a42354b46d22fd95e599a86b6c /src | |
| parent | f111c1e299493f0bdf21f9ff08f046d429764e75 (diff) | |
| download | openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.tar.gz openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.tar.bz2 openbsd-e5f4dc854c2c390526097888bf3654db21bdbb15.zip | |
Bring in bignum_{mul,sqr}_6_12{,_alt}() from s2n-bignum.
These provide fast multiplication and squaring of inputs with 6x words,
producing a 12 word result. The non-_alt versions require the CPU to
support ADX instructions, while the _alt versions do not.
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S | 210 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S | 186 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S | 214 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S | 197 |
4 files changed, 807 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S new file mode 100644 index 0000000000..081b602cde --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12.S | |||
| @@ -0,0 +1,210 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Multiply z := x * y | ||
| 6 | // Inputs x[6], y[6]; output z[12] | ||
| 7 | // | ||
| 8 | // extern void bignum_mul_6_12(uint64_t z[static 12], const uint64_t x[static 6], | ||
| 9 | // const uint64_t y[static 6]); | ||
| 10 | // | ||
| 11 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
| 12 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
| 13 | // ---------------------------------------------------------------------------- | ||
| 14 | |||
| 15 | #include "_internal_s2n_bignum.h" | ||
| 16 | |||
| 17 | .intel_syntax noprefix | ||
| 18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12) | ||
| 19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12) | ||
| 20 | .text | ||
| 21 | |||
| 22 | // These are actually right | ||
| 23 | |||
| 24 | #define z rdi | ||
| 25 | #define x rsi | ||
| 26 | |||
| 27 | // Copied in or set up | ||
| 28 | |||
| 29 | #define y rcx | ||
| 30 | |||
| 31 | // A zero register | ||
| 32 | |||
| 33 | #define zero rbp | ||
| 34 | #define zeroe ebp | ||
| 35 | |||
| 36 | // Add in x[i] * rdx to the (i,i+1) position with the register window | ||
| 37 | // Would be nice to have conditional expressions reg[i], reg[i+1] ... | ||
| 38 | |||
| 39 | .macro mulpadd arg1,arg2 | ||
| 40 | mulx rbx, rax, [x+8*\arg2] | ||
| 41 | .if ((\arg1 + \arg2) % 6 == 0) | ||
| 42 | adcx r8, rax | ||
| 43 | adox r9, rbx | ||
| 44 | .elseif ((\arg1 + \arg2) % 6 == 1) | ||
| 45 | adcx r9, rax | ||
| 46 | adox r10, rbx | ||
| 47 | .elseif ((\arg1 + \arg2) % 6 == 2) | ||
| 48 | adcx r10, rax | ||
| 49 | adox r11, rbx | ||
| 50 | .elseif ((\arg1 + \arg2) % 6 == 3) | ||
| 51 | adcx r11, rax | ||
| 52 | adox r12, rbx | ||
| 53 | .elseif ((\arg1 + \arg2) % 6 == 4) | ||
| 54 | adcx r12, rax | ||
| 55 | adox r13, rbx | ||
| 56 | .elseif ((\arg1 + \arg2) % 6 == 5) | ||
| 57 | adcx r13, rax | ||
| 58 | adox r8, rbx | ||
| 59 | .endif | ||
| 60 | |||
| 61 | .endm | ||
| 62 | |||
| 63 | |||
| 64 | // Add in the whole j'th row | ||
| 65 | |||
| 66 | .macro addrow arg1 | ||
| 67 | mov rdx, [y+8*\arg1] | ||
| 68 | xor zeroe, zeroe | ||
| 69 | |||
| 70 | mulpadd \arg1, 0 | ||
| 71 | |||
| 72 | .if (\arg1 % 6 == 0) | ||
| 73 | mov [z+8*\arg1],r8 | ||
| 74 | .elseif (\arg1 % 6 == 1) | ||
| 75 | mov [z+8*\arg1],r9 | ||
| 76 | .elseif (\arg1 % 6 == 2) | ||
| 77 | mov [z+8*\arg1],r10 | ||
| 78 | .elseif (\arg1 % 6 == 3) | ||
| 79 | mov [z+8*\arg1],r11 | ||
| 80 | .elseif (\arg1 % 6 == 4) | ||
| 81 | mov [z+8*\arg1],r12 | ||
| 82 | .elseif (\arg1 % 6 == 5) | ||
| 83 | mov [z+8*\arg1],r13 | ||
| 84 | .endif | ||
| 85 | |||
| 86 | mulpadd \arg1, 1 | ||
| 87 | mulpadd \arg1, 2 | ||
| 88 | mulpadd \arg1, 3 | ||
| 89 | mulpadd \arg1, 4 | ||
| 90 | |||
| 91 | .if (\arg1 % 6 == 0) | ||
| 92 | mulx r8, rax, [x+40] | ||
| 93 | adcx r13, rax | ||
| 94 | adox r8, zero | ||
| 95 | adcx r8, zero | ||
| 96 | .elseif (\arg1 % 6 == 1) | ||
| 97 | mulx r9, rax, [x+40] | ||
| 98 | adcx r8, rax | ||
| 99 | adox r9, zero | ||
| 100 | adcx r9, zero | ||
| 101 | .elseif (\arg1 % 6 == 2) | ||
| 102 | mulx r10, rax, [x+40] | ||
| 103 | adcx r9, rax | ||
| 104 | adox r10, zero | ||
| 105 | adcx r10, zero | ||
| 106 | .elseif (\arg1 % 6 == 3) | ||
| 107 | mulx r11, rax, [x+40] | ||
| 108 | adcx r10, rax | ||
| 109 | adox r11, zero | ||
| 110 | adcx r11, zero | ||
| 111 | .elseif (\arg1 % 6 == 4) | ||
| 112 | mulx r12, rax, [x+40] | ||
| 113 | adcx r11, rax | ||
| 114 | adox r12, zero | ||
| 115 | adcx r12, zero | ||
| 116 | .elseif (\arg1 % 6 == 5) | ||
| 117 | mulx r13, rax, [x+40] | ||
| 118 | adcx r12, rax | ||
| 119 | adox r13, zero | ||
| 120 | adcx r13, zero | ||
| 121 | .endif | ||
| 122 | |||
| 123 | .endm | ||
| 124 | |||
| 125 | |||
| 126 | |||
| 127 | S2N_BN_SYMBOL(bignum_mul_6_12): | ||
| 128 | _CET_ENDBR | ||
| 129 | |||
| 130 | #if WINDOWS_ABI | ||
| 131 | push rdi | ||
| 132 | push rsi | ||
| 133 | mov rdi, rcx | ||
| 134 | mov rsi, rdx | ||
| 135 | mov rdx, r8 | ||
| 136 | #endif | ||
| 137 | |||
| 138 | // Save more registers to play with | ||
| 139 | |||
| 140 | push rbp | ||
| 141 | push rbx | ||
| 142 | push r12 | ||
| 143 | push r13 | ||
| 144 | |||
| 145 | // Copy y into a safe register to start with | ||
| 146 | |||
| 147 | mov y, rdx | ||
| 148 | |||
| 149 | // Zero a register, which also makes sure we don't get a fake carry-in | ||
| 150 | |||
| 151 | xor zeroe, zeroe | ||
| 152 | |||
| 153 | // Do the zeroth row, which is a bit different | ||
| 154 | // Write back the zero-zero product and then accumulate | ||
| 155 | // r8,r13,r12,r11,r10,r9 as y[0] * x from 1..6 | ||
| 156 | |||
| 157 | mov rdx, [y] | ||
| 158 | |||
| 159 | mulx r9, r8, [x] | ||
| 160 | mov [z], r8 | ||
| 161 | |||
| 162 | mulx r10, rbx, [x+8] | ||
| 163 | adcx r9, rbx | ||
| 164 | |||
| 165 | mulx r11, rbx, [x+16] | ||
| 166 | adcx r10, rbx | ||
| 167 | |||
| 168 | mulx r12, rbx, [x+24] | ||
| 169 | adcx r11, rbx | ||
| 170 | |||
| 171 | mulx r13, rbx, [x+32] | ||
| 172 | adcx r12, rbx | ||
| 173 | |||
| 174 | mulx r8, rbx, [x+40] | ||
| 175 | adcx r13, rbx | ||
| 176 | adcx r8, zero | ||
| 177 | |||
| 178 | // Now all the other rows in a uniform pattern | ||
| 179 | |||
| 180 | addrow 1 | ||
| 181 | addrow 2 | ||
| 182 | addrow 3 | ||
| 183 | addrow 4 | ||
| 184 | addrow 5 | ||
| 185 | |||
| 186 | // Now write back the additional columns | ||
| 187 | |||
| 188 | mov [z+48], r8 | ||
| 189 | mov [z+56], r9 | ||
| 190 | mov [z+64], r10 | ||
| 191 | mov [z+72], r11 | ||
| 192 | mov [z+80], r12 | ||
| 193 | mov [z+88], r13 | ||
| 194 | |||
| 195 | // Restore registers and return | ||
| 196 | |||
| 197 | pop r13 | ||
| 198 | pop r12 | ||
| 199 | pop rbx | ||
| 200 | pop rbp | ||
| 201 | |||
| 202 | #if WINDOWS_ABI | ||
| 203 | pop rsi | ||
| 204 | pop rdi | ||
| 205 | #endif | ||
| 206 | ret | ||
| 207 | |||
| 208 | #if defined(__linux__) && defined(__ELF__) | ||
| 209 | .section .note.GNU-stack,"",%progbits | ||
| 210 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S new file mode 100644 index 0000000000..f4ea4ed68b --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_6_12_alt.S | |||
| @@ -0,0 +1,186 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Multiply z := x * y | ||
| 6 | // Inputs x[6], y[6]; output z[12] | ||
| 7 | // | ||
| 8 | // extern void bignum_mul_6_12_alt(uint64_t z[static 12], | ||
| 9 | // const uint64_t x[static 6], | ||
| 10 | // const uint64_t y[static 6]); | ||
| 11 | // | ||
| 12 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
| 13 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
| 14 | // ---------------------------------------------------------------------------- | ||
| 15 | |||
| 16 | #include "_internal_s2n_bignum.h" | ||
| 17 | |||
| 18 | .intel_syntax noprefix | ||
| 19 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt) | ||
| 20 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt) | ||
| 21 | .text | ||
| 22 | |||
| 23 | // These are actually right | ||
| 24 | |||
| 25 | #define z rdi | ||
| 26 | #define x rsi | ||
| 27 | |||
| 28 | // This is moved from rdx to free it for muls | ||
| 29 | |||
| 30 | #define y rcx | ||
| 31 | |||
| 32 | // Other variables used as a rotating 3-word window to add terms to | ||
| 33 | |||
| 34 | #define t0 r8 | ||
| 35 | #define t1 r9 | ||
| 36 | #define t2 r10 | ||
| 37 | |||
| 38 | // Macro for the key "multiply and add to (c,h,l)" step | ||
| 39 | |||
| 40 | #define combadd(c,h,l,numa,numb) \ | ||
| 41 | mov rax, numa; \ | ||
| 42 | mul QWORD PTR numb; \ | ||
| 43 | add l, rax; \ | ||
| 44 | adc h, rdx; \ | ||
| 45 | adc c, 0 | ||
| 46 | |||
| 47 | // A minutely shorter form for when c = 0 initially | ||
| 48 | |||
| 49 | #define combadz(c,h,l,numa,numb) \ | ||
| 50 | mov rax, numa; \ | ||
| 51 | mul QWORD PTR numb; \ | ||
| 52 | add l, rax; \ | ||
| 53 | adc h, rdx; \ | ||
| 54 | adc c, c | ||
| 55 | |||
| 56 | // A short form where we don't expect a top carry | ||
| 57 | |||
| 58 | #define combads(h,l,numa,numb) \ | ||
| 59 | mov rax, numa; \ | ||
| 60 | mul QWORD PTR numb; \ | ||
| 61 | add l, rax; \ | ||
| 62 | adc h, rdx | ||
| 63 | |||
| 64 | S2N_BN_SYMBOL(bignum_mul_6_12_alt): | ||
| 65 | _CET_ENDBR | ||
| 66 | |||
| 67 | #if WINDOWS_ABI | ||
| 68 | push rdi | ||
| 69 | push rsi | ||
| 70 | mov rdi, rcx | ||
| 71 | mov rsi, rdx | ||
| 72 | mov rdx, r8 | ||
| 73 | #endif | ||
| 74 | |||
| 75 | // Copy y into a safe register to start with | ||
| 76 | |||
| 77 | mov y, rdx | ||
| 78 | |||
| 79 | // Result term 0 | ||
| 80 | |||
| 81 | mov rax, [x] | ||
| 82 | mul QWORD PTR [y] | ||
| 83 | |||
| 84 | mov [z], rax | ||
| 85 | mov t0, rdx | ||
| 86 | xor t1, t1 | ||
| 87 | |||
| 88 | // Result term 1 | ||
| 89 | |||
| 90 | xor t2, t2 | ||
| 91 | combads(t1,t0,[x],[y+8]) | ||
| 92 | combadz(t2,t1,t0,[x+8],[y]) | ||
| 93 | mov [z+8], t0 | ||
| 94 | |||
| 95 | // Result term 2 | ||
| 96 | |||
| 97 | xor t0, t0 | ||
| 98 | combadz(t0,t2,t1,[x],[y+16]) | ||
| 99 | combadd(t0,t2,t1,[x+8],[y+8]) | ||
| 100 | combadd(t0,t2,t1,[x+16],[y]) | ||
| 101 | mov [z+16], t1 | ||
| 102 | |||
| 103 | // Result term 3 | ||
| 104 | |||
| 105 | xor t1, t1 | ||
| 106 | combadz(t1,t0,t2,[x],[y+24]) | ||
| 107 | combadd(t1,t0,t2,[x+8],[y+16]) | ||
| 108 | combadd(t1,t0,t2,[x+16],[y+8]) | ||
| 109 | combadd(t1,t0,t2,[x+24],[y]) | ||
| 110 | mov [z+24], t2 | ||
| 111 | |||
| 112 | // Result term 4 | ||
| 113 | |||
| 114 | xor t2, t2 | ||
| 115 | combadz(t2,t1,t0,[x],[y+32]) | ||
| 116 | combadd(t2,t1,t0,[x+8],[y+24]) | ||
| 117 | combadd(t2,t1,t0,[x+16],[y+16]) | ||
| 118 | combadd(t2,t1,t0,[x+24],[y+8]) | ||
| 119 | combadd(t2,t1,t0,[x+32],[y]) | ||
| 120 | mov [z+32], t0 | ||
| 121 | |||
| 122 | // Result term 5 | ||
| 123 | |||
| 124 | xor t0, t0 | ||
| 125 | combadz(t0,t2,t1,[x],[y+40]) | ||
| 126 | combadd(t0,t2,t1,[x+8],[y+32]) | ||
| 127 | combadd(t0,t2,t1,[x+16],[y+24]) | ||
| 128 | combadd(t0,t2,t1,[x+24],[y+16]) | ||
| 129 | combadd(t0,t2,t1,[x+32],[y+8]) | ||
| 130 | combadd(t0,t2,t1,[x+40],[y]) | ||
| 131 | mov [z+40], t1 | ||
| 132 | |||
| 133 | // Result term 6 | ||
| 134 | |||
| 135 | xor t1, t1 | ||
| 136 | combadz(t1,t0,t2,[x+8],[y+40]) | ||
| 137 | combadd(t1,t0,t2,[x+16],[y+32]) | ||
| 138 | combadd(t1,t0,t2,[x+24],[y+24]) | ||
| 139 | combadd(t1,t0,t2,[x+32],[y+16]) | ||
| 140 | combadd(t1,t0,t2,[x+40],[y+8]) | ||
| 141 | mov [z+48], t2 | ||
| 142 | |||
| 143 | // Result term 7 | ||
| 144 | |||
| 145 | xor t2, t2 | ||
| 146 | combadz(t2,t1,t0,[x+16],[y+40]) | ||
| 147 | combadd(t2,t1,t0,[x+24],[y+32]) | ||
| 148 | combadd(t2,t1,t0,[x+32],[y+24]) | ||
| 149 | combadd(t2,t1,t0,[x+40],[y+16]) | ||
| 150 | mov [z+56], t0 | ||
| 151 | |||
| 152 | // Result term 8 | ||
| 153 | |||
| 154 | xor t0, t0 | ||
| 155 | combadz(t0,t2,t1,[x+24],[y+40]) | ||
| 156 | combadd(t0,t2,t1,[x+32],[y+32]) | ||
| 157 | combadd(t0,t2,t1,[x+40],[y+24]) | ||
| 158 | mov [z+64], t1 | ||
| 159 | |||
| 160 | // Result term 9 | ||
| 161 | |||
| 162 | xor t1, t1 | ||
| 163 | combadz(t1,t0,t2,[x+32],[y+40]) | ||
| 164 | combadd(t1,t0,t2,[x+40],[y+32]) | ||
| 165 | mov [z+72], t2 | ||
| 166 | |||
| 167 | // Result term 10 | ||
| 168 | |||
| 169 | combads(t1,t0,[x+40],[y+40]) | ||
| 170 | mov [z+80], t0 | ||
| 171 | |||
| 172 | // Result term 11 | ||
| 173 | |||
| 174 | mov [z+88], t1 | ||
| 175 | |||
| 176 | // Return | ||
| 177 | |||
| 178 | #if WINDOWS_ABI | ||
| 179 | pop rsi | ||
| 180 | pop rdi | ||
| 181 | #endif | ||
| 182 | ret | ||
| 183 | |||
| 184 | #if defined(__linux__) && defined(__ELF__) | ||
| 185 | .section .note.GNU-stack,"",%progbits | ||
| 186 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S new file mode 100644 index 0000000000..07d840b47c --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S | |||
| @@ -0,0 +1,214 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Square, z := x^2 | ||
| 6 | // Input x[6]; output z[12] | ||
| 7 | // | ||
| 8 | // extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]); | ||
| 9 | // | ||
| 10 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
| 11 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
| 12 | // ---------------------------------------------------------------------------- | ||
| 13 | |||
| 14 | #include "_internal_s2n_bignum.h" | ||
| 15 | |||
| 16 | .intel_syntax noprefix | ||
| 17 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12) | ||
| 18 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12) | ||
| 19 | .text | ||
| 20 | |||
| 21 | // These are actually right | ||
| 22 | |||
| 23 | #define z rdi | ||
| 24 | #define x rsi | ||
| 25 | |||
| 26 | // A zero register | ||
| 27 | |||
| 28 | #define zero rbp | ||
| 29 | #define zeroe ebp | ||
| 30 | |||
| 31 | // Other registers | ||
| 32 | |||
| 33 | #define d1 r8 | ||
| 34 | #define d2 r9 | ||
| 35 | #define d3 r10 | ||
| 36 | #define d4 r11 | ||
| 37 | #define d5 r12 | ||
| 38 | #define d6 r13 | ||
| 39 | #define d7 r14 | ||
| 40 | #define d8 r15 | ||
| 41 | #define d9 rbx | ||
| 42 | |||
| 43 | // Care is needed: re-using the zero register | ||
| 44 | |||
| 45 | #define d10 rbp | ||
| 46 | |||
| 47 | |||
| 48 | S2N_BN_SYMBOL(bignum_sqr_6_12): | ||
| 49 | _CET_ENDBR | ||
| 50 | |||
| 51 | #if WINDOWS_ABI | ||
| 52 | push rdi | ||
| 53 | push rsi | ||
| 54 | mov rdi, rcx | ||
| 55 | mov rsi, rdx | ||
| 56 | #endif | ||
| 57 | |||
| 58 | // Save more registers to play with | ||
| 59 | |||
| 60 | push rbp | ||
| 61 | push rbx | ||
| 62 | push r12 | ||
| 63 | push r13 | ||
| 64 | push r14 | ||
| 65 | push r15 | ||
| 66 | |||
| 67 | // Set up an initial window [d8;...d1] = [34;05;03;01] | ||
| 68 | |||
| 69 | mov rdx, [x] | ||
| 70 | mulx d2, d1, [x+8] | ||
| 71 | mulx d4, d3, [x+24] | ||
| 72 | mulx d6, d5, [x+40] | ||
| 73 | mov rdx, [x+24] | ||
| 74 | mulx d8, d7, [x+32] | ||
| 75 | |||
| 76 | // Clear our zero register, and also initialize the flags for the carry chain | ||
| 77 | |||
| 78 | xor zeroe, zeroe | ||
| 79 | |||
| 80 | // Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window | ||
| 81 | // (no carry-out possible since we add it to the top of a product) | ||
| 82 | |||
| 83 | mov rdx, [x+16] | ||
| 84 | mulx rcx, rax, [x] | ||
| 85 | adcx d2, rax | ||
| 86 | adox d3, rcx | ||
| 87 | mulx rcx, rax, [x+8] | ||
| 88 | adcx d3, rax | ||
| 89 | adox d4, rcx | ||
| 90 | mov rdx, [x+8] | ||
| 91 | mulx rcx, rax, [x+24] | ||
| 92 | adcx d4, rax | ||
| 93 | adox d5, rcx | ||
| 94 | mulx rcx, rax, [x+32] | ||
| 95 | adcx d5, rax | ||
| 96 | adox d6, rcx | ||
| 97 | mulx rcx, rax, [x+40] | ||
| 98 | adcx d6, rax | ||
| 99 | adox d7, rcx | ||
| 100 | adcx d7, zero | ||
| 101 | adox d8, zero | ||
| 102 | adcx d8, zero | ||
| 103 | |||
| 104 | // Again zero out the flags. Actually they are already cleared but it may | ||
| 105 | // help decouple these in the OOO engine not to wait for the chain above | ||
| 106 | |||
| 107 | xor zeroe, zeroe | ||
| 108 | |||
| 109 | // Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms | ||
| 110 | // We are running out of registers and here our zero register is not zero! | ||
| 111 | |||
| 112 | mov rdx, [x+32] | ||
| 113 | mulx rcx, rax, [x] | ||
| 114 | adcx d4, rax | ||
| 115 | adox d5, rcx | ||
| 116 | mov rdx, [x+16] | ||
| 117 | mulx rcx, rax, [x+24] | ||
| 118 | adcx d5, rax | ||
| 119 | adox d6, rcx | ||
| 120 | mulx rcx, rax, [x+32] | ||
| 121 | adcx d6, rax | ||
| 122 | adox d7, rcx | ||
| 123 | mulx rcx, rax, [x+40] | ||
| 124 | adcx d7, rax | ||
| 125 | adox d8, rcx | ||
| 126 | mov rdx, [x+24] | ||
| 127 | mulx d9, rax, [x+40] | ||
| 128 | adcx d8, rax | ||
| 129 | adox d9, zero | ||
| 130 | mov rdx, [x+32] | ||
| 131 | mulx d10, rax, [x+40] | ||
| 132 | adcx d9, rax | ||
| 133 | mov eax, 0 | ||
| 134 | adox d10, rax | ||
| 135 | adcx d10, rax | ||
| 136 | |||
| 137 | // Again, just for a clear fresh start for the flags | ||
| 138 | |||
| 139 | xor eax, eax | ||
| 140 | |||
| 141 | // Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms | ||
| 142 | // | ||
| 143 | // We could use shift-double but this seems tidier and in larger squarings | ||
| 144 | // it was actually more efficient. I haven't experimented with this small | ||
| 145 | // case to see how much that matters. Note: the writeback here is sprinkled | ||
| 146 | // into the sequence in such a way that things still work if z = x, i.e. if | ||
| 147 | // the output overwrites the input buffer and beyond. | ||
| 148 | |||
| 149 | mov rdx, [x] | ||
| 150 | mulx rdx, rax, rdx | ||
| 151 | mov [z], rax | ||
| 152 | adcx d1, d1 | ||
| 153 | adox d1, rdx | ||
| 154 | mov rdx, [x+8] | ||
| 155 | mov [z+8], d1 | ||
| 156 | mulx rdx, rax, rdx | ||
| 157 | adcx d2, d2 | ||
| 158 | adox d2, rax | ||
| 159 | adcx d3, d3 | ||
| 160 | adox d3, rdx | ||
| 161 | mov rdx, [x+16] | ||
| 162 | mov [z+16], d2 | ||
| 163 | mulx rdx, rax, rdx | ||
| 164 | adcx d4, d4 | ||
| 165 | adox d4, rax | ||
| 166 | adcx d5, d5 | ||
| 167 | adox d5, rdx | ||
| 168 | mov rdx, [x+24] | ||
| 169 | mov [z+24], d3 | ||
| 170 | mulx rdx, rax, rdx | ||
| 171 | adcx d6, d6 | ||
| 172 | adox d6, rax | ||
| 173 | adcx d7, d7 | ||
| 174 | adox d7, rdx | ||
| 175 | mov rdx, [x+32] | ||
| 176 | mov [z+32], d4 | ||
| 177 | mulx rdx, rax, rdx | ||
| 178 | adcx d8, d8 | ||
| 179 | adox d8, rax | ||
| 180 | adcx d9, d9 | ||
| 181 | adox d9, rdx | ||
| 182 | mov rdx, [x+40] | ||
| 183 | mov [z+40], d5 | ||
| 184 | mulx rdx, rax, rdx | ||
| 185 | mov [z+48], d6 | ||
| 186 | adcx d10, d10 | ||
| 187 | mov [z+56], d7 | ||
| 188 | adox d10, rax | ||
| 189 | mov [z+64], d8 | ||
| 190 | mov eax, 0 | ||
| 191 | mov [z+72], d9 | ||
| 192 | adcx rdx, rax | ||
| 193 | mov [z+80], d10 | ||
| 194 | adox rdx, rax | ||
| 195 | mov [z+88], rdx | ||
| 196 | |||
| 197 | // Restore saved registers and return | ||
| 198 | |||
| 199 | pop r15 | ||
| 200 | pop r14 | ||
| 201 | pop r13 | ||
| 202 | pop r12 | ||
| 203 | pop rbx | ||
| 204 | pop rbp | ||
| 205 | |||
| 206 | #if WINDOWS_ABI | ||
| 207 | pop rsi | ||
| 208 | pop rdi | ||
| 209 | #endif | ||
| 210 | ret | ||
| 211 | |||
| 212 | #if defined(__linux__) && defined(__ELF__) | ||
| 213 | .section .note.GNU-stack,"",%progbits | ||
| 214 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S new file mode 100644 index 0000000000..a517e5c654 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12_alt.S | |||
| @@ -0,0 +1,197 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Square, z := x^2 | ||
| 6 | // Input x[6]; output z[12] | ||
| 7 | // | ||
| 8 | // extern void bignum_sqr_6_12_alt(uint64_t z[static 12], | ||
| 9 | // const uint64_t x[static 6]); | ||
| 10 | // | ||
| 11 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
| 12 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
| 13 | // ---------------------------------------------------------------------------- | ||
| 14 | |||
| 15 | #include "_internal_s2n_bignum.h" | ||
| 16 | |||
| 17 | .intel_syntax noprefix | ||
| 18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt) | ||
| 19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt) | ||
| 20 | .text | ||
| 21 | |||
| 22 | // Input arguments | ||
| 23 | |||
| 24 | #define z rdi | ||
| 25 | #define x rsi | ||
| 26 | |||
| 27 | // Other variables used as a rotating 3-word window to add terms to | ||
| 28 | |||
| 29 | #define t0 r8 | ||
| 30 | #define t1 r9 | ||
| 31 | #define t2 r10 | ||
| 32 | |||
| 33 | // Additional temporaries for local windows to share doublings | ||
| 34 | |||
| 35 | #define u0 rcx | ||
| 36 | #define u1 r11 | ||
| 37 | |||
| 38 | // Macro for the key "multiply and add to (c,h,l)" step | ||
| 39 | |||
| 40 | #define combadd(c,h,l,numa,numb) \ | ||
| 41 | mov rax, numa; \ | ||
| 42 | mul QWORD PTR numb; \ | ||
| 43 | add l, rax; \ | ||
| 44 | adc h, rdx; \ | ||
| 45 | adc c, 0 | ||
| 46 | |||
| 47 | // Set up initial window (c,h,l) = numa * numb | ||
| 48 | |||
| 49 | #define combaddz(c,h,l,numa,numb) \ | ||
| 50 | mov rax, numa; \ | ||
| 51 | mul QWORD PTR numb; \ | ||
| 52 | xor c, c; \ | ||
| 53 | mov l, rax; \ | ||
| 54 | mov h, rdx | ||
| 55 | |||
| 56 | // Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) | ||
| 57 | |||
| 58 | #define doubladd(c,h,l,hh,ll) \ | ||
| 59 | add ll, ll; \ | ||
| 60 | adc hh, hh; \ | ||
| 61 | adc c, c; \ | ||
| 62 | add l, ll; \ | ||
| 63 | adc h, hh; \ | ||
| 64 | adc c, 0 | ||
| 65 | |||
| 66 | // Square term incorporation (c,h,l) += numba^2 | ||
| 67 | |||
| 68 | #define combadd1(c,h,l,numa) \ | ||
| 69 | mov rax, numa; \ | ||
| 70 | mul rax; \ | ||
| 71 | add l, rax; \ | ||
| 72 | adc h, rdx; \ | ||
| 73 | adc c, 0 | ||
| 74 | |||
| 75 | // A short form where we don't expect a top carry | ||
| 76 | |||
| 77 | #define combads(h,l,numa) \ | ||
| 78 | mov rax, numa; \ | ||
| 79 | mul rax; \ | ||
| 80 | add l, rax; \ | ||
| 81 | adc h, rdx | ||
| 82 | |||
| 83 | // A version doubling directly before adding, for single non-square terms | ||
| 84 | |||
| 85 | #define combadd2(c,h,l,numa,numb) \ | ||
| 86 | mov rax, numa; \ | ||
| 87 | mul QWORD PTR numb; \ | ||
| 88 | add rax, rax; \ | ||
| 89 | adc rdx, rdx; \ | ||
| 90 | adc c, 0; \ | ||
| 91 | add l, rax; \ | ||
| 92 | adc h, rdx; \ | ||
| 93 | adc c, 0 | ||
| 94 | |||
| 95 | S2N_BN_SYMBOL(bignum_sqr_6_12_alt): | ||
| 96 | _CET_ENDBR | ||
| 97 | |||
| 98 | #if WINDOWS_ABI | ||
| 99 | push rdi | ||
| 100 | push rsi | ||
| 101 | mov rdi, rcx | ||
| 102 | mov rsi, rdx | ||
| 103 | #endif | ||
| 104 | |||
| 105 | // Result term 0 | ||
| 106 | |||
| 107 | mov rax, [x] | ||
| 108 | mul rax | ||
| 109 | |||
| 110 | mov [z], rax | ||
| 111 | mov t0, rdx | ||
| 112 | xor t1, t1 | ||
| 113 | |||
| 114 | // Result term 1 | ||
| 115 | |||
| 116 | xor t2, t2 | ||
| 117 | combadd2(t2,t1,t0,[x],[x+8]) | ||
| 118 | mov [z+8], t0 | ||
| 119 | |||
| 120 | // Result term 2 | ||
| 121 | |||
| 122 | xor t0, t0 | ||
| 123 | combadd1(t0,t2,t1,[x+8]) | ||
| 124 | combadd2(t0,t2,t1,[x],[x+16]) | ||
| 125 | mov [z+16], t1 | ||
| 126 | |||
| 127 | // Result term 3 | ||
| 128 | |||
| 129 | combaddz(t1,u1,u0,[x],[x+24]) | ||
| 130 | combadd(t1,u1,u0,[x+8],[x+16]) | ||
| 131 | doubladd(t1,t0,t2,u1,u0) | ||
| 132 | mov [z+24], t2 | ||
| 133 | |||
| 134 | // Result term 4 | ||
| 135 | |||
| 136 | combaddz(t2,u1,u0,[x],[x+32]) | ||
| 137 | combadd(t2,u1,u0,[x+8],[x+24]) | ||
| 138 | doubladd(t2,t1,t0,u1,u0) | ||
| 139 | combadd1(t2,t1,t0,[x+16]) | ||
| 140 | mov [z+32], t0 | ||
| 141 | |||
| 142 | // Result term 5 | ||
| 143 | |||
| 144 | combaddz(t0,u1,u0,[x],[x+40]) | ||
| 145 | combadd(t0,u1,u0,[x+8],[x+32]) | ||
| 146 | combadd(t0,u1,u0,[x+16],[x+24]) | ||
| 147 | doubladd(t0,t2,t1,u1,u0) | ||
| 148 | mov [z+40], t1 | ||
| 149 | |||
| 150 | // Result term 6 | ||
| 151 | |||
| 152 | combaddz(t1,u1,u0,[x+8],[x+40]) | ||
| 153 | combadd(t1,u1,u0,[x+16],[x+32]) | ||
| 154 | doubladd(t1,t0,t2,u1,u0) | ||
| 155 | combadd1(t1,t0,t2,[x+24]) | ||
| 156 | mov [z+48], t2 | ||
| 157 | |||
| 158 | // Result term 7 | ||
| 159 | |||
| 160 | combaddz(t2,u1,u0,[x+16],[x+40]) | ||
| 161 | combadd(t2,u1,u0,[x+24],[x+32]) | ||
| 162 | doubladd(t2,t1,t0,u1,u0) | ||
| 163 | mov [z+56], t0 | ||
| 164 | |||
| 165 | // Result term 8 | ||
| 166 | |||
| 167 | xor t0, t0 | ||
| 168 | combadd2(t0,t2,t1,[x+24],[x+40]) | ||
| 169 | combadd1(t0,t2,t1,[x+32]) | ||
| 170 | mov [z+64], t1 | ||
| 171 | |||
| 172 | // Result term 9 | ||
| 173 | |||
| 174 | xor t1, t1 | ||
| 175 | combadd2(t1,t0,t2,[x+32],[x+40]) | ||
| 176 | mov [z+72], t2 | ||
| 177 | |||
| 178 | // Result term 10 | ||
| 179 | |||
| 180 | combads(t1,t0,[x+40]) | ||
| 181 | mov [z+80], t0 | ||
| 182 | |||
| 183 | // Result term 11 | ||
| 184 | |||
| 185 | mov [z+88], t1 | ||
| 186 | |||
| 187 | // Return | ||
| 188 | |||
| 189 | #if WINDOWS_ABI | ||
| 190 | pop rsi | ||
| 191 | pop rdi | ||
| 192 | #endif | ||
| 193 | ret | ||
| 194 | |||
| 195 | #if defined(__linux__) && defined(__ELF__) | ||
| 196 | .section .note.GNU-stack,"",%progbits | ||
| 197 | #endif | ||
