diff options
Diffstat (limited to '')
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S | 174 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S | 260 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S | 145 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S | 298 |
4 files changed, 877 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S new file mode 100644 index 0000000000..5e9aebd685 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8.S | |||
| @@ -0,0 +1,174 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Multiply z := x * y | ||
| 6 | // Inputs x[4], y[4]; output z[8] | ||
| 7 | // | ||
| 8 | // extern void bignum_mul_4_8(uint64_t z[static 8], const uint64_t x[static 4], | ||
| 9 | // const uint64_t y[static 4]); | ||
| 10 | // | ||
| 11 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
| 12 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
| 13 | // ---------------------------------------------------------------------------- | ||
| 14 | |||
| 15 | #include "_internal_s2n_bignum.h" | ||
| 16 | |||
| 17 | .intel_syntax noprefix | ||
| 18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8) | ||
| 19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8) | ||
| 20 | .text | ||
| 21 | |||
| 22 | // These are actually right | ||
| 23 | |||
| 24 | #define z rdi | ||
| 25 | #define x rsi | ||
| 26 | |||
| 27 | // Copied in or set up | ||
| 28 | |||
| 29 | #define y rcx | ||
| 30 | |||
| 31 | // A zero register | ||
| 32 | |||
| 33 | #define zero rbp | ||
| 34 | #define zeroe ebp | ||
| 35 | |||
| 36 | // Add in x[i] * rdx to the (i,i+1) position with the register window | ||
| 37 | // Would be nice to have conditional expressions reg[i], reg[i+1] ... | ||
| 38 | |||
| 39 | .macro mulpadd arg1,arg2 | ||
| 40 | mulx rbx, rax, [x+8*\arg2] | ||
| 41 | .if ((\arg1 + \arg2) % 4 == 0) | ||
| 42 | adcx r8, rax | ||
| 43 | adox r9, rbx | ||
| 44 | .elseif ((\arg1 + \arg2) % 4 == 1) | ||
| 45 | adcx r9, rax | ||
| 46 | adox r10, rbx | ||
| 47 | .elseif ((\arg1 + \arg2) % 4 == 2) | ||
| 48 | adcx r10, rax | ||
| 49 | adox r11, rbx | ||
| 50 | .elseif ((\arg1 + \arg2) % 4 == 3) | ||
| 51 | adcx r11, rax | ||
| 52 | adox r8, rbx | ||
| 53 | .endif | ||
| 54 | |||
| 55 | .endm | ||
| 56 | |||
| 57 | |||
| 58 | // Add in the whole j'th row | ||
| 59 | |||
| 60 | .macro addrow arg1 | ||
| 61 | mov rdx, [y+8*\arg1] | ||
| 62 | xor zeroe, zeroe | ||
| 63 | |||
| 64 | mulpadd \arg1, 0 | ||
| 65 | |||
| 66 | .if (\arg1 % 4 == 0) | ||
| 67 | mov [z+8*\arg1],r8 | ||
| 68 | .elseif (\arg1 % 4 == 1) | ||
| 69 | mov [z+8*\arg1],r9 | ||
| 70 | .elseif (\arg1 % 4 == 2) | ||
| 71 | mov [z+8*\arg1],r10 | ||
| 72 | .elseif (\arg1 % 4 == 3) | ||
| 73 | mov [z+8*\arg1],r11 | ||
| 74 | .endif | ||
| 75 | |||
| 76 | mulpadd \arg1, 1 | ||
| 77 | mulpadd \arg1, 2 | ||
| 78 | |||
| 79 | .if (\arg1 % 4 == 0) | ||
| 80 | mulx r8, rax, [x+24] | ||
| 81 | adcx r11, rax | ||
| 82 | adox r8, zero | ||
| 83 | adcx r8, zero | ||
| 84 | .elseif (\arg1 % 4 == 1) | ||
| 85 | mulx r9, rax, [x+24] | ||
| 86 | adcx r8, rax | ||
| 87 | adox r9, zero | ||
| 88 | adcx r9, zero | ||
| 89 | .elseif (\arg1 % 4 == 2) | ||
| 90 | mulx r10, rax, [x+24] | ||
| 91 | adcx r9, rax | ||
| 92 | adox r10, zero | ||
| 93 | adcx r10, zero | ||
| 94 | .elseif (\arg1 % 4 == 3) | ||
| 95 | mulx r11, rax, [x+24] | ||
| 96 | adcx r10, rax | ||
| 97 | adox r11, zero | ||
| 98 | adcx r11, zero | ||
| 99 | .endif | ||
| 100 | |||
| 101 | .endm | ||
| 102 | |||
| 103 | |||
| 104 | |||
| 105 | S2N_BN_SYMBOL(bignum_mul_4_8): | ||
| 106 | _CET_ENDBR | ||
| 107 | |||
| 108 | #if WINDOWS_ABI | ||
| 109 | push rdi | ||
| 110 | push rsi | ||
| 111 | mov rdi, rcx | ||
| 112 | mov rsi, rdx | ||
| 113 | mov rdx, r8 | ||
| 114 | #endif | ||
| 115 | |||
| 116 | // Save more registers to play with | ||
| 117 | |||
| 118 | push rbp | ||
| 119 | push rbx | ||
| 120 | |||
| 121 | // Copy y into a safe register to start with | ||
| 122 | |||
| 123 | mov y, rdx | ||
| 124 | |||
| 125 | // Zero a register, which also makes sure we don't get a fake carry-in | ||
| 126 | |||
| 127 | xor zeroe, zeroe | ||
| 128 | |||
| 129 | // Do the zeroth row, which is a bit different | ||
| 130 | // Write back the zero-zero product and then accumulate | ||
| 131 | // r8,r11,r10,r9 as y[0] * x from 1..4 | ||
| 132 | |||
| 133 | mov rdx, [y] | ||
| 134 | |||
| 135 | mulx r9, r8, [x] | ||
| 136 | mov [z], r8 | ||
| 137 | |||
| 138 | mulx r10, rbx, [x+8] | ||
| 139 | adcx r9, rbx | ||
| 140 | |||
| 141 | mulx r11, rbx, [x+16] | ||
| 142 | adcx r10, rbx | ||
| 143 | |||
| 144 | mulx r8, rbx, [x+24] | ||
| 145 | adcx r11, rbx | ||
| 146 | adcx r8, zero | ||
| 147 | |||
| 148 | // Now all the other rows in a uniform pattern | ||
| 149 | |||
| 150 | addrow 1 | ||
| 151 | addrow 2 | ||
| 152 | addrow 3 | ||
| 153 | |||
| 154 | // Now write back the additional columns | ||
| 155 | |||
| 156 | mov [z+32], r8 | ||
| 157 | mov [z+40], r9 | ||
| 158 | mov [z+48], r10 | ||
| 159 | mov [z+56], r11 | ||
| 160 | |||
| 161 | // Restore registers and return | ||
| 162 | |||
| 163 | pop rbx | ||
| 164 | pop rbp | ||
| 165 | |||
| 166 | #if WINDOWS_ABI | ||
| 167 | pop rsi | ||
| 168 | pop rdi | ||
| 169 | #endif | ||
| 170 | ret | ||
| 171 | |||
| 172 | #if defined(__linux__) && defined(__ELF__) | ||
| 173 | .section .note.GNU-stack,"",%progbits | ||
| 174 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S new file mode 100644 index 0000000000..7bfae22a3f --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16.S | |||
| @@ -0,0 +1,260 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Multiply z := x * y | ||
| 6 | // Inputs x[8], y[8]; output z[16] | ||
| 7 | // | ||
| 8 | // extern void bignum_mul_8_16(uint64_t z[static 16], const uint64_t x[static 8], | ||
| 9 | // const uint64_t y[static 8]); | ||
| 10 | // | ||
| 11 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
| 12 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
| 13 | // ---------------------------------------------------------------------------- | ||
| 14 | |||
| 15 | #include "_internal_s2n_bignum.h" | ||
| 16 | |||
| 17 | .intel_syntax noprefix | ||
| 18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16) | ||
| 19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16) | ||
| 20 | .text | ||
| 21 | |||
| 22 | // These are actually right | ||
| 23 | |||
| 24 | #define z rdi | ||
| 25 | #define x rsi | ||
| 26 | |||
| 27 | // Copied in or set up | ||
| 28 | |||
| 29 | #define y rcx | ||
| 30 | |||
| 31 | // A zero register | ||
| 32 | |||
| 33 | #define zero rbp | ||
| 34 | #define zeroe ebp | ||
| 35 | |||
| 36 | // mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j | ||
| 37 | |||
| 38 | .macro mulpadd arg1,arg2 | ||
| 39 | mulx rbx, rax, [x+8*\arg1] | ||
| 40 | .if ((\arg1 + \arg2) % 8 == 0) | ||
| 41 | adcx r8, rax | ||
| 42 | adox r9, rbx | ||
| 43 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
| 44 | adcx r9, rax | ||
| 45 | adox r10, rbx | ||
| 46 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
| 47 | adcx r10, rax | ||
| 48 | adox r11, rbx | ||
| 49 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
| 50 | adcx r11, rax | ||
| 51 | adox r12, rbx | ||
| 52 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
| 53 | adcx r12, rax | ||
| 54 | adox r13, rbx | ||
| 55 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
| 56 | adcx r13, rax | ||
| 57 | adox r14, rbx | ||
| 58 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
| 59 | adcx r14, rax | ||
| 60 | adox r15, rbx | ||
| 61 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
| 62 | adcx r15, rax | ||
| 63 | adox r8, rbx | ||
| 64 | .endif | ||
| 65 | |||
| 66 | .endm | ||
| 67 | |||
| 68 | // mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j | ||
| 69 | // but re-creates the top word assuming nothing to add there | ||
| 70 | |||
| 71 | .macro mulpade arg1,arg2 | ||
| 72 | .if ((\arg1 + \arg2) % 8 == 0) | ||
| 73 | mulx r9, rax, [x+8*\arg1] | ||
| 74 | adcx r8, rax | ||
| 75 | adox r9, zero | ||
| 76 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
| 77 | mulx r10, rax, [x+8*\arg1] | ||
| 78 | adcx r9, rax | ||
| 79 | adox r10, zero | ||
| 80 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
| 81 | mulx r11, rax, [x+8*\arg1] | ||
| 82 | adcx r10, rax | ||
| 83 | adox r11, zero | ||
| 84 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
| 85 | mulx r12, rax, [x+8*\arg1] | ||
| 86 | adcx r11, rax | ||
| 87 | adox r12, zero | ||
| 88 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
| 89 | mulx r13, rax, [x+8*\arg1] | ||
| 90 | adcx r12, rax | ||
| 91 | adox r13, zero | ||
| 92 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
| 93 | mulx r14, rax, [x+8*\arg1] | ||
| 94 | adcx r13, rax | ||
| 95 | adox r14, zero | ||
| 96 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
| 97 | mulx r15, rax, [x+8*\arg1] | ||
| 98 | adcx r14, rax | ||
| 99 | adox r15, zero | ||
| 100 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
| 101 | mulx r8, rax, [x+8*\arg1] | ||
| 102 | adcx r15, rax | ||
| 103 | adox r8, zero | ||
| 104 | .endif | ||
| 105 | |||
| 106 | .endm | ||
| 107 | |||
| 108 | // Add in the whole j'th row | ||
| 109 | |||
| 110 | .macro addrow arg1 | ||
| 111 | mov rdx, [y+8*\arg1] | ||
| 112 | xor zeroe, zeroe | ||
| 113 | |||
| 114 | mulpadd 0, \arg1 | ||
| 115 | |||
| 116 | .if (\arg1 % 8 == 0) | ||
| 117 | mov [z+8*\arg1],r8 | ||
| 118 | .elseif (\arg1 % 8 == 1) | ||
| 119 | mov [z+8*\arg1],r9 | ||
| 120 | .elseif (\arg1 % 8 == 2) | ||
| 121 | mov [z+8*\arg1],r10 | ||
| 122 | .elseif (\arg1 % 8 == 3) | ||
| 123 | mov [z+8*\arg1],r11 | ||
| 124 | .elseif (\arg1 % 8 == 4) | ||
| 125 | mov [z+8*\arg1],r12 | ||
| 126 | .elseif (\arg1 % 8 == 5) | ||
| 127 | mov [z+8*\arg1],r13 | ||
| 128 | .elseif (\arg1 % 8 == 6) | ||
| 129 | mov [z+8*\arg1],r14 | ||
| 130 | .elseif (\arg1 % 8 == 7) | ||
| 131 | mov [z+8*\arg1],r15 | ||
| 132 | .endif | ||
| 133 | |||
| 134 | mulpadd 1, \arg1 | ||
| 135 | mulpadd 2, \arg1 | ||
| 136 | mulpadd 3, \arg1 | ||
| 137 | mulpadd 4, \arg1 | ||
| 138 | mulpadd 5, \arg1 | ||
| 139 | mulpadd 6, \arg1 | ||
| 140 | mulpade 7, \arg1 | ||
| 141 | |||
| 142 | .if (\arg1 % 8 == 0) | ||
| 143 | adc r8, zero | ||
| 144 | .elseif (\arg1 % 8 == 1) | ||
| 145 | adc r9, zero | ||
| 146 | .elseif (\arg1 % 8 == 2) | ||
| 147 | adc r10, zero | ||
| 148 | .elseif (\arg1 % 8 == 3) | ||
| 149 | adc r11, zero | ||
| 150 | .elseif (\arg1 % 8 == 4) | ||
| 151 | adc r12, zero | ||
| 152 | .elseif (\arg1 % 8 == 5) | ||
| 153 | adc r13, zero | ||
| 154 | .elseif (\arg1 % 8 == 6) | ||
| 155 | adc r14, zero | ||
| 156 | .elseif (\arg1 % 8 == 7) | ||
| 157 | adc r15, zero | ||
| 158 | .endif | ||
| 159 | |||
| 160 | .endm | ||
| 161 | |||
| 162 | |||
| 163 | S2N_BN_SYMBOL(bignum_mul_8_16): | ||
| 164 | _CET_ENDBR | ||
| 165 | |||
| 166 | #if WINDOWS_ABI | ||
| 167 | push rdi | ||
| 168 | push rsi | ||
| 169 | mov rdi, rcx | ||
| 170 | mov rsi, rdx | ||
| 171 | mov rdx, r8 | ||
| 172 | #endif | ||
| 173 | |||
| 174 | // Save more registers to play with | ||
| 175 | |||
| 176 | push rbp | ||
| 177 | push rbx | ||
| 178 | push r12 | ||
| 179 | push r13 | ||
| 180 | push r14 | ||
| 181 | push r15 | ||
| 182 | |||
| 183 | // Copy y into a safe register to start with | ||
| 184 | |||
| 185 | mov y, rdx | ||
| 186 | |||
| 187 | // Zero a register, which also makes sure we don't get a fake carry-in | ||
| 188 | |||
| 189 | xor zeroe, zeroe | ||
| 190 | |||
| 191 | // Do the zeroth row, which is a bit different | ||
| 192 | // Write back the zero-zero product and then accumulate | ||
| 193 | // r8,r15,r14,r13,r12,r11,r10,r9 as y[0] * x from 1..8 | ||
| 194 | |||
| 195 | mov rdx, [y] | ||
| 196 | |||
| 197 | mulx r9, r8, [x] | ||
| 198 | mov [z], r8 | ||
| 199 | |||
| 200 | mulx r10, rbx, [x+8] | ||
| 201 | adc r9, rbx | ||
| 202 | |||
| 203 | mulx r11, rbx, [x+16] | ||
| 204 | adc r10, rbx | ||
| 205 | |||
| 206 | mulx r12, rbx, [x+24] | ||
| 207 | adc r11, rbx | ||
| 208 | |||
| 209 | mulx r13, rbx, [x+32] | ||
| 210 | adc r12, rbx | ||
| 211 | |||
| 212 | mulx r14, rbx, [x+40] | ||
| 213 | adc r13, rbx | ||
| 214 | |||
| 215 | mulx r15, rbx, [x+48] | ||
| 216 | adc r14, rbx | ||
| 217 | |||
| 218 | mulx r8, rbx, [x+56] | ||
| 219 | adc r15, rbx | ||
| 220 | adc r8, zero | ||
| 221 | |||
| 222 | // Now all the other rows in a uniform pattern | ||
| 223 | |||
| 224 | addrow 1 | ||
| 225 | addrow 2 | ||
| 226 | addrow 3 | ||
| 227 | addrow 4 | ||
| 228 | addrow 5 | ||
| 229 | addrow 6 | ||
| 230 | addrow 7 | ||
| 231 | |||
| 232 | // Now write back the additional columns | ||
| 233 | |||
| 234 | mov [z+64], r8 | ||
| 235 | mov [z+72], r9 | ||
| 236 | mov [z+80], r10 | ||
| 237 | mov [z+88], r11 | ||
| 238 | mov [z+96], r12 | ||
| 239 | mov [z+104], r13 | ||
| 240 | mov [z+112], r14 | ||
| 241 | mov [z+120], r15 | ||
| 242 | |||
| 243 | // Real epilog | ||
| 244 | |||
| 245 | pop r15 | ||
| 246 | pop r14 | ||
| 247 | pop r13 | ||
| 248 | pop r12 | ||
| 249 | pop rbx | ||
| 250 | pop rbp | ||
| 251 | |||
| 252 | #if WINDOWS_ABI | ||
| 253 | pop rsi | ||
| 254 | pop rdi | ||
| 255 | #endif | ||
| 256 | ret | ||
| 257 | |||
| 258 | #if defined(__linux__) && defined(__ELF__) | ||
| 259 | .section .note.GNU-stack,"",%progbits | ||
| 260 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S new file mode 100644 index 0000000000..205455bd2c --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8.S | |||
| @@ -0,0 +1,145 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Square, z := x^2 | ||
| 6 | // Input x[4]; output z[8] | ||
| 7 | // | ||
| 8 | // extern void bignum_sqr_4_8(uint64_t z[static 8], const uint64_t x[static 4]); | ||
| 9 | // | ||
| 10 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
| 11 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
| 12 | // ---------------------------------------------------------------------------- | ||
| 13 | |||
| 14 | #include "_internal_s2n_bignum.h" | ||
| 15 | |||
| 16 | .intel_syntax noprefix | ||
| 17 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8) | ||
| 18 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8) | ||
| 19 | .text | ||
| 20 | |||
| 21 | // These are actually right | ||
| 22 | |||
| 23 | #define z rdi | ||
| 24 | #define x rsi | ||
| 25 | |||
| 26 | // A zero register | ||
| 27 | |||
| 28 | #define zero rbp | ||
| 29 | #define zeroe ebp | ||
| 30 | |||
| 31 | // Other registers | ||
| 32 | |||
| 33 | #define d1 r8 | ||
| 34 | #define d2 r9 | ||
| 35 | #define d3 r10 | ||
| 36 | #define d4 r11 | ||
| 37 | #define d5 r12 | ||
| 38 | #define d6 r13 | ||
| 39 | |||
| 40 | |||
| 41 | |||
| 42 | S2N_BN_SYMBOL(bignum_sqr_4_8): | ||
| 43 | _CET_ENDBR | ||
| 44 | |||
| 45 | #if WINDOWS_ABI | ||
| 46 | push rdi | ||
| 47 | push rsi | ||
| 48 | mov rdi, rcx | ||
| 49 | mov rsi, rdx | ||
| 50 | #endif | ||
| 51 | |||
| 52 | // Save more registers to play with | ||
| 53 | |||
| 54 | push rbp | ||
| 55 | push r12 | ||
| 56 | push r13 | ||
| 57 | |||
| 58 | // Set up an initial window [d6;...d1] = [23;03;01] | ||
| 59 | |||
| 60 | mov rdx, [x] | ||
| 61 | mulx d2, d1, [x+8] | ||
| 62 | mulx d4, d3, [x+24] | ||
| 63 | mov rdx, [x+16] | ||
| 64 | mulx d6, d5, [x+24] | ||
| 65 | |||
| 66 | // Clear our zero register, and also initialize the flags for the carry chain | ||
| 67 | |||
| 68 | xor zeroe, zeroe | ||
| 69 | |||
| 70 | // Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) | ||
| 71 | // This gives all the "heterogeneous" terms of the squaring ready to double | ||
| 72 | |||
| 73 | mulx rcx, rax, [x] | ||
| 74 | adcx d2, rax | ||
| 75 | adox d3, rcx | ||
| 76 | mulx rcx, rax, [x+8] | ||
| 77 | adcx d3, rax | ||
| 78 | adox d4, rcx | ||
| 79 | mov rdx, [x+24] | ||
| 80 | mulx rcx, rax, [x+8] | ||
| 81 | adcx d4, rax | ||
| 82 | adox d5, rcx | ||
| 83 | adcx d5, zero | ||
| 84 | adox d6, zero | ||
| 85 | adcx d6, zero | ||
| 86 | |||
| 87 | // In principle this is otiose as CF and OF carries are absorbed at this point | ||
| 88 | // However it seems helpful for the OOO engine to be told it's a fresh start | ||
| 89 | |||
| 90 | xor zeroe, zeroe | ||
| 91 | |||
| 92 | // Double and add to the 00 + 11 + 22 + 33 terms | ||
| 93 | // | ||
| 94 | // We could use shift-double but this seems tidier and in larger squarings | ||
| 95 | // it was actually more efficient. I haven't experimented with this small | ||
| 96 | // case to see how much that matters. Note: the writeback here is sprinkled | ||
| 97 | // into the sequence in such a way that things still work if z = x, i.e. if | ||
| 98 | // the output overwrites the input buffer and beyond. | ||
| 99 | |||
| 100 | mov rdx, [x] | ||
| 101 | mulx rdx, rax, rdx | ||
| 102 | mov [z], rax | ||
| 103 | adcx d1, d1 | ||
| 104 | adox d1, rdx | ||
| 105 | mov rdx, [x+8] | ||
| 106 | mov [z+8], d1 | ||
| 107 | mulx rdx, rax, rdx | ||
| 108 | adcx d2, d2 | ||
| 109 | adox d2, rax | ||
| 110 | adcx d3, d3 | ||
| 111 | adox d3, rdx | ||
| 112 | mov rdx, [x+16] | ||
| 113 | mov [z+16], d2 | ||
| 114 | mulx rdx, rax, rdx | ||
| 115 | adcx d4, d4 | ||
| 116 | adox d4, rax | ||
| 117 | adcx d5, d5 | ||
| 118 | adox d5, rdx | ||
| 119 | mov rdx, [x+24] | ||
| 120 | mov [z+24], d3 | ||
| 121 | mulx rdx, rax, rdx | ||
| 122 | mov [z+32], d4 | ||
| 123 | adcx d6, d6 | ||
| 124 | mov [z+40], d5 | ||
| 125 | adox d6, rax | ||
| 126 | mov [z+48], d6 | ||
| 127 | adcx rdx, zero | ||
| 128 | adox rdx, zero | ||
| 129 | mov [z+56], rdx | ||
| 130 | |||
| 131 | // Restore saved registers and return | ||
| 132 | |||
| 133 | pop r13 | ||
| 134 | pop r12 | ||
| 135 | pop rbp | ||
| 136 | |||
| 137 | #if WINDOWS_ABI | ||
| 138 | pop rsi | ||
| 139 | pop rdi | ||
| 140 | #endif | ||
| 141 | ret | ||
| 142 | |||
| 143 | #if defined(__linux__) && defined(__ELF__) | ||
| 144 | .section .note.GNU-stack,"",%progbits | ||
| 145 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S new file mode 100644 index 0000000000..77741c603e --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16.S | |||
| @@ -0,0 +1,298 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Square, z := x^2 | ||
| 6 | // Input x[8]; output z[16] | ||
| 7 | // | ||
| 8 | // extern void bignum_sqr_8_16(uint64_t z[static 16], const uint64_t x[static 8]); | ||
| 9 | // | ||
| 10 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
| 11 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
| 12 | // ---------------------------------------------------------------------------- | ||
| 13 | |||
| 14 | #include "_internal_s2n_bignum.h" | ||
| 15 | |||
| 16 | .intel_syntax noprefix | ||
| 17 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16) | ||
| 18 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16) | ||
| 19 | .text | ||
| 20 | |||
| 21 | // These are actually right | ||
| 22 | |||
| 23 | #define z rdi | ||
| 24 | #define x rsi | ||
| 25 | |||
| 26 | // A zero register | ||
| 27 | |||
| 28 | #define zero rbp | ||
| 29 | #define zeroe ebp | ||
| 30 | |||
| 31 | // mulpadd i, j adds rdx * x[i] into the window at the i+j point | ||
| 32 | |||
| 33 | .macro mulpadd arg1,arg2 | ||
| 34 | mulx rcx, rax, [x+8*\arg1] | ||
| 35 | .if ((\arg1 + \arg2) % 8 == 0) | ||
| 36 | adcx r8, rax | ||
| 37 | adox r9, rcx | ||
| 38 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
| 39 | adcx r9, rax | ||
| 40 | adox r10, rcx | ||
| 41 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
| 42 | adcx r10, rax | ||
| 43 | adox r11, rcx | ||
| 44 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
| 45 | adcx r11, rax | ||
| 46 | adox r12, rcx | ||
| 47 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
| 48 | adcx r12, rax | ||
| 49 | adox r13, rcx | ||
| 50 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
| 51 | adcx r13, rax | ||
| 52 | adox r14, rcx | ||
| 53 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
| 54 | adcx r14, rax | ||
| 55 | adox r15, rcx | ||
| 56 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
| 57 | adcx r15, rax | ||
| 58 | adox r8, rcx | ||
| 59 | .endif | ||
| 60 | |||
| 61 | .endm | ||
| 62 | |||
| 63 | // mulpade i, j adds rdx * x[i] into the window at i+j | ||
| 64 | // but re-creates the top word assuming nothing to add there | ||
| 65 | |||
| 66 | .macro mulpade arg1,arg2 | ||
| 67 | .if ((\arg1 + \arg2) % 8 == 0) | ||
| 68 | mulx r9, rax, [x+8*\arg1] | ||
| 69 | adcx r8, rax | ||
| 70 | adox r9, zero | ||
| 71 | .elseif ((\arg1 + \arg2) % 8 == 1) | ||
| 72 | mulx r10, rax, [x+8*\arg1] | ||
| 73 | adcx r9, rax | ||
| 74 | adox r10, zero | ||
| 75 | .elseif ((\arg1 + \arg2) % 8 == 2) | ||
| 76 | mulx r11, rax, [x+8*\arg1] | ||
| 77 | adcx r10, rax | ||
| 78 | adox r11, zero | ||
| 79 | .elseif ((\arg1 + \arg2) % 8 == 3) | ||
| 80 | mulx r12, rax, [x+8*\arg1] | ||
| 81 | adcx r11, rax | ||
| 82 | adox r12, zero | ||
| 83 | .elseif ((\arg1 + \arg2) % 8 == 4) | ||
| 84 | mulx r13, rax, [x+8*\arg1] | ||
| 85 | adcx r12, rax | ||
| 86 | adox r13, zero | ||
| 87 | .elseif ((\arg1 + \arg2) % 8 == 5) | ||
| 88 | mulx r14, rax, [x+8*\arg1] | ||
| 89 | adcx r13, rax | ||
| 90 | adox r14, zero | ||
| 91 | .elseif ((\arg1 + \arg2) % 8 == 6) | ||
| 92 | mulx r15, rax, [x+8*\arg1] | ||
| 93 | adcx r14, rax | ||
| 94 | adox r15, zero | ||
| 95 | .elseif ((\arg1 + \arg2) % 8 == 7) | ||
| 96 | mulx r8, rax, [x+8*\arg1] | ||
| 97 | adcx r15, rax | ||
| 98 | adox r8, zero | ||
| 99 | .endif | ||
| 100 | |||
| 101 | .endm | ||
| 102 | |||
| 103 | .macro diagonals | ||
| 104 | |||
| 105 | xor zeroe, zeroe | ||
| 106 | |||
| 107 | // Set initial window [r8..r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70 | ||
| 108 | |||
| 109 | mov rdx, [x] | ||
| 110 | mulx rax, r9, [x+8] | ||
| 111 | mov [z+8], r9 | ||
| 112 | mulx rcx, r10, [x+16] | ||
| 113 | adcx r10, rax | ||
| 114 | mov [z+16], r10 | ||
| 115 | mulx rax, r11, [x+24] | ||
| 116 | adcx r11, rcx | ||
| 117 | mulx rcx, r12, [x+32] | ||
| 118 | adcx r12, rax | ||
| 119 | mulx rax, r13, [x+40] | ||
| 120 | adcx r13, rcx | ||
| 121 | mulx rcx, r14, [x+48] | ||
| 122 | adcx r14, rax | ||
| 123 | mulx r8, r15, [x+56] | ||
| 124 | adcx r15, rcx | ||
| 125 | adcx r8, zero | ||
| 126 | |||
| 127 | // Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54 | ||
| 128 | |||
| 129 | xor zeroe, zeroe | ||
| 130 | mov rdx, [x+8] | ||
| 131 | mulpadd 2, 1 | ||
| 132 | mov [z+24], r11 | ||
| 133 | mulpadd 3, 1 | ||
| 134 | mov [z+32], r12 | ||
| 135 | mulpadd 4, 1 | ||
| 136 | mulpadd 5, 1 | ||
| 137 | mulpadd 6, 1 | ||
| 138 | mulpade 7, 1 | ||
| 139 | mov rdx, [x+32] | ||
| 140 | mulpade 5, 4 | ||
| 141 | adcx r10, zero | ||
| 142 | |||
| 143 | // And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65 | ||
| 144 | |||
| 145 | xor zeroe, zeroe | ||
| 146 | mov rdx, [x+16] | ||
| 147 | mulpadd 3, 2 | ||
| 148 | mov [z+40], r13 | ||
| 149 | mulpadd 4, 2 | ||
| 150 | mov [z+48], r14 | ||
| 151 | mulpadd 5, 2 | ||
| 152 | mulpadd 6, 2 | ||
| 153 | mulpadd 7, 2 | ||
| 154 | mov rdx, [x+48] | ||
| 155 | mulpade 4, 6 | ||
| 156 | mulpade 5, 6 | ||
| 157 | adcx r12, zero | ||
| 158 | |||
| 159 | // And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76 | ||
| 160 | |||
| 161 | xor zeroe, zeroe | ||
| 162 | mov rdx, [x+24] | ||
| 163 | mulpadd 4, 3 | ||
| 164 | mov [z+56], r15 | ||
| 165 | mulpadd 5, 3 | ||
| 166 | mov [z+64], r8 | ||
| 167 | mulpadd 6, 3 | ||
| 168 | mulpadd 7, 3 | ||
| 169 | mov rdx, [x+56] | ||
| 170 | mulpadd 4, 7 | ||
| 171 | mulpade 5, 7 | ||
| 172 | mulpade 6, 7 | ||
| 173 | adcx r14, zero | ||
| 174 | |||
| 175 | // Double and add things; use z[1]..z[8] and thereafter the registers | ||
| 176 | // r9..r15 which haven't been written back yet | ||
| 177 | |||
| 178 | xor zeroe, zeroe | ||
| 179 | mov rdx, [x] | ||
| 180 | mulx rcx, rax, rdx | ||
| 181 | mov [z], rax | ||
| 182 | mov rax, [z+8] | ||
| 183 | adcx rax, rax | ||
| 184 | adox rax, rcx | ||
| 185 | mov [z+8], rax | ||
| 186 | |||
| 187 | mov rax, [z+16] | ||
| 188 | mov rdx, [x+8] | ||
| 189 | mulx rcx, rdx, rdx | ||
| 190 | adcx rax, rax | ||
| 191 | adox rax, rdx | ||
| 192 | mov [z+16], rax | ||
| 193 | mov rax, [z+24] | ||
| 194 | adcx rax, rax | ||
| 195 | adox rax, rcx | ||
| 196 | mov [z+24], rax | ||
| 197 | |||
| 198 | mov rax, [z+32] | ||
| 199 | mov rdx, [x+16] | ||
| 200 | mulx rcx, rdx, rdx | ||
| 201 | adcx rax, rax | ||
| 202 | adox rax, rdx | ||
| 203 | mov [z+32], rax | ||
| 204 | mov rax, [z+40] | ||
| 205 | adcx rax, rax | ||
| 206 | adox rax, rcx | ||
| 207 | mov [z+40], rax | ||
| 208 | |||
| 209 | mov rax, [z+48] | ||
| 210 | mov rdx, [x+24] | ||
| 211 | mulx rcx, rdx, rdx | ||
| 212 | adcx rax, rax | ||
| 213 | adox rax, rdx | ||
| 214 | mov [z+48], rax | ||
| 215 | mov rax, [z+56] | ||
| 216 | adcx rax, rax | ||
| 217 | adox rax, rcx | ||
| 218 | mov [z+56], rax | ||
| 219 | |||
| 220 | mov rax, [z+64] | ||
| 221 | mov rdx, [x+32] | ||
| 222 | mulx rcx, rdx, rdx | ||
| 223 | adcx rax, rax | ||
| 224 | adox rax, rdx | ||
| 225 | mov [z+64], rax | ||
| 226 | adcx r9, r9 | ||
| 227 | adox r9, rcx | ||
| 228 | mov [z+72], r9 | ||
| 229 | |||
| 230 | mov rdx, [x+40] | ||
| 231 | mulx rcx, rdx, rdx | ||
| 232 | adcx r10, r10 | ||
| 233 | adox r10, rdx | ||
| 234 | mov [z+80], r10 | ||
| 235 | adcx r11, r11 | ||
| 236 | adox r11, rcx | ||
| 237 | mov [z+88], r11 | ||
| 238 | |||
| 239 | mov rdx, [x+48] | ||
| 240 | mulx rcx, rdx, rdx | ||
| 241 | adcx r12, r12 | ||
| 242 | adox r12, rdx | ||
| 243 | mov [z+96], r12 | ||
| 244 | adcx r13, r13 | ||
| 245 | adox r13, rcx | ||
| 246 | mov [z+104], r13 | ||
| 247 | |||
| 248 | mov rdx, [x+56] | ||
| 249 | mulx r15, rdx, rdx | ||
| 250 | adcx r14, r14 | ||
| 251 | adox r14, rdx | ||
| 252 | mov [z+112], r14 | ||
| 253 | adcx r15, zero | ||
| 254 | adox r15, zero | ||
| 255 | mov [z+120], r15 | ||
| 256 | |||
| 257 | .endm | ||
| 258 | |||
| 259 | |||
| 260 | S2N_BN_SYMBOL(bignum_sqr_8_16): | ||
| 261 | _CET_ENDBR | ||
| 262 | |||
| 263 | #if WINDOWS_ABI | ||
| 264 | push rdi | ||
| 265 | push rsi | ||
| 266 | mov rdi, rcx | ||
| 267 | mov rsi, rdx | ||
| 268 | #endif | ||
| 269 | |||
| 270 | // Save more registers to play with | ||
| 271 | |||
| 272 | push rbp | ||
| 273 | push r12 | ||
| 274 | push r13 | ||
| 275 | push r14 | ||
| 276 | push r15 | ||
| 277 | |||
| 278 | // Do the multiplication | ||
| 279 | |||
| 280 | diagonals | ||
| 281 | |||
| 282 | // Real epilog | ||
| 283 | |||
| 284 | pop r15 | ||
| 285 | pop r14 | ||
| 286 | pop r13 | ||
| 287 | pop r12 | ||
| 288 | pop rbp | ||
| 289 | |||
| 290 | #if WINDOWS_ABI | ||
| 291 | pop rsi | ||
| 292 | pop rdi | ||
| 293 | #endif | ||
| 294 | ret | ||
| 295 | |||
| 296 | #if defined(__linux__) && defined(__ELF__) | ||
| 297 | .section .note.GNU-stack,"",%progbits | ||
| 298 | #endif | ||
