diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_add.S | 153 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S | 143 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S | 126 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul.S | 155 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S | 145 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S | 232 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S | 133 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S | 230 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/arch/amd64/bignum_sub.S | 141 |
9 files changed, 1458 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S new file mode 100644 index 0000000000..33663916bb --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S | |||
| @@ -0,0 +1,153 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Add, z := x + y | ||
| 6 | // Inputs x[m], y[n]; outputs function return (carry-out) and z[p] | ||
| 7 | // | ||
| 8 | // extern uint64_t bignum_add | ||
| 9 | // (uint64_t p, uint64_t *z, | ||
| 10 | // uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); | ||
| 11 | // | ||
| 12 | // Does the z := x + y operation, truncating modulo p words in general and | ||
| 13 | // returning a top carry (0 or 1) in the p'th place, only adding the input | ||
| 14 | // words below p (as well as m and n respectively) to get the sum and carry. | ||
| 15 | // | ||
| 16 | // Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX | ||
| 17 | // Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX | ||
| 18 | // ---------------------------------------------------------------------------- | ||
| 19 | |||
| 20 | #include "_internal_s2n_bignum.h" | ||
| 21 | |||
| 22 | .intel_syntax noprefix | ||
| 23 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add) | ||
| 24 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add) | ||
| 25 | .text | ||
| 26 | |||
| 27 | #define p rdi | ||
| 28 | #define z rsi | ||
| 29 | #define m rdx | ||
| 30 | #define x rcx | ||
| 31 | #define n r8 | ||
| 32 | #define y r9 | ||
| 33 | #define i r10 | ||
| 34 | #define a rax | ||
| 35 | |||
| 36 | #define ashort eax | ||
| 37 | |||
| 38 | |||
| 39 | |||
| 40 | S2N_BN_SYMBOL(bignum_add): | ||
| 41 | |||
| 42 | #if WINDOWS_ABI | ||
| 43 | push rdi | ||
| 44 | push rsi | ||
| 45 | mov rdi, rcx | ||
| 46 | mov rsi, rdx | ||
| 47 | mov rdx, r8 | ||
| 48 | mov rcx, r9 | ||
| 49 | mov r8, [rsp+56] | ||
| 50 | mov r9, [rsp+64] | ||
| 51 | #endif | ||
| 52 | |||
| 53 | // Zero the main index counter for both branches | ||
| 54 | |||
| 55 | xor i, i | ||
| 56 | |||
| 57 | // First clamp the two input sizes m := min(p,m) and n := min(p,n) since | ||
| 58 | // we'll never need words past the p'th. Can now assume m <= p and n <= p. | ||
| 59 | // Then compare the modified m and n and branch accordingly | ||
| 60 | |||
| 61 | cmp p, m | ||
| 62 | cmovc m, p | ||
| 63 | cmp p, n | ||
| 64 | cmovc n, p | ||
| 65 | cmp m, n | ||
| 66 | jc ylonger | ||
| 67 | |||
| 68 | // The case where x is longer or of the same size (p >= m >= n) | ||
| 69 | |||
| 70 | sub p, m | ||
| 71 | sub m, n | ||
| 72 | inc m | ||
| 73 | test n, n | ||
| 74 | jz xtest | ||
| 75 | xmainloop: | ||
| 76 | mov a, [x+8*i] | ||
| 77 | adc a, [y+8*i] | ||
| 78 | mov [z+8*i],a | ||
| 79 | inc i | ||
| 80 | dec n | ||
| 81 | jnz xmainloop | ||
| 82 | jmp xtest | ||
| 83 | xtoploop: | ||
| 84 | mov a, [x+8*i] | ||
| 85 | adc a, 0 | ||
| 86 | mov [z+8*i],a | ||
| 87 | inc i | ||
| 88 | xtest: | ||
| 89 | dec m | ||
| 90 | jnz xtoploop | ||
| 91 | mov ashort, 0 | ||
| 92 | adc a, 0 | ||
| 93 | test p, p | ||
| 94 | jnz tails | ||
| 95 | #if WINDOWS_ABI | ||
| 96 | pop rsi | ||
| 97 | pop rdi | ||
| 98 | #endif | ||
| 99 | ret | ||
| 100 | |||
| 101 | // The case where y is longer (p >= n > m) | ||
| 102 | |||
| 103 | ylonger: | ||
| 104 | |||
| 105 | sub p, n | ||
| 106 | sub n, m | ||
| 107 | test m, m | ||
| 108 | jz ytoploop | ||
| 109 | ymainloop: | ||
| 110 | mov a, [x+8*i] | ||
| 111 | adc a, [y+8*i] | ||
| 112 | mov [z+8*i],a | ||
| 113 | inc i | ||
| 114 | dec m | ||
| 115 | jnz ymainloop | ||
| 116 | ytoploop: | ||
| 117 | mov a, [y+8*i] | ||
| 118 | adc a, 0 | ||
| 119 | mov [z+8*i],a | ||
| 120 | inc i | ||
| 121 | dec n | ||
| 122 | jnz ytoploop | ||
| 123 | mov ashort, 0 | ||
| 124 | adc a, 0 | ||
| 125 | test p, p | ||
| 126 | jnz tails | ||
| 127 | #if WINDOWS_ABI | ||
| 128 | pop rsi | ||
| 129 | pop rdi | ||
| 130 | #endif | ||
| 131 | ret | ||
| 132 | |||
| 133 | // Adding a non-trivial tail, when p > max(m,n) | ||
| 134 | |||
| 135 | tails: | ||
| 136 | mov [z+8*i],a | ||
| 137 | xor a, a | ||
| 138 | jmp tail | ||
| 139 | tailloop: | ||
| 140 | mov [z+8*i],a | ||
| 141 | tail: | ||
| 142 | inc i | ||
| 143 | dec p | ||
| 144 | jnz tailloop | ||
| 145 | #if WINDOWS_ABI | ||
| 146 | pop rsi | ||
| 147 | pop rdi | ||
| 148 | #endif | ||
| 149 | ret | ||
| 150 | |||
| 151 | #if defined(__linux__) && defined(__ELF__) | ||
| 152 | .section .note.GNU-stack,"",%progbits | ||
| 153 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S new file mode 100644 index 0000000000..33f9be2fa0 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S | |||
| @@ -0,0 +1,143 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Multiply-add with single-word multiplier, z := z + c * y | ||
| 6 | // Inputs c, y[n]; outputs function return (carry-out) and z[k] | ||
| 7 | // | ||
| 8 | // extern uint64_t bignum_cmadd | ||
| 9 | // (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); | ||
| 10 | // | ||
| 11 | // Does the "z := z + c * y" operation where y is n digits, result z is p. | ||
| 12 | // Truncates the result in general. | ||
| 13 | // | ||
| 14 | // The return value is a high/carry word that is meaningful when p = n + 1, or | ||
| 15 | // more generally when n <= p and the result fits in p + 1 digits. In these | ||
| 16 | // cases it gives the top digit of the (p + 1)-digit result. | ||
| 17 | // | ||
| 18 | // Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX | ||
| 19 | // Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX | ||
| 20 | // ---------------------------------------------------------------------------- | ||
| 21 | |||
| 22 | #include "_internal_s2n_bignum.h" | ||
| 23 | |||
| 24 | .intel_syntax noprefix | ||
| 25 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd) | ||
| 26 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd) | ||
| 27 | .text | ||
| 28 | |||
| 29 | #define p rdi | ||
| 30 | #define z rsi | ||
| 31 | #define c r9 | ||
| 32 | #define n rcx | ||
| 33 | #define x r8 | ||
| 34 | |||
| 35 | #define i r10 | ||
| 36 | #define h r11 | ||
| 37 | |||
| 38 | #define r rbx | ||
| 39 | |||
| 40 | #define hshort r11d | ||
| 41 | #define ishort r10d | ||
| 42 | |||
| 43 | |||
| 44 | |||
| 45 | S2N_BN_SYMBOL(bignum_cmadd): | ||
| 46 | |||
| 47 | #if WINDOWS_ABI | ||
| 48 | push rdi | ||
| 49 | push rsi | ||
| 50 | mov rdi, rcx | ||
| 51 | mov rsi, rdx | ||
| 52 | mov rdx, r8 | ||
| 53 | mov rcx, r9 | ||
| 54 | mov r8, [rsp+56] | ||
| 55 | #endif | ||
| 56 | |||
| 57 | // Seems hard to avoid one more register | ||
| 58 | |||
| 59 | push rbx | ||
| 60 | |||
| 61 | // First clamp the input size n := min(p,n) since we can never need to read | ||
| 62 | // past the p'th term of the input to generate p-digit output. | ||
| 63 | // Subtract p := p - min(n,p) so it holds the size of the extra tail needed | ||
| 64 | |||
| 65 | cmp p, n | ||
| 66 | cmovc n, p | ||
| 67 | sub p, n | ||
| 68 | |||
| 69 | // Initialize high part h = 0; if n = 0 do nothing but return that zero | ||
| 70 | |||
| 71 | xor h, h | ||
| 72 | test n, n | ||
| 73 | jz end | ||
| 74 | |||
| 75 | // Move c into a safer register as multiplies overwrite rdx | ||
| 76 | |||
| 77 | mov c, rdx | ||
| 78 | |||
| 79 | // Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0 | ||
| 80 | |||
| 81 | mov rax, [x] | ||
| 82 | mul c | ||
| 83 | add [z], rax | ||
| 84 | mov h, rdx | ||
| 85 | mov ishort, 1 | ||
| 86 | dec n | ||
| 87 | jz hightail | ||
| 88 | |||
| 89 | // Main loop, where we always have CF + previous high part h to add in | ||
| 90 | |||
| 91 | loop: | ||
| 92 | adc h, [z+8*i] | ||
| 93 | sbb r, r | ||
| 94 | mov rax, [x+8*i] | ||
| 95 | mul c | ||
| 96 | sub rdx, r | ||
| 97 | add rax, h | ||
| 98 | mov [z+8*i], rax | ||
| 99 | mov h, rdx | ||
| 100 | inc i | ||
| 101 | dec n | ||
| 102 | jnz loop | ||
| 103 | |||
| 104 | hightail: | ||
| 105 | adc h, 0 | ||
| 106 | |||
| 107 | // Propagate the carry all the way to the end with h as extra carry word | ||
| 108 | |||
| 109 | tail: | ||
| 110 | test p, p | ||
| 111 | jz end | ||
| 112 | |||
| 113 | add [z+8*i], h | ||
| 114 | mov hshort, 0 | ||
| 115 | inc i | ||
| 116 | dec p | ||
| 117 | jz highend | ||
| 118 | |||
| 119 | tloop: | ||
| 120 | adc [z+8*i], h | ||
| 121 | inc i | ||
| 122 | dec p | ||
| 123 | jnz tloop | ||
| 124 | |||
| 125 | highend: | ||
| 126 | |||
| 127 | adc h, 0 | ||
| 128 | |||
| 129 | // Return the high/carry word | ||
| 130 | |||
| 131 | end: | ||
| 132 | mov rax, h | ||
| 133 | |||
| 134 | pop rbx | ||
| 135 | #if WINDOWS_ABI | ||
| 136 | pop rsi | ||
| 137 | pop rdi | ||
| 138 | #endif | ||
| 139 | ret | ||
| 140 | |||
| 141 | #if defined(__linux__) && defined(__ELF__) | ||
| 142 | .section .note.GNU-stack,"",%progbits | ||
| 143 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S new file mode 100644 index 0000000000..6d184e3f39 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S | |||
| @@ -0,0 +1,126 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Multiply by a single word, z := c * y | ||
| 6 | // Inputs c, y[n]; outputs function return (carry-out) and z[k] | ||
| 7 | // | ||
| 8 | // extern uint64_t bignum_cmul | ||
| 9 | // (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); | ||
| 10 | // | ||
| 11 | // Does the "z := c * y" operation where y is n digits, result z is p. | ||
| 12 | // Truncates the result in general unless p >= n + 1. | ||
| 13 | // | ||
| 14 | // The return value is a high/carry word that is meaningful when p >= n as | ||
| 15 | // giving the high part of the result. Since this is always zero if p > n, | ||
| 16 | // it is mainly of interest in the special case p = n, i.e. where the source | ||
| 17 | // and destination have the same nominal size, when it gives the extra word | ||
| 18 | // of the full result. | ||
| 19 | // | ||
| 20 | // Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX | ||
| 21 | // Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX | ||
| 22 | // ---------------------------------------------------------------------------- | ||
| 23 | |||
| 24 | #include "_internal_s2n_bignum.h" | ||
| 25 | |||
| 26 | .intel_syntax noprefix | ||
| 27 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul) | ||
| 28 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul) | ||
| 29 | .text | ||
| 30 | |||
| 31 | #define p rdi | ||
| 32 | #define z rsi | ||
| 33 | #define c r9 | ||
| 34 | #define n rcx | ||
| 35 | #define x r8 | ||
| 36 | |||
| 37 | #define i r10 | ||
| 38 | #define h r11 | ||
| 39 | |||
| 40 | |||
| 41 | |||
| 42 | S2N_BN_SYMBOL(bignum_cmul): | ||
| 43 | |||
| 44 | #if WINDOWS_ABI | ||
| 45 | push rdi | ||
| 46 | push rsi | ||
| 47 | mov rdi, rcx | ||
| 48 | mov rsi, rdx | ||
| 49 | mov rdx, r8 | ||
| 50 | mov rcx, r9 | ||
| 51 | mov r8, [rsp+56] | ||
| 52 | #endif | ||
| 53 | |||
| 54 | // First clamp the input size n := min(p,n) since we can never need to read | ||
| 55 | // past the p'th term of the input to generate p-digit output. Now we can | ||
| 56 | // assume that n <= p | ||
| 57 | |||
| 58 | cmp p, n | ||
| 59 | cmovc n, p | ||
| 60 | |||
| 61 | // Initialize current input/output pointer offset i and high part h. | ||
| 62 | // But then if n = 0 skip the multiplication and go to the tail part | ||
| 63 | |||
| 64 | xor h, h | ||
| 65 | xor i, i | ||
| 66 | test n, n | ||
| 67 | jz tail | ||
| 68 | |||
| 69 | // Move c into a safer register as multiplies overwrite rdx | ||
| 70 | |||
| 71 | mov c, rdx | ||
| 72 | |||
| 73 | // Initialization of the loop: [h,l] = c * x_0 | ||
| 74 | |||
| 75 | mov rax, [x] | ||
| 76 | mul c | ||
| 77 | mov [z], rax | ||
| 78 | mov h, rdx | ||
| 79 | inc i | ||
| 80 | cmp i, n | ||
| 81 | jz tail | ||
| 82 | |||
| 83 | // Main loop doing the multiplications | ||
| 84 | |||
| 85 | loop: | ||
| 86 | mov rax, [x+8*i] | ||
| 87 | mul c | ||
| 88 | add rax, h | ||
| 89 | adc rdx, 0 | ||
| 90 | mov [z+8*i], rax | ||
| 91 | mov h, rdx | ||
| 92 | inc i | ||
| 93 | cmp i, n | ||
| 94 | jc loop | ||
| 95 | |||
| 96 | // Add a tail when the destination is longer | ||
| 97 | |||
| 98 | tail: | ||
| 99 | cmp i, p | ||
| 100 | jnc end | ||
| 101 | mov [z+8*i], h | ||
| 102 | xor h, h | ||
| 103 | inc i | ||
| 104 | cmp i, p | ||
| 105 | jnc end | ||
| 106 | |||
| 107 | tloop: | ||
| 108 | mov [z+8*i], h | ||
| 109 | inc i | ||
| 110 | cmp i, p | ||
| 111 | jc tloop | ||
| 112 | |||
| 113 | // Return the high/carry word | ||
| 114 | |||
| 115 | end: | ||
| 116 | mov rax, h | ||
| 117 | |||
| 118 | #if WINDOWS_ABI | ||
| 119 | pop rsi | ||
| 120 | pop rdi | ||
| 121 | #endif | ||
| 122 | ret | ||
| 123 | |||
| 124 | #if defined(__linux__) && defined(__ELF__) | ||
| 125 | .section .note.GNU-stack,"",%progbits | ||
| 126 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S new file mode 100644 index 0000000000..20c3f70202 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S | |||
| @@ -0,0 +1,155 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Multiply z := x * y | ||
| 6 | // Inputs x[m], y[n]; output z[k] | ||
| 7 | // | ||
| 8 | // extern void bignum_mul | ||
| 9 | // (uint64_t k, uint64_t *z, | ||
| 10 | // uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); | ||
| 11 | // | ||
| 12 | // Does the "z := x * y" operation where x is m digits, y is n, result z is k. | ||
| 13 | // Truncates the result in general unless k >= m + n | ||
| 14 | // | ||
| 15 | // Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y | ||
| 16 | // Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y | ||
| 17 | // ---------------------------------------------------------------------------- | ||
| 18 | |||
| 19 | #include "_internal_s2n_bignum.h" | ||
| 20 | |||
| 21 | .intel_syntax noprefix | ||
| 22 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul) | ||
| 23 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul) | ||
| 24 | .text | ||
| 25 | |||
| 26 | // These are actually right | ||
| 27 | |||
| 28 | #define p rdi | ||
| 29 | #define z rsi | ||
| 30 | #define n r8 | ||
| 31 | |||
| 32 | // These are not | ||
| 33 | |||
| 34 | #define c r15 | ||
| 35 | #define h r14 | ||
| 36 | #define l r13 | ||
| 37 | #define x r12 | ||
| 38 | #define y r11 | ||
| 39 | #define i rbx | ||
| 40 | #define k r10 | ||
| 41 | #define m rbp | ||
| 42 | |||
| 43 | // These are always local scratch since multiplier result is in these | ||
| 44 | |||
| 45 | #define a rax | ||
| 46 | #define d rdx | ||
| 47 | |||
| 48 | |||
| 49 | |||
| 50 | S2N_BN_SYMBOL(bignum_mul): | ||
| 51 | |||
| 52 | #if WINDOWS_ABI | ||
| 53 | push rdi | ||
| 54 | push rsi | ||
| 55 | mov rdi, rcx | ||
| 56 | mov rsi, rdx | ||
| 57 | mov rdx, r8 | ||
| 58 | mov rcx, r9 | ||
| 59 | mov r8, [rsp+56] | ||
| 60 | mov r9, [rsp+64] | ||
| 61 | #endif | ||
| 62 | |||
| 63 | // We use too many registers, and also we need rax:rdx for multiplications | ||
| 64 | |||
| 65 | push rbx | ||
| 66 | push rbp | ||
| 67 | push r12 | ||
| 68 | push r13 | ||
| 69 | push r14 | ||
| 70 | push r15 | ||
| 71 | mov m, rdx | ||
| 72 | |||
| 73 | // If the result size is zero, do nothing | ||
| 74 | // Note that even if either or both inputs has size zero, we can't | ||
| 75 | // just give up because we at least need to zero the output array | ||
| 76 | // If we did a multiply-add variant, however, then we could | ||
| 77 | |||
| 78 | test p, p | ||
| 79 | jz end | ||
| 80 | |||
| 81 | // Set initial 2-part sum to zero (we zero c inside the body) | ||
| 82 | |||
| 83 | xor h,h | ||
| 84 | xor l,l | ||
| 85 | |||
| 86 | // Otherwise do outer loop k = 0 ... k = p - 1 | ||
| 87 | |||
| 88 | xor k, k | ||
| 89 | |||
| 90 | outerloop: | ||
| 91 | |||
| 92 | // Zero our carry term first; we eventually want it and a zero is useful now | ||
| 93 | // Set a = max 0 (k + 1 - n), i = min (k + 1) m | ||
| 94 | // This defines the range a <= j < i for the inner summation | ||
| 95 | // Note that since k < p < 2^64 we can assume k + 1 doesn't overflow | ||
| 96 | // And since we want to increment it anyway, we might as well do it now | ||
| 97 | |||
| 98 | xor c, c // c = 0 | ||
| 99 | inc k // k = k + 1 | ||
| 100 | |||
| 101 | mov a, k // a = k + 1 | ||
| 102 | sub a, n // a = k + 1 - n | ||
| 103 | cmovc a, c // a = max 0 (k + 1 - n) | ||
| 104 | |||
| 105 | mov i, m // i = m | ||
| 106 | cmp k, m // CF <=> k + 1 < m | ||
| 107 | cmovc i, k // i = min (k + 1) m | ||
| 108 | |||
| 109 | // Turn i into a loop count, and skip things if it's <= 0 | ||
| 110 | // Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a] | ||
| 111 | // and then launch into the main inner loop, postdecrementing i | ||
| 112 | |||
| 113 | mov d, k | ||
| 114 | sub d, i | ||
| 115 | sub i, a | ||
| 116 | jbe innerend | ||
| 117 | lea x,[rcx+8*a] | ||
| 118 | lea y,[r9+8*d-8] | ||
| 119 | |||
| 120 | innerloop: | ||
| 121 | mov rax, [y+8*i] | ||
| 122 | mul QWORD PTR [x] | ||
| 123 | add x, 8 | ||
| 124 | add l, rax | ||
| 125 | adc h, rdx | ||
| 126 | adc c, 0 | ||
| 127 | dec i | ||
| 128 | jnz innerloop | ||
| 129 | |||
| 130 | innerend: | ||
| 131 | |||
| 132 | mov [z], l | ||
| 133 | mov l, h | ||
| 134 | mov h, c | ||
| 135 | add z, 8 | ||
| 136 | |||
| 137 | cmp k, p | ||
| 138 | jc outerloop | ||
| 139 | |||
| 140 | end: | ||
| 141 | pop r15 | ||
| 142 | pop r14 | ||
| 143 | pop r13 | ||
| 144 | pop r12 | ||
| 145 | pop rbp | ||
| 146 | pop rbx | ||
| 147 | #if WINDOWS_ABI | ||
| 148 | pop rsi | ||
| 149 | pop rdi | ||
| 150 | #endif | ||
| 151 | ret | ||
| 152 | |||
| 153 | #if defined(__linux__) && defined(__ELF__) | ||
| 154 | .section .note.GNU-stack,"",%progbits | ||
| 155 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S new file mode 100644 index 0000000000..e70ed116da --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S | |||
| @@ -0,0 +1,145 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Multiply z := x * y | ||
| 6 | // Inputs x[4], y[4]; output z[8] | ||
| 7 | // | ||
| 8 | // extern void bignum_mul_4_8_alt | ||
| 9 | // (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); | ||
| 10 | // | ||
| 11 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
| 12 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
| 13 | // ---------------------------------------------------------------------------- | ||
| 14 | |||
| 15 | #include "_internal_s2n_bignum.h" | ||
| 16 | |||
| 17 | .intel_syntax noprefix | ||
| 18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt) | ||
| 19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt) | ||
| 20 | .text | ||
| 21 | |||
| 22 | // These are actually right | ||
| 23 | |||
| 24 | #define z rdi | ||
| 25 | #define x rsi | ||
| 26 | |||
| 27 | // This is moved from rdx to free it for muls | ||
| 28 | |||
| 29 | #define y rcx | ||
| 30 | |||
| 31 | // Other variables used as a rotating 3-word window to add terms to | ||
| 32 | |||
| 33 | #define t0 r8 | ||
| 34 | #define t1 r9 | ||
| 35 | #define t2 r10 | ||
| 36 | |||
| 37 | // Macro for the key "multiply and add to (c,h,l)" step | ||
| 38 | |||
| 39 | #define combadd(c,h,l,numa,numb) \ | ||
| 40 | mov rax, numa; \ | ||
| 41 | mul QWORD PTR numb; \ | ||
| 42 | add l, rax; \ | ||
| 43 | adc h, rdx; \ | ||
| 44 | adc c, 0 | ||
| 45 | |||
| 46 | // A minutely shorter form for when c = 0 initially | ||
| 47 | |||
| 48 | #define combadz(c,h,l,numa,numb) \ | ||
| 49 | mov rax, numa; \ | ||
| 50 | mul QWORD PTR numb; \ | ||
| 51 | add l, rax; \ | ||
| 52 | adc h, rdx; \ | ||
| 53 | adc c, c | ||
| 54 | |||
| 55 | // A short form where we don't expect a top carry | ||
| 56 | |||
| 57 | #define combads(h,l,numa,numb) \ | ||
| 58 | mov rax, numa; \ | ||
| 59 | mul QWORD PTR numb; \ | ||
| 60 | add l, rax; \ | ||
| 61 | adc h, rdx | ||
| 62 | |||
| 63 | S2N_BN_SYMBOL(bignum_mul_4_8_alt): | ||
| 64 | |||
| 65 | #if WINDOWS_ABI | ||
| 66 | push rdi | ||
| 67 | push rsi | ||
| 68 | mov rdi, rcx | ||
| 69 | mov rsi, rdx | ||
| 70 | mov rdx, r8 | ||
| 71 | #endif | ||
| 72 | |||
| 73 | // Copy y into a safe register to start with | ||
| 74 | |||
| 75 | mov y, rdx | ||
| 76 | |||
| 77 | // Result term 0 | ||
| 78 | |||
| 79 | mov rax, [x] | ||
| 80 | mul QWORD PTR [y] | ||
| 81 | |||
| 82 | mov [z], rax | ||
| 83 | mov t0, rdx | ||
| 84 | xor t1, t1 | ||
| 85 | |||
| 86 | // Result term 1 | ||
| 87 | |||
| 88 | xor t2, t2 | ||
| 89 | combads(t1,t0,[x],[y+8]) | ||
| 90 | combadz(t2,t1,t0,[x+8],[y]) | ||
| 91 | mov [z+8], t0 | ||
| 92 | |||
| 93 | // Result term 2 | ||
| 94 | |||
| 95 | xor t0, t0 | ||
| 96 | combadz(t0,t2,t1,[x],[y+16]) | ||
| 97 | combadd(t0,t2,t1,[x+8],[y+8]) | ||
| 98 | combadd(t0,t2,t1,[x+16],[y]) | ||
| 99 | mov [z+16], t1 | ||
| 100 | |||
| 101 | // Result term 3 | ||
| 102 | |||
| 103 | xor t1, t1 | ||
| 104 | combadz(t1,t0,t2,[x],[y+24]) | ||
| 105 | combadd(t1,t0,t2,[x+8],[y+16]) | ||
| 106 | combadd(t1,t0,t2,[x+16],[y+8]) | ||
| 107 | combadd(t1,t0,t2,[x+24],[y]) | ||
| 108 | mov [z+24], t2 | ||
| 109 | |||
| 110 | // Result term 4 | ||
| 111 | |||
| 112 | xor t2, t2 | ||
| 113 | combadz(t2,t1,t0,[x+8],[y+24]) | ||
| 114 | combadd(t2,t1,t0,[x+16],[y+16]) | ||
| 115 | combadd(t2,t1,t0,[x+24],[y+8]) | ||
| 116 | mov [z+32], t0 | ||
| 117 | |||
| 118 | // Result term 5 | ||
| 119 | |||
| 120 | xor t0, t0 | ||
| 121 | combadz(t0,t2,t1,[x+16],[y+24]) | ||
| 122 | combadd(t0,t2,t1,[x+24],[y+16]) | ||
| 123 | mov [z+40], t1 | ||
| 124 | |||
| 125 | // Result term 6 | ||
| 126 | |||
| 127 | xor t1, t1 | ||
| 128 | combads(t0,t2,[x+24],[y+24]) | ||
| 129 | mov [z+48], t2 | ||
| 130 | |||
| 131 | // Result term 7 | ||
| 132 | |||
| 133 | mov [z+56], t0 | ||
| 134 | |||
| 135 | // Return | ||
| 136 | |||
| 137 | #if WINDOWS_ABI | ||
| 138 | pop rsi | ||
| 139 | pop rdi | ||
| 140 | #endif | ||
| 141 | ret | ||
| 142 | |||
| 143 | #if defined(__linux__) && defined(__ELF__) | ||
| 144 | .section .note.GNU-stack,"",%progbits | ||
| 145 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S new file mode 100644 index 0000000000..43c6c48667 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S | |||
| @@ -0,0 +1,232 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Multiply z := x * y | ||
| 6 | // Inputs x[8], y[8]; output z[16] | ||
| 7 | // | ||
| 8 | // extern void bignum_mul_8_16_alt | ||
| 9 | // (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); | ||
| 10 | // | ||
| 11 | // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y | ||
| 12 | // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y | ||
| 13 | // ---------------------------------------------------------------------------- | ||
| 14 | |||
| 15 | #include "_internal_s2n_bignum.h" | ||
| 16 | |||
| 17 | .intel_syntax noprefix | ||
| 18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt) | ||
| 19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt) | ||
| 20 | .text | ||
| 21 | |||
| 22 | // These are actually right | ||
| 23 | |||
| 24 | #define z rdi | ||
| 25 | #define x rsi | ||
| 26 | |||
| 27 | // This is moved from rdx to free it for muls | ||
| 28 | |||
| 29 | #define y rcx | ||
| 30 | |||
| 31 | // Other variables used as a rotating 3-word window to add terms to | ||
| 32 | |||
| 33 | #define t0 r8 | ||
| 34 | #define t1 r9 | ||
| 35 | #define t2 r10 | ||
| 36 | |||
| 37 | // Macro for the key "multiply and add to (c,h,l)" step | ||
| 38 | |||
| 39 | #define combadd(c,h,l,numa,numb) \ | ||
| 40 | mov rax, numa; \ | ||
| 41 | mul QWORD PTR numb; \ | ||
| 42 | add l, rax; \ | ||
| 43 | adc h, rdx; \ | ||
| 44 | adc c, 0 | ||
| 45 | |||
| 46 | // A minutely shorter form for when c = 0 initially | ||
| 47 | |||
| 48 | #define combadz(c,h,l,numa,numb) \ | ||
| 49 | mov rax, numa; \ | ||
| 50 | mul QWORD PTR numb; \ | ||
| 51 | add l, rax; \ | ||
| 52 | adc h, rdx; \ | ||
| 53 | adc c, c | ||
| 54 | |||
| 55 | // A short form where we don't expect a top carry | ||
| 56 | |||
| 57 | #define combads(h,l,numa,numb) \ | ||
| 58 | mov rax, numa; \ | ||
| 59 | mul QWORD PTR numb; \ | ||
| 60 | add l, rax; \ | ||
| 61 | adc h, rdx | ||
| 62 | |||
| 63 | S2N_BN_SYMBOL(bignum_mul_8_16_alt): | ||
| 64 | |||
| 65 | #if WINDOWS_ABI | ||
| 66 | push rdi | ||
| 67 | push rsi | ||
| 68 | mov rdi, rcx | ||
| 69 | mov rsi, rdx | ||
| 70 | mov rdx, r8 | ||
| 71 | #endif | ||
| 72 | |||
| 73 | // Copy y into a safe register to start with | ||
| 74 | |||
| 75 | mov y, rdx | ||
| 76 | |||
| 77 | // Result term 0 | ||
| 78 | |||
| 79 | mov rax, [x] | ||
| 80 | mul QWORD PTR [y] | ||
| 81 | |||
| 82 | mov [z], rax | ||
| 83 | mov t0, rdx | ||
| 84 | xor t1, t1 | ||
| 85 | |||
| 86 | // Result term 1 | ||
| 87 | |||
| 88 | xor t2, t2 | ||
| 89 | combads(t1,t0,[x],[y+8]) | ||
| 90 | combadz(t2,t1,t0,[x+8],[y]) | ||
| 91 | mov [z+8], t0 | ||
| 92 | |||
| 93 | // Result term 2 | ||
| 94 | |||
| 95 | xor t0, t0 | ||
| 96 | combadz(t0,t2,t1,[x],[y+16]) | ||
| 97 | combadd(t0,t2,t1,[x+8],[y+8]) | ||
| 98 | combadd(t0,t2,t1,[x+16],[y]) | ||
| 99 | mov [z+16], t1 | ||
| 100 | |||
| 101 | // Result term 3 | ||
| 102 | |||
| 103 | xor t1, t1 | ||
| 104 | combadz(t1,t0,t2,[x],[y+24]) | ||
| 105 | combadd(t1,t0,t2,[x+8],[y+16]) | ||
| 106 | combadd(t1,t0,t2,[x+16],[y+8]) | ||
| 107 | combadd(t1,t0,t2,[x+24],[y]) | ||
| 108 | mov [z+24], t2 | ||
| 109 | |||
| 110 | // Result term 4 | ||
| 111 | |||
| 112 | xor t2, t2 | ||
| 113 | combadz(t2,t1,t0,[x],[y+32]) | ||
| 114 | combadd(t2,t1,t0,[x+8],[y+24]) | ||
| 115 | combadd(t2,t1,t0,[x+16],[y+16]) | ||
| 116 | combadd(t2,t1,t0,[x+24],[y+8]) | ||
| 117 | combadd(t2,t1,t0,[x+32],[y]) | ||
| 118 | mov [z+32], t0 | ||
| 119 | |||
| 120 | // Result term 5 | ||
| 121 | |||
| 122 | xor t0, t0 | ||
| 123 | combadz(t0,t2,t1,[x],[y+40]) | ||
| 124 | combadd(t0,t2,t1,[x+8],[y+32]) | ||
| 125 | combadd(t0,t2,t1,[x+16],[y+24]) | ||
| 126 | combadd(t0,t2,t1,[x+24],[y+16]) | ||
| 127 | combadd(t0,t2,t1,[x+32],[y+8]) | ||
| 128 | combadd(t0,t2,t1,[x+40],[y]) | ||
| 129 | mov [z+40], t1 | ||
| 130 | |||
| 131 | // Result term 6 | ||
| 132 | |||
| 133 | xor t1, t1 | ||
| 134 | combadz(t1,t0,t2,[x],[y+48]) | ||
| 135 | combadd(t1,t0,t2,[x+8],[y+40]) | ||
| 136 | combadd(t1,t0,t2,[x+16],[y+32]) | ||
| 137 | combadd(t1,t0,t2,[x+24],[y+24]) | ||
| 138 | combadd(t1,t0,t2,[x+32],[y+16]) | ||
| 139 | combadd(t1,t0,t2,[x+40],[y+8]) | ||
| 140 | combadd(t1,t0,t2,[x+48],[y]) | ||
| 141 | mov [z+48], t2 | ||
| 142 | |||
| 143 | // Result term 7 | ||
| 144 | |||
| 145 | xor t2, t2 | ||
| 146 | combadz(t2,t1,t0,[x],[y+56]) | ||
| 147 | combadd(t2,t1,t0,[x+8],[y+48]) | ||
| 148 | combadd(t2,t1,t0,[x+16],[y+40]) | ||
| 149 | combadd(t2,t1,t0,[x+24],[y+32]) | ||
| 150 | combadd(t2,t1,t0,[x+32],[y+24]) | ||
| 151 | combadd(t2,t1,t0,[x+40],[y+16]) | ||
| 152 | combadd(t2,t1,t0,[x+48],[y+8]) | ||
| 153 | combadd(t2,t1,t0,[x+56],[y]) | ||
| 154 | mov [z+56], t0 | ||
| 155 | |||
| 156 | // Result term 8 | ||
| 157 | |||
| 158 | xor t0, t0 | ||
| 159 | combadz(t0,t2,t1,[x+8],[y+56]) | ||
| 160 | combadd(t0,t2,t1,[x+16],[y+48]) | ||
| 161 | combadd(t0,t2,t1,[x+24],[y+40]) | ||
| 162 | combadd(t0,t2,t1,[x+32],[y+32]) | ||
| 163 | combadd(t0,t2,t1,[x+40],[y+24]) | ||
| 164 | combadd(t0,t2,t1,[x+48],[y+16]) | ||
| 165 | combadd(t0,t2,t1,[x+56],[y+8]) | ||
| 166 | mov [z+64], t1 | ||
| 167 | |||
| 168 | // Result term 9 | ||
| 169 | |||
| 170 | xor t1, t1 | ||
| 171 | combadz(t1,t0,t2,[x+16],[y+56]) | ||
| 172 | combadd(t1,t0,t2,[x+24],[y+48]) | ||
| 173 | combadd(t1,t0,t2,[x+32],[y+40]) | ||
| 174 | combadd(t1,t0,t2,[x+40],[y+32]) | ||
| 175 | combadd(t1,t0,t2,[x+48],[y+24]) | ||
| 176 | combadd(t1,t0,t2,[x+56],[y+16]) | ||
| 177 | mov [z+72], t2 | ||
| 178 | |||
| 179 | // Result term 10 | ||
| 180 | |||
| 181 | xor t2, t2 | ||
| 182 | combadz(t2,t1,t0,[x+24],[y+56]) | ||
| 183 | combadd(t2,t1,t0,[x+32],[y+48]) | ||
| 184 | combadd(t2,t1,t0,[x+40],[y+40]) | ||
| 185 | combadd(t2,t1,t0,[x+48],[y+32]) | ||
| 186 | combadd(t2,t1,t0,[x+56],[y+24]) | ||
| 187 | mov [z+80], t0 | ||
| 188 | |||
| 189 | // Result term 11 | ||
| 190 | |||
| 191 | xor t0, t0 | ||
| 192 | combadz(t0,t2,t1,[x+32],[y+56]) | ||
| 193 | combadd(t0,t2,t1,[x+40],[y+48]) | ||
| 194 | combadd(t0,t2,t1,[x+48],[y+40]) | ||
| 195 | combadd(t0,t2,t1,[x+56],[y+32]) | ||
| 196 | mov [z+88], t1 | ||
| 197 | |||
| 198 | // Result term 12 | ||
| 199 | |||
| 200 | xor t1, t1 | ||
| 201 | combadz(t1,t0,t2,[x+40],[y+56]) | ||
| 202 | combadd(t1,t0,t2,[x+48],[y+48]) | ||
| 203 | combadd(t1,t0,t2,[x+56],[y+40]) | ||
| 204 | mov [z+96], t2 | ||
| 205 | |||
| 206 | // Result term 13 | ||
| 207 | |||
| 208 | xor t2, t2 | ||
| 209 | combadz(t2,t1,t0,[x+48],[y+56]) | ||
| 210 | combadd(t2,t1,t0,[x+56],[y+48]) | ||
| 211 | mov [z+104], t0 | ||
| 212 | |||
| 213 | // Result term 14 | ||
| 214 | |||
| 215 | combads(t2,t1,[x+56],[y+56]) | ||
| 216 | mov [z+112], t1 | ||
| 217 | |||
| 218 | // Result term 11 | ||
| 219 | |||
| 220 | mov [z+120], t2 | ||
| 221 | |||
| 222 | // Return | ||
| 223 | |||
| 224 | #if WINDOWS_ABI | ||
| 225 | pop rsi | ||
| 226 | pop rdi | ||
| 227 | #endif | ||
| 228 | ret | ||
| 229 | |||
| 230 | #if defined(__linux__) && defined(__ELF__) | ||
| 231 | .section .note.GNU-stack,"",%progbits | ||
| 232 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S new file mode 100644 index 0000000000..db483a5723 --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S | |||
| @@ -0,0 +1,133 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Square, z := x^2 | ||
| 6 | // Input x[4]; output z[8] | ||
| 7 | // | ||
| 8 | // extern void bignum_sqr_4_8_alt | ||
| 9 | // (uint64_t z[static 8], uint64_t x[static 4]); | ||
| 10 | // | ||
| 11 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
| 12 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
| 13 | // ---------------------------------------------------------------------------- | ||
| 14 | |||
| 15 | #include "_internal_s2n_bignum.h" | ||
| 16 | |||
| 17 | .intel_syntax noprefix | ||
| 18 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt) | ||
| 19 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt) | ||
| 20 | .text | ||
| 21 | |||
| 22 | // Input arguments | ||
| 23 | |||
| 24 | #define z rdi | ||
| 25 | #define x rsi | ||
| 26 | |||
| 27 | // Other variables used as a rotating 3-word window to add terms to | ||
| 28 | |||
| 29 | #define t0 rcx | ||
| 30 | #define t1 r8 | ||
| 31 | #define t2 r9 | ||
| 32 | |||
| 33 | // Macro for the key "multiply and add to (c,h,l)" step, for square term | ||
| 34 | |||
| 35 | #define combadd1(c,h,l,numa) \ | ||
| 36 | mov rax, numa; \ | ||
| 37 | mul rax; \ | ||
| 38 | add l, rax; \ | ||
| 39 | adc h, rdx; \ | ||
| 40 | adc c, 0 | ||
| 41 | |||
| 42 | // A short form where we don't expect a top carry | ||
| 43 | |||
| 44 | #define combads(h,l,numa) \ | ||
| 45 | mov rax, numa; \ | ||
| 46 | mul rax; \ | ||
| 47 | add l, rax; \ | ||
| 48 | adc h, rdx | ||
| 49 | |||
| 50 | // A version doubling before adding, for non-square terms | ||
| 51 | |||
| 52 | #define combadd2(c,h,l,numa,numb) \ | ||
| 53 | mov rax, numa; \ | ||
| 54 | mul QWORD PTR numb; \ | ||
| 55 | add rax, rax; \ | ||
| 56 | adc rdx, rdx; \ | ||
| 57 | adc c, 0; \ | ||
| 58 | add l, rax; \ | ||
| 59 | adc h, rdx; \ | ||
| 60 | adc c, 0 | ||
| 61 | |||
| 62 | S2N_BN_SYMBOL(bignum_sqr_4_8_alt): | ||
| 63 | |||
| 64 | #if WINDOWS_ABI | ||
| 65 | push rdi | ||
| 66 | push rsi | ||
| 67 | mov rdi, rcx | ||
| 68 | mov rsi, rdx | ||
| 69 | #endif | ||
| 70 | |||
| 71 | // Result term 0 | ||
| 72 | |||
| 73 | mov rax, [x] | ||
| 74 | mul rax | ||
| 75 | |||
| 76 | mov [z], rax | ||
| 77 | mov t0, rdx | ||
| 78 | xor t1, t1 | ||
| 79 | |||
| 80 | // Result term 1 | ||
| 81 | |||
| 82 | xor t2, t2 | ||
| 83 | combadd2(t2,t1,t0,[x],[x+8]) | ||
| 84 | mov [z+8], t0 | ||
| 85 | |||
| 86 | // Result term 2 | ||
| 87 | |||
| 88 | xor t0, t0 | ||
| 89 | combadd1(t0,t2,t1,[x+8]) | ||
| 90 | combadd2(t0,t2,t1,[x],[x+16]) | ||
| 91 | mov [z+16], t1 | ||
| 92 | |||
| 93 | // Result term 3 | ||
| 94 | |||
| 95 | xor t1, t1 | ||
| 96 | combadd2(t1,t0,t2,[x],[x+24]) | ||
| 97 | combadd2(t1,t0,t2,[x+8],[x+16]) | ||
| 98 | mov [z+24], t2 | ||
| 99 | |||
| 100 | // Result term 4 | ||
| 101 | |||
| 102 | xor t2, t2 | ||
| 103 | combadd2(t2,t1,t0,[x+8],[x+24]) | ||
| 104 | combadd1(t2,t1,t0,[x+16]) | ||
| 105 | mov [z+32], t0 | ||
| 106 | |||
| 107 | // Result term 5 | ||
| 108 | |||
| 109 | xor t0, t0 | ||
| 110 | combadd2(t0,t2,t1,[x+16],[x+24]) | ||
| 111 | mov [z+40], t1 | ||
| 112 | |||
| 113 | // Result term 6 | ||
| 114 | |||
| 115 | xor t1, t1 | ||
| 116 | combads(t0,t2,[x+24]) | ||
| 117 | mov [z+48], t2 | ||
| 118 | |||
| 119 | // Result term 7 | ||
| 120 | |||
| 121 | mov [z+56], t0 | ||
| 122 | |||
| 123 | // Return | ||
| 124 | |||
| 125 | #if WINDOWS_ABI | ||
| 126 | pop rsi | ||
| 127 | pop rdi | ||
| 128 | #endif | ||
| 129 | ret | ||
| 130 | |||
| 131 | #if defined(__linux__) && defined(__ELF__) | ||
| 132 | .section .note.GNU-stack,"",%progbits | ||
| 133 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S new file mode 100644 index 0000000000..dcf3c92e5b --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S | |||
| @@ -0,0 +1,230 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Square, z := x^2 | ||
| 6 | // Input x[8]; output z[16] | ||
| 7 | // | ||
| 8 | // extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]); | ||
| 9 | // | ||
| 10 | // Standard x86-64 ABI: RDI = z, RSI = x | ||
| 11 | // Microsoft x64 ABI: RCX = z, RDX = x | ||
| 12 | // ---------------------------------------------------------------------------- | ||
| 13 | |||
| 14 | #include "_internal_s2n_bignum.h" | ||
| 15 | |||
| 16 | .intel_syntax noprefix | ||
| 17 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt) | ||
| 18 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt) | ||
| 19 | .text | ||
| 20 | |||
| 21 | // Input arguments | ||
| 22 | |||
| 23 | #define z rdi | ||
| 24 | #define x rsi | ||
| 25 | |||
| 26 | // Other variables used as a rotating 3-word window to add terms to | ||
| 27 | |||
| 28 | #define t0 r8 | ||
| 29 | #define t1 r9 | ||
| 30 | #define t2 r10 | ||
| 31 | |||
| 32 | // Additional temporaries for local windows to share doublings | ||
| 33 | |||
| 34 | #define u0 rcx | ||
| 35 | #define u1 r11 | ||
| 36 | |||
| 37 | // Macro for the key "multiply and add to (c,h,l)" step | ||
| 38 | |||
| 39 | #define combadd(c,h,l,numa,numb) \ | ||
| 40 | mov rax, numa; \ | ||
| 41 | mul QWORD PTR numb; \ | ||
| 42 | add l, rax; \ | ||
| 43 | adc h, rdx; \ | ||
| 44 | adc c, 0 | ||
| 45 | |||
| 46 | // Set up initial window (c,h,l) = numa * numb | ||
| 47 | |||
| 48 | #define combaddz(c,h,l,numa,numb) \ | ||
| 49 | mov rax, numa; \ | ||
| 50 | mul QWORD PTR numb; \ | ||
| 51 | xor c, c; \ | ||
| 52 | mov l, rax; \ | ||
| 53 | mov h, rdx | ||
| 54 | |||
| 55 | // Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) | ||
| 56 | |||
| 57 | #define doubladd(c,h,l,hh,ll) \ | ||
| 58 | add ll, ll; \ | ||
| 59 | adc hh, hh; \ | ||
| 60 | adc c, c; \ | ||
| 61 | add l, ll; \ | ||
| 62 | adc h, hh; \ | ||
| 63 | adc c, 0 | ||
| 64 | |||
| 65 | // Square term incorporation (c,h,l) += numba^2 | ||
| 66 | |||
| 67 | #define combadd1(c,h,l,numa) \ | ||
| 68 | mov rax, numa; \ | ||
| 69 | mul rax; \ | ||
| 70 | add l, rax; \ | ||
| 71 | adc h, rdx; \ | ||
| 72 | adc c, 0 | ||
| 73 | |||
| 74 | // A short form where we don't expect a top carry | ||
| 75 | |||
| 76 | #define combads(h,l,numa) \ | ||
| 77 | mov rax, numa; \ | ||
| 78 | mul rax; \ | ||
| 79 | add l, rax; \ | ||
| 80 | adc h, rdx | ||
| 81 | |||
| 82 | // A version doubling directly before adding, for single non-square terms | ||
| 83 | |||
| 84 | #define combadd2(c,h,l,numa,numb) \ | ||
| 85 | mov rax, numa; \ | ||
| 86 | mul QWORD PTR numb; \ | ||
| 87 | add rax, rax; \ | ||
| 88 | adc rdx, rdx; \ | ||
| 89 | adc c, 0; \ | ||
| 90 | add l, rax; \ | ||
| 91 | adc h, rdx; \ | ||
| 92 | adc c, 0 | ||
| 93 | |||
| 94 | S2N_BN_SYMBOL(bignum_sqr_8_16_alt): | ||
| 95 | |||
| 96 | #if WINDOWS_ABI | ||
| 97 | push rdi | ||
| 98 | push rsi | ||
| 99 | mov rdi, rcx | ||
| 100 | mov rsi, rdx | ||
| 101 | #endif | ||
| 102 | |||
| 103 | // Result term 0 | ||
| 104 | |||
| 105 | mov rax, [x] | ||
| 106 | mul rax | ||
| 107 | |||
| 108 | mov [z], rax | ||
| 109 | mov t0, rdx | ||
| 110 | xor t1, t1 | ||
| 111 | |||
| 112 | // Result term 1 | ||
| 113 | |||
| 114 | xor t2, t2 | ||
| 115 | combadd2(t2,t1,t0,[x],[x+8]) | ||
| 116 | mov [z+8], t0 | ||
| 117 | |||
| 118 | // Result term 2 | ||
| 119 | |||
| 120 | xor t0, t0 | ||
| 121 | combadd1(t0,t2,t1,[x+8]) | ||
| 122 | combadd2(t0,t2,t1,[x],[x+16]) | ||
| 123 | mov [z+16], t1 | ||
| 124 | |||
| 125 | // Result term 3 | ||
| 126 | |||
| 127 | combaddz(t1,u1,u0,[x],[x+24]) | ||
| 128 | combadd(t1,u1,u0,[x+8],[x+16]) | ||
| 129 | doubladd(t1,t0,t2,u1,u0) | ||
| 130 | mov [z+24], t2 | ||
| 131 | |||
| 132 | // Result term 4 | ||
| 133 | |||
| 134 | combaddz(t2,u1,u0,[x],[x+32]) | ||
| 135 | combadd(t2,u1,u0,[x+8],[x+24]) | ||
| 136 | doubladd(t2,t1,t0,u1,u0) | ||
| 137 | combadd1(t2,t1,t0,[x+16]) | ||
| 138 | mov [z+32], t0 | ||
| 139 | |||
| 140 | // Result term 5 | ||
| 141 | |||
| 142 | combaddz(t0,u1,u0,[x],[x+40]) | ||
| 143 | combadd(t0,u1,u0,[x+8],[x+32]) | ||
| 144 | combadd(t0,u1,u0,[x+16],[x+24]) | ||
| 145 | doubladd(t0,t2,t1,u1,u0) | ||
| 146 | mov [z+40], t1 | ||
| 147 | |||
| 148 | // Result term 6 | ||
| 149 | |||
| 150 | combaddz(t1,u1,u0,[x],[x+48]) | ||
| 151 | combadd(t1,u1,u0,[x+8],[x+40]) | ||
| 152 | combadd(t1,u1,u0,[x+16],[x+32]) | ||
| 153 | doubladd(t1,t0,t2,u1,u0) | ||
| 154 | combadd1(t1,t0,t2,[x+24]) | ||
| 155 | mov [z+48], t2 | ||
| 156 | |||
| 157 | // Result term 7 | ||
| 158 | |||
| 159 | combaddz(t2,u1,u0,[x],[x+56]) | ||
| 160 | combadd(t2,u1,u0,[x+8],[x+48]) | ||
| 161 | combadd(t2,u1,u0,[x+16],[x+40]) | ||
| 162 | combadd(t2,u1,u0,[x+24],[x+32]) | ||
| 163 | doubladd(t2,t1,t0,u1,u0) | ||
| 164 | mov [z+56], t0 | ||
| 165 | |||
| 166 | // Result term 8 | ||
| 167 | |||
| 168 | combaddz(t0,u1,u0,[x+8],[x+56]) | ||
| 169 | combadd(t0,u1,u0,[x+16],[x+48]) | ||
| 170 | combadd(t0,u1,u0,[x+24],[x+40]) | ||
| 171 | doubladd(t0,t2,t1,u1,u0) | ||
| 172 | combadd1(t0,t2,t1,[x+32]) | ||
| 173 | mov [z+64], t1 | ||
| 174 | |||
| 175 | // Result term 9 | ||
| 176 | |||
| 177 | combaddz(t1,u1,u0,[x+16],[x+56]) | ||
| 178 | combadd(t1,u1,u0,[x+24],[x+48]) | ||
| 179 | combadd(t1,u1,u0,[x+32],[x+40]) | ||
| 180 | doubladd(t1,t0,t2,u1,u0) | ||
| 181 | mov [z+72], t2 | ||
| 182 | |||
| 183 | // Result term 10 | ||
| 184 | |||
| 185 | combaddz(t2,u1,u0,[x+24],[x+56]) | ||
| 186 | combadd(t2,u1,u0,[x+32],[x+48]) | ||
| 187 | doubladd(t2,t1,t0,u1,u0) | ||
| 188 | combadd1(t2,t1,t0,[x+40]) | ||
| 189 | mov [z+80], t0 | ||
| 190 | |||
| 191 | // Result term 11 | ||
| 192 | |||
| 193 | combaddz(t0,u1,u0,[x+32],[x+56]) | ||
| 194 | combadd(t0,u1,u0,[x+40],[x+48]) | ||
| 195 | doubladd(t0,t2,t1,u1,u0) | ||
| 196 | mov [z+88], t1 | ||
| 197 | |||
| 198 | // Result term 12 | ||
| 199 | |||
| 200 | xor t1, t1 | ||
| 201 | combadd2(t1,t0,t2,[x+40],[x+56]) | ||
| 202 | combadd1(t1,t0,t2,[x+48]) | ||
| 203 | mov [z+96], t2 | ||
| 204 | |||
| 205 | // Result term 13 | ||
| 206 | |||
| 207 | xor t2, t2 | ||
| 208 | combadd2(t2,t1,t0,[x+48],[x+56]) | ||
| 209 | mov [z+104], t0 | ||
| 210 | |||
| 211 | // Result term 14 | ||
| 212 | |||
| 213 | combads(t2,t1,[x+56]) | ||
| 214 | mov [z+112], t1 | ||
| 215 | |||
| 216 | // Result term 15 | ||
| 217 | |||
| 218 | mov [z+120], t2 | ||
| 219 | |||
| 220 | // Return | ||
| 221 | |||
| 222 | #if WINDOWS_ABI | ||
| 223 | pop rsi | ||
| 224 | pop rdi | ||
| 225 | #endif | ||
| 226 | ret | ||
| 227 | |||
| 228 | #if defined(__linux__) && defined(__ELF__) | ||
| 229 | .section .note.GNU-stack,"",%progbits | ||
| 230 | #endif | ||
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S new file mode 100644 index 0000000000..42eb6a82ef --- /dev/null +++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S | |||
| @@ -0,0 +1,141 @@ | |||
| 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
| 2 | // SPDX-License-Identifier: Apache-2.0 OR ISC | ||
| 3 | |||
| 4 | // ---------------------------------------------------------------------------- | ||
| 5 | // Subtract, z := x - y | ||
| 6 | // Inputs x[m], y[n]; outputs function return (carry-out) and z[p] | ||
| 7 | // | ||
| 8 | // extern uint64_t bignum_sub | ||
| 9 | // (uint64_t p, uint64_t *z, | ||
| 10 | // uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); | ||
| 11 | // | ||
| 12 | // Does the z := x - y operation, truncating modulo p words in general and | ||
| 13 | // returning a top borrow (0 or 1) in the p'th place, only subtracting input | ||
| 14 | // words below p (as well as m and n respectively) to get the diff and borrow. | ||
| 15 | // | ||
| 16 | // Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX | ||
| 17 | // Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX | ||
| 18 | // ---------------------------------------------------------------------------- | ||
| 19 | |||
| 20 | #include "_internal_s2n_bignum.h" | ||
| 21 | |||
| 22 | .intel_syntax noprefix | ||
| 23 | S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub) | ||
| 24 | S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub) | ||
| 25 | .text | ||
| 26 | |||
| 27 | #define p rdi | ||
| 28 | #define z rsi | ||
| 29 | #define m rdx | ||
| 30 | #define x rcx | ||
| 31 | #define n r8 | ||
| 32 | #define y r9 | ||
| 33 | #define i r10 | ||
| 34 | #define a rax | ||
| 35 | |||
| 36 | #define ashort eax | ||
| 37 | |||
| 38 | |||
| 39 | |||
| 40 | S2N_BN_SYMBOL(bignum_sub): | ||
| 41 | |||
| 42 | #if WINDOWS_ABI | ||
| 43 | push rdi | ||
| 44 | push rsi | ||
| 45 | mov rdi, rcx | ||
| 46 | mov rsi, rdx | ||
| 47 | mov rdx, r8 | ||
| 48 | mov rcx, r9 | ||
| 49 | mov r8, [rsp+56] | ||
| 50 | mov r9, [rsp+64] | ||
| 51 | #endif | ||
| 52 | |||
| 53 | // Zero the main index counter for both branches | ||
| 54 | |||
| 55 | xor i, i | ||
| 56 | |||
| 57 | // First clamp the two input sizes m := min(p,m) and n := min(p,n) since | ||
| 58 | // we'll never need words past the p'th. Can now assume m <= p and n <= p. | ||
| 59 | // Then compare the modified m and n and branch accordingly | ||
| 60 | |||
| 61 | cmp p, m | ||
| 62 | cmovc m, p | ||
| 63 | cmp p, n | ||
| 64 | cmovc n, p | ||
| 65 | cmp m, n | ||
| 66 | jc ylonger | ||
| 67 | |||
| 68 | // The case where x is longer or of the same size (p >= m >= n) | ||
| 69 | |||
| 70 | sub p, m | ||
| 71 | sub m, n | ||
| 72 | inc m | ||
| 73 | test n, n | ||
| 74 | jz xtest | ||
| 75 | xmainloop: | ||
| 76 | mov a, [x+8*i] | ||
| 77 | sbb a, [y+8*i] | ||
| 78 | mov [z+8*i],a | ||
| 79 | inc i | ||
| 80 | dec n | ||
| 81 | jnz xmainloop | ||
| 82 | jmp xtest | ||
| 83 | xtoploop: | ||
| 84 | mov a, [x+8*i] | ||
| 85 | sbb a, 0 | ||
| 86 | mov [z+8*i],a | ||
| 87 | inc i | ||
| 88 | xtest: | ||
| 89 | dec m | ||
| 90 | jnz xtoploop | ||
| 91 | sbb a, a | ||
| 92 | test p, p | ||
| 93 | jz tailskip | ||
| 94 | tailloop: | ||
| 95 | mov [z+8*i],a | ||
| 96 | inc i | ||
| 97 | dec p | ||
| 98 | jnz tailloop | ||
| 99 | tailskip: | ||
| 100 | neg a | ||
| 101 | #if WINDOWS_ABI | ||
| 102 | pop rsi | ||
| 103 | pop rdi | ||
| 104 | #endif | ||
| 105 | ret | ||
| 106 | |||
| 107 | // The case where y is longer (p >= n > m) | ||
| 108 | |||
| 109 | ylonger: | ||
| 110 | |||
| 111 | sub p, n | ||
| 112 | sub n, m | ||
| 113 | test m, m | ||
| 114 | jz ytoploop | ||
| 115 | ymainloop: | ||
| 116 | mov a, [x+8*i] | ||
| 117 | sbb a, [y+8*i] | ||
| 118 | mov [z+8*i],a | ||
| 119 | inc i | ||
| 120 | dec m | ||
| 121 | jnz ymainloop | ||
| 122 | ytoploop: | ||
| 123 | mov ashort, 0 | ||
| 124 | sbb a, [y+8*i] | ||
| 125 | mov [z+8*i],a | ||
| 126 | inc i | ||
| 127 | dec n | ||
| 128 | jnz ytoploop | ||
| 129 | sbb a, a | ||
| 130 | test p, p | ||
| 131 | jnz tailloop | ||
| 132 | neg a | ||
| 133 | #if WINDOWS_ABI | ||
| 134 | pop rsi | ||
| 135 | pop rdi | ||
| 136 | #endif | ||
| 137 | ret | ||
| 138 | |||
| 139 | #if defined(__linux__) && defined(__ELF__) | ||
| 140 | .section .note.GNU-stack,"",%progbits | ||
| 141 | #endif | ||
