summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_add.S153
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S143
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S126
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul.S155
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S145
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S232
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S133
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S230
-rw-r--r--src/lib/libcrypto/bn/arch/amd64/bignum_sub.S141
9 files changed, 1458 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_add.S b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
new file mode 100644
index 0000000000..33663916bb
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_add.S
@@ -0,0 +1,153 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC
3
4// ----------------------------------------------------------------------------
5// Add, z := x + y
6// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
7//
8// extern uint64_t bignum_add
9// (uint64_t p, uint64_t *z,
10// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
11//
12// Does the z := x + y operation, truncating modulo p words in general and
13// returning a top carry (0 or 1) in the p'th place, only adding the input
14// words below p (as well as m and n respectively) to get the sum and carry.
15//
16// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
17// Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
18// ----------------------------------------------------------------------------
19
20#include "_internal_s2n_bignum.h"
21
22 .intel_syntax noprefix
23 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add)
24 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add)
25 .text
26
27#define p rdi
28#define z rsi
29#define m rdx
30#define x rcx
31#define n r8
32#define y r9
33#define i r10
34#define a rax
35
36#define ashort eax
37
38
39
40S2N_BN_SYMBOL(bignum_add):
41
42#if WINDOWS_ABI
43 push rdi
44 push rsi
45 mov rdi, rcx
46 mov rsi, rdx
47 mov rdx, r8
48 mov rcx, r9
49 mov r8, [rsp+56]
50 mov r9, [rsp+64]
51#endif
52
53// Zero the main index counter for both branches
54
55 xor i, i
56
57// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
58// we'll never need words past the p'th. Can now assume m <= p and n <= p.
59// Then compare the modified m and n and branch accordingly
60
61 cmp p, m
62 cmovc m, p
63 cmp p, n
64 cmovc n, p
65 cmp m, n
66 jc ylonger
67
68// The case where x is longer or of the same size (p >= m >= n)
69
70 sub p, m
71 sub m, n
72 inc m
73 test n, n
74 jz xtest
75xmainloop:
76 mov a, [x+8*i]
77 adc a, [y+8*i]
78 mov [z+8*i],a
79 inc i
80 dec n
81 jnz xmainloop
82 jmp xtest
83xtoploop:
84 mov a, [x+8*i]
85 adc a, 0
86 mov [z+8*i],a
87 inc i
88xtest:
89 dec m
90 jnz xtoploop
91 mov ashort, 0
92 adc a, 0
93 test p, p
94 jnz tails
95#if WINDOWS_ABI
96 pop rsi
97 pop rdi
98#endif
99 ret
100
101// The case where y is longer (p >= n > m)
102
103ylonger:
104
105 sub p, n
106 sub n, m
107 test m, m
108 jz ytoploop
109ymainloop:
110 mov a, [x+8*i]
111 adc a, [y+8*i]
112 mov [z+8*i],a
113 inc i
114 dec m
115 jnz ymainloop
116ytoploop:
117 mov a, [y+8*i]
118 adc a, 0
119 mov [z+8*i],a
120 inc i
121 dec n
122 jnz ytoploop
123 mov ashort, 0
124 adc a, 0
125 test p, p
126 jnz tails
127#if WINDOWS_ABI
128 pop rsi
129 pop rdi
130#endif
131 ret
132
133// Adding a non-trivial tail, when p > max(m,n)
134
135tails:
136 mov [z+8*i],a
137 xor a, a
138 jmp tail
139tailloop:
140 mov [z+8*i],a
141tail:
142 inc i
143 dec p
144 jnz tailloop
145#if WINDOWS_ABI
146 pop rsi
147 pop rdi
148#endif
149 ret
150
151#if defined(__linux__) && defined(__ELF__)
152.section .note.GNU-stack,"",%progbits
153#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
new file mode 100644
index 0000000000..33f9be2fa0
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S
@@ -0,0 +1,143 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC
3
4// ----------------------------------------------------------------------------
5// Multiply-add with single-word multiplier, z := z + c * y
6// Inputs c, y[n]; outputs function return (carry-out) and z[k]
7//
8// extern uint64_t bignum_cmadd
9// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
10//
11// Does the "z := z + c * y" operation where y is n digits, result z is p.
12// Truncates the result in general.
13//
14// The return value is a high/carry word that is meaningful when p = n + 1, or
15// more generally when n <= p and the result fits in p + 1 digits. In these
16// cases it gives the top digit of the (p + 1)-digit result.
17//
18// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
19// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
20// ----------------------------------------------------------------------------
21
22#include "_internal_s2n_bignum.h"
23
24 .intel_syntax noprefix
25 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd)
26 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd)
27 .text
28
29#define p rdi
30#define z rsi
31#define c r9
32#define n rcx
33#define x r8
34
35#define i r10
36#define h r11
37
38#define r rbx
39
40#define hshort r11d
41#define ishort r10d
42
43
44
45S2N_BN_SYMBOL(bignum_cmadd):
46
47#if WINDOWS_ABI
48 push rdi
49 push rsi
50 mov rdi, rcx
51 mov rsi, rdx
52 mov rdx, r8
53 mov rcx, r9
54 mov r8, [rsp+56]
55#endif
56
57// Seems hard to avoid one more register
58
59 push rbx
60
61// First clamp the input size n := min(p,n) since we can never need to read
62// past the p'th term of the input to generate p-digit output.
63// Subtract p := p - min(n,p) so it holds the size of the extra tail needed
64
65 cmp p, n
66 cmovc n, p
67 sub p, n
68
69// Initialize high part h = 0; if n = 0 do nothing but return that zero
70
71 xor h, h
72 test n, n
73 jz end
74
75// Move c into a safer register as multiplies overwrite rdx
76
77 mov c, rdx
78
79// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0
80
81 mov rax, [x]
82 mul c
83 add [z], rax
84 mov h, rdx
85 mov ishort, 1
86 dec n
87 jz hightail
88
89// Main loop, where we always have CF + previous high part h to add in
90
91loop:
92 adc h, [z+8*i]
93 sbb r, r
94 mov rax, [x+8*i]
95 mul c
96 sub rdx, r
97 add rax, h
98 mov [z+8*i], rax
99 mov h, rdx
100 inc i
101 dec n
102 jnz loop
103
104hightail:
105 adc h, 0
106
107// Propagate the carry all the way to the end with h as extra carry word
108
109tail:
110 test p, p
111 jz end
112
113 add [z+8*i], h
114 mov hshort, 0
115 inc i
116 dec p
117 jz highend
118
119tloop:
120 adc [z+8*i], h
121 inc i
122 dec p
123 jnz tloop
124
125highend:
126
127 adc h, 0
128
129// Return the high/carry word
130
131end:
132 mov rax, h
133
134 pop rbx
135#if WINDOWS_ABI
136 pop rsi
137 pop rdi
138#endif
139 ret
140
141#if defined(__linux__) && defined(__ELF__)
142.section .note.GNU-stack,"",%progbits
143#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
new file mode 100644
index 0000000000..6d184e3f39
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
@@ -0,0 +1,126 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC
3
4// ----------------------------------------------------------------------------
5// Multiply by a single word, z := c * y
6// Inputs c, y[n]; outputs function return (carry-out) and z[k]
7//
8// extern uint64_t bignum_cmul
9// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
10//
11// Does the "z := c * y" operation where y is n digits, result z is p.
12// Truncates the result in general unless p >= n + 1.
13//
14// The return value is a high/carry word that is meaningful when p >= n as
15// giving the high part of the result. Since this is always zero if p > n,
16// it is mainly of interest in the special case p = n, i.e. where the source
17// and destination have the same nominal size, when it gives the extra word
18// of the full result.
19//
20// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
21// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
22// ----------------------------------------------------------------------------
23
24#include "_internal_s2n_bignum.h"
25
26 .intel_syntax noprefix
27 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul)
28 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul)
29 .text
30
31#define p rdi
32#define z rsi
33#define c r9
34#define n rcx
35#define x r8
36
37#define i r10
38#define h r11
39
40
41
42S2N_BN_SYMBOL(bignum_cmul):
43
44#if WINDOWS_ABI
45 push rdi
46 push rsi
47 mov rdi, rcx
48 mov rsi, rdx
49 mov rdx, r8
50 mov rcx, r9
51 mov r8, [rsp+56]
52#endif
53
54// First clamp the input size n := min(p,n) since we can never need to read
55// past the p'th term of the input to generate p-digit output. Now we can
56// assume that n <= p
57
58 cmp p, n
59 cmovc n, p
60
61// Initialize current input/output pointer offset i and high part h.
62// But then if n = 0 skip the multiplication and go to the tail part
63
64 xor h, h
65 xor i, i
66 test n, n
67 jz tail
68
69// Move c into a safer register as multiplies overwrite rdx
70
71 mov c, rdx
72
73// Initialization of the loop: [h,l] = c * x_0
74
75 mov rax, [x]
76 mul c
77 mov [z], rax
78 mov h, rdx
79 inc i
80 cmp i, n
81 jz tail
82
83// Main loop doing the multiplications
84
85loop:
86 mov rax, [x+8*i]
87 mul c
88 add rax, h
89 adc rdx, 0
90 mov [z+8*i], rax
91 mov h, rdx
92 inc i
93 cmp i, n
94 jc loop
95
96// Add a tail when the destination is longer
97
98tail:
99 cmp i, p
100 jnc end
101 mov [z+8*i], h
102 xor h, h
103 inc i
104 cmp i, p
105 jnc end
106
107tloop:
108 mov [z+8*i], h
109 inc i
110 cmp i, p
111 jc tloop
112
113// Return the high/carry word
114
115end:
116 mov rax, h
117
118#if WINDOWS_ABI
119 pop rsi
120 pop rdi
121#endif
122 ret
123
124#if defined(__linux__) && defined(__ELF__)
125.section .note.GNU-stack,"",%progbits
126#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
new file mode 100644
index 0000000000..20c3f70202
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul.S
@@ -0,0 +1,155 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC
3
4// ----------------------------------------------------------------------------
5// Multiply z := x * y
6// Inputs x[m], y[n]; output z[k]
7//
8// extern void bignum_mul
9// (uint64_t k, uint64_t *z,
10// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
11//
12// Does the "z := x * y" operation where x is m digits, y is n, result z is k.
13// Truncates the result in general unless k >= m + n
14//
15// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y
16// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y
17// ----------------------------------------------------------------------------
18
19#include "_internal_s2n_bignum.h"
20
21 .intel_syntax noprefix
22 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul)
23 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul)
24 .text
25
26// These are actually right
27
28#define p rdi
29#define z rsi
30#define n r8
31
32// These are not
33
34#define c r15
35#define h r14
36#define l r13
37#define x r12
38#define y r11
39#define i rbx
40#define k r10
41#define m rbp
42
43// These are always local scratch since multiplier result is in these
44
45#define a rax
46#define d rdx
47
48
49
50S2N_BN_SYMBOL(bignum_mul):
51
52#if WINDOWS_ABI
53 push rdi
54 push rsi
55 mov rdi, rcx
56 mov rsi, rdx
57 mov rdx, r8
58 mov rcx, r9
59 mov r8, [rsp+56]
60 mov r9, [rsp+64]
61#endif
62
63// We use too many registers, and also we need rax:rdx for multiplications
64
65 push rbx
66 push rbp
67 push r12
68 push r13
69 push r14
70 push r15
71 mov m, rdx
72
73// If the result size is zero, do nothing
74// Note that even if either or both inputs has size zero, we can't
75// just give up because we at least need to zero the output array
76// If we did a multiply-add variant, however, then we could
77
78 test p, p
79 jz end
80
81// Set initial 2-part sum to zero (we zero c inside the body)
82
83 xor h,h
84 xor l,l
85
86// Otherwise do outer loop k = 0 ... k = p - 1
87
88 xor k, k
89
90outerloop:
91
92// Zero our carry term first; we eventually want it and a zero is useful now
93// Set a = max 0 (k + 1 - n), i = min (k + 1) m
94// This defines the range a <= j < i for the inner summation
95// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow
96// And since we want to increment it anyway, we might as well do it now
97
98 xor c, c // c = 0
99 inc k // k = k + 1
100
101 mov a, k // a = k + 1
102 sub a, n // a = k + 1 - n
103 cmovc a, c // a = max 0 (k + 1 - n)
104
105 mov i, m // i = m
106 cmp k, m // CF <=> k + 1 < m
107 cmovc i, k // i = min (k + 1) m
108
109// Turn i into a loop count, and skip things if it's <= 0
110// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a]
111// and then launch into the main inner loop, postdecrementing i
112
113 mov d, k
114 sub d, i
115 sub i, a
116 jbe innerend
117 lea x,[rcx+8*a]
118 lea y,[r9+8*d-8]
119
120innerloop:
121 mov rax, [y+8*i]
122 mul QWORD PTR [x]
123 add x, 8
124 add l, rax
125 adc h, rdx
126 adc c, 0
127 dec i
128 jnz innerloop
129
130innerend:
131
132 mov [z], l
133 mov l, h
134 mov h, c
135 add z, 8
136
137 cmp k, p
138 jc outerloop
139
140end:
141 pop r15
142 pop r14
143 pop r13
144 pop r12
145 pop rbp
146 pop rbx
147#if WINDOWS_ABI
148 pop rsi
149 pop rdi
150#endif
151 ret
152
153#if defined(__linux__) && defined(__ELF__)
154.section .note.GNU-stack,"",%progbits
155#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
new file mode 100644
index 0000000000..e70ed116da
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S
@@ -0,0 +1,145 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC
3
4// ----------------------------------------------------------------------------
5// Multiply z := x * y
6// Inputs x[4], y[4]; output z[8]
7//
8// extern void bignum_mul_4_8_alt
9// (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
10//
11// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
12// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
13// ----------------------------------------------------------------------------
14
15#include "_internal_s2n_bignum.h"
16
17 .intel_syntax noprefix
18 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt)
19 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt)
20 .text
21
22// These are actually right
23
24#define z rdi
25#define x rsi
26
27// This is moved from rdx to free it for muls
28
29#define y rcx
30
31// Other variables used as a rotating 3-word window to add terms to
32
33#define t0 r8
34#define t1 r9
35#define t2 r10
36
37// Macro for the key "multiply and add to (c,h,l)" step
38
39#define combadd(c,h,l,numa,numb) \
40 mov rax, numa; \
41 mul QWORD PTR numb; \
42 add l, rax; \
43 adc h, rdx; \
44 adc c, 0
45
46// A minutely shorter form for when c = 0 initially
47
48#define combadz(c,h,l,numa,numb) \
49 mov rax, numa; \
50 mul QWORD PTR numb; \
51 add l, rax; \
52 adc h, rdx; \
53 adc c, c
54
55// A short form where we don't expect a top carry
56
57#define combads(h,l,numa,numb) \
58 mov rax, numa; \
59 mul QWORD PTR numb; \
60 add l, rax; \
61 adc h, rdx
62
63S2N_BN_SYMBOL(bignum_mul_4_8_alt):
64
65#if WINDOWS_ABI
66 push rdi
67 push rsi
68 mov rdi, rcx
69 mov rsi, rdx
70 mov rdx, r8
71#endif
72
73// Copy y into a safe register to start with
74
75 mov y, rdx
76
77// Result term 0
78
79 mov rax, [x]
80 mul QWORD PTR [y]
81
82 mov [z], rax
83 mov t0, rdx
84 xor t1, t1
85
86// Result term 1
87
88 xor t2, t2
89 combads(t1,t0,[x],[y+8])
90 combadz(t2,t1,t0,[x+8],[y])
91 mov [z+8], t0
92
93// Result term 2
94
95 xor t0, t0
96 combadz(t0,t2,t1,[x],[y+16])
97 combadd(t0,t2,t1,[x+8],[y+8])
98 combadd(t0,t2,t1,[x+16],[y])
99 mov [z+16], t1
100
101// Result term 3
102
103 xor t1, t1
104 combadz(t1,t0,t2,[x],[y+24])
105 combadd(t1,t0,t2,[x+8],[y+16])
106 combadd(t1,t0,t2,[x+16],[y+8])
107 combadd(t1,t0,t2,[x+24],[y])
108 mov [z+24], t2
109
110// Result term 4
111
112 xor t2, t2
113 combadz(t2,t1,t0,[x+8],[y+24])
114 combadd(t2,t1,t0,[x+16],[y+16])
115 combadd(t2,t1,t0,[x+24],[y+8])
116 mov [z+32], t0
117
118// Result term 5
119
120 xor t0, t0
121 combadz(t0,t2,t1,[x+16],[y+24])
122 combadd(t0,t2,t1,[x+24],[y+16])
123 mov [z+40], t1
124
125// Result term 6
126
127 xor t1, t1
128 combads(t0,t2,[x+24],[y+24])
129 mov [z+48], t2
130
131// Result term 7
132
133 mov [z+56], t0
134
135// Return
136
137#if WINDOWS_ABI
138 pop rsi
139 pop rdi
140#endif
141 ret
142
143#if defined(__linux__) && defined(__ELF__)
144.section .note.GNU-stack,"",%progbits
145#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
new file mode 100644
index 0000000000..43c6c48667
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S
@@ -0,0 +1,232 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC
3
4// ----------------------------------------------------------------------------
5// Multiply z := x * y
6// Inputs x[8], y[8]; output z[16]
7//
8// extern void bignum_mul_8_16_alt
9// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
10//
11// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
12// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
13// ----------------------------------------------------------------------------
14
15#include "_internal_s2n_bignum.h"
16
17 .intel_syntax noprefix
18 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt)
19 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt)
20 .text
21
22// These are actually right
23
24#define z rdi
25#define x rsi
26
27// This is moved from rdx to free it for muls
28
29#define y rcx
30
31// Other variables used as a rotating 3-word window to add terms to
32
33#define t0 r8
34#define t1 r9
35#define t2 r10
36
37// Macro for the key "multiply and add to (c,h,l)" step
38
39#define combadd(c,h,l,numa,numb) \
40 mov rax, numa; \
41 mul QWORD PTR numb; \
42 add l, rax; \
43 adc h, rdx; \
44 adc c, 0
45
46// A minutely shorter form for when c = 0 initially
47
48#define combadz(c,h,l,numa,numb) \
49 mov rax, numa; \
50 mul QWORD PTR numb; \
51 add l, rax; \
52 adc h, rdx; \
53 adc c, c
54
55// A short form where we don't expect a top carry
56
57#define combads(h,l,numa,numb) \
58 mov rax, numa; \
59 mul QWORD PTR numb; \
60 add l, rax; \
61 adc h, rdx
62
63S2N_BN_SYMBOL(bignum_mul_8_16_alt):
64
65#if WINDOWS_ABI
66 push rdi
67 push rsi
68 mov rdi, rcx
69 mov rsi, rdx
70 mov rdx, r8
71#endif
72
73// Copy y into a safe register to start with
74
75 mov y, rdx
76
77// Result term 0
78
79 mov rax, [x]
80 mul QWORD PTR [y]
81
82 mov [z], rax
83 mov t0, rdx
84 xor t1, t1
85
86// Result term 1
87
88 xor t2, t2
89 combads(t1,t0,[x],[y+8])
90 combadz(t2,t1,t0,[x+8],[y])
91 mov [z+8], t0
92
93// Result term 2
94
95 xor t0, t0
96 combadz(t0,t2,t1,[x],[y+16])
97 combadd(t0,t2,t1,[x+8],[y+8])
98 combadd(t0,t2,t1,[x+16],[y])
99 mov [z+16], t1
100
101// Result term 3
102
103 xor t1, t1
104 combadz(t1,t0,t2,[x],[y+24])
105 combadd(t1,t0,t2,[x+8],[y+16])
106 combadd(t1,t0,t2,[x+16],[y+8])
107 combadd(t1,t0,t2,[x+24],[y])
108 mov [z+24], t2
109
110// Result term 4
111
112 xor t2, t2
113 combadz(t2,t1,t0,[x],[y+32])
114 combadd(t2,t1,t0,[x+8],[y+24])
115 combadd(t2,t1,t0,[x+16],[y+16])
116 combadd(t2,t1,t0,[x+24],[y+8])
117 combadd(t2,t1,t0,[x+32],[y])
118 mov [z+32], t0
119
120// Result term 5
121
122 xor t0, t0
123 combadz(t0,t2,t1,[x],[y+40])
124 combadd(t0,t2,t1,[x+8],[y+32])
125 combadd(t0,t2,t1,[x+16],[y+24])
126 combadd(t0,t2,t1,[x+24],[y+16])
127 combadd(t0,t2,t1,[x+32],[y+8])
128 combadd(t0,t2,t1,[x+40],[y])
129 mov [z+40], t1
130
131// Result term 6
132
133 xor t1, t1
134 combadz(t1,t0,t2,[x],[y+48])
135 combadd(t1,t0,t2,[x+8],[y+40])
136 combadd(t1,t0,t2,[x+16],[y+32])
137 combadd(t1,t0,t2,[x+24],[y+24])
138 combadd(t1,t0,t2,[x+32],[y+16])
139 combadd(t1,t0,t2,[x+40],[y+8])
140 combadd(t1,t0,t2,[x+48],[y])
141 mov [z+48], t2
142
143// Result term 7
144
145 xor t2, t2
146 combadz(t2,t1,t0,[x],[y+56])
147 combadd(t2,t1,t0,[x+8],[y+48])
148 combadd(t2,t1,t0,[x+16],[y+40])
149 combadd(t2,t1,t0,[x+24],[y+32])
150 combadd(t2,t1,t0,[x+32],[y+24])
151 combadd(t2,t1,t0,[x+40],[y+16])
152 combadd(t2,t1,t0,[x+48],[y+8])
153 combadd(t2,t1,t0,[x+56],[y])
154 mov [z+56], t0
155
156// Result term 8
157
158 xor t0, t0
159 combadz(t0,t2,t1,[x+8],[y+56])
160 combadd(t0,t2,t1,[x+16],[y+48])
161 combadd(t0,t2,t1,[x+24],[y+40])
162 combadd(t0,t2,t1,[x+32],[y+32])
163 combadd(t0,t2,t1,[x+40],[y+24])
164 combadd(t0,t2,t1,[x+48],[y+16])
165 combadd(t0,t2,t1,[x+56],[y+8])
166 mov [z+64], t1
167
168// Result term 9
169
170 xor t1, t1
171 combadz(t1,t0,t2,[x+16],[y+56])
172 combadd(t1,t0,t2,[x+24],[y+48])
173 combadd(t1,t0,t2,[x+32],[y+40])
174 combadd(t1,t0,t2,[x+40],[y+32])
175 combadd(t1,t0,t2,[x+48],[y+24])
176 combadd(t1,t0,t2,[x+56],[y+16])
177 mov [z+72], t2
178
179// Result term 10
180
181 xor t2, t2
182 combadz(t2,t1,t0,[x+24],[y+56])
183 combadd(t2,t1,t0,[x+32],[y+48])
184 combadd(t2,t1,t0,[x+40],[y+40])
185 combadd(t2,t1,t0,[x+48],[y+32])
186 combadd(t2,t1,t0,[x+56],[y+24])
187 mov [z+80], t0
188
189// Result term 11
190
191 xor t0, t0
192 combadz(t0,t2,t1,[x+32],[y+56])
193 combadd(t0,t2,t1,[x+40],[y+48])
194 combadd(t0,t2,t1,[x+48],[y+40])
195 combadd(t0,t2,t1,[x+56],[y+32])
196 mov [z+88], t1
197
198// Result term 12
199
200 xor t1, t1
201 combadz(t1,t0,t2,[x+40],[y+56])
202 combadd(t1,t0,t2,[x+48],[y+48])
203 combadd(t1,t0,t2,[x+56],[y+40])
204 mov [z+96], t2
205
206// Result term 13
207
208 xor t2, t2
209 combadz(t2,t1,t0,[x+48],[y+56])
210 combadd(t2,t1,t0,[x+56],[y+48])
211 mov [z+104], t0
212
213// Result term 14
214
215 combads(t2,t1,[x+56],[y+56])
216 mov [z+112], t1
217
218// Result term 11
219
220 mov [z+120], t2
221
222// Return
223
224#if WINDOWS_ABI
225 pop rsi
226 pop rdi
227#endif
228 ret
229
230#if defined(__linux__) && defined(__ELF__)
231.section .note.GNU-stack,"",%progbits
232#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
new file mode 100644
index 0000000000..db483a5723
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S
@@ -0,0 +1,133 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC
3
4// ----------------------------------------------------------------------------
5// Square, z := x^2
6// Input x[4]; output z[8]
7//
8// extern void bignum_sqr_4_8_alt
9// (uint64_t z[static 8], uint64_t x[static 4]);
10//
11// Standard x86-64 ABI: RDI = z, RSI = x
12// Microsoft x64 ABI: RCX = z, RDX = x
13// ----------------------------------------------------------------------------
14
15#include "_internal_s2n_bignum.h"
16
17 .intel_syntax noprefix
18 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt)
19 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt)
20 .text
21
22// Input arguments
23
24#define z rdi
25#define x rsi
26
27// Other variables used as a rotating 3-word window to add terms to
28
29#define t0 rcx
30#define t1 r8
31#define t2 r9
32
33// Macro for the key "multiply and add to (c,h,l)" step, for square term
34
35#define combadd1(c,h,l,numa) \
36 mov rax, numa; \
37 mul rax; \
38 add l, rax; \
39 adc h, rdx; \
40 adc c, 0
41
42// A short form where we don't expect a top carry
43
44#define combads(h,l,numa) \
45 mov rax, numa; \
46 mul rax; \
47 add l, rax; \
48 adc h, rdx
49
50// A version doubling before adding, for non-square terms
51
52#define combadd2(c,h,l,numa,numb) \
53 mov rax, numa; \
54 mul QWORD PTR numb; \
55 add rax, rax; \
56 adc rdx, rdx; \
57 adc c, 0; \
58 add l, rax; \
59 adc h, rdx; \
60 adc c, 0
61
62S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
63
64#if WINDOWS_ABI
65 push rdi
66 push rsi
67 mov rdi, rcx
68 mov rsi, rdx
69#endif
70
71// Result term 0
72
73 mov rax, [x]
74 mul rax
75
76 mov [z], rax
77 mov t0, rdx
78 xor t1, t1
79
80// Result term 1
81
82 xor t2, t2
83 combadd2(t2,t1,t0,[x],[x+8])
84 mov [z+8], t0
85
86// Result term 2
87
88 xor t0, t0
89 combadd1(t0,t2,t1,[x+8])
90 combadd2(t0,t2,t1,[x],[x+16])
91 mov [z+16], t1
92
93// Result term 3
94
95 xor t1, t1
96 combadd2(t1,t0,t2,[x],[x+24])
97 combadd2(t1,t0,t2,[x+8],[x+16])
98 mov [z+24], t2
99
100// Result term 4
101
102 xor t2, t2
103 combadd2(t2,t1,t0,[x+8],[x+24])
104 combadd1(t2,t1,t0,[x+16])
105 mov [z+32], t0
106
107// Result term 5
108
109 xor t0, t0
110 combadd2(t0,t2,t1,[x+16],[x+24])
111 mov [z+40], t1
112
113// Result term 6
114
115 xor t1, t1
116 combads(t0,t2,[x+24])
117 mov [z+48], t2
118
119// Result term 7
120
121 mov [z+56], t0
122
123// Return
124
125#if WINDOWS_ABI
126 pop rsi
127 pop rdi
128#endif
129 ret
130
131#if defined(__linux__) && defined(__ELF__)
132.section .note.GNU-stack,"",%progbits
133#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
new file mode 100644
index 0000000000..dcf3c92e5b
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S
@@ -0,0 +1,230 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC
3
4// ----------------------------------------------------------------------------
5// Square, z := x^2
6// Input x[8]; output z[16]
7//
8// extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
9//
10// Standard x86-64 ABI: RDI = z, RSI = x
11// Microsoft x64 ABI: RCX = z, RDX = x
12// ----------------------------------------------------------------------------
13
14#include "_internal_s2n_bignum.h"
15
16 .intel_syntax noprefix
17 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt)
18 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt)
19 .text
20
21// Input arguments
22
23#define z rdi
24#define x rsi
25
26// Other variables used as a rotating 3-word window to add terms to
27
28#define t0 r8
29#define t1 r9
30#define t2 r10
31
32// Additional temporaries for local windows to share doublings
33
34#define u0 rcx
35#define u1 r11
36
37// Macro for the key "multiply and add to (c,h,l)" step
38
39#define combadd(c,h,l,numa,numb) \
40 mov rax, numa; \
41 mul QWORD PTR numb; \
42 add l, rax; \
43 adc h, rdx; \
44 adc c, 0
45
46// Set up initial window (c,h,l) = numa * numb
47
48#define combaddz(c,h,l,numa,numb) \
49 mov rax, numa; \
50 mul QWORD PTR numb; \
51 xor c, c; \
52 mov l, rax; \
53 mov h, rdx
54
55// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
56
57#define doubladd(c,h,l,hh,ll) \
58 add ll, ll; \
59 adc hh, hh; \
60 adc c, c; \
61 add l, ll; \
62 adc h, hh; \
63 adc c, 0
64
65// Square term incorporation (c,h,l) += numba^2
66
67#define combadd1(c,h,l,numa) \
68 mov rax, numa; \
69 mul rax; \
70 add l, rax; \
71 adc h, rdx; \
72 adc c, 0
73
74// A short form where we don't expect a top carry
75
76#define combads(h,l,numa) \
77 mov rax, numa; \
78 mul rax; \
79 add l, rax; \
80 adc h, rdx
81
82// A version doubling directly before adding, for single non-square terms
83
84#define combadd2(c,h,l,numa,numb) \
85 mov rax, numa; \
86 mul QWORD PTR numb; \
87 add rax, rax; \
88 adc rdx, rdx; \
89 adc c, 0; \
90 add l, rax; \
91 adc h, rdx; \
92 adc c, 0
93
94S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
95
96#if WINDOWS_ABI
97 push rdi
98 push rsi
99 mov rdi, rcx
100 mov rsi, rdx
101#endif
102
103// Result term 0
104
105 mov rax, [x]
106 mul rax
107
108 mov [z], rax
109 mov t0, rdx
110 xor t1, t1
111
112// Result term 1
113
114 xor t2, t2
115 combadd2(t2,t1,t0,[x],[x+8])
116 mov [z+8], t0
117
118// Result term 2
119
120 xor t0, t0
121 combadd1(t0,t2,t1,[x+8])
122 combadd2(t0,t2,t1,[x],[x+16])
123 mov [z+16], t1
124
125// Result term 3
126
127 combaddz(t1,u1,u0,[x],[x+24])
128 combadd(t1,u1,u0,[x+8],[x+16])
129 doubladd(t1,t0,t2,u1,u0)
130 mov [z+24], t2
131
132// Result term 4
133
134 combaddz(t2,u1,u0,[x],[x+32])
135 combadd(t2,u1,u0,[x+8],[x+24])
136 doubladd(t2,t1,t0,u1,u0)
137 combadd1(t2,t1,t0,[x+16])
138 mov [z+32], t0
139
140// Result term 5
141
142 combaddz(t0,u1,u0,[x],[x+40])
143 combadd(t0,u1,u0,[x+8],[x+32])
144 combadd(t0,u1,u0,[x+16],[x+24])
145 doubladd(t0,t2,t1,u1,u0)
146 mov [z+40], t1
147
148// Result term 6
149
150 combaddz(t1,u1,u0,[x],[x+48])
151 combadd(t1,u1,u0,[x+8],[x+40])
152 combadd(t1,u1,u0,[x+16],[x+32])
153 doubladd(t1,t0,t2,u1,u0)
154 combadd1(t1,t0,t2,[x+24])
155 mov [z+48], t2
156
157// Result term 7
158
159 combaddz(t2,u1,u0,[x],[x+56])
160 combadd(t2,u1,u0,[x+8],[x+48])
161 combadd(t2,u1,u0,[x+16],[x+40])
162 combadd(t2,u1,u0,[x+24],[x+32])
163 doubladd(t2,t1,t0,u1,u0)
164 mov [z+56], t0
165
166// Result term 8
167
168 combaddz(t0,u1,u0,[x+8],[x+56])
169 combadd(t0,u1,u0,[x+16],[x+48])
170 combadd(t0,u1,u0,[x+24],[x+40])
171 doubladd(t0,t2,t1,u1,u0)
172 combadd1(t0,t2,t1,[x+32])
173 mov [z+64], t1
174
175// Result term 9
176
177 combaddz(t1,u1,u0,[x+16],[x+56])
178 combadd(t1,u1,u0,[x+24],[x+48])
179 combadd(t1,u1,u0,[x+32],[x+40])
180 doubladd(t1,t0,t2,u1,u0)
181 mov [z+72], t2
182
183// Result term 10
184
185 combaddz(t2,u1,u0,[x+24],[x+56])
186 combadd(t2,u1,u0,[x+32],[x+48])
187 doubladd(t2,t1,t0,u1,u0)
188 combadd1(t2,t1,t0,[x+40])
189 mov [z+80], t0
190
191// Result term 11
192
193 combaddz(t0,u1,u0,[x+32],[x+56])
194 combadd(t0,u1,u0,[x+40],[x+48])
195 doubladd(t0,t2,t1,u1,u0)
196 mov [z+88], t1
197
198// Result term 12
199
200 xor t1, t1
201 combadd2(t1,t0,t2,[x+40],[x+56])
202 combadd1(t1,t0,t2,[x+48])
203 mov [z+96], t2
204
205// Result term 13
206
207 xor t2, t2
208 combadd2(t2,t1,t0,[x+48],[x+56])
209 mov [z+104], t0
210
211// Result term 14
212
213 combads(t2,t1,[x+56])
214 mov [z+112], t1
215
216// Result term 15
217
218 mov [z+120], t2
219
220// Return
221
222#if WINDOWS_ABI
223 pop rsi
224 pop rdi
225#endif
226 ret
227
228#if defined(__linux__) && defined(__ELF__)
229.section .note.GNU-stack,"",%progbits
230#endif
diff --git a/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
new file mode 100644
index 0000000000..42eb6a82ef
--- /dev/null
+++ b/src/lib/libcrypto/bn/arch/amd64/bignum_sub.S
@@ -0,0 +1,141 @@
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0 OR ISC
3
4// ----------------------------------------------------------------------------
5// Subtract, z := x - y
6// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
7//
8// extern uint64_t bignum_sub
9// (uint64_t p, uint64_t *z,
10// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
11//
12// Does the z := x - y operation, truncating modulo p words in general and
13// returning a top borrow (0 or 1) in the p'th place, only subtracting input
14// words below p (as well as m and n respectively) to get the diff and borrow.
15//
16// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
17// Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
18// ----------------------------------------------------------------------------
19
20#include "_internal_s2n_bignum.h"
21
22 .intel_syntax noprefix
23 S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub)
24 S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub)
25 .text
26
27#define p rdi
28#define z rsi
29#define m rdx
30#define x rcx
31#define n r8
32#define y r9
33#define i r10
34#define a rax
35
36#define ashort eax
37
38
39
40S2N_BN_SYMBOL(bignum_sub):
41
42#if WINDOWS_ABI
43 push rdi
44 push rsi
45 mov rdi, rcx
46 mov rsi, rdx
47 mov rdx, r8
48 mov rcx, r9
49 mov r8, [rsp+56]
50 mov r9, [rsp+64]
51#endif
52
53// Zero the main index counter for both branches
54
55 xor i, i
56
57// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
58// we'll never need words past the p'th. Can now assume m <= p and n <= p.
59// Then compare the modified m and n and branch accordingly
60
61 cmp p, m
62 cmovc m, p
63 cmp p, n
64 cmovc n, p
65 cmp m, n
66 jc ylonger
67
68// The case where x is longer or of the same size (p >= m >= n)
69
70 sub p, m
71 sub m, n
72 inc m
73 test n, n
74 jz xtest
75xmainloop:
76 mov a, [x+8*i]
77 sbb a, [y+8*i]
78 mov [z+8*i],a
79 inc i
80 dec n
81 jnz xmainloop
82 jmp xtest
83xtoploop:
84 mov a, [x+8*i]
85 sbb a, 0
86 mov [z+8*i],a
87 inc i
88xtest:
89 dec m
90 jnz xtoploop
91 sbb a, a
92 test p, p
93 jz tailskip
94tailloop:
95 mov [z+8*i],a
96 inc i
97 dec p
98 jnz tailloop
99tailskip:
100 neg a
101#if WINDOWS_ABI
102 pop rsi
103 pop rdi
104#endif
105 ret
106
107// The case where y is longer (p >= n > m)
108
109ylonger:
110
111 sub p, n
112 sub n, m
113 test m, m
114 jz ytoploop
115ymainloop:
116 mov a, [x+8*i]
117 sbb a, [y+8*i]
118 mov [z+8*i],a
119 inc i
120 dec m
121 jnz ymainloop
122ytoploop:
123 mov ashort, 0
124 sbb a, [y+8*i]
125 mov [z+8*i],a
126 inc i
127 dec n
128 jnz ytoploop
129 sbb a, a
130 test p, p
131 jnz tailloop
132 neg a
133#if WINDOWS_ABI
134 pop rsi
135 pop rdi
136#endif
137 ret
138
139#if defined(__linux__) && defined(__ELF__)
140.section .note.GNU-stack,"",%progbits
141#endif