summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/arch/amd64/bignum_cmul.S
blob: 6d184e3f393ad3c247b30fc9ffbc4fad4dc7a54b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Multiply by a single word, z := c * y
// Inputs c, y[n]; outputs function return (carry-out) and z[k]
//
//    extern uint64_t bignum_cmul
//     (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
//
// Does the "z := c * y" operation where y is n digits, result z is p.
// Truncates the result in general unless p >= n + 1.
//
// The return value is a high/carry word that is meaningful when p >= n as
// giving the high part of the result. Since this is always zero if p > n,
// it is mainly of interest in the special case p = n, i.e. where the source
// and destination have the same nominal size, when it gives the extra word
// of the full result.
//
// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum.h"

        .intel_syntax noprefix
        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul)
        .text

#define p rdi
#define z rsi
#define c r9
#define n rcx
#define x r8

#define i r10
#define h r11



S2N_BN_SYMBOL(bignum_cmul):

#if WINDOWS_ABI
        push    rdi
        push    rsi
        mov     rdi, rcx
        mov     rsi, rdx
        mov     rdx, r8
        mov     rcx, r9
        mov     r8, [rsp+56]
#endif

// First clamp the input size n := min(p,n) since we can never need to read
// past the p'th term of the input to generate p-digit output. Now we can
// assume that n <= p

        cmp     p, n
        cmovc   n, p

// Initialize current input/output pointer offset i and high part h.
// But then if n = 0 skip the multiplication and go to the tail part

        xor     h, h
        xor     i, i
        test    n, n
        jz      tail

// Move c into a safer register as multiplies overwrite rdx

        mov     c, rdx

// Initialization of the loop: [h,l] = c * x_0

        mov     rax, [x]
        mul     c
        mov     [z], rax
        mov     h, rdx
        inc     i
        cmp     i, n
        jz      tail

// Main loop doing the multiplications

loop:
        mov     rax, [x+8*i]
        mul     c
        add     rax, h
        adc     rdx, 0
        mov     [z+8*i], rax
        mov     h, rdx
        inc     i
        cmp     i, n
        jc      loop

// Add a tail when the destination is longer

tail:
        cmp     i, p
        jnc     end
        mov     [z+8*i], h
        xor     h, h
        inc     i
        cmp     i, p
        jnc     end

tloop:
        mov     [z+8*i], h
        inc     i
        cmp     i, p
        jc      tloop

// Return the high/carry word

end:
        mov     rax, h

#if WINDOWS_ABI
        pop    rsi
        pop    rdi
#endif
        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif