diff options
author | jsing <> | 2024-11-16 15:31:36 +0000 |
---|---|---|
committer | jsing <> | 2024-11-16 15:31:36 +0000 |
commit | 23d9a213525c4cdbc5fba2b8cf4109f087fe41d3 (patch) | |
tree | ca867c18349e0ee9b5d1d2d76bade5e031c5ad2a | |
parent | d87a513f1f66019c84942357247dbcdc0c318f2e (diff) | |
download | openbsd-23d9a213525c4cdbc5fba2b8cf4109f087fe41d3.tar.gz openbsd-23d9a213525c4cdbc5fba2b8cf4109f087fe41d3.tar.bz2 openbsd-23d9a213525c4cdbc5fba2b8cf4109f087fe41d3.zip |
Provide a SHA-256 assembly implementation for amd64 using SHA-NI.
This provides a SHA-256 assembly implementation for amd64, which uses
the Intel SHA Extensions (aka SHA New Instructions or SHA-NI). This
provides a 3-5x performance gain on some Intel CPUs and many AMD CPUs.
ok tb@
-rw-r--r-- | src/lib/libcrypto/arch/amd64/Makefile.inc | 3 | ||||
-rw-r--r-- | src/lib/libcrypto/sha/sha256_amd64.c | 10 | ||||
-rw-r--r-- | src/lib/libcrypto/sha/sha256_amd64_shani.S | 209 |
3 files changed, 220 insertions, 2 deletions
diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc index 9ba5634f87..fe22385633 100644 --- a/src/lib/libcrypto/arch/amd64/Makefile.inc +++ b/src/lib/libcrypto/arch/amd64/Makefile.inc | |||
@@ -1,4 +1,4 @@ | |||
1 | # $OpenBSD: Makefile.inc,v 1.32 2024/11/16 14:56:39 jsing Exp $ | 1 | # $OpenBSD: Makefile.inc,v 1.33 2024/11/16 15:31:36 jsing Exp $ |
2 | 2 | ||
3 | # amd64-specific libcrypto build rules | 3 | # amd64-specific libcrypto build rules |
4 | 4 | ||
@@ -53,6 +53,7 @@ SSLASM+= sha sha1-x86_64 | |||
53 | CFLAGS+= -DSHA256_ASM | 53 | CFLAGS+= -DSHA256_ASM |
54 | SRCS+= sha256_amd64.c | 54 | SRCS+= sha256_amd64.c |
55 | SRCS+= sha256_amd64_generic.S | 55 | SRCS+= sha256_amd64_generic.S |
56 | SRCS+= sha256_amd64_shani.S | ||
56 | CFLAGS+= -DSHA512_ASM | 57 | CFLAGS+= -DSHA512_ASM |
57 | SRCS+= sha512_amd64.c | 58 | SRCS+= sha512_amd64.c |
58 | SRCS+= sha512_amd64_generic.S | 59 | SRCS+= sha512_amd64_generic.S |
diff --git a/src/lib/libcrypto/sha/sha256_amd64.c b/src/lib/libcrypto/sha/sha256_amd64.c index f7531b340f..6c5d3e897f 100644 --- a/src/lib/libcrypto/sha/sha256_amd64.c +++ b/src/lib/libcrypto/sha/sha256_amd64.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* $OpenBSD: sha256_amd64.c,v 1.1 2024/11/08 15:09:48 jsing Exp $ */ | 1 | /* $OpenBSD: sha256_amd64.c,v 1.2 2024/11/16 15:31:36 jsing Exp $ */ |
2 | /* | 2 | /* |
3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> | 3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> |
4 | * | 4 | * |
@@ -17,10 +17,18 @@ | |||
17 | 17 | ||
18 | #include <openssl/sha.h> | 18 | #include <openssl/sha.h> |
19 | 19 | ||
20 | #include "crypto_arch.h" | ||
21 | |||
20 | void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num); | 22 | void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num); |
23 | void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num); | ||
21 | 24 | ||
22 | void | 25 | void |
23 | sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num) | 26 | sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num) |
24 | { | 27 | { |
28 | if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_SHA) != 0) { | ||
29 | sha256_block_shani(ctx, in, num); | ||
30 | return; | ||
31 | } | ||
32 | |||
25 | sha256_block_generic(ctx, in, num); | 33 | sha256_block_generic(ctx, in, num); |
26 | } | 34 | } |
diff --git a/src/lib/libcrypto/sha/sha256_amd64_shani.S b/src/lib/libcrypto/sha/sha256_amd64_shani.S new file mode 100644 index 0000000000..df3a796b45 --- /dev/null +++ b/src/lib/libcrypto/sha/sha256_amd64_shani.S | |||
@@ -0,0 +1,209 @@ | |||
1 | /* $OpenBSD: sha256_amd64_shani.S,v 1.1 2024/11/16 15:31:36 jsing Exp $ */ | ||
2 | /* | ||
3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> | ||
4 | * | ||
5 | * Permission to use, copy, modify, and distribute this software for any | ||
6 | * purpose with or without fee is hereby granted, provided that the above | ||
7 | * copyright notice and this permission notice appear in all copies. | ||
8 | * | ||
9 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | */ | ||
17 | |||
18 | #ifdef __CET__ | ||
19 | #include <cet.h> | ||
20 | #else | ||
21 | #define _CET_ENDBR | ||
22 | #endif | ||
23 | |||
24 | /* | ||
25 | * SHA-256 implementation using the Intel SHA extensions: | ||
26 | * | ||
27 | * https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html | ||
28 | */ | ||
29 | |||
30 | #define ctx %rdi | ||
31 | #define in %rsi | ||
32 | #define num %rdx | ||
33 | |||
34 | #define end %rbx | ||
35 | |||
36 | #define k256 %rbp | ||
37 | |||
38 | #define xmsg %xmm0 | ||
39 | |||
40 | #define xhs0 %xmm1 | ||
41 | #define xhs1 %xmm2 | ||
42 | |||
43 | #define xabef %xmm3 | ||
44 | #define xcdgh %xmm4 | ||
45 | |||
46 | #define xmsgtmp0 %xmm6 | ||
47 | #define xmsgtmp1 %xmm7 | ||
48 | #define xmsgtmp2 %xmm8 | ||
49 | #define xmsgtmp3 %xmm9 | ||
50 | #define xmsgtmp4 %xmm10 | ||
51 | |||
52 | #define xshufmask %xmm11 | ||
53 | |||
54 | #define xtmp0 %xmm12 | ||
55 | |||
56 | #define sha256_message_schedule_load(idx, m, xmsgtmp) \ | ||
57 | movdqu (idx*16)(m), xmsg; \ | ||
58 | pshufb xshufmask, xmsg; \ | ||
59 | movdqa xmsg, xmsgtmp; | ||
60 | |||
61 | #define sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3) \ | ||
62 | sha256msg1 xmt1, xmt0; \ | ||
63 | movdqa xmt3, xmsgtmp4; \ | ||
64 | palignr $4, xmt2, xmsgtmp4; \ | ||
65 | paddd xmsgtmp4, xmt0; \ | ||
66 | sha256msg2 xmt3, xmt0; | ||
67 | |||
68 | #define sha256_shani_round(idx) \ | ||
69 | paddd (idx*16)(k256), xmsg; \ | ||
70 | sha256rnds2 xmsg, xhs0, xhs1; \ | ||
71 | pshufd $0x0e, xmsg, xmsg; \ | ||
72 | sha256rnds2 xmsg, xhs1, xhs0; | ||
73 | |||
74 | #define sha256_shani_round_load(idx, m, xmsgtmp) \ | ||
75 | sha256_message_schedule_load(idx, m, xmsgtmp); \ | ||
76 | sha256_shani_round(idx); | ||
77 | |||
78 | #define sha256_shani_round_update(idx, xmt0, xmt1, xmt2, xmt3) \ | ||
79 | sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3); \ | ||
80 | movdqa xmt0, xmsg; \ | ||
81 | sha256_shani_round(idx); | ||
82 | |||
83 | .text | ||
84 | |||
85 | /* | ||
86 | * void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num); | ||
87 | * | ||
88 | * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num | ||
89 | */ | ||
90 | .align 16 | ||
91 | .globl sha256_block_shani | ||
92 | .type sha256_block_shani,@function | ||
93 | sha256_block_shani: | ||
94 | _CET_ENDBR | ||
95 | |||
96 | /* Save callee save registers. */ | ||
97 | pushq %rbx | ||
98 | pushq %rbp | ||
99 | |||
100 | /* Compute end of message. */ | ||
101 | shlq $6, num | ||
102 | leaq (in, num, 1), end | ||
103 | |||
104 | /* Address of SHA-256 constants. */ | ||
105 | leaq K256(%rip), k256 | ||
106 | |||
107 | /* Load endian shuffle mask. */ | ||
108 | movdqa shufmask(%rip), xshufmask | ||
109 | |||
110 | /* Load current hash state from context. */ | ||
111 | movdqu (0*16)(ctx), xhs0 /* dcba */ | ||
112 | movdqu (1*16)(ctx), xhs1 /* hgfe */ | ||
113 | |||
114 | /* Rearrange words to construct abef/cdgh. */ | ||
115 | pshufd $0xb1, xhs0, xhs0 /* cdab */ | ||
116 | pshufd $0x1b, xhs1, xhs1 /* efgh */ | ||
117 | movdqa xhs0, xtmp0 | ||
118 | palignr $8, xhs1, xhs0 /* abef */ | ||
119 | pblendw $0xf0, xtmp0, xhs1 /* cdgh */ | ||
120 | |||
121 | jmp .Lshani_block_loop | ||
122 | |||
123 | .align 16 | ||
124 | .Lshani_block_loop: | ||
125 | /* Save state for accumulation. */ | ||
126 | movdqa xhs0, xabef | ||
127 | movdqa xhs1, xcdgh | ||
128 | |||
129 | /* Rounds 0 through 15 (four rounds at a time). */ | ||
130 | sha256_shani_round_load(0, in, xmsgtmp0) | ||
131 | sha256_shani_round_load(1, in, xmsgtmp1) | ||
132 | sha256_shani_round_load(2, in, xmsgtmp2) | ||
133 | sha256_shani_round_load(3, in, xmsgtmp3) | ||
134 | |||
135 | /* Rounds 16 through 63 (four rounds at a time). */ | ||
136 | sha256_shani_round_update(4, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) | ||
137 | sha256_shani_round_update(5, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) | ||
138 | sha256_shani_round_update(6, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) | ||
139 | sha256_shani_round_update(7, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) | ||
140 | |||
141 | sha256_shani_round_update(8, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) | ||
142 | sha256_shani_round_update(9, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) | ||
143 | sha256_shani_round_update(10, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) | ||
144 | sha256_shani_round_update(11, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) | ||
145 | |||
146 | sha256_shani_round_update(12, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) | ||
147 | sha256_shani_round_update(13, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) | ||
148 | sha256_shani_round_update(14, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) | ||
149 | sha256_shani_round_update(15, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) | ||
150 | |||
151 | /* Accumulate hash state. */ | ||
152 | paddd xabef, xhs0 | ||
153 | paddd xcdgh, xhs1 | ||
154 | |||
155 | addq $64, in | ||
156 | cmpq end, in | ||
157 | jb .Lshani_block_loop | ||
158 | |||
159 | /* Rearrange words to construct dcba/hgfe. */ | ||
160 | pshufd $0x1b, xhs0, xhs0 /* feba */ | ||
161 | pshufd $0xb1, xhs1, xhs1 /* dchg */ | ||
162 | movdqa xhs0, xtmp0 | ||
163 | pblendw $0xf0, xhs1, xhs0 /* dcba */ | ||
164 | palignr $8, xtmp0, xhs1 /* hgfe */ | ||
165 | |||
166 | /* Update stored hash context. */ | ||
167 | movdqu xhs0, (0*16)(ctx) | ||
168 | movdqu xhs1, (1*16)(ctx) | ||
169 | |||
170 | /* Restore callee save registers. */ | ||
171 | popq %rbp | ||
172 | popq %rbx | ||
173 | |||
174 | ret | ||
175 | |||
176 | .rodata | ||
177 | |||
178 | /* | ||
179 | * Shuffle mask - little endian to big endian word conversion. | ||
180 | */ | ||
181 | .align 16 | ||
182 | .type shufmask,@object | ||
183 | shufmask: | ||
184 | .octa 0x0c0d0e0f08090a0b0405060700010203 | ||
185 | .size shufmask,.-shufmask | ||
186 | |||
187 | /* | ||
188 | * SHA-256 constants - see FIPS 180-4 section 4.2.2. | ||
189 | */ | ||
190 | .align 64 | ||
191 | .type K256,@object | ||
192 | K256: | ||
193 | .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 | ||
194 | .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 | ||
195 | .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 | ||
196 | .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 | ||
197 | .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc | ||
198 | .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da | ||
199 | .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 | ||
200 | .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 | ||
201 | .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 | ||
202 | .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 | ||
203 | .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 | ||
204 | .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 | ||
205 | .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 | ||
206 | .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 | ||
207 | .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 | ||
208 | .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | ||
209 | .size K256,.-K256 | ||