diff options
| author | jsing <> | 2024-11-16 15:31:36 +0000 |
|---|---|---|
| committer | jsing <> | 2024-11-16 15:31:36 +0000 |
| commit | a06b7340f2af374d4581bec7db2775ae686ce1ab (patch) | |
| tree | ca867c18349e0ee9b5d1d2d76bade5e031c5ad2a /src/lib/libc | |
| parent | 9c7148f8d90423eeba4266d426611516acab5bc3 (diff) | |
| download | openbsd-a06b7340f2af374d4581bec7db2775ae686ce1ab.tar.gz openbsd-a06b7340f2af374d4581bec7db2775ae686ce1ab.tar.bz2 openbsd-a06b7340f2af374d4581bec7db2775ae686ce1ab.zip | |
Provide a SHA-256 assembly implementation for amd64 using SHA-NI.
This provides a SHA-256 assembly implementation for amd64, which uses
the Intel SHA Extensions (aka SHA New Instructions or SHA-NI). This
provides a 3-5x performance gain on some Intel CPUs and many AMD CPUs.
ok tb@
Diffstat (limited to '')
| -rw-r--r-- | src/lib/libcrypto/arch/amd64/Makefile.inc | 3 | ||||
| -rw-r--r-- | src/lib/libcrypto/sha/sha256_amd64.c | 10 | ||||
| -rw-r--r-- | src/lib/libcrypto/sha/sha256_amd64_shani.S | 209 |
3 files changed, 220 insertions, 2 deletions
diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc index 9ba5634f87..fe22385633 100644 --- a/src/lib/libcrypto/arch/amd64/Makefile.inc +++ b/src/lib/libcrypto/arch/amd64/Makefile.inc | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | # $OpenBSD: Makefile.inc,v 1.32 2024/11/16 14:56:39 jsing Exp $ | 1 | # $OpenBSD: Makefile.inc,v 1.33 2024/11/16 15:31:36 jsing Exp $ |
| 2 | 2 | ||
| 3 | # amd64-specific libcrypto build rules | 3 | # amd64-specific libcrypto build rules |
| 4 | 4 | ||
| @@ -53,6 +53,7 @@ SSLASM+= sha sha1-x86_64 | |||
| 53 | CFLAGS+= -DSHA256_ASM | 53 | CFLAGS+= -DSHA256_ASM |
| 54 | SRCS+= sha256_amd64.c | 54 | SRCS+= sha256_amd64.c |
| 55 | SRCS+= sha256_amd64_generic.S | 55 | SRCS+= sha256_amd64_generic.S |
| 56 | SRCS+= sha256_amd64_shani.S | ||
| 56 | CFLAGS+= -DSHA512_ASM | 57 | CFLAGS+= -DSHA512_ASM |
| 57 | SRCS+= sha512_amd64.c | 58 | SRCS+= sha512_amd64.c |
| 58 | SRCS+= sha512_amd64_generic.S | 59 | SRCS+= sha512_amd64_generic.S |
diff --git a/src/lib/libcrypto/sha/sha256_amd64.c b/src/lib/libcrypto/sha/sha256_amd64.c index f7531b340f..6c5d3e897f 100644 --- a/src/lib/libcrypto/sha/sha256_amd64.c +++ b/src/lib/libcrypto/sha/sha256_amd64.c | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | /* $OpenBSD: sha256_amd64.c,v 1.1 2024/11/08 15:09:48 jsing Exp $ */ | 1 | /* $OpenBSD: sha256_amd64.c,v 1.2 2024/11/16 15:31:36 jsing Exp $ */ |
| 2 | /* | 2 | /* |
| 3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> | 3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> |
| 4 | * | 4 | * |
| @@ -17,10 +17,18 @@ | |||
| 17 | 17 | ||
| 18 | #include <openssl/sha.h> | 18 | #include <openssl/sha.h> |
| 19 | 19 | ||
| 20 | #include "crypto_arch.h" | ||
| 21 | |||
| 20 | void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num); | 22 | void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num); |
| 23 | void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num); | ||
| 21 | 24 | ||
| 22 | void | 25 | void |
| 23 | sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num) | 26 | sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num) |
| 24 | { | 27 | { |
| 28 | if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_SHA) != 0) { | ||
| 29 | sha256_block_shani(ctx, in, num); | ||
| 30 | return; | ||
| 31 | } | ||
| 32 | |||
| 25 | sha256_block_generic(ctx, in, num); | 33 | sha256_block_generic(ctx, in, num); |
| 26 | } | 34 | } |
diff --git a/src/lib/libcrypto/sha/sha256_amd64_shani.S b/src/lib/libcrypto/sha/sha256_amd64_shani.S new file mode 100644 index 0000000000..df3a796b45 --- /dev/null +++ b/src/lib/libcrypto/sha/sha256_amd64_shani.S | |||
| @@ -0,0 +1,209 @@ | |||
| 1 | /* $OpenBSD: sha256_amd64_shani.S,v 1.1 2024/11/16 15:31:36 jsing Exp $ */ | ||
| 2 | /* | ||
| 3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> | ||
| 4 | * | ||
| 5 | * Permission to use, copy, modify, and distribute this software for any | ||
| 6 | * purpose with or without fee is hereby granted, provided that the above | ||
| 7 | * copyright notice and this permission notice appear in all copies. | ||
| 8 | * | ||
| 9 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
| 10 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
| 11 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
| 12 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
| 13 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
| 14 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
| 15 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
| 16 | */ | ||
| 17 | |||
| 18 | #ifdef __CET__ | ||
| 19 | #include <cet.h> | ||
| 20 | #else | ||
| 21 | #define _CET_ENDBR | ||
| 22 | #endif | ||
| 23 | |||
| 24 | /* | ||
| 25 | * SHA-256 implementation using the Intel SHA extensions: | ||
| 26 | * | ||
| 27 | * https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html | ||
| 28 | */ | ||
| 29 | |||
| 30 | #define ctx %rdi | ||
| 31 | #define in %rsi | ||
| 32 | #define num %rdx | ||
| 33 | |||
| 34 | #define end %rbx | ||
| 35 | |||
| 36 | #define k256 %rbp | ||
| 37 | |||
| 38 | #define xmsg %xmm0 | ||
| 39 | |||
| 40 | #define xhs0 %xmm1 | ||
| 41 | #define xhs1 %xmm2 | ||
| 42 | |||
| 43 | #define xabef %xmm3 | ||
| 44 | #define xcdgh %xmm4 | ||
| 45 | |||
| 46 | #define xmsgtmp0 %xmm6 | ||
| 47 | #define xmsgtmp1 %xmm7 | ||
| 48 | #define xmsgtmp2 %xmm8 | ||
| 49 | #define xmsgtmp3 %xmm9 | ||
| 50 | #define xmsgtmp4 %xmm10 | ||
| 51 | |||
| 52 | #define xshufmask %xmm11 | ||
| 53 | |||
| 54 | #define xtmp0 %xmm12 | ||
| 55 | |||
| 56 | #define sha256_message_schedule_load(idx, m, xmsgtmp) \ | ||
| 57 | movdqu (idx*16)(m), xmsg; \ | ||
| 58 | pshufb xshufmask, xmsg; \ | ||
| 59 | movdqa xmsg, xmsgtmp; | ||
| 60 | |||
| 61 | #define sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3) \ | ||
| 62 | sha256msg1 xmt1, xmt0; \ | ||
| 63 | movdqa xmt3, xmsgtmp4; \ | ||
| 64 | palignr $4, xmt2, xmsgtmp4; \ | ||
| 65 | paddd xmsgtmp4, xmt0; \ | ||
| 66 | sha256msg2 xmt3, xmt0; | ||
| 67 | |||
| 68 | #define sha256_shani_round(idx) \ | ||
| 69 | paddd (idx*16)(k256), xmsg; \ | ||
| 70 | sha256rnds2 xmsg, xhs0, xhs1; \ | ||
| 71 | pshufd $0x0e, xmsg, xmsg; \ | ||
| 72 | sha256rnds2 xmsg, xhs1, xhs0; | ||
| 73 | |||
| 74 | #define sha256_shani_round_load(idx, m, xmsgtmp) \ | ||
| 75 | sha256_message_schedule_load(idx, m, xmsgtmp); \ | ||
| 76 | sha256_shani_round(idx); | ||
| 77 | |||
| 78 | #define sha256_shani_round_update(idx, xmt0, xmt1, xmt2, xmt3) \ | ||
| 79 | sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3); \ | ||
| 80 | movdqa xmt0, xmsg; \ | ||
| 81 | sha256_shani_round(idx); | ||
| 82 | |||
| 83 | .text | ||
| 84 | |||
| 85 | /* | ||
| 86 | * void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num); | ||
| 87 | * | ||
| 88 | * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num | ||
| 89 | */ | ||
| 90 | .align 16 | ||
| 91 | .globl sha256_block_shani | ||
| 92 | .type sha256_block_shani,@function | ||
| 93 | sha256_block_shani: | ||
| 94 | _CET_ENDBR | ||
| 95 | |||
| 96 | /* Save callee save registers. */ | ||
| 97 | pushq %rbx | ||
| 98 | pushq %rbp | ||
| 99 | |||
| 100 | /* Compute end of message. */ | ||
| 101 | shlq $6, num | ||
| 102 | leaq (in, num, 1), end | ||
| 103 | |||
| 104 | /* Address of SHA-256 constants. */ | ||
| 105 | leaq K256(%rip), k256 | ||
| 106 | |||
| 107 | /* Load endian shuffle mask. */ | ||
| 108 | movdqa shufmask(%rip), xshufmask | ||
| 109 | |||
| 110 | /* Load current hash state from context. */ | ||
| 111 | movdqu (0*16)(ctx), xhs0 /* dcba */ | ||
| 112 | movdqu (1*16)(ctx), xhs1 /* hgfe */ | ||
| 113 | |||
| 114 | /* Rearrange words to construct abef/cdgh. */ | ||
| 115 | pshufd $0xb1, xhs0, xhs0 /* cdab */ | ||
| 116 | pshufd $0x1b, xhs1, xhs1 /* efgh */ | ||
| 117 | movdqa xhs0, xtmp0 | ||
| 118 | palignr $8, xhs1, xhs0 /* abef */ | ||
| 119 | pblendw $0xf0, xtmp0, xhs1 /* cdgh */ | ||
| 120 | |||
| 121 | jmp .Lshani_block_loop | ||
| 122 | |||
| 123 | .align 16 | ||
| 124 | .Lshani_block_loop: | ||
| 125 | /* Save state for accumulation. */ | ||
| 126 | movdqa xhs0, xabef | ||
| 127 | movdqa xhs1, xcdgh | ||
| 128 | |||
| 129 | /* Rounds 0 through 15 (four rounds at a time). */ | ||
| 130 | sha256_shani_round_load(0, in, xmsgtmp0) | ||
| 131 | sha256_shani_round_load(1, in, xmsgtmp1) | ||
| 132 | sha256_shani_round_load(2, in, xmsgtmp2) | ||
| 133 | sha256_shani_round_load(3, in, xmsgtmp3) | ||
| 134 | |||
| 135 | /* Rounds 16 through 63 (four rounds at a time). */ | ||
| 136 | sha256_shani_round_update(4, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) | ||
| 137 | sha256_shani_round_update(5, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) | ||
| 138 | sha256_shani_round_update(6, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) | ||
| 139 | sha256_shani_round_update(7, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) | ||
| 140 | |||
| 141 | sha256_shani_round_update(8, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) | ||
| 142 | sha256_shani_round_update(9, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) | ||
| 143 | sha256_shani_round_update(10, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) | ||
| 144 | sha256_shani_round_update(11, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) | ||
| 145 | |||
| 146 | sha256_shani_round_update(12, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) | ||
| 147 | sha256_shani_round_update(13, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) | ||
| 148 | sha256_shani_round_update(14, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) | ||
| 149 | sha256_shani_round_update(15, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) | ||
| 150 | |||
| 151 | /* Accumulate hash state. */ | ||
| 152 | paddd xabef, xhs0 | ||
| 153 | paddd xcdgh, xhs1 | ||
| 154 | |||
| 155 | addq $64, in | ||
| 156 | cmpq end, in | ||
| 157 | jb .Lshani_block_loop | ||
| 158 | |||
| 159 | /* Rearrange words to construct dcba/hgfe. */ | ||
| 160 | pshufd $0x1b, xhs0, xhs0 /* feba */ | ||
| 161 | pshufd $0xb1, xhs1, xhs1 /* dchg */ | ||
| 162 | movdqa xhs0, xtmp0 | ||
| 163 | pblendw $0xf0, xhs1, xhs0 /* dcba */ | ||
| 164 | palignr $8, xtmp0, xhs1 /* hgfe */ | ||
| 165 | |||
| 166 | /* Update stored hash context. */ | ||
| 167 | movdqu xhs0, (0*16)(ctx) | ||
| 168 | movdqu xhs1, (1*16)(ctx) | ||
| 169 | |||
| 170 | /* Restore callee save registers. */ | ||
| 171 | popq %rbp | ||
| 172 | popq %rbx | ||
| 173 | |||
| 174 | ret | ||
| 175 | |||
| 176 | .rodata | ||
| 177 | |||
| 178 | /* | ||
| 179 | * Shuffle mask - little endian to big endian word conversion. | ||
| 180 | */ | ||
| 181 | .align 16 | ||
| 182 | .type shufmask,@object | ||
| 183 | shufmask: | ||
| 184 | .octa 0x0c0d0e0f08090a0b0405060700010203 | ||
| 185 | .size shufmask,.-shufmask | ||
| 186 | |||
| 187 | /* | ||
| 188 | * SHA-256 constants - see FIPS 180-4 section 4.2.2. | ||
| 189 | */ | ||
| 190 | .align 64 | ||
| 191 | .type K256,@object | ||
| 192 | K256: | ||
| 193 | .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 | ||
| 194 | .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 | ||
| 195 | .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 | ||
| 196 | .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 | ||
| 197 | .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc | ||
| 198 | .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da | ||
| 199 | .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 | ||
| 200 | .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 | ||
| 201 | .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 | ||
| 202 | .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 | ||
| 203 | .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 | ||
| 204 | .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 | ||
| 205 | .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 | ||
| 206 | .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 | ||
| 207 | .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 | ||
| 208 | .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | ||
| 209 | .size K256,.-K256 | ||
