From 1c3ce6cc8e538cecc33ed58f89d969af28952dea Mon Sep 17 00:00:00 2001 From: jsing <> Date: Wed, 4 Dec 2024 13:13:33 +0000 Subject: Provide a replacement assembly implementation for SHA-1 on amd64. As already done for SHA-256 and SHA-512, replace the perlasm generated SHA-1 assembly implementation with one that is actually readable. Call the assembly implementation from a C wrapper that can, in the future, dispatch to alternate implementations. On a modern CPU the performance is around 5% faster than the base implementation generated by sha1-x86_64.pl, however it is around 15% slower than the excessively complex SSSE2/AVX version that is also generated by the same script (a SHA-NI version will greatly outperform this and is much cleaner/simpler). ok tb@ --- src/lib/libcrypto/arch/amd64/Makefile.inc | 5 +- src/lib/libcrypto/sha/sha1_amd64.c | 28 +++ src/lib/libcrypto/sha/sha1_amd64_generic.S | 314 +++++++++++++++++++++++++++++ 3 files changed, 345 insertions(+), 2 deletions(-) create mode 100644 src/lib/libcrypto/sha/sha1_amd64.c create mode 100644 src/lib/libcrypto/sha/sha1_amd64_generic.S (limited to 'src') diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc index fe22385633..33c7dbba26 100644 --- a/src/lib/libcrypto/arch/amd64/Makefile.inc +++ b/src/lib/libcrypto/arch/amd64/Makefile.inc @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile.inc,v 1.33 2024/11/16 15:31:36 jsing Exp $ +# $OpenBSD: Makefile.inc,v 1.34 2024/12/04 13:13:33 jsing Exp $ # amd64-specific libcrypto build rules @@ -49,7 +49,8 @@ SSLASM+= rc4 rc4-x86_64 # ripemd # sha CFLAGS+= -DSHA1_ASM -SSLASM+= sha sha1-x86_64 +SRCS+= sha1_amd64.c +SRCS+= sha1_amd64_generic.S CFLAGS+= -DSHA256_ASM SRCS+= sha256_amd64.c SRCS+= sha256_amd64_generic.S diff --git a/src/lib/libcrypto/sha/sha1_amd64.c b/src/lib/libcrypto/sha/sha1_amd64.c new file mode 100644 index 0000000000..b3d4ab1263 --- /dev/null +++ b/src/lib/libcrypto/sha/sha1_amd64.c @@ -0,0 +1,28 @@ +/* $OpenBSD: sha1_amd64.c,v 1.1 2024/12/04 13:13:33 jsing Exp $ */ +/* + * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <openssl/sha.h> + +#include "crypto_arch.h" + +void sha1_block_generic(SHA_CTX *ctx, const void *in, size_t num); + +void +sha1_block_data_order(SHA_CTX *ctx, const void *in, size_t num) +{ + sha1_block_generic(ctx, in, num); +} diff --git a/src/lib/libcrypto/sha/sha1_amd64_generic.S b/src/lib/libcrypto/sha/sha1_amd64_generic.S new file mode 100644 index 0000000000..d3e184dbca --- /dev/null +++ b/src/lib/libcrypto/sha/sha1_amd64_generic.S @@ -0,0 +1,314 @@ +/* $OpenBSD: sha1_amd64_generic.S,v 1.1 2024/12/04 13:13:33 jsing Exp $ */ +/* + * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef __CET__ +#include <cet.h> +#else +#define _CET_ENDBR +#endif + +#define ctx %rdi +#define in %rsi +#define num %rdx + +#define end %rbp + +#define hs0 %r8d +#define hs1 %r9d +#define hs2 %r10d +#define hs3 %r11d +#define hs4 %r12d + +#define tmp0 %eax +#define tmp1 %ebx +#define tmp2 %ecx +#define tmp3 %edx + +/* + * Load message into wt, storing a copy in the message schedule: + * + * Wt = Mt + */ +#define sha1_message_schedule_load(idx, m, w, wt) \ + movl ((idx&0xf)*4)(m), wt; \ + bswapl wt; \ + movl wt, ((idx&0xf)*4)(w); + +/* + * Update message schedule and return current value in wt: + * + * W0 = rol(W13 ^ W8 ^ W2 ^ W0, 1) + */ +#define sha1_message_schedule_update(idx, w, wt) \ + movl (((idx-3)&0xf)*4)(w), wt; /* W13 */ \ + xorl (((idx-8)&0xf)*4)(w), wt; /* W8 */ \ + xorl (((idx-14)&0xf)*4)(w), wt; /* W2 */ \ + xorl (((idx)&0xf)*4)(w), wt; /* W0 */ \ + roll $1, wt; \ + \ + movl wt, ((idx&0xf)*4)(w); + +/* + * Compute a SHA-1 round without logic function: + * + * T = rol(a, 5) + e + Kt + Wt + * + * The caller is required to compute the appropriate logic function + * (Ch, Maj, Parity) and add it to e. + * + * Upon completion b = rol(b, 30), e = T, pending rotation. + */ +#define sha1_round(a, b, c, d, e, kt, wt) \ + leal kt(wt, e, 1), e; /* Kt + Wt */ \ + \ + movl a, tmp1; /* rol(a, 5) */ \ + roll $5, tmp1; \ + addl tmp1, e; \ + \ + roll $30, b; /* rol(b, 30) */ + +/* + * Compute a SHA-1 round with Ch: + * + * T = rol(a, 5) + Ch(b, c, d) + e + Kt + Wt + * + * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z + * + * Upon completion b = rol(b, 30), e = T, pending rotation. + */ +#define sha1_round_ch(a, b, c, d, e, kt, wt) \ + movl c, tmp2; /* Ch */ \ + xorl d, tmp2; /* Ch */ \ + andl b, tmp2; /* Ch */ \ + xorl d, tmp2; /* Ch */ \ + addl tmp2, e; /* Ch */ \ + \ + sha1_round(a, b, c, d, e, kt, wt); + +/* + * Compute a SHA-1 round with Parity: + * + * T = rol(a, 5) + Parity(b, c, d) + e + Kt + Wt + * + * Parity(x, y, z) = x ^ y ^ z + * + * Upon completion b = rol(b, 30), e = T, pending rotation. + */ +#define sha1_round_parity(a, b, c, d, e, kt, wt) \ + movl b, tmp2; /* Parity */ \ + xorl c, tmp2; /* Parity */ \ + xorl d, tmp2; /* Parity */ \ + addl tmp2, e; /* Parity */ \ + \ + sha1_round(a, b, c, d, e, kt, wt); + +/* + * Compute a SHA-1 round with Maj: + * + * T = rol(a, 5) + Maj(b, c, d) + e + Kt + Wt + * + * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z) + * + * Upon completion b = rol(b, 30), e = T, pending rotation. + */ +#define sha1_round_maj(a, b, c, d, e, kt, wt) \ + movl c, tmp2; /* Maj */ \ + xorl d, tmp2; /* Maj */ \ + andl b, tmp2; /* Maj */ \ + movl c, tmp3; /* Maj */ \ + andl d, tmp3; /* Maj */ \ + xorl tmp2, tmp3; /* Maj */ \ + addl tmp3, e; /* Maj */ \ + \ + sha1_round(a, b, c, d, e, kt, wt); + +#define sha1_round1_load(idx, a, b, c, d, e) \ + sha1_message_schedule_load(idx, in, %rsp, tmp0) \ + sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0) + +#define sha1_round1_update(idx, a, b, c, d, e) \ + sha1_message_schedule_update(idx, %rsp, tmp0) \ + sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0) + +#define sha1_round2_update(idx, a, b, c, d, e) \ + sha1_message_schedule_update(idx, %rsp, tmp0) \ + sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0) + +#define sha1_round3_update(idx, a, b, c, d, e) \ + sha1_message_schedule_update(idx, %rsp, tmp0) \ + sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0) + +#define sha1_round4_update(idx, a, b, c, d, e) \ + sha1_message_schedule_update(idx, %rsp, tmp0) \ + sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0) + +.text + +/* + * void sha1_block_generic(SHA1_CTX *ctx, const void *in, size_t num); + * + * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num + */ +.align 16 +.globl sha1_block_generic +.type sha1_block_generic,@function +sha1_block_generic: + _CET_ENDBR + + /* Save callee save registers. */ + pushq %rbx + pushq %rbp + pushq %r12 + + /* Allocate space for message schedule. */ + movq %rsp, %rax + subq $(64+1*8), %rsp + andq $~63, %rsp + movq %rax, (64+0*8)(%rsp) + + /* Compute and store end of message. */ + shlq $6, num + leaq (in, num, 1), %rbp + + /* Load current hash state from context. */ + movl (0*4)(ctx), hs0 + movl (1*4)(ctx), hs1 + movl (2*4)(ctx), hs2 + movl (3*4)(ctx), hs3 + movl (4*4)(ctx), hs4 + + jmp .Lblock_loop + +.align 16 +.Lblock_loop: + + /* Round 0 through 15. */ + sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4) + sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3) + sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2) + sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1) + sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0) + sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4) + sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3) + sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2) + sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1) + sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0) + sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4) + sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3) + sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2) + sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1) + sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0) + sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4) + + /* Round 16 through 31. */ + sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3); + sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2); + sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1); + sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0); + sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4); + sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3); + sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2); + sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1); + sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0); + sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4); + sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3); + sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2); + sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1); + sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0); + sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4); + sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3); + + /* Round 32 through 47. */ + sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2); + sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1); + sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0); + sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4); + sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3); + sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2); + sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1); + sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0); + sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4); + sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3); + sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2); + sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1); + sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0); + sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4); + sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3); + sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2); + + /* Round 48 through 63. */ + sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1); + sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0); + sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4); + sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3); + sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2); + sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1); + sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0); + sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4); + sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3); + sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2); + sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1); + sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0); + sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4); + sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3); + sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2); + sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1); + + /* Round 64 through 79. */ + sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0); + sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4); + sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3); + sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2); + sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1); + sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0); + sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4); + sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3); + sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2); + sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1); + sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0); + sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4); + sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3); + sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2); + sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1); + sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0); + + /* Add intermediate state to hash state. */ + addl (0*4)(ctx), hs0 + addl (1*4)(ctx), hs1 + addl (2*4)(ctx), hs2 + addl (3*4)(ctx), hs3 + addl (4*4)(ctx), hs4 + + /* Store new hash state to context. */ + movl hs0, (0*4)(ctx) + movl hs1, (1*4)(ctx) + movl hs2, (2*4)(ctx) + movl hs3, (3*4)(ctx) + movl hs4, (4*4)(ctx) + + addq $64, in + cmpq end, in + jb .Lblock_loop + + movq (64+0*8)(%rsp), %rsp + + /* Restore callee save registers. */ + popq %r12 + popq %rbp + popq %rbx + + ret -- cgit v1.2.3-55-g6feb