/* $OpenBSD: sha1_aarch64_ce.S,v 1.1 2025/06/28 12:51:08 jsing Exp $ */ /* * Copyright (c) 2023,2025 Joel Sing * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * SHA-1 implementation using the ARM Cryptographic Extension (CE). * * There are six instructions for hardware acceleration of SHA-1 - the * documentation for these instructions is woefully inadequate: * * sha1c: hash update (choose) * sha1h: fixed rotate * sha1m: hash update (majority) * sha1p: hash update (parity) * sha1su0: message schedule update with sigma0 for four rounds * sha1su1: message schedule update with sigma1 for four rounds */ #define ctx x0 #define in x1 #define num x2 /* Note: the lower 64 bits of v8 through v15 are callee saved. */ #define hc0 v16 #define hc1 v17 #define hc1s s17 #define hs0 v18 #define hs1 v19 #define hs1s s19 #define w0 v20 #define w1 v21 #define w2 v22 #define w3 v23 #define k0 v24 #define k1 v25 #define k2 v26 #define k3 v27 #define tmp0 v28 #define tmp1 s29 #define tmp2 w11 /* * Update message schedule for m0 (W0:W1:W2:W3), using m1 (W4:W5:W6:W7), * m2 (W8:W9:W10:11) and m3 (W12:W13:W14:W15). The sha1su0 instruction computes * W0 = W8 ^ W2 ^ W0, while sha1su1 computes rol(W0 ^ W13, 1). */ #define sha1_message_schedule_update(m0, m1, m2, m3) \ sha1su0 m0.4s, m1.4s, m2.4s; \ sha1su1 m0.4s, m3.4s; /* * Compute four SHA-1 rounds by adding W0:W1:W2:W3 + K0:K1:K2:K3, then * computing the remainder of each round (including the shuffle) via * sha1{c,p,m}/sha1h. */ #define sha1_round1(h0, h1, w, k) \ add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ mov tmp1, h0.s[0]; \ sha1c h0, h1, tmp0.4s; \ sha1h h1, tmp1; #define sha1_round2(h0, h1, w, k) \ add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ mov tmp1, h0.s[0]; \ sha1p h0, h1, tmp0.4s; \ sha1h h1, tmp1; #define sha1_round3(h0, h1, w, k) \ add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ mov tmp1, h0.s[0]; \ sha1m h0, h1, tmp0.4s; \ sha1h h1, tmp1; #define sha1_round4(h0, h1, w, k) \ add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ mov tmp1, h0.s[0]; \ sha1p h0, h1, tmp0.4s; \ sha1h h1, tmp1; .arch armv8-a+sha2 .text /* * void sha1_block_ce(SHA256_CTX *ctx, const void *in, size_t num); * * Standard ARM ABI: x0 = ctx, x1 = in, x2 = num */ .globl sha1_block_ce .type sha1_block_ce,@function sha1_block_ce: /* * Load SHA-1 round constants. */ /* Round 1 - 0x5a827999 */ movz tmp2, #0x5a82, lsl #16 movk tmp2, #0x7999 dup k0.4s, tmp2 /* Round 2 - 0x6ed9eba1 */ movz tmp2, #0x6ed9, lsl #16 movk tmp2, #0xeba1 dup k1.4s, tmp2 /* Round 3 - 0x8f1bbcdc */ movz tmp2, #0x8f1b, lsl #16 movk tmp2, #0xbcdc dup k2.4s, tmp2 /* Round 4 - 0xca62c1d6 */ movz tmp2, #0xca62, lsl #16 movk tmp2, #0xc1d6 dup k3.4s, tmp2 /* Load current hash state from context (hc0 = a:b:c:d, hc1 = e). */ ld1 {hc0.4s}, [ctx] ldr hc1s, [ctx, #(4*4)] block_loop: /* Copy current hash state. */ mov hs0.4s, hc0.4s mov hs1s, hc1.s[0] /* Load and byte swap message schedule. */ ld1 {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64 rev32 w0.16b, w0.16b rev32 w1.16b, w1.16b rev32 w2.16b, w2.16b rev32 w3.16b, w3.16b /* Rounds 0 through 15 (four rounds at a time). */ sha1_round1(hs0, hs1s, w0, k0) sha1_round1(hs0, hs1s, w1, k0) sha1_round1(hs0, hs1s, w2, k0) sha1_round1(hs0, hs1s, w3, k0) /* Rounds 16 through 31 (four rounds at a time). */ sha1_message_schedule_update(w0, w1, w2, w3) sha1_message_schedule_update(w1, w2, w3, w0) sha1_message_schedule_update(w2, w3, w0, w1) sha1_message_schedule_update(w3, w0, w1, w2) sha1_round1(hs0, hs1s, w0, k0) sha1_round2(hs0, hs1s, w1, k1) sha1_round2(hs0, hs1s, w2, k1) sha1_round2(hs0, hs1s, w3, k1) /* Rounds 32 through 47 (four rounds at a time). */ sha1_message_schedule_update(w0, w1, w2, w3) sha1_message_schedule_update(w1, w2, w3, w0) sha1_message_schedule_update(w2, w3, w0, w1) sha1_message_schedule_update(w3, w0, w1, w2) sha1_round2(hs0, hs1s, w0, k1) sha1_round2(hs0, hs1s, w1, k1) sha1_round3(hs0, hs1s, w2, k2) sha1_round3(hs0, hs1s, w3, k2) /* Rounds 48 through 63 (four rounds at a time). */ sha1_message_schedule_update(w0, w1, w2, w3) sha1_message_schedule_update(w1, w2, w3, w0) sha1_message_schedule_update(w2, w3, w0, w1) sha1_message_schedule_update(w3, w0, w1, w2) sha1_round3(hs0, hs1s, w0, k2) sha1_round3(hs0, hs1s, w1, k2) sha1_round3(hs0, hs1s, w2, k2) sha1_round4(hs0, hs1s, w3, k3) /* Rounds 64 through 79 (four rounds at a time). */ sha1_message_schedule_update(w0, w1, w2, w3) sha1_message_schedule_update(w1, w2, w3, w0) sha1_message_schedule_update(w2, w3, w0, w1) sha1_message_schedule_update(w3, w0, w1, w2) sha1_round4(hs0, hs1s, w0, k3) sha1_round4(hs0, hs1s, w1, k3) sha1_round4(hs0, hs1s, w2, k3) sha1_round4(hs0, hs1s, w3, k3) /* Add intermediate state to hash state. */ add hc0.4s, hc0.4s, hs0.4s add hc1.4s, hc1.4s, hs1.4s sub num, num, #1 cbnz num, block_loop /* Store hash state to context. */ st1 {hc0.4s}, [ctx] str hc1s, [ctx, #(4*4)] ret