From c6ef8fc8d3285a55c002f52df2bb2df42b7734c0 Mon Sep 17 00:00:00 2001 From: jsing <> Date: Sun, 25 Jan 2026 08:22:17 +0000 Subject: Make SHA aarch64 assembly build with gcc. gcc is extremely fussy about register naming and insists on q and s naming for the ARM CE SHA instructions, even though they're referring to the same register (while LLVM just figures it out). Work around this by mapping registers to their required variant at usage and defining a handful of mappings between v registers and alternate names/views. This is still somewhat ugly, but seems to be one of the cleaner options that will allow portable to enable SHA assembly on platforms that use gcc. ok kenjiro@ tb@ --- src/lib/libcrypto/sha/sha1_aarch64_ce.S | 61 ++++++++++++++++--------------- src/lib/libcrypto/sha/sha256_aarch64_ce.S | 27 +++++++++----- src/lib/libcrypto/sha/sha512_aarch64_ce.S | 39 ++++++++++++-------- 3 files changed, 72 insertions(+), 55 deletions(-) diff --git a/src/lib/libcrypto/sha/sha1_aarch64_ce.S b/src/lib/libcrypto/sha/sha1_aarch64_ce.S index 853d467641..641500a1e5 100644 --- a/src/lib/libcrypto/sha/sha1_aarch64_ce.S +++ b/src/lib/libcrypto/sha/sha1_aarch64_ce.S @@ -1,4 +1,4 @@ -/* $OpenBSD: sha1_aarch64_ce.S,v 1.4 2026/01/24 14:20:52 jsing Exp $ */ +/* $OpenBSD: sha1_aarch64_ce.S,v 1.5 2026/01/25 08:22:17 jsing Exp $ */ /* * Copyright (c) 2023,2025 Joel Sing * @@ -40,6 +40,7 @@ #define hc1s s17 #define hs0 v18 +#define hs0q q18 #define hs1 v19 #define hs1s s19 @@ -76,26 +77,26 @@ #define sha1_round1(h0, h1, w, k) \ add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ mov tmp1, h0.s[0]; \ - sha1c h0, h1, tmp0.4s; \ - sha1h h1, tmp1 + sha1c h0##q, h1##s, tmp0.4s; \ + sha1h h1##s, tmp1 #define sha1_round2(h0, h1, w, k) \ add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ mov tmp1, h0.s[0]; \ - sha1p h0, h1, tmp0.4s; \ - sha1h h1, tmp1 + sha1p h0##q, h1##s, tmp0.4s; \ + sha1h h1##s, tmp1 #define sha1_round3(h0, h1, w, k) \ add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ mov tmp1, h0.s[0]; \ - sha1m h0, h1, tmp0.4s; \ - sha1h h1, tmp1 + sha1m h0##q, h1##s, tmp0.4s; \ + sha1h h1##s, tmp1 #define sha1_round4(h0, h1, w, k) \ add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ mov tmp1, h0.s[0]; \ - sha1p h0, h1, tmp0.4s; \ - sha1h h1, tmp1 + sha1p h0##q, h1##s, tmp0.4s; \ + sha1h h1##s, tmp1 .arch armv8-a+sha2 @@ -140,7 +141,7 @@ sha1_block_ce: .Lblock_loop: /* Copy current hash state. */ - mov hs0.4s, hc0.4s + mov hs0.16b, hc0.16b mov hs1s, hc1.s[0] /* Load and byte swap message schedule. */ @@ -151,10 +152,10 @@ sha1_block_ce: rev32 w3.16b, w3.16b /* Rounds 0 through 15 (four rounds at a time). */ - sha1_round1(hs0, hs1s, w0, k0) - sha1_round1(hs0, hs1s, w1, k0) - sha1_round1(hs0, hs1s, w2, k0) - sha1_round1(hs0, hs1s, w3, k0) + sha1_round1(hs0, hs1, w0, k0) + sha1_round1(hs0, hs1, w1, k0) + sha1_round1(hs0, hs1, w2, k0) + sha1_round1(hs0, hs1, w3, k0) /* Rounds 16 through 31 (four rounds at a time). */ sha1_message_schedule_update(w0, w1, w2, w3) @@ -162,10 +163,10 @@ sha1_block_ce: sha1_message_schedule_update(w2, w3, w0, w1) sha1_message_schedule_update(w3, w0, w1, w2) - sha1_round1(hs0, hs1s, w0, k0) - sha1_round2(hs0, hs1s, w1, k1) - sha1_round2(hs0, hs1s, w2, k1) - sha1_round2(hs0, hs1s, w3, k1) + sha1_round1(hs0, hs1, w0, k0) + sha1_round2(hs0, hs1, w1, k1) + sha1_round2(hs0, hs1, w2, k1) + sha1_round2(hs0, hs1, w3, k1) /* Rounds 32 through 47 (four rounds at a time). */ sha1_message_schedule_update(w0, w1, w2, w3) @@ -173,10 +174,10 @@ sha1_block_ce: sha1_message_schedule_update(w2, w3, w0, w1) sha1_message_schedule_update(w3, w0, w1, w2) - sha1_round2(hs0, hs1s, w0, k1) - sha1_round2(hs0, hs1s, w1, k1) - sha1_round3(hs0, hs1s, w2, k2) - sha1_round3(hs0, hs1s, w3, k2) + sha1_round2(hs0, hs1, w0, k1) + sha1_round2(hs0, hs1, w1, k1) + sha1_round3(hs0, hs1, w2, k2) + sha1_round3(hs0, hs1, w3, k2) /* Rounds 48 through 63 (four rounds at a time). */ sha1_message_schedule_update(w0, w1, w2, w3) @@ -184,10 +185,10 @@ sha1_block_ce: sha1_message_schedule_update(w2, w3, w0, w1) sha1_message_schedule_update(w3, w0, w1, w2) - sha1_round3(hs0, hs1s, w0, k2) - sha1_round3(hs0, hs1s, w1, k2) - sha1_round3(hs0, hs1s, w2, k2) - sha1_round4(hs0, hs1s, w3, k3) + sha1_round3(hs0, hs1, w0, k2) + sha1_round3(hs0, hs1, w1, k2) + sha1_round3(hs0, hs1, w2, k2) + sha1_round4(hs0, hs1, w3, k3) /* Rounds 64 through 79 (four rounds at a time). */ sha1_message_schedule_update(w0, w1, w2, w3) @@ -195,10 +196,10 @@ sha1_block_ce: sha1_message_schedule_update(w2, w3, w0, w1) sha1_message_schedule_update(w3, w0, w1, w2) - sha1_round4(hs0, hs1s, w0, k3) - sha1_round4(hs0, hs1s, w1, k3) - sha1_round4(hs0, hs1s, w2, k3) - sha1_round4(hs0, hs1s, w3, k3) + sha1_round4(hs0, hs1, w0, k3) + sha1_round4(hs0, hs1, w1, k3) + sha1_round4(hs0, hs1, w2, k3) + sha1_round4(hs0, hs1, w3, k3) /* Add intermediate state to hash state. */ add hc0.4s, hc0.4s, hs0.4s diff --git a/src/lib/libcrypto/sha/sha256_aarch64_ce.S b/src/lib/libcrypto/sha/sha256_aarch64_ce.S index 343f338390..8a26f91b06 100644 --- a/src/lib/libcrypto/sha/sha256_aarch64_ce.S +++ b/src/lib/libcrypto/sha/sha256_aarch64_ce.S @@ -1,4 +1,4 @@ -/* $OpenBSD: sha256_aarch64_ce.S,v 1.5 2026/01/24 14:20:52 jsing Exp $ */ +/* $OpenBSD: sha256_aarch64_ce.S,v 1.6 2026/01/25 08:22:17 jsing Exp $ */ /* * Copyright (c) 2023,2025 Joel Sing * @@ -54,6 +54,10 @@ #define tmp0 v28 #define tmp1 v29 +#define tmp1q q29 + +#define v18q q18 +#define v19q q19 /* * Update message schedule for m0 (W0:W1:W2:W3), using m1 (W4:W5:W6:W7), @@ -74,9 +78,12 @@ */ #define sha256_round(h0, h1, w, k) \ add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \ - mov tmp1.4s, h0.4s; \ - sha256h h0, h1, tmp0.4s; \ - sha256h2 h1, tmp1, tmp0.4s + mov tmp1.16b, h0.16b; \ + sha256h h0##q, h1##q, tmp0.4s; \ + sha256h2 h1##q, tmp1##q, tmp0.4s + +#define sha256_round_initial(h0, h1, w, k) \ + sha256_round(h0, h1, w, k) #define sha256_round_update(h0, h1, m0, m1, m2, m3, k) \ sha256_message_schedule_update(m0, m1, m2, m3); \ @@ -109,8 +116,8 @@ sha256_block_ce: mov k256, k256_base /* Copy current hash state. */ - mov hs0.4s, hc0.4s - mov hs1.4s, hc1.4s + mov hs0.16b, hc0.16b + mov hs1.16b, hc1.16b /* Load and byte swap message schedule. */ ld1 {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64 @@ -122,10 +129,10 @@ sha256_block_ce: /* Rounds 0 through 15 (four rounds at a time). */ ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64 - sha256_round(hs0, hs1, w0, k0) - sha256_round(hs0, hs1, w1, k1) - sha256_round(hs0, hs1, w2, k2) - sha256_round(hs0, hs1, w3, k3) + sha256_round_initial(hs0, hs1, w0, k0) + sha256_round_initial(hs0, hs1, w1, k1) + sha256_round_initial(hs0, hs1, w2, k2) + sha256_round_initial(hs0, hs1, w3, k3) /* Rounds 16 through 31 (four rounds at a time). */ ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64 diff --git a/src/lib/libcrypto/sha/sha512_aarch64_ce.S b/src/lib/libcrypto/sha/sha512_aarch64_ce.S index bec56a49e5..6efe775ff5 100644 --- a/src/lib/libcrypto/sha/sha512_aarch64_ce.S +++ b/src/lib/libcrypto/sha/sha512_aarch64_ce.S @@ -1,4 +1,4 @@ -/* $OpenBSD: sha512_aarch64_ce.S,v 1.3 2026/01/17 06:31:45 jsing Exp $ */ +/* $OpenBSD: sha512_aarch64_ce.S,v 1.4 2026/01/25 08:22:17 jsing Exp $ */ /* * Copyright (c) 2023,2025 Joel Sing * @@ -83,9 +83,15 @@ #define k7 v27 #define tmp0 v8 +#define tmp0q q8 #define tmp1 v9 #define tmp2 v18 +#define v0q q0 +#define v1q q1 +#define v4q q4 +#define v6q q6 + /* * Update message schedule for m0 (W0:W1), using m1 (W2:W3), m4 (W8:W9), * m5 (W10:W11) and m7 (W14:W15). The sha512su0 instruction computes the sigma0 @@ -141,9 +147,12 @@ add h4.2d, h4.2d, h3.2d; /* W1:W0 += g:h */ \ ext tmp0.16b, h2.16b, h3.16b, #8; /* f:g */ \ ext tmp1.16b, h1.16b, h2.16b, #8; /* d:e */ \ - sha512h h4, tmp0, tmp1.2d; /* T1 */ \ + sha512h h4##q, tmp0##q, tmp1.2d; /* T1 */ \ add h5.2d, h1.2d, h4.2d; /* c:d + T1 */ \ - sha512h2 h4, h1, h0.2d; /* T1 + T2 */ + sha512h2 h4##q, h1##q, h0.2d; /* T1 + T2 */ + +#define sha512_round_initial(h0, h1, h2, h3, h4, h5, w, k) \ + sha512_round(h0, h1, h2, h3, h4, h5, w, k) #define sha512_round_update(h0, h1, h2, h3, h4, h5, m0, m1, m2, m3, m4, k) \ sha512_message_schedule_update(m0, m1, m2, m3, m4) \ @@ -181,10 +190,10 @@ sha512_block_ce: mov k512, k512_base /* Copy current hash state. */ - mov hs0.2d, hc0.2d - mov hs1.2d, hc1.2d - mov hs2.2d, hc2.2d - mov hs3.2d, hc3.2d + mov hs0.16b, hc0.16b + mov hs1.16b, hc1.16b + mov hs2.16b, hc2.16b + mov hs3.16b, hc3.16b /* Load and byte swap message schedule. */ ld1 {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64 @@ -203,14 +212,14 @@ sha512_block_ce: ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64 ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64 - sha512_round(hs0, hs1, hs2, hs3, hs4, hs5, w0, k0) - sha512_round(hs4, hs0, hs5, hs2, hs6, hs7, w1, k1) - sha512_round(hs6, hs4, hs7, hs5, hs1, hs3, w2, k2) - sha512_round(hs1, hs6, hs3, hs7, hs0, hs2, w3, k3) - sha512_round(hs0, hs1, hs2, hs3, hs4, hs5, w4, k4) - sha512_round(hs4, hs0, hs5, hs2, hs6, hs7, w5, k5) - sha512_round(hs6, hs4, hs7, hs5, hs1, hs3, w6, k6) - sha512_round(hs1, hs6, hs3, hs7, hs0, hs2, w7, k7) + sha512_round_initial(hs0, hs1, hs2, hs3, hs4, hs5, w0, k0) + sha512_round_initial(hs4, hs0, hs5, hs2, hs6, hs7, w1, k1) + sha512_round_initial(hs6, hs4, hs7, hs5, hs1, hs3, w2, k2) + sha512_round_initial(hs1, hs6, hs3, hs7, hs0, hs2, w3, k3) + sha512_round_initial(hs0, hs1, hs2, hs3, hs4, hs5, w4, k4) + sha512_round_initial(hs4, hs0, hs5, hs2, hs6, hs7, w5, k5) + sha512_round_initial(hs6, hs4, hs7, hs5, hs1, hs3, w6, k6) + sha512_round_initial(hs1, hs6, hs3, hs7, hs0, hs2, w7, k7) /* Rounds 16 through 31 (two rounds at a time). */ ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64 -- cgit v1.2.3-55-g6feb