diff options
| author | jsing <> | 2025-08-17 08:04:25 +0000 |
|---|---|---|
| committer | jsing <> | 2025-08-17 08:04:25 +0000 |
| commit | c9f6ed45b268d1c6b5608ff56379e3b051625806 (patch) | |
| tree | c4d739fc2ff0a7cc1a149a22a0dee426bf9cdcb4 | |
| parent | 13656f0db6ccf444b794aea868c6e5bdfc241a24 (diff) | |
| download | openbsd-c9f6ed45b268d1c6b5608ff56379e3b051625806.tar.gz openbsd-c9f6ed45b268d1c6b5608ff56379e3b051625806.tar.bz2 openbsd-c9f6ed45b268d1c6b5608ff56379e3b051625806.zip | |
Further simplify the rc4 implementation.
The RC4_CHUNK related code is intended to process native word sized
chunks if the input and output are naturally aligned. However, RC4_CHUNK
is currently a mess of machine dependent defines.
Replace this with uint64_t on all architectures - 64 bit architectures will
be happy with this and on 32 bit architectures the compiler can decompose
this into multiple 32 bit operations. Provide separate rc4_chunk()
implementations for big and little endian, since not all architectures
have a byte swap instruction that would make this a cheap conversion.
Thanks to gkoehler@ and tb@ for testing on big endian.
ok tb@
| -rw-r--r-- | src/lib/libcrypto/rc4/rc4.c | 162 |
1 files changed, 45 insertions, 117 deletions
diff --git a/src/lib/libcrypto/rc4/rc4.c b/src/lib/libcrypto/rc4/rc4.c index 9c0a61162d..69b7d0a815 100644 --- a/src/lib/libcrypto/rc4/rc4.c +++ b/src/lib/libcrypto/rc4/rc4.c | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | /* $OpenBSD: rc4.c,v 1.14 2025/08/14 14:55:43 jsing Exp $ */ | 1 | /* $OpenBSD: rc4.c,v 1.15 2025/08/17 08:04:25 jsing Exp $ */ |
| 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) |
| 3 | * All rights reserved. | 3 | * All rights reserved. |
| 4 | * | 4 | * |
| @@ -57,22 +57,15 @@ | |||
| 57 | */ | 57 | */ |
| 58 | 58 | ||
| 59 | #include <endian.h> | 59 | #include <endian.h> |
| 60 | #include <stdint.h> | ||
| 60 | 61 | ||
| 61 | #include <openssl/rc4.h> | 62 | #include <openssl/rc4.h> |
| 62 | 63 | ||
| 63 | #include "crypto_arch.h" | 64 | #include "crypto_arch.h" |
| 64 | 65 | ||
| 65 | /* RC4 as implemented from a posting from | ||
| 66 | * Newsgroups: sci.crypt | ||
| 67 | * From: sterndark@netcom.com (David Sterndark) | ||
| 68 | * Subject: RC4 Algorithm revealed. | ||
| 69 | * Message-ID: <sternCvKL4B.Hyy@netcom.com> | ||
| 70 | * Date: Wed, 14 Sep 1994 06:35:31 GMT | ||
| 71 | */ | ||
| 72 | |||
| 73 | #ifdef HAVE_RC4_INTERNAL | 66 | #ifdef HAVE_RC4_INTERNAL |
| 74 | void rc4_internal(RC4_KEY *key, size_t len, const unsigned char *indata, | 67 | void rc4_internal(RC4_KEY *key, size_t len, const uint8_t *in, |
| 75 | unsigned char *outdata); | 68 | uint8_t *out); |
| 76 | 69 | ||
| 77 | #else | 70 | #else |
| 78 | static inline RC4_INT | 71 | static inline RC4_INT |
| @@ -89,9 +82,35 @@ rc4_step(RC4_INT *d, RC4_INT *x, RC4_INT *y) | |||
| 89 | return d[(tx + ty) & 0xff]; | 82 | return d[(tx + ty) & 0xff]; |
| 90 | } | 83 | } |
| 91 | 84 | ||
| 85 | #if BYTE_ORDER == BIG_ENDIAN | ||
| 86 | static inline uint64_t | ||
| 87 | rc4_chunk(RC4_INT *d, RC4_INT *x, RC4_INT *y) | ||
| 88 | { | ||
| 89 | uint64_t chunk = 0; | ||
| 90 | size_t i; | ||
| 91 | |||
| 92 | for (i = 0; i < 8; i++) | ||
| 93 | chunk = chunk << 8 | (uint64_t)rc4_step(d, x, y); | ||
| 94 | |||
| 95 | return chunk; | ||
| 96 | } | ||
| 97 | |||
| 98 | #else | ||
| 99 | static inline uint64_t | ||
| 100 | rc4_chunk(RC4_INT *d, RC4_INT *x, RC4_INT *y) | ||
| 101 | { | ||
| 102 | uint64_t chunk = 0; | ||
| 103 | size_t i; | ||
| 104 | |||
| 105 | for (i = 0; i < 8; i++) | ||
| 106 | chunk |= (uint64_t)rc4_step(d, x, y) << (i * 8); | ||
| 107 | |||
| 108 | return chunk; | ||
| 109 | } | ||
| 110 | #endif | ||
| 111 | |||
| 92 | static void | 112 | static void |
| 93 | rc4_internal(RC4_KEY *key, size_t len, const unsigned char *indata, | 113 | rc4_internal(RC4_KEY *key, size_t len, const uint8_t *in, uint8_t *out) |
| 94 | unsigned char *outdata) | ||
| 95 | { | 114 | { |
| 96 | RC4_INT *d, x, y; | 115 | RC4_INT *d, x, y; |
| 97 | size_t i; | 116 | size_t i; |
| @@ -100,118 +119,27 @@ rc4_internal(RC4_KEY *key, size_t len, const unsigned char *indata, | |||
| 100 | y = key->y; | 119 | y = key->y; |
| 101 | d = key->data; | 120 | d = key->data; |
| 102 | 121 | ||
| 103 | #if defined(RC4_CHUNK) | 122 | /* Process uint64_t chunks if 8 byte aligned. */ |
| 104 | /* | 123 | if ((((size_t)in | (size_t)out) % 8) == 0) { |
| 105 | * The original reason for implementing this(*) was the fact that | 124 | while (len >= 8) { |
| 106 | * pre-21164a Alpha CPUs don't have byte load/store instructions | 125 | *(uint64_t *)out = *(const uint64_t *)in ^ rc4_chunk(d, &x, &y); |
| 107 | * and e.g. a byte store has to be done with 64-bit load, shift, | ||
| 108 | * and, or and finally 64-bit store. Peaking data and operating | ||
| 109 | * at natural word size made it possible to reduce amount of | ||
| 110 | * instructions as well as to perform early read-ahead without | ||
| 111 | * suffering from RAW (read-after-write) hazard. This resulted | ||
| 112 | * in ~40%(**) performance improvement on 21064 box with gcc. | ||
| 113 | * But it's not only Alpha users who win here:-) Thanks to the | ||
| 114 | * early-n-wide read-ahead this implementation also exhibits | ||
| 115 | * >40% speed-up on SPARC and 20-30% on 64-bit MIPS (depending | ||
| 116 | * on sizeof(RC4_INT)). | ||
| 117 | * | ||
| 118 | * (*) "this" means code which recognizes the case when input | ||
| 119 | * and output pointers appear to be aligned at natural CPU | ||
| 120 | * word boundary | ||
| 121 | * (**) i.e. according to 'apps/openssl speed rc4' benchmark, | ||
| 122 | * crypto/rc4/rc4speed.c exhibits almost 70% speed-up... | ||
| 123 | * | ||
| 124 | * Caveats. | ||
| 125 | * | ||
| 126 | * - RC4_CHUNK="unsigned long long" should be a #1 choice for | ||
| 127 | * UltraSPARC. Unfortunately gcc generates very slow code | ||
| 128 | * (2.5-3 times slower than one generated by Sun's WorkShop | ||
| 129 | * C) and therefore gcc (at least 2.95 and earlier) should | ||
| 130 | * always be told that RC4_CHUNK="unsigned long". | ||
| 131 | * | ||
| 132 | * <appro@fy.chalmers.se> | ||
| 133 | */ | ||
| 134 | |||
| 135 | # define RC4_STEP ((RC4_CHUNK)rc4_step(d, &x, &y)) | ||
| 136 | 126 | ||
| 137 | if ((((size_t)indata & (sizeof(RC4_CHUNK) - 1)) | | 127 | in += 8; |
| 138 | ((size_t)outdata & (sizeof(RC4_CHUNK) - 1))) == 0 ) { | 128 | out += 8; |
| 139 | RC4_CHUNK ichunk, otp; | 129 | len -= 8; |
| 140 | |||
| 141 | /* | ||
| 142 | * I reckon we can afford to implement both endian | ||
| 143 | * cases and to decide which way to take at run-time | ||
| 144 | * because the machine code appears to be very compact | ||
| 145 | * and redundant 1-2KB is perfectly tolerable (i.e. | ||
| 146 | * in case the compiler fails to eliminate it:-). By | ||
| 147 | * suggestion from Terrel Larson <terr@terralogic.net>. | ||
| 148 | * | ||
| 149 | * Special notes. | ||
| 150 | * | ||
| 151 | * - compilers (those I've tried) don't seem to have | ||
| 152 | * problems eliminating either the operators guarded | ||
| 153 | * by "if (sizeof(RC4_CHUNK)==8)" or the condition | ||
| 154 | * expressions themselves so I've got 'em to replace | ||
| 155 | * corresponding #ifdefs from the previous version; | ||
| 156 | * - I chose to let the redundant switch cases when | ||
| 157 | * sizeof(RC4_CHUNK)!=8 be (were also #ifdefed | ||
| 158 | * before); | ||
| 159 | * - in case you wonder "&(sizeof(RC4_CHUNK)*8-1)" in | ||
| 160 | * [LB]ESHFT guards against "shift is out of range" | ||
| 161 | * warnings when sizeof(RC4_CHUNK)!=8 | ||
| 162 | * | ||
| 163 | * <appro@fy.chalmers.se> | ||
| 164 | */ | ||
| 165 | #if BYTE_ORDER == BIG_ENDIAN | ||
| 166 | # define BESHFT(c) (((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1)) | ||
| 167 | for (; len & (0 - sizeof(RC4_CHUNK)); len -= sizeof(RC4_CHUNK)) { | ||
| 168 | ichunk = *(RC4_CHUNK *)indata; | ||
| 169 | otp = RC4_STEP << BESHFT(0); | ||
| 170 | otp |= RC4_STEP << BESHFT(1); | ||
| 171 | otp |= RC4_STEP << BESHFT(2); | ||
| 172 | otp |= RC4_STEP << BESHFT(3); | ||
| 173 | if (sizeof(RC4_CHUNK) == 8) { | ||
| 174 | otp |= RC4_STEP << BESHFT(4); | ||
| 175 | otp |= RC4_STEP << BESHFT(5); | ||
| 176 | otp |= RC4_STEP << BESHFT(6); | ||
| 177 | otp |= RC4_STEP << BESHFT(7); | ||
| 178 | } | ||
| 179 | *(RC4_CHUNK *)outdata = otp^ichunk; | ||
| 180 | indata += sizeof(RC4_CHUNK); | ||
| 181 | outdata += sizeof(RC4_CHUNK); | ||
| 182 | } | ||
| 183 | #else | ||
| 184 | # define LESHFT(c) (((c)*8)&(sizeof(RC4_CHUNK)*8-1)) | ||
| 185 | for (; len & (0 - sizeof(RC4_CHUNK)); len -= sizeof(RC4_CHUNK)) { | ||
| 186 | ichunk = *(RC4_CHUNK *)indata; | ||
| 187 | otp = RC4_STEP; | ||
| 188 | otp |= RC4_STEP << 8; | ||
| 189 | otp |= RC4_STEP << 16; | ||
| 190 | otp |= RC4_STEP << 24; | ||
| 191 | if (sizeof(RC4_CHUNK) == 8) { | ||
| 192 | otp |= RC4_STEP << LESHFT(4); | ||
| 193 | otp |= RC4_STEP << LESHFT(5); | ||
| 194 | otp |= RC4_STEP << LESHFT(6); | ||
| 195 | otp |= RC4_STEP << LESHFT(7); | ||
| 196 | } | ||
| 197 | *(RC4_CHUNK *)outdata = otp ^ ichunk; | ||
| 198 | indata += sizeof(RC4_CHUNK); | ||
| 199 | outdata += sizeof(RC4_CHUNK); | ||
| 200 | } | 130 | } |
| 201 | #endif | ||
| 202 | } | 131 | } |
| 203 | #endif | ||
| 204 | 132 | ||
| 205 | while (len >= 8) { | 133 | while (len >= 8) { |
| 206 | for (i = 0; i < 8; i++) | 134 | for (i = 0; i < 8; i++) |
| 207 | outdata[i] = rc4_step(d, &x, &y) ^ indata[i]; | 135 | out[i] = rc4_step(d, &x, &y) ^ in[i]; |
| 208 | 136 | ||
| 209 | indata += 8; | 137 | in += 8; |
| 210 | outdata += 8; | 138 | out += 8; |
| 211 | len -= 8; | 139 | len -= 8; |
| 212 | } | 140 | } |
| 213 | for (i = 0; i < len; i++) | 141 | for (i = 0; i < len; i++) |
| 214 | outdata[i] = rc4_step(d, &x, &y) ^ indata[i]; | 142 | out[i] = rc4_step(d, &x, &y) ^ in[i]; |
| 215 | 143 | ||
| 216 | key->x = x; | 144 | key->x = x; |
| 217 | key->y = y; | 145 | key->y = y; |
| @@ -219,11 +147,11 @@ rc4_internal(RC4_KEY *key, size_t len, const unsigned char *indata, | |||
| 219 | #endif | 147 | #endif |
| 220 | 148 | ||
| 221 | #ifdef HAVE_RC4_SET_KEY_INTERNAL | 149 | #ifdef HAVE_RC4_SET_KEY_INTERNAL |
| 222 | void rc4_set_key_internal(RC4_KEY *key, int len, const unsigned char *data); | 150 | void rc4_set_key_internal(RC4_KEY *key, int len, const uint8_t *data); |
| 223 | 151 | ||
| 224 | #else | 152 | #else |
| 225 | static inline void | 153 | static inline void |
| 226 | rc4_set_key_internal(RC4_KEY *key, int len, const unsigned char *data) | 154 | rc4_set_key_internal(RC4_KEY *key, int len, const uint8_t *data) |
| 227 | { | 155 | { |
| 228 | RC4_INT *d, tmp; | 156 | RC4_INT *d, tmp; |
| 229 | int idx1, idx2; | 157 | int idx1, idx2; |
