sha3: make size/speed optimization decision configurable

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2013-01-15 01:12:26 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2013-01-15 01:12:26 +0100
commit: 30a8652fbf16884490cee4a624f039a9ab587269 (patch)
tree: cf6235392344386a06849e04fd5deb4f364f0a71 /libbb
parent: 60cb48ca50fcff24aa6c3927f51e4a508fa118f4 (diff)
download: busybox-w32-30a8652fbf16884490cee4a624f039a9ab587269.tar.gz
busybox-w32-30a8652fbf16884490cee4a624f039a9ab587269.tar.bz2
busybox-w32-30a8652fbf16884490cee4a624f039a9ab587269.zip
2 files changed, 68 insertions, 19 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index ee1b66a45..19021fed1 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -28,6 +28,16 @@ config MD5_SMALL
          2                   3.0                5088
          3 (smallest)        5.1                4912
+config SHA3_SMALL
+        int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
+        default 1
+        range 0 1
+        help
+          Trade binary size versus speed for the sha3sum algorithm.
+          SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
+          64-bit x86: +270 bytes of code, 45% faster
+          32-bit x86: +450 bytes of code, 75% faster
 config FEATURE_FAST_TOP
        bool "Faster /proc scanning code (+100 bytes)"
        default y
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 06a2400b5..643cf205f 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -918,6 +918,16 @@ void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
 * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
 */
+#if CONFIG_SHA3_SMALL < 0
+# define SHA3_SMALL 0
+#elif CONFIG_SHA3_SMALL > 1
+# define SHA3_SMALL 1
+#else
+# define SHA3_SMALL CONFIG_SHA3_SMALL
+#endif
+#define ARCH_IS_64BIT (sizeof(long) >= sizeof(uint64_t))
 enum {
        cKeccakR_SizeInBytes = 576 / 8,
        cKeccakNumberOfRounds = 24,
@@ -967,8 +977,6 @@ static const uint8_t KeccakF_Mod5[10] = {
 static void KeccakF(uint64_t *state)
 {
        uint8_t x, y;
-        uint64_t temp;
-        uint64_t BC[5];
        int round;
        if (BB_BIG_ENDIAN) {
@@ -979,30 +987,61 @@ static void KeccakF(uint64_t *state)
        for (round = 0; round < cKeccakNumberOfRounds; ++round) {
                /* Theta */
-                for (x = 0; x < 5; ++x) {
+                {
-                        BC[x] = state[x] ^ state[5 + x] ^ state[10 + x] ^
+                        uint64_t BC[5];
-                                state[15 + x] ^ state[20 + x];
+                        for (x = 0; x < 5; ++x) {
-                }
+                                BC[x] = state[x] ^ state[5 + x] ^ state[10 + x] ^
-                for (x = 0; x < 5; ++x) {
+                                        state[15 + x] ^ state[20 + x];
-                        temp = BC[KeccakF_Mod5[x + 4]] ^
+                        }
-                                rotl64(BC[KeccakF_Mod5[x + 1]], 1);
+                        for (x = 0; x < 5; ++x) {
+                                uint64_t temp = BC[KeccakF_Mod5[x + 4]] ^
-                        for (y = 0; y <= 20; y += 5) {
+                                        rotl64(BC[KeccakF_Mod5[x + 1]], 1);
-                                state[y + x] ^= temp;
+                                if (SHA3_SMALL && !ARCH_IS_64BIT) {
+                                        for (y = 0; y <= 20; y += 5)
+                                                state[y + x] ^= temp;
+                                } else {
+                                        /* on 64-bit arch, this is actually smaller too */
+                                        state[0 + x] ^= temp;
+                                        state[5 + x] ^= temp;
+                                        state[10 + x] ^= temp;
+                                        state[15 + x] ^= temp;
+                                        state[20 + x] ^= temp;
+                                }
                        }
                }
                /* Rho Pi */
-                temp = state[1];
+                if (SHA3_SMALL) {
-                for (x = 0; x < 24; ++x) {
+                        uint64_t t1 = state[1];
-                        BC[0] = state[KeccakF_PiLane[x]];
+                        for (x = 0; x < 24; ++x) {
-                        state[KeccakF_PiLane[x]] =
+                                uint64_t t0 = state[KeccakF_PiLane[x]];
-                            rotl64(temp, KeccakF_RotationConstants[x]);
+                                state[KeccakF_PiLane[x]] = rotl64(t1, KeccakF_RotationConstants[x]);
-                        temp = BC[0];
+                                t1 = t0;
+                        }
+                } else {
+                        /* Especially large benefit for 32-bit arch:
+                         * 64-bit rotations by non-constant usually are SLOW on those.
+                         * We resort to unrolling here.
+                         * This optimizes out KeccakF_PiLane[] and KeccakF_RotationConstants[],
+                         * but generates 300-500 more bytes of code.
+                         */
+                        uint64_t t0;
+                        uint64_t t1 = state[1];
+#define RhoPi_twice(x) \
+                        t0 = state[KeccakF_PiLane[x  ]]; state[KeccakF_PiLane[x  ]] = rotl64(t1, KeccakF_RotationConstants[x  ]); \
+                        t1 = state[KeccakF_PiLane[x+1]]; state[KeccakF_PiLane[x+1]] = rotl64(t0, KeccakF_RotationConstants[x+1]);
+                        RhoPi_twice(0); RhoPi_twice(2);
+                        RhoPi_twice(4); RhoPi_twice(6);
+                        RhoPi_twice(8); RhoPi_twice(10);
+                        RhoPi_twice(12); RhoPi_twice(14);
+                        RhoPi_twice(16); RhoPi_twice(18);
+                        RhoPi_twice(20); RhoPi_twice(22);
+#undef RhoPi_twice
                }
                /* Chi */
-                for (y = 0; y < 25; y += 5) {
+                for (y = 0; y <= 20; y += 5) {
+                        uint64_t BC[5];
                        BC[0] = state[y + 0];
                        BC[1] = state[y + 1];
                        BC[2] = state[y + 2];
author	Denys Vlasenko <vda.linux@googlemail.com>	2013-01-15 01:12:26 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2013-01-15 01:12:26 +0100
commit	30a8652fbf16884490cee4a624f039a9ab587269 (patch)
tree	cf6235392344386a06849e04fd5deb4f364f0a71 /libbb
parent	60cb48ca50fcff24aa6c3927f51e4a508fa118f4 (diff)
download	busybox-w32-30a8652fbf16884490cee4a624f039a9ab587269.tar.gz busybox-w32-30a8652fbf16884490cee4a624f039a9ab587269.tar.bz2 busybox-w32-30a8652fbf16884490cee4a624f039a9ab587269.zip

diff --git a/libbb/Config.src b/libbb/Config.src index ee1b66a45..19021fed1 100644 --- a/libbb/Config.src +++ b/libbb/Config.src
@@ -28,6 +28,16 @@ config MD5_SMALL
28	2 3.0 5088	28	2 3.0 5088
29	3 (smallest) 5.1 4912	29	3 (smallest) 5.1 4912
30		30
		31	config SHA3_SMALL
		32	int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
		33	default 1
		34	range 0 1
		35	help
		36	Trade binary size versus speed for the sha3sum algorithm.
		37	SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
		38	64-bit x86: +270 bytes of code, 45% faster
		39	32-bit x86: +450 bytes of code, 75% faster
		40
31	config FEATURE_FAST_TOP	41	config FEATURE_FAST_TOP
32	bool "Faster /proc scanning code (+100 bytes)"	42	bool "Faster /proc scanning code (+100 bytes)"
33	default y	43	default y


diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 06a2400b5..643cf205f 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c
@@ -918,6 +918,16 @@ void FAST_FUNC sha512_end(sha512_ctx_t ctx, void resbuf)
918	* Busybox modifications (C) Lauri Kasanen, under the GPLv2.	918	* Busybox modifications (C) Lauri Kasanen, under the GPLv2.
919	*/	919	*/
920		920
		921	#if CONFIG_SHA3_SMALL < 0
		922	# define SHA3_SMALL 0
		923	#elif CONFIG_SHA3_SMALL > 1
		924	# define SHA3_SMALL 1
		925	#else
		926	# define SHA3_SMALL CONFIG_SHA3_SMALL
		927	#endif
		928
		929	#define ARCH_IS_64BIT (sizeof(long) >= sizeof(uint64_t))
		930
921	enum {	931	enum {
922	cKeccakR_SizeInBytes = 576 / 8,	932	cKeccakR_SizeInBytes = 576 / 8,
923	cKeccakNumberOfRounds = 24,	933	cKeccakNumberOfRounds = 24,
@@ -967,8 +977,6 @@ static const uint8_t KeccakF_Mod5[10] = {
967	static void KeccakF(uint64_t *state)	977	static void KeccakF(uint64_t *state)
968	{	978	{
969	uint8_t x, y;	979	uint8_t x, y;
970	uint64_t temp;
971	uint64_t BC[5];
972	int round;	980	int round;
973		981
974	if (BB_BIG_ENDIAN) {	982	if (BB_BIG_ENDIAN) {
@@ -979,30 +987,61 @@ static void KeccakF(uint64_t *state)
979		987
980	for (round = 0; round < cKeccakNumberOfRounds; ++round) {	988	for (round = 0; round < cKeccakNumberOfRounds; ++round) {
981	/* Theta */	989	/* Theta */
982	for (x = 0; x < 5; ++x) {	990	{
983	BC[x] = state[x] ^ state[5 + x] ^ state[10 + x] ^	991	uint64_t BC[5];
984	state[15 + x] ^ state[20 + x];	992	for (x = 0; x < 5; ++x) {
985	}	993	BC[x] = state[x] ^ state[5 + x] ^ state[10 + x] ^
986	for (x = 0; x < 5; ++x) {	994	state[15 + x] ^ state[20 + x];
987	temp = BC[KeccakF_Mod5[x + 4]] ^	995	}
988	rotl64(BC[KeccakF_Mod5[x + 1]], 1);	996	for (x = 0; x < 5; ++x) {
989		997	uint64_t temp = BC[KeccakF_Mod5[x + 4]] ^
990	for (y = 0; y <= 20; y += 5) {	998	rotl64(BC[KeccakF_Mod5[x + 1]], 1);
991	state[y + x] ^= temp;	999	if (SHA3_SMALL && !ARCH_IS_64BIT) {
		1000	for (y = 0; y <= 20; y += 5)
		1001	state[y + x] ^= temp;
		1002	} else {
		1003	/* on 64-bit arch, this is actually smaller too */
		1004	state[0 + x] ^= temp;
		1005	state[5 + x] ^= temp;
		1006	state[10 + x] ^= temp;
		1007	state[15 + x] ^= temp;
		1008	state[20 + x] ^= temp;
		1009	}
992	}	1010	}
993	}	1011	}
994		1012
995	/* Rho Pi */	1013	/* Rho Pi */
996	temp = state[1];	1014	if (SHA3_SMALL) {
997	for (x = 0; x < 24; ++x) {	1015	uint64_t t1 = state[1];
998	BC[0] = state[KeccakF_PiLane[x]];	1016	for (x = 0; x < 24; ++x) {
999	state[KeccakF_PiLane[x]] =	1017	uint64_t t0 = state[KeccakF_PiLane[x]];
1000	rotl64(temp, KeccakF_RotationConstants[x]);	1018	state[KeccakF_PiLane[x]] = rotl64(t1, KeccakF_RotationConstants[x]);
1001	temp = BC[0];	1019	t1 = t0;
		1020	}
		1021	} else {
		1022	/* Especially large benefit for 32-bit arch:
		1023	* 64-bit rotations by non-constant usually are SLOW on those.
		1024	* We resort to unrolling here.
		1025	* This optimizes out KeccakF_PiLane[] and KeccakF_RotationConstants[],
		1026	* but generates 300-500 more bytes of code.
		1027	*/
		1028	uint64_t t0;
		1029	uint64_t t1 = state[1];
		1030	#define RhoPi_twice(x) \
		1031	t0 = state[KeccakF_PiLane[x ]]; state[KeccakF_PiLane[x ]] = rotl64(t1, KeccakF_RotationConstants[x ]); \
		1032	t1 = state[KeccakF_PiLane[x+1]]; state[KeccakF_PiLane[x+1]] = rotl64(t0, KeccakF_RotationConstants[x+1]);
		1033	RhoPi_twice(0); RhoPi_twice(2);
		1034	RhoPi_twice(4); RhoPi_twice(6);
		1035	RhoPi_twice(8); RhoPi_twice(10);
		1036	RhoPi_twice(12); RhoPi_twice(14);
		1037	RhoPi_twice(16); RhoPi_twice(18);
		1038	RhoPi_twice(20); RhoPi_twice(22);
		1039	#undef RhoPi_twice
1002	}	1040	}
1003		1041
1004	/* Chi */	1042	/* Chi */
1005	for (y = 0; y < 25; y += 5) {	1043	for (y = 0; y <= 20; y += 5) {
		1044	uint64_t BC[5];
1006	BC[0] = state[y + 0];	1045	BC[0] = state[y + 0];
1007	BC[1] = state[y + 1];	1046	BC[1] = state[y + 1];
1008	BC[2] = state[y + 2];	1047	BC[2] = state[y + 2];