sha3: add 32-bit optimized bit-sliced implementation

It is an interesting trick, but so far I only managed to make it work correctly, not to make it faster and/or smaller. The code is ifdefed out for now. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2014-07-25 17:24:13 +0200
committer: Denys Vlasenko <vda.linux@googlemail.com> 2014-07-25 17:24:13 +0200
commit: 2a563ea49a16589f47ed6afe7b22eebef4e3a6d1 (patch)
tree: 212bbad503c49c6ae0aabaeaef0ab023ca584f39
parent: a4d564ad7c24df2da1d8e03a7dd016f0a3d5edbd (diff)
download: busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.tar.gz
busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.tar.bz2
busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.zip
1 files changed, 242 insertions, 14 deletions
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 3f743ac75..dff583ad1 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -926,10 +926,81 @@ void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
 # define SHA3_SMALL CONFIG_SHA3_SMALL
 #endif
+#define OPTIMIZE_SHA3_FOR_32 0
+/*
+ * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
+ * every 64-bit word of state[] can be split into two 32-bit words
+ * by even/odd bits. In this form, all rotations of sha3 round
+ * are 32-bit - and there are lots of them.
+ * However, it requires either splitting/combining state words
+ * before/after sha3 round (code does this now)
+ * or shuffling bits before xor'ing them into state and in sha3_end.
+ * Without shuffling, bit-slicing results in -130 bytes of code
+ * and marginal speedup (but of course it gives wrong result).
+ * With shuffling it works, but +260 code bytes, and slower.
+ * Disabled for now:
+ */
+#if 0 /* LONG_MAX == 0x7fffffff */
+# undef OPTIMIZE_SHA3_FOR_32
+# define OPTIMIZE_SHA3_FOR_32 1
+#endif
 enum {
        SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */
 };
+#if OPTIMIZE_SHA3_FOR_32
+/* This splits every 64-bit word into a pair of 32-bit words,
+ * even bits go into first word, odd bits go to second one.
+ * The conversion is done in-place.
+ */
+static void split_halves(uint64_t *state)
+{
+        /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+        uint32_t *s32 = (uint32_t*)state;
+        uint32_t t, x0, x1;
+        int i;
+        for (i = 24; i >= 0; --i) {
+                x0 = s32[0];
+                t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
+                t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
+                t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
+                t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
+                x1 = s32[1];
+                t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
+                t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
+                t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
+                t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
+                *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
+                *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
+        }
+}
+/* The reverse operation */
+static void combine_halves(uint64_t *state)
+{
+        uint32_t *s32 = (uint32_t*)state;
+        uint32_t t, x0, x1;
+        int i;
+        for (i = 24; i >= 0; --i) {
+                x0 = s32[0];
+                x1 = s32[1];
+                t = (x0 & 0x0000FFFF) | (x1 << 16);
+                x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
+                x0 = t;
+                t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
+                t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
+                t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
+                t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
+                *s32++ = x0;
+                t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
+                t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
+                t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
+                t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
+                *s32++ = x1;
+        }
+}
+#endif
 /*
 * In the crypto literature this function is usually called Keccak-f().
 */
@@ -937,6 +1008,164 @@ static void sha3_process_block72(uint64_t *state)
 {
        enum { NROUNDS = 24 };
+#if OPTIMIZE_SHA3_FOR_32
+        /*
+        static const uint32_t IOTA_CONST_0[NROUNDS] = {
+                0x00000001UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000001UL,
+                0x00000000UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000001UL,
+                0x00000000UL,
+                0x00000001UL,
+                0x00000000UL,
+        };
+        ** bits are in lsb: 0101 0000 1111 0100 1111 0001
+        */
+        uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
+        static const uint32_t IOTA_CONST_1[NROUNDS] = {
+                0x00000000UL,
+                0x00000089UL,
+                0x8000008bUL,
+                0x80008080UL,
+                0x0000008bUL,
+                0x00008000UL,
+                0x80008088UL,
+                0x80000082UL,
+                0x0000000bUL,
+                0x0000000aUL,
+                0x00008082UL,
+                0x00008003UL,
+                0x0000808bUL,
+                0x8000000bUL,
+                0x8000008aUL,
+                0x80000081UL,
+                0x80000081UL,
+                0x80000008UL,
+                0x00000083UL,
+                0x80008003UL,
+                0x80008088UL,
+                0x80000088UL,
+                0x00008000UL,
+                0x80008082UL,
+        };
+        uint32_t *const s32 = (uint32_t*)state;
+        unsigned round;
+        split_halves(state);
+        for (round = 0; round < NROUNDS; round++) {
+                unsigned x;
+                /* Theta */
+                {
+                        uint32_t BC[20];
+                        for (x = 0; x < 10; ++x) {
+                                BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
+                        }
+                        for (x = 0; x < 10; x += 2) {
+                                uint32_t ta, tb;
+                                ta = BC[x+8] ^ rotl32(BC[x+3], 1);
+                                tb = BC[x+9] ^ BC[x+2];
+                                s32[x+0] ^= ta;
+                                s32[x+1] ^= tb;
+                                s32[x+10] ^= ta;
+                                s32[x+11] ^= tb;
+                                s32[x+20] ^= ta;
+                                s32[x+21] ^= tb;
+                                s32[x+30] ^= ta;
+                                s32[x+31] ^= tb;
+                                s32[x+40] ^= ta;
+                                s32[x+41] ^= tb;
+                        }
+                }
+                /* RhoPi */
+                {
+                        uint32_t t0a,t0b, t1a,t1b;
+                        t1a = s32[1*2+0];
+                        t1b = s32[1*2+1];
+#define RhoPi(PI_LANE, ROT_CONST) \
+        t0a = s32[PI_LANE*2+0];\
+        t0b = s32[PI_LANE*2+1];\
+        if (ROT_CONST & 1) {\
+                s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
+                s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
+        } else {\
+                s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
+                s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
+        }\
+        t1a = t0a; t1b = t0b;
+                        RhoPi(10, 1)
+                        RhoPi( 7, 3)
+                        RhoPi(11, 6)
+                        RhoPi(17,10)
+                        RhoPi(18,15)
+                        RhoPi( 3,21)
+                        RhoPi( 5,28)
+                        RhoPi(16,36)
+                        RhoPi( 8,45)
+                        RhoPi(21,55)
+                        RhoPi(24, 2)
+                        RhoPi( 4,14)
+                        RhoPi(15,27)
+                        RhoPi(23,41)
+                        RhoPi(19,56)
+                        RhoPi(13, 8)
+                        RhoPi(12,25)
+                        RhoPi( 2,43)
+                        RhoPi(20,62)
+                        RhoPi(14,18)
+                        RhoPi(22,39)
+                        RhoPi( 9,61)
+                        RhoPi( 6,20)
+                        RhoPi( 1,44)
+#undef RhoPi
+                }
+                /* Chi */
+                for (x = 0; x <= 20; x += 5) {
+                        /*
+                         * Can write this in terms of uint32 too,
+                         * but why? compiler does it automatically.
+                         */
+                        uint64_t BC0, BC1, BC2, BC3, BC4;
+                        BC0 = state[x + 0];
+                        BC1 = state[x + 1];
+                        BC2 = state[x + 2];
+                        state[x + 0] = BC0 ^ ((~BC1) & BC2);
+                        BC3 = state[x + 3];
+                        state[x + 1] = BC1 ^ ((~BC2) & BC3);
+                        BC4 = state[x + 4];
+                        state[x + 2] = BC2 ^ ((~BC3) & BC4);
+                        state[x + 3] = BC3 ^ ((~BC4) & BC0);
+                        state[x + 4] = BC4 ^ ((~BC0) & BC1);
+                }
+                /* Iota */
+                s32[0] ^= IOTA_CONST_0bits & 1;
+                IOTA_CONST_0bits >>= 1;
+                s32[1] ^= IOTA_CONST_1[round];
+        }
+        combine_halves(state);
+#else
        /* Elements should be 64-bit, but top half is always zero or 0x80000000.
         * We encode 63rd bits in a separate word below.
         * Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit.
@@ -983,7 +1212,7 @@ static void sha3_process_block72(uint64_t *state)
        };
        /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
-        unsigned x, y;
+        unsigned x;
        unsigned round;
        if (BB_BIG_ENDIAN) {
@@ -1045,22 +1274,20 @@ static void sha3_process_block72(uint64_t *state)
                        RhoPi_twice(20); RhoPi_twice(22);
 #undef RhoPi_twice
                }
                /* Chi */
-                for (y = 0; y <= 20; y += 5) {
+                for (x = 0; x <= 20; x += 5) {
                        uint64_t BC0, BC1, BC2, BC3, BC4;
-                        BC0 = state[y + 0];
+                        BC0 = state[x + 0];
-                        BC1 = state[y + 1];
+                        BC1 = state[x + 1];
-                        BC2 = state[y + 2];
+                        BC2 = state[x + 2];
-                        state[y + 0] = BC0 ^ ((~BC1) & BC2);
+                        state[x + 0] = BC0 ^ ((~BC1) & BC2);
-                        BC3 = state[y + 3];
+                        BC3 = state[x + 3];
-                        state[y + 1] = BC1 ^ ((~BC2) & BC3);
+                        state[x + 1] = BC1 ^ ((~BC2) & BC3);
-                        BC4 = state[y + 4];
+                        BC4 = state[x + 4];
-                        state[y + 2] = BC2 ^ ((~BC3) & BC4);
+                        state[x + 2] = BC2 ^ ((~BC3) & BC4);
-                        state[y + 3] = BC3 ^ ((~BC4) & BC0);
+                        state[x + 3] = BC3 ^ ((~BC4) & BC0);
-                        state[y + 4] = BC4 ^ ((~BC0) & BC1);
+                        state[x + 4] = BC4 ^ ((~BC0) & BC1);
                }
                /* Iota */
                state[0] ^= IOTA_CONST[round]
                        | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
@@ -1072,6 +1299,7 @@ static void sha3_process_block72(uint64_t *state)
                        state[x] = SWAP_LE64(state[x]);
                }
        }
+#endif
 }
 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
author	Denys Vlasenko <vda.linux@googlemail.com>	2014-07-25 17:24:13 +0200
committer	Denys Vlasenko <vda.linux@googlemail.com>	2014-07-25 17:24:13 +0200
commit	2a563ea49a16589f47ed6afe7b22eebef4e3a6d1 (patch)
tree	212bbad503c49c6ae0aabaeaef0ab023ca584f39
parent	a4d564ad7c24df2da1d8e03a7dd016f0a3d5edbd (diff)
download	busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.tar.gz busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.tar.bz2 busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.zip

diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 3f743ac75..dff583ad1 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c
@@ -926,10 +926,81 @@ void FAST_FUNC sha512_end(sha512_ctx_t ctx, void resbuf)
926	# define SHA3_SMALL CONFIG_SHA3_SMALL	926	# define SHA3_SMALL CONFIG_SHA3_SMALL
927	#endif	927	#endif
928		928
		929	#define OPTIMIZE_SHA3_FOR_32 0
		930	/*
		931	* SHA3 can be optimized for 32-bit CPUs with bit-slicing:
		932	* every 64-bit word of state[] can be split into two 32-bit words
		933	* by even/odd bits. In this form, all rotations of sha3 round
		934	* are 32-bit - and there are lots of them.
		935	* However, it requires either splitting/combining state words
		936	* before/after sha3 round (code does this now)
		937	* or shuffling bits before xor'ing them into state and in sha3_end.
		938	* Without shuffling, bit-slicing results in -130 bytes of code
		939	* and marginal speedup (but of course it gives wrong result).
		940	* With shuffling it works, but +260 code bytes, and slower.
		941	* Disabled for now:
		942	*/
		943	#if 0 /* LONG_MAX == 0x7fffffff */
		944	# undef OPTIMIZE_SHA3_FOR_32
		945	# define OPTIMIZE_SHA3_FOR_32 1
		946	#endif
		947
929	enum {	948	enum {
930	SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */	949	SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */
931	};	950	};
932		951
		952	#if OPTIMIZE_SHA3_FOR_32
		953	/* This splits every 64-bit word into a pair of 32-bit words,
		954	* even bits go into first word, odd bits go to second one.
		955	* The conversion is done in-place.
		956	*/
		957	static void split_halves(uint64_t *state)
		958	{
		959	/* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
		960	uint32_t s32 = (uint32_t)state;
		961	uint32_t t, x0, x1;
		962	int i;
		963	for (i = 24; i >= 0; --i) {
		964	x0 = s32[0];
		965	t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
		966	t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
		967	t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
		968	t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
		969	x1 = s32[1];
		970	t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
		971	t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
		972	t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
		973	t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
		974	*s32++ = (x0 & 0x0000FFFF) \| (x1 << 16);
		975	*s32++ = (x0 >> 16) \| (x1 & 0xFFFF0000);
		976	}
		977	}
		978	/* The reverse operation */
		979	static void combine_halves(uint64_t *state)
		980	{
		981	uint32_t s32 = (uint32_t)state;
		982	uint32_t t, x0, x1;
		983	int i;
		984	for (i = 24; i >= 0; --i) {
		985	x0 = s32[0];
		986	x1 = s32[1];
		987	t = (x0 & 0x0000FFFF) \| (x1 << 16);
		988	x1 = (x0 >> 16) \| (x1 & 0xFFFF0000);
		989	x0 = t;
		990	t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
		991	t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
		992	t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
		993	t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
		994	*s32++ = x0;
		995	t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
		996	t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
		997	t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
		998	t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
		999	*s32++ = x1;
		1000	}
		1001	}
		1002	#endif
		1003
933	/*	1004	/*
934	* In the crypto literature this function is usually called Keccak-f().	1005	* In the crypto literature this function is usually called Keccak-f().
935	*/	1006	*/
@@ -937,6 +1008,164 @@ static void sha3_process_block72(uint64_t *state)
937	{	1008	{
938	enum { NROUNDS = 24 };	1009	enum { NROUNDS = 24 };
939		1010
		1011	#if OPTIMIZE_SHA3_FOR_32
		1012	/*
		1013	static const uint32_t IOTA_CONST_0[NROUNDS] = {
		1014	0x00000001UL,
		1015	0x00000000UL,
		1016	0x00000000UL,
		1017	0x00000000UL,
		1018	0x00000001UL,
		1019	0x00000001UL,
		1020	0x00000001UL,
		1021	0x00000001UL,
		1022	0x00000000UL,
		1023	0x00000000UL,
		1024	0x00000001UL,
		1025	0x00000000UL,
		1026	0x00000001UL,
		1027	0x00000001UL,
		1028	0x00000001UL,
		1029	0x00000001UL,
		1030	0x00000000UL,
		1031	0x00000000UL,
		1032	0x00000000UL,
		1033	0x00000000UL,
		1034	0x00000001UL,
		1035	0x00000000UL,
		1036	0x00000001UL,
		1037	0x00000000UL,
		1038	};
		1039	** bits are in lsb: 0101 0000 1111 0100 1111 0001
		1040	*/
		1041	uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
		1042	static const uint32_t IOTA_CONST_1[NROUNDS] = {
		1043	0x00000000UL,
		1044	0x00000089UL,
		1045	0x8000008bUL,
		1046	0x80008080UL,
		1047	0x0000008bUL,
		1048	0x00008000UL,
		1049	0x80008088UL,
		1050	0x80000082UL,
		1051	0x0000000bUL,
		1052	0x0000000aUL,
		1053	0x00008082UL,
		1054	0x00008003UL,
		1055	0x0000808bUL,
		1056	0x8000000bUL,
		1057	0x8000008aUL,
		1058	0x80000081UL,
		1059	0x80000081UL,
		1060	0x80000008UL,
		1061	0x00000083UL,
		1062	0x80008003UL,
		1063	0x80008088UL,
		1064	0x80000088UL,
		1065	0x00008000UL,
		1066	0x80008082UL,
		1067	};
		1068
		1069	uint32_t const s32 = (uint32_t)state;
		1070	unsigned round;
		1071
		1072	split_halves(state);
		1073
		1074	for (round = 0; round < NROUNDS; round++) {
		1075	unsigned x;
		1076
		1077	/* Theta */
		1078	{
		1079	uint32_t BC[20];
		1080	for (x = 0; x < 10; ++x) {
		1081	BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
		1082	}
		1083	for (x = 0; x < 10; x += 2) {
		1084	uint32_t ta, tb;
		1085	ta = BC[x+8] ^ rotl32(BC[x+3], 1);
		1086	tb = BC[x+9] ^ BC[x+2];
		1087	s32[x+0] ^= ta;
		1088	s32[x+1] ^= tb;
		1089	s32[x+10] ^= ta;
		1090	s32[x+11] ^= tb;
		1091	s32[x+20] ^= ta;
		1092	s32[x+21] ^= tb;
		1093	s32[x+30] ^= ta;
		1094	s32[x+31] ^= tb;
		1095	s32[x+40] ^= ta;
		1096	s32[x+41] ^= tb;
		1097	}
		1098	}
		1099	/* RhoPi */
		1100	{
		1101	uint32_t t0a,t0b, t1a,t1b;
		1102	t1a = s32[1*2+0];
		1103	t1b = s32[1*2+1];
		1104
		1105	#define RhoPi(PI_LANE, ROT_CONST) \
		1106	t0a = s32[PI_LANE*2+0];\
		1107	t0b = s32[PI_LANE*2+1];\
		1108	if (ROT_CONST & 1) {\
		1109	s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
		1110	s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
		1111	} else {\
		1112	s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
		1113	s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
		1114	}\
		1115	t1a = t0a; t1b = t0b;
		1116
		1117	RhoPi(10, 1)
		1118	RhoPi( 7, 3)
		1119	RhoPi(11, 6)
		1120	RhoPi(17,10)
		1121	RhoPi(18,15)
		1122	RhoPi( 3,21)
		1123	RhoPi( 5,28)
		1124	RhoPi(16,36)
		1125	RhoPi( 8,45)
		1126	RhoPi(21,55)
		1127	RhoPi(24, 2)
		1128	RhoPi( 4,14)
		1129	RhoPi(15,27)
		1130	RhoPi(23,41)
		1131	RhoPi(19,56)
		1132	RhoPi(13, 8)
		1133	RhoPi(12,25)
		1134	RhoPi( 2,43)
		1135	RhoPi(20,62)
		1136	RhoPi(14,18)
		1137	RhoPi(22,39)
		1138	RhoPi( 9,61)
		1139	RhoPi( 6,20)
		1140	RhoPi( 1,44)
		1141	#undef RhoPi
		1142	}
		1143	/* Chi */
		1144	for (x = 0; x <= 20; x += 5) {
		1145	/*
		1146	* Can write this in terms of uint32 too,
		1147	* but why? compiler does it automatically.
		1148	*/
		1149	uint64_t BC0, BC1, BC2, BC3, BC4;
		1150	BC0 = state[x + 0];
		1151	BC1 = state[x + 1];
		1152	BC2 = state[x + 2];
		1153	state[x + 0] = BC0 ^ ((~BC1) & BC2);
		1154	BC3 = state[x + 3];
		1155	state[x + 1] = BC1 ^ ((~BC2) & BC3);
		1156	BC4 = state[x + 4];
		1157	state[x + 2] = BC2 ^ ((~BC3) & BC4);
		1158	state[x + 3] = BC3 ^ ((~BC4) & BC0);
		1159	state[x + 4] = BC4 ^ ((~BC0) & BC1);
		1160	}
		1161	/* Iota */
		1162	s32[0] ^= IOTA_CONST_0bits & 1;
		1163	IOTA_CONST_0bits >>= 1;
		1164	s32[1] ^= IOTA_CONST_1[round];
		1165	}
		1166
		1167	combine_halves(state);
		1168	#else
940	/* Elements should be 64-bit, but top half is always zero or 0x80000000.	1169	/* Elements should be 64-bit, but top half is always zero or 0x80000000.
941	* We encode 63rd bits in a separate word below.	1170	* We encode 63rd bits in a separate word below.
942	* Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit.	1171	* Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit.
@@ -983,7 +1212,7 @@ static void sha3_process_block72(uint64_t *state)
983	};	1212	};
984	/static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };/	1213	/static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };/
985		1214
986	unsigned x, y;	1215	unsigned x;
987	unsigned round;	1216	unsigned round;
988		1217
989	if (BB_BIG_ENDIAN) {	1218	if (BB_BIG_ENDIAN) {
@@ -1045,22 +1274,20 @@ static void sha3_process_block72(uint64_t *state)
1045	RhoPi_twice(20); RhoPi_twice(22);	1274	RhoPi_twice(20); RhoPi_twice(22);
1046	#undef RhoPi_twice	1275	#undef RhoPi_twice
1047	}	1276	}
1048
1049	/* Chi */	1277	/* Chi */
1050	for (y = 0; y <= 20; y += 5) {	1278	for (x = 0; x <= 20; x += 5) {
1051	uint64_t BC0, BC1, BC2, BC3, BC4;	1279	uint64_t BC0, BC1, BC2, BC3, BC4;
1052	BC0 = state[y + 0];	1280	BC0 = state[x + 0];
1053	BC1 = state[y + 1];	1281	BC1 = state[x + 1];
1054	BC2 = state[y + 2];	1282	BC2 = state[x + 2];
1055	state[y + 0] = BC0 ^ ((~BC1) & BC2);	1283	state[x + 0] = BC0 ^ ((~BC1) & BC2);
1056	BC3 = state[y + 3];	1284	BC3 = state[x + 3];
1057	state[y + 1] = BC1 ^ ((~BC2) & BC3);	1285	state[x + 1] = BC1 ^ ((~BC2) & BC3);
1058	BC4 = state[y + 4];	1286	BC4 = state[x + 4];
1059	state[y + 2] = BC2 ^ ((~BC3) & BC4);	1287	state[x + 2] = BC2 ^ ((~BC3) & BC4);
1060	state[y + 3] = BC3 ^ ((~BC4) & BC0);	1288	state[x + 3] = BC3 ^ ((~BC4) & BC0);
1061	state[y + 4] = BC4 ^ ((~BC0) & BC1);	1289	state[x + 4] = BC4 ^ ((~BC0) & BC1);
1062	}	1290	}
1063
1064	/* Iota */	1291	/* Iota */
1065	state[0] ^= IOTA_CONST[round]	1292	state[0] ^= IOTA_CONST[round]
1066	\| (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)	1293	\| (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
@@ -1072,6 +1299,7 @@ static void sha3_process_block72(uint64_t *state)
1072	state[x] = SWAP_LE64(state[x]);	1299	state[x] = SWAP_LE64(state[x]);
1073	}	1300	}
1074	}	1301	}
		1302	#endif
1075	}	1303	}
1076		1304
1077	void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)	1305	void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)