1 files changed, 298 insertions, 19 deletions
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 3f743ac75..1f63ccdee 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -926,10 +926,81 @@ void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
 # define SHA3_SMALL CONFIG_SHA3_SMALL
 #endif
+#define OPTIMIZE_SHA3_FOR_32 0
+/*
+ * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
+ * every 64-bit word of state[] can be split into two 32-bit words
+ * by even/odd bits. In this form, all rotations of sha3 round
+ * are 32-bit - and there are lots of them.
+ * However, it requires either splitting/combining state words
+ * before/after sha3 round (code does this now)
+ * or shuffling bits before xor'ing them into state and in sha3_end.
+ * Without shuffling, bit-slicing results in -130 bytes of code
+ * and marginal speedup (but of course it gives wrong result).
+ * With shuffling it works, but +260 code bytes, and slower.
+ * Disabled for now:
+ */
+#if 0 /* LONG_MAX == 0x7fffffff */
+# undef OPTIMIZE_SHA3_FOR_32
+# define OPTIMIZE_SHA3_FOR_32 1
+#endif
 enum {
        SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */
 };
+#if OPTIMIZE_SHA3_FOR_32
+/* This splits every 64-bit word into a pair of 32-bit words,
+ * even bits go into first word, odd bits go to second one.
+ * The conversion is done in-place.
+ */
+static void split_halves(uint64_t *state)
+{
+        /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+        uint32_t *s32 = (uint32_t*)state;
+        uint32_t t, x0, x1;
+        int i;
+        for (i = 24; i >= 0; --i) {
+                x0 = s32[0];
+                t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
+                t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
+                t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
+                t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
+                x1 = s32[1];
+                t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
+                t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
+                t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
+                t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
+                *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
+                *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
+        }
+}
+/* The reverse operation */
+static void combine_halves(uint64_t *state)
+{
+        uint32_t *s32 = (uint32_t*)state;
+        uint32_t t, x0, x1;
+        int i;
+        for (i = 24; i >= 0; --i) {
+                x0 = s32[0];
+                x1 = s32[1];
+                t = (x0 & 0x0000FFFF) | (x1 << 16);
+                x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
+                x0 = t;
+                t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
+                t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
+                t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
+                t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
+                *s32++ = x0;
+                t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
+                t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
+                t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
+                t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
+                *s32++ = x1;
+        }
+}
+#endif
 /*
 * In the crypto literature this function is usually called Keccak-f().
 */
@@ -937,12 +1008,179 @@ static void sha3_process_block72(uint64_t *state)
 {
        enum { NROUNDS = 24 };
-        /* Elements should be 64-bit, but top half is always zero or 0x80000000.
+#if OPTIMIZE_SHA3_FOR_32
-         * We encode 63rd bits in a separate word below.
+        /*
-         * Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit.
+        static const uint32_t IOTA_CONST_0[NROUNDS] = {
-         * The speed penalty is lost in the noise.
+                0x00000001UL,
-         */
+                0x00000000UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000001UL,
+                0x00000000UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000001UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000000UL,
+                0x00000001UL,
+                0x00000000UL,
+                0x00000001UL,
+                0x00000000UL,
+        };
+        ** bits are in lsb: 0101 0000 1111 0100 1111 0001
+        */
+        uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
+        static const uint32_t IOTA_CONST_1[NROUNDS] = {
+                0x00000000UL,
+                0x00000089UL,
+                0x8000008bUL,
+                0x80008080UL,
+                0x0000008bUL,
+                0x00008000UL,
+                0x80008088UL,
+                0x80000082UL,
+                0x0000000bUL,
+                0x0000000aUL,
+                0x00008082UL,
+                0x00008003UL,
+                0x0000808bUL,
+                0x8000000bUL,
+                0x8000008aUL,
+                0x80000081UL,
+                0x80000081UL,
+                0x80000008UL,
+                0x00000083UL,
+                0x80008003UL,
+                0x80008088UL,
+                0x80000088UL,
+                0x00008000UL,
+                0x80008082UL,
+        };
+        uint32_t *const s32 = (uint32_t*)state;
+        unsigned round;
+        split_halves(state);
+        for (round = 0; round < NROUNDS; round++) {
+                unsigned x;
+                /* Theta */
+                {
+                        uint32_t BC[20];
+                        for (x = 0; x < 10; ++x) {
+                                BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
+                        }
+                        for (x = 0; x < 10; x += 2) {
+                                uint32_t ta, tb;
+                                ta = BC[x+8] ^ rotl32(BC[x+3], 1);
+                                tb = BC[x+9] ^ BC[x+2];
+                                s32[x+0] ^= ta;
+                                s32[x+1] ^= tb;
+                                s32[x+10] ^= ta;
+                                s32[x+11] ^= tb;
+                                s32[x+20] ^= ta;
+                                s32[x+21] ^= tb;
+                                s32[x+30] ^= ta;
+                                s32[x+31] ^= tb;
+                                s32[x+40] ^= ta;
+                                s32[x+41] ^= tb;
+                        }
+                }
+                /* RhoPi */
+                {
+                        uint32_t t0a,t0b, t1a,t1b;
+                        t1a = s32[1*2+0];
+                        t1b = s32[1*2+1];
+#define RhoPi(PI_LANE, ROT_CONST) \
+        t0a = s32[PI_LANE*2+0];\
+        t0b = s32[PI_LANE*2+1];\
+        if (ROT_CONST & 1) {\
+                s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
+                s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
+        } else {\
+                s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
+                s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
+        }\
+        t1a = t0a; t1b = t0b;
+                        RhoPi(10, 1)
+                        RhoPi( 7, 3)
+                        RhoPi(11, 6)
+                        RhoPi(17,10)
+                        RhoPi(18,15)
+                        RhoPi( 3,21)
+                        RhoPi( 5,28)
+                        RhoPi(16,36)
+                        RhoPi( 8,45)
+                        RhoPi(21,55)
+                        RhoPi(24, 2)
+                        RhoPi( 4,14)
+                        RhoPi(15,27)
+                        RhoPi(23,41)
+                        RhoPi(19,56)
+                        RhoPi(13, 8)
+                        RhoPi(12,25)
+                        RhoPi( 2,43)
+                        RhoPi(20,62)
+                        RhoPi(14,18)
+                        RhoPi(22,39)
+                        RhoPi( 9,61)
+                        RhoPi( 6,20)
+                        RhoPi( 1,44)
+#undef RhoPi
+                }
+                /* Chi */
+                for (x = 0; x <= 40;) {
+                        uint32_t BC0, BC1, BC2, BC3, BC4;
+                        BC0 = s32[x + 0*2];
+                        BC1 = s32[x + 1*2];
+                        BC2 = s32[x + 2*2];
+                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
+                        BC3 = s32[x + 3*2];
+                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
+                        BC4 = s32[x + 4*2];
+                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
+                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
+                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
+                        x++;
+                        BC0 = s32[x + 0*2];
+                        BC1 = s32[x + 1*2];
+                        BC2 = s32[x + 2*2];
+                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
+                        BC3 = s32[x + 3*2];
+                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
+                        BC4 = s32[x + 4*2];
+                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
+                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
+                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
+                        x += 9;
+                }
+                /* Iota */
+                s32[0] ^= IOTA_CONST_0bits & 1;
+                IOTA_CONST_0bits >>= 1;
+                s32[1] ^= IOTA_CONST_1[round];
+        }
+        combine_halves(state);
+#else
+        /* Native 64-bit algorithm */
        static const uint16_t IOTA_CONST[NROUNDS] = {
+                /* Elements should be 64-bit, but top half is always zero
+                 * or 0x80000000. We encode 63rd bits in a separate word below.
+                 * Same is true for 31th bits, which lets us use 16-bit table
+                 * instead of 64-bit. The speed penalty is lost in the noise.
+                 */
                0x0001,
                0x8082,
                0x808a,
@@ -983,7 +1221,7 @@ static void sha3_process_block72(uint64_t *state)
        };
        /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
-        unsigned x, y;
+        unsigned x;
        unsigned round;
        if (BB_BIG_ENDIAN) {
@@ -1045,22 +1283,62 @@ static void sha3_process_block72(uint64_t *state)
                        RhoPi_twice(20); RhoPi_twice(22);
 #undef RhoPi_twice
                }
                /* Chi */
-                for (y = 0; y <= 20; y += 5) {
+# if LONG_MAX > 0x7fffffff
+                for (x = 0; x <= 20; x += 5) {
                        uint64_t BC0, BC1, BC2, BC3, BC4;
-                        BC0 = state[y + 0];
+                        BC0 = state[x + 0];
-                        BC1 = state[y + 1];
+                        BC1 = state[x + 1];
-                        BC2 = state[y + 2];
+                        BC2 = state[x + 2];
-                        state[y + 0] = BC0 ^ ((~BC1) & BC2);
+                        state[x + 0] = BC0 ^ ((~BC1) & BC2);
-                        BC3 = state[y + 3];
+                        BC3 = state[x + 3];
-                        state[y + 1] = BC1 ^ ((~BC2) & BC3);
+                        state[x + 1] = BC1 ^ ((~BC2) & BC3);
-                        BC4 = state[y + 4];
+                        BC4 = state[x + 4];
-                        state[y + 2] = BC2 ^ ((~BC3) & BC4);
+                        state[x + 2] = BC2 ^ ((~BC3) & BC4);
-                        state[y + 3] = BC3 ^ ((~BC4) & BC0);
+                        state[x + 3] = BC3 ^ ((~BC4) & BC0);
-                        state[y + 4] = BC4 ^ ((~BC0) & BC1);
+                        state[x + 4] = BC4 ^ ((~BC0) & BC1);
                }
+# else
+                /* Reduced register pressure version
+                 * for register-starved 32-bit arches
+                 * (i386: -95 bytes, and it is _faster_)
+                 */
+                for (x = 0; x <= 40;) {
+                        uint32_t BC0, BC1, BC2, BC3, BC4;
+                        uint32_t *const s32 = (uint32_t*)state;
+#  if SHA3_SMALL
+ do_half:
+#  endif
+                        BC0 = s32[x + 0*2];
+                        BC1 = s32[x + 1*2];
+                        BC2 = s32[x + 2*2];
+                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
+                        BC3 = s32[x + 3*2];
+                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
+                        BC4 = s32[x + 4*2];
+                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
+                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
+                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
+                        x++;
+#  if SHA3_SMALL
+                        if (x & 1)
+                                goto do_half;
+                        x += 8;
+#  else
+                        BC0 = s32[x + 0*2];
+                        BC1 = s32[x + 1*2];
+                        BC2 = s32[x + 2*2];
+                        s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
+                        BC3 = s32[x + 3*2];
+                        s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
+                        BC4 = s32[x + 4*2];
+                        s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
+                        s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
+                        s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
+                        x += 9;
+#  endif
+                }
+# endif /* long is 32-bit */
                /* Iota */
                state[0] ^= IOTA_CONST[round]
                        | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
@@ -1072,6 +1350,7 @@ static void sha3_process_block72(uint64_t *state)
                        state[x] = SWAP_LE64(state[x]);
                }
        }
+#endif
 }
 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)

diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 3f743ac75..1f63ccdee 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c
@@ -926,10 +926,81 @@ void FAST_FUNC sha512_end(sha512_ctx_t ctx, void resbuf)
926	# define SHA3_SMALL CONFIG_SHA3_SMALL	926	# define SHA3_SMALL CONFIG_SHA3_SMALL
927	#endif	927	#endif
928		928
		929	#define OPTIMIZE_SHA3_FOR_32 0
		930	/*
		931	* SHA3 can be optimized for 32-bit CPUs with bit-slicing:
		932	* every 64-bit word of state[] can be split into two 32-bit words
		933	* by even/odd bits. In this form, all rotations of sha3 round
		934	* are 32-bit - and there are lots of them.
		935	* However, it requires either splitting/combining state words
		936	* before/after sha3 round (code does this now)
		937	* or shuffling bits before xor'ing them into state and in sha3_end.
		938	* Without shuffling, bit-slicing results in -130 bytes of code
		939	* and marginal speedup (but of course it gives wrong result).
		940	* With shuffling it works, but +260 code bytes, and slower.
		941	* Disabled for now:
		942	*/
		943	#if 0 /* LONG_MAX == 0x7fffffff */
		944	# undef OPTIMIZE_SHA3_FOR_32
		945	# define OPTIMIZE_SHA3_FOR_32 1
		946	#endif
		947
929	enum {	948	enum {
930	SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */	949	SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */
931	};	950	};
932		951
		952	#if OPTIMIZE_SHA3_FOR_32
		953	/* This splits every 64-bit word into a pair of 32-bit words,
		954	* even bits go into first word, odd bits go to second one.
		955	* The conversion is done in-place.
		956	*/
		957	static void split_halves(uint64_t *state)
		958	{
		959	/* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
		960	uint32_t s32 = (uint32_t)state;
		961	uint32_t t, x0, x1;
		962	int i;
		963	for (i = 24; i >= 0; --i) {
		964	x0 = s32[0];
		965	t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
		966	t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
		967	t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
		968	t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
		969	x1 = s32[1];
		970	t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
		971	t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
		972	t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
		973	t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
		974	*s32++ = (x0 & 0x0000FFFF) \| (x1 << 16);
		975	*s32++ = (x0 >> 16) \| (x1 & 0xFFFF0000);
		976	}
		977	}
		978	/* The reverse operation */
		979	static void combine_halves(uint64_t *state)
		980	{
		981	uint32_t s32 = (uint32_t)state;
		982	uint32_t t, x0, x1;
		983	int i;
		984	for (i = 24; i >= 0; --i) {
		985	x0 = s32[0];
		986	x1 = s32[1];
		987	t = (x0 & 0x0000FFFF) \| (x1 << 16);
		988	x1 = (x0 >> 16) \| (x1 & 0xFFFF0000);
		989	x0 = t;
		990	t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
		991	t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
		992	t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
		993	t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
		994	*s32++ = x0;
		995	t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
		996	t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
		997	t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
		998	t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
		999	*s32++ = x1;
		1000	}
		1001	}
		1002	#endif
		1003
933	/*	1004	/*
934	* In the crypto literature this function is usually called Keccak-f().	1005	* In the crypto literature this function is usually called Keccak-f().
935	*/	1006	*/
@@ -937,12 +1008,179 @@ static void sha3_process_block72(uint64_t *state)
937	{	1008	{
938	enum { NROUNDS = 24 };	1009	enum { NROUNDS = 24 };
939		1010
940	/* Elements should be 64-bit, but top half is always zero or 0x80000000.	1011	#if OPTIMIZE_SHA3_FOR_32
941	* We encode 63rd bits in a separate word below.	1012	/*
942	* Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit.	1013	static const uint32_t IOTA_CONST_0[NROUNDS] = {
943	* The speed penalty is lost in the noise.	1014	0x00000001UL,
944	*/	1015	0x00000000UL,
		1016	0x00000000UL,
		1017	0x00000000UL,
		1018	0x00000001UL,
		1019	0x00000001UL,
		1020	0x00000001UL,
		1021	0x00000001UL,
		1022	0x00000000UL,
		1023	0x00000000UL,
		1024	0x00000001UL,
		1025	0x00000000UL,
		1026	0x00000001UL,
		1027	0x00000001UL,
		1028	0x00000001UL,
		1029	0x00000001UL,
		1030	0x00000000UL,
		1031	0x00000000UL,
		1032	0x00000000UL,
		1033	0x00000000UL,
		1034	0x00000001UL,
		1035	0x00000000UL,
		1036	0x00000001UL,
		1037	0x00000000UL,
		1038	};
		1039	** bits are in lsb: 0101 0000 1111 0100 1111 0001
		1040	*/
		1041	uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
		1042	static const uint32_t IOTA_CONST_1[NROUNDS] = {
		1043	0x00000000UL,
		1044	0x00000089UL,
		1045	0x8000008bUL,
		1046	0x80008080UL,
		1047	0x0000008bUL,
		1048	0x00008000UL,
		1049	0x80008088UL,
		1050	0x80000082UL,
		1051	0x0000000bUL,
		1052	0x0000000aUL,
		1053	0x00008082UL,
		1054	0x00008003UL,
		1055	0x0000808bUL,
		1056	0x8000000bUL,
		1057	0x8000008aUL,
		1058	0x80000081UL,
		1059	0x80000081UL,
		1060	0x80000008UL,
		1061	0x00000083UL,
		1062	0x80008003UL,
		1063	0x80008088UL,
		1064	0x80000088UL,
		1065	0x00008000UL,
		1066	0x80008082UL,
		1067	};
		1068
		1069	uint32_t const s32 = (uint32_t)state;
		1070	unsigned round;
		1071
		1072	split_halves(state);
		1073
		1074	for (round = 0; round < NROUNDS; round++) {
		1075	unsigned x;
		1076
		1077	/* Theta */
		1078	{
		1079	uint32_t BC[20];
		1080	for (x = 0; x < 10; ++x) {
		1081	BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
		1082	}
		1083	for (x = 0; x < 10; x += 2) {
		1084	uint32_t ta, tb;
		1085	ta = BC[x+8] ^ rotl32(BC[x+3], 1);
		1086	tb = BC[x+9] ^ BC[x+2];
		1087	s32[x+0] ^= ta;
		1088	s32[x+1] ^= tb;
		1089	s32[x+10] ^= ta;
		1090	s32[x+11] ^= tb;
		1091	s32[x+20] ^= ta;
		1092	s32[x+21] ^= tb;
		1093	s32[x+30] ^= ta;
		1094	s32[x+31] ^= tb;
		1095	s32[x+40] ^= ta;
		1096	s32[x+41] ^= tb;
		1097	}
		1098	}
		1099	/* RhoPi */
		1100	{
		1101	uint32_t t0a,t0b, t1a,t1b;
		1102	t1a = s32[1*2+0];
		1103	t1b = s32[1*2+1];
		1104
		1105	#define RhoPi(PI_LANE, ROT_CONST) \
		1106	t0a = s32[PI_LANE*2+0];\
		1107	t0b = s32[PI_LANE*2+1];\
		1108	if (ROT_CONST & 1) {\
		1109	s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
		1110	s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
		1111	} else {\
		1112	s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
		1113	s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
		1114	}\
		1115	t1a = t0a; t1b = t0b;
		1116
		1117	RhoPi(10, 1)
		1118	RhoPi( 7, 3)
		1119	RhoPi(11, 6)
		1120	RhoPi(17,10)
		1121	RhoPi(18,15)
		1122	RhoPi( 3,21)
		1123	RhoPi( 5,28)
		1124	RhoPi(16,36)
		1125	RhoPi( 8,45)
		1126	RhoPi(21,55)
		1127	RhoPi(24, 2)
		1128	RhoPi( 4,14)
		1129	RhoPi(15,27)
		1130	RhoPi(23,41)
		1131	RhoPi(19,56)
		1132	RhoPi(13, 8)
		1133	RhoPi(12,25)
		1134	RhoPi( 2,43)
		1135	RhoPi(20,62)
		1136	RhoPi(14,18)
		1137	RhoPi(22,39)
		1138	RhoPi( 9,61)
		1139	RhoPi( 6,20)
		1140	RhoPi( 1,44)
		1141	#undef RhoPi
		1142	}
		1143	/* Chi */
		1144	for (x = 0; x <= 40;) {
		1145	uint32_t BC0, BC1, BC2, BC3, BC4;
		1146	BC0 = s32[x + 0*2];
		1147	BC1 = s32[x + 1*2];
		1148	BC2 = s32[x + 2*2];
		1149	s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
		1150	BC3 = s32[x + 3*2];
		1151	s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
		1152	BC4 = s32[x + 4*2];
		1153	s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
		1154	s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
		1155	s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
		1156	x++;
		1157	BC0 = s32[x + 0*2];
		1158	BC1 = s32[x + 1*2];
		1159	BC2 = s32[x + 2*2];
		1160	s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
		1161	BC3 = s32[x + 3*2];
		1162	s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
		1163	BC4 = s32[x + 4*2];
		1164	s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
		1165	s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
		1166	s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
		1167	x += 9;
		1168	}
		1169	/* Iota */
		1170	s32[0] ^= IOTA_CONST_0bits & 1;
		1171	IOTA_CONST_0bits >>= 1;
		1172	s32[1] ^= IOTA_CONST_1[round];
		1173	}
		1174
		1175	combine_halves(state);
		1176	#else
		1177	/* Native 64-bit algorithm */
945	static const uint16_t IOTA_CONST[NROUNDS] = {	1178	static const uint16_t IOTA_CONST[NROUNDS] = {
		1179	/* Elements should be 64-bit, but top half is always zero
		1180	* or 0x80000000. We encode 63rd bits in a separate word below.
		1181	* Same is true for 31th bits, which lets us use 16-bit table
		1182	* instead of 64-bit. The speed penalty is lost in the noise.
		1183	*/
946	0x0001,	1184	0x0001,
947	0x8082,	1185	0x8082,
948	0x808a,	1186	0x808a,
@@ -983,7 +1221,7 @@ static void sha3_process_block72(uint64_t *state)
983	};	1221	};
984	/static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };/	1222	/static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };/
985		1223
986	unsigned x, y;	1224	unsigned x;
987	unsigned round;	1225	unsigned round;
988		1226
989	if (BB_BIG_ENDIAN) {	1227	if (BB_BIG_ENDIAN) {
@@ -1045,22 +1283,62 @@ static void sha3_process_block72(uint64_t *state)
1045	RhoPi_twice(20); RhoPi_twice(22);	1283	RhoPi_twice(20); RhoPi_twice(22);
1046	#undef RhoPi_twice	1284	#undef RhoPi_twice
1047	}	1285	}
1048
1049	/* Chi */	1286	/* Chi */
1050	for (y = 0; y <= 20; y += 5) {	1287	# if LONG_MAX > 0x7fffffff
		1288	for (x = 0; x <= 20; x += 5) {
1051	uint64_t BC0, BC1, BC2, BC3, BC4;	1289	uint64_t BC0, BC1, BC2, BC3, BC4;
1052	BC0 = state[y + 0];	1290	BC0 = state[x + 0];
1053	BC1 = state[y + 1];	1291	BC1 = state[x + 1];
1054	BC2 = state[y + 2];	1292	BC2 = state[x + 2];
1055	state[y + 0] = BC0 ^ ((~BC1) & BC2);	1293	state[x + 0] = BC0 ^ ((~BC1) & BC2);
1056	BC3 = state[y + 3];	1294	BC3 = state[x + 3];
1057	state[y + 1] = BC1 ^ ((~BC2) & BC3);	1295	state[x + 1] = BC1 ^ ((~BC2) & BC3);
1058	BC4 = state[y + 4];	1296	BC4 = state[x + 4];
1059	state[y + 2] = BC2 ^ ((~BC3) & BC4);	1297	state[x + 2] = BC2 ^ ((~BC3) & BC4);
1060	state[y + 3] = BC3 ^ ((~BC4) & BC0);	1298	state[x + 3] = BC3 ^ ((~BC4) & BC0);
1061	state[y + 4] = BC4 ^ ((~BC0) & BC1);	1299	state[x + 4] = BC4 ^ ((~BC0) & BC1);
1062	}	1300	}
1063		1301	# else
		1302	/* Reduced register pressure version
		1303	* for register-starved 32-bit arches
		1304	* (i386: -95 bytes, and it is _faster_)
		1305	*/
		1306	for (x = 0; x <= 40;) {
		1307	uint32_t BC0, BC1, BC2, BC3, BC4;
		1308	uint32_t const s32 = (uint32_t)state;
		1309	# if SHA3_SMALL
		1310	do_half:
		1311	# endif
		1312	BC0 = s32[x + 0*2];
		1313	BC1 = s32[x + 1*2];
		1314	BC2 = s32[x + 2*2];
		1315	s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
		1316	BC3 = s32[x + 3*2];
		1317	s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
		1318	BC4 = s32[x + 4*2];
		1319	s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
		1320	s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
		1321	s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
		1322	x++;
		1323	# if SHA3_SMALL
		1324	if (x & 1)
		1325	goto do_half;
		1326	x += 8;
		1327	# else
		1328	BC0 = s32[x + 0*2];
		1329	BC1 = s32[x + 1*2];
		1330	BC2 = s32[x + 2*2];
		1331	s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
		1332	BC3 = s32[x + 3*2];
		1333	s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
		1334	BC4 = s32[x + 4*2];
		1335	s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
		1336	s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
		1337	s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
		1338	x += 9;
		1339	# endif
		1340	}
		1341	# endif /* long is 32-bit */
1064	/* Iota */	1342	/* Iota */
1065	state[0] ^= IOTA_CONST[round]	1343	state[0] ^= IOTA_CONST[round]
1066	\| (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)	1344	\| (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
@@ -1072,6 +1350,7 @@ static void sha3_process_block72(uint64_t *state)
1072	state[x] = SWAP_LE64(state[x]);	1350	state[x] = SWAP_LE64(state[x]);
1073	}	1351	}
1074	}	1352	}
		1353	#endif
1075	}	1354	}
1076		1355
1077	void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)	1356	void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)