aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2014-07-25 17:24:13 +0200
committerDenys Vlasenko <vda.linux@googlemail.com>2014-07-25 17:24:13 +0200
commit2a563ea49a16589f47ed6afe7b22eebef4e3a6d1 (patch)
tree212bbad503c49c6ae0aabaeaef0ab023ca584f39
parenta4d564ad7c24df2da1d8e03a7dd016f0a3d5edbd (diff)
downloadbusybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.tar.gz
busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.tar.bz2
busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.zip
sha3: add 32-bit optimized bit-sliced implementation
It is an interesting trick, but so far I only managed to make it work correctly, not to make it faster and/or smaller. The code is ifdefed out for now. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/hash_md5_sha.c256
1 files changed, 242 insertions, 14 deletions
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 3f743ac75..dff583ad1 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -926,10 +926,81 @@ void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
926# define SHA3_SMALL CONFIG_SHA3_SMALL 926# define SHA3_SMALL CONFIG_SHA3_SMALL
927#endif 927#endif
928 928
929#define OPTIMIZE_SHA3_FOR_32 0
930/*
931 * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
932 * every 64-bit word of state[] can be split into two 32-bit words
933 * by even/odd bits. In this form, all rotations of sha3 round
934 * are 32-bit - and there are lots of them.
935 * However, it requires either splitting/combining state words
936 * before/after sha3 round (code does this now)
937 * or shuffling bits before xor'ing them into state and in sha3_end.
938 * Without shuffling, bit-slicing results in -130 bytes of code
939 * and marginal speedup (but of course it gives wrong result).
940 * With shuffling it works, but +260 code bytes, and slower.
941 * Disabled for now:
942 */
943#if 0 /* LONG_MAX == 0x7fffffff */
944# undef OPTIMIZE_SHA3_FOR_32
945# define OPTIMIZE_SHA3_FOR_32 1
946#endif
947
929enum { 948enum {
930 SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */ 949 SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */
931}; 950};
932 951
952#if OPTIMIZE_SHA3_FOR_32
953/* This splits every 64-bit word into a pair of 32-bit words,
954 * even bits go into first word, odd bits go to second one.
955 * The conversion is done in-place.
956 */
957static void split_halves(uint64_t *state)
958{
959 /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
960 uint32_t *s32 = (uint32_t*)state;
961 uint32_t t, x0, x1;
962 int i;
963 for (i = 24; i >= 0; --i) {
964 x0 = s32[0];
965 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
966 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
967 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
968 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
969 x1 = s32[1];
970 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
971 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
972 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
973 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
974 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
975 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
976 }
977}
978/* The reverse operation */
979static void combine_halves(uint64_t *state)
980{
981 uint32_t *s32 = (uint32_t*)state;
982 uint32_t t, x0, x1;
983 int i;
984 for (i = 24; i >= 0; --i) {
985 x0 = s32[0];
986 x1 = s32[1];
987 t = (x0 & 0x0000FFFF) | (x1 << 16);
988 x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
989 x0 = t;
990 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
991 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
992 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
993 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
994 *s32++ = x0;
995 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
996 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
997 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
998 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
999 *s32++ = x1;
1000 }
1001}
1002#endif
1003
933/* 1004/*
934 * In the crypto literature this function is usually called Keccak-f(). 1005 * In the crypto literature this function is usually called Keccak-f().
935 */ 1006 */
@@ -937,6 +1008,164 @@ static void sha3_process_block72(uint64_t *state)
937{ 1008{
938 enum { NROUNDS = 24 }; 1009 enum { NROUNDS = 24 };
939 1010
1011#if OPTIMIZE_SHA3_FOR_32
1012 /*
1013 static const uint32_t IOTA_CONST_0[NROUNDS] = {
1014 0x00000001UL,
1015 0x00000000UL,
1016 0x00000000UL,
1017 0x00000000UL,
1018 0x00000001UL,
1019 0x00000001UL,
1020 0x00000001UL,
1021 0x00000001UL,
1022 0x00000000UL,
1023 0x00000000UL,
1024 0x00000001UL,
1025 0x00000000UL,
1026 0x00000001UL,
1027 0x00000001UL,
1028 0x00000001UL,
1029 0x00000001UL,
1030 0x00000000UL,
1031 0x00000000UL,
1032 0x00000000UL,
1033 0x00000000UL,
1034 0x00000001UL,
1035 0x00000000UL,
1036 0x00000001UL,
1037 0x00000000UL,
1038 };
1039 ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1040 */
1041 uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1042 static const uint32_t IOTA_CONST_1[NROUNDS] = {
1043 0x00000000UL,
1044 0x00000089UL,
1045 0x8000008bUL,
1046 0x80008080UL,
1047 0x0000008bUL,
1048 0x00008000UL,
1049 0x80008088UL,
1050 0x80000082UL,
1051 0x0000000bUL,
1052 0x0000000aUL,
1053 0x00008082UL,
1054 0x00008003UL,
1055 0x0000808bUL,
1056 0x8000000bUL,
1057 0x8000008aUL,
1058 0x80000081UL,
1059 0x80000081UL,
1060 0x80000008UL,
1061 0x00000083UL,
1062 0x80008003UL,
1063 0x80008088UL,
1064 0x80000088UL,
1065 0x00008000UL,
1066 0x80008082UL,
1067 };
1068
1069 uint32_t *const s32 = (uint32_t*)state;
1070 unsigned round;
1071
1072 split_halves(state);
1073
1074 for (round = 0; round < NROUNDS; round++) {
1075 unsigned x;
1076
1077 /* Theta */
1078 {
1079 uint32_t BC[20];
1080 for (x = 0; x < 10; ++x) {
1081 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1082 }
1083 for (x = 0; x < 10; x += 2) {
1084 uint32_t ta, tb;
1085 ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1086 tb = BC[x+9] ^ BC[x+2];
1087 s32[x+0] ^= ta;
1088 s32[x+1] ^= tb;
1089 s32[x+10] ^= ta;
1090 s32[x+11] ^= tb;
1091 s32[x+20] ^= ta;
1092 s32[x+21] ^= tb;
1093 s32[x+30] ^= ta;
1094 s32[x+31] ^= tb;
1095 s32[x+40] ^= ta;
1096 s32[x+41] ^= tb;
1097 }
1098 }
1099 /* RhoPi */
1100 {
1101 uint32_t t0a,t0b, t1a,t1b;
1102 t1a = s32[1*2+0];
1103 t1b = s32[1*2+1];
1104
1105#define RhoPi(PI_LANE, ROT_CONST) \
1106 t0a = s32[PI_LANE*2+0];\
1107 t0b = s32[PI_LANE*2+1];\
1108 if (ROT_CONST & 1) {\
1109 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1110 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1111 } else {\
1112 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1113 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1114 }\
1115 t1a = t0a; t1b = t0b;
1116
1117 RhoPi(10, 1)
1118 RhoPi( 7, 3)
1119 RhoPi(11, 6)
1120 RhoPi(17,10)
1121 RhoPi(18,15)
1122 RhoPi( 3,21)
1123 RhoPi( 5,28)
1124 RhoPi(16,36)
1125 RhoPi( 8,45)
1126 RhoPi(21,55)
1127 RhoPi(24, 2)
1128 RhoPi( 4,14)
1129 RhoPi(15,27)
1130 RhoPi(23,41)
1131 RhoPi(19,56)
1132 RhoPi(13, 8)
1133 RhoPi(12,25)
1134 RhoPi( 2,43)
1135 RhoPi(20,62)
1136 RhoPi(14,18)
1137 RhoPi(22,39)
1138 RhoPi( 9,61)
1139 RhoPi( 6,20)
1140 RhoPi( 1,44)
1141#undef RhoPi
1142 }
1143 /* Chi */
1144 for (x = 0; x <= 20; x += 5) {
1145 /*
1146 * Can write this in terms of uint32 too,
1147 * but why? compiler does it automatically.
1148 */
1149 uint64_t BC0, BC1, BC2, BC3, BC4;
1150 BC0 = state[x + 0];
1151 BC1 = state[x + 1];
1152 BC2 = state[x + 2];
1153 state[x + 0] = BC0 ^ ((~BC1) & BC2);
1154 BC3 = state[x + 3];
1155 state[x + 1] = BC1 ^ ((~BC2) & BC3);
1156 BC4 = state[x + 4];
1157 state[x + 2] = BC2 ^ ((~BC3) & BC4);
1158 state[x + 3] = BC3 ^ ((~BC4) & BC0);
1159 state[x + 4] = BC4 ^ ((~BC0) & BC1);
1160 }
1161 /* Iota */
1162 s32[0] ^= IOTA_CONST_0bits & 1;
1163 IOTA_CONST_0bits >>= 1;
1164 s32[1] ^= IOTA_CONST_1[round];
1165 }
1166
1167 combine_halves(state);
1168#else
940 /* Elements should be 64-bit, but top half is always zero or 0x80000000. 1169 /* Elements should be 64-bit, but top half is always zero or 0x80000000.
941 * We encode 63rd bits in a separate word below. 1170 * We encode 63rd bits in a separate word below.
942 * Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit. 1171 * Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit.
@@ -983,7 +1212,7 @@ static void sha3_process_block72(uint64_t *state)
983 }; 1212 };
984 /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/ 1213 /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
985 1214
986 unsigned x, y; 1215 unsigned x;
987 unsigned round; 1216 unsigned round;
988 1217
989 if (BB_BIG_ENDIAN) { 1218 if (BB_BIG_ENDIAN) {
@@ -1045,22 +1274,20 @@ static void sha3_process_block72(uint64_t *state)
1045 RhoPi_twice(20); RhoPi_twice(22); 1274 RhoPi_twice(20); RhoPi_twice(22);
1046#undef RhoPi_twice 1275#undef RhoPi_twice
1047 } 1276 }
1048
1049 /* Chi */ 1277 /* Chi */
1050 for (y = 0; y <= 20; y += 5) { 1278 for (x = 0; x <= 20; x += 5) {
1051 uint64_t BC0, BC1, BC2, BC3, BC4; 1279 uint64_t BC0, BC1, BC2, BC3, BC4;
1052 BC0 = state[y + 0]; 1280 BC0 = state[x + 0];
1053 BC1 = state[y + 1]; 1281 BC1 = state[x + 1];
1054 BC2 = state[y + 2]; 1282 BC2 = state[x + 2];
1055 state[y + 0] = BC0 ^ ((~BC1) & BC2); 1283 state[x + 0] = BC0 ^ ((~BC1) & BC2);
1056 BC3 = state[y + 3]; 1284 BC3 = state[x + 3];
1057 state[y + 1] = BC1 ^ ((~BC2) & BC3); 1285 state[x + 1] = BC1 ^ ((~BC2) & BC3);
1058 BC4 = state[y + 4]; 1286 BC4 = state[x + 4];
1059 state[y + 2] = BC2 ^ ((~BC3) & BC4); 1287 state[x + 2] = BC2 ^ ((~BC3) & BC4);
1060 state[y + 3] = BC3 ^ ((~BC4) & BC0); 1288 state[x + 3] = BC3 ^ ((~BC4) & BC0);
1061 state[y + 4] = BC4 ^ ((~BC0) & BC1); 1289 state[x + 4] = BC4 ^ ((~BC0) & BC1);
1062 } 1290 }
1063
1064 /* Iota */ 1291 /* Iota */
1065 state[0] ^= IOTA_CONST[round] 1292 state[0] ^= IOTA_CONST[round]
1066 | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000) 1293 | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
@@ -1072,6 +1299,7 @@ static void sha3_process_block72(uint64_t *state)
1072 state[x] = SWAP_LE64(state[x]); 1299 state[x] = SWAP_LE64(state[x]);
1073 } 1300 }
1074 } 1301 }
1302#endif
1075} 1303}
1076 1304
1077void FAST_FUNC sha3_begin(sha3_ctx_t *ctx) 1305void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)