diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2014-07-25 17:24:13 +0200 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2014-07-25 17:24:13 +0200 |
commit | 2a563ea49a16589f47ed6afe7b22eebef4e3a6d1 (patch) | |
tree | 212bbad503c49c6ae0aabaeaef0ab023ca584f39 | |
parent | a4d564ad7c24df2da1d8e03a7dd016f0a3d5edbd (diff) | |
download | busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.tar.gz busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.tar.bz2 busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.zip |
sha3: add 32-bit optimized bit-sliced implementation
It is an interesting trick, but so far I only managed to make it work
correctly, not to make it faster and/or smaller.
The code is ifdefed out for now.
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/hash_md5_sha.c | 256 |
1 files changed, 242 insertions, 14 deletions
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 3f743ac75..dff583ad1 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
@@ -926,10 +926,81 @@ void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf) | |||
926 | # define SHA3_SMALL CONFIG_SHA3_SMALL | 926 | # define SHA3_SMALL CONFIG_SHA3_SMALL |
927 | #endif | 927 | #endif |
928 | 928 | ||
929 | #define OPTIMIZE_SHA3_FOR_32 0 | ||
930 | /* | ||
931 | * SHA3 can be optimized for 32-bit CPUs with bit-slicing: | ||
932 | * every 64-bit word of state[] can be split into two 32-bit words | ||
933 | * by even/odd bits. In this form, all rotations of sha3 round | ||
934 | * are 32-bit - and there are lots of them. | ||
935 | * However, it requires either splitting/combining state words | ||
936 | * before/after sha3 round (code does this now) | ||
937 | * or shuffling bits before xor'ing them into state and in sha3_end. | ||
938 | * Without shuffling, bit-slicing results in -130 bytes of code | ||
939 | * and marginal speedup (but of course it gives wrong result). | ||
940 | * With shuffling it works, but +260 code bytes, and slower. | ||
941 | * Disabled for now: | ||
942 | */ | ||
943 | #if 0 /* LONG_MAX == 0x7fffffff */ | ||
944 | # undef OPTIMIZE_SHA3_FOR_32 | ||
945 | # define OPTIMIZE_SHA3_FOR_32 1 | ||
946 | #endif | ||
947 | |||
929 | enum { | 948 | enum { |
930 | SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */ | 949 | SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */ |
931 | }; | 950 | }; |
932 | 951 | ||
952 | #if OPTIMIZE_SHA3_FOR_32 | ||
953 | /* This splits every 64-bit word into a pair of 32-bit words, | ||
954 | * even bits go into first word, odd bits go to second one. | ||
955 | * The conversion is done in-place. | ||
956 | */ | ||
957 | static void split_halves(uint64_t *state) | ||
958 | { | ||
959 | /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ | ||
960 | uint32_t *s32 = (uint32_t*)state; | ||
961 | uint32_t t, x0, x1; | ||
962 | int i; | ||
963 | for (i = 24; i >= 0; --i) { | ||
964 | x0 = s32[0]; | ||
965 | t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1); | ||
966 | t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2); | ||
967 | t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4); | ||
968 | t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8); | ||
969 | x1 = s32[1]; | ||
970 | t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1); | ||
971 | t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2); | ||
972 | t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4); | ||
973 | t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8); | ||
974 | *s32++ = (x0 & 0x0000FFFF) | (x1 << 16); | ||
975 | *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000); | ||
976 | } | ||
977 | } | ||
978 | /* The reverse operation */ | ||
979 | static void combine_halves(uint64_t *state) | ||
980 | { | ||
981 | uint32_t *s32 = (uint32_t*)state; | ||
982 | uint32_t t, x0, x1; | ||
983 | int i; | ||
984 | for (i = 24; i >= 0; --i) { | ||
985 | x0 = s32[0]; | ||
986 | x1 = s32[1]; | ||
987 | t = (x0 & 0x0000FFFF) | (x1 << 16); | ||
988 | x1 = (x0 >> 16) | (x1 & 0xFFFF0000); | ||
989 | x0 = t; | ||
990 | t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8); | ||
991 | t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4); | ||
992 | t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2); | ||
993 | t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1); | ||
994 | *s32++ = x0; | ||
995 | t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8); | ||
996 | t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4); | ||
997 | t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2); | ||
998 | t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1); | ||
999 | *s32++ = x1; | ||
1000 | } | ||
1001 | } | ||
1002 | #endif | ||
1003 | |||
933 | /* | 1004 | /* |
934 | * In the crypto literature this function is usually called Keccak-f(). | 1005 | * In the crypto literature this function is usually called Keccak-f(). |
935 | */ | 1006 | */ |
@@ -937,6 +1008,164 @@ static void sha3_process_block72(uint64_t *state) | |||
937 | { | 1008 | { |
938 | enum { NROUNDS = 24 }; | 1009 | enum { NROUNDS = 24 }; |
939 | 1010 | ||
1011 | #if OPTIMIZE_SHA3_FOR_32 | ||
1012 | /* | ||
1013 | static const uint32_t IOTA_CONST_0[NROUNDS] = { | ||
1014 | 0x00000001UL, | ||
1015 | 0x00000000UL, | ||
1016 | 0x00000000UL, | ||
1017 | 0x00000000UL, | ||
1018 | 0x00000001UL, | ||
1019 | 0x00000001UL, | ||
1020 | 0x00000001UL, | ||
1021 | 0x00000001UL, | ||
1022 | 0x00000000UL, | ||
1023 | 0x00000000UL, | ||
1024 | 0x00000001UL, | ||
1025 | 0x00000000UL, | ||
1026 | 0x00000001UL, | ||
1027 | 0x00000001UL, | ||
1028 | 0x00000001UL, | ||
1029 | 0x00000001UL, | ||
1030 | 0x00000000UL, | ||
1031 | 0x00000000UL, | ||
1032 | 0x00000000UL, | ||
1033 | 0x00000000UL, | ||
1034 | 0x00000001UL, | ||
1035 | 0x00000000UL, | ||
1036 | 0x00000001UL, | ||
1037 | 0x00000000UL, | ||
1038 | }; | ||
1039 | ** bits are in lsb: 0101 0000 1111 0100 1111 0001 | ||
1040 | */ | ||
1041 | uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1); | ||
1042 | static const uint32_t IOTA_CONST_1[NROUNDS] = { | ||
1043 | 0x00000000UL, | ||
1044 | 0x00000089UL, | ||
1045 | 0x8000008bUL, | ||
1046 | 0x80008080UL, | ||
1047 | 0x0000008bUL, | ||
1048 | 0x00008000UL, | ||
1049 | 0x80008088UL, | ||
1050 | 0x80000082UL, | ||
1051 | 0x0000000bUL, | ||
1052 | 0x0000000aUL, | ||
1053 | 0x00008082UL, | ||
1054 | 0x00008003UL, | ||
1055 | 0x0000808bUL, | ||
1056 | 0x8000000bUL, | ||
1057 | 0x8000008aUL, | ||
1058 | 0x80000081UL, | ||
1059 | 0x80000081UL, | ||
1060 | 0x80000008UL, | ||
1061 | 0x00000083UL, | ||
1062 | 0x80008003UL, | ||
1063 | 0x80008088UL, | ||
1064 | 0x80000088UL, | ||
1065 | 0x00008000UL, | ||
1066 | 0x80008082UL, | ||
1067 | }; | ||
1068 | |||
1069 | uint32_t *const s32 = (uint32_t*)state; | ||
1070 | unsigned round; | ||
1071 | |||
1072 | split_halves(state); | ||
1073 | |||
1074 | for (round = 0; round < NROUNDS; round++) { | ||
1075 | unsigned x; | ||
1076 | |||
1077 | /* Theta */ | ||
1078 | { | ||
1079 | uint32_t BC[20]; | ||
1080 | for (x = 0; x < 10; ++x) { | ||
1081 | BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40]; | ||
1082 | } | ||
1083 | for (x = 0; x < 10; x += 2) { | ||
1084 | uint32_t ta, tb; | ||
1085 | ta = BC[x+8] ^ rotl32(BC[x+3], 1); | ||
1086 | tb = BC[x+9] ^ BC[x+2]; | ||
1087 | s32[x+0] ^= ta; | ||
1088 | s32[x+1] ^= tb; | ||
1089 | s32[x+10] ^= ta; | ||
1090 | s32[x+11] ^= tb; | ||
1091 | s32[x+20] ^= ta; | ||
1092 | s32[x+21] ^= tb; | ||
1093 | s32[x+30] ^= ta; | ||
1094 | s32[x+31] ^= tb; | ||
1095 | s32[x+40] ^= ta; | ||
1096 | s32[x+41] ^= tb; | ||
1097 | } | ||
1098 | } | ||
1099 | /* RhoPi */ | ||
1100 | { | ||
1101 | uint32_t t0a,t0b, t1a,t1b; | ||
1102 | t1a = s32[1*2+0]; | ||
1103 | t1b = s32[1*2+1]; | ||
1104 | |||
1105 | #define RhoPi(PI_LANE, ROT_CONST) \ | ||
1106 | t0a = s32[PI_LANE*2+0];\ | ||
1107 | t0b = s32[PI_LANE*2+1];\ | ||
1108 | if (ROT_CONST & 1) {\ | ||
1109 | s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\ | ||
1110 | s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\ | ||
1111 | } else {\ | ||
1112 | s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\ | ||
1113 | s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\ | ||
1114 | }\ | ||
1115 | t1a = t0a; t1b = t0b; | ||
1116 | |||
1117 | RhoPi(10, 1) | ||
1118 | RhoPi( 7, 3) | ||
1119 | RhoPi(11, 6) | ||
1120 | RhoPi(17,10) | ||
1121 | RhoPi(18,15) | ||
1122 | RhoPi( 3,21) | ||
1123 | RhoPi( 5,28) | ||
1124 | RhoPi(16,36) | ||
1125 | RhoPi( 8,45) | ||
1126 | RhoPi(21,55) | ||
1127 | RhoPi(24, 2) | ||
1128 | RhoPi( 4,14) | ||
1129 | RhoPi(15,27) | ||
1130 | RhoPi(23,41) | ||
1131 | RhoPi(19,56) | ||
1132 | RhoPi(13, 8) | ||
1133 | RhoPi(12,25) | ||
1134 | RhoPi( 2,43) | ||
1135 | RhoPi(20,62) | ||
1136 | RhoPi(14,18) | ||
1137 | RhoPi(22,39) | ||
1138 | RhoPi( 9,61) | ||
1139 | RhoPi( 6,20) | ||
1140 | RhoPi( 1,44) | ||
1141 | #undef RhoPi | ||
1142 | } | ||
1143 | /* Chi */ | ||
1144 | for (x = 0; x <= 20; x += 5) { | ||
1145 | /* | ||
1146 | * Can write this in terms of uint32 too, | ||
1147 | * but why? compiler does it automatically. | ||
1148 | */ | ||
1149 | uint64_t BC0, BC1, BC2, BC3, BC4; | ||
1150 | BC0 = state[x + 0]; | ||
1151 | BC1 = state[x + 1]; | ||
1152 | BC2 = state[x + 2]; | ||
1153 | state[x + 0] = BC0 ^ ((~BC1) & BC2); | ||
1154 | BC3 = state[x + 3]; | ||
1155 | state[x + 1] = BC1 ^ ((~BC2) & BC3); | ||
1156 | BC4 = state[x + 4]; | ||
1157 | state[x + 2] = BC2 ^ ((~BC3) & BC4); | ||
1158 | state[x + 3] = BC3 ^ ((~BC4) & BC0); | ||
1159 | state[x + 4] = BC4 ^ ((~BC0) & BC1); | ||
1160 | } | ||
1161 | /* Iota */ | ||
1162 | s32[0] ^= IOTA_CONST_0bits & 1; | ||
1163 | IOTA_CONST_0bits >>= 1; | ||
1164 | s32[1] ^= IOTA_CONST_1[round]; | ||
1165 | } | ||
1166 | |||
1167 | combine_halves(state); | ||
1168 | #else | ||
940 | /* Elements should be 64-bit, but top half is always zero or 0x80000000. | 1169 | /* Elements should be 64-bit, but top half is always zero or 0x80000000. |
941 | * We encode 63rd bits in a separate word below. | 1170 | * We encode 63rd bits in a separate word below. |
942 | * Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit. | 1171 | * Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit. |
@@ -983,7 +1212,7 @@ static void sha3_process_block72(uint64_t *state) | |||
983 | }; | 1212 | }; |
984 | /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/ | 1213 | /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/ |
985 | 1214 | ||
986 | unsigned x, y; | 1215 | unsigned x; |
987 | unsigned round; | 1216 | unsigned round; |
988 | 1217 | ||
989 | if (BB_BIG_ENDIAN) { | 1218 | if (BB_BIG_ENDIAN) { |
@@ -1045,22 +1274,20 @@ static void sha3_process_block72(uint64_t *state) | |||
1045 | RhoPi_twice(20); RhoPi_twice(22); | 1274 | RhoPi_twice(20); RhoPi_twice(22); |
1046 | #undef RhoPi_twice | 1275 | #undef RhoPi_twice |
1047 | } | 1276 | } |
1048 | |||
1049 | /* Chi */ | 1277 | /* Chi */ |
1050 | for (y = 0; y <= 20; y += 5) { | 1278 | for (x = 0; x <= 20; x += 5) { |
1051 | uint64_t BC0, BC1, BC2, BC3, BC4; | 1279 | uint64_t BC0, BC1, BC2, BC3, BC4; |
1052 | BC0 = state[y + 0]; | 1280 | BC0 = state[x + 0]; |
1053 | BC1 = state[y + 1]; | 1281 | BC1 = state[x + 1]; |
1054 | BC2 = state[y + 2]; | 1282 | BC2 = state[x + 2]; |
1055 | state[y + 0] = BC0 ^ ((~BC1) & BC2); | 1283 | state[x + 0] = BC0 ^ ((~BC1) & BC2); |
1056 | BC3 = state[y + 3]; | 1284 | BC3 = state[x + 3]; |
1057 | state[y + 1] = BC1 ^ ((~BC2) & BC3); | 1285 | state[x + 1] = BC1 ^ ((~BC2) & BC3); |
1058 | BC4 = state[y + 4]; | 1286 | BC4 = state[x + 4]; |
1059 | state[y + 2] = BC2 ^ ((~BC3) & BC4); | 1287 | state[x + 2] = BC2 ^ ((~BC3) & BC4); |
1060 | state[y + 3] = BC3 ^ ((~BC4) & BC0); | 1288 | state[x + 3] = BC3 ^ ((~BC4) & BC0); |
1061 | state[y + 4] = BC4 ^ ((~BC0) & BC1); | 1289 | state[x + 4] = BC4 ^ ((~BC0) & BC1); |
1062 | } | 1290 | } |
1063 | |||
1064 | /* Iota */ | 1291 | /* Iota */ |
1065 | state[0] ^= IOTA_CONST[round] | 1292 | state[0] ^= IOTA_CONST[round] |
1066 | | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000) | 1293 | | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000) |
@@ -1072,6 +1299,7 @@ static void sha3_process_block72(uint64_t *state) | |||
1072 | state[x] = SWAP_LE64(state[x]); | 1299 | state[x] = SWAP_LE64(state[x]); |
1073 | } | 1300 | } |
1074 | } | 1301 | } |
1302 | #endif | ||
1075 | } | 1303 | } |
1076 | 1304 | ||
1077 | void FAST_FUNC sha3_begin(sha3_ctx_t *ctx) | 1305 | void FAST_FUNC sha3_begin(sha3_ctx_t *ctx) |