diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2014-07-25 17:24:13 +0200 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2014-07-25 17:24:13 +0200 |
| commit | 2a563ea49a16589f47ed6afe7b22eebef4e3a6d1 (patch) | |
| tree | 212bbad503c49c6ae0aabaeaef0ab023ca584f39 | |
| parent | a4d564ad7c24df2da1d8e03a7dd016f0a3d5edbd (diff) | |
| download | busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.tar.gz busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.tar.bz2 busybox-w32-2a563ea49a16589f47ed6afe7b22eebef4e3a6d1.zip | |
sha3: add 32-bit optimized bit-sliced implementation
It is an interesting trick, but so far I only managed to make it work
correctly, not to make it faster and/or smaller.
The code is ifdefed out for now.
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
| -rw-r--r-- | libbb/hash_md5_sha.c | 256 |
1 files changed, 242 insertions, 14 deletions
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 3f743ac75..dff583ad1 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
| @@ -926,10 +926,81 @@ void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf) | |||
| 926 | # define SHA3_SMALL CONFIG_SHA3_SMALL | 926 | # define SHA3_SMALL CONFIG_SHA3_SMALL |
| 927 | #endif | 927 | #endif |
| 928 | 928 | ||
| 929 | #define OPTIMIZE_SHA3_FOR_32 0 | ||
| 930 | /* | ||
| 931 | * SHA3 can be optimized for 32-bit CPUs with bit-slicing: | ||
| 932 | * every 64-bit word of state[] can be split into two 32-bit words | ||
| 933 | * by even/odd bits. In this form, all rotations of sha3 round | ||
| 934 | * are 32-bit - and there are lots of them. | ||
| 935 | * However, it requires either splitting/combining state words | ||
| 936 | * before/after sha3 round (code does this now) | ||
| 937 | * or shuffling bits before xor'ing them into state and in sha3_end. | ||
| 938 | * Without shuffling, bit-slicing results in -130 bytes of code | ||
| 939 | * and marginal speedup (but of course it gives wrong result). | ||
| 940 | * With shuffling it works, but +260 code bytes, and slower. | ||
| 941 | * Disabled for now: | ||
| 942 | */ | ||
| 943 | #if 0 /* LONG_MAX == 0x7fffffff */ | ||
| 944 | # undef OPTIMIZE_SHA3_FOR_32 | ||
| 945 | # define OPTIMIZE_SHA3_FOR_32 1 | ||
| 946 | #endif | ||
| 947 | |||
| 929 | enum { | 948 | enum { |
| 930 | SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */ | 949 | SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */ |
| 931 | }; | 950 | }; |
| 932 | 951 | ||
| 952 | #if OPTIMIZE_SHA3_FOR_32 | ||
| 953 | /* This splits every 64-bit word into a pair of 32-bit words, | ||
| 954 | * even bits go into first word, odd bits go to second one. | ||
| 955 | * The conversion is done in-place. | ||
| 956 | */ | ||
| 957 | static void split_halves(uint64_t *state) | ||
| 958 | { | ||
| 959 | /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ | ||
| 960 | uint32_t *s32 = (uint32_t*)state; | ||
| 961 | uint32_t t, x0, x1; | ||
| 962 | int i; | ||
| 963 | for (i = 24; i >= 0; --i) { | ||
| 964 | x0 = s32[0]; | ||
| 965 | t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1); | ||
| 966 | t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2); | ||
| 967 | t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4); | ||
| 968 | t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8); | ||
| 969 | x1 = s32[1]; | ||
| 970 | t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1); | ||
| 971 | t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2); | ||
| 972 | t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4); | ||
| 973 | t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8); | ||
| 974 | *s32++ = (x0 & 0x0000FFFF) | (x1 << 16); | ||
| 975 | *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000); | ||
| 976 | } | ||
| 977 | } | ||
| 978 | /* The reverse operation */ | ||
| 979 | static void combine_halves(uint64_t *state) | ||
| 980 | { | ||
| 981 | uint32_t *s32 = (uint32_t*)state; | ||
| 982 | uint32_t t, x0, x1; | ||
| 983 | int i; | ||
| 984 | for (i = 24; i >= 0; --i) { | ||
| 985 | x0 = s32[0]; | ||
| 986 | x1 = s32[1]; | ||
| 987 | t = (x0 & 0x0000FFFF) | (x1 << 16); | ||
| 988 | x1 = (x0 >> 16) | (x1 & 0xFFFF0000); | ||
| 989 | x0 = t; | ||
| 990 | t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8); | ||
| 991 | t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4); | ||
| 992 | t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2); | ||
| 993 | t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1); | ||
| 994 | *s32++ = x0; | ||
| 995 | t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8); | ||
| 996 | t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4); | ||
| 997 | t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2); | ||
| 998 | t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1); | ||
| 999 | *s32++ = x1; | ||
| 1000 | } | ||
| 1001 | } | ||
| 1002 | #endif | ||
| 1003 | |||
| 933 | /* | 1004 | /* |
| 934 | * In the crypto literature this function is usually called Keccak-f(). | 1005 | * In the crypto literature this function is usually called Keccak-f(). |
| 935 | */ | 1006 | */ |
| @@ -937,6 +1008,164 @@ static void sha3_process_block72(uint64_t *state) | |||
| 937 | { | 1008 | { |
| 938 | enum { NROUNDS = 24 }; | 1009 | enum { NROUNDS = 24 }; |
| 939 | 1010 | ||
| 1011 | #if OPTIMIZE_SHA3_FOR_32 | ||
| 1012 | /* | ||
| 1013 | static const uint32_t IOTA_CONST_0[NROUNDS] = { | ||
| 1014 | 0x00000001UL, | ||
| 1015 | 0x00000000UL, | ||
| 1016 | 0x00000000UL, | ||
| 1017 | 0x00000000UL, | ||
| 1018 | 0x00000001UL, | ||
| 1019 | 0x00000001UL, | ||
| 1020 | 0x00000001UL, | ||
| 1021 | 0x00000001UL, | ||
| 1022 | 0x00000000UL, | ||
| 1023 | 0x00000000UL, | ||
| 1024 | 0x00000001UL, | ||
| 1025 | 0x00000000UL, | ||
| 1026 | 0x00000001UL, | ||
| 1027 | 0x00000001UL, | ||
| 1028 | 0x00000001UL, | ||
| 1029 | 0x00000001UL, | ||
| 1030 | 0x00000000UL, | ||
| 1031 | 0x00000000UL, | ||
| 1032 | 0x00000000UL, | ||
| 1033 | 0x00000000UL, | ||
| 1034 | 0x00000001UL, | ||
| 1035 | 0x00000000UL, | ||
| 1036 | 0x00000001UL, | ||
| 1037 | 0x00000000UL, | ||
| 1038 | }; | ||
| 1039 | ** bits are in lsb: 0101 0000 1111 0100 1111 0001 | ||
| 1040 | */ | ||
| 1041 | uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1); | ||
| 1042 | static const uint32_t IOTA_CONST_1[NROUNDS] = { | ||
| 1043 | 0x00000000UL, | ||
| 1044 | 0x00000089UL, | ||
| 1045 | 0x8000008bUL, | ||
| 1046 | 0x80008080UL, | ||
| 1047 | 0x0000008bUL, | ||
| 1048 | 0x00008000UL, | ||
| 1049 | 0x80008088UL, | ||
| 1050 | 0x80000082UL, | ||
| 1051 | 0x0000000bUL, | ||
| 1052 | 0x0000000aUL, | ||
| 1053 | 0x00008082UL, | ||
| 1054 | 0x00008003UL, | ||
| 1055 | 0x0000808bUL, | ||
| 1056 | 0x8000000bUL, | ||
| 1057 | 0x8000008aUL, | ||
| 1058 | 0x80000081UL, | ||
| 1059 | 0x80000081UL, | ||
| 1060 | 0x80000008UL, | ||
| 1061 | 0x00000083UL, | ||
| 1062 | 0x80008003UL, | ||
| 1063 | 0x80008088UL, | ||
| 1064 | 0x80000088UL, | ||
| 1065 | 0x00008000UL, | ||
| 1066 | 0x80008082UL, | ||
| 1067 | }; | ||
| 1068 | |||
| 1069 | uint32_t *const s32 = (uint32_t*)state; | ||
| 1070 | unsigned round; | ||
| 1071 | |||
| 1072 | split_halves(state); | ||
| 1073 | |||
| 1074 | for (round = 0; round < NROUNDS; round++) { | ||
| 1075 | unsigned x; | ||
| 1076 | |||
| 1077 | /* Theta */ | ||
| 1078 | { | ||
| 1079 | uint32_t BC[20]; | ||
| 1080 | for (x = 0; x < 10; ++x) { | ||
| 1081 | BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40]; | ||
| 1082 | } | ||
| 1083 | for (x = 0; x < 10; x += 2) { | ||
| 1084 | uint32_t ta, tb; | ||
| 1085 | ta = BC[x+8] ^ rotl32(BC[x+3], 1); | ||
| 1086 | tb = BC[x+9] ^ BC[x+2]; | ||
| 1087 | s32[x+0] ^= ta; | ||
| 1088 | s32[x+1] ^= tb; | ||
| 1089 | s32[x+10] ^= ta; | ||
| 1090 | s32[x+11] ^= tb; | ||
| 1091 | s32[x+20] ^= ta; | ||
| 1092 | s32[x+21] ^= tb; | ||
| 1093 | s32[x+30] ^= ta; | ||
| 1094 | s32[x+31] ^= tb; | ||
| 1095 | s32[x+40] ^= ta; | ||
| 1096 | s32[x+41] ^= tb; | ||
| 1097 | } | ||
| 1098 | } | ||
| 1099 | /* RhoPi */ | ||
| 1100 | { | ||
| 1101 | uint32_t t0a,t0b, t1a,t1b; | ||
| 1102 | t1a = s32[1*2+0]; | ||
| 1103 | t1b = s32[1*2+1]; | ||
| 1104 | |||
| 1105 | #define RhoPi(PI_LANE, ROT_CONST) \ | ||
| 1106 | t0a = s32[PI_LANE*2+0];\ | ||
| 1107 | t0b = s32[PI_LANE*2+1];\ | ||
| 1108 | if (ROT_CONST & 1) {\ | ||
| 1109 | s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\ | ||
| 1110 | s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\ | ||
| 1111 | } else {\ | ||
| 1112 | s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\ | ||
| 1113 | s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\ | ||
| 1114 | }\ | ||
| 1115 | t1a = t0a; t1b = t0b; | ||
| 1116 | |||
| 1117 | RhoPi(10, 1) | ||
| 1118 | RhoPi( 7, 3) | ||
| 1119 | RhoPi(11, 6) | ||
| 1120 | RhoPi(17,10) | ||
| 1121 | RhoPi(18,15) | ||
| 1122 | RhoPi( 3,21) | ||
| 1123 | RhoPi( 5,28) | ||
| 1124 | RhoPi(16,36) | ||
| 1125 | RhoPi( 8,45) | ||
| 1126 | RhoPi(21,55) | ||
| 1127 | RhoPi(24, 2) | ||
| 1128 | RhoPi( 4,14) | ||
| 1129 | RhoPi(15,27) | ||
| 1130 | RhoPi(23,41) | ||
| 1131 | RhoPi(19,56) | ||
| 1132 | RhoPi(13, 8) | ||
| 1133 | RhoPi(12,25) | ||
| 1134 | RhoPi( 2,43) | ||
| 1135 | RhoPi(20,62) | ||
| 1136 | RhoPi(14,18) | ||
| 1137 | RhoPi(22,39) | ||
| 1138 | RhoPi( 9,61) | ||
| 1139 | RhoPi( 6,20) | ||
| 1140 | RhoPi( 1,44) | ||
| 1141 | #undef RhoPi | ||
| 1142 | } | ||
| 1143 | /* Chi */ | ||
| 1144 | for (x = 0; x <= 20; x += 5) { | ||
| 1145 | /* | ||
| 1146 | * Can write this in terms of uint32 too, | ||
| 1147 | * but why? compiler does it automatically. | ||
| 1148 | */ | ||
| 1149 | uint64_t BC0, BC1, BC2, BC3, BC4; | ||
| 1150 | BC0 = state[x + 0]; | ||
| 1151 | BC1 = state[x + 1]; | ||
| 1152 | BC2 = state[x + 2]; | ||
| 1153 | state[x + 0] = BC0 ^ ((~BC1) & BC2); | ||
| 1154 | BC3 = state[x + 3]; | ||
| 1155 | state[x + 1] = BC1 ^ ((~BC2) & BC3); | ||
| 1156 | BC4 = state[x + 4]; | ||
| 1157 | state[x + 2] = BC2 ^ ((~BC3) & BC4); | ||
| 1158 | state[x + 3] = BC3 ^ ((~BC4) & BC0); | ||
| 1159 | state[x + 4] = BC4 ^ ((~BC0) & BC1); | ||
| 1160 | } | ||
| 1161 | /* Iota */ | ||
| 1162 | s32[0] ^= IOTA_CONST_0bits & 1; | ||
| 1163 | IOTA_CONST_0bits >>= 1; | ||
| 1164 | s32[1] ^= IOTA_CONST_1[round]; | ||
| 1165 | } | ||
| 1166 | |||
| 1167 | combine_halves(state); | ||
| 1168 | #else | ||
| 940 | /* Elements should be 64-bit, but top half is always zero or 0x80000000. | 1169 | /* Elements should be 64-bit, but top half is always zero or 0x80000000. |
| 941 | * We encode 63rd bits in a separate word below. | 1170 | * We encode 63rd bits in a separate word below. |
| 942 | * Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit. | 1171 | * Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit. |
| @@ -983,7 +1212,7 @@ static void sha3_process_block72(uint64_t *state) | |||
| 983 | }; | 1212 | }; |
| 984 | /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/ | 1213 | /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/ |
| 985 | 1214 | ||
| 986 | unsigned x, y; | 1215 | unsigned x; |
| 987 | unsigned round; | 1216 | unsigned round; |
| 988 | 1217 | ||
| 989 | if (BB_BIG_ENDIAN) { | 1218 | if (BB_BIG_ENDIAN) { |
| @@ -1045,22 +1274,20 @@ static void sha3_process_block72(uint64_t *state) | |||
| 1045 | RhoPi_twice(20); RhoPi_twice(22); | 1274 | RhoPi_twice(20); RhoPi_twice(22); |
| 1046 | #undef RhoPi_twice | 1275 | #undef RhoPi_twice |
| 1047 | } | 1276 | } |
| 1048 | |||
| 1049 | /* Chi */ | 1277 | /* Chi */ |
| 1050 | for (y = 0; y <= 20; y += 5) { | 1278 | for (x = 0; x <= 20; x += 5) { |
| 1051 | uint64_t BC0, BC1, BC2, BC3, BC4; | 1279 | uint64_t BC0, BC1, BC2, BC3, BC4; |
| 1052 | BC0 = state[y + 0]; | 1280 | BC0 = state[x + 0]; |
| 1053 | BC1 = state[y + 1]; | 1281 | BC1 = state[x + 1]; |
| 1054 | BC2 = state[y + 2]; | 1282 | BC2 = state[x + 2]; |
| 1055 | state[y + 0] = BC0 ^ ((~BC1) & BC2); | 1283 | state[x + 0] = BC0 ^ ((~BC1) & BC2); |
| 1056 | BC3 = state[y + 3]; | 1284 | BC3 = state[x + 3]; |
| 1057 | state[y + 1] = BC1 ^ ((~BC2) & BC3); | 1285 | state[x + 1] = BC1 ^ ((~BC2) & BC3); |
| 1058 | BC4 = state[y + 4]; | 1286 | BC4 = state[x + 4]; |
| 1059 | state[y + 2] = BC2 ^ ((~BC3) & BC4); | 1287 | state[x + 2] = BC2 ^ ((~BC3) & BC4); |
| 1060 | state[y + 3] = BC3 ^ ((~BC4) & BC0); | 1288 | state[x + 3] = BC3 ^ ((~BC4) & BC0); |
| 1061 | state[y + 4] = BC4 ^ ((~BC0) & BC1); | 1289 | state[x + 4] = BC4 ^ ((~BC0) & BC1); |
| 1062 | } | 1290 | } |
| 1063 | |||
| 1064 | /* Iota */ | 1291 | /* Iota */ |
| 1065 | state[0] ^= IOTA_CONST[round] | 1292 | state[0] ^= IOTA_CONST[round] |
| 1066 | | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000) | 1293 | | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000) |
| @@ -1072,6 +1299,7 @@ static void sha3_process_block72(uint64_t *state) | |||
| 1072 | state[x] = SWAP_LE64(state[x]); | 1299 | state[x] = SWAP_LE64(state[x]); |
| 1073 | } | 1300 | } |
| 1074 | } | 1301 | } |
| 1302 | #endif | ||
| 1075 | } | 1303 | } |
| 1076 | 1304 | ||
| 1077 | void FAST_FUNC sha3_begin(sha3_ctx_t *ctx) | 1305 | void FAST_FUNC sha3_begin(sha3_ctx_t *ctx) |
