aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha.c')
-rw-r--r--libbb/hash_md5_sha.c317
1 files changed, 298 insertions, 19 deletions
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 3f743ac75..1f63ccdee 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -926,10 +926,81 @@ void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
926# define SHA3_SMALL CONFIG_SHA3_SMALL 926# define SHA3_SMALL CONFIG_SHA3_SMALL
927#endif 927#endif
928 928
929#define OPTIMIZE_SHA3_FOR_32 0
930/*
931 * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
932 * every 64-bit word of state[] can be split into two 32-bit words
933 * by even/odd bits. In this form, all rotations of sha3 round
934 * are 32-bit - and there are lots of them.
935 * However, it requires either splitting/combining state words
936 * before/after sha3 round (code does this now)
937 * or shuffling bits before xor'ing them into state and in sha3_end.
938 * Without shuffling, bit-slicing results in -130 bytes of code
939 * and marginal speedup (but of course it gives wrong result).
940 * With shuffling it works, but +260 code bytes, and slower.
941 * Disabled for now:
942 */
943#if 0 /* LONG_MAX == 0x7fffffff */
944# undef OPTIMIZE_SHA3_FOR_32
945# define OPTIMIZE_SHA3_FOR_32 1
946#endif
947
929enum { 948enum {
930 SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */ 949 SHA3_IBLK_BYTES = 72, /* 576 bits / 8 */
931}; 950};
932 951
952#if OPTIMIZE_SHA3_FOR_32
953/* This splits every 64-bit word into a pair of 32-bit words,
954 * even bits go into first word, odd bits go to second one.
955 * The conversion is done in-place.
956 */
957static void split_halves(uint64_t *state)
958{
959 /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
960 uint32_t *s32 = (uint32_t*)state;
961 uint32_t t, x0, x1;
962 int i;
963 for (i = 24; i >= 0; --i) {
964 x0 = s32[0];
965 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
966 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
967 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
968 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
969 x1 = s32[1];
970 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
971 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
972 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
973 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
974 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
975 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
976 }
977}
978/* The reverse operation */
979static void combine_halves(uint64_t *state)
980{
981 uint32_t *s32 = (uint32_t*)state;
982 uint32_t t, x0, x1;
983 int i;
984 for (i = 24; i >= 0; --i) {
985 x0 = s32[0];
986 x1 = s32[1];
987 t = (x0 & 0x0000FFFF) | (x1 << 16);
988 x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
989 x0 = t;
990 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
991 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
992 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
993 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
994 *s32++ = x0;
995 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
996 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
997 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
998 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
999 *s32++ = x1;
1000 }
1001}
1002#endif
1003
933/* 1004/*
934 * In the crypto literature this function is usually called Keccak-f(). 1005 * In the crypto literature this function is usually called Keccak-f().
935 */ 1006 */
@@ -937,12 +1008,179 @@ static void sha3_process_block72(uint64_t *state)
937{ 1008{
938 enum { NROUNDS = 24 }; 1009 enum { NROUNDS = 24 };
939 1010
940 /* Elements should be 64-bit, but top half is always zero or 0x80000000. 1011#if OPTIMIZE_SHA3_FOR_32
941 * We encode 63rd bits in a separate word below. 1012 /*
942 * Same is true for 31th bits, which lets us use 16-bit table instead of 64-bit. 1013 static const uint32_t IOTA_CONST_0[NROUNDS] = {
943 * The speed penalty is lost in the noise. 1014 0x00000001UL,
944 */ 1015 0x00000000UL,
1016 0x00000000UL,
1017 0x00000000UL,
1018 0x00000001UL,
1019 0x00000001UL,
1020 0x00000001UL,
1021 0x00000001UL,
1022 0x00000000UL,
1023 0x00000000UL,
1024 0x00000001UL,
1025 0x00000000UL,
1026 0x00000001UL,
1027 0x00000001UL,
1028 0x00000001UL,
1029 0x00000001UL,
1030 0x00000000UL,
1031 0x00000000UL,
1032 0x00000000UL,
1033 0x00000000UL,
1034 0x00000001UL,
1035 0x00000000UL,
1036 0x00000001UL,
1037 0x00000000UL,
1038 };
1039 ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1040 */
1041 uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1042 static const uint32_t IOTA_CONST_1[NROUNDS] = {
1043 0x00000000UL,
1044 0x00000089UL,
1045 0x8000008bUL,
1046 0x80008080UL,
1047 0x0000008bUL,
1048 0x00008000UL,
1049 0x80008088UL,
1050 0x80000082UL,
1051 0x0000000bUL,
1052 0x0000000aUL,
1053 0x00008082UL,
1054 0x00008003UL,
1055 0x0000808bUL,
1056 0x8000000bUL,
1057 0x8000008aUL,
1058 0x80000081UL,
1059 0x80000081UL,
1060 0x80000008UL,
1061 0x00000083UL,
1062 0x80008003UL,
1063 0x80008088UL,
1064 0x80000088UL,
1065 0x00008000UL,
1066 0x80008082UL,
1067 };
1068
1069 uint32_t *const s32 = (uint32_t*)state;
1070 unsigned round;
1071
1072 split_halves(state);
1073
1074 for (round = 0; round < NROUNDS; round++) {
1075 unsigned x;
1076
1077 /* Theta */
1078 {
1079 uint32_t BC[20];
1080 for (x = 0; x < 10; ++x) {
1081 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1082 }
1083 for (x = 0; x < 10; x += 2) {
1084 uint32_t ta, tb;
1085 ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1086 tb = BC[x+9] ^ BC[x+2];
1087 s32[x+0] ^= ta;
1088 s32[x+1] ^= tb;
1089 s32[x+10] ^= ta;
1090 s32[x+11] ^= tb;
1091 s32[x+20] ^= ta;
1092 s32[x+21] ^= tb;
1093 s32[x+30] ^= ta;
1094 s32[x+31] ^= tb;
1095 s32[x+40] ^= ta;
1096 s32[x+41] ^= tb;
1097 }
1098 }
1099 /* RhoPi */
1100 {
1101 uint32_t t0a,t0b, t1a,t1b;
1102 t1a = s32[1*2+0];
1103 t1b = s32[1*2+1];
1104
1105#define RhoPi(PI_LANE, ROT_CONST) \
1106 t0a = s32[PI_LANE*2+0];\
1107 t0b = s32[PI_LANE*2+1];\
1108 if (ROT_CONST & 1) {\
1109 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1110 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1111 } else {\
1112 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1113 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1114 }\
1115 t1a = t0a; t1b = t0b;
1116
1117 RhoPi(10, 1)
1118 RhoPi( 7, 3)
1119 RhoPi(11, 6)
1120 RhoPi(17,10)
1121 RhoPi(18,15)
1122 RhoPi( 3,21)
1123 RhoPi( 5,28)
1124 RhoPi(16,36)
1125 RhoPi( 8,45)
1126 RhoPi(21,55)
1127 RhoPi(24, 2)
1128 RhoPi( 4,14)
1129 RhoPi(15,27)
1130 RhoPi(23,41)
1131 RhoPi(19,56)
1132 RhoPi(13, 8)
1133 RhoPi(12,25)
1134 RhoPi( 2,43)
1135 RhoPi(20,62)
1136 RhoPi(14,18)
1137 RhoPi(22,39)
1138 RhoPi( 9,61)
1139 RhoPi( 6,20)
1140 RhoPi( 1,44)
1141#undef RhoPi
1142 }
1143 /* Chi */
1144 for (x = 0; x <= 40;) {
1145 uint32_t BC0, BC1, BC2, BC3, BC4;
1146 BC0 = s32[x + 0*2];
1147 BC1 = s32[x + 1*2];
1148 BC2 = s32[x + 2*2];
1149 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1150 BC3 = s32[x + 3*2];
1151 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1152 BC4 = s32[x + 4*2];
1153 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1154 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1155 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1156 x++;
1157 BC0 = s32[x + 0*2];
1158 BC1 = s32[x + 1*2];
1159 BC2 = s32[x + 2*2];
1160 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1161 BC3 = s32[x + 3*2];
1162 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1163 BC4 = s32[x + 4*2];
1164 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1165 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1166 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1167 x += 9;
1168 }
1169 /* Iota */
1170 s32[0] ^= IOTA_CONST_0bits & 1;
1171 IOTA_CONST_0bits >>= 1;
1172 s32[1] ^= IOTA_CONST_1[round];
1173 }
1174
1175 combine_halves(state);
1176#else
1177 /* Native 64-bit algorithm */
945 static const uint16_t IOTA_CONST[NROUNDS] = { 1178 static const uint16_t IOTA_CONST[NROUNDS] = {
1179 /* Elements should be 64-bit, but top half is always zero
1180 * or 0x80000000. We encode 63rd bits in a separate word below.
1181 * Same is true for 31th bits, which lets us use 16-bit table
1182 * instead of 64-bit. The speed penalty is lost in the noise.
1183 */
946 0x0001, 1184 0x0001,
947 0x8082, 1185 0x8082,
948 0x808a, 1186 0x808a,
@@ -983,7 +1221,7 @@ static void sha3_process_block72(uint64_t *state)
983 }; 1221 };
984 /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/ 1222 /*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
985 1223
986 unsigned x, y; 1224 unsigned x;
987 unsigned round; 1225 unsigned round;
988 1226
989 if (BB_BIG_ENDIAN) { 1227 if (BB_BIG_ENDIAN) {
@@ -1045,22 +1283,62 @@ static void sha3_process_block72(uint64_t *state)
1045 RhoPi_twice(20); RhoPi_twice(22); 1283 RhoPi_twice(20); RhoPi_twice(22);
1046#undef RhoPi_twice 1284#undef RhoPi_twice
1047 } 1285 }
1048
1049 /* Chi */ 1286 /* Chi */
1050 for (y = 0; y <= 20; y += 5) { 1287# if LONG_MAX > 0x7fffffff
1288 for (x = 0; x <= 20; x += 5) {
1051 uint64_t BC0, BC1, BC2, BC3, BC4; 1289 uint64_t BC0, BC1, BC2, BC3, BC4;
1052 BC0 = state[y + 0]; 1290 BC0 = state[x + 0];
1053 BC1 = state[y + 1]; 1291 BC1 = state[x + 1];
1054 BC2 = state[y + 2]; 1292 BC2 = state[x + 2];
1055 state[y + 0] = BC0 ^ ((~BC1) & BC2); 1293 state[x + 0] = BC0 ^ ((~BC1) & BC2);
1056 BC3 = state[y + 3]; 1294 BC3 = state[x + 3];
1057 state[y + 1] = BC1 ^ ((~BC2) & BC3); 1295 state[x + 1] = BC1 ^ ((~BC2) & BC3);
1058 BC4 = state[y + 4]; 1296 BC4 = state[x + 4];
1059 state[y + 2] = BC2 ^ ((~BC3) & BC4); 1297 state[x + 2] = BC2 ^ ((~BC3) & BC4);
1060 state[y + 3] = BC3 ^ ((~BC4) & BC0); 1298 state[x + 3] = BC3 ^ ((~BC4) & BC0);
1061 state[y + 4] = BC4 ^ ((~BC0) & BC1); 1299 state[x + 4] = BC4 ^ ((~BC0) & BC1);
1062 } 1300 }
1063 1301# else
1302 /* Reduced register pressure version
1303 * for register-starved 32-bit arches
1304 * (i386: -95 bytes, and it is _faster_)
1305 */
1306 for (x = 0; x <= 40;) {
1307 uint32_t BC0, BC1, BC2, BC3, BC4;
1308 uint32_t *const s32 = (uint32_t*)state;
1309# if SHA3_SMALL
1310 do_half:
1311# endif
1312 BC0 = s32[x + 0*2];
1313 BC1 = s32[x + 1*2];
1314 BC2 = s32[x + 2*2];
1315 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1316 BC3 = s32[x + 3*2];
1317 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1318 BC4 = s32[x + 4*2];
1319 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1320 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1321 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1322 x++;
1323# if SHA3_SMALL
1324 if (x & 1)
1325 goto do_half;
1326 x += 8;
1327# else
1328 BC0 = s32[x + 0*2];
1329 BC1 = s32[x + 1*2];
1330 BC2 = s32[x + 2*2];
1331 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1332 BC3 = s32[x + 3*2];
1333 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1334 BC4 = s32[x + 4*2];
1335 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1336 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1337 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1338 x += 9;
1339# endif
1340 }
1341# endif /* long is 32-bit */
1064 /* Iota */ 1342 /* Iota */
1065 state[0] ^= IOTA_CONST[round] 1343 state[0] ^= IOTA_CONST[round]
1066 | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000) 1344 | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
@@ -1072,6 +1350,7 @@ static void sha3_process_block72(uint64_t *state)
1072 state[x] = SWAP_LE64(state[x]); 1350 state[x] = SWAP_LE64(state[x]);
1073 } 1351 }
1074 } 1352 }
1353#endif
1075} 1354}
1076 1355
1077void FAST_FUNC sha3_begin(sha3_ctx_t *ctx) 1356void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)