aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAvi Halachmi (:avih) <avihpit@yahoo.com>2023-06-25 01:42:52 +0300
committerAvi Halachmi (:avih) <avihpit@yahoo.com>2023-06-28 18:08:00 +0300
commit1602a45b797908025dc71e6a07149a39fdb12a48 (patch)
treeb381b12bacfe1fe5acf154e16d6050a34083f540
parentebe80f3e5c9b612f4d1b6e444c9badc10f9f2745 (diff)
downloadbusybox-w32-1602a45b797908025dc71e6a07149a39fdb12a48.tar.gz
busybox-w32-1602a45b797908025dc71e6a07149a39fdb12a48.tar.bz2
busybox-w32-1602a45b797908025dc71e6a07149a39fdb12a48.zip
win32: the great UTF8 ReadConsoleInput hack
Since commit 597d31ee (EURO_INPUT), ReadConsoleInputA is the default. The main problem with that is that if the console codepage is UTF8, e.g. after "chcp 65001", then typing or pasting can result in a crash of the console itself (the Windows Terminal or cmd.exe window closes). Additionally and regardless of this crash, ReadConsoleInputA is apparently buggy with UTF8 CP also otherwise. For instance, on Windows 7 only ASCII values work - others become '?'. Or sometimes in Windows 10 (cmd.exe console but not Windows terminal) only key-up events arrive for some non-ASCII codepoints (without a prior key-down), and more. So this commit implements readConsoleInput_utf8 which delivers UTF8 Regardless of CP, including of surrogate pairs, and works on win 7/10. Other than fixing the crash and working much better with UTF8 console CP, it also allows a build with the UTF8 manifest to capture correctly arbitrary unicode inputs which are typed or pasted into the console regardless of the console CP. However, it doesn't look OK unless the console CP is set to UTF8 (which we don't do automatically, but the user can chcp 65001), and editing is still lacking due to missing screen-length awareness. To reproduce the crash: start a new console window, 'chcp 65001', run this program (or busybox sh), and paste "ಀ" or "😀" (U+0C80, U+1F600) #include <windows.h> int main() { HANDLE h = GetStdHandle(STD_INPUT_HANDLE); INPUT_RECORD r; DWORD n; while (ReadConsoleInputA(h, &r, 1, &n)) /* NOP */; return 0; }
-rw-r--r--include/mingw.h3
-rw-r--r--win32/termios.c5
-rw-r--r--win32/winansi.c160
3 files changed, 166 insertions, 2 deletions
diff --git a/include/mingw.h b/include/mingw.h
index a826d7eaa..34c8bfa3f 100644
--- a/include/mingw.h
+++ b/include/mingw.h
@@ -153,6 +153,9 @@ IMPL(setlinebuf, void, ,FILE *fd UNUSED_PARAM)
153BOOL conToCharBuffA(LPSTR d, DWORD len); 153BOOL conToCharBuffA(LPSTR d, DWORD len);
154BOOL conToCharA(LPSTR d); 154BOOL conToCharA(LPSTR d);
155 155
156// same as ReadConsoleInputA, but delivers UTF8 regardless of console CP
157BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got);
158
156void set_title(const char *str); 159void set_title(const char *str);
157void move_cursor_row(int n); 160void move_cursor_row(int n);
158void reset_screen(void); 161void reset_screen(void);
diff --git a/win32/termios.c b/win32/termios.c
index 11c24c8b3..d70c1e685 100644
--- a/win32/termios.c
+++ b/win32/termios.c
@@ -57,7 +57,10 @@ int64_t FAST_FUNC windows_read_key(int fd, char *buf UNUSED_PARAM, int timeout)
57#if ENABLE_FEATURE_EURO_INPUT 57#if ENABLE_FEATURE_EURO_INPUT
58 if (!ReadConsoleInputW(cin, &record, 1, &nevent_out)) 58 if (!ReadConsoleInputW(cin, &record, 1, &nevent_out))
59#else 59#else
60 if (!ReadConsoleInput(cin, &record, 1, &nevent_out)) 60 // if ACP is UTF8 then we read UTF8 regardless of console (in) CP
61 if (GetConsoleCP() == CP_UTF8 || GetACP() == CP_UTF8
62 ? !readConsoleInput_utf8(cin, &record, 1, &nevent_out)
63 : !ReadConsoleInput(cin, &record, 1, &nevent_out))
61#endif 64#endif
62 goto done; 65 goto done;
63 66
diff --git a/win32/winansi.c b/win32/winansi.c
index 552ccc6ca..f823d14be 100644
--- a/win32/winansi.c
+++ b/win32/winansi.c
@@ -758,7 +758,8 @@ BOOL conToCharBuffA(LPSTR s, DWORD len)
758 CPINFO acp_info, con_info; 758 CPINFO acp_info, con_info;
759 WCHAR *buf; 759 WCHAR *buf;
760 760
761 if (acp == conicp) 761 // if acp is UTF8 then we got UTF8 via readConsoleInput_utf8
762 if (acp == conicp || acp == CP_UTF8)
762 return TRUE; 763 return TRUE;
763 764
764 if (!s || !GetCPInfo(acp, &acp_info) || !GetCPInfo(conicp, &con_info) || 765 if (!s || !GetCPInfo(acp, &acp_info) || !GetCPInfo(conicp, &con_info) ||
@@ -1189,3 +1190,160 @@ int mingw_isatty(int fd)
1189 1190
1190 return result; 1191 return result;
1191} 1192}
1193
1194// intentionally also converts invalid values (surrogate halfs, too big)
1195static int toutf8(DWORD cp, unsigned char *buf) {
1196 if (cp <= 0x7f) {
1197 *buf = cp;
1198 return 1;
1199 }
1200 if (cp <= 0x7ff) {
1201 *buf++ = 0xc0 | (cp >> 6);
1202 *buf = 0x80 | (cp & 0x3f);
1203 return 2;
1204 }
1205 if (cp <= 0xffff) {
1206 *buf++ = 0xe0 | (cp >> 12);
1207 *buf++ = 0x80 | ((cp >> 6) & 0x3f);
1208 *buf = 0x80 | (cp & 0x3f);
1209 return 3;
1210 }
1211 if (cp <= 0x10ffff) {
1212 *buf++ = 0xf0 | (cp >> 18);
1213 *buf++ = 0x80 | ((cp >> 12) & 0x3f);
1214 *buf++ = 0x80 | ((cp >> 6) & 0x3f);
1215 *buf = 0x80 | (cp & 0x3f);
1216 return 4;
1217 }
1218 // invalid. returning 0 works in our context because it's delivered
1219 // as a key event, where 0 values are typically ignored by the caller
1220 *buf = 0;
1221 return 1;
1222}
1223
1224// peek into the console input queue and try to find a key-up event of
1225// a surrugate-2nd-half, at which case eat the console events up to this
1226// one, and combine the pair values into *ph1
1227static void maybeEatUpto2ndHalfUp(HANDLE h, DWORD *ph1)
1228{
1229 // Peek into the queue arbitrary 16 records deep
1230 INPUT_RECORD r[16];
1231 DWORD got;
1232 int i;
1233
1234 if (!PeekConsoleInputW(h, r, 16, &got))
1235 return;
1236
1237 // we're conservative, and abort the search on anything which
1238 // seems out of place, like non-key event, non-2nd-half, etc.
1239 for (i = 0; i < got; ++i) {
1240 DWORD h2;
1241 int is2nd, isdown;
1242
1243 if (r[i].EventType != KEY_EVENT)
1244 return;
1245
1246 isdown = r[i].Event.KeyEvent.bKeyDown;
1247 h2 = r[i].Event.KeyEvent.uChar.UnicodeChar;
1248 is2nd = h2 >= 0xDC00 && h2 <= 0xDFFF;
1249
1250 // skip 0 values, keyup of 1st half, and keydown of a 2nd half, if any
1251 if (!h2 || (h2 == *ph1 && !isdown) || (is2nd && isdown))
1252 continue;
1253
1254 if (!is2nd)
1255 return;
1256
1257 // got 2nd-half-up. eat the events up to this, combine the values
1258 ReadConsoleInputW(h, r, i + 1, &got);
1259 *ph1 = 0x10000 | ((*ph1 & ~0xD800) << 10) | (h2 & ~0xDC00);
1260 return;
1261 }
1262}
1263
1264
1265/*
1266 * readConsoleInput_utf8 behaves similar enough to ReadConsoleInputA when
1267 * the console (input) CP is UTF8, but addressed two issues:
1268 * - It depend on the console CP, while we use ReadConsoleInputW internally.
1269 * - ReadConsoleInputA with Console CP of UTF8 (65001) is buggy:
1270 * - Doesn't work on Windows 7 (reads 0 or '?' for non-ASCII codepoints).
1271 * - When used at the cmd.exe console - but not Windows Terminal:
1272 * sometimes only key-up events arrive without the expected prior key-down.
1273 * Seems to depend both on the console CP and the entered/pasted codepoint.
1274 * - If reading one record at a time (which is how we use it), then input
1275 * codepoints of U+0800 or higher crash the console/terminal window.
1276 * (tested on Windows 10.0.19045.3086: console and Windows Terminal 1.17)
1277 * Example: U+0C80 (UTF8: 0xE0 0xB2 0x80): "ಀ"
1278 * Example: U+1F600 (UTF8: 0xF0 0x9F 0x98 0x80): "😀"
1279 * - If reading more than one record at a time:
1280 * - Unknown whether it can still crash in some cases (was not observed).
1281 * - Codepoints above U+FFFF are broken, and arrive as
1282 * U+FFFD REPLACEMENT CHARACTER "�"
1283 * - Few more codepoints to test the issues above (and below):
1284 * - U+0500 (UTF8: 0xD4, 0x80): "Ԁ" (OK in UTF8 CP, else maybe no key-down)
1285 * - U+07C0 (UTF8: 0xDF, 0x80): "߀" (might exhibit missing key-down)
1286 *
1287 * So this function uses ReadConsoleInputW and then delivers it as UTF8:
1288 * - Works with any console CP, in Windows terminal and Windows 7/10 console.
1289 * - Surrogate pairs are combined and delivered as a single UTF8 codepoint.
1290 * - Ignore occasional intermediate control events between the halfs.
1291 * - If we can't find the 2nd half, or if for some reason we get a 2nd half
1292 * wiithout the 1st, deliver the half we got as UTF8 (a-la WTF8).
1293 * - The "sometimes key-down is missing" issue at the cmd.exe console happens
1294 * also when using ReadConsoleInputW (for U+0080 or higher), so handle it.
1295 * This can also happen with surrogate pairs.
1296 * - Up to 4-bytes state is maintained for a single UTF8 codepoint buffer.
1297 *
1298 * Gotchas (could be solved, but currently there's no need):
1299 * - We support reading one record at a time, else fail - to make it obvious.
1300 * - We have a state which is hidden from PeekConsoleInput - so not in sync.
1301 * - We don't deliver key-up events in some cases: when working around
1302 * the "missing key-down" issue, and with combined surrogate halfs value.
1303 */
1304BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got)
1305{
1306 static unsigned char u8buf[4]; // any single codepoint in UTF8
1307 static int u8pos = 0, u8len = 0;
1308 static INPUT_RECORD srec;
1309
1310 if (len != 1)
1311 return FALSE;
1312
1313 if (u8pos == u8len) {
1314 DWORD codepoint;
1315
1316 if (!ReadConsoleInputW(h, r, 1, got))
1317 return FALSE;
1318 if (*got == 0 || r->EventType != KEY_EVENT)
1319 return TRUE;
1320
1321 srec = *r;
1322 codepoint = srec.Event.KeyEvent.uChar.UnicodeChar;
1323
1324 // At the cmd.exe console (but not windows terminal) we sometimes
1325 // get key-up without the prior expected key-down event, sometimes
1326 // with UnicodeChar of 0 instead the key-down event. work around it.
1327 if (codepoint) {
1328 static wchar_t last_down = 0;
1329
1330 if (srec.Event.KeyEvent.bKeyDown)
1331 last_down = codepoint;
1332 else if (codepoint > 127 && codepoint != last_down)
1333 srec.Event.KeyEvent.bKeyDown = TRUE;
1334 }
1335
1336 // if it's a 1st (high) surrogate pair half, try to eat upto and
1337 // including the 2nd (low) half, and combine them into codepoint.
1338 if (codepoint >= 0xD800 && codepoint <= 0xDBFF)
1339 maybeEatUpto2ndHalfUp(h, &codepoint);
1340
1341 u8len = toutf8(codepoint, u8buf);
1342 u8pos = 0;
1343 }
1344
1345 *r = srec;
1346 r->Event.KeyEvent.uChar.AsciiChar = (char)u8buf[u8pos++];
1347 *got = 1;
1348 return TRUE;
1349}