diff options
| author | Avi Halachmi (:avih) <avihpit@yahoo.com> | 2023-06-25 01:42:52 +0300 |
|---|---|---|
| committer | Avi Halachmi (:avih) <avihpit@yahoo.com> | 2023-06-28 18:08:00 +0300 |
| commit | 1602a45b797908025dc71e6a07149a39fdb12a48 (patch) | |
| tree | b381b12bacfe1fe5acf154e16d6050a34083f540 | |
| parent | ebe80f3e5c9b612f4d1b6e444c9badc10f9f2745 (diff) | |
| download | busybox-w32-1602a45b797908025dc71e6a07149a39fdb12a48.tar.gz busybox-w32-1602a45b797908025dc71e6a07149a39fdb12a48.tar.bz2 busybox-w32-1602a45b797908025dc71e6a07149a39fdb12a48.zip | |
win32: the great UTF8 ReadConsoleInput hack
Since commit 597d31ee (EURO_INPUT), ReadConsoleInputA is the default.
The main problem with that is that if the console codepage is UTF8,
e.g. after "chcp 65001", then typing or pasting can result in a crash
of the console itself (the Windows Terminal or cmd.exe window closes).
Additionally and regardless of this crash, ReadConsoleInputA is
apparently buggy with UTF8 CP also otherwise.
For instance, on Windows 7 only ASCII values work - others become '?'.
Or sometimes in Windows 10 (cmd.exe console but not Windows terminal)
only key-up events arrive for some non-ASCII codepoints (without
a prior key-down), and more.
So this commit implements readConsoleInput_utf8 which delivers UTF8
Regardless of CP, including of surrogate pairs, and works on win 7/10.
Other than fixing the crash and working much better with UTF8 console
CP, it also allows a build with the UTF8 manifest to capture correctly
arbitrary unicode inputs which are typed or pasted into the console
regardless of the console CP.
However, it doesn't look OK unless the console CP is set to UTF8
(which we don't do automatically, but the user can chcp 65001),
and editing is still lacking due to missing screen-length awareness.
To reproduce the crash: start a new console window, 'chcp 65001', run
this program (or busybox sh), and paste "ಀ" or "😀" (U+0C80, U+1F600)
#include <windows.h>
int main() {
HANDLE h = GetStdHandle(STD_INPUT_HANDLE);
INPUT_RECORD r;
DWORD n;
while (ReadConsoleInputA(h, &r, 1, &n)) /* NOP */;
return 0;
}
| -rw-r--r-- | include/mingw.h | 3 | ||||
| -rw-r--r-- | win32/termios.c | 5 | ||||
| -rw-r--r-- | win32/winansi.c | 160 |
3 files changed, 166 insertions, 2 deletions
diff --git a/include/mingw.h b/include/mingw.h index a826d7eaa..34c8bfa3f 100644 --- a/include/mingw.h +++ b/include/mingw.h | |||
| @@ -153,6 +153,9 @@ IMPL(setlinebuf, void, ,FILE *fd UNUSED_PARAM) | |||
| 153 | BOOL conToCharBuffA(LPSTR d, DWORD len); | 153 | BOOL conToCharBuffA(LPSTR d, DWORD len); |
| 154 | BOOL conToCharA(LPSTR d); | 154 | BOOL conToCharA(LPSTR d); |
| 155 | 155 | ||
| 156 | // same as ReadConsoleInputA, but delivers UTF8 regardless of console CP | ||
| 157 | BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got); | ||
| 158 | |||
| 156 | void set_title(const char *str); | 159 | void set_title(const char *str); |
| 157 | void move_cursor_row(int n); | 160 | void move_cursor_row(int n); |
| 158 | void reset_screen(void); | 161 | void reset_screen(void); |
diff --git a/win32/termios.c b/win32/termios.c index 11c24c8b3..d70c1e685 100644 --- a/win32/termios.c +++ b/win32/termios.c | |||
| @@ -57,7 +57,10 @@ int64_t FAST_FUNC windows_read_key(int fd, char *buf UNUSED_PARAM, int timeout) | |||
| 57 | #if ENABLE_FEATURE_EURO_INPUT | 57 | #if ENABLE_FEATURE_EURO_INPUT |
| 58 | if (!ReadConsoleInputW(cin, &record, 1, &nevent_out)) | 58 | if (!ReadConsoleInputW(cin, &record, 1, &nevent_out)) |
| 59 | #else | 59 | #else |
| 60 | if (!ReadConsoleInput(cin, &record, 1, &nevent_out)) | 60 | // if ACP is UTF8 then we read UTF8 regardless of console (in) CP |
| 61 | if (GetConsoleCP() == CP_UTF8 || GetACP() == CP_UTF8 | ||
| 62 | ? !readConsoleInput_utf8(cin, &record, 1, &nevent_out) | ||
| 63 | : !ReadConsoleInput(cin, &record, 1, &nevent_out)) | ||
| 61 | #endif | 64 | #endif |
| 62 | goto done; | 65 | goto done; |
| 63 | 66 | ||
diff --git a/win32/winansi.c b/win32/winansi.c index 552ccc6ca..f823d14be 100644 --- a/win32/winansi.c +++ b/win32/winansi.c | |||
| @@ -758,7 +758,8 @@ BOOL conToCharBuffA(LPSTR s, DWORD len) | |||
| 758 | CPINFO acp_info, con_info; | 758 | CPINFO acp_info, con_info; |
| 759 | WCHAR *buf; | 759 | WCHAR *buf; |
| 760 | 760 | ||
| 761 | if (acp == conicp) | 761 | // if acp is UTF8 then we got UTF8 via readConsoleInput_utf8 |
| 762 | if (acp == conicp || acp == CP_UTF8) | ||
| 762 | return TRUE; | 763 | return TRUE; |
| 763 | 764 | ||
| 764 | if (!s || !GetCPInfo(acp, &acp_info) || !GetCPInfo(conicp, &con_info) || | 765 | if (!s || !GetCPInfo(acp, &acp_info) || !GetCPInfo(conicp, &con_info) || |
| @@ -1189,3 +1190,160 @@ int mingw_isatty(int fd) | |||
| 1189 | 1190 | ||
| 1190 | return result; | 1191 | return result; |
| 1191 | } | 1192 | } |
| 1193 | |||
| 1194 | // intentionally also converts invalid values (surrogate halfs, too big) | ||
| 1195 | static int toutf8(DWORD cp, unsigned char *buf) { | ||
| 1196 | if (cp <= 0x7f) { | ||
| 1197 | *buf = cp; | ||
| 1198 | return 1; | ||
| 1199 | } | ||
| 1200 | if (cp <= 0x7ff) { | ||
| 1201 | *buf++ = 0xc0 | (cp >> 6); | ||
| 1202 | *buf = 0x80 | (cp & 0x3f); | ||
| 1203 | return 2; | ||
| 1204 | } | ||
| 1205 | if (cp <= 0xffff) { | ||
| 1206 | *buf++ = 0xe0 | (cp >> 12); | ||
| 1207 | *buf++ = 0x80 | ((cp >> 6) & 0x3f); | ||
| 1208 | *buf = 0x80 | (cp & 0x3f); | ||
| 1209 | return 3; | ||
| 1210 | } | ||
| 1211 | if (cp <= 0x10ffff) { | ||
| 1212 | *buf++ = 0xf0 | (cp >> 18); | ||
| 1213 | *buf++ = 0x80 | ((cp >> 12) & 0x3f); | ||
| 1214 | *buf++ = 0x80 | ((cp >> 6) & 0x3f); | ||
| 1215 | *buf = 0x80 | (cp & 0x3f); | ||
| 1216 | return 4; | ||
| 1217 | } | ||
| 1218 | // invalid. returning 0 works in our context because it's delivered | ||
| 1219 | // as a key event, where 0 values are typically ignored by the caller | ||
| 1220 | *buf = 0; | ||
| 1221 | return 1; | ||
| 1222 | } | ||
| 1223 | |||
| 1224 | // peek into the console input queue and try to find a key-up event of | ||
| 1225 | // a surrugate-2nd-half, at which case eat the console events up to this | ||
| 1226 | // one, and combine the pair values into *ph1 | ||
| 1227 | static void maybeEatUpto2ndHalfUp(HANDLE h, DWORD *ph1) | ||
| 1228 | { | ||
| 1229 | // Peek into the queue arbitrary 16 records deep | ||
| 1230 | INPUT_RECORD r[16]; | ||
| 1231 | DWORD got; | ||
| 1232 | int i; | ||
| 1233 | |||
| 1234 | if (!PeekConsoleInputW(h, r, 16, &got)) | ||
| 1235 | return; | ||
| 1236 | |||
| 1237 | // we're conservative, and abort the search on anything which | ||
| 1238 | // seems out of place, like non-key event, non-2nd-half, etc. | ||
| 1239 | for (i = 0; i < got; ++i) { | ||
| 1240 | DWORD h2; | ||
| 1241 | int is2nd, isdown; | ||
| 1242 | |||
| 1243 | if (r[i].EventType != KEY_EVENT) | ||
| 1244 | return; | ||
| 1245 | |||
| 1246 | isdown = r[i].Event.KeyEvent.bKeyDown; | ||
| 1247 | h2 = r[i].Event.KeyEvent.uChar.UnicodeChar; | ||
| 1248 | is2nd = h2 >= 0xDC00 && h2 <= 0xDFFF; | ||
| 1249 | |||
| 1250 | // skip 0 values, keyup of 1st half, and keydown of a 2nd half, if any | ||
| 1251 | if (!h2 || (h2 == *ph1 && !isdown) || (is2nd && isdown)) | ||
| 1252 | continue; | ||
| 1253 | |||
| 1254 | if (!is2nd) | ||
| 1255 | return; | ||
| 1256 | |||
| 1257 | // got 2nd-half-up. eat the events up to this, combine the values | ||
| 1258 | ReadConsoleInputW(h, r, i + 1, &got); | ||
| 1259 | *ph1 = 0x10000 | ((*ph1 & ~0xD800) << 10) | (h2 & ~0xDC00); | ||
| 1260 | return; | ||
| 1261 | } | ||
| 1262 | } | ||
| 1263 | |||
| 1264 | |||
| 1265 | /* | ||
| 1266 | * readConsoleInput_utf8 behaves similar enough to ReadConsoleInputA when | ||
| 1267 | * the console (input) CP is UTF8, but addressed two issues: | ||
| 1268 | * - It depend on the console CP, while we use ReadConsoleInputW internally. | ||
| 1269 | * - ReadConsoleInputA with Console CP of UTF8 (65001) is buggy: | ||
| 1270 | * - Doesn't work on Windows 7 (reads 0 or '?' for non-ASCII codepoints). | ||
| 1271 | * - When used at the cmd.exe console - but not Windows Terminal: | ||
| 1272 | * sometimes only key-up events arrive without the expected prior key-down. | ||
| 1273 | * Seems to depend both on the console CP and the entered/pasted codepoint. | ||
| 1274 | * - If reading one record at a time (which is how we use it), then input | ||
| 1275 | * codepoints of U+0800 or higher crash the console/terminal window. | ||
| 1276 | * (tested on Windows 10.0.19045.3086: console and Windows Terminal 1.17) | ||
| 1277 | * Example: U+0C80 (UTF8: 0xE0 0xB2 0x80): "ಀ" | ||
| 1278 | * Example: U+1F600 (UTF8: 0xF0 0x9F 0x98 0x80): "😀" | ||
| 1279 | * - If reading more than one record at a time: | ||
| 1280 | * - Unknown whether it can still crash in some cases (was not observed). | ||
| 1281 | * - Codepoints above U+FFFF are broken, and arrive as | ||
| 1282 | * U+FFFD REPLACEMENT CHARACTER "�" | ||
| 1283 | * - Few more codepoints to test the issues above (and below): | ||
| 1284 | * - U+0500 (UTF8: 0xD4, 0x80): "Ԁ" (OK in UTF8 CP, else maybe no key-down) | ||
| 1285 | * - U+07C0 (UTF8: 0xDF, 0x80): "߀" (might exhibit missing key-down) | ||
| 1286 | * | ||
| 1287 | * So this function uses ReadConsoleInputW and then delivers it as UTF8: | ||
| 1288 | * - Works with any console CP, in Windows terminal and Windows 7/10 console. | ||
| 1289 | * - Surrogate pairs are combined and delivered as a single UTF8 codepoint. | ||
| 1290 | * - Ignore occasional intermediate control events between the halfs. | ||
| 1291 | * - If we can't find the 2nd half, or if for some reason we get a 2nd half | ||
| 1292 | * wiithout the 1st, deliver the half we got as UTF8 (a-la WTF8). | ||
| 1293 | * - The "sometimes key-down is missing" issue at the cmd.exe console happens | ||
| 1294 | * also when using ReadConsoleInputW (for U+0080 or higher), so handle it. | ||
| 1295 | * This can also happen with surrogate pairs. | ||
| 1296 | * - Up to 4-bytes state is maintained for a single UTF8 codepoint buffer. | ||
| 1297 | * | ||
| 1298 | * Gotchas (could be solved, but currently there's no need): | ||
| 1299 | * - We support reading one record at a time, else fail - to make it obvious. | ||
| 1300 | * - We have a state which is hidden from PeekConsoleInput - so not in sync. | ||
| 1301 | * - We don't deliver key-up events in some cases: when working around | ||
| 1302 | * the "missing key-down" issue, and with combined surrogate halfs value. | ||
| 1303 | */ | ||
| 1304 | BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got) | ||
| 1305 | { | ||
| 1306 | static unsigned char u8buf[4]; // any single codepoint in UTF8 | ||
| 1307 | static int u8pos = 0, u8len = 0; | ||
| 1308 | static INPUT_RECORD srec; | ||
| 1309 | |||
| 1310 | if (len != 1) | ||
| 1311 | return FALSE; | ||
| 1312 | |||
| 1313 | if (u8pos == u8len) { | ||
| 1314 | DWORD codepoint; | ||
| 1315 | |||
| 1316 | if (!ReadConsoleInputW(h, r, 1, got)) | ||
| 1317 | return FALSE; | ||
| 1318 | if (*got == 0 || r->EventType != KEY_EVENT) | ||
| 1319 | return TRUE; | ||
| 1320 | |||
| 1321 | srec = *r; | ||
| 1322 | codepoint = srec.Event.KeyEvent.uChar.UnicodeChar; | ||
| 1323 | |||
| 1324 | // At the cmd.exe console (but not windows terminal) we sometimes | ||
| 1325 | // get key-up without the prior expected key-down event, sometimes | ||
| 1326 | // with UnicodeChar of 0 instead the key-down event. work around it. | ||
| 1327 | if (codepoint) { | ||
| 1328 | static wchar_t last_down = 0; | ||
| 1329 | |||
| 1330 | if (srec.Event.KeyEvent.bKeyDown) | ||
| 1331 | last_down = codepoint; | ||
| 1332 | else if (codepoint > 127 && codepoint != last_down) | ||
| 1333 | srec.Event.KeyEvent.bKeyDown = TRUE; | ||
| 1334 | } | ||
| 1335 | |||
| 1336 | // if it's a 1st (high) surrogate pair half, try to eat upto and | ||
| 1337 | // including the 2nd (low) half, and combine them into codepoint. | ||
| 1338 | if (codepoint >= 0xD800 && codepoint <= 0xDBFF) | ||
| 1339 | maybeEatUpto2ndHalfUp(h, &codepoint); | ||
| 1340 | |||
| 1341 | u8len = toutf8(codepoint, u8buf); | ||
| 1342 | u8pos = 0; | ||
| 1343 | } | ||
| 1344 | |||
| 1345 | *r = srec; | ||
| 1346 | r->Event.KeyEvent.uChar.AsciiChar = (char)u8buf[u8pos++]; | ||
| 1347 | *got = 1; | ||
| 1348 | return TRUE; | ||
| 1349 | } | ||
