aboutsummaryrefslogtreecommitdiff
path: root/win32
diff options
context:
space:
mode:
Diffstat (limited to 'win32')
-rw-r--r--win32/termios.c5
-rw-r--r--win32/winansi.c160
2 files changed, 163 insertions, 2 deletions
diff --git a/win32/termios.c b/win32/termios.c
index 11c24c8b3..d70c1e685 100644
--- a/win32/termios.c
+++ b/win32/termios.c
@@ -57,7 +57,10 @@ int64_t FAST_FUNC windows_read_key(int fd, char *buf UNUSED_PARAM, int timeout)
57#if ENABLE_FEATURE_EURO_INPUT 57#if ENABLE_FEATURE_EURO_INPUT
58 if (!ReadConsoleInputW(cin, &record, 1, &nevent_out)) 58 if (!ReadConsoleInputW(cin, &record, 1, &nevent_out))
59#else 59#else
60 if (!ReadConsoleInput(cin, &record, 1, &nevent_out)) 60 // if ACP is UTF8 then we read UTF8 regardless of console (in) CP
61 if (GetConsoleCP() == CP_UTF8 || GetACP() == CP_UTF8
62 ? !readConsoleInput_utf8(cin, &record, 1, &nevent_out)
63 : !ReadConsoleInput(cin, &record, 1, &nevent_out))
61#endif 64#endif
62 goto done; 65 goto done;
63 66
diff --git a/win32/winansi.c b/win32/winansi.c
index 552ccc6ca..f823d14be 100644
--- a/win32/winansi.c
+++ b/win32/winansi.c
@@ -758,7 +758,8 @@ BOOL conToCharBuffA(LPSTR s, DWORD len)
758 CPINFO acp_info, con_info; 758 CPINFO acp_info, con_info;
759 WCHAR *buf; 759 WCHAR *buf;
760 760
761 if (acp == conicp) 761 // if acp is UTF8 then we got UTF8 via readConsoleInput_utf8
762 if (acp == conicp || acp == CP_UTF8)
762 return TRUE; 763 return TRUE;
763 764
764 if (!s || !GetCPInfo(acp, &acp_info) || !GetCPInfo(conicp, &con_info) || 765 if (!s || !GetCPInfo(acp, &acp_info) || !GetCPInfo(conicp, &con_info) ||
@@ -1189,3 +1190,160 @@ int mingw_isatty(int fd)
1189 1190
1190 return result; 1191 return result;
1191} 1192}
1193
1194// intentionally also converts invalid values (surrogate halfs, too big)
1195static int toutf8(DWORD cp, unsigned char *buf) {
1196 if (cp <= 0x7f) {
1197 *buf = cp;
1198 return 1;
1199 }
1200 if (cp <= 0x7ff) {
1201 *buf++ = 0xc0 | (cp >> 6);
1202 *buf = 0x80 | (cp & 0x3f);
1203 return 2;
1204 }
1205 if (cp <= 0xffff) {
1206 *buf++ = 0xe0 | (cp >> 12);
1207 *buf++ = 0x80 | ((cp >> 6) & 0x3f);
1208 *buf = 0x80 | (cp & 0x3f);
1209 return 3;
1210 }
1211 if (cp <= 0x10ffff) {
1212 *buf++ = 0xf0 | (cp >> 18);
1213 *buf++ = 0x80 | ((cp >> 12) & 0x3f);
1214 *buf++ = 0x80 | ((cp >> 6) & 0x3f);
1215 *buf = 0x80 | (cp & 0x3f);
1216 return 4;
1217 }
1218 // invalid. returning 0 works in our context because it's delivered
1219 // as a key event, where 0 values are typically ignored by the caller
1220 *buf = 0;
1221 return 1;
1222}
1223
1224// peek into the console input queue and try to find a key-up event of
1225// a surrugate-2nd-half, at which case eat the console events up to this
1226// one, and combine the pair values into *ph1
1227static void maybeEatUpto2ndHalfUp(HANDLE h, DWORD *ph1)
1228{
1229 // Peek into the queue arbitrary 16 records deep
1230 INPUT_RECORD r[16];
1231 DWORD got;
1232 int i;
1233
1234 if (!PeekConsoleInputW(h, r, 16, &got))
1235 return;
1236
1237 // we're conservative, and abort the search on anything which
1238 // seems out of place, like non-key event, non-2nd-half, etc.
1239 for (i = 0; i < got; ++i) {
1240 DWORD h2;
1241 int is2nd, isdown;
1242
1243 if (r[i].EventType != KEY_EVENT)
1244 return;
1245
1246 isdown = r[i].Event.KeyEvent.bKeyDown;
1247 h2 = r[i].Event.KeyEvent.uChar.UnicodeChar;
1248 is2nd = h2 >= 0xDC00 && h2 <= 0xDFFF;
1249
1250 // skip 0 values, keyup of 1st half, and keydown of a 2nd half, if any
1251 if (!h2 || (h2 == *ph1 && !isdown) || (is2nd && isdown))
1252 continue;
1253
1254 if (!is2nd)
1255 return;
1256
1257 // got 2nd-half-up. eat the events up to this, combine the values
1258 ReadConsoleInputW(h, r, i + 1, &got);
1259 *ph1 = 0x10000 | ((*ph1 & ~0xD800) << 10) | (h2 & ~0xDC00);
1260 return;
1261 }
1262}
1263
1264
1265/*
1266 * readConsoleInput_utf8 behaves similar enough to ReadConsoleInputA when
1267 * the console (input) CP is UTF8, but addressed two issues:
1268 * - It depend on the console CP, while we use ReadConsoleInputW internally.
1269 * - ReadConsoleInputA with Console CP of UTF8 (65001) is buggy:
1270 * - Doesn't work on Windows 7 (reads 0 or '?' for non-ASCII codepoints).
1271 * - When used at the cmd.exe console - but not Windows Terminal:
1272 * sometimes only key-up events arrive without the expected prior key-down.
1273 * Seems to depend both on the console CP and the entered/pasted codepoint.
1274 * - If reading one record at a time (which is how we use it), then input
1275 * codepoints of U+0800 or higher crash the console/terminal window.
1276 * (tested on Windows 10.0.19045.3086: console and Windows Terminal 1.17)
1277 * Example: U+0C80 (UTF8: 0xE0 0xB2 0x80): "ಀ"
1278 * Example: U+1F600 (UTF8: 0xF0 0x9F 0x98 0x80): "😀"
1279 * - If reading more than one record at a time:
1280 * - Unknown whether it can still crash in some cases (was not observed).
1281 * - Codepoints above U+FFFF are broken, and arrive as
1282 * U+FFFD REPLACEMENT CHARACTER "�"
1283 * - Few more codepoints to test the issues above (and below):
1284 * - U+0500 (UTF8: 0xD4, 0x80): "Ԁ" (OK in UTF8 CP, else maybe no key-down)
1285 * - U+07C0 (UTF8: 0xDF, 0x80): "߀" (might exhibit missing key-down)
1286 *
1287 * So this function uses ReadConsoleInputW and then delivers it as UTF8:
1288 * - Works with any console CP, in Windows terminal and Windows 7/10 console.
1289 * - Surrogate pairs are combined and delivered as a single UTF8 codepoint.
1290 * - Ignore occasional intermediate control events between the halfs.
1291 * - If we can't find the 2nd half, or if for some reason we get a 2nd half
1292 * wiithout the 1st, deliver the half we got as UTF8 (a-la WTF8).
1293 * - The "sometimes key-down is missing" issue at the cmd.exe console happens
1294 * also when using ReadConsoleInputW (for U+0080 or higher), so handle it.
1295 * This can also happen with surrogate pairs.
1296 * - Up to 4-bytes state is maintained for a single UTF8 codepoint buffer.
1297 *
1298 * Gotchas (could be solved, but currently there's no need):
1299 * - We support reading one record at a time, else fail - to make it obvious.
1300 * - We have a state which is hidden from PeekConsoleInput - so not in sync.
1301 * - We don't deliver key-up events in some cases: when working around
1302 * the "missing key-down" issue, and with combined surrogate halfs value.
1303 */
1304BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got)
1305{
1306 static unsigned char u8buf[4]; // any single codepoint in UTF8
1307 static int u8pos = 0, u8len = 0;
1308 static INPUT_RECORD srec;
1309
1310 if (len != 1)
1311 return FALSE;
1312
1313 if (u8pos == u8len) {
1314 DWORD codepoint;
1315
1316 if (!ReadConsoleInputW(h, r, 1, got))
1317 return FALSE;
1318 if (*got == 0 || r->EventType != KEY_EVENT)
1319 return TRUE;
1320
1321 srec = *r;
1322 codepoint = srec.Event.KeyEvent.uChar.UnicodeChar;
1323
1324 // At the cmd.exe console (but not windows terminal) we sometimes
1325 // get key-up without the prior expected key-down event, sometimes
1326 // with UnicodeChar of 0 instead the key-down event. work around it.
1327 if (codepoint) {
1328 static wchar_t last_down = 0;
1329
1330 if (srec.Event.KeyEvent.bKeyDown)
1331 last_down = codepoint;
1332 else if (codepoint > 127 && codepoint != last_down)
1333 srec.Event.KeyEvent.bKeyDown = TRUE;
1334 }
1335
1336 // if it's a 1st (high) surrogate pair half, try to eat upto and
1337 // including the 2nd (low) half, and combine them into codepoint.
1338 if (codepoint >= 0xD800 && codepoint <= 0xDBFF)
1339 maybeEatUpto2ndHalfUp(h, &codepoint);
1340
1341 u8len = toutf8(codepoint, u8buf);
1342 u8pos = 0;
1343 }
1344
1345 *r = srec;
1346 r->Event.KeyEvent.uChar.AsciiChar = (char)u8buf[u8pos++];
1347 *got = 1;
1348 return TRUE;
1349}