aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAvi Halachmi (:avih) <avihpit@yahoo.com>2024-01-30 18:44:52 +0200
committerRon Yorston <rmy@pobox.com>2024-01-31 08:40:21 +0000
commite960b0d69d3f954d50e814a6bc4d6e206bde7f66 (patch)
tree4be87ed9e57f78e2d4b1914ba7a5eef9e218d128
parenta750640a87ff0bad6e59b534264dddeaf8c6923b (diff)
downloadbusybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.tar.gz
busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.tar.bz2
busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.zip
win32: UTF8_OUTPUT: recover quicker from bad byte
When an unexpected value is detected in UTF-8, we should print the placeholder codepoint, and then recover whenever we detect a value which is valid for starting a new UTF-8 codepoint (including ASCII7). However, previously, we only tested recovery at the bytes following the unexpected one, and so if the first unexpected value was also valid for a new codepoint, then didn't rcover it. Now we check for recovery from the first unexpected byte, which, if recoverable, requires both placeholder printout and recovery, so the recovery "unwinding" is modified a bit to allow placeholder. Example of of a sequence which now recovers quicker than before: (where UTF-8 for U+1F600 "😀" is: 0xF0 0x9F 0x98 0x80) printf "\xF0\xF0\x9F\x98\x80A" Previously: ?A Now: ?😀A
-rw-r--r--win32/winansi.c31
1 files changed, 19 insertions, 12 deletions
diff --git a/win32/winansi.c b/win32/winansi.c
index 6142a244c..66b040b31 100644
--- a/win32/winansi.c
+++ b/win32/winansi.c
@@ -1477,7 +1477,6 @@ static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz)
1477 while (c & (0x80 >> topbits)) 1477 while (c & (0x80 >> topbits))
1478 ++topbits; 1478 ++topbits;
1479 1479
1480 process_byte:
1481 if (state == 0 && topbits == 0) { 1480 if (state == 0 && topbits == 0) {
1482 // valid ASCII7, state remains 0 1481 // valid ASCII7, state remains 0
1483 codepoint = c; 1482 codepoint = c;
@@ -1494,18 +1493,26 @@ static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz)
1494 state = topbits - 1; // remaining bytes after lead 1493 state = topbits - 1; // remaining bytes after lead
1495 continue; 1494 continue;
1496 1495
1497 } else if (state >= 0) {
1498 // invalid byte at state 0/1/2/3, add placeholder once
1499 codepoint = CONFIG_SUBST_WCHAR;
1500 state = -1;
1501
1502 } else { 1496 } else {
1503 // inside bad sequence (placeholder char already added) 1497 // already bad (state<0), or unexpected c at state 0-3.
1504 if (topbits == 1 || topbits > 4) 1498 // placeholder is added only at the 1st (state>=0).
1505 continue; // still bad 1499 // regardless, c may be valid to reprocess as state 0
1506 // c is valid for state 0, process it with clean slate 1500 // (even when it's the 1st unexpected in state 1/2/3)
1507 state = 0; 1501 int placeholder_done = state < 0;
1508 goto process_byte; 1502
1503 if (topbits < 5 && topbits != 1) {
1504 --u8buf; // valid for state 0, reprocess
1505 ++u8siz;
1506 state = 0;
1507 } else {
1508 state = -1; // set/keep bad state
1509 }
1510
1511 if (placeholder_done)
1512 continue;
1513
1514 // 1st unexpected char, add placeholder
1515 codepoint = CONFIG_SUBST_WCHAR;
1509 } 1516 }
1510 1517
1511 // codepoint is complete 1518 // codepoint is complete