diff options
author | Avi Halachmi (:avih) <avihpit@yahoo.com> | 2024-01-30 18:44:52 +0200 |
---|---|---|
committer | Ron Yorston <rmy@pobox.com> | 2024-01-31 08:40:21 +0000 |
commit | e960b0d69d3f954d50e814a6bc4d6e206bde7f66 (patch) | |
tree | 4be87ed9e57f78e2d4b1914ba7a5eef9e218d128 | |
parent | a750640a87ff0bad6e59b534264dddeaf8c6923b (diff) | |
download | busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.tar.gz busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.tar.bz2 busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.zip |
win32: UTF8_OUTPUT: recover quicker from bad byte
When an unexpected value is detected in UTF-8, we should print the
placeholder codepoint, and then recover whenever we detect a value
which is valid for starting a new UTF-8 codepoint (including ASCII7).
However, previously, we only tested recovery at the bytes following
the unexpected one, and so if the first unexpected value was also
valid for a new codepoint, then didn't rcover it.
Now we check for recovery from the first unexpected byte, which,
if recoverable, requires both placeholder printout and recovery,
so the recovery "unwinding" is modified a bit to allow placeholder.
Example of of a sequence which now recovers quicker than before:
(where UTF-8 for U+1F600 "😀" is: 0xF0 0x9F 0x98 0x80)
printf "\xF0\xF0\x9F\x98\x80A"
Previously: ?A
Now: ?😀A
-rw-r--r-- | win32/winansi.c | 31 |
1 files changed, 19 insertions, 12 deletions
diff --git a/win32/winansi.c b/win32/winansi.c index 6142a244c..66b040b31 100644 --- a/win32/winansi.c +++ b/win32/winansi.c | |||
@@ -1477,7 +1477,6 @@ static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz) | |||
1477 | while (c & (0x80 >> topbits)) | 1477 | while (c & (0x80 >> topbits)) |
1478 | ++topbits; | 1478 | ++topbits; |
1479 | 1479 | ||
1480 | process_byte: | ||
1481 | if (state == 0 && topbits == 0) { | 1480 | if (state == 0 && topbits == 0) { |
1482 | // valid ASCII7, state remains 0 | 1481 | // valid ASCII7, state remains 0 |
1483 | codepoint = c; | 1482 | codepoint = c; |
@@ -1494,18 +1493,26 @@ static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz) | |||
1494 | state = topbits - 1; // remaining bytes after lead | 1493 | state = topbits - 1; // remaining bytes after lead |
1495 | continue; | 1494 | continue; |
1496 | 1495 | ||
1497 | } else if (state >= 0) { | ||
1498 | // invalid byte at state 0/1/2/3, add placeholder once | ||
1499 | codepoint = CONFIG_SUBST_WCHAR; | ||
1500 | state = -1; | ||
1501 | |||
1502 | } else { | 1496 | } else { |
1503 | // inside bad sequence (placeholder char already added) | 1497 | // already bad (state<0), or unexpected c at state 0-3. |
1504 | if (topbits == 1 || topbits > 4) | 1498 | // placeholder is added only at the 1st (state>=0). |
1505 | continue; // still bad | 1499 | // regardless, c may be valid to reprocess as state 0 |
1506 | // c is valid for state 0, process it with clean slate | 1500 | // (even when it's the 1st unexpected in state 1/2/3) |
1507 | state = 0; | 1501 | int placeholder_done = state < 0; |
1508 | goto process_byte; | 1502 | |
1503 | if (topbits < 5 && topbits != 1) { | ||
1504 | --u8buf; // valid for state 0, reprocess | ||
1505 | ++u8siz; | ||
1506 | state = 0; | ||
1507 | } else { | ||
1508 | state = -1; // set/keep bad state | ||
1509 | } | ||
1510 | |||
1511 | if (placeholder_done) | ||
1512 | continue; | ||
1513 | |||
1514 | // 1st unexpected char, add placeholder | ||
1515 | codepoint = CONFIG_SUBST_WCHAR; | ||
1509 | } | 1516 | } |
1510 | 1517 | ||
1511 | // codepoint is complete | 1518 | // codepoint is complete |