win32: UTF8_OUTPUT: recover quicker from bad byte

When an unexpected value is detected in UTF-8, we should print the placeholder codepoint, and then recover whenever we detect a value which is valid for starting a new UTF-8 codepoint (including ASCII7). However, previously, we only tested recovery at the bytes following the unexpected one, and so if the first unexpected value was also valid for a new codepoint, then didn't rcover it. Now we check for recovery from the first unexpected byte, which, if recoverable, requires both placeholder printout and recovery, so the recovery "unwinding" is modified a bit to allow placeholder. Example of of a sequence which now recovers quicker than before: (where UTF-8 for U+1F600 "😀" is: 0xF0 0x9F 0x98 0x80) printf "\xF0\xF0\x9F\x98\x80A" Previously: ?A Now: ?😀A
author: Avi Halachmi (:avih) <avihpit@yahoo.com> 2024-01-30 18:44:52 +0200
committer: Ron Yorston <rmy@pobox.com> 2024-01-31 08:40:21 +0000
commit: e960b0d69d3f954d50e814a6bc4d6e206bde7f66 (patch)
tree: 4be87ed9e57f78e2d4b1914ba7a5eef9e218d128
parent: a750640a87ff0bad6e59b534264dddeaf8c6923b (diff)
download: busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.tar.gz
busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.tar.bz2
busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.zip
1 files changed, 19 insertions, 12 deletions
diff --git a/win32/winansi.c b/win32/winansi.c
index 6142a244c..66b040b31 100644
--- a/win32/winansi.c
+++ b/win32/winansi.c
@@ -1477,7 +1477,6 @@ static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz)
                while (c & (0x80 >> topbits))
                        ++topbits;
-        process_byte:
                if (state == 0 && topbits == 0) {
                        // valid ASCII7, state remains 0
                        codepoint = c;
@@ -1494,18 +1493,26 @@ static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz)
                        state = topbits - 1;  // remaining bytes after lead
                        continue;
-                } else if (state >= 0) {
-                        // invalid byte at state 0/1/2/3, add placeholder once
-                        codepoint = CONFIG_SUBST_WCHAR;
-                        state = -1;
                } else {
-                        // inside bad sequence (placeholder char already added)
+                        // already bad (state<0), or unexpected c at state 0-3.
-                        if (topbits == 1 || topbits > 4)
+                        // placeholder is added only at the 1st (state>=0).
-                                continue;  // still bad
+                        // regardless, c may be valid to reprocess as state 0
-                        // c is valid for state 0, process it with clean slate
+                        // (even when it's the 1st unexpected in state 1/2/3)
-                        state = 0;
+                        int placeholder_done = state < 0;
-                        goto process_byte;
+                        if (topbits < 5 && topbits != 1) {
+                                --u8buf;  // valid for state 0, reprocess
+                                ++u8siz;
+                                state = 0;
+                        } else {
+                                state = -1;  // set/keep bad state
+                        }
+                        if (placeholder_done)
+                                continue;
+                        // 1st unexpected char, add placeholder
+                        codepoint = CONFIG_SUBST_WCHAR;
                }
                // codepoint is complete
author	Avi Halachmi (:avih) <avihpit@yahoo.com>	2024-01-30 18:44:52 +0200
committer	Ron Yorston <rmy@pobox.com>	2024-01-31 08:40:21 +0000
commit	e960b0d69d3f954d50e814a6bc4d6e206bde7f66 (patch)
tree	4be87ed9e57f78e2d4b1914ba7a5eef9e218d128
parent	a750640a87ff0bad6e59b534264dddeaf8c6923b (diff)
download	busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.tar.gz busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.tar.bz2 busybox-w32-e960b0d69d3f954d50e814a6bc4d6e206bde7f66.zip

diff --git a/win32/winansi.c b/win32/winansi.c index 6142a244c..66b040b31 100644 --- a/win32/winansi.c +++ b/win32/winansi.c
@@ -1477,7 +1477,6 @@ static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz)
1477	while (c & (0x80 >> topbits))	1477	while (c & (0x80 >> topbits))
1478	++topbits;	1478	++topbits;
1479		1479
1480	process_byte:
1481	if (state == 0 && topbits == 0) {	1480	if (state == 0 && topbits == 0) {
1482	// valid ASCII7, state remains 0	1481	// valid ASCII7, state remains 0
1483	codepoint = c;	1482	codepoint = c;
@@ -1494,18 +1493,26 @@ static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz)
1494	state = topbits - 1; // remaining bytes after lead	1493	state = topbits - 1; // remaining bytes after lead
1495	continue;	1494	continue;
1496		1495
1497	} else if (state >= 0) {
1498	// invalid byte at state 0/1/2/3, add placeholder once
1499	codepoint = CONFIG_SUBST_WCHAR;
1500	state = -1;
1501
1502	} else {	1496	} else {
1503	// inside bad sequence (placeholder char already added)	1497	// already bad (state<0), or unexpected c at state 0-3.
1504	if (topbits == 1 \|\| topbits > 4)	1498	// placeholder is added only at the 1st (state>=0).
1505	continue; // still bad	1499	// regardless, c may be valid to reprocess as state 0
1506	// c is valid for state 0, process it with clean slate	1500	// (even when it's the 1st unexpected in state 1/2/3)
1507	state = 0;	1501	int placeholder_done = state < 0;
1508	goto process_byte;	1502
		1503	if (topbits < 5 && topbits != 1) {
		1504	--u8buf; // valid for state 0, reprocess
		1505	++u8siz;
		1506	state = 0;
		1507	} else {
		1508	state = -1; // set/keep bad state
		1509	}
		1510
		1511	if (placeholder_done)
		1512	continue;
		1513
		1514	// 1st unexpected char, add placeholder
		1515	codepoint = CONFIG_SUBST_WCHAR;
1509	}	1516	}
1510		1517
1511	// codepoint is complete	1518	// codepoint is complete