lineedit: invalid unicode characters are replaced with CONFIG_SUBST_WCHAR

function old new delta read_key_ungets - 50 +50 lineedit_read_key 223 252 +29 Signed-off-by: Tomas Heinrich <heinrich.tomas@gmail.com> Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Tomas Heinrich <heinrich.tomas@gmail.com> 2010-03-09 14:09:24 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2010-03-09 14:09:24 +0100
commit: d2b04050c0a9a15e29e15cbf9c487db93d07c46e (patch)
tree: 19929bed97b4e6ddc028465f9fd0a7b3a5d28b5f
parent: f15620c3774c164ee6c1e2fbf9dd481b606a95a1 (diff)
download: busybox-w32-d2b04050c0a9a15e29e15cbf9c487db93d07c46e.tar.gz
busybox-w32-d2b04050c0a9a15e29e15cbf9c487db93d07c46e.tar.bz2
busybox-w32-d2b04050c0a9a15e29e15cbf9c487db93d07c46e.zip
4 files changed, 73 insertions, 5 deletions
diff --git a/include/libbb.h b/include/libbb.h
index ead1020dd..fccc816cb 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1277,6 +1277,7 @@ enum {
 * on first call.
 */
 int64_t read_key(int fd, char *buffer) FAST_FUNC;
+void read_key_ungets(char *buffer, const char *str, unsigned len) FAST_FUNC;
 #if ENABLE_FEATURE_EDITING
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index c50b31d67..8e339da53 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -1700,18 +1700,34 @@ static int lineedit_read_key(char *read_key_buffer)
 #endif
 #if ENABLE_FEATURE_ASSUME_UNICODE
-                {
+                if (unicode_status == UNICODE_ON) {
                        wchar_t wc;
                        if ((int32_t)ic < 0) /* KEYCODE_xxx */
                                return ic;
+                        // TODO: imagine sequence like: 0xff, <left-arrow>: we are currently losing 0xff...
                        unicode_buf[unicode_idx++] = ic;
                        unicode_buf[unicode_idx] = '\0';
-                        if (mbstowcs(&wc, unicode_buf, 1) != 1 && unicode_idx < MB_CUR_MAX) {
+                        if (mbstowcs(&wc, unicode_buf, 1) != 1) {
-                                delay = 50;
+                                /* Not (yet?) a valid unicode char */
-                                goto poll_again;
+                                if (unicode_idx < MB_CUR_MAX) {
+                                        delay = 50;
+                                        goto poll_again;
+                                }
+                                /* Invalid sequence. Save all "bad bytes" except first */
+                                read_key_ungets(read_key_buffer, unicode_buf + 1, MB_CUR_MAX - 1);
+                                /*
+                                 * ic = unicode_buf[0] sounds even better, but currently
+                                 * this does not work: wchar_t[] -> char[] conversion
+                                 * when lineedit finishes mangles such "raw bytes"
+                                 * (by misinterpreting them as unicode chars):
+                                 */
+                                ic = CONFIG_SUBST_WCHAR;
+                        } else {
+                                /* Valid unicode char, return its code */
+                                ic = wc;
                        }
-                        ic = wc;
                }
 #endif
        } while (errno == EAGAIN);
diff --git a/libbb/read_key.c b/libbb/read_key.c
index a2253ce3e..98b3131de 100644
--- a/libbb/read_key.c
+++ b/libbb/read_key.c
@@ -246,3 +246,12 @@ int64_t FAST_FUNC read_key(int fd, char *buffer)
        buffer[-1] = 0;
        goto start_over;
 }
+void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len)
+{
+        unsigned cur_len = (unsigned char)buffer[0];
+        if (len > KEYCODE_BUFFER_SIZE-1 - cur_len)
+                len = KEYCODE_BUFFER_SIZE-1 - cur_len;
+        memcpy(buffer + 1 + cur_len, str, len);
+        buffer[0] += cur_len + len;
+}
diff --git a/testsuite/ash.tests b/testsuite/ash.tests
new file mode 100755
index 000000000..4b6efe42c
--- /dev/null
+++ b/testsuite/ash.tests
@@ -0,0 +1,42 @@
+#!/bin/sh
+#
+# These are not ash tests, we use ash as a way to test lineedit!
+#
+# Copyright 2010 by Denys Vlasenko
+# Licensed under GPL v2, see file LICENSE for details.
+. ./testing.sh
+# testing "test name" "options" "expected result" "file input" "stdin"
+testing "One byte which is not valid unicode char followed by valid input" \
+        "script -q -c 'ash' /dev/null >/dev/null; cat output; rm output" \
+        "\
+00000000  3f 2d 0a                                          |?-.|
+00000003
+" \
+        "" \
+        "echo \xff- | hexdump -C >output; exit; exit; exit; exit\n" \
+testing "30 bytes which are not valid unicode chars followed by valid input" \
+        "script -q -c 'ash' /dev/null >/dev/null; cat output; rm output" \
+        "\
+00000000  3f 3f 3f 3f 3f 3f 3f 3f  3f 3f 3f 3f 3f 3f 3f 3f  |????????????????|
+00000010  3f 3f 3f 3f 3f 3f 3f 3f  3f 3f 3f 3f 3f 3f 2d 0a  |??????????????-.|
+00000020
+" \
+        "" \
+        "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >output; exit; exit; exit; exit\n" \
+# Not sure this behavior is perfect: we lose all invalid input which precedes
+# arrow keys and such. In this example, \xff\xff are lost
+testing "2 bytes which are not valid unicode chars followed by left arrow key" \
+        "script -q -c 'ash' /dev/null >/dev/null; cat output; rm output" \
+        "\
+00000000  3d 2d 0a                                          |=-.|
+00000003
+" \
+        "" \
+        "echo =+\xff\xff\x1b\x5b\x44- | hexdump -C >output; exit; exit; exit; exit\n" \
+exit $FAILCOUNT
author	Tomas Heinrich <heinrich.tomas@gmail.com>	2010-03-09 14:09:24 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2010-03-09 14:09:24 +0100
commit	d2b04050c0a9a15e29e15cbf9c487db93d07c46e (patch)
tree	19929bed97b4e6ddc028465f9fd0a7b3a5d28b5f
parent	f15620c3774c164ee6c1e2fbf9dd481b606a95a1 (diff)
download	busybox-w32-d2b04050c0a9a15e29e15cbf9c487db93d07c46e.tar.gz busybox-w32-d2b04050c0a9a15e29e15cbf9c487db93d07c46e.tar.bz2 busybox-w32-d2b04050c0a9a15e29e15cbf9c487db93d07c46e.zip

diff --git a/include/libbb.h b/include/libbb.h index ead1020dd..fccc816cb 100644 --- a/include/libbb.h +++ b/include/libbb.h
@@ -1277,6 +1277,7 @@ enum {
1277	* on first call.	1277	* on first call.
1278	*/	1278	*/
1279	int64_t read_key(int fd, char *buffer) FAST_FUNC;	1279	int64_t read_key(int fd, char *buffer) FAST_FUNC;
		1280	void read_key_ungets(char buffer, const char str, unsigned len) FAST_FUNC;
1280		1281
1281		1282
1282	#if ENABLE_FEATURE_EDITING	1283	#if ENABLE_FEATURE_EDITING


diff --git a/libbb/lineedit.c b/libbb/lineedit.c index c50b31d67..8e339da53 100644 --- a/libbb/lineedit.c +++ b/libbb/lineedit.c
@@ -1700,18 +1700,34 @@ static int lineedit_read_key(char *read_key_buffer)
1700	#endif	1700	#endif
1701		1701
1702	#if ENABLE_FEATURE_ASSUME_UNICODE	1702	#if ENABLE_FEATURE_ASSUME_UNICODE
1703	{	1703	if (unicode_status == UNICODE_ON) {
1704	wchar_t wc;	1704	wchar_t wc;
1705		1705
1706	if ((int32_t)ic < 0) /* KEYCODE_xxx */	1706	if ((int32_t)ic < 0) /* KEYCODE_xxx */
1707	return ic;	1707	return ic;
		1708	// TODO: imagine sequence like: 0xff, <left-arrow>: we are currently losing 0xff...
		1709
1708	unicode_buf[unicode_idx++] = ic;	1710	unicode_buf[unicode_idx++] = ic;
1709	unicode_buf[unicode_idx] = '\0';	1711	unicode_buf[unicode_idx] = '\0';
1710	if (mbstowcs(&wc, unicode_buf, 1) != 1 && unicode_idx < MB_CUR_MAX) {	1712	if (mbstowcs(&wc, unicode_buf, 1) != 1) {
1711	delay = 50;	1713	/* Not (yet?) a valid unicode char */
1712	goto poll_again;	1714	if (unicode_idx < MB_CUR_MAX) {
		1715	delay = 50;
		1716	goto poll_again;
		1717	}
		1718	/* Invalid sequence. Save all "bad bytes" except first */
		1719	read_key_ungets(read_key_buffer, unicode_buf + 1, MB_CUR_MAX - 1);
		1720	/*
		1721	* ic = unicode_buf[0] sounds even better, but currently
		1722	* this does not work: wchar_t[] -> char[] conversion
		1723	* when lineedit finishes mangles such "raw bytes"
		1724	* (by misinterpreting them as unicode chars):
		1725	*/
		1726	ic = CONFIG_SUBST_WCHAR;
		1727	} else {
		1728	/* Valid unicode char, return its code */
		1729	ic = wc;
1713	}	1730	}
1714	ic = wc;
1715	}	1731	}
1716	#endif	1732	#endif
1717	} while (errno == EAGAIN);	1733	} while (errno == EAGAIN);


diff --git a/libbb/read_key.c b/libbb/read_key.c index a2253ce3e..98b3131de 100644 --- a/libbb/read_key.c +++ b/libbb/read_key.c
@@ -246,3 +246,12 @@ int64_t FAST_FUNC read_key(int fd, char *buffer)
246	buffer[-1] = 0;	246	buffer[-1] = 0;
247	goto start_over;	247	goto start_over;
248	}	248	}
		249
		250	void FAST_FUNC read_key_ungets(char buffer, const char str, unsigned len)
		251	{
		252	unsigned cur_len = (unsigned char)buffer[0];
		253	if (len > KEYCODE_BUFFER_SIZE-1 - cur_len)
		254	len = KEYCODE_BUFFER_SIZE-1 - cur_len;
		255	memcpy(buffer + 1 + cur_len, str, len);
		256	buffer[0] += cur_len + len;
		257	}


diff --git a/testsuite/ash.tests b/testsuite/ash.tests new file mode 100755 index 000000000..4b6efe42c --- /dev/null +++ b/testsuite/ash.tests
@@ -0,0 +1,42 @@
		1	#!/bin/sh
		2	#
		3	# These are not ash tests, we use ash as a way to test lineedit!
		4	#
		5	# Copyright 2010 by Denys Vlasenko
		6	# Licensed under GPL v2, see file LICENSE for details.
		7
		8	. ./testing.sh
		9
		10	# testing "test name" "options" "expected result" "file input" "stdin"
		11
		12	testing "One byte which is not valid unicode char followed by valid input" \
		13	"script -q -c 'ash' /dev/null >/dev/null; cat output; rm output" \
		14	"\
		15	00000000 3f 2d 0a \|?-.\|
		16	00000003
		17	" \
		18	"" \
		19	"echo \xff- \| hexdump -C >output; exit; exit; exit; exit\n" \
		20
		21	testing "30 bytes which are not valid unicode chars followed by valid input" \
		22	"script -q -c 'ash' /dev/null >/dev/null; cat output; rm output" \
		23	"\
		24	00000000 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f \|????????????????\|
		25	00000010 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 3f 2d 0a \|??????????????-.\|
		26	00000020
		27	" \
		28	"" \
		29	"echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- \| hexdump -C >output; exit; exit; exit; exit\n" \
		30
		31	# Not sure this behavior is perfect: we lose all invalid input which precedes
		32	# arrow keys and such. In this example, \xff\xff are lost
		33	testing "2 bytes which are not valid unicode chars followed by left arrow key" \
		34	"script -q -c 'ash' /dev/null >/dev/null; cat output; rm output" \
		35	"\
		36	00000000 3d 2d 0a \|=-.\|
		37	00000003
		38	" \
		39	"" \
		40	"echo =+\xff\xff\x1b\x5b\x44- \| hexdump -C >output; exit; exit; exit; exit\n" \
		41
		42	exit $FAILCOUNT