aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTomas Heinrich <heinrich.tomas@gmail.com>2010-04-29 13:43:39 +0200
committerDenys Vlasenko <vda.linux@googlemail.com>2010-04-29 13:43:39 +0200
commita659b81dfa435aa19130a8c7dd1bfe8fa9a22131 (patch)
tree6e42922ad325142290898143818bcd819f799c27
parent25b10d97e66a74d4e5a5571afb1b8369c31eefca (diff)
downloadbusybox-w32-a659b81dfa435aa19130a8c7dd1bfe8fa9a22131.tar.gz
busybox-w32-a659b81dfa435aa19130a8c7dd1bfe8fa9a22131.tar.bz2
busybox-w32-a659b81dfa435aa19130a8c7dd1bfe8fa9a22131.zip
libbb/lineedit: add support for preserving "broken" (non-unicode) chars
Signed-off-by: Tomas Heinrich <heinrich.tomas@gmail.com> Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--Config.in11
-rw-r--r--libbb/lineedit.c62
-rw-r--r--libbb/unicode.c12
-rwxr-xr-xtestsuite/ash.tests24
4 files changed, 89 insertions, 20 deletions
diff --git a/Config.in b/Config.in
index 40af9115d..a5d20038a 100644
--- a/Config.in
+++ b/Config.in
@@ -223,6 +223,17 @@ config UNICODE_NEUTRAL_TABLE
223 With this option on, more extensive (and bigger) table 223 With this option on, more extensive (and bigger) table
224 of neutral chars will be used. 224 of neutral chars will be used.
225 225
226config UNICODE_PRESERVE_BROKEN
227 bool "Make it possible to enter sequences of chars which are not Unicode"
228 default n
229 depends on UNICODE_SUPPORT
230 help
231 With this option on, invalid UTF-8 bytes are not substituted
232 with the selected substitution character.
233 For example, this means that entering 'l', 's', ' ', 0xff, [Enter]
234 at shell prompt will list file named 0xff (single char name
235 with char value 255), not file named '?'.
236
226config LONG_OPTS 237config LONG_OPTS
227 bool "Support for --long-options" 238 bool "Support for --long-options"
228 default y 239 default y
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index dc90846f9..622f9ddfc 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -68,7 +68,7 @@
68 68
69#undef CHAR_T 69#undef CHAR_T
70#if ENABLE_UNICODE_SUPPORT 70#if ENABLE_UNICODE_SUPPORT
71# define BB_NUL L'\0' 71# define BB_NUL ((wchar_t)0)
72# define CHAR_T wchar_t 72# define CHAR_T wchar_t
73static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); } 73static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); }
74# if ENABLE_FEATURE_EDITING_VI 74# if ENABLE_FEATURE_EDITING_VI
@@ -92,6 +92,14 @@ static bool BB_ispunct(CHAR_T c) { return ((unsigned)c < 256 && ispunct(c)); }
92#endif 92#endif
93 93
94 94
95# if ENABLE_UNICODE_PRESERVE_BROKEN
96# define unicode_mark_inv_wchar(wc) ((wc) | 0x20000000)
97# define unicode_is_inv_wchar(wc) ((wc) & 0x20000000)
98# else
99# define unicode_is_inv_wchar(wc) 0
100# endif
101
102
95enum { 103enum {
96 /* We use int16_t for positions, need to limit line len */ 104 /* We use int16_t for positions, need to limit line len */
97 MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0 105 MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0
@@ -208,24 +216,58 @@ static size_t load_string(const char *src, int maxsize)
208 ssize_t len = mbstowcs(command_ps, src, maxsize - 1); 216 ssize_t len = mbstowcs(command_ps, src, maxsize - 1);
209 if (len < 0) 217 if (len < 0)
210 len = 0; 218 len = 0;
211 command_ps[len] = L'\0'; 219 command_ps[len] = 0;
212 return len; 220 return len;
213} 221}
214static size_t save_string(char *dst, int maxsize) 222static unsigned save_string(char *dst, unsigned maxsize)
215{ 223{
224#if !ENABLE_UNICODE_PRESERVE_BROKEN
216 ssize_t len = wcstombs(dst, command_ps, maxsize - 1); 225 ssize_t len = wcstombs(dst, command_ps, maxsize - 1);
217 if (len < 0) 226 if (len < 0)
218 len = 0; 227 len = 0;
219 dst[len] = '\0'; 228 dst[len] = '\0';
220 return len; 229 return len;
230#else
231 unsigned dstpos = 0;
232 unsigned srcpos = 0;
233
234 maxsize--;
235 while (dstpos < maxsize) {
236 wchar_t wc;
237 int n = srcpos;
238 while ((wc = command_ps[srcpos]) != 0
239 && !unicode_is_inv_wchar(wc)
240 ) {
241 srcpos++;
242 }
243 command_ps[srcpos] = 0;
244 n = wcstombs(dst + dstpos, command_ps + n, maxsize - dstpos);
245 if (n < 0) /* should not happen */
246 break;
247 dstpos += n;
248 if (wc == 0) /* usually is */
249 break;
250 /* We do have invalid byte here! */
251 command_ps[srcpos] = wc; /* restore it */
252 srcpos++;
253 if (dstpos == maxsize)
254 break;
255 dst[dstpos++] = (char) wc;
256 }
257 dst[dstpos] = '\0';
258 return dstpos;
259#endif
221} 260}
222/* I thought just fputwc(c, stdout) would work. But no... */ 261/* I thought just fputwc(c, stdout) would work. But no... */
223static void BB_PUTCHAR(wchar_t c) 262static void BB_PUTCHAR(wchar_t c)
224{ 263{
225 char buf[MB_CUR_MAX + 1]; 264 char buf[MB_CUR_MAX + 1];
226 mbstate_t mbst = { 0 }; 265 mbstate_t mbst = { 0 };
227 ssize_t len = wcrtomb(buf, c, &mbst); 266 ssize_t len;
228 267
268 if (unicode_is_inv_wchar(c))
269 c = CONFIG_SUBST_WCHAR;
270 len = wcrtomb(buf, c, &mbst);
229 if (len > 0) { 271 if (len > 0) {
230 buf[len] = '\0'; 272 buf[len] = '\0';
231 fputs(buf, stdout); 273 fputs(buf, stdout);
@@ -238,7 +280,7 @@ static size_t load_string(const char *src, int maxsize)
238 return strlen(command_ps); 280 return strlen(command_ps);
239} 281}
240# if ENABLE_FEATURE_TAB_COMPLETION 282# if ENABLE_FEATURE_TAB_COMPLETION
241static void save_string(char *dst, int maxsize) 283static void save_string(char *dst, unsigned maxsize)
242{ 284{
243 safe_strncpy(dst, command_ps, maxsize); 285 safe_strncpy(dst, command_ps, maxsize);
244} 286}
@@ -1719,13 +1761,11 @@ static int lineedit_read_key(char *read_key_buffer)
1719 pushback: 1761 pushback:
1720 /* Invalid sequence. Save all "bad bytes" except first */ 1762 /* Invalid sequence. Save all "bad bytes" except first */
1721 read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1); 1763 read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1);
1722 /* 1764# if !ENABLE_UNICODE_PRESERVE_BROKEN
1723 * ic = unicode_buf[0] sounds even better, but currently
1724 * this does not work: wchar_t[] -> char[] conversion
1725 * when lineedit finishes mangles such "raw bytes"
1726 * (by misinterpreting them as unicode chars):
1727 */
1728 ic = CONFIG_SUBST_WCHAR; 1765 ic = CONFIG_SUBST_WCHAR;
1766# else
1767 ic = unicode_mark_inv_wchar(unicode_buf[0]);
1768# endif
1729 } else { 1769 } else {
1730 /* Valid unicode char, return its code */ 1770 /* Valid unicode char, return its code */
1731 ic = wc; 1771 ic = wc;
diff --git a/libbb/unicode.c b/libbb/unicode.c
index 83e70b412..d1c6167c7 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -423,7 +423,6 @@ static int wcwidth(unsigned ucs)
423# if LAST_SUPPORTED_WCHAR >= 0x300 423# if LAST_SUPPORTED_WCHAR >= 0x300
424 /* sorted list of non-overlapping intervals of non-spacing characters */ 424 /* sorted list of non-overlapping intervals of non-spacing characters */
425 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ 425 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
426 static const struct interval combining[] = {
427# define BIG_(a,b) { a, b }, 426# define BIG_(a,b) { a, b },
428# define PAIR(a,b) 427# define PAIR(a,b)
429# define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \ 428# define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \
@@ -557,10 +556,9 @@ static int wcwidth(unsigned ucs)
557 BIG_(0xFE20, 0xFE23) \ 556 BIG_(0xFE20, 0xFE23) \
558 BIG_(0xFEFF, 0xFEFF) \ 557 BIG_(0xFEFF, 0xFEFF) \
559 BIG_(0xFFF9, 0xFFFB) 558 BIG_(0xFFF9, 0xFFFB)
560 ARRAY 559 static const struct interval combining[] = { ARRAY };
561# undef BIG_ 560# undef BIG_
562# undef PAIR 561# undef PAIR
563 };
564# define BIG_(a,b) 562# define BIG_(a,b)
565# define PAIR(a,b) (a << 2) | (b-a), 563# define PAIR(a,b) (a << 2) | (b-a),
566 static const uint16_t combining1[] = { ARRAY }; 564 static const uint16_t combining1[] = { ARRAY };
@@ -668,7 +666,6 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
668 * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt 666 * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt
669 * Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter 667 * Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter
670 */ 668 */
671 static const struct interval rtl_b[] = {
672# define BIG_(a,b) { a, b }, 669# define BIG_(a,b) { a, b },
673# define PAIR(a,b) 670# define PAIR(a,b)
674# define ARRAY \ 671# define ARRAY \
@@ -723,10 +720,9 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
723 {0x10E7F, 0x10FFF}, 720 {0x10E7F, 0x10FFF},
724 {0x1E800, 0x1EFFF} 721 {0x1E800, 0x1EFFF}
725 */ 722 */
726 ARRAY 723 static const struct interval rtl_b[] = { ARRAY };
727# undef BIG_ 724# undef BIG_
728# undef PAIR 725# undef PAIR
729 };
730# define BIG_(a,b) 726# define BIG_(a,b)
731# define PAIR(a,b) (a << 2) | (b-a), 727# define PAIR(a,b) (a << 2) | (b-a),
732 static const uint16_t rtl_p[] = { ARRAY }; 728 static const uint16_t rtl_p[] = { ARRAY };
@@ -755,7 +751,6 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)
755 * White_Space, Other_Neutral, European_Number, European_Separator, 751 * White_Space, Other_Neutral, European_Number, European_Separator,
756 * European_Terminator, Arabic_Number, Common_Separator 752 * European_Terminator, Arabic_Number, Common_Separator
757 */ 753 */
758 static const struct interval neutral_b[] = {
759# define BIG_(a,b) { a, b }, 754# define BIG_(a,b) { a, b },
760# define PAIR(a,b) 755# define PAIR(a,b)
761# define ARRAY \ 756# define ARRAY \
@@ -929,10 +924,9 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)
929 {0x1F030, 0x1F093}, 924 {0x1F030, 0x1F093},
930 {0x1F100, 0x1F10A} 925 {0x1F100, 0x1F10A}
931 */ 926 */
932 ARRAY 927 static const struct interval neutral_b[] = { ARRAY };
933# undef BIG_ 928# undef BIG_
934# undef PAIR 929# undef PAIR
935 };
936# define BIG_(a,b) 930# define BIG_(a,b)
937# define PAIR(a,b) (a << 2) | (b-a), 931# define PAIR(a,b) (a << 2) | (b-a),
938 static const uint16_t neutral_p[] = { ARRAY }; 932 static const uint16_t neutral_p[] = { ARRAY };
diff --git a/testsuite/ash.tests b/testsuite/ash.tests
index 6b2caf316..ce585beb1 100755
--- a/testsuite/ash.tests
+++ b/testsuite/ash.tests
@@ -7,8 +7,30 @@
7 7
8. ./testing.sh 8. ./testing.sh
9 9
10test -f "$bindir/.config" && . "$bindir/.config"
11
10# testing "test name" "options" "expected result" "file input" "stdin" 12# testing "test name" "options" "expected result" "file input" "stdin"
11 13
14if test x"$CONFIG_UNICODE_PRESERVE_BROKEN" = x"y"; then
15testing "One byte which is not valid unicode char followed by valid input" \
16 "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
17 "\
1800000000 ff 2d 0a |.-.|
1900000003
20" \
21 "" \
22 "echo \xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
23
24testing "30 bytes which are not valid unicode chars followed by valid input" \
25 "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
26 "\
2700000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
2800000010 ff ff ff ff ff ff ff ff ff ff ff ff ff ff 2d 0a |..............-.|
2900000020
30" \
31 "" \
32 "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
33else
12testing "One byte which is not valid unicode char followed by valid input" \ 34testing "One byte which is not valid unicode char followed by valid input" \
13 "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ 35 "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
14 "\ 36 "\
@@ -27,6 +49,8 @@ testing "30 bytes which are not valid unicode chars followed by valid input" \
27" \ 49" \
28 "" \ 50 "" \
29 "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" 51 "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
52fi
53
30 54
31# Not sure this behavior is perfect: we lose all invalid input which precedes 55# Not sure this behavior is perfect: we lose all invalid input which precedes
32# arrow keys and such. In this example, \xff\xff are lost 56# arrow keys and such. In this example, \xff\xff are lost