diff options
author | Tomas Heinrich <heinrich.tomas@gmail.com> | 2010-04-29 13:43:39 +0200 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2010-04-29 13:43:39 +0200 |
commit | a659b81dfa435aa19130a8c7dd1bfe8fa9a22131 (patch) | |
tree | 6e42922ad325142290898143818bcd819f799c27 | |
parent | 25b10d97e66a74d4e5a5571afb1b8369c31eefca (diff) | |
download | busybox-w32-a659b81dfa435aa19130a8c7dd1bfe8fa9a22131.tar.gz busybox-w32-a659b81dfa435aa19130a8c7dd1bfe8fa9a22131.tar.bz2 busybox-w32-a659b81dfa435aa19130a8c7dd1bfe8fa9a22131.zip |
libbb/lineedit: add support for preserving "broken" (non-unicode) chars
Signed-off-by: Tomas Heinrich <heinrich.tomas@gmail.com>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | Config.in | 11 | ||||
-rw-r--r-- | libbb/lineedit.c | 62 | ||||
-rw-r--r-- | libbb/unicode.c | 12 | ||||
-rwxr-xr-x | testsuite/ash.tests | 24 |
4 files changed, 89 insertions, 20 deletions
@@ -223,6 +223,17 @@ config UNICODE_NEUTRAL_TABLE | |||
223 | With this option on, more extensive (and bigger) table | 223 | With this option on, more extensive (and bigger) table |
224 | of neutral chars will be used. | 224 | of neutral chars will be used. |
225 | 225 | ||
226 | config UNICODE_PRESERVE_BROKEN | ||
227 | bool "Make it possible to enter sequences of chars which are not Unicode" | ||
228 | default n | ||
229 | depends on UNICODE_SUPPORT | ||
230 | help | ||
231 | With this option on, invalid UTF-8 bytes are not substituted | ||
232 | with the selected substitution character. | ||
233 | For example, this means that entering 'l', 's', ' ', 0xff, [Enter] | ||
234 | at shell prompt will list file named 0xff (single char name | ||
235 | with char value 255), not file named '?'. | ||
236 | |||
226 | config LONG_OPTS | 237 | config LONG_OPTS |
227 | bool "Support for --long-options" | 238 | bool "Support for --long-options" |
228 | default y | 239 | default y |
diff --git a/libbb/lineedit.c b/libbb/lineedit.c index dc90846f9..622f9ddfc 100644 --- a/libbb/lineedit.c +++ b/libbb/lineedit.c | |||
@@ -68,7 +68,7 @@ | |||
68 | 68 | ||
69 | #undef CHAR_T | 69 | #undef CHAR_T |
70 | #if ENABLE_UNICODE_SUPPORT | 70 | #if ENABLE_UNICODE_SUPPORT |
71 | # define BB_NUL L'\0' | 71 | # define BB_NUL ((wchar_t)0) |
72 | # define CHAR_T wchar_t | 72 | # define CHAR_T wchar_t |
73 | static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); } | 73 | static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); } |
74 | # if ENABLE_FEATURE_EDITING_VI | 74 | # if ENABLE_FEATURE_EDITING_VI |
@@ -92,6 +92,14 @@ static bool BB_ispunct(CHAR_T c) { return ((unsigned)c < 256 && ispunct(c)); } | |||
92 | #endif | 92 | #endif |
93 | 93 | ||
94 | 94 | ||
95 | # if ENABLE_UNICODE_PRESERVE_BROKEN | ||
96 | # define unicode_mark_inv_wchar(wc) ((wc) | 0x20000000) | ||
97 | # define unicode_is_inv_wchar(wc) ((wc) & 0x20000000) | ||
98 | # else | ||
99 | # define unicode_is_inv_wchar(wc) 0 | ||
100 | # endif | ||
101 | |||
102 | |||
95 | enum { | 103 | enum { |
96 | /* We use int16_t for positions, need to limit line len */ | 104 | /* We use int16_t for positions, need to limit line len */ |
97 | MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0 | 105 | MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0 |
@@ -208,24 +216,58 @@ static size_t load_string(const char *src, int maxsize) | |||
208 | ssize_t len = mbstowcs(command_ps, src, maxsize - 1); | 216 | ssize_t len = mbstowcs(command_ps, src, maxsize - 1); |
209 | if (len < 0) | 217 | if (len < 0) |
210 | len = 0; | 218 | len = 0; |
211 | command_ps[len] = L'\0'; | 219 | command_ps[len] = 0; |
212 | return len; | 220 | return len; |
213 | } | 221 | } |
214 | static size_t save_string(char *dst, int maxsize) | 222 | static unsigned save_string(char *dst, unsigned maxsize) |
215 | { | 223 | { |
224 | #if !ENABLE_UNICODE_PRESERVE_BROKEN | ||
216 | ssize_t len = wcstombs(dst, command_ps, maxsize - 1); | 225 | ssize_t len = wcstombs(dst, command_ps, maxsize - 1); |
217 | if (len < 0) | 226 | if (len < 0) |
218 | len = 0; | 227 | len = 0; |
219 | dst[len] = '\0'; | 228 | dst[len] = '\0'; |
220 | return len; | 229 | return len; |
230 | #else | ||
231 | unsigned dstpos = 0; | ||
232 | unsigned srcpos = 0; | ||
233 | |||
234 | maxsize--; | ||
235 | while (dstpos < maxsize) { | ||
236 | wchar_t wc; | ||
237 | int n = srcpos; | ||
238 | while ((wc = command_ps[srcpos]) != 0 | ||
239 | && !unicode_is_inv_wchar(wc) | ||
240 | ) { | ||
241 | srcpos++; | ||
242 | } | ||
243 | command_ps[srcpos] = 0; | ||
244 | n = wcstombs(dst + dstpos, command_ps + n, maxsize - dstpos); | ||
245 | if (n < 0) /* should not happen */ | ||
246 | break; | ||
247 | dstpos += n; | ||
248 | if (wc == 0) /* usually is */ | ||
249 | break; | ||
250 | /* We do have invalid byte here! */ | ||
251 | command_ps[srcpos] = wc; /* restore it */ | ||
252 | srcpos++; | ||
253 | if (dstpos == maxsize) | ||
254 | break; | ||
255 | dst[dstpos++] = (char) wc; | ||
256 | } | ||
257 | dst[dstpos] = '\0'; | ||
258 | return dstpos; | ||
259 | #endif | ||
221 | } | 260 | } |
222 | /* I thought just fputwc(c, stdout) would work. But no... */ | 261 | /* I thought just fputwc(c, stdout) would work. But no... */ |
223 | static void BB_PUTCHAR(wchar_t c) | 262 | static void BB_PUTCHAR(wchar_t c) |
224 | { | 263 | { |
225 | char buf[MB_CUR_MAX + 1]; | 264 | char buf[MB_CUR_MAX + 1]; |
226 | mbstate_t mbst = { 0 }; | 265 | mbstate_t mbst = { 0 }; |
227 | ssize_t len = wcrtomb(buf, c, &mbst); | 266 | ssize_t len; |
228 | 267 | ||
268 | if (unicode_is_inv_wchar(c)) | ||
269 | c = CONFIG_SUBST_WCHAR; | ||
270 | len = wcrtomb(buf, c, &mbst); | ||
229 | if (len > 0) { | 271 | if (len > 0) { |
230 | buf[len] = '\0'; | 272 | buf[len] = '\0'; |
231 | fputs(buf, stdout); | 273 | fputs(buf, stdout); |
@@ -238,7 +280,7 @@ static size_t load_string(const char *src, int maxsize) | |||
238 | return strlen(command_ps); | 280 | return strlen(command_ps); |
239 | } | 281 | } |
240 | # if ENABLE_FEATURE_TAB_COMPLETION | 282 | # if ENABLE_FEATURE_TAB_COMPLETION |
241 | static void save_string(char *dst, int maxsize) | 283 | static void save_string(char *dst, unsigned maxsize) |
242 | { | 284 | { |
243 | safe_strncpy(dst, command_ps, maxsize); | 285 | safe_strncpy(dst, command_ps, maxsize); |
244 | } | 286 | } |
@@ -1719,13 +1761,11 @@ static int lineedit_read_key(char *read_key_buffer) | |||
1719 | pushback: | 1761 | pushback: |
1720 | /* Invalid sequence. Save all "bad bytes" except first */ | 1762 | /* Invalid sequence. Save all "bad bytes" except first */ |
1721 | read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1); | 1763 | read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1); |
1722 | /* | 1764 | # if !ENABLE_UNICODE_PRESERVE_BROKEN |
1723 | * ic = unicode_buf[0] sounds even better, but currently | ||
1724 | * this does not work: wchar_t[] -> char[] conversion | ||
1725 | * when lineedit finishes mangles such "raw bytes" | ||
1726 | * (by misinterpreting them as unicode chars): | ||
1727 | */ | ||
1728 | ic = CONFIG_SUBST_WCHAR; | 1765 | ic = CONFIG_SUBST_WCHAR; |
1766 | # else | ||
1767 | ic = unicode_mark_inv_wchar(unicode_buf[0]); | ||
1768 | # endif | ||
1729 | } else { | 1769 | } else { |
1730 | /* Valid unicode char, return its code */ | 1770 | /* Valid unicode char, return its code */ |
1731 | ic = wc; | 1771 | ic = wc; |
diff --git a/libbb/unicode.c b/libbb/unicode.c index 83e70b412..d1c6167c7 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c | |||
@@ -423,7 +423,6 @@ static int wcwidth(unsigned ucs) | |||
423 | # if LAST_SUPPORTED_WCHAR >= 0x300 | 423 | # if LAST_SUPPORTED_WCHAR >= 0x300 |
424 | /* sorted list of non-overlapping intervals of non-spacing characters */ | 424 | /* sorted list of non-overlapping intervals of non-spacing characters */ |
425 | /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ | 425 | /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ |
426 | static const struct interval combining[] = { | ||
427 | # define BIG_(a,b) { a, b }, | 426 | # define BIG_(a,b) { a, b }, |
428 | # define PAIR(a,b) | 427 | # define PAIR(a,b) |
429 | # define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \ | 428 | # define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \ |
@@ -557,10 +556,9 @@ static int wcwidth(unsigned ucs) | |||
557 | BIG_(0xFE20, 0xFE23) \ | 556 | BIG_(0xFE20, 0xFE23) \ |
558 | BIG_(0xFEFF, 0xFEFF) \ | 557 | BIG_(0xFEFF, 0xFEFF) \ |
559 | BIG_(0xFFF9, 0xFFFB) | 558 | BIG_(0xFFF9, 0xFFFB) |
560 | ARRAY | 559 | static const struct interval combining[] = { ARRAY }; |
561 | # undef BIG_ | 560 | # undef BIG_ |
562 | # undef PAIR | 561 | # undef PAIR |
563 | }; | ||
564 | # define BIG_(a,b) | 562 | # define BIG_(a,b) |
565 | # define PAIR(a,b) (a << 2) | (b-a), | 563 | # define PAIR(a,b) (a << 2) | (b-a), |
566 | static const uint16_t combining1[] = { ARRAY }; | 564 | static const uint16_t combining1[] = { ARRAY }; |
@@ -668,7 +666,6 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc) | |||
668 | * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt | 666 | * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt |
669 | * Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter | 667 | * Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter |
670 | */ | 668 | */ |
671 | static const struct interval rtl_b[] = { | ||
672 | # define BIG_(a,b) { a, b }, | 669 | # define BIG_(a,b) { a, b }, |
673 | # define PAIR(a,b) | 670 | # define PAIR(a,b) |
674 | # define ARRAY \ | 671 | # define ARRAY \ |
@@ -723,10 +720,9 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc) | |||
723 | {0x10E7F, 0x10FFF}, | 720 | {0x10E7F, 0x10FFF}, |
724 | {0x1E800, 0x1EFFF} | 721 | {0x1E800, 0x1EFFF} |
725 | */ | 722 | */ |
726 | ARRAY | 723 | static const struct interval rtl_b[] = { ARRAY }; |
727 | # undef BIG_ | 724 | # undef BIG_ |
728 | # undef PAIR | 725 | # undef PAIR |
729 | }; | ||
730 | # define BIG_(a,b) | 726 | # define BIG_(a,b) |
731 | # define PAIR(a,b) (a << 2) | (b-a), | 727 | # define PAIR(a,b) (a << 2) | (b-a), |
732 | static const uint16_t rtl_p[] = { ARRAY }; | 728 | static const uint16_t rtl_p[] = { ARRAY }; |
@@ -755,7 +751,6 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc) | |||
755 | * White_Space, Other_Neutral, European_Number, European_Separator, | 751 | * White_Space, Other_Neutral, European_Number, European_Separator, |
756 | * European_Terminator, Arabic_Number, Common_Separator | 752 | * European_Terminator, Arabic_Number, Common_Separator |
757 | */ | 753 | */ |
758 | static const struct interval neutral_b[] = { | ||
759 | # define BIG_(a,b) { a, b }, | 754 | # define BIG_(a,b) { a, b }, |
760 | # define PAIR(a,b) | 755 | # define PAIR(a,b) |
761 | # define ARRAY \ | 756 | # define ARRAY \ |
@@ -929,10 +924,9 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc) | |||
929 | {0x1F030, 0x1F093}, | 924 | {0x1F030, 0x1F093}, |
930 | {0x1F100, 0x1F10A} | 925 | {0x1F100, 0x1F10A} |
931 | */ | 926 | */ |
932 | ARRAY | 927 | static const struct interval neutral_b[] = { ARRAY }; |
933 | # undef BIG_ | 928 | # undef BIG_ |
934 | # undef PAIR | 929 | # undef PAIR |
935 | }; | ||
936 | # define BIG_(a,b) | 930 | # define BIG_(a,b) |
937 | # define PAIR(a,b) (a << 2) | (b-a), | 931 | # define PAIR(a,b) (a << 2) | (b-a), |
938 | static const uint16_t neutral_p[] = { ARRAY }; | 932 | static const uint16_t neutral_p[] = { ARRAY }; |
diff --git a/testsuite/ash.tests b/testsuite/ash.tests index 6b2caf316..ce585beb1 100755 --- a/testsuite/ash.tests +++ b/testsuite/ash.tests | |||
@@ -7,8 +7,30 @@ | |||
7 | 7 | ||
8 | . ./testing.sh | 8 | . ./testing.sh |
9 | 9 | ||
10 | test -f "$bindir/.config" && . "$bindir/.config" | ||
11 | |||
10 | # testing "test name" "options" "expected result" "file input" "stdin" | 12 | # testing "test name" "options" "expected result" "file input" "stdin" |
11 | 13 | ||
14 | if test x"$CONFIG_UNICODE_PRESERVE_BROKEN" = x"y"; then | ||
15 | testing "One byte which is not valid unicode char followed by valid input" \ | ||
16 | "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ | ||
17 | "\ | ||
18 | 00000000 ff 2d 0a |.-.| | ||
19 | 00000003 | ||
20 | " \ | ||
21 | "" \ | ||
22 | "echo \xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" | ||
23 | |||
24 | testing "30 bytes which are not valid unicode chars followed by valid input" \ | ||
25 | "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ | ||
26 | "\ | ||
27 | 00000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................| | ||
28 | 00000010 ff ff ff ff ff ff ff ff ff ff ff ff ff ff 2d 0a |..............-.| | ||
29 | 00000020 | ||
30 | " \ | ||
31 | "" \ | ||
32 | "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" | ||
33 | else | ||
12 | testing "One byte which is not valid unicode char followed by valid input" \ | 34 | testing "One byte which is not valid unicode char followed by valid input" \ |
13 | "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ | 35 | "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ |
14 | "\ | 36 | "\ |
@@ -27,6 +49,8 @@ testing "30 bytes which are not valid unicode chars followed by valid input" \ | |||
27 | " \ | 49 | " \ |
28 | "" \ | 50 | "" \ |
29 | "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" | 51 | "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" |
52 | fi | ||
53 | |||
30 | 54 | ||
31 | # Not sure this behavior is perfect: we lose all invalid input which precedes | 55 | # Not sure this behavior is perfect: we lose all invalid input which precedes |
32 | # arrow keys and such. In this example, \xff\xff are lost | 56 | # arrow keys and such. In this example, \xff\xff are lost |