From 0efc74740ebc0d98af79ba4a5dfa73bfb5db3df0 Mon Sep 17 00:00:00 2001 From: "Avi Halachmi (:avih)" Date: Tue, 27 Jun 2023 14:41:47 +0300 Subject: win32: support build with FEATURE_UNICODE_SUPPORT FEATURE_UTF8_MANIFEST enables Unicode args and filenames on Win 10+. FEATURE_UTF8_INPUT allows the shell prompt to digest correctly Unicode strings (as UTF8) which are typed or pasted. This commit adds support for building with FEATURE_UNICODE_SUPPORT (mostly by supporting 32 bit wchar_t which busybox expects): - Unicode-aware line-edit - for the most part cursor movement/del being (UTF8) codepoint-aware rather than assuming that one-byte equals one-char-on-screen. - Codepoint-aware operations in some other utils, like rev or wc -c. - When UNICODE_COMBINING_WCHARS and UNICODE_WIDE_WCHARS are enabled, some screen-width-aware operations, like with fold, ls, expand, etc. The busybox Unicode support is incomplete, and even less so with the builtin libc replacement functions, like wcwidth, which are active when UNICODE_USING_LOCALE is unset (mingw lacks those functions). FEATURE_CHECK_UNICODE_IN_ENV should be set so that Unicode is not hardcoded but rather depends on the ANSI codepage and some env vars: LC_ALL=C disables Unicode support, else it's enabled if ACP is UTF8. There's at least one known issue where the tab-completion-prefix-case is not updated correctly, e.g. ~/desk completes to ~/desktop/ instead of ~/Desktop/, because the code which handles it exists only at the non-unicode code paths, but that's not very critical. That seems to be the only case where mingw-specific code is disabled when Unicode is enabled, but there could be other unknown issues. None of the Unicode options is enabled by default, and the next commit will make it easier to create a build which supports Unicode. --- include/mingw.h | 12 ++++++++++++ include/unicode.h | 15 +++++++++++++++ libbb/lineedit.c | 28 +++++++++++++++++++++++++++- libbb/unicode.c | 6 ++++++ win32/mingw.c | 14 ++++++++++++++ 5 files changed, 74 insertions(+), 1 deletion(-) diff --git a/include/mingw.h b/include/mingw.h index 232ffadd7..97db2f6a9 100644 --- a/include/mingw.h +++ b/include/mingw.h @@ -586,6 +586,18 @@ char *alloc_ext_space(const char *path); int add_win32_extension(char *p); char *file_is_win32_exe(const char *name); +#if ENABLE_UNICODE_SUPPORT +/* + * windows wchar_t is 16 bit, while linux (and busybox expectation) is 32. + * so when (busybox) unicode.h is included, wchar_t is 32 bit. + * Without unicode.h, MINGW_BB_WCHAR_T is busybox wide char (32), + * and wchar_t is Windows wide char (16). + */ +#define MINGW_BB_WCHAR_T uint32_t /* keep in sync with unicode.h */ + +MINGW_BB_WCHAR_T *bs_to_slash_u(MINGW_BB_WCHAR_T *p) FAST_FUNC; +#endif + char *bs_to_slash(char *p) FAST_FUNC; void slash_to_bs(char *p) FAST_FUNC; size_t remove_cr(char *p, size_t len) FAST_FUNC; diff --git a/include/unicode.h b/include/unicode.h index 0317a2151..e894f7148 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -87,6 +87,21 @@ void reinit_unicode(const char *LANG) FAST_FUNC; # undef MB_CUR_MAX # define MB_CUR_MAX 6 +#if ENABLE_PLATFORM_MINGW32 + #undef wint_t + #undef mbstate_t + #undef mbstowcs + #undef wcstombs + #undef wcrtomb + #undef iswspace + #undef iswalnum + #undef iswpunct + #undef wcwidth + + #undef wchar_t + #define wchar_t uint32_t /* keep in sync with MINGW_BB_WCHAR_T */ +#endif + /* Prevent name collisions */ # define wint_t bb_wint_t # define mbstate_t bb_mbstate_t diff --git a/libbb/lineedit.c b/libbb/lineedit.c index a6884c7e0..1fb8919bb 100644 --- a/libbb/lineedit.c +++ b/libbb/lineedit.c @@ -726,8 +726,19 @@ static void input_forward(void) #if !ENABLE_PLATFORM_MINGW32 put_cur_glyph_and_inc_cursor(); #else + /* + * inc_cursor improves forward cursor movement appearance on + * win 7/8 console, but it's broken with unicode wide-glyphs, + * e.g. paste and move forward over: echo 开开心心过每一天 + * so disable inc_corsor when unicode is active (which is only + * windows 10+, where inc_cursor is not needed anyway). + * + * FIXME: the VT_INPUT condition is not required, because other + * than the wide-glyphs issue, inc_cursor works correctly + * regardless of the VT mode. + */ { - if (terminal_mode(FALSE) & VT_INPUT) + if (terminal_mode(FALSE) & VT_INPUT || unicode_status == UNICODE_ON) put_cur_glyph_and_inc_cursor(); else inc_cursor(); @@ -770,6 +781,11 @@ static void add_match(char *matched, int sensitive) || (!ENABLE_UNICODE_SUPPORT && *p >= 0x7f) || (ENABLE_UNICODE_SUPPORT && *p == 0x7f) # else + /* + * on Windows, *p > 0x7f is never control: + * without unicode active: these are normal codepage chars. + * with unicode active: these are UTF8 continuation bytes. + */ || *p == 0x7f # endif ) { @@ -1318,6 +1334,12 @@ static NOINLINE void input_tab(smallint *lastWasTab) # if ENABLE_PLATFORM_MINGW32 int chosen_index = 0; int chosen_sens = FALSE; + /* + * FIXME: the next three vars are unused with ENABLE_UNICODE_SUPPORT + * because the mingw code which uses them to update a tab-completion + * prefix to the correct case (e.g. ~/desk to ~/Desktop/) is + * not compiled, and so e.g. ~/desk completes to ~/desktop/ . + */ unsigned orig_pfx_len; char *target; const char *source; @@ -2803,7 +2825,11 @@ int FAST_FUNC read_line_input(line_input_t *st, const char *prompt, char *comman #if ENABLE_PLATFORM_MINGW32 case CTRL('Z'): command_ps[command_len] = '\0'; + #if ENABLE_UNICODE_SUPPORT + bs_to_slash_u(command_ps); + #else bs_to_slash(command_ps); + #endif redraw(cmdedit_y, 0); break; #endif diff --git a/libbb/unicode.c b/libbb/unicode.c index e98cbbf35..638c3b7c3 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c @@ -69,8 +69,14 @@ void FAST_FUNC init_unicode(void) void FAST_FUNC reinit_unicode(const char *LANG) { unicode_status = UNICODE_OFF; +#if ENABLE_PLATFORM_MINGW32 + /* enable unicode only when ACP is UTF8 and the env var is not 'C' */ + if (GetACP() != CP_UTF8 || (LANG && LANG[0] == 'C' && LANG[1] == 0)) + return; +#else if (!LANG || !(strstr(LANG, ".utf") || strstr(LANG, ".UTF"))) return; +#endif unicode_status = UNICODE_ON; } diff --git a/win32/mingw.c b/win32/mingw.c index 5e9c71226..dabb2a2e7 100644 --- a/win32/mingw.c +++ b/win32/mingw.c @@ -2119,6 +2119,20 @@ char * FAST_FUNC bs_to_slash(char *str) return str; } +#if ENABLE_UNICODE_SUPPORT +MINGW_BB_WCHAR_T * FAST_FUNC bs_to_slash_u(MINGW_BB_WCHAR_T *str) +{ + MINGW_BB_WCHAR_T *p; + + for (p=str; *p; ++p) { + if ( *p == '\\' ) { + *p = '/'; + } + } + return str; +} +#endif + void FAST_FUNC slash_to_bs(char *p) { for (; *p; ++p) { -- cgit v1.2.3-55-g6feb