From a42cc92778545d21429f9267540330991185f1f1 Mon Sep 17 00:00:00 2001 From: "Avi Halachmi (:avih)" Date: Thu, 3 Aug 2023 11:57:01 +0300 Subject: win32: unify 'convert and write to console' (no-op) Use one call to do both charToCon and then write it to the console. Technically, this commit only reduces boilerplate code slightly, but it also makes it easier for future modifications to make changes to this sequence in one place. --- win32/winansi.c | 61 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/win32/winansi.c b/win32/winansi.c index f280177e6..e6528926e 100644 --- a/win32/winansi.c +++ b/win32/winansi.c @@ -10,6 +10,9 @@ static BOOL charToConBuffA(LPSTR s, DWORD len); static BOOL charToConA(LPSTR s); +static int conv_fwriteCon(FILE *stream, char *buf, size_t siz); +static int conv_writeCon(int fd, char *buf, size_t siz); + /* Functions to be wrapped: */ @@ -814,9 +817,7 @@ static int ansi_emulate(const char *s, FILE *stream) size_t len = pos - str; if (len) { - *pos = '\0'; /* NB, '\033' has been overwritten */ - charToConA(str); - if (fputs(str, stream) == EOF) + if (conv_fwriteCon(stream, str, len) == EOF) return EOF; rv += len; } @@ -837,9 +838,9 @@ static int ansi_emulate(const char *s, FILE *stream) return EOF; } else { - rv += strlen(str); - charToConA(str); - return fputs(str, stream) == EOF ? EOF : rv; + size_t len = strlen(str); + rv += len; + return conv_fwriteCon(stream, str, len) == EOF ? EOF : rv; } } return rv; @@ -853,8 +854,7 @@ int winansi_putchar(int c) if (!is_console(STDOUT_FILENO)) return putchar(c); - charToConBuffA(s, 1); - return putchar(t) == EOF ? EOF : (unsigned char)c; + return conv_fwriteCon(stdout, s, 1) == EOF ? EOF : (unsigned char)c; } int winansi_puts(const char *s) @@ -952,8 +952,7 @@ int winansi_fputc(int c, FILE *stream) return ret; } - charToConBuffA(s, 1); - return fputc(t, stream) == EOF ? EOF : (unsigned char )c; + return conv_fwriteCon(stream, s, 1) == EOF ? EOF : (unsigned char )c; } #if !defined(__USE_MINGW_ANSI_STDIO) || !__USE_MINGW_ANSI_STDIO @@ -1083,8 +1082,7 @@ static int ansi_emulate_write(int fd, const void *buf, size_t count) len = pos - str; if (len) { - charToConBuffA(str, len); - out_len = write(fd, str, len); + out_len = conv_writeCon(fd, str, len); if (out_len == -1) return -1; rv += out_len; @@ -1100,8 +1098,7 @@ static int ansi_emulate_write(int fd, const void *buf, size_t count) pos = str; } else { len = strlen(str); - charToConA(str); - out_len = write(fd, str, len); + out_len = conv_writeCon(fd, str, len); return (out_len == -1) ? -1 : rv+out_len; } } @@ -1446,9 +1443,39 @@ void console_write(const char *str, int len) { char *buf = xmemdup(str, len); int fd = _open("CONOUT$", _O_WRONLY); - HANDLE fh = (HANDLE)_get_osfhandle(fd); - charToConBuffA(buf, len); - WriteConsole(fh, buf, len, NULL, NULL); + conv_writeCon(fd, buf, len); close(fd); free(buf); } + +// TODO: improvements: +// +// 1. currently conv_[f]writeCon modify buf inplace, which means the caller +// typically has to make a writable copy first just for this. +// Sometimes it allocates a big copy once, and calls us with substrings. +// Instead, we could make a writable copy here - it's not used later anyway. +// To avoid the performance hit of many small allocations, we could use +// a local buffer for short strings, and allocate only if it doesn't fit +// (or maybe just reuse the local buffer with substring iterations). +// +// 2. Instead of converting from ACP to the console out CP - which guarantees +// potential data-loss if they differ, we could convert it to wchar_t and +// write it using WriteConsoleW. This should prevent all output data-loss. +// care should be taken with DBCS codepages (e.g. 936) or other multi-byte +// because then converting on arbitrary substring boundaries can fail. + +// convert buf inplace from ACP to console out CP and write it to stream +// returns EOF on error, 0 on success +static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) +{ + charToConBuffA(buf, siz); + return fwrite(buf, 1, siz, stream) < siz ? EOF : 0; +} + +// similar to above, but using lower level write +// returns -1 on error, actually-written bytes on suceess +static int conv_writeCon(int fd, char *buf, size_t siz) +{ + charToConBuffA(buf, siz); + return write(fd, buf, siz); +} -- cgit v1.2.3-55-g6feb From bf70275f786716546ec474caade0f16ec28da541 Mon Sep 17 00:00:00 2001 From: "Avi Halachmi (:avih)" Date: Thu, 3 Aug 2023 18:23:23 +0300 Subject: win32: add FEATURE_UTF8_OUTPUT (enabled with unicode) Previously, the unicode build required console (out) codepage of UTF8 in order for unicode output to be printed correctly - e.g. at the shell command prompt or the output of `ls` for unicode file names. This is inconvenient, because by default it's not UTF8, and so unless the user invoked 'chcp 65001' - by default unicode output didn't work. This feature (which is now enabled for the unicode build) makes it print unicode output correctly regardless of the console CP, by using a new stream-conversion funcion from UTF8 chars to wchar_t, and writing those using WriteConsoleW. If the console CP happens to be UTF8 - this conversion is disabled. We could have instead changed the console CP to UTF8, but that's a slippery slope, and some old program which expect the default CP might get broken, so achieving the same result without touching the console CP is hopefully better. --- Config.in | 13 +++++- configs/mingw32_defconfig | 1 + configs/mingw64_defconfig | 1 + configs/mingw64u_defconfig | 1 + scripts/mk_mingw64u_defconfig | 1 + win32/winansi.c | 96 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 112 insertions(+), 1 deletion(-) diff --git a/Config.in b/Config.in index 33f90fa42..548eaf20c 100644 --- a/Config.in +++ b/Config.in @@ -438,7 +438,8 @@ config FEATURE_UTF8_MANIFEST depends on FEATURE_RESOURCES help Include a manifest which sets the process code page to UTF-8. - Users who enable this may also wish to enable FEATURE_UTF8_INPUT. + Users who enable this may also wish to enable FEATURE_UTF8_INPUT + and/or FEATURE_UTF8_OUTPUT. config FEATURE_ICON bool "Include application icon in binary" @@ -483,6 +484,16 @@ config FEATURE_UTF8_INPUT This may be useful in conjunction with the UTF8 manifest which is supported in Window 10 and 11. +config FEATURE_UTF8_OUTPUT + bool "Allow UTF8 console output" + default n + depends on PLATFORM_MINGW32 && FEATURE_UTF8_MANIFEST + help + Print UTF8 output correctly even if the console (output) codepage + is not UTF8. + This may be useful in conjunction with the UTF8 manifest which + is supported in Window 10 and 11. + config TERMINAL_MODE int "Default setting for terminal mode" default 5 diff --git a/configs/mingw32_defconfig b/configs/mingw32_defconfig index 88b974bd1..d3f99f222 100644 --- a/configs/mingw32_defconfig +++ b/configs/mingw32_defconfig @@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y CONFIG_FEATURE_ICON_ALL=y CONFIG_FEATURE_EURO=y # CONFIG_FEATURE_UTF8_INPUT is not set +# CONFIG_FEATURE_UTF8_OUTPUT is not set CONFIG_TERMINAL_MODE=5 CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y CONFIG_FEATURE_EXTRA_FILE_DATA=y diff --git a/configs/mingw64_defconfig b/configs/mingw64_defconfig index d52df9c48..19b66046f 100644 --- a/configs/mingw64_defconfig +++ b/configs/mingw64_defconfig @@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y CONFIG_FEATURE_ICON_ALL=y CONFIG_FEATURE_EURO=y # CONFIG_FEATURE_UTF8_INPUT is not set +# CONFIG_FEATURE_UTF8_OUTPUT is not set CONFIG_TERMINAL_MODE=5 CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y CONFIG_FEATURE_EXTRA_FILE_DATA=y diff --git a/configs/mingw64u_defconfig b/configs/mingw64u_defconfig index 95261b149..876d7162b 100644 --- a/configs/mingw64u_defconfig +++ b/configs/mingw64u_defconfig @@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y CONFIG_FEATURE_ICON_ALL=y CONFIG_FEATURE_EURO=y CONFIG_FEATURE_UTF8_INPUT=y +CONFIG_FEATURE_UTF8_OUTPUT=y CONFIG_TERMINAL_MODE=5 CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y CONFIG_FEATURE_EXTRA_FILE_DATA=y diff --git a/scripts/mk_mingw64u_defconfig b/scripts/mk_mingw64u_defconfig index 3cca78e5b..760c55a00 100755 --- a/scripts/mk_mingw64u_defconfig +++ b/scripts/mk_mingw64u_defconfig @@ -25,6 +25,7 @@ set_build_opts() { set_build_opts \ CONFIG_FEATURE_UTF8_MANIFEST=y \ CONFIG_FEATURE_UTF8_INPUT=y \ + CONFIG_FEATURE_UTF8_OUTPUT=y \ CONFIG_UNICODE_SUPPORT=y \ CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \ CONFIG_SUBST_WCHAR=63 \ diff --git a/win32/winansi.c b/win32/winansi.c index e6528926e..aaaa2fa50 100644 --- a/win32/winansi.c +++ b/win32/winansi.c @@ -1439,6 +1439,92 @@ BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got) } #endif +#if ENABLE_FEATURE_UTF8_OUTPUT +// Write u8buf as if the console output CP is UTF8 - regardless of the CP. +// fd should be associated with a console output. +// Return: 0 on successful write[s], else -1 (e.g. if fd is not a console). +// +// Up to 3 bytes of an incomplete codepoint may be buffered from prior call[s]. +// All the completed codepoints in one call are written using WriteConsoleW. +// Bad sequence of any length (till ASCII7 or UTF8 lead) prints 1 subst wchar. +// +// note: one console is assumed, and the (3 bytes) buffer is shared regardless +// of the original output stream (stdout/err), or even if the handle is +// of a different console. This can result in invalid codepoints output +// if streams are multiplexed mid-codepoint (same as elsewhere?) +static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz) +{ + static int state = 0; // -1: bad, 0-3: remaining cp bytes (0: done/new) + static uint32_t codepoint = 0; // accumulated from up to 4 UTF8 bytes + + HANDLE h = (HANDLE)_get_osfhandle(fd); + wchar_t wbuf[256]; + int wlen = 0; + + // ASCII7 uses least logic, then UTF8 continuations, UTF8 lead, errors + while (u8siz--) { + unsigned char c = *u8buf++; + int topbits = 0; + + while (c & (0x80 >> topbits)) + ++topbits; + + process_byte: + if (state == 0 && topbits == 0) { + // valid ASCII7, state remains 0 + codepoint = c; + + } else if (state > 0 && topbits == 1) { + // valid continuation byte + codepoint = (codepoint << 6) | (c & 0x3f); + if (--state) + continue; + + } else if (state == 0 && topbits >= 2 && topbits <= 4) { + // valid UTF8 lead of 2/3/4 bytes codepoint + codepoint = c & (0x7f >> topbits); + state = topbits - 1; // remaining bytes after lead + continue; + + } else if (state >= 0) { + // invalid byte at state 0/1/2/3, add placeholder once + codepoint = CONFIG_SUBST_WCHAR; + state = -1; + + } else { + // inside bad sequence (placeholder char already added) + if (topbits == 1 || topbits > 4) + continue; // still bad + // c is valid for state 0, process it with clean slate + state = 0; + goto process_byte; + } + + // codepoint is complete + // we don't reject surrogate halves, reserved, etc + if (codepoint < 0x10000) { + wbuf[wlen++] = codepoint; + } else { + // generate a surrogates pair (wbuf has room for 2+) + codepoint -= 0x10000; + wbuf[wlen++] = 0xd800 | (codepoint >> 10); + wbuf[wlen++] = 0xdc00 | (codepoint & 0x3ff); + } + + // flush if we have less than two empty spaces + if (wlen > ARRAY_SIZE(wbuf) - 2) { + if (!WriteConsoleW(h, wbuf, wlen, 0, 0)) + return -1; + wlen = 0; + } + } + + if (wlen && !WriteConsoleW(h, wbuf, wlen, 0, 0)) + return -1; + return 0; +} +#endif + void console_write(const char *str, int len) { char *buf = xmemdup(str, len); @@ -1468,7 +1554,12 @@ void console_write(const char *str, int len) // returns EOF on error, 0 on success static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) { +#if ENABLE_FEATURE_UTF8_OUTPUT + if (GetConsoleOutputCP() != CP_UTF8) + return writeCon_utf8(fileno(stream), buf, siz) ? EOF : 0; +#else charToConBuffA(buf, siz); +#endif return fwrite(buf, 1, siz, stream) < siz ? EOF : 0; } @@ -1476,6 +1567,11 @@ static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) // returns -1 on error, actually-written bytes on suceess static int conv_writeCon(int fd, char *buf, size_t siz) { +#if ENABLE_FEATURE_UTF8_OUTPUT + if (GetConsoleOutputCP() != CP_UTF8) + return writeCon_utf8(fd, buf, siz) ? -1 : siz; +#else charToConBuffA(buf, siz); +#endif return write(fd, buf, siz); } -- cgit v1.2.3-55-g6feb From b8fff6b345d4b7e3f16227f65eecca1a0c88ab41 Mon Sep 17 00:00:00 2001 From: "Avi Halachmi (:avih)" Date: Thu, 3 Aug 2023 19:19:47 +0300 Subject: win32: disable console output conversion with LC_ALL=C Previously, when writing to the console, the non-unicode build always assumed the source data is in the ANSI codepage, and used charToCon to convert it unconditionally to the console CP. Similarly, the unicode build made the same assumption (where ANSI CP is UTF8), and always tried to convert it so that it's printed correctly (at least when FEATURE_UTF8_OUTPUT is enabled - which it is by default at the unicode build). However, there could be cases where this assumption is incorrect, for instance if the data comes from a file encoded for some codepage X, and after the user also changed the console CP to X does 'cat file.X' This commit allows disabling this conversion, using the same env vars which can be used to disable the locale/unicode elsewhere, (LANG, LC_CTYPE, LC_ALL as "C") e.g. 'LC_ALL=C cat file.X' now doesn't convert, and the console renders it according to its own codepage. --- win32/winansi.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/win32/winansi.c b/win32/winansi.c index aaaa2fa50..c88c096d2 100644 --- a/win32/winansi.c +++ b/win32/winansi.c @@ -1534,6 +1534,25 @@ void console_write(const char *str, int len) free(buf); } +// LC_ALL=C disables console output conversion, so that the source +// data is interpreted only by the console according to its output CP. +static int conout_conv_enabled(void) +{ + static int enabled, tested; /* = 0 */ + + if (!tested) { + // keep in sync with [re]init_unicode at libbb/unicode.c + char *s = getenv("LC_ALL"); + if (!s) s = getenv("LC_CTYPE"); + if (!s) s = getenv("LANG"); + + enabled = !(s && s[0] == 'C' && s[1] == 0); + tested = 1; + } + + return enabled; +} + // TODO: improvements: // // 1. currently conv_[f]writeCon modify buf inplace, which means the caller @@ -1554,12 +1573,14 @@ void console_write(const char *str, int len) // returns EOF on error, 0 on success static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) { + if (conout_conv_enabled()) { #if ENABLE_FEATURE_UTF8_OUTPUT - if (GetConsoleOutputCP() != CP_UTF8) - return writeCon_utf8(fileno(stream), buf, siz) ? EOF : 0; + if (GetConsoleOutputCP() != CP_UTF8) + return writeCon_utf8(fileno(stream), buf, siz) ? EOF : 0; #else - charToConBuffA(buf, siz); + charToConBuffA(buf, siz); #endif + } return fwrite(buf, 1, siz, stream) < siz ? EOF : 0; } @@ -1567,11 +1588,13 @@ static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) // returns -1 on error, actually-written bytes on suceess static int conv_writeCon(int fd, char *buf, size_t siz) { + if (conout_conv_enabled()) { #if ENABLE_FEATURE_UTF8_OUTPUT - if (GetConsoleOutputCP() != CP_UTF8) - return writeCon_utf8(fd, buf, siz) ? -1 : siz; + if (GetConsoleOutputCP() != CP_UTF8) + return writeCon_utf8(fd, buf, siz) ? -1 : siz; #else - charToConBuffA(buf, siz); + charToConBuffA(buf, siz); #endif + } return write(fd, buf, siz); } -- cgit v1.2.3-55-g6feb