From bf70275f786716546ec474caade0f16ec28da541 Mon Sep 17 00:00:00 2001 From: "Avi Halachmi (:avih)" Date: Thu, 3 Aug 2023 18:23:23 +0300 Subject: win32: add FEATURE_UTF8_OUTPUT (enabled with unicode) Previously, the unicode build required console (out) codepage of UTF8 in order for unicode output to be printed correctly - e.g. at the shell command prompt or the output of `ls` for unicode file names. This is inconvenient, because by default it's not UTF8, and so unless the user invoked 'chcp 65001' - by default unicode output didn't work. This feature (which is now enabled for the unicode build) makes it print unicode output correctly regardless of the console CP, by using a new stream-conversion funcion from UTF8 chars to wchar_t, and writing those using WriteConsoleW. If the console CP happens to be UTF8 - this conversion is disabled. We could have instead changed the console CP to UTF8, but that's a slippery slope, and some old program which expect the default CP might get broken, so achieving the same result without touching the console CP is hopefully better. --- Config.in | 13 +++++- configs/mingw32_defconfig | 1 + configs/mingw64_defconfig | 1 + configs/mingw64u_defconfig | 1 + scripts/mk_mingw64u_defconfig | 1 + win32/winansi.c | 96 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 112 insertions(+), 1 deletion(-) diff --git a/Config.in b/Config.in index 33f90fa42..548eaf20c 100644 --- a/Config.in +++ b/Config.in @@ -438,7 +438,8 @@ config FEATURE_UTF8_MANIFEST depends on FEATURE_RESOURCES help Include a manifest which sets the process code page to UTF-8. - Users who enable this may also wish to enable FEATURE_UTF8_INPUT. + Users who enable this may also wish to enable FEATURE_UTF8_INPUT + and/or FEATURE_UTF8_OUTPUT. config FEATURE_ICON bool "Include application icon in binary" @@ -483,6 +484,16 @@ config FEATURE_UTF8_INPUT This may be useful in conjunction with the UTF8 manifest which is supported in Window 10 and 11. +config FEATURE_UTF8_OUTPUT + bool "Allow UTF8 console output" + default n + depends on PLATFORM_MINGW32 && FEATURE_UTF8_MANIFEST + help + Print UTF8 output correctly even if the console (output) codepage + is not UTF8. + This may be useful in conjunction with the UTF8 manifest which + is supported in Window 10 and 11. + config TERMINAL_MODE int "Default setting for terminal mode" default 5 diff --git a/configs/mingw32_defconfig b/configs/mingw32_defconfig index 88b974bd1..d3f99f222 100644 --- a/configs/mingw32_defconfig +++ b/configs/mingw32_defconfig @@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y CONFIG_FEATURE_ICON_ALL=y CONFIG_FEATURE_EURO=y # CONFIG_FEATURE_UTF8_INPUT is not set +# CONFIG_FEATURE_UTF8_OUTPUT is not set CONFIG_TERMINAL_MODE=5 CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y CONFIG_FEATURE_EXTRA_FILE_DATA=y diff --git a/configs/mingw64_defconfig b/configs/mingw64_defconfig index d52df9c48..19b66046f 100644 --- a/configs/mingw64_defconfig +++ b/configs/mingw64_defconfig @@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y CONFIG_FEATURE_ICON_ALL=y CONFIG_FEATURE_EURO=y # CONFIG_FEATURE_UTF8_INPUT is not set +# CONFIG_FEATURE_UTF8_OUTPUT is not set CONFIG_TERMINAL_MODE=5 CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y CONFIG_FEATURE_EXTRA_FILE_DATA=y diff --git a/configs/mingw64u_defconfig b/configs/mingw64u_defconfig index 95261b149..876d7162b 100644 --- a/configs/mingw64u_defconfig +++ b/configs/mingw64u_defconfig @@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y CONFIG_FEATURE_ICON_ALL=y CONFIG_FEATURE_EURO=y CONFIG_FEATURE_UTF8_INPUT=y +CONFIG_FEATURE_UTF8_OUTPUT=y CONFIG_TERMINAL_MODE=5 CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y CONFIG_FEATURE_EXTRA_FILE_DATA=y diff --git a/scripts/mk_mingw64u_defconfig b/scripts/mk_mingw64u_defconfig index 3cca78e5b..760c55a00 100755 --- a/scripts/mk_mingw64u_defconfig +++ b/scripts/mk_mingw64u_defconfig @@ -25,6 +25,7 @@ set_build_opts() { set_build_opts \ CONFIG_FEATURE_UTF8_MANIFEST=y \ CONFIG_FEATURE_UTF8_INPUT=y \ + CONFIG_FEATURE_UTF8_OUTPUT=y \ CONFIG_UNICODE_SUPPORT=y \ CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \ CONFIG_SUBST_WCHAR=63 \ diff --git a/win32/winansi.c b/win32/winansi.c index e6528926e..aaaa2fa50 100644 --- a/win32/winansi.c +++ b/win32/winansi.c @@ -1439,6 +1439,92 @@ BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got) } #endif +#if ENABLE_FEATURE_UTF8_OUTPUT +// Write u8buf as if the console output CP is UTF8 - regardless of the CP. +// fd should be associated with a console output. +// Return: 0 on successful write[s], else -1 (e.g. if fd is not a console). +// +// Up to 3 bytes of an incomplete codepoint may be buffered from prior call[s]. +// All the completed codepoints in one call are written using WriteConsoleW. +// Bad sequence of any length (till ASCII7 or UTF8 lead) prints 1 subst wchar. +// +// note: one console is assumed, and the (3 bytes) buffer is shared regardless +// of the original output stream (stdout/err), or even if the handle is +// of a different console. This can result in invalid codepoints output +// if streams are multiplexed mid-codepoint (same as elsewhere?) +static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz) +{ + static int state = 0; // -1: bad, 0-3: remaining cp bytes (0: done/new) + static uint32_t codepoint = 0; // accumulated from up to 4 UTF8 bytes + + HANDLE h = (HANDLE)_get_osfhandle(fd); + wchar_t wbuf[256]; + int wlen = 0; + + // ASCII7 uses least logic, then UTF8 continuations, UTF8 lead, errors + while (u8siz--) { + unsigned char c = *u8buf++; + int topbits = 0; + + while (c & (0x80 >> topbits)) + ++topbits; + + process_byte: + if (state == 0 && topbits == 0) { + // valid ASCII7, state remains 0 + codepoint = c; + + } else if (state > 0 && topbits == 1) { + // valid continuation byte + codepoint = (codepoint << 6) | (c & 0x3f); + if (--state) + continue; + + } else if (state == 0 && topbits >= 2 && topbits <= 4) { + // valid UTF8 lead of 2/3/4 bytes codepoint + codepoint = c & (0x7f >> topbits); + state = topbits - 1; // remaining bytes after lead + continue; + + } else if (state >= 0) { + // invalid byte at state 0/1/2/3, add placeholder once + codepoint = CONFIG_SUBST_WCHAR; + state = -1; + + } else { + // inside bad sequence (placeholder char already added) + if (topbits == 1 || topbits > 4) + continue; // still bad + // c is valid for state 0, process it with clean slate + state = 0; + goto process_byte; + } + + // codepoint is complete + // we don't reject surrogate halves, reserved, etc + if (codepoint < 0x10000) { + wbuf[wlen++] = codepoint; + } else { + // generate a surrogates pair (wbuf has room for 2+) + codepoint -= 0x10000; + wbuf[wlen++] = 0xd800 | (codepoint >> 10); + wbuf[wlen++] = 0xdc00 | (codepoint & 0x3ff); + } + + // flush if we have less than two empty spaces + if (wlen > ARRAY_SIZE(wbuf) - 2) { + if (!WriteConsoleW(h, wbuf, wlen, 0, 0)) + return -1; + wlen = 0; + } + } + + if (wlen && !WriteConsoleW(h, wbuf, wlen, 0, 0)) + return -1; + return 0; +} +#endif + void console_write(const char *str, int len) { char *buf = xmemdup(str, len); @@ -1468,7 +1554,12 @@ void console_write(const char *str, int len) // returns EOF on error, 0 on success static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) { +#if ENABLE_FEATURE_UTF8_OUTPUT + if (GetConsoleOutputCP() != CP_UTF8) + return writeCon_utf8(fileno(stream), buf, siz) ? EOF : 0; +#else charToConBuffA(buf, siz); +#endif return fwrite(buf, 1, siz, stream) < siz ? EOF : 0; } @@ -1476,6 +1567,11 @@ static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) // returns -1 on error, actually-written bytes on suceess static int conv_writeCon(int fd, char *buf, size_t siz) { +#if ENABLE_FEATURE_UTF8_OUTPUT + if (GetConsoleOutputCP() != CP_UTF8) + return writeCon_utf8(fd, buf, siz) ? -1 : siz; +#else charToConBuffA(buf, siz); +#endif return write(fd, buf, siz); } -- cgit v1.2.3-55-g6feb