aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRon Yorston <rmy@pobox.com>2023-08-04 12:09:13 +0000
committerGitHub <noreply@github.com>2023-08-04 12:09:13 +0000
commitcf513a82223315c92a229d81b2710fda22da1659 (patch)
treedd8ea10581be75ddee328aeb6aff88f02e9542f6
parent9e2c3594ccbd3ef45beab57cba6796f97f06906c (diff)
parentb8fff6b345d4b7e3f16227f65eecca1a0c88ab41 (diff)
downloadbusybox-w32-cf513a82223315c92a229d81b2710fda22da1659.tar.gz
busybox-w32-cf513a82223315c92a229d81b2710fda22da1659.tar.bz2
busybox-w32-cf513a82223315c92a229d81b2710fda22da1659.zip
Merge pull request #349 from avih/win32-utf8-output
Win32: make unicode print correctly regardless of console CP
-rw-r--r--Config.in13
-rw-r--r--configs/mingw32_defconfig1
-rw-r--r--configs/mingw64_defconfig1
-rw-r--r--configs/mingw64u_defconfig1
-rwxr-xr-xscripts/mk_mingw64u_defconfig1
-rw-r--r--win32/winansi.c180
6 files changed, 179 insertions, 18 deletions
diff --git a/Config.in b/Config.in
index 33f90fa42..548eaf20c 100644
--- a/Config.in
+++ b/Config.in
@@ -438,7 +438,8 @@ config FEATURE_UTF8_MANIFEST
438 depends on FEATURE_RESOURCES 438 depends on FEATURE_RESOURCES
439 help 439 help
440 Include a manifest which sets the process code page to UTF-8. 440 Include a manifest which sets the process code page to UTF-8.
441 Users who enable this may also wish to enable FEATURE_UTF8_INPUT. 441 Users who enable this may also wish to enable FEATURE_UTF8_INPUT
442 and/or FEATURE_UTF8_OUTPUT.
442 443
443config FEATURE_ICON 444config FEATURE_ICON
444 bool "Include application icon in binary" 445 bool "Include application icon in binary"
@@ -483,6 +484,16 @@ config FEATURE_UTF8_INPUT
483 This may be useful in conjunction with the UTF8 manifest which 484 This may be useful in conjunction with the UTF8 manifest which
484 is supported in Window 10 and 11. 485 is supported in Window 10 and 11.
485 486
487config FEATURE_UTF8_OUTPUT
488 bool "Allow UTF8 console output"
489 default n
490 depends on PLATFORM_MINGW32 && FEATURE_UTF8_MANIFEST
491 help
492 Print UTF8 output correctly even if the console (output) codepage
493 is not UTF8.
494 This may be useful in conjunction with the UTF8 manifest which
495 is supported in Window 10 and 11.
496
486config TERMINAL_MODE 497config TERMINAL_MODE
487 int "Default setting for terminal mode" 498 int "Default setting for terminal mode"
488 default 5 499 default 5
diff --git a/configs/mingw32_defconfig b/configs/mingw32_defconfig
index 88b974bd1..d3f99f222 100644
--- a/configs/mingw32_defconfig
+++ b/configs/mingw32_defconfig
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y
55CONFIG_FEATURE_ICON_ALL=y 55CONFIG_FEATURE_ICON_ALL=y
56CONFIG_FEATURE_EURO=y 56CONFIG_FEATURE_EURO=y
57# CONFIG_FEATURE_UTF8_INPUT is not set 57# CONFIG_FEATURE_UTF8_INPUT is not set
58# CONFIG_FEATURE_UTF8_OUTPUT is not set
58CONFIG_TERMINAL_MODE=5 59CONFIG_TERMINAL_MODE=5
59CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y 60CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y
60CONFIG_FEATURE_EXTRA_FILE_DATA=y 61CONFIG_FEATURE_EXTRA_FILE_DATA=y
diff --git a/configs/mingw64_defconfig b/configs/mingw64_defconfig
index d52df9c48..19b66046f 100644
--- a/configs/mingw64_defconfig
+++ b/configs/mingw64_defconfig
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y
55CONFIG_FEATURE_ICON_ALL=y 55CONFIG_FEATURE_ICON_ALL=y
56CONFIG_FEATURE_EURO=y 56CONFIG_FEATURE_EURO=y
57# CONFIG_FEATURE_UTF8_INPUT is not set 57# CONFIG_FEATURE_UTF8_INPUT is not set
58# CONFIG_FEATURE_UTF8_OUTPUT is not set
58CONFIG_TERMINAL_MODE=5 59CONFIG_TERMINAL_MODE=5
59CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y 60CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y
60CONFIG_FEATURE_EXTRA_FILE_DATA=y 61CONFIG_FEATURE_EXTRA_FILE_DATA=y
diff --git a/configs/mingw64u_defconfig b/configs/mingw64u_defconfig
index 95261b149..876d7162b 100644
--- a/configs/mingw64u_defconfig
+++ b/configs/mingw64u_defconfig
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y
55CONFIG_FEATURE_ICON_ALL=y 55CONFIG_FEATURE_ICON_ALL=y
56CONFIG_FEATURE_EURO=y 56CONFIG_FEATURE_EURO=y
57CONFIG_FEATURE_UTF8_INPUT=y 57CONFIG_FEATURE_UTF8_INPUT=y
58CONFIG_FEATURE_UTF8_OUTPUT=y
58CONFIG_TERMINAL_MODE=5 59CONFIG_TERMINAL_MODE=5
59CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y 60CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y
60CONFIG_FEATURE_EXTRA_FILE_DATA=y 61CONFIG_FEATURE_EXTRA_FILE_DATA=y
diff --git a/scripts/mk_mingw64u_defconfig b/scripts/mk_mingw64u_defconfig
index 3cca78e5b..760c55a00 100755
--- a/scripts/mk_mingw64u_defconfig
+++ b/scripts/mk_mingw64u_defconfig
@@ -25,6 +25,7 @@ set_build_opts() {
25set_build_opts \ 25set_build_opts \
26 CONFIG_FEATURE_UTF8_MANIFEST=y \ 26 CONFIG_FEATURE_UTF8_MANIFEST=y \
27 CONFIG_FEATURE_UTF8_INPUT=y \ 27 CONFIG_FEATURE_UTF8_INPUT=y \
28 CONFIG_FEATURE_UTF8_OUTPUT=y \
28 CONFIG_UNICODE_SUPPORT=y \ 29 CONFIG_UNICODE_SUPPORT=y \
29 CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \ 30 CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \
30 CONFIG_SUBST_WCHAR=63 \ 31 CONFIG_SUBST_WCHAR=63 \
diff --git a/win32/winansi.c b/win32/winansi.c
index f280177e6..c88c096d2 100644
--- a/win32/winansi.c
+++ b/win32/winansi.c
@@ -10,6 +10,9 @@
10static BOOL charToConBuffA(LPSTR s, DWORD len); 10static BOOL charToConBuffA(LPSTR s, DWORD len);
11static BOOL charToConA(LPSTR s); 11static BOOL charToConA(LPSTR s);
12 12
13static int conv_fwriteCon(FILE *stream, char *buf, size_t siz);
14static int conv_writeCon(int fd, char *buf, size_t siz);
15
13/* 16/*
14 Functions to be wrapped: 17 Functions to be wrapped:
15*/ 18*/
@@ -814,9 +817,7 @@ static int ansi_emulate(const char *s, FILE *stream)
814 size_t len = pos - str; 817 size_t len = pos - str;
815 818
816 if (len) { 819 if (len) {
817 *pos = '\0'; /* NB, '\033' has been overwritten */ 820 if (conv_fwriteCon(stream, str, len) == EOF)
818 charToConA(str);
819 if (fputs(str, stream) == EOF)
820 return EOF; 821 return EOF;
821 rv += len; 822 rv += len;
822 } 823 }
@@ -837,9 +838,9 @@ static int ansi_emulate(const char *s, FILE *stream)
837 return EOF; 838 return EOF;
838 839
839 } else { 840 } else {
840 rv += strlen(str); 841 size_t len = strlen(str);
841 charToConA(str); 842 rv += len;
842 return fputs(str, stream) == EOF ? EOF : rv; 843 return conv_fwriteCon(stream, str, len) == EOF ? EOF : rv;
843 } 844 }
844 } 845 }
845 return rv; 846 return rv;
@@ -853,8 +854,7 @@ int winansi_putchar(int c)
853 if (!is_console(STDOUT_FILENO)) 854 if (!is_console(STDOUT_FILENO))
854 return putchar(c); 855 return putchar(c);
855 856
856 charToConBuffA(s, 1); 857 return conv_fwriteCon(stdout, s, 1) == EOF ? EOF : (unsigned char)c;
857 return putchar(t) == EOF ? EOF : (unsigned char)c;
858} 858}
859 859
860int winansi_puts(const char *s) 860int winansi_puts(const char *s)
@@ -952,8 +952,7 @@ int winansi_fputc(int c, FILE *stream)
952 return ret; 952 return ret;
953 } 953 }
954 954
955 charToConBuffA(s, 1); 955 return conv_fwriteCon(stream, s, 1) == EOF ? EOF : (unsigned char )c;
956 return fputc(t, stream) == EOF ? EOF : (unsigned char )c;
957} 956}
958 957
959#if !defined(__USE_MINGW_ANSI_STDIO) || !__USE_MINGW_ANSI_STDIO 958#if !defined(__USE_MINGW_ANSI_STDIO) || !__USE_MINGW_ANSI_STDIO
@@ -1083,8 +1082,7 @@ static int ansi_emulate_write(int fd, const void *buf, size_t count)
1083 len = pos - str; 1082 len = pos - str;
1084 1083
1085 if (len) { 1084 if (len) {
1086 charToConBuffA(str, len); 1085 out_len = conv_writeCon(fd, str, len);
1087 out_len = write(fd, str, len);
1088 if (out_len == -1) 1086 if (out_len == -1)
1089 return -1; 1087 return -1;
1090 rv += out_len; 1088 rv += out_len;
@@ -1100,8 +1098,7 @@ static int ansi_emulate_write(int fd, const void *buf, size_t count)
1100 pos = str; 1098 pos = str;
1101 } else { 1099 } else {
1102 len = strlen(str); 1100 len = strlen(str);
1103 charToConA(str); 1101 out_len = conv_writeCon(fd, str, len);
1104 out_len = write(fd, str, len);
1105 return (out_len == -1) ? -1 : rv+out_len; 1102 return (out_len == -1) ? -1 : rv+out_len;
1106 } 1103 }
1107 } 1104 }
@@ -1442,13 +1439,162 @@ BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got)
1442} 1439}
1443#endif 1440#endif
1444 1441
1442#if ENABLE_FEATURE_UTF8_OUTPUT
1443// Write u8buf as if the console output CP is UTF8 - regardless of the CP.
1444// fd should be associated with a console output.
1445// Return: 0 on successful write[s], else -1 (e.g. if fd is not a console).
1446//
1447// Up to 3 bytes of an incomplete codepoint may be buffered from prior call[s].
1448// All the completed codepoints in one call are written using WriteConsoleW.
1449// Bad sequence of any length (till ASCII7 or UTF8 lead) prints 1 subst wchar.
1450//
1451// note: one console is assumed, and the (3 bytes) buffer is shared regardless
1452// of the original output stream (stdout/err), or even if the handle is
1453// of a different console. This can result in invalid codepoints output
1454// if streams are multiplexed mid-codepoint (same as elsewhere?)
1455static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz)
1456{
1457 static int state = 0; // -1: bad, 0-3: remaining cp bytes (0: done/new)
1458 static uint32_t codepoint = 0; // accumulated from up to 4 UTF8 bytes
1459
1460 HANDLE h = (HANDLE)_get_osfhandle(fd);
1461 wchar_t wbuf[256];
1462 int wlen = 0;
1463
1464 // ASCII7 uses least logic, then UTF8 continuations, UTF8 lead, errors
1465 while (u8siz--) {
1466 unsigned char c = *u8buf++;
1467 int topbits = 0;
1468
1469 while (c & (0x80 >> topbits))
1470 ++topbits;
1471
1472 process_byte:
1473 if (state == 0 && topbits == 0) {
1474 // valid ASCII7, state remains 0
1475 codepoint = c;
1476
1477 } else if (state > 0 && topbits == 1) {
1478 // valid continuation byte
1479 codepoint = (codepoint << 6) | (c & 0x3f);
1480 if (--state)
1481 continue;
1482
1483 } else if (state == 0 && topbits >= 2 && topbits <= 4) {
1484 // valid UTF8 lead of 2/3/4 bytes codepoint
1485 codepoint = c & (0x7f >> topbits);
1486 state = topbits - 1; // remaining bytes after lead
1487 continue;
1488
1489 } else if (state >= 0) {
1490 // invalid byte at state 0/1/2/3, add placeholder once
1491 codepoint = CONFIG_SUBST_WCHAR;
1492 state = -1;
1493
1494 } else {
1495 // inside bad sequence (placeholder char already added)
1496 if (topbits == 1 || topbits > 4)
1497 continue; // still bad
1498 // c is valid for state 0, process it with clean slate
1499 state = 0;
1500 goto process_byte;
1501 }
1502
1503 // codepoint is complete
1504 // we don't reject surrogate halves, reserved, etc
1505 if (codepoint < 0x10000) {
1506 wbuf[wlen++] = codepoint;
1507 } else {
1508 // generate a surrogates pair (wbuf has room for 2+)
1509 codepoint -= 0x10000;
1510 wbuf[wlen++] = 0xd800 | (codepoint >> 10);
1511 wbuf[wlen++] = 0xdc00 | (codepoint & 0x3ff);
1512 }
1513
1514 // flush if we have less than two empty spaces
1515 if (wlen > ARRAY_SIZE(wbuf) - 2) {
1516 if (!WriteConsoleW(h, wbuf, wlen, 0, 0))
1517 return -1;
1518 wlen = 0;
1519 }
1520 }
1521
1522 if (wlen && !WriteConsoleW(h, wbuf, wlen, 0, 0))
1523 return -1;
1524 return 0;
1525}
1526#endif
1527
1445void console_write(const char *str, int len) 1528void console_write(const char *str, int len)
1446{ 1529{
1447 char *buf = xmemdup(str, len); 1530 char *buf = xmemdup(str, len);
1448 int fd = _open("CONOUT$", _O_WRONLY); 1531 int fd = _open("CONOUT$", _O_WRONLY);
1449 HANDLE fh = (HANDLE)_get_osfhandle(fd); 1532 conv_writeCon(fd, buf, len);
1450 charToConBuffA(buf, len);
1451 WriteConsole(fh, buf, len, NULL, NULL);
1452 close(fd); 1533 close(fd);
1453 free(buf); 1534 free(buf);
1454} 1535}
1536
1537// LC_ALL=C disables console output conversion, so that the source
1538// data is interpreted only by the console according to its output CP.
1539static int conout_conv_enabled(void)
1540{
1541 static int enabled, tested; /* = 0 */
1542
1543 if (!tested) {
1544 // keep in sync with [re]init_unicode at libbb/unicode.c
1545 char *s = getenv("LC_ALL");
1546 if (!s) s = getenv("LC_CTYPE");
1547 if (!s) s = getenv("LANG");
1548
1549 enabled = !(s && s[0] == 'C' && s[1] == 0);
1550 tested = 1;
1551 }
1552
1553 return enabled;
1554}
1555
1556// TODO: improvements:
1557//
1558// 1. currently conv_[f]writeCon modify buf inplace, which means the caller
1559// typically has to make a writable copy first just for this.
1560// Sometimes it allocates a big copy once, and calls us with substrings.
1561// Instead, we could make a writable copy here - it's not used later anyway.
1562// To avoid the performance hit of many small allocations, we could use
1563// a local buffer for short strings, and allocate only if it doesn't fit
1564// (or maybe just reuse the local buffer with substring iterations).
1565//
1566// 2. Instead of converting from ACP to the console out CP - which guarantees
1567// potential data-loss if they differ, we could convert it to wchar_t and
1568// write it using WriteConsoleW. This should prevent all output data-loss.
1569// care should be taken with DBCS codepages (e.g. 936) or other multi-byte
1570// because then converting on arbitrary substring boundaries can fail.
1571
1572// convert buf inplace from ACP to console out CP and write it to stream
1573// returns EOF on error, 0 on success
1574static int conv_fwriteCon(FILE *stream, char *buf, size_t siz)
1575{
1576 if (conout_conv_enabled()) {
1577#if ENABLE_FEATURE_UTF8_OUTPUT
1578 if (GetConsoleOutputCP() != CP_UTF8)
1579 return writeCon_utf8(fileno(stream), buf, siz) ? EOF : 0;
1580#else
1581 charToConBuffA(buf, siz);
1582#endif
1583 }
1584 return fwrite(buf, 1, siz, stream) < siz ? EOF : 0;
1585}
1586
1587// similar to above, but using lower level write
1588// returns -1 on error, actually-written bytes on suceess
1589static int conv_writeCon(int fd, char *buf, size_t siz)
1590{
1591 if (conout_conv_enabled()) {
1592#if ENABLE_FEATURE_UTF8_OUTPUT
1593 if (GetConsoleOutputCP() != CP_UTF8)
1594 return writeCon_utf8(fd, buf, siz) ? -1 : siz;
1595#else
1596 charToConBuffA(buf, siz);
1597#endif
1598 }
1599 return write(fd, buf, siz);
1600}