diff options
author | Ron Yorston <rmy@pobox.com> | 2023-08-04 12:09:13 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-04 12:09:13 +0000 |
commit | cf513a82223315c92a229d81b2710fda22da1659 (patch) | |
tree | dd8ea10581be75ddee328aeb6aff88f02e9542f6 | |
parent | 9e2c3594ccbd3ef45beab57cba6796f97f06906c (diff) | |
parent | b8fff6b345d4b7e3f16227f65eecca1a0c88ab41 (diff) | |
download | busybox-w32-cf513a82223315c92a229d81b2710fda22da1659.tar.gz busybox-w32-cf513a82223315c92a229d81b2710fda22da1659.tar.bz2 busybox-w32-cf513a82223315c92a229d81b2710fda22da1659.zip |
Merge pull request #349 from avih/win32-utf8-output
Win32: make unicode print correctly regardless of console CP
-rw-r--r-- | Config.in | 13 | ||||
-rw-r--r-- | configs/mingw32_defconfig | 1 | ||||
-rw-r--r-- | configs/mingw64_defconfig | 1 | ||||
-rw-r--r-- | configs/mingw64u_defconfig | 1 | ||||
-rwxr-xr-x | scripts/mk_mingw64u_defconfig | 1 | ||||
-rw-r--r-- | win32/winansi.c | 180 |
6 files changed, 179 insertions, 18 deletions
@@ -438,7 +438,8 @@ config FEATURE_UTF8_MANIFEST | |||
438 | depends on FEATURE_RESOURCES | 438 | depends on FEATURE_RESOURCES |
439 | help | 439 | help |
440 | Include a manifest which sets the process code page to UTF-8. | 440 | Include a manifest which sets the process code page to UTF-8. |
441 | Users who enable this may also wish to enable FEATURE_UTF8_INPUT. | 441 | Users who enable this may also wish to enable FEATURE_UTF8_INPUT |
442 | and/or FEATURE_UTF8_OUTPUT. | ||
442 | 443 | ||
443 | config FEATURE_ICON | 444 | config FEATURE_ICON |
444 | bool "Include application icon in binary" | 445 | bool "Include application icon in binary" |
@@ -483,6 +484,16 @@ config FEATURE_UTF8_INPUT | |||
483 | This may be useful in conjunction with the UTF8 manifest which | 484 | This may be useful in conjunction with the UTF8 manifest which |
484 | is supported in Window 10 and 11. | 485 | is supported in Window 10 and 11. |
485 | 486 | ||
487 | config FEATURE_UTF8_OUTPUT | ||
488 | bool "Allow UTF8 console output" | ||
489 | default n | ||
490 | depends on PLATFORM_MINGW32 && FEATURE_UTF8_MANIFEST | ||
491 | help | ||
492 | Print UTF8 output correctly even if the console (output) codepage | ||
493 | is not UTF8. | ||
494 | This may be useful in conjunction with the UTF8 manifest which | ||
495 | is supported in Window 10 and 11. | ||
496 | |||
486 | config TERMINAL_MODE | 497 | config TERMINAL_MODE |
487 | int "Default setting for terminal mode" | 498 | int "Default setting for terminal mode" |
488 | default 5 | 499 | default 5 |
diff --git a/configs/mingw32_defconfig b/configs/mingw32_defconfig index 88b974bd1..d3f99f222 100644 --- a/configs/mingw32_defconfig +++ b/configs/mingw32_defconfig | |||
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y | |||
55 | CONFIG_FEATURE_ICON_ALL=y | 55 | CONFIG_FEATURE_ICON_ALL=y |
56 | CONFIG_FEATURE_EURO=y | 56 | CONFIG_FEATURE_EURO=y |
57 | # CONFIG_FEATURE_UTF8_INPUT is not set | 57 | # CONFIG_FEATURE_UTF8_INPUT is not set |
58 | # CONFIG_FEATURE_UTF8_OUTPUT is not set | ||
58 | CONFIG_TERMINAL_MODE=5 | 59 | CONFIG_TERMINAL_MODE=5 |
59 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y | 60 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y |
60 | CONFIG_FEATURE_EXTRA_FILE_DATA=y | 61 | CONFIG_FEATURE_EXTRA_FILE_DATA=y |
diff --git a/configs/mingw64_defconfig b/configs/mingw64_defconfig index d52df9c48..19b66046f 100644 --- a/configs/mingw64_defconfig +++ b/configs/mingw64_defconfig | |||
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y | |||
55 | CONFIG_FEATURE_ICON_ALL=y | 55 | CONFIG_FEATURE_ICON_ALL=y |
56 | CONFIG_FEATURE_EURO=y | 56 | CONFIG_FEATURE_EURO=y |
57 | # CONFIG_FEATURE_UTF8_INPUT is not set | 57 | # CONFIG_FEATURE_UTF8_INPUT is not set |
58 | # CONFIG_FEATURE_UTF8_OUTPUT is not set | ||
58 | CONFIG_TERMINAL_MODE=5 | 59 | CONFIG_TERMINAL_MODE=5 |
59 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y | 60 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y |
60 | CONFIG_FEATURE_EXTRA_FILE_DATA=y | 61 | CONFIG_FEATURE_EXTRA_FILE_DATA=y |
diff --git a/configs/mingw64u_defconfig b/configs/mingw64u_defconfig index 95261b149..876d7162b 100644 --- a/configs/mingw64u_defconfig +++ b/configs/mingw64u_defconfig | |||
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y | |||
55 | CONFIG_FEATURE_ICON_ALL=y | 55 | CONFIG_FEATURE_ICON_ALL=y |
56 | CONFIG_FEATURE_EURO=y | 56 | CONFIG_FEATURE_EURO=y |
57 | CONFIG_FEATURE_UTF8_INPUT=y | 57 | CONFIG_FEATURE_UTF8_INPUT=y |
58 | CONFIG_FEATURE_UTF8_OUTPUT=y | ||
58 | CONFIG_TERMINAL_MODE=5 | 59 | CONFIG_TERMINAL_MODE=5 |
59 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y | 60 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y |
60 | CONFIG_FEATURE_EXTRA_FILE_DATA=y | 61 | CONFIG_FEATURE_EXTRA_FILE_DATA=y |
diff --git a/scripts/mk_mingw64u_defconfig b/scripts/mk_mingw64u_defconfig index 3cca78e5b..760c55a00 100755 --- a/scripts/mk_mingw64u_defconfig +++ b/scripts/mk_mingw64u_defconfig | |||
@@ -25,6 +25,7 @@ set_build_opts() { | |||
25 | set_build_opts \ | 25 | set_build_opts \ |
26 | CONFIG_FEATURE_UTF8_MANIFEST=y \ | 26 | CONFIG_FEATURE_UTF8_MANIFEST=y \ |
27 | CONFIG_FEATURE_UTF8_INPUT=y \ | 27 | CONFIG_FEATURE_UTF8_INPUT=y \ |
28 | CONFIG_FEATURE_UTF8_OUTPUT=y \ | ||
28 | CONFIG_UNICODE_SUPPORT=y \ | 29 | CONFIG_UNICODE_SUPPORT=y \ |
29 | CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \ | 30 | CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \ |
30 | CONFIG_SUBST_WCHAR=63 \ | 31 | CONFIG_SUBST_WCHAR=63 \ |
diff --git a/win32/winansi.c b/win32/winansi.c index f280177e6..c88c096d2 100644 --- a/win32/winansi.c +++ b/win32/winansi.c | |||
@@ -10,6 +10,9 @@ | |||
10 | static BOOL charToConBuffA(LPSTR s, DWORD len); | 10 | static BOOL charToConBuffA(LPSTR s, DWORD len); |
11 | static BOOL charToConA(LPSTR s); | 11 | static BOOL charToConA(LPSTR s); |
12 | 12 | ||
13 | static int conv_fwriteCon(FILE *stream, char *buf, size_t siz); | ||
14 | static int conv_writeCon(int fd, char *buf, size_t siz); | ||
15 | |||
13 | /* | 16 | /* |
14 | Functions to be wrapped: | 17 | Functions to be wrapped: |
15 | */ | 18 | */ |
@@ -814,9 +817,7 @@ static int ansi_emulate(const char *s, FILE *stream) | |||
814 | size_t len = pos - str; | 817 | size_t len = pos - str; |
815 | 818 | ||
816 | if (len) { | 819 | if (len) { |
817 | *pos = '\0'; /* NB, '\033' has been overwritten */ | 820 | if (conv_fwriteCon(stream, str, len) == EOF) |
818 | charToConA(str); | ||
819 | if (fputs(str, stream) == EOF) | ||
820 | return EOF; | 821 | return EOF; |
821 | rv += len; | 822 | rv += len; |
822 | } | 823 | } |
@@ -837,9 +838,9 @@ static int ansi_emulate(const char *s, FILE *stream) | |||
837 | return EOF; | 838 | return EOF; |
838 | 839 | ||
839 | } else { | 840 | } else { |
840 | rv += strlen(str); | 841 | size_t len = strlen(str); |
841 | charToConA(str); | 842 | rv += len; |
842 | return fputs(str, stream) == EOF ? EOF : rv; | 843 | return conv_fwriteCon(stream, str, len) == EOF ? EOF : rv; |
843 | } | 844 | } |
844 | } | 845 | } |
845 | return rv; | 846 | return rv; |
@@ -853,8 +854,7 @@ int winansi_putchar(int c) | |||
853 | if (!is_console(STDOUT_FILENO)) | 854 | if (!is_console(STDOUT_FILENO)) |
854 | return putchar(c); | 855 | return putchar(c); |
855 | 856 | ||
856 | charToConBuffA(s, 1); | 857 | return conv_fwriteCon(stdout, s, 1) == EOF ? EOF : (unsigned char)c; |
857 | return putchar(t) == EOF ? EOF : (unsigned char)c; | ||
858 | } | 858 | } |
859 | 859 | ||
860 | int winansi_puts(const char *s) | 860 | int winansi_puts(const char *s) |
@@ -952,8 +952,7 @@ int winansi_fputc(int c, FILE *stream) | |||
952 | return ret; | 952 | return ret; |
953 | } | 953 | } |
954 | 954 | ||
955 | charToConBuffA(s, 1); | 955 | return conv_fwriteCon(stream, s, 1) == EOF ? EOF : (unsigned char )c; |
956 | return fputc(t, stream) == EOF ? EOF : (unsigned char )c; | ||
957 | } | 956 | } |
958 | 957 | ||
959 | #if !defined(__USE_MINGW_ANSI_STDIO) || !__USE_MINGW_ANSI_STDIO | 958 | #if !defined(__USE_MINGW_ANSI_STDIO) || !__USE_MINGW_ANSI_STDIO |
@@ -1083,8 +1082,7 @@ static int ansi_emulate_write(int fd, const void *buf, size_t count) | |||
1083 | len = pos - str; | 1082 | len = pos - str; |
1084 | 1083 | ||
1085 | if (len) { | 1084 | if (len) { |
1086 | charToConBuffA(str, len); | 1085 | out_len = conv_writeCon(fd, str, len); |
1087 | out_len = write(fd, str, len); | ||
1088 | if (out_len == -1) | 1086 | if (out_len == -1) |
1089 | return -1; | 1087 | return -1; |
1090 | rv += out_len; | 1088 | rv += out_len; |
@@ -1100,8 +1098,7 @@ static int ansi_emulate_write(int fd, const void *buf, size_t count) | |||
1100 | pos = str; | 1098 | pos = str; |
1101 | } else { | 1099 | } else { |
1102 | len = strlen(str); | 1100 | len = strlen(str); |
1103 | charToConA(str); | 1101 | out_len = conv_writeCon(fd, str, len); |
1104 | out_len = write(fd, str, len); | ||
1105 | return (out_len == -1) ? -1 : rv+out_len; | 1102 | return (out_len == -1) ? -1 : rv+out_len; |
1106 | } | 1103 | } |
1107 | } | 1104 | } |
@@ -1442,13 +1439,162 @@ BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got) | |||
1442 | } | 1439 | } |
1443 | #endif | 1440 | #endif |
1444 | 1441 | ||
1442 | #if ENABLE_FEATURE_UTF8_OUTPUT | ||
1443 | // Write u8buf as if the console output CP is UTF8 - regardless of the CP. | ||
1444 | // fd should be associated with a console output. | ||
1445 | // Return: 0 on successful write[s], else -1 (e.g. if fd is not a console). | ||
1446 | // | ||
1447 | // Up to 3 bytes of an incomplete codepoint may be buffered from prior call[s]. | ||
1448 | // All the completed codepoints in one call are written using WriteConsoleW. | ||
1449 | // Bad sequence of any length (till ASCII7 or UTF8 lead) prints 1 subst wchar. | ||
1450 | // | ||
1451 | // note: one console is assumed, and the (3 bytes) buffer is shared regardless | ||
1452 | // of the original output stream (stdout/err), or even if the handle is | ||
1453 | // of a different console. This can result in invalid codepoints output | ||
1454 | // if streams are multiplexed mid-codepoint (same as elsewhere?) | ||
1455 | static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz) | ||
1456 | { | ||
1457 | static int state = 0; // -1: bad, 0-3: remaining cp bytes (0: done/new) | ||
1458 | static uint32_t codepoint = 0; // accumulated from up to 4 UTF8 bytes | ||
1459 | |||
1460 | HANDLE h = (HANDLE)_get_osfhandle(fd); | ||
1461 | wchar_t wbuf[256]; | ||
1462 | int wlen = 0; | ||
1463 | |||
1464 | // ASCII7 uses least logic, then UTF8 continuations, UTF8 lead, errors | ||
1465 | while (u8siz--) { | ||
1466 | unsigned char c = *u8buf++; | ||
1467 | int topbits = 0; | ||
1468 | |||
1469 | while (c & (0x80 >> topbits)) | ||
1470 | ++topbits; | ||
1471 | |||
1472 | process_byte: | ||
1473 | if (state == 0 && topbits == 0) { | ||
1474 | // valid ASCII7, state remains 0 | ||
1475 | codepoint = c; | ||
1476 | |||
1477 | } else if (state > 0 && topbits == 1) { | ||
1478 | // valid continuation byte | ||
1479 | codepoint = (codepoint << 6) | (c & 0x3f); | ||
1480 | if (--state) | ||
1481 | continue; | ||
1482 | |||
1483 | } else if (state == 0 && topbits >= 2 && topbits <= 4) { | ||
1484 | // valid UTF8 lead of 2/3/4 bytes codepoint | ||
1485 | codepoint = c & (0x7f >> topbits); | ||
1486 | state = topbits - 1; // remaining bytes after lead | ||
1487 | continue; | ||
1488 | |||
1489 | } else if (state >= 0) { | ||
1490 | // invalid byte at state 0/1/2/3, add placeholder once | ||
1491 | codepoint = CONFIG_SUBST_WCHAR; | ||
1492 | state = -1; | ||
1493 | |||
1494 | } else { | ||
1495 | // inside bad sequence (placeholder char already added) | ||
1496 | if (topbits == 1 || topbits > 4) | ||
1497 | continue; // still bad | ||
1498 | // c is valid for state 0, process it with clean slate | ||
1499 | state = 0; | ||
1500 | goto process_byte; | ||
1501 | } | ||
1502 | |||
1503 | // codepoint is complete | ||
1504 | // we don't reject surrogate halves, reserved, etc | ||
1505 | if (codepoint < 0x10000) { | ||
1506 | wbuf[wlen++] = codepoint; | ||
1507 | } else { | ||
1508 | // generate a surrogates pair (wbuf has room for 2+) | ||
1509 | codepoint -= 0x10000; | ||
1510 | wbuf[wlen++] = 0xd800 | (codepoint >> 10); | ||
1511 | wbuf[wlen++] = 0xdc00 | (codepoint & 0x3ff); | ||
1512 | } | ||
1513 | |||
1514 | // flush if we have less than two empty spaces | ||
1515 | if (wlen > ARRAY_SIZE(wbuf) - 2) { | ||
1516 | if (!WriteConsoleW(h, wbuf, wlen, 0, 0)) | ||
1517 | return -1; | ||
1518 | wlen = 0; | ||
1519 | } | ||
1520 | } | ||
1521 | |||
1522 | if (wlen && !WriteConsoleW(h, wbuf, wlen, 0, 0)) | ||
1523 | return -1; | ||
1524 | return 0; | ||
1525 | } | ||
1526 | #endif | ||
1527 | |||
1445 | void console_write(const char *str, int len) | 1528 | void console_write(const char *str, int len) |
1446 | { | 1529 | { |
1447 | char *buf = xmemdup(str, len); | 1530 | char *buf = xmemdup(str, len); |
1448 | int fd = _open("CONOUT$", _O_WRONLY); | 1531 | int fd = _open("CONOUT$", _O_WRONLY); |
1449 | HANDLE fh = (HANDLE)_get_osfhandle(fd); | 1532 | conv_writeCon(fd, buf, len); |
1450 | charToConBuffA(buf, len); | ||
1451 | WriteConsole(fh, buf, len, NULL, NULL); | ||
1452 | close(fd); | 1533 | close(fd); |
1453 | free(buf); | 1534 | free(buf); |
1454 | } | 1535 | } |
1536 | |||
1537 | // LC_ALL=C disables console output conversion, so that the source | ||
1538 | // data is interpreted only by the console according to its output CP. | ||
1539 | static int conout_conv_enabled(void) | ||
1540 | { | ||
1541 | static int enabled, tested; /* = 0 */ | ||
1542 | |||
1543 | if (!tested) { | ||
1544 | // keep in sync with [re]init_unicode at libbb/unicode.c | ||
1545 | char *s = getenv("LC_ALL"); | ||
1546 | if (!s) s = getenv("LC_CTYPE"); | ||
1547 | if (!s) s = getenv("LANG"); | ||
1548 | |||
1549 | enabled = !(s && s[0] == 'C' && s[1] == 0); | ||
1550 | tested = 1; | ||
1551 | } | ||
1552 | |||
1553 | return enabled; | ||
1554 | } | ||
1555 | |||
1556 | // TODO: improvements: | ||
1557 | // | ||
1558 | // 1. currently conv_[f]writeCon modify buf inplace, which means the caller | ||
1559 | // typically has to make a writable copy first just for this. | ||
1560 | // Sometimes it allocates a big copy once, and calls us with substrings. | ||
1561 | // Instead, we could make a writable copy here - it's not used later anyway. | ||
1562 | // To avoid the performance hit of many small allocations, we could use | ||
1563 | // a local buffer for short strings, and allocate only if it doesn't fit | ||
1564 | // (or maybe just reuse the local buffer with substring iterations). | ||
1565 | // | ||
1566 | // 2. Instead of converting from ACP to the console out CP - which guarantees | ||
1567 | // potential data-loss if they differ, we could convert it to wchar_t and | ||
1568 | // write it using WriteConsoleW. This should prevent all output data-loss. | ||
1569 | // care should be taken with DBCS codepages (e.g. 936) or other multi-byte | ||
1570 | // because then converting on arbitrary substring boundaries can fail. | ||
1571 | |||
1572 | // convert buf inplace from ACP to console out CP and write it to stream | ||
1573 | // returns EOF on error, 0 on success | ||
1574 | static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) | ||
1575 | { | ||
1576 | if (conout_conv_enabled()) { | ||
1577 | #if ENABLE_FEATURE_UTF8_OUTPUT | ||
1578 | if (GetConsoleOutputCP() != CP_UTF8) | ||
1579 | return writeCon_utf8(fileno(stream), buf, siz) ? EOF : 0; | ||
1580 | #else | ||
1581 | charToConBuffA(buf, siz); | ||
1582 | #endif | ||
1583 | } | ||
1584 | return fwrite(buf, 1, siz, stream) < siz ? EOF : 0; | ||
1585 | } | ||
1586 | |||
1587 | // similar to above, but using lower level write | ||
1588 | // returns -1 on error, actually-written bytes on suceess | ||
1589 | static int conv_writeCon(int fd, char *buf, size_t siz) | ||
1590 | { | ||
1591 | if (conout_conv_enabled()) { | ||
1592 | #if ENABLE_FEATURE_UTF8_OUTPUT | ||
1593 | if (GetConsoleOutputCP() != CP_UTF8) | ||
1594 | return writeCon_utf8(fd, buf, siz) ? -1 : siz; | ||
1595 | #else | ||
1596 | charToConBuffA(buf, siz); | ||
1597 | #endif | ||
1598 | } | ||
1599 | return write(fd, buf, siz); | ||
1600 | } | ||