diff options
author | Avi Halachmi (:avih) <avihpit@yahoo.com> | 2023-08-03 18:23:23 +0300 |
---|---|---|
committer | Avi Halachmi (:avih) <avihpit@yahoo.com> | 2023-08-03 20:57:53 +0300 |
commit | bf70275f786716546ec474caade0f16ec28da541 (patch) | |
tree | c92cc94e1fd5508b6a0cc7745f40cd7750b3e160 | |
parent | a42cc92778545d21429f9267540330991185f1f1 (diff) | |
download | busybox-w32-bf70275f786716546ec474caade0f16ec28da541.tar.gz busybox-w32-bf70275f786716546ec474caade0f16ec28da541.tar.bz2 busybox-w32-bf70275f786716546ec474caade0f16ec28da541.zip |
win32: add FEATURE_UTF8_OUTPUT (enabled with unicode)
Previously, the unicode build required console (out) codepage of UTF8
in order for unicode output to be printed correctly - e.g. at the
shell command prompt or the output of `ls` for unicode file names.
This is inconvenient, because by default it's not UTF8, and so unless
the user invoked 'chcp 65001' - by default unicode output didn't work.
This feature (which is now enabled for the unicode build) makes it
print unicode output correctly regardless of the console CP, by
using a new stream-conversion funcion from UTF8 chars to wchar_t,
and writing those using WriteConsoleW.
If the console CP happens to be UTF8 - this conversion is disabled.
We could have instead changed the console CP to UTF8, but that's
a slippery slope, and some old program which expect the default CP
might get broken, so achieving the same result without touching
the console CP is hopefully better.
-rw-r--r-- | Config.in | 13 | ||||
-rw-r--r-- | configs/mingw32_defconfig | 1 | ||||
-rw-r--r-- | configs/mingw64_defconfig | 1 | ||||
-rw-r--r-- | configs/mingw64u_defconfig | 1 | ||||
-rwxr-xr-x | scripts/mk_mingw64u_defconfig | 1 | ||||
-rw-r--r-- | win32/winansi.c | 96 |
6 files changed, 112 insertions, 1 deletions
@@ -438,7 +438,8 @@ config FEATURE_UTF8_MANIFEST | |||
438 | depends on FEATURE_RESOURCES | 438 | depends on FEATURE_RESOURCES |
439 | help | 439 | help |
440 | Include a manifest which sets the process code page to UTF-8. | 440 | Include a manifest which sets the process code page to UTF-8. |
441 | Users who enable this may also wish to enable FEATURE_UTF8_INPUT. | 441 | Users who enable this may also wish to enable FEATURE_UTF8_INPUT |
442 | and/or FEATURE_UTF8_OUTPUT. | ||
442 | 443 | ||
443 | config FEATURE_ICON | 444 | config FEATURE_ICON |
444 | bool "Include application icon in binary" | 445 | bool "Include application icon in binary" |
@@ -483,6 +484,16 @@ config FEATURE_UTF8_INPUT | |||
483 | This may be useful in conjunction with the UTF8 manifest which | 484 | This may be useful in conjunction with the UTF8 manifest which |
484 | is supported in Window 10 and 11. | 485 | is supported in Window 10 and 11. |
485 | 486 | ||
487 | config FEATURE_UTF8_OUTPUT | ||
488 | bool "Allow UTF8 console output" | ||
489 | default n | ||
490 | depends on PLATFORM_MINGW32 && FEATURE_UTF8_MANIFEST | ||
491 | help | ||
492 | Print UTF8 output correctly even if the console (output) codepage | ||
493 | is not UTF8. | ||
494 | This may be useful in conjunction with the UTF8 manifest which | ||
495 | is supported in Window 10 and 11. | ||
496 | |||
486 | config TERMINAL_MODE | 497 | config TERMINAL_MODE |
487 | int "Default setting for terminal mode" | 498 | int "Default setting for terminal mode" |
488 | default 5 | 499 | default 5 |
diff --git a/configs/mingw32_defconfig b/configs/mingw32_defconfig index 88b974bd1..d3f99f222 100644 --- a/configs/mingw32_defconfig +++ b/configs/mingw32_defconfig | |||
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y | |||
55 | CONFIG_FEATURE_ICON_ALL=y | 55 | CONFIG_FEATURE_ICON_ALL=y |
56 | CONFIG_FEATURE_EURO=y | 56 | CONFIG_FEATURE_EURO=y |
57 | # CONFIG_FEATURE_UTF8_INPUT is not set | 57 | # CONFIG_FEATURE_UTF8_INPUT is not set |
58 | # CONFIG_FEATURE_UTF8_OUTPUT is not set | ||
58 | CONFIG_TERMINAL_MODE=5 | 59 | CONFIG_TERMINAL_MODE=5 |
59 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y | 60 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y |
60 | CONFIG_FEATURE_EXTRA_FILE_DATA=y | 61 | CONFIG_FEATURE_EXTRA_FILE_DATA=y |
diff --git a/configs/mingw64_defconfig b/configs/mingw64_defconfig index d52df9c48..19b66046f 100644 --- a/configs/mingw64_defconfig +++ b/configs/mingw64_defconfig | |||
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y | |||
55 | CONFIG_FEATURE_ICON_ALL=y | 55 | CONFIG_FEATURE_ICON_ALL=y |
56 | CONFIG_FEATURE_EURO=y | 56 | CONFIG_FEATURE_EURO=y |
57 | # CONFIG_FEATURE_UTF8_INPUT is not set | 57 | # CONFIG_FEATURE_UTF8_INPUT is not set |
58 | # CONFIG_FEATURE_UTF8_OUTPUT is not set | ||
58 | CONFIG_TERMINAL_MODE=5 | 59 | CONFIG_TERMINAL_MODE=5 |
59 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y | 60 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y |
60 | CONFIG_FEATURE_EXTRA_FILE_DATA=y | 61 | CONFIG_FEATURE_EXTRA_FILE_DATA=y |
diff --git a/configs/mingw64u_defconfig b/configs/mingw64u_defconfig index 95261b149..876d7162b 100644 --- a/configs/mingw64u_defconfig +++ b/configs/mingw64u_defconfig | |||
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y | |||
55 | CONFIG_FEATURE_ICON_ALL=y | 55 | CONFIG_FEATURE_ICON_ALL=y |
56 | CONFIG_FEATURE_EURO=y | 56 | CONFIG_FEATURE_EURO=y |
57 | CONFIG_FEATURE_UTF8_INPUT=y | 57 | CONFIG_FEATURE_UTF8_INPUT=y |
58 | CONFIG_FEATURE_UTF8_OUTPUT=y | ||
58 | CONFIG_TERMINAL_MODE=5 | 59 | CONFIG_TERMINAL_MODE=5 |
59 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y | 60 | CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y |
60 | CONFIG_FEATURE_EXTRA_FILE_DATA=y | 61 | CONFIG_FEATURE_EXTRA_FILE_DATA=y |
diff --git a/scripts/mk_mingw64u_defconfig b/scripts/mk_mingw64u_defconfig index 3cca78e5b..760c55a00 100755 --- a/scripts/mk_mingw64u_defconfig +++ b/scripts/mk_mingw64u_defconfig | |||
@@ -25,6 +25,7 @@ set_build_opts() { | |||
25 | set_build_opts \ | 25 | set_build_opts \ |
26 | CONFIG_FEATURE_UTF8_MANIFEST=y \ | 26 | CONFIG_FEATURE_UTF8_MANIFEST=y \ |
27 | CONFIG_FEATURE_UTF8_INPUT=y \ | 27 | CONFIG_FEATURE_UTF8_INPUT=y \ |
28 | CONFIG_FEATURE_UTF8_OUTPUT=y \ | ||
28 | CONFIG_UNICODE_SUPPORT=y \ | 29 | CONFIG_UNICODE_SUPPORT=y \ |
29 | CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \ | 30 | CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \ |
30 | CONFIG_SUBST_WCHAR=63 \ | 31 | CONFIG_SUBST_WCHAR=63 \ |
diff --git a/win32/winansi.c b/win32/winansi.c index e6528926e..aaaa2fa50 100644 --- a/win32/winansi.c +++ b/win32/winansi.c | |||
@@ -1439,6 +1439,92 @@ BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got) | |||
1439 | } | 1439 | } |
1440 | #endif | 1440 | #endif |
1441 | 1441 | ||
1442 | #if ENABLE_FEATURE_UTF8_OUTPUT | ||
1443 | // Write u8buf as if the console output CP is UTF8 - regardless of the CP. | ||
1444 | // fd should be associated with a console output. | ||
1445 | // Return: 0 on successful write[s], else -1 (e.g. if fd is not a console). | ||
1446 | // | ||
1447 | // Up to 3 bytes of an incomplete codepoint may be buffered from prior call[s]. | ||
1448 | // All the completed codepoints in one call are written using WriteConsoleW. | ||
1449 | // Bad sequence of any length (till ASCII7 or UTF8 lead) prints 1 subst wchar. | ||
1450 | // | ||
1451 | // note: one console is assumed, and the (3 bytes) buffer is shared regardless | ||
1452 | // of the original output stream (stdout/err), or even if the handle is | ||
1453 | // of a different console. This can result in invalid codepoints output | ||
1454 | // if streams are multiplexed mid-codepoint (same as elsewhere?) | ||
1455 | static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz) | ||
1456 | { | ||
1457 | static int state = 0; // -1: bad, 0-3: remaining cp bytes (0: done/new) | ||
1458 | static uint32_t codepoint = 0; // accumulated from up to 4 UTF8 bytes | ||
1459 | |||
1460 | HANDLE h = (HANDLE)_get_osfhandle(fd); | ||
1461 | wchar_t wbuf[256]; | ||
1462 | int wlen = 0; | ||
1463 | |||
1464 | // ASCII7 uses least logic, then UTF8 continuations, UTF8 lead, errors | ||
1465 | while (u8siz--) { | ||
1466 | unsigned char c = *u8buf++; | ||
1467 | int topbits = 0; | ||
1468 | |||
1469 | while (c & (0x80 >> topbits)) | ||
1470 | ++topbits; | ||
1471 | |||
1472 | process_byte: | ||
1473 | if (state == 0 && topbits == 0) { | ||
1474 | // valid ASCII7, state remains 0 | ||
1475 | codepoint = c; | ||
1476 | |||
1477 | } else if (state > 0 && topbits == 1) { | ||
1478 | // valid continuation byte | ||
1479 | codepoint = (codepoint << 6) | (c & 0x3f); | ||
1480 | if (--state) | ||
1481 | continue; | ||
1482 | |||
1483 | } else if (state == 0 && topbits >= 2 && topbits <= 4) { | ||
1484 | // valid UTF8 lead of 2/3/4 bytes codepoint | ||
1485 | codepoint = c & (0x7f >> topbits); | ||
1486 | state = topbits - 1; // remaining bytes after lead | ||
1487 | continue; | ||
1488 | |||
1489 | } else if (state >= 0) { | ||
1490 | // invalid byte at state 0/1/2/3, add placeholder once | ||
1491 | codepoint = CONFIG_SUBST_WCHAR; | ||
1492 | state = -1; | ||
1493 | |||
1494 | } else { | ||
1495 | // inside bad sequence (placeholder char already added) | ||
1496 | if (topbits == 1 || topbits > 4) | ||
1497 | continue; // still bad | ||
1498 | // c is valid for state 0, process it with clean slate | ||
1499 | state = 0; | ||
1500 | goto process_byte; | ||
1501 | } | ||
1502 | |||
1503 | // codepoint is complete | ||
1504 | // we don't reject surrogate halves, reserved, etc | ||
1505 | if (codepoint < 0x10000) { | ||
1506 | wbuf[wlen++] = codepoint; | ||
1507 | } else { | ||
1508 | // generate a surrogates pair (wbuf has room for 2+) | ||
1509 | codepoint -= 0x10000; | ||
1510 | wbuf[wlen++] = 0xd800 | (codepoint >> 10); | ||
1511 | wbuf[wlen++] = 0xdc00 | (codepoint & 0x3ff); | ||
1512 | } | ||
1513 | |||
1514 | // flush if we have less than two empty spaces | ||
1515 | if (wlen > ARRAY_SIZE(wbuf) - 2) { | ||
1516 | if (!WriteConsoleW(h, wbuf, wlen, 0, 0)) | ||
1517 | return -1; | ||
1518 | wlen = 0; | ||
1519 | } | ||
1520 | } | ||
1521 | |||
1522 | if (wlen && !WriteConsoleW(h, wbuf, wlen, 0, 0)) | ||
1523 | return -1; | ||
1524 | return 0; | ||
1525 | } | ||
1526 | #endif | ||
1527 | |||
1442 | void console_write(const char *str, int len) | 1528 | void console_write(const char *str, int len) |
1443 | { | 1529 | { |
1444 | char *buf = xmemdup(str, len); | 1530 | char *buf = xmemdup(str, len); |
@@ -1468,7 +1554,12 @@ void console_write(const char *str, int len) | |||
1468 | // returns EOF on error, 0 on success | 1554 | // returns EOF on error, 0 on success |
1469 | static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) | 1555 | static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) |
1470 | { | 1556 | { |
1557 | #if ENABLE_FEATURE_UTF8_OUTPUT | ||
1558 | if (GetConsoleOutputCP() != CP_UTF8) | ||
1559 | return writeCon_utf8(fileno(stream), buf, siz) ? EOF : 0; | ||
1560 | #else | ||
1471 | charToConBuffA(buf, siz); | 1561 | charToConBuffA(buf, siz); |
1562 | #endif | ||
1472 | return fwrite(buf, 1, siz, stream) < siz ? EOF : 0; | 1563 | return fwrite(buf, 1, siz, stream) < siz ? EOF : 0; |
1473 | } | 1564 | } |
1474 | 1565 | ||
@@ -1476,6 +1567,11 @@ static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) | |||
1476 | // returns -1 on error, actually-written bytes on suceess | 1567 | // returns -1 on error, actually-written bytes on suceess |
1477 | static int conv_writeCon(int fd, char *buf, size_t siz) | 1568 | static int conv_writeCon(int fd, char *buf, size_t siz) |
1478 | { | 1569 | { |
1570 | #if ENABLE_FEATURE_UTF8_OUTPUT | ||
1571 | if (GetConsoleOutputCP() != CP_UTF8) | ||
1572 | return writeCon_utf8(fd, buf, siz) ? -1 : siz; | ||
1573 | #else | ||
1479 | charToConBuffA(buf, siz); | 1574 | charToConBuffA(buf, siz); |
1575 | #endif | ||
1480 | return write(fd, buf, siz); | 1576 | return write(fd, buf, siz); |
1481 | } | 1577 | } |