aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAvi Halachmi (:avih) <avihpit@yahoo.com>2023-08-03 18:23:23 +0300
committerAvi Halachmi (:avih) <avihpit@yahoo.com>2023-08-03 20:57:53 +0300
commitbf70275f786716546ec474caade0f16ec28da541 (patch)
treec92cc94e1fd5508b6a0cc7745f40cd7750b3e160
parenta42cc92778545d21429f9267540330991185f1f1 (diff)
downloadbusybox-w32-bf70275f786716546ec474caade0f16ec28da541.tar.gz
busybox-w32-bf70275f786716546ec474caade0f16ec28da541.tar.bz2
busybox-w32-bf70275f786716546ec474caade0f16ec28da541.zip
win32: add FEATURE_UTF8_OUTPUT (enabled with unicode)
Previously, the unicode build required console (out) codepage of UTF8 in order for unicode output to be printed correctly - e.g. at the shell command prompt or the output of `ls` for unicode file names. This is inconvenient, because by default it's not UTF8, and so unless the user invoked 'chcp 65001' - by default unicode output didn't work. This feature (which is now enabled for the unicode build) makes it print unicode output correctly regardless of the console CP, by using a new stream-conversion funcion from UTF8 chars to wchar_t, and writing those using WriteConsoleW. If the console CP happens to be UTF8 - this conversion is disabled. We could have instead changed the console CP to UTF8, but that's a slippery slope, and some old program which expect the default CP might get broken, so achieving the same result without touching the console CP is hopefully better.
-rw-r--r--Config.in13
-rw-r--r--configs/mingw32_defconfig1
-rw-r--r--configs/mingw64_defconfig1
-rw-r--r--configs/mingw64u_defconfig1
-rwxr-xr-xscripts/mk_mingw64u_defconfig1
-rw-r--r--win32/winansi.c96
6 files changed, 112 insertions, 1 deletions
diff --git a/Config.in b/Config.in
index 33f90fa42..548eaf20c 100644
--- a/Config.in
+++ b/Config.in
@@ -438,7 +438,8 @@ config FEATURE_UTF8_MANIFEST
438 depends on FEATURE_RESOURCES 438 depends on FEATURE_RESOURCES
439 help 439 help
440 Include a manifest which sets the process code page to UTF-8. 440 Include a manifest which sets the process code page to UTF-8.
441 Users who enable this may also wish to enable FEATURE_UTF8_INPUT. 441 Users who enable this may also wish to enable FEATURE_UTF8_INPUT
442 and/or FEATURE_UTF8_OUTPUT.
442 443
443config FEATURE_ICON 444config FEATURE_ICON
444 bool "Include application icon in binary" 445 bool "Include application icon in binary"
@@ -483,6 +484,16 @@ config FEATURE_UTF8_INPUT
483 This may be useful in conjunction with the UTF8 manifest which 484 This may be useful in conjunction with the UTF8 manifest which
484 is supported in Window 10 and 11. 485 is supported in Window 10 and 11.
485 486
487config FEATURE_UTF8_OUTPUT
488 bool "Allow UTF8 console output"
489 default n
490 depends on PLATFORM_MINGW32 && FEATURE_UTF8_MANIFEST
491 help
492 Print UTF8 output correctly even if the console (output) codepage
493 is not UTF8.
494 This may be useful in conjunction with the UTF8 manifest which
495 is supported in Window 10 and 11.
496
486config TERMINAL_MODE 497config TERMINAL_MODE
487 int "Default setting for terminal mode" 498 int "Default setting for terminal mode"
488 default 5 499 default 5
diff --git a/configs/mingw32_defconfig b/configs/mingw32_defconfig
index 88b974bd1..d3f99f222 100644
--- a/configs/mingw32_defconfig
+++ b/configs/mingw32_defconfig
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y
55CONFIG_FEATURE_ICON_ALL=y 55CONFIG_FEATURE_ICON_ALL=y
56CONFIG_FEATURE_EURO=y 56CONFIG_FEATURE_EURO=y
57# CONFIG_FEATURE_UTF8_INPUT is not set 57# CONFIG_FEATURE_UTF8_INPUT is not set
58# CONFIG_FEATURE_UTF8_OUTPUT is not set
58CONFIG_TERMINAL_MODE=5 59CONFIG_TERMINAL_MODE=5
59CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y 60CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y
60CONFIG_FEATURE_EXTRA_FILE_DATA=y 61CONFIG_FEATURE_EXTRA_FILE_DATA=y
diff --git a/configs/mingw64_defconfig b/configs/mingw64_defconfig
index d52df9c48..19b66046f 100644
--- a/configs/mingw64_defconfig
+++ b/configs/mingw64_defconfig
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y
55CONFIG_FEATURE_ICON_ALL=y 55CONFIG_FEATURE_ICON_ALL=y
56CONFIG_FEATURE_EURO=y 56CONFIG_FEATURE_EURO=y
57# CONFIG_FEATURE_UTF8_INPUT is not set 57# CONFIG_FEATURE_UTF8_INPUT is not set
58# CONFIG_FEATURE_UTF8_OUTPUT is not set
58CONFIG_TERMINAL_MODE=5 59CONFIG_TERMINAL_MODE=5
59CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y 60CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y
60CONFIG_FEATURE_EXTRA_FILE_DATA=y 61CONFIG_FEATURE_EXTRA_FILE_DATA=y
diff --git a/configs/mingw64u_defconfig b/configs/mingw64u_defconfig
index 95261b149..876d7162b 100644
--- a/configs/mingw64u_defconfig
+++ b/configs/mingw64u_defconfig
@@ -55,6 +55,7 @@ CONFIG_FEATURE_ICON=y
55CONFIG_FEATURE_ICON_ALL=y 55CONFIG_FEATURE_ICON_ALL=y
56CONFIG_FEATURE_EURO=y 56CONFIG_FEATURE_EURO=y
57CONFIG_FEATURE_UTF8_INPUT=y 57CONFIG_FEATURE_UTF8_INPUT=y
58CONFIG_FEATURE_UTF8_OUTPUT=y
58CONFIG_TERMINAL_MODE=5 59CONFIG_TERMINAL_MODE=5
59CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y 60CONFIG_FEATURE_IMPROVED_COLOUR_MAPPING=y
60CONFIG_FEATURE_EXTRA_FILE_DATA=y 61CONFIG_FEATURE_EXTRA_FILE_DATA=y
diff --git a/scripts/mk_mingw64u_defconfig b/scripts/mk_mingw64u_defconfig
index 3cca78e5b..760c55a00 100755
--- a/scripts/mk_mingw64u_defconfig
+++ b/scripts/mk_mingw64u_defconfig
@@ -25,6 +25,7 @@ set_build_opts() {
25set_build_opts \ 25set_build_opts \
26 CONFIG_FEATURE_UTF8_MANIFEST=y \ 26 CONFIG_FEATURE_UTF8_MANIFEST=y \
27 CONFIG_FEATURE_UTF8_INPUT=y \ 27 CONFIG_FEATURE_UTF8_INPUT=y \
28 CONFIG_FEATURE_UTF8_OUTPUT=y \
28 CONFIG_UNICODE_SUPPORT=y \ 29 CONFIG_UNICODE_SUPPORT=y \
29 CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \ 30 CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \
30 CONFIG_SUBST_WCHAR=63 \ 31 CONFIG_SUBST_WCHAR=63 \
diff --git a/win32/winansi.c b/win32/winansi.c
index e6528926e..aaaa2fa50 100644
--- a/win32/winansi.c
+++ b/win32/winansi.c
@@ -1439,6 +1439,92 @@ BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got)
1439} 1439}
1440#endif 1440#endif
1441 1441
1442#if ENABLE_FEATURE_UTF8_OUTPUT
1443// Write u8buf as if the console output CP is UTF8 - regardless of the CP.
1444// fd should be associated with a console output.
1445// Return: 0 on successful write[s], else -1 (e.g. if fd is not a console).
1446//
1447// Up to 3 bytes of an incomplete codepoint may be buffered from prior call[s].
1448// All the completed codepoints in one call are written using WriteConsoleW.
1449// Bad sequence of any length (till ASCII7 or UTF8 lead) prints 1 subst wchar.
1450//
1451// note: one console is assumed, and the (3 bytes) buffer is shared regardless
1452// of the original output stream (stdout/err), or even if the handle is
1453// of a different console. This can result in invalid codepoints output
1454// if streams are multiplexed mid-codepoint (same as elsewhere?)
1455static int writeCon_utf8(int fd, const char *u8buf, size_t u8siz)
1456{
1457 static int state = 0; // -1: bad, 0-3: remaining cp bytes (0: done/new)
1458 static uint32_t codepoint = 0; // accumulated from up to 4 UTF8 bytes
1459
1460 HANDLE h = (HANDLE)_get_osfhandle(fd);
1461 wchar_t wbuf[256];
1462 int wlen = 0;
1463
1464 // ASCII7 uses least logic, then UTF8 continuations, UTF8 lead, errors
1465 while (u8siz--) {
1466 unsigned char c = *u8buf++;
1467 int topbits = 0;
1468
1469 while (c & (0x80 >> topbits))
1470 ++topbits;
1471
1472 process_byte:
1473 if (state == 0 && topbits == 0) {
1474 // valid ASCII7, state remains 0
1475 codepoint = c;
1476
1477 } else if (state > 0 && topbits == 1) {
1478 // valid continuation byte
1479 codepoint = (codepoint << 6) | (c & 0x3f);
1480 if (--state)
1481 continue;
1482
1483 } else if (state == 0 && topbits >= 2 && topbits <= 4) {
1484 // valid UTF8 lead of 2/3/4 bytes codepoint
1485 codepoint = c & (0x7f >> topbits);
1486 state = topbits - 1; // remaining bytes after lead
1487 continue;
1488
1489 } else if (state >= 0) {
1490 // invalid byte at state 0/1/2/3, add placeholder once
1491 codepoint = CONFIG_SUBST_WCHAR;
1492 state = -1;
1493
1494 } else {
1495 // inside bad sequence (placeholder char already added)
1496 if (topbits == 1 || topbits > 4)
1497 continue; // still bad
1498 // c is valid for state 0, process it with clean slate
1499 state = 0;
1500 goto process_byte;
1501 }
1502
1503 // codepoint is complete
1504 // we don't reject surrogate halves, reserved, etc
1505 if (codepoint < 0x10000) {
1506 wbuf[wlen++] = codepoint;
1507 } else {
1508 // generate a surrogates pair (wbuf has room for 2+)
1509 codepoint -= 0x10000;
1510 wbuf[wlen++] = 0xd800 | (codepoint >> 10);
1511 wbuf[wlen++] = 0xdc00 | (codepoint & 0x3ff);
1512 }
1513
1514 // flush if we have less than two empty spaces
1515 if (wlen > ARRAY_SIZE(wbuf) - 2) {
1516 if (!WriteConsoleW(h, wbuf, wlen, 0, 0))
1517 return -1;
1518 wlen = 0;
1519 }
1520 }
1521
1522 if (wlen && !WriteConsoleW(h, wbuf, wlen, 0, 0))
1523 return -1;
1524 return 0;
1525}
1526#endif
1527
1442void console_write(const char *str, int len) 1528void console_write(const char *str, int len)
1443{ 1529{
1444 char *buf = xmemdup(str, len); 1530 char *buf = xmemdup(str, len);
@@ -1468,7 +1554,12 @@ void console_write(const char *str, int len)
1468// returns EOF on error, 0 on success 1554// returns EOF on error, 0 on success
1469static int conv_fwriteCon(FILE *stream, char *buf, size_t siz) 1555static int conv_fwriteCon(FILE *stream, char *buf, size_t siz)
1470{ 1556{
1557#if ENABLE_FEATURE_UTF8_OUTPUT
1558 if (GetConsoleOutputCP() != CP_UTF8)
1559 return writeCon_utf8(fileno(stream), buf, siz) ? EOF : 0;
1560#else
1471 charToConBuffA(buf, siz); 1561 charToConBuffA(buf, siz);
1562#endif
1472 return fwrite(buf, 1, siz, stream) < siz ? EOF : 0; 1563 return fwrite(buf, 1, siz, stream) < siz ? EOF : 0;
1473} 1564}
1474 1565
@@ -1476,6 +1567,11 @@ static int conv_fwriteCon(FILE *stream, char *buf, size_t siz)
1476// returns -1 on error, actually-written bytes on suceess 1567// returns -1 on error, actually-written bytes on suceess
1477static int conv_writeCon(int fd, char *buf, size_t siz) 1568static int conv_writeCon(int fd, char *buf, size_t siz)
1478{ 1569{
1570#if ENABLE_FEATURE_UTF8_OUTPUT
1571 if (GetConsoleOutputCP() != CP_UTF8)
1572 return writeCon_utf8(fd, buf, siz) ? -1 : siz;
1573#else
1479 charToConBuffA(buf, siz); 1574 charToConBuffA(buf, siz);
1575#endif
1480 return write(fd, buf, siz); 1576 return write(fd, buf, siz);
1481} 1577}