From 4fe7e7cdd0441e9455cc93c17b40a7a96704e61f Mon Sep 17 00:00:00 2001
From: "Avi Halachmi (:avih)" <avihpit@yahoo.com>
Date: Thu, 20 Jul 2023 22:48:41 +0300
Subject: win32: UTF8 input: improve missing-key-down hack

The UTF8 input code works around an issue when pasting at the
windows console (but not terminal) that sometimes we get key-up
without a prior matching key-down - at which case it generates down.

However, previously it detected this by comparing an up-event to the
last down-event, which could result in false-positive in cases like:
  X-down Y-down X-up Y-up (e.g. when typing quickly).

Now it remembers the last 8 key-down events when searching a prior
matching key-down, which fixes an issue of incorrect repeated keys
(in the example above Y-up was incorrectly changed to Y-down).
---
 win32/winansi.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 11 deletions(-)

diff --git a/win32/winansi.c b/win32/winansi.c
index bc3e69163..f280177e6 100644
--- a/win32/winansi.c
+++ b/win32/winansi.c
@@ -1284,6 +1284,44 @@ static void maybeEatUpto2ndHalfUp(HANDLE h, DWORD *ph1)
 	}
 }
 
+// if the codepoint is a key-down event, remember it, else if
+// it's a key-up event with matching prior down - forget the down,
+// else (up without matching prior key-down) - change it to down.
+// We remember few prior key-down events so that a sequence
+// like X-down Y-down X-up Y-up won't trigger this hack for Y-up.
+// When up is changed into down there won't be further key-up event,
+// but that's OK because the caller ignores key-up events anyway.
+static void maybe_change_up_to_down(wchar_t key, BOOL *isdown)
+{
+	#define DOWN_BUF_SIZ 8
+	static wchar_t downbuf[DOWN_BUF_SIZ] = {0};
+	static int pos = 0;
+
+	if (*isdown) {
+		downbuf[pos++] = key;
+		pos = pos % DOWN_BUF_SIZ;
+		return;
+	}
+
+	// the missing-key-down issue was only observed with unicode values,
+	// so limit this hack to non-ASCII-7 values.
+	// also, launching a new shell/read process from CLI captures
+	// an ENTER-up event without prior down at this new process, which
+	// would otherwise change it to down - creating a wrong ENTER keypress.
+	if (key <= 127)
+		return;
+
+	// key up, try to match a prior down
+	for (int i = 0; i < DOWN_BUF_SIZ; ++i) {
+		if (downbuf[i] == key) {
+			downbuf[i] = 0;  // "forget" this down
+			return;
+		}
+	}
+
+	// no prior key-down - replace the up with down
+	*isdown = TRUE;
+}
 
 /*
  * readConsoleInput_utf8 behaves similar enough to ReadConsoleInputA when
@@ -1355,20 +1393,18 @@ BOOL readConsoleInput_utf8(HANDLE h, INPUT_RECORD *r, DWORD len, DWORD *got)
 		srec = *r;
 		codepoint = srec.Event.KeyEvent.uChar.UnicodeChar;
 
-		// At the cmd.exe console (but not windows terminal) we sometimes
-		// get key-up without the prior expected key-down event, sometimes
-		// with UnicodeChar of 0 instead the key-down event. work around it.
-		if (codepoint) {
-			static wchar_t last_down = 0;
-
-			if (srec.Event.KeyEvent.bKeyDown)
-				last_down = codepoint;
-			else if (codepoint > 127 && codepoint != last_down)
-				srec.Event.KeyEvent.bKeyDown = TRUE;
-		}
+		// Observed when pasting unicode at cmd.exe console (but not
+		// windows terminal), we sometimes get key-up event without
+		// a prior matching key-down (or with key-down codepoint 0),
+		// so this call would change the up into down in such case.
+		// E.g. pastes fixed by this hack: U+1F600 "😀", or U+0C80 "ಀ"
+		if (codepoint)
+			maybe_change_up_to_down(codepoint, &srec.Event.KeyEvent.bKeyDown);
 
 		// if it's a 1st (high) surrogate pair half, try to eat upto and
 		// excluding the 2nd (low) half, and combine them into codepoint.
+		// this does not interfere with the missing-key-down workaround
+		// (no issue if the down-buffer has 1st-half-down without up).
 		if (codepoint >= 0xD800 && codepoint <= 0xDBFF)
 			maybeEatUpto2ndHalfUp(h, &codepoint);
 
-- 
cgit v1.2.3-55-g6feb


From 0efc74740ebc0d98af79ba4a5dfa73bfb5db3df0 Mon Sep 17 00:00:00 2001
From: "Avi Halachmi (:avih)" <avihpit@yahoo.com>
Date: Tue, 27 Jun 2023 14:41:47 +0300
Subject: win32: support build with FEATURE_UNICODE_SUPPORT

FEATURE_UTF8_MANIFEST enables Unicode args and filenames on Win 10+.

FEATURE_UTF8_INPUT allows the shell prompt to digest correctly
Unicode strings (as UTF8) which are typed or pasted.

This commit adds support for building with FEATURE_UNICODE_SUPPORT
(mostly by supporting 32 bit wchar_t which busybox expects):

- Unicode-aware line-edit - for the most part cursor movement/del
  being (UTF8) codepoint-aware rather than assuming that one-byte
  equals one-char-on-screen.

- Codepoint-aware operations in some other utils, like rev or wc -c.

- When UNICODE_COMBINING_WCHARS and UNICODE_WIDE_WCHARS are enabled,
  some screen-width-aware operations, like with fold, ls, expand, etc.

The busybox Unicode support is incomplete, and even less so with the
builtin libc replacement functions, like wcwidth, which are active
when UNICODE_USING_LOCALE is unset (mingw lacks those functions).

FEATURE_CHECK_UNICODE_IN_ENV should be set so that Unicode is not
hardcoded but rather depends on the ANSI codepage and some env vars:
LC_ALL=C disables Unicode support, else it's enabled if ACP is UTF8.

There's at least one known issue where the tab-completion-prefix-case
is not updated correctly, e.g. ~/desk<tab> completes to ~/desktop/
instead of ~/Desktop/, because the code which handles it exists
only at the non-unicode code paths, but that's not very critical.

That seems to be the only case where mingw-specific code is disabled
when Unicode is enabled, but there could be other unknown issues.

None of the Unicode options is enabled by default, and the next
commit will make it easier to create a build which supports Unicode.
---
 include/mingw.h   | 12 ++++++++++++
 include/unicode.h | 15 +++++++++++++++
 libbb/lineedit.c  | 28 +++++++++++++++++++++++++++-
 libbb/unicode.c   |  6 ++++++
 win32/mingw.c     | 14 ++++++++++++++
 5 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/include/mingw.h b/include/mingw.h
index 232ffadd7..97db2f6a9 100644
--- a/include/mingw.h
+++ b/include/mingw.h
@@ -586,6 +586,18 @@ char *alloc_ext_space(const char *path);
 int add_win32_extension(char *p);
 char *file_is_win32_exe(const char *name);
 
+#if ENABLE_UNICODE_SUPPORT
+/*
+ * windows wchar_t is 16 bit, while linux (and busybox expectation) is 32.
+ * so when (busybox) unicode.h is included, wchar_t is 32 bit.
+ * Without unicode.h, MINGW_BB_WCHAR_T is busybox wide char (32),
+ * and wchar_t is Windows wide char (16).
+ */
+#define MINGW_BB_WCHAR_T uint32_t  /* keep in sync with unicode.h */
+
+MINGW_BB_WCHAR_T *bs_to_slash_u(MINGW_BB_WCHAR_T *p) FAST_FUNC;
+#endif
+
 char *bs_to_slash(char *p) FAST_FUNC;
 void slash_to_bs(char *p) FAST_FUNC;
 size_t remove_cr(char *p, size_t len) FAST_FUNC;
diff --git a/include/unicode.h b/include/unicode.h
index 0317a2151..e894f7148 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -87,6 +87,21 @@ void reinit_unicode(const char *LANG) FAST_FUNC;
 #  undef MB_CUR_MAX
 #  define MB_CUR_MAX 6
 
+#if ENABLE_PLATFORM_MINGW32
+  #undef wint_t
+  #undef mbstate_t
+  #undef mbstowcs
+  #undef wcstombs
+  #undef wcrtomb
+  #undef iswspace
+  #undef iswalnum
+  #undef iswpunct
+  #undef wcwidth
+
+  #undef wchar_t
+  #define wchar_t uint32_t  /* keep in sync with MINGW_BB_WCHAR_T */
+#endif
+
 /* Prevent name collisions */
 #  define wint_t    bb_wint_t
 #  define mbstate_t bb_mbstate_t
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index a6884c7e0..1fb8919bb 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -726,8 +726,19 @@ static void input_forward(void)
 #if !ENABLE_PLATFORM_MINGW32
 		put_cur_glyph_and_inc_cursor();
 #else
+	/*
+	 * inc_cursor improves forward cursor movement appearance on
+	 * win 7/8 console, but it's broken with unicode wide-glyphs,
+	 * e.g. paste and move forward over: echo 开开心心过每一天
+	 * so disable inc_corsor when unicode is active (which is only
+	 * windows 10+, where inc_cursor is not needed anyway).
+	 *
+	 * FIXME: the VT_INPUT condition is not required, because other
+	 * than the wide-glyphs issue, inc_cursor works correctly
+	 * regardless of the VT mode.
+	 */
 	{
-		if (terminal_mode(FALSE) & VT_INPUT)
+		if (terminal_mode(FALSE) & VT_INPUT || unicode_status == UNICODE_ON)
 			put_cur_glyph_and_inc_cursor();
 		else
 			inc_cursor();
@@ -770,6 +781,11 @@ static void add_match(char *matched, int sensitive)
 		 || (!ENABLE_UNICODE_SUPPORT && *p >= 0x7f)
 		 || (ENABLE_UNICODE_SUPPORT && *p == 0x7f)
 # else
+		/*
+		 * on Windows, *p > 0x7f is never control:
+		 * without unicode active: these are normal codepage chars.
+		 * with unicode active: these are UTF8 continuation bytes.
+		 */
 		 || *p == 0x7f
 # endif
 		) {
@@ -1318,6 +1334,12 @@ static NOINLINE void input_tab(smallint *lastWasTab)
 # if ENABLE_PLATFORM_MINGW32
 	int chosen_index = 0;
 	int chosen_sens = FALSE;
+	/*
+	 * FIXME: the next three vars are unused with ENABLE_UNICODE_SUPPORT
+	 * because the mingw code which uses them to update a tab-completion
+	 * prefix to the correct case (e.g. ~/desk<tab> to ~/Desktop/) is
+	 * not compiled, and so e.g. ~/desk<tab> completes to ~/desktop/ .
+	 */
 	unsigned orig_pfx_len;
 	char *target;
 	const char *source;
@@ -2803,7 +2825,11 @@ int FAST_FUNC read_line_input(line_input_t *st, const char *prompt, char *comman
 #if ENABLE_PLATFORM_MINGW32
 		case CTRL('Z'):
 			command_ps[command_len] = '\0';
+		#if ENABLE_UNICODE_SUPPORT
+			bs_to_slash_u(command_ps);
+		#else
 			bs_to_slash(command_ps);
+		#endif
 			redraw(cmdedit_y, 0);
 			break;
 #endif
diff --git a/libbb/unicode.c b/libbb/unicode.c
index e98cbbf35..638c3b7c3 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -69,8 +69,14 @@ void FAST_FUNC init_unicode(void)
 void FAST_FUNC reinit_unicode(const char *LANG)
 {
 	unicode_status = UNICODE_OFF;
+#if ENABLE_PLATFORM_MINGW32
+	/* enable unicode only when ACP is UTF8 and the env var is not 'C' */
+	if (GetACP() != CP_UTF8 || (LANG && LANG[0] == 'C' && LANG[1] == 0))
+		return;
+#else
 	if (!LANG || !(strstr(LANG, ".utf") || strstr(LANG, ".UTF")))
 		return;
+#endif
 	unicode_status = UNICODE_ON;
 }
 
diff --git a/win32/mingw.c b/win32/mingw.c
index 5e9c71226..dabb2a2e7 100644
--- a/win32/mingw.c
+++ b/win32/mingw.c
@@ -2119,6 +2119,20 @@ char * FAST_FUNC bs_to_slash(char *str)
 	return str;
 }
 
+#if ENABLE_UNICODE_SUPPORT
+MINGW_BB_WCHAR_T * FAST_FUNC bs_to_slash_u(MINGW_BB_WCHAR_T *str)
+{
+	MINGW_BB_WCHAR_T *p;
+
+	for (p=str; *p; ++p) {
+		if ( *p == '\\' ) {
+			*p = '/';
+		}
+	}
+	return str;
+}
+#endif
+
 void FAST_FUNC slash_to_bs(char *p)
 {
 	for (; *p; ++p) {
-- 
cgit v1.2.3-55-g6feb


From 7aa64d90542cfc8f01f52ece38bf751c7b843875 Mon Sep 17 00:00:00 2001
From: "Avi Halachmi (:avih)" <avihpit@yahoo.com>
Date: Sun, 16 Jul 2023 21:29:08 +0300
Subject: win32: add script to create mingw unicode config

Run ./scripts/mk_mingw64u_defconfig to create (or update)
configs/mingw64u_defconfig from configs/mingw64_defconfig while
enabling UTF8 manifest, UTF8 input, and unicode editing.
---
 scripts/mk_mingw64u_defconfig | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100755 scripts/mk_mingw64u_defconfig

diff --git a/scripts/mk_mingw64u_defconfig b/scripts/mk_mingw64u_defconfig
new file mode 100755
index 000000000..3cca78e5b
--- /dev/null
+++ b/scripts/mk_mingw64u_defconfig
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+configs=$(dirname -- "$0")/../configs
+
+# replace each FOO=bar argument with -e 's/.*FOO.*/FOO=bar/', then sed "$@"
+set_build_opts() {
+    for v; do
+        set -- "$@" -e "s/.*${v%%=*}.*/$v/"
+        shift
+    done
+    sed "$@"
+}
+
+
+# Create unicode configs/mingw64u_defconfig from configs/mingw64_defconfig
+# by flipping some build options to enable:
+# - UTF8 manifest to support unicode on win 10 (filenames, etc).
+# - UTF8 terminal input (shell prompt, read).
+# - UTF8 editing - codepoint awareness (prompt, read):
+#   - Builtin libc unicode functions (mbstowcs etc - no UNICODE_USING_LOCALE).
+#   - Dynamic unicode based on ANSI codepage and ENV (CHECK_UNICODE_IN_ENV).
+#   - Screen-width awareness (COMBINING_WCHARS, WIDE_WCHARS)
+#   - Full unicode range (U+10FFFF - LAST_SUPPORTED_WCHAR=1114111)
+
+set_build_opts \
+    CONFIG_FEATURE_UTF8_MANIFEST=y \
+    CONFIG_FEATURE_UTF8_INPUT=y \
+    CONFIG_UNICODE_SUPPORT=y \
+    CONFIG_FEATURE_CHECK_UNICODE_IN_ENV=y \
+    CONFIG_SUBST_WCHAR=63 \
+    CONFIG_LAST_SUPPORTED_WCHAR=1114111 \
+    CONFIG_UNICODE_COMBINING_WCHARS=y \
+    CONFIG_UNICODE_WIDE_WCHARS=y \
+    < "$configs"/mingw64_defconfig \
+    > "$configs"/mingw64u_defconfig
-- 
cgit v1.2.3-55-g6feb


From 9c0a4d8cd6d4abee680909ca5db7575012a6815b Mon Sep 17 00:00:00 2001
From: "Avi Halachmi (:avih)" <avihpit@yahoo.com>
Date: Thu, 20 Jul 2023 23:15:43 +0300
Subject: win32: use inc_cursor regardless of VT mode

Commit 8ade494 added VT input support and, among others,
disabled inc_cursor (in favor of the upstream busybox code) when
the terminal has VT input enabled.

However, inc_cursor works correctly regardless of the VT mode,
and that condition was not required.

Revert this condition (but still disable inc_cursor with unicode
because it handles wide-glyphs incorrectly).
---
 libbb/lineedit.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index 1fb8919bb..54f0edef0 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -730,15 +730,11 @@ static void input_forward(void)
 	 * inc_cursor improves forward cursor movement appearance on
 	 * win 7/8 console, but it's broken with unicode wide-glyphs,
 	 * e.g. paste and move forward over: echo 开开心心过每一天
-	 * so disable inc_corsor when unicode is active (which is only
+	 * so disable inc_cursor when unicode is active (which is only
 	 * windows 10+, where inc_cursor is not needed anyway).
-	 *
-	 * FIXME: the VT_INPUT condition is not required, because other
-	 * than the wide-glyphs issue, inc_cursor works correctly
-	 * regardless of the VT mode.
 	 */
 	{
-		if (terminal_mode(FALSE) & VT_INPUT || unicode_status == UNICODE_ON)
+		if (unicode_status == UNICODE_ON)
 			put_cur_glyph_and_inc_cursor();
 		else
 			inc_cursor();
-- 
cgit v1.2.3-55-g6feb


From 878b3cd27fe83f2b0ff476b884c34d165be0072c Mon Sep 17 00:00:00 2001
From: "Avi Halachmi (:avih)" <avihpit@yahoo.com>
Date: Tue, 27 Jun 2023 16:42:33 +0300
Subject: unicode: identify emoji width and modifiers

This adds the Emoticons block U+1F600..U+1F64F as double-width
codepoints, and the skin tone modifiers range U+1F3FB..U+1F3FF
as combining codepoints.

The Emoticons variant modifiers U+FE0E and U+FE0F were already in.

It's unclear how to test UNICODE_COMBINING_WCHARS and
UNICODE_WIDE_WCHARS in general and also here specifically,
but at least the data on Emojis width and combinings now exits.
---
 libbb/unicode.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libbb/unicode.c b/libbb/unicode.c
index 638c3b7c3..206ec0dcb 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -659,6 +659,9 @@ int FAST_FUNC wcwidth(unsigned ucs)
 			{ 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
 			{ 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
 			{ 0xD242, 0xD244 }
+#if ENABLE_PLATFORM_MINGW32
+			, { 0xF3FB, 0xF3FF }
+#endif
 		};
 		/* Binary search in table of non-spacing characters in Supplementary Multilingual Plane */
 		if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
@@ -695,6 +698,11 @@ int FAST_FUNC wcwidth(unsigned ucs)
 		|| (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
 		|| (ucs >= 0xffe0 && ucs <= 0xffe6)
 #   endif
+#if ENABLE_PLATFORM_MINGW32
+#   if CONFIG_LAST_SUPPORTED_WCHAR >= 0x10000
+		|| (ucs >= 0x1f600 && ucs <= 0x1f64f) /* Emoticons */
+#   endif
+#endif
 #   if CONFIG_LAST_SUPPORTED_WCHAR >= 0x20000
 		|| ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */
 #   endif
-- 
cgit v1.2.3-55-g6feb