feat(terminal): also accept codepoint integers for width checkfeat/unicode-width

Lua utf8 functions return codepoints, hence it makes sense to accept those, instead of having to convert to utf8 string and back again.
author: Thijs Schreijer <thijs@thijsschreijer.nl> 2025-02-06 16:52:08 +0100
committer: Thijs <thijs@thijsschreijer.nl> 2025-02-06 21:10:21 +0100
commit: a1d933aae61420364685051dbc5b318a527fef1d (patch)
tree: 771c9ea7e694f6c18d76e76e5a7160f371453db1
parent: 3c1fdbcc844a55f94dde41591f487ded73eab012 (diff)
download: luasystem-feat/unicode-width.tar.gz
luasystem-feat/unicode-width.tar.bz2
luasystem-feat/unicode-width.zip
3 files changed, 59 insertions, 15 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ae7189..25114c3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -30,6 +30,8 @@ The scope of what is covered by the version number excludes:
 ### unreleased
+- Feat: when detecting character display width, also accept unicode codepoints (integers),
+  since the Lua utf8 library returns codepoints, not strings
 - Fix: NetBSD fix compilation, undeclared directives
 ### version 0.4.5, released 18-Dec-2024
diff --git a/spec/04-term_spec.lua b/spec/04-term_spec.lua
index 813947a..907f903 100644
--- a/spec/04-term_spec.lua
+++ b/spec/04-term_spec.lua
@@ -511,11 +511,18 @@ describe("Terminal:", function()
  describe("utf8cwidth()", function()
+    -- utf-8 strings
    local ch1 = string.char(226, 130, 172)       -- "€"   single
    local ch2 = string.char(240, 159, 154, 128)  -- "🚀"  double
    local ch3 = string.char(228, 189, 160)       -- "你"  double
    local ch4 = string.char(229, 165, 189)       -- "好"  double
+    -- unicode codepoints
+    local cp1 = 8364    -- "€"   single
+    local cp2 = 128640  -- "🚀"  double
+    local cp3 = 20320   -- "你"  double
+    local cp4 = 22909   -- "好"  double
    it("handles zero width characters", function()
      assert.same({0}, {system.utf8cwidth("")}) -- empty string returns 0-size
      assert.same({nil, 'Character width determination failed'}, {system.utf8cwidth("\a")})  -- bell character
@@ -539,6 +546,24 @@ describe("Terminal:", function()
      assert.same({2}, {system.utf8cwidth(ch2 .. ch3 .. ch4)})
    end)
+    it("handles integer codepoints", function()
+      assert.same({1}, {system.utf8cwidth(cp1)})
+      assert.same({2}, {system.utf8cwidth(cp2)})
+      assert.same({2}, {system.utf8cwidth(cp3)})
+      assert.same({2}, {system.utf8cwidth(cp4)})
+    end)
+    it("returns an error on bad argument", function()
+      assert.has.error(function()
+        system.utf8cwidth(true)
+      end, "bad argument #1 to 'utf8cwidth' (Expected UTF-8-string or codepoint-integer as first argument)")
+    end)
+    it("returns an error on bad unicode values", function()
+      assert.same({nil, "Invalid Unicode codepoint"}, {system.utf8cwidth(-10)})
+      assert.same({nil, "Invalid Unicode codepoint"}, {system.utf8cwidth(999999999999)})
+    end)
  end)
diff --git a/src/term.c b/src/term.c
index 8c2b87a..2375080 100644
--- a/src/term.c
+++ b/src/term.c
@@ -953,30 +953,47 @@ int utf8_to_wchar(const char *utf8, size_t len, mk_wchar_t *codepoint) {
 /***
 Get the width of a utf8 character for terminal display.
 @function utf8cwidth
-@tparam string utf8_char the utf8 character to check, only the width of the first character will be returned
+@tparam string|int utf8_char the utf8 character, or unicode codepoint, to check, only the width of the first character will be returned
 @treturn[1] int the display width in columns of the first character in the string (0 for an empty string)
 @treturn[2] nil
 @treturn[2] string error message
 */
 int lst_utf8cwidth(lua_State *L) {
-    const char *utf8_char;
-    size_t utf8_len;
-    utf8_char = luaL_checklstring(L, 1, &utf8_len);
    int width = 0;
    mk_wchar_t wc;
-    if (utf8_len == 0) {
+    if (lua_type(L, 1) == LUA_TSTRING) {
-        lua_pushinteger(L, 0);
+        // Handle UTF8 as string input
-        return 1;
+        const char *utf8_char;
-    }
+        size_t utf8_len;
+        utf8_char = luaL_checklstring(L, 1, &utf8_len);
-    // Convert the UTF-8 string to a wide character
+        if (utf8_len == 0) {
-    int bytes_processed = utf8_to_wchar(utf8_char, utf8_len, &wc);
+            lua_pushinteger(L, 0);
-    if (bytes_processed == -1) {
+            return 1;
-        lua_pushnil(L);
+        }
-        lua_pushstring(L, "Invalid UTF-8 character");
-        return 2;
+        // Convert the UTF-8 string to a wide character
+        int bytes_processed = utf8_to_wchar(utf8_char, utf8_len, &wc);
+        if (bytes_processed == -1) {
+            lua_pushnil(L);
+            lua_pushstring(L, "Invalid UTF-8 character");
+            return 2;
+        }
+    } else if (lua_type(L, 1) == LUA_TNUMBER) {
+        // Handle codepoint input
+        int codepoint = luaL_checkinteger(L, 1);
+        if (codepoint < 0 || codepoint > 0x10FFFF) {
+            lua_pushnil(L);
+            lua_pushstring(L, "Invalid Unicode codepoint");
+            return 2;
+        }
+        wc = (mk_wchar_t)codepoint;
+    } else {
+        return luaL_argerror(L, 1, "Expected UTF-8-string or codepoint-integer as first argument");
    }
    // Get the width of the wide character
author	Thijs Schreijer <thijs@thijsschreijer.nl>	2025-02-06 16:52:08 +0100
committer	Thijs <thijs@thijsschreijer.nl>	2025-02-06 21:10:21 +0100
commit	a1d933aae61420364685051dbc5b318a527fef1d (patch)
tree	771c9ea7e694f6c18d76e76e5a7160f371453db1
parent	3c1fdbcc844a55f94dde41591f487ded73eab012 (diff)
download	luasystem-feat/unicode-width.tar.gz luasystem-feat/unicode-width.tar.bz2 luasystem-feat/unicode-width.zip

diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ae7189..25114c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md
@@ -30,6 +30,8 @@ The scope of what is covered by the version number excludes:
30		30
31	### unreleased	31	### unreleased
32		32
		33	- Feat: when detecting character display width, also accept unicode codepoints (integers),
		34	since the Lua utf8 library returns codepoints, not strings
33	- Fix: NetBSD fix compilation, undeclared directives	35	- Fix: NetBSD fix compilation, undeclared directives
34		36
35	### version 0.4.5, released 18-Dec-2024	37	### version 0.4.5, released 18-Dec-2024


diff --git a/spec/04-term_spec.lua b/spec/04-term_spec.lua index 813947a..907f903 100644 --- a/spec/04-term_spec.lua +++ b/spec/04-term_spec.lua
@@ -511,11 +511,18 @@ describe("Terminal:", function()
511		511
512	describe("utf8cwidth()", function()	512	describe("utf8cwidth()", function()
513		513
		514	-- utf-8 strings
514	local ch1 = string.char(226, 130, 172) -- "€" single	515	local ch1 = string.char(226, 130, 172) -- "€" single
515	local ch2 = string.char(240, 159, 154, 128) -- "🚀" double	516	local ch2 = string.char(240, 159, 154, 128) -- "🚀" double
516	local ch3 = string.char(228, 189, 160) -- "你" double	517	local ch3 = string.char(228, 189, 160) -- "你" double
517	local ch4 = string.char(229, 165, 189) -- "好" double	518	local ch4 = string.char(229, 165, 189) -- "好" double
518		519
		520	-- unicode codepoints
		521	local cp1 = 8364 -- "€" single
		522	local cp2 = 128640 -- "🚀" double
		523	local cp3 = 20320 -- "你" double
		524	local cp4 = 22909 -- "好" double
		525
519	it("handles zero width characters", function()	526	it("handles zero width characters", function()
520	assert.same({0}, {system.utf8cwidth("")}) -- empty string returns 0-size	527	assert.same({0}, {system.utf8cwidth("")}) -- empty string returns 0-size
521	assert.same({nil, 'Character width determination failed'}, {system.utf8cwidth("\a")}) -- bell character	528	assert.same({nil, 'Character width determination failed'}, {system.utf8cwidth("\a")}) -- bell character
@@ -539,6 +546,24 @@ describe("Terminal:", function()
539	assert.same({2}, {system.utf8cwidth(ch2 .. ch3 .. ch4)})	546	assert.same({2}, {system.utf8cwidth(ch2 .. ch3 .. ch4)})
540	end)	547	end)
541		548
		549	it("handles integer codepoints", function()
		550	assert.same({1}, {system.utf8cwidth(cp1)})
		551	assert.same({2}, {system.utf8cwidth(cp2)})
		552	assert.same({2}, {system.utf8cwidth(cp3)})
		553	assert.same({2}, {system.utf8cwidth(cp4)})
		554	end)
		555
		556	it("returns an error on bad argument", function()
		557	assert.has.error(function()
		558	system.utf8cwidth(true)
		559	end, "bad argument #1 to 'utf8cwidth' (Expected UTF-8-string or codepoint-integer as first argument)")
		560	end)
		561
		562	it("returns an error on bad unicode values", function()
		563	assert.same({nil, "Invalid Unicode codepoint"}, {system.utf8cwidth(-10)})
		564	assert.same({nil, "Invalid Unicode codepoint"}, {system.utf8cwidth(999999999999)})
		565	end)
		566
542	end)	567	end)
543		568
544		569


diff --git a/src/term.c b/src/term.c index 8c2b87a..2375080 100644 --- a/src/term.c +++ b/src/term.c
@@ -953,30 +953,47 @@ int utf8_to_wchar(const char utf8, size_t len, mk_wchar_t codepoint) {
953	/***	953	/***
954	Get the width of a utf8 character for terminal display.	954	Get the width of a utf8 character for terminal display.
955	@function utf8cwidth	955	@function utf8cwidth
956	@tparam string utf8_char the utf8 character to check, only the width of the first character will be returned	956	@tparam string\|int utf8_char the utf8 character, or unicode codepoint, to check, only the width of the first character will be returned
957	@treturn[1] int the display width in columns of the first character in the string (0 for an empty string)	957	@treturn[1] int the display width in columns of the first character in the string (0 for an empty string)
958	@treturn[2] nil	958	@treturn[2] nil
959	@treturn[2] string error message	959	@treturn[2] string error message
960	*/	960	*/
961	int lst_utf8cwidth(lua_State *L) {	961	int lst_utf8cwidth(lua_State *L) {
962	const char *utf8_char;
963	size_t utf8_len;
964	utf8_char = luaL_checklstring(L, 1, &utf8_len);
965	int width = 0;	962	int width = 0;
966
967	mk_wchar_t wc;	963	mk_wchar_t wc;
968		964
969	if (utf8_len == 0) {	965	if (lua_type(L, 1) == LUA_TSTRING) {
970	lua_pushinteger(L, 0);	966	// Handle UTF8 as string input
971	return 1;	967	const char *utf8_char;
972	}	968	size_t utf8_len;
		969	utf8_char = luaL_checklstring(L, 1, &utf8_len);
973		970
974	// Convert the UTF-8 string to a wide character	971	if (utf8_len == 0) {
975	int bytes_processed = utf8_to_wchar(utf8_char, utf8_len, &wc);	972	lua_pushinteger(L, 0);
976	if (bytes_processed == -1) {	973	return 1;
977	lua_pushnil(L);	974	}
978	lua_pushstring(L, "Invalid UTF-8 character");	975
979	return 2;	976	// Convert the UTF-8 string to a wide character
		977	int bytes_processed = utf8_to_wchar(utf8_char, utf8_len, &wc);
		978	if (bytes_processed == -1) {
		979	lua_pushnil(L);
		980	lua_pushstring(L, "Invalid UTF-8 character");
		981	return 2;
		982	}
		983
		984	} else if (lua_type(L, 1) == LUA_TNUMBER) {
		985	// Handle codepoint input
		986	int codepoint = luaL_checkinteger(L, 1);
		987
		988	if (codepoint < 0 \|\| codepoint > 0x10FFFF) {
		989	lua_pushnil(L);
		990	lua_pushstring(L, "Invalid Unicode codepoint");
		991	return 2;
		992	}
		993	wc = (mk_wchar_t)codepoint;
		994
		995	} else {
		996	return luaL_argerror(L, 1, "Expected UTF-8-string or codepoint-integer as first argument");
980	}	997	}
981		998
982	// Get the width of the wide character	999	// Get the width of the wide character