From 1159329b247d6532fecb375e7008aca979261eaa Mon Sep 17 00:00:00 2001 From: Thijs Schreijer Date: Thu, 29 Jan 2026 11:02:33 +0100 Subject: fix(wcwidth): add a generator for width ranges The generator script will parse official unicode data to create the actual ranges for 0, double, and ambiguous width characters. --- src/term.c | 15 +- src/wcwidth.c | 245 +++---------------------- src/wcwidth.h | 7 +- src/wcwidth_ambiguous_width.c | 64 +++++++ src/wcwidth_double_width.c | 45 +++++ src/wcwidth_update.lua | 404 ++++++++++++++++++++++++++++++++++++++++++ src/wcwidth_zero_width.c | 128 +++++++++++++ 7 files changed, 684 insertions(+), 224 deletions(-) create mode 100644 src/wcwidth_ambiguous_width.c create mode 100644 src/wcwidth_double_width.c create mode 100755 src/wcwidth_update.lua create mode 100644 src/wcwidth_zero_width.c (limited to 'src') diff --git a/src/term.c b/src/term.c index a389e06..80998b0 100644 --- a/src/term.c +++ b/src/term.c @@ -1085,6 +1085,7 @@ int utf8_to_wchar(const char *utf8, size_t len, mk_wchar_t *codepoint) { Get the width of a utf8 character for terminal display. @function utf8cwidth @tparam string|int utf8_char the utf8 character, or unicode codepoint, to check, only the width of the first character will be returned +@tparam[opt=1] int ambiguous_width the width to return for ambiguous width characters (usually 1 or 2) @treturn[1] int the display width in columns of the first character in the string (0 for an empty string) @treturn[2] nil @treturn[2] string error message @@ -1093,6 +1094,7 @@ Get the width of a utf8 character for terminal display. int lst_utf8cwidth(lua_State *L) { int width = 0; mk_wchar_t wc; + int ambiguous_width = luaL_optinteger(L, 2, 1); if (lua_type(L, 1) == LUA_TSTRING) { // Handle UTF8 as string input @@ -1129,10 +1131,10 @@ int lst_utf8cwidth(lua_State *L) { } // Get the width of the wide character - width = mk_wcwidth(wc); + width = mk_wcwidth(wc, ambiguous_width); if (width == -1) { lua_pushnil(L); - lua_pushstring(L, "Character width determination failed"); + lua_pushstring(L, "Control characters have no width"); return 2; } @@ -1147,6 +1149,7 @@ int lst_utf8cwidth(lua_State *L) { Get the width of a utf8 string for terminal display. @function utf8swidth @tparam string utf8_string the utf8 string to check +@tparam[opt=1] int ambiguous_width the width to return for ambiguous width characters (1 or 2) @treturn[1] int the display width of the string in columns (0 for an empty string) @treturn[2] nil @treturn[2] string error message @@ -1156,6 +1159,10 @@ int lst_utf8swidth(lua_State *L) { const char *utf8_str; size_t utf8_len; utf8_str = luaL_checklstring(L, 1, &utf8_len); + int ambiguous_width = luaL_optinteger(L, 2, 1); + if (ambiguous_width != 1 && ambiguous_width != 2) { + return luaL_argerror(L, 2, "Ambiguous width must be 1 or 2"); + } int total_width = 0; if (utf8_len == 0) { @@ -1175,10 +1182,10 @@ int lst_utf8swidth(lua_State *L) { return 2; } - int width = mk_wcwidth(wc); + int width = mk_wcwidth(wc, ambiguous_width); if (width == -1) { lua_pushnil(L); - lua_pushstring(L, "Character width determination failed"); + lua_pushstring(L, "Control characters have no width"); return 2; } diff --git a/src/wcwidth.c b/src/wcwidth.c index 6032158..ea293c9 100644 --- a/src/wcwidth.c +++ b/src/wcwidth.c @@ -1,57 +1,6 @@ -// This file was modified from the original versions, check "modified:" comments for details -// Character range updates (both the table and the +1 check) were generated using ChatGPT. +// This file was modified from the original version by Markus Kuhn -/* - * This is an implementation of wcwidth() and wcswidth() (defined in - * IEEE Std 1002.1-2001) for Unicode. - * - * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html - * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html - * - * In fixed-width output devices, Latin characters all occupy a single - * "cell" position of equal width, whereas ideographic CJK characters - * occupy two such cells. Interoperability between terminal-line - * applications and (teletype-style) character terminals using the - * UTF-8 encoding requires agreement on which character should advance - * the cursor by how many cell positions. No established formal - * standards exist at present on which Unicode character shall occupy - * how many cell positions on character terminals. These routines are - * a first attempt of defining such behavior based on simple rules - * applied to data provided by the Unicode Consortium. - * - * For some graphical characters, the Unicode standard explicitly - * defines a character-cell width via the definition of the East Asian - * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. - * In all these cases, there is no ambiguity about which width a - * terminal shall use. For characters in the East Asian Ambiguous (A) - * class, the width choice depends purely on a preference of backward - * compatibility with either historic CJK or Western practice. - * Choosing single-width for these characters is easy to justify as - * the appropriate long-term solution, as the CJK practice of - * displaying these characters as double-width comes from historic - * implementation simplicity (8-bit encoded characters were displayed - * single-width and 16-bit ones double-width, even for Greek, - * Cyrillic, etc.) and not any typographic considerations. - * - * Much less clear is the choice of width for the Not East Asian - * (Neutral) class. Existing practice does not dictate a width for any - * of these characters. It would nevertheless make sense - * typographically to allocate two character cells to characters such - * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be - * represented adequately with a single-width glyph. The following - * routines at present merely assign a single-cell width to all - * neutral characters, in the interest of simplicity. This is not - * entirely satisfactory and should be reconsidered before - * establishing a formal standard in this area. At the moment, the - * decision which Not East Asian (Neutral) characters should be - * represented by double-width glyphs cannot yet be answered by - * applying a simple rule from the Unicode database content. Setting - * up a proper standard for the behavior of UTF-8 character terminals - * will require a careful analysis not only of each Unicode character, - * but also of each presentation form, something the author of these - * routines has avoided to do so far. - * - * http://www.unicode.org/unicode/reports/tr11/ +/* Original copyrights: * * Markus Kuhn -- 2007-05-26 (Unicode 5.0) * @@ -62,7 +11,7 @@ * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c */ -#include "wcwidth.h" // modified: used to define mk_wchar_t +#include "wcwidth.h" struct interval { int first; @@ -70,7 +19,7 @@ struct interval { }; /* auxiliary function for binary search in interval table */ -static int bisearch(mk_wchar_t ucs, const struct interval *table, int max) { // modified: use mk_wchar_t +static int bisearch(mk_wchar_t ucs, const struct interval *table, int max) { int min = 0; int mid; @@ -91,150 +40,23 @@ static int bisearch(mk_wchar_t ucs, const struct interval *table, int max) { // /* The following two functions define the column width of an ISO 10646 - * character as follows: - * - * - The null character (U+0000) has a column width of 0. - * - * - Other C0/C1 control characters and DEL will lead to a return - * value of -1. - * - * - Non-spacing and enclosing combining characters (general - * category code Mn or Me in the Unicode database) have a - * column width of 0. + * characters. * - * - SOFT HYPHEN (U+00AD) has a column width of 1. - * - * - Other format characters (general category code Cf in the Unicode - * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. - * - * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) - * have a column width of 0. - * - * - Spacing characters in the East Asian Wide (W) or East Asian - * Full-width (F) category as defined in Unicode Technical - * Report #11 have a column width of 2. - * - * - All remaining characters (including all printable - * ISO 8859-1 and WGL4 characters, Unicode control characters, - * etc.) have a column width of 1. - * - * This implementation assumes that mk_wchar_t characters are encoded - * in ISO 10646. + * @param ucs the Unicode code point to check + * @param ambiguous_width the width to return for ambiguous width characters (1 or 2) + * @return the width of the character, or -1 if the character is a control character */ -int mk_wcwidth(mk_wchar_t ucs) // modified: use mk_wchar_t +int mk_wcwidth(mk_wchar_t ucs, int ambiguous_width) { - /* sorted list of non-overlapping intervals of non-spacing characters */ - /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ - static const struct interval combining[] = { // modified: added new ranges to the list - { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD }, - { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 }, - { 0x05C7, 0x05C7 }, { 0x0600, 0x0605 }, { 0x0610, 0x061A }, - { 0x061C, 0x061C }, { 0x064B, 0x065F }, { 0x0670, 0x0670 }, - { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 }, { 0x06E7, 0x06E8 }, - { 0x06EA, 0x06ED }, { 0x0711, 0x0711 }, { 0x0730, 0x074A }, - { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x07FD, 0x07FD }, - { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 }, - { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x08D3, 0x08E1 }, - { 0x08E3, 0x0903 }, { 0x093A, 0x093C }, { 0x093E, 0x094F }, - { 0x0951, 0x0957 }, { 0x0962, 0x0963 }, { 0x0981, 0x0983 }, - { 0x09BC, 0x09BC }, { 0x09BE, 0x09C4 }, { 0x09C7, 0x09C8 }, - { 0x09CB, 0x09CD }, { 0x09D7, 0x09D7 }, { 0x09E2, 0x09E3 }, - { 0x09FE, 0x09FE }, { 0x0A01, 0x0A03 }, { 0x0A3C, 0x0A3C }, - { 0x0A3E, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, - { 0x0A51, 0x0A51 }, { 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, - { 0x0A81, 0x0A83 }, { 0x0ABC, 0x0ABC }, { 0x0ABE, 0x0AC5 }, - { 0x0AC7, 0x0AC9 }, { 0x0ACB, 0x0ACD }, { 0x0AE2, 0x0AE3 }, - { 0x0AFA, 0x0AFF }, { 0x0B01, 0x0B03 }, { 0x0B3C, 0x0B3C }, - { 0x0B3E, 0x0B44 }, { 0x0B47, 0x0B48 }, { 0x0B4B, 0x0B4D }, - { 0x0B55, 0x0B57 }, { 0x0B62, 0x0B63 }, { 0x0B82, 0x0B82 }, - { 0x0BBE, 0x0BC2 }, { 0x0BC6, 0x0BC8 }, { 0x0BCA, 0x0BCD }, - { 0x0BD7, 0x0BD7 }, { 0x0C00, 0x0C04 }, { 0x0C3E, 0x0C44 }, - { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, - { 0x0C62, 0x0C63 }, { 0x0C81, 0x0C83 }, { 0x0CBC, 0x0CBC }, - { 0x0CBE, 0x0CC4 }, { 0x0CC6, 0x0CC8 }, { 0x0CCA, 0x0CCD }, - { 0x0CD5, 0x0CD6 }, { 0x0CE2, 0x0CE3 }, { 0x0D00, 0x0D03 }, - { 0x0D3B, 0x0D3C }, { 0x0D3E, 0x0D44 }, { 0x0D46, 0x0D48 }, - { 0x0D4A, 0x0D4D }, { 0x0D57, 0x0D57 }, { 0x0D62, 0x0D63 }, - { 0x0D82, 0x0D83 }, { 0x0DCF, 0x0DD4 }, { 0x0DD6, 0x0DD6 }, - { 0x0DD8, 0x0DDF }, { 0x0DF2, 0x0DF3 }, { 0x0E31, 0x0E31 }, - { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, - { 0x0EB4, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, - { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, - { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, - { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, - { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A }, - { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 }, - { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 }, - { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x135D, 0x135F }, - { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 }, - { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD }, - { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD }, - { 0x180B, 0x180E }, { 0x1885, 0x1886 }, { 0x18A9, 0x18A9 }, - { 0x1920, 0x1922 }, { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, - { 0x1939, 0x193B }, { 0x1A17, 0x1A18 }, { 0x1A1B, 0x1A1B }, - { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E }, { 0x1A60, 0x1A60 }, - { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C }, { 0x1A73, 0x1A7C }, - { 0x1A7F, 0x1A7F }, { 0x1AB0, 0x1ACE }, { 0x1B00, 0x1B03 }, - { 0x1B34, 0x1B34 }, { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, - { 0x1B42, 0x1B42 }, { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B82 }, - { 0x1BA1, 0x1BA1 }, { 0x1BA6, 0x1BA7 }, { 0x1BAA, 0x1BAA }, - { 0x1BAB, 0x1BAD }, { 0x1BE6, 0x1BE6 }, { 0x1BE8, 0x1BE9 }, - { 0x1BED, 0x1BED }, { 0x1BEF, 0x1BF1 }, { 0x1C2C, 0x1C33 }, - { 0x1C36, 0x1C37 }, { 0x1CD0, 0x1CD2 }, { 0x1CD4, 0x1CE8 }, - { 0x1CED, 0x1CED }, { 0x1CF4, 0x1CF4 }, { 0x1CF8, 0x1CF9 }, - { 0x1DC0, 0x1DF9 }, { 0x1DFB, 0x1DFF }, { 0x20D0, 0x20DC }, - { 0x20E1, 0x20E1 }, { 0x20E5, 0x20F0 }, { 0x2CEF, 0x2CF1 }, - { 0x2D7F, 0x2D7F }, { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D }, - { 0x3099, 0x309A }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D }, - { 0xA69E, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 }, - { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 }, - { 0xA82C, 0xA82C }, { 0xA8C4, 0xA8C5 }, { 0xA8E0, 0xA8F1 }, - { 0xA8FF, 0xA8FF }, { 0xA926, 0xA92D }, { 0xA947, 0xA951 }, - { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 }, { 0xA9B6, 0xA9B9 }, - { 0xA9BC, 0xA9BD }, { 0xA9E5, 0xA9E5 }, { 0xAA29, 0xAA2E }, - { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 }, - { 0xAA4C, 0xAA4C }, { 0xAA7C, 0xAA7C }, { 0xAAB0, 0xAAB0 }, - { 0xAAB2, 0xAAB4 }, { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF }, - { 0xAAC1, 0xAAC1 }, { 0xAAEB, 0xAAEB }, { 0xAAEE, 0xAAEF }, - { 0xAAF5, 0xAAF6 }, { 0xABE3, 0xABE4 }, { 0xABE6, 0xABE7 }, - { 0xABE9, 0xABEA }, { 0xABEC, 0xABED }, { 0xFB1E, 0xFB1E }, - { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE2F }, { 0x101FD, 0x101FD }, - { 0x102E0, 0x102E0 }, { 0x10376, 0x1037A }, { 0x10A01, 0x10A03 }, - { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F }, { 0x10A38, 0x10A3A }, - { 0x10A3F, 0x10A3F }, { 0x10AE5, 0x10AE6 }, { 0x10D24, 0x10D27 }, - { 0x10EAB, 0x10EAC }, { 0x10F46, 0x10F50 }, { 0x10F82, 0x10F85 }, - { 0x11000, 0x11002 }, { 0x11038, 0x11046 }, { 0x1107F, 0x11082 }, - { 0x110B0, 0x110BA }, { 0x11100, 0x11102 }, { 0x11127, 0x11134 }, - { 0x11145, 0x11146 }, { 0x11173, 0x11173 }, { 0x11180, 0x11182 }, - { 0x111B3, 0x111C0 }, { 0x111C9, 0x111CC }, { 0x1122C, 0x11237 }, - { 0x1123E, 0x1123E }, { 0x112DF, 0x112EA }, { 0x11300, 0x11303 }, - { 0x1133B, 0x1133C }, { 0x1133E, 0x11344 }, { 0x11347, 0x11348 }, - { 0x1134B, 0x1134D }, { 0x11357, 0x11357 }, { 0x11362, 0x11363 }, - { 0x11435, 0x11446 }, { 0x1145E, 0x1145E }, { 0x114B0, 0x114C3 }, - { 0x115AF, 0x115B5 }, { 0x115B8, 0x115C0 }, { 0x115DC, 0x115DD }, - { 0x11630, 0x11640 }, { 0x116AB, 0x116B7 }, { 0x1171D, 0x1172B }, - { 0x1182C, 0x1183A }, { 0x11930, 0x11935 }, { 0x11937, 0x11938 }, - { 0x1193B, 0x1193E }, { 0x11940, 0x11940 }, { 0x11942, 0x11942 }, - { 0x119D1, 0x119D7 }, { 0x119DA, 0x119E0 }, { 0x11A01, 0x11A0A }, - { 0x11A33, 0x11A39 }, { 0x11A3B, 0x11A3E }, { 0x11A47, 0x11A47 }, - { 0x11A51, 0x11A5B }, { 0x11A8A, 0x11A96 }, { 0x11A98, 0x11A99 }, - { 0x11C30, 0x11C36 }, { 0x11C38, 0x11C3D }, { 0x11C3F, 0x11C3F }, - { 0x11C92, 0x11CA7 }, { 0x11CAA, 0x11CB0 }, { 0x11CB2, 0x11CB3 }, - { 0x11CB5, 0x11CB6 }, { 0x11D31, 0x11D36 }, { 0x11D3A, 0x11D3A }, - { 0x11D3C, 0x11D3D }, { 0x11D3F, 0x11D45 }, { 0x11D47, 0x11D47 }, - { 0x11D90, 0x11D91 }, { 0x11D95, 0x11D95 }, { 0x11D97, 0x11D97 }, - { 0x11EF3, 0x11EF4 }, { 0x13430, 0x13438 }, { 0x16AF0, 0x16AF4 }, - { 0x16B30, 0x16B36 }, { 0x16F4F, 0x16F4F }, { 0x16F8F, 0x16F92 }, - { 0x1BC9D, 0x1BC9E }, { 0x1BCA0, 0x1BCA3 }, { 0x1D167, 0x1D169 }, - { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD }, - { 0x1D242, 0x1D244 }, { 0x1DA00, 0x1DA36 }, { 0x1DA3B, 0x1DA6C }, - { 0x1DA75, 0x1DA75 }, { 0x1DA84, 0x1DA84 }, { 0x1DA9B, 0x1DA9F }, - { 0x1DAA1, 0x1DAAF }, { 0x1E000, 0x1E006 }, { 0x1E008, 0x1E018 }, - { 0x1E01B, 0x1E021 }, { 0x1E023, 0x1E024 }, { 0x1E026, 0x1E02A }, - { 0x1E130, 0x1E136 }, { 0x1E2AE, 0x1E2AE }, { 0x1E2EC, 0x1E2EF }, - { 0x1E4EC, 0x1E4EF }, { 0x1E8D0, 0x1E8D6 }, { 0x1E944, 0x1E94A }, - { 0x1E947, 0x1E94A }, { 0xE0100, 0xE01EF } + static const struct interval zero_width_ranges[] = { + #include "wcwidth_zero_width.c" + }; + static const struct interval ambiguous_width_ranges[] = { + #include "wcwidth_ambiguous_width.c" + }; + static const struct interval double_width_ranges[] = { + #include "wcwidth_double_width.c" }; /* test for 8-bit control characters */ @@ -244,38 +66,27 @@ int mk_wcwidth(mk_wchar_t ucs) // modified: use mk_wchar_t return -1; /* binary search in table of non-spacing characters */ - if (bisearch(ucs, combining, - sizeof(combining) / sizeof(struct interval) - 1)) + if (bisearch(ucs, zero_width_ranges, + sizeof(zero_width_ranges) / sizeof(struct interval) - 1)) return 0; - /* if we arrive here, ucs is not a combining or C0/C1 control character */ + /* binary search in table of ambiguous width characters */ + if (bisearch(ucs, ambiguous_width_ranges, + sizeof(ambiguous_width_ranges) / sizeof(struct interval) - 1)) + return ambiguous_width; - return 1 + - (ucs >= 0x1100 && - (ucs <= 0x115f || /* Hangul Jamo init. consonants */ - ucs == 0x2329 || ucs == 0x232a || - (ucs >= 0x2e80 && ucs <= 0xa4cf && - ucs != 0x303f) || /* CJK ... Yi */ - (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */ - (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */ - (ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */ - (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */ - (ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */ - (ucs >= 0xffe0 && ucs <= 0xffe6) || - (ucs >= 0x1f300 && ucs <= 0x1f64f) || /* modified: added Emoticons */ - (ucs >= 0x1f680 && ucs <= 0x1f6ff) || /* modified: added Transport and Map Symbols */ - (ucs >= 0x1f900 && ucs <= 0x1f9ff) || /* modified: added Supplemental Symbols and Pictographs */ - (ucs >= 0x20000 && ucs <= 0x2fffd) || - (ucs >= 0x30000 && ucs <= 0x3fffd))); + /* binary search in table of double width characters, default to 1 width */ + return 1 + (bisearch(ucs, double_width_ranges, + sizeof(double_width_ranges) / sizeof(struct interval) - 1)); } -int mk_wcswidth(const mk_wchar_t *pwcs, size_t n) // modified: use mk_wchar_t +int mk_wcswidth(const mk_wchar_t *pwcs, size_t n, int ambiguous_width) { int w, width = 0; for (;*pwcs && n-- > 0; pwcs++) - if ((w = mk_wcwidth(*pwcs)) < 0) + if ((w = mk_wcwidth(*pwcs, ambiguous_width)) < 0) return -1; else width += w; diff --git a/src/wcwidth.h b/src/wcwidth.h index 6cb6f6d..9d345f9 100644 --- a/src/wcwidth.h +++ b/src/wcwidth.h @@ -1,7 +1,8 @@ // wcwidth.h // Windows does not have a wcwidth function, so we use compatibilty code from -// http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c by Markus Kuhn +// http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c by Markus Kuhn, this is +// however heavily modified. #ifndef MK_WCWIDTH_H #define MK_WCWIDTH_H @@ -16,7 +17,7 @@ typedef uint32_t mk_wchar_t; // Windows wchar_t can be 16-bit, we need 32-bit typedef wchar_t mk_wchar_t; // Posix wchar_t is 32-bit so just use that #endif -int mk_wcwidth(mk_wchar_t ucs); -int mk_wcswidth(const mk_wchar_t *pwcs, size_t n); +int mk_wcwidth(mk_wchar_t ucs, int ambiguous_width); +int mk_wcswidth(const mk_wchar_t *pwcs, size_t n, int ambiguous_width); #endif // MK_WCWIDTH_H diff --git a/src/wcwidth_ambiguous_width.c b/src/wcwidth_ambiguous_width.c new file mode 100644 index 0000000..264258e --- /dev/null +++ b/src/wcwidth_ambiguous_width.c @@ -0,0 +1,64 @@ + // Do not modify this file directly, it is generated by the wcwidth_update.lua script + // Contains unicode character-ranges handled as ambiguous (either 1 or 2 width) + // Generated from Unicode 17.0.0 + // Generated on 2026-01-29 + { 0x00A1, 0x00A1 }, { 0x00A4, 0x00A4 }, { 0x00A7, 0x00A8 }, + { 0x00AA, 0x00AA }, { 0x00AD, 0x00AE }, { 0x00B0, 0x00B4 }, + { 0x00B6, 0x00BA }, { 0x00BC, 0x00BF }, { 0x00C6, 0x00C6 }, + { 0x00D0, 0x00D0 }, { 0x00D7, 0x00D8 }, { 0x00DE, 0x00E1 }, + { 0x00E6, 0x00E6 }, { 0x00E8, 0x00EA }, { 0x00EC, 0x00ED }, + { 0x00F0, 0x00F0 }, { 0x00F2, 0x00F3 }, { 0x00F7, 0x00FA }, + { 0x00FC, 0x00FC }, { 0x00FE, 0x00FE }, { 0x0101, 0x0101 }, + { 0x0111, 0x0111 }, { 0x0113, 0x0113 }, { 0x011B, 0x011B }, + { 0x0126, 0x0127 }, { 0x012B, 0x012B }, { 0x0131, 0x0133 }, + { 0x0138, 0x0138 }, { 0x013F, 0x0142 }, { 0x0144, 0x0144 }, + { 0x0148, 0x014B }, { 0x014D, 0x014D }, { 0x0152, 0x0153 }, + { 0x0166, 0x0167 }, { 0x016B, 0x016B }, { 0x01CE, 0x01CE }, + { 0x01D0, 0x01D0 }, { 0x01D2, 0x01D2 }, { 0x01D4, 0x01D4 }, + { 0x01D6, 0x01D6 }, { 0x01D8, 0x01D8 }, { 0x01DA, 0x01DA }, + { 0x01DC, 0x01DC }, { 0x0251, 0x0251 }, { 0x0261, 0x0261 }, + { 0x02C4, 0x02C4 }, { 0x02C7, 0x02C7 }, { 0x02C9, 0x02CB }, + { 0x02CD, 0x02CD }, { 0x02D0, 0x02D0 }, { 0x02D8, 0x02DB }, + { 0x02DD, 0x02DD }, { 0x02DF, 0x02DF }, { 0x0300, 0x036F }, + { 0x0391, 0x03A1 }, { 0x03A3, 0x03A9 }, { 0x03B1, 0x03C1 }, + { 0x03C3, 0x03C9 }, { 0x0401, 0x0401 }, { 0x0410, 0x044F }, + { 0x0451, 0x0451 }, { 0x2010, 0x2010 }, { 0x2013, 0x2016 }, + { 0x2018, 0x2019 }, { 0x201C, 0x201D }, { 0x2020, 0x2022 }, + { 0x2024, 0x2027 }, { 0x2030, 0x2030 }, { 0x2032, 0x2033 }, + { 0x2035, 0x2035 }, { 0x203B, 0x203B }, { 0x203E, 0x203E }, + { 0x2074, 0x2074 }, { 0x207F, 0x207F }, { 0x2081, 0x2084 }, + { 0x20AC, 0x20AC }, { 0x2103, 0x2103 }, { 0x2105, 0x2105 }, + { 0x2109, 0x2109 }, { 0x2113, 0x2113 }, { 0x2116, 0x2116 }, + { 0x2121, 0x2122 }, { 0x2126, 0x2126 }, { 0x212B, 0x212B }, + { 0x2153, 0x2154 }, { 0x215B, 0x215E }, { 0x2160, 0x216B }, + { 0x2170, 0x2179 }, { 0x2189, 0x2189 }, { 0x2190, 0x2199 }, + { 0x21B8, 0x21B9 }, { 0x21D2, 0x21D2 }, { 0x21D4, 0x21D4 }, + { 0x21E7, 0x21E7 }, { 0x2200, 0x2200 }, { 0x2202, 0x2203 }, + { 0x2207, 0x2208 }, { 0x220B, 0x220B }, { 0x220F, 0x220F }, + { 0x2211, 0x2211 }, { 0x2215, 0x2215 }, { 0x221A, 0x221A }, + { 0x221D, 0x2220 }, { 0x2223, 0x2223 }, { 0x2225, 0x2225 }, + { 0x2227, 0x222C }, { 0x222E, 0x222E }, { 0x2234, 0x2237 }, + { 0x223C, 0x223D }, { 0x2248, 0x2248 }, { 0x224C, 0x224C }, + { 0x2252, 0x2252 }, { 0x2260, 0x2261 }, { 0x2264, 0x2267 }, + { 0x226A, 0x226B }, { 0x226E, 0x226F }, { 0x2282, 0x2283 }, + { 0x2286, 0x2287 }, { 0x2295, 0x2295 }, { 0x2299, 0x2299 }, + { 0x22A5, 0x22A5 }, { 0x22BF, 0x22BF }, { 0x2312, 0x2312 }, + { 0x2460, 0x24E9 }, { 0x24EB, 0x254B }, { 0x2550, 0x2573 }, + { 0x2580, 0x258F }, { 0x2592, 0x2595 }, { 0x25A0, 0x25A1 }, + { 0x25A3, 0x25A9 }, { 0x25B2, 0x25B3 }, { 0x25B6, 0x25B7 }, + { 0x25BC, 0x25BD }, { 0x25C0, 0x25C1 }, { 0x25C6, 0x25C8 }, + { 0x25CB, 0x25CB }, { 0x25CE, 0x25D1 }, { 0x25E2, 0x25E5 }, + { 0x25EF, 0x25EF }, { 0x2605, 0x2606 }, { 0x2609, 0x2609 }, + { 0x260E, 0x260F }, { 0x261C, 0x261C }, { 0x261E, 0x261E }, + { 0x2640, 0x2640 }, { 0x2642, 0x2642 }, { 0x2660, 0x2661 }, + { 0x2663, 0x2665 }, { 0x2667, 0x266A }, { 0x266C, 0x266D }, + { 0x266F, 0x266F }, { 0x269E, 0x269F }, { 0x26BF, 0x26BF }, + { 0x26C6, 0x26CD }, { 0x26CF, 0x26D3 }, { 0x26D5, 0x26E1 }, + { 0x26E3, 0x26E3 }, { 0x26E8, 0x26E9 }, { 0x26EB, 0x26F1 }, + { 0x26F4, 0x26F4 }, { 0x26F6, 0x26F9 }, { 0x26FB, 0x26FC }, + { 0x26FE, 0x26FF }, { 0x273D, 0x273D }, { 0x2776, 0x277F }, + { 0x2B56, 0x2B59 }, { 0x3248, 0x324F }, { 0xE000, 0xF8FF }, + { 0xFE00, 0xFE0F }, { 0xFFFD, 0xFFFD }, { 0x1F100, 0x1F10A }, + { 0x1F110, 0x1F12D }, { 0x1F130, 0x1F169 }, { 0x1F170, 0x1F18D }, + { 0x1F18F, 0x1F190 }, { 0x1F19B, 0x1F1AC }, { 0xE0100, 0xE01EF }, + { 0xF0000, 0xFFFFD }, { 0x100000, 0x10FFFD } diff --git a/src/wcwidth_double_width.c b/src/wcwidth_double_width.c new file mode 100644 index 0000000..a0c1b65 --- /dev/null +++ b/src/wcwidth_double_width.c @@ -0,0 +1,45 @@ + // Do not modify this file directly, it is generated by the wcwidth_update.lua script + // Contains unicode character-ranges handled as double width + // Generated from Unicode 17.0.0 + // Generated on 2026-01-29 + { 0x1100, 0x115F }, { 0x231A, 0x231B }, { 0x2329, 0x232A }, + { 0x23E9, 0x23EC }, { 0x23F0, 0x23F0 }, { 0x23F3, 0x23F3 }, + { 0x25FD, 0x25FE }, { 0x2614, 0x2615 }, { 0x2630, 0x2637 }, + { 0x2648, 0x2653 }, { 0x267F, 0x267F }, { 0x268A, 0x268F }, + { 0x2693, 0x2693 }, { 0x26A1, 0x26A1 }, { 0x26AA, 0x26AB }, + { 0x26BD, 0x26BE }, { 0x26C4, 0x26C5 }, { 0x26CE, 0x26CE }, + { 0x26D4, 0x26D4 }, { 0x26EA, 0x26EA }, { 0x26F2, 0x26F3 }, + { 0x26F5, 0x26F5 }, { 0x26FA, 0x26FA }, { 0x26FD, 0x26FD }, + { 0x2705, 0x2705 }, { 0x270A, 0x270B }, { 0x2728, 0x2728 }, + { 0x274C, 0x274C }, { 0x274E, 0x274E }, { 0x2753, 0x2755 }, + { 0x2757, 0x2757 }, { 0x2795, 0x2797 }, { 0x27B0, 0x27B0 }, + { 0x27BF, 0x27BF }, { 0x2B1B, 0x2B1C }, { 0x2B50, 0x2B50 }, + { 0x2B55, 0x2B55 }, { 0x2E80, 0x2E99 }, { 0x2E9B, 0x2EF3 }, + { 0x2F00, 0x2FD5 }, { 0x2FF0, 0x303E }, { 0x3041, 0x3096 }, + { 0x3099, 0x30FF }, { 0x3105, 0x312F }, { 0x3131, 0x318E }, + { 0x3190, 0x31E5 }, { 0x31EF, 0x321E }, { 0x3220, 0x3247 }, + { 0x3250, 0xA48C }, { 0xA490, 0xA4C6 }, { 0xA960, 0xA97C }, + { 0xAC00, 0xD7A3 }, { 0xF900, 0xFAFF }, { 0xFE10, 0xFE19 }, + { 0xFE30, 0xFE52 }, { 0xFE54, 0xFE66 }, { 0xFE68, 0xFE6B }, + { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 }, { 0x16FE0, 0x16FE4 }, + { 0x16FF0, 0x16FF6 }, { 0x17000, 0x18CD5 }, { 0x18CFF, 0x18D1E }, + { 0x18D80, 0x18DF2 }, { 0x1AFF0, 0x1AFF3 }, { 0x1AFF5, 0x1AFFB }, + { 0x1AFFD, 0x1AFFE }, { 0x1B000, 0x1B122 }, { 0x1B132, 0x1B132 }, + { 0x1B150, 0x1B152 }, { 0x1B155, 0x1B155 }, { 0x1B164, 0x1B167 }, + { 0x1B170, 0x1B2FB }, { 0x1D300, 0x1D356 }, { 0x1D360, 0x1D376 }, + { 0x1F004, 0x1F004 }, { 0x1F0CF, 0x1F0CF }, { 0x1F18E, 0x1F18E }, + { 0x1F191, 0x1F19A }, { 0x1F1E6, 0x1F202 }, { 0x1F210, 0x1F23B }, + { 0x1F240, 0x1F248 }, { 0x1F250, 0x1F251 }, { 0x1F260, 0x1F265 }, + { 0x1F300, 0x1F320 }, { 0x1F32D, 0x1F335 }, { 0x1F337, 0x1F37C }, + { 0x1F37E, 0x1F393 }, { 0x1F3A0, 0x1F3CA }, { 0x1F3CF, 0x1F3D3 }, + { 0x1F3E0, 0x1F3F0 }, { 0x1F3F4, 0x1F3F4 }, { 0x1F3F8, 0x1F43E }, + { 0x1F440, 0x1F440 }, { 0x1F442, 0x1F4FC }, { 0x1F4FF, 0x1F53D }, + { 0x1F54B, 0x1F54E }, { 0x1F550, 0x1F567 }, { 0x1F57A, 0x1F57A }, + { 0x1F595, 0x1F596 }, { 0x1F5A4, 0x1F5A4 }, { 0x1F5FB, 0x1F64F }, + { 0x1F680, 0x1F6C5 }, { 0x1F6CC, 0x1F6CC }, { 0x1F6D0, 0x1F6D2 }, + { 0x1F6D5, 0x1F6D8 }, { 0x1F6DC, 0x1F6DF }, { 0x1F6EB, 0x1F6EC }, + { 0x1F6F4, 0x1F6FC }, { 0x1F7E0, 0x1F7EB }, { 0x1F7F0, 0x1F7F0 }, + { 0x1F90C, 0x1F93A }, { 0x1F93C, 0x1F945 }, { 0x1F947, 0x1F9FF }, + { 0x1FA70, 0x1FA7C }, { 0x1FA80, 0x1FA8A }, { 0x1FA8E, 0x1FAC6 }, + { 0x1FAC8, 0x1FAC8 }, { 0x1FACD, 0x1FADC }, { 0x1FADF, 0x1FAEA }, + { 0x1FAEF, 0x1FAF8 }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD } diff --git a/src/wcwidth_update.lua b/src/wcwidth_update.lua new file mode 100755 index 0000000..37f18c3 --- /dev/null +++ b/src/wcwidth_update.lua @@ -0,0 +1,404 @@ +#!/usr/bin/env lua + +-- This file downloads and parses unicode standard files and updates the wcwidth code +-- based on that data. + +local VERSION="17.0.0" -- the unicode standard version to download + + + +-- test if curl is available, and Penlight +do + local ok, ec = os.execute("curl --version > /dev/null 2>&1") + if not ok then + error("curl is not available in the path; exitcode " .. ec) + end + + local ok, utils = pcall(require, "pl.utils") + if not ok then + error("Penlight is not available, please install via `luarocks install penlight`") + end + + utils.readfile("./wcwidth.c") + if not ok then + error("failed to read './wcwidth.c', run this script from within the `./src/` directory") + end +end + +-- files to download from the unicode site +local FN_DERIVED_GENERAL_CATEGORY = 1 +local FN_EAST_ASIAN_WIDTH = 2 +local FN_DERIVED_CORE_PROPERTIES = 3 +local FN_EMOJI_DATA = 4 + +local download_file_list = { + [FN_DERIVED_GENERAL_CATEGORY] = "extracted/DerivedGeneralCategory.txt", + [FN_EAST_ASIAN_WIDTH] = "EastAsianWidth.txt", + [FN_DERIVED_CORE_PROPERTIES] = "DerivedCoreProperties.txt", + [FN_EMOJI_DATA] = "emoji/emoji-data.txt", +} +local target_path = "./unicode_data/" + + + +do + local base_url = "https://www.unicode.org/Public/" .. VERSION .. "/ucd/" -- must include trailing slash + + + -- removes a file, and then downloads a new copy from the unicode site + local function download_file(filename, target_filename) + print("Downloading " .. filename .. " to " .. target_filename) + os.remove(target_filename) + local cmd = "curl --fail -s -o " .. target_filename .. " " .. base_url .. filename + local ok, ec = os.execute(cmd) + if not ok then + error("Failed to execute: " .. cmd .. "; exitcode " .. ec) + end + end + + + -- Downloads all unicode files we need + local function download_files() + os.execute("mkdir -p " .. target_path .. "extracted") + os.execute("mkdir -p " .. target_path .. "emoji") + for _, filename in ipairs(download_file_list) do + download_file(filename, target_path .. filename) + end + end + + + download_files() +end + + + +-- set up the 3 lists of data (everything else is single-width) +local zero_width = {} +local double_width = {} +local ambiguous_width = {} + + + +local readlines do + local utils = require("pl.utils") + + function readlines(filename) + print("Parsing " .. filename) + local lines = assert(utils.readlines(filename)) + + -- drop lines starting with "#" being comments, or empty lines (whitespace only) + for i = #lines, 1, -1 do -- reverse, since we're deleting items + if lines[i]:match("^%s*#") or lines[i]:match("^%s*$") then + table.remove(lines, i) + end + end + + return lines + end +end + + + + +-- parse DerivedGeneralCategory.txt +-- Purpose: zero-width combining marks +-- Extract: +-- Mn — Nonspacing Mark → width = 0 +-- Me — Enclosing Mark → width = 0 +-- Why: +-- These characters overlay the previous glyph +-- This replaces Markus Kuhn’s combining[] table +-- Ignore all other categories in this file. +do + local lines = readlines(target_path .. download_file_list[FN_DERIVED_GENERAL_CATEGORY]) + local zw_start = #zero_width + + -- parse the lines + for _, line in ipairs(lines) do + local range, category = line:match("^([%x%.]+)%s*;%s*(%a+)") + if not range then + error("Failed to parse line: " .. line) + end + + if not range:find("..", 1, true) then -- single code point, make range + range = range .. ".." .. range + end + + if category == "Mn" or category == "Me" then + zero_width[#zero_width + 1] = range + end + end + + print(" found " .. (#zero_width - zw_start) .. " zero-width character-ranges") +end + + + +-- parse DerivedCoreProperties.txt +-- Purpose: zero-width format / ignorable characters +-- Extract: +-- Default_Ignorable_Code_Point → width = 0 + +-- Includes (important examples): +-- U+200D ZERO WIDTH JOINER +-- U+200C ZERO WIDTH NON-JOINER +-- U+FE00..U+FE0F (variation selectors) +-- Bidi and other format controls + +-- Why: +-- Not Mn/Me, but terminals treat them as zero-width +-- Required for emoji correctness and modern text +do + local lines = readlines(target_path .. download_file_list[FN_DERIVED_CORE_PROPERTIES]) + local zw_start = #zero_width + + -- parse the lines + for _, line in ipairs(lines) do + local range, category = line:match("^([%x%.]+)%s*;%s*([%a_]+)") + if not range then + error("Failed to parse line: " .. line) + end + + if not range:find("..", 1, true) then -- single code point, make range + range = range .. ".." .. range + end + + if category == "Default_Ignorable_Code_Point" then + zero_width[#zero_width + 1] = range + end + end + + print(" found " .. (#zero_width - zw_start) .. " zero-width character-ranges") +end + + + +-- parse EastAsianWidth.txt +-- Purpose: determine double-width and ambiguous-width characters +-- Extract: +-- W (Wide) → width = 2 +-- F (Fullwidth) → width = 2 +-- A (Ambiguous) → width = 1 or 2 (your choice; usually 1 unless CJK mode) +-- Everything else: +-- H, Na, N → width = 1 +-- Why: +-- - This is the only Unicode-sanctioned width-related property +-- - Core of all wcwidth() implementations +do + local lines = readlines(target_path .. download_file_list[FN_EAST_ASIAN_WIDTH]) + local dw_start = #double_width + local aw_start = #ambiguous_width + + -- parse the lines + for _, line in ipairs(lines) do + local range, width_type = line:match("^([%x%.]+)%s*;%s*(%a+)") + if not range then + error("Failed to parse line: " .. line) + end + + if not range:find("..", 1, true) then -- single code point, make range + range = range .. ".." .. range + end + + if width_type == "W" or width_type == "F" then + double_width[#double_width + 1] = range + elseif width_type == "A" then + ambiguous_width[#ambiguous_width + 1] = range + end + end + + print(" found " .. (#double_width - dw_start) .. " double-width character-ranges") + print(" found " .. (#ambiguous_width - aw_start) .. " ambiguous-width character-ranges") +end + + + +-- parse emoji-data.txt +-- Purpose: emoji presentation width +-- Extract: +-- Emoji_Presentation=Yes → width = 2 +-- (Optionally) Extended_Pictographic → emoji sequences +-- Why: +-- Emoji are not reliably covered by EastAsianWidth +-- Modern terminals render these as double-width +-- Required for correct emoji column alignment +do + local lines = readlines(target_path .. download_file_list[FN_EMOJI_DATA]) + local dw_start = #double_width + + -- parse the lines + for _, line in ipairs(lines) do + local range, properties = line:match("^([%x%.]+)%s*;%s*([%a_]+)") + if not range then + error("Failed to parse line: " .. line) + end + + if not range:find("..", 1, true) then -- single code point, make range + range = range .. ".." .. range + end + + if properties:match("Emoji_Presentation") then + double_width[#double_width + 1] = range + end + end + + print(" found " .. (#double_width - dw_start) .. " double-width character-ranges") +end + + + +-- returns the start and end of a range, numerically, and hex strings +-- @tparam string range the range to parse +-- @treturn number sr the start of the range +-- @treturn number er the end of the range +-- @treturn string sh the start of the range as a hex string +-- @treturn string eh the end of the range as a hex string +local parse_range do + function parse_range(range) + local s = range:find("..", 1, true) + if not s then + error("Failed to parse range: " .. range) + end + local sh = range:sub(1, s - 1) + local eh = range:sub(s + 2, -1) + local sr = tonumber(sh, 16) + local er = tonumber(eh, 16) + if er < sr then + error("Failed to parse range: " .. range .. " (end < start)") + end + return sr, er, sh, eh + end + + -- some inline tests for parse_range + local sr, er = parse_range("25FD..25FE") + assert(sr == 9725) + assert(er == 9726) + local sr, er = parse_range("105C0..105F3") + assert(sr == 67008) + assert(er == 67059) +end + + + +-- sorts the ranges in-place +local function sort_ranges(ranges) + table.sort(ranges, function(a, b) + return parse_range(a) < parse_range(b) + end) + return ranges +end + + + +-- combines adjacent ranges in-place +local combine_ranges do + function combine_ranges(ranges) + local last_idx = 1 + for i = 2, #ranges do + local last_s, last_e, last_sh, last_eh = parse_range(ranges[last_idx]) + local current_s, current_e, _, current_eh = parse_range(ranges[i]) + if current_s >= last_s and current_s <= (last_e + 1) then + -- ranges are adjacent or overlapping, combine them + local sh = last_sh + local eh = current_eh + if last_e > current_e then + eh = last_eh + end + ranges[last_idx] = sh .. ".." .. eh + else + last_idx = last_idx + 1 + ranges[last_idx] = ranges[i] + end + end + -- clear left-overs beyond last entry + for i = last_idx + 1, #ranges do + ranges[i] = nil + end + end + + -- some inline tests for combine_ranges + local ranges = { + "25FD..25FE", + "25FD..25FE", -- duplicate range, should be removed + "105C0..105F3", + "105D0..105E0", -- range fully within previous range, should be combined + "10F00..10F10", + "10F11..10F20", -- adjacent or previous, should be combined + "11000..11100", + "11101..11110", -- adjacent + extending to previous, should be combined + "12000..12010", + "12011..12020", -- multiple: adjacent should be combined + "12015..12030", -- multiple: overlap + extending to previous, should be combined + "12031..12040", -- multiple: overlapping, should be combined + } + combine_ranges(ranges) + assert(#ranges == 5) + assert(ranges[1] == "25FD..25FE") + assert(ranges[2] == "105C0..105F3") + assert(ranges[3] == "10F00..10F20") + assert(ranges[4] == "11000..11110") + assert(ranges[5] == "12000..12040") +end + + + +combine_ranges(sort_ranges(zero_width)) +combine_ranges(sort_ranges(double_width)) +combine_ranges(sort_ranges(ambiguous_width)) + + + +-- convert ranges into c-source-code ranges (in-place) +-- format: "{ 0x0829, 0x082D }" +local function convert_c_ranges(ranges) + for i = 1, #ranges do + local _, _, sh, eh = parse_range(ranges[i]) + ranges[i] = "{ 0x" .. sh .. ", 0x" .. eh .. " }" + end +end + +convert_c_ranges(zero_width) +convert_c_ranges(double_width) +convert_c_ranges(ambiguous_width) + + + +local SOURCE_INDENT = " " + + +-- write c source, as triplet; 3 ranges on 1 line +local function triplet_lines(ranges) + local lines = {} + for i = 1, #ranges, 3 do + lines[#lines+1] = SOURCE_INDENT .. table.concat(ranges, ", ", i, math.min(i + 2, #ranges)) .. "," + end + -- drop trailing comma from last line + lines[#lines] = lines[#lines]:sub(1, -2) + return lines +end + + +-- create file-contents +local function create_file_contents(ranges, contains) + return + SOURCE_INDENT .. "// Do not modify this file directly, it is generated by the wcwidth_update.lua script\n" .. + SOURCE_INDENT .. "// Contains " .. contains .. "\n" .. + SOURCE_INDENT .. "// Generated from Unicode " .. VERSION .. "\n" .. + SOURCE_INDENT .. "// Generated on " .. os.date("%Y-%m-%d") .. "\n" .. + table.concat(triplet_lines(ranges), "\n") .. "\n" +end + + + + +local writefile = require("pl.utils").writefile + +print("writing source files...") +print(" zero-width: ./wcwidth_zero_width.c") +assert(writefile("./wcwidth_zero_width.c", create_file_contents(zero_width, "unicode character-ranges handled as 0 width"))) + +print(" double-width: ./wcwidth_double_width.c") +assert(writefile("./wcwidth_double_width.c", create_file_contents(double_width, "unicode character-ranges handled as double width"))) + +print(" ambiguous-width: ./wcwidth_ambiguous_width.c") +assert(writefile("./wcwidth_ambiguous_width.c", create_file_contents(ambiguous_width, "unicode character-ranges handled as ambiguous (either 1 or 2 width)"))) diff --git a/src/wcwidth_zero_width.c b/src/wcwidth_zero_width.c new file mode 100644 index 0000000..579ca5f --- /dev/null +++ b/src/wcwidth_zero_width.c @@ -0,0 +1,128 @@ + // Do not modify this file directly, it is generated by the wcwidth_update.lua script + // Contains unicode character-ranges handled as 0 width + // Generated from Unicode 17.0.0 + // Generated on 2026-01-29 + { 0x00AD, 0x00AD }, { 0x0300, 0x036F }, { 0x0483, 0x0489 }, + { 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, + { 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0610, 0x061A }, + { 0x061C, 0x061C }, { 0x064B, 0x065F }, { 0x0670, 0x0670 }, + { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 }, { 0x06E7, 0x06E8 }, + { 0x06EA, 0x06ED }, { 0x0711, 0x0711 }, { 0x0730, 0x074A }, + { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x07FD, 0x07FD }, + { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 }, + { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x0897, 0x089F }, + { 0x08CA, 0x08E1 }, { 0x08E3, 0x0902 }, { 0x093A, 0x093A }, + { 0x093C, 0x093C }, { 0x0941, 0x0948 }, { 0x094D, 0x094D }, + { 0x0951, 0x0957 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, + { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, + { 0x09E2, 0x09E3 }, { 0x09FE, 0x09FE }, { 0x0A01, 0x0A02 }, + { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 }, + { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 }, { 0x0A70, 0x0A71 }, + { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC }, + { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD }, + { 0x0AE2, 0x0AE3 }, { 0x0AFA, 0x0AFF }, { 0x0B01, 0x0B01 }, + { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 }, + { 0x0B4D, 0x0B4D }, { 0x0B55, 0x0B56 }, { 0x0B62, 0x0B63 }, + { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD }, + { 0x0C00, 0x0C00 }, { 0x0C04, 0x0C04 }, { 0x0C3C, 0x0C3C }, + { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, + { 0x0C55, 0x0C56 }, { 0x0C62, 0x0C63 }, { 0x0C81, 0x0C81 }, + { 0x0CBC, 0x0CBC }, { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, + { 0x0CCC, 0x0CCD }, { 0x0CE2, 0x0CE3 }, { 0x0D00, 0x0D01 }, + { 0x0D3B, 0x0D3C }, { 0x0D41, 0x0D44 }, { 0x0D4D, 0x0D4D }, + { 0x0D62, 0x0D63 }, { 0x0D81, 0x0D81 }, { 0x0DCA, 0x0DCA }, + { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, + { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, + { 0x0EB4, 0x0EBC }, { 0x0EC8, 0x0ECE }, { 0x0F18, 0x0F19 }, + { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, + { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, + { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, + { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A }, + { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 }, + { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 }, + { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x115F, 0x1160 }, + { 0x135D, 0x135F }, { 0x1712, 0x1714 }, { 0x1732, 0x1733 }, + { 0x1752, 0x1753 }, { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, + { 0x17B7, 0x17BD }, { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, + { 0x17DD, 0x17DD }, { 0x180B, 0x180F }, { 0x1885, 0x1886 }, + { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 }, { 0x1927, 0x1928 }, + { 0x1932, 0x1932 }, { 0x1939, 0x193B }, { 0x1A17, 0x1A18 }, + { 0x1A1B, 0x1A1B }, { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E }, + { 0x1A60, 0x1A60 }, { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C }, + { 0x1A73, 0x1A7C }, { 0x1A7F, 0x1A7F }, { 0x1AB0, 0x1ADD }, + { 0x1AE0, 0x1AEB }, { 0x1B00, 0x1B03 }, { 0x1B34, 0x1B34 }, + { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, { 0x1B42, 0x1B42 }, + { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B81 }, { 0x1BA2, 0x1BA5 }, + { 0x1BA8, 0x1BA9 }, { 0x1BAB, 0x1BAD }, { 0x1BE6, 0x1BE6 }, + { 0x1BE8, 0x1BE9 }, { 0x1BED, 0x1BED }, { 0x1BEF, 0x1BF1 }, + { 0x1C2C, 0x1C33 }, { 0x1C36, 0x1C37 }, { 0x1CD0, 0x1CD2 }, + { 0x1CD4, 0x1CE0 }, { 0x1CE2, 0x1CE8 }, { 0x1CED, 0x1CED }, + { 0x1CF4, 0x1CF4 }, { 0x1CF8, 0x1CF9 }, { 0x1DC0, 0x1DFF }, + { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x206F }, + { 0x20D0, 0x20F0 }, { 0x2CEF, 0x2CF1 }, { 0x2D7F, 0x2D7F }, + { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D }, { 0x3099, 0x309A }, + { 0x3164, 0x3164 }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D }, + { 0xA69E, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 }, + { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 }, + { 0xA82C, 0xA82C }, { 0xA8C4, 0xA8C5 }, { 0xA8E0, 0xA8F1 }, + { 0xA8FF, 0xA8FF }, { 0xA926, 0xA92D }, { 0xA947, 0xA951 }, + { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 }, { 0xA9B6, 0xA9B9 }, + { 0xA9BC, 0xA9BD }, { 0xA9E5, 0xA9E5 }, { 0xAA29, 0xAA2E }, + { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 }, + { 0xAA4C, 0xAA4C }, { 0xAA7C, 0xAA7C }, { 0xAAB0, 0xAAB0 }, + { 0xAAB2, 0xAAB4 }, { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF }, + { 0xAAC1, 0xAAC1 }, { 0xAAEC, 0xAAED }, { 0xAAF6, 0xAAF6 }, + { 0xABE5, 0xABE5 }, { 0xABE8, 0xABE8 }, { 0xABED, 0xABED }, + { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE2F }, + { 0xFEFF, 0xFEFF }, { 0xFFA0, 0xFFA0 }, { 0xFFF0, 0xFFF8 }, + { 0x101FD, 0x101FD }, { 0x102E0, 0x102E0 }, { 0x10376, 0x1037A }, + { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F }, + { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x10AE5, 0x10AE6 }, + { 0x10D24, 0x10D27 }, { 0x10D69, 0x10D6D }, { 0x10EAB, 0x10EAC }, + { 0x10EFA, 0x10EFF }, { 0x10F46, 0x10F50 }, { 0x10F82, 0x10F85 }, + { 0x11001, 0x11001 }, { 0x11038, 0x11046 }, { 0x11070, 0x11070 }, + { 0x11073, 0x11074 }, { 0x1107F, 0x11081 }, { 0x110B3, 0x110B6 }, + { 0x110B9, 0x110BA }, { 0x110C2, 0x110C2 }, { 0x11100, 0x11102 }, + { 0x11127, 0x1112B }, { 0x1112D, 0x11134 }, { 0x11173, 0x11173 }, + { 0x11180, 0x11181 }, { 0x111B6, 0x111BE }, { 0x111C9, 0x111CC }, + { 0x111CF, 0x111CF }, { 0x1122F, 0x11231 }, { 0x11234, 0x11234 }, + { 0x11236, 0x11237 }, { 0x1123E, 0x1123E }, { 0x11241, 0x11241 }, + { 0x112DF, 0x112DF }, { 0x112E3, 0x112EA }, { 0x11300, 0x11301 }, + { 0x1133B, 0x1133C }, { 0x11340, 0x11340 }, { 0x11366, 0x1136C }, + { 0x11370, 0x11374 }, { 0x113BB, 0x113C0 }, { 0x113CE, 0x113CE }, + { 0x113D0, 0x113D0 }, { 0x113D2, 0x113D2 }, { 0x113E1, 0x113E2 }, + { 0x11438, 0x1143F }, { 0x11442, 0x11444 }, { 0x11446, 0x11446 }, + { 0x1145E, 0x1145E }, { 0x114B3, 0x114B8 }, { 0x114BA, 0x114BA }, + { 0x114BF, 0x114C0 }, { 0x114C2, 0x114C3 }, { 0x115B2, 0x115B5 }, + { 0x115BC, 0x115BD }, { 0x115BF, 0x115C0 }, { 0x115DC, 0x115DD }, + { 0x11633, 0x1163A }, { 0x1163D, 0x1163D }, { 0x1163F, 0x11640 }, + { 0x116AB, 0x116AB }, { 0x116AD, 0x116AD }, { 0x116B0, 0x116B5 }, + { 0x116B7, 0x116B7 }, { 0x1171D, 0x1171D }, { 0x1171F, 0x1171F }, + { 0x11722, 0x11725 }, { 0x11727, 0x1172B }, { 0x1182F, 0x11837 }, + { 0x11839, 0x1183A }, { 0x1193B, 0x1193C }, { 0x1193E, 0x1193E }, + { 0x11943, 0x11943 }, { 0x119D4, 0x119D7 }, { 0x119DA, 0x119DB }, + { 0x119E0, 0x119E0 }, { 0x11A01, 0x11A0A }, { 0x11A33, 0x11A38 }, + { 0x11A3B, 0x11A3E }, { 0x11A47, 0x11A47 }, { 0x11A51, 0x11A56 }, + { 0x11A59, 0x11A5B }, { 0x11A8A, 0x11A96 }, { 0x11A98, 0x11A99 }, + { 0x11B60, 0x11B60 }, { 0x11B62, 0x11B64 }, { 0x11B66, 0x11B66 }, + { 0x11C30, 0x11C36 }, { 0x11C38, 0x11C3D }, { 0x11C3F, 0x11C3F }, + { 0x11C92, 0x11CA7 }, { 0x11CAA, 0x11CB0 }, { 0x11CB2, 0x11CB3 }, + { 0x11CB5, 0x11CB6 }, { 0x11D31, 0x11D36 }, { 0x11D3A, 0x11D3A }, + { 0x11D3C, 0x11D3D }, { 0x11D3F, 0x11D45 }, { 0x11D47, 0x11D47 }, + { 0x11D90, 0x11D91 }, { 0x11D95, 0x11D95 }, { 0x11D97, 0x11D97 }, + { 0x11EF3, 0x11EF4 }, { 0x11F00, 0x11F01 }, { 0x11F36, 0x11F3A }, + { 0x11F40, 0x11F40 }, { 0x11F42, 0x11F42 }, { 0x11F5A, 0x11F5A }, + { 0x13440, 0x13440 }, { 0x13447, 0x13455 }, { 0x1611E, 0x16129 }, + { 0x1612D, 0x1612F }, { 0x16AF0, 0x16AF4 }, { 0x16B30, 0x16B36 }, + { 0x16F4F, 0x16F4F }, { 0x16F8F, 0x16F92 }, { 0x16FE4, 0x16FE4 }, + { 0x1BC9D, 0x1BC9E }, { 0x1BCA0, 0x1BCA3 }, { 0x1CF00, 0x1CF2D }, + { 0x1CF30, 0x1CF46 }, { 0x1D167, 0x1D169 }, { 0x1D173, 0x1D182 }, + { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD }, { 0x1D242, 0x1D244 }, + { 0x1DA00, 0x1DA36 }, { 0x1DA3B, 0x1DA6C }, { 0x1DA75, 0x1DA75 }, + { 0x1DA84, 0x1DA84 }, { 0x1DA9B, 0x1DA9F }, { 0x1DAA1, 0x1DAAF }, + { 0x1E000, 0x1E006 }, { 0x1E008, 0x1E018 }, { 0x1E01B, 0x1E021 }, + { 0x1E023, 0x1E024 }, { 0x1E026, 0x1E02A }, { 0x1E08F, 0x1E08F }, + { 0x1E130, 0x1E136 }, { 0x1E2AE, 0x1E2AE }, { 0x1E2EC, 0x1E2EF }, + { 0x1E4EC, 0x1E4EF }, { 0x1E5EE, 0x1E5EF }, { 0x1E6E3, 0x1E6E3 }, + { 0x1E6E6, 0x1E6E6 }, { 0x1E6EE, 0x1E6EF }, { 0x1E6F5, 0x1E6F5 }, + { 0x1E8D0, 0x1E8D6 }, { 0x1E944, 0x1E94A }, { 0xE0000, 0xE0FFF } -- cgit v1.2.3-55-g6feb