aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/term.c15
-rw-r--r--src/wcwidth.c245
-rw-r--r--src/wcwidth.h7
-rw-r--r--src/wcwidth_ambiguous_width.c64
-rw-r--r--src/wcwidth_double_width.c45
-rwxr-xr-xsrc/wcwidth_update.lua404
-rw-r--r--src/wcwidth_zero_width.c128
7 files changed, 684 insertions, 224 deletions
diff --git a/src/term.c b/src/term.c
index a389e06..80998b0 100644
--- a/src/term.c
+++ b/src/term.c
@@ -1085,6 +1085,7 @@ int utf8_to_wchar(const char *utf8, size_t len, mk_wchar_t *codepoint) {
1085Get the width of a utf8 character for terminal display. 1085Get the width of a utf8 character for terminal display.
1086@function utf8cwidth 1086@function utf8cwidth
1087@tparam string|int utf8_char the utf8 character, or unicode codepoint, to check, only the width of the first character will be returned 1087@tparam string|int utf8_char the utf8 character, or unicode codepoint, to check, only the width of the first character will be returned
1088@tparam[opt=1] int ambiguous_width the width to return for ambiguous width characters (usually 1 or 2)
1088@treturn[1] int the display width in columns of the first character in the string (0 for an empty string) 1089@treturn[1] int the display width in columns of the first character in the string (0 for an empty string)
1089@treturn[2] nil 1090@treturn[2] nil
1090@treturn[2] string error message 1091@treturn[2] string error message
@@ -1093,6 +1094,7 @@ Get the width of a utf8 character for terminal display.
1093int lst_utf8cwidth(lua_State *L) { 1094int lst_utf8cwidth(lua_State *L) {
1094 int width = 0; 1095 int width = 0;
1095 mk_wchar_t wc; 1096 mk_wchar_t wc;
1097 int ambiguous_width = luaL_optinteger(L, 2, 1);
1096 1098
1097 if (lua_type(L, 1) == LUA_TSTRING) { 1099 if (lua_type(L, 1) == LUA_TSTRING) {
1098 // Handle UTF8 as string input 1100 // Handle UTF8 as string input
@@ -1129,10 +1131,10 @@ int lst_utf8cwidth(lua_State *L) {
1129 } 1131 }
1130 1132
1131 // Get the width of the wide character 1133 // Get the width of the wide character
1132 width = mk_wcwidth(wc); 1134 width = mk_wcwidth(wc, ambiguous_width);
1133 if (width == -1) { 1135 if (width == -1) {
1134 lua_pushnil(L); 1136 lua_pushnil(L);
1135 lua_pushstring(L, "Character width determination failed"); 1137 lua_pushstring(L, "Control characters have no width");
1136 return 2; 1138 return 2;
1137 } 1139 }
1138 1140
@@ -1147,6 +1149,7 @@ int lst_utf8cwidth(lua_State *L) {
1147Get the width of a utf8 string for terminal display. 1149Get the width of a utf8 string for terminal display.
1148@function utf8swidth 1150@function utf8swidth
1149@tparam string utf8_string the utf8 string to check 1151@tparam string utf8_string the utf8 string to check
1152@tparam[opt=1] int ambiguous_width the width to return for ambiguous width characters (1 or 2)
1150@treturn[1] int the display width of the string in columns (0 for an empty string) 1153@treturn[1] int the display width of the string in columns (0 for an empty string)
1151@treturn[2] nil 1154@treturn[2] nil
1152@treturn[2] string error message 1155@treturn[2] string error message
@@ -1156,6 +1159,10 @@ int lst_utf8swidth(lua_State *L) {
1156 const char *utf8_str; 1159 const char *utf8_str;
1157 size_t utf8_len; 1160 size_t utf8_len;
1158 utf8_str = luaL_checklstring(L, 1, &utf8_len); 1161 utf8_str = luaL_checklstring(L, 1, &utf8_len);
1162 int ambiguous_width = luaL_optinteger(L, 2, 1);
1163 if (ambiguous_width != 1 && ambiguous_width != 2) {
1164 return luaL_argerror(L, 2, "Ambiguous width must be 1 or 2");
1165 }
1159 int total_width = 0; 1166 int total_width = 0;
1160 1167
1161 if (utf8_len == 0) { 1168 if (utf8_len == 0) {
@@ -1175,10 +1182,10 @@ int lst_utf8swidth(lua_State *L) {
1175 return 2; 1182 return 2;
1176 } 1183 }
1177 1184
1178 int width = mk_wcwidth(wc); 1185 int width = mk_wcwidth(wc, ambiguous_width);
1179 if (width == -1) { 1186 if (width == -1) {
1180 lua_pushnil(L); 1187 lua_pushnil(L);
1181 lua_pushstring(L, "Character width determination failed"); 1188 lua_pushstring(L, "Control characters have no width");
1182 return 2; 1189 return 2;
1183 } 1190 }
1184 1191
diff --git a/src/wcwidth.c b/src/wcwidth.c
index 6032158..ea293c9 100644
--- a/src/wcwidth.c
+++ b/src/wcwidth.c
@@ -1,57 +1,6 @@
1// This file was modified from the original versions, check "modified:" comments for details 1// This file was modified from the original version by Markus Kuhn
2// Character range updates (both the table and the +1 check) were generated using ChatGPT.
3 2
4/* 3/* Original copyrights:
5 * This is an implementation of wcwidth() and wcswidth() (defined in
6 * IEEE Std 1002.1-2001) for Unicode.
7 *
8 * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
9 * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
10 *
11 * In fixed-width output devices, Latin characters all occupy a single
12 * "cell" position of equal width, whereas ideographic CJK characters
13 * occupy two such cells. Interoperability between terminal-line
14 * applications and (teletype-style) character terminals using the
15 * UTF-8 encoding requires agreement on which character should advance
16 * the cursor by how many cell positions. No established formal
17 * standards exist at present on which Unicode character shall occupy
18 * how many cell positions on character terminals. These routines are
19 * a first attempt of defining such behavior based on simple rules
20 * applied to data provided by the Unicode Consortium.
21 *
22 * For some graphical characters, the Unicode standard explicitly
23 * defines a character-cell width via the definition of the East Asian
24 * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
25 * In all these cases, there is no ambiguity about which width a
26 * terminal shall use. For characters in the East Asian Ambiguous (A)
27 * class, the width choice depends purely on a preference of backward
28 * compatibility with either historic CJK or Western practice.
29 * Choosing single-width for these characters is easy to justify as
30 * the appropriate long-term solution, as the CJK practice of
31 * displaying these characters as double-width comes from historic
32 * implementation simplicity (8-bit encoded characters were displayed
33 * single-width and 16-bit ones double-width, even for Greek,
34 * Cyrillic, etc.) and not any typographic considerations.
35 *
36 * Much less clear is the choice of width for the Not East Asian
37 * (Neutral) class. Existing practice does not dictate a width for any
38 * of these characters. It would nevertheless make sense
39 * typographically to allocate two character cells to characters such
40 * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
41 * represented adequately with a single-width glyph. The following
42 * routines at present merely assign a single-cell width to all
43 * neutral characters, in the interest of simplicity. This is not
44 * entirely satisfactory and should be reconsidered before
45 * establishing a formal standard in this area. At the moment, the
46 * decision which Not East Asian (Neutral) characters should be
47 * represented by double-width glyphs cannot yet be answered by
48 * applying a simple rule from the Unicode database content. Setting
49 * up a proper standard for the behavior of UTF-8 character terminals
50 * will require a careful analysis not only of each Unicode character,
51 * but also of each presentation form, something the author of these
52 * routines has avoided to do so far.
53 *
54 * http://www.unicode.org/unicode/reports/tr11/
55 * 4 *
56 * Markus Kuhn -- 2007-05-26 (Unicode 5.0) 5 * Markus Kuhn -- 2007-05-26 (Unicode 5.0)
57 * 6 *
@@ -62,7 +11,7 @@
62 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c 11 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
63 */ 12 */
64 13
65#include "wcwidth.h" // modified: used to define mk_wchar_t 14#include "wcwidth.h"
66 15
67struct interval { 16struct interval {
68 int first; 17 int first;
@@ -70,7 +19,7 @@ struct interval {
70}; 19};
71 20
72/* auxiliary function for binary search in interval table */ 21/* auxiliary function for binary search in interval table */
73static int bisearch(mk_wchar_t ucs, const struct interval *table, int max) { // modified: use mk_wchar_t 22static int bisearch(mk_wchar_t ucs, const struct interval *table, int max) {
74 int min = 0; 23 int min = 0;
75 int mid; 24 int mid;
76 25
@@ -91,150 +40,23 @@ static int bisearch(mk_wchar_t ucs, const struct interval *table, int max) { //
91 40
92 41
93/* The following two functions define the column width of an ISO 10646 42/* The following two functions define the column width of an ISO 10646
94 * character as follows: 43 * characters.
95 *
96 * - The null character (U+0000) has a column width of 0.
97 *
98 * - Other C0/C1 control characters and DEL will lead to a return
99 * value of -1.
100 *
101 * - Non-spacing and enclosing combining characters (general
102 * category code Mn or Me in the Unicode database) have a
103 * column width of 0.
104 * 44 *
105 * - SOFT HYPHEN (U+00AD) has a column width of 1. 45 * @param ucs the Unicode code point to check
106 * 46 * @param ambiguous_width the width to return for ambiguous width characters (1 or 2)
107 * - Other format characters (general category code Cf in the Unicode 47 * @return the width of the character, or -1 if the character is a control character
108 * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
109 *
110 * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
111 * have a column width of 0.
112 *
113 * - Spacing characters in the East Asian Wide (W) or East Asian
114 * Full-width (F) category as defined in Unicode Technical
115 * Report #11 have a column width of 2.
116 *
117 * - All remaining characters (including all printable
118 * ISO 8859-1 and WGL4 characters, Unicode control characters,
119 * etc.) have a column width of 1.
120 *
121 * This implementation assumes that mk_wchar_t characters are encoded
122 * in ISO 10646.
123 */ 48 */
124 49
125int mk_wcwidth(mk_wchar_t ucs) // modified: use mk_wchar_t 50int mk_wcwidth(mk_wchar_t ucs, int ambiguous_width)
126{ 51{
127 /* sorted list of non-overlapping intervals of non-spacing characters */ 52 static const struct interval zero_width_ranges[] = {
128 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ 53 #include "wcwidth_zero_width.c"
129 static const struct interval combining[] = { // modified: added new ranges to the list 54 };
130 { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD }, 55 static const struct interval ambiguous_width_ranges[] = {
131 { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 }, 56 #include "wcwidth_ambiguous_width.c"
132 { 0x05C7, 0x05C7 }, { 0x0600, 0x0605 }, { 0x0610, 0x061A }, 57 };
133 { 0x061C, 0x061C }, { 0x064B, 0x065F }, { 0x0670, 0x0670 }, 58 static const struct interval double_width_ranges[] = {
134 { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 }, { 0x06E7, 0x06E8 }, 59 #include "wcwidth_double_width.c"
135 { 0x06EA, 0x06ED }, { 0x0711, 0x0711 }, { 0x0730, 0x074A },
136 { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x07FD, 0x07FD },
137 { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 },
138 { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x08D3, 0x08E1 },
139 { 0x08E3, 0x0903 }, { 0x093A, 0x093C }, { 0x093E, 0x094F },
140 { 0x0951, 0x0957 }, { 0x0962, 0x0963 }, { 0x0981, 0x0983 },
141 { 0x09BC, 0x09BC }, { 0x09BE, 0x09C4 }, { 0x09C7, 0x09C8 },
142 { 0x09CB, 0x09CD }, { 0x09D7, 0x09D7 }, { 0x09E2, 0x09E3 },
143 { 0x09FE, 0x09FE }, { 0x0A01, 0x0A03 }, { 0x0A3C, 0x0A3C },
144 { 0x0A3E, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D },
145 { 0x0A51, 0x0A51 }, { 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 },
146 { 0x0A81, 0x0A83 }, { 0x0ABC, 0x0ABC }, { 0x0ABE, 0x0AC5 },
147 { 0x0AC7, 0x0AC9 }, { 0x0ACB, 0x0ACD }, { 0x0AE2, 0x0AE3 },
148 { 0x0AFA, 0x0AFF }, { 0x0B01, 0x0B03 }, { 0x0B3C, 0x0B3C },
149 { 0x0B3E, 0x0B44 }, { 0x0B47, 0x0B48 }, { 0x0B4B, 0x0B4D },
150 { 0x0B55, 0x0B57 }, { 0x0B62, 0x0B63 }, { 0x0B82, 0x0B82 },
151 { 0x0BBE, 0x0BC2 }, { 0x0BC6, 0x0BC8 }, { 0x0BCA, 0x0BCD },
152 { 0x0BD7, 0x0BD7 }, { 0x0C00, 0x0C04 }, { 0x0C3E, 0x0C44 },
153 { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 },
154 { 0x0C62, 0x0C63 }, { 0x0C81, 0x0C83 }, { 0x0CBC, 0x0CBC },
155 { 0x0CBE, 0x0CC4 }, { 0x0CC6, 0x0CC8 }, { 0x0CCA, 0x0CCD },
156 { 0x0CD5, 0x0CD6 }, { 0x0CE2, 0x0CE3 }, { 0x0D00, 0x0D03 },
157 { 0x0D3B, 0x0D3C }, { 0x0D3E, 0x0D44 }, { 0x0D46, 0x0D48 },
158 { 0x0D4A, 0x0D4D }, { 0x0D57, 0x0D57 }, { 0x0D62, 0x0D63 },
159 { 0x0D82, 0x0D83 }, { 0x0DCF, 0x0DD4 }, { 0x0DD6, 0x0DD6 },
160 { 0x0DD8, 0x0DDF }, { 0x0DF2, 0x0DF3 }, { 0x0E31, 0x0E31 },
161 { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 },
162 { 0x0EB4, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 },
163 { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 },
164 { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 },
165 { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 },
166 { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A },
167 { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 },
168 { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 },
169 { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x135D, 0x135F },
170 { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
171 { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
172 { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
173 { 0x180B, 0x180E }, { 0x1885, 0x1886 }, { 0x18A9, 0x18A9 },
174 { 0x1920, 0x1922 }, { 0x1927, 0x1928 }, { 0x1932, 0x1932 },
175 { 0x1939, 0x193B }, { 0x1A17, 0x1A18 }, { 0x1A1B, 0x1A1B },
176 { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E }, { 0x1A60, 0x1A60 },
177 { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C }, { 0x1A73, 0x1A7C },
178 { 0x1A7F, 0x1A7F }, { 0x1AB0, 0x1ACE }, { 0x1B00, 0x1B03 },
179 { 0x1B34, 0x1B34 }, { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C },
180 { 0x1B42, 0x1B42 }, { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B82 },
181 { 0x1BA1, 0x1BA1 }, { 0x1BA6, 0x1BA7 }, { 0x1BAA, 0x1BAA },
182 { 0x1BAB, 0x1BAD }, { 0x1BE6, 0x1BE6 }, { 0x1BE8, 0x1BE9 },
183 { 0x1BED, 0x1BED }, { 0x1BEF, 0x1BF1 }, { 0x1C2C, 0x1C33 },
184 { 0x1C36, 0x1C37 }, { 0x1CD0, 0x1CD2 }, { 0x1CD4, 0x1CE8 },
185 { 0x1CED, 0x1CED }, { 0x1CF4, 0x1CF4 }, { 0x1CF8, 0x1CF9 },
186 { 0x1DC0, 0x1DF9 }, { 0x1DFB, 0x1DFF }, { 0x20D0, 0x20DC },
187 { 0x20E1, 0x20E1 }, { 0x20E5, 0x20F0 }, { 0x2CEF, 0x2CF1 },
188 { 0x2D7F, 0x2D7F }, { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D },
189 { 0x3099, 0x309A }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D },
190 { 0xA69E, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 },
191 { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 },
192 { 0xA82C, 0xA82C }, { 0xA8C4, 0xA8C5 }, { 0xA8E0, 0xA8F1 },
193 { 0xA8FF, 0xA8FF }, { 0xA926, 0xA92D }, { 0xA947, 0xA951 },
194 { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 }, { 0xA9B6, 0xA9B9 },
195 { 0xA9BC, 0xA9BD }, { 0xA9E5, 0xA9E5 }, { 0xAA29, 0xAA2E },
196 { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 },
197 { 0xAA4C, 0xAA4C }, { 0xAA7C, 0xAA7C }, { 0xAAB0, 0xAAB0 },
198 { 0xAAB2, 0xAAB4 }, { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF },
199 { 0xAAC1, 0xAAC1 }, { 0xAAEB, 0xAAEB }, { 0xAAEE, 0xAAEF },
200 { 0xAAF5, 0xAAF6 }, { 0xABE3, 0xABE4 }, { 0xABE6, 0xABE7 },
201 { 0xABE9, 0xABEA }, { 0xABEC, 0xABED }, { 0xFB1E, 0xFB1E },
202 { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE2F }, { 0x101FD, 0x101FD },
203 { 0x102E0, 0x102E0 }, { 0x10376, 0x1037A }, { 0x10A01, 0x10A03 },
204 { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F }, { 0x10A38, 0x10A3A },
205 { 0x10A3F, 0x10A3F }, { 0x10AE5, 0x10AE6 }, { 0x10D24, 0x10D27 },
206 { 0x10EAB, 0x10EAC }, { 0x10F46, 0x10F50 }, { 0x10F82, 0x10F85 },
207 { 0x11000, 0x11002 }, { 0x11038, 0x11046 }, { 0x1107F, 0x11082 },
208 { 0x110B0, 0x110BA }, { 0x11100, 0x11102 }, { 0x11127, 0x11134 },
209 { 0x11145, 0x11146 }, { 0x11173, 0x11173 }, { 0x11180, 0x11182 },
210 { 0x111B3, 0x111C0 }, { 0x111C9, 0x111CC }, { 0x1122C, 0x11237 },
211 { 0x1123E, 0x1123E }, { 0x112DF, 0x112EA }, { 0x11300, 0x11303 },
212 { 0x1133B, 0x1133C }, { 0x1133E, 0x11344 }, { 0x11347, 0x11348 },
213 { 0x1134B, 0x1134D }, { 0x11357, 0x11357 }, { 0x11362, 0x11363 },
214 { 0x11435, 0x11446 }, { 0x1145E, 0x1145E }, { 0x114B0, 0x114C3 },
215 { 0x115AF, 0x115B5 }, { 0x115B8, 0x115C0 }, { 0x115DC, 0x115DD },
216 { 0x11630, 0x11640 }, { 0x116AB, 0x116B7 }, { 0x1171D, 0x1172B },
217 { 0x1182C, 0x1183A }, { 0x11930, 0x11935 }, { 0x11937, 0x11938 },
218 { 0x1193B, 0x1193E }, { 0x11940, 0x11940 }, { 0x11942, 0x11942 },
219 { 0x119D1, 0x119D7 }, { 0x119DA, 0x119E0 }, { 0x11A01, 0x11A0A },
220 { 0x11A33, 0x11A39 }, { 0x11A3B, 0x11A3E }, { 0x11A47, 0x11A47 },
221 { 0x11A51, 0x11A5B }, { 0x11A8A, 0x11A96 }, { 0x11A98, 0x11A99 },
222 { 0x11C30, 0x11C36 }, { 0x11C38, 0x11C3D }, { 0x11C3F, 0x11C3F },
223 { 0x11C92, 0x11CA7 }, { 0x11CAA, 0x11CB0 }, { 0x11CB2, 0x11CB3 },
224 { 0x11CB5, 0x11CB6 }, { 0x11D31, 0x11D36 }, { 0x11D3A, 0x11D3A },
225 { 0x11D3C, 0x11D3D }, { 0x11D3F, 0x11D45 }, { 0x11D47, 0x11D47 },
226 { 0x11D90, 0x11D91 }, { 0x11D95, 0x11D95 }, { 0x11D97, 0x11D97 },
227 { 0x11EF3, 0x11EF4 }, { 0x13430, 0x13438 }, { 0x16AF0, 0x16AF4 },
228 { 0x16B30, 0x16B36 }, { 0x16F4F, 0x16F4F }, { 0x16F8F, 0x16F92 },
229 { 0x1BC9D, 0x1BC9E }, { 0x1BCA0, 0x1BCA3 }, { 0x1D167, 0x1D169 },
230 { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
231 { 0x1D242, 0x1D244 }, { 0x1DA00, 0x1DA36 }, { 0x1DA3B, 0x1DA6C },
232 { 0x1DA75, 0x1DA75 }, { 0x1DA84, 0x1DA84 }, { 0x1DA9B, 0x1DA9F },
233 { 0x1DAA1, 0x1DAAF }, { 0x1E000, 0x1E006 }, { 0x1E008, 0x1E018 },
234 { 0x1E01B, 0x1E021 }, { 0x1E023, 0x1E024 }, { 0x1E026, 0x1E02A },
235 { 0x1E130, 0x1E136 }, { 0x1E2AE, 0x1E2AE }, { 0x1E2EC, 0x1E2EF },
236 { 0x1E4EC, 0x1E4EF }, { 0x1E8D0, 0x1E8D6 }, { 0x1E944, 0x1E94A },
237 { 0x1E947, 0x1E94A }, { 0xE0100, 0xE01EF }
238 }; 60 };
239 61
240 /* test for 8-bit control characters */ 62 /* test for 8-bit control characters */
@@ -244,38 +66,27 @@ int mk_wcwidth(mk_wchar_t ucs) // modified: use mk_wchar_t
244 return -1; 66 return -1;
245 67
246 /* binary search in table of non-spacing characters */ 68 /* binary search in table of non-spacing characters */
247 if (bisearch(ucs, combining, 69 if (bisearch(ucs, zero_width_ranges,
248 sizeof(combining) / sizeof(struct interval) - 1)) 70 sizeof(zero_width_ranges) / sizeof(struct interval) - 1))
249 return 0; 71 return 0;
250 72
251 /* if we arrive here, ucs is not a combining or C0/C1 control character */ 73 /* binary search in table of ambiguous width characters */
74 if (bisearch(ucs, ambiguous_width_ranges,
75 sizeof(ambiguous_width_ranges) / sizeof(struct interval) - 1))
76 return ambiguous_width;
252 77
253 return 1 + 78 /* binary search in table of double width characters, default to 1 width */
254 (ucs >= 0x1100 && 79 return 1 + (bisearch(ucs, double_width_ranges,
255 (ucs <= 0x115f || /* Hangul Jamo init. consonants */ 80 sizeof(double_width_ranges) / sizeof(struct interval) - 1));
256 ucs == 0x2329 || ucs == 0x232a ||
257 (ucs >= 0x2e80 && ucs <= 0xa4cf &&
258 ucs != 0x303f) || /* CJK ... Yi */
259 (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
260 (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */
261 (ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */
262 (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
263 (ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */
264 (ucs >= 0xffe0 && ucs <= 0xffe6) ||
265 (ucs >= 0x1f300 && ucs <= 0x1f64f) || /* modified: added Emoticons */
266 (ucs >= 0x1f680 && ucs <= 0x1f6ff) || /* modified: added Transport and Map Symbols */
267 (ucs >= 0x1f900 && ucs <= 0x1f9ff) || /* modified: added Supplemental Symbols and Pictographs */
268 (ucs >= 0x20000 && ucs <= 0x2fffd) ||
269 (ucs >= 0x30000 && ucs <= 0x3fffd)));
270} 81}
271 82
272 83
273int mk_wcswidth(const mk_wchar_t *pwcs, size_t n) // modified: use mk_wchar_t 84int mk_wcswidth(const mk_wchar_t *pwcs, size_t n, int ambiguous_width)
274{ 85{
275 int w, width = 0; 86 int w, width = 0;
276 87
277 for (;*pwcs && n-- > 0; pwcs++) 88 for (;*pwcs && n-- > 0; pwcs++)
278 if ((w = mk_wcwidth(*pwcs)) < 0) 89 if ((w = mk_wcwidth(*pwcs, ambiguous_width)) < 0)
279 return -1; 90 return -1;
280 else 91 else
281 width += w; 92 width += w;
diff --git a/src/wcwidth.h b/src/wcwidth.h
index 6cb6f6d..9d345f9 100644
--- a/src/wcwidth.h
+++ b/src/wcwidth.h
@@ -1,7 +1,8 @@
1// wcwidth.h 1// wcwidth.h
2 2
3// Windows does not have a wcwidth function, so we use compatibilty code from 3// Windows does not have a wcwidth function, so we use compatibilty code from
4// http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c by Markus Kuhn 4// http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c by Markus Kuhn, this is
5// however heavily modified.
5 6
6#ifndef MK_WCWIDTH_H 7#ifndef MK_WCWIDTH_H
7#define MK_WCWIDTH_H 8#define MK_WCWIDTH_H
@@ -16,7 +17,7 @@ typedef uint32_t mk_wchar_t; // Windows wchar_t can be 16-bit, we need 32-bit
16typedef wchar_t mk_wchar_t; // Posix wchar_t is 32-bit so just use that 17typedef wchar_t mk_wchar_t; // Posix wchar_t is 32-bit so just use that
17#endif 18#endif
18 19
19int mk_wcwidth(mk_wchar_t ucs); 20int mk_wcwidth(mk_wchar_t ucs, int ambiguous_width);
20int mk_wcswidth(const mk_wchar_t *pwcs, size_t n); 21int mk_wcswidth(const mk_wchar_t *pwcs, size_t n, int ambiguous_width);
21 22
22#endif // MK_WCWIDTH_H 23#endif // MK_WCWIDTH_H
diff --git a/src/wcwidth_ambiguous_width.c b/src/wcwidth_ambiguous_width.c
new file mode 100644
index 0000000..264258e
--- /dev/null
+++ b/src/wcwidth_ambiguous_width.c
@@ -0,0 +1,64 @@
1 // Do not modify this file directly, it is generated by the wcwidth_update.lua script
2 // Contains unicode character-ranges handled as ambiguous (either 1 or 2 width)
3 // Generated from Unicode 17.0.0
4 // Generated on 2026-01-29
5 { 0x00A1, 0x00A1 }, { 0x00A4, 0x00A4 }, { 0x00A7, 0x00A8 },
6 { 0x00AA, 0x00AA }, { 0x00AD, 0x00AE }, { 0x00B0, 0x00B4 },
7 { 0x00B6, 0x00BA }, { 0x00BC, 0x00BF }, { 0x00C6, 0x00C6 },
8 { 0x00D0, 0x00D0 }, { 0x00D7, 0x00D8 }, { 0x00DE, 0x00E1 },
9 { 0x00E6, 0x00E6 }, { 0x00E8, 0x00EA }, { 0x00EC, 0x00ED },
10 { 0x00F0, 0x00F0 }, { 0x00F2, 0x00F3 }, { 0x00F7, 0x00FA },
11 { 0x00FC, 0x00FC }, { 0x00FE, 0x00FE }, { 0x0101, 0x0101 },
12 { 0x0111, 0x0111 }, { 0x0113, 0x0113 }, { 0x011B, 0x011B },
13 { 0x0126, 0x0127 }, { 0x012B, 0x012B }, { 0x0131, 0x0133 },
14 { 0x0138, 0x0138 }, { 0x013F, 0x0142 }, { 0x0144, 0x0144 },
15 { 0x0148, 0x014B }, { 0x014D, 0x014D }, { 0x0152, 0x0153 },
16 { 0x0166, 0x0167 }, { 0x016B, 0x016B }, { 0x01CE, 0x01CE },
17 { 0x01D0, 0x01D0 }, { 0x01D2, 0x01D2 }, { 0x01D4, 0x01D4 },
18 { 0x01D6, 0x01D6 }, { 0x01D8, 0x01D8 }, { 0x01DA, 0x01DA },
19 { 0x01DC, 0x01DC }, { 0x0251, 0x0251 }, { 0x0261, 0x0261 },
20 { 0x02C4, 0x02C4 }, { 0x02C7, 0x02C7 }, { 0x02C9, 0x02CB },
21 { 0x02CD, 0x02CD }, { 0x02D0, 0x02D0 }, { 0x02D8, 0x02DB },
22 { 0x02DD, 0x02DD }, { 0x02DF, 0x02DF }, { 0x0300, 0x036F },
23 { 0x0391, 0x03A1 }, { 0x03A3, 0x03A9 }, { 0x03B1, 0x03C1 },
24 { 0x03C3, 0x03C9 }, { 0x0401, 0x0401 }, { 0x0410, 0x044F },
25 { 0x0451, 0x0451 }, { 0x2010, 0x2010 }, { 0x2013, 0x2016 },
26 { 0x2018, 0x2019 }, { 0x201C, 0x201D }, { 0x2020, 0x2022 },
27 { 0x2024, 0x2027 }, { 0x2030, 0x2030 }, { 0x2032, 0x2033 },
28 { 0x2035, 0x2035 }, { 0x203B, 0x203B }, { 0x203E, 0x203E },
29 { 0x2074, 0x2074 }, { 0x207F, 0x207F }, { 0x2081, 0x2084 },
30 { 0x20AC, 0x20AC }, { 0x2103, 0x2103 }, { 0x2105, 0x2105 },
31 { 0x2109, 0x2109 }, { 0x2113, 0x2113 }, { 0x2116, 0x2116 },
32 { 0x2121, 0x2122 }, { 0x2126, 0x2126 }, { 0x212B, 0x212B },
33 { 0x2153, 0x2154 }, { 0x215B, 0x215E }, { 0x2160, 0x216B },
34 { 0x2170, 0x2179 }, { 0x2189, 0x2189 }, { 0x2190, 0x2199 },
35 { 0x21B8, 0x21B9 }, { 0x21D2, 0x21D2 }, { 0x21D4, 0x21D4 },
36 { 0x21E7, 0x21E7 }, { 0x2200, 0x2200 }, { 0x2202, 0x2203 },
37 { 0x2207, 0x2208 }, { 0x220B, 0x220B }, { 0x220F, 0x220F },
38 { 0x2211, 0x2211 }, { 0x2215, 0x2215 }, { 0x221A, 0x221A },
39 { 0x221D, 0x2220 }, { 0x2223, 0x2223 }, { 0x2225, 0x2225 },
40 { 0x2227, 0x222C }, { 0x222E, 0x222E }, { 0x2234, 0x2237 },
41 { 0x223C, 0x223D }, { 0x2248, 0x2248 }, { 0x224C, 0x224C },
42 { 0x2252, 0x2252 }, { 0x2260, 0x2261 }, { 0x2264, 0x2267 },
43 { 0x226A, 0x226B }, { 0x226E, 0x226F }, { 0x2282, 0x2283 },
44 { 0x2286, 0x2287 }, { 0x2295, 0x2295 }, { 0x2299, 0x2299 },
45 { 0x22A5, 0x22A5 }, { 0x22BF, 0x22BF }, { 0x2312, 0x2312 },
46 { 0x2460, 0x24E9 }, { 0x24EB, 0x254B }, { 0x2550, 0x2573 },
47 { 0x2580, 0x258F }, { 0x2592, 0x2595 }, { 0x25A0, 0x25A1 },
48 { 0x25A3, 0x25A9 }, { 0x25B2, 0x25B3 }, { 0x25B6, 0x25B7 },
49 { 0x25BC, 0x25BD }, { 0x25C0, 0x25C1 }, { 0x25C6, 0x25C8 },
50 { 0x25CB, 0x25CB }, { 0x25CE, 0x25D1 }, { 0x25E2, 0x25E5 },
51 { 0x25EF, 0x25EF }, { 0x2605, 0x2606 }, { 0x2609, 0x2609 },
52 { 0x260E, 0x260F }, { 0x261C, 0x261C }, { 0x261E, 0x261E },
53 { 0x2640, 0x2640 }, { 0x2642, 0x2642 }, { 0x2660, 0x2661 },
54 { 0x2663, 0x2665 }, { 0x2667, 0x266A }, { 0x266C, 0x266D },
55 { 0x266F, 0x266F }, { 0x269E, 0x269F }, { 0x26BF, 0x26BF },
56 { 0x26C6, 0x26CD }, { 0x26CF, 0x26D3 }, { 0x26D5, 0x26E1 },
57 { 0x26E3, 0x26E3 }, { 0x26E8, 0x26E9 }, { 0x26EB, 0x26F1 },
58 { 0x26F4, 0x26F4 }, { 0x26F6, 0x26F9 }, { 0x26FB, 0x26FC },
59 { 0x26FE, 0x26FF }, { 0x273D, 0x273D }, { 0x2776, 0x277F },
60 { 0x2B56, 0x2B59 }, { 0x3248, 0x324F }, { 0xE000, 0xF8FF },
61 { 0xFE00, 0xFE0F }, { 0xFFFD, 0xFFFD }, { 0x1F100, 0x1F10A },
62 { 0x1F110, 0x1F12D }, { 0x1F130, 0x1F169 }, { 0x1F170, 0x1F18D },
63 { 0x1F18F, 0x1F190 }, { 0x1F19B, 0x1F1AC }, { 0xE0100, 0xE01EF },
64 { 0xF0000, 0xFFFFD }, { 0x100000, 0x10FFFD }
diff --git a/src/wcwidth_double_width.c b/src/wcwidth_double_width.c
new file mode 100644
index 0000000..a0c1b65
--- /dev/null
+++ b/src/wcwidth_double_width.c
@@ -0,0 +1,45 @@
1 // Do not modify this file directly, it is generated by the wcwidth_update.lua script
2 // Contains unicode character-ranges handled as double width
3 // Generated from Unicode 17.0.0
4 // Generated on 2026-01-29
5 { 0x1100, 0x115F }, { 0x231A, 0x231B }, { 0x2329, 0x232A },
6 { 0x23E9, 0x23EC }, { 0x23F0, 0x23F0 }, { 0x23F3, 0x23F3 },
7 { 0x25FD, 0x25FE }, { 0x2614, 0x2615 }, { 0x2630, 0x2637 },
8 { 0x2648, 0x2653 }, { 0x267F, 0x267F }, { 0x268A, 0x268F },
9 { 0x2693, 0x2693 }, { 0x26A1, 0x26A1 }, { 0x26AA, 0x26AB },
10 { 0x26BD, 0x26BE }, { 0x26C4, 0x26C5 }, { 0x26CE, 0x26CE },
11 { 0x26D4, 0x26D4 }, { 0x26EA, 0x26EA }, { 0x26F2, 0x26F3 },
12 { 0x26F5, 0x26F5 }, { 0x26FA, 0x26FA }, { 0x26FD, 0x26FD },
13 { 0x2705, 0x2705 }, { 0x270A, 0x270B }, { 0x2728, 0x2728 },
14 { 0x274C, 0x274C }, { 0x274E, 0x274E }, { 0x2753, 0x2755 },
15 { 0x2757, 0x2757 }, { 0x2795, 0x2797 }, { 0x27B0, 0x27B0 },
16 { 0x27BF, 0x27BF }, { 0x2B1B, 0x2B1C }, { 0x2B50, 0x2B50 },
17 { 0x2B55, 0x2B55 }, { 0x2E80, 0x2E99 }, { 0x2E9B, 0x2EF3 },
18 { 0x2F00, 0x2FD5 }, { 0x2FF0, 0x303E }, { 0x3041, 0x3096 },
19 { 0x3099, 0x30FF }, { 0x3105, 0x312F }, { 0x3131, 0x318E },
20 { 0x3190, 0x31E5 }, { 0x31EF, 0x321E }, { 0x3220, 0x3247 },
21 { 0x3250, 0xA48C }, { 0xA490, 0xA4C6 }, { 0xA960, 0xA97C },
22 { 0xAC00, 0xD7A3 }, { 0xF900, 0xFAFF }, { 0xFE10, 0xFE19 },
23 { 0xFE30, 0xFE52 }, { 0xFE54, 0xFE66 }, { 0xFE68, 0xFE6B },
24 { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 }, { 0x16FE0, 0x16FE4 },
25 { 0x16FF0, 0x16FF6 }, { 0x17000, 0x18CD5 }, { 0x18CFF, 0x18D1E },
26 { 0x18D80, 0x18DF2 }, { 0x1AFF0, 0x1AFF3 }, { 0x1AFF5, 0x1AFFB },
27 { 0x1AFFD, 0x1AFFE }, { 0x1B000, 0x1B122 }, { 0x1B132, 0x1B132 },
28 { 0x1B150, 0x1B152 }, { 0x1B155, 0x1B155 }, { 0x1B164, 0x1B167 },
29 { 0x1B170, 0x1B2FB }, { 0x1D300, 0x1D356 }, { 0x1D360, 0x1D376 },
30 { 0x1F004, 0x1F004 }, { 0x1F0CF, 0x1F0CF }, { 0x1F18E, 0x1F18E },
31 { 0x1F191, 0x1F19A }, { 0x1F1E6, 0x1F202 }, { 0x1F210, 0x1F23B },
32 { 0x1F240, 0x1F248 }, { 0x1F250, 0x1F251 }, { 0x1F260, 0x1F265 },
33 { 0x1F300, 0x1F320 }, { 0x1F32D, 0x1F335 }, { 0x1F337, 0x1F37C },
34 { 0x1F37E, 0x1F393 }, { 0x1F3A0, 0x1F3CA }, { 0x1F3CF, 0x1F3D3 },
35 { 0x1F3E0, 0x1F3F0 }, { 0x1F3F4, 0x1F3F4 }, { 0x1F3F8, 0x1F43E },
36 { 0x1F440, 0x1F440 }, { 0x1F442, 0x1F4FC }, { 0x1F4FF, 0x1F53D },
37 { 0x1F54B, 0x1F54E }, { 0x1F550, 0x1F567 }, { 0x1F57A, 0x1F57A },
38 { 0x1F595, 0x1F596 }, { 0x1F5A4, 0x1F5A4 }, { 0x1F5FB, 0x1F64F },
39 { 0x1F680, 0x1F6C5 }, { 0x1F6CC, 0x1F6CC }, { 0x1F6D0, 0x1F6D2 },
40 { 0x1F6D5, 0x1F6D8 }, { 0x1F6DC, 0x1F6DF }, { 0x1F6EB, 0x1F6EC },
41 { 0x1F6F4, 0x1F6FC }, { 0x1F7E0, 0x1F7EB }, { 0x1F7F0, 0x1F7F0 },
42 { 0x1F90C, 0x1F93A }, { 0x1F93C, 0x1F945 }, { 0x1F947, 0x1F9FF },
43 { 0x1FA70, 0x1FA7C }, { 0x1FA80, 0x1FA8A }, { 0x1FA8E, 0x1FAC6 },
44 { 0x1FAC8, 0x1FAC8 }, { 0x1FACD, 0x1FADC }, { 0x1FADF, 0x1FAEA },
45 { 0x1FAEF, 0x1FAF8 }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD }
diff --git a/src/wcwidth_update.lua b/src/wcwidth_update.lua
new file mode 100755
index 0000000..37f18c3
--- /dev/null
+++ b/src/wcwidth_update.lua
@@ -0,0 +1,404 @@
1#!/usr/bin/env lua
2
3-- This file downloads and parses unicode standard files and updates the wcwidth code
4-- based on that data.
5
6local VERSION="17.0.0" -- the unicode standard version to download
7
8
9
10-- test if curl is available, and Penlight
11do
12 local ok, ec = os.execute("curl --version > /dev/null 2>&1")
13 if not ok then
14 error("curl is not available in the path; exitcode " .. ec)
15 end
16
17 local ok, utils = pcall(require, "pl.utils")
18 if not ok then
19 error("Penlight is not available, please install via `luarocks install penlight`")
20 end
21
22 utils.readfile("./wcwidth.c")
23 if not ok then
24 error("failed to read './wcwidth.c', run this script from within the `./src/` directory")
25 end
26end
27
28-- files to download from the unicode site
29local FN_DERIVED_GENERAL_CATEGORY = 1
30local FN_EAST_ASIAN_WIDTH = 2
31local FN_DERIVED_CORE_PROPERTIES = 3
32local FN_EMOJI_DATA = 4
33
34local download_file_list = {
35 [FN_DERIVED_GENERAL_CATEGORY] = "extracted/DerivedGeneralCategory.txt",
36 [FN_EAST_ASIAN_WIDTH] = "EastAsianWidth.txt",
37 [FN_DERIVED_CORE_PROPERTIES] = "DerivedCoreProperties.txt",
38 [FN_EMOJI_DATA] = "emoji/emoji-data.txt",
39}
40local target_path = "./unicode_data/"
41
42
43
44do
45 local base_url = "https://www.unicode.org/Public/" .. VERSION .. "/ucd/" -- must include trailing slash
46
47
48 -- removes a file, and then downloads a new copy from the unicode site
49 local function download_file(filename, target_filename)
50 print("Downloading " .. filename .. " to " .. target_filename)
51 os.remove(target_filename)
52 local cmd = "curl --fail -s -o " .. target_filename .. " " .. base_url .. filename
53 local ok, ec = os.execute(cmd)
54 if not ok then
55 error("Failed to execute: " .. cmd .. "; exitcode " .. ec)
56 end
57 end
58
59
60 -- Downloads all unicode files we need
61 local function download_files()
62 os.execute("mkdir -p " .. target_path .. "extracted")
63 os.execute("mkdir -p " .. target_path .. "emoji")
64 for _, filename in ipairs(download_file_list) do
65 download_file(filename, target_path .. filename)
66 end
67 end
68
69
70 download_files()
71end
72
73
74
75-- set up the 3 lists of data (everything else is single-width)
76local zero_width = {}
77local double_width = {}
78local ambiguous_width = {}
79
80
81
82local readlines do
83 local utils = require("pl.utils")
84
85 function readlines(filename)
86 print("Parsing " .. filename)
87 local lines = assert(utils.readlines(filename))
88
89 -- drop lines starting with "#" being comments, or empty lines (whitespace only)
90 for i = #lines, 1, -1 do -- reverse, since we're deleting items
91 if lines[i]:match("^%s*#") or lines[i]:match("^%s*$") then
92 table.remove(lines, i)
93 end
94 end
95
96 return lines
97 end
98end
99
100
101
102
103-- parse DerivedGeneralCategory.txt
104-- Purpose: zero-width combining marks
105-- Extract:
106-- Mn — Nonspacing Mark → width = 0
107-- Me — Enclosing Mark → width = 0
108-- Why:
109-- These characters overlay the previous glyph
110-- This replaces Markus Kuhn’s combining[] table
111-- Ignore all other categories in this file.
112do
113 local lines = readlines(target_path .. download_file_list[FN_DERIVED_GENERAL_CATEGORY])
114 local zw_start = #zero_width
115
116 -- parse the lines
117 for _, line in ipairs(lines) do
118 local range, category = line:match("^([%x%.]+)%s*;%s*(%a+)")
119 if not range then
120 error("Failed to parse line: " .. line)
121 end
122
123 if not range:find("..", 1, true) then -- single code point, make range
124 range = range .. ".." .. range
125 end
126
127 if category == "Mn" or category == "Me" then
128 zero_width[#zero_width + 1] = range
129 end
130 end
131
132 print(" found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
133end
134
135
136
137-- parse DerivedCoreProperties.txt
138-- Purpose: zero-width format / ignorable characters
139-- Extract:
140-- Default_Ignorable_Code_Point → width = 0
141
142-- Includes (important examples):
143-- U+200D ZERO WIDTH JOINER
144-- U+200C ZERO WIDTH NON-JOINER
145-- U+FE00..U+FE0F (variation selectors)
146-- Bidi and other format controls
147
148-- Why:
149-- Not Mn/Me, but terminals treat them as zero-width
150-- Required for emoji correctness and modern text
151do
152 local lines = readlines(target_path .. download_file_list[FN_DERIVED_CORE_PROPERTIES])
153 local zw_start = #zero_width
154
155 -- parse the lines
156 for _, line in ipairs(lines) do
157 local range, category = line:match("^([%x%.]+)%s*;%s*([%a_]+)")
158 if not range then
159 error("Failed to parse line: " .. line)
160 end
161
162 if not range:find("..", 1, true) then -- single code point, make range
163 range = range .. ".." .. range
164 end
165
166 if category == "Default_Ignorable_Code_Point" then
167 zero_width[#zero_width + 1] = range
168 end
169 end
170
171 print(" found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
172end
173
174
175
176-- parse EastAsianWidth.txt
177-- Purpose: determine double-width and ambiguous-width characters
178-- Extract:
179-- W (Wide) → width = 2
180-- F (Fullwidth) → width = 2
181-- A (Ambiguous) → width = 1 or 2 (your choice; usually 1 unless CJK mode)
182-- Everything else:
183-- H, Na, N → width = 1
184-- Why:
185-- - This is the only Unicode-sanctioned width-related property
186-- - Core of all wcwidth() implementations
187do
188 local lines = readlines(target_path .. download_file_list[FN_EAST_ASIAN_WIDTH])
189 local dw_start = #double_width
190 local aw_start = #ambiguous_width
191
192 -- parse the lines
193 for _, line in ipairs(lines) do
194 local range, width_type = line:match("^([%x%.]+)%s*;%s*(%a+)")
195 if not range then
196 error("Failed to parse line: " .. line)
197 end
198
199 if not range:find("..", 1, true) then -- single code point, make range
200 range = range .. ".." .. range
201 end
202
203 if width_type == "W" or width_type == "F" then
204 double_width[#double_width + 1] = range
205 elseif width_type == "A" then
206 ambiguous_width[#ambiguous_width + 1] = range
207 end
208 end
209
210 print(" found " .. (#double_width - dw_start) .. " double-width character-ranges")
211 print(" found " .. (#ambiguous_width - aw_start) .. " ambiguous-width character-ranges")
212end
213
214
215
216-- parse emoji-data.txt
217-- Purpose: emoji presentation width
218-- Extract:
219-- Emoji_Presentation=Yes → width = 2
220-- (Optionally) Extended_Pictographic → emoji sequences
221-- Why:
222-- Emoji are not reliably covered by EastAsianWidth
223-- Modern terminals render these as double-width
224-- Required for correct emoji column alignment
225do
226 local lines = readlines(target_path .. download_file_list[FN_EMOJI_DATA])
227 local dw_start = #double_width
228
229 -- parse the lines
230 for _, line in ipairs(lines) do
231 local range, properties = line:match("^([%x%.]+)%s*;%s*([%a_]+)")
232 if not range then
233 error("Failed to parse line: " .. line)
234 end
235
236 if not range:find("..", 1, true) then -- single code point, make range
237 range = range .. ".." .. range
238 end
239
240 if properties:match("Emoji_Presentation") then
241 double_width[#double_width + 1] = range
242 end
243 end
244
245 print(" found " .. (#double_width - dw_start) .. " double-width character-ranges")
246end
247
248
249
250-- returns the start and end of a range, numerically, and hex strings
251-- @tparam string range the range to parse
252-- @treturn number sr the start of the range
253-- @treturn number er the end of the range
254-- @treturn string sh the start of the range as a hex string
255-- @treturn string eh the end of the range as a hex string
256local parse_range do
257 function parse_range(range)
258 local s = range:find("..", 1, true)
259 if not s then
260 error("Failed to parse range: " .. range)
261 end
262 local sh = range:sub(1, s - 1)
263 local eh = range:sub(s + 2, -1)
264 local sr = tonumber(sh, 16)
265 local er = tonumber(eh, 16)
266 if er < sr then
267 error("Failed to parse range: " .. range .. " (end < start)")
268 end
269 return sr, er, sh, eh
270 end
271
272 -- some inline tests for parse_range
273 local sr, er = parse_range("25FD..25FE")
274 assert(sr == 9725)
275 assert(er == 9726)
276 local sr, er = parse_range("105C0..105F3")
277 assert(sr == 67008)
278 assert(er == 67059)
279end
280
281
282
283-- sorts the ranges in-place
284local function sort_ranges(ranges)
285 table.sort(ranges, function(a, b)
286 return parse_range(a) < parse_range(b)
287 end)
288 return ranges
289end
290
291
292
293-- combines adjacent ranges in-place
294local combine_ranges do
295 function combine_ranges(ranges)
296 local last_idx = 1
297 for i = 2, #ranges do
298 local last_s, last_e, last_sh, last_eh = parse_range(ranges[last_idx])
299 local current_s, current_e, _, current_eh = parse_range(ranges[i])
300 if current_s >= last_s and current_s <= (last_e + 1) then
301 -- ranges are adjacent or overlapping, combine them
302 local sh = last_sh
303 local eh = current_eh
304 if last_e > current_e then
305 eh = last_eh
306 end
307 ranges[last_idx] = sh .. ".." .. eh
308 else
309 last_idx = last_idx + 1
310 ranges[last_idx] = ranges[i]
311 end
312 end
313 -- clear left-overs beyond last entry
314 for i = last_idx + 1, #ranges do
315 ranges[i] = nil
316 end
317 end
318
319 -- some inline tests for combine_ranges
320 local ranges = {
321 "25FD..25FE",
322 "25FD..25FE", -- duplicate range, should be removed
323 "105C0..105F3",
324 "105D0..105E0", -- range fully within previous range, should be combined
325 "10F00..10F10",
326 "10F11..10F20", -- adjacent or previous, should be combined
327 "11000..11100",
328 "11101..11110", -- adjacent + extending to previous, should be combined
329 "12000..12010",
330 "12011..12020", -- multiple: adjacent should be combined
331 "12015..12030", -- multiple: overlap + extending to previous, should be combined
332 "12031..12040", -- multiple: overlapping, should be combined
333 }
334 combine_ranges(ranges)
335 assert(#ranges == 5)
336 assert(ranges[1] == "25FD..25FE")
337 assert(ranges[2] == "105C0..105F3")
338 assert(ranges[3] == "10F00..10F20")
339 assert(ranges[4] == "11000..11110")
340 assert(ranges[5] == "12000..12040")
341end
342
343
344
345combine_ranges(sort_ranges(zero_width))
346combine_ranges(sort_ranges(double_width))
347combine_ranges(sort_ranges(ambiguous_width))
348
349
350
351-- convert ranges into c-source-code ranges (in-place)
352-- format: "{ 0x0829, 0x082D }"
353local function convert_c_ranges(ranges)
354 for i = 1, #ranges do
355 local _, _, sh, eh = parse_range(ranges[i])
356 ranges[i] = "{ 0x" .. sh .. ", 0x" .. eh .. " }"
357 end
358end
359
360convert_c_ranges(zero_width)
361convert_c_ranges(double_width)
362convert_c_ranges(ambiguous_width)
363
364
365
366local SOURCE_INDENT = " "
367
368
369-- write c source, as triplet; 3 ranges on 1 line
370local function triplet_lines(ranges)
371 local lines = {}
372 for i = 1, #ranges, 3 do
373 lines[#lines+1] = SOURCE_INDENT .. table.concat(ranges, ", ", i, math.min(i + 2, #ranges)) .. ","
374 end
375 -- drop trailing comma from last line
376 lines[#lines] = lines[#lines]:sub(1, -2)
377 return lines
378end
379
380
381-- create file-contents
382local function create_file_contents(ranges, contains)
383 return
384 SOURCE_INDENT .. "// Do not modify this file directly, it is generated by the wcwidth_update.lua script\n" ..
385 SOURCE_INDENT .. "// Contains " .. contains .. "\n" ..
386 SOURCE_INDENT .. "// Generated from Unicode " .. VERSION .. "\n" ..
387 SOURCE_INDENT .. "// Generated on " .. os.date("%Y-%m-%d") .. "\n" ..
388 table.concat(triplet_lines(ranges), "\n") .. "\n"
389end
390
391
392
393
394local writefile = require("pl.utils").writefile
395
396print("writing source files...")
397print(" zero-width: ./wcwidth_zero_width.c")
398assert(writefile("./wcwidth_zero_width.c", create_file_contents(zero_width, "unicode character-ranges handled as 0 width")))
399
400print(" double-width: ./wcwidth_double_width.c")
401assert(writefile("./wcwidth_double_width.c", create_file_contents(double_width, "unicode character-ranges handled as double width")))
402
403print(" ambiguous-width: ./wcwidth_ambiguous_width.c")
404assert(writefile("./wcwidth_ambiguous_width.c", create_file_contents(ambiguous_width, "unicode character-ranges handled as ambiguous (either 1 or 2 width)")))
diff --git a/src/wcwidth_zero_width.c b/src/wcwidth_zero_width.c
new file mode 100644
index 0000000..579ca5f
--- /dev/null
+++ b/src/wcwidth_zero_width.c
@@ -0,0 +1,128 @@
1 // Do not modify this file directly, it is generated by the wcwidth_update.lua script
2 // Contains unicode character-ranges handled as 0 width
3 // Generated from Unicode 17.0.0
4 // Generated on 2026-01-29
5 { 0x00AD, 0x00AD }, { 0x0300, 0x036F }, { 0x0483, 0x0489 },
6 { 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 },
7 { 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0610, 0x061A },
8 { 0x061C, 0x061C }, { 0x064B, 0x065F }, { 0x0670, 0x0670 },
9 { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 }, { 0x06E7, 0x06E8 },
10 { 0x06EA, 0x06ED }, { 0x0711, 0x0711 }, { 0x0730, 0x074A },
11 { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x07FD, 0x07FD },
12 { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 },
13 { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x0897, 0x089F },
14 { 0x08CA, 0x08E1 }, { 0x08E3, 0x0902 }, { 0x093A, 0x093A },
15 { 0x093C, 0x093C }, { 0x0941, 0x0948 }, { 0x094D, 0x094D },
16 { 0x0951, 0x0957 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 },
17 { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD },
18 { 0x09E2, 0x09E3 }, { 0x09FE, 0x09FE }, { 0x0A01, 0x0A02 },
19 { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 },
20 { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 }, { 0x0A70, 0x0A71 },
21 { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC },
22 { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD },
23 { 0x0AE2, 0x0AE3 }, { 0x0AFA, 0x0AFF }, { 0x0B01, 0x0B01 },
24 { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 },
25 { 0x0B4D, 0x0B4D }, { 0x0B55, 0x0B56 }, { 0x0B62, 0x0B63 },
26 { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD },
27 { 0x0C00, 0x0C00 }, { 0x0C04, 0x0C04 }, { 0x0C3C, 0x0C3C },
28 { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D },
29 { 0x0C55, 0x0C56 }, { 0x0C62, 0x0C63 }, { 0x0C81, 0x0C81 },
30 { 0x0CBC, 0x0CBC }, { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 },
31 { 0x0CCC, 0x0CCD }, { 0x0CE2, 0x0CE3 }, { 0x0D00, 0x0D01 },
32 { 0x0D3B, 0x0D3C }, { 0x0D41, 0x0D44 }, { 0x0D4D, 0x0D4D },
33 { 0x0D62, 0x0D63 }, { 0x0D81, 0x0D81 }, { 0x0DCA, 0x0DCA },
34 { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 },
35 { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 },
36 { 0x0EB4, 0x0EBC }, { 0x0EC8, 0x0ECE }, { 0x0F18, 0x0F19 },
37 { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 },
38 { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 },
39 { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 },
40 { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A },
41 { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 },
42 { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 },
43 { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x115F, 0x1160 },
44 { 0x135D, 0x135F }, { 0x1712, 0x1714 }, { 0x1732, 0x1733 },
45 { 0x1752, 0x1753 }, { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 },
46 { 0x17B7, 0x17BD }, { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 },
47 { 0x17DD, 0x17DD }, { 0x180B, 0x180F }, { 0x1885, 0x1886 },
48 { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 }, { 0x1927, 0x1928 },
49 { 0x1932, 0x1932 }, { 0x1939, 0x193B }, { 0x1A17, 0x1A18 },
50 { 0x1A1B, 0x1A1B }, { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E },
51 { 0x1A60, 0x1A60 }, { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C },
52 { 0x1A73, 0x1A7C }, { 0x1A7F, 0x1A7F }, { 0x1AB0, 0x1ADD },
53 { 0x1AE0, 0x1AEB }, { 0x1B00, 0x1B03 }, { 0x1B34, 0x1B34 },
54 { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, { 0x1B42, 0x1B42 },
55 { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B81 }, { 0x1BA2, 0x1BA5 },
56 { 0x1BA8, 0x1BA9 }, { 0x1BAB, 0x1BAD }, { 0x1BE6, 0x1BE6 },
57 { 0x1BE8, 0x1BE9 }, { 0x1BED, 0x1BED }, { 0x1BEF, 0x1BF1 },
58 { 0x1C2C, 0x1C33 }, { 0x1C36, 0x1C37 }, { 0x1CD0, 0x1CD2 },
59 { 0x1CD4, 0x1CE0 }, { 0x1CE2, 0x1CE8 }, { 0x1CED, 0x1CED },
60 { 0x1CF4, 0x1CF4 }, { 0x1CF8, 0x1CF9 }, { 0x1DC0, 0x1DFF },
61 { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x206F },
62 { 0x20D0, 0x20F0 }, { 0x2CEF, 0x2CF1 }, { 0x2D7F, 0x2D7F },
63 { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D }, { 0x3099, 0x309A },
64 { 0x3164, 0x3164 }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D },
65 { 0xA69E, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 },
66 { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 },
67 { 0xA82C, 0xA82C }, { 0xA8C4, 0xA8C5 }, { 0xA8E0, 0xA8F1 },
68 { 0xA8FF, 0xA8FF }, { 0xA926, 0xA92D }, { 0xA947, 0xA951 },
69 { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 }, { 0xA9B6, 0xA9B9 },
70 { 0xA9BC, 0xA9BD }, { 0xA9E5, 0xA9E5 }, { 0xAA29, 0xAA2E },
71 { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 },
72 { 0xAA4C, 0xAA4C }, { 0xAA7C, 0xAA7C }, { 0xAAB0, 0xAAB0 },
73 { 0xAAB2, 0xAAB4 }, { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF },
74 { 0xAAC1, 0xAAC1 }, { 0xAAEC, 0xAAED }, { 0xAAF6, 0xAAF6 },
75 { 0xABE5, 0xABE5 }, { 0xABE8, 0xABE8 }, { 0xABED, 0xABED },
76 { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE2F },
77 { 0xFEFF, 0xFEFF }, { 0xFFA0, 0xFFA0 }, { 0xFFF0, 0xFFF8 },
78 { 0x101FD, 0x101FD }, { 0x102E0, 0x102E0 }, { 0x10376, 0x1037A },
79 { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
80 { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x10AE5, 0x10AE6 },
81 { 0x10D24, 0x10D27 }, { 0x10D69, 0x10D6D }, { 0x10EAB, 0x10EAC },
82 { 0x10EFA, 0x10EFF }, { 0x10F46, 0x10F50 }, { 0x10F82, 0x10F85 },
83 { 0x11001, 0x11001 }, { 0x11038, 0x11046 }, { 0x11070, 0x11070 },
84 { 0x11073, 0x11074 }, { 0x1107F, 0x11081 }, { 0x110B3, 0x110B6 },
85 { 0x110B9, 0x110BA }, { 0x110C2, 0x110C2 }, { 0x11100, 0x11102 },
86 { 0x11127, 0x1112B }, { 0x1112D, 0x11134 }, { 0x11173, 0x11173 },
87 { 0x11180, 0x11181 }, { 0x111B6, 0x111BE }, { 0x111C9, 0x111CC },
88 { 0x111CF, 0x111CF }, { 0x1122F, 0x11231 }, { 0x11234, 0x11234 },
89 { 0x11236, 0x11237 }, { 0x1123E, 0x1123E }, { 0x11241, 0x11241 },
90 { 0x112DF, 0x112DF }, { 0x112E3, 0x112EA }, { 0x11300, 0x11301 },
91 { 0x1133B, 0x1133C }, { 0x11340, 0x11340 }, { 0x11366, 0x1136C },
92 { 0x11370, 0x11374 }, { 0x113BB, 0x113C0 }, { 0x113CE, 0x113CE },
93 { 0x113D0, 0x113D0 }, { 0x113D2, 0x113D2 }, { 0x113E1, 0x113E2 },
94 { 0x11438, 0x1143F }, { 0x11442, 0x11444 }, { 0x11446, 0x11446 },
95 { 0x1145E, 0x1145E }, { 0x114B3, 0x114B8 }, { 0x114BA, 0x114BA },
96 { 0x114BF, 0x114C0 }, { 0x114C2, 0x114C3 }, { 0x115B2, 0x115B5 },
97 { 0x115BC, 0x115BD }, { 0x115BF, 0x115C0 }, { 0x115DC, 0x115DD },
98 { 0x11633, 0x1163A }, { 0x1163D, 0x1163D }, { 0x1163F, 0x11640 },
99 { 0x116AB, 0x116AB }, { 0x116AD, 0x116AD }, { 0x116B0, 0x116B5 },
100 { 0x116B7, 0x116B7 }, { 0x1171D, 0x1171D }, { 0x1171F, 0x1171F },
101 { 0x11722, 0x11725 }, { 0x11727, 0x1172B }, { 0x1182F, 0x11837 },
102 { 0x11839, 0x1183A }, { 0x1193B, 0x1193C }, { 0x1193E, 0x1193E },
103 { 0x11943, 0x11943 }, { 0x119D4, 0x119D7 }, { 0x119DA, 0x119DB },
104 { 0x119E0, 0x119E0 }, { 0x11A01, 0x11A0A }, { 0x11A33, 0x11A38 },
105 { 0x11A3B, 0x11A3E }, { 0x11A47, 0x11A47 }, { 0x11A51, 0x11A56 },
106 { 0x11A59, 0x11A5B }, { 0x11A8A, 0x11A96 }, { 0x11A98, 0x11A99 },
107 { 0x11B60, 0x11B60 }, { 0x11B62, 0x11B64 }, { 0x11B66, 0x11B66 },
108 { 0x11C30, 0x11C36 }, { 0x11C38, 0x11C3D }, { 0x11C3F, 0x11C3F },
109 { 0x11C92, 0x11CA7 }, { 0x11CAA, 0x11CB0 }, { 0x11CB2, 0x11CB3 },
110 { 0x11CB5, 0x11CB6 }, { 0x11D31, 0x11D36 }, { 0x11D3A, 0x11D3A },
111 { 0x11D3C, 0x11D3D }, { 0x11D3F, 0x11D45 }, { 0x11D47, 0x11D47 },
112 { 0x11D90, 0x11D91 }, { 0x11D95, 0x11D95 }, { 0x11D97, 0x11D97 },
113 { 0x11EF3, 0x11EF4 }, { 0x11F00, 0x11F01 }, { 0x11F36, 0x11F3A },
114 { 0x11F40, 0x11F40 }, { 0x11F42, 0x11F42 }, { 0x11F5A, 0x11F5A },
115 { 0x13440, 0x13440 }, { 0x13447, 0x13455 }, { 0x1611E, 0x16129 },
116 { 0x1612D, 0x1612F }, { 0x16AF0, 0x16AF4 }, { 0x16B30, 0x16B36 },
117 { 0x16F4F, 0x16F4F }, { 0x16F8F, 0x16F92 }, { 0x16FE4, 0x16FE4 },
118 { 0x1BC9D, 0x1BC9E }, { 0x1BCA0, 0x1BCA3 }, { 0x1CF00, 0x1CF2D },
119 { 0x1CF30, 0x1CF46 }, { 0x1D167, 0x1D169 }, { 0x1D173, 0x1D182 },
120 { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD }, { 0x1D242, 0x1D244 },
121 { 0x1DA00, 0x1DA36 }, { 0x1DA3B, 0x1DA6C }, { 0x1DA75, 0x1DA75 },
122 { 0x1DA84, 0x1DA84 }, { 0x1DA9B, 0x1DA9F }, { 0x1DAA1, 0x1DAAF },
123 { 0x1E000, 0x1E006 }, { 0x1E008, 0x1E018 }, { 0x1E01B, 0x1E021 },
124 { 0x1E023, 0x1E024 }, { 0x1E026, 0x1E02A }, { 0x1E08F, 0x1E08F },
125 { 0x1E130, 0x1E136 }, { 0x1E2AE, 0x1E2AE }, { 0x1E2EC, 0x1E2EF },
126 { 0x1E4EC, 0x1E4EF }, { 0x1E5EE, 0x1E5EF }, { 0x1E6E3, 0x1E6E3 },
127 { 0x1E6E6, 0x1E6E6 }, { 0x1E6EE, 0x1E6EF }, { 0x1E6F5, 0x1E6F5 },
128 { 0x1E8D0, 0x1E8D6 }, { 0x1E944, 0x1E94A }, { 0xE0000, 0xE0FFF }