aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAvi Halachmi (:avih) <avihpit@yahoo.com>2023-07-24 11:01:00 +0300
committerAvi Halachmi (:avih) <avihpit@yahoo.com>2024-03-29 17:26:13 +0300
commitc188a345a4d8fe453c4a06796e38d036106fc161 (patch)
tree7163015f86810439e67b03f728bf122af44b67a1
parent9e2482e93913a7de1f667720a7ac619fe6fdd723 (diff)
downloadbusybox-w32-c188a345a4d8fe453c4a06796e38d036106fc161.tar.gz
busybox-w32-c188a345a4d8fe453c4a06796e38d036106fc161.tar.bz2
busybox-w32-c188a345a4d8fe453c4a06796e38d036106fc161.zip
win32: unicode: use newer wcwidth by default
This commit adds a new wcwidth implementation at libbb/wcwidth_alt.c, and uses it instead of the existing implementation when compiling for windows and CONFIG_LAST_SUPPORTED_WCHAR >= 0x30000 - which is the case with the unicode configs/mingw64u_defconfig. The windows-target condition keeps non-windows build unmodified, and the last supported wchar threshold is a semi-hack to allow switching between implementations without adding a new config option (the old code supports codepoints up to 0x2ffff). The new file wcwidth_alt.c was generated by a new scripts/mkwcwidth, which prints a wcwidth implementation using latest unicode data from a local clone of https://github.com/jquast/wcwidth . This repo is the main python wcwidth implementation, and is maintained and up to date. Functional differences from the existing implementation: - Unicode 15.1.0 (latest) with the new version (about 450 ranges of wide and zero-width codepoints), compared to roughly Unicode 5.0 of the existing code (nearly 20 years old spec, about 150 ranges). The new spec includes, among others, various wide icons and emojis, which can now be edited correctly at the shell prompt, have correct alignment in 'ls', etc. - The old implementation returns -1 (non-printable) for surrogates, while the new code returns 1, though this is inconsequential, and POSIX doesn't care. Also libc implementations vary in this regard. Technical differences: - The old version compiles less code/data when the last supported wchar is smaller, while the new version doesn't. This doesn't matter because the new version is enabled only for the full range. - The new version is smaller and relatively straight forward, and fully automated (generated), so updates to newer spec is trivial. The old version mixes data, ad-hoc code (tailored to the data), and preprocessor checks, and is hard to automate updates. The old version has various forms of 32 and 16 bit data ranges, in several arrays, while the new version uses single data array with unified form of 32 bits per range, with two rules: - A data range can't span Unicode planes (enforced, but unlikely required, and if yes, code to split ranges would be simple). - A range can't hold more than 32768 codepoints, so bigger ranges are split automatically (currently there are 2 such ranges). Performance wise, the new version should be faster, even with three times the data ranges. Both versions do effectively at most one binary search in one Unicode plane data, but the new version finds both zero-width and wide-width results in this one search, while the old version only finds zero-width, and to detect wide-width it does an additional linear series of manual range tests, but since most results are width 1, this sequence is performed in most (non-ASCII) calls. In a cursory comparison of the new wcwidth with glibc and musl-libc (both use O(1) lookup tables), with few bodies of text, we're in the same ballpark, with typical speed of 60% or better. Bloat-wise, the new version is about 180 bytes code and 1800 bytes data. If it had similar number of data ranges as the old code (150), the new version would be about 200 bytes smaller, but because the new version has 450 data ranges, it's about 1K bigger.
-rw-r--r--include/unicode.h4
-rw-r--r--libbb/unicode.c4
-rw-r--r--libbb/wcwidth_alt.c506
-rwxr-xr-xscripts/mkwcwidth169
4 files changed, 683 insertions, 0 deletions
diff --git a/include/unicode.h b/include/unicode.h
index e894f7148..cdf35acb7 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -33,7 +33,11 @@ enum {
33 33
34# if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR >= 0x30000 34# if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR >= 0x30000
35# undef CONFIG_LAST_SUPPORTED_WCHAR 35# undef CONFIG_LAST_SUPPORTED_WCHAR
36# if ENABLE_PLATFORM_MINGW32
37# define CONFIG_LAST_SUPPORTED_WCHAR 0x10ffff /* full unicode range */
38# else
36# define CONFIG_LAST_SUPPORTED_WCHAR 0x2ffff 39# define CONFIG_LAST_SUPPORTED_WCHAR 0x2ffff
40# endif
37# endif 41# endif
38 42
39# if CONFIG_LAST_SUPPORTED_WCHAR < 0x300 43# if CONFIG_LAST_SUPPORTED_WCHAR < 0x300
diff --git a/libbb/unicode.c b/libbb/unicode.c
index 206ec0dcb..a0b2db625 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -276,6 +276,7 @@ int FAST_FUNC iswpunct(wint_t wc)
276 return (unsigned)wc <= 0x7f && ispunct(wc); 276 return (unsigned)wc <= 0x7f && ispunct(wc);
277} 277}
278 278
279# if !ENABLE_PLATFORM_MINGW32 || CONFIG_LAST_SUPPORTED_WCHAR < 0x30000
279 280
280# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300 281# if CONFIG_LAST_SUPPORTED_WCHAR >= 0x300
281struct interval { 282struct interval {
@@ -711,6 +712,9 @@ int FAST_FUNC wcwidth(unsigned ucs)
711# endif /* >= 0x300 */ 712# endif /* >= 0x300 */
712} 713}
713 714
715# else /* ENABLE_PLATFORM_MINGW32 && CONFIG_LAST_SUPPORTED_WCHAR >= 0x30000 */
716# include "wcwidth_alt.c" /* simpler and more up-to-date implementation */
717# endif
714 718
715# if ENABLE_UNICODE_BIDI_SUPPORT 719# if ENABLE_UNICODE_BIDI_SUPPORT
716int FAST_FUNC unicode_bidi_isrtl(wint_t wc) 720int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
diff --git a/libbb/wcwidth_alt.c b/libbb/wcwidth_alt.c
new file mode 100644
index 000000000..9a45ab0e9
--- /dev/null
+++ b/libbb/wcwidth_alt.c
@@ -0,0 +1,506 @@
1/* wcwidth - Unicode 15.1.0, generated by scripts/mkwcwidth.
2 * Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>
3 * License: MIT
4 *
5 * Data imported on 2024-03-29 from https://github.com/jquast/wcwidth
6 * commit 0.2.13-3-g056ee4b (2024-02-14 15:05:06 -0500)
7 */
8int FAST_FUNC wcwidth(uint32_t ucs)
9{
10 /* sorted ranges, "first" is clipped to 16 bit, and its high bits
11 * (plane) are deduced from the "planes" array below.
12 * (imported from table_zero.py and table_wide.py)
13 */
14 static const struct range {
15 uint16_t first;
16 uint16_t iswide: 1; /* bitfield order empirically faster */
17 uint16_t difflast: 15;
18 } ranges[] = {
19 #define R(first, last, width) {first & 0xffff, width/2, last-first}
20 R(0x000000, 0x000000, 0), /* nil */
21 R(0x0000ad, 0x0000ad, 0), /* Soft Hyphen */
22 R(0x000300, 0x00036f, 0), /* Combining Grave Accent ..Combining Latin Small Le */
23 R(0x000483, 0x000489, 0), /* Combining Cyrillic Titlo..Combining Cyrillic Milli */
24 R(0x000591, 0x0005bd, 0), /* Hebrew Accent Etnahta ..Hebrew Point Meteg */
25 R(0x0005bf, 0x0005bf, 0), /* Hebrew Point Rafe */
26 R(0x0005c1, 0x0005c2, 0), /* Hebrew Point Shin Dot ..Hebrew Point Sin Dot */
27 R(0x0005c4, 0x0005c5, 0), /* Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot */
28 R(0x0005c7, 0x0005c7, 0), /* Hebrew Point Qamats Qatan */
29 R(0x000600, 0x000605, 0), /* Arabic Number Sign ..Arabic Number Mark Above */
30 R(0x000610, 0x00061a, 0), /* Arabic Sign Sallallahou ..Arabic Small Kasra */
31 R(0x00061c, 0x00061c, 0), /* Arabic Letter Mark */
32 R(0x00064b, 0x00065f, 0), /* Arabic Fathatan ..Arabic Wavy Hamza Below */
33 R(0x000670, 0x000670, 0), /* Arabic Letter Superscript Alef */
34 R(0x0006d6, 0x0006dd, 0), /* Arabic Small High Ligatu..Arabic End Of Ayah */
35 R(0x0006df, 0x0006e4, 0), /* Arabic Small High Rounde..Arabic Small High Madda */
36 R(0x0006e7, 0x0006e8, 0), /* Arabic Small High Yeh ..Arabic Small High Noon */
37 R(0x0006ea, 0x0006ed, 0), /* Arabic Empty Centre Low ..Arabic Small Low Meem */
38 R(0x00070f, 0x00070f, 0), /* Syriac Abbreviation Mark */
39 R(0x000711, 0x000711, 0), /* Syriac Letter Superscript Alaph */
40 R(0x000730, 0x00074a, 0), /* Syriac Pthaha Above ..Syriac Barrekh */
41 R(0x0007a6, 0x0007b0, 0), /* Thaana Abafili ..Thaana Sukun */
42 R(0x0007eb, 0x0007f3, 0), /* Nko Combining Short High..Nko Combining Double Dot */
43 R(0x0007fd, 0x0007fd, 0), /* Nko Dantayalan */
44 R(0x000816, 0x000819, 0), /* Samaritan Mark In ..Samaritan Mark Dagesh */
45 R(0x00081b, 0x000823, 0), /* Samaritan Mark Epentheti..Samaritan Vowel Sign A */
46 R(0x000825, 0x000827, 0), /* Samaritan Vowel Sign Sho..Samaritan Vowel Sign U */
47 R(0x000829, 0x00082d, 0), /* Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa */
48 R(0x000859, 0x00085b, 0), /* Mandaic Affrication Mark..Mandaic Gemination Mark */
49 R(0x000890, 0x000891, 0), /* Arabic Pound Mark Above ..Arabic Piastre Mark Abov */
50 R(0x000898, 0x00089f, 0), /* Arabic Small High Word A..Arabic Half Madda Over M */
51 R(0x0008ca, 0x000903, 0), /* Arabic Small High Farsi ..Devanagari Sign Visarga */
52 R(0x00093a, 0x00093c, 0), /* Devanagari Vowel Sign Oe..Devanagari Sign Nukta */
53 R(0x00093e, 0x00094f, 0), /* Devanagari Vowel Sign Aa..Devanagari Vowel Sign Aw */
54 R(0x000951, 0x000957, 0), /* Devanagari Stress Sign U..Devanagari Vowel Sign Uu */
55 R(0x000962, 0x000963, 0), /* Devanagari Vowel Sign Vo..Devanagari Vowel Sign Vo */
56 R(0x000981, 0x000983, 0), /* Bengali Sign Candrabindu..Bengali Sign Visarga */
57 R(0x0009bc, 0x0009bc, 0), /* Bengali Sign Nukta */
58 R(0x0009be, 0x0009c4, 0), /* Bengali Vowel Sign Aa ..Bengali Vowel Sign Vocal */
59 R(0x0009c7, 0x0009c8, 0), /* Bengali Vowel Sign E ..Bengali Vowel Sign Ai */
60 R(0x0009cb, 0x0009cd, 0), /* Bengali Vowel Sign O ..Bengali Sign Virama */
61 R(0x0009d7, 0x0009d7, 0), /* Bengali Au Length Mark */
62 R(0x0009e2, 0x0009e3, 0), /* Bengali Vowel Sign Vocal..Bengali Vowel Sign Vocal */
63 R(0x0009fe, 0x0009fe, 0), /* Bengali Sandhi Mark */
64 R(0x000a01, 0x000a03, 0), /* Gurmukhi Sign Adak Bindi..Gurmukhi Sign Visarga */
65 R(0x000a3c, 0x000a3c, 0), /* Gurmukhi Sign Nukta */
66 R(0x000a3e, 0x000a42, 0), /* Gurmukhi Vowel Sign Aa ..Gurmukhi Vowel Sign Uu */
67 R(0x000a47, 0x000a48, 0), /* Gurmukhi Vowel Sign Ee ..Gurmukhi Vowel Sign Ai */
68 R(0x000a4b, 0x000a4d, 0), /* Gurmukhi Vowel Sign Oo ..Gurmukhi Sign Virama */
69 R(0x000a51, 0x000a51, 0), /* Gurmukhi Sign Udaat */
70 R(0x000a70, 0x000a71, 0), /* Gurmukhi Tippi ..Gurmukhi Addak */
71 R(0x000a75, 0x000a75, 0), /* Gurmukhi Sign Yakash */
72 R(0x000a81, 0x000a83, 0), /* Gujarati Sign Candrabind..Gujarati Sign Visarga */
73 R(0x000abc, 0x000abc, 0), /* Gujarati Sign Nukta */
74 R(0x000abe, 0x000ac5, 0), /* Gujarati Vowel Sign Aa ..Gujarati Vowel Sign Cand */
75 R(0x000ac7, 0x000ac9, 0), /* Gujarati Vowel Sign E ..Gujarati Vowel Sign Cand */
76 R(0x000acb, 0x000acd, 0), /* Gujarati Vowel Sign O ..Gujarati Sign Virama */
77 R(0x000ae2, 0x000ae3, 0), /* Gujarati Vowel Sign Voca..Gujarati Vowel Sign Voca */
78 R(0x000afa, 0x000aff, 0), /* Gujarati Sign Sukun ..Gujarati Sign Two-circle */
79 R(0x000b01, 0x000b03, 0), /* Oriya Sign Candrabindu ..Oriya Sign Visarga */
80 R(0x000b3c, 0x000b3c, 0), /* Oriya Sign Nukta */
81 R(0x000b3e, 0x000b44, 0), /* Oriya Vowel Sign Aa ..Oriya Vowel Sign Vocalic */
82 R(0x000b47, 0x000b48, 0), /* Oriya Vowel Sign E ..Oriya Vowel Sign Ai */
83 R(0x000b4b, 0x000b4d, 0), /* Oriya Vowel Sign O ..Oriya Sign Virama */
84 R(0x000b55, 0x000b57, 0), /* Oriya Sign Overline ..Oriya Au Length Mark */
85 R(0x000b62, 0x000b63, 0), /* Oriya Vowel Sign Vocalic..Oriya Vowel Sign Vocalic */
86 R(0x000b82, 0x000b82, 0), /* Tamil Sign Anusvara */
87 R(0x000bbe, 0x000bc2, 0), /* Tamil Vowel Sign Aa ..Tamil Vowel Sign Uu */
88 R(0x000bc6, 0x000bc8, 0), /* Tamil Vowel Sign E ..Tamil Vowel Sign Ai */
89 R(0x000bca, 0x000bcd, 0), /* Tamil Vowel Sign O ..Tamil Sign Virama */
90 R(0x000bd7, 0x000bd7, 0), /* Tamil Au Length Mark */
91 R(0x000c00, 0x000c04, 0), /* Telugu Sign Combining Ca..Telugu Sign Combining An */
92 R(0x000c3c, 0x000c3c, 0), /* Telugu Sign Nukta */
93 R(0x000c3e, 0x000c44, 0), /* Telugu Vowel Sign Aa ..Telugu Vowel Sign Vocali */
94 R(0x000c46, 0x000c48, 0), /* Telugu Vowel Sign E ..Telugu Vowel Sign Ai */
95 R(0x000c4a, 0x000c4d, 0), /* Telugu Vowel Sign O ..Telugu Sign Virama */
96 R(0x000c55, 0x000c56, 0), /* Telugu Length Mark ..Telugu Ai Length Mark */
97 R(0x000c62, 0x000c63, 0), /* Telugu Vowel Sign Vocali..Telugu Vowel Sign Vocali */
98 R(0x000c81, 0x000c83, 0), /* Kannada Sign Candrabindu..Kannada Sign Visarga */
99 R(0x000cbc, 0x000cbc, 0), /* Kannada Sign Nukta */
100 R(0x000cbe, 0x000cc4, 0), /* Kannada Vowel Sign Aa ..Kannada Vowel Sign Vocal */
101 R(0x000cc6, 0x000cc8, 0), /* Kannada Vowel Sign E ..Kannada Vowel Sign Ai */
102 R(0x000cca, 0x000ccd, 0), /* Kannada Vowel Sign O ..Kannada Sign Virama */
103 R(0x000cd5, 0x000cd6, 0), /* Kannada Length Mark ..Kannada Ai Length Mark */
104 R(0x000ce2, 0x000ce3, 0), /* Kannada Vowel Sign Vocal..Kannada Vowel Sign Vocal */
105 R(0x000cf3, 0x000cf3, 0), /* Kannada Sign Combining Anusvara Above Right */
106 R(0x000d00, 0x000d03, 0), /* Malayalam Sign Combining..Malayalam Sign Visarga */
107 R(0x000d3b, 0x000d3c, 0), /* Malayalam Sign Vertical ..Malayalam Sign Circular */
108 R(0x000d3e, 0x000d44, 0), /* Malayalam Vowel Sign Aa ..Malayalam Vowel Sign Voc */
109 R(0x000d46, 0x000d48, 0), /* Malayalam Vowel Sign E ..Malayalam Vowel Sign Ai */
110 R(0x000d4a, 0x000d4d, 0), /* Malayalam Vowel Sign O ..Malayalam Sign Virama */
111 R(0x000d57, 0x000d57, 0), /* Malayalam Au Length Mark */
112 R(0x000d62, 0x000d63, 0), /* Malayalam Vowel Sign Voc..Malayalam Vowel Sign Voc */
113 R(0x000d81, 0x000d83, 0), /* Sinhala Sign Candrabindu..Sinhala Sign Visargaya */
114 R(0x000dca, 0x000dca, 0), /* Sinhala Sign Al-lakuna */
115 R(0x000dcf, 0x000dd4, 0), /* Sinhala Vowel Sign Aela-..Sinhala Vowel Sign Ketti */
116 R(0x000dd6, 0x000dd6, 0), /* Sinhala Vowel Sign Diga Paa-pilla */
117 R(0x000dd8, 0x000ddf, 0), /* Sinhala Vowel Sign Gaett..Sinhala Vowel Sign Gayan */
118 R(0x000df2, 0x000df3, 0), /* Sinhala Vowel Sign Diga ..Sinhala Vowel Sign Diga */
119 R(0x000e31, 0x000e31, 0), /* Thai Character Mai Han-akat */
120 R(0x000e34, 0x000e3a, 0), /* Thai Character Sara I ..Thai Character Phinthu */
121 R(0x000e47, 0x000e4e, 0), /* Thai Character Maitaikhu..Thai Character Yamakkan */
122 R(0x000eb1, 0x000eb1, 0), /* Lao Vowel Sign Mai Kan */
123 R(0x000eb4, 0x000ebc, 0), /* Lao Vowel Sign I ..Lao Semivowel Sign Lo */
124 R(0x000ec8, 0x000ece, 0), /* Lao Tone Mai Ek ..Lao Yamakkan */
125 R(0x000f18, 0x000f19, 0), /* Tibetan Astrological Sig..Tibetan Astrological Sig */
126 R(0x000f35, 0x000f35, 0), /* Tibetan Mark Ngas Bzung Nyi Zla */
127 R(0x000f37, 0x000f37, 0), /* Tibetan Mark Ngas Bzung Sgor Rtags */
128 R(0x000f39, 0x000f39, 0), /* Tibetan Mark Tsa -phru */
129 R(0x000f3e, 0x000f3f, 0), /* Tibetan Sign Yar Tshes ..Tibetan Sign Mar Tshes */
130 R(0x000f71, 0x000f84, 0), /* Tibetan Vowel Sign Aa ..Tibetan Mark Halanta */
131 R(0x000f86, 0x000f87, 0), /* Tibetan Sign Lci Rtags ..Tibetan Sign Yang Rtags */
132 R(0x000f8d, 0x000f97, 0), /* Tibetan Subjoined Sign L..Tibetan Subjoined Letter */
133 R(0x000f99, 0x000fbc, 0), /* Tibetan Subjoined Letter..Tibetan Subjoined Letter */
134 R(0x000fc6, 0x000fc6, 0), /* Tibetan Symbol Padma Gdan */
135 R(0x00102b, 0x00103e, 0), /* Myanmar Vowel Sign Tall ..Myanmar Consonant Sign M */
136 R(0x001056, 0x001059, 0), /* Myanmar Vowel Sign Vocal..Myanmar Vowel Sign Vocal */
137 R(0x00105e, 0x001060, 0), /* Myanmar Consonant Sign M..Myanmar Consonant Sign M */
138 R(0x001062, 0x001064, 0), /* Myanmar Vowel Sign Sgaw ..Myanmar Tone Mark Sgaw K */
139 R(0x001067, 0x00106d, 0), /* Myanmar Vowel Sign Weste..Myanmar Sign Western Pwo */
140 R(0x001071, 0x001074, 0), /* Myanmar Vowel Sign Geba ..Myanmar Vowel Sign Kayah */
141 R(0x001082, 0x00108d, 0), /* Myanmar Consonant Sign S..Myanmar Sign Shan Counci */
142 R(0x00108f, 0x00108f, 0), /* Myanmar Sign Rumai Palaung Tone-5 */
143 R(0x00109a, 0x00109d, 0), /* Myanmar Sign Khamti Tone..Myanmar Vowel Sign Aiton */
144 R(0x001100, 0x00115f, 2), /* Hangul Choseong Kiyeok ..Hangul Choseong Filler */
145 R(0x001160, 0x0011ff, 0), /* Hangul Jungseong Filler ..Hangul Jongseong Ssangni */
146 R(0x00135d, 0x00135f, 0), /* Ethiopic Combining Gemin..Ethiopic Combining Gemin */
147 R(0x001712, 0x001715, 0), /* Tagalog Vowel Sign I ..Tagalog Sign Pamudpod */
148 R(0x001732, 0x001734, 0), /* Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod */
149 R(0x001752, 0x001753, 0), /* Buhid Vowel Sign I ..Buhid Vowel Sign U */
150 R(0x001772, 0x001773, 0), /* Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U */
151 R(0x0017b4, 0x0017d3, 0), /* Khmer Vowel Inherent Aq ..Khmer Sign Bathamasat */
152 R(0x0017dd, 0x0017dd, 0), /* Khmer Sign Atthacan */
153 R(0x00180b, 0x00180f, 0), /* Mongolian Free Variation..Mongolian Free Variation */
154 R(0x001885, 0x001886, 0), /* Mongolian Letter Ali Gal..Mongolian Letter Ali Gal */
155 R(0x0018a9, 0x0018a9, 0), /* Mongolian Letter Ali Gali Dagalga */
156 R(0x001920, 0x00192b, 0), /* Limbu Vowel Sign A ..Limbu Subjoined Letter W */
157 R(0x001930, 0x00193b, 0), /* Limbu Small Letter Ka ..Limbu Sign Sa-i */
158 R(0x001a17, 0x001a1b, 0), /* Buginese Vowel Sign I ..Buginese Vowel Sign Ae */
159 R(0x001a55, 0x001a5e, 0), /* Tai Tham Consonant Sign ..Tai Tham Consonant Sign */
160 R(0x001a60, 0x001a7c, 0), /* Tai Tham Sign Sakot ..Tai Tham Sign Khuen-lue */
161 R(0x001a7f, 0x001a7f, 0), /* Tai Tham Combining Cryptogrammic Dot */
162 R(0x001ab0, 0x001ace, 0), /* Combining Doubled Circum..Combining Latin Small Le */
163 R(0x001b00, 0x001b04, 0), /* Balinese Sign Ulu Ricem ..Balinese Sign Bisah */
164 R(0x001b34, 0x001b44, 0), /* Balinese Sign Rerekan ..Balinese Adeg Adeg */
165 R(0x001b6b, 0x001b73, 0), /* Balinese Musical Symbol ..Balinese Musical Symbol */
166 R(0x001b80, 0x001b82, 0), /* Sundanese Sign Panyecek ..Sundanese Sign Pangwisad */
167 R(0x001ba1, 0x001bad, 0), /* Sundanese Consonant Sign..Sundanese Consonant Sign */
168 R(0x001be6, 0x001bf3, 0), /* Batak Sign Tompi ..Batak Panongonan */
169 R(0x001c24, 0x001c37, 0), /* Lepcha Subjoined Letter ..Lepcha Sign Nukta */
170 R(0x001cd0, 0x001cd2, 0), /* Vedic Tone Karshana ..Vedic Tone Prenkha */
171 R(0x001cd4, 0x001ce8, 0), /* Vedic Sign Yajurvedic Mi..Vedic Sign Visarga Anuda */
172 R(0x001ced, 0x001ced, 0), /* Vedic Sign Tiryak */
173 R(0x001cf4, 0x001cf4, 0), /* Vedic Tone Candra Above */
174 R(0x001cf7, 0x001cf9, 0), /* Vedic Sign Atikrama ..Vedic Tone Double Ring A */
175 R(0x001dc0, 0x001dff, 0), /* Combining Dotted Grave A..Combining Right Arrowhea */
176 R(0x00200b, 0x00200f, 0), /* Zero Width Space ..Right-to-left Mark */
177 R(0x002028, 0x00202e, 0), /* Line Separator ..Right-to-left Override */
178 R(0x002060, 0x002064, 0), /* Word Joiner ..Invisible Plus */
179 R(0x002066, 0x00206f, 0), /* Left-to-right Isolate ..Nominal Digit Shapes */
180 R(0x0020d0, 0x0020f0, 0), /* Combining Left Harpoon A..Combining Asterisk Above */
181 R(0x00231a, 0x00231b, 2), /* Watch ..Hourglass */
182 R(0x002329, 0x00232a, 2), /* Left-pointing Angle Brac..Right-pointing Angle Bra */
183 R(0x0023e9, 0x0023ec, 2), /* Black Right-pointing Dou..Black Down-pointing Doub */
184 R(0x0023f0, 0x0023f0, 2), /* Alarm Clock */
185 R(0x0023f3, 0x0023f3, 2), /* Hourglass With Flowing Sand */
186 R(0x0025fd, 0x0025fe, 2), /* White Medium Small Squar..Black Medium Small Squar */
187 R(0x002614, 0x002615, 2), /* Umbrella With Rain Drops..Hot Beverage */
188 R(0x002648, 0x002653, 2), /* Aries ..Pisces */
189 R(0x00267f, 0x00267f, 2), /* Wheelchair Symbol */
190 R(0x002693, 0x002693, 2), /* Anchor */
191 R(0x0026a1, 0x0026a1, 2), /* High Voltage Sign */
192 R(0x0026aa, 0x0026ab, 2), /* Medium White Circle ..Medium Black Circle */
193 R(0x0026bd, 0x0026be, 2), /* Soccer Ball ..Baseball */
194 R(0x0026c4, 0x0026c5, 2), /* Snowman Without Snow ..Sun Behind Cloud */
195 R(0x0026ce, 0x0026ce, 2), /* Ophiuchus */
196 R(0x0026d4, 0x0026d4, 2), /* No Entry */
197 R(0x0026ea, 0x0026ea, 2), /* Church */
198 R(0x0026f2, 0x0026f3, 2), /* Fountain ..Flag In Hole */
199 R(0x0026f5, 0x0026f5, 2), /* Sailboat */
200 R(0x0026fa, 0x0026fa, 2), /* Tent */
201 R(0x0026fd, 0x0026fd, 2), /* Fuel Pump */
202 R(0x002705, 0x002705, 2), /* White Heavy Check Mark */
203 R(0x00270a, 0x00270b, 2), /* Raised Fist ..Raised Hand */
204 R(0x002728, 0x002728, 2), /* Sparkles */
205 R(0x00274c, 0x00274c, 2), /* Cross Mark */
206 R(0x00274e, 0x00274e, 2), /* Negative Squared Cross Mark */
207 R(0x002753, 0x002755, 2), /* Black Question Mark Orna..White Exclamation Mark O */
208 R(0x002757, 0x002757, 2), /* Heavy Exclamation Mark Symbol */
209 R(0x002795, 0x002797, 2), /* Heavy Plus Sign ..Heavy Division Sign */
210 R(0x0027b0, 0x0027b0, 2), /* Curly Loop */
211 R(0x0027bf, 0x0027bf, 2), /* Double Curly Loop */
212 R(0x002b1b, 0x002b1c, 2), /* Black Large Square ..White Large Square */
213 R(0x002b50, 0x002b50, 2), /* White Medium Star */
214 R(0x002b55, 0x002b55, 2), /* Heavy Large Circle */
215 R(0x002cef, 0x002cf1, 0), /* Coptic Combining Ni Abov..Coptic Combining Spiritu */
216 R(0x002d7f, 0x002d7f, 0), /* Tifinagh Consonant Joiner */
217 R(0x002de0, 0x002dff, 0), /* Combining Cyrillic Lette..Combining Cyrillic Lette */
218 R(0x002e80, 0x002e99, 2), /* Cjk Radical Repeat ..Cjk Radical Rap */
219 R(0x002e9b, 0x002ef3, 2), /* Cjk Radical Choke ..Cjk Radical C-simplified */
220 R(0x002f00, 0x002fd5, 2), /* Kangxi Radical One ..Kangxi Radical Flute */
221 R(0x002ff0, 0x003029, 2), /* Ideographic Description ..Hangzhou Numeral Nine */
222 R(0x00302a, 0x00302f, 0), /* Ideographic Level Tone M..Hangul Double Dot Tone M */
223 R(0x003030, 0x00303e, 2), /* Wavy Dash ..Ideographic Variation In */
224 R(0x003041, 0x003096, 2), /* Hiragana Letter Small A ..Hiragana Letter Small Ke */
225 R(0x003099, 0x00309a, 0), /* Combining Katakana-hirag..Combining Katakana-hirag */
226 R(0x00309b, 0x0030ff, 2), /* Katakana-hiragana Voiced..Katakana Digraph Koto */
227 R(0x003105, 0x00312f, 2), /* Bopomofo Letter B ..Bopomofo Letter Nn */
228 R(0x003131, 0x00318e, 2), /* Hangul Letter Kiyeok ..Hangul Letter Araeae */
229 R(0x003190, 0x0031e3, 2), /* Ideographic Annotation L..Cjk Stroke Q */
230 R(0x0031ef, 0x00321e, 2), /* nil ..Parenthesized Korean Cha */
231 R(0x003220, 0x003247, 2), /* Parenthesized Ideograph ..Circled Ideograph Koto */
232 R(0x003250, 0x004dbf, 2), /* Partnership Sign ..Cjk Unified Ideograph-4d */
233 R(0x004e00, 0x00a48c, 2), /* Cjk Unified Ideograph-4e..Yi Syllable Yyr */
234 R(0x00a490, 0x00a4c6, 2), /* Yi Radical Qot ..Yi Radical Ke */
235 R(0x00a66f, 0x00a672, 0), /* Combining Cyrillic Vzmet..Combining Cyrillic Thous */
236 R(0x00a674, 0x00a67d, 0), /* Combining Cyrillic Lette..Combining Cyrillic Payer */
237 R(0x00a69e, 0x00a69f, 0), /* Combining Cyrillic Lette..Combining Cyrillic Lette */
238 R(0x00a6f0, 0x00a6f1, 0), /* Bamum Combining Mark Koq..Bamum Combining Mark Tuk */
239 R(0x00a802, 0x00a802, 0), /* Syloti Nagri Sign Dvisvara */
240 R(0x00a806, 0x00a806, 0), /* Syloti Nagri Sign Hasanta */
241 R(0x00a80b, 0x00a80b, 0), /* Syloti Nagri Sign Anusvara */
242 R(0x00a823, 0x00a827, 0), /* Syloti Nagri Vowel Sign ..Syloti Nagri Vowel Sign */
243 R(0x00a82c, 0x00a82c, 0), /* Syloti Nagri Sign Alternate Hasanta */
244 R(0x00a880, 0x00a881, 0), /* Saurashtra Sign Anusvara..Saurashtra Sign Visarga */
245 R(0x00a8b4, 0x00a8c5, 0), /* Saurashtra Consonant Sig..Saurashtra Sign Candrabi */
246 R(0x00a8e0, 0x00a8f1, 0), /* Combining Devanagari Dig..Combining Devanagari Sig */
247 R(0x00a8ff, 0x00a8ff, 0), /* Devanagari Vowel Sign Ay */
248 R(0x00a926, 0x00a92d, 0), /* Kayah Li Vowel Ue ..Kayah Li Tone Calya Plop */
249 R(0x00a947, 0x00a953, 0), /* Rejang Vowel Sign I ..Rejang Virama */
250 R(0x00a960, 0x00a97c, 2), /* Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo */
251 R(0x00a980, 0x00a983, 0), /* Javanese Sign Panyangga ..Javanese Sign Wignyan */
252 R(0x00a9b3, 0x00a9c0, 0), /* Javanese Sign Cecak Telu..Javanese Pangkon */
253 R(0x00a9e5, 0x00a9e5, 0), /* Myanmar Sign Shan Saw */
254 R(0x00aa29, 0x00aa36, 0), /* Cham Vowel Sign Aa ..Cham Consonant Sign Wa */
255 R(0x00aa43, 0x00aa43, 0), /* Cham Consonant Sign Final Ng */
256 R(0x00aa4c, 0x00aa4d, 0), /* Cham Consonant Sign Fina..Cham Consonant Sign Fina */
257 R(0x00aa7b, 0x00aa7d, 0), /* Myanmar Sign Pao Karen T..Myanmar Sign Tai Laing T */
258 R(0x00aab0, 0x00aab0, 0), /* Tai Viet Mai Kang */
259 R(0x00aab2, 0x00aab4, 0), /* Tai Viet Vowel I ..Tai Viet Vowel U */
260 R(0x00aab7, 0x00aab8, 0), /* Tai Viet Mai Khit ..Tai Viet Vowel Ia */
261 R(0x00aabe, 0x00aabf, 0), /* Tai Viet Vowel Am ..Tai Viet Tone Mai Ek */
262 R(0x00aac1, 0x00aac1, 0), /* Tai Viet Tone Mai Tho */
263 R(0x00aaeb, 0x00aaef, 0), /* Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign */
264 R(0x00aaf5, 0x00aaf6, 0), /* Meetei Mayek Vowel Sign ..Meetei Mayek Virama */
265 R(0x00abe3, 0x00abea, 0), /* Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign */
266 R(0x00abec, 0x00abed, 0), /* Meetei Mayek Lum Iyek ..Meetei Mayek Apun Iyek */
267 R(0x00ac00, 0x00d7a3, 2), /* Hangul Syllable Ga ..Hangul Syllable Hih */
268 R(0x00d7b0, 0x00d7ff, 0), /* Hangul Jungseong O-yeo .. nil */
269 R(0x00f900, 0x00faff, 2), /* Cjk Compatibility Ideogr.. nil */
270 R(0x00fb1e, 0x00fb1e, 0), /* Hebrew Point Judeo-spanish Varika */
271 R(0x00fe00, 0x00fe0f, 0), /* Variation Selector-1 ..Variation Selector-16 */
272 R(0x00fe10, 0x00fe19, 2), /* Presentation Form For Ve..Presentation Form For Ve */
273 R(0x00fe20, 0x00fe2f, 0), /* Combining Ligature Left ..Combining Cyrillic Titlo */
274 R(0x00fe30, 0x00fe52, 2), /* Presentation Form For Ve..Small Full Stop */
275 R(0x00fe54, 0x00fe66, 2), /* Small Semicolon ..Small Equals Sign */
276 R(0x00fe68, 0x00fe6b, 2), /* Small Reverse Solidus ..Small Commercial At */
277 R(0x00feff, 0x00feff, 0), /* Zero Width No-break Space */
278 R(0x00ff01, 0x00ff60, 2), /* Fullwidth Exclamation Ma..Fullwidth Right White Pa */
279 R(0x00ffe0, 0x00ffe6, 2), /* Fullwidth Cent Sign ..Fullwidth Won Sign */
280 R(0x00fff9, 0x00fffb, 0), /* Interlinear Annotation A..Interlinear Annotation T */
281 R(0x0101fd, 0x0101fd, 0), /* Phaistos Disc Sign Combining Oblique Stroke */
282 R(0x0102e0, 0x0102e0, 0), /* Coptic Epact Thousands Mark */
283 R(0x010376, 0x01037a, 0), /* Combining Old Permic Let..Combining Old Permic Let */
284 R(0x010a01, 0x010a03, 0), /* Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo */
285 R(0x010a05, 0x010a06, 0), /* Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O */
286 R(0x010a0c, 0x010a0f, 0), /* Kharoshthi Vowel Length ..Kharoshthi Sign Visarga */
287 R(0x010a38, 0x010a3a, 0), /* Kharoshthi Sign Bar Abov..Kharoshthi Sign Dot Belo */
288 R(0x010a3f, 0x010a3f, 0), /* Kharoshthi Virama */
289 R(0x010ae5, 0x010ae6, 0), /* Manichaean Abbreviation ..Manichaean Abbreviation */
290 R(0x010d24, 0x010d27, 0), /* Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas */
291 R(0x010eab, 0x010eac, 0), /* Yezidi Combining Hamza M..Yezidi Combining Madda M */
292 R(0x010efd, 0x010eff, 0), /* Arabic Small Low Word Sa..Arabic Small Low Word Ma */
293 R(0x010f46, 0x010f50, 0), /* Sogdian Combining Dot Be..Sogdian Combining Stroke */
294 R(0x010f82, 0x010f85, 0), /* Old Uyghur Combining Dot..Old Uyghur Combining Two */
295 R(0x011000, 0x011002, 0), /* Brahmi Sign Candrabindu ..Brahmi Sign Visarga */
296 R(0x011038, 0x011046, 0), /* Brahmi Vowel Sign Aa ..Brahmi Virama */
297 R(0x011070, 0x011070, 0), /* Brahmi Sign Old Tamil Virama */
298 R(0x011073, 0x011074, 0), /* Brahmi Vowel Sign Old Ta..Brahmi Vowel Sign Old Ta */
299 R(0x01107f, 0x011082, 0), /* Brahmi Number Joiner ..Kaithi Sign Visarga */
300 R(0x0110b0, 0x0110ba, 0), /* Kaithi Vowel Sign Aa ..Kaithi Sign Nukta */
301 R(0x0110bd, 0x0110bd, 0), /* Kaithi Number Sign */
302 R(0x0110c2, 0x0110c2, 0), /* Kaithi Vowel Sign Vocalic R */
303 R(0x0110cd, 0x0110cd, 0), /* Kaithi Number Sign Above */
304 R(0x011100, 0x011102, 0), /* Chakma Sign Candrabindu ..Chakma Sign Visarga */
305 R(0x011127, 0x011134, 0), /* Chakma Vowel Sign A ..Chakma Maayyaa */
306 R(0x011145, 0x011146, 0), /* Chakma Vowel Sign Aa ..Chakma Vowel Sign Ei */
307 R(0x011173, 0x011173, 0), /* Mahajani Sign Nukta */
308 R(0x011180, 0x011182, 0), /* Sharada Sign Candrabindu..Sharada Sign Visarga */
309 R(0x0111b3, 0x0111c0, 0), /* Sharada Vowel Sign Aa ..Sharada Sign Virama */
310 R(0x0111c9, 0x0111cc, 0), /* Sharada Sandhi Mark ..Sharada Extra Short Vowe */
311 R(0x0111ce, 0x0111cf, 0), /* Sharada Vowel Sign Prish..Sharada Sign Inverted Ca */
312 R(0x01122c, 0x011237, 0), /* Khojki Vowel Sign Aa ..Khojki Sign Shadda */
313 R(0x01123e, 0x01123e, 0), /* Khojki Sign Sukun */
314 R(0x011241, 0x011241, 0), /* Khojki Vowel Sign Vocalic R */
315 R(0x0112df, 0x0112ea, 0), /* Khudawadi Sign Anusvara ..Khudawadi Sign Virama */
316 R(0x011300, 0x011303, 0), /* Grantha Sign Combining A..Grantha Sign Visarga */
317 R(0x01133b, 0x01133c, 0), /* Combining Bindu Below ..Grantha Sign Nukta */
318 R(0x01133e, 0x011344, 0), /* Grantha Vowel Sign Aa ..Grantha Vowel Sign Vocal */
319 R(0x011347, 0x011348, 0), /* Grantha Vowel Sign Ee ..Grantha Vowel Sign Ai */
320 R(0x01134b, 0x01134d, 0), /* Grantha Vowel Sign Oo ..Grantha Sign Virama */
321 R(0x011357, 0x011357, 0), /* Grantha Au Length Mark */
322 R(0x011362, 0x011363, 0), /* Grantha Vowel Sign Vocal..Grantha Vowel Sign Vocal */
323 R(0x011366, 0x01136c, 0), /* Combining Grantha Digit ..Combining Grantha Digit */
324 R(0x011370, 0x011374, 0), /* Combining Grantha Letter..Combining Grantha Letter */
325 R(0x011435, 0x011446, 0), /* Newa Vowel Sign Aa ..Newa Sign Nukta */
326 R(0x01145e, 0x01145e, 0), /* Newa Sandhi Mark */
327 R(0x0114b0, 0x0114c3, 0), /* Tirhuta Vowel Sign Aa ..Tirhuta Sign Nukta */
328 R(0x0115af, 0x0115b5, 0), /* Siddham Vowel Sign Aa ..Siddham Vowel Sign Vocal */
329 R(0x0115b8, 0x0115c0, 0), /* Siddham Vowel Sign E ..Siddham Sign Nukta */
330 R(0x0115dc, 0x0115dd, 0), /* Siddham Vowel Sign Alter..Siddham Vowel Sign Alter */
331 R(0x011630, 0x011640, 0), /* Modi Vowel Sign Aa ..Modi Sign Ardhacandra */
332 R(0x0116ab, 0x0116b7, 0), /* Takri Sign Anusvara ..Takri Sign Nukta */
333 R(0x01171d, 0x01172b, 0), /* Ahom Consonant Sign Medi..Ahom Sign Killer */
334 R(0x01182c, 0x01183a, 0), /* Dogra Vowel Sign Aa ..Dogra Sign Nukta */
335 R(0x011930, 0x011935, 0), /* Dives Akuru Vowel Sign A..Dives Akuru Vowel Sign E */
336 R(0x011937, 0x011938, 0), /* Dives Akuru Vowel Sign A..Dives Akuru Vowel Sign O */
337 R(0x01193b, 0x01193e, 0), /* Dives Akuru Sign Anusvar..Dives Akuru Virama */
338 R(0x011940, 0x011940, 0), /* Dives Akuru Medial Ya */
339 R(0x011942, 0x011943, 0), /* Dives Akuru Medial Ra ..Dives Akuru Sign Nukta */
340 R(0x0119d1, 0x0119d7, 0), /* Nandinagari Vowel Sign A..Nandinagari Vowel Sign V */
341 R(0x0119da, 0x0119e0, 0), /* Nandinagari Vowel Sign E..Nandinagari Sign Virama */
342 R(0x0119e4, 0x0119e4, 0), /* Nandinagari Vowel Sign Prishthamatra E */
343 R(0x011a01, 0x011a0a, 0), /* Zanabazar Square Vowel S..Zanabazar Square Vowel L */
344 R(0x011a33, 0x011a39, 0), /* Zanabazar Square Final C..Zanabazar Square Sign Vi */
345 R(0x011a3b, 0x011a3e, 0), /* Zanabazar Square Cluster..Zanabazar Square Cluster */
346 R(0x011a47, 0x011a47, 0), /* Zanabazar Square Subjoiner */
347 R(0x011a51, 0x011a5b, 0), /* Soyombo Vowel Sign I ..Soyombo Vowel Length Mar */
348 R(0x011a8a, 0x011a99, 0), /* Soyombo Final Consonant ..Soyombo Subjoiner */
349 R(0x011c2f, 0x011c36, 0), /* Bhaiksuki Vowel Sign Aa ..Bhaiksuki Vowel Sign Voc */
350 R(0x011c38, 0x011c3f, 0), /* Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Virama */
351 R(0x011c92, 0x011ca7, 0), /* Marchen Subjoined Letter..Marchen Subjoined Letter */
352 R(0x011ca9, 0x011cb6, 0), /* Marchen Subjoined Letter..Marchen Sign Candrabindu */
353 R(0x011d31, 0x011d36, 0), /* Masaram Gondi Vowel Sign..Masaram Gondi Vowel Sign */
354 R(0x011d3a, 0x011d3a, 0), /* Masaram Gondi Vowel Sign E */
355 R(0x011d3c, 0x011d3d, 0), /* Masaram Gondi Vowel Sign..Masaram Gondi Vowel Sign */
356 R(0x011d3f, 0x011d45, 0), /* Masaram Gondi Vowel Sign..Masaram Gondi Virama */
357 R(0x011d47, 0x011d47, 0), /* Masaram Gondi Ra-kara */
358 R(0x011d8a, 0x011d8e, 0), /* Gunjala Gondi Vowel Sign..Gunjala Gondi Vowel Sign */
359 R(0x011d90, 0x011d91, 0), /* Gunjala Gondi Vowel Sign..Gunjala Gondi Vowel Sign */
360 R(0x011d93, 0x011d97, 0), /* Gunjala Gondi Vowel Sign..Gunjala Gondi Virama */
361 R(0x011ef3, 0x011ef6, 0), /* Makasar Vowel Sign I ..Makasar Vowel Sign O */
362 R(0x011f00, 0x011f01, 0), /* Kawi Sign Candrabindu ..Kawi Sign Anusvara */
363 R(0x011f03, 0x011f03, 0), /* Kawi Sign Visarga */
364 R(0x011f34, 0x011f3a, 0), /* Kawi Vowel Sign Aa ..Kawi Vowel Sign Vocalic */
365 R(0x011f3e, 0x011f42, 0), /* Kawi Vowel Sign E ..Kawi Conjoiner */
366 R(0x013430, 0x013440, 0), /* Egyptian Hieroglyph Vert..Egyptian Hieroglyph Mirr */
367 R(0x013447, 0x013455, 0), /* Egyptian Hieroglyph Modi..Egyptian Hieroglyph Modi */
368 R(0x016af0, 0x016af4, 0), /* Bassa Vah Combining High..Bassa Vah Combining High */
369 R(0x016b30, 0x016b36, 0), /* Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta */
370 R(0x016f4f, 0x016f4f, 0), /* Miao Sign Consonant Modifier Bar */
371 R(0x016f51, 0x016f87, 0), /* Miao Sign Aspiration ..Miao Vowel Sign Ui */
372 R(0x016f8f, 0x016f92, 0), /* Miao Tone Right ..Miao Tone Below */
373 R(0x016fe0, 0x016fe3, 2), /* Tangut Iteration Mark ..Old Chinese Iteration Ma */
374 R(0x016fe4, 0x016fe4, 0), /* Khitan Small Script Filler */
375 R(0x016ff0, 0x016ff1, 0), /* Vietnamese Alternate Rea..Vietnamese Alternate Rea */
376 R(0x017000, 0x0187f7, 2), /* nil */
377 R(0x018800, 0x018cd5, 2), /* Tangut Component-001 ..Khitan Small Script Char */
378 R(0x018d00, 0x018d08, 2), /* nil */
379 R(0x01aff0, 0x01aff3, 2), /* Katakana Letter Minnan T..Katakana Letter Minnan T */
380 R(0x01aff5, 0x01affb, 2), /* Katakana Letter Minnan T..Katakana Letter Minnan N */
381 R(0x01affd, 0x01affe, 2), /* Katakana Letter Minnan N..Katakana Letter Minnan N */
382 R(0x01b000, 0x01b122, 2), /* Katakana Letter Archaic ..Katakana Letter Archaic */
383 R(0x01b132, 0x01b132, 2), /* Hiragana Letter Small Ko */
384 R(0x01b150, 0x01b152, 2), /* Hiragana Letter Small Wi..Hiragana Letter Small Wo */
385 R(0x01b155, 0x01b155, 2), /* Katakana Letter Small Ko */
386 R(0x01b164, 0x01b167, 2), /* Katakana Letter Small Wi..Katakana Letter Small N */
387 R(0x01b170, 0x01b2fb, 2), /* Nushu Character-1b170 ..Nushu Character-1b2fb */
388 R(0x01bc9d, 0x01bc9e, 0), /* Duployan Thick Letter Se..Duployan Double Mark */
389 R(0x01bca0, 0x01bca3, 0), /* Shorthand Format Letter ..Shorthand Format Up Step */
390 R(0x01cf00, 0x01cf2d, 0), /* Znamenny Combining Mark ..Znamenny Combining Mark */
391 R(0x01cf30, 0x01cf46, 0), /* Znamenny Combining Tonal..Znamenny Priznak Modifie */
392 R(0x01d165, 0x01d169, 0), /* Musical Symbol Combining..Musical Symbol Combining */
393 R(0x01d16d, 0x01d182, 0), /* Musical Symbol Combining..Musical Symbol Combining */
394 R(0x01d185, 0x01d18b, 0), /* Musical Symbol Combining..Musical Symbol Combining */
395 R(0x01d1aa, 0x01d1ad, 0), /* Musical Symbol Combining..Musical Symbol Combining */
396 R(0x01d242, 0x01d244, 0), /* Combining Greek Musical ..Combining Greek Musical */
397 R(0x01da00, 0x01da36, 0), /* Signwriting Head Rim ..Signwriting Air Sucking */
398 R(0x01da3b, 0x01da6c, 0), /* Signwriting Mouth Closed..Signwriting Excitement */
399 R(0x01da75, 0x01da75, 0), /* Signwriting Upper Body Tilting From Hip Joints */
400 R(0x01da84, 0x01da84, 0), /* Signwriting Location Head Neck */
401 R(0x01da9b, 0x01da9f, 0), /* Signwriting Fill Modifie..Signwriting Fill Modifie */
402 R(0x01daa1, 0x01daaf, 0), /* Signwriting Rotation Mod..Signwriting Rotation Mod */
403 R(0x01e000, 0x01e006, 0), /* Combining Glagolitic Let..Combining Glagolitic Let */
404 R(0x01e008, 0x01e018, 0), /* Combining Glagolitic Let..Combining Glagolitic Let */
405 R(0x01e01b, 0x01e021, 0), /* Combining Glagolitic Let..Combining Glagolitic Let */
406 R(0x01e023, 0x01e024, 0), /* Combining Glagolitic Let..Combining Glagolitic Let */
407 R(0x01e026, 0x01e02a, 0), /* Combining Glagolitic Let..Combining Glagolitic Let */
408 R(0x01e08f, 0x01e08f, 0), /* Combining Cyrillic Small Letter Byelorussian-ukr */
409 R(0x01e130, 0x01e136, 0), /* Nyiakeng Puachue Hmong T..Nyiakeng Puachue Hmong T */
410 R(0x01e2ae, 0x01e2ae, 0), /* Toto Sign Rising Tone */
411 R(0x01e2ec, 0x01e2ef, 0), /* Wancho Tone Tup ..Wancho Tone Koini */
412 R(0x01e4ec, 0x01e4ef, 0), /* Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh */
413 R(0x01e8d0, 0x01e8d6, 0), /* Mende Kikakui Combining ..Mende Kikakui Combining */
414 R(0x01e944, 0x01e94a, 0), /* Adlam Alif Lengthener ..Adlam Nukta */
415 R(0x01f004, 0x01f004, 2), /* Mahjong Tile Red Dragon */
416 R(0x01f0cf, 0x01f0cf, 2), /* Playing Card Black Joker */
417 R(0x01f18e, 0x01f18e, 2), /* Negative Squared Ab */
418 R(0x01f191, 0x01f19a, 2), /* Squared Cl ..Squared Vs */
419 R(0x01f200, 0x01f202, 2), /* Square Hiragana Hoka ..Squared Katakana Sa */
420 R(0x01f210, 0x01f23b, 2), /* Squared Cjk Unified Ideo..Squared Cjk Unified Ideo */
421 R(0x01f240, 0x01f248, 2), /* Tortoise Shell Bracketed..Tortoise Shell Bracketed */
422 R(0x01f250, 0x01f251, 2), /* Circled Ideograph Advant..Circled Ideograph Accept */
423 R(0x01f260, 0x01f265, 2), /* Rounded Symbol For Fu ..Rounded Symbol For Cai */
424 R(0x01f300, 0x01f320, 2), /* Cyclone ..Shooting Star */
425 R(0x01f32d, 0x01f335, 2), /* Hot Dog ..Cactus */
426 R(0x01f337, 0x01f37c, 2), /* Tulip ..Baby Bottle */
427 R(0x01f37e, 0x01f393, 2), /* Bottle With Popping Cork..Graduation Cap */
428 R(0x01f3a0, 0x01f3ca, 2), /* Carousel Horse ..Swimmer */
429 R(0x01f3cf, 0x01f3d3, 2), /* Cricket Bat And Ball ..Table Tennis Paddle And */
430 R(0x01f3e0, 0x01f3f0, 2), /* House Building ..European Castle */
431 R(0x01f3f4, 0x01f3f4, 2), /* Waving Black Flag */
432 R(0x01f3f8, 0x01f3fa, 2), /* Badminton Racquet And Sh..Amphora */
433 R(0x01f3fb, 0x01f3ff, 0), /* Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri */
434 R(0x01f400, 0x01f43e, 2), /* Rat ..Paw Prints */
435 R(0x01f440, 0x01f440, 2), /* Eyes */
436 R(0x01f442, 0x01f4fc, 2), /* Ear ..Videocassette */
437 R(0x01f4ff, 0x01f53d, 2), /* Prayer Beads ..Down-pointing Small Red */
438 R(0x01f54b, 0x01f54e, 2), /* Kaaba ..Menorah With Nine Branch */
439 R(0x01f550, 0x01f567, 2), /* Clock Face One Oclock ..Clock Face Twelve-thirty */
440 R(0x01f57a, 0x01f57a, 2), /* Man Dancing */
441 R(0x01f595, 0x01f596, 2), /* Reversed Hand With Middl..Raised Hand With Part Be */
442 R(0x01f5a4, 0x01f5a4, 2), /* Black Heart */
443 R(0x01f5fb, 0x01f64f, 2), /* Mount Fuji ..Person With Folded Hands */
444 R(0x01f680, 0x01f6c5, 2), /* Rocket ..Left Luggage */
445 R(0x01f6cc, 0x01f6cc, 2), /* Sleeping Accommodation */
446 R(0x01f6d0, 0x01f6d2, 2), /* Place Of Worship ..Shopping Trolley */
447 R(0x01f6d5, 0x01f6d7, 2), /* Hindu Temple ..Elevator */
448 R(0x01f6dc, 0x01f6df, 2), /* Wireless ..Ring Buoy */
449 R(0x01f6eb, 0x01f6ec, 2), /* Airplane Departure ..Airplane Arriving */
450 R(0x01f6f4, 0x01f6fc, 2), /* Scooter ..Roller Skate */
451 R(0x01f7e0, 0x01f7eb, 2), /* Large Orange Circle ..Large Brown Square */
452 R(0x01f7f0, 0x01f7f0, 2), /* Heavy Equals Sign */
453 R(0x01f90c, 0x01f93a, 2), /* Pinched Fingers ..Fencer */
454 R(0x01f93c, 0x01f945, 2), /* Wrestlers ..Goal Net */
455 R(0x01f947, 0x01f9ff, 2), /* First Place Medal ..Nazar Amulet */
456 R(0x01fa70, 0x01fa7c, 2), /* Ballet Shoes ..Crutch */
457 R(0x01fa80, 0x01fa88, 2), /* Yo-yo ..Flute */
458 R(0x01fa90, 0x01fabd, 2), /* Ringed Planet ..Wing */
459 R(0x01fabf, 0x01fac5, 2), /* Goose ..Person With Crown */
460 R(0x01face, 0x01fadb, 2), /* Moose ..Pea Pod */
461 R(0x01fae0, 0x01fae8, 2), /* Melting Face ..Shaking Face */
462 R(0x01faf0, 0x01faf8, 2), /* Hand With Index Finger A..Rightwards Pushing Hand */
463 R(0x020000, 0x027fff, 2), /* Cjk Unified Ideograph-20.. nil */
464 R(0x028000, 0x02fffd, 2), /* (continued...) */
465 R(0x030000, 0x037fff, 2), /* Cjk Unified Ideograph-30.. nil */
466 R(0x038000, 0x03fffd, 2), /* (continued...) */
467 R(0x0e0001, 0x0e0001, 0), /* Language Tag */
468 R(0x0e0020, 0x0e007f, 0), /* Tag Space ..Cancel Tag */
469 R(0x0e0100, 0x0e01ef, 0), /* Variation Selector-17 ..Variation Selector-256 */
470 #undef R
471 };
472
473 /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */
474 static const uint16_t planes[/* 18 */] = {
475 0, 261, 443, 445, 447, 447, 447, 447, 447, 447, 447, 447,
476 447, 447, 447, 450, 450, 450,
477 };
478
479 /******* END OF STATIC DATA *******/
480
481 uint32_t p, bot, top;
482
483 /* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */
484 if (ucs < 160)
485 return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0;
486
487 /* out of range for "planes" (and non-unicode), non-characters. */
488 /* (some also test surrogate halves, but not required by POSIX) */
489 if (ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
490 return -1;
491
492 p = ucs >> 16;
493 ucs &= 0xffff;
494
495 for (bot = planes[p], top = planes[p+1]; bot < top; ) {
496 uint32_t mid = (bot + top) / 2;
497 if (ucs < ranges[mid].first)
498 top = mid;
499 else if (ucs > ranges[mid].first + ranges[mid].difflast)
500 bot = mid + 1;
501 else
502 return 2 * ranges[mid].iswide;
503 }
504
505 return 1;
506} /* wcwidth - Unicode 15.1.0 */
diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth
new file mode 100755
index 000000000..792045a29
--- /dev/null
+++ b/scripts/mkwcwidth
@@ -0,0 +1,169 @@
1#!/bin/sh
2#
3# Generate a C implementation of wcwidth, with latest unicode data
4# from a local clone of https://github.com/jquast/wcwidth
5#
6# The MIT License (MIT)
7#
8# Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>
9#
10# Permission is hereby granted, free of charge, to any person obtaining a copy
11# of this software and associated documentation files (the "Software"), to deal
12# in the Software without restriction, including without limitation the rights
13# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14# copies of the Software, and to permit persons to whom the Software is
15# furnished to do so, subject to the following conditions:
16#
17# The above copyright notice and this permission notice shall be included in all
18# copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26# SOFTWARE.
27
28export LC_ALL=C
29self=${0##*/}
30
31# c-types (bigger types work but waste memory. uintN_t need <stdint.h>)
32u32=uint32_t # "unsigned" is also typically 32 bit
33u16=uint16_t # "unsigned short" is also typically 16 bits
34FUNC_ATTR=FAST_FUNC # delete this line if not generating a busybox function
35
36
37err() { >&2 printf %s\\n "$self: $*"; exit 1; }
38
39case ${1-} in -h | --help)
40 echo "Usage: $self [path/to/python-wcwidth] (default path is '.')"
41 echo "Prints a wcwidth C implementation, with latest Unicode data"
42 echo "imported from a local https://github.com/jquast/wcwidth repo."
43 echo "Assumptions about table_zero.py and table_wide.py at the repo:"
44 echo "- Each range is in one Unicode plane (a>>16 == b>>16) (enforced)."
45 echo "- Commit 04d6d90c (2023-10-30) or later, where table_zero.py"
46 echo " includes zero-width Cf chars (else need to add manual tests)."
47esac
48
49[ "${1-}" != -- ] || shift
50
51pwc_root=${1:-.}
52pwc_git() { git -C "$pwc_root" "$@"; }
53
54zerowidth_py=$pwc_root/wcwidth/table_zero.py
55widewidth_py=$pwc_root/wcwidth/table_wide.py
56
57[ -r "$zerowidth_py" ] && [ -r "$widewidth_py" ] \
58 || err "missing $zerowidth_py or $widewidth_py. abort."
59
60# latest unicode version from table_wide.py (e.g. from " '10.0.0': (")
61ver=$(grep "^\s*'[0-9]" < "$widewidth_py" | tail -n1 | sed "s/.*'\(.*\)'.*/\1/")
62
63# stdin -> stdout: extract the data of the last table (latest spec) from
64# wcwidth/table_{wide,zero}.py (from https://github.com/jquast/wcwidth)
65last_table() {
66 awk "/^\s*'[0-9]/ { i=0 } # new table -> reset
67 /^\s*\(0x/ { arr[++i] = \$0 } # range (first, last)
68 END { for (j=1; j <= i; ++j) print arr[j] }"
69}
70
71# stdin -> stdout, $1 is the range's (wc)width (0 or 2), e.g.
72# from: (0x0123a, 0x0123c,), # comment
73# to : R(0x00123a, 0x00123c, 2), /* comment */
74# ranges bigger than half-plane (32769+ codepoints) are split to two.
75py_data_to_c() {
76 sed -e 's/[(),]/ /g' -e 's|#\(.*\)|/*\1 */|' | while read a b c; do
77 # to support cross-plane ranges, we'd need to split them here,
78 # but unlikely required, as all planes end in non-characters.
79 [ $(($a>>16)) = $(($b>>16)) ] || err "not same plane -- $a $b"
80
81 a=$(($a)) b=$(($b)) # some shells want decimal vars in $(())
82 if [ "$((b-a))" -ge 32768 ]; then # split to 15 bit ranges
83 printf "R(0x%06x, 0x%06x, $1), %s\n" $a $((a+32767)) "$c"
84 a=$((a+32768)) c="/* (continued...) */"
85 fi
86 printf "R(0x%06x, 0x%06x, $1), %s\n" $a $b "$c"
87 done
88}
89
90data=$(last_table < "$zerowidth_py" | py_data_to_c 0 &&
91 last_table < "$widewidth_py" | py_data_to_c 2) || err abort
92data=$(printf %s\\n "$data" | sort) # lexicographic here is also numeric
93
94# sorted hex ranges and their (wc)width: R(first, last, {0|2}),[ /* ... */]
95data() { printf %s\\n "$data"; }
96
97repeat() { R=$2; while [ "$R" -gt 0 ]; do printf %s "$1"; R=$((R-1)); done; }
98
99# data -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data
100mkplanes() {
101 i=0 lastp=-1
102 while read a b c; do
103 p=$((${b%?} >> 16)) # plane (last >> 16)
104 repeat "$i, " $((p-lastp))
105 i=$((i+1)) lastp=$p
106 done
107 repeat "$i, " $((17-lastp))
108}
109
110indent() { sed -e 's/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces
111
112cat << CFUNCTION
113/* wcwidth - Unicode $ver, generated by $0.
114 * Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>
115 * License: MIT
116 *
117 * Data imported on $(date -u -I) from https://github.com/jquast/wcwidth
118 * commit $(pwc_git describe --tags) ($(pwc_git show --no-patch --format=%ci))
119 */
120int ${FUNC_ATTR-} wcwidth($u32 ucs)
121{
122 /* sorted ranges, "first" is clipped to 16 bit, and its high bits
123 * (plane) are deduced from the "planes" array below.
124 * (imported from ${zerowidth_py##*/} and ${widewidth_py##*/})
125 */
126 static const struct range {
127 uint16_t first;
128 uint16_t iswide: 1; /* bitfield order empirically faster */
129 uint16_t difflast: 15;
130 } ranges[] = {
131 #define R(first, last, width) {first & 0xffff, width/2, last-first}
132$(data | indent)
133 #undef R
134 };
135
136 /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */
137 static const $u16 planes[/* 18 */] = {
138$(data | mkplanes | fold -s -w 60 | indent)
139 };
140
141 /******* END OF STATIC DATA *******/
142
143 $u32 p, bot, top;
144
145 /* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */
146 if (ucs < 160)
147 return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0;
148
149 /* out of range for "planes" (and non-unicode), non-characters. */
150 /* (some also test surrogate halves, but not required by POSIX) */
151 if (ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
152 return -1;
153
154 p = ucs >> 16;
155 ucs &= 0xffff;
156
157 for (bot = planes[p], top = planes[p+1]; bot < top; ) {
158 $u32 mid = (bot + top) / 2;
159 if (ucs < ranges[mid].first)
160 top = mid;
161 else if (ucs > ranges[mid].first + ranges[mid].difflast)
162 bot = mid + 1;
163 else
164 return 2 * ranges[mid].iswide;
165 }
166
167 return 1;
168} /* wcwidth - Unicode $ver */
169CFUNCTION