aboutsummaryrefslogtreecommitdiff
path: root/scripts/mkwcwidth
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/mkwcwidth')
-rwxr-xr-xscripts/mkwcwidth360
1 files changed, 360 insertions, 0 deletions
diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth
new file mode 100755
index 000000000..515128abb
--- /dev/null
+++ b/scripts/mkwcwidth
@@ -0,0 +1,360 @@
1#!/bin/sh
2#
3# Generate a wcwidth C implementation from Unicode data (tested v7 - v17)
4#
5# The MIT License (MIT)
6#
7# Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
8#
9# Permission is hereby granted, free of charge, to any person obtaining a copy
10# of this software and associated documentation files (the "Software"), to deal
11# in the Software without restriction, including without limitation the rights
12# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the Software is
14# furnished to do so, subject to the following conditions:
15#
16# The above copyright notice and this permission notice shall be included in all
17# copies or substantial portions of the Software.
18#
19# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25# SOFTWARE.
26
27
28# Latest Unicode data source files:
29# https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
30# https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
31#
32# License and term of use in either URL:
33# http://www.unicode.org/copyright.html
34# http://www.unicode.org/terms_of_use.html
35
36
37export LC_ALL=C
38self=${0##*/}
39awk=${AWK:-awk} sed=${SED:-sed}
40
41URL_base=https://www.unicode.org/Public/UCD/latest/ucd
42URL_verbase=https://www.unicode.org/Public/%s/ucd
43ud_file=UnicodeData.txt
44eaw_file=EastAsianWidth.txt
45
46# int by default (int is 32 bit and short is 16 on unix/linux/osx/windows, and
47# doesn't require stdint.h - https://en.cppreference.com/w/cpp/language/types)
48u32=${U32:-unsigned} # or uint32_t
49u16=${U16:-unsigned short} # or uint16_t
50
51err() { >&2 printf %s\\n "$self: $*"; exit 1; }
52
53case ${1-} in -h | --help)
54 echo "Usage: $self [DL=VERSION] [FATTR]"
55 echo "Print a wcwidth C implementation to stdout."
56 echo
57 echo "Uses the files ./$ud_file and ./$eaw_file ."
58 echo "If the files are missing, the latest will be downloaded."
59 echo
60 echo "If DL=XXX is provided, force-download + overwrite, where XXX is:"
61 echo "- 'latest' -> $URL_base/..."
62 echo "- 'draft', '16.0.0' etc -> $(printf "$URL_verbase" XXX)/..."
63 echo
64 echo "If given, FATTR will be inserted as 'int FATTR wcwidth(...) {...}'."
65 echo "Optional env vars:"
66 echo ' $FN Function name to generate. Default: wcwidth.'
67 echo ' $U32, $U16 unsigned codepoint, u16 C-types. Default: int/short.'
68 echo ' $AWK, $SED used programs (word-split, e.g. AWK="busybox awk").'
69 echo
70 echo 'Requires: sh, curl/wget, sed, awk, and few more POSIX utilities.'
71 exit
72esac
73[ "${1-}" = -- ] && shift
74
75DL=
76case ${1-} in DL=*) DL=${1#DL=}; shift; esac
77[ "${DL-}" ] && [ "$DL" != latest ] && URL_base=$(printf "$URL_verbase" "$DL")
78
79FUNC_ATTR=${1:+$1 }
80
81url2file() { # wget errors on 404-not-found etc, curl requires -f for that
82 >&2 echo "[$1 -> $2]"
83 rm -f -- "$2.tmp"
84 { { [ "$(which wget)" ] && wget -O "$2.tmp" "$1"; } ||
85 { [ "$(which curl)" ] && curl -f -o "$2.tmp" "$1"; }
86 } && mv -- "$2.tmp" "$2"
87}
88
89[ -r "$ud_file" ] && [ -r "$eaw_file" ] && [ -z "$DL" ] || {
90 url2file "$URL_base/$ud_file" "$ud_file" &&
91 url2file "$URL_base/$eaw_file" "$eaw_file" ||
92 err "can't download $ud_file and/or $eaw_file. abort."
93}
94
95
96# extractors of zero-width and wide-width ranges from the Unicode data files.
97#
98# At https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - the mother of most current
99# wcwidth implementations, by Markus Kuhn (Unicode 5.0) - the width is:
100# - 0 for codepoint 0.
101# - -1 for control ranges C0/C1, and DEL (1-31, 128-159, 127)
102# - 0 if U+200B, cat is Me|Mn|Cf but not U+00AD, Hangul Jamo U+1160..U+11FF.
103# - else 2 if East_Asian_Width property is F|W (fullwidth, wide).
104# - else 1.
105#
106# We add as 0-width:
107# - Category Mc (combining, diacritics[-like])
108# - Hangul Jamo Extended-B U+D7B0..U+D7FF (combining, wasn't in Unicode 5.0)
109# - Emoji modifiers U+1F3FB..U+1F3FF (wasn't in Unicode 5.0)
110#
111# Hangul Jamo, Emoji mods are identified at the data automatically by name.
112# U+200B is category Cf since at least 5.0.0 - no need for manual override.
113#
114# Python wcwidth https://github.com/jquast/wcwidth is like us, plus 0-width:
115# - Cat Zl,Zp (2 codepoints: line/para. sep.), but terminals disagree on them.
116# - Few unassigned codepoints at Hangul Jamo Extended-B - it adds the range
117# manually (includes unassigned), we get (assigned) codepoints at the data.
118# - (U+00AD was 0, fixed to 1 after https://github.com/jquast/wcwidth/issues/8)
119#
120#
121# UnicodeData.txt is lines of:
122# CODEPOINT;NAME;CATEGORY;(...more) where CODEPOINT is 4-6 hex digits
123# or pairs of:
124# CODEPOINT;<XNAME, First>;CATEGORY;(...more) where XNAME is group-ish name
125# CODEPOINT;<XNAME, Last>;CATEGORY;(...more)
126#
127# EastAsianWidth.txt is lines of:
128# #... (comment)
129# or
130# CODEPOINT[..LAST];EAW # <stuff> (4-6 hex digits, maybe spaces around ";")
131# where EAW is East_Asian_Width property ("F", "W", "A", "N", etc)
132
133# replace First/Last pairs in UnicodeData with one line of FIRST..LAST range.
134# assume that First/Last are at consecutive lines, and XNAME/CAT don't change.
135# (no-op for us at Unicode v16 - doesn't add new wide/zero width codepoints)
136ud_file_ranges() {
137 $awk -F\; '$2 ~ /, First>$/ { printf $1 ".."; next }; 1' < "$ud_file"
138}
139
140# output unsorted/overlapping lines of hex "CODEPOINT[..LAST] WIDTH" (0/1/2)
141raw_ranges() {
142 # wide ranges according to the EAW property
143 $sed -e '/^#/d' -e 's/;/ /' < "$eaw_file" | # comment lines, ';'
144 $awk '$2 ~ /^[FW]$/ { print $1, 2 }'
145
146 # wide by name (older Unicode had EAW==N for some FULLWIDTH names)
147 ud_file_ranges | $awk -F\; '$2 ~ /FULLWIDTH/ { print $1, 2 }'
148
149 # zero-width ranges (override wide-width)
150 ud_file_ranges | $awk -F\; '
151 $3 ~ /^(Me|Mn|Mc|Cf)$/ ||
152 $2 ~ /EMOJI MODIFIER|HANGUL J[OU]NGSEONG/ { print $1, 0 }'
153
154 # override soft-hyphen as width 1
155 echo 00AD 1
156}
157
158# lines of hex "CODEPOINT[..LAST] WIDTH" -> decimal "FIRST LAST WIDTH"
159as_decimal_ranges() {
160 while read range width; do
161 echo $((0x${range%%.*})) $((0x${range##*.})) $width
162 done
163}
164
165# to generate the final ranges list, we need to sort the raw ranges list,
166# ensure zero-width overrides wide-width, join adjacent/overlap ranges,
167# and split big ranges - our C structure uses 15 bits to store LAST-FIRST of
168# each range, and ranges are not allowed to cross plane boundary.
169#
170# we use simple (and slow) approach: codepoint-bucket-store all the individual
171# widths at the input ranges as they come, then scan the whole Unicode range
172# (~ 1M buckets) and output continuous ranges of width 0/2. Therefore, later
173# inputs of codepoint X override earlier, so width 2 should arrive before W 0.
174#
175# there's ~3K input ranges of ~200K codepoints, and the output is 450+ ranges.
176# depending on sys/awk performance, it's 200ms - 2s, up to ~7s on low-end ARM.
177# MERGE_RANGES_C (bottom of this file) is drop-in replacement, runs in few ms.
178
179# sorted lines of "FIRST LAST WIDTH" (FIRST/LAST: hex digits, WIDTH: 0 or 2)
180ranges=$(raw_ranges | as_decimal_ranges | $awk '
181
182 # The final ~ 1M codepoints scan can take few secs, up to 30+s on low end.
183 # so use a simple optimization: divide the unicode range to segments which
184 # do not cross plane boundary (some 2^X, tuned for speed), mark segments
185 # where an input range can change the width at the final scan (its egdes),
186 # then skip unmarked segments at the final output scan, as these segs are
187 # guaranteed to not have any width change. this is effective (x5 speedup)
188 # because most segments are empty or dense. If the input was spread evenly
189 # then it would be more effective to store, sort and iterate exactly these
190 # edges. Currently though, both are same speed, and mark/skip is less code.
191
192 BEGIN { SEG = 128 } # 128 or 256 seem fastest
193 function mark(i) { segs[int(i/SEG)] = 1 }
194 function skip(i) { return !(int(i/SEG) in segs) }
195
196 # input range $1..$2: width may differ at $1 vs $1-1, and at $2+1 vs $2
197 { mark($1); mark($2+1) }
198
199 # store/overwrite width-1 (1 -> 0) for every codepoint in this range
200 { v=$3-1; for (i=$1; i<=$2; w[i++]=v); }
201
202 function min(a,b) { return a<b ? a : b }
203 END {
204 # scan the full unicode range, print continuous ranges of width 0/2
205 # the SEG skip is SEG-aligned. to disable: use unconditional i++
206 for (i = 0; i <= 1114112; skip(i) ? i+=SEG : i++) { # 0x110000
207 if (w[i] == wprev && i%65536) # no change of width or plane
208 continue
209 if (wprev) # range i0..i-1 is width 0 or 2 - print
210 for (; i0 < i; i0 += 32768) # split (once) if too big
211 printf "%06x %06x %d\n", i0, min(i0+32767, i-1), 1+wprev
212 i0 = i
213 wprev = w[i]
214 }
215 }
216')
217
218ranges() { printf %s\\n "$ranges"; }
219
220verify_ranges() {(
221 errv() { err "verify: bad range: $* -- (0x$ha, 0x$hb, $w)"; }
222 ishex() { case $1 in '' | *[!0-9A-Fa-f]* ) false; esac; } # LC_ALL=C
223
224 pa=-2 pb=-2 pw=x # previous range values
225 while read ha hb w; do
226 [ "$w" = 0 ] || [ "$w" = 2 ] || errv WIDTH is not 0 or 2
227 ishex "$ha" && ishex "$hb" || errv not hex values
228
229 a=$((0x$ha)) b=$((0x$hb)) # decimal
230 [ $a -le $b ] || errv "FIRST > LAST"
231 [ $b -le 1114111 ] || errv out of bounds
232 [ $((a>>16)) = $((b>>16)) ] || errv not same plane
233 [ $((b-a)) -lt 32768 ] || errv more than 0x8000 values
234 [ $a -gt $pb ] || errv overlap or not sorted
235
236 # detect adjacent ranges which combined are <= 0x8000 values.
237 # not detecting 3 adjacent 0x5000 ranges, but shouldn't happen
238 [ $w != $pw ] || [ $a != $((pb+1)) ] || [ $((b-pa)) -ge 32768 ] ||
239 errv small same-width adjacent ranges not combined
240
241 pa=$a pb=$b pw=$w
242 done
243)}
244
245
246as_c_ranges() {
247 while read a b w; do
248 echo "R(0x$a, 0x$b, $w),"
249 done
250}
251
252# ranges -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data
253# (inclusive, exclussive. arr of 18 items - last plane (16) is a[16]..a[17])
254extract_plane_indices() {
255 # for each plane (0..16) print the 1st index at ranges where it appears
256 # or, if missing, index of the next found plane (ranges are same-plane)
257 i=0 pdone=-1 # index at ranges, plane whose index was last printed
258 while read a dummy; do # a: ranges[i].first (hex, plane(a) == a>>16)
259 while [ $pdone -lt $((0x$a >> 16)) ]; do
260 printf "$i, "
261 pdone=$((pdone+1))
262 done # pdone == plane(a)
263 i=$((i+1))
264 done
265 printf $i # one past last range. the rest are implicit-init as 0
266}
267
268indent() { $sed -e '2,$s/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces
269
270# only EastAsianWidth.txt has version/date
271ver=$(head -n 1 < "$eaw_file" | $sed -e 's/.*-//' -e 's/.txt//')
272udate=$(head -n 2 < "$eaw_file" | tail -n 1 | $sed -e 's/.*Date: //')
273
274ranges | verify_ranges || exit
275
276# at the C code, p/bot/top/mid can be $idx_t, but faster as 32 bit types
277[ "$(ranges | wc -l)" -lt 65536 ] && idx_t=$u16 || idx_t=$u32
278
279cat << CFUNCTION
280/* wcwidth - Unicode $ver
281 * Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
282 * License: MIT
283 *
284 * Generated by $self on $(date -u -I) using the Unicode files
285 * $ud_file and $eaw_file ($udate)
286 */
287int ${FUNC_ATTR}${FN:-wcwidth}($u32 ucs)
288{
289 /* sorted ranges, "first" is clipped to 16 bit, and its high bits
290 * (plane) are deduced from the "planes" array below.
291 */
292 static const struct range { /* bitfield order empirically fast */
293 $u16 first: 16;
294 $u16 iswide: 1;
295 $u16 delta: 15;
296 } ranges[] = {
297 #define R(first, last, width) {first & 0xffff, width/2, last-first}
298 $(ranges | as_c_ranges | indent)
299 #undef R
300 };
301
302 /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */
303 static const $idx_t planes[18] = {
304 $(ranges | extract_plane_indices | fold -s -w 60 | indent)
305 };
306
307 /******* END OF STATIC DATA *******/
308
309 $u32 p, bot, top;
310
311 /* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */
312 if (ucs < 160)
313 return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0;
314
315 /* out of range for "planes" (and non-unicode), non-characters. */
316 /* (some also test surrogate halves, but not required by POSIX) */
317 if (ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
318 return -1;
319
320 p = ucs >> 16;
321 ucs &= 0xffff;
322
323 for (bot = planes[p], top = planes[p+1]; bot < top; ) {
324 $u32 mid = (bot + top) / 2;
325 if (ucs < ranges[mid].first)
326 top = mid;
327 else if (ucs > ranges[mid].first + ranges[mid].delta)
328 bot = mid + 1;
329 else
330 return 2 * ranges[mid].iswide;
331 }
332
333 return 1;
334} /* wcwidth - Unicode $ver */
335CFUNCTION
336
337
338# C drop-in replacement to the awk script which outputs the final ranges list
339: << \MERGE_RANGES_C
340#include <stdio.h>
341int main(void)
342{
343 static char w[0x110000 + 1]; /* = {0} which is width==1 */
344 unsigned a, b, c, i, first; /* assume 32bit (need 21) */
345
346 while (3 == scanf("%u%u%u", &a, &b, &c)) /* FIRST LAST WIDTH (W:0/1/2) */
347 if (b < 0x110000 && c < 3) while (a <= b) w[a++] = (char)c + 1;
348
349 /* HEXFIRST HEXLAST WIDTH (W: 0/2, not cross-plane, LAST-FIRST<0x8000) */
350 for (first = 0, i = 1; i <= 0x110000; ++i) {
351 if (w[i] == w[i-1] && i % 0x10000 && i - first < 0x8000)
352 continue;
353 if (w[i-1] & 1) /* 1 or 3 -> width is 0 or 2 for first..i-1 */
354 printf("%06x %06x %d\n", first, i-1, w[i-1]-1);
355 first = i;
356 }
357
358 return 0;
359}
360MERGE_RANGES_C