aboutsummaryrefslogtreecommitdiff
path: root/scripts/mkwcwidth
diff options
context:
space:
mode:
authorAvi Halachmi (:avih) <avihpit@yahoo.com>2025-12-26 00:02:45 +0200
committerRon Yorston <rmy@pobox.com>2025-12-29 13:57:05 +0000
commit6f847c296bf94401c0c147bf3c5aa1e8531b9dbf (patch)
tree0348f7468392ce5f413515796566451577d2ce7c /scripts/mkwcwidth
parentaa552f0883ab1320c2a65679ee8cd683629af7ec (diff)
downloadbusybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.tar.gz
busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.tar.bz2
busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.zip
win32: wcwidth_alt.c: new generator script, re-generate code
Previously, scripts/mkwcwidth generated a wcwidth implementation based on input from the python wcwidth: https://github.com/jquast/wcwidth . However, that project is not updated frequently, and the less dependencies - the better. This new script instead prints an implementation based directly on the original Unicode files - which it can also download. It generates the same code as before (same data structures, etc). It was then used to re-generate libbb/wcwidth_alt.c, as follows: ./scripts/mkwcwidth DL=15.1.0 FAST_FUNC > libbb/wcwidth_alt.c This is the same Unicode version as the previous file, just to make the codepoint differences visible easily. These differences are very small, and the script itself explains where and why it differs from https://github.com/jquast/wcwidth . Next commits will update the tables to the latest unicode version.
Diffstat (limited to 'scripts/mkwcwidth')
-rwxr-xr-xscripts/mkwcwidth339
1 files changed, 265 insertions, 74 deletions
diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth
index 792045a29..515128abb 100755
--- a/scripts/mkwcwidth
+++ b/scripts/mkwcwidth
@@ -1,11 +1,10 @@
1#!/bin/sh 1#!/bin/sh
2# 2#
3# Generate a C implementation of wcwidth, with latest unicode data 3# Generate a wcwidth C implementation from Unicode data (tested v7 - v17)
4# from a local clone of https://github.com/jquast/wcwidth
5# 4#
6# The MIT License (MIT) 5# The MIT License (MIT)
7# 6#
8# Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com> 7# Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
9# 8#
10# Permission is hereby granted, free of charge, to any person obtaining a copy 9# Permission is hereby granted, free of charge, to any person obtaining a copy
11# of this software and associated documentation files (the "Software"), to deal 10# of this software and associated documentation files (the "Software"), to deal
@@ -25,117 +24,284 @@
25# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26# SOFTWARE. 25# SOFTWARE.
27 26
27
28# Latest Unicode data source files:
29# https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
30# https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
31#
32# License and term of use in either URL:
33# http://www.unicode.org/copyright.html
34# http://www.unicode.org/terms_of_use.html
35
36
28export LC_ALL=C 37export LC_ALL=C
29self=${0##*/} 38self=${0##*/}
39awk=${AWK:-awk} sed=${SED:-sed}
30 40
31# c-types (bigger types work but waste memory. uintN_t need <stdint.h>) 41URL_base=https://www.unicode.org/Public/UCD/latest/ucd
32u32=uint32_t # "unsigned" is also typically 32 bit 42URL_verbase=https://www.unicode.org/Public/%s/ucd
33u16=uint16_t # "unsigned short" is also typically 16 bits 43ud_file=UnicodeData.txt
34FUNC_ATTR=FAST_FUNC # delete this line if not generating a busybox function 44eaw_file=EastAsianWidth.txt
35 45
46# int by default (int is 32 bit and short is 16 on unix/linux/osx/windows, and
47# doesn't require stdint.h - https://en.cppreference.com/w/cpp/language/types)
48u32=${U32:-unsigned} # or uint32_t
49u16=${U16:-unsigned short} # or uint16_t
36 50
37err() { >&2 printf %s\\n "$self: $*"; exit 1; } 51err() { >&2 printf %s\\n "$self: $*"; exit 1; }
38 52
39case ${1-} in -h | --help) 53case ${1-} in -h | --help)
40 echo "Usage: $self [path/to/python-wcwidth] (default path is '.')" 54 echo "Usage: $self [DL=VERSION] [FATTR]"
41 echo "Prints a wcwidth C implementation, with latest Unicode data" 55 echo "Print a wcwidth C implementation to stdout."
42 echo "imported from a local https://github.com/jquast/wcwidth repo." 56 echo
43 echo "Assumptions about table_zero.py and table_wide.py at the repo:" 57 echo "Uses the files ./$ud_file and ./$eaw_file ."
44 echo "- Each range is in one Unicode plane (a>>16 == b>>16) (enforced)." 58 echo "If the files are missing, the latest will be downloaded."
45 echo "- Commit 04d6d90c (2023-10-30) or later, where table_zero.py" 59 echo
46 echo " includes zero-width Cf chars (else need to add manual tests)." 60 echo "If DL=XXX is provided, force-download + overwrite, where XXX is:"
61 echo "- 'latest' -> $URL_base/..."
62 echo "- 'draft', '16.0.0' etc -> $(printf "$URL_verbase" XXX)/..."
63 echo
64 echo "If given, FATTR will be inserted as 'int FATTR wcwidth(...) {...}'."
65 echo "Optional env vars:"
66 echo ' $FN Function name to generate. Default: wcwidth.'
67 echo ' $U32, $U16 unsigned codepoint, u16 C-types. Default: int/short.'
68 echo ' $AWK, $SED used programs (word-split, e.g. AWK="busybox awk").'
69 echo
70 echo 'Requires: sh, curl/wget, sed, awk, and few more POSIX utilities.'
71 exit
47esac 72esac
73[ "${1-}" = -- ] && shift
48 74
49[ "${1-}" != -- ] || shift 75DL=
76case ${1-} in DL=*) DL=${1#DL=}; shift; esac
77[ "${DL-}" ] && [ "$DL" != latest ] && URL_base=$(printf "$URL_verbase" "$DL")
50 78
51pwc_root=${1:-.} 79FUNC_ATTR=${1:+$1 }
52pwc_git() { git -C "$pwc_root" "$@"; }
53 80
54zerowidth_py=$pwc_root/wcwidth/table_zero.py 81url2file() { # wget errors on 404-not-found etc, curl requires -f for that
55widewidth_py=$pwc_root/wcwidth/table_wide.py 82 >&2 echo "[$1 -> $2]"
83 rm -f -- "$2.tmp"
84 { { [ "$(which wget)" ] && wget -O "$2.tmp" "$1"; } ||
85 { [ "$(which curl)" ] && curl -f -o "$2.tmp" "$1"; }
86 } && mv -- "$2.tmp" "$2"
87}
56 88
57[ -r "$zerowidth_py" ] && [ -r "$widewidth_py" ] \ 89[ -r "$ud_file" ] && [ -r "$eaw_file" ] && [ -z "$DL" ] || {
58 || err "missing $zerowidth_py or $widewidth_py. abort." 90 url2file "$URL_base/$ud_file" "$ud_file" &&
91 url2file "$URL_base/$eaw_file" "$eaw_file" ||
92 err "can't download $ud_file and/or $eaw_file. abort."
93}
59 94
60# latest unicode version from table_wide.py (e.g. from " '10.0.0': (")
61ver=$(grep "^\s*'[0-9]" < "$widewidth_py" | tail -n1 | sed "s/.*'\(.*\)'.*/\1/")
62 95
63# stdin -> stdout: extract the data of the last table (latest spec) from 96# extractors of zero-width and wide-width ranges from the Unicode data files.
64# wcwidth/table_{wide,zero}.py (from https://github.com/jquast/wcwidth) 97#
65last_table() { 98# At https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - the mother of most current
66 awk "/^\s*'[0-9]/ { i=0 } # new table -> reset 99# wcwidth implementations, by Markus Kuhn (Unicode 5.0) - the width is:
67 /^\s*\(0x/ { arr[++i] = \$0 } # range (first, last) 100# - 0 for codepoint 0.
68 END { for (j=1; j <= i; ++j) print arr[j] }" 101# - -1 for control ranges C0/C1, and DEL (1-31, 128-159, 127)
102# - 0 if U+200B, cat is Me|Mn|Cf but not U+00AD, Hangul Jamo U+1160..U+11FF.
103# - else 2 if East_Asian_Width property is F|W (fullwidth, wide).
104# - else 1.
105#
106# We add as 0-width:
107# - Category Mc (combining, diacritics[-like])
108# - Hangul Jamo Extended-B U+D7B0..U+D7FF (combining, wasn't in Unicode 5.0)
109# - Emoji modifiers U+1F3FB..U+1F3FF (wasn't in Unicode 5.0)
110#
111# Hangul Jamo, Emoji mods are identified at the data automatically by name.
112# U+200B is category Cf since at least 5.0.0 - no need for manual override.
113#
114# Python wcwidth https://github.com/jquast/wcwidth is like us, plus 0-width:
115# - Cat Zl,Zp (2 codepoints: line/para. sep.), but terminals disagree on them.
116# - Few unassigned codepoints at Hangul Jamo Extended-B - it adds the range
117# manually (includes unassigned), we get (assigned) codepoints at the data.
118# - (U+00AD was 0, fixed to 1 after https://github.com/jquast/wcwidth/issues/8)
119#
120#
121# UnicodeData.txt is lines of:
122# CODEPOINT;NAME;CATEGORY;(...more) where CODEPOINT is 4-6 hex digits
123# or pairs of:
124# CODEPOINT;<XNAME, First>;CATEGORY;(...more) where XNAME is group-ish name
125# CODEPOINT;<XNAME, Last>;CATEGORY;(...more)
126#
127# EastAsianWidth.txt is lines of:
128# #... (comment)
129# or
130# CODEPOINT[..LAST];EAW # <stuff> (4-6 hex digits, maybe spaces around ";")
131# where EAW is East_Asian_Width property ("F", "W", "A", "N", etc)
132
133# replace First/Last pairs in UnicodeData with one line of FIRST..LAST range.
134# assume that First/Last are at consecutive lines, and XNAME/CAT don't change.
135# (no-op for us at Unicode v16 - doesn't add new wide/zero width codepoints)
136ud_file_ranges() {
137 $awk -F\; '$2 ~ /, First>$/ { printf $1 ".."; next }; 1' < "$ud_file"
69} 138}
70 139
71# stdin -> stdout, $1 is the range's (wc)width (0 or 2), e.g. 140# output unsorted/overlapping lines of hex "CODEPOINT[..LAST] WIDTH" (0/1/2)
72# from: (0x0123a, 0x0123c,), # comment 141raw_ranges() {
73# to : R(0x00123a, 0x00123c, 2), /* comment */ 142 # wide ranges according to the EAW property
74# ranges bigger than half-plane (32769+ codepoints) are split to two. 143 $sed -e '/^#/d' -e 's/;/ /' < "$eaw_file" | # comment lines, ';'
75py_data_to_c() { 144 $awk '$2 ~ /^[FW]$/ { print $1, 2 }'
76 sed -e 's/[(),]/ /g' -e 's|#\(.*\)|/*\1 */|' | while read a b c; do 145
77 # to support cross-plane ranges, we'd need to split them here, 146 # wide by name (older Unicode had EAW==N for some FULLWIDTH names)
78 # but unlikely required, as all planes end in non-characters. 147 ud_file_ranges | $awk -F\; '$2 ~ /FULLWIDTH/ { print $1, 2 }'
79 [ $(($a>>16)) = $(($b>>16)) ] || err "not same plane -- $a $b" 148
80 149 # zero-width ranges (override wide-width)
81 a=$(($a)) b=$(($b)) # some shells want decimal vars in $(()) 150 ud_file_ranges | $awk -F\; '
82 if [ "$((b-a))" -ge 32768 ]; then # split to 15 bit ranges 151 $3 ~ /^(Me|Mn|Mc|Cf)$/ ||
83 printf "R(0x%06x, 0x%06x, $1), %s\n" $a $((a+32767)) "$c" 152 $2 ~ /EMOJI MODIFIER|HANGUL J[OU]NGSEONG/ { print $1, 0 }'
84 a=$((a+32768)) c="/* (continued...) */" 153
85 fi 154 # override soft-hyphen as width 1
86 printf "R(0x%06x, 0x%06x, $1), %s\n" $a $b "$c" 155 echo 00AD 1
156}
157
158# lines of hex "CODEPOINT[..LAST] WIDTH" -> decimal "FIRST LAST WIDTH"
159as_decimal_ranges() {
160 while read range width; do
161 echo $((0x${range%%.*})) $((0x${range##*.})) $width
87 done 162 done
88} 163}
89 164
90data=$(last_table < "$zerowidth_py" | py_data_to_c 0 && 165# to generate the final ranges list, we need to sort the raw ranges list,
91 last_table < "$widewidth_py" | py_data_to_c 2) || err abort 166# ensure zero-width overrides wide-width, join adjacent/overlap ranges,
92data=$(printf %s\\n "$data" | sort) # lexicographic here is also numeric 167# and split big ranges - our C structure uses 15 bits to store LAST-FIRST of
168# each range, and ranges are not allowed to cross plane boundary.
169#
170# we use simple (and slow) approach: codepoint-bucket-store all the individual
171# widths at the input ranges as they come, then scan the whole Unicode range
172# (~ 1M buckets) and output continuous ranges of width 0/2. Therefore, later
173# inputs of codepoint X override earlier, so width 2 should arrive before W 0.
174#
175# there's ~3K input ranges of ~200K codepoints, and the output is 450+ ranges.
176# depending on sys/awk performance, it's 200ms - 2s, up to ~7s on low-end ARM.
177# MERGE_RANGES_C (bottom of this file) is drop-in replacement, runs in few ms.
178
179# sorted lines of "FIRST LAST WIDTH" (FIRST/LAST: hex digits, WIDTH: 0 or 2)
180ranges=$(raw_ranges | as_decimal_ranges | $awk '
181
182 # The final ~ 1M codepoints scan can take few secs, up to 30+s on low end.
183 # so use a simple optimization: divide the unicode range to segments which
184 # do not cross plane boundary (some 2^X, tuned for speed), mark segments
185 # where an input range can change the width at the final scan (its egdes),
186 # then skip unmarked segments at the final output scan, as these segs are
187 # guaranteed to not have any width change. this is effective (x5 speedup)
188 # because most segments are empty or dense. If the input was spread evenly
189 # then it would be more effective to store, sort and iterate exactly these
190 # edges. Currently though, both are same speed, and mark/skip is less code.
93 191
94# sorted hex ranges and their (wc)width: R(first, last, {0|2}),[ /* ... */] 192 BEGIN { SEG = 128 } # 128 or 256 seem fastest
95data() { printf %s\\n "$data"; } 193 function mark(i) { segs[int(i/SEG)] = 1 }
194 function skip(i) { return !(int(i/SEG) in segs) }
96 195
97repeat() { R=$2; while [ "$R" -gt 0 ]; do printf %s "$1"; R=$((R-1)); done; } 196 # input range $1..$2: width may differ at $1 vs $1-1, and at $2+1 vs $2
197 { mark($1); mark($2+1) }
98 198
99# data -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data 199 # store/overwrite width-1 (1 -> 0) for every codepoint in this range
100mkplanes() { 200 { v=$3-1; for (i=$1; i<=$2; w[i++]=v); }
101 i=0 lastp=-1 201
102 while read a b c; do 202 function min(a,b) { return a<b ? a : b }
103 p=$((${b%?} >> 16)) # plane (last >> 16) 203 END {
104 repeat "$i, " $((p-lastp)) 204 # scan the full unicode range, print continuous ranges of width 0/2
105 i=$((i+1)) lastp=$p 205 # the SEG skip is SEG-aligned. to disable: use unconditional i++
206 for (i = 0; i <= 1114112; skip(i) ? i+=SEG : i++) { # 0x110000
207 if (w[i] == wprev && i%65536) # no change of width or plane
208 continue
209 if (wprev) # range i0..i-1 is width 0 or 2 - print
210 for (; i0 < i; i0 += 32768) # split (once) if too big
211 printf "%06x %06x %d\n", i0, min(i0+32767, i-1), 1+wprev
212 i0 = i
213 wprev = w[i]
214 }
215 }
216')
217
218ranges() { printf %s\\n "$ranges"; }
219
220verify_ranges() {(
221 errv() { err "verify: bad range: $* -- (0x$ha, 0x$hb, $w)"; }
222 ishex() { case $1 in '' | *[!0-9A-Fa-f]* ) false; esac; } # LC_ALL=C
223
224 pa=-2 pb=-2 pw=x # previous range values
225 while read ha hb w; do
226 [ "$w" = 0 ] || [ "$w" = 2 ] || errv WIDTH is not 0 or 2
227 ishex "$ha" && ishex "$hb" || errv not hex values
228
229 a=$((0x$ha)) b=$((0x$hb)) # decimal
230 [ $a -le $b ] || errv "FIRST > LAST"
231 [ $b -le 1114111 ] || errv out of bounds
232 [ $((a>>16)) = $((b>>16)) ] || errv not same plane
233 [ $((b-a)) -lt 32768 ] || errv more than 0x8000 values
234 [ $a -gt $pb ] || errv overlap or not sorted
235
236 # detect adjacent ranges which combined are <= 0x8000 values.
237 # not detecting 3 adjacent 0x5000 ranges, but shouldn't happen
238 [ $w != $pw ] || [ $a != $((pb+1)) ] || [ $((b-pa)) -ge 32768 ] ||
239 errv small same-width adjacent ranges not combined
240
241 pa=$a pb=$b pw=$w
242 done
243)}
244
245
246as_c_ranges() {
247 while read a b w; do
248 echo "R(0x$a, 0x$b, $w),"
106 done 249 done
107 repeat "$i, " $((17-lastp))
108} 250}
109 251
110indent() { sed -e 's/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces 252# ranges -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data
253# (inclusive, exclussive. arr of 18 items - last plane (16) is a[16]..a[17])
254extract_plane_indices() {
255 # for each plane (0..16) print the 1st index at ranges where it appears
256 # or, if missing, index of the next found plane (ranges are same-plane)
257 i=0 pdone=-1 # index at ranges, plane whose index was last printed
258 while read a dummy; do # a: ranges[i].first (hex, plane(a) == a>>16)
259 while [ $pdone -lt $((0x$a >> 16)) ]; do
260 printf "$i, "
261 pdone=$((pdone+1))
262 done # pdone == plane(a)
263 i=$((i+1))
264 done
265 printf $i # one past last range. the rest are implicit-init as 0
266}
267
268indent() { $sed -e '2,$s/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces
269
270# only EastAsianWidth.txt has version/date
271ver=$(head -n 1 < "$eaw_file" | $sed -e 's/.*-//' -e 's/.txt//')
272udate=$(head -n 2 < "$eaw_file" | tail -n 1 | $sed -e 's/.*Date: //')
273
274ranges | verify_ranges || exit
275
276# at the C code, p/bot/top/mid can be $idx_t, but faster as 32 bit types
277[ "$(ranges | wc -l)" -lt 65536 ] && idx_t=$u16 || idx_t=$u32
111 278
112cat << CFUNCTION 279cat << CFUNCTION
113/* wcwidth - Unicode $ver, generated by $0. 280/* wcwidth - Unicode $ver
114 * Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com> 281 * Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
115 * License: MIT 282 * License: MIT
116 * 283 *
117 * Data imported on $(date -u -I) from https://github.com/jquast/wcwidth 284 * Generated by $self on $(date -u -I) using the Unicode files
118 * commit $(pwc_git describe --tags) ($(pwc_git show --no-patch --format=%ci)) 285 * $ud_file and $eaw_file ($udate)
119 */ 286 */
120int ${FUNC_ATTR-} wcwidth($u32 ucs) 287int ${FUNC_ATTR}${FN:-wcwidth}($u32 ucs)
121{ 288{
122 /* sorted ranges, "first" is clipped to 16 bit, and its high bits 289 /* sorted ranges, "first" is clipped to 16 bit, and its high bits
123 * (plane) are deduced from the "planes" array below. 290 * (plane) are deduced from the "planes" array below.
124 * (imported from ${zerowidth_py##*/} and ${widewidth_py##*/})
125 */ 291 */
126 static const struct range { 292 static const struct range { /* bitfield order empirically fast */
127 uint16_t first; 293 $u16 first: 16;
128 uint16_t iswide: 1; /* bitfield order empirically faster */ 294 $u16 iswide: 1;
129 uint16_t difflast: 15; 295 $u16 delta: 15;
130 } ranges[] = { 296 } ranges[] = {
131 #define R(first, last, width) {first & 0xffff, width/2, last-first} 297 #define R(first, last, width) {first & 0xffff, width/2, last-first}
132$(data | indent) 298 $(ranges | as_c_ranges | indent)
133 #undef R 299 #undef R
134 }; 300 };
135 301
136 /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */ 302 /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */
137 static const $u16 planes[/* 18 */] = { 303 static const $idx_t planes[18] = {
138$(data | mkplanes | fold -s -w 60 | indent) 304 $(ranges | extract_plane_indices | fold -s -w 60 | indent)
139 }; 305 };
140 306
141 /******* END OF STATIC DATA *******/ 307 /******* END OF STATIC DATA *******/
@@ -158,7 +324,7 @@ $(data | mkplanes | fold -s -w 60 | indent)
158 $u32 mid = (bot + top) / 2; 324 $u32 mid = (bot + top) / 2;
159 if (ucs < ranges[mid].first) 325 if (ucs < ranges[mid].first)
160 top = mid; 326 top = mid;
161 else if (ucs > ranges[mid].first + ranges[mid].difflast) 327 else if (ucs > ranges[mid].first + ranges[mid].delta)
162 bot = mid + 1; 328 bot = mid + 1;
163 else 329 else
164 return 2 * ranges[mid].iswide; 330 return 2 * ranges[mid].iswide;
@@ -167,3 +333,28 @@ $(data | mkplanes | fold -s -w 60 | indent)
167 return 1; 333 return 1;
168} /* wcwidth - Unicode $ver */ 334} /* wcwidth - Unicode $ver */
169CFUNCTION 335CFUNCTION
336
337
338# C drop-in replacement to the awk script which outputs the final ranges list
339: << \MERGE_RANGES_C
340#include <stdio.h>
341int main(void)
342{
343 static char w[0x110000 + 1]; /* = {0} which is width==1 */
344 unsigned a, b, c, i, first; /* assume 32bit (need 21) */
345
346 while (3 == scanf("%u%u%u", &a, &b, &c)) /* FIRST LAST WIDTH (W:0/1/2) */
347 if (b < 0x110000 && c < 3) while (a <= b) w[a++] = (char)c + 1;
348
349 /* HEXFIRST HEXLAST WIDTH (W: 0/2, not cross-plane, LAST-FIRST<0x8000) */
350 for (first = 0, i = 1; i <= 0x110000; ++i) {
351 if (w[i] == w[i-1] && i % 0x10000 && i - first < 0x8000)
352 continue;
353 if (w[i-1] & 1) /* 1 or 3 -> width is 0 or 2 for first..i-1 */
354 printf("%06x %06x %d\n", first, i-1, w[i-1]-1);
355 first = i;
356 }
357
358 return 0;
359}
360MERGE_RANGES_C