win32: wcwidth_alt.c: new generator script, re-generate code

Previously, scripts/mkwcwidth generated a wcwidth implementation based on input from the python wcwidth: https://github.com/jquast/wcwidth . However, that project is not updated frequently, and the less dependencies - the better. This new script instead prints an implementation based directly on the original Unicode files - which it can also download. It generates the same code as before (same data structures, etc). It was then used to re-generate libbb/wcwidth_alt.c, as follows: ./scripts/mkwcwidth DL=15.1.0 FAST_FUNC > libbb/wcwidth_alt.c This is the same Unicode version as the previous file, just to make the codepoint differences visible easily. These differences are very small, and the script itself explains where and why it differs from https://github.com/jquast/wcwidth . Next commits will update the tables to the latest unicode version.
author: Avi Halachmi (:avih) <avihpit@yahoo.com> 2025-12-26 00:02:45 +0200
committer: Ron Yorston <rmy@pobox.com> 2025-12-29 13:57:05 +0000
commit: 6f847c296bf94401c0c147bf3c5aa1e8531b9dbf (patch)
tree: 0348f7468392ce5f413515796566451577d2ce7c /scripts/mkwcwidth
parent: aa552f0883ab1320c2a65679ee8cd683629af7ec (diff)
download: busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.tar.gz
busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.tar.bz2
busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.zip
1 files changed, 265 insertions, 74 deletions
diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth
index 792045a29..515128abb 100755
--- a/scripts/mkwcwidth
+++ b/scripts/mkwcwidth
@@ -1,11 +1,10 @@
 #!/bin/sh
 #
-# Generate a C implementation of wcwidth, with latest unicode data
+# Generate a wcwidth C implementation from Unicode data (tested v7 - v17)
-# from a local clone of https://github.com/jquast/wcwidth
 #
 # The MIT License (MIT)
 #
-# Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>
+# Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -25,117 +24,284 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
+# Latest Unicode data source files:
+#   https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
+#   https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
+#
+# License and term of use in either URL:
+#   http://www.unicode.org/copyright.html
+#   http://www.unicode.org/terms_of_use.html
 export LC_ALL=C
 self=${0##*/}
+awk=${AWK:-awk} sed=${SED:-sed}
-# c-types (bigger types work but waste memory. uintN_t need <stdint.h>)
+URL_base=https://www.unicode.org/Public/UCD/latest/ucd
-u32=uint32_t  # "unsigned" is also typically 32 bit
+URL_verbase=https://www.unicode.org/Public/%s/ucd
-u16=uint16_t  # "unsigned short" is also typically 16 bits
+ud_file=UnicodeData.txt
-FUNC_ATTR=FAST_FUNC # delete this line if not generating a busybox function
+eaw_file=EastAsianWidth.txt
+# int by default (int is 32 bit and short is 16 on unix/linux/osx/windows, and
+# doesn't require stdint.h - https://en.cppreference.com/w/cpp/language/types)
+u32=${U32:-unsigned}        # or uint32_t
+u16=${U16:-unsigned short}  # or uint16_t
 err() { >&2 printf %s\\n "$self: $*"; exit 1; }
 case ${1-} in -h | --help)
-        echo "Usage: $self [path/to/python-wcwidth]   (default path is '.')"
+        echo "Usage: $self [DL=VERSION] [FATTR]"
-        echo "Prints a wcwidth C implementation, with latest Unicode data"
+        echo "Print a wcwidth C implementation to stdout."
-        echo "imported from a local https://github.com/jquast/wcwidth repo."
+        echo
-        echo "Assumptions about table_zero.py and table_wide.py at the repo:"
+        echo "Uses the files ./$ud_file and ./$eaw_file ."
-        echo "- Each range is in one Unicode plane (a>>16 == b>>16) (enforced)."
+        echo "If the files are missing, the latest will be downloaded."
-        echo "- Commit 04d6d90c (2023-10-30) or later, where table_zero.py"
+        echo
-        echo "  includes zero-width Cf chars (else need to add manual tests)."
+        echo "If DL=XXX is provided, force-download + overwrite, where XXX is:"
+        echo "- 'latest' -> $URL_base/..."
+        echo "- 'draft', '16.0.0' etc -> $(printf "$URL_verbase" XXX)/..."
+        echo
+        echo "If given, FATTR will be inserted as 'int FATTR wcwidth(...) {...}'."
+        echo "Optional env vars:"
+        echo '  $FN           Function name to generate. Default: wcwidth.'
+        echo '  $U32, $U16    unsigned codepoint, u16 C-types. Default: int/short.'
+        echo '  $AWK, $SED    used programs (word-split, e.g. AWK="busybox awk").'
+        echo
+        echo 'Requires: sh, curl/wget, sed, awk, and few more POSIX utilities.'
+        exit
 esac
+[ "${1-}" = -- ] && shift
-[ "${1-}" != -- ] || shift
+DL=
+case ${1-} in DL=*) DL=${1#DL=}; shift; esac
+[ "${DL-}" ] && [ "$DL" != latest ] && URL_base=$(printf "$URL_verbase" "$DL")
-pwc_root=${1:-.}
+FUNC_ATTR=${1:+$1 }
-pwc_git() { git -C "$pwc_root" "$@"; }
-zerowidth_py=$pwc_root/wcwidth/table_zero.py
+url2file() {  # wget errors on 404-not-found etc, curl requires -f for that
-widewidth_py=$pwc_root/wcwidth/table_wide.py
+        >&2 echo "[$1  ->  $2]"
+        rm -f -- "$2.tmp"
+        { { [ "$(which wget)" ] && wget    -O "$2.tmp" "$1"; } ||
+          { [ "$(which curl)" ] && curl -f -o "$2.tmp" "$1"; }
+        } && mv -- "$2.tmp" "$2"
+}
-[ -r "$zerowidth_py" ] && [ -r "$widewidth_py" ] \
+[ -r "$ud_file" ] && [ -r "$eaw_file" ] && [ -z "$DL" ] || {
-        || err "missing $zerowidth_py or $widewidth_py. abort."
+        url2file "$URL_base/$ud_file"  "$ud_file"  &&
+        url2file "$URL_base/$eaw_file" "$eaw_file" ||
+                err "can't download $ud_file and/or $eaw_file. abort."
+}
-# latest unicode version from table_wide.py (e.g. from "    '10.0.0': (")
-ver=$(grep "^\s*'[0-9]" < "$widewidth_py" | tail -n1 | sed "s/.*'\(.*\)'.*/\1/")
-# stdin -> stdout: extract the data of the last table (latest spec) from
+# extractors of zero-width and wide-width ranges from the Unicode data files.
-# wcwidth/table_{wide,zero}.py (from https://github.com/jquast/wcwidth)
+#
-last_table() {
+# At https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - the mother of most current
-        awk "/^\s*'[0-9]/ { i=0 }             # new table -> reset
+# wcwidth implementations, by Markus Kuhn (Unicode 5.0) - the width is:
-             /^\s*\(0x/   { arr[++i] = \$0 }  # range (first, last)
+# - 0 for codepoint 0.
-             END          { for (j=1; j <= i; ++j) print arr[j] }"
+# - -1 for control ranges C0/C1, and DEL (1-31, 128-159, 127)
+# - 0 if U+200B, cat is Me|Mn|Cf but not U+00AD, Hangul Jamo U+1160..U+11FF.
+# - else 2 if East_Asian_Width property is F|W (fullwidth, wide).
+# - else 1.
+#
+# We add as 0-width:
+# - Category Mc (combining, diacritics[-like])
+# - Hangul Jamo Extended-B U+D7B0..U+D7FF (combining, wasn't in Unicode 5.0)
+# - Emoji modifiers U+1F3FB..U+1F3FF (wasn't in Unicode 5.0)
+#
+# Hangul Jamo, Emoji mods are identified at the data automatically by name.
+# U+200B is category Cf since at least 5.0.0 - no need for manual override.
+#
+# Python wcwidth https://github.com/jquast/wcwidth is like us, plus 0-width:
+# - Cat Zl,Zp (2 codepoints: line/para. sep.), but terminals disagree on them.
+# - Few unassigned codepoints at Hangul Jamo Extended-B - it adds the range
+#   manually (includes unassigned), we get (assigned) codepoints at the data.
+# - (U+00AD was 0, fixed to 1 after https://github.com/jquast/wcwidth/issues/8)
+#
+#
+# UnicodeData.txt is lines of:
+#   CODEPOINT;NAME;CATEGORY;(...more) where CODEPOINT is 4-6 hex digits
+# or pairs of:
+#   CODEPOINT;<XNAME, First>;CATEGORY;(...more)  where XNAME is group-ish name
+#   CODEPOINT;<XNAME, Last>;CATEGORY;(...more)
+#
+# EastAsianWidth.txt is lines of:
+#   #... (comment)
+# or
+#   CODEPOINT[..LAST];EAW # <stuff>  (4-6 hex digits, maybe spaces around ";")
+# where EAW is East_Asian_Width property ("F", "W", "A", "N", etc)
+# replace First/Last pairs in UnicodeData with one line of FIRST..LAST range.
+# assume that First/Last are at consecutive lines, and XNAME/CAT don't change.
+# (no-op for us at Unicode v16 - doesn't add new wide/zero width codepoints)
+ud_file_ranges() {
+        $awk -F\; '$2 ~ /, First>$/ { printf $1 ".."; next }; 1' < "$ud_file"
 }
-# stdin -> stdout, $1 is the range's (wc)width (0 or 2), e.g.
+# output unsorted/overlapping lines of hex "CODEPOINT[..LAST] WIDTH" (0/1/2)
-#   from: (0x0123a, 0x0123c,), # comment
+raw_ranges() {
-#   to  : R(0x00123a, 0x00123c, 2),  /* comment */
+        # wide ranges according to the EAW property
-# ranges bigger than half-plane (32769+ codepoints) are split to two.
+        $sed -e '/^#/d' -e 's/;/ /' < "$eaw_file" |  # comment lines, ';'
-py_data_to_c() {
+        $awk '$2 ~ /^[FW]$/ { print $1, 2 }'
-        sed -e 's/[(),]/ /g' -e 's|#\(.*\)|/*\1 */|' | while read a b c; do
-                # to support cross-plane ranges, we'd need to split them here,
+        # wide by name (older Unicode had EAW==N for some FULLWIDTH names)
-                # but unlikely required, as all planes end in non-characters.
+        ud_file_ranges | $awk -F\; '$2 ~ /FULLWIDTH/ { print $1, 2 }'
-                [ $(($a>>16)) = $(($b>>16)) ] || err "not same plane -- $a $b"
+        # zero-width ranges (override wide-width)
-                a=$(($a)) b=$(($b))  # some shells want decimal vars in $(())
+        ud_file_ranges | $awk -F\; '
-                if [ "$((b-a))" -ge 32768 ]; then  # split to 15 bit ranges
+                $3 ~ /^(Me|Mn|Mc|Cf)$/ ||
-                        printf "R(0x%06x, 0x%06x, $1),  %s\n" $a $((a+32767)) "$c"
+                $2 ~ /EMOJI MODIFIER|HANGUL J[OU]NGSEONG/ { print $1, 0 }'
-                        a=$((a+32768)) c="/* (continued...) */"
-                fi
+        # override soft-hyphen as width 1
-                printf "R(0x%06x, 0x%06x, $1),  %s\n" $a $b "$c"
+        echo 00AD 1
+}
+# lines of hex "CODEPOINT[..LAST] WIDTH"  ->  decimal "FIRST LAST WIDTH"
+as_decimal_ranges() {
+        while read range width; do
+                echo $((0x${range%%.*})) $((0x${range##*.})) $width
        done
 }
-data=$(last_table < "$zerowidth_py" | py_data_to_c 0 &&
+# to generate the final ranges list, we need to sort the raw ranges list,
-       last_table < "$widewidth_py" | py_data_to_c 2) || err abort
+# ensure zero-width overrides wide-width, join adjacent/overlap ranges,
-data=$(printf %s\\n "$data" | sort)  # lexicographic here is also numeric
+# and split big ranges - our C structure uses 15 bits to store LAST-FIRST of
+# each range, and ranges are not allowed to cross plane boundary.
+#
+# we use simple (and slow) approach: codepoint-bucket-store all the individual
+# widths at the input ranges as they come, then scan the whole Unicode range
+# (~ 1M buckets) and output continuous ranges of width 0/2. Therefore, later
+# inputs of codepoint X override earlier, so width 2 should arrive before W 0.
+#
+# there's ~3K input ranges of ~200K codepoints, and the output is 450+ ranges.
+# depending on sys/awk performance, it's 200ms - 2s, up to ~7s on low-end ARM.
+# MERGE_RANGES_C (bottom of this file) is drop-in replacement, runs in few ms.
+# sorted lines of "FIRST LAST WIDTH" (FIRST/LAST: hex digits, WIDTH: 0 or 2)
+ranges=$(raw_ranges | as_decimal_ranges | $awk '
+    # The final ~ 1M codepoints scan can take few secs, up to 30+s on low end.
+    # so use a simple optimization: divide the unicode range to segments which
+    # do not cross plane boundary (some 2^X, tuned for speed), mark segments
+    # where an input range can change the width at the final scan (its egdes),
+    # then skip unmarked segments at the final output scan, as these segs are
+    # guaranteed to not have any width change. this is effective (x5 speedup)
+    # because most segments are empty or dense. If the input was spread evenly
+    # then it would be more effective to store, sort and iterate exactly these
+    # edges. Currently though, both are same speed, and mark/skip is less code.
-# sorted hex ranges and their (wc)width: R(first, last, {0|2}),[  /* ... */]
+    BEGIN { SEG = 128 }  # 128 or 256 seem fastest
-data() { printf %s\\n "$data"; }
+    function mark(i) { segs[int(i/SEG)] = 1 }
+    function skip(i) { return !(int(i/SEG) in segs) }
-repeat() { R=$2; while [ "$R" -gt 0 ]; do printf %s "$1"; R=$((R-1)); done; }
+    # input range $1..$2: width may differ at $1 vs $1-1, and at $2+1 vs $2
+    { mark($1); mark($2+1) }
-# data -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data
+    # store/overwrite width-1 (1 -> 0) for every codepoint in this range
-mkplanes() {
+    { v=$3-1; for (i=$1; i<=$2; w[i++]=v); }
-        i=0 lastp=-1
-        while read a b c; do
+    function min(a,b) { return a<b ? a : b }
-                p=$((${b%?} >> 16))  # plane (last >> 16)
+    END {
-                repeat "$i, " $((p-lastp))
+        # scan the full unicode range, print continuous ranges of width 0/2
-                i=$((i+1)) lastp=$p
+        # the SEG skip is SEG-aligned. to disable: use unconditional i++
+        for (i = 0; i <= 1114112; skip(i) ? i+=SEG : i++) {  # 0x110000
+            if (w[i] == wprev && i%65536)  # no change of width or plane
+                continue
+            if (wprev)  # range i0..i-1 is width 0 or 2 - print
+                for (; i0 < i; i0 += 32768)  # split (once) if too big
+                    printf "%06x %06x %d\n", i0, min(i0+32767, i-1), 1+wprev
+            i0 = i
+            wprev = w[i]
+        }
+    }
+')
+ranges() { printf %s\\n "$ranges"; }
+verify_ranges() {(
+        errv()  { err "verify: bad range: $* -- (0x$ha, 0x$hb, $w)"; }
+        ishex() { case $1 in '' | *[!0-9A-Fa-f]* ) false; esac; }  # LC_ALL=C
+        pa=-2 pb=-2 pw=x  # previous range values
+        while read ha hb w; do
+                [ "$w" = 0 ] || [ "$w" = 2 ]  || errv WIDTH is not 0 or 2
+                ishex "$ha" && ishex "$hb"    || errv not hex values
+                a=$((0x$ha)) b=$((0x$hb))     #  decimal
+                [ $a -le $b ]                 || errv "FIRST > LAST"
+                [ $b -le 1114111 ]            || errv out of bounds
+                [ $((a>>16)) = $((b>>16)) ]   || errv not same plane
+                [ $((b-a)) -lt 32768 ]        || errv more than 0x8000 values
+                [ $a -gt $pb ]                || errv overlap or not sorted
+                # detect adjacent ranges which combined are <= 0x8000 values.
+                # not detecting 3 adjacent 0x5000 ranges, but shouldn't happen
+                [ $w != $pw ] || [ $a != $((pb+1)) ] || [ $((b-pa)) -ge 32768 ] ||
+                     errv small same-width adjacent ranges not combined
+                pa=$a pb=$b pw=$w
+        done
+)}
+as_c_ranges() {
+        while read a b w; do
+                echo "R(0x$a, 0x$b, $w),"
        done
-        repeat "$i, " $((17-lastp))
 }
-indent() { sed -e 's/^/\t\t/' -e 's/\s*$//'; }  # also trim trailing spaces
+# ranges -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data
+# (inclusive, exclussive. arr of 18 items - last plane (16) is a[16]..a[17])
+extract_plane_indices() {
+        # for each plane (0..16) print the 1st index at ranges where it appears
+        # or, if missing, index of the next found plane (ranges are same-plane)
+        i=0 pdone=-1  # index at ranges, plane whose index was last printed
+        while read a dummy; do  # a: ranges[i].first (hex, plane(a) == a>>16)
+                while [ $pdone -lt $((0x$a >> 16)) ]; do
+                        printf "$i, "
+                        pdone=$((pdone+1))
+                done  # pdone == plane(a)
+                i=$((i+1))
+        done
+        printf $i  # one past last range. the rest are implicit-init as 0
+}
+indent() { $sed -e '2,$s/^/\t\t/' -e 's/\s*$//'; }  # also trim trailing spaces
+# only EastAsianWidth.txt has version/date
+ver=$(head -n 1 < "$eaw_file" | $sed -e 's/.*-//' -e 's/.txt//')
+udate=$(head -n 2 < "$eaw_file" | tail -n 1 | $sed -e 's/.*Date: //')
+ranges | verify_ranges || exit
+# at the C code, p/bot/top/mid can be $idx_t, but faster as 32 bit types
+[ "$(ranges | wc -l)" -lt 65536 ] && idx_t=$u16 || idx_t=$u32
 cat << CFUNCTION
-/* wcwidth - Unicode $ver, generated by $0.
+/* wcwidth - Unicode $ver
- * Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>
+ * Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
 * License: MIT
 *
- * Data imported on $(date -u -I) from https://github.com/jquast/wcwidth
+ * Generated by $self on $(date -u -I) using the Unicode files
- * commit $(pwc_git describe --tags) ($(pwc_git show --no-patch --format=%ci))
+ * $ud_file and $eaw_file ($udate)
 */
-int ${FUNC_ATTR-} wcwidth($u32 ucs)
+int ${FUNC_ATTR}${FN:-wcwidth}($u32 ucs)
 {
        /* sorted ranges, "first" is clipped to 16 bit, and its high bits
         * (plane) are deduced from the "planes" array below.
-         * (imported from ${zerowidth_py##*/} and ${widewidth_py##*/})
         */
-        static const struct range {
+        static const struct range {  /* bitfield order empirically fast */
-                uint16_t first;
+                $u16 first:  16;
-                uint16_t iswide: 1;  /* bitfield order empirically faster */
+                $u16 iswide:  1;
-                uint16_t difflast: 15;
+                $u16 delta:  15;
        } ranges[] = {
        #define R(first, last, width) {first & 0xffff, width/2, last-first}
-$(data | indent)
+                $(ranges | as_c_ranges | indent)
        #undef  R
        };
        /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */
-        static const $u16 planes[/* 18 */] = {
+        static const $idx_t planes[18] = {
-$(data | mkplanes | fold -s -w 60 | indent)
+                $(ranges | extract_plane_indices | fold -s -w 60 | indent)
        };
        /******* END OF STATIC DATA *******/
@@ -158,7 +324,7 @@ $(data | mkplanes | fold -s -w 60 | indent)
                $u32 mid = (bot + top) / 2;
                if (ucs < ranges[mid].first)
                        top = mid;
-                else if (ucs > ranges[mid].first + ranges[mid].difflast)
+                else if (ucs > ranges[mid].first + ranges[mid].delta)
                        bot = mid + 1;
                else
                        return 2 * ranges[mid].iswide;
@@ -167,3 +333,28 @@ $(data | mkplanes | fold -s -w 60 | indent)
        return 1;
 }  /* wcwidth - Unicode $ver */
 CFUNCTION
+# C drop-in replacement to the awk script which outputs the final ranges list
+: << \MERGE_RANGES_C
+#include <stdio.h>
+int main(void)
+{
+    static char w[0x110000 + 1]; /* = {0} which is width==1 */
+    unsigned a, b, c, i, first;  /* assume 32bit (need 21) */
+    while (3 == scanf("%u%u%u", &a, &b, &c))  /* FIRST LAST WIDTH (W:0/1/2) */
+        if (b < 0x110000 && c < 3) while (a <= b) w[a++] = (char)c + 1;
+    /* HEXFIRST HEXLAST WIDTH  (W: 0/2, not cross-plane, LAST-FIRST<0x8000) */
+    for (first = 0, i = 1; i <= 0x110000; ++i) {
+        if (w[i] == w[i-1] && i % 0x10000 && i - first < 0x8000)
+            continue;
+        if (w[i-1] & 1)  /* 1 or 3 -> width is 0 or 2 for first..i-1 */
+            printf("%06x %06x %d\n", first, i-1, w[i-1]-1);
+        first = i;
+    }
+    return 0;
+}
+MERGE_RANGES_C
author	Avi Halachmi (:avih) <avihpit@yahoo.com>	2025-12-26 00:02:45 +0200
committer	Ron Yorston <rmy@pobox.com>	2025-12-29 13:57:05 +0000
commit	6f847c296bf94401c0c147bf3c5aa1e8531b9dbf (patch)
tree	0348f7468392ce5f413515796566451577d2ce7c /scripts/mkwcwidth
parent	aa552f0883ab1320c2a65679ee8cd683629af7ec (diff)
download	busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.tar.gz busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.tar.bz2 busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.zip

diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth index 792045a29..515128abb 100755 --- a/scripts/mkwcwidth +++ b/scripts/mkwcwidth
@@ -1,11 +1,10 @@
1	#!/bin/sh	1	#!/bin/sh
2	#	2	#
3	# Generate a C implementation of wcwidth, with latest unicode data	3	# Generate a wcwidth C implementation from Unicode data (tested v7 - v17)
4	# from a local clone of https://github.com/jquast/wcwidth
5	#	4	#
6	# The MIT License (MIT)	5	# The MIT License (MIT)
7	#	6	#
8	# Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>	7	# Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
9	#	8	#
10	# Permission is hereby granted, free of charge, to any person obtaining a copy	9	# Permission is hereby granted, free of charge, to any person obtaining a copy
11	# of this software and associated documentation files (the "Software"), to deal	10	# of this software and associated documentation files (the "Software"), to deal
@@ -25,117 +24,284 @@
25	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE	24	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26	# SOFTWARE.	25	# SOFTWARE.
27		26
		27
		28	# Latest Unicode data source files:
		29	# https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
		30	# https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
		31	#
		32	# License and term of use in either URL:
		33	# http://www.unicode.org/copyright.html
		34	# http://www.unicode.org/terms_of_use.html
		35
		36
28	export LC_ALL=C	37	export LC_ALL=C
29	self=${0##*/}	38	self=${0##*/}
		39	awk=${AWK:-awk} sed=${SED:-sed}
30		40
31	# c-types (bigger types work but waste memory. uintN_t need <stdint.h>)	41	URL_base=https://www.unicode.org/Public/UCD/latest/ucd
32	u32=uint32_t # "unsigned" is also typically 32 bit	42	URL_verbase=https://www.unicode.org/Public/%s/ucd
33	u16=uint16_t # "unsigned short" is also typically 16 bits	43	ud_file=UnicodeData.txt
34	FUNC_ATTR=FAST_FUNC # delete this line if not generating a busybox function	44	eaw_file=EastAsianWidth.txt
35		45
		46	# int by default (int is 32 bit and short is 16 on unix/linux/osx/windows, and
		47	# doesn't require stdint.h - https://en.cppreference.com/w/cpp/language/types)
		48	u32=${U32:-unsigned} # or uint32_t
		49	u16=${U16:-unsigned short} # or uint16_t
36		50
37	err() { >&2 printf %s\\n "$self: $*"; exit 1; }	51	err() { >&2 printf %s\\n "$self: $*"; exit 1; }
38		52
39	case ${1-} in -h \| --help)	53	case ${1-} in -h \| --help)
40	echo "Usage: $self [path/to/python-wcwidth] (default path is '.')"	54	echo "Usage: $self [DL=VERSION] [FATTR]"
41	echo "Prints a wcwidth C implementation, with latest Unicode data"	55	echo "Print a wcwidth C implementation to stdout."
42	echo "imported from a local https://github.com/jquast/wcwidth repo."	56	echo
43	echo "Assumptions about table_zero.py and table_wide.py at the repo:"	57	echo "Uses the files ./$ud_file and ./$eaw_file ."
44	echo "- Each range is in one Unicode plane (a>>16 == b>>16) (enforced)."	58	echo "If the files are missing, the latest will be downloaded."
45	echo "- Commit 04d6d90c (2023-10-30) or later, where table_zero.py"	59	echo
46	echo " includes zero-width Cf chars (else need to add manual tests)."	60	echo "If DL=XXX is provided, force-download + overwrite, where XXX is:"
		61	echo "- 'latest' -> $URL_base/..."
		62	echo "- 'draft', '16.0.0' etc -> $(printf "$URL_verbase" XXX)/..."
		63	echo
		64	echo "If given, FATTR will be inserted as 'int FATTR wcwidth(...) {...}'."
		65	echo "Optional env vars:"
		66	echo ' $FN Function name to generate. Default: wcwidth.'
		67	echo ' $U32, $U16 unsigned codepoint, u16 C-types. Default: int/short.'
		68	echo ' $AWK, $SED used programs (word-split, e.g. AWK="busybox awk").'
		69	echo
		70	echo 'Requires: sh, curl/wget, sed, awk, and few more POSIX utilities.'
		71	exit
47	esac	72	esac
		73	[ "${1-}" = -- ] && shift
48		74
49	[ "${1-}" != -- ] \|\| shift	75	DL=
		76	case ${1-} in DL=*) DL=${1#DL=}; shift; esac
		77	[ "${DL-}" ] && [ "$DL" != latest ] && URL_base=$(printf "$URL_verbase" "$DL")
50		78
51	pwc_root=${1:-.}	79	FUNC_ATTR=${1:+$1 }
52	pwc_git() { git -C "$pwc_root" "$@"; }
53		80
54	zerowidth_py=$pwc_root/wcwidth/table_zero.py	81	url2file() { # wget errors on 404-not-found etc, curl requires -f for that
55	widewidth_py=$pwc_root/wcwidth/table_wide.py	82	>&2 echo "[$1 -> $2]"
		83	rm -f -- "$2.tmp"
		84	{ { [ "$(which wget)" ] && wget -O "$2.tmp" "$1"; } \|\|
		85	{ [ "$(which curl)" ] && curl -f -o "$2.tmp" "$1"; }
		86	} && mv -- "$2.tmp" "$2"
		87	}
56		88
57	[ -r "$zerowidth_py" ] && [ -r "$widewidth_py" ] \	89	[ -r "$ud_file" ] && [ -r "$eaw_file" ] && [ -z "$DL" ] \|\| {
58	\|\| err "missing $zerowidth_py or $widewidth_py. abort."	90	url2file "$URL_base/$ud_file" "$ud_file" &&
		91	url2file "$URL_base/$eaw_file" "$eaw_file" \|\|
		92	err "can't download $ud_file and/or $eaw_file. abort."
		93	}
59		94
60	# latest unicode version from table_wide.py (e.g. from " '10.0.0': (")
61	ver=$(grep "^\s'[0-9]" < "$widewidth_py" \| tail -n1 \| sed "s/.'\(.\)'./\1/")
62		95
63	# stdin -> stdout: extract the data of the last table (latest spec) from	96	# extractors of zero-width and wide-width ranges from the Unicode data files.
64	# wcwidth/table_{wide,zero}.py (from https://github.com/jquast/wcwidth)	97	#
65	last_table() {	98	# At https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - the mother of most current
66	awk "/^\s*'[0-9]/ { i=0 } # new table -> reset	99	# wcwidth implementations, by Markus Kuhn (Unicode 5.0) - the width is:
67	/^\s*\(0x/ { arr[++i] = \$0 } # range (first, last)	100	# - 0 for codepoint 0.
68	END { for (j=1; j <= i; ++j) print arr[j] }"	101	# - -1 for control ranges C0/C1, and DEL (1-31, 128-159, 127)
		102	# - 0 if U+200B, cat is Me\|Mn\|Cf but not U+00AD, Hangul Jamo U+1160..U+11FF.
		103	# - else 2 if East_Asian_Width property is F\|W (fullwidth, wide).
		104	# - else 1.
		105	#
		106	# We add as 0-width:
		107	# - Category Mc (combining, diacritics[-like])
		108	# - Hangul Jamo Extended-B U+D7B0..U+D7FF (combining, wasn't in Unicode 5.0)
		109	# - Emoji modifiers U+1F3FB..U+1F3FF (wasn't in Unicode 5.0)
		110	#
		111	# Hangul Jamo, Emoji mods are identified at the data automatically by name.
		112	# U+200B is category Cf since at least 5.0.0 - no need for manual override.
		113	#
		114	# Python wcwidth https://github.com/jquast/wcwidth is like us, plus 0-width:
		115	# - Cat Zl,Zp (2 codepoints: line/para. sep.), but terminals disagree on them.
		116	# - Few unassigned codepoints at Hangul Jamo Extended-B - it adds the range
		117	# manually (includes unassigned), we get (assigned) codepoints at the data.
		118	# - (U+00AD was 0, fixed to 1 after https://github.com/jquast/wcwidth/issues/8)
		119	#
		120	#
		121	# UnicodeData.txt is lines of:
		122	# CODEPOINT;NAME;CATEGORY;(...more) where CODEPOINT is 4-6 hex digits
		123	# or pairs of:
		124	# CODEPOINT;<XNAME, First>;CATEGORY;(...more) where XNAME is group-ish name
		125	# CODEPOINT;<XNAME, Last>;CATEGORY;(...more)
		126	#
		127	# EastAsianWidth.txt is lines of:
		128	# #... (comment)
		129	# or
		130	# CODEPOINT[..LAST];EAW # <stuff> (4-6 hex digits, maybe spaces around ";")
		131	# where EAW is East_Asian_Width property ("F", "W", "A", "N", etc)
		132
		133	# replace First/Last pairs in UnicodeData with one line of FIRST..LAST range.
		134	# assume that First/Last are at consecutive lines, and XNAME/CAT don't change.
		135	# (no-op for us at Unicode v16 - doesn't add new wide/zero width codepoints)
		136	ud_file_ranges() {
		137	$awk -F\; '$2 ~ /, First>$/ { printf $1 ".."; next }; 1' < "$ud_file"
69	}	138	}
70		139
71	# stdin -> stdout, $1 is the range's (wc)width (0 or 2), e.g.	140	# output unsorted/overlapping lines of hex "CODEPOINT[..LAST] WIDTH" (0/1/2)
72	# from: (0x0123a, 0x0123c,), # comment	141	raw_ranges() {
73	# to : R(0x00123a, 0x00123c, 2), /* comment */	142	# wide ranges according to the EAW property
74	# ranges bigger than half-plane (32769+ codepoints) are split to two.	143	$sed -e '/^#/d' -e 's/;/ /' < "$eaw_file" \| # comment lines, ';'
75	py_data_to_c() {	144	$awk '$2 ~ /^[FW]$/ { print $1, 2 }'
76	sed -e 's/[(),]/ /g' -e 's\|#\(.\)\|/\1 */\|' \| while read a b c; do	145
77	# to support cross-plane ranges, we'd need to split them here,	146	# wide by name (older Unicode had EAW==N for some FULLWIDTH names)
78	# but unlikely required, as all planes end in non-characters.	147	ud_file_ranges \| $awk -F\; '$2 ~ /FULLWIDTH/ { print $1, 2 }'
79	[ $(($a>>16)) = $(($b>>16)) ] \|\| err "not same plane -- $a $b"	148
80		149	# zero-width ranges (override wide-width)
81	a=$(($a)) b=$(($b)) # some shells want decimal vars in $(())	150	ud_file_ranges \| $awk -F\; '
82	if [ "$((b-a))" -ge 32768 ]; then # split to 15 bit ranges	151	$3 ~ /^(Me\|Mn\|Mc\|Cf)$/ \|\|
83	printf "R(0x%06x, 0x%06x, $1), %s\n" $a $((a+32767)) "$c"	152	$2 ~ /EMOJI MODIFIER\|HANGUL J[OU]NGSEONG/ { print $1, 0 }'
84	a=$((a+32768)) c="/* (continued...) */"	153
85	fi	154	# override soft-hyphen as width 1
86	printf "R(0x%06x, 0x%06x, $1), %s\n" $a $b "$c"	155	echo 00AD 1
		156	}
		157
		158	# lines of hex "CODEPOINT[..LAST] WIDTH" -> decimal "FIRST LAST WIDTH"
		159	as_decimal_ranges() {
		160	while read range width; do
		161	echo $((0x${range%%.})) $((0x${range##.})) $width
87	done	162	done
88	}	163	}
89		164
90	data=$(last_table < "$zerowidth_py" \| py_data_to_c 0 &&	165	# to generate the final ranges list, we need to sort the raw ranges list,
91	last_table < "$widewidth_py" \| py_data_to_c 2) \|\| err abort	166	# ensure zero-width overrides wide-width, join adjacent/overlap ranges,
92	data=$(printf %s\\n "$data" \| sort) # lexicographic here is also numeric	167	# and split big ranges - our C structure uses 15 bits to store LAST-FIRST of
		168	# each range, and ranges are not allowed to cross plane boundary.
		169	#
		170	# we use simple (and slow) approach: codepoint-bucket-store all the individual
		171	# widths at the input ranges as they come, then scan the whole Unicode range
		172	# (~ 1M buckets) and output continuous ranges of width 0/2. Therefore, later
		173	# inputs of codepoint X override earlier, so width 2 should arrive before W 0.
		174	#
		175	# there's ~3K input ranges of ~200K codepoints, and the output is 450+ ranges.
		176	# depending on sys/awk performance, it's 200ms - 2s, up to ~7s on low-end ARM.
		177	# MERGE_RANGES_C (bottom of this file) is drop-in replacement, runs in few ms.
		178
		179	# sorted lines of "FIRST LAST WIDTH" (FIRST/LAST: hex digits, WIDTH: 0 or 2)
		180	ranges=$(raw_ranges \| as_decimal_ranges \| $awk '
		181
		182	# The final ~ 1M codepoints scan can take few secs, up to 30+s on low end.
		183	# so use a simple optimization: divide the unicode range to segments which
		184	# do not cross plane boundary (some 2^X, tuned for speed), mark segments
		185	# where an input range can change the width at the final scan (its egdes),
		186	# then skip unmarked segments at the final output scan, as these segs are
		187	# guaranteed to not have any width change. this is effective (x5 speedup)
		188	# because most segments are empty or dense. If the input was spread evenly
		189	# then it would be more effective to store, sort and iterate exactly these
		190	# edges. Currently though, both are same speed, and mark/skip is less code.
93		191
94	# sorted hex ranges and their (wc)width: R(first, last, {0\|2}),[ /* ... */]	192	BEGIN { SEG = 128 } # 128 or 256 seem fastest
95	data() { printf %s\\n "$data"; }	193	function mark(i) { segs[int(i/SEG)] = 1 }
		194	function skip(i) { return !(int(i/SEG) in segs) }
96		195
97	repeat() { R=$2; while [ "$R" -gt 0 ]; do printf %s "$1"; R=$((R-1)); done; }	196	# input range $1..$2: width may differ at $1 vs $1-1, and at $2+1 vs $2
		197	{ mark($1); mark($2+1) }
98		198
99	# data -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data	199	# store/overwrite width-1 (1 -> 0) for every codepoint in this range
100	mkplanes() {	200	{ v=$3-1; for (i=$1; i<=$2; w[i++]=v); }
101	i=0 lastp=-1	201
102	while read a b c; do	202	function min(a,b) { return a<b ? a : b }
103	p=$((${b%?} >> 16)) # plane (last >> 16)	203	END {
104	repeat "$i, " $((p-lastp))	204	# scan the full unicode range, print continuous ranges of width 0/2
105	i=$((i+1)) lastp=$p	205	# the SEG skip is SEG-aligned. to disable: use unconditional i++
		206	for (i = 0; i <= 1114112; skip(i) ? i+=SEG : i++) { # 0x110000
		207	if (w[i] == wprev && i%65536) # no change of width or plane
		208	continue
		209	if (wprev) # range i0..i-1 is width 0 or 2 - print
		210	for (; i0 < i; i0 += 32768) # split (once) if too big
		211	printf "%06x %06x %d\n", i0, min(i0+32767, i-1), 1+wprev
		212	i0 = i
		213	wprev = w[i]
		214	}
		215	}
		216	')
		217
		218	ranges() { printf %s\\n "$ranges"; }
		219
		220	verify_ranges() {(
		221	errv() { err "verify: bad range: $* -- (0x$ha, 0x$hb, $w)"; }
		222	ishex() { case $1 in '' \| [!0-9A-Fa-f] ) false; esac; } # LC_ALL=C
		223
		224	pa=-2 pb=-2 pw=x # previous range values
		225	while read ha hb w; do
		226	[ "$w" = 0 ] \|\| [ "$w" = 2 ] \|\| errv WIDTH is not 0 or 2
		227	ishex "$ha" && ishex "$hb" \|\| errv not hex values
		228
		229	a=$((0x$ha)) b=$((0x$hb)) # decimal
		230	[ $a -le $b ] \|\| errv "FIRST > LAST"
		231	[ $b -le 1114111 ] \|\| errv out of bounds
		232	[ $((a>>16)) = $((b>>16)) ] \|\| errv not same plane
		233	[ $((b-a)) -lt 32768 ] \|\| errv more than 0x8000 values
		234	[ $a -gt $pb ] \|\| errv overlap or not sorted
		235
		236	# detect adjacent ranges which combined are <= 0x8000 values.
		237	# not detecting 3 adjacent 0x5000 ranges, but shouldn't happen
		238	[ $w != $pw ] \|\| [ $a != $((pb+1)) ] \|\| [ $((b-pa)) -ge 32768 ] \|\|
		239	errv small same-width adjacent ranges not combined
		240
		241	pa=$a pb=$b pw=$w
		242	done
		243	)}
		244
		245
		246	as_c_ranges() {
		247	while read a b w; do
		248	echo "R(0x$a, 0x$b, $w),"
106	done	249	done
107	repeat "$i, " $((17-lastp))
108	}	250	}
109		251
110	indent() { sed -e 's/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces	252	# ranges -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data
		253	# (inclusive, exclussive. arr of 18 items - last plane (16) is a[16]..a[17])
		254	extract_plane_indices() {
		255	# for each plane (0..16) print the 1st index at ranges where it appears
		256	# or, if missing, index of the next found plane (ranges are same-plane)
		257	i=0 pdone=-1 # index at ranges, plane whose index was last printed
		258	while read a dummy; do # a: ranges[i].first (hex, plane(a) == a>>16)
		259	while [ $pdone -lt $((0x$a >> 16)) ]; do
		260	printf "$i, "
		261	pdone=$((pdone+1))
		262	done # pdone == plane(a)
		263	i=$((i+1))
		264	done
		265	printf $i # one past last range. the rest are implicit-init as 0
		266	}
		267
		268	indent() { $sed -e '2,$s/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces
		269
		270	# only EastAsianWidth.txt has version/date
		271	ver=$(head -n 1 < "$eaw_file" \| $sed -e 's/.*-//' -e 's/.txt//')
		272	udate=$(head -n 2 < "$eaw_file" \| tail -n 1 \| $sed -e 's/.*Date: //')
		273
		274	ranges \| verify_ranges \|\| exit
		275
		276	# at the C code, p/bot/top/mid can be $idx_t, but faster as 32 bit types
		277	[ "$(ranges \| wc -l)" -lt 65536 ] && idx_t=$u16 \|\| idx_t=$u32
111		278
112	cat << CFUNCTION	279	cat << CFUNCTION
113	/* wcwidth - Unicode $ver, generated by $0.	280	/* wcwidth - Unicode $ver
114	* Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>	281	* Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
115	* License: MIT	282	* License: MIT
116	*	283	*
117	* Data imported on $(date -u -I) from https://github.com/jquast/wcwidth	284	* Generated by $self on $(date -u -I) using the Unicode files
118	* commit $(pwc_git describe --tags) ($(pwc_git show --no-patch --format=%ci))	285	* $ud_file and $eaw_file ($udate)
119	*/	286	*/
120	int ${FUNC_ATTR-} wcwidth($u32 ucs)	287	int ${FUNC_ATTR}${FN:-wcwidth}($u32 ucs)
121	{	288	{
122	/* sorted ranges, "first" is clipped to 16 bit, and its high bits	289	/* sorted ranges, "first" is clipped to 16 bit, and its high bits
123	* (plane) are deduced from the "planes" array below.	290	* (plane) are deduced from the "planes" array below.
124	* (imported from ${zerowidth_py##/} and ${widewidth_py##/})
125	*/	291	*/
126	static const struct range {	292	static const struct range { /* bitfield order empirically fast */
127	uint16_t first;	293	$u16 first: 16;
128	uint16_t iswide: 1; /* bitfield order empirically faster */	294	$u16 iswide: 1;
129	uint16_t difflast: 15;	295	$u16 delta: 15;
130	} ranges[] = {	296	} ranges[] = {
131	#define R(first, last, width) {first & 0xffff, width/2, last-first}	297	#define R(first, last, width) {first & 0xffff, width/2, last-first}
132	$(data \| indent)	298	$(ranges \| as_c_ranges \| indent)
133	#undef R	299	#undef R
134	};	300	};
135		301
136	/* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */	302	/* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */
137	static const $u16 planes[/* 18 */] = {	303	static const $idx_t planes[18] = {
138	$(data \| mkplanes \| fold -s -w 60 \| indent)	304	$(ranges \| extract_plane_indices \| fold -s -w 60 \| indent)
139	};	305	};
140		306
141	/***** END OF STATIC DATA *****/	307	/***** END OF STATIC DATA *****/
@@ -158,7 +324,7 @@ $(data \| mkplanes \| fold -s -w 60 \| indent)
158	$u32 mid = (bot + top) / 2;	324	$u32 mid = (bot + top) / 2;
159	if (ucs < ranges[mid].first)	325	if (ucs < ranges[mid].first)
160	top = mid;	326	top = mid;
161	else if (ucs > ranges[mid].first + ranges[mid].difflast)	327	else if (ucs > ranges[mid].first + ranges[mid].delta)
162	bot = mid + 1;	328	bot = mid + 1;
163	else	329	else
164	return 2 * ranges[mid].iswide;	330	return 2 * ranges[mid].iswide;
@@ -167,3 +333,28 @@ $(data \| mkplanes \| fold -s -w 60 \| indent)
167	return 1;	333	return 1;
168	} /* wcwidth - Unicode $ver */	334	} /* wcwidth - Unicode $ver */
169	CFUNCTION	335	CFUNCTION
		336
		337
		338	# C drop-in replacement to the awk script which outputs the final ranges list
		339	: << \MERGE_RANGES_C
		340	#include <stdio.h>
		341	int main(void)
		342	{
		343	static char w[0x110000 + 1]; /* = {0} which is width==1 */
		344	unsigned a, b, c, i, first; /* assume 32bit (need 21) */
		345
		346	while (3 == scanf("%u%u%u", &a, &b, &c)) /* FIRST LAST WIDTH (W:0/1/2) */
		347	if (b < 0x110000 && c < 3) while (a <= b) w[a++] = (char)c + 1;
		348
		349	/* HEXFIRST HEXLAST WIDTH (W: 0/2, not cross-plane, LAST-FIRST<0x8000) */
		350	for (first = 0, i = 1; i <= 0x110000; ++i) {
		351	if (w[i] == w[i-1] && i % 0x10000 && i - first < 0x8000)
		352	continue;
		353	if (w[i-1] & 1) /* 1 or 3 -> width is 0 or 2 for first..i-1 */
		354	printf("%06x %06x %d\n", first, i-1, w[i-1]-1);
		355	first = i;
		356	}
		357
		358	return 0;
		359	}
		360	MERGE_RANGES_C