1 files changed, 360 insertions, 0 deletions
diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth
new file mode 100755
index 000000000..515128abb
--- /dev/null
+++ b/scripts/mkwcwidth
@@ -0,0 +1,360 @@
+#!/bin/sh
+#
+# Generate a wcwidth C implementation from Unicode data (tested v7 - v17)
+#
+# The MIT License (MIT)
+#
+# Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# Latest Unicode data source files:
+#   https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
+#   https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
+#
+# License and term of use in either URL:
+#   http://www.unicode.org/copyright.html
+#   http://www.unicode.org/terms_of_use.html
+export LC_ALL=C
+self=${0##*/}
+awk=${AWK:-awk} sed=${SED:-sed}
+URL_base=https://www.unicode.org/Public/UCD/latest/ucd
+URL_verbase=https://www.unicode.org/Public/%s/ucd
+ud_file=UnicodeData.txt
+eaw_file=EastAsianWidth.txt
+# int by default (int is 32 bit and short is 16 on unix/linux/osx/windows, and
+# doesn't require stdint.h - https://en.cppreference.com/w/cpp/language/types)
+u32=${U32:-unsigned}        # or uint32_t
+u16=${U16:-unsigned short}  # or uint16_t
+err() { >&2 printf %s\\n "$self: $*"; exit 1; }
+case ${1-} in -h | --help)
+        echo "Usage: $self [DL=VERSION] [FATTR]"
+        echo "Print a wcwidth C implementation to stdout."
+        echo
+        echo "Uses the files ./$ud_file and ./$eaw_file ."
+        echo "If the files are missing, the latest will be downloaded."
+        echo
+        echo "If DL=XXX is provided, force-download + overwrite, where XXX is:"
+        echo "- 'latest' -> $URL_base/..."
+        echo "- 'draft', '16.0.0' etc -> $(printf "$URL_verbase" XXX)/..."
+        echo
+        echo "If given, FATTR will be inserted as 'int FATTR wcwidth(...) {...}'."
+        echo "Optional env vars:"
+        echo '  $FN           Function name to generate. Default: wcwidth.'
+        echo '  $U32, $U16    unsigned codepoint, u16 C-types. Default: int/short.'
+        echo '  $AWK, $SED    used programs (word-split, e.g. AWK="busybox awk").'
+        echo
+        echo 'Requires: sh, curl/wget, sed, awk, and few more POSIX utilities.'
+        exit
+esac
+[ "${1-}" = -- ] && shift
+DL=
+case ${1-} in DL=*) DL=${1#DL=}; shift; esac
+[ "${DL-}" ] && [ "$DL" != latest ] && URL_base=$(printf "$URL_verbase" "$DL")
+FUNC_ATTR=${1:+$1 }
+url2file() {  # wget errors on 404-not-found etc, curl requires -f for that
+        >&2 echo "[$1  ->  $2]"
+        rm -f -- "$2.tmp"
+        { { [ "$(which wget)" ] && wget    -O "$2.tmp" "$1"; } ||
+          { [ "$(which curl)" ] && curl -f -o "$2.tmp" "$1"; }
+        } && mv -- "$2.tmp" "$2"
+}
+[ -r "$ud_file" ] && [ -r "$eaw_file" ] && [ -z "$DL" ] || {
+        url2file "$URL_base/$ud_file"  "$ud_file"  &&
+        url2file "$URL_base/$eaw_file" "$eaw_file" ||
+                err "can't download $ud_file and/or $eaw_file. abort."
+}
+# extractors of zero-width and wide-width ranges from the Unicode data files.
+#
+# At https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - the mother of most current
+# wcwidth implementations, by Markus Kuhn (Unicode 5.0) - the width is:
+# - 0 for codepoint 0.
+# - -1 for control ranges C0/C1, and DEL (1-31, 128-159, 127)
+# - 0 if U+200B, cat is Me|Mn|Cf but not U+00AD, Hangul Jamo U+1160..U+11FF.
+# - else 2 if East_Asian_Width property is F|W (fullwidth, wide).
+# - else 1.
+#
+# We add as 0-width:
+# - Category Mc (combining, diacritics[-like])
+# - Hangul Jamo Extended-B U+D7B0..U+D7FF (combining, wasn't in Unicode 5.0)
+# - Emoji modifiers U+1F3FB..U+1F3FF (wasn't in Unicode 5.0)
+#
+# Hangul Jamo, Emoji mods are identified at the data automatically by name.
+# U+200B is category Cf since at least 5.0.0 - no need for manual override.
+#
+# Python wcwidth https://github.com/jquast/wcwidth is like us, plus 0-width:
+# - Cat Zl,Zp (2 codepoints: line/para. sep.), but terminals disagree on them.
+# - Few unassigned codepoints at Hangul Jamo Extended-B - it adds the range
+#   manually (includes unassigned), we get (assigned) codepoints at the data.
+# - (U+00AD was 0, fixed to 1 after https://github.com/jquast/wcwidth/issues/8)
+#
+#
+# UnicodeData.txt is lines of:
+#   CODEPOINT;NAME;CATEGORY;(...more) where CODEPOINT is 4-6 hex digits
+# or pairs of:
+#   CODEPOINT;<XNAME, First>;CATEGORY;(...more)  where XNAME is group-ish name
+#   CODEPOINT;<XNAME, Last>;CATEGORY;(...more)
+#
+# EastAsianWidth.txt is lines of:
+#   #... (comment)
+# or
+#   CODEPOINT[..LAST];EAW # <stuff>  (4-6 hex digits, maybe spaces around ";")
+# where EAW is East_Asian_Width property ("F", "W", "A", "N", etc)
+# replace First/Last pairs in UnicodeData with one line of FIRST..LAST range.
+# assume that First/Last are at consecutive lines, and XNAME/CAT don't change.
+# (no-op for us at Unicode v16 - doesn't add new wide/zero width codepoints)
+ud_file_ranges() {
+        $awk -F\; '$2 ~ /, First>$/ { printf $1 ".."; next }; 1' < "$ud_file"
+}
+# output unsorted/overlapping lines of hex "CODEPOINT[..LAST] WIDTH" (0/1/2)
+raw_ranges() {
+        # wide ranges according to the EAW property
+        $sed -e '/^#/d' -e 's/;/ /' < "$eaw_file" |  # comment lines, ';'
+        $awk '$2 ~ /^[FW]$/ { print $1, 2 }'
+        # wide by name (older Unicode had EAW==N for some FULLWIDTH names)
+        ud_file_ranges | $awk -F\; '$2 ~ /FULLWIDTH/ { print $1, 2 }'
+        # zero-width ranges (override wide-width)
+        ud_file_ranges | $awk -F\; '
+                $3 ~ /^(Me|Mn|Mc|Cf)$/ ||
+                $2 ~ /EMOJI MODIFIER|HANGUL J[OU]NGSEONG/ { print $1, 0 }'
+        # override soft-hyphen as width 1
+        echo 00AD 1
+}
+# lines of hex "CODEPOINT[..LAST] WIDTH"  ->  decimal "FIRST LAST WIDTH"
+as_decimal_ranges() {
+        while read range width; do
+                echo $((0x${range%%.*})) $((0x${range##*.})) $width
+        done
+}
+# to generate the final ranges list, we need to sort the raw ranges list,
+# ensure zero-width overrides wide-width, join adjacent/overlap ranges,
+# and split big ranges - our C structure uses 15 bits to store LAST-FIRST of
+# each range, and ranges are not allowed to cross plane boundary.
+#
+# we use simple (and slow) approach: codepoint-bucket-store all the individual
+# widths at the input ranges as they come, then scan the whole Unicode range
+# (~ 1M buckets) and output continuous ranges of width 0/2. Therefore, later
+# inputs of codepoint X override earlier, so width 2 should arrive before W 0.
+#
+# there's ~3K input ranges of ~200K codepoints, and the output is 450+ ranges.
+# depending on sys/awk performance, it's 200ms - 2s, up to ~7s on low-end ARM.
+# MERGE_RANGES_C (bottom of this file) is drop-in replacement, runs in few ms.
+# sorted lines of "FIRST LAST WIDTH" (FIRST/LAST: hex digits, WIDTH: 0 or 2)
+ranges=$(raw_ranges | as_decimal_ranges | $awk '
+    # The final ~ 1M codepoints scan can take few secs, up to 30+s on low end.
+    # so use a simple optimization: divide the unicode range to segments which
+    # do not cross plane boundary (some 2^X, tuned for speed), mark segments
+    # where an input range can change the width at the final scan (its egdes),
+    # then skip unmarked segments at the final output scan, as these segs are
+    # guaranteed to not have any width change. this is effective (x5 speedup)
+    # because most segments are empty or dense. If the input was spread evenly
+    # then it would be more effective to store, sort and iterate exactly these
+    # edges. Currently though, both are same speed, and mark/skip is less code.
+    BEGIN { SEG = 128 }  # 128 or 256 seem fastest
+    function mark(i) { segs[int(i/SEG)] = 1 }
+    function skip(i) { return !(int(i/SEG) in segs) }
+    # input range $1..$2: width may differ at $1 vs $1-1, and at $2+1 vs $2
+    { mark($1); mark($2+1) }
+    # store/overwrite width-1 (1 -> 0) for every codepoint in this range
+    { v=$3-1; for (i=$1; i<=$2; w[i++]=v); }
+    function min(a,b) { return a<b ? a : b }
+    END {
+        # scan the full unicode range, print continuous ranges of width 0/2
+        # the SEG skip is SEG-aligned. to disable: use unconditional i++
+        for (i = 0; i <= 1114112; skip(i) ? i+=SEG : i++) {  # 0x110000
+            if (w[i] == wprev && i%65536)  # no change of width or plane
+                continue
+            if (wprev)  # range i0..i-1 is width 0 or 2 - print
+                for (; i0 < i; i0 += 32768)  # split (once) if too big
+                    printf "%06x %06x %d\n", i0, min(i0+32767, i-1), 1+wprev
+            i0 = i
+            wprev = w[i]
+        }
+    }
+')
+ranges() { printf %s\\n "$ranges"; }
+verify_ranges() {(
+        errv()  { err "verify: bad range: $* -- (0x$ha, 0x$hb, $w)"; }
+        ishex() { case $1 in '' | *[!0-9A-Fa-f]* ) false; esac; }  # LC_ALL=C
+        pa=-2 pb=-2 pw=x  # previous range values
+        while read ha hb w; do
+                [ "$w" = 0 ] || [ "$w" = 2 ]  || errv WIDTH is not 0 or 2
+                ishex "$ha" && ishex "$hb"    || errv not hex values
+                a=$((0x$ha)) b=$((0x$hb))     #  decimal
+                [ $a -le $b ]                 || errv "FIRST > LAST"
+                [ $b -le 1114111 ]            || errv out of bounds
+                [ $((a>>16)) = $((b>>16)) ]   || errv not same plane
+                [ $((b-a)) -lt 32768 ]        || errv more than 0x8000 values
+                [ $a -gt $pb ]                || errv overlap or not sorted
+                # detect adjacent ranges which combined are <= 0x8000 values.
+                # not detecting 3 adjacent 0x5000 ranges, but shouldn't happen
+                [ $w != $pw ] || [ $a != $((pb+1)) ] || [ $((b-pa)) -ge 32768 ] ||
+                     errv small same-width adjacent ranges not combined
+                pa=$a pb=$b pw=$w
+        done
+)}
+as_c_ranges() {
+        while read a b w; do
+                echo "R(0x$a, 0x$b, $w),"
+        done
+}
+# ranges -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data
+# (inclusive, exclussive. arr of 18 items - last plane (16) is a[16]..a[17])
+extract_plane_indices() {
+        # for each plane (0..16) print the 1st index at ranges where it appears
+        # or, if missing, index of the next found plane (ranges are same-plane)
+        i=0 pdone=-1  # index at ranges, plane whose index was last printed
+        while read a dummy; do  # a: ranges[i].first (hex, plane(a) == a>>16)
+                while [ $pdone -lt $((0x$a >> 16)) ]; do
+                        printf "$i, "
+                        pdone=$((pdone+1))
+                done  # pdone == plane(a)
+                i=$((i+1))
+        done
+        printf $i  # one past last range. the rest are implicit-init as 0
+}
+indent() { $sed -e '2,$s/^/\t\t/' -e 's/\s*$//'; }  # also trim trailing spaces
+# only EastAsianWidth.txt has version/date
+ver=$(head -n 1 < "$eaw_file" | $sed -e 's/.*-//' -e 's/.txt//')
+udate=$(head -n 2 < "$eaw_file" | tail -n 1 | $sed -e 's/.*Date: //')
+ranges | verify_ranges || exit
+# at the C code, p/bot/top/mid can be $idx_t, but faster as 32 bit types
+[ "$(ranges | wc -l)" -lt 65536 ] && idx_t=$u16 || idx_t=$u32
+cat << CFUNCTION
+/* wcwidth - Unicode $ver
+ * Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
+ * License: MIT
+ *
+ * Generated by $self on $(date -u -I) using the Unicode files
+ * $ud_file and $eaw_file ($udate)
+ */
+int ${FUNC_ATTR}${FN:-wcwidth}($u32 ucs)
+{
+        /* sorted ranges, "first" is clipped to 16 bit, and its high bits
+         * (plane) are deduced from the "planes" array below.
+         */
+        static const struct range {  /* bitfield order empirically fast */
+                $u16 first:  16;
+                $u16 iswide:  1;
+                $u16 delta:  15;
+        } ranges[] = {
+        #define R(first, last, width) {first & 0xffff, width/2, last-first}
+                $(ranges | as_c_ranges | indent)
+        #undef  R
+        };
+        /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */
+        static const $idx_t planes[18] = {
+                $(ranges | extract_plane_indices | fold -s -w 60 | indent)
+        };
+        /******* END OF STATIC DATA *******/
+        $u32 p, bot, top;
+        /* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */
+        if (ucs < 160)
+                return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0;
+        /* out of range for "planes" (and non-unicode), non-characters. */
+        /* (some also test surrogate halves, but not required by POSIX) */
+        if (ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
+                return -1;
+        p = ucs >> 16;
+        ucs &= 0xffff;
+        for (bot = planes[p], top = planes[p+1]; bot < top; ) {
+                $u32 mid = (bot + top) / 2;
+                if (ucs < ranges[mid].first)
+                        top = mid;
+                else if (ucs > ranges[mid].first + ranges[mid].delta)
+                        bot = mid + 1;
+                else
+                        return 2 * ranges[mid].iswide;
+        }
+        return 1;
+}  /* wcwidth - Unicode $ver */
+CFUNCTION
+# C drop-in replacement to the awk script which outputs the final ranges list
+: << \MERGE_RANGES_C
+#include <stdio.h>
+int main(void)
+{
+    static char w[0x110000 + 1]; /* = {0} which is width==1 */
+    unsigned a, b, c, i, first;  /* assume 32bit (need 21) */
+    while (3 == scanf("%u%u%u", &a, &b, &c))  /* FIRST LAST WIDTH (W:0/1/2) */
+        if (b < 0x110000 && c < 3) while (a <= b) w[a++] = (char)c + 1;
+    /* HEXFIRST HEXLAST WIDTH  (W: 0/2, not cross-plane, LAST-FIRST<0x8000) */
+    for (first = 0, i = 1; i <= 0x110000; ++i) {
+        if (w[i] == w[i-1] && i % 0x10000 && i - first < 0x8000)
+            continue;
+        if (w[i-1] & 1)  /* 1 or 3 -> width is 0 or 2 for first..i-1 */
+            printf("%06x %06x %d\n", first, i-1, w[i-1]-1);
+        first = i;
+    }
+    return 0;
+}
+MERGE_RANGES_C

diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth new file mode 100755 index 000000000..515128abb --- /dev/null +++ b/scripts/mkwcwidth
@@ -0,0 +1,360 @@
	1	#!/bin/sh
	2	#
	3	# Generate a wcwidth C implementation from Unicode data (tested v7 - v17)
	4	#
	5	# The MIT License (MIT)
	6	#
	7	# Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
	8	#
	9	# Permission is hereby granted, free of charge, to any person obtaining a copy
	10	# of this software and associated documentation files (the "Software"), to deal
	11	# in the Software without restriction, including without limitation the rights
	12	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	13	# copies of the Software, and to permit persons to whom the Software is
	14	# furnished to do so, subject to the following conditions:
	15	#
	16	# The above copyright notice and this permission notice shall be included in all
	17	# copies or substantial portions of the Software.
	18	#
	19	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	20	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	21	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	22	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	23	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	24	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	25	# SOFTWARE.
	26
	27
	28	# Latest Unicode data source files:
	29	# https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
	30	# https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
	31	#
	32	# License and term of use in either URL:
	33	# http://www.unicode.org/copyright.html
	34	# http://www.unicode.org/terms_of_use.html
	35
	36
	37	export LC_ALL=C
	38	self=${0##*/}
	39	awk=${AWK:-awk} sed=${SED:-sed}
	40
	41	URL_base=https://www.unicode.org/Public/UCD/latest/ucd
	42	URL_verbase=https://www.unicode.org/Public/%s/ucd
	43	ud_file=UnicodeData.txt
	44	eaw_file=EastAsianWidth.txt
	45
	46	# int by default (int is 32 bit and short is 16 on unix/linux/osx/windows, and
	47	# doesn't require stdint.h - https://en.cppreference.com/w/cpp/language/types)
	48	u32=${U32:-unsigned} # or uint32_t
	49	u16=${U16:-unsigned short} # or uint16_t
	50
	51	err() { >&2 printf %s\\n "$self: $*"; exit 1; }
	52
	53	case ${1-} in -h \| --help)
	54	echo "Usage: $self [DL=VERSION] [FATTR]"
	55	echo "Print a wcwidth C implementation to stdout."
	56	echo
	57	echo "Uses the files ./$ud_file and ./$eaw_file ."
	58	echo "If the files are missing, the latest will be downloaded."
	59	echo
	60	echo "If DL=XXX is provided, force-download + overwrite, where XXX is:"
	61	echo "- 'latest' -> $URL_base/..."
	62	echo "- 'draft', '16.0.0' etc -> $(printf "$URL_verbase" XXX)/..."
	63	echo
	64	echo "If given, FATTR will be inserted as 'int FATTR wcwidth(...) {...}'."
	65	echo "Optional env vars:"
	66	echo ' $FN Function name to generate. Default: wcwidth.'
	67	echo ' $U32, $U16 unsigned codepoint, u16 C-types. Default: int/short.'
	68	echo ' $AWK, $SED used programs (word-split, e.g. AWK="busybox awk").'
	69	echo
	70	echo 'Requires: sh, curl/wget, sed, awk, and few more POSIX utilities.'
	71	exit
	72	esac
	73	[ "${1-}" = -- ] && shift
	74
	75	DL=
	76	case ${1-} in DL=*) DL=${1#DL=}; shift; esac
	77	[ "${DL-}" ] && [ "$DL" != latest ] && URL_base=$(printf "$URL_verbase" "$DL")
	78
	79	FUNC_ATTR=${1:+$1 }
	80
	81	url2file() { # wget errors on 404-not-found etc, curl requires -f for that
	82	>&2 echo "[$1 -> $2]"
	83	rm -f -- "$2.tmp"
	84	{ { [ "$(which wget)" ] && wget -O "$2.tmp" "$1"; } \|\|
	85	{ [ "$(which curl)" ] && curl -f -o "$2.tmp" "$1"; }
	86	} && mv -- "$2.tmp" "$2"
	87	}
	88
	89	[ -r "$ud_file" ] && [ -r "$eaw_file" ] && [ -z "$DL" ] \|\| {
	90	url2file "$URL_base/$ud_file" "$ud_file" &&
	91	url2file "$URL_base/$eaw_file" "$eaw_file" \|\|
	92	err "can't download $ud_file and/or $eaw_file. abort."
	93	}
	94
	95
	96	# extractors of zero-width and wide-width ranges from the Unicode data files.
	97	#
	98	# At https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - the mother of most current
	99	# wcwidth implementations, by Markus Kuhn (Unicode 5.0) - the width is:
	100	# - 0 for codepoint 0.
	101	# - -1 for control ranges C0/C1, and DEL (1-31, 128-159, 127)
	102	# - 0 if U+200B, cat is Me\|Mn\|Cf but not U+00AD, Hangul Jamo U+1160..U+11FF.
	103	# - else 2 if East_Asian_Width property is F\|W (fullwidth, wide).
	104	# - else 1.
	105	#
	106	# We add as 0-width:
	107	# - Category Mc (combining, diacritics[-like])
	108	# - Hangul Jamo Extended-B U+D7B0..U+D7FF (combining, wasn't in Unicode 5.0)
	109	# - Emoji modifiers U+1F3FB..U+1F3FF (wasn't in Unicode 5.0)
	110	#
	111	# Hangul Jamo, Emoji mods are identified at the data automatically by name.
	112	# U+200B is category Cf since at least 5.0.0 - no need for manual override.
	113	#
	114	# Python wcwidth https://github.com/jquast/wcwidth is like us, plus 0-width:
	115	# - Cat Zl,Zp (2 codepoints: line/para. sep.), but terminals disagree on them.
	116	# - Few unassigned codepoints at Hangul Jamo Extended-B - it adds the range
	117	# manually (includes unassigned), we get (assigned) codepoints at the data.
	118	# - (U+00AD was 0, fixed to 1 after https://github.com/jquast/wcwidth/issues/8)
	119	#
	120	#
	121	# UnicodeData.txt is lines of:
	122	# CODEPOINT;NAME;CATEGORY;(...more) where CODEPOINT is 4-6 hex digits
	123	# or pairs of:
	124	# CODEPOINT;<XNAME, First>;CATEGORY;(...more) where XNAME is group-ish name
	125	# CODEPOINT;<XNAME, Last>;CATEGORY;(...more)
	126	#
	127	# EastAsianWidth.txt is lines of:
	128	# #... (comment)
	129	# or
	130	# CODEPOINT[..LAST];EAW # <stuff> (4-6 hex digits, maybe spaces around ";")
	131	# where EAW is East_Asian_Width property ("F", "W", "A", "N", etc)
	132
	133	# replace First/Last pairs in UnicodeData with one line of FIRST..LAST range.
	134	# assume that First/Last are at consecutive lines, and XNAME/CAT don't change.
	135	# (no-op for us at Unicode v16 - doesn't add new wide/zero width codepoints)
	136	ud_file_ranges() {
	137	$awk -F\; '$2 ~ /, First>$/ { printf $1 ".."; next }; 1' < "$ud_file"
	138	}
	139
	140	# output unsorted/overlapping lines of hex "CODEPOINT[..LAST] WIDTH" (0/1/2)
	141	raw_ranges() {
	142	# wide ranges according to the EAW property
	143	$sed -e '/^#/d' -e 's/;/ /' < "$eaw_file" \| # comment lines, ';'
	144	$awk '$2 ~ /^[FW]$/ { print $1, 2 }'
	145
	146	# wide by name (older Unicode had EAW==N for some FULLWIDTH names)
	147	ud_file_ranges \| $awk -F\; '$2 ~ /FULLWIDTH/ { print $1, 2 }'
	148
	149	# zero-width ranges (override wide-width)
	150	ud_file_ranges \| $awk -F\; '
	151	$3 ~ /^(Me\|Mn\|Mc\|Cf)$/ \|\|
	152	$2 ~ /EMOJI MODIFIER\|HANGUL J[OU]NGSEONG/ { print $1, 0 }'
	153
	154	# override soft-hyphen as width 1
	155	echo 00AD 1
	156	}
	157
	158	# lines of hex "CODEPOINT[..LAST] WIDTH" -> decimal "FIRST LAST WIDTH"
	159	as_decimal_ranges() {
	160	while read range width; do
	161	echo $((0x${range%%.})) $((0x${range##.})) $width
	162	done
	163	}
	164
	165	# to generate the final ranges list, we need to sort the raw ranges list,
	166	# ensure zero-width overrides wide-width, join adjacent/overlap ranges,
	167	# and split big ranges - our C structure uses 15 bits to store LAST-FIRST of
	168	# each range, and ranges are not allowed to cross plane boundary.
	169	#
	170	# we use simple (and slow) approach: codepoint-bucket-store all the individual
	171	# widths at the input ranges as they come, then scan the whole Unicode range
	172	# (~ 1M buckets) and output continuous ranges of width 0/2. Therefore, later
	173	# inputs of codepoint X override earlier, so width 2 should arrive before W 0.
	174	#
	175	# there's ~3K input ranges of ~200K codepoints, and the output is 450+ ranges.
	176	# depending on sys/awk performance, it's 200ms - 2s, up to ~7s on low-end ARM.
	177	# MERGE_RANGES_C (bottom of this file) is drop-in replacement, runs in few ms.
	178
	179	# sorted lines of "FIRST LAST WIDTH" (FIRST/LAST: hex digits, WIDTH: 0 or 2)
	180	ranges=$(raw_ranges \| as_decimal_ranges \| $awk '
	181
	182	# The final ~ 1M codepoints scan can take few secs, up to 30+s on low end.
	183	# so use a simple optimization: divide the unicode range to segments which
	184	# do not cross plane boundary (some 2^X, tuned for speed), mark segments
	185	# where an input range can change the width at the final scan (its egdes),
	186	# then skip unmarked segments at the final output scan, as these segs are
	187	# guaranteed to not have any width change. this is effective (x5 speedup)
	188	# because most segments are empty or dense. If the input was spread evenly
	189	# then it would be more effective to store, sort and iterate exactly these
	190	# edges. Currently though, both are same speed, and mark/skip is less code.
	191
	192	BEGIN { SEG = 128 } # 128 or 256 seem fastest
	193	function mark(i) { segs[int(i/SEG)] = 1 }
	194	function skip(i) { return !(int(i/SEG) in segs) }
	195
	196	# input range $1..$2: width may differ at $1 vs $1-1, and at $2+1 vs $2
	197	{ mark($1); mark($2+1) }
	198
	199	# store/overwrite width-1 (1 -> 0) for every codepoint in this range
	200	{ v=$3-1; for (i=$1; i<=$2; w[i++]=v); }
	201
	202	function min(a,b) { return a<b ? a : b }
	203	END {
	204	# scan the full unicode range, print continuous ranges of width 0/2
	205	# the SEG skip is SEG-aligned. to disable: use unconditional i++
	206	for (i = 0; i <= 1114112; skip(i) ? i+=SEG : i++) { # 0x110000
	207	if (w[i] == wprev && i%65536) # no change of width or plane
	208	continue
	209	if (wprev) # range i0..i-1 is width 0 or 2 - print
	210	for (; i0 < i; i0 += 32768) # split (once) if too big
	211	printf "%06x %06x %d\n", i0, min(i0+32767, i-1), 1+wprev
	212	i0 = i
	213	wprev = w[i]
	214	}
	215	}
	216	')
	217
	218	ranges() { printf %s\\n "$ranges"; }
	219
	220	verify_ranges() {(
	221	errv() { err "verify: bad range: $* -- (0x$ha, 0x$hb, $w)"; }
	222	ishex() { case $1 in '' \| [!0-9A-Fa-f] ) false; esac; } # LC_ALL=C
	223
	224	pa=-2 pb=-2 pw=x # previous range values
	225	while read ha hb w; do
	226	[ "$w" = 0 ] \|\| [ "$w" = 2 ] \|\| errv WIDTH is not 0 or 2
	227	ishex "$ha" && ishex "$hb" \|\| errv not hex values
	228
	229	a=$((0x$ha)) b=$((0x$hb)) # decimal
	230	[ $a -le $b ] \|\| errv "FIRST > LAST"
	231	[ $b -le 1114111 ] \|\| errv out of bounds
	232	[ $((a>>16)) = $((b>>16)) ] \|\| errv not same plane
	233	[ $((b-a)) -lt 32768 ] \|\| errv more than 0x8000 values
	234	[ $a -gt $pb ] \|\| errv overlap or not sorted
	235
	236	# detect adjacent ranges which combined are <= 0x8000 values.
	237	# not detecting 3 adjacent 0x5000 ranges, but shouldn't happen
	238	[ $w != $pw ] \|\| [ $a != $((pb+1)) ] \|\| [ $((b-pa)) -ge 32768 ] \|\|
	239	errv small same-width adjacent ranges not combined
	240
	241	pa=$a pb=$b pw=$w
	242	done
	243	)}
	244
	245
	246	as_c_ranges() {
	247	while read a b w; do
	248	echo "R(0x$a, 0x$b, $w),"
	249	done
	250	}
	251
	252	# ranges -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data
	253	# (inclusive, exclussive. arr of 18 items - last plane (16) is a[16]..a[17])
	254	extract_plane_indices() {
	255	# for each plane (0..16) print the 1st index at ranges where it appears
	256	# or, if missing, index of the next found plane (ranges are same-plane)
	257	i=0 pdone=-1 # index at ranges, plane whose index was last printed
	258	while read a dummy; do # a: ranges[i].first (hex, plane(a) == a>>16)
	259	while [ $pdone -lt $((0x$a >> 16)) ]; do
	260	printf "$i, "
	261	pdone=$((pdone+1))
	262	done # pdone == plane(a)
	263	i=$((i+1))
	264	done
	265	printf $i # one past last range. the rest are implicit-init as 0
	266	}
	267
	268	indent() { $sed -e '2,$s/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces
	269
	270	# only EastAsianWidth.txt has version/date
	271	ver=$(head -n 1 < "$eaw_file" \| $sed -e 's/.*-//' -e 's/.txt//')
	272	udate=$(head -n 2 < "$eaw_file" \| tail -n 1 \| $sed -e 's/.*Date: //')
	273
	274	ranges \| verify_ranges \|\| exit
	275
	276	# at the C code, p/bot/top/mid can be $idx_t, but faster as 32 bit types
	277	[ "$(ranges \| wc -l)" -lt 65536 ] && idx_t=$u16 \|\| idx_t=$u32
	278
	279	cat << CFUNCTION
	280	/* wcwidth - Unicode $ver
	281	* Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com>
	282	* License: MIT
	283	*
	284	* Generated by $self on $(date -u -I) using the Unicode files
	285	* $ud_file and $eaw_file ($udate)
	286	*/
	287	int ${FUNC_ATTR}${FN:-wcwidth}($u32 ucs)
	288	{
	289	/* sorted ranges, "first" is clipped to 16 bit, and its high bits
	290	* (plane) are deduced from the "planes" array below.
	291	*/
	292	static const struct range { /* bitfield order empirically fast */
	293	$u16 first: 16;
	294	$u16 iswide: 1;
	295	$u16 delta: 15;
	296	} ranges[] = {
	297	#define R(first, last, width) {first & 0xffff, width/2, last-first}
	298	$(ranges \| as_c_ranges \| indent)
	299	#undef R
	300	};
	301
	302	/* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */
	303	static const $idx_t planes[18] = {
	304	$(ranges \| extract_plane_indices \| fold -s -w 60 \| indent)
	305	};
	306
	307	/***** END OF STATIC DATA *****/
	308
	309	$u32 p, bot, top;
	310
	311	/* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */
	312	if (ucs < 160)
	313	return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0;
	314
	315	/* out of range for "planes" (and non-unicode), non-characters. */
	316	/* (some also test surrogate halves, but not required by POSIX) */
	317	if (ucs > 0x10ffff \|\| (ucs & 0xfffe) == 0xfffe)
	318	return -1;
	319
	320	p = ucs >> 16;
	321	ucs &= 0xffff;
	322
	323	for (bot = planes[p], top = planes[p+1]; bot < top; ) {
	324	$u32 mid = (bot + top) / 2;
	325	if (ucs < ranges[mid].first)
	326	top = mid;
	327	else if (ucs > ranges[mid].first + ranges[mid].delta)
	328	bot = mid + 1;
	329	else
	330	return 2 * ranges[mid].iswide;
	331	}
	332
	333	return 1;
	334	} /* wcwidth - Unicode $ver */
	335	CFUNCTION
	336
	337
	338	# C drop-in replacement to the awk script which outputs the final ranges list
	339	: << \MERGE_RANGES_C
	340	#include <stdio.h>
	341	int main(void)
	342	{
	343	static char w[0x110000 + 1]; /* = {0} which is width==1 */
	344	unsigned a, b, c, i, first; /* assume 32bit (need 21) */
	345
	346	while (3 == scanf("%u%u%u", &a, &b, &c)) /* FIRST LAST WIDTH (W:0/1/2) */
	347	if (b < 0x110000 && c < 3) while (a <= b) w[a++] = (char)c + 1;
	348
	349	/* HEXFIRST HEXLAST WIDTH (W: 0/2, not cross-plane, LAST-FIRST<0x8000) */
	350	for (first = 0, i = 1; i <= 0x110000; ++i) {
	351	if (w[i] == w[i-1] && i % 0x10000 && i - first < 0x8000)
	352	continue;
	353	if (w[i-1] & 1) /* 1 or 3 -> width is 0 or 2 for first..i-1 */
	354	printf("%06x %06x %d\n", first, i-1, w[i-1]-1);
	355	first = i;
	356	}
	357
	358	return 0;
	359	}
	360	MERGE_RANGES_C