#!/bin/sh # # Generate a wcwidth C implementation from Unicode data (tested v7 - v17) # # The MIT License (MIT) # # Copyright (C) 2025 Avi Halachmi # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Latest Unicode data source files: # https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt # https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt # # License and term of use in either URL: # http://www.unicode.org/copyright.html # http://www.unicode.org/terms_of_use.html export LC_ALL=C self=${0##*/} awk=${AWK:-awk} sed=${SED:-sed} URL_base=https://www.unicode.org/Public/UCD/latest/ucd URL_verbase=https://www.unicode.org/Public/%s/ucd ud_file=UnicodeData.txt eaw_file=EastAsianWidth.txt # int by default (int is 32 bit and short is 16 on unix/linux/osx/windows, and # doesn't require stdint.h - https://en.cppreference.com/w/cpp/language/types) u32=${U32:-unsigned} # or uint32_t u16=${U16:-unsigned short} # or uint16_t err() { >&2 printf %s\\n "$self: $*"; exit 1; } case ${1-} in -h | --help) echo "Usage: $self [DL=VERSION] [FATTR]" echo "Print a wcwidth C implementation to stdout." echo echo "Uses the files ./$ud_file and ./$eaw_file ." echo "If the files are missing, the latest will be downloaded." echo echo "If DL=XXX is provided, force-download + overwrite, where XXX is:" echo "- 'latest' -> $URL_base/..." echo "- 'draft', '16.0.0' etc -> $(printf "$URL_verbase" XXX)/..." echo echo "If given, FATTR will be inserted as 'int FATTR wcwidth(...) {...}'." echo "Optional env vars:" echo ' $FN Function name to generate. Default: wcwidth.' echo ' $U32, $U16 unsigned codepoint, u16 C-types. Default: int/short.' echo ' $AWK, $SED used programs (word-split, e.g. AWK="busybox awk").' echo echo 'Requires: sh, curl/wget, sed, awk, and few more POSIX utilities.' exit esac [ "${1-}" = -- ] && shift DL= case ${1-} in DL=*) DL=${1#DL=}; shift; esac [ "${DL-}" ] && [ "$DL" != latest ] && URL_base=$(printf "$URL_verbase" "$DL") FUNC_ATTR=${1:+$1 } url2file() { # wget errors on 404-not-found etc, curl requires -f for that >&2 echo "[$1 -> $2]" rm -f -- "$2.tmp" { { [ "$(which wget)" ] && wget -O "$2.tmp" "$1"; } || { [ "$(which curl)" ] && curl -f -o "$2.tmp" "$1"; } } && mv -- "$2.tmp" "$2" } [ -r "$ud_file" ] && [ -r "$eaw_file" ] && [ -z "$DL" ] || { url2file "$URL_base/$ud_file" "$ud_file" && url2file "$URL_base/$eaw_file" "$eaw_file" || err "can't download $ud_file and/or $eaw_file. abort." } # extractors of zero-width and wide-width ranges from the Unicode data files. # # At https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - the mother of most current # wcwidth implementations, by Markus Kuhn (Unicode 5.0) - the width is: # - 0 for codepoint 0. # - -1 for control ranges C0/C1, and DEL (1-31, 128-159, 127) # - 0 if U+200B, cat is Me|Mn|Cf but not U+00AD, Hangul Jamo U+1160..U+11FF. # - else 2 if East_Asian_Width property is F|W (fullwidth, wide). # - else 1. # # We add as 0-width: # - Category Mc (combining, diacritics[-like]) # - Hangul Jamo Extended-B U+D7B0..U+D7FF (combining, wasn't in Unicode 5.0) # - Emoji modifiers U+1F3FB..U+1F3FF (wasn't in Unicode 5.0) # # Hangul Jamo, Emoji mods are identified at the data automatically by name. # U+200B is category Cf since at least 5.0.0 - no need for manual override. # # Python wcwidth https://github.com/jquast/wcwidth is like us, plus 0-width: # - Cat Zl,Zp (2 codepoints: line/para. sep.), but terminals disagree on them. # - Few unassigned codepoints at Hangul Jamo Extended-B - it adds the range # manually (includes unassigned), we get (assigned) codepoints at the data. # - (U+00AD was 0, fixed to 1 after https://github.com/jquast/wcwidth/issues/8) # # # UnicodeData.txt is lines of: # CODEPOINT;NAME;CATEGORY;(...more) where CODEPOINT is 4-6 hex digits # or pairs of: # CODEPOINT;;CATEGORY;(...more) where XNAME is group-ish name # CODEPOINT;;CATEGORY;(...more) # # EastAsianWidth.txt is lines of: # #... (comment) # or # CODEPOINT[..LAST];EAW # (4-6 hex digits, maybe spaces around ";") # where EAW is East_Asian_Width property ("F", "W", "A", "N", etc) # replace First/Last pairs in UnicodeData with one line of FIRST..LAST range. # assume that First/Last are at consecutive lines, and XNAME/CAT don't change. # (no-op for us at Unicode v16 - doesn't add new wide/zero width codepoints) ud_file_ranges() { $awk -F\; '$2 ~ /, First>$/ { printf $1 ".."; next }; 1' < "$ud_file" } # output unsorted/overlapping lines of hex "CODEPOINT[..LAST] WIDTH" (0/1/2) raw_ranges() { # wide ranges according to the EAW property $sed -e '/^#/d' -e 's/;/ /' < "$eaw_file" | # comment lines, ';' $awk '$2 ~ /^[FW]$/ { print $1, 2 }' # wide by name (older Unicode had EAW==N for some FULLWIDTH names) ud_file_ranges | $awk -F\; '$2 ~ /FULLWIDTH/ { print $1, 2 }' # zero-width ranges (override wide-width) ud_file_ranges | $awk -F\; ' $3 ~ /^(Me|Mn|Mc|Cf)$/ || $2 ~ /EMOJI MODIFIER|HANGUL J[OU]NGSEONG/ { print $1, 0 }' # override soft-hyphen as width 1 echo 00AD 1 } # lines of hex "CODEPOINT[..LAST] WIDTH" -> decimal "FIRST LAST WIDTH" as_decimal_ranges() { while read range width; do echo $((0x${range%%.*})) $((0x${range##*.})) $width done } # to generate the final ranges list, we need to sort the raw ranges list, # ensure zero-width overrides wide-width, join adjacent/overlap ranges, # and split big ranges - our C structure uses 15 bits to store LAST-FIRST of # each range, and ranges are not allowed to cross plane boundary. # # we use simple (and slow) approach: codepoint-bucket-store all the individual # widths at the input ranges as they come, then scan the whole Unicode range # (~ 1M buckets) and output continuous ranges of width 0/2. Therefore, later # inputs of codepoint X override earlier, so width 2 should arrive before W 0. # # there's ~3K input ranges of ~200K codepoints, and the output is 450+ ranges. # depending on sys/awk performance, it's 200ms - 2s, up to ~7s on low-end ARM. # MERGE_RANGES_C (bottom of this file) is drop-in replacement, runs in few ms. # sorted lines of "FIRST LAST WIDTH" (FIRST/LAST: hex digits, WIDTH: 0 or 2) ranges=$(raw_ranges | as_decimal_ranges | $awk ' # The final ~ 1M codepoints scan can take few secs, up to 30+s on low end. # so use a simple optimization: divide the unicode range to segments which # do not cross plane boundary (some 2^X, tuned for speed), mark segments # where an input range can change the width at the final scan (its egdes), # then skip unmarked segments at the final output scan, as these segs are # guaranteed to not have any width change. this is effective (x5 speedup) # because most segments are empty or dense. If the input was spread evenly # then it would be more effective to store, sort and iterate exactly these # edges. Currently though, both are same speed, and mark/skip is less code. BEGIN { SEG = 128 } # 128 or 256 seem fastest function mark(i) { segs[int(i/SEG)] = 1 } function skip(i) { return !(int(i/SEG) in segs) } # input range $1..$2: width may differ at $1 vs $1-1, and at $2+1 vs $2 { mark($1); mark($2+1) } # store/overwrite width-1 (1 -> 0) for every codepoint in this range { v=$3-1; for (i=$1; i<=$2; w[i++]=v); } function min(a,b) { return a LAST" [ $b -le 1114111 ] || errv out of bounds [ $((a>>16)) = $((b>>16)) ] || errv not same plane [ $((b-a)) -lt 32768 ] || errv more than 0x8000 values [ $a -gt $pb ] || errv overlap or not sorted # detect adjacent ranges which combined are <= 0x8000 values. # not detecting 3 adjacent 0x5000 ranges, but shouldn't happen [ $w != $pw ] || [ $a != $((pb+1)) ] || [ $((b-pa)) -ge 32768 ] || errv small same-width adjacent ranges not combined pa=$a pb=$b pw=$w done )} as_c_ranges() { while read a b w; do echo "R(0x$a, 0x$b, $w)," done } # ranges -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data # (inclusive, exclussive. arr of 18 items - last plane (16) is a[16]..a[17]) extract_plane_indices() { # for each plane (0..16) print the 1st index at ranges where it appears # or, if missing, index of the next found plane (ranges are same-plane) i=0 pdone=-1 # index at ranges, plane whose index was last printed while read a dummy; do # a: ranges[i].first (hex, plane(a) == a>>16) while [ $pdone -lt $((0x$a >> 16)) ]; do printf "$i, " pdone=$((pdone+1)) done # pdone == plane(a) i=$((i+1)) done printf $i # one past last range. the rest are implicit-init as 0 } indent() { $sed -e '2,$s/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces # only EastAsianWidth.txt has version/date ver=$(head -n 1 < "$eaw_file" | $sed -e 's/.*-//' -e 's/.txt//') udate=$(head -n 2 < "$eaw_file" | tail -n 1 | $sed -e 's/.*Date: //') ranges | verify_ranges || exit # at the C code, p/bot/top/mid can be $idx_t, but faster as 32 bit types [ "$(ranges | wc -l)" -lt 65536 ] && idx_t=$u16 || idx_t=$u32 cat << CFUNCTION /* wcwidth - Unicode $ver * Copyright (C) 2025 Avi Halachmi * License: MIT * * Generated by $self on $(date -u -I) using the Unicode files * $ud_file and $eaw_file ($udate) */ int ${FUNC_ATTR}${FN:-wcwidth}($u32 ucs) { /* sorted ranges, "first" is clipped to 16 bit, and its high bits * (plane) are deduced from the "planes" array below. */ static const struct range { /* bitfield order empirically fast */ $u16 first: 16; $u16 iswide: 1; $u16 delta: 15; } ranges[] = { #define R(first, last, width) {first & 0xffff, width/2, last-first} $(ranges | as_c_ranges | indent) #undef R }; /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */ static const $idx_t planes[18] = { $(ranges | extract_plane_indices | fold -s -w 60 | indent) }; /******* END OF STATIC DATA *******/ $u32 p, bot, top; /* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */ if (ucs < 160) return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0; /* out of range for "planes" (and non-unicode), non-characters. */ /* (some also test surrogate halves, but not required by POSIX) */ if (ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe) return -1; p = ucs >> 16; ucs &= 0xffff; for (bot = planes[p], top = planes[p+1]; bot < top; ) { $u32 mid = (bot + top) / 2; if (ucs < ranges[mid].first) top = mid; else if (ucs > ranges[mid].first + ranges[mid].delta) bot = mid + 1; else return 2 * ranges[mid].iswide; } return 1; } /* wcwidth - Unicode $ver */ CFUNCTION # C drop-in replacement to the awk script which outputs the final ranges list : << \MERGE_RANGES_C #include int main(void) { static char w[0x110000 + 1]; /* = {0} which is width==1 */ unsigned a, b, c, i, first; /* assume 32bit (need 21) */ while (3 == scanf("%u%u%u", &a, &b, &c)) /* FIRST LAST WIDTH (W:0/1/2) */ if (b < 0x110000 && c < 3) while (a <= b) w[a++] = (char)c + 1; /* HEXFIRST HEXLAST WIDTH (W: 0/2, not cross-plane, LAST-FIRST<0x8000) */ for (first = 0, i = 1; i <= 0x110000; ++i) { if (w[i] == w[i-1] && i % 0x10000 && i - first < 0x8000) continue; if (w[i-1] & 1) /* 1 or 3 -> width is 0 or 2 for first..i-1 */ printf("%06x %06x %d\n", first, i-1, w[i-1]-1); first = i; } return 0; } MERGE_RANGES_C