#!/bin/sh # # Generate a C implementation of wcwidth, with latest unicode data # from a local clone of https://github.com/jquast/wcwidth # # The MIT License (MIT) # # Copyright (C) 2024 Avi Halachmi # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. export LC_ALL=C self=${0##*/} # c-types (bigger types work but waste memory. uintN_t need ) u32=uint32_t # "unsigned" is also typically 32 bit u16=uint16_t # "unsigned short" is also typically 16 bits FUNC_ATTR=FAST_FUNC # delete this line if not generating a busybox function err() { >&2 printf %s\\n "$self: $*"; exit 1; } case ${1-} in -h | --help) echo "Usage: $self [path/to/python-wcwidth] (default path is '.')" echo "Prints a wcwidth C implementation, with latest Unicode data" echo "imported from a local https://github.com/jquast/wcwidth repo." echo "Assumptions about table_zero.py and table_wide.py at the repo:" echo "- Each range is in one Unicode plane (a>>16 == b>>16) (enforced)." echo "- Commit 04d6d90c (2023-10-30) or later, where table_zero.py" echo " includes zero-width Cf chars (else need to add manual tests)." esac [ "${1-}" != -- ] || shift pwc_root=${1:-.} pwc_git() { git -C "$pwc_root" "$@"; } zerowidth_py=$pwc_root/wcwidth/table_zero.py widewidth_py=$pwc_root/wcwidth/table_wide.py [ -r "$zerowidth_py" ] && [ -r "$widewidth_py" ] \ || err "missing $zerowidth_py or $widewidth_py. abort." # latest unicode version from table_wide.py (e.g. from " '10.0.0': (") ver=$(grep "^\s*'[0-9]" < "$widewidth_py" | tail -n1 | sed "s/.*'$.*$'.*/\1/") # stdin -> stdout: extract the data of the last table (latest spec) from # wcwidth/table_{wide,zero}.py (from https://github.com/jquast/wcwidth) last_table() { awk "/^\s*'[0-9]/ { i=0 } # new table -> reset /^\s*$0x/ { arr[++i] = \$0 } # range (first, last) END { for (j=1; j <= i; ++j) print arr[j] }" } # stdin -> stdout, $1 is the range's (wc)width (0 or 2), e.g. # from: (0x0123a, 0x0123c,), # comment # to : R(0x00123a, 0x00123c, 2), /* comment */ # ranges bigger than half-plane (32769+ codepoints) are split to two. py_data_to_c() { sed -e 's/[(),]/ /g' -e 's|#\(.*$|/*\1 */|' | while read a b c; do # to support cross-plane ranges, we'd need to split them here, # but unlikely required, as all planes end in non-characters. [ $(($a>>16)) = $(($b>>16)) ] || err "not same plane -- $a $b" a=$(($a)) b=$(($b)) # some shells want decimal vars in $(()) if [ "$((b-a))" -ge 32768 ]; then # split to 15 bit ranges printf "R(0x%06x, 0x%06x, $1), %s\n" $a $((a+32767)) "$c" a=$((a+32768)) c="/* (continued...) */" fi printf "R(0x%06x, 0x%06x, $1), %s\n" $a $b "$c" done } data=$(last_table < "$zerowidth_py" | py_data_to_c 0 && last_table < "$widewidth_py" | py_data_to_c 2) || err abort data=$(printf %s\\n "$data" | sort) # lexicographic here is also numeric # sorted hex ranges and their (wc)width: R(first, last, {0|2}),[ /* ... */] data() { printf %s\\n "$data"; } repeat() { R=$2; while [ "$R" -gt 0 ]; do printf %s "$1"; R=$((R-1)); done; } # data -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data mkplanes() { i=0 lastp=-1 while read a b c; do p=$((${b%?} >> 16)) # plane (last >> 16) repeat "$i, " $((p-lastp)) i=$((i+1)) lastp=$p done repeat "$i, " $((17-lastp)) } indent() { sed -e 's/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces cat << CFUNCTION /* wcwidth - Unicode $ver, generated by $0. * Copyright (C) 2024 Avi Halachmi * License: MIT * * Data imported on $(date -u -I) from https://github.com/jquast/wcwidth * commit $(pwc_git describe --tags) ($(pwc_git show --no-patch --format=%ci)) */ int ${FUNC_ATTR-} wcwidth($u32 ucs) { /* sorted ranges, "first" is clipped to 16 bit, and its high bits * (plane) are deduced from the "planes" array below. * (imported from ${zerowidth_py##*/} and ${widewidth_py##*/}) */ static const struct range { uint16_t first; uint16_t iswide: 1; /* bitfield order empirically faster */ uint16_t difflast: 15; } ranges[] = { #define R(first, last, width) {first & 0xffff, width/2, last-first} $(data | indent) #undef R }; /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */ static const $u16 planes[/* 18 */] = { $(data | mkplanes | fold -s -w 60 | indent) }; /******* END OF STATIC DATA *******/ $u32 p, bot, top; /* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */ if (ucs < 160) return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0; /* out of range for "planes" (and non-unicode), non-characters. */ /* (some also test surrogate halves, but not required by POSIX) */ if (ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe) return -1; p = ucs >> 16; ucs &= 0xffff; for (bot = planes[p], top = planes[p+1]; bot < top; ) { $u32 mid = (bot + top) / 2; if (ucs < ranges[mid].first) top = mid; else if (ucs > ranges[mid].first + ranges[mid].difflast) bot = mid + 1; else return 2 * ranges[mid].iswide; } return 1; } /* wcwidth - Unicode $ver */ CFUNCTION