diff options
Diffstat (limited to 'scripts/mkwcwidth')
| -rwxr-xr-x | scripts/mkwcwidth | 169 |
1 files changed, 169 insertions, 0 deletions
diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth new file mode 100755 index 000000000..792045a29 --- /dev/null +++ b/scripts/mkwcwidth | |||
| @@ -0,0 +1,169 @@ | |||
| 1 | #!/bin/sh | ||
| 2 | # | ||
| 3 | # Generate a C implementation of wcwidth, with latest unicode data | ||
| 4 | # from a local clone of https://github.com/jquast/wcwidth | ||
| 5 | # | ||
| 6 | # The MIT License (MIT) | ||
| 7 | # | ||
| 8 | # Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com> | ||
| 9 | # | ||
| 10 | # Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| 11 | # of this software and associated documentation files (the "Software"), to deal | ||
| 12 | # in the Software without restriction, including without limitation the rights | ||
| 13 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 14 | # copies of the Software, and to permit persons to whom the Software is | ||
| 15 | # furnished to do so, subject to the following conditions: | ||
| 16 | # | ||
| 17 | # The above copyright notice and this permission notice shall be included in all | ||
| 18 | # copies or substantial portions of the Software. | ||
| 19 | # | ||
| 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 21 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 22 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| 23 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| 24 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| 25 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 26 | # SOFTWARE. | ||
| 27 | |||
| 28 | export LC_ALL=C | ||
| 29 | self=${0##*/} | ||
| 30 | |||
| 31 | # c-types (bigger types work but waste memory. uintN_t need <stdint.h>) | ||
| 32 | u32=uint32_t # "unsigned" is also typically 32 bit | ||
| 33 | u16=uint16_t # "unsigned short" is also typically 16 bits | ||
| 34 | FUNC_ATTR=FAST_FUNC # delete this line if not generating a busybox function | ||
| 35 | |||
| 36 | |||
| 37 | err() { >&2 printf %s\\n "$self: $*"; exit 1; } | ||
| 38 | |||
| 39 | case ${1-} in -h | --help) | ||
| 40 | echo "Usage: $self [path/to/python-wcwidth] (default path is '.')" | ||
| 41 | echo "Prints a wcwidth C implementation, with latest Unicode data" | ||
| 42 | echo "imported from a local https://github.com/jquast/wcwidth repo." | ||
| 43 | echo "Assumptions about table_zero.py and table_wide.py at the repo:" | ||
| 44 | echo "- Each range is in one Unicode plane (a>>16 == b>>16) (enforced)." | ||
| 45 | echo "- Commit 04d6d90c (2023-10-30) or later, where table_zero.py" | ||
| 46 | echo " includes zero-width Cf chars (else need to add manual tests)." | ||
| 47 | esac | ||
| 48 | |||
| 49 | [ "${1-}" != -- ] || shift | ||
| 50 | |||
| 51 | pwc_root=${1:-.} | ||
| 52 | pwc_git() { git -C "$pwc_root" "$@"; } | ||
| 53 | |||
| 54 | zerowidth_py=$pwc_root/wcwidth/table_zero.py | ||
| 55 | widewidth_py=$pwc_root/wcwidth/table_wide.py | ||
| 56 | |||
| 57 | [ -r "$zerowidth_py" ] && [ -r "$widewidth_py" ] \ | ||
| 58 | || err "missing $zerowidth_py or $widewidth_py. abort." | ||
| 59 | |||
| 60 | # latest unicode version from table_wide.py (e.g. from " '10.0.0': (") | ||
| 61 | ver=$(grep "^\s*'[0-9]" < "$widewidth_py" | tail -n1 | sed "s/.*'\(.*\)'.*/\1/") | ||
| 62 | |||
| 63 | # stdin -> stdout: extract the data of the last table (latest spec) from | ||
| 64 | # wcwidth/table_{wide,zero}.py (from https://github.com/jquast/wcwidth) | ||
| 65 | last_table() { | ||
| 66 | awk "/^\s*'[0-9]/ { i=0 } # new table -> reset | ||
| 67 | /^\s*\(0x/ { arr[++i] = \$0 } # range (first, last) | ||
| 68 | END { for (j=1; j <= i; ++j) print arr[j] }" | ||
| 69 | } | ||
| 70 | |||
| 71 | # stdin -> stdout, $1 is the range's (wc)width (0 or 2), e.g. | ||
| 72 | # from: (0x0123a, 0x0123c,), # comment | ||
| 73 | # to : R(0x00123a, 0x00123c, 2), /* comment */ | ||
| 74 | # ranges bigger than half-plane (32769+ codepoints) are split to two. | ||
| 75 | py_data_to_c() { | ||
| 76 | sed -e 's/[(),]/ /g' -e 's|#\(.*\)|/*\1 */|' | while read a b c; do | ||
| 77 | # to support cross-plane ranges, we'd need to split them here, | ||
| 78 | # but unlikely required, as all planes end in non-characters. | ||
| 79 | [ $(($a>>16)) = $(($b>>16)) ] || err "not same plane -- $a $b" | ||
| 80 | |||
| 81 | a=$(($a)) b=$(($b)) # some shells want decimal vars in $(()) | ||
| 82 | if [ "$((b-a))" -ge 32768 ]; then # split to 15 bit ranges | ||
| 83 | printf "R(0x%06x, 0x%06x, $1), %s\n" $a $((a+32767)) "$c" | ||
| 84 | a=$((a+32768)) c="/* (continued...) */" | ||
| 85 | fi | ||
| 86 | printf "R(0x%06x, 0x%06x, $1), %s\n" $a $b "$c" | ||
| 87 | done | ||
| 88 | } | ||
| 89 | |||
| 90 | data=$(last_table < "$zerowidth_py" | py_data_to_c 0 && | ||
| 91 | last_table < "$widewidth_py" | py_data_to_c 2) || err abort | ||
| 92 | data=$(printf %s\\n "$data" | sort) # lexicographic here is also numeric | ||
| 93 | |||
| 94 | # sorted hex ranges and their (wc)width: R(first, last, {0|2}),[ /* ... */] | ||
| 95 | data() { printf %s\\n "$data"; } | ||
| 96 | |||
| 97 | repeat() { R=$2; while [ "$R" -gt 0 ]; do printf %s "$1"; R=$((R-1)); done; } | ||
| 98 | |||
| 99 | # data -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data | ||
| 100 | mkplanes() { | ||
| 101 | i=0 lastp=-1 | ||
| 102 | while read a b c; do | ||
| 103 | p=$((${b%?} >> 16)) # plane (last >> 16) | ||
| 104 | repeat "$i, " $((p-lastp)) | ||
| 105 | i=$((i+1)) lastp=$p | ||
| 106 | done | ||
| 107 | repeat "$i, " $((17-lastp)) | ||
| 108 | } | ||
| 109 | |||
| 110 | indent() { sed -e 's/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces | ||
| 111 | |||
| 112 | cat << CFUNCTION | ||
| 113 | /* wcwidth - Unicode $ver, generated by $0. | ||
| 114 | * Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com> | ||
| 115 | * License: MIT | ||
| 116 | * | ||
| 117 | * Data imported on $(date -u -I) from https://github.com/jquast/wcwidth | ||
| 118 | * commit $(pwc_git describe --tags) ($(pwc_git show --no-patch --format=%ci)) | ||
| 119 | */ | ||
| 120 | int ${FUNC_ATTR-} wcwidth($u32 ucs) | ||
| 121 | { | ||
| 122 | /* sorted ranges, "first" is clipped to 16 bit, and its high bits | ||
| 123 | * (plane) are deduced from the "planes" array below. | ||
| 124 | * (imported from ${zerowidth_py##*/} and ${widewidth_py##*/}) | ||
| 125 | */ | ||
| 126 | static const struct range { | ||
| 127 | uint16_t first; | ||
| 128 | uint16_t iswide: 1; /* bitfield order empirically faster */ | ||
| 129 | uint16_t difflast: 15; | ||
| 130 | } ranges[] = { | ||
| 131 | #define R(first, last, width) {first & 0xffff, width/2, last-first} | ||
| 132 | $(data | indent) | ||
| 133 | #undef R | ||
| 134 | }; | ||
| 135 | |||
| 136 | /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */ | ||
| 137 | static const $u16 planes[/* 18 */] = { | ||
| 138 | $(data | mkplanes | fold -s -w 60 | indent) | ||
| 139 | }; | ||
| 140 | |||
| 141 | /******* END OF STATIC DATA *******/ | ||
| 142 | |||
| 143 | $u32 p, bot, top; | ||
| 144 | |||
| 145 | /* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */ | ||
| 146 | if (ucs < 160) | ||
| 147 | return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0; | ||
| 148 | |||
| 149 | /* out of range for "planes" (and non-unicode), non-characters. */ | ||
| 150 | /* (some also test surrogate halves, but not required by POSIX) */ | ||
| 151 | if (ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe) | ||
| 152 | return -1; | ||
| 153 | |||
| 154 | p = ucs >> 16; | ||
| 155 | ucs &= 0xffff; | ||
| 156 | |||
| 157 | for (bot = planes[p], top = planes[p+1]; bot < top; ) { | ||
| 158 | $u32 mid = (bot + top) / 2; | ||
| 159 | if (ucs < ranges[mid].first) | ||
| 160 | top = mid; | ||
| 161 | else if (ucs > ranges[mid].first + ranges[mid].difflast) | ||
| 162 | bot = mid + 1; | ||
| 163 | else | ||
| 164 | return 2 * ranges[mid].iswide; | ||
| 165 | } | ||
| 166 | |||
| 167 | return 1; | ||
| 168 | } /* wcwidth - Unicode $ver */ | ||
| 169 | CFUNCTION | ||
