diff options
Diffstat (limited to 'scripts/mkwcwidth')
-rwxr-xr-x | scripts/mkwcwidth | 169 |
1 files changed, 169 insertions, 0 deletions
diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth new file mode 100755 index 000000000..792045a29 --- /dev/null +++ b/scripts/mkwcwidth | |||
@@ -0,0 +1,169 @@ | |||
1 | #!/bin/sh | ||
2 | # | ||
3 | # Generate a C implementation of wcwidth, with latest unicode data | ||
4 | # from a local clone of https://github.com/jquast/wcwidth | ||
5 | # | ||
6 | # The MIT License (MIT) | ||
7 | # | ||
8 | # Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com> | ||
9 | # | ||
10 | # Permission is hereby granted, free of charge, to any person obtaining a copy | ||
11 | # of this software and associated documentation files (the "Software"), to deal | ||
12 | # in the Software without restriction, including without limitation the rights | ||
13 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
14 | # copies of the Software, and to permit persons to whom the Software is | ||
15 | # furnished to do so, subject to the following conditions: | ||
16 | # | ||
17 | # The above copyright notice and this permission notice shall be included in all | ||
18 | # copies or substantial portions of the Software. | ||
19 | # | ||
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
21 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
22 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
23 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
24 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
25 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
26 | # SOFTWARE. | ||
27 | |||
28 | export LC_ALL=C | ||
29 | self=${0##*/} | ||
30 | |||
31 | # c-types (bigger types work but waste memory. uintN_t need <stdint.h>) | ||
32 | u32=uint32_t # "unsigned" is also typically 32 bit | ||
33 | u16=uint16_t # "unsigned short" is also typically 16 bits | ||
34 | FUNC_ATTR=FAST_FUNC # delete this line if not generating a busybox function | ||
35 | |||
36 | |||
37 | err() { >&2 printf %s\\n "$self: $*"; exit 1; } | ||
38 | |||
39 | case ${1-} in -h | --help) | ||
40 | echo "Usage: $self [path/to/python-wcwidth] (default path is '.')" | ||
41 | echo "Prints a wcwidth C implementation, with latest Unicode data" | ||
42 | echo "imported from a local https://github.com/jquast/wcwidth repo." | ||
43 | echo "Assumptions about table_zero.py and table_wide.py at the repo:" | ||
44 | echo "- Each range is in one Unicode plane (a>>16 == b>>16) (enforced)." | ||
45 | echo "- Commit 04d6d90c (2023-10-30) or later, where table_zero.py" | ||
46 | echo " includes zero-width Cf chars (else need to add manual tests)." | ||
47 | esac | ||
48 | |||
49 | [ "${1-}" != -- ] || shift | ||
50 | |||
51 | pwc_root=${1:-.} | ||
52 | pwc_git() { git -C "$pwc_root" "$@"; } | ||
53 | |||
54 | zerowidth_py=$pwc_root/wcwidth/table_zero.py | ||
55 | widewidth_py=$pwc_root/wcwidth/table_wide.py | ||
56 | |||
57 | [ -r "$zerowidth_py" ] && [ -r "$widewidth_py" ] \ | ||
58 | || err "missing $zerowidth_py or $widewidth_py. abort." | ||
59 | |||
60 | # latest unicode version from table_wide.py (e.g. from " '10.0.0': (") | ||
61 | ver=$(grep "^\s*'[0-9]" < "$widewidth_py" | tail -n1 | sed "s/.*'\(.*\)'.*/\1/") | ||
62 | |||
63 | # stdin -> stdout: extract the data of the last table (latest spec) from | ||
64 | # wcwidth/table_{wide,zero}.py (from https://github.com/jquast/wcwidth) | ||
65 | last_table() { | ||
66 | awk "/^\s*'[0-9]/ { i=0 } # new table -> reset | ||
67 | /^\s*\(0x/ { arr[++i] = \$0 } # range (first, last) | ||
68 | END { for (j=1; j <= i; ++j) print arr[j] }" | ||
69 | } | ||
70 | |||
71 | # stdin -> stdout, $1 is the range's (wc)width (0 or 2), e.g. | ||
72 | # from: (0x0123a, 0x0123c,), # comment | ||
73 | # to : R(0x00123a, 0x00123c, 2), /* comment */ | ||
74 | # ranges bigger than half-plane (32769+ codepoints) are split to two. | ||
75 | py_data_to_c() { | ||
76 | sed -e 's/[(),]/ /g' -e 's|#\(.*\)|/*\1 */|' | while read a b c; do | ||
77 | # to support cross-plane ranges, we'd need to split them here, | ||
78 | # but unlikely required, as all planes end in non-characters. | ||
79 | [ $(($a>>16)) = $(($b>>16)) ] || err "not same plane -- $a $b" | ||
80 | |||
81 | a=$(($a)) b=$(($b)) # some shells want decimal vars in $(()) | ||
82 | if [ "$((b-a))" -ge 32768 ]; then # split to 15 bit ranges | ||
83 | printf "R(0x%06x, 0x%06x, $1), %s\n" $a $((a+32767)) "$c" | ||
84 | a=$((a+32768)) c="/* (continued...) */" | ||
85 | fi | ||
86 | printf "R(0x%06x, 0x%06x, $1), %s\n" $a $b "$c" | ||
87 | done | ||
88 | } | ||
89 | |||
90 | data=$(last_table < "$zerowidth_py" | py_data_to_c 0 && | ||
91 | last_table < "$widewidth_py" | py_data_to_c 2) || err abort | ||
92 | data=$(printf %s\\n "$data" | sort) # lexicographic here is also numeric | ||
93 | |||
94 | # sorted hex ranges and their (wc)width: R(first, last, {0|2}),[ /* ... */] | ||
95 | data() { printf %s\\n "$data"; } | ||
96 | |||
97 | repeat() { R=$2; while [ "$R" -gt 0 ]; do printf %s "$1"; R=$((R-1)); done; } | ||
98 | |||
99 | # data -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data | ||
100 | mkplanes() { | ||
101 | i=0 lastp=-1 | ||
102 | while read a b c; do | ||
103 | p=$((${b%?} >> 16)) # plane (last >> 16) | ||
104 | repeat "$i, " $((p-lastp)) | ||
105 | i=$((i+1)) lastp=$p | ||
106 | done | ||
107 | repeat "$i, " $((17-lastp)) | ||
108 | } | ||
109 | |||
110 | indent() { sed -e 's/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces | ||
111 | |||
112 | cat << CFUNCTION | ||
113 | /* wcwidth - Unicode $ver, generated by $0. | ||
114 | * Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com> | ||
115 | * License: MIT | ||
116 | * | ||
117 | * Data imported on $(date -u -I) from https://github.com/jquast/wcwidth | ||
118 | * commit $(pwc_git describe --tags) ($(pwc_git show --no-patch --format=%ci)) | ||
119 | */ | ||
120 | int ${FUNC_ATTR-} wcwidth($u32 ucs) | ||
121 | { | ||
122 | /* sorted ranges, "first" is clipped to 16 bit, and its high bits | ||
123 | * (plane) are deduced from the "planes" array below. | ||
124 | * (imported from ${zerowidth_py##*/} and ${widewidth_py##*/}) | ||
125 | */ | ||
126 | static const struct range { | ||
127 | uint16_t first; | ||
128 | uint16_t iswide: 1; /* bitfield order empirically faster */ | ||
129 | uint16_t difflast: 15; | ||
130 | } ranges[] = { | ||
131 | #define R(first, last, width) {first & 0xffff, width/2, last-first} | ||
132 | $(data | indent) | ||
133 | #undef R | ||
134 | }; | ||
135 | |||
136 | /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */ | ||
137 | static const $u16 planes[/* 18 */] = { | ||
138 | $(data | mkplanes | fold -s -w 60 | indent) | ||
139 | }; | ||
140 | |||
141 | /******* END OF STATIC DATA *******/ | ||
142 | |||
143 | $u32 p, bot, top; | ||
144 | |||
145 | /* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */ | ||
146 | if (ucs < 160) | ||
147 | return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0; | ||
148 | |||
149 | /* out of range for "planes" (and non-unicode), non-characters. */ | ||
150 | /* (some also test surrogate halves, but not required by POSIX) */ | ||
151 | if (ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe) | ||
152 | return -1; | ||
153 | |||
154 | p = ucs >> 16; | ||
155 | ucs &= 0xffff; | ||
156 | |||
157 | for (bot = planes[p], top = planes[p+1]; bot < top; ) { | ||
158 | $u32 mid = (bot + top) / 2; | ||
159 | if (ucs < ranges[mid].first) | ||
160 | top = mid; | ||
161 | else if (ucs > ranges[mid].first + ranges[mid].difflast) | ||
162 | bot = mid + 1; | ||
163 | else | ||
164 | return 2 * ranges[mid].iswide; | ||
165 | } | ||
166 | |||
167 | return 1; | ||
168 | } /* wcwidth - Unicode $ver */ | ||
169 | CFUNCTION | ||