aboutsummaryrefslogtreecommitdiff
path: root/scripts/mkwcwidth
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/mkwcwidth')
-rwxr-xr-xscripts/mkwcwidth169
1 files changed, 169 insertions, 0 deletions
diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth
new file mode 100755
index 000000000..792045a29
--- /dev/null
+++ b/scripts/mkwcwidth
@@ -0,0 +1,169 @@
1#!/bin/sh
2#
3# Generate a C implementation of wcwidth, with latest unicode data
4# from a local clone of https://github.com/jquast/wcwidth
5#
6# The MIT License (MIT)
7#
8# Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>
9#
10# Permission is hereby granted, free of charge, to any person obtaining a copy
11# of this software and associated documentation files (the "Software"), to deal
12# in the Software without restriction, including without limitation the rights
13# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14# copies of the Software, and to permit persons to whom the Software is
15# furnished to do so, subject to the following conditions:
16#
17# The above copyright notice and this permission notice shall be included in all
18# copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26# SOFTWARE.
27
28export LC_ALL=C
29self=${0##*/}
30
31# c-types (bigger types work but waste memory. uintN_t need <stdint.h>)
32u32=uint32_t # "unsigned" is also typically 32 bit
33u16=uint16_t # "unsigned short" is also typically 16 bits
34FUNC_ATTR=FAST_FUNC # delete this line if not generating a busybox function
35
36
37err() { >&2 printf %s\\n "$self: $*"; exit 1; }
38
39case ${1-} in -h | --help)
40 echo "Usage: $self [path/to/python-wcwidth] (default path is '.')"
41 echo "Prints a wcwidth C implementation, with latest Unicode data"
42 echo "imported from a local https://github.com/jquast/wcwidth repo."
43 echo "Assumptions about table_zero.py and table_wide.py at the repo:"
44 echo "- Each range is in one Unicode plane (a>>16 == b>>16) (enforced)."
45 echo "- Commit 04d6d90c (2023-10-30) or later, where table_zero.py"
46 echo " includes zero-width Cf chars (else need to add manual tests)."
47esac
48
49[ "${1-}" != -- ] || shift
50
51pwc_root=${1:-.}
52pwc_git() { git -C "$pwc_root" "$@"; }
53
54zerowidth_py=$pwc_root/wcwidth/table_zero.py
55widewidth_py=$pwc_root/wcwidth/table_wide.py
56
57[ -r "$zerowidth_py" ] && [ -r "$widewidth_py" ] \
58 || err "missing $zerowidth_py or $widewidth_py. abort."
59
60# latest unicode version from table_wide.py (e.g. from " '10.0.0': (")
61ver=$(grep "^\s*'[0-9]" < "$widewidth_py" | tail -n1 | sed "s/.*'\(.*\)'.*/\1/")
62
63# stdin -> stdout: extract the data of the last table (latest spec) from
64# wcwidth/table_{wide,zero}.py (from https://github.com/jquast/wcwidth)
65last_table() {
66 awk "/^\s*'[0-9]/ { i=0 } # new table -> reset
67 /^\s*\(0x/ { arr[++i] = \$0 } # range (first, last)
68 END { for (j=1; j <= i; ++j) print arr[j] }"
69}
70
71# stdin -> stdout, $1 is the range's (wc)width (0 or 2), e.g.
72# from: (0x0123a, 0x0123c,), # comment
73# to : R(0x00123a, 0x00123c, 2), /* comment */
74# ranges bigger than half-plane (32769+ codepoints) are split to two.
75py_data_to_c() {
76 sed -e 's/[(),]/ /g' -e 's|#\(.*\)|/*\1 */|' | while read a b c; do
77 # to support cross-plane ranges, we'd need to split them here,
78 # but unlikely required, as all planes end in non-characters.
79 [ $(($a>>16)) = $(($b>>16)) ] || err "not same plane -- $a $b"
80
81 a=$(($a)) b=$(($b)) # some shells want decimal vars in $(())
82 if [ "$((b-a))" -ge 32768 ]; then # split to 15 bit ranges
83 printf "R(0x%06x, 0x%06x, $1), %s\n" $a $((a+32767)) "$c"
84 a=$((a+32768)) c="/* (continued...) */"
85 fi
86 printf "R(0x%06x, 0x%06x, $1), %s\n" $a $b "$c"
87 done
88}
89
90data=$(last_table < "$zerowidth_py" | py_data_to_c 0 &&
91 last_table < "$widewidth_py" | py_data_to_c 2) || err abort
92data=$(printf %s\\n "$data" | sort) # lexicographic here is also numeric
93
94# sorted hex ranges and their (wc)width: R(first, last, {0|2}),[ /* ... */]
95data() { printf %s\\n "$data"; }
96
97repeat() { R=$2; while [ "$R" -gt 0 ]; do printf %s "$1"; R=$((R-1)); done; }
98
99# data -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data
100mkplanes() {
101 i=0 lastp=-1
102 while read a b c; do
103 p=$((${b%?} >> 16)) # plane (last >> 16)
104 repeat "$i, " $((p-lastp))
105 i=$((i+1)) lastp=$p
106 done
107 repeat "$i, " $((17-lastp))
108}
109
110indent() { sed -e 's/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces
111
112cat << CFUNCTION
113/* wcwidth - Unicode $ver, generated by $0.
114 * Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>
115 * License: MIT
116 *
117 * Data imported on $(date -u -I) from https://github.com/jquast/wcwidth
118 * commit $(pwc_git describe --tags) ($(pwc_git show --no-patch --format=%ci))
119 */
120int ${FUNC_ATTR-} wcwidth($u32 ucs)
121{
122 /* sorted ranges, "first" is clipped to 16 bit, and its high bits
123 * (plane) are deduced from the "planes" array below.
124 * (imported from ${zerowidth_py##*/} and ${widewidth_py##*/})
125 */
126 static const struct range {
127 uint16_t first;
128 uint16_t iswide: 1; /* bitfield order empirically faster */
129 uint16_t difflast: 15;
130 } ranges[] = {
131 #define R(first, last, width) {first & 0xffff, width/2, last-first}
132$(data | indent)
133 #undef R
134 };
135
136 /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */
137 static const $u16 planes[/* 18 */] = {
138$(data | mkplanes | fold -s -w 60 | indent)
139 };
140
141 /******* END OF STATIC DATA *******/
142
143 $u32 p, bot, top;
144
145 /* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */
146 if (ucs < 160)
147 return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0;
148
149 /* out of range for "planes" (and non-unicode), non-characters. */
150 /* (some also test surrogate halves, but not required by POSIX) */
151 if (ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
152 return -1;
153
154 p = ucs >> 16;
155 ucs &= 0xffff;
156
157 for (bot = planes[p], top = planes[p+1]; bot < top; ) {
158 $u32 mid = (bot + top) / 2;
159 if (ucs < ranges[mid].first)
160 top = mid;
161 else if (ucs > ranges[mid].first + ranges[mid].difflast)
162 bot = mid + 1;
163 else
164 return 2 * ranges[mid].iswide;
165 }
166
167 return 1;
168} /* wcwidth - Unicode $ver */
169CFUNCTION