diff options
| author | Avi Halachmi (:avih) <avihpit@yahoo.com> | 2025-12-26 00:02:45 +0200 |
|---|---|---|
| committer | Ron Yorston <rmy@pobox.com> | 2025-12-29 13:57:05 +0000 |
| commit | 6f847c296bf94401c0c147bf3c5aa1e8531b9dbf (patch) | |
| tree | 0348f7468392ce5f413515796566451577d2ce7c /scripts/mkwcwidth | |
| parent | aa552f0883ab1320c2a65679ee8cd683629af7ec (diff) | |
| download | busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.tar.gz busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.tar.bz2 busybox-w32-6f847c296bf94401c0c147bf3c5aa1e8531b9dbf.zip | |
win32: wcwidth_alt.c: new generator script, re-generate code
Previously, scripts/mkwcwidth generated a wcwidth implementation based
on input from the python wcwidth: https://github.com/jquast/wcwidth .
However, that project is not updated frequently, and the less
dependencies - the better.
This new script instead prints an implementation based directly on the
original Unicode files - which it can also download.
It generates the same code as before (same data structures, etc).
It was then used to re-generate libbb/wcwidth_alt.c, as follows:
./scripts/mkwcwidth DL=15.1.0 FAST_FUNC > libbb/wcwidth_alt.c
This is the same Unicode version as the previous file, just to make
the codepoint differences visible easily.
These differences are very small, and the script itself explains where
and why it differs from https://github.com/jquast/wcwidth .
Next commits will update the tables to the latest unicode version.
Diffstat (limited to 'scripts/mkwcwidth')
| -rwxr-xr-x | scripts/mkwcwidth | 339 |
1 files changed, 265 insertions, 74 deletions
diff --git a/scripts/mkwcwidth b/scripts/mkwcwidth index 792045a29..515128abb 100755 --- a/scripts/mkwcwidth +++ b/scripts/mkwcwidth | |||
| @@ -1,11 +1,10 @@ | |||
| 1 | #!/bin/sh | 1 | #!/bin/sh |
| 2 | # | 2 | # |
| 3 | # Generate a C implementation of wcwidth, with latest unicode data | 3 | # Generate a wcwidth C implementation from Unicode data (tested v7 - v17) |
| 4 | # from a local clone of https://github.com/jquast/wcwidth | ||
| 5 | # | 4 | # |
| 6 | # The MIT License (MIT) | 5 | # The MIT License (MIT) |
| 7 | # | 6 | # |
| 8 | # Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com> | 7 | # Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com> |
| 9 | # | 8 | # |
| 10 | # Permission is hereby granted, free of charge, to any person obtaining a copy | 9 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
| 11 | # of this software and associated documentation files (the "Software"), to deal | 10 | # of this software and associated documentation files (the "Software"), to deal |
| @@ -25,117 +24,284 @@ | |||
| 25 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | 24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 26 | # SOFTWARE. | 25 | # SOFTWARE. |
| 27 | 26 | ||
| 27 | |||
| 28 | # Latest Unicode data source files: | ||
| 29 | # https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | ||
| 30 | # https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt | ||
| 31 | # | ||
| 32 | # License and term of use in either URL: | ||
| 33 | # http://www.unicode.org/copyright.html | ||
| 34 | # http://www.unicode.org/terms_of_use.html | ||
| 35 | |||
| 36 | |||
| 28 | export LC_ALL=C | 37 | export LC_ALL=C |
| 29 | self=${0##*/} | 38 | self=${0##*/} |
| 39 | awk=${AWK:-awk} sed=${SED:-sed} | ||
| 30 | 40 | ||
| 31 | # c-types (bigger types work but waste memory. uintN_t need <stdint.h>) | 41 | URL_base=https://www.unicode.org/Public/UCD/latest/ucd |
| 32 | u32=uint32_t # "unsigned" is also typically 32 bit | 42 | URL_verbase=https://www.unicode.org/Public/%s/ucd |
| 33 | u16=uint16_t # "unsigned short" is also typically 16 bits | 43 | ud_file=UnicodeData.txt |
| 34 | FUNC_ATTR=FAST_FUNC # delete this line if not generating a busybox function | 44 | eaw_file=EastAsianWidth.txt |
| 35 | 45 | ||
| 46 | # int by default (int is 32 bit and short is 16 on unix/linux/osx/windows, and | ||
| 47 | # doesn't require stdint.h - https://en.cppreference.com/w/cpp/language/types) | ||
| 48 | u32=${U32:-unsigned} # or uint32_t | ||
| 49 | u16=${U16:-unsigned short} # or uint16_t | ||
| 36 | 50 | ||
| 37 | err() { >&2 printf %s\\n "$self: $*"; exit 1; } | 51 | err() { >&2 printf %s\\n "$self: $*"; exit 1; } |
| 38 | 52 | ||
| 39 | case ${1-} in -h | --help) | 53 | case ${1-} in -h | --help) |
| 40 | echo "Usage: $self [path/to/python-wcwidth] (default path is '.')" | 54 | echo "Usage: $self [DL=VERSION] [FATTR]" |
| 41 | echo "Prints a wcwidth C implementation, with latest Unicode data" | 55 | echo "Print a wcwidth C implementation to stdout." |
| 42 | echo "imported from a local https://github.com/jquast/wcwidth repo." | 56 | echo |
| 43 | echo "Assumptions about table_zero.py and table_wide.py at the repo:" | 57 | echo "Uses the files ./$ud_file and ./$eaw_file ." |
| 44 | echo "- Each range is in one Unicode plane (a>>16 == b>>16) (enforced)." | 58 | echo "If the files are missing, the latest will be downloaded." |
| 45 | echo "- Commit 04d6d90c (2023-10-30) or later, where table_zero.py" | 59 | echo |
| 46 | echo " includes zero-width Cf chars (else need to add manual tests)." | 60 | echo "If DL=XXX is provided, force-download + overwrite, where XXX is:" |
| 61 | echo "- 'latest' -> $URL_base/..." | ||
| 62 | echo "- 'draft', '16.0.0' etc -> $(printf "$URL_verbase" XXX)/..." | ||
| 63 | echo | ||
| 64 | echo "If given, FATTR will be inserted as 'int FATTR wcwidth(...) {...}'." | ||
| 65 | echo "Optional env vars:" | ||
| 66 | echo ' $FN Function name to generate. Default: wcwidth.' | ||
| 67 | echo ' $U32, $U16 unsigned codepoint, u16 C-types. Default: int/short.' | ||
| 68 | echo ' $AWK, $SED used programs (word-split, e.g. AWK="busybox awk").' | ||
| 69 | echo | ||
| 70 | echo 'Requires: sh, curl/wget, sed, awk, and few more POSIX utilities.' | ||
| 71 | exit | ||
| 47 | esac | 72 | esac |
| 73 | [ "${1-}" = -- ] && shift | ||
| 48 | 74 | ||
| 49 | [ "${1-}" != -- ] || shift | 75 | DL= |
| 76 | case ${1-} in DL=*) DL=${1#DL=}; shift; esac | ||
| 77 | [ "${DL-}" ] && [ "$DL" != latest ] && URL_base=$(printf "$URL_verbase" "$DL") | ||
| 50 | 78 | ||
| 51 | pwc_root=${1:-.} | 79 | FUNC_ATTR=${1:+$1 } |
| 52 | pwc_git() { git -C "$pwc_root" "$@"; } | ||
| 53 | 80 | ||
| 54 | zerowidth_py=$pwc_root/wcwidth/table_zero.py | 81 | url2file() { # wget errors on 404-not-found etc, curl requires -f for that |
| 55 | widewidth_py=$pwc_root/wcwidth/table_wide.py | 82 | >&2 echo "[$1 -> $2]" |
| 83 | rm -f -- "$2.tmp" | ||
| 84 | { { [ "$(which wget)" ] && wget -O "$2.tmp" "$1"; } || | ||
| 85 | { [ "$(which curl)" ] && curl -f -o "$2.tmp" "$1"; } | ||
| 86 | } && mv -- "$2.tmp" "$2" | ||
| 87 | } | ||
| 56 | 88 | ||
| 57 | [ -r "$zerowidth_py" ] && [ -r "$widewidth_py" ] \ | 89 | [ -r "$ud_file" ] && [ -r "$eaw_file" ] && [ -z "$DL" ] || { |
| 58 | || err "missing $zerowidth_py or $widewidth_py. abort." | 90 | url2file "$URL_base/$ud_file" "$ud_file" && |
| 91 | url2file "$URL_base/$eaw_file" "$eaw_file" || | ||
| 92 | err "can't download $ud_file and/or $eaw_file. abort." | ||
| 93 | } | ||
| 59 | 94 | ||
| 60 | # latest unicode version from table_wide.py (e.g. from " '10.0.0': (") | ||
| 61 | ver=$(grep "^\s*'[0-9]" < "$widewidth_py" | tail -n1 | sed "s/.*'\(.*\)'.*/\1/") | ||
| 62 | 95 | ||
| 63 | # stdin -> stdout: extract the data of the last table (latest spec) from | 96 | # extractors of zero-width and wide-width ranges from the Unicode data files. |
| 64 | # wcwidth/table_{wide,zero}.py (from https://github.com/jquast/wcwidth) | 97 | # |
| 65 | last_table() { | 98 | # At https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - the mother of most current |
| 66 | awk "/^\s*'[0-9]/ { i=0 } # new table -> reset | 99 | # wcwidth implementations, by Markus Kuhn (Unicode 5.0) - the width is: |
| 67 | /^\s*\(0x/ { arr[++i] = \$0 } # range (first, last) | 100 | # - 0 for codepoint 0. |
| 68 | END { for (j=1; j <= i; ++j) print arr[j] }" | 101 | # - -1 for control ranges C0/C1, and DEL (1-31, 128-159, 127) |
| 102 | # - 0 if U+200B, cat is Me|Mn|Cf but not U+00AD, Hangul Jamo U+1160..U+11FF. | ||
| 103 | # - else 2 if East_Asian_Width property is F|W (fullwidth, wide). | ||
| 104 | # - else 1. | ||
| 105 | # | ||
| 106 | # We add as 0-width: | ||
| 107 | # - Category Mc (combining, diacritics[-like]) | ||
| 108 | # - Hangul Jamo Extended-B U+D7B0..U+D7FF (combining, wasn't in Unicode 5.0) | ||
| 109 | # - Emoji modifiers U+1F3FB..U+1F3FF (wasn't in Unicode 5.0) | ||
| 110 | # | ||
| 111 | # Hangul Jamo, Emoji mods are identified at the data automatically by name. | ||
| 112 | # U+200B is category Cf since at least 5.0.0 - no need for manual override. | ||
| 113 | # | ||
| 114 | # Python wcwidth https://github.com/jquast/wcwidth is like us, plus 0-width: | ||
| 115 | # - Cat Zl,Zp (2 codepoints: line/para. sep.), but terminals disagree on them. | ||
| 116 | # - Few unassigned codepoints at Hangul Jamo Extended-B - it adds the range | ||
| 117 | # manually (includes unassigned), we get (assigned) codepoints at the data. | ||
| 118 | # - (U+00AD was 0, fixed to 1 after https://github.com/jquast/wcwidth/issues/8) | ||
| 119 | # | ||
| 120 | # | ||
| 121 | # UnicodeData.txt is lines of: | ||
| 122 | # CODEPOINT;NAME;CATEGORY;(...more) where CODEPOINT is 4-6 hex digits | ||
| 123 | # or pairs of: | ||
| 124 | # CODEPOINT;<XNAME, First>;CATEGORY;(...more) where XNAME is group-ish name | ||
| 125 | # CODEPOINT;<XNAME, Last>;CATEGORY;(...more) | ||
| 126 | # | ||
| 127 | # EastAsianWidth.txt is lines of: | ||
| 128 | # #... (comment) | ||
| 129 | # or | ||
| 130 | # CODEPOINT[..LAST];EAW # <stuff> (4-6 hex digits, maybe spaces around ";") | ||
| 131 | # where EAW is East_Asian_Width property ("F", "W", "A", "N", etc) | ||
| 132 | |||
| 133 | # replace First/Last pairs in UnicodeData with one line of FIRST..LAST range. | ||
| 134 | # assume that First/Last are at consecutive lines, and XNAME/CAT don't change. | ||
| 135 | # (no-op for us at Unicode v16 - doesn't add new wide/zero width codepoints) | ||
| 136 | ud_file_ranges() { | ||
| 137 | $awk -F\; '$2 ~ /, First>$/ { printf $1 ".."; next }; 1' < "$ud_file" | ||
| 69 | } | 138 | } |
| 70 | 139 | ||
| 71 | # stdin -> stdout, $1 is the range's (wc)width (0 or 2), e.g. | 140 | # output unsorted/overlapping lines of hex "CODEPOINT[..LAST] WIDTH" (0/1/2) |
| 72 | # from: (0x0123a, 0x0123c,), # comment | 141 | raw_ranges() { |
| 73 | # to : R(0x00123a, 0x00123c, 2), /* comment */ | 142 | # wide ranges according to the EAW property |
| 74 | # ranges bigger than half-plane (32769+ codepoints) are split to two. | 143 | $sed -e '/^#/d' -e 's/;/ /' < "$eaw_file" | # comment lines, ';' |
| 75 | py_data_to_c() { | 144 | $awk '$2 ~ /^[FW]$/ { print $1, 2 }' |
| 76 | sed -e 's/[(),]/ /g' -e 's|#\(.*\)|/*\1 */|' | while read a b c; do | 145 | |
| 77 | # to support cross-plane ranges, we'd need to split them here, | 146 | # wide by name (older Unicode had EAW==N for some FULLWIDTH names) |
| 78 | # but unlikely required, as all planes end in non-characters. | 147 | ud_file_ranges | $awk -F\; '$2 ~ /FULLWIDTH/ { print $1, 2 }' |
| 79 | [ $(($a>>16)) = $(($b>>16)) ] || err "not same plane -- $a $b" | 148 | |
| 80 | 149 | # zero-width ranges (override wide-width) | |
| 81 | a=$(($a)) b=$(($b)) # some shells want decimal vars in $(()) | 150 | ud_file_ranges | $awk -F\; ' |
| 82 | if [ "$((b-a))" -ge 32768 ]; then # split to 15 bit ranges | 151 | $3 ~ /^(Me|Mn|Mc|Cf)$/ || |
| 83 | printf "R(0x%06x, 0x%06x, $1), %s\n" $a $((a+32767)) "$c" | 152 | $2 ~ /EMOJI MODIFIER|HANGUL J[OU]NGSEONG/ { print $1, 0 }' |
| 84 | a=$((a+32768)) c="/* (continued...) */" | 153 | |
| 85 | fi | 154 | # override soft-hyphen as width 1 |
| 86 | printf "R(0x%06x, 0x%06x, $1), %s\n" $a $b "$c" | 155 | echo 00AD 1 |
| 156 | } | ||
| 157 | |||
| 158 | # lines of hex "CODEPOINT[..LAST] WIDTH" -> decimal "FIRST LAST WIDTH" | ||
| 159 | as_decimal_ranges() { | ||
| 160 | while read range width; do | ||
| 161 | echo $((0x${range%%.*})) $((0x${range##*.})) $width | ||
| 87 | done | 162 | done |
| 88 | } | 163 | } |
| 89 | 164 | ||
| 90 | data=$(last_table < "$zerowidth_py" | py_data_to_c 0 && | 165 | # to generate the final ranges list, we need to sort the raw ranges list, |
| 91 | last_table < "$widewidth_py" | py_data_to_c 2) || err abort | 166 | # ensure zero-width overrides wide-width, join adjacent/overlap ranges, |
| 92 | data=$(printf %s\\n "$data" | sort) # lexicographic here is also numeric | 167 | # and split big ranges - our C structure uses 15 bits to store LAST-FIRST of |
| 168 | # each range, and ranges are not allowed to cross plane boundary. | ||
| 169 | # | ||
| 170 | # we use simple (and slow) approach: codepoint-bucket-store all the individual | ||
| 171 | # widths at the input ranges as they come, then scan the whole Unicode range | ||
| 172 | # (~ 1M buckets) and output continuous ranges of width 0/2. Therefore, later | ||
| 173 | # inputs of codepoint X override earlier, so width 2 should arrive before W 0. | ||
| 174 | # | ||
| 175 | # there's ~3K input ranges of ~200K codepoints, and the output is 450+ ranges. | ||
| 176 | # depending on sys/awk performance, it's 200ms - 2s, up to ~7s on low-end ARM. | ||
| 177 | # MERGE_RANGES_C (bottom of this file) is drop-in replacement, runs in few ms. | ||
| 178 | |||
| 179 | # sorted lines of "FIRST LAST WIDTH" (FIRST/LAST: hex digits, WIDTH: 0 or 2) | ||
| 180 | ranges=$(raw_ranges | as_decimal_ranges | $awk ' | ||
| 181 | |||
| 182 | # The final ~ 1M codepoints scan can take few secs, up to 30+s on low end. | ||
| 183 | # so use a simple optimization: divide the unicode range to segments which | ||
| 184 | # do not cross plane boundary (some 2^X, tuned for speed), mark segments | ||
| 185 | # where an input range can change the width at the final scan (its egdes), | ||
| 186 | # then skip unmarked segments at the final output scan, as these segs are | ||
| 187 | # guaranteed to not have any width change. this is effective (x5 speedup) | ||
| 188 | # because most segments are empty or dense. If the input was spread evenly | ||
| 189 | # then it would be more effective to store, sort and iterate exactly these | ||
| 190 | # edges. Currently though, both are same speed, and mark/skip is less code. | ||
| 93 | 191 | ||
| 94 | # sorted hex ranges and their (wc)width: R(first, last, {0|2}),[ /* ... */] | 192 | BEGIN { SEG = 128 } # 128 or 256 seem fastest |
| 95 | data() { printf %s\\n "$data"; } | 193 | function mark(i) { segs[int(i/SEG)] = 1 } |
| 194 | function skip(i) { return !(int(i/SEG) in segs) } | ||
| 96 | 195 | ||
| 97 | repeat() { R=$2; while [ "$R" -gt 0 ]; do printf %s "$1"; R=$((R-1)); done; } | 196 | # input range $1..$2: width may differ at $1 vs $1-1, and at $2+1 vs $2 |
| 197 | { mark($1); mark($2+1) } | ||
| 98 | 198 | ||
| 99 | # data -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data | 199 | # store/overwrite width-1 (1 -> 0) for every codepoint in this range |
| 100 | mkplanes() { | 200 | { v=$3-1; for (i=$1; i<=$2; w[i++]=v); } |
| 101 | i=0 lastp=-1 | 201 | |
| 102 | while read a b c; do | 202 | function min(a,b) { return a<b ? a : b } |
| 103 | p=$((${b%?} >> 16)) # plane (last >> 16) | 203 | END { |
| 104 | repeat "$i, " $((p-lastp)) | 204 | # scan the full unicode range, print continuous ranges of width 0/2 |
| 105 | i=$((i+1)) lastp=$p | 205 | # the SEG skip is SEG-aligned. to disable: use unconditional i++ |
| 206 | for (i = 0; i <= 1114112; skip(i) ? i+=SEG : i++) { # 0x110000 | ||
| 207 | if (w[i] == wprev && i%65536) # no change of width or plane | ||
| 208 | continue | ||
| 209 | if (wprev) # range i0..i-1 is width 0 or 2 - print | ||
| 210 | for (; i0 < i; i0 += 32768) # split (once) if too big | ||
| 211 | printf "%06x %06x %d\n", i0, min(i0+32767, i-1), 1+wprev | ||
| 212 | i0 = i | ||
| 213 | wprev = w[i] | ||
| 214 | } | ||
| 215 | } | ||
| 216 | ') | ||
| 217 | |||
| 218 | ranges() { printf %s\\n "$ranges"; } | ||
| 219 | |||
| 220 | verify_ranges() {( | ||
| 221 | errv() { err "verify: bad range: $* -- (0x$ha, 0x$hb, $w)"; } | ||
| 222 | ishex() { case $1 in '' | *[!0-9A-Fa-f]* ) false; esac; } # LC_ALL=C | ||
| 223 | |||
| 224 | pa=-2 pb=-2 pw=x # previous range values | ||
| 225 | while read ha hb w; do | ||
| 226 | [ "$w" = 0 ] || [ "$w" = 2 ] || errv WIDTH is not 0 or 2 | ||
| 227 | ishex "$ha" && ishex "$hb" || errv not hex values | ||
| 228 | |||
| 229 | a=$((0x$ha)) b=$((0x$hb)) # decimal | ||
| 230 | [ $a -le $b ] || errv "FIRST > LAST" | ||
| 231 | [ $b -le 1114111 ] || errv out of bounds | ||
| 232 | [ $((a>>16)) = $((b>>16)) ] || errv not same plane | ||
| 233 | [ $((b-a)) -lt 32768 ] || errv more than 0x8000 values | ||
| 234 | [ $a -gt $pb ] || errv overlap or not sorted | ||
| 235 | |||
| 236 | # detect adjacent ranges which combined are <= 0x8000 values. | ||
| 237 | # not detecting 3 adjacent 0x5000 ranges, but shouldn't happen | ||
| 238 | [ $w != $pw ] || [ $a != $((pb+1)) ] || [ $((b-pa)) -ge 32768 ] || | ||
| 239 | errv small same-width adjacent ranges not combined | ||
| 240 | |||
| 241 | pa=$a pb=$b pw=$w | ||
| 242 | done | ||
| 243 | )} | ||
| 244 | |||
| 245 | |||
| 246 | as_c_ranges() { | ||
| 247 | while read a b w; do | ||
| 248 | echo "R(0x$a, 0x$b, $w)," | ||
| 106 | done | 249 | done |
| 107 | repeat "$i, " $((17-lastp)) | ||
| 108 | } | 250 | } |
| 109 | 251 | ||
| 110 | indent() { sed -e 's/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces | 252 | # ranges -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data |
| 253 | # (inclusive, exclussive. arr of 18 items - last plane (16) is a[16]..a[17]) | ||
| 254 | extract_plane_indices() { | ||
| 255 | # for each plane (0..16) print the 1st index at ranges where it appears | ||
| 256 | # or, if missing, index of the next found plane (ranges are same-plane) | ||
| 257 | i=0 pdone=-1 # index at ranges, plane whose index was last printed | ||
| 258 | while read a dummy; do # a: ranges[i].first (hex, plane(a) == a>>16) | ||
| 259 | while [ $pdone -lt $((0x$a >> 16)) ]; do | ||
| 260 | printf "$i, " | ||
| 261 | pdone=$((pdone+1)) | ||
| 262 | done # pdone == plane(a) | ||
| 263 | i=$((i+1)) | ||
| 264 | done | ||
| 265 | printf $i # one past last range. the rest are implicit-init as 0 | ||
| 266 | } | ||
| 267 | |||
| 268 | indent() { $sed -e '2,$s/^/\t\t/' -e 's/\s*$//'; } # also trim trailing spaces | ||
| 269 | |||
| 270 | # only EastAsianWidth.txt has version/date | ||
| 271 | ver=$(head -n 1 < "$eaw_file" | $sed -e 's/.*-//' -e 's/.txt//') | ||
| 272 | udate=$(head -n 2 < "$eaw_file" | tail -n 1 | $sed -e 's/.*Date: //') | ||
| 273 | |||
| 274 | ranges | verify_ranges || exit | ||
| 275 | |||
| 276 | # at the C code, p/bot/top/mid can be $idx_t, but faster as 32 bit types | ||
| 277 | [ "$(ranges | wc -l)" -lt 65536 ] && idx_t=$u16 || idx_t=$u32 | ||
| 111 | 278 | ||
| 112 | cat << CFUNCTION | 279 | cat << CFUNCTION |
| 113 | /* wcwidth - Unicode $ver, generated by $0. | 280 | /* wcwidth - Unicode $ver |
| 114 | * Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com> | 281 | * Copyright (C) 2025 Avi Halachmi <avihpit at yahoo.com> |
| 115 | * License: MIT | 282 | * License: MIT |
| 116 | * | 283 | * |
| 117 | * Data imported on $(date -u -I) from https://github.com/jquast/wcwidth | 284 | * Generated by $self on $(date -u -I) using the Unicode files |
| 118 | * commit $(pwc_git describe --tags) ($(pwc_git show --no-patch --format=%ci)) | 285 | * $ud_file and $eaw_file ($udate) |
| 119 | */ | 286 | */ |
| 120 | int ${FUNC_ATTR-} wcwidth($u32 ucs) | 287 | int ${FUNC_ATTR}${FN:-wcwidth}($u32 ucs) |
| 121 | { | 288 | { |
| 122 | /* sorted ranges, "first" is clipped to 16 bit, and its high bits | 289 | /* sorted ranges, "first" is clipped to 16 bit, and its high bits |
| 123 | * (plane) are deduced from the "planes" array below. | 290 | * (plane) are deduced from the "planes" array below. |
| 124 | * (imported from ${zerowidth_py##*/} and ${widewidth_py##*/}) | ||
| 125 | */ | 291 | */ |
| 126 | static const struct range { | 292 | static const struct range { /* bitfield order empirically fast */ |
| 127 | uint16_t first; | 293 | $u16 first: 16; |
| 128 | uint16_t iswide: 1; /* bitfield order empirically faster */ | 294 | $u16 iswide: 1; |
| 129 | uint16_t difflast: 15; | 295 | $u16 delta: 15; |
| 130 | } ranges[] = { | 296 | } ranges[] = { |
| 131 | #define R(first, last, width) {first & 0xffff, width/2, last-first} | 297 | #define R(first, last, width) {first & 0xffff, width/2, last-first} |
| 132 | $(data | indent) | 298 | $(ranges | as_c_ranges | indent) |
| 133 | #undef R | 299 | #undef R |
| 134 | }; | 300 | }; |
| 135 | 301 | ||
| 136 | /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */ | 302 | /* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */ |
| 137 | static const $u16 planes[/* 18 */] = { | 303 | static const $idx_t planes[18] = { |
| 138 | $(data | mkplanes | fold -s -w 60 | indent) | 304 | $(ranges | extract_plane_indices | fold -s -w 60 | indent) |
| 139 | }; | 305 | }; |
| 140 | 306 | ||
| 141 | /******* END OF STATIC DATA *******/ | 307 | /******* END OF STATIC DATA *******/ |
| @@ -158,7 +324,7 @@ $(data | mkplanes | fold -s -w 60 | indent) | |||
| 158 | $u32 mid = (bot + top) / 2; | 324 | $u32 mid = (bot + top) / 2; |
| 159 | if (ucs < ranges[mid].first) | 325 | if (ucs < ranges[mid].first) |
| 160 | top = mid; | 326 | top = mid; |
| 161 | else if (ucs > ranges[mid].first + ranges[mid].difflast) | 327 | else if (ucs > ranges[mid].first + ranges[mid].delta) |
| 162 | bot = mid + 1; | 328 | bot = mid + 1; |
| 163 | else | 329 | else |
| 164 | return 2 * ranges[mid].iswide; | 330 | return 2 * ranges[mid].iswide; |
| @@ -167,3 +333,28 @@ $(data | mkplanes | fold -s -w 60 | indent) | |||
| 167 | return 1; | 333 | return 1; |
| 168 | } /* wcwidth - Unicode $ver */ | 334 | } /* wcwidth - Unicode $ver */ |
| 169 | CFUNCTION | 335 | CFUNCTION |
| 336 | |||
| 337 | |||
| 338 | # C drop-in replacement to the awk script which outputs the final ranges list | ||
| 339 | : << \MERGE_RANGES_C | ||
| 340 | #include <stdio.h> | ||
| 341 | int main(void) | ||
| 342 | { | ||
| 343 | static char w[0x110000 + 1]; /* = {0} which is width==1 */ | ||
| 344 | unsigned a, b, c, i, first; /* assume 32bit (need 21) */ | ||
| 345 | |||
| 346 | while (3 == scanf("%u%u%u", &a, &b, &c)) /* FIRST LAST WIDTH (W:0/1/2) */ | ||
| 347 | if (b < 0x110000 && c < 3) while (a <= b) w[a++] = (char)c + 1; | ||
| 348 | |||
| 349 | /* HEXFIRST HEXLAST WIDTH (W: 0/2, not cross-plane, LAST-FIRST<0x8000) */ | ||
| 350 | for (first = 0, i = 1; i <= 0x110000; ++i) { | ||
| 351 | if (w[i] == w[i-1] && i % 0x10000 && i - first < 0x8000) | ||
| 352 | continue; | ||
| 353 | if (w[i-1] & 1) /* 1 or 3 -> width is 0 or 2 for first..i-1 */ | ||
| 354 | printf("%06x %06x %d\n", first, i-1, w[i-1]-1); | ||
| 355 | first = i; | ||
| 356 | } | ||
| 357 | |||
| 358 | return 0; | ||
| 359 | } | ||
| 360 | MERGE_RANGES_C | ||
