aboutsummaryrefslogtreecommitdiff
path: root/src/wcwidth_update.lua
diff options
context:
space:
mode:
Diffstat (limited to 'src/wcwidth_update.lua')
-rwxr-xr-xsrc/wcwidth_update.lua404
1 files changed, 404 insertions, 0 deletions
diff --git a/src/wcwidth_update.lua b/src/wcwidth_update.lua
new file mode 100755
index 0000000..37f18c3
--- /dev/null
+++ b/src/wcwidth_update.lua
@@ -0,0 +1,404 @@
1#!/usr/bin/env lua
2
3-- This file downloads and parses unicode standard files and updates the wcwidth code
4-- based on that data.
5
6local VERSION="17.0.0" -- the unicode standard version to download
7
8
9
10-- test if curl is available, and Penlight
11do
12 local ok, ec = os.execute("curl --version > /dev/null 2>&1")
13 if not ok then
14 error("curl is not available in the path; exitcode " .. ec)
15 end
16
17 local ok, utils = pcall(require, "pl.utils")
18 if not ok then
19 error("Penlight is not available, please install via `luarocks install penlight`")
20 end
21
22 utils.readfile("./wcwidth.c")
23 if not ok then
24 error("failed to read './wcwidth.c', run this script from within the `./src/` directory")
25 end
26end
27
28-- files to download from the unicode site
29local FN_DERIVED_GENERAL_CATEGORY = 1
30local FN_EAST_ASIAN_WIDTH = 2
31local FN_DERIVED_CORE_PROPERTIES = 3
32local FN_EMOJI_DATA = 4
33
34local download_file_list = {
35 [FN_DERIVED_GENERAL_CATEGORY] = "extracted/DerivedGeneralCategory.txt",
36 [FN_EAST_ASIAN_WIDTH] = "EastAsianWidth.txt",
37 [FN_DERIVED_CORE_PROPERTIES] = "DerivedCoreProperties.txt",
38 [FN_EMOJI_DATA] = "emoji/emoji-data.txt",
39}
40local target_path = "./unicode_data/"
41
42
43
44do
45 local base_url = "https://www.unicode.org/Public/" .. VERSION .. "/ucd/" -- must include trailing slash
46
47
48 -- removes a file, and then downloads a new copy from the unicode site
49 local function download_file(filename, target_filename)
50 print("Downloading " .. filename .. " to " .. target_filename)
51 os.remove(target_filename)
52 local cmd = "curl --fail -s -o " .. target_filename .. " " .. base_url .. filename
53 local ok, ec = os.execute(cmd)
54 if not ok then
55 error("Failed to execute: " .. cmd .. "; exitcode " .. ec)
56 end
57 end
58
59
60 -- Downloads all unicode files we need
61 local function download_files()
62 os.execute("mkdir -p " .. target_path .. "extracted")
63 os.execute("mkdir -p " .. target_path .. "emoji")
64 for _, filename in ipairs(download_file_list) do
65 download_file(filename, target_path .. filename)
66 end
67 end
68
69
70 download_files()
71end
72
73
74
75-- set up the 3 lists of data (everything else is single-width)
76local zero_width = {}
77local double_width = {}
78local ambiguous_width = {}
79
80
81
82local readlines do
83 local utils = require("pl.utils")
84
85 function readlines(filename)
86 print("Parsing " .. filename)
87 local lines = assert(utils.readlines(filename))
88
89 -- drop lines starting with "#" being comments, or empty lines (whitespace only)
90 for i = #lines, 1, -1 do -- reverse, since we're deleting items
91 if lines[i]:match("^%s*#") or lines[i]:match("^%s*$") then
92 table.remove(lines, i)
93 end
94 end
95
96 return lines
97 end
98end
99
100
101
102
103-- parse DerivedGeneralCategory.txt
104-- Purpose: zero-width combining marks
105-- Extract:
106-- Mn — Nonspacing Mark → width = 0
107-- Me — Enclosing Mark → width = 0
108-- Why:
109-- These characters overlay the previous glyph
110-- This replaces Markus Kuhn’s combining[] table
111-- Ignore all other categories in this file.
112do
113 local lines = readlines(target_path .. download_file_list[FN_DERIVED_GENERAL_CATEGORY])
114 local zw_start = #zero_width
115
116 -- parse the lines
117 for _, line in ipairs(lines) do
118 local range, category = line:match("^([%x%.]+)%s*;%s*(%a+)")
119 if not range then
120 error("Failed to parse line: " .. line)
121 end
122
123 if not range:find("..", 1, true) then -- single code point, make range
124 range = range .. ".." .. range
125 end
126
127 if category == "Mn" or category == "Me" then
128 zero_width[#zero_width + 1] = range
129 end
130 end
131
132 print(" found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
133end
134
135
136
137-- parse DerivedCoreProperties.txt
138-- Purpose: zero-width format / ignorable characters
139-- Extract:
140-- Default_Ignorable_Code_Point → width = 0
141
142-- Includes (important examples):
143-- U+200D ZERO WIDTH JOINER
144-- U+200C ZERO WIDTH NON-JOINER
145-- U+FE00..U+FE0F (variation selectors)
146-- Bidi and other format controls
147
148-- Why:
149-- Not Mn/Me, but terminals treat them as zero-width
150-- Required for emoji correctness and modern text
151do
152 local lines = readlines(target_path .. download_file_list[FN_DERIVED_CORE_PROPERTIES])
153 local zw_start = #zero_width
154
155 -- parse the lines
156 for _, line in ipairs(lines) do
157 local range, category = line:match("^([%x%.]+)%s*;%s*([%a_]+)")
158 if not range then
159 error("Failed to parse line: " .. line)
160 end
161
162 if not range:find("..", 1, true) then -- single code point, make range
163 range = range .. ".." .. range
164 end
165
166 if category == "Default_Ignorable_Code_Point" then
167 zero_width[#zero_width + 1] = range
168 end
169 end
170
171 print(" found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
172end
173
174
175
176-- parse EastAsianWidth.txt
177-- Purpose: determine double-width and ambiguous-width characters
178-- Extract:
179-- W (Wide) → width = 2
180-- F (Fullwidth) → width = 2
181-- A (Ambiguous) → width = 1 or 2 (your choice; usually 1 unless CJK mode)
182-- Everything else:
183-- H, Na, N → width = 1
184-- Why:
185-- - This is the only Unicode-sanctioned width-related property
186-- - Core of all wcwidth() implementations
187do
188 local lines = readlines(target_path .. download_file_list[FN_EAST_ASIAN_WIDTH])
189 local dw_start = #double_width
190 local aw_start = #ambiguous_width
191
192 -- parse the lines
193 for _, line in ipairs(lines) do
194 local range, width_type = line:match("^([%x%.]+)%s*;%s*(%a+)")
195 if not range then
196 error("Failed to parse line: " .. line)
197 end
198
199 if not range:find("..", 1, true) then -- single code point, make range
200 range = range .. ".." .. range
201 end
202
203 if width_type == "W" or width_type == "F" then
204 double_width[#double_width + 1] = range
205 elseif width_type == "A" then
206 ambiguous_width[#ambiguous_width + 1] = range
207 end
208 end
209
210 print(" found " .. (#double_width - dw_start) .. " double-width character-ranges")
211 print(" found " .. (#ambiguous_width - aw_start) .. " ambiguous-width character-ranges")
212end
213
214
215
216-- parse emoji-data.txt
217-- Purpose: emoji presentation width
218-- Extract:
219-- Emoji_Presentation=Yes → width = 2
220-- (Optionally) Extended_Pictographic → emoji sequences
221-- Why:
222-- Emoji are not reliably covered by EastAsianWidth
223-- Modern terminals render these as double-width
224-- Required for correct emoji column alignment
225do
226 local lines = readlines(target_path .. download_file_list[FN_EMOJI_DATA])
227 local dw_start = #double_width
228
229 -- parse the lines
230 for _, line in ipairs(lines) do
231 local range, properties = line:match("^([%x%.]+)%s*;%s*([%a_]+)")
232 if not range then
233 error("Failed to parse line: " .. line)
234 end
235
236 if not range:find("..", 1, true) then -- single code point, make range
237 range = range .. ".." .. range
238 end
239
240 if properties:match("Emoji_Presentation") then
241 double_width[#double_width + 1] = range
242 end
243 end
244
245 print(" found " .. (#double_width - dw_start) .. " double-width character-ranges")
246end
247
248
249
250-- returns the start and end of a range, numerically, and hex strings
251-- @tparam string range the range to parse
252-- @treturn number sr the start of the range
253-- @treturn number er the end of the range
254-- @treturn string sh the start of the range as a hex string
255-- @treturn string eh the end of the range as a hex string
256local parse_range do
257 function parse_range(range)
258 local s = range:find("..", 1, true)
259 if not s then
260 error("Failed to parse range: " .. range)
261 end
262 local sh = range:sub(1, s - 1)
263 local eh = range:sub(s + 2, -1)
264 local sr = tonumber(sh, 16)
265 local er = tonumber(eh, 16)
266 if er < sr then
267 error("Failed to parse range: " .. range .. " (end < start)")
268 end
269 return sr, er, sh, eh
270 end
271
272 -- some inline tests for parse_range
273 local sr, er = parse_range("25FD..25FE")
274 assert(sr == 9725)
275 assert(er == 9726)
276 local sr, er = parse_range("105C0..105F3")
277 assert(sr == 67008)
278 assert(er == 67059)
279end
280
281
282
283-- sorts the ranges in-place
284local function sort_ranges(ranges)
285 table.sort(ranges, function(a, b)
286 return parse_range(a) < parse_range(b)
287 end)
288 return ranges
289end
290
291
292
293-- combines adjacent ranges in-place
294local combine_ranges do
295 function combine_ranges(ranges)
296 local last_idx = 1
297 for i = 2, #ranges do
298 local last_s, last_e, last_sh, last_eh = parse_range(ranges[last_idx])
299 local current_s, current_e, _, current_eh = parse_range(ranges[i])
300 if current_s >= last_s and current_s <= (last_e + 1) then
301 -- ranges are adjacent or overlapping, combine them
302 local sh = last_sh
303 local eh = current_eh
304 if last_e > current_e then
305 eh = last_eh
306 end
307 ranges[last_idx] = sh .. ".." .. eh
308 else
309 last_idx = last_idx + 1
310 ranges[last_idx] = ranges[i]
311 end
312 end
313 -- clear left-overs beyond last entry
314 for i = last_idx + 1, #ranges do
315 ranges[i] = nil
316 end
317 end
318
319 -- some inline tests for combine_ranges
320 local ranges = {
321 "25FD..25FE",
322 "25FD..25FE", -- duplicate range, should be removed
323 "105C0..105F3",
324 "105D0..105E0", -- range fully within previous range, should be combined
325 "10F00..10F10",
326 "10F11..10F20", -- adjacent or previous, should be combined
327 "11000..11100",
328 "11101..11110", -- adjacent + extending to previous, should be combined
329 "12000..12010",
330 "12011..12020", -- multiple: adjacent should be combined
331 "12015..12030", -- multiple: overlap + extending to previous, should be combined
332 "12031..12040", -- multiple: overlapping, should be combined
333 }
334 combine_ranges(ranges)
335 assert(#ranges == 5)
336 assert(ranges[1] == "25FD..25FE")
337 assert(ranges[2] == "105C0..105F3")
338 assert(ranges[3] == "10F00..10F20")
339 assert(ranges[4] == "11000..11110")
340 assert(ranges[5] == "12000..12040")
341end
342
343
344
345combine_ranges(sort_ranges(zero_width))
346combine_ranges(sort_ranges(double_width))
347combine_ranges(sort_ranges(ambiguous_width))
348
349
350
351-- convert ranges into c-source-code ranges (in-place)
352-- format: "{ 0x0829, 0x082D }"
353local function convert_c_ranges(ranges)
354 for i = 1, #ranges do
355 local _, _, sh, eh = parse_range(ranges[i])
356 ranges[i] = "{ 0x" .. sh .. ", 0x" .. eh .. " }"
357 end
358end
359
360convert_c_ranges(zero_width)
361convert_c_ranges(double_width)
362convert_c_ranges(ambiguous_width)
363
364
365
366local SOURCE_INDENT = " "
367
368
369-- write c source, as triplet; 3 ranges on 1 line
370local function triplet_lines(ranges)
371 local lines = {}
372 for i = 1, #ranges, 3 do
373 lines[#lines+1] = SOURCE_INDENT .. table.concat(ranges, ", ", i, math.min(i + 2, #ranges)) .. ","
374 end
375 -- drop trailing comma from last line
376 lines[#lines] = lines[#lines]:sub(1, -2)
377 return lines
378end
379
380
381-- create file-contents
382local function create_file_contents(ranges, contains)
383 return
384 SOURCE_INDENT .. "// Do not modify this file directly, it is generated by the wcwidth_update.lua script\n" ..
385 SOURCE_INDENT .. "// Contains " .. contains .. "\n" ..
386 SOURCE_INDENT .. "// Generated from Unicode " .. VERSION .. "\n" ..
387 SOURCE_INDENT .. "// Generated on " .. os.date("%Y-%m-%d") .. "\n" ..
388 table.concat(triplet_lines(ranges), "\n") .. "\n"
389end
390
391
392
393
394local writefile = require("pl.utils").writefile
395
396print("writing source files...")
397print(" zero-width: ./wcwidth_zero_width.c")
398assert(writefile("./wcwidth_zero_width.c", create_file_contents(zero_width, "unicode character-ranges handled as 0 width")))
399
400print(" double-width: ./wcwidth_double_width.c")
401assert(writefile("./wcwidth_double_width.c", create_file_contents(double_width, "unicode character-ranges handled as double width")))
402
403print(" ambiguous-width: ./wcwidth_ambiguous_width.c")
404assert(writefile("./wcwidth_ambiguous_width.c", create_file_contents(ambiguous_width, "unicode character-ranges handled as ambiguous (either 1 or 2 width)")))