From 1159329b247d6532fecb375e7008aca979261eaa Mon Sep 17 00:00:00 2001 From: Thijs Schreijer Date: Thu, 29 Jan 2026 11:02:33 +0100 Subject: fix(wcwidth): add a generator for width ranges The generator script will parse official unicode data to create the actual ranges for 0, double, and ambiguous width characters. --- src/wcwidth_update.lua | 404 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 404 insertions(+) create mode 100755 src/wcwidth_update.lua (limited to 'src/wcwidth_update.lua') diff --git a/src/wcwidth_update.lua b/src/wcwidth_update.lua new file mode 100755 index 0000000..37f18c3 --- /dev/null +++ b/src/wcwidth_update.lua @@ -0,0 +1,404 @@ +#!/usr/bin/env lua + +-- This file downloads and parses unicode standard files and updates the wcwidth code +-- based on that data. + +local VERSION="17.0.0" -- the unicode standard version to download + + + +-- test if curl is available, and Penlight +do + local ok, ec = os.execute("curl --version > /dev/null 2>&1") + if not ok then + error("curl is not available in the path; exitcode " .. ec) + end + + local ok, utils = pcall(require, "pl.utils") + if not ok then + error("Penlight is not available, please install via `luarocks install penlight`") + end + + utils.readfile("./wcwidth.c") + if not ok then + error("failed to read './wcwidth.c', run this script from within the `./src/` directory") + end +end + +-- files to download from the unicode site +local FN_DERIVED_GENERAL_CATEGORY = 1 +local FN_EAST_ASIAN_WIDTH = 2 +local FN_DERIVED_CORE_PROPERTIES = 3 +local FN_EMOJI_DATA = 4 + +local download_file_list = { + [FN_DERIVED_GENERAL_CATEGORY] = "extracted/DerivedGeneralCategory.txt", + [FN_EAST_ASIAN_WIDTH] = "EastAsianWidth.txt", + [FN_DERIVED_CORE_PROPERTIES] = "DerivedCoreProperties.txt", + [FN_EMOJI_DATA] = "emoji/emoji-data.txt", +} +local target_path = "./unicode_data/" + + + +do + local base_url = "https://www.unicode.org/Public/" .. VERSION .. "/ucd/" -- must include trailing slash + + + -- removes a file, and then downloads a new copy from the unicode site + local function download_file(filename, target_filename) + print("Downloading " .. filename .. " to " .. target_filename) + os.remove(target_filename) + local cmd = "curl --fail -s -o " .. target_filename .. " " .. base_url .. filename + local ok, ec = os.execute(cmd) + if not ok then + error("Failed to execute: " .. cmd .. "; exitcode " .. ec) + end + end + + + -- Downloads all unicode files we need + local function download_files() + os.execute("mkdir -p " .. target_path .. "extracted") + os.execute("mkdir -p " .. target_path .. "emoji") + for _, filename in ipairs(download_file_list) do + download_file(filename, target_path .. filename) + end + end + + + download_files() +end + + + +-- set up the 3 lists of data (everything else is single-width) +local zero_width = {} +local double_width = {} +local ambiguous_width = {} + + + +local readlines do + local utils = require("pl.utils") + + function readlines(filename) + print("Parsing " .. filename) + local lines = assert(utils.readlines(filename)) + + -- drop lines starting with "#" being comments, or empty lines (whitespace only) + for i = #lines, 1, -1 do -- reverse, since we're deleting items + if lines[i]:match("^%s*#") or lines[i]:match("^%s*$") then + table.remove(lines, i) + end + end + + return lines + end +end + + + + +-- parse DerivedGeneralCategory.txt +-- Purpose: zero-width combining marks +-- Extract: +-- Mn — Nonspacing Mark → width = 0 +-- Me — Enclosing Mark → width = 0 +-- Why: +-- These characters overlay the previous glyph +-- This replaces Markus Kuhn’s combining[] table +-- Ignore all other categories in this file. +do + local lines = readlines(target_path .. download_file_list[FN_DERIVED_GENERAL_CATEGORY]) + local zw_start = #zero_width + + -- parse the lines + for _, line in ipairs(lines) do + local range, category = line:match("^([%x%.]+)%s*;%s*(%a+)") + if not range then + error("Failed to parse line: " .. line) + end + + if not range:find("..", 1, true) then -- single code point, make range + range = range .. ".." .. range + end + + if category == "Mn" or category == "Me" then + zero_width[#zero_width + 1] = range + end + end + + print(" found " .. (#zero_width - zw_start) .. " zero-width character-ranges") +end + + + +-- parse DerivedCoreProperties.txt +-- Purpose: zero-width format / ignorable characters +-- Extract: +-- Default_Ignorable_Code_Point → width = 0 + +-- Includes (important examples): +-- U+200D ZERO WIDTH JOINER +-- U+200C ZERO WIDTH NON-JOINER +-- U+FE00..U+FE0F (variation selectors) +-- Bidi and other format controls + +-- Why: +-- Not Mn/Me, but terminals treat them as zero-width +-- Required for emoji correctness and modern text +do + local lines = readlines(target_path .. download_file_list[FN_DERIVED_CORE_PROPERTIES]) + local zw_start = #zero_width + + -- parse the lines + for _, line in ipairs(lines) do + local range, category = line:match("^([%x%.]+)%s*;%s*([%a_]+)") + if not range then + error("Failed to parse line: " .. line) + end + + if not range:find("..", 1, true) then -- single code point, make range + range = range .. ".." .. range + end + + if category == "Default_Ignorable_Code_Point" then + zero_width[#zero_width + 1] = range + end + end + + print(" found " .. (#zero_width - zw_start) .. " zero-width character-ranges") +end + + + +-- parse EastAsianWidth.txt +-- Purpose: determine double-width and ambiguous-width characters +-- Extract: +-- W (Wide) → width = 2 +-- F (Fullwidth) → width = 2 +-- A (Ambiguous) → width = 1 or 2 (your choice; usually 1 unless CJK mode) +-- Everything else: +-- H, Na, N → width = 1 +-- Why: +-- - This is the only Unicode-sanctioned width-related property +-- - Core of all wcwidth() implementations +do + local lines = readlines(target_path .. download_file_list[FN_EAST_ASIAN_WIDTH]) + local dw_start = #double_width + local aw_start = #ambiguous_width + + -- parse the lines + for _, line in ipairs(lines) do + local range, width_type = line:match("^([%x%.]+)%s*;%s*(%a+)") + if not range then + error("Failed to parse line: " .. line) + end + + if not range:find("..", 1, true) then -- single code point, make range + range = range .. ".." .. range + end + + if width_type == "W" or width_type == "F" then + double_width[#double_width + 1] = range + elseif width_type == "A" then + ambiguous_width[#ambiguous_width + 1] = range + end + end + + print(" found " .. (#double_width - dw_start) .. " double-width character-ranges") + print(" found " .. (#ambiguous_width - aw_start) .. " ambiguous-width character-ranges") +end + + + +-- parse emoji-data.txt +-- Purpose: emoji presentation width +-- Extract: +-- Emoji_Presentation=Yes → width = 2 +-- (Optionally) Extended_Pictographic → emoji sequences +-- Why: +-- Emoji are not reliably covered by EastAsianWidth +-- Modern terminals render these as double-width +-- Required for correct emoji column alignment +do + local lines = readlines(target_path .. download_file_list[FN_EMOJI_DATA]) + local dw_start = #double_width + + -- parse the lines + for _, line in ipairs(lines) do + local range, properties = line:match("^([%x%.]+)%s*;%s*([%a_]+)") + if not range then + error("Failed to parse line: " .. line) + end + + if not range:find("..", 1, true) then -- single code point, make range + range = range .. ".." .. range + end + + if properties:match("Emoji_Presentation") then + double_width[#double_width + 1] = range + end + end + + print(" found " .. (#double_width - dw_start) .. " double-width character-ranges") +end + + + +-- returns the start and end of a range, numerically, and hex strings +-- @tparam string range the range to parse +-- @treturn number sr the start of the range +-- @treturn number er the end of the range +-- @treturn string sh the start of the range as a hex string +-- @treturn string eh the end of the range as a hex string +local parse_range do + function parse_range(range) + local s = range:find("..", 1, true) + if not s then + error("Failed to parse range: " .. range) + end + local sh = range:sub(1, s - 1) + local eh = range:sub(s + 2, -1) + local sr = tonumber(sh, 16) + local er = tonumber(eh, 16) + if er < sr then + error("Failed to parse range: " .. range .. " (end < start)") + end + return sr, er, sh, eh + end + + -- some inline tests for parse_range + local sr, er = parse_range("25FD..25FE") + assert(sr == 9725) + assert(er == 9726) + local sr, er = parse_range("105C0..105F3") + assert(sr == 67008) + assert(er == 67059) +end + + + +-- sorts the ranges in-place +local function sort_ranges(ranges) + table.sort(ranges, function(a, b) + return parse_range(a) < parse_range(b) + end) + return ranges +end + + + +-- combines adjacent ranges in-place +local combine_ranges do + function combine_ranges(ranges) + local last_idx = 1 + for i = 2, #ranges do + local last_s, last_e, last_sh, last_eh = parse_range(ranges[last_idx]) + local current_s, current_e, _, current_eh = parse_range(ranges[i]) + if current_s >= last_s and current_s <= (last_e + 1) then + -- ranges are adjacent or overlapping, combine them + local sh = last_sh + local eh = current_eh + if last_e > current_e then + eh = last_eh + end + ranges[last_idx] = sh .. ".." .. eh + else + last_idx = last_idx + 1 + ranges[last_idx] = ranges[i] + end + end + -- clear left-overs beyond last entry + for i = last_idx + 1, #ranges do + ranges[i] = nil + end + end + + -- some inline tests for combine_ranges + local ranges = { + "25FD..25FE", + "25FD..25FE", -- duplicate range, should be removed + "105C0..105F3", + "105D0..105E0", -- range fully within previous range, should be combined + "10F00..10F10", + "10F11..10F20", -- adjacent or previous, should be combined + "11000..11100", + "11101..11110", -- adjacent + extending to previous, should be combined + "12000..12010", + "12011..12020", -- multiple: adjacent should be combined + "12015..12030", -- multiple: overlap + extending to previous, should be combined + "12031..12040", -- multiple: overlapping, should be combined + } + combine_ranges(ranges) + assert(#ranges == 5) + assert(ranges[1] == "25FD..25FE") + assert(ranges[2] == "105C0..105F3") + assert(ranges[3] == "10F00..10F20") + assert(ranges[4] == "11000..11110") + assert(ranges[5] == "12000..12040") +end + + + +combine_ranges(sort_ranges(zero_width)) +combine_ranges(sort_ranges(double_width)) +combine_ranges(sort_ranges(ambiguous_width)) + + + +-- convert ranges into c-source-code ranges (in-place) +-- format: "{ 0x0829, 0x082D }" +local function convert_c_ranges(ranges) + for i = 1, #ranges do + local _, _, sh, eh = parse_range(ranges[i]) + ranges[i] = "{ 0x" .. sh .. ", 0x" .. eh .. " }" + end +end + +convert_c_ranges(zero_width) +convert_c_ranges(double_width) +convert_c_ranges(ambiguous_width) + + + +local SOURCE_INDENT = " " + + +-- write c source, as triplet; 3 ranges on 1 line +local function triplet_lines(ranges) + local lines = {} + for i = 1, #ranges, 3 do + lines[#lines+1] = SOURCE_INDENT .. table.concat(ranges, ", ", i, math.min(i + 2, #ranges)) .. "," + end + -- drop trailing comma from last line + lines[#lines] = lines[#lines]:sub(1, -2) + return lines +end + + +-- create file-contents +local function create_file_contents(ranges, contains) + return + SOURCE_INDENT .. "// Do not modify this file directly, it is generated by the wcwidth_update.lua script\n" .. + SOURCE_INDENT .. "// Contains " .. contains .. "\n" .. + SOURCE_INDENT .. "// Generated from Unicode " .. VERSION .. "\n" .. + SOURCE_INDENT .. "// Generated on " .. os.date("%Y-%m-%d") .. "\n" .. + table.concat(triplet_lines(ranges), "\n") .. "\n" +end + + + + +local writefile = require("pl.utils").writefile + +print("writing source files...") +print(" zero-width: ./wcwidth_zero_width.c") +assert(writefile("./wcwidth_zero_width.c", create_file_contents(zero_width, "unicode character-ranges handled as 0 width"))) + +print(" double-width: ./wcwidth_double_width.c") +assert(writefile("./wcwidth_double_width.c", create_file_contents(double_width, "unicode character-ranges handled as double width"))) + +print(" ambiguous-width: ./wcwidth_ambiguous_width.c") +assert(writefile("./wcwidth_ambiguous_width.c", create_file_contents(ambiguous_width, "unicode character-ranges handled as ambiguous (either 1 or 2 width)"))) -- cgit v1.2.3-55-g6feb