From 1159329b247d6532fecb375e7008aca979261eaa Mon Sep 17 00:00:00 2001
From: Thijs Schreijer <thijs@thijsschreijer.nl>
Date: Thu, 29 Jan 2026 11:02:33 +0100
Subject: fix(wcwidth): add a generator for width ranges

The generator script will parse official unicode data to create
the actual ranges for 0, double, and ambiguous width characters.
---
 src/wcwidth_update.lua | 404 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 404 insertions(+)
 create mode 100755 src/wcwidth_update.lua

(limited to 'src/wcwidth_update.lua')

diff --git a/src/wcwidth_update.lua b/src/wcwidth_update.lua
new file mode 100755
index 0000000..37f18c3
--- /dev/null
+++ b/src/wcwidth_update.lua
@@ -0,0 +1,404 @@
+#!/usr/bin/env lua
+
+-- This file downloads and parses unicode standard files and updates the wcwidth code
+-- based on that data.
+
+local VERSION="17.0.0"   -- the unicode standard version to download
+
+
+
+-- test if curl is available, and Penlight
+do
+  local ok, ec = os.execute("curl --version > /dev/null 2>&1")
+  if not ok then
+    error("curl is not available in the path; exitcode " .. ec)
+  end
+
+  local ok, utils = pcall(require, "pl.utils")
+  if not ok then
+    error("Penlight is not available, please install via `luarocks install penlight`")
+  end
+
+  utils.readfile("./wcwidth.c")
+  if not ok then
+    error("failed to read './wcwidth.c', run this script from within the `./src/` directory")
+  end
+end
+
+-- files to download from the unicode site
+local FN_DERIVED_GENERAL_CATEGORY = 1
+local FN_EAST_ASIAN_WIDTH = 2
+local FN_DERIVED_CORE_PROPERTIES = 3
+local FN_EMOJI_DATA = 4
+
+local download_file_list = {
+  [FN_DERIVED_GENERAL_CATEGORY] = "extracted/DerivedGeneralCategory.txt",
+  [FN_EAST_ASIAN_WIDTH]         = "EastAsianWidth.txt",
+  [FN_DERIVED_CORE_PROPERTIES]  = "DerivedCoreProperties.txt",
+  [FN_EMOJI_DATA]               = "emoji/emoji-data.txt",
+}
+local target_path = "./unicode_data/"
+
+
+
+do
+  local base_url = "https://www.unicode.org/Public/" .. VERSION .. "/ucd/"  -- must include trailing slash
+
+
+  -- removes a file, and then downloads a new copy from the unicode site
+  local function download_file(filename, target_filename)
+    print("Downloading " .. filename .. " to " .. target_filename)
+    os.remove(target_filename)
+    local cmd = "curl --fail -s -o " .. target_filename .. " " .. base_url .. filename
+    local ok, ec = os.execute(cmd)
+    if not ok then
+      error("Failed to execute: " .. cmd .. "; exitcode " .. ec)
+    end
+  end
+
+
+  -- Downloads all unicode files we need
+  local function download_files()
+    os.execute("mkdir -p " .. target_path .. "extracted")
+    os.execute("mkdir -p " .. target_path .. "emoji")
+    for _, filename in ipairs(download_file_list) do
+      download_file(filename, target_path .. filename)
+    end
+  end
+
+
+  download_files()
+end
+
+
+
+-- set up the 3 lists of data (everything else is single-width)
+local zero_width = {}
+local double_width = {}
+local ambiguous_width = {}
+
+
+
+local readlines do
+  local utils = require("pl.utils")
+
+  function readlines(filename)
+    print("Parsing " .. filename)
+    local lines = assert(utils.readlines(filename))
+
+    -- drop lines starting with "#" being comments, or empty lines (whitespace only)
+    for i = #lines, 1, -1 do -- reverse, since we're deleting items
+      if lines[i]:match("^%s*#") or lines[i]:match("^%s*$") then
+        table.remove(lines, i)
+      end
+    end
+
+    return lines
+  end
+end
+
+
+
+
+-- parse DerivedGeneralCategory.txt
+-- Purpose: zero-width combining marks
+-- Extract:
+--   Mn — Nonspacing Mark → width = 0
+--   Me — Enclosing Mark → width = 0
+-- Why:
+--   These characters overlay the previous glyph
+--   This replaces Markus Kuhn’s combining[] table
+-- Ignore all other categories in this file.
+do
+  local lines = readlines(target_path .. download_file_list[FN_DERIVED_GENERAL_CATEGORY])
+  local zw_start = #zero_width
+
+  -- parse the lines
+  for _, line in ipairs(lines) do
+    local range, category = line:match("^([%x%.]+)%s*;%s*(%a+)")
+    if not range then
+      error("Failed to parse line: " .. line)
+    end
+
+    if not range:find("..", 1, true) then -- single code point, make range
+      range = range .. ".." .. range
+    end
+
+    if category == "Mn" or category == "Me" then
+      zero_width[#zero_width + 1] = range
+    end
+  end
+
+  print("  found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
+end
+
+
+
+-- parse DerivedCoreProperties.txt
+-- Purpose: zero-width format / ignorable characters
+-- Extract:
+--   Default_Ignorable_Code_Point → width = 0
+
+-- Includes (important examples):
+--   U+200D ZERO WIDTH JOINER
+--   U+200C ZERO WIDTH NON-JOINER
+--   U+FE00..U+FE0F (variation selectors)
+--   Bidi and other format controls
+
+-- Why:
+--   Not Mn/Me, but terminals treat them as zero-width
+--   Required for emoji correctness and modern text
+do
+  local lines = readlines(target_path .. download_file_list[FN_DERIVED_CORE_PROPERTIES])
+  local zw_start = #zero_width
+
+  -- parse the lines
+  for _, line in ipairs(lines) do
+    local range, category = line:match("^([%x%.]+)%s*;%s*([%a_]+)")
+    if not range then
+      error("Failed to parse line: " .. line)
+    end
+
+    if not range:find("..", 1, true) then -- single code point, make range
+      range = range .. ".." .. range
+    end
+
+    if category == "Default_Ignorable_Code_Point" then
+      zero_width[#zero_width + 1] = range
+    end
+  end
+
+  print("  found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
+end
+
+
+
+-- parse EastAsianWidth.txt
+-- Purpose: determine double-width and ambiguous-width characters
+-- Extract:
+--   W (Wide) → width = 2
+--   F (Fullwidth) → width = 2
+--   A (Ambiguous) → width = 1 or 2 (your choice; usually 1 unless CJK mode)
+-- Everything else:
+--   H, Na, N → width = 1
+-- Why:
+--   - This is the only Unicode-sanctioned width-related property
+--   - Core of all wcwidth() implementations
+do
+  local lines = readlines(target_path .. download_file_list[FN_EAST_ASIAN_WIDTH])
+  local dw_start = #double_width
+  local aw_start = #ambiguous_width
+
+  -- parse the lines
+  for _, line in ipairs(lines) do
+    local range, width_type = line:match("^([%x%.]+)%s*;%s*(%a+)")
+    if not range then
+      error("Failed to parse line: " .. line)
+    end
+
+    if not range:find("..", 1, true) then -- single code point, make range
+      range = range .. ".." .. range
+    end
+
+    if width_type == "W" or width_type == "F" then
+      double_width[#double_width + 1] = range
+    elseif width_type == "A" then
+      ambiguous_width[#ambiguous_width + 1] = range
+    end
+  end
+
+  print("  found " .. (#double_width - dw_start) .. " double-width character-ranges")
+  print("  found " .. (#ambiguous_width - aw_start) .. " ambiguous-width character-ranges")
+end
+
+
+
+-- parse emoji-data.txt
+-- Purpose: emoji presentation width
+-- Extract:
+--   Emoji_Presentation=Yes → width = 2
+--   (Optionally) Extended_Pictographic → emoji sequences
+-- Why:
+--   Emoji are not reliably covered by EastAsianWidth
+--   Modern terminals render these as double-width
+--   Required for correct emoji column alignment
+do
+  local lines = readlines(target_path .. download_file_list[FN_EMOJI_DATA])
+  local dw_start = #double_width
+
+  -- parse the lines
+  for _, line in ipairs(lines) do
+    local range, properties = line:match("^([%x%.]+)%s*;%s*([%a_]+)")
+    if not range then
+      error("Failed to parse line: " .. line)
+    end
+
+    if not range:find("..", 1, true) then -- single code point, make range
+      range = range .. ".." .. range
+    end
+
+    if properties:match("Emoji_Presentation") then
+      double_width[#double_width + 1] = range
+    end
+  end
+
+  print("  found " .. (#double_width - dw_start) .. " double-width character-ranges")
+end
+
+
+
+-- returns the start and end of a range, numerically, and hex strings
+-- @tparam string range the range to parse
+-- @treturn number sr the start of the range
+-- @treturn number er the end of the range
+-- @treturn string sh the start of the range as a hex string
+-- @treturn string eh the end of the range as a hex string
+local parse_range do
+  function parse_range(range)
+    local s = range:find("..", 1, true)
+    if not s then
+      error("Failed to parse range: " .. range)
+    end
+    local sh = range:sub(1, s - 1)
+    local eh = range:sub(s + 2, -1)
+    local sr = tonumber(sh, 16)
+    local er = tonumber(eh, 16)
+    if er < sr then
+      error("Failed to parse range: " .. range .. " (end < start)")
+    end
+    return sr, er, sh, eh
+  end
+
+  -- some inline tests for parse_range
+  local sr, er = parse_range("25FD..25FE")
+  assert(sr == 9725)
+  assert(er == 9726)
+  local sr, er = parse_range("105C0..105F3")
+  assert(sr == 67008)
+  assert(er == 67059)
+end
+
+
+
+-- sorts the ranges in-place
+local function sort_ranges(ranges)
+  table.sort(ranges, function(a, b)
+    return parse_range(a) < parse_range(b)
+  end)
+  return ranges
+end
+
+
+
+-- combines adjacent ranges in-place
+local combine_ranges do
+  function combine_ranges(ranges)
+    local last_idx = 1
+    for i = 2, #ranges do
+      local last_s, last_e, last_sh, last_eh = parse_range(ranges[last_idx])
+      local current_s, current_e, _, current_eh = parse_range(ranges[i])
+      if current_s >= last_s and current_s <= (last_e + 1) then
+        -- ranges are adjacent or overlapping, combine them
+        local sh = last_sh
+        local eh = current_eh
+        if last_e > current_e then
+          eh = last_eh
+        end
+        ranges[last_idx] = sh .. ".." .. eh
+      else
+        last_idx = last_idx + 1
+        ranges[last_idx] = ranges[i]
+      end
+    end
+    -- clear left-overs beyond last entry
+    for i = last_idx + 1, #ranges do
+      ranges[i] = nil
+    end
+  end
+
+  -- some inline tests for combine_ranges
+  local ranges = {
+    "25FD..25FE",
+    "25FD..25FE",  -- duplicate range, should be removed
+    "105C0..105F3",
+    "105D0..105E0",  -- range fully within previous range, should be combined
+    "10F00..10F10",
+    "10F11..10F20",  -- adjacent or previous, should be combined
+    "11000..11100",
+    "11101..11110",  -- adjacent + extending to previous, should be combined
+    "12000..12010",
+    "12011..12020",  -- multiple: adjacent should be combined
+    "12015..12030",  -- multiple: overlap + extending to previous, should be combined
+    "12031..12040",  -- multiple: overlapping, should be combined
+  }
+  combine_ranges(ranges)
+  assert(#ranges == 5)
+  assert(ranges[1] == "25FD..25FE")
+  assert(ranges[2] == "105C0..105F3")
+  assert(ranges[3] == "10F00..10F20")
+  assert(ranges[4] == "11000..11110")
+  assert(ranges[5] == "12000..12040")
+end
+
+
+
+combine_ranges(sort_ranges(zero_width))
+combine_ranges(sort_ranges(double_width))
+combine_ranges(sort_ranges(ambiguous_width))
+
+
+
+-- convert ranges into c-source-code ranges (in-place)
+-- format: "{ 0x0829, 0x082D }"
+local function convert_c_ranges(ranges)
+  for i = 1, #ranges do
+    local _, _, sh, eh = parse_range(ranges[i])
+    ranges[i] = "{ 0x" .. sh .. ", 0x" .. eh .. " }"
+  end
+end
+
+convert_c_ranges(zero_width)
+convert_c_ranges(double_width)
+convert_c_ranges(ambiguous_width)
+
+
+
+local SOURCE_INDENT = "    "
+
+
+-- write c source, as triplet; 3 ranges on 1 line
+local function triplet_lines(ranges)
+  local lines = {}
+  for i = 1, #ranges, 3 do
+    lines[#lines+1] = SOURCE_INDENT .. table.concat(ranges, ", ", i, math.min(i + 2, #ranges)) .. ","
+  end
+  -- drop trailing comma from last line
+  lines[#lines] = lines[#lines]:sub(1, -2)
+  return lines
+end
+
+
+-- create file-contents
+local function create_file_contents(ranges, contains)
+  return
+    SOURCE_INDENT .. "// Do not modify this file directly, it is generated by the wcwidth_update.lua script\n" ..
+    SOURCE_INDENT .. "// Contains " .. contains .. "\n" ..
+    SOURCE_INDENT .. "// Generated from Unicode " .. VERSION .. "\n" ..
+    SOURCE_INDENT .. "// Generated on " .. os.date("%Y-%m-%d") .. "\n" ..
+    table.concat(triplet_lines(ranges), "\n") .. "\n"
+end
+
+
+
+
+local writefile = require("pl.utils").writefile
+
+print("writing source files...")
+print("  zero-width: ./wcwidth_zero_width.c")
+assert(writefile("./wcwidth_zero_width.c", create_file_contents(zero_width, "unicode character-ranges handled as 0 width")))
+
+print("  double-width: ./wcwidth_double_width.c")
+assert(writefile("./wcwidth_double_width.c", create_file_contents(double_width, "unicode character-ranges handled as double width")))
+
+print("  ambiguous-width: ./wcwidth_ambiguous_width.c")
+assert(writefile("./wcwidth_ambiguous_width.c", create_file_contents(ambiguous_width, "unicode character-ranges handled as ambiguous (either 1 or 2 width)")))
-- 
cgit v1.2.3-55-g6feb