fix(wcwidth): add a generator for width ranges

The generator script will parse official unicode data to create the actual ranges for 0, double, and ambiguous width characters.
author: Thijs Schreijer <thijs@thijsschreijer.nl> 2026-01-29 11:02:33 +0100
committer: Thijs Schreijer <thijs@thijsschreijer.nl> 2026-01-29 13:55:39 +0100
commit: 1159329b247d6532fecb375e7008aca979261eaa (patch)
tree: 0b56caf9a03ba47c2c77d5313662ea27f198752d /src/wcwidth_update.lua
parent: dfd0d4b8ca3607ae39b1d2cbad4e3a7180dd6754 (diff)
download: luasystem-1159329b247d6532fecb375e7008aca979261eaa.tar.gz
luasystem-1159329b247d6532fecb375e7008aca979261eaa.tar.bz2
luasystem-1159329b247d6532fecb375e7008aca979261eaa.zip
1 files changed, 404 insertions, 0 deletions
diff --git a/src/wcwidth_update.lua b/src/wcwidth_update.lua
new file mode 100755
index 0000000..37f18c3
--- /dev/null
+++ b/src/wcwidth_update.lua
@@ -0,0 +1,404 @@
+#!/usr/bin/env lua
+-- This file downloads and parses unicode standard files and updates the wcwidth code
+-- based on that data.
+local VERSION="17.0.0"   -- the unicode standard version to download
+-- test if curl is available, and Penlight
+do
+  local ok, ec = os.execute("curl --version > /dev/null 2>&1")
+  if not ok then
+    error("curl is not available in the path; exitcode " .. ec)
+  end
+  local ok, utils = pcall(require, "pl.utils")
+  if not ok then
+    error("Penlight is not available, please install via `luarocks install penlight`")
+  end
+  utils.readfile("./wcwidth.c")
+  if not ok then
+    error("failed to read './wcwidth.c', run this script from within the `./src/` directory")
+  end
+end
+-- files to download from the unicode site
+local FN_DERIVED_GENERAL_CATEGORY = 1
+local FN_EAST_ASIAN_WIDTH = 2
+local FN_DERIVED_CORE_PROPERTIES = 3
+local FN_EMOJI_DATA = 4
+local download_file_list = {
+  [FN_DERIVED_GENERAL_CATEGORY] = "extracted/DerivedGeneralCategory.txt",
+  [FN_EAST_ASIAN_WIDTH]         = "EastAsianWidth.txt",
+  [FN_DERIVED_CORE_PROPERTIES]  = "DerivedCoreProperties.txt",
+  [FN_EMOJI_DATA]               = "emoji/emoji-data.txt",
+}
+local target_path = "./unicode_data/"
+do
+  local base_url = "https://www.unicode.org/Public/" .. VERSION .. "/ucd/"  -- must include trailing slash
+  -- removes a file, and then downloads a new copy from the unicode site
+  local function download_file(filename, target_filename)
+    print("Downloading " .. filename .. " to " .. target_filename)
+    os.remove(target_filename)
+    local cmd = "curl --fail -s -o " .. target_filename .. " " .. base_url .. filename
+    local ok, ec = os.execute(cmd)
+    if not ok then
+      error("Failed to execute: " .. cmd .. "; exitcode " .. ec)
+    end
+  end
+  -- Downloads all unicode files we need
+  local function download_files()
+    os.execute("mkdir -p " .. target_path .. "extracted")
+    os.execute("mkdir -p " .. target_path .. "emoji")
+    for _, filename in ipairs(download_file_list) do
+      download_file(filename, target_path .. filename)
+    end
+  end
+  download_files()
+end
+-- set up the 3 lists of data (everything else is single-width)
+local zero_width = {}
+local double_width = {}
+local ambiguous_width = {}
+local readlines do
+  local utils = require("pl.utils")
+  function readlines(filename)
+    print("Parsing " .. filename)
+    local lines = assert(utils.readlines(filename))
+    -- drop lines starting with "#" being comments, or empty lines (whitespace only)
+    for i = #lines, 1, -1 do -- reverse, since we're deleting items
+      if lines[i]:match("^%s*#") or lines[i]:match("^%s*$") then
+        table.remove(lines, i)
+      end
+    end
+    return lines
+  end
+end
+-- parse DerivedGeneralCategory.txt
+-- Purpose: zero-width combining marks
+-- Extract:
+--   Mn — Nonspacing Mark → width = 0
+--   Me — Enclosing Mark → width = 0
+-- Why:
+--   These characters overlay the previous glyph
+--   This replaces Markus Kuhn’s combining[] table
+-- Ignore all other categories in this file.
+do
+  local lines = readlines(target_path .. download_file_list[FN_DERIVED_GENERAL_CATEGORY])
+  local zw_start = #zero_width
+  -- parse the lines
+  for _, line in ipairs(lines) do
+    local range, category = line:match("^([%x%.]+)%s*;%s*(%a+)")
+    if not range then
+      error("Failed to parse line: " .. line)
+    end
+    if not range:find("..", 1, true) then -- single code point, make range
+      range = range .. ".." .. range
+    end
+    if category == "Mn" or category == "Me" then
+      zero_width[#zero_width + 1] = range
+    end
+  end
+  print("  found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
+end
+-- parse DerivedCoreProperties.txt
+-- Purpose: zero-width format / ignorable characters
+-- Extract:
+--   Default_Ignorable_Code_Point → width = 0
+-- Includes (important examples):
+--   U+200D ZERO WIDTH JOINER
+--   U+200C ZERO WIDTH NON-JOINER
+--   U+FE00..U+FE0F (variation selectors)
+--   Bidi and other format controls
+-- Why:
+--   Not Mn/Me, but terminals treat them as zero-width
+--   Required for emoji correctness and modern text
+do
+  local lines = readlines(target_path .. download_file_list[FN_DERIVED_CORE_PROPERTIES])
+  local zw_start = #zero_width
+  -- parse the lines
+  for _, line in ipairs(lines) do
+    local range, category = line:match("^([%x%.]+)%s*;%s*([%a_]+)")
+    if not range then
+      error("Failed to parse line: " .. line)
+    end
+    if not range:find("..", 1, true) then -- single code point, make range
+      range = range .. ".." .. range
+    end
+    if category == "Default_Ignorable_Code_Point" then
+      zero_width[#zero_width + 1] = range
+    end
+  end
+  print("  found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
+end
+-- parse EastAsianWidth.txt
+-- Purpose: determine double-width and ambiguous-width characters
+-- Extract:
+--   W (Wide) → width = 2
+--   F (Fullwidth) → width = 2
+--   A (Ambiguous) → width = 1 or 2 (your choice; usually 1 unless CJK mode)
+-- Everything else:
+--   H, Na, N → width = 1
+-- Why:
+--   - This is the only Unicode-sanctioned width-related property
+--   - Core of all wcwidth() implementations
+do
+  local lines = readlines(target_path .. download_file_list[FN_EAST_ASIAN_WIDTH])
+  local dw_start = #double_width
+  local aw_start = #ambiguous_width
+  -- parse the lines
+  for _, line in ipairs(lines) do
+    local range, width_type = line:match("^([%x%.]+)%s*;%s*(%a+)")
+    if not range then
+      error("Failed to parse line: " .. line)
+    end
+    if not range:find("..", 1, true) then -- single code point, make range
+      range = range .. ".." .. range
+    end
+    if width_type == "W" or width_type == "F" then
+      double_width[#double_width + 1] = range
+    elseif width_type == "A" then
+      ambiguous_width[#ambiguous_width + 1] = range
+    end
+  end
+  print("  found " .. (#double_width - dw_start) .. " double-width character-ranges")
+  print("  found " .. (#ambiguous_width - aw_start) .. " ambiguous-width character-ranges")
+end
+-- parse emoji-data.txt
+-- Purpose: emoji presentation width
+-- Extract:
+--   Emoji_Presentation=Yes → width = 2
+--   (Optionally) Extended_Pictographic → emoji sequences
+-- Why:
+--   Emoji are not reliably covered by EastAsianWidth
+--   Modern terminals render these as double-width
+--   Required for correct emoji column alignment
+do
+  local lines = readlines(target_path .. download_file_list[FN_EMOJI_DATA])
+  local dw_start = #double_width
+  -- parse the lines
+  for _, line in ipairs(lines) do
+    local range, properties = line:match("^([%x%.]+)%s*;%s*([%a_]+)")
+    if not range then
+      error("Failed to parse line: " .. line)
+    end
+    if not range:find("..", 1, true) then -- single code point, make range
+      range = range .. ".." .. range
+    end
+    if properties:match("Emoji_Presentation") then
+      double_width[#double_width + 1] = range
+    end
+  end
+  print("  found " .. (#double_width - dw_start) .. " double-width character-ranges")
+end
+-- returns the start and end of a range, numerically, and hex strings
+-- @tparam string range the range to parse
+-- @treturn number sr the start of the range
+-- @treturn number er the end of the range
+-- @treturn string sh the start of the range as a hex string
+-- @treturn string eh the end of the range as a hex string
+local parse_range do
+  function parse_range(range)
+    local s = range:find("..", 1, true)
+    if not s then
+      error("Failed to parse range: " .. range)
+    end
+    local sh = range:sub(1, s - 1)
+    local eh = range:sub(s + 2, -1)
+    local sr = tonumber(sh, 16)
+    local er = tonumber(eh, 16)
+    if er < sr then
+      error("Failed to parse range: " .. range .. " (end < start)")
+    end
+    return sr, er, sh, eh
+  end
+  -- some inline tests for parse_range
+  local sr, er = parse_range("25FD..25FE")
+  assert(sr == 9725)
+  assert(er == 9726)
+  local sr, er = parse_range("105C0..105F3")
+  assert(sr == 67008)
+  assert(er == 67059)
+end
+-- sorts the ranges in-place
+local function sort_ranges(ranges)
+  table.sort(ranges, function(a, b)
+    return parse_range(a) < parse_range(b)
+  end)
+  return ranges
+end
+-- combines adjacent ranges in-place
+local combine_ranges do
+  function combine_ranges(ranges)
+    local last_idx = 1
+    for i = 2, #ranges do
+      local last_s, last_e, last_sh, last_eh = parse_range(ranges[last_idx])
+      local current_s, current_e, _, current_eh = parse_range(ranges[i])
+      if current_s >= last_s and current_s <= (last_e + 1) then
+        -- ranges are adjacent or overlapping, combine them
+        local sh = last_sh
+        local eh = current_eh
+        if last_e > current_e then
+          eh = last_eh
+        end
+        ranges[last_idx] = sh .. ".." .. eh
+      else
+        last_idx = last_idx + 1
+        ranges[last_idx] = ranges[i]
+      end
+    end
+    -- clear left-overs beyond last entry
+    for i = last_idx + 1, #ranges do
+      ranges[i] = nil
+    end
+  end
+  -- some inline tests for combine_ranges
+  local ranges = {
+    "25FD..25FE",
+    "25FD..25FE",  -- duplicate range, should be removed
+    "105C0..105F3",
+    "105D0..105E0",  -- range fully within previous range, should be combined
+    "10F00..10F10",
+    "10F11..10F20",  -- adjacent or previous, should be combined
+    "11000..11100",
+    "11101..11110",  -- adjacent + extending to previous, should be combined
+    "12000..12010",
+    "12011..12020",  -- multiple: adjacent should be combined
+    "12015..12030",  -- multiple: overlap + extending to previous, should be combined
+    "12031..12040",  -- multiple: overlapping, should be combined
+  }
+  combine_ranges(ranges)
+  assert(#ranges == 5)
+  assert(ranges[1] == "25FD..25FE")
+  assert(ranges[2] == "105C0..105F3")
+  assert(ranges[3] == "10F00..10F20")
+  assert(ranges[4] == "11000..11110")
+  assert(ranges[5] == "12000..12040")
+end
+combine_ranges(sort_ranges(zero_width))
+combine_ranges(sort_ranges(double_width))
+combine_ranges(sort_ranges(ambiguous_width))
+-- convert ranges into c-source-code ranges (in-place)
+-- format: "{ 0x0829, 0x082D }"
+local function convert_c_ranges(ranges)
+  for i = 1, #ranges do
+    local _, _, sh, eh = parse_range(ranges[i])
+    ranges[i] = "{ 0x" .. sh .. ", 0x" .. eh .. " }"
+  end
+end
+convert_c_ranges(zero_width)
+convert_c_ranges(double_width)
+convert_c_ranges(ambiguous_width)
+local SOURCE_INDENT = "    "
+-- write c source, as triplet; 3 ranges on 1 line
+local function triplet_lines(ranges)
+  local lines = {}
+  for i = 1, #ranges, 3 do
+    lines[#lines+1] = SOURCE_INDENT .. table.concat(ranges, ", ", i, math.min(i + 2, #ranges)) .. ","
+  end
+  -- drop trailing comma from last line
+  lines[#lines] = lines[#lines]:sub(1, -2)
+  return lines
+end
+-- create file-contents
+local function create_file_contents(ranges, contains)
+  return
+    SOURCE_INDENT .. "// Do not modify this file directly, it is generated by the wcwidth_update.lua script\n" ..
+    SOURCE_INDENT .. "// Contains " .. contains .. "\n" ..
+    SOURCE_INDENT .. "// Generated from Unicode " .. VERSION .. "\n" ..
+    SOURCE_INDENT .. "// Generated on " .. os.date("%Y-%m-%d") .. "\n" ..
+    table.concat(triplet_lines(ranges), "\n") .. "\n"
+end
+local writefile = require("pl.utils").writefile
+print("writing source files...")
+print("  zero-width: ./wcwidth_zero_width.c")
+assert(writefile("./wcwidth_zero_width.c", create_file_contents(zero_width, "unicode character-ranges handled as 0 width")))
+print("  double-width: ./wcwidth_double_width.c")
+assert(writefile("./wcwidth_double_width.c", create_file_contents(double_width, "unicode character-ranges handled as double width")))
+print("  ambiguous-width: ./wcwidth_ambiguous_width.c")
+assert(writefile("./wcwidth_ambiguous_width.c", create_file_contents(ambiguous_width, "unicode character-ranges handled as ambiguous (either 1 or 2 width)")))
author	Thijs Schreijer <thijs@thijsschreijer.nl>	2026-01-29 11:02:33 +0100
committer	Thijs Schreijer <thijs@thijsschreijer.nl>	2026-01-29 13:55:39 +0100
commit	1159329b247d6532fecb375e7008aca979261eaa (patch)
tree	0b56caf9a03ba47c2c77d5313662ea27f198752d /src/wcwidth_update.lua
parent	dfd0d4b8ca3607ae39b1d2cbad4e3a7180dd6754 (diff)
download	luasystem-1159329b247d6532fecb375e7008aca979261eaa.tar.gz luasystem-1159329b247d6532fecb375e7008aca979261eaa.tar.bz2 luasystem-1159329b247d6532fecb375e7008aca979261eaa.zip

diff --git a/src/wcwidth_update.lua b/src/wcwidth_update.lua new file mode 100755 index 0000000..37f18c3 --- /dev/null +++ b/src/wcwidth_update.lua
@@ -0,0 +1,404 @@
	1	#!/usr/bin/env lua
	2
	3	-- This file downloads and parses unicode standard files and updates the wcwidth code
	4	-- based on that data.
	5
	6	local VERSION="17.0.0" -- the unicode standard version to download
	7
	8
	9
	10	-- test if curl is available, and Penlight
	11	do
	12	local ok, ec = os.execute("curl --version > /dev/null 2>&1")
	13	if not ok then
	14	error("curl is not available in the path; exitcode " .. ec)
	15	end
	16
	17	local ok, utils = pcall(require, "pl.utils")
	18	if not ok then
	19	error("Penlight is not available, please install via `luarocks install penlight`")
	20	end
	21
	22	utils.readfile("./wcwidth.c")
	23	if not ok then
	24	error("failed to read './wcwidth.c', run this script from within the `./src/` directory")
	25	end
	26	end
	27
	28	-- files to download from the unicode site
	29	local FN_DERIVED_GENERAL_CATEGORY = 1
	30	local FN_EAST_ASIAN_WIDTH = 2
	31	local FN_DERIVED_CORE_PROPERTIES = 3
	32	local FN_EMOJI_DATA = 4
	33
	34	local download_file_list = {
	35	[FN_DERIVED_GENERAL_CATEGORY] = "extracted/DerivedGeneralCategory.txt",
	36	[FN_EAST_ASIAN_WIDTH] = "EastAsianWidth.txt",
	37	[FN_DERIVED_CORE_PROPERTIES] = "DerivedCoreProperties.txt",
	38	[FN_EMOJI_DATA] = "emoji/emoji-data.txt",
	39	}
	40	local target_path = "./unicode_data/"
	41
	42
	43
	44	do
	45	local base_url = "https://www.unicode.org/Public/" .. VERSION .. "/ucd/" -- must include trailing slash
	46
	47
	48	-- removes a file, and then downloads a new copy from the unicode site
	49	local function download_file(filename, target_filename)
	50	print("Downloading " .. filename .. " to " .. target_filename)
	51	os.remove(target_filename)
	52	local cmd = "curl --fail -s -o " .. target_filename .. " " .. base_url .. filename
	53	local ok, ec = os.execute(cmd)
	54	if not ok then
	55	error("Failed to execute: " .. cmd .. "; exitcode " .. ec)
	56	end
	57	end
	58
	59
	60	-- Downloads all unicode files we need
	61	local function download_files()
	62	os.execute("mkdir -p " .. target_path .. "extracted")
	63	os.execute("mkdir -p " .. target_path .. "emoji")
	64	for _, filename in ipairs(download_file_list) do
	65	download_file(filename, target_path .. filename)
	66	end
	67	end
	68
	69
	70	download_files()
	71	end
	72
	73
	74
	75	-- set up the 3 lists of data (everything else is single-width)
	76	local zero_width = {}
	77	local double_width = {}
	78	local ambiguous_width = {}
	79
	80
	81
	82	local readlines do
	83	local utils = require("pl.utils")
	84
	85	function readlines(filename)
	86	print("Parsing " .. filename)
	87	local lines = assert(utils.readlines(filename))
	88
	89	-- drop lines starting with "#" being comments, or empty lines (whitespace only)
	90	for i = #lines, 1, -1 do -- reverse, since we're deleting items
	91	if lines[i]:match("^%s#") or lines[i]:match("^%s$") then
	92	table.remove(lines, i)
	93	end
	94	end
	95
	96	return lines
	97	end
	98	end
	99
	100
	101
	102
	103	-- parse DerivedGeneralCategory.txt
	104	-- Purpose: zero-width combining marks
	105	-- Extract:
	106	-- Mn — Nonspacing Mark → width = 0
	107	-- Me — Enclosing Mark → width = 0
	108	-- Why:
	109	-- These characters overlay the previous glyph
	110	-- This replaces Markus Kuhn’s combining[] table
	111	-- Ignore all other categories in this file.
	112	do
	113	local lines = readlines(target_path .. download_file_list[FN_DERIVED_GENERAL_CATEGORY])
	114	local zw_start = #zero_width
	115
	116	-- parse the lines
	117	for _, line in ipairs(lines) do
	118	local range, category = line:match("^([%x%.]+)%s;%s(%a+)")
	119	if not range then
	120	error("Failed to parse line: " .. line)
	121	end
	122
	123	if not range:find("..", 1, true) then -- single code point, make range
	124	range = range .. ".." .. range
	125	end
	126
	127	if category == "Mn" or category == "Me" then
	128	zero_width[#zero_width + 1] = range
	129	end
	130	end
	131
	132	print(" found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
	133	end
	134
	135
	136
	137	-- parse DerivedCoreProperties.txt
	138	-- Purpose: zero-width format / ignorable characters
	139	-- Extract:
	140	-- Default_Ignorable_Code_Point → width = 0
	141
	142	-- Includes (important examples):
	143	-- U+200D ZERO WIDTH JOINER
	144	-- U+200C ZERO WIDTH NON-JOINER
	145	-- U+FE00..U+FE0F (variation selectors)
	146	-- Bidi and other format controls
	147
	148	-- Why:
	149	-- Not Mn/Me, but terminals treat them as zero-width
	150	-- Required for emoji correctness and modern text
	151	do
	152	local lines = readlines(target_path .. download_file_list[FN_DERIVED_CORE_PROPERTIES])
	153	local zw_start = #zero_width
	154
	155	-- parse the lines
	156	for _, line in ipairs(lines) do
	157	local range, category = line:match("^([%x%.]+)%s;%s([%a_]+)")
	158	if not range then
	159	error("Failed to parse line: " .. line)
	160	end
	161
	162	if not range:find("..", 1, true) then -- single code point, make range
	163	range = range .. ".." .. range
	164	end
	165
	166	if category == "Default_Ignorable_Code_Point" then
	167	zero_width[#zero_width + 1] = range
	168	end
	169	end
	170
	171	print(" found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
	172	end
	173
	174
	175
	176	-- parse EastAsianWidth.txt
	177	-- Purpose: determine double-width and ambiguous-width characters
	178	-- Extract:
	179	-- W (Wide) → width = 2
	180	-- F (Fullwidth) → width = 2
	181	-- A (Ambiguous) → width = 1 or 2 (your choice; usually 1 unless CJK mode)
	182	-- Everything else:
	183	-- H, Na, N → width = 1
	184	-- Why:
	185	-- - This is the only Unicode-sanctioned width-related property
	186	-- - Core of all wcwidth() implementations
	187	do
	188	local lines = readlines(target_path .. download_file_list[FN_EAST_ASIAN_WIDTH])
	189	local dw_start = #double_width
	190	local aw_start = #ambiguous_width
	191
	192	-- parse the lines
	193	for _, line in ipairs(lines) do
	194	local range, width_type = line:match("^([%x%.]+)%s;%s(%a+)")
	195	if not range then
	196	error("Failed to parse line: " .. line)
	197	end
	198
	199	if not range:find("..", 1, true) then -- single code point, make range
	200	range = range .. ".." .. range
	201	end
	202
	203	if width_type == "W" or width_type == "F" then
	204	double_width[#double_width + 1] = range
	205	elseif width_type == "A" then
	206	ambiguous_width[#ambiguous_width + 1] = range
	207	end
	208	end
	209
	210	print(" found " .. (#double_width - dw_start) .. " double-width character-ranges")
	211	print(" found " .. (#ambiguous_width - aw_start) .. " ambiguous-width character-ranges")
	212	end
	213
	214
	215
	216	-- parse emoji-data.txt
	217	-- Purpose: emoji presentation width
	218	-- Extract:
	219	-- Emoji_Presentation=Yes → width = 2
	220	-- (Optionally) Extended_Pictographic → emoji sequences
	221	-- Why:
	222	-- Emoji are not reliably covered by EastAsianWidth
	223	-- Modern terminals render these as double-width
	224	-- Required for correct emoji column alignment
	225	do
	226	local lines = readlines(target_path .. download_file_list[FN_EMOJI_DATA])
	227	local dw_start = #double_width
	228
	229	-- parse the lines
	230	for _, line in ipairs(lines) do
	231	local range, properties = line:match("^([%x%.]+)%s;%s([%a_]+)")
	232	if not range then
	233	error("Failed to parse line: " .. line)
	234	end
	235
	236	if not range:find("..", 1, true) then -- single code point, make range
	237	range = range .. ".." .. range
	238	end
	239
	240	if properties:match("Emoji_Presentation") then
	241	double_width[#double_width + 1] = range
	242	end
	243	end
	244
	245	print(" found " .. (#double_width - dw_start) .. " double-width character-ranges")
	246	end
	247
	248
	249
	250	-- returns the start and end of a range, numerically, and hex strings
	251	-- @tparam string range the range to parse
	252	-- @treturn number sr the start of the range
	253	-- @treturn number er the end of the range
	254	-- @treturn string sh the start of the range as a hex string
	255	-- @treturn string eh the end of the range as a hex string
	256	local parse_range do
	257	function parse_range(range)
	258	local s = range:find("..", 1, true)
	259	if not s then
	260	error("Failed to parse range: " .. range)
	261	end
	262	local sh = range:sub(1, s - 1)
	263	local eh = range:sub(s + 2, -1)
	264	local sr = tonumber(sh, 16)
	265	local er = tonumber(eh, 16)
	266	if er < sr then
	267	error("Failed to parse range: " .. range .. " (end < start)")
	268	end
	269	return sr, er, sh, eh
	270	end
	271
	272	-- some inline tests for parse_range
	273	local sr, er = parse_range("25FD..25FE")
	274	assert(sr == 9725)
	275	assert(er == 9726)
	276	local sr, er = parse_range("105C0..105F3")
	277	assert(sr == 67008)
	278	assert(er == 67059)
	279	end
	280
	281
	282
	283	-- sorts the ranges in-place
	284	local function sort_ranges(ranges)
	285	table.sort(ranges, function(a, b)
	286	return parse_range(a) < parse_range(b)
	287	end)
	288	return ranges
	289	end
	290
	291
	292
	293	-- combines adjacent ranges in-place
	294	local combine_ranges do
	295	function combine_ranges(ranges)
	296	local last_idx = 1
	297	for i = 2, #ranges do
	298	local last_s, last_e, last_sh, last_eh = parse_range(ranges[last_idx])
	299	local current_s, current_e, _, current_eh = parse_range(ranges[i])
	300	if current_s >= last_s and current_s <= (last_e + 1) then
	301	-- ranges are adjacent or overlapping, combine them
	302	local sh = last_sh
	303	local eh = current_eh
	304	if last_e > current_e then
	305	eh = last_eh
	306	end
	307	ranges[last_idx] = sh .. ".." .. eh
	308	else
	309	last_idx = last_idx + 1
	310	ranges[last_idx] = ranges[i]
	311	end
	312	end
	313	-- clear left-overs beyond last entry
	314	for i = last_idx + 1, #ranges do
	315	ranges[i] = nil
	316	end
	317	end
	318
	319	-- some inline tests for combine_ranges
	320	local ranges = {
	321	"25FD..25FE",
	322	"25FD..25FE", -- duplicate range, should be removed
	323	"105C0..105F3",
	324	"105D0..105E0", -- range fully within previous range, should be combined
	325	"10F00..10F10",
	326	"10F11..10F20", -- adjacent or previous, should be combined
	327	"11000..11100",
	328	"11101..11110", -- adjacent + extending to previous, should be combined
	329	"12000..12010",
	330	"12011..12020", -- multiple: adjacent should be combined
	331	"12015..12030", -- multiple: overlap + extending to previous, should be combined
	332	"12031..12040", -- multiple: overlapping, should be combined
	333	}
	334	combine_ranges(ranges)
	335	assert(#ranges == 5)
	336	assert(ranges[1] == "25FD..25FE")
	337	assert(ranges[2] == "105C0..105F3")
	338	assert(ranges[3] == "10F00..10F20")
	339	assert(ranges[4] == "11000..11110")
	340	assert(ranges[5] == "12000..12040")
	341	end
	342
	343
	344
	345	combine_ranges(sort_ranges(zero_width))
	346	combine_ranges(sort_ranges(double_width))
	347	combine_ranges(sort_ranges(ambiguous_width))
	348
	349
	350
	351	-- convert ranges into c-source-code ranges (in-place)
	352	-- format: "{ 0x0829, 0x082D }"
	353	local function convert_c_ranges(ranges)
	354	for i = 1, #ranges do
	355	local _, _, sh, eh = parse_range(ranges[i])
	356	ranges[i] = "{ 0x" .. sh .. ", 0x" .. eh .. " }"
	357	end
	358	end
	359
	360	convert_c_ranges(zero_width)
	361	convert_c_ranges(double_width)
	362	convert_c_ranges(ambiguous_width)
	363
	364
	365
	366	local SOURCE_INDENT = " "
	367
	368
	369	-- write c source, as triplet; 3 ranges on 1 line
	370	local function triplet_lines(ranges)
	371	local lines = {}
	372	for i = 1, #ranges, 3 do
	373	lines[#lines+1] = SOURCE_INDENT .. table.concat(ranges, ", ", i, math.min(i + 2, #ranges)) .. ","
	374	end
	375	-- drop trailing comma from last line
	376	lines[#lines] = lines[#lines]:sub(1, -2)
	377	return lines
	378	end
	379
	380
	381	-- create file-contents
	382	local function create_file_contents(ranges, contains)
	383	return
	384	SOURCE_INDENT .. "// Do not modify this file directly, it is generated by the wcwidth_update.lua script\n" ..
	385	SOURCE_INDENT .. "// Contains " .. contains .. "\n" ..
	386	SOURCE_INDENT .. "// Generated from Unicode " .. VERSION .. "\n" ..
	387	SOURCE_INDENT .. "// Generated on " .. os.date("%Y-%m-%d") .. "\n" ..
	388	table.concat(triplet_lines(ranges), "\n") .. "\n"
	389	end
	390
	391
	392
	393
	394	local writefile = require("pl.utils").writefile
	395
	396	print("writing source files...")
	397	print(" zero-width: ./wcwidth_zero_width.c")
	398	assert(writefile("./wcwidth_zero_width.c", create_file_contents(zero_width, "unicode character-ranges handled as 0 width")))
	399
	400	print(" double-width: ./wcwidth_double_width.c")
	401	assert(writefile("./wcwidth_double_width.c", create_file_contents(double_width, "unicode character-ranges handled as double width")))
	402
	403	print(" ambiguous-width: ./wcwidth_ambiguous_width.c")
	404	assert(writefile("./wcwidth_ambiguous_width.c", create_file_contents(ambiguous_width, "unicode character-ranges handled as ambiguous (either 1 or 2 width)")))