#!/usr/bin/env lua

-- This file downloads and parses unicode standard files and updates the wcwidth code
-- based on that data.

local VERSION="17.0.0"   -- the unicode standard version to download


-- test if curl is available, and Penlight
do
  local ok, ec = os.execute("curl --version > /dev/null 2>&1")
  if not ok then
    error("curl is not available in the path; exitcode " .. ec)
  end

  local ok, utils = pcall(require, "pl.utils")
  if not ok then
    error("Penlight is not available, please install via `luarocks install penlight`")
  end

  utils.readfile("./wcwidth.c")
  if not ok then
    error("failed to read './wcwidth.c', run this script from within the `./src/` directory")
  end
end

-- files to download from the unicode site
local FN_DERIVED_GENERAL_CATEGORY = 1
local FN_EAST_ASIAN_WIDTH = 2
local FN_DERIVED_CORE_PROPERTIES = 3
local FN_EMOJI_DATA = 4

local download_file_list = {
  [FN_DERIVED_GENERAL_CATEGORY] = "extracted/DerivedGeneralCategory.txt",
  [FN_EAST_ASIAN_WIDTH]         = "EastAsianWidth.txt",
  [FN_DERIVED_CORE_PROPERTIES]  = "DerivedCoreProperties.txt",
  [FN_EMOJI_DATA]               = "emoji/emoji-data.txt",
}
local target_path = "./unicode_data/"


do
  local base_url = "https://www.unicode.org/Public/" .. VERSION .. "/ucd/"  -- must include trailing slash


  -- removes a file, and then downloads a new copy from the unicode site
  local function download_file(filename, target_filename)
    print("Downloading " .. filename .. " to " .. target_filename)
    os.remove(target_filename)
    local cmd = "curl --fail -s -o " .. target_filename .. " " .. base_url .. filename
    local ok, ec = os.execute(cmd)
    if not ok then
      error("Failed to execute: " .. cmd .. "; exitcode " .. ec)
    end
  end


  -- Downloads all unicode files we need
  local function download_files()
    os.execute("mkdir -p " .. target_path .. "extracted")
    os.execute("mkdir -p " .. target_path .. "emoji")
    for _, filename in ipairs(download_file_list) do
      download_file(filename, target_path .. filename)
    end
  end


  download_files()
end


-- set up the 3 lists of data (everything else is single-width)
local zero_width = {}
local double_width = {}
local ambiguous_width = {}


local readlines do
  local utils = require("pl.utils")

  function readlines(filename)
    print("Parsing " .. filename)
    local lines = assert(utils.readlines(filename))

    -- drop lines starting with "#" being comments, or empty lines (whitespace only)
    for i = #lines, 1, -1 do -- reverse, since we're deleting items
      if lines[i]:match("^%s*#") or lines[i]:match("^%s*$") then
        table.remove(lines, i)
      end
    end

    return lines
  end
end


-- parse DerivedGeneralCategory.txt
-- Purpose: zero-width combining marks
-- Extract:
--   Mn — Nonspacing Mark → width = 0
--   Me — Enclosing Mark → width = 0
-- Why:
--   These characters overlay the previous glyph
--   This replaces Markus Kuhn’s combining[] table
-- Ignore all other categories in this file.
do
  local lines = readlines(target_path .. download_file_list[FN_DERIVED_GENERAL_CATEGORY])
  local zw_start = #zero_width

  -- parse the lines
  for _, line in ipairs(lines) do
    local range, category = line:match("^([%x%.]+)%s*;%s*(%a+)")
    if not range then
      error("Failed to parse line: " .. line)
    end

    if not range:find("..", 1, true) then -- single code point, make range
      range = range .. ".." .. range
    end

    if category == "Mn" or category == "Me" then
      zero_width[#zero_width + 1] = range
    end
  end

  print("  found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
end


-- parse DerivedCoreProperties.txt
-- Purpose: zero-width format / ignorable characters
-- Extract:
--   Default_Ignorable_Code_Point → width = 0

-- Includes (important examples):
--   U+200D ZERO WIDTH JOINER
--   U+200C ZERO WIDTH NON-JOINER
--   U+FE00..U+FE0F (variation selectors)
--   Bidi and other format controls

-- Why:
--   Not Mn/Me, but terminals treat them as zero-width
--   Required for emoji correctness and modern text
do
  local lines = readlines(target_path .. download_file_list[FN_DERIVED_CORE_PROPERTIES])
  local zw_start = #zero_width

  -- parse the lines
  for _, line in ipairs(lines) do
    local range, category = line:match("^([%x%.]+)%s*;%s*([%a_]+)")
    if not range then
      error("Failed to parse line: " .. line)
    end

    if not range:find("..", 1, true) then -- single code point, make range
      range = range .. ".." .. range
    end

    if category == "Default_Ignorable_Code_Point" then
      zero_width[#zero_width + 1] = range
    end
  end

  print("  found " .. (#zero_width - zw_start) .. " zero-width character-ranges")
end


-- parse EastAsianWidth.txt
-- Purpose: determine double-width and ambiguous-width characters
-- Extract:
--   W (Wide) → width = 2
--   F (Fullwidth) → width = 2
--   A (Ambiguous) → width = 1 or 2 (your choice; usually 1 unless CJK mode)
-- Everything else:
--   H, Na, N → width = 1
-- Why:
--   - This is the only Unicode-sanctioned width-related property
--   - Core of all wcwidth() implementations
do
  local lines = readlines(target_path .. download_file_list[FN_EAST_ASIAN_WIDTH])
  local dw_start = #double_width
  local aw_start = #ambiguous_width

  -- parse the lines
  for _, line in ipairs(lines) do
    local range, width_type = line:match("^([%x%.]+)%s*;%s*(%a+)")
    if not range then
      error("Failed to parse line: " .. line)
    end

    if not range:find("..", 1, true) then -- single code point, make range
      range = range .. ".." .. range
    end

    if width_type == "W" or width_type == "F" then
      double_width[#double_width + 1] = range
    elseif width_type == "A" then
      ambiguous_width[#ambiguous_width + 1] = range
    end
  end

  print("  found " .. (#double_width - dw_start) .. " double-width character-ranges")
  print("  found " .. (#ambiguous_width - aw_start) .. " ambiguous-width character-ranges")
end


-- parse emoji-data.txt
-- Purpose: emoji presentation width
-- Extract:
--   Emoji_Presentation=Yes → width = 2
--   (Optionally) Extended_Pictographic → emoji sequences
-- Why:
--   Emoji are not reliably covered by EastAsianWidth
--   Modern terminals render these as double-width
--   Required for correct emoji column alignment
do
  local lines = readlines(target_path .. download_file_list[FN_EMOJI_DATA])
  local dw_start = #double_width

  -- parse the lines
  for _, line in ipairs(lines) do
    local range, properties = line:match("^([%x%.]+)%s*;%s*([%a_]+)")
    if not range then
      error("Failed to parse line: " .. line)
    end

    if not range:find("..", 1, true) then -- single code point, make range
      range = range .. ".." .. range
    end

    if properties:match("Emoji_Presentation") then
      double_width[#double_width + 1] = range
    end
  end

  print("  found " .. (#double_width - dw_start) .. " double-width character-ranges")
end


-- returns the start and end of a range, numerically, and hex strings
-- @tparam string range the range to parse
-- @treturn number sr the start of the range
-- @treturn number er the end of the range
-- @treturn string sh the start of the range as a hex string
-- @treturn string eh the end of the range as a hex string
local parse_range do
  function parse_range(range)
    local s = range:find("..", 1, true)
    if not s then
      error("Failed to parse range: " .. range)
    end
    local sh = range:sub(1, s - 1)
    local eh = range:sub(s + 2, -1)
    local sr = tonumber(sh, 16)
    local er = tonumber(eh, 16)
    if er < sr then
      error("Failed to parse range: " .. range .. " (end < start)")
    end
    return sr, er, sh, eh
  end

  -- some inline tests for parse_range
  local sr, er = parse_range("25FD..25FE")
  assert(sr == 9725)
  assert(er == 9726)
  local sr, er = parse_range("105C0..105F3")
  assert(sr == 67008)
  assert(er == 67059)
end


-- sorts the ranges in-place
local function sort_ranges(ranges)
  table.sort(ranges, function(a, b)
    return parse_range(a) < parse_range(b)
  end)
  return ranges
end


-- combines adjacent ranges in-place
local combine_ranges do
  function combine_ranges(ranges)
    local last_idx = 1
    for i = 2, #ranges do
      local last_s, last_e, last_sh, last_eh = parse_range(ranges[last_idx])
      local current_s, current_e, _, current_eh = parse_range(ranges[i])
      if current_s >= last_s and current_s <= (last_e + 1) then
        -- ranges are adjacent or overlapping, combine them
        local sh = last_sh
        local eh = current_eh
        if last_e > current_e then
          eh = last_eh
        end
        ranges[last_idx] = sh .. ".." .. eh
      else
        last_idx = last_idx + 1
        ranges[last_idx] = ranges[i]
      end
    end
    -- clear left-overs beyond last entry
    for i = last_idx + 1, #ranges do
      ranges[i] = nil
    end
  end

  -- some inline tests for combine_ranges
  local ranges = {
    "25FD..25FE",
    "25FD..25FE",  -- duplicate range, should be removed
    "105C0..105F3",
    "105D0..105E0",  -- range fully within previous range, should be combined
    "10F00..10F10",
    "10F11..10F20",  -- adjacent or previous, should be combined
    "11000..11100",
    "11101..11110",  -- adjacent + extending to previous, should be combined
    "12000..12010",
    "12011..12020",  -- multiple: adjacent should be combined
    "12015..12030",  -- multiple: overlap + extending to previous, should be combined
    "12031..12040",  -- multiple: overlapping, should be combined
  }
  combine_ranges(ranges)
  assert(#ranges == 5)
  assert(ranges[1] == "25FD..25FE")
  assert(ranges[2] == "105C0..105F3")
  assert(ranges[3] == "10F00..10F20")
  assert(ranges[4] == "11000..11110")
  assert(ranges[5] == "12000..12040")
end


combine_ranges(sort_ranges(zero_width))
combine_ranges(sort_ranges(double_width))
combine_ranges(sort_ranges(ambiguous_width))


-- convert ranges into c-source-code ranges (in-place)
-- format: "{ 0x0829, 0x082D }"
local function convert_c_ranges(ranges)
  for i = 1, #ranges do
    local _, _, sh, eh = parse_range(ranges[i])
    ranges[i] = "{ 0x" .. sh .. ", 0x" .. eh .. " }"
  end
end

convert_c_ranges(zero_width)
convert_c_ranges(double_width)
convert_c_ranges(ambiguous_width)


local SOURCE_INDENT = "    "


-- write c source, as triplet; 3 ranges on 1 line
local function triplet_lines(ranges)
  local lines = {}
  for i = 1, #ranges, 3 do
    lines[#lines+1] = SOURCE_INDENT .. table.concat(ranges, ", ", i, math.min(i + 2, #ranges)) .. ","
  end
  -- drop trailing comma from last line
  lines[#lines] = lines[#lines]:sub(1, -2)
  return lines
end


-- create file-contents
local function create_file_contents(ranges, contains)
  return
    SOURCE_INDENT .. "// Do not modify this file directly, it is generated by the wcwidth_update.lua script\n" ..
    SOURCE_INDENT .. "// Contains " .. contains .. "\n" ..
    SOURCE_INDENT .. "// Generated from Unicode " .. VERSION .. "\n" ..
    SOURCE_INDENT .. "// Generated on " .. os.date("%Y-%m-%d") .. "\n" ..
    table.concat(triplet_lines(ranges), "\n") .. "\n"
end


local writefile = require("pl.utils").writefile

print("writing source files...")
print("  zero-width: ./wcwidth_zero_width.c")
assert(writefile("./wcwidth_zero_width.c", create_file_contents(zero_width, "unicode character-ranges handled as 0 width")))

print("  double-width: ./wcwidth_double_width.c")
assert(writefile("./wcwidth_double_width.c", create_file_contents(double_width, "unicode character-ranges handled as double width")))

print("  ambiguous-width: ./wcwidth_ambiguous_width.c")
assert(writefile("./wcwidth_ambiguous_width.c", create_file_contents(ambiguous_width, "unicode character-ranges handled as ambiguous (either 1 or 2 width)")))