diff options
| author | Thijs Schreijer <thijs@thijsschreijer.nl> | 2025-01-26 20:15:39 +0100 |
|---|---|---|
| committer | Thijs Schreijer <thijs@thijsschreijer.nl> | 2025-01-26 20:15:39 +0100 |
| commit | f697ea7e7603e916d5ee13327fcdaf9c811a00aa (patch) | |
| tree | e044dec90ee1cc497ac0cf1d9892aee11089a3e5 | |
| parent | 85ad15fbd8c81807a1a662f5b6060641fa3a6357 (diff) | |
| download | luasystem-unicode/ambiguous.tar.gz luasystem-unicode/ambiguous.tar.bz2 luasystem-unicode/ambiguous.zip | |
feat(terminal): check ambiguous unicode widthunicode/ambiguous
| -rw-r--r-- | luasystem-scm-0.rockspec | 1 | ||||
| -rw-r--r-- | spec/04-term_spec.lua | 6 | ||||
| -rw-r--r-- | src/term.c | 11 | ||||
| -rw-r--r-- | src/wcwidtha.c | 259 | ||||
| -rw-r--r-- | src/wcwidtha.h | 12 |
5 files changed, 288 insertions, 1 deletions
diff --git a/luasystem-scm-0.rockspec b/luasystem-scm-0.rockspec index 00a442c..ab83080 100644 --- a/luasystem-scm-0.rockspec +++ b/luasystem-scm-0.rockspec | |||
| @@ -61,6 +61,7 @@ local function make_platform(plat) | |||
| 61 | 'src/term.c', | 61 | 'src/term.c', |
| 62 | 'src/bitflags.c', | 62 | 'src/bitflags.c', |
| 63 | 'src/wcwidth.c', | 63 | 'src/wcwidth.c', |
| 64 | 'src/wcwidtha.c', | ||
| 64 | }, | 65 | }, |
| 65 | defines = defines[plat], | 66 | defines = defines[plat], |
| 66 | libraries = libraries[plat], | 67 | libraries = libraries[plat], |
diff --git a/spec/04-term_spec.lua b/spec/04-term_spec.lua index 813947a..711059b 100644 --- a/spec/04-term_spec.lua +++ b/spec/04-term_spec.lua | |||
| @@ -539,6 +539,12 @@ describe("Terminal:", function() | |||
| 539 | assert.same({2}, {system.utf8cwidth(ch2 .. ch3 .. ch4)}) | 539 | assert.same({2}, {system.utf8cwidth(ch2 .. ch3 .. ch4)}) |
| 540 | end) | 540 | end) |
| 541 | 541 | ||
| 542 | it("returns 2nd ambigious boolean value only if requested", function() | ||
| 543 | assert.same({1}, {system.utf8cwidth("¡", false)}) | ||
| 544 | assert.same({1, true}, {system.utf8cwidth("¡", true)}) | ||
| 545 | assert.same({1, false}, {system.utf8cwidth("a", true)}) | ||
| 546 | end) | ||
| 547 | |||
| 542 | end) | 548 | end) |
| 543 | 549 | ||
| 544 | 550 | ||
| @@ -36,6 +36,7 @@ | |||
| 36 | // Windows does not have a wcwidth function, so we use compatibilty code from | 36 | // Windows does not have a wcwidth function, so we use compatibilty code from |
| 37 | // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c by Markus Kuhn | 37 | // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c by Markus Kuhn |
| 38 | #include "wcwidth.h" | 38 | #include "wcwidth.h" |
| 39 | #include "wcwidtha.h" // ambiguous width checks for East Asian characters | ||
| 39 | 40 | ||
| 40 | 41 | ||
| 41 | #ifdef _WIN32 | 42 | #ifdef _WIN32 |
| @@ -950,14 +951,16 @@ int utf8_to_wchar(const char *utf8, size_t len, mk_wchar_t *codepoint) { | |||
| 950 | Get the width of a utf8 character for terminal display. | 951 | Get the width of a utf8 character for terminal display. |
| 951 | @function utf8cwidth | 952 | @function utf8cwidth |
| 952 | @tparam string utf8_char the utf8 character to check, only the width of the first character will be returned | 953 | @tparam string utf8_char the utf8 character to check, only the width of the first character will be returned |
| 954 | @tparam bool ambiguous if `true` a second return value will be returned; boolean indicating if the character is ambiguous | ||
| 953 | @treturn[1] int the display width in columns of the first character in the string (0 for an empty string) | 955 | @treturn[1] int the display width in columns of the first character in the string (0 for an empty string) |
| 954 | @treturn[2] nil | 956 | @treturn[2] nil|bool if `ambiguous` is `true`, a boolean indicating if the character is ambiguous |
| 955 | @treturn[2] string error message | 957 | @treturn[2] string error message |
| 956 | */ | 958 | */ |
| 957 | int lst_utf8cwidth(lua_State *L) { | 959 | int lst_utf8cwidth(lua_State *L) { |
| 958 | const char *utf8_char; | 960 | const char *utf8_char; |
| 959 | size_t utf8_len; | 961 | size_t utf8_len; |
| 960 | utf8_char = luaL_checklstring(L, 1, &utf8_len); | 962 | utf8_char = luaL_checklstring(L, 1, &utf8_len); |
| 963 | int ambiguous = lua_toboolean(L, 2); | ||
| 961 | int width = 0; | 964 | int width = 0; |
| 962 | 965 | ||
| 963 | mk_wchar_t wc; | 966 | mk_wchar_t wc; |
| @@ -984,6 +987,12 @@ int lst_utf8cwidth(lua_State *L) { | |||
| 984 | } | 987 | } |
| 985 | 988 | ||
| 986 | lua_pushinteger(L, width); | 989 | lua_pushinteger(L, width); |
| 990 | |||
| 991 | if (ambiguous) { | ||
| 992 | // also check if the width is ambiguous | ||
| 993 | lua_pushboolean(L, mk_wcwidth_a(wc)); | ||
| 994 | return 2; | ||
| 995 | } | ||
| 987 | return 1; | 996 | return 1; |
| 988 | } | 997 | } |
| 989 | 998 | ||
diff --git a/src/wcwidtha.c b/src/wcwidtha.c new file mode 100644 index 0000000..2936ee0 --- /dev/null +++ b/src/wcwidtha.c | |||
| @@ -0,0 +1,259 @@ | |||
| 1 | // To update this file to the lastest version of the Unicode standard | ||
| 2 | // save the Lua script below to a file named 'getranges.lua' | ||
| 3 | // execute as: | ||
| 4 | // curl -s https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt | lua getranges.lua | ||
| 5 | |||
| 6 | /* the script: | ||
| 7 | local function singleline(line) | ||
| 8 | if line:find("; A", 1, true) then -- handle ambiguous characters only | ||
| 9 | local s,e = line:match("^([0-9a-fA-F]+)%.?%.?([0-9a-fA-F]*)") | ||
| 10 | e = (e == "" and s) or e -- single char, so end-range == start-range | ||
| 11 | local cmmnt = "// "..line:match("(; A.*)$") | ||
| 12 | local range = " {0x"..s..", 0x"..e.."}," | ||
| 13 | print(range..(" "):rep(30-#range)..cmmnt) -- print formatted output line | ||
| 14 | end | ||
| 15 | end | ||
| 16 | |||
| 17 | -- read all lines from stdin and iterate over them | ||
| 18 | local t = {} | ||
| 19 | for line in io.lines() do | ||
| 20 | line = line:match("^%s*(.-)%s*$") -- strip whitespace | ||
| 21 | if line ~= "" and line:sub(1,1) ~= "#" then -- skip comments and empty lines | ||
| 22 | singleline(line) | ||
| 23 | end | ||
| 24 | end | ||
| 25 | */ | ||
| 26 | |||
| 27 | |||
| 28 | #include "wcwidtha.h" | ||
| 29 | |||
| 30 | struct interval { | ||
| 31 | mk_wchar_t start; | ||
| 32 | mk_wchar_t end; | ||
| 33 | }; | ||
| 34 | |||
| 35 | |||
| 36 | // Takes a unicode character, and return whether the character is in the list of | ||
| 37 | // ambiguous width characters. | ||
| 38 | int mk_wcwidth_a(mk_wchar_t ucs) | ||
| 39 | { | ||
| 40 | /* sorted list of ambiguous width characters in East Asian displays */ | ||
| 41 | /* generated by script in the comments above */ | ||
| 42 | static const struct interval ranges[] = { | ||
| 43 | {0x00A1, 0x00A1}, // ; A # Po INVERTED EXCLAMATION MARK | ||
| 44 | {0x00A4, 0x00A4}, // ; A # Sc CURRENCY SIGN | ||
| 45 | {0x00A7, 0x00A7}, // ; A # Po SECTION SIGN | ||
| 46 | {0x00A8, 0x00A8}, // ; A # Sk DIAERESIS | ||
| 47 | {0x00AA, 0x00AA}, // ; A # Lo FEMININE ORDINAL INDICATOR | ||
| 48 | {0x00AD, 0x00AD}, // ; A # Cf SOFT HYPHEN | ||
| 49 | {0x00AE, 0x00AE}, // ; A # So REGISTERED SIGN | ||
| 50 | {0x00B0, 0x00B0}, // ; A # So DEGREE SIGN | ||
| 51 | {0x00B1, 0x00B1}, // ; A # Sm PLUS-MINUS SIGN | ||
| 52 | {0x00B2, 0x00B3}, // ; A # No [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE | ||
| 53 | {0x00B4, 0x00B4}, // ; A # Sk ACUTE ACCENT | ||
| 54 | {0x00B6, 0x00B7}, // ; A # Po [2] PILCROW SIGN..MIDDLE DOT | ||
| 55 | {0x00B8, 0x00B8}, // ; A # Sk CEDILLA | ||
| 56 | {0x00B9, 0x00B9}, // ; A # No SUPERSCRIPT ONE | ||
| 57 | {0x00BA, 0x00BA}, // ; A # Lo MASCULINE ORDINAL INDICATOR | ||
| 58 | {0x00BC, 0x00BE}, // ; A # No [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS | ||
| 59 | {0x00BF, 0x00BF}, // ; A # Po INVERTED QUESTION MARK | ||
| 60 | {0x00C6, 0x00C6}, // ; A # Lu LATIN CAPITAL LETTER AE | ||
| 61 | {0x00D0, 0x00D0}, // ; A # Lu LATIN CAPITAL LETTER ETH | ||
| 62 | {0x00D7, 0x00D7}, // ; A # Sm MULTIPLICATION SIGN | ||
| 63 | {0x00D8, 0x00D8}, // ; A # Lu LATIN CAPITAL LETTER O WITH STROKE | ||
| 64 | {0x00DE, 0x00E1}, // ; A # L& [4] LATIN CAPITAL LETTER THORN..LATIN SMALL LETTER A WITH ACUTE | ||
| 65 | {0x00E6, 0x00E6}, // ; A # Ll LATIN SMALL LETTER AE | ||
| 66 | {0x00E8, 0x00EA}, // ; A # Ll [3] LATIN SMALL LETTER E WITH GRAVE..LATIN SMALL LETTER E WITH CIRCUMFLEX | ||
| 67 | {0x00EC, 0x00ED}, // ; A # Ll [2] LATIN SMALL LETTER I WITH GRAVE..LATIN SMALL LETTER I WITH ACUTE | ||
| 68 | {0x00F0, 0x00F0}, // ; A # Ll LATIN SMALL LETTER ETH | ||
| 69 | {0x00F2, 0x00F3}, // ; A # Ll [2] LATIN SMALL LETTER O WITH GRAVE..LATIN SMALL LETTER O WITH ACUTE | ||
| 70 | {0x00F7, 0x00F7}, // ; A # Sm DIVISION SIGN | ||
| 71 | {0x00F8, 0x00FA}, // ; A # Ll [3] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER U WITH ACUTE | ||
| 72 | {0x00FC, 0x00FC}, // ; A # Ll LATIN SMALL LETTER U WITH DIAERESIS | ||
| 73 | {0x00FE, 0x00FE}, // ; A # Ll LATIN SMALL LETTER THORN | ||
| 74 | {0x0101, 0x0101}, // ; A # Ll LATIN SMALL LETTER A WITH MACRON | ||
| 75 | {0x0111, 0x0111}, // ; A # Ll LATIN SMALL LETTER D WITH STROKE | ||
| 76 | {0x0113, 0x0113}, // ; A # Ll LATIN SMALL LETTER E WITH MACRON | ||
| 77 | {0x011B, 0x011B}, // ; A # Ll LATIN SMALL LETTER E WITH CARON | ||
| 78 | {0x0126, 0x0127}, // ; A # L& [2] LATIN CAPITAL LETTER H WITH STROKE..LATIN SMALL LETTER H WITH STROKE | ||
| 79 | {0x012B, 0x012B}, // ; A # Ll LATIN SMALL LETTER I WITH MACRON | ||
| 80 | {0x0131, 0x0133}, // ; A # L& [3] LATIN SMALL LETTER DOTLESS I..LATIN SMALL LIGATURE IJ | ||
| 81 | {0x0138, 0x0138}, // ; A # Ll LATIN SMALL LETTER KRA | ||
| 82 | {0x013F, 0x0142}, // ; A # L& [4] LATIN CAPITAL LETTER L WITH MIDDLE DOT..LATIN SMALL LETTER L WITH STROKE | ||
| 83 | {0x0144, 0x0144}, // ; A # Ll LATIN SMALL LETTER N WITH ACUTE | ||
| 84 | {0x0148, 0x014B}, // ; A # L& [4] LATIN SMALL LETTER N WITH CARON..LATIN SMALL LETTER ENG | ||
| 85 | {0x014D, 0x014D}, // ; A # Ll LATIN SMALL LETTER O WITH MACRON | ||
| 86 | {0x0152, 0x0153}, // ; A # L& [2] LATIN CAPITAL LIGATURE OE..LATIN SMALL LIGATURE OE | ||
| 87 | {0x0166, 0x0167}, // ; A # L& [2] LATIN CAPITAL LETTER T WITH STROKE..LATIN SMALL LETTER T WITH STROKE | ||
| 88 | {0x016B, 0x016B}, // ; A # Ll LATIN SMALL LETTER U WITH MACRON | ||
| 89 | {0x01CE, 0x01CE}, // ; A # Ll LATIN SMALL LETTER A WITH CARON | ||
| 90 | {0x01D0, 0x01D0}, // ; A # Ll LATIN SMALL LETTER I WITH CARON | ||
| 91 | {0x01D2, 0x01D2}, // ; A # Ll LATIN SMALL LETTER O WITH CARON | ||
| 92 | {0x01D4, 0x01D4}, // ; A # Ll LATIN SMALL LETTER U WITH CARON | ||
| 93 | {0x01D6, 0x01D6}, // ; A # Ll LATIN SMALL LETTER U WITH DIAERESIS AND MACRON | ||
| 94 | {0x01D8, 0x01D8}, // ; A # Ll LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE | ||
| 95 | {0x01DA, 0x01DA}, // ; A # Ll LATIN SMALL LETTER U WITH DIAERESIS AND CARON | ||
| 96 | {0x01DC, 0x01DC}, // ; A # Ll LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE | ||
| 97 | {0x0251, 0x0251}, // ; A # Ll LATIN SMALL LETTER ALPHA | ||
| 98 | {0x0261, 0x0261}, // ; A # Ll LATIN SMALL LETTER SCRIPT G | ||
| 99 | {0x02C4, 0x02C4}, // ; A # Sk MODIFIER LETTER UP ARROWHEAD | ||
| 100 | {0x02C7, 0x02C7}, // ; A # Lm CARON | ||
| 101 | {0x02C9, 0x02CB}, // ; A # Lm [3] MODIFIER LETTER MACRON..MODIFIER LETTER GRAVE ACCENT | ||
| 102 | {0x02CD, 0x02CD}, // ; A # Lm MODIFIER LETTER LOW MACRON | ||
| 103 | {0x02D0, 0x02D0}, // ; A # Lm MODIFIER LETTER TRIANGULAR COLON | ||
| 104 | {0x02D8, 0x02DB}, // ; A # Sk [4] BREVE..OGONEK | ||
| 105 | {0x02DD, 0x02DD}, // ; A # Sk DOUBLE ACUTE ACCENT | ||
| 106 | {0x02DF, 0x02DF}, // ; A # Sk MODIFIER LETTER CROSS ACCENT | ||
| 107 | {0x0300, 0x036F}, // ; A # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X | ||
| 108 | {0x0391, 0x03A1}, // ; A # Lu [17] GREEK CAPITAL LETTER ALPHA..GREEK CAPITAL LETTER RHO | ||
| 109 | {0x03A3, 0x03A9}, // ; A # Lu [7] GREEK CAPITAL LETTER SIGMA..GREEK CAPITAL LETTER OMEGA | ||
| 110 | {0x03B1, 0x03C1}, // ; A # Ll [17] GREEK SMALL LETTER ALPHA..GREEK SMALL LETTER RHO | ||
| 111 | {0x03C3, 0x03C9}, // ; A # Ll [7] GREEK SMALL LETTER SIGMA..GREEK SMALL LETTER OMEGA | ||
| 112 | {0x0401, 0x0401}, // ; A # Lu CYRILLIC CAPITAL LETTER IO | ||
| 113 | {0x0410, 0x044F}, // ; A # L& [64] CYRILLIC CAPITAL LETTER A..CYRILLIC SMALL LETTER YA | ||
| 114 | {0x0451, 0x0451}, // ; A # Ll CYRILLIC SMALL LETTER IO | ||
| 115 | {0x2010, 0x2010}, // ; A # Pd HYPHEN | ||
| 116 | {0x2013, 0x2015}, // ; A # Pd [3] EN DASH..HORIZONTAL BAR | ||
| 117 | {0x2016, 0x2016}, // ; A # Po DOUBLE VERTICAL LINE | ||
| 118 | {0x2018, 0x2018}, // ; A # Pi LEFT SINGLE QUOTATION MARK | ||
| 119 | {0x2019, 0x2019}, // ; A # Pf RIGHT SINGLE QUOTATION MARK | ||
| 120 | {0x201C, 0x201C}, // ; A # Pi LEFT DOUBLE QUOTATION MARK | ||
| 121 | {0x201D, 0x201D}, // ; A # Pf RIGHT DOUBLE QUOTATION MARK | ||
| 122 | {0x2020, 0x2022}, // ; A # Po [3] DAGGER..BULLET | ||
| 123 | {0x2024, 0x2027}, // ; A # Po [4] ONE DOT LEADER..HYPHENATION POINT | ||
| 124 | {0x2030, 0x2030}, // ; A # Po PER MILLE SIGN | ||
| 125 | {0x2032, 0x2033}, // ; A # Po [2] PRIME..DOUBLE PRIME | ||
| 126 | {0x2035, 0x2035}, // ; A # Po REVERSED PRIME | ||
| 127 | {0x203B, 0x203B}, // ; A # Po REFERENCE MARK | ||
| 128 | {0x203E, 0x203E}, // ; A # Po OVERLINE | ||
| 129 | {0x2074, 0x2074}, // ; A # No SUPERSCRIPT FOUR | ||
| 130 | {0x207F, 0x207F}, // ; A # Lm SUPERSCRIPT LATIN SMALL LETTER N | ||
| 131 | {0x2081, 0x2084}, // ; A # No [4] SUBSCRIPT ONE..SUBSCRIPT FOUR | ||
| 132 | {0x20AC, 0x20AC}, // ; A # Sc EURO SIGN | ||
| 133 | {0x2103, 0x2103}, // ; A # So DEGREE CELSIUS | ||
| 134 | {0x2105, 0x2105}, // ; A # So CARE OF | ||
| 135 | {0x2109, 0x2109}, // ; A # So DEGREE FAHRENHEIT | ||
| 136 | {0x2113, 0x2113}, // ; A # Ll SCRIPT SMALL L | ||
| 137 | {0x2116, 0x2116}, // ; A # So NUMERO SIGN | ||
| 138 | {0x2121, 0x2122}, // ; A # So [2] TELEPHONE SIGN..TRADE MARK SIGN | ||
| 139 | {0x2126, 0x2126}, // ; A # Lu OHM SIGN | ||
| 140 | {0x212B, 0x212B}, // ; A # Lu ANGSTROM SIGN | ||
| 141 | {0x2153, 0x2154}, // ; A # No [2] VULGAR FRACTION ONE THIRD..VULGAR FRACTION TWO THIRDS | ||
| 142 | {0x215B, 0x215E}, // ; A # No [4] VULGAR FRACTION ONE EIGHTH..VULGAR FRACTION SEVEN EIGHTHS | ||
| 143 | {0x2160, 0x216B}, // ; A # Nl [12] ROMAN NUMERAL ONE..ROMAN NUMERAL TWELVE | ||
| 144 | {0x2170, 0x2179}, // ; A # Nl [10] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL TEN | ||
| 145 | {0x2189, 0x2189}, // ; A # No VULGAR FRACTION ZERO THIRDS | ||
| 146 | {0x2190, 0x2194}, // ; A # Sm [5] LEFTWARDS ARROW..LEFT RIGHT ARROW | ||
| 147 | {0x2195, 0x2199}, // ; A # So [5] UP DOWN ARROW..SOUTH WEST ARROW | ||
| 148 | {0x21B8, 0x21B9}, // ; A # So [2] NORTH WEST ARROW TO LONG BAR..LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR | ||
| 149 | {0x21D2, 0x21D2}, // ; A # Sm RIGHTWARDS DOUBLE ARROW | ||
| 150 | {0x21D4, 0x21D4}, // ; A # Sm LEFT RIGHT DOUBLE ARROW | ||
| 151 | {0x21E7, 0x21E7}, // ; A # So UPWARDS WHITE ARROW | ||
| 152 | {0x2200, 0x2200}, // ; A # Sm FOR ALL | ||
| 153 | {0x2202, 0x2203}, // ; A # Sm [2] PARTIAL DIFFERENTIAL..THERE EXISTS | ||
| 154 | {0x2207, 0x2208}, // ; A # Sm [2] NABLA..ELEMENT OF | ||
| 155 | {0x220B, 0x220B}, // ; A # Sm CONTAINS AS MEMBER | ||
| 156 | {0x220F, 0x220F}, // ; A # Sm N-ARY PRODUCT | ||
| 157 | {0x2211, 0x2211}, // ; A # Sm N-ARY SUMMATION | ||
| 158 | {0x2215, 0x2215}, // ; A # Sm DIVISION SLASH | ||
| 159 | {0x221A, 0x221A}, // ; A # Sm SQUARE ROOT | ||
| 160 | {0x221D, 0x2220}, // ; A # Sm [4] PROPORTIONAL TO..ANGLE | ||
| 161 | {0x2223, 0x2223}, // ; A # Sm DIVIDES | ||
| 162 | {0x2225, 0x2225}, // ; A # Sm PARALLEL TO | ||
| 163 | {0x2227, 0x222C}, // ; A # Sm [6] LOGICAL AND..DOUBLE INTEGRAL | ||
| 164 | {0x222E, 0x222E}, // ; A # Sm CONTOUR INTEGRAL | ||
| 165 | {0x2234, 0x2237}, // ; A # Sm [4] THEREFORE..PROPORTION | ||
| 166 | {0x223C, 0x223D}, // ; A # Sm [2] TILDE OPERATOR..REVERSED TILDE | ||
| 167 | {0x2248, 0x2248}, // ; A # Sm ALMOST EQUAL TO | ||
| 168 | {0x224C, 0x224C}, // ; A # Sm ALL EQUAL TO | ||
| 169 | {0x2252, 0x2252}, // ; A # Sm APPROXIMATELY EQUAL TO OR THE IMAGE OF | ||
| 170 | {0x2260, 0x2261}, // ; A # Sm [2] NOT EQUAL TO..IDENTICAL TO | ||
| 171 | {0x2264, 0x2267}, // ; A # Sm [4] LESS-THAN OR EQUAL TO..GREATER-THAN OVER EQUAL TO | ||
| 172 | {0x226A, 0x226B}, // ; A # Sm [2] MUCH LESS-THAN..MUCH GREATER-THAN | ||
| 173 | {0x226E, 0x226F}, // ; A # Sm [2] NOT LESS-THAN..NOT GREATER-THAN | ||
| 174 | {0x2282, 0x2283}, // ; A # Sm [2] SUBSET OF..SUPERSET OF | ||
| 175 | {0x2286, 0x2287}, // ; A # Sm [2] SUBSET OF OR EQUAL TO..SUPERSET OF OR EQUAL TO | ||
| 176 | {0x2295, 0x2295}, // ; A # Sm CIRCLED PLUS | ||
| 177 | {0x2299, 0x2299}, // ; A # Sm CIRCLED DOT OPERATOR | ||
| 178 | {0x22A5, 0x22A5}, // ; A # Sm UP TACK | ||
| 179 | {0x22BF, 0x22BF}, // ; A # Sm RIGHT TRIANGLE | ||
| 180 | {0x2312, 0x2312}, // ; A # So ARC | ||
| 181 | {0x2460, 0x249B}, // ; A # No [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP | ||
| 182 | {0x249C, 0x24E9}, // ; A # So [78] PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z | ||
| 183 | {0x24EB, 0x24FF}, // ; A # No [21] NEGATIVE CIRCLED NUMBER ELEVEN..NEGATIVE CIRCLED DIGIT ZERO | ||
| 184 | {0x2500, 0x254B}, // ; A # So [76] BOX DRAWINGS LIGHT HORIZONTAL..BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL | ||
| 185 | {0x2550, 0x2573}, // ; A # So [36] BOX DRAWINGS DOUBLE HORIZONTAL..BOX DRAWINGS LIGHT DIAGONAL CROSS | ||
| 186 | {0x2580, 0x258F}, // ; A # So [16] UPPER HALF BLOCK..LEFT ONE EIGHTH BLOCK | ||
| 187 | {0x2592, 0x2595}, // ; A # So [4] MEDIUM SHADE..RIGHT ONE EIGHTH BLOCK | ||
| 188 | {0x25A0, 0x25A1}, // ; A # So [2] BLACK SQUARE..WHITE SQUARE | ||
| 189 | {0x25A3, 0x25A9}, // ; A # So [7] WHITE SQUARE CONTAINING BLACK SMALL SQUARE..SQUARE WITH DIAGONAL CROSSHATCH FILL | ||
| 190 | {0x25B2, 0x25B3}, // ; A # So [2] BLACK UP-POINTING TRIANGLE..WHITE UP-POINTING TRIANGLE | ||
| 191 | {0x25B6, 0x25B6}, // ; A # So BLACK RIGHT-POINTING TRIANGLE | ||
| 192 | {0x25B7, 0x25B7}, // ; A # Sm WHITE RIGHT-POINTING TRIANGLE | ||
| 193 | {0x25BC, 0x25BD}, // ; A # So [2] BLACK DOWN-POINTING TRIANGLE..WHITE DOWN-POINTING TRIANGLE | ||
| 194 | {0x25C0, 0x25C0}, // ; A # So BLACK LEFT-POINTING TRIANGLE | ||
| 195 | {0x25C1, 0x25C1}, // ; A # Sm WHITE LEFT-POINTING TRIANGLE | ||
| 196 | {0x25C6, 0x25C8}, // ; A # So [3] BLACK DIAMOND..WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND | ||
| 197 | {0x25CB, 0x25CB}, // ; A # So WHITE CIRCLE | ||
| 198 | {0x25CE, 0x25D1}, // ; A # So [4] BULLSEYE..CIRCLE WITH RIGHT HALF BLACK | ||
| 199 | {0x25E2, 0x25E5}, // ; A # So [4] BLACK LOWER RIGHT TRIANGLE..BLACK UPPER RIGHT TRIANGLE | ||
| 200 | {0x25EF, 0x25EF}, // ; A # So LARGE CIRCLE | ||
| 201 | {0x2605, 0x2606}, // ; A # So [2] BLACK STAR..WHITE STAR | ||
| 202 | {0x2609, 0x2609}, // ; A # So SUN | ||
| 203 | {0x260E, 0x260F}, // ; A # So [2] BLACK TELEPHONE..WHITE TELEPHONE | ||
| 204 | {0x261C, 0x261C}, // ; A # So WHITE LEFT POINTING INDEX | ||
| 205 | {0x261E, 0x261E}, // ; A # So WHITE RIGHT POINTING INDEX | ||
| 206 | {0x2640, 0x2640}, // ; A # So FEMALE SIGN | ||
| 207 | {0x2642, 0x2642}, // ; A # So MALE SIGN | ||
| 208 | {0x2660, 0x2661}, // ; A # So [2] BLACK SPADE SUIT..WHITE HEART SUIT | ||
| 209 | {0x2663, 0x2665}, // ; A # So [3] BLACK CLUB SUIT..BLACK HEART SUIT | ||
| 210 | {0x2667, 0x266A}, // ; A # So [4] WHITE CLUB SUIT..EIGHTH NOTE | ||
| 211 | {0x266C, 0x266D}, // ; A # So [2] BEAMED SIXTEENTH NOTES..MUSIC FLAT SIGN | ||
| 212 | {0x266F, 0x266F}, // ; A # Sm MUSIC SHARP SIGN | ||
| 213 | {0x269E, 0x269F}, // ; A # So [2] THREE LINES CONVERGING RIGHT..THREE LINES CONVERGING LEFT | ||
| 214 | {0x26BF, 0x26BF}, // ; A # So SQUARED KEY | ||
| 215 | {0x26C6, 0x26CD}, // ; A # So [8] RAIN..DISABLED CAR | ||
| 216 | {0x26CF, 0x26D3}, // ; A # So [5] PICK..CHAINS | ||
| 217 | {0x26D5, 0x26E1}, // ; A # So [13] ALTERNATE ONE-WAY LEFT WAY TRAFFIC..RESTRICTED LEFT ENTRY-2 | ||
| 218 | {0x26E3, 0x26E3}, // ; A # So HEAVY CIRCLE WITH STROKE AND TWO DOTS ABOVE | ||
| 219 | {0x26E8, 0x26E9}, // ; A # So [2] BLACK CROSS ON SHIELD..SHINTO SHRINE | ||
| 220 | {0x26EB, 0x26F1}, // ; A # So [7] CASTLE..UMBRELLA ON GROUND | ||
| 221 | {0x26F4, 0x26F4}, // ; A # So FERRY | ||
| 222 | {0x26F6, 0x26F9}, // ; A # So [4] SQUARE FOUR CORNERS..PERSON WITH BALL | ||
| 223 | {0x26FB, 0x26FC}, // ; A # So [2] JAPANESE BANK SYMBOL..HEADSTONE GRAVEYARD SYMBOL | ||
| 224 | {0x26FE, 0x26FF}, // ; A # So [2] CUP ON BLACK SQUARE..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE | ||
| 225 | {0x273D, 0x273D}, // ; A # So HEAVY TEARDROP-SPOKED ASTERISK | ||
| 226 | {0x2776, 0x277F}, // ; A # No [10] DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED NUMBER TEN | ||
| 227 | {0x2B56, 0x2B59}, // ; A # So [4] HEAVY OVAL WITH OVAL INSIDE..HEAVY CIRCLED SALTIRE | ||
| 228 | {0x3248, 0x324F}, // ; A # No [8] CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE | ||
| 229 | {0xE000, 0xF8FF}, // ; A # Co [6400] <private-use-E000>..<private-use-F8FF> | ||
| 230 | {0xFE00, 0xFE0F}, // ; A # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 | ||
| 231 | {0xFFFD, 0xFFFD}, // ; A # So REPLACEMENT CHARACTER | ||
| 232 | {0x1F100, 0x1F10A}, // ; A # No [11] DIGIT ZERO FULL STOP..DIGIT NINE COMMA | ||
| 233 | {0x1F110, 0x1F12D}, // ; A # So [30] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED CD | ||
| 234 | {0x1F130, 0x1F169}, // ; A # So [58] SQUARED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z | ||
| 235 | {0x1F170, 0x1F18D}, // ; A # So [30] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED SA | ||
| 236 | {0x1F18F, 0x1F190}, // ; A # So [2] NEGATIVE SQUARED WC..SQUARE DJ | ||
| 237 | {0x1F19B, 0x1F1AC}, // ; A # So [18] SQUARED THREE D..SQUARED VOD | ||
| 238 | {0xE0100, 0xE01EF}, // ; A # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 | ||
| 239 | {0xF0000, 0xFFFFD}, // ; A # Co [65534] <private-use-F0000>..<private-use-FFFFD> | ||
| 240 | {0x100000, 0x10FFFD} // ; A # Co [65534] <private-use-100000>..<private-use-10FFFD> | ||
| 241 | }; | ||
| 242 | const size_t num_ranges = sizeof(ranges) / sizeof(ranges[0]); | ||
| 243 | |||
| 244 | int left = 0, right = num_ranges - 1; | ||
| 245 | |||
| 246 | while (left <= right) { | ||
| 247 | int mid = left + (right - left) / 2; | ||
| 248 | |||
| 249 | if (ucs >= ranges[mid].start && ucs <= ranges[mid].end) { | ||
| 250 | return 1; // Character is in the range | ||
| 251 | } else if (ucs < ranges[mid].start) { | ||
| 252 | right = mid - 1; | ||
| 253 | } else { | ||
| 254 | left = mid + 1; | ||
| 255 | } | ||
| 256 | } | ||
| 257 | return 0; // Character is not in any of the ranges | ||
| 258 | } | ||
| 259 | |||
diff --git a/src/wcwidtha.h b/src/wcwidtha.h new file mode 100644 index 0000000..9931b01 --- /dev/null +++ b/src/wcwidtha.h | |||
| @@ -0,0 +1,12 @@ | |||
| 1 | // wcwidtha.h | ||
| 2 | |||
| 3 | #ifndef MK_WCWIDTHA_H | ||
| 4 | #define MK_WCWIDTHA_H | ||
| 5 | |||
| 6 | |||
| 7 | #include "wcwidth.h" | ||
| 8 | |||
| 9 | // Is a character in the list of ambiguous width characters (for east asian display) | ||
| 10 | int mk_wcwidth_a(mk_wchar_t ucs); | ||
| 11 | |||
| 12 | #endif // MK_WCWIDTHA_H | ||
