diff options
author | Thijs Schreijer <thijs@thijsschreijer.nl> | 2024-05-06 11:44:47 +0200 |
---|---|---|
committer | Thijs Schreijer <thijs@thijsschreijer.nl> | 2024-05-20 12:43:55 +0200 |
commit | dcd5d62501e61e0f6901d4d4687ab56430a4b8a7 (patch) | |
tree | 4501938052c0f62279eaae66c34811d4b5232fa2 /src | |
parent | 1d64b5790f26760cb830336ccca9d51474b73ae8 (diff) | |
download | luasystem-dcd5d62501e61e0f6901d4d4687ab56430a4b8a7.tar.gz luasystem-dcd5d62501e61e0f6901d4d4687ab56430a4b8a7.tar.bz2 luasystem-dcd5d62501e61e0f6901d4d4687ab56430a4b8a7.zip |
add example for reading a line from the terminal, non-blocking
Handles utf8, and character width
Diffstat (limited to 'src')
-rw-r--r-- | src/term.c | 330 | ||||
-rw-r--r-- | src/wcwidth.c | 285 | ||||
-rw-r--r-- | src/wcwidth.h | 21 |
3 files changed, 619 insertions, 17 deletions
@@ -15,6 +15,7 @@ | |||
15 | 15 | ||
16 | #ifdef _WIN32 | 16 | #ifdef _WIN32 |
17 | # include <windows.h> | 17 | # include <windows.h> |
18 | # include <locale.h> | ||
18 | #else | 19 | #else |
19 | # include <termios.h> | 20 | # include <termios.h> |
20 | # include <string.h> | 21 | # include <string.h> |
@@ -22,8 +23,16 @@ | |||
22 | # include <fcntl.h> | 23 | # include <fcntl.h> |
23 | # include <sys/ioctl.h> | 24 | # include <sys/ioctl.h> |
24 | # include <unistd.h> | 25 | # include <unistd.h> |
26 | # include <wchar.h> | ||
27 | # include <locale.h> | ||
25 | #endif | 28 | #endif |
26 | 29 | ||
30 | |||
31 | // Windows does not have a wcwidth function, so we use compatibilty code from | ||
32 | // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c by Markus Kuhn | ||
33 | #include "wcwidth.h" | ||
34 | |||
35 | |||
27 | #ifdef _WIN32 | 36 | #ifdef _WIN32 |
28 | // after an error is returned, GetLastError() result can be passed to this function to get a string | 37 | // after an error is returned, GetLastError() result can be passed to this function to get a string |
29 | // representation of the error on the stack. | 38 | // representation of the error on the stack. |
@@ -423,7 +432,7 @@ static int lst_getconsoleflags(lua_State *L) | |||
423 | // see https://github.com/luaposix/luaposix | 432 | // see https://github.com/luaposix/luaposix |
424 | 433 | ||
425 | /*** | 434 | /*** |
426 | Get termios state. | 435 | Get termios state (Posix). |
427 | The terminal attributes is a table with the following fields: | 436 | The terminal attributes is a table with the following fields: |
428 | 437 | ||
429 | - `iflag` input flags | 438 | - `iflag` input flags |
@@ -511,7 +520,7 @@ static int lst_tcgetattr(lua_State *L) | |||
511 | 520 | ||
512 | 521 | ||
513 | /*** | 522 | /*** |
514 | Set termios state. | 523 | Set termios state (Posix). |
515 | This function will set the flags as given. | 524 | This function will set the flags as given. |
516 | 525 | ||
517 | The `I_`, `O_`, and `L_` constants are available on the module table. They are the respective | 526 | The `I_`, `O_`, and `L_` constants are available on the module table. They are the respective |
@@ -689,13 +698,28 @@ static int lst_getnonblock(lua_State *L) | |||
689 | * Reading keyboard input | 698 | * Reading keyboard input |
690 | *-------------------------------------------------------------------------*/ | 699 | *-------------------------------------------------------------------------*/ |
691 | 700 | ||
701 | #ifdef _WIN32 | ||
702 | // Define a static buffer for UTF-8 characters | ||
703 | static char utf8_buffer[4]; | ||
704 | static int utf8_buffer_len = 0; | ||
705 | static int utf8_buffer_index = 0; | ||
706 | #endif | ||
707 | |||
708 | |||
692 | /*** | 709 | /*** |
693 | Reads a key from the console non-blocking. | 710 | Reads a key from the console non-blocking. This function should not be called |
711 | directly, but through the `system.readkey` or `system.readansi` functions. It | ||
712 | will return the next byte from the input stream, or `nil` if no key was pressed. | ||
713 | |||
694 | On Posix, `io.stdin` must be set to non-blocking mode using `setnonblock` | 714 | On Posix, `io.stdin` must be set to non-blocking mode using `setnonblock` |
695 | before calling this function. Otherwise it will block. | 715 | before calling this function. Otherwise it will block. No conversions are |
716 | done on Posix, so the byte read is returned as-is. | ||
696 | 717 | ||
697 | @function readkey | 718 | On Windows this reads a wide character and converts it to UTF-8. Multi-byte |
698 | @treturn[1] integer the key code of the key that was pressed | 719 | sequences will be buffered internally and returned one byte at a time. |
720 | |||
721 | @function _readkey | ||
722 | @treturn[1] integer the byte read from the input stream | ||
699 | @treturn[2] nil if no key was pressed | 723 | @treturn[2] nil if no key was pressed |
700 | @treturn[3] nil on error | 724 | @treturn[3] nil on error |
701 | @treturn[3] string error message | 725 | @treturn[3] string error message |
@@ -703,20 +727,87 @@ before calling this function. Otherwise it will block. | |||
703 | */ | 727 | */ |
704 | static int lst_readkey(lua_State *L) { | 728 | static int lst_readkey(lua_State *L) { |
705 | #ifdef _WIN32 | 729 | #ifdef _WIN32 |
706 | if (_kbhit()) { | 730 | if (utf8_buffer_len > 0) { |
707 | int ch = _getch(); | 731 | // Buffer not empty, return the next byte |
708 | if (ch == EOF) { | 732 | lua_pushinteger(L, (unsigned char)utf8_buffer[utf8_buffer_index]); |
709 | // Error handling for end-of-file or read error | 733 | utf8_buffer_index++; |
710 | lua_pushnil(L); | 734 | utf8_buffer_len--; |
711 | lua_pushliteral(L, "_getch error"); | 735 | // printf("returning from buffer: %d\n", luaL_checkinteger(L, -1)); |
712 | return 2; | 736 | if (utf8_buffer_len == 0) { |
737 | utf8_buffer_index = 0; | ||
713 | } | 738 | } |
714 | lua_pushinteger(L, (unsigned char)ch); | ||
715 | return 1; | 739 | return 1; |
716 | } | 740 | } |
717 | return 0; | 741 | |
742 | if (!_kbhit()) { | ||
743 | return 0; | ||
744 | } | ||
745 | |||
746 | wchar_t wc = _getwch(); | ||
747 | // printf("----\nread wchar_t: %x\n", wc); | ||
748 | if (wc == WEOF) { | ||
749 | lua_pushnil(L); | ||
750 | lua_pushliteral(L, "read error"); | ||
751 | return 2; | ||
752 | } | ||
753 | |||
754 | if (sizeof(wchar_t) == 2) { | ||
755 | // printf("2-byte wchar_t\n"); | ||
756 | // only 2 bytes wide, not 4 | ||
757 | if (wc >= 0xD800 && wc <= 0xDBFF) { | ||
758 | // printf("2-byte wchar_t, received high, getting low...\n"); | ||
759 | |||
760 | // we got a high surrogate, so we need to read the next one as the low surrogate | ||
761 | if (!_kbhit()) { | ||
762 | lua_pushnil(L); | ||
763 | lua_pushliteral(L, "incomplete surrogate pair"); | ||
764 | return 2; | ||
765 | } | ||
766 | |||
767 | wchar_t wc2 = _getwch(); | ||
768 | // printf("read wchar_t 2: %x\n", wc2); | ||
769 | if (wc2 == WEOF) { | ||
770 | lua_pushnil(L); | ||
771 | lua_pushliteral(L, "read error"); | ||
772 | return 2; | ||
773 | } | ||
774 | |||
775 | if (wc2 < 0xDC00 || wc2 > 0xDFFF) { | ||
776 | lua_pushnil(L); | ||
777 | lua_pushliteral(L, "invalid surrogate pair"); | ||
778 | return 2; | ||
779 | } | ||
780 | // printf("2-byte pair complete now\n"); | ||
781 | wchar_t wch_pair[2] = { wc, wc2 }; | ||
782 | utf8_buffer_len = WideCharToMultiByte(CP_UTF8, 0, wch_pair, 2, utf8_buffer, sizeof(utf8_buffer), NULL, NULL); | ||
783 | |||
784 | } else { | ||
785 | // printf("2-byte wchar_t, no surrogate pair\n"); | ||
786 | // not a high surrogate, so we can handle just the 2 bytes directly | ||
787 | utf8_buffer_len = WideCharToMultiByte(CP_UTF8, 0, &wc, 1, utf8_buffer, sizeof(utf8_buffer), NULL, NULL); | ||
788 | } | ||
789 | |||
790 | } else { | ||
791 | // printf("4-byte wchar_t\n"); | ||
792 | // 4 bytes wide, so handle as UTF-32 directly | ||
793 | utf8_buffer_len = WideCharToMultiByte(CP_UTF8, 0, &wc, 1, utf8_buffer, sizeof(utf8_buffer), NULL, NULL); | ||
794 | } | ||
795 | // printf("utf8_buffer_len: %d\n", utf8_buffer_len); | ||
796 | utf8_buffer_index = 0; | ||
797 | if (utf8_buffer_len <= 0) { | ||
798 | lua_pushnil(L); | ||
799 | lua_pushliteral(L, "UTF-8 conversion error"); | ||
800 | return 2; | ||
801 | } | ||
802 | |||
803 | lua_pushinteger(L, (unsigned char)utf8_buffer[utf8_buffer_index]); | ||
804 | utf8_buffer_index++; | ||
805 | utf8_buffer_len--; | ||
806 | // printf("returning from buffer: %x\n", luaL_checkinteger(L, -1)); | ||
807 | return 1; | ||
718 | 808 | ||
719 | #else | 809 | #else |
810 | // Posix implementation | ||
720 | char ch; | 811 | char ch; |
721 | ssize_t bytes_read = read(STDIN_FILENO, &ch, 1); | 812 | ssize_t bytes_read = read(STDIN_FILENO, &ch, 1); |
722 | if (bytes_read > 0) { | 813 | if (bytes_read > 0) { |
@@ -782,6 +873,205 @@ static int lst_termsize(lua_State *L) { | |||
782 | 873 | ||
783 | 874 | ||
784 | /*------------------------------------------------------------------------- | 875 | /*------------------------------------------------------------------------- |
876 | * utf8 conversion and support | ||
877 | *-------------------------------------------------------------------------*/ | ||
878 | |||
879 | // Function to convert a single UTF-8 character to a Unicode code point (uint32_t) | ||
880 | // To prevent having to do codepage/locale changes, we use a custom implementation | ||
881 | int utf8_to_wchar(const char *utf8, size_t len, mk_wchar_t *codepoint) { | ||
882 | if (len == 0) { | ||
883 | return -1; // No input provided | ||
884 | } | ||
885 | |||
886 | unsigned char c = (unsigned char)utf8[0]; | ||
887 | if (c <= 0x7F) { | ||
888 | *codepoint = c; | ||
889 | return 1; | ||
890 | } else if ((c & 0xE0) == 0xC0) { | ||
891 | if (len < 2) return -1; // Not enough bytes | ||
892 | *codepoint = ((utf8[0] & 0x1F) << 6) | (utf8[1] & 0x3F); | ||
893 | return 2; | ||
894 | } else if ((c & 0xF0) == 0xE0) { | ||
895 | if (len < 3) return -1; // Not enough bytes | ||
896 | *codepoint = ((utf8[0] & 0x0F) << 12) | ((utf8[1] & 0x3F) << 6) | (utf8[2] & 0x3F); | ||
897 | return 3; | ||
898 | } else if ((c & 0xF8) == 0xF0) { | ||
899 | if (len < 4) return -1; // Not enough bytes | ||
900 | *codepoint = ((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3F) << 12) | ((utf8[2] & 0x3F) << 6) | (utf8[3] & 0x3F); | ||
901 | return 4; | ||
902 | } else { | ||
903 | // Invalid UTF-8 character | ||
904 | return -1; | ||
905 | } | ||
906 | } | ||
907 | |||
908 | |||
909 | /*** | ||
910 | Get the width of a utf8 character for terminal display. | ||
911 | @function utf8cwidth | ||
912 | @tparam string utf8_char the utf8 character to check, only the width of the first character will be returned | ||
913 | @treturn[1] int the display width in columns of the first character in the string (0 for an empty string) | ||
914 | @treturn[2] nil | ||
915 | @treturn[2] string error message | ||
916 | */ | ||
917 | int lst_utf8cwidth(lua_State *L) { | ||
918 | const char *utf8_char; | ||
919 | size_t utf8_len; | ||
920 | utf8_char = luaL_checklstring(L, 1, &utf8_len); | ||
921 | int width = 0; | ||
922 | |||
923 | mk_wchar_t wc; | ||
924 | |||
925 | if (utf8_len == 0) { | ||
926 | lua_pushinteger(L, 0); | ||
927 | return 1; | ||
928 | } | ||
929 | |||
930 | // Convert the UTF-8 string to a wide character | ||
931 | int bytes_processed = utf8_to_wchar(utf8_char, utf8_len, &wc); | ||
932 | if (bytes_processed == -1) { | ||
933 | lua_pushnil(L); | ||
934 | lua_pushstring(L, "Invalid UTF-8 character"); | ||
935 | return 2; | ||
936 | } | ||
937 | |||
938 | // Get the width of the wide character | ||
939 | width = mk_wcwidth(wc); | ||
940 | if (width == -1) { | ||
941 | lua_pushnil(L); | ||
942 | lua_pushstring(L, "Character width determination failed"); | ||
943 | return 2; | ||
944 | } | ||
945 | |||
946 | lua_pushinteger(L, width); | ||
947 | return 1; | ||
948 | } | ||
949 | |||
950 | |||
951 | |||
952 | |||
953 | /*** | ||
954 | Get the width of a utf8 string for terminal display. | ||
955 | @function utf8swidth | ||
956 | @tparam string utf8_string the utf8 string to check | ||
957 | @treturn[1] int the display width of the string in columns (0 for an empty string) | ||
958 | @treturn[2] nil | ||
959 | @treturn[2] string error message | ||
960 | */ | ||
961 | int lst_utf8swidth(lua_State *L) { | ||
962 | const char *utf8_str; | ||
963 | size_t utf8_len; | ||
964 | utf8_str = luaL_checklstring(L, 1, &utf8_len); | ||
965 | int total_width = 0; | ||
966 | |||
967 | if (utf8_len == 0) { | ||
968 | lua_pushinteger(L, 0); | ||
969 | return 1; | ||
970 | } | ||
971 | |||
972 | int bytes_processed = 0; | ||
973 | size_t i = 0; | ||
974 | mk_wchar_t wc; | ||
975 | |||
976 | while (i < utf8_len) { | ||
977 | bytes_processed = utf8_to_wchar(utf8_str + i, utf8_len - i, &wc); | ||
978 | if (bytes_processed == -1) { | ||
979 | lua_pushnil(L); | ||
980 | lua_pushstring(L, "Invalid UTF-8 character"); | ||
981 | return 2; | ||
982 | } | ||
983 | |||
984 | int width = mk_wcwidth(wc); | ||
985 | if (width == -1) { | ||
986 | lua_pushnil(L); | ||
987 | lua_pushstring(L, "Character width determination failed"); | ||
988 | return 2; | ||
989 | } | ||
990 | |||
991 | total_width += width; | ||
992 | i += bytes_processed; | ||
993 | } | ||
994 | |||
995 | lua_pushinteger(L, total_width); | ||
996 | return 1; | ||
997 | } | ||
998 | |||
999 | |||
1000 | |||
1001 | /*------------------------------------------------------------------------- | ||
1002 | * Windows codepage functions | ||
1003 | *-------------------------------------------------------------------------*/ | ||
1004 | |||
1005 | |||
1006 | /*** | ||
1007 | Gets the current console code page (Windows). | ||
1008 | @function getconsolecp | ||
1009 | @treturn[1] int the current code page (always 65001 on Posix systems) | ||
1010 | */ | ||
1011 | static int lst_getconsolecp(lua_State *L) { | ||
1012 | unsigned int cp = 65001; | ||
1013 | #ifdef _WIN32 | ||
1014 | cp = GetConsoleCP(); | ||
1015 | #endif | ||
1016 | lua_pushinteger(L, cp); | ||
1017 | return 1; | ||
1018 | } | ||
1019 | |||
1020 | |||
1021 | |||
1022 | /*** | ||
1023 | Sets the current console code page (Windows). | ||
1024 | @function setconsolecp | ||
1025 | @tparam int cp the code page to set, use 65001 for UTF-8 | ||
1026 | @treturn[1] bool `true` on success (always `true` on Posix systems) | ||
1027 | */ | ||
1028 | static int lst_setconsolecp(lua_State *L) { | ||
1029 | unsigned int cp = (unsigned int)luaL_checkinteger(L, 1); | ||
1030 | int success = TRUE; | ||
1031 | #ifdef _WIN32 | ||
1032 | SetConsoleCP(cp); | ||
1033 | #endif | ||
1034 | lua_pushboolean(L, success); | ||
1035 | return 1; | ||
1036 | } | ||
1037 | |||
1038 | |||
1039 | |||
1040 | /*** | ||
1041 | Gets the current console output code page (Windows). | ||
1042 | @function getconsoleoutputcp | ||
1043 | @treturn[1] int the current code page (always 65001 on Posix systems) | ||
1044 | */ | ||
1045 | static int lst_getconsoleoutputcp(lua_State *L) { | ||
1046 | unsigned int cp = 65001; | ||
1047 | #ifdef _WIN32 | ||
1048 | cp = GetConsoleOutputCP(); | ||
1049 | #endif | ||
1050 | lua_pushinteger(L, cp); | ||
1051 | return 1; | ||
1052 | } | ||
1053 | |||
1054 | |||
1055 | |||
1056 | /*** | ||
1057 | Sets the current console output code page (Windows). | ||
1058 | @function setconsoleoutputcp | ||
1059 | @tparam int cp the code page to set, use 65001 for UTF-8 | ||
1060 | @treturn[1] bool `true` on success (always `true` on Posix systems) | ||
1061 | */ | ||
1062 | static int lst_setconsoleoutputcp(lua_State *L) { | ||
1063 | unsigned int cp = (unsigned int)luaL_checkinteger(L, 1); | ||
1064 | int success = TRUE; | ||
1065 | #ifdef _WIN32 | ||
1066 | SetConsoleOutputCP(cp); | ||
1067 | #endif | ||
1068 | lua_pushboolean(L, success); | ||
1069 | return 1; | ||
1070 | } | ||
1071 | |||
1072 | |||
1073 | |||
1074 | /*------------------------------------------------------------------------- | ||
785 | * Initializes module | 1075 | * Initializes module |
786 | *-------------------------------------------------------------------------*/ | 1076 | *-------------------------------------------------------------------------*/ |
787 | 1077 | ||
@@ -791,10 +1081,16 @@ static luaL_Reg func[] = { | |||
791 | { "setconsoleflags", lst_setconsoleflags }, | 1081 | { "setconsoleflags", lst_setconsoleflags }, |
792 | { "tcgetattr", lst_tcgetattr }, | 1082 | { "tcgetattr", lst_tcgetattr }, |
793 | { "tcsetattr", lst_tcsetattr }, | 1083 | { "tcsetattr", lst_tcsetattr }, |
794 | { "getnonblock", lst_setnonblock }, | 1084 | { "getnonblock", lst_getnonblock }, |
795 | { "setnonblock", lst_setnonblock }, | 1085 | { "setnonblock", lst_setnonblock }, |
796 | { "readkey", lst_readkey }, | 1086 | { "_readkey", lst_readkey }, |
797 | { "termsize", lst_termsize }, | 1087 | { "termsize", lst_termsize }, |
1088 | { "utf8cwidth", lst_utf8cwidth }, | ||
1089 | { "utf8swidth", lst_utf8swidth }, | ||
1090 | { "getconsolecp", lst_getconsolecp }, | ||
1091 | { "setconsolecp", lst_setconsolecp }, | ||
1092 | { "getconsoleoutputcp", lst_getconsoleoutputcp }, | ||
1093 | { "setconsoleoutputcp", lst_setconsoleoutputcp }, | ||
798 | { NULL, NULL } | 1094 | { NULL, NULL } |
799 | }; | 1095 | }; |
800 | 1096 | ||
diff --git a/src/wcwidth.c b/src/wcwidth.c new file mode 100644 index 0000000..6032158 --- /dev/null +++ b/src/wcwidth.c | |||
@@ -0,0 +1,285 @@ | |||
1 | // This file was modified from the original versions, check "modified:" comments for details | ||
2 | // Character range updates (both the table and the +1 check) were generated using ChatGPT. | ||
3 | |||
4 | /* | ||
5 | * This is an implementation of wcwidth() and wcswidth() (defined in | ||
6 | * IEEE Std 1002.1-2001) for Unicode. | ||
7 | * | ||
8 | * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html | ||
9 | * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html | ||
10 | * | ||
11 | * In fixed-width output devices, Latin characters all occupy a single | ||
12 | * "cell" position of equal width, whereas ideographic CJK characters | ||
13 | * occupy two such cells. Interoperability between terminal-line | ||
14 | * applications and (teletype-style) character terminals using the | ||
15 | * UTF-8 encoding requires agreement on which character should advance | ||
16 | * the cursor by how many cell positions. No established formal | ||
17 | * standards exist at present on which Unicode character shall occupy | ||
18 | * how many cell positions on character terminals. These routines are | ||
19 | * a first attempt of defining such behavior based on simple rules | ||
20 | * applied to data provided by the Unicode Consortium. | ||
21 | * | ||
22 | * For some graphical characters, the Unicode standard explicitly | ||
23 | * defines a character-cell width via the definition of the East Asian | ||
24 | * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. | ||
25 | * In all these cases, there is no ambiguity about which width a | ||
26 | * terminal shall use. For characters in the East Asian Ambiguous (A) | ||
27 | * class, the width choice depends purely on a preference of backward | ||
28 | * compatibility with either historic CJK or Western practice. | ||
29 | * Choosing single-width for these characters is easy to justify as | ||
30 | * the appropriate long-term solution, as the CJK practice of | ||
31 | * displaying these characters as double-width comes from historic | ||
32 | * implementation simplicity (8-bit encoded characters were displayed | ||
33 | * single-width and 16-bit ones double-width, even for Greek, | ||
34 | * Cyrillic, etc.) and not any typographic considerations. | ||
35 | * | ||
36 | * Much less clear is the choice of width for the Not East Asian | ||
37 | * (Neutral) class. Existing practice does not dictate a width for any | ||
38 | * of these characters. It would nevertheless make sense | ||
39 | * typographically to allocate two character cells to characters such | ||
40 | * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be | ||
41 | * represented adequately with a single-width glyph. The following | ||
42 | * routines at present merely assign a single-cell width to all | ||
43 | * neutral characters, in the interest of simplicity. This is not | ||
44 | * entirely satisfactory and should be reconsidered before | ||
45 | * establishing a formal standard in this area. At the moment, the | ||
46 | * decision which Not East Asian (Neutral) characters should be | ||
47 | * represented by double-width glyphs cannot yet be answered by | ||
48 | * applying a simple rule from the Unicode database content. Setting | ||
49 | * up a proper standard for the behavior of UTF-8 character terminals | ||
50 | * will require a careful analysis not only of each Unicode character, | ||
51 | * but also of each presentation form, something the author of these | ||
52 | * routines has avoided to do so far. | ||
53 | * | ||
54 | * http://www.unicode.org/unicode/reports/tr11/ | ||
55 | * | ||
56 | * Markus Kuhn -- 2007-05-26 (Unicode 5.0) | ||
57 | * | ||
58 | * Permission to use, copy, modify, and distribute this software | ||
59 | * for any purpose and without fee is hereby granted. The author | ||
60 | * disclaims all warranties with regard to this software. | ||
61 | * | ||
62 | * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c | ||
63 | */ | ||
64 | |||
65 | #include "wcwidth.h" // modified: used to define mk_wchar_t | ||
66 | |||
67 | struct interval { | ||
68 | int first; | ||
69 | int last; | ||
70 | }; | ||
71 | |||
72 | /* auxiliary function for binary search in interval table */ | ||
73 | static int bisearch(mk_wchar_t ucs, const struct interval *table, int max) { // modified: use mk_wchar_t | ||
74 | int min = 0; | ||
75 | int mid; | ||
76 | |||
77 | if (ucs < table[0].first || ucs > table[max].last) | ||
78 | return 0; | ||
79 | while (max >= min) { | ||
80 | mid = (min + max) / 2; | ||
81 | if (ucs > table[mid].last) | ||
82 | min = mid + 1; | ||
83 | else if (ucs < table[mid].first) | ||
84 | max = mid - 1; | ||
85 | else | ||
86 | return 1; | ||
87 | } | ||
88 | |||
89 | return 0; | ||
90 | } | ||
91 | |||
92 | |||
93 | /* The following two functions define the column width of an ISO 10646 | ||
94 | * character as follows: | ||
95 | * | ||
96 | * - The null character (U+0000) has a column width of 0. | ||
97 | * | ||
98 | * - Other C0/C1 control characters and DEL will lead to a return | ||
99 | * value of -1. | ||
100 | * | ||
101 | * - Non-spacing and enclosing combining characters (general | ||
102 | * category code Mn or Me in the Unicode database) have a | ||
103 | * column width of 0. | ||
104 | * | ||
105 | * - SOFT HYPHEN (U+00AD) has a column width of 1. | ||
106 | * | ||
107 | * - Other format characters (general category code Cf in the Unicode | ||
108 | * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. | ||
109 | * | ||
110 | * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) | ||
111 | * have a column width of 0. | ||
112 | * | ||
113 | * - Spacing characters in the East Asian Wide (W) or East Asian | ||
114 | * Full-width (F) category as defined in Unicode Technical | ||
115 | * Report #11 have a column width of 2. | ||
116 | * | ||
117 | * - All remaining characters (including all printable | ||
118 | * ISO 8859-1 and WGL4 characters, Unicode control characters, | ||
119 | * etc.) have a column width of 1. | ||
120 | * | ||
121 | * This implementation assumes that mk_wchar_t characters are encoded | ||
122 | * in ISO 10646. | ||
123 | */ | ||
124 | |||
125 | int mk_wcwidth(mk_wchar_t ucs) // modified: use mk_wchar_t | ||
126 | { | ||
127 | /* sorted list of non-overlapping intervals of non-spacing characters */ | ||
128 | /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ | ||
129 | static const struct interval combining[] = { // modified: added new ranges to the list | ||
130 | { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD }, | ||
131 | { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 }, | ||
132 | { 0x05C7, 0x05C7 }, { 0x0600, 0x0605 }, { 0x0610, 0x061A }, | ||
133 | { 0x061C, 0x061C }, { 0x064B, 0x065F }, { 0x0670, 0x0670 }, | ||
134 | { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 }, { 0x06E7, 0x06E8 }, | ||
135 | { 0x06EA, 0x06ED }, { 0x0711, 0x0711 }, { 0x0730, 0x074A }, | ||
136 | { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x07FD, 0x07FD }, | ||
137 | { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 }, | ||
138 | { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x08D3, 0x08E1 }, | ||
139 | { 0x08E3, 0x0903 }, { 0x093A, 0x093C }, { 0x093E, 0x094F }, | ||
140 | { 0x0951, 0x0957 }, { 0x0962, 0x0963 }, { 0x0981, 0x0983 }, | ||
141 | { 0x09BC, 0x09BC }, { 0x09BE, 0x09C4 }, { 0x09C7, 0x09C8 }, | ||
142 | { 0x09CB, 0x09CD }, { 0x09D7, 0x09D7 }, { 0x09E2, 0x09E3 }, | ||
143 | { 0x09FE, 0x09FE }, { 0x0A01, 0x0A03 }, { 0x0A3C, 0x0A3C }, | ||
144 | { 0x0A3E, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, | ||
145 | { 0x0A51, 0x0A51 }, { 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, | ||
146 | { 0x0A81, 0x0A83 }, { 0x0ABC, 0x0ABC }, { 0x0ABE, 0x0AC5 }, | ||
147 | { 0x0AC7, 0x0AC9 }, { 0x0ACB, 0x0ACD }, { 0x0AE2, 0x0AE3 }, | ||
148 | { 0x0AFA, 0x0AFF }, { 0x0B01, 0x0B03 }, { 0x0B3C, 0x0B3C }, | ||
149 | { 0x0B3E, 0x0B44 }, { 0x0B47, 0x0B48 }, { 0x0B4B, 0x0B4D }, | ||
150 | { 0x0B55, 0x0B57 }, { 0x0B62, 0x0B63 }, { 0x0B82, 0x0B82 }, | ||
151 | { 0x0BBE, 0x0BC2 }, { 0x0BC6, 0x0BC8 }, { 0x0BCA, 0x0BCD }, | ||
152 | { 0x0BD7, 0x0BD7 }, { 0x0C00, 0x0C04 }, { 0x0C3E, 0x0C44 }, | ||
153 | { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, | ||
154 | { 0x0C62, 0x0C63 }, { 0x0C81, 0x0C83 }, { 0x0CBC, 0x0CBC }, | ||
155 | { 0x0CBE, 0x0CC4 }, { 0x0CC6, 0x0CC8 }, { 0x0CCA, 0x0CCD }, | ||
156 | { 0x0CD5, 0x0CD6 }, { 0x0CE2, 0x0CE3 }, { 0x0D00, 0x0D03 }, | ||
157 | { 0x0D3B, 0x0D3C }, { 0x0D3E, 0x0D44 }, { 0x0D46, 0x0D48 }, | ||
158 | { 0x0D4A, 0x0D4D }, { 0x0D57, 0x0D57 }, { 0x0D62, 0x0D63 }, | ||
159 | { 0x0D82, 0x0D83 }, { 0x0DCF, 0x0DD4 }, { 0x0DD6, 0x0DD6 }, | ||
160 | { 0x0DD8, 0x0DDF }, { 0x0DF2, 0x0DF3 }, { 0x0E31, 0x0E31 }, | ||
161 | { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, | ||
162 | { 0x0EB4, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, | ||
163 | { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, | ||
164 | { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, | ||
165 | { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, | ||
166 | { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A }, | ||
167 | { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 }, | ||
168 | { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 }, | ||
169 | { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x135D, 0x135F }, | ||
170 | { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 }, | ||
171 | { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD }, | ||
172 | { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD }, | ||
173 | { 0x180B, 0x180E }, { 0x1885, 0x1886 }, { 0x18A9, 0x18A9 }, | ||
174 | { 0x1920, 0x1922 }, { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, | ||
175 | { 0x1939, 0x193B }, { 0x1A17, 0x1A18 }, { 0x1A1B, 0x1A1B }, | ||
176 | { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E }, { 0x1A60, 0x1A60 }, | ||
177 | { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C }, { 0x1A73, 0x1A7C }, | ||
178 | { 0x1A7F, 0x1A7F }, { 0x1AB0, 0x1ACE }, { 0x1B00, 0x1B03 }, | ||
179 | { 0x1B34, 0x1B34 }, { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, | ||
180 | { 0x1B42, 0x1B42 }, { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B82 }, | ||
181 | { 0x1BA1, 0x1BA1 }, { 0x1BA6, 0x1BA7 }, { 0x1BAA, 0x1BAA }, | ||
182 | { 0x1BAB, 0x1BAD }, { 0x1BE6, 0x1BE6 }, { 0x1BE8, 0x1BE9 }, | ||
183 | { 0x1BED, 0x1BED }, { 0x1BEF, 0x1BF1 }, { 0x1C2C, 0x1C33 }, | ||
184 | { 0x1C36, 0x1C37 }, { 0x1CD0, 0x1CD2 }, { 0x1CD4, 0x1CE8 }, | ||
185 | { 0x1CED, 0x1CED }, { 0x1CF4, 0x1CF4 }, { 0x1CF8, 0x1CF9 }, | ||
186 | { 0x1DC0, 0x1DF9 }, { 0x1DFB, 0x1DFF }, { 0x20D0, 0x20DC }, | ||
187 | { 0x20E1, 0x20E1 }, { 0x20E5, 0x20F0 }, { 0x2CEF, 0x2CF1 }, | ||
188 | { 0x2D7F, 0x2D7F }, { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D }, | ||
189 | { 0x3099, 0x309A }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D }, | ||
190 | { 0xA69E, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 }, | ||
191 | { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 }, | ||
192 | { 0xA82C, 0xA82C }, { 0xA8C4, 0xA8C5 }, { 0xA8E0, 0xA8F1 }, | ||
193 | { 0xA8FF, 0xA8FF }, { 0xA926, 0xA92D }, { 0xA947, 0xA951 }, | ||
194 | { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 }, { 0xA9B6, 0xA9B9 }, | ||
195 | { 0xA9BC, 0xA9BD }, { 0xA9E5, 0xA9E5 }, { 0xAA29, 0xAA2E }, | ||
196 | { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 }, | ||
197 | { 0xAA4C, 0xAA4C }, { 0xAA7C, 0xAA7C }, { 0xAAB0, 0xAAB0 }, | ||
198 | { 0xAAB2, 0xAAB4 }, { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF }, | ||
199 | { 0xAAC1, 0xAAC1 }, { 0xAAEB, 0xAAEB }, { 0xAAEE, 0xAAEF }, | ||
200 | { 0xAAF5, 0xAAF6 }, { 0xABE3, 0xABE4 }, { 0xABE6, 0xABE7 }, | ||
201 | { 0xABE9, 0xABEA }, { 0xABEC, 0xABED }, { 0xFB1E, 0xFB1E }, | ||
202 | { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE2F }, { 0x101FD, 0x101FD }, | ||
203 | { 0x102E0, 0x102E0 }, { 0x10376, 0x1037A }, { 0x10A01, 0x10A03 }, | ||
204 | { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F }, { 0x10A38, 0x10A3A }, | ||
205 | { 0x10A3F, 0x10A3F }, { 0x10AE5, 0x10AE6 }, { 0x10D24, 0x10D27 }, | ||
206 | { 0x10EAB, 0x10EAC }, { 0x10F46, 0x10F50 }, { 0x10F82, 0x10F85 }, | ||
207 | { 0x11000, 0x11002 }, { 0x11038, 0x11046 }, { 0x1107F, 0x11082 }, | ||
208 | { 0x110B0, 0x110BA }, { 0x11100, 0x11102 }, { 0x11127, 0x11134 }, | ||
209 | { 0x11145, 0x11146 }, { 0x11173, 0x11173 }, { 0x11180, 0x11182 }, | ||
210 | { 0x111B3, 0x111C0 }, { 0x111C9, 0x111CC }, { 0x1122C, 0x11237 }, | ||
211 | { 0x1123E, 0x1123E }, { 0x112DF, 0x112EA }, { 0x11300, 0x11303 }, | ||
212 | { 0x1133B, 0x1133C }, { 0x1133E, 0x11344 }, { 0x11347, 0x11348 }, | ||
213 | { 0x1134B, 0x1134D }, { 0x11357, 0x11357 }, { 0x11362, 0x11363 }, | ||
214 | { 0x11435, 0x11446 }, { 0x1145E, 0x1145E }, { 0x114B0, 0x114C3 }, | ||
215 | { 0x115AF, 0x115B5 }, { 0x115B8, 0x115C0 }, { 0x115DC, 0x115DD }, | ||
216 | { 0x11630, 0x11640 }, { 0x116AB, 0x116B7 }, { 0x1171D, 0x1172B }, | ||
217 | { 0x1182C, 0x1183A }, { 0x11930, 0x11935 }, { 0x11937, 0x11938 }, | ||
218 | { 0x1193B, 0x1193E }, { 0x11940, 0x11940 }, { 0x11942, 0x11942 }, | ||
219 | { 0x119D1, 0x119D7 }, { 0x119DA, 0x119E0 }, { 0x11A01, 0x11A0A }, | ||
220 | { 0x11A33, 0x11A39 }, { 0x11A3B, 0x11A3E }, { 0x11A47, 0x11A47 }, | ||
221 | { 0x11A51, 0x11A5B }, { 0x11A8A, 0x11A96 }, { 0x11A98, 0x11A99 }, | ||
222 | { 0x11C30, 0x11C36 }, { 0x11C38, 0x11C3D }, { 0x11C3F, 0x11C3F }, | ||
223 | { 0x11C92, 0x11CA7 }, { 0x11CAA, 0x11CB0 }, { 0x11CB2, 0x11CB3 }, | ||
224 | { 0x11CB5, 0x11CB6 }, { 0x11D31, 0x11D36 }, { 0x11D3A, 0x11D3A }, | ||
225 | { 0x11D3C, 0x11D3D }, { 0x11D3F, 0x11D45 }, { 0x11D47, 0x11D47 }, | ||
226 | { 0x11D90, 0x11D91 }, { 0x11D95, 0x11D95 }, { 0x11D97, 0x11D97 }, | ||
227 | { 0x11EF3, 0x11EF4 }, { 0x13430, 0x13438 }, { 0x16AF0, 0x16AF4 }, | ||
228 | { 0x16B30, 0x16B36 }, { 0x16F4F, 0x16F4F }, { 0x16F8F, 0x16F92 }, | ||
229 | { 0x1BC9D, 0x1BC9E }, { 0x1BCA0, 0x1BCA3 }, { 0x1D167, 0x1D169 }, | ||
230 | { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD }, | ||
231 | { 0x1D242, 0x1D244 }, { 0x1DA00, 0x1DA36 }, { 0x1DA3B, 0x1DA6C }, | ||
232 | { 0x1DA75, 0x1DA75 }, { 0x1DA84, 0x1DA84 }, { 0x1DA9B, 0x1DA9F }, | ||
233 | { 0x1DAA1, 0x1DAAF }, { 0x1E000, 0x1E006 }, { 0x1E008, 0x1E018 }, | ||
234 | { 0x1E01B, 0x1E021 }, { 0x1E023, 0x1E024 }, { 0x1E026, 0x1E02A }, | ||
235 | { 0x1E130, 0x1E136 }, { 0x1E2AE, 0x1E2AE }, { 0x1E2EC, 0x1E2EF }, | ||
236 | { 0x1E4EC, 0x1E4EF }, { 0x1E8D0, 0x1E8D6 }, { 0x1E944, 0x1E94A }, | ||
237 | { 0x1E947, 0x1E94A }, { 0xE0100, 0xE01EF } | ||
238 | }; | ||
239 | |||
240 | /* test for 8-bit control characters */ | ||
241 | if (ucs == 0) | ||
242 | return 0; | ||
243 | if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0)) | ||
244 | return -1; | ||
245 | |||
246 | /* binary search in table of non-spacing characters */ | ||
247 | if (bisearch(ucs, combining, | ||
248 | sizeof(combining) / sizeof(struct interval) - 1)) | ||
249 | return 0; | ||
250 | |||
251 | /* if we arrive here, ucs is not a combining or C0/C1 control character */ | ||
252 | |||
253 | return 1 + | ||
254 | (ucs >= 0x1100 && | ||
255 | (ucs <= 0x115f || /* Hangul Jamo init. consonants */ | ||
256 | ucs == 0x2329 || ucs == 0x232a || | ||
257 | (ucs >= 0x2e80 && ucs <= 0xa4cf && | ||
258 | ucs != 0x303f) || /* CJK ... Yi */ | ||
259 | (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */ | ||
260 | (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */ | ||
261 | (ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */ | ||
262 | (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */ | ||
263 | (ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */ | ||
264 | (ucs >= 0xffe0 && ucs <= 0xffe6) || | ||
265 | (ucs >= 0x1f300 && ucs <= 0x1f64f) || /* modified: added Emoticons */ | ||
266 | (ucs >= 0x1f680 && ucs <= 0x1f6ff) || /* modified: added Transport and Map Symbols */ | ||
267 | (ucs >= 0x1f900 && ucs <= 0x1f9ff) || /* modified: added Supplemental Symbols and Pictographs */ | ||
268 | (ucs >= 0x20000 && ucs <= 0x2fffd) || | ||
269 | (ucs >= 0x30000 && ucs <= 0x3fffd))); | ||
270 | } | ||
271 | |||
272 | |||
273 | int mk_wcswidth(const mk_wchar_t *pwcs, size_t n) // modified: use mk_wchar_t | ||
274 | { | ||
275 | int w, width = 0; | ||
276 | |||
277 | for (;*pwcs && n-- > 0; pwcs++) | ||
278 | if ((w = mk_wcwidth(*pwcs)) < 0) | ||
279 | return -1; | ||
280 | else | ||
281 | width += w; | ||
282 | |||
283 | return width; | ||
284 | } | ||
285 | |||
diff --git a/src/wcwidth.h b/src/wcwidth.h new file mode 100644 index 0000000..f2fee11 --- /dev/null +++ b/src/wcwidth.h | |||
@@ -0,0 +1,21 @@ | |||
1 | // wcwidth.h | ||
2 | |||
3 | // Windows does not have a wcwidth function, so we use compatibilty code from | ||
4 | // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c by Markus Kuhn | ||
5 | |||
6 | #ifndef MK_WCWIDTH_H | ||
7 | #define MK_WCWIDTH_H | ||
8 | |||
9 | |||
10 | #ifdef _WIN32 | ||
11 | #include <stdint.h> | ||
12 | typedef uint32_t mk_wchar_t; // Windows wchar_t can be 16-bit, we need 32-bit | ||
13 | #else | ||
14 | #include <wchar.h> | ||
15 | typedef wchar_t mk_wchar_t; // Posix wchar_t is 32-bit so just use that | ||
16 | #endif | ||
17 | |||
18 | int mk_wcwidth(mk_wchar_t ucs); | ||
19 | int mk_wcswidth(const mk_wchar_t *pwcs, size_t n); | ||
20 | |||
21 | #endif // MK_WCWIDTH_H | ||