diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2024-05-27 11:29:39 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2024-05-27 11:29:39 -0300 |
commit | 814213b65fa4ab2b1a7216d06f68a6f3df89efcd (patch) | |
tree | 899a187277f8645f9bc0b48ae55be4c31a61ae39 | |
parent | cbdf4969ec425f1df1ade358425c0bf0bf811d83 (diff) | |
download | lua-814213b65fa4ab2b1a7216d06f68a6f3df89efcd.tar.gz lua-814213b65fa4ab2b1a7216d06f68a6f3df89efcd.tar.bz2 lua-814213b65fa4ab2b1a7216d06f68a6f3df89efcd.zip |
utf8.offset returns also final position of character
'utf8.offset' returns two values: the initial and the final position
of the given character.
Diffstat (limited to '')
-rw-r--r-- | lutf8lib.c | 20 | ||||
-rw-r--r-- | manual/manual.of | 22 | ||||
-rw-r--r-- | testes/utf8.lua | 44 |
3 files changed, 55 insertions, 31 deletions
@@ -181,8 +181,8 @@ static int utfchar (lua_State *L) { | |||
181 | 181 | ||
182 | 182 | ||
183 | /* | 183 | /* |
184 | ** offset(s, n, [i]) -> index where n-th character counting from | 184 | ** offset(s, n, [i]) -> indices where n-th character counting from |
185 | ** position 'i' starts; 0 means character at 'i'. | 185 | ** position 'i' starts and ends; 0 means character at 'i'. |
186 | */ | 186 | */ |
187 | static int byteoffset (lua_State *L) { | 187 | static int byteoffset (lua_State *L) { |
188 | size_t len; | 188 | size_t len; |
@@ -217,11 +217,19 @@ static int byteoffset (lua_State *L) { | |||
217 | } | 217 | } |
218 | } | 218 | } |
219 | } | 219 | } |
220 | if (n == 0) /* did it find given character? */ | 220 | if (n != 0) { /* did not find given character? */ |
221 | lua_pushinteger(L, posi + 1); | ||
222 | else /* no such character */ | ||
223 | luaL_pushfail(L); | 221 | luaL_pushfail(L); |
224 | return 1; | 222 | return 1; |
223 | } | ||
224 | lua_pushinteger(L, posi + 1); /* initial position */ | ||
225 | if ((s[posi] & 0x80) != 0) { /* multi-byte character? */ | ||
226 | do { | ||
227 | posi++; | ||
228 | } while (iscontp(s + posi + 1)); /* skip to final byte */ | ||
229 | } | ||
230 | /* else one-byte character: final position is the initial one */ | ||
231 | lua_pushinteger(L, posi + 1); /* 'posi' now is the final position */ | ||
232 | return 2; | ||
225 | } | 233 | } |
226 | 234 | ||
227 | 235 | ||
diff --git a/manual/manual.of b/manual/manual.of index f830b01c..359bd166 100644 --- a/manual/manual.of +++ b/manual/manual.of | |||
@@ -7958,21 +7958,27 @@ returns @fail plus the position of the first invalid byte. | |||
7958 | 7958 | ||
7959 | @LibEntry{utf8.offset (s, n [, i])| | 7959 | @LibEntry{utf8.offset (s, n [, i])| |
7960 | 7960 | ||
7961 | Returns the position (in bytes) where the encoding of the | 7961 | Returns the the position of the @id{n}-th character of @id{s} |
7962 | @id{n}-th character of @id{s} | 7962 | (counting from byte position @id{i}) as two integers: |
7963 | (counting from position @id{i}) starts. | 7963 | The index (in bytes) where its encoding starts and the |
7964 | index (in bytes) where it ends. | ||
7965 | |||
7966 | If the specified character is right after the end of @id{s}, | ||
7967 | the function behaves as if there was a @Char{\0} there. | ||
7968 | If the specified character is neither in the subject | ||
7969 | nor right after its end, | ||
7970 | the function returns @fail. | ||
7971 | |||
7964 | A negative @id{n} gets characters before position @id{i}. | 7972 | A negative @id{n} gets characters before position @id{i}. |
7965 | The default for @id{i} is 1 when @id{n} is non-negative | 7973 | The default for @id{i} is 1 when @id{n} is non-negative |
7966 | and @T{#s + 1} otherwise, | 7974 | and @T{#s + 1} otherwise, |
7967 | so that @T{utf8.offset(s, -n)} gets the offset of the | 7975 | so that @T{utf8.offset(s, -n)} gets the offset of the |
7968 | @id{n}-th character from the end of the string. | 7976 | @id{n}-th character from the end of the string. |
7969 | If the specified character is neither in the subject | ||
7970 | nor right after its end, | ||
7971 | the function returns @fail. | ||
7972 | 7977 | ||
7973 | As a special case, | 7978 | As a special case, |
7974 | when @id{n} is 0 the function returns the start of the encoding | 7979 | when @id{n} is 0 the function returns the start and end |
7975 | of the character that contains the @id{i}-th byte of @id{s}. | 7980 | of the encoding of the character that contains the |
7981 | @id{i}-th byte of @id{s}. | ||
7976 | 7982 | ||
7977 | This function assumes that @id{s} is a valid UTF-8 string. | 7983 | This function assumes that @id{s} is a valid UTF-8 string. |
7978 | 7984 | ||
diff --git a/testes/utf8.lua b/testes/utf8.lua index efadbd5c..dc0f2f09 100644 --- a/testes/utf8.lua +++ b/testes/utf8.lua | |||
@@ -52,25 +52,35 @@ local function check (s, t, nonstrict) | |||
52 | for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1' | 52 | for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1' |
53 | 53 | ||
54 | for i = 1, l do -- for all codepoints | 54 | for i = 1, l do -- for all codepoints |
55 | local pi = utf8.offset(s, i) -- position of i-th char | 55 | local pi, pie = utf8.offset(s, i) -- position of i-th char |
56 | local pi1 = utf8.offset(s, 2, pi) -- position of next char | 56 | local pi1 = utf8.offset(s, 2, pi) -- position of next char |
57 | assert(pi1 == pie + 1) | ||
57 | assert(string.find(string.sub(s, pi, pi1 - 1), justone)) | 58 | assert(string.find(string.sub(s, pi, pi1 - 1), justone)) |
58 | assert(utf8.offset(s, -1, pi1) == pi) | 59 | assert(utf8.offset(s, -1, pi1) == pi) |
59 | assert(utf8.offset(s, i - l - 1) == pi) | 60 | assert(utf8.offset(s, i - l - 1) == pi) |
60 | assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict))) | 61 | assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict))) |
61 | for j = pi, pi1 - 1 do | 62 | for j = pi, pi1 - 1 do |
62 | assert(utf8.offset(s, 0, j) == pi) | 63 | local off1, off2 = utf8.offset(s, 0, j) |
64 | assert(off1 == pi and off2 == pi1 - 1) | ||
63 | end | 65 | end |
64 | for j = pi + 1, pi1 - 1 do | 66 | for j = pi + 1, pi1 - 1 do |
65 | assert(not utf8.len(s, j)) | 67 | assert(not utf8.len(s, j)) |
66 | end | 68 | end |
67 | assert(utf8.len(s, pi, pi, nonstrict) == 1) | 69 | assert(utf8.len(s, pi, pi, nonstrict) == 1) |
68 | assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1) | 70 | assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1) |
69 | assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1) | 71 | assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1) |
70 | assert(utf8.len(s, pi1, -1, nonstrict) == l - i) | 72 | assert(utf8.len(s, pi1, -1, nonstrict) == l - i) |
71 | assert(utf8.len(s, 1, pi, nonstrict) == i) | 73 | assert(utf8.len(s, 1, pi, nonstrict) == i) |
72 | end | 74 | end |
73 | 75 | ||
76 | local expected = 1 -- expected position of "current" character | ||
77 | for i = 1, l + 1 do | ||
78 | local p, e = utf8.offset(s, i) | ||
79 | assert(p == expected) | ||
80 | expected = e + 1 | ||
81 | end | ||
82 | assert(expected - 1 == #s + 1) | ||
83 | |||
74 | local i = 0 | 84 | local i = 0 |
75 | for p, c in utf8.codes(s, nonstrict) do | 85 | for p, c in utf8.codes(s, nonstrict) do |
76 | i = i + 1 | 86 | i = i + 1 |
@@ -94,20 +104,20 @@ end | |||
94 | 104 | ||
95 | 105 | ||
96 | do -- error indication in utf8.len | 106 | do -- error indication in utf8.len |
97 | local function check (s, p) | 107 | local function checklen (s, p) |
98 | local a, b = utf8.len(s) | 108 | local a, b = utf8.len(s) |
99 | assert(not a and b == p) | 109 | assert(not a and b == p) |
100 | end | 110 | end |
101 | check("abc\xE3def", 4) | 111 | checklen("abc\xE3def", 4) |
102 | check("\xF4\x9F\xBF", 1) | 112 | checklen("\xF4\x9F\xBF", 1) |
103 | check("\xF4\x9F\xBF\xBF", 1) | 113 | checklen("\xF4\x9F\xBF\xBF", 1) |
104 | -- spurious continuation bytes | 114 | -- spurious continuation bytes |
105 | check("汉字\x80", #("汉字") + 1) | 115 | checklen("汉字\x80", #("汉字") + 1) |
106 | check("\x80hello", 1) | 116 | checklen("\x80hello", 1) |
107 | check("hel\x80lo", 4) | 117 | checklen("hel\x80lo", 4) |
108 | check("汉字\xBF", #("汉字") + 1) | 118 | checklen("汉字\xBF", #("汉字") + 1) |
109 | check("\xBFhello", 1) | 119 | checklen("\xBFhello", 1) |
110 | check("hel\xBFlo", 4) | 120 | checklen("hel\xBFlo", 4) |
111 | end | 121 | end |
112 | 122 | ||
113 | -- errors in utf8.codes | 123 | -- errors in utf8.codes |