aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2024-05-27 11:29:39 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2024-05-27 11:29:39 -0300
commit814213b65fa4ab2b1a7216d06f68a6f3df89efcd (patch)
tree899a187277f8645f9bc0b48ae55be4c31a61ae39
parentcbdf4969ec425f1df1ade358425c0bf0bf811d83 (diff)
downloadlua-814213b65fa4ab2b1a7216d06f68a6f3df89efcd.tar.gz
lua-814213b65fa4ab2b1a7216d06f68a6f3df89efcd.tar.bz2
lua-814213b65fa4ab2b1a7216d06f68a6f3df89efcd.zip
utf8.offset returns also final position of character
'utf8.offset' returns two values: the initial and the final position of the given character.
Diffstat (limited to '')
-rw-r--r--lutf8lib.c20
-rw-r--r--manual/manual.of22
-rw-r--r--testes/utf8.lua44
3 files changed, 55 insertions, 31 deletions
diff --git a/lutf8lib.c b/lutf8lib.c
index 3a5b9bc3..7b747937 100644
--- a/lutf8lib.c
+++ b/lutf8lib.c
@@ -181,8 +181,8 @@ static int utfchar (lua_State *L) {
181 181
182 182
183/* 183/*
184** offset(s, n, [i]) -> index where n-th character counting from 184** offset(s, n, [i]) -> indices where n-th character counting from
185** position 'i' starts; 0 means character at 'i'. 185** position 'i' starts and ends; 0 means character at 'i'.
186*/ 186*/
187static int byteoffset (lua_State *L) { 187static int byteoffset (lua_State *L) {
188 size_t len; 188 size_t len;
@@ -217,11 +217,19 @@ static int byteoffset (lua_State *L) {
217 } 217 }
218 } 218 }
219 } 219 }
220 if (n == 0) /* did it find given character? */ 220 if (n != 0) { /* did not find given character? */
221 lua_pushinteger(L, posi + 1);
222 else /* no such character */
223 luaL_pushfail(L); 221 luaL_pushfail(L);
224 return 1; 222 return 1;
223 }
224 lua_pushinteger(L, posi + 1); /* initial position */
225 if ((s[posi] & 0x80) != 0) { /* multi-byte character? */
226 do {
227 posi++;
228 } while (iscontp(s + posi + 1)); /* skip to final byte */
229 }
230 /* else one-byte character: final position is the initial one */
231 lua_pushinteger(L, posi + 1); /* 'posi' now is the final position */
232 return 2;
225} 233}
226 234
227 235
diff --git a/manual/manual.of b/manual/manual.of
index f830b01c..359bd166 100644
--- a/manual/manual.of
+++ b/manual/manual.of
@@ -7958,21 +7958,27 @@ returns @fail plus the position of the first invalid byte.
7958 7958
7959@LibEntry{utf8.offset (s, n [, i])| 7959@LibEntry{utf8.offset (s, n [, i])|
7960 7960
7961Returns the position (in bytes) where the encoding of the 7961Returns the the position of the @id{n}-th character of @id{s}
7962@id{n}-th character of @id{s} 7962(counting from byte position @id{i}) as two integers:
7963(counting from position @id{i}) starts. 7963The index (in bytes) where its encoding starts and the
7964index (in bytes) where it ends.
7965
7966If the specified character is right after the end of @id{s},
7967the function behaves as if there was a @Char{\0} there.
7968If the specified character is neither in the subject
7969nor right after its end,
7970the function returns @fail.
7971
7964A negative @id{n} gets characters before position @id{i}. 7972A negative @id{n} gets characters before position @id{i}.
7965The default for @id{i} is 1 when @id{n} is non-negative 7973The default for @id{i} is 1 when @id{n} is non-negative
7966and @T{#s + 1} otherwise, 7974and @T{#s + 1} otherwise,
7967so that @T{utf8.offset(s, -n)} gets the offset of the 7975so that @T{utf8.offset(s, -n)} gets the offset of the
7968@id{n}-th character from the end of the string. 7976@id{n}-th character from the end of the string.
7969If the specified character is neither in the subject
7970nor right after its end,
7971the function returns @fail.
7972 7977
7973As a special case, 7978As a special case,
7974when @id{n} is 0 the function returns the start of the encoding 7979when @id{n} is 0 the function returns the start and end
7975of the character that contains the @id{i}-th byte of @id{s}. 7980of the encoding of the character that contains the
7981@id{i}-th byte of @id{s}.
7976 7982
7977This function assumes that @id{s} is a valid UTF-8 string. 7983This function assumes that @id{s} is a valid UTF-8 string.
7978 7984
diff --git a/testes/utf8.lua b/testes/utf8.lua
index efadbd5c..dc0f2f09 100644
--- a/testes/utf8.lua
+++ b/testes/utf8.lua
@@ -52,25 +52,35 @@ local function check (s, t, nonstrict)
52 for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1' 52 for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1'
53 53
54 for i = 1, l do -- for all codepoints 54 for i = 1, l do -- for all codepoints
55 local pi = utf8.offset(s, i) -- position of i-th char 55 local pi, pie = utf8.offset(s, i) -- position of i-th char
56 local pi1 = utf8.offset(s, 2, pi) -- position of next char 56 local pi1 = utf8.offset(s, 2, pi) -- position of next char
57 assert(pi1 == pie + 1)
57 assert(string.find(string.sub(s, pi, pi1 - 1), justone)) 58 assert(string.find(string.sub(s, pi, pi1 - 1), justone))
58 assert(utf8.offset(s, -1, pi1) == pi) 59 assert(utf8.offset(s, -1, pi1) == pi)
59 assert(utf8.offset(s, i - l - 1) == pi) 60 assert(utf8.offset(s, i - l - 1) == pi)
60 assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict))) 61 assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
61 for j = pi, pi1 - 1 do 62 for j = pi, pi1 - 1 do
62 assert(utf8.offset(s, 0, j) == pi) 63 local off1, off2 = utf8.offset(s, 0, j)
64 assert(off1 == pi and off2 == pi1 - 1)
63 end 65 end
64 for j = pi + 1, pi1 - 1 do 66 for j = pi + 1, pi1 - 1 do
65 assert(not utf8.len(s, j)) 67 assert(not utf8.len(s, j))
66 end 68 end
67 assert(utf8.len(s, pi, pi, nonstrict) == 1) 69 assert(utf8.len(s, pi, pi, nonstrict) == 1)
68 assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1) 70 assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
69 assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1) 71 assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
70 assert(utf8.len(s, pi1, -1, nonstrict) == l - i) 72 assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
71 assert(utf8.len(s, 1, pi, nonstrict) == i) 73 assert(utf8.len(s, 1, pi, nonstrict) == i)
72 end 74 end
73 75
76 local expected = 1 -- expected position of "current" character
77 for i = 1, l + 1 do
78 local p, e = utf8.offset(s, i)
79 assert(p == expected)
80 expected = e + 1
81 end
82 assert(expected - 1 == #s + 1)
83
74 local i = 0 84 local i = 0
75 for p, c in utf8.codes(s, nonstrict) do 85 for p, c in utf8.codes(s, nonstrict) do
76 i = i + 1 86 i = i + 1
@@ -94,20 +104,20 @@ end
94 104
95 105
96do -- error indication in utf8.len 106do -- error indication in utf8.len
97 local function check (s, p) 107 local function checklen (s, p)
98 local a, b = utf8.len(s) 108 local a, b = utf8.len(s)
99 assert(not a and b == p) 109 assert(not a and b == p)
100 end 110 end
101 check("abc\xE3def", 4) 111 checklen("abc\xE3def", 4)
102 check("\xF4\x9F\xBF", 1) 112 checklen("\xF4\x9F\xBF", 1)
103 check("\xF4\x9F\xBF\xBF", 1) 113 checklen("\xF4\x9F\xBF\xBF", 1)
104 -- spurious continuation bytes 114 -- spurious continuation bytes
105 check("汉字\x80", #("汉字") + 1) 115 checklen("汉字\x80", #("汉字") + 1)
106 check("\x80hello", 1) 116 checklen("\x80hello", 1)
107 check("hel\x80lo", 4) 117 checklen("hel\x80lo", 4)
108 check("汉字\xBF", #("汉字") + 1) 118 checklen("汉字\xBF", #("汉字") + 1)
109 check("\xBFhello", 1) 119 checklen("\xBFhello", 1)
110 check("hel\xBFlo", 4) 120 checklen("hel\xBFlo", 4)
111end 121end
112 122
113-- errors in utf8.codes 123-- errors in utf8.codes