From ccb8b307f11c7497e61f617b12f3a7f0a697256c Mon Sep 17 00:00:00 2001 From: Roberto Ierusalimschy Date: Fri, 18 Jul 2025 16:10:28 -0300 Subject: Correction in utf8.offset Wrong utf-8 character may have no continuation bytes. --- lutf8lib.c | 7 ++++--- testes/utf8.lua | 9 +++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lutf8lib.c b/lutf8lib.c index be016135..df49c901 100644 --- a/lutf8lib.c +++ b/lutf8lib.c @@ -215,9 +215,10 @@ static int byteoffset (lua_State *L) { } lua_pushinteger(L, posi + 1); /* initial position */ if ((s[posi] & 0x80) != 0) { /* multi-byte character? */ - do { - posi++; - } while (iscontp(s + posi + 1)); /* skip to final byte */ + if (iscont(s[posi])) + return luaL_error(L, "initial position is a continuation byte"); + while (iscontp(s + posi + 1)) + posi++; /* skip to last continuation byte */ } /* else one-byte character: final position is the initial one */ lua_pushinteger(L, posi + 1); /* 'posi' now is the final position */ diff --git a/testes/utf8.lua b/testes/utf8.lua index 143c6d34..028995a4 100644 --- a/testes/utf8.lua +++ b/testes/utf8.lua @@ -152,11 +152,20 @@ checkerror("position out of bounds", utf8.offset, "", 1, -1) checkerror("continuation byte", utf8.offset, "𦧺", 1, 2) checkerror("continuation byte", utf8.offset, "𦧺", 1, 2) checkerror("continuation byte", utf8.offset, "\x80", 1) +checkerror("continuation byte", utf8.offset, "\x9c", -1) -- error in indices for len checkerror("out of bounds", utf8.len, "abc", 0, 2) checkerror("out of bounds", utf8.len, "abc", 1, 4) +do -- missing continuation bytes + -- get what is available + local p, e = utf8.offset("\xE0", 1) + assert(p == 1 and e == 1) + local p, e = utf8.offset("\xE0\x9e", -1) + assert(p == 1 and e == 2) +end + local s = "hello World" local t = {string.byte(s, 1, -1)} -- cgit v1.2.3-55-g6feb