utf8.offset returns also final position of character
'utf8.offset' returns two values: the initial and the final position of the given character.
This commit is contained in:
20
lutf8lib.c
20
lutf8lib.c
@@ -181,8 +181,8 @@ static int utfchar (lua_State *L) {
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
** offset(s, n, [i]) -> index where n-th character counting from
|
** offset(s, n, [i]) -> indices where n-th character counting from
|
||||||
** position 'i' starts; 0 means character at 'i'.
|
** position 'i' starts and ends; 0 means character at 'i'.
|
||||||
*/
|
*/
|
||||||
static int byteoffset (lua_State *L) {
|
static int byteoffset (lua_State *L) {
|
||||||
size_t len;
|
size_t len;
|
||||||
@@ -217,11 +217,19 @@ static int byteoffset (lua_State *L) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (n == 0) /* did it find given character? */
|
if (n != 0) { /* did not find given character? */
|
||||||
lua_pushinteger(L, posi + 1);
|
|
||||||
else /* no such character */
|
|
||||||
luaL_pushfail(L);
|
luaL_pushfail(L);
|
||||||
return 1;
|
return 1;
|
||||||
|
}
|
||||||
|
lua_pushinteger(L, posi + 1); /* initial position */
|
||||||
|
if ((s[posi] & 0x80) != 0) { /* multi-byte character? */
|
||||||
|
do {
|
||||||
|
posi++;
|
||||||
|
} while (iscontp(s + posi + 1)); /* skip to final byte */
|
||||||
|
}
|
||||||
|
/* else one-byte character: final position is the initial one */
|
||||||
|
lua_pushinteger(L, posi + 1); /* 'posi' now is the final position */
|
||||||
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -7958,21 +7958,27 @@ returns @fail plus the position of the first invalid byte.
|
|||||||
|
|
||||||
@LibEntry{utf8.offset (s, n [, i])|
|
@LibEntry{utf8.offset (s, n [, i])|
|
||||||
|
|
||||||
Returns the position (in bytes) where the encoding of the
|
Returns the the position of the @id{n}-th character of @id{s}
|
||||||
@id{n}-th character of @id{s}
|
(counting from byte position @id{i}) as two integers:
|
||||||
(counting from position @id{i}) starts.
|
The index (in bytes) where its encoding starts and the
|
||||||
|
index (in bytes) where it ends.
|
||||||
|
|
||||||
|
If the specified character is right after the end of @id{s},
|
||||||
|
the function behaves as if there was a @Char{\0} there.
|
||||||
|
If the specified character is neither in the subject
|
||||||
|
nor right after its end,
|
||||||
|
the function returns @fail.
|
||||||
|
|
||||||
A negative @id{n} gets characters before position @id{i}.
|
A negative @id{n} gets characters before position @id{i}.
|
||||||
The default for @id{i} is 1 when @id{n} is non-negative
|
The default for @id{i} is 1 when @id{n} is non-negative
|
||||||
and @T{#s + 1} otherwise,
|
and @T{#s + 1} otherwise,
|
||||||
so that @T{utf8.offset(s, -n)} gets the offset of the
|
so that @T{utf8.offset(s, -n)} gets the offset of the
|
||||||
@id{n}-th character from the end of the string.
|
@id{n}-th character from the end of the string.
|
||||||
If the specified character is neither in the subject
|
|
||||||
nor right after its end,
|
|
||||||
the function returns @fail.
|
|
||||||
|
|
||||||
As a special case,
|
As a special case,
|
||||||
when @id{n} is 0 the function returns the start of the encoding
|
when @id{n} is 0 the function returns the start and end
|
||||||
of the character that contains the @id{i}-th byte of @id{s}.
|
of the encoding of the character that contains the
|
||||||
|
@id{i}-th byte of @id{s}.
|
||||||
|
|
||||||
This function assumes that @id{s} is a valid UTF-8 string.
|
This function assumes that @id{s} is a valid UTF-8 string.
|
||||||
|
|
||||||
|
|||||||
@@ -52,25 +52,35 @@ local function check (s, t, nonstrict)
|
|||||||
for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1'
|
for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1'
|
||||||
|
|
||||||
for i = 1, l do -- for all codepoints
|
for i = 1, l do -- for all codepoints
|
||||||
local pi = utf8.offset(s, i) -- position of i-th char
|
local pi, pie = utf8.offset(s, i) -- position of i-th char
|
||||||
local pi1 = utf8.offset(s, 2, pi) -- position of next char
|
local pi1 = utf8.offset(s, 2, pi) -- position of next char
|
||||||
|
assert(pi1 == pie + 1)
|
||||||
assert(string.find(string.sub(s, pi, pi1 - 1), justone))
|
assert(string.find(string.sub(s, pi, pi1 - 1), justone))
|
||||||
assert(utf8.offset(s, -1, pi1) == pi)
|
assert(utf8.offset(s, -1, pi1) == pi)
|
||||||
assert(utf8.offset(s, i - l - 1) == pi)
|
assert(utf8.offset(s, i - l - 1) == pi)
|
||||||
assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
|
assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
|
||||||
for j = pi, pi1 - 1 do
|
for j = pi, pi1 - 1 do
|
||||||
assert(utf8.offset(s, 0, j) == pi)
|
local off1, off2 = utf8.offset(s, 0, j)
|
||||||
|
assert(off1 == pi and off2 == pi1 - 1)
|
||||||
end
|
end
|
||||||
for j = pi + 1, pi1 - 1 do
|
for j = pi + 1, pi1 - 1 do
|
||||||
assert(not utf8.len(s, j))
|
assert(not utf8.len(s, j))
|
||||||
end
|
end
|
||||||
assert(utf8.len(s, pi, pi, nonstrict) == 1)
|
assert(utf8.len(s, pi, pi, nonstrict) == 1)
|
||||||
assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
|
assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
|
||||||
assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
|
assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
|
||||||
assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
|
assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
|
||||||
assert(utf8.len(s, 1, pi, nonstrict) == i)
|
assert(utf8.len(s, 1, pi, nonstrict) == i)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
local expected = 1 -- expected position of "current" character
|
||||||
|
for i = 1, l + 1 do
|
||||||
|
local p, e = utf8.offset(s, i)
|
||||||
|
assert(p == expected)
|
||||||
|
expected = e + 1
|
||||||
|
end
|
||||||
|
assert(expected - 1 == #s + 1)
|
||||||
|
|
||||||
local i = 0
|
local i = 0
|
||||||
for p, c in utf8.codes(s, nonstrict) do
|
for p, c in utf8.codes(s, nonstrict) do
|
||||||
i = i + 1
|
i = i + 1
|
||||||
@@ -94,20 +104,20 @@ end
|
|||||||
|
|
||||||
|
|
||||||
do -- error indication in utf8.len
|
do -- error indication in utf8.len
|
||||||
local function check (s, p)
|
local function checklen (s, p)
|
||||||
local a, b = utf8.len(s)
|
local a, b = utf8.len(s)
|
||||||
assert(not a and b == p)
|
assert(not a and b == p)
|
||||||
end
|
end
|
||||||
check("abc\xE3def", 4)
|
checklen("abc\xE3def", 4)
|
||||||
check("\xF4\x9F\xBF", 1)
|
checklen("\xF4\x9F\xBF", 1)
|
||||||
check("\xF4\x9F\xBF\xBF", 1)
|
checklen("\xF4\x9F\xBF\xBF", 1)
|
||||||
-- spurious continuation bytes
|
-- spurious continuation bytes
|
||||||
check("汉字\x80", #("汉字") + 1)
|
checklen("汉字\x80", #("汉字") + 1)
|
||||||
check("\x80hello", 1)
|
checklen("\x80hello", 1)
|
||||||
check("hel\x80lo", 4)
|
checklen("hel\x80lo", 4)
|
||||||
check("汉字\xBF", #("汉字") + 1)
|
checklen("汉字\xBF", #("汉字") + 1)
|
||||||
check("\xBFhello", 1)
|
checklen("\xBFhello", 1)
|
||||||
check("hel\xBFlo", 4)
|
checklen("hel\xBFlo", 4)
|
||||||
end
|
end
|
||||||
|
|
||||||
-- errors in utf8.codes
|
-- errors in utf8.codes
|
||||||
|
|||||||
Reference in New Issue
Block a user