Changes in the validation of UTF-8

All UTF-8 encoding functionality (including the escape
sequence '\u') accepts all values from the original UTF-8
specification (with sequences of up to six bytes).

By default, the decoding functions in the UTF-8 library do not
accept invalid Unicode code points, such as surrogates. A new
parameter 'nonstrict' makes them accept all code points up to
(2^31)-1, as in the original UTF-8 specification.
This commit is contained in:
Roberto Ierusalimschy
2019-03-15 13:14:17 -03:00
parent 8fa4f1380b
commit 1e0c73d5b6
6 changed files with 164 additions and 72 deletions

2
llex.c
View File

@@ -335,7 +335,7 @@ static unsigned long readutf8esc (LexState *ls) {
while ((save_and_next(ls), lisxdigit(ls->current))) { while ((save_and_next(ls), lisxdigit(ls->current))) {
i++; i++;
r = (r << 4) + luaO_hexavalue(ls->current); r = (r << 4) + luaO_hexavalue(ls->current);
esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large"); esccheck(ls, r <= 0x7FFFFFFFu, "UTF-8 value too large");
} }
esccheck(ls, ls->current == '}', "missing '}'"); esccheck(ls, ls->current == '}', "missing '}'");
next(ls); /* skip '}' */ next(ls); /* skip '}' */

View File

@@ -343,7 +343,7 @@ size_t luaO_str2num (const char *s, TValue *o) {
int luaO_utf8esc (char *buff, unsigned long x) { int luaO_utf8esc (char *buff, unsigned long x) {
int n = 1; /* number of bytes put in buffer (backwards) */ int n = 1; /* number of bytes put in buffer (backwards) */
lua_assert(x <= 0x10FFFF); lua_assert(x <= 0x7FFFFFFFu);
if (x < 0x80) /* ascii? */ if (x < 0x80) /* ascii? */
buff[UTF8BUFFSZ - 1] = cast_char(x); buff[UTF8BUFFSZ - 1] = cast_char(x);
else { /* need continuation bytes */ else { /* need continuation bytes */
@@ -435,9 +435,9 @@ const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
pushstr(L, buff, l); pushstr(L, buff, l);
break; break;
} }
case 'U': { /* an 'int' as a UTF-8 sequence */ case 'U': { /* a 'long' as a UTF-8 sequence */
char buff[UTF8BUFFSZ]; char buff[UTF8BUFFSZ];
int l = luaO_utf8esc(buff, cast(long, va_arg(argp, long))); int l = luaO_utf8esc(buff, va_arg(argp, long));
pushstr(L, buff + UTF8BUFFSZ - l, l); pushstr(L, buff + UTF8BUFFSZ - l, l);
break; break;
} }

View File

@@ -21,12 +21,14 @@
#include "lualib.h" #include "lualib.h"
#define MAXUNICODE 0x10FFFF #define MAXUNICODE 0x10FFFFu
#define MAXUTF 0x7FFFFFFFu
/* /*
** Integer type for decoded UTF-8 values; MAXUNICODE needs 21 bits. ** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
*/ */
#if LUAI_BITSINT >= 21 #if LUAI_BITSINT >= 31
typedef unsigned int utfint; typedef unsigned int utfint;
#else #else
typedef unsigned long utfint; typedef unsigned long utfint;
@@ -46,38 +48,46 @@ static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
/* /*
** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. ** Decode one UTF-8 sequence, returning NULL if byte sequence is
** invalid. The array 'limits' stores the minimum value for each
** sequence length, to check for overlong representations. Its first
** entry forces an error for non-ascii bytes with no continuation
** bytes (count == 0).
*/ */
static const char *utf8_decode (const char *o, utfint *val) { static const char *utf8_decode (const char *s, utfint *val, int strict) {
static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF}; static const utfint limits[] =
const unsigned char *s = (const unsigned char *)o; {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
unsigned int c = s[0]; unsigned int c = (unsigned char)s[0];
utfint res = 0; /* final result */ utfint res = 0; /* final result */
if (c < 0x80) /* ascii? */ if (c < 0x80) /* ascii? */
res = c; res = c;
else { else {
int count = 0; /* to count number of continuation bytes */ int count = 0; /* to count number of continuation bytes */
while (c & 0x40) { /* still have continuation bytes? */ for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */
int cc = s[++count]; /* read next byte */ unsigned int cc = (unsigned char)s[++count]; /* read next byte */
if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
return NULL; /* invalid byte sequence */ return NULL; /* invalid byte sequence */
res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
c <<= 1; /* to test next bit */
} }
res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */ res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */
if (count > 3 || res > MAXUNICODE || res <= limits[count]) if (count > 5 || res > MAXUTF || res < limits[count])
return NULL; /* invalid byte sequence */ return NULL; /* invalid byte sequence */
s += count; /* skip continuation bytes read */ s += count; /* skip continuation bytes read */
} }
if (strict) {
/* check for invalid code points; too large or surrogates */
if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu))
return NULL;
}
if (val) *val = res; if (val) *val = res;
return (const char *)s + 1; /* +1 to include first byte */ return s + 1; /* +1 to include first byte */
} }
/* /*
** utf8len(s [, i [, j]]) --> number of characters that start in the ** utf8len(s [, i [, j [, nonstrict]]]) --> number of characters that
** range [i,j], or nil + current position if 's' is not well formed in ** start in the range [i,j], or nil + current position if 's' is not
** that interval ** well formed in that interval
*/ */
static int utflen (lua_State *L) { static int utflen (lua_State *L) {
lua_Integer n = 0; /* counter for the number of characters */ lua_Integer n = 0; /* counter for the number of characters */
@@ -85,12 +95,13 @@ static int utflen (lua_State *L) {
const char *s = luaL_checklstring(L, 1, &len); const char *s = luaL_checklstring(L, 1, &len);
lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len); lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
int nonstrict = lua_toboolean(L, 4);
luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
"initial position out of string"); "initial position out of string");
luaL_argcheck(L, --posj < (lua_Integer)len, 3, luaL_argcheck(L, --posj < (lua_Integer)len, 3,
"final position out of string"); "final position out of string");
while (posi <= posj) { while (posi <= posj) {
const char *s1 = utf8_decode(s + posi, NULL); const char *s1 = utf8_decode(s + posi, NULL, !nonstrict);
if (s1 == NULL) { /* conversion error? */ if (s1 == NULL) { /* conversion error? */
lua_pushnil(L); /* return nil ... */ lua_pushnil(L); /* return nil ... */
lua_pushinteger(L, posi + 1); /* ... and current position */ lua_pushinteger(L, posi + 1); /* ... and current position */
@@ -105,14 +116,15 @@ static int utflen (lua_State *L) {
/* /*
** codepoint(s, [i, [j]]) -> returns codepoints for all characters ** codepoint(s, [i, [j [, nonstrict]]]) -> returns codepoints for all
** that start in the range [i,j] ** characters that start in the range [i,j]
*/ */
static int codepoint (lua_State *L) { static int codepoint (lua_State *L) {
size_t len; size_t len;
const char *s = luaL_checklstring(L, 1, &len); const char *s = luaL_checklstring(L, 1, &len);
lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len); lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
int nonstrict = lua_toboolean(L, 4);
int n; int n;
const char *se; const char *se;
luaL_argcheck(L, posi >= 1, 2, "out of range"); luaL_argcheck(L, posi >= 1, 2, "out of range");
@@ -126,7 +138,7 @@ static int codepoint (lua_State *L) {
se = s + pose; /* string end */ se = s + pose; /* string end */
for (s += posi - 1; s < se;) { for (s += posi - 1; s < se;) {
utfint code; utfint code;
s = utf8_decode(s, &code); s = utf8_decode(s, &code, !nonstrict);
if (s == NULL) if (s == NULL)
return luaL_error(L, "invalid UTF-8 code"); return luaL_error(L, "invalid UTF-8 code");
lua_pushinteger(L, code); lua_pushinteger(L, code);
@@ -137,8 +149,8 @@ static int codepoint (lua_State *L) {
static void pushutfchar (lua_State *L, int arg) { static void pushutfchar (lua_State *L, int arg) {
lua_Integer code = luaL_checkinteger(L, arg); lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range"); luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
lua_pushfstring(L, "%U", (long)code); lua_pushfstring(L, "%U", (long)code);
} }
@@ -209,7 +221,7 @@ static int byteoffset (lua_State *L) {
} }
static int iter_aux (lua_State *L) { static int iter_aux (lua_State *L, int strict) {
size_t len; size_t len;
const char *s = luaL_checklstring(L, 1, &len); const char *s = luaL_checklstring(L, 1, &len);
lua_Integer n = lua_tointeger(L, 2) - 1; lua_Integer n = lua_tointeger(L, 2) - 1;
@@ -223,8 +235,8 @@ static int iter_aux (lua_State *L) {
return 0; /* no more codepoints */ return 0; /* no more codepoints */
else { else {
utfint code; utfint code;
const char *next = utf8_decode(s + n, &code); const char *next = utf8_decode(s + n, &code, strict);
if (next == NULL || iscont(next)) if (next == NULL)
return luaL_error(L, "invalid UTF-8 code"); return luaL_error(L, "invalid UTF-8 code");
lua_pushinteger(L, n + 1); lua_pushinteger(L, n + 1);
lua_pushinteger(L, code); lua_pushinteger(L, code);
@@ -233,9 +245,19 @@ static int iter_aux (lua_State *L) {
} }
static int iter_auxstrict (lua_State *L) {
return iter_aux(L, 1);
}
static int iter_auxnostrict (lua_State *L) {
return iter_aux(L, 0);
}
static int iter_codes (lua_State *L) { static int iter_codes (lua_State *L) {
int nonstrict = lua_toboolean(L, 2);
luaL_checkstring(L, 1); luaL_checkstring(L, 1);
lua_pushcfunction(L, iter_aux); lua_pushcfunction(L, nonstrict ? iter_auxnostrict : iter_auxstrict);
lua_pushvalue(L, 1); lua_pushvalue(L, 1);
lua_pushinteger(L, 0); lua_pushinteger(L, 0);
return 3; return 3;
@@ -243,7 +265,7 @@ static int iter_codes (lua_State *L) {
/* pattern to match a single UTF-8 character */ /* pattern to match a single UTF-8 character */
#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*" #define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
static const luaL_Reg funcs[] = { static const luaL_Reg funcs[] = {

View File

@@ -1004,6 +1004,8 @@ the escape sequence @T{\u{@rep{XXX}}}
(note the mandatory enclosing brackets), (note the mandatory enclosing brackets),
where @rep{XXX} is a sequence of one or more hexadecimal digits where @rep{XXX} is a sequence of one or more hexadecimal digits
representing the character code point. representing the character code point.
This code point can be any value smaller than @M{2@sp{31}}.
(Lua uses the original UTF-8 specification here.)
Literal strings can also be defined using a long format Literal strings can also be defined using a long format
enclosed by @def{long brackets}. enclosed by @def{long brackets}.
@@ -6899,6 +6901,7 @@ x = string.gsub("$name-$version.tar.gz", "%$(%w+)", t)
} }
@LibEntry{string.len (s)| @LibEntry{string.len (s)|
Receives a string and returns its length. Receives a string and returns its length.
The empty string @T{""} has length 0. The empty string @T{""} has length 0.
Embedded zeros are counted, Embedded zeros are counted,
@@ -6907,6 +6910,7 @@ so @T{"a\000bc\000"} has length 5.
} }
@LibEntry{string.lower (s)| @LibEntry{string.lower (s)|
Receives a string and returns a copy of this string with all Receives a string and returns a copy of this string with all
uppercase letters changed to lowercase. uppercase letters changed to lowercase.
All other characters are left unchanged. All other characters are left unchanged.
@@ -6915,6 +6919,7 @@ The definition of what an uppercase letter is depends on the current locale.
} }
@LibEntry{string.match (s, pattern [, init])| @LibEntry{string.match (s, pattern [, init])|
Looks for the first @emph{match} of Looks for the first @emph{match} of
@id{pattern} @see{pm} in the string @id{s}. @id{pattern} @see{pm} in the string @id{s}.
If it finds one, then @id{match} returns If it finds one, then @id{match} returns
@@ -6946,6 +6951,7 @@ The format string cannot have the variable-length options
} }
@LibEntry{string.rep (s, n [, sep])| @LibEntry{string.rep (s, n [, sep])|
Returns a string that is the concatenation of @id{n} copies of Returns a string that is the concatenation of @id{n} copies of
the string @id{s} separated by the string @id{sep}. the string @id{s} separated by the string @id{sep}.
The default value for @id{sep} is the empty string The default value for @id{sep} is the empty string
@@ -6958,11 +6964,13 @@ with a single call to this function.)
} }
@LibEntry{string.reverse (s)| @LibEntry{string.reverse (s)|
Returns a string that is the string @id{s} reversed. Returns a string that is the string @id{s} reversed.
} }
@LibEntry{string.sub (s, i [, j])| @LibEntry{string.sub (s, i [, j])|
Returns the substring of @id{s} that Returns the substring of @id{s} that
starts at @id{i} and continues until @id{j}; starts at @id{i} and continues until @id{j};
@id{i} and @id{j} can be negative. @id{i} and @id{j} can be negative.
@@ -6998,6 +7006,7 @@ this function also returns the index of the first unread byte in @id{s}.
} }
@LibEntry{string.upper (s)| @LibEntry{string.upper (s)|
Receives a string and returns a copy of this string with all Receives a string and returns a copy of this string with all
lowercase letters changed to uppercase. lowercase letters changed to uppercase.
All other characters are left unchanged. All other characters are left unchanged.
@@ -7318,8 +7327,24 @@ or one plus the length of the subject string.
As in the string library, As in the string library,
negative indices count from the end of the string. negative indices count from the end of the string.
Functions that create byte sequences
accept all values up to @T{0x7FFFFFFF},
as defined in the original UTF-8 specification;
that implies byte sequences of up to six bytes.
Functions that interpret byte sequences only accept
valid sequences (well formed and not overlong).
By default, they only accept byte sequences
that result in valid Unicode code points,
rejecting values larger than @T{10FFFF} and surrogates.
A boolean argument @id{nonstrict}, when available,
lifts these checks,
so that all values up to @T{0x7FFFFFFF} are accepted.
(Not well formed and overlong sequences are still rejected.)
@LibEntry{utf8.char (@Cdots)| @LibEntry{utf8.char (@Cdots)|
Receives zero or more integers, Receives zero or more integers,
converts each one to its corresponding UTF-8 byte sequence converts each one to its corresponding UTF-8 byte sequence
and returns a string with the concatenation of all these sequences. and returns a string with the concatenation of all these sequences.
@@ -7327,14 +7352,15 @@ and returns a string with the concatenation of all these sequences.
} }
@LibEntry{utf8.charpattern| @LibEntry{utf8.charpattern|
The pattern (a string, not a function) @St{[\0-\x7F\xC2-\xF4][\x80-\xBF]*}
The pattern (a string, not a function) @St{[\0-\x7F\xC2-\xFD][\x80-\xBF]*}
@see{pm}, @see{pm},
which matches exactly one UTF-8 byte sequence, which matches exactly one UTF-8 byte sequence,
assuming that the subject is a valid UTF-8 string. assuming that the subject is a valid UTF-8 string.
} }
@LibEntry{utf8.codes (s)| @LibEntry{utf8.codes (s [, nonstrict])|
Returns values so that the construction Returns values so that the construction
@verbatim{ @verbatim{
@@ -7347,7 +7373,8 @@ It raises an error if it meets any invalid byte sequence.
} }
@LibEntry{utf8.codepoint (s [, i [, j]])| @LibEntry{utf8.codepoint (s [, i [, j [, nonstrict]]])|
Returns the codepoints (as integers) from all characters in @id{s} Returns the codepoints (as integers) from all characters in @id{s}
that start between byte position @id{i} and @id{j} (both included). that start between byte position @id{i} and @id{j} (both included).
The default for @id{i} is 1 and for @id{j} is @id{i}. The default for @id{i} is 1 and for @id{j} is @id{i}.
@@ -7355,7 +7382,8 @@ It raises an error if it meets any invalid byte sequence.
} }
@LibEntry{utf8.len (s [, i [, j]])| @LibEntry{utf8.len (s [, i [, j [, nonstrict]]])|
Returns the number of UTF-8 characters in string @id{s} Returns the number of UTF-8 characters in string @id{s}
that start between positions @id{i} and @id{j} (both inclusive). that start between positions @id{i} and @id{j} (both inclusive).
The default for @id{i} is @num{1} and for @id{j} is @num{-1}. The default for @id{i} is @num{1} and for @id{j} is @num{-1}.
@@ -7365,6 +7393,7 @@ returns a false value plus the position of the first invalid byte.
} }
@LibEntry{utf8.offset (s, n [, i])| @LibEntry{utf8.offset (s, n [, i])|
Returns the position (in bytes) where the encoding of the Returns the position (in bytes) where the encoding of the
@id{n}-th character of @id{s} @id{n}-th character of @id{s}
(counting from position @id{i}) starts. (counting from position @id{i}) starts.
@@ -8755,6 +8784,12 @@ You can enclose the call in parentheses if you need to
discard these extra results. discard these extra results.
} }
@item{
By default, the decoding functions in the @Lid{utf8} library
do not accept surrogates as valid code points.
An extra parameter in these functions makes them more permissive.
}
} }
} }

View File

@@ -56,16 +56,23 @@ assert("abc\z
assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0)) assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0))
-- limits for 1-byte sequences -- limits for 1-byte sequences
assert("\u{0}\u{7F}" == "\x00\z\x7F") assert("\u{0}\u{7F}" == "\x00\x7F")
-- limits for 2-byte sequences -- limits for 2-byte sequences
assert("\u{80}\u{7FF}" == "\xC2\x80\z\xDF\xBF") assert("\u{80}\u{7FF}" == "\xC2\x80\xDF\xBF")
-- limits for 3-byte sequences -- limits for 3-byte sequences
assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\z\xEF\xBF\xBF") assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\xEF\xBF\xBF")
-- limits for 4-byte sequences -- limits for 4-byte sequences
assert("\u{10000}\u{10FFFF}" == "\xF0\x90\x80\x80\z\xF4\x8F\xBF\xBF") assert("\u{10000}\u{1FFFFF}" == "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF")
-- limits for 5-byte sequences
assert("\u{200000}\u{3FFFFFF}" == "\xF8\x88\x80\x80\x80\xFB\xBF\xBF\xBF\xBF")
-- limits for 6-byte sequences
assert("\u{4000000}\u{7FFFFFFF}" ==
"\xFC\x84\x80\x80\x80\x80\xFD\xBF\xBF\xBF\xBF\xBF")
-- Error in escape sequences -- Error in escape sequences
@@ -94,7 +101,7 @@ lexerror([["xyz\300"]], [[\300"]])
lexerror([[" \256"]], [[\256"]]) lexerror([[" \256"]], [[\256"]])
-- errors in UTF-8 sequences -- errors in UTF-8 sequences
lexerror([["abc\u{110000}"]], [[abc\u{110000]]) -- too large lexerror([["abc\u{100000000}"]], [[abc\u{100000000]]) -- too large
lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{' lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{'
lexerror([["abc\u"]], [[abc\u"]]) -- missing '{' lexerror([["abc\u"]], [[abc\u"]]) -- missing '{'
lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}' lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}'

View File

@@ -21,62 +21,59 @@ local justone = "^" .. utf8.charpattern .. "$"
-- 't' is the list of codepoints of 's' -- 't' is the list of codepoints of 's'
local function checksyntax (s, t) local function checksyntax (s, t)
-- creates a string "return '\u{t[1]}...\u{t[n]}'"
local ts = {"return '"} local ts = {"return '"}
for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end
ts[#t + 2] = "'" ts[#t + 2] = "'"
ts = table.concat(ts) ts = table.concat(ts)
-- its execution should result in 's'
assert(assert(load(ts))() == s) assert(assert(load(ts))() == s)
end end
assert(utf8.offset("alo", 5) == nil) assert(utf8.offset("alo", 5) == nil)
assert(utf8.offset("alo", -4) == nil) assert(utf8.offset("alo", -4) == nil)
-- 't' is the list of codepoints of 's' -- 'check' makes several tests over the validity of string 's'.
local function check (s, t) -- 't' is the list of codepoints of 's'.
local l = utf8.len(s) local function check (s, t, nonstrict)
local l = utf8.len(s, 1, -1, nonstrict)
assert(#t == l and len(s) == l) assert(#t == l and len(s) == l)
assert(utf8.char(table.unpack(t)) == s) assert(utf8.char(table.unpack(t)) == s) -- 't' and 's' are equivalent
assert(utf8.offset(s, 0) == 1) assert(utf8.offset(s, 0) == 1)
checksyntax(s, t) checksyntax(s, t)
local t1 = {utf8.codepoint(s, 1, -1)} -- creates new table with all codepoints of 's'
local t1 = {utf8.codepoint(s, 1, -1, nonstrict)}
assert(#t == #t1) assert(#t == #t1)
for i = 1, #t do assert(t[i] == t1[i]) end for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1'
for i = 1, l do for i = 1, l do -- for all codepoints
local pi = utf8.offset(s, i) -- position of i-th char local pi = utf8.offset(s, i) -- position of i-th char
local pi1 = utf8.offset(s, 2, pi) -- position of next char local pi1 = utf8.offset(s, 2, pi) -- position of next char
assert(string.find(string.sub(s, pi, pi1 - 1), justone)) assert(string.find(string.sub(s, pi, pi1 - 1), justone))
assert(utf8.offset(s, -1, pi1) == pi) assert(utf8.offset(s, -1, pi1) == pi)
assert(utf8.offset(s, i - l - 1) == pi) assert(utf8.offset(s, i - l - 1) == pi)
assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi))) assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
for j = pi, pi1 - 1 do for j = pi, pi1 - 1 do
assert(utf8.offset(s, 0, j) == pi) assert(utf8.offset(s, 0, j) == pi)
end end
for j = pi + 1, pi1 - 1 do for j = pi + 1, pi1 - 1 do
assert(not utf8.len(s, j)) assert(not utf8.len(s, j))
end end
assert(utf8.len(s, pi, pi) == 1) assert(utf8.len(s, pi, pi, nonstrict) == 1)
assert(utf8.len(s, pi, pi1 - 1) == 1) assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
assert(utf8.len(s, pi) == l - i + 1) assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
assert(utf8.len(s, pi1) == l - i) assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
assert(utf8.len(s, 1, pi) == i) assert(utf8.len(s, 1, pi, -1, nonstrict) == i)
end end
local i = 0 local i = 0
for p, c in utf8.codes(s) do for p, c in utf8.codes(s, nonstrict) do
i = i + 1
assert(c == t[i] and p == utf8.offset(s, i))
assert(utf8.codepoint(s, p) == c)
end
assert(i == #t)
i = 0
for p, c in utf8.codes(s) do
i = i + 1 i = i + 1
assert(c == t[i] and p == utf8.offset(s, i)) assert(c == t[i] and p == utf8.offset(s, i))
assert(utf8.codepoint(s, p, p, nonstrict) == c)
end end
assert(i == #t) assert(i == #t)
@@ -105,13 +102,17 @@ do -- error indication in utf8.len
check("\xF4\x9F\xBF\xBF", 1) check("\xF4\x9F\xBF\xBF", 1)
end end
-- error in utf8.codes -- errors in utf8.codes
checkerror("invalid UTF%-8 code", do
function () local function errorcodes (s)
local s = "ab\xff" checkerror("invalid UTF%-8 code",
for c in utf8.codes(s) do assert(c) end function ()
end) for c in utf8.codes(s) do assert(c) end
end)
end
errorcodes("ab\xff")
errorcodes("\u{110000}")
end
-- error in initial position for offset -- error in initial position for offset
checkerror("position out of range", utf8.offset, "abc", 1, 5) checkerror("position out of range", utf8.offset, "abc", 1, 5)
@@ -141,14 +142,22 @@ do
assert(#t == 0) assert(#t == 0)
checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1) checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1)
checkerror("out of range", utf8.codepoint, s, 1, #s + 1) checkerror("out of range", utf8.codepoint, s, 1, #s + 1)
-- surrogates
assert(utf8.codepoint("\u{D7FF}") == 0xD800 - 1)
assert(utf8.codepoint("\u{E000}") == 0xDFFF + 1)
assert(utf8.codepoint("\u{D800}", 1, 1, true) == 0xD800)
assert(utf8.codepoint("\u{DFFF}", 1, 1, true) == 0xDFFF)
assert(utf8.codepoint("\u{7FFFFFFF}", 1, 1, true) == 0x7FFFFFFF)
end end
assert(utf8.char() == "") assert(utf8.char() == "")
assert(utf8.char(97, 98, 99) == "abc") assert(utf8.char(0, 97, 98, 99, 1) == "\0abc\1")
assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF) assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
assert(utf8.codepoint(utf8.char(0x7FFFFFFF), 1, 1, true) == (1<<31) - 1)
checkerror("value out of range", utf8.char, 0x10FFFF + 1) checkerror("value out of range", utf8.char, 0x7FFFFFFF + 1)
checkerror("value out of range", utf8.char, -1)
local function invalid (s) local function invalid (s)
checkerror("invalid UTF%-8 code", utf8.codepoint, s) checkerror("invalid UTF%-8 code", utf8.codepoint, s)
@@ -158,6 +167,10 @@ end
-- UTF-8 representation for 0x11ffff (value out of valid range) -- UTF-8 representation for 0x11ffff (value out of valid range)
invalid("\xF4\x9F\xBF\xBF") invalid("\xF4\x9F\xBF\xBF")
-- surrogates
invalid("\u{D800}")
invalid("\u{DFFF}")
-- overlong sequences -- overlong sequences
invalid("\xC0\x80") -- zero invalid("\xC0\x80") -- zero
invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte) invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte)
@@ -183,6 +196,21 @@ s = "\0 \x7F\z
s = string.gsub(s, " ", "") s = string.gsub(s, " ", "")
check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF}) check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
do
-- original UTF-8 values
local s = "\u{4000000}\u{7FFFFFFF}"
assert(#s == 12)
check(s, {0x4000000, 0x7FFFFFFF}, true)
s = "\u{200000}\u{3FFFFFF}"
assert(#s == 10)
check(s, {0x200000, 0x3FFFFFF}, true)
s = "\u{10000}\u{1fffff}"
assert(#s == 8)
check(s, {0x10000, 0x1FFFFF}, true)
end
x = "日本語a-4\0éó" x = "日本語a-4\0éó"
check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243}) check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})