Changes in the validation of UTF-8

All UTF-8 encoding functionality (including the escape sequence '\u') accepts all values from the original UTF-8 specification (with sequences of up to six bytes). By default, the decoding functions in the UTF-8 library do not accept invalid Unicode code points, such as surrogates. A new parameter 'nonstrict' makes them accept all code points up to (2^31)-1, as in the original UTF-8 specification.
2019-03-15 13:14:17 -03:00
parent 8fa4f1380b
commit 1e0c73d5b6
6 changed files with 164 additions and 72 deletions
--- a/llex.c
+++ b/llex.c
@@ -335,7 +335,7 @@ static unsigned long readutf8esc (LexState *ls) {
  while ((save_and_next(ls), lisxdigit(ls->current))) {
    i++;
    r = (r << 4) + luaO_hexavalue(ls->current);
-    esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
+    esccheck(ls, r <= 0x7FFFFFFFu, "UTF-8 value too large");
  }
  esccheck(ls, ls->current == '}', "missing '}'");
  next(ls);  /* skip '}' */
--- a/lobject.c
+++ b/lobject.c
@@ -343,7 +343,7 @@ size_t luaO_str2num (const char *s, TValue *o) {
 int luaO_utf8esc (char *buff, unsigned long x) {
  int n = 1;  /* number of bytes put in buffer (backwards) */
-  lua_assert(x <= 0x10FFFF);
+  lua_assert(x <= 0x7FFFFFFFu);
  if (x < 0x80)  /* ascii? */
    buff[UTF8BUFFSZ - 1] = cast_char(x);
  else {  /* need continuation bytes */
@@ -435,9 +435,9 @@ const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
        pushstr(L, buff, l);
        break;
      }
-      case 'U': {  /* an 'int' as a UTF-8 sequence */
+      case 'U': {  /* a 'long' as a UTF-8 sequence */
        char buff[UTF8BUFFSZ];
-        int l = luaO_utf8esc(buff, cast(long, va_arg(argp, long)));
+        int l = luaO_utf8esc(buff, va_arg(argp, long));
        pushstr(L, buff + UTF8BUFFSZ - l, l);
        break;
      }
--- a/lutf8lib.c
+++ b/lutf8lib.c
@@ -21,12 +21,14 @@
 #include "lualib.h"
-#define MAXUNICODE	0x10FFFF
+#define MAXUNICODE	0x10FFFFu
 #define MAXUTF		0x7FFFFFFFu
 /*
-** Integer type for decoded UTF-8 values; MAXUNICODE needs 21 bits.
+** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
 */
-#if LUAI_BITSINT >= 21
+#if LUAI_BITSINT >= 31
 typedef	unsigned int utfint;
 #else
 typedef unsigned long utfint;
@@ -46,38 +48,46 @@ static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
 /*
-** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
+** Decode one UTF-8 sequence, returning NULL if byte sequence is
 ** invalid.  The array 'limits' stores the minimum value for each
 ** sequence length, to check for overlong representations. Its first
 ** entry forces an error for non-ascii bytes with no continuation
 ** bytes (count == 0).
 */
-static const char *utf8_decode (const char *o, utfint *val) {
+static const char *utf8_decode (const char *s, utfint *val, int strict) {
-  static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
+  static const utfint limits[] =
-  const unsigned char *s = (const unsigned char *)o;
+        {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
-  unsigned int c = s[0];
+  unsigned int c = (unsigned char)s[0];
  utfint res = 0;  /* final result */
  if (c < 0x80)  /* ascii? */
    res = c;
  else {
    int count = 0;  /* to count number of continuation bytes */
-    while (c & 0x40) {  /* still have continuation bytes? */
+    for (; c & 0x40; c <<= 1) {  /* while it needs continuation bytes... */
-      int cc = s[++count];  /* read next byte */
+      unsigned int cc = (unsigned char)s[++count];  /* read next byte */
      if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
        return NULL;  /* invalid byte sequence */
      res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
      c <<= 1;  /* to test next bit */
    }
    res |= ((utfint)(c & 0x7F) << (count * 5));  /* add first byte */
-    if (count > 3 || res > MAXUNICODE || res <= limits[count])
+    if (count > 5 || res > MAXUTF || res < limits[count])
      return NULL;  /* invalid byte sequence */
    s += count;  /* skip continuation bytes read */
  }
  if (strict) {
    /* check for invalid code points; too large or surrogates */
    if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu))
      return NULL;
  }
  if (val) *val = res;
-  return (const char *)s + 1;  /* +1 to include first byte */
+  return s + 1;  /* +1 to include first byte */
 }
 /*
-** utf8len(s [, i [, j]]) --> number of characters that start in the
+** utf8len(s [, i [, j [, nonstrict]]]) --> number of characters that
-** range [i,j], or nil + current position if 's' is not well formed in
+** start in the range [i,j], or nil + current position if 's' is not
-** that interval
+** well formed in that interval
 */
 static int utflen (lua_State *L) {
  lua_Integer n = 0;  /* counter for the number of characters */
@@ -85,12 +95,13 @@ static int utflen (lua_State *L) {
  const char *s = luaL_checklstring(L, 1, &len);
  lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
  lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
  int nonstrict = lua_toboolean(L, 4);
  luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
                   "initial position out of string");
  luaL_argcheck(L, --posj < (lua_Integer)len, 3,
                   "final position out of string");
  while (posi <= posj) {
-    const char *s1 = utf8_decode(s + posi, NULL);
+    const char *s1 = utf8_decode(s + posi, NULL, !nonstrict);
    if (s1 == NULL) {  /* conversion error? */
      lua_pushnil(L);  /* return nil ... */
      lua_pushinteger(L, posi + 1);  /* ... and current position */
@@ -105,14 +116,15 @@ static int utflen (lua_State *L) {
 /*
-** codepoint(s, [i, [j]])  -> returns codepoints for all characters
+** codepoint(s, [i, [j [, nonstrict]]]) -> returns codepoints for all
-** that start in the range [i,j]
+** characters that start in the range [i,j]
 */
 static int codepoint (lua_State *L) {
  size_t len;
  const char *s = luaL_checklstring(L, 1, &len);
  lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
  lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
  int nonstrict = lua_toboolean(L, 4);
  int n;
  const char *se;
  luaL_argcheck(L, posi >= 1, 2, "out of range");
@@ -126,7 +138,7 @@ static int codepoint (lua_State *L) {
  se = s + pose;  /* string end */
  for (s += posi - 1; s < se;) {
    utfint code;
-    s = utf8_decode(s, &code);
+    s = utf8_decode(s, &code, !nonstrict);
    if (s == NULL)
      return luaL_error(L, "invalid UTF-8 code");
    lua_pushinteger(L, code);
@@ -137,8 +149,8 @@ static int codepoint (lua_State *L) {
 static void pushutfchar (lua_State *L, int arg) {
-  lua_Integer code = luaL_checkinteger(L, arg);
+  lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
-  luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
+  luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
  lua_pushfstring(L, "%U", (long)code);
 }
@@ -209,7 +221,7 @@ static int byteoffset (lua_State *L) {
 }
-static int iter_aux (lua_State *L) {
+static int iter_aux (lua_State *L, int strict) {
  size_t len;
  const char *s = luaL_checklstring(L, 1, &len);
  lua_Integer n = lua_tointeger(L, 2) - 1;
@@ -223,8 +235,8 @@ static int iter_aux (lua_State *L) {
    return 0;  /* no more codepoints */
  else {
    utfint code;
-    const char *next = utf8_decode(s + n, &code);
+    const char *next = utf8_decode(s + n, &code, strict);
-    if (next == NULL || iscont(next))
+    if (next == NULL)
      return luaL_error(L, "invalid UTF-8 code");
    lua_pushinteger(L, n + 1);
    lua_pushinteger(L, code);
@@ -233,9 +245,19 @@ static int iter_aux (lua_State *L) {
 }
 static int iter_auxstrict (lua_State *L) {
  return iter_aux(L, 1);
 }
 static int iter_auxnostrict (lua_State *L) {
  return iter_aux(L, 0);
 }
 static int iter_codes (lua_State *L) {
  int nonstrict = lua_toboolean(L, 2);
  luaL_checkstring(L, 1);
-  lua_pushcfunction(L, iter_aux);
+  lua_pushcfunction(L, nonstrict ? iter_auxnostrict : iter_auxstrict);
  lua_pushvalue(L, 1);
  lua_pushinteger(L, 0);
  return 3;
@@ -243,7 +265,7 @@ static int iter_codes (lua_State *L) {
 /* pattern to match a single UTF-8 character */
-#define UTF8PATT	"[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
+#define UTF8PATT	"[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
 static const luaL_Reg funcs[] = {
--- a/manual/manual.of
+++ b/manual/manual.of
@@ -1004,6 +1004,8 @@ the escape sequence @T{\u{@rep{XXX}}}
 (note the mandatory enclosing brackets),
 where @rep{XXX} is a sequence of one or more hexadecimal digits
 representing the character code point.
 This code point can be any value smaller than @M{2@sp{31}}.
 (Lua uses the original UTF-8 specification here.)
 Literal strings can also be defined using a long format
 enclosed by @def{long brackets}.
@@ -6899,6 +6901,7 @@ x = string.gsub("$name-$version.tar.gz", "%$(%w+)", t)
 }
@LibEntry{string.len (s)|
 Receives a string and returns its length.
 The empty string @T{""} has length 0.
 Embedded zeros are counted,
@@ -6907,6 +6910,7 @@ so @T{"a\000bc\000"} has length 5.
 }
@LibEntry{string.lower (s)|
 Receives a string and returns a copy of this string with all
 uppercase letters changed to lowercase.
 All other characters are left unchanged.
@@ -6915,6 +6919,7 @@ The definition of what an uppercase letter is depends on the current locale.
 }
@LibEntry{string.match (s, pattern [, init])|
 Looks for the first @emph{match} of
@id{pattern} @see{pm} in the string @id{s}.
 If it finds one, then @id{match} returns
@@ -6946,6 +6951,7 @@ The format string cannot have the variable-length options
 }
@LibEntry{string.rep (s, n [, sep])|
 Returns a string that is the concatenation of @id{n} copies of
 the string @id{s} separated by the string @id{sep}.
 The default value for @id{sep} is the empty string
@@ -6958,11 +6964,13 @@ with a single call to this function.)
 }
@LibEntry{string.reverse (s)|
 Returns a string that is the string @id{s} reversed.
 }
@LibEntry{string.sub (s, i [, j])|
 Returns the substring of @id{s} that
 starts at @id{i}  and continues until @id{j};
@id{i} and @id{j} can be negative.
@@ -6998,6 +7006,7 @@ this function also returns the index of the first unread byte in @id{s}.
 }
@LibEntry{string.upper (s)|
 Receives a string and returns a copy of this string with all
 lowercase letters changed to uppercase.
 All other characters are left unchanged.
@@ -7318,8 +7327,24 @@ or one plus the length of the subject string.
 As in the string library,
 negative indices count from the end of the string.
 Functions that create byte sequences
 accept all values up to @T{0x7FFFFFFF},
 as defined in the original UTF-8 specification;
 that implies byte sequences of up to six bytes.
 Functions that interpret byte sequences only accept
 valid sequences (well formed and not overlong).
 By default, they only accept byte sequences
 that result in valid Unicode code points,
 rejecting values larger than @T{10FFFF} and surrogates.
 A boolean argument @id{nonstrict}, when available,
 lifts these checks,
 so that all values up to @T{0x7FFFFFFF} are accepted.
 (Not well formed and overlong sequences are still rejected.)
@LibEntry{utf8.char (@Cdots)|
 Receives zero or more integers,
 converts each one to its corresponding UTF-8 byte sequence
 and returns a string with the concatenation of all these sequences.
@@ -7327,14 +7352,15 @@ and returns a string with the concatenation of all these sequences.
 }
@LibEntry{utf8.charpattern|
-The pattern (a string, not a function) @St{[\0-\x7F\xC2-\xF4][\x80-\xBF]*}
+
 The pattern (a string, not a function) @St{[\0-\x7F\xC2-\xFD][\x80-\xBF]*}
@see{pm},
 which matches exactly one UTF-8 byte sequence,
 assuming that the subject is a valid UTF-8 string.
 }
-@LibEntry{utf8.codes (s)|
+@LibEntry{utf8.codes (s [, nonstrict])|
 Returns values so that the construction
@verbatim{
@@ -7347,7 +7373,8 @@ It raises an error if it meets any invalid byte sequence.
 }
-@LibEntry{utf8.codepoint (s [, i [, j]])|
+@LibEntry{utf8.codepoint (s [, i [, j [, nonstrict]]])|
 Returns the codepoints (as integers) from all characters in @id{s}
 that start between byte position @id{i} and @id{j} (both included).
 The default for @id{i} is 1 and for @id{j} is @id{i}.
@@ -7355,7 +7382,8 @@ It raises an error if it meets any invalid byte sequence.
 }
-@LibEntry{utf8.len (s [, i [, j]])|
+@LibEntry{utf8.len (s [, i [, j [, nonstrict]]])|
 Returns the number of UTF-8 characters in string @id{s}
 that start between positions @id{i} and @id{j} (both inclusive).
 The default for @id{i} is @num{1} and for @id{j} is @num{-1}.
@@ -7365,6 +7393,7 @@ returns a false value plus the position of the first invalid byte.
 }
@LibEntry{utf8.offset (s, n [, i])|
 Returns the position (in bytes) where the encoding of the
@id{n}-th character of @id{s}
 (counting from position @id{i}) starts.
@@ -8755,6 +8784,12 @@ You can enclose the call in parentheses if you need to
 discard these extra results.
 }
@item{
 By default, the decoding functions in the @Lid{utf8} library
 do not accept surrogates as valid code points.
 An extra parameter in these functions makes them more permissive.
 }
 }
 }
--- a/testes/literals.lua
+++ b/testes/literals.lua
@@ -56,16 +56,23 @@ assert("abc\z
 assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0))
 -- limits for 1-byte sequences
-assert("\u{0}\u{7F}" == "\x00\z\x7F")
+assert("\u{0}\u{7F}" == "\x00\x7F")
 -- limits for 2-byte sequences
-assert("\u{80}\u{7FF}" == "\xC2\x80\z\xDF\xBF")
+assert("\u{80}\u{7FF}" == "\xC2\x80\xDF\xBF")
 -- limits for 3-byte sequences
-assert("\u{800}\u{FFFF}" ==   "\xE0\xA0\x80\z\xEF\xBF\xBF")
+assert("\u{800}\u{FFFF}" ==   "\xE0\xA0\x80\xEF\xBF\xBF")
 -- limits for 4-byte sequences
-assert("\u{10000}\u{10FFFF}" == "\xF0\x90\x80\x80\z\xF4\x8F\xBF\xBF")
+assert("\u{10000}\u{1FFFFF}" == "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF")
 -- limits for 5-byte sequences
 assert("\u{200000}\u{3FFFFFF}" == "\xF8\x88\x80\x80\x80\xFB\xBF\xBF\xBF\xBF")
 -- limits for 6-byte sequences
 assert("\u{4000000}\u{7FFFFFFF}" ==
       "\xFC\x84\x80\x80\x80\x80\xFD\xBF\xBF\xBF\xBF\xBF")
 -- Error in escape sequences
@@ -94,7 +101,7 @@ lexerror([["xyz\300"]], [[\300"]])
 lexerror([["   \256"]], [[\256"]])
 -- errors in UTF-8 sequences
-lexerror([["abc\u{110000}"]], [[abc\u{110000]])   -- too large
+lexerror([["abc\u{100000000}"]], [[abc\u{100000000]])   -- too large
 lexerror([["abc\u11r"]], [[abc\u1]])    -- missing '{'
 lexerror([["abc\u"]], [[abc\u"]])    -- missing '{'
 lexerror([["abc\u{11r"]], [[abc\u{11r]])    -- missing '}'
--- a/testes/utf8.lua
+++ b/testes/utf8.lua
@@ -21,62 +21,59 @@ local justone = "^" .. utf8.charpattern .. "$"
 -- 't' is the list of codepoints of 's'
 local function checksyntax (s, t)
  -- creates a string "return '\u{t[1]}...\u{t[n]}'"
  local ts = {"return '"}
  for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end
  ts[#t + 2] = "'"
  ts = table.concat(ts)
  -- its execution should result in 's'
  assert(assert(load(ts))() == s)
 end
 assert(utf8.offset("alo", 5) == nil)
 assert(utf8.offset("alo", -4) == nil)
-- 't' is the list of codepoints of 's'
+-- 'check' makes several tests over the validity of string 's'.
-local function check (s, t)
+-- 't' is the list of codepoints of 's'.
-  local l = utf8.len(s) 
+local function check (s, t, nonstrict)
  local l = utf8.len(s, 1, -1, nonstrict)
  assert(#t == l and len(s) == l)
-  assert(utf8.char(table.unpack(t)) == s)
+  assert(utf8.char(table.unpack(t)) == s)   -- 't' and 's' are equivalent
  assert(utf8.offset(s, 0) == 1)
  checksyntax(s, t)
-  local t1 = {utf8.codepoint(s, 1, -1)}
+  -- creates new table with all codepoints of 's'
  local t1 = {utf8.codepoint(s, 1, -1, nonstrict)}
  assert(#t == #t1)
-  for i = 1, #t do assert(t[i] == t1[i]) end
+  for i = 1, #t do assert(t[i] == t1[i]) end   -- 't' is equal to 't1'
-  for i = 1, l do
+  for i = 1, l do   -- for all codepoints
    local pi = utf8.offset(s, i)        -- position of i-th char
    local pi1 = utf8.offset(s, 2, pi)   -- position of next char
    assert(string.find(string.sub(s, pi, pi1 - 1), justone))
    assert(utf8.offset(s, -1, pi1) == pi)
    assert(utf8.offset(s, i - l - 1) == pi)
-    assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi)))
+    assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
    for j = pi, pi1 - 1 do
      assert(utf8.offset(s, 0, j) == pi)
    end
    for j = pi + 1, pi1 - 1 do
      assert(not utf8.len(s, j))
    end
-   assert(utf8.len(s, pi, pi) == 1)
+   assert(utf8.len(s, pi, pi, nonstrict) == 1)
-   assert(utf8.len(s, pi, pi1 - 1) == 1)
+   assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
-   assert(utf8.len(s, pi) == l - i + 1)
+   assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
-   assert(utf8.len(s, pi1) == l - i)
+   assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
-   assert(utf8.len(s, 1, pi) == i)
+   assert(utf8.len(s, 1, pi, -1, nonstrict) == i)
  end
  local i = 0
-  for p, c in utf8.codes(s) do
+  for p, c in utf8.codes(s, nonstrict) do
    i = i + 1
    assert(c == t[i] and p == utf8.offset(s, i))
    assert(utf8.codepoint(s, p) == c)
  end
  assert(i == #t)
  i = 0
  for p, c in utf8.codes(s) do
    i = i + 1
    assert(c == t[i] and p == utf8.offset(s, i))
    assert(utf8.codepoint(s, p, p, nonstrict) == c)
  end
  assert(i == #t)
@@ -105,13 +102,17 @@ do    -- error indication in utf8.len
  check("\xF4\x9F\xBF\xBF", 1)
 end
-- error in utf8.codes
+-- errors in utf8.codes
-checkerror("invalid UTF%-8 code",
+do
-  function ()
+  local function errorcodes (s)
-    local s = "ab\xff"
+    checkerror("invalid UTF%-8 code",
-    for c in utf8.codes(s) do assert(c) end
+      function ()
-  end)
+        for c in utf8.codes(s) do assert(c) end
-
+      end)
  end
  errorcodes("ab\xff")
  errorcodes("\u{110000}")
 end
 -- error in initial position for offset
 checkerror("position out of range", utf8.offset, "abc", 1, 5)
@@ -141,14 +142,22 @@ do
  assert(#t == 0)
  checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1)
  checkerror("out of range", utf8.codepoint, s, 1, #s + 1)
  -- surrogates
  assert(utf8.codepoint("\u{D7FF}") == 0xD800 - 1)
  assert(utf8.codepoint("\u{E000}") == 0xDFFF + 1)
  assert(utf8.codepoint("\u{D800}", 1, 1, true) == 0xD800)
  assert(utf8.codepoint("\u{DFFF}", 1, 1, true) == 0xDFFF)
  assert(utf8.codepoint("\u{7FFFFFFF}", 1, 1, true) == 0x7FFFFFFF)
 end
 assert(utf8.char() == "")
-assert(utf8.char(97, 98, 99) == "abc")
+assert(utf8.char(0, 97, 98, 99, 1) == "\0abc\1")
 assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
 assert(utf8.codepoint(utf8.char(0x7FFFFFFF), 1, 1, true) == (1<<31) - 1)
-checkerror("value out of range", utf8.char, 0x10FFFF + 1)
+checkerror("value out of range", utf8.char, 0x7FFFFFFF + 1)
 checkerror("value out of range", utf8.char, -1)
 local function invalid (s)
  checkerror("invalid UTF%-8 code", utf8.codepoint, s)
@@ -158,6 +167,10 @@ end
 -- UTF-8 representation for 0x11ffff (value out of valid range)
 invalid("\xF4\x9F\xBF\xBF")
 -- surrogates
 invalid("\u{D800}")
 invalid("\u{DFFF}")
 -- overlong sequences
 invalid("\xC0\x80")          -- zero
 invalid("\xC1\xBF")          -- 0x7F (should be coded in 1 byte)
@@ -183,6 +196,21 @@ s = "\0 \x7F\z
 s = string.gsub(s, " ", "")
 check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
 do
  -- original UTF-8 values
  local s = "\u{4000000}\u{7FFFFFFF}"
  assert(#s == 12)
  check(s, {0x4000000, 0x7FFFFFFF}, true)
  s = "\u{200000}\u{3FFFFFF}"
  assert(#s == 10)
  check(s, {0x200000, 0x3FFFFFF}, true)
  s = "\u{10000}\u{1fffff}"
  assert(#s == 8)
  check(s, {0x10000, 0x1FFFFF}, true)
 end
 x = "日本語a-4\0éó"
 check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})