Changes in the validation of UTF-8

All UTF-8 encoding functionality (including the escape
sequence '\u') accepts all values from the original UTF-8
specification (with sequences of up to six bytes).

By default, the decoding functions in the UTF-8 library do not
accept invalid Unicode code points, such as surrogates. A new
parameter 'nonstrict' makes them accept all code points up to
(2^31)-1, as in the original UTF-8 specification.
This commit is contained in:
Roberto Ierusalimschy
2019-03-15 13:14:17 -03:00
parent 8fa4f1380b
commit 1e0c73d5b6
6 changed files with 164 additions and 72 deletions

View File

@@ -56,16 +56,23 @@ assert("abc\z
assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0))
-- limits for 1-byte sequences
assert("\u{0}\u{7F}" == "\x00\z\x7F")
assert("\u{0}\u{7F}" == "\x00\x7F")
-- limits for 2-byte sequences
assert("\u{80}\u{7FF}" == "\xC2\x80\z\xDF\xBF")
assert("\u{80}\u{7FF}" == "\xC2\x80\xDF\xBF")
-- limits for 3-byte sequences
assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\z\xEF\xBF\xBF")
assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\xEF\xBF\xBF")
-- limits for 4-byte sequences
assert("\u{10000}\u{10FFFF}" == "\xF0\x90\x80\x80\z\xF4\x8F\xBF\xBF")
assert("\u{10000}\u{1FFFFF}" == "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF")
-- limits for 5-byte sequences
assert("\u{200000}\u{3FFFFFF}" == "\xF8\x88\x80\x80\x80\xFB\xBF\xBF\xBF\xBF")
-- limits for 6-byte sequences
assert("\u{4000000}\u{7FFFFFFF}" ==
"\xFC\x84\x80\x80\x80\x80\xFD\xBF\xBF\xBF\xBF\xBF")
-- Error in escape sequences
@@ -94,7 +101,7 @@ lexerror([["xyz\300"]], [[\300"]])
lexerror([[" \256"]], [[\256"]])
-- errors in UTF-8 sequences
lexerror([["abc\u{110000}"]], [[abc\u{110000]]) -- too large
lexerror([["abc\u{100000000}"]], [[abc\u{100000000]]) -- too large
lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{'
lexerror([["abc\u"]], [[abc\u"]]) -- missing '{'
lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}'