diff --git a/ext/liquid_c/lexer.c b/ext/liquid_c/lexer.c index 490e5375..39429acc 100644 --- a/ext/liquid_c/lexer.c +++ b/ext/liquid_c/lexer.c @@ -144,19 +144,33 @@ const char *lex_one(const char *start, const char *end, lexer_token_t *token) if (is_special(c)) RETURN_TOKEN(c, 1); + int remaining_str_len = end - str; + // read multibyte UTF-8 character if ((c & 0x80) == 0) { // 1-byte character rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c", c); } else if ((c & 0xE0) == 0xC0) { // 2-byte character - rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c", c, str[1]); + if (remaining_str_len < 2) { + rb_raise(rb_eArgError, "invalid byte sequence in UTF-8"); + } else { + rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c", c, str[1]); + } } else if ((c & 0xF0) == 0xE0) { // 3-byte character - rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c%c", c, str[1], str[2]); + if (remaining_str_len < 3) { + rb_raise(rb_eArgError, "invalid byte sequence in UTF-8"); + } else { + rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c%c", c, str[1], str[2]); + } } else if ((c & 0xF8) == 0xF0) { // 4-byte character - rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c%c%c", c, str[1], str[2], str[3]); + if (remaining_str_len < 4) { + rb_raise(rb_eArgError, "invalid byte sequence in UTF-8"); + } else { + rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c%c%c", c, str[1], str[2], str[3]); + } } else { // this should never happen rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c", c); diff --git a/test/unit/variable_test.rb b/test/unit/variable_test.rb index 61c550b8..beab9cf6 100644 --- a/test/unit/variable_test.rb +++ b/test/unit/variable_test.rb @@ -283,6 +283,44 @@ def test_encoding_error_message_with_multi_byte_characters ) end + def test_invalid_utf8_sequence + # 2 byte character with 1 byte missing + exc = assert_raises(ArgumentError) do + variable_strict_parse("\xC0") + end + assert_equal("invalid byte sequence in UTF-8", exc.message) + + # 3 byte character with 1 byte missing + exc = assert_raises(ArgumentError) do + variable_strict_parse("\xE0\x01") + end + assert_equal("invalid byte sequence in UTF-8", exc.message) + + # 3 byte character with 2 byte missing + exc = assert_raises(ArgumentError) do + variable_strict_parse("\xE0") + end + assert_equal("invalid byte sequence in UTF-8", exc.message) + + # 4 byte character with 1 byte missing + exc = assert_raises(ArgumentError) do + variable_strict_parse("\xF0\x01\x01") + end + assert_equal("invalid byte sequence in UTF-8", exc.message) + + # 4 byte character with 2 byte missing + exc = assert_raises(ArgumentError) do + variable_strict_parse("\xF0\x01") + end + assert_equal("invalid byte sequence in UTF-8", exc.message) + + # 4 byte character with 3 byte missing + exc = assert_raises(ArgumentError) do + variable_strict_parse("\xF0") + end + assert_equal("invalid byte sequence in UTF-8", exc.message) + end + private def variable_strict_parse(markup)