Skip to content

Commit

Permalink
check if invalid UTF-8 byte sequence is given
Browse files Browse the repository at this point in the history
  • Loading branch information
ggmichaelgo committed Nov 27, 2023
1 parent 5280ecb commit fab42f6
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 3 deletions.
20 changes: 17 additions & 3 deletions ext/liquid_c/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,19 +144,33 @@ const char *lex_one(const char *start, const char *end, lexer_token_t *token)

if (is_special(c)) RETURN_TOKEN(c, 1);

int remaining_str_len = end - str;

// read multibyte UTF-8 character
if ((c & 0x80) == 0) {
// 1-byte character
rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c", c);
} else if ((c & 0xE0) == 0xC0) {
// 2-byte character
rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c", c, str[1]);
if (remaining_str_len < 2) {
rb_raise(rb_eArgError, "invalid byte sequence in UTF-8");
} else {
rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c", c, str[1]);
}
} else if ((c & 0xF0) == 0xE0) {
// 3-byte character
rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c%c", c, str[1], str[2]);
if (remaining_str_len < 3) {
rb_raise(rb_eArgError, "invalid byte sequence in UTF-8");
} else {
rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c%c", c, str[1], str[2]);
}
} else if ((c & 0xF8) == 0xF0) {
// 4-byte character
rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c%c%c", c, str[1], str[2], str[3]);
if (remaining_str_len < 4) {
rb_raise(rb_eArgError, "invalid byte sequence in UTF-8");
} else {
rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c%c%c%c", c, str[1], str[2], str[3]);
}
} else {
// this should never happen
rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c", c);
Expand Down
38 changes: 38 additions & 0 deletions test/unit/variable_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,44 @@ def test_encoding_error_message_with_multi_byte_characters
)
end

def test_invalid_utf8_sequence
# 2 byte character with 1 byte missing
exc = assert_raises(ArgumentError) do
variable_strict_parse("\xC0")
end
assert_equal("invalid byte sequence in UTF-8", exc.message)

# 3 byte character with 1 byte missing
exc = assert_raises(ArgumentError) do
variable_strict_parse("\xE0\x01")
end
assert_equal("invalid byte sequence in UTF-8", exc.message)

# 3 byte character with 2 byte missing
exc = assert_raises(ArgumentError) do
variable_strict_parse("\xE0")
end
assert_equal("invalid byte sequence in UTF-8", exc.message)

# 4 byte character with 1 byte missing
exc = assert_raises(ArgumentError) do
variable_strict_parse("\xF0\x01\x01")
end
assert_equal("invalid byte sequence in UTF-8", exc.message)

# 4 byte character with 2 byte missing
exc = assert_raises(ArgumentError) do
variable_strict_parse("\xF0\x01")
end
assert_equal("invalid byte sequence in UTF-8", exc.message)

# 4 byte character with 3 byte missing
exc = assert_raises(ArgumentError) do
variable_strict_parse("\xF0")
end
assert_equal("invalid byte sequence in UTF-8", exc.message)
end

private

def variable_strict_parse(markup)
Expand Down

0 comments on commit fab42f6

Please sign in to comment.