Skip to content

Commit

Permalink
ICU-22898 MF2 parser bug fixes
Browse files Browse the repository at this point in the history
ICU4C: Escape curly braces when serializing and normalizing
ICU4C: Escape '|' in patterns
ICU4C: When normalizing input, escape optionally-escaped characters in patterns
ICU4C/ICU4J: Allow trailing whitespace after a match
ICU4C: Fix parser to iterate over code points, not code units
Add tests with old reserved syntax as syntax-error tests
  • Loading branch information
catamorphism committed Sep 20, 2024
1 parent 9adcf8b commit 8f82fac
Show file tree
Hide file tree
Showing 9 changed files with 408 additions and 322 deletions.
620 changes: 321 additions & 299 deletions icu4c/source/i18n/messageformat2_parser.cpp

Large diffs are not rendered by default.

13 changes: 12 additions & 1 deletion icu4c/source/i18n/messageformat2_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,21 @@ namespace message2 {
SelectorKeys parseNonEmptyKeys(UErrorCode&);
void errorPattern(UErrorCode& status);
Pattern parseQuotedPattern(UErrorCode&);
bool isDeclarationStart();

UChar32 peek() const { return source.char32At(index) ; }
UChar32 peek(uint32_t i) const {
return source.char32At(source.moveIndex32(index, i));
}
void next() { index = source.moveIndex32(index, 1); }

bool inBounds() const { return (int32_t) index < source.length(); }
bool inBounds(uint32_t i) const { return source.moveIndex32(index, i) < source.length(); }
bool allConsumed() const { return (int32_t) index == source.length(); }

// The input string
const UnicodeString &source;
// The current position within the input string
// The current position within the input string -- counting in UChar32
uint32_t index;
// Represents the current line (and when an error is indicated),
// character offset within the line of the parse error
Expand Down
29 changes: 16 additions & 13 deletions icu4c/source/i18n/messageformat2_serializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,26 @@ void Serializer::emit(const std::u16string_view& token) {
void Serializer::emit(const Literal& l) {
if (l.isQuoted()) {
emit(PIPE);
const UnicodeString& contents = l.unquoted();
for (int32_t i = 0; i < contents.length(); i++) {
// Re-escape any PIPE or BACKSLASH characters
}
const UnicodeString& contents = l.unquoted();
for (int32_t i = 0; ((int32_t) i) < contents.length(); i++) {
// Re-escape any escaped-char characters
switch(contents[i]) {
case BACKSLASH:
case PIPE: {
emit(BACKSLASH);
break;
case PIPE:
case LEFT_CURLY_BRACE:
case RIGHT_CURLY_BRACE: {
emit(BACKSLASH);
break;
}
default: {
break;
break;
}
}
emit(contents[i]);
}
emit(PIPE);
} else {
emit(l.unquoted());
}
if (l.isQuoted()) {
emit(PIPE);
}
}

Expand Down Expand Up @@ -194,9 +196,10 @@ void Serializer::emit(const PatternPart& part) {
if (part.isText()) {
// Raw text
const UnicodeString& text = part.asText();
// Re-escape '{'/'}'/'\'
for (int32_t i = 0; i < text.length(); i++) {
// Re-escape '{'/'}'/'\''|'
for (int32_t i = 0; ((int32_t) i) < text.length(); i++) {
switch(text[i]) {
case PIPE:
case BACKSLASH:
case LEFT_CURLY_BRACE:
case RIGHT_CURLY_BRACE: {
Expand Down
2 changes: 2 additions & 0 deletions icu4c/source/test/intltest/messageformat2test_read_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) {
// Do tests for reserved statements/expressions
runTestsFromJsonFile(*this, "spec/unsupported-expressions.json", errorCode);
runTestsFromJsonFile(*this, "spec/unsupported-statements.json", errorCode);
// Uncomment when reserved syntax is removed
// runTestsFromJsonFile(*this, "syntax-errors-reserved.json", errorCode);

// Do valid spec tests
runTestsFromJsonFile(*this, "spec/syntax.json", errorCode);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -613,10 +613,8 @@ private MFDataModel.Variant getVariant() throws MFParseException {
}
keys.add(key);
}
// Only want to skip whitespace if we parsed at least one key
if (!keys.isEmpty()) {
skipOptionalWhitespaces();
}
// Trailing whitespace is allowed after the message
skipOptionalWhitespaces();
if (input.atEnd()) {
checkCondition(
keys.isEmpty(), "After selector keys it is mandatory to have a pattern.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ public class CoreTest extends CoreTestFmwk {
"syntax-errors-diagnostics.json",
"syntax-errors-diagnostics-multiline.json",
"syntax-errors-end-of-input.json",
"syntax-errors-reserved.json",
"tricky-declarations.json",
"valid-tests.json"};

Expand Down
9 changes: 4 additions & 5 deletions testdata/message2/syntax-errors-diagnostics.json
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,6 @@
"char": 28,
"comment": "Trailing characters that are not whitespace"
},
{
"src": ".match {$foo :string} {$bar :string} one * {{one}} * * {{other}} ",
"char": 66,
"comment": "Trailing whitespace at end of message should not be accepted either"
},
{
"src": "empty { }",
"char": 8,
Expand Down Expand Up @@ -343,6 +338,10 @@
"src": ".match {1} {{_}}",
"char": 12,
"comment": "Disambiguating a wrong .match from an unsupported statement"
},
{
"src": "{{{/p o4.􍅲 = 1}}}",
"char": 9
}
]
}
22 changes: 22 additions & 0 deletions testdata/message2/syntax-errors-reserved.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"scenario": "Reserved and private annotations",
"description": "Tests for old unsupported expression (reserved/private) syntax -- these are now syntax errors",
"defaultTestProperties": {
"locale": "en-US",
"expErrors": [
{
"type": "syntax-error"
}
]
},
"tests": [
{ "src" : "\\\\{ ? 󗟋 𫓜|@} |}" },
{ "src" : "\\\\{ ? 󗟋 𫓜|@} | \\} @𠟅򑚎𥪙𽧫=|| @򒘒򳷦㥞򉊷=$򸚶񽱆񅗽񤕞 @𰺱:񎫛񢶛򶈎񄮒}" },
{ "src" : "{ $iFN ^ @USh =$u @l}" },
{ "src" : ".local $dS ={ $p4 ^ |.| \\\\ @g:FV = $kd}{{@}}" },
{ "src" : ".D. \\\\ ||{1}{{}} " },
{ "src" : ".cIT ||{|@| % \\} } { *󔜫񥘃󸇀 }{{}}" },
{ "src" : ".򱋿󠆿 . |@| {$󛒁񓝖 & \\{ . @𯼼򋼡= $򵀀񓞈}{{\\\\}}" }
]
}

28 changes: 28 additions & 0 deletions testdata/message2/valid-tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,34 @@
"comment": "message -> complex-message -> *(declaration [s]) complex-body -> declaration complex-body -> input-declaration complex-body -> input variable-expression complex-body",
"src": ".input{$x}{{}}",
"exp": ""
},
{
"comment": "Test that escaped characters are re-escaped properly when serializing literals",
"src": "{|\\}|}",
"exp": "}"
},
{
"comment": "Test that escaped characters are re-escaped properly when serializing patterns",
"src": "\\|",
"exp": "|"
},
{
"comment": "Test that parser and serializer treat optionally-escaped characters consistently",
"src" : "{{{1}|}}",
"exp": "1|"
},
{
"comment": "Trailing whitespace after match is valid",
"src": ".match {1 :string} * {{}} ",
"exp": ""
},
{
"src" : "𴢸",
"exp" : "𴢸"
},
{
"src" : "{{􍅲}}",
"exp" : "􍅲"
}
]
}

0 comments on commit 8f82fac

Please sign in to comment.