From 7565c56c3ba5feba9a2873e327e7b70cb1013bc4 Mon Sep 17 00:00:00 2001 From: Gabriel Belingueres Date: Fri, 25 Jan 2019 19:42:07 -0300 Subject: [PATCH 1/2] #57 Uncaught IllegalArgumentException due to malformed unicode entity ref - Added a more readable error message by means of a XmlPullParserException. - Improved validation of the numeric character reference, according to XML 1.0 spec. (https://www.w3.org/TR/REC-xml/#NT-Char) --- .../plexus/util/xml/pull/MXParser.java | 47 +++++++++++++++---- .../plexus/util/xml/pull/MXParserTest.java | 42 +++++++++++++++++ 2 files changed, 79 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java index 5a119719..ff501216 100644 --- a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java +++ b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java @@ -2664,7 +2664,8 @@ protected char[] parseEntityRef() entityRefName = null; posStart = pos; char ch = more(); - StringBuilder sb = new StringBuilder(); + StringBuilder sb16 = new StringBuilder(); + StringBuilder sb10 = new StringBuilder(); if ( ch == '#' ) { // parse character reference @@ -2679,17 +2680,17 @@ protected char[] parseEntityRef() if ( ch >= '0' && ch <= '9' ) { charRef = (char) ( charRef * 16 + ( ch - '0' ) ); - sb.append( ch ); + sb16.append( ch ); } else if ( ch >= 'a' && ch <= 'f' ) { charRef = (char) ( charRef * 16 + ( ch - ( 'a' - 10 ) ) ); - sb.append( ch ); + sb16.append( ch ); } else if ( ch >= 'A' && ch <= 'F' ) { charRef = (char) ( charRef * 16 + ( ch - ( 'A' - 10 ) ) ); - sb.append( ch ); + sb16.append( ch ); } else if ( ch == ';' ) { @@ -2710,6 +2711,7 @@ else if ( ch >= 'A' && ch <= 'F' ) if ( ch >= '0' && ch <= '9' ) { charRef = (char) ( charRef * 10 + ( ch - '0' ) ); + sb10.append( ch ); } else if ( ch == ';' ) { @@ -2724,16 +2726,35 @@ else if ( ch >= 'A' && ch <= 'F' ) } } posEnd = pos - 1; - if ( sb.length() > 0 ) + if ( sb16.length() > 0 ) { - char[] tmp = toChars( Integer.parseInt( sb.toString(), 16 ) ); - charRefOneCharBuf = tmp; + try + { + charRefOneCharBuf = toChars( Integer.parseInt( sb16.toString(), 16 ) ); + } + catch ( IllegalArgumentException e ) + { + throw new XmlPullParserException( "character reference (with hex value " + sb16.toString() + + ") is invalid", this, null ); + } + if ( tokenize ) { text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length ); } return charRefOneCharBuf; } + + try + { + toChars( Integer.parseInt( sb10.toString(), 10 ) ); + } + catch ( IllegalArgumentException e ) + { + throw new XmlPullParserException( "character reference (with decimal value " + sb10.toString() + + ") is invalid", this, null ); + } + charRefOneCharBuf[0] = charRef; if ( tokenize ) { @@ -3996,15 +4017,21 @@ private static boolean isHighSurrogate( char ch ) return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch ); } - private static final int MIN_CODE_POINT = 0x000000; - private static final int MAX_CODE_POINT = 0x10FFFF; private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; + /** + * Check if the provided parameter is a valid Char, according to: {@link https://www.w3.org/TR/REC-xml/#NT-Char} + * + * @param codePoint the numeric value to check + * @return true if it is a valid numeric character reference. False otherwise. + */ private static boolean isValidCodePoint( int codePoint ) { - return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint ); + // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + return codePoint == 0x9 || codePoint == 0xA || codePoint == 0xD || ( 0x20 <= codePoint && codePoint <= 0xD7FF ) + || ( 0xE000 <= codePoint && codePoint <= 0xFFFD ) || ( 0x10000 <= codePoint && codePoint <= 0X10FFFF ); } private static boolean isSupplementaryCodePoint( int codePoint ) diff --git a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java index a50b4507..f53b5338 100644 --- a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java +++ b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java @@ -17,6 +17,8 @@ */ import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import java.io.IOException; import java.io.StringReader; @@ -156,6 +158,46 @@ public void testUnicodeEntities() assertEquals( XmlPullParser.END_TAG, parser.nextToken() ); } + @Test + public void testInvalidCharacterReferenceHexa() + throws Exception + { + MXParser parser = new MXParser(); + String input = ""; + parser.setInput( new StringReader( input ) ); + + try + { + assertEquals( XmlPullParser.START_TAG, parser.nextToken() ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + fail( "Should fail since � is an illegal character reference" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "character reference (with hex value 110000) is invalid" ) ); + } + } + + @Test + public void testInvalidCharacterReferenceDecimal() + throws Exception + { + MXParser parser = new MXParser(); + String input = ""; + parser.setInput( new StringReader( input ) ); + + try + { + assertEquals( XmlPullParser.START_TAG, parser.nextToken() ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + fail( "Should fail since � is an illegal character reference" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "character reference (with decimal value 1114112) is invalid" ) ); + } + } + @Test public void testProcessingInstruction() throws Exception From e66c906c9ff5dbaf75226ed5f7853e63f4d68f6a Mon Sep 17 00:00:00 2001 From: Gabriel Belingueres Date: Sat, 9 Mar 2019 19:44:39 -0300 Subject: [PATCH 2/2] Added changes suggested by hboutemy. - Replaced two StringBuiders by one, added isHex boolean. - Added tests for valid char references. - Catched and fixed wrong parsing bug for decimal >= ✐ (supplemental) char refs. --- .../plexus/util/xml/pull/MXParser.java | 46 +++------- .../plexus/util/xml/pull/MXParserTest.java | 87 +++++++++++++++++++ 2 files changed, 101 insertions(+), 32 deletions(-) diff --git a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java index ff501216..e03d6bda 100644 --- a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java +++ b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java @@ -2664,14 +2664,16 @@ protected char[] parseEntityRef() entityRefName = null; posStart = pos; char ch = more(); - StringBuilder sb16 = new StringBuilder(); - StringBuilder sb10 = new StringBuilder(); if ( ch == '#' ) { // parse character reference + char charRef = 0; ch = more(); - if ( ch == 'x' ) + StringBuilder sb = new StringBuilder(); + boolean isHex = ( ch == 'x' ); + + if ( isHex ) { // encoded in hex while ( true ) @@ -2680,17 +2682,17 @@ protected char[] parseEntityRef() if ( ch >= '0' && ch <= '9' ) { charRef = (char) ( charRef * 16 + ( ch - '0' ) ); - sb16.append( ch ); + sb.append( ch ); } else if ( ch >= 'a' && ch <= 'f' ) { charRef = (char) ( charRef * 16 + ( ch - ( 'a' - 10 ) ) ); - sb16.append( ch ); + sb.append( ch ); } else if ( ch >= 'A' && ch <= 'F' ) { charRef = (char) ( charRef * 16 + ( ch - ( 'A' - 10 ) ) ); - sb16.append( ch ); + sb.append( ch ); } else if ( ch == ';' ) { @@ -2711,7 +2713,7 @@ else if ( ch >= 'A' && ch <= 'F' ) if ( ch >= '0' && ch <= '9' ) { charRef = (char) ( charRef * 10 + ( ch - '0' ) ); - sb10.append( ch ); + sb.append( ch ); } else if ( ch == ';' ) { @@ -2726,39 +2728,19 @@ else if ( ch >= 'A' && ch <= 'F' ) } } posEnd = pos - 1; - if ( sb16.length() > 0 ) - { - try - { - charRefOneCharBuf = toChars( Integer.parseInt( sb16.toString(), 16 ) ); - } - catch ( IllegalArgumentException e ) - { - throw new XmlPullParserException( "character reference (with hex value " + sb16.toString() - + ") is invalid", this, null ); - } - - if ( tokenize ) - { - text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length ); - } - return charRefOneCharBuf; - } - try { - toChars( Integer.parseInt( sb10.toString(), 10 ) ); + charRefOneCharBuf = toChars( Integer.parseInt( sb.toString(), isHex ? 16 : 10 ) ); } catch ( IllegalArgumentException e ) { - throw new XmlPullParserException( "character reference (with decimal value " + sb10.toString() - + ") is invalid", this, null ); + throw new XmlPullParserException( "character reference (with " + ( isHex ? "hex" : "decimal" ) + + " value " + sb.toString() + ") is invalid", this, null ); } - charRefOneCharBuf[0] = charRef; if ( tokenize ) { - text = newString( charRefOneCharBuf, 0, 1 ); + text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length ); } return charRefOneCharBuf; } @@ -4031,7 +4013,7 @@ private static boolean isValidCodePoint( int codePoint ) { // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] return codePoint == 0x9 || codePoint == 0xA || codePoint == 0xD || ( 0x20 <= codePoint && codePoint <= 0xD7FF ) - || ( 0xE000 <= codePoint && codePoint <= 0xFFFD ) || ( 0x10000 <= codePoint && codePoint <= 0X10FFFF ); + || ( 0xE000 <= codePoint && codePoint <= 0xFFFD ) || ( 0x10000 <= codePoint && codePoint <= 0x10FFFF ); } private static boolean isSupplementaryCodePoint( int codePoint ) diff --git a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java index f53b5338..d26afe5b 100644 --- a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java +++ b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java @@ -178,6 +178,49 @@ public void testInvalidCharacterReferenceHexa() } } + @Test + public void testValidCharacterReferenceHexa() + throws Exception + { + MXParser parser = new MXParser(); + String input = " Ȁ퟿ᄁ�𐀀􏿽􏿿"; + parser.setInput( new StringReader( input ) ); + + try + { + assertEquals( XmlPullParser.START_TAG, parser.nextToken() ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0x9, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0xA, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0xD, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0x20, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0x200, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0xD7FF, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0xE000, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0xFFA2, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0xFFFD, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0x10000, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0x10FFFD, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 0x10FFFF, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.END_TAG, parser.nextToken() ); + } + catch ( XmlPullParserException e ) + { + fail( "Should success since the input represents all legal character references" ); + } + } + @Test public void testInvalidCharacterReferenceDecimal() throws Exception @@ -198,6 +241,50 @@ public void testInvalidCharacterReferenceDecimal() } } + @Test + public void testValidCharacterReferenceDecimal() + throws Exception + { + MXParser parser = new MXParser(); + String input = + " Ȁ퟿ᄁ�𐀀􏿽􏿿"; + parser.setInput( new StringReader( input ) ); + + try + { + assertEquals( XmlPullParser.START_TAG, parser.nextToken() ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 9, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 10, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 13, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 32, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 512, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 55295, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 57344, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 65442, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 65533, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 65536, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 1114109, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( 1114111, parser.getText().codePointAt( 0 ) ); + assertEquals( XmlPullParser.END_TAG, parser.nextToken() ); + } + catch ( XmlPullParserException e ) + { + fail( "Should success since the input represents all legal character references" ); + } + } + @Test public void testProcessingInstruction() throws Exception