Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#57 Uncaught IllegalArgumentException due to malformed unicode entity ref #58

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 37 additions & 10 deletions src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -2664,7 +2664,8 @@ protected char[] parseEntityRef()
entityRefName = null;
posStart = pos;
char ch = more();
StringBuilder sb = new StringBuilder();
StringBuilder sb16 = new StringBuilder();
StringBuilder sb10 = new StringBuilder();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of creating 2 separate string buffers, keeping sb and creating a "isHex" boolean would be a little bit more efficient and permit code sharing between the 2 cases since it's only a question of radix value, isn't it?

if ( ch == '#' )
{
// parse character reference
Expand All @@ -2679,17 +2680,17 @@ protected char[] parseEntityRef()
if ( ch >= '0' && ch <= '9' )
{
charRef = (char) ( charRef * 16 + ( ch - '0' ) );
sb.append( ch );
sb16.append( ch );
}
else if ( ch >= 'a' && ch <= 'f' )
{
charRef = (char) ( charRef * 16 + ( ch - ( 'a' - 10 ) ) );
sb.append( ch );
sb16.append( ch );
}
else if ( ch >= 'A' && ch <= 'F' )
{
charRef = (char) ( charRef * 16 + ( ch - ( 'A' - 10 ) ) );
sb.append( ch );
sb16.append( ch );
}
else if ( ch == ';' )
{
Expand All @@ -2710,6 +2711,7 @@ else if ( ch >= 'A' && ch <= 'F' )
if ( ch >= '0' && ch <= '9' )
{
charRef = (char) ( charRef * 10 + ( ch - '0' ) );
sb10.append( ch );
}
else if ( ch == ';' )
{
Expand All @@ -2724,16 +2726,35 @@ else if ( ch >= 'A' && ch <= 'F' )
}
}
posEnd = pos - 1;
if ( sb.length() > 0 )
if ( sb16.length() > 0 )
{
char[] tmp = toChars( Integer.parseInt( sb.toString(), 16 ) );
charRefOneCharBuf = tmp;
try
{
charRefOneCharBuf = toChars( Integer.parseInt( sb16.toString(), 16 ) );
}
catch ( IllegalArgumentException e )
{
throw new XmlPullParserException( "character reference (with hex value " + sb16.toString()
+ ") is invalid", this, null );
}

if ( tokenize )
{
text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length );
}
return charRefOneCharBuf;
}

try
{
toChars( Integer.parseInt( sb10.toString(), 10 ) );
}
catch ( IllegalArgumentException e )
{
throw new XmlPullParserException( "character reference (with decimal value " + sb10.toString()
+ ") is invalid", this, null );
}

charRefOneCharBuf[0] = charRef;
if ( tokenize )
{
Expand Down Expand Up @@ -3996,15 +4017,21 @@ private static boolean isHighSurrogate( char ch )
return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
}

private static final int MIN_CODE_POINT = 0x000000;

private static final int MAX_CODE_POINT = 0x10FFFF;

private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;

/**
* Check if the provided parameter is a valid Char, according to: {@link https://www.w3.org/TR/REC-xml/#NT-Char}
*
* @param codePoint the numeric value to check
* @return true if it is a valid numeric character reference. False otherwise.
*/
private static boolean isValidCodePoint( int codePoint )
{
return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
// Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
return codePoint == 0x9 || codePoint == 0xA || codePoint == 0xD || ( 0x20 <= codePoint && codePoint <= 0xD7FF )
|| ( 0xE000 <= codePoint && codePoint <= 0xFFFD ) || ( 0x10000 <= codePoint && codePoint <= 0X10FFFF );
}

private static boolean isSupplementaryCodePoint( int codePoint )
Expand Down
42 changes: 42 additions & 0 deletions src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
*/

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.IOException;
import java.io.StringReader;
Expand Down Expand Up @@ -156,6 +158,46 @@ public void testUnicodeEntities()
assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
}

@Test
public void testInvalidCharacterReferenceHexa()
throws Exception
{
MXParser parser = new MXParser();
String input = "<root>&#x110000;</root>";
parser.setInput( new StringReader( input ) );

try
{
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
fail( "Should fail since &#x110000; is an illegal character reference" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "character reference (with hex value 110000) is invalid" ) );
}
}

@Test
public void testInvalidCharacterReferenceDecimal()
hboutemy marked this conversation as resolved.
Show resolved Hide resolved
throws Exception
{
MXParser parser = new MXParser();
String input = "<root>&#1114112;</root>";
parser.setInput( new StringReader( input ) );

try
{
assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
fail( "Should fail since &#1114112; is an illegal character reference" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "character reference (with decimal value 1114112) is invalid" ) );
}
}

@Test
public void testProcessingInstruction()
throws Exception
Expand Down