From 32b72a6e8ff51e7b53510ecfc47ca46eaa4695ba Mon Sep 17 00:00:00 2001 From: Guillaume Nodet Date: Wed, 1 Mar 2023 23:01:46 +0100 Subject: [PATCH] Fix reading comments with UTF chars (fixes #238) (#240) * Fix reading comments with UTF chars (fixes #238) * Fix printable methods to account for UTF chars --- .../plexus/util/xml/pull/MXParser.java | 39 +++++++++++++++---- .../plexus/util/xml/pull/MXParserTest.java | 26 +++++++++++++ 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java index d44c9a7f..e9fc1182 100644 --- a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java +++ b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java @@ -2981,8 +2981,8 @@ private void parseComment() // implements XML 1.0 Section 2.5 Comments // ASSUMPTION: seen - ch = more(); + cch = more(); + int ch; + char cch2; + if ( Character.isHighSurrogate( cch ) ) + { + cch2 = more(); + ch = Character.toCodePoint( cch, cch2 ); + } + else + { + cch2 = 0; + ch = cch; + } if ( seenDashDash && ch != '>' ) { throw new XmlPullParserException( "in comment after two dashes (--) next character must be >" @@ -3074,7 +3086,11 @@ else if ( ch == '\n' ) { if ( pcEnd >= pc.length ) ensurePC( pcEnd ); - pc[pcEnd++] = ch; + pc[pcEnd++] = cch; + if ( cch2 != 0 ) + { + pc[pcEnd++] = cch2; + } } normalizedCR = false; } @@ -4153,7 +4169,7 @@ private static boolean isS( char ch ) // ch != '\u0000' ch < '\uFFFE' // private char printable(char ch) { return ch; } - private static String printable( char ch ) + private static String printable( int ch ) { if ( ch == '\n' ) { @@ -4175,18 +4191,25 @@ else if ( ch == '\'' ) { return "\\u" + Integer.toHexString( ch ); } - return "" + ch; + if ( Character.isBmpCodePoint( ch ) ) + { + return Character.toString( ( char ) ch ); + } + else + { + return new String( new char[] { Character.highSurrogate( ch ), Character.lowSurrogate( ch ) } ); + } } private static String printable( String s ) { if ( s == null ) return null; - final int sLen = s.length(); + final int sLen = s.codePointCount(0, s.length()); StringBuilder buf = new StringBuilder( sLen + 10 ); for ( int i = 0; i < sLen; ++i ) { - buf.append( printable( s.charAt( i ) ) ); + buf.append( printable( s.codePointAt( i ) ) ); } s = buf.toString(); return s; diff --git a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java index e5e04708..cba42b32 100644 --- a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java +++ b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java @@ -1511,4 +1511,30 @@ public void testReplacementInPCArrayWithShorterCharArray() fail( "should not raise exception: " + e ); } } + + /** + * Ensures emoji can be parsed correctly + */ + @Test + public void testUnicode() throws IOException { + String input = ""; + + try + { + MXParser parser = new MXParser(); + parser.setInput( new StringReader( input ) ); + + assertEquals( XmlPullParser.START_TAG, parser.nextToken() ); + assertEquals( "project", parser.getName() ); + assertEquals( XmlPullParser.COMMENT, parser.nextToken() ); + assertEquals( "ALL TEH BOMS! \uD83D\uDCA3 ", parser.getText() ); + assertEquals( XmlPullParser.END_TAG, parser.nextToken() ); + assertEquals( "project", parser.getName() ); + } + catch ( XmlPullParserException e ) + { + e.printStackTrace(); + fail( "should not raise exception: " + e ); + } + } }