Removed character range check (>= 0xD800 && <= 0xDFFF)

That was preventing the valid decode of &#55357;&#56495; to 💯. This rule must have been in the spec when initially implemented but I can't find a reference to it now. I'm assuming that the range had since been added, but can't immediately identify why it was explicitly excluded originally. Fixes #2047
jhy · Dec 3, 2023 · 954c46a · 954c46a
1 parent e39b9b9
commit 954c46a
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 6 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -15,6 +15,8 @@
   correctly. [2067](https://github.com/jhy/jsoup/issues/2067)
 * When tracking the source position of a body fragment parse, a null pointer exception was
   thrown. [2068](https://github.com/jhy/jsoup/issues/2068)
+* A multi-point encoded emoji entity may be incorrectly decoded to the replacement
+  character. [2047](https://github.com/jhy/jsoup/issues/2074)
 
 ---
 Older changes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27) may be found in

diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java
@@ -205,8 +205,11 @@ void advanceTransition(TokeniserState newState) {
                 int base = isHexMode ? 16 : 10;
                 charval = Integer.valueOf(numRef, base);
             } catch (NumberFormatException ignored) {
-            } // skip
-            if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
+                // skip
+            }
+            // todo: check for extra illegal unicode points as parse errors - described https://html.spec.whatwg.org/multipage/syntax.html#character-references and in Infra
+            // The numeric character reference forms described above are allowed to reference any code point excluding U+000D CR, noncharacters, and controls other than ASCII whitespace.
+            if (charval == -1 || charval > 0x10FFFF) {
                 characterReferenceError("character [%s] outside of valid range", charval);
                 codeRef[0] = replacementChar;
             } else {

diff --git a/src/test/java/org/jsoup/nodes/EntitiesTest.java b/src/test/java/org/jsoup/nodes/EntitiesTest.java
@@ -175,4 +175,14 @@ public class EntitiesTest {
         String escaped2 = assertDoesNotThrow(() -> Entities.escape(text, clone2));
         assertEquals(escaped1, escaped2);
     }
+
+    @Test void parseHtmlEncodedEmojiMultipoint() {
+        String emoji = Parser.unescapeEntities("&#55357;&#56495;", false); // 💯
+        assertEquals("\uD83D\uDCAF", emoji);
+    }
+
+    @Test void parseHtmlEncodedEmoji() {
+        String emoji = Parser.unescapeEntities("&#128175;", false); // 💯
+        assertEquals("\uD83D\uDCAF", emoji);
+    }
 }
diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -851,7 +851,7 @@ private static Stream<Arguments> dupeAttributeData() {
     }
 
     @Test public void tracksErrorsWhenRequested() {
-        String html = "<p>One</p href='no'>\n<!DOCTYPE html>\n&arrgh;<font />&#33 &amp &#xD800;<br /></div><foo";
+        String html = "<p>One</p href='no'>\n<!DOCTYPE html>\n&arrgh;<font />&#33 &amp &#x110000;<br /></div><foo";
         Parser parser = Parser.htmlParser().setTrackErrors(500);
         Document doc = Jsoup.parse(html, "http://example.com", parser);
 
@@ -863,9 +863,9 @@ private static Stream<Arguments> dupeAttributeData() {
         assertEquals("<3:16>: Tag [font] cannot be self closing; not a void tag", errors.get(3).toString());
         assertEquals("<3:20>: Invalid character reference: missing semicolon on [&#33]", errors.get(4).toString());
         assertEquals("<3:25>: Invalid character reference: missing semicolon on [&amp]", errors.get(5).toString());
-        assertEquals("<3:34>: Invalid character reference: character [55296] outside of valid range", errors.get(6).toString());
-        assertEquals("<3:46>: Unexpected EndTag token [</div>] when in state [InBody]", errors.get(7).toString());
-        assertEquals("<3:51>: Unexpectedly reached end of file (EOF) in input state [TagName]", errors.get(8).toString());
+        assertEquals("<3:36>: Invalid character reference: character [1114112] outside of valid range", errors.get(6).toString());
+        assertEquals("<3:48>: Unexpected EndTag token [</div>] when in state [InBody]", errors.get(7).toString());
+        assertEquals("<3:53>: Unexpectedly reached end of file (EOF) in input state [TagName]", errors.get(8).toString());
     }
 
     @Test public void tracksLimitedErrorsWhenRequested() {
@@ -1874,4 +1874,18 @@ private static void assertMathNamespace(Element el) {
         assertMathNamespace(doc4.expectFirst("annotation-xml"));
         assertHtmlNamespace(doc4.expectFirst("divv"));
     }
+
+    @Test void parseEmojiFromMultipointEncoded() {
+        String html = "<img multi='&#55357;&#56495;' single='&#128175;' hexsingle='&#x1f4af;'>";
+        Document document = Jsoup.parse(html);
+        Element img = document.expectFirst("img");
+        assertEquals("\uD83D\uDCAF", img.attr("multi"));
+        assertEquals("\uD83D\uDCAF", img.attr("single"));
+        assertEquals("\uD83D\uDCAF", img.attr("hexsingle"));
+
+        assertEquals("<img multi=\"\uD83D\uDCAF\" single=\"\uD83D\uDCAF\" hexsingle=\"\uD83D\uDCAF\">", img.outerHtml());
+
+        img.ownerDocument().outputSettings().charset("ascii");
+        assertEquals("<img multi=\"&#x1f4af;\" single=\"&#x1f4af;\" hexsingle=\"&#x1f4af;\">", img.outerHtml());
+    }
 }