From 954c46a92a1a0c5052ed241d4398db8b0331e1f3 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 4 Dec 2023 10:15:43 +1100 Subject: [PATCH] Removed character range check (>= 0xD800 && <= 0xDFFF) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That was preventing the valid decode of �� to 💯. This rule must have been in the spec when initially implemented but I can't find a reference to it now. I'm assuming that the range had since been added, but can't immediately identify why it was explicitly excluded originally. Fixes #2047 --- CHANGES.md | 2 ++ src/main/java/org/jsoup/parser/Tokeniser.java | 7 ++++-- .../java/org/jsoup/nodes/EntitiesTest.java | 10 +++++++++ .../java/org/jsoup/parser/HtmlParserTest.java | 22 +++++++++++++++---- 4 files changed, 35 insertions(+), 6 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index a0852ec691..4f75af5058 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,6 +15,8 @@ correctly. [2067](https://github.com/jhy/jsoup/issues/2067) * When tracking the source position of a body fragment parse, a null pointer exception was thrown. [2068](https://github.com/jhy/jsoup/issues/2068) +* A multi-point encoded emoji entity may be incorrectly decoded to the replacement + character. [2047](https://github.com/jhy/jsoup/issues/2074) --- Older changes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27) may be found in diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java index d77fbf9e81..1d7b3ff68f 100644 --- a/src/main/java/org/jsoup/parser/Tokeniser.java +++ b/src/main/java/org/jsoup/parser/Tokeniser.java @@ -205,8 +205,11 @@ void advanceTransition(TokeniserState newState) { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException ignored) { - } // skip - if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { + // skip + } + // todo: check for extra illegal unicode points as parse errors - described https://html.spec.whatwg.org/multipage/syntax.html#character-references and in Infra + // The numeric character reference forms described above are allowed to reference any code point excluding U+000D CR, noncharacters, and controls other than ASCII whitespace. + if (charval == -1 || charval > 0x10FFFF) { characterReferenceError("character [%s] outside of valid range", charval); codeRef[0] = replacementChar; } else { diff --git a/src/test/java/org/jsoup/nodes/EntitiesTest.java b/src/test/java/org/jsoup/nodes/EntitiesTest.java index 7243c2f55c..4139a9e964 100644 --- a/src/test/java/org/jsoup/nodes/EntitiesTest.java +++ b/src/test/java/org/jsoup/nodes/EntitiesTest.java @@ -175,4 +175,14 @@ public class EntitiesTest { String escaped2 = assertDoesNotThrow(() -> Entities.escape(text, clone2)); assertEquals(escaped1, escaped2); } + + @Test void parseHtmlEncodedEmojiMultipoint() { + String emoji = Parser.unescapeEntities("��", false); // 💯 + assertEquals("\uD83D\uDCAF", emoji); + } + + @Test void parseHtmlEncodedEmoji() { + String emoji = Parser.unescapeEntities("💯", false); // 💯 + assertEquals("\uD83D\uDCAF", emoji); + } } diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 5f9eca9d7c..68b39ed3ee 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -851,7 +851,7 @@ private static Stream dupeAttributeData() { } @Test public void tracksErrorsWhenRequested() { - String html = "

One

\n\n&arrgh;! & �
dupeAttributeData() { assertEquals("<3:16>: Tag [font] cannot be self closing; not a void tag", errors.get(3).toString()); assertEquals("<3:20>: Invalid character reference: missing semicolon on [!]", errors.get(4).toString()); assertEquals("<3:25>: Invalid character reference: missing semicolon on [&]", errors.get(5).toString()); - assertEquals("<3:34>: Invalid character reference: character [55296] outside of valid range", errors.get(6).toString()); - assertEquals("<3:46>: Unexpected EndTag token [] when in state [InBody]", errors.get(7).toString()); - assertEquals("<3:51>: Unexpectedly reached end of file (EOF) in input state [TagName]", errors.get(8).toString()); + assertEquals("<3:36>: Invalid character reference: character [1114112] outside of valid range", errors.get(6).toString()); + assertEquals("<3:48>: Unexpected EndTag token [] when in state [InBody]", errors.get(7).toString()); + assertEquals("<3:53>: Unexpectedly reached end of file (EOF) in input state [TagName]", errors.get(8).toString()); } @Test public void tracksLimitedErrorsWhenRequested() { @@ -1874,4 +1874,18 @@ private static void assertMathNamespace(Element el) { assertMathNamespace(doc4.expectFirst("annotation-xml")); assertHtmlNamespace(doc4.expectFirst("divv")); } + + @Test void parseEmojiFromMultipointEncoded() { + String html = ""; + Document document = Jsoup.parse(html); + Element img = document.expectFirst("img"); + assertEquals("\uD83D\uDCAF", img.attr("multi")); + assertEquals("\uD83D\uDCAF", img.attr("single")); + assertEquals("\uD83D\uDCAF", img.attr("hexsingle")); + + assertEquals("", img.outerHtml()); + + img.ownerDocument().outputSettings().charset("ascii"); + assertEquals("", img.outerHtml()); + } }