From 0ef4b70b2acf47c1094aedbb9954324a2e84c05e Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sat, 23 Nov 2024 11:45:24 +1100 Subject: [PATCH] Allow `<` in tag name state We used to have specific handling for this, but that moves us out of spec, and it's not a clear-cut intent. Fixes #2230 --- CHANGES.md | 5 ++++- .../org/jsoup/parser/CharacterReader.java | 3 +-- .../java/org/jsoup/parser/TokeniserState.java | 4 ---- .../java/org/jsoup/parser/HtmlParserTest.java | 19 +++++++++++++++++-- .../org/jsoup/parser/TokeniserStateTest.java | 17 ----------------- 5 files changed, 22 insertions(+), 26 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index b4199879b7..a0ea757120 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -35,7 +35,10 @@ applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831) * When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an attribute). [2207](https://github.com/jhy/jsoup/issues/2207) -* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204) +* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly + created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204) +* Follow the current HTML specification in the tokenizer to allow `<` as part of a tag name, instead of emitting it as a + character node. [2230](https://github.com/jhy/jsoup/issues/2230) ## 1.18.1 (2024-Jul-10) diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index 1e015b0c06..9c16d44da1 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -489,7 +489,7 @@ String consumeRawData() { String consumeTagName() { // '\t', '\n', '\r', '\f', ' ', '/', '>' - // NOTE: out of spec, added '<' to fix common author bugs; does not stop and append on nullChar but eats + // NOTE: out of spec; does not stop and append on nullChar but eats bufferUp(); int pos = bufPos; final int start = pos; @@ -505,7 +505,6 @@ String consumeTagName() { case ' ': case '/': case '>': - case '<': break OUTER; } pos++; diff --git a/src/main/java/org/jsoup/parser/TokeniserState.java b/src/main/java/org/jsoup/parser/TokeniserState.java index 081b1525fb..7567936af9 100644 --- a/src/main/java/org/jsoup/parser/TokeniserState.java +++ b/src/main/java/org/jsoup/parser/TokeniserState.java @@ -160,10 +160,6 @@ enum TokeniserState { case '/': t.transition(SelfClosingStartTag); break; - case '<': // NOTE: out of spec, but clear author intent - r.unconsume(); - t.error(this); - // intended fall through to next > case '>': t.emitTagPending(); t.transition(Data); diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index d87318bda0..07c94222ea 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -1649,9 +1649,9 @@ private boolean didAddElements(String input) { // when the Element is created, the name got normalized to "template" and so looked like there should be a // template on the stack during resetInsertionMode for the select. // The issue was that the normalization in Tag.valueOf did a trim which the Token.Tag did not - Document doc = Jsoup.parse(""); assertNotNull(doc); - assertEquals("", + assertEquals("", TextUtil.stripNewlines(doc.head().html())); } @@ -1924,4 +1924,19 @@ private static void assertMathNamespace(Element el) { TextUtil.normalizeSpaces(doc.body().html()) ); } + + @Test void gtAfterTagClose() { + // https://github.com/jhy/jsoup/issues/2230 + String html = "
Div OneHello"; + // this gives us an element "a Div One Hello
", TextUtil.normalizeSpaces(body.html())); + + Elements abs = doc.getElementsByTag("a

Two
", TextUtil.stripNewlines(doc.body().html())); - } - @Test public void testUnconsumeAtBufferBoundary() { String triggeringSnippet = "One \0Two");