From 0ef4b70b2acf47c1094aedbb9954324a2e84c05e Mon Sep 17 00:00:00 2001
From: Jonathan Hedley
Date: Sat, 23 Nov 2024 11:45:24 +1100
Subject: [PATCH] Allow `<` in tag name state
We used to have specific handling for this, but that moves us out of spec, and it's not a clear-cut intent.
Fixes #2230
---
CHANGES.md | 5 ++++-
.../org/jsoup/parser/CharacterReader.java | 3 +--
.../java/org/jsoup/parser/TokeniserState.java | 4 ----
.../java/org/jsoup/parser/HtmlParserTest.java | 19 +++++++++++++++++--
.../org/jsoup/parser/TokeniserStateTest.java | 17 -----------------
5 files changed, 22 insertions(+), 26 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
index b4199879b7..a0ea757120 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -35,7 +35,10 @@
applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831)
* When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an
attribute). [2207](https://github.com/jhy/jsoup/issues/2207)
-* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
+* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly
+ created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
+* Follow the current HTML specification in the tokenizer to allow `<` as part of a tag name, instead of emitting it as a
+ character node. [2230](https://github.com/jhy/jsoup/issues/2230)
## 1.18.1 (2024-Jul-10)
diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java
index 1e015b0c06..9c16d44da1 100644
--- a/src/main/java/org/jsoup/parser/CharacterReader.java
+++ b/src/main/java/org/jsoup/parser/CharacterReader.java
@@ -489,7 +489,7 @@ String consumeRawData() {
String consumeTagName() {
// '\t', '\n', '\r', '\f', ' ', '/', '>'
- // NOTE: out of spec, added '<' to fix common author bugs; does not stop and append on nullChar but eats
+ // NOTE: out of spec; does not stop and append on nullChar but eats
bufferUp();
int pos = bufPos;
final int start = pos;
@@ -505,7 +505,6 @@ String consumeTagName() {
case ' ':
case '/':
case '>':
- case '<':
break OUTER;
}
pos++;
diff --git a/src/main/java/org/jsoup/parser/TokeniserState.java b/src/main/java/org/jsoup/parser/TokeniserState.java
index 081b1525fb..7567936af9 100644
--- a/src/main/java/org/jsoup/parser/TokeniserState.java
+++ b/src/main/java/org/jsoup/parser/TokeniserState.java
@@ -160,10 +160,6 @@ enum TokeniserState {
case '/':
t.transition(SelfClosingStartTag);
break;
- case '<': // NOTE: out of spec, but clear author intent
- r.unconsume();
- t.error(this);
- // intended fall through to next >
case '>':
t.emitTagPending();
t.transition(Data);
diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
index d87318bda0..07c94222ea 100644
--- a/src/test/java/org/jsoup/parser/HtmlParserTest.java
+++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -1649,9 +1649,9 @@ private boolean didAddElements(String input) {
// when the Element is created, the name got normalized to "template" and so looked like there should be a
// template on the stack during resetInsertionMode for the select.
// The issue was that the normalization in Tag.valueOf did a trim which the Token.Tag did not
- Document doc = Jsoup.parse("