From 0ef4b70b2acf47c1094aedbb9954324a2e84c05e Mon Sep 17 00:00:00 2001
From: Jonathan Hedley <jonathan@hedley.net>
Date: Sat, 23 Nov 2024 11:45:24 +1100
Subject: [PATCH] Allow `<` in tag name state

We used to have specific handling for this, but that moves us out of spec, and it's not a clear-cut intent.

Fixes #2230
---
 CHANGES.md                                    |  5 ++++-
 .../org/jsoup/parser/CharacterReader.java     |  3 +--
 .../java/org/jsoup/parser/TokeniserState.java |  4 ----
 .../java/org/jsoup/parser/HtmlParserTest.java | 19 +++++++++++++++++--
 .../org/jsoup/parser/TokeniserStateTest.java  | 17 -----------------
 5 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index b4199879b7..a0ea757120 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -35,7 +35,10 @@
   applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831)
 * When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an
   attribute). [2207](https://github.com/jhy/jsoup/issues/2207)
-* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
+* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly
+  created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
+* Follow the current HTML specification in the tokenizer to allow `<` as part of a tag name, instead of emitting it as a
+  character node. [2230](https://github.com/jhy/jsoup/issues/2230)
 
 ## 1.18.1 (2024-Jul-10)
 
diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java
index 1e015b0c06..9c16d44da1 100644
--- a/src/main/java/org/jsoup/parser/CharacterReader.java
+++ b/src/main/java/org/jsoup/parser/CharacterReader.java
@@ -489,7 +489,7 @@ String consumeRawData() {
 
     String consumeTagName() {
         // '\t', '\n', '\r', '\f', ' ', '/', '>'
-        // NOTE: out of spec, added '<' to fix common author bugs; does not stop and append on nullChar but eats
+        // NOTE: out of spec; does not stop and append on nullChar but eats
         bufferUp();
         int pos = bufPos;
         final int start = pos;
@@ -505,7 +505,6 @@ String consumeTagName() {
                 case ' ':
                 case '/':
                 case '>':
-                case '<':
                     break OUTER;
             }
             pos++;
diff --git a/src/main/java/org/jsoup/parser/TokeniserState.java b/src/main/java/org/jsoup/parser/TokeniserState.java
index 081b1525fb..7567936af9 100644
--- a/src/main/java/org/jsoup/parser/TokeniserState.java
+++ b/src/main/java/org/jsoup/parser/TokeniserState.java
@@ -160,10 +160,6 @@ enum TokeniserState {
                 case '/':
                     t.transition(SelfClosingStartTag);
                     break;
-                case '<': // NOTE: out of spec, but clear author intent
-                    r.unconsume();
-                    t.error(this);
-                    // intended fall through to next >
                 case '>':
                     t.emitTagPending();
                     t.transition(Data);
diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
index d87318bda0..07c94222ea 100644
--- a/src/test/java/org/jsoup/parser/HtmlParserTest.java
+++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -1649,9 +1649,9 @@ private boolean didAddElements(String input) {
         // when the Element is created, the name got normalized to "template" and so looked like there should be a
         // template on the stack during resetInsertionMode for the select.
         // The issue was that the normalization in Tag.valueOf did a trim which the Token.Tag did not
-        Document doc = Jsoup.parse("<template\u001E<select<input<");
+        Document doc = Jsoup.parse("<template\u001E><select><input>");
         assertNotNull(doc);
-        assertEquals("<template><select></select><input>&lt;</template>",
+        assertEquals("<template><select></select><input></template>",
             TextUtil.stripNewlines(doc.head().html()));
     }
 
@@ -1924,4 +1924,19 @@ private static void assertMathNamespace(Element el) {
             TextUtil.normalizeSpaces(doc.body().html())
         );
     }
+
+    @Test void gtAfterTagClose() {
+        // https://github.com/jhy/jsoup/issues/2230
+        String html = "<div>Div</div<> <a>One<a<b>Hello</b>";
+        // this gives us an element "a<b", which is gross, but to the spec & browsers
+        Document doc = Jsoup.parse(html);
+        Element body = doc.body();
+        assertEquals("<div> Div <a>One<a<b> Hello </a<b></a></div>", TextUtil.normalizeSpaces(body.html()));
+
+        Elements abs = doc.getElementsByTag("a<b");
+        assertEquals(1, abs.size());
+        Element ab = abs.first();
+        assertEquals("Hello", ab.text());
+        assertEquals("a<b", ab.tag().normalName());
+    }
 }
diff --git a/src/test/java/org/jsoup/parser/TokeniserStateTest.java b/src/test/java/org/jsoup/parser/TokeniserStateTest.java
index 6d9b5f7a77..90aced1708 100644
--- a/src/test/java/org/jsoup/parser/TokeniserStateTest.java
+++ b/src/test/java/org/jsoup/parser/TokeniserStateTest.java
@@ -198,13 +198,6 @@ public void testPublicAndSystemIdentifiersWithWhitespace() {
         }
     }
 
-    @Test public void handlesLessInTagThanAsNewTag() {
-        // out of spec, but clear author intent
-        String html = "<p\n<p<div id=one <span>Two";
-        Document doc = Jsoup.parse(html);
-        assertEquals("<p></p><p></p><div id=\"one\"><span>Two</span></div>", TextUtil.stripNewlines(doc.body().html()));
-    }
-
     @Test
     public void testUnconsumeAtBufferBoundary() {
         String triggeringSnippet = "<a href=\"\"foo";
@@ -250,16 +243,6 @@ public void testMalformedSelfClosingTag() {
         assertEquals(7, errorList.get(0).getPosition());
     }
 
-    @Test
-    public void testOpeningAngleBracketInTagName() {
-        String triggeringSnippet = "<html<";
-        ParseErrorList errorList = ParseErrorList.tracking(1);
-
-        Parser.parseFragment(triggeringSnippet, null, "", errorList);
-
-        assertEquals(5, errorList.get(0).getPosition());
-    }
-
     @Test
     public void rcData() {
         Document doc = Jsoup.parse("<title>One \0Two</title>");