diff --git a/CHANGES.md b/CHANGES.md index 5c2438b023..b4199879b7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -35,6 +35,7 @@ applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831) * When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an attribute). [2207](https://github.com/jhy/jsoup/issues/2207) +* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204) ## 1.18.1 (2024-Jul-10) diff --git a/src/main/java/org/jsoup/nodes/Attributes.java b/src/main/java/org/jsoup/nodes/Attributes.java index bc0f458795..12460ee3d0 100644 --- a/src/main/java/org/jsoup/nodes/Attributes.java +++ b/src/main/java/org/jsoup/nodes/Attributes.java @@ -389,6 +389,25 @@ public Range.AttributeRange sourceRange(String key) { return (Map) userData(AttrRangeKey); } + /** + Set the source ranges (start to end position) from which this attribute's name and value were parsed. + @param key the attribute name + @param range the range for the attribute's name and value + @return these attributes, for chaining + @since 1.18.2 + */ + public Attributes sourceRange(String key, Range.AttributeRange range) { + Validate.notNull(key); + Validate.notNull(range); + Map ranges = getRanges(); + if (ranges == null) { + ranges = new HashMap<>(); + userData(AttrRangeKey, ranges); + } + ranges.put(key, range); + return this; + } + @Override public Iterator iterator() { diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index 470a785a50..9a57c9c07d 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -7,6 +7,7 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.DocumentType; import org.jsoup.nodes.Element; +import org.jsoup.nodes.Range; import java.util.ArrayList; @@ -371,12 +372,7 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) { stack = tb.getStack(); if (stack.size() > 0) { Element html = tb.getStack().get(0); - if (startTag.hasAttributes()) { - for (Attribute attribute : startTag.attributes) { - if (!html.hasAttr(attribute.getKey())) - html.attributes().put(attribute); - } - } + mergeAttributes(startTag, html); } break; case "body": @@ -388,13 +384,8 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) { } else { tb.framesetOk(false); // will be on stack if this is a nested body. won't be if closed (which is a variance from spec, which leaves it on) - Element body; - if (startTag.hasAttributes() && (body = tb.getFromStack("body")) != null) { // we only ever put one body on stack - for (Attribute attribute : startTag.attributes) { - if (!body.hasAttr(attribute.getKey())) - body.attributes().put(attribute); - } - } + Element body = tb.getFromStack("body"); + if (body != null) mergeAttributes(startTag, body); } break; case "frameset": @@ -1841,6 +1832,20 @@ boolean processAsHtml(Token t, HtmlTreeBuilder tb) { } }; + private static void mergeAttributes(Token.StartTag source, Element dest) { + if (!source.hasAttributes()) return; + for (Attribute attr : source.attributes) { // only iterates public attributes + Attributes destAttrs = dest.attributes(); + if (!destAttrs.hasKey(attr.getKey())) { + Range.AttributeRange range = attr.sourceRange(); // need to grab range before its parent changes + destAttrs.put(attr); + if (source.trackSource) { // copy the attribute range + destAttrs.sourceRange(attr.getKey(), range); + } + } + } + } + private static final String nullString = String.valueOf('\u0000'); abstract boolean process(Token t, HtmlTreeBuilder tb); diff --git a/src/main/java/org/jsoup/parser/Token.java b/src/main/java/org/jsoup/parser/Token.java index a0c411ddc8..887024e5bc 100644 --- a/src/main/java/org/jsoup/parser/Token.java +++ b/src/main/java/org/jsoup/parser/Token.java @@ -196,16 +196,8 @@ private void trackAttributeRange(String name) { final boolean preserve = start.treeBuilder.settings.preserveAttributeCase(); assert attributes != null; - //noinspection unchecked - Map attrRanges = - (Map) attributes.userData(AttrRangeKey); - if (attrRanges == null) { - attrRanges = new HashMap<>(); - attributes.userData(AttrRangeKey, attrRanges); - } - if (!preserve) name = Normalizer.lowerCase(name); - if (attrRanges.containsKey(name)) return; // dedupe ranges as we go; actual attributes get deduped later for error count + if (attributes.sourceRange(name).nameRange().isTracked()) return; // dedupe ranges as we go; actual attributes get deduped later for error count // if there's no value (e.g. boolean), make it an implicit range at current if (!hasAttrValue) attrValStart = attrValEnd = attrNameEnd; @@ -218,7 +210,7 @@ private void trackAttributeRange(String name) { new Range.Position(attrValStart, r.lineNumber(attrValStart), r.columnNumber(attrValStart)), new Range.Position(attrValEnd, r.lineNumber(attrValEnd), r.columnNumber(attrValEnd))) ); - attrRanges.put(name, range); + attributes.sourceRange(name, range); } } diff --git a/src/test/java/org/jsoup/parser/PositionTest.java b/src/test/java/org/jsoup/parser/PositionTest.java index 9dd42971a7..75cb1be4d1 100644 --- a/src/test/java/org/jsoup/parser/PositionTest.java +++ b/src/test/java/org/jsoup/parser/PositionTest.java @@ -1,7 +1,9 @@ package org.jsoup.parser; import org.jsoup.Jsoup; +import org.jsoup.TextUtil; import org.jsoup.integration.servlets.FileServlet; +import org.jsoup.internal.Normalizer; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.CDataNode; import org.jsoup.nodes.Comment; @@ -567,6 +569,23 @@ private void printRange(Node node) { assertEquals(expectedRange, attr2.sourceRange().toString()); } + @Test void movedAttributesHaveRange() { + // https://github.com/jhy/jsoup/issues/2204 + String html = "OneTwo"; + // note that the attributes of the head el are not copied into the implicit head created by the span, per spec. html and body els are. + Document doc = Jsoup.parse(html, TrackingHtmlParser); + StringBuilder elTrack = new StringBuilder(); + doc.forEachNode(node -> accumulatePositions(node, elTrack)); + + StringBuilder atTrack = new StringBuilder(); + doc.forEachNode(node -> accumulateAttributePositions(node, atTrack)); + + assertEquals("#document:0-0~98-98; html:0-0~98-98; head:0-0~0-0; body:0-0~53-60; span:0-11~14-21; #text:11-14; #text:50-53; ", elTrack.toString()); + assertEquals("attr:27-31=32-35; class:42-47=48-49; data:89-93=94-97; id:6-8=9-10; ", atTrack.toString()); + + assertEquals("OneTwo ", TextUtil.normalizeSpaces(doc.html())); + } + static void accumulateAttributePositions(Node node, StringBuilder sb) { if (node instanceof LeafNode) return; // leafnode pseudo attributes are not tracked for (Attribute attribute : node.attributes()) { @@ -591,4 +610,4 @@ static void accumulatePositions(Attribute attr, StringBuilder sb) { sb.append("; "); } -} \ No newline at end of file +}