diff --git a/CHANGES b/CHANGES index e82d02d233..24c8d52b27 100644 --- a/CHANGES +++ b/CHANGES @@ -6,13 +6,17 @@ Release 1.16.2 [PENDING] matching process by ensuring that simpler evaluations (such as a tag name match) are conducted prior to more complex evaluations (such as an attribute regex, or a deep child scan with a :has). + * Improvement: added support for and tags (and their children). This includes tag namespaces and case + preservation on applicable tags and attributes. + + * Improvement: when converting jsoup Documents to W3C Documents in W3CDom, HTML documents will be placed in the `http://www.w3.org/1999/xhtml` namespace by default, per the HTML5 spec. This can be controlled by setting `W3CDom#namespaceAware(false)`. * Improvement: speed optimized the Structural Evaluators by memoizing previous evaluations. Particularly the `~` - (any preceeding sibling) and `:nth-of-type` selectors are improved. + (any preceding sibling) and `:nth-of-type` selectors are improved. * Improvement: tweaked the performance of the Element nextElementSibling, previousElementSibling, firstElementSibling, diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java index 231410b0ab..3070d1024d 100644 --- a/src/main/java/org/jsoup/helper/W3CDom.java +++ b/src/main/java/org/jsoup/helper/W3CDom.java @@ -4,6 +4,7 @@ import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; import org.jsoup.parser.HtmlTreeBuilder; +import org.jsoup.parser.Parser; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; import org.jsoup.select.Selector; @@ -339,9 +340,9 @@ public String asString(Document doc) { * Implements the conversion by walking the input. */ protected static class W3CBuilder implements NodeVisitor { + // TODO: move the namespace handling stuff into XmlTreeBuilder / HtmlTreeBuilder, now that Tags have namespaces private static final String xmlnsKey = "xmlns"; private static final String xmlnsPrefix = "xmlns:"; - private static final String xhtmlNs = "http://www.w3.org/1999/xhtml"; private final Document doc; private boolean namespaceAware = true; @@ -358,7 +359,7 @@ public W3CBuilder(Document doc) { final org.jsoup.nodes.Document inDoc = contextElement.ownerDocument(); if (namespaceAware && inDoc != null && inDoc.parser().getTreeBuilder() instanceof HtmlTreeBuilder) { // as per the WHATWG HTML5 spec ยง 2.1.3, elements are in the HTML namespace by default - namespacesStack.peek().put("", xhtmlNs); + namespacesStack.peek().put("", Parser.NamespaceHtml); } } diff --git a/src/main/java/org/jsoup/nodes/Document.java b/src/main/java/org/jsoup/nodes/Document.java index 9930dc5e24..b2c3f7c88a 100644 --- a/src/main/java/org/jsoup/nodes/Document.java +++ b/src/main/java/org/jsoup/nodes/Document.java @@ -15,7 +15,6 @@ import javax.annotation.Nullable; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; -import java.util.ArrayList; import java.util.List; /** @@ -31,17 +30,28 @@ public class Document extends Element { private boolean updateMetaCharset = false; /** - Create a new, empty Document. + Create a new, empty Document, in the specified namespace. + @param namespace the namespace of this Document's root node. @param baseUri base URI of document @see org.jsoup.Jsoup#parse @see #createShell */ - public Document(String baseUri) { - super(Tag.valueOf("#root", ParseSettings.htmlDefault), baseUri); + public Document(String namespace, String baseUri) { + super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri); this.location = baseUri; this.parser = Parser.htmlParser(); // default, but overridable } + /** + Create a new, empty Document, in the HTML namespace. + @param baseUri base URI of document + @see org.jsoup.Jsoup#parse + @see #Document(String namespace, String baseUri) + */ + public Document(String baseUri) { + this(Parser.NamespaceHtml, baseUri); + } + /** Create a valid, empty shell of a document, suitable for adding more elements to. @param baseUri baseUri of document @@ -208,7 +218,7 @@ public void title(String title) { @return new element */ public Element createElement(String tagName) { - return new Element(Tag.valueOf(tagName, ParseSettings.preserveCase), this.baseUri()); + return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri()); } @Override @@ -312,7 +322,7 @@ public Document clone() { @Override public Document shallowClone() { - Document clone = new Document(baseUri()); + Document clone = new Document(this.tag().namespace(), baseUri()); if (attributes != null) clone.attributes = attributes.clone(); clone.outputSettings = this.outputSettings.clone(); diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index 3d5f8c15fa..86b304ecb9 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -5,6 +5,7 @@ import org.jsoup.internal.NonnullByDefault; import org.jsoup.internal.StringUtil; import org.jsoup.parser.ParseSettings; +import org.jsoup.parser.Parser; import org.jsoup.parser.Tag; import org.jsoup.select.Collector; import org.jsoup.select.Elements; @@ -51,11 +52,21 @@ public class Element extends Node { @Nullable Attributes attributes; // field is nullable but all methods for attributes are non-null /** - * Create a new, standalone element. + * Create a new, standalone element, in the specified namespace. * @param tag tag name + * @param namespace namespace for this element + */ + public Element(String tag, String namespace) { + this(Tag.valueOf(tag, namespace, ParseSettings.preserveCase), null); + } + + /** + * Create a new, standalone element, in the HTML namespace. + * @param tag tag name + * @see #Element(String tag, String namespace) */ public Element(String tag) { - this(Tag.valueOf(tag), "", null); + this(Tag.valueOf(tag, Parser.NamespaceHtml, ParseSettings.preserveCase), "", null); } /** @@ -172,8 +183,22 @@ public String normalName() { * @see Elements#tagName(String) */ public Element tagName(String tagName) { + return tagName(tagName, tag.namespace()); + } + + /** + * Change (rename) the tag of this element. For example, convert a {@code } to a {@code
} with + * {@code el.tagName("div");}. + * + * @param tagName new tag name for this element + * @param namespace the new namespace for this element + * @return this element, for chaining + * @see Elements#tagName(String) + */ + public Element tagName(String tagName, String namespace) { Validate.notEmptyParam(tagName, "tagName"); - tag = Tag.valueOf(tagName, NodeUtils.parser(this).settings()); // maintains the case option of the original parse + Validate.notEmptyParam(namespace, "namespace"); + tag = Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()); // maintains the case option of the original parse return this; } @@ -679,7 +704,11 @@ public Element insertChildren(int index, Node... children) { * {@code parent.appendElement("h1").attr("id", "header").text("Welcome");} */ public Element appendElement(String tagName) { - Element child = new Element(Tag.valueOf(tagName, NodeUtils.parser(this).settings()), baseUri()); + return appendElement(tagName, tag.namespace()); + } + + public Element appendElement(String tagName, String namespace) { + Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri()); appendChild(child); return child; } @@ -692,7 +721,11 @@ public Element appendElement(String tagName) { * {@code parent.prependElement("h1").attr("id", "header").text("Welcome");} */ public Element prependElement(String tagName) { - Element child = new Element(Tag.valueOf(tagName, NodeUtils.parser(this).settings()), baseUri()); + return prependElement(tagName, tag.namespace()); + } + + public Element prependElement(String tagName, String namespace) { + Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri()); prependChild(child); return child; } diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 2e49e6050a..fc33a1ac02 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -1,6 +1,7 @@ package org.jsoup.parser; import org.jsoup.helper.Validate; +import org.jsoup.internal.Normalizer; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.CDataNode; import org.jsoup.nodes.Comment; @@ -21,6 +22,7 @@ import static org.jsoup.internal.StringUtil.inSorted; import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster; +import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent; /** * HTML Tree Builder; creates a DOM from Tokens. @@ -42,6 +44,8 @@ public class HtmlTreeBuilder extends TreeBuilder { "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "ul", "wbr", "xmp"}; + static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"}; + static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"}; public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages @@ -165,7 +169,86 @@ List parseFragment(String inputFragment, @Nullable Element context, String @Override protected boolean process(Token token) { currentToken = token; - return this.state.process(token, this); + + if (shouldDispatchToCurrentInsertionMode(token)) { + return this.state.process(token, this); + } else { + return ForeignContent.process(token, this); + } + } + + boolean shouldDispatchToCurrentInsertionMode(Token token) { + // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction + // If the stack of open elements is empty + if (stack.isEmpty()) + return true; + final Element el = currentElement(); + final String ns = el.tag().namespace(); + + // If the adjusted current node is an element in the HTML namespace + if (Parser.NamespaceHtml.equals(ns)) + return true; + + // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark" + // If the adjusted current node is a MathML text integration point and the token is a character token + if (isMathmlTextIntegration(el)) { + if (token.isStartTag() + && !"mglyph".equals(token.asStartTag().normalName) + && !"malignmark".equals(token.asStartTag().normalName)) + return true; + if (token.isCharacter()) + return true; + } + // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg" + if (Parser.NamespaceMathml.equals(ns) + && el.normalName().equals("annotation-xml") + && token.isStartTag() + && "svg".equals(token.asStartTag().normalName)) + return true; + + // If the adjusted current node is an HTML integration point and the token is a start tag + // If the adjusted current node is an HTML integration point and the token is a character token + if (isHtmlIntegration(el) + && (token.isStartTag() || token.isCharacter())) + return true; + + // If the token is an end-of-file token + return token.isEOF(); + } + + boolean isMathmlTextIntegration(Element el) { + /* + A node is a MathML text integration point if it is one of the following elements: + A MathML mi element + A MathML mo element + A MathML mn element + A MathML ms element + A MathML mtext element + */ + return (Parser.NamespaceMathml.equals(el.tag().namespace()) + && StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration)); + } + + boolean isHtmlIntegration(Element el) { + /* + A node is an HTML integration point if it is one of the following elements: + A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html" + A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml" + An SVG foreignObject element + An SVG desc element + An SVG title element + */ + if (Parser.NamespaceMathml.equals(el.tag().namespace()) + && el.normalName().equals("annotation-xml")) { + String encoding = Normalizer.normalize(el.attr("encoding")); + if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml")) + return true; + } + if (Parser.NamespaceSvg.equals(el.tag().namespace()) + && StringUtil.in(el.tagName(), TagSvgHtmlIntegration)) // note using .tagName for case-sensitive hit here of foreignObject + return true; + + return false; } boolean process(Token token, HtmlTreeBuilderState state) { @@ -245,6 +328,23 @@ Element insert(final Token.StartTag startTag) { return el; } + /** + Inserts a foreign element. Preserves the case of the tag name and of the attributes. + */ + Element insertForeign(final Token.StartTag startTag, String namespace) { + dedupeAttributes(startTag); + Tag tag = tagFor(startTag.name(), namespace, ParseSettings.preserveCase); + Element el = new Element(tag, null, ParseSettings.preserveCase.normalizeAttributes(startTag.attributes)); + insert(el, startTag); + + if (startTag.isSelfClosing()) { + tag.setSelfClosing(); // remember this is self-closing for output + pop(); + } + + return el; + } + Element insertStartTag(String startTagName) { Element el = new Element(tagFor(startTagName, settings), null); insert(el); @@ -272,7 +372,7 @@ Element insertEmpty(Token.StartTag startTag) { if (!tag.isEmpty()) tokeniser.error("Tag [%s] cannot be self closing; not a void tag", tag.normalName()); } - else // unknown tag, remember this is self closing for output + else // unknown tag, remember this is self-closing for output tag.setSelfClosing(); } return el; @@ -306,6 +406,7 @@ void insert(Token.Character characterToken) { insert(characterToken, el); } + /** Inserts the provided character token into the provided element. */ void insert(Token.Character characterToken, Element el) { final Node node; final String tagName = el.normalName(); @@ -321,7 +422,7 @@ else if (isContentForTagData(tagName)) onNodeInserted(node, characterToken); } - /** Inserts the provided character token into the provided element. Use when not going onto stack element */ + /** Inserts the provided Node into the current element. */ private void insertNode(Node node, @Nullable Token token) { // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc if (stack.isEmpty()) @@ -331,10 +432,14 @@ else if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), else currentElement().appendChild(node); - // connect form controls to their form element - if (node instanceof Element && ((Element) node).tag().isFormListed()) { - if (formElement != null) - formElement.addElement((Element) node); + if (node instanceof Element) { + Element el = (Element) node; + if (el.tag().isFormListed() && formElement != null) + formElement.addElement(el); // connect form controls to their form element + + // in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to + if (el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace())) + error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName()); } onNodeInserted(node, token); } diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index 3c5352ecec..363c9d9608 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -1,5 +1,6 @@ package org.jsoup.parser; +import org.jsoup.helper.Validate; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; @@ -556,13 +557,11 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) { break; case "math": tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml) - tb.insert(startTag); + tb.insertForeign(startTag, Parser.NamespaceMathml); break; case "svg": tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "svg" (xlink, svg) - tb.insert(startTag); + tb.insertForeign(startTag, Parser.NamespaceSvg); break; // static final String[] Headings = new String[]{"h1", "h2", "h3", "h4", "h5", "h6"}; case "h1": @@ -1740,9 +1739,83 @@ boolean process(Token t, HtmlTreeBuilder tb) { } }, ForeignContent { + // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign boolean process(Token t, HtmlTreeBuilder tb) { + switch (t.type) { + case Character: + Token.Character c = t.asCharacter(); + if (c.getData().equals(nullString)) + tb.error(this); + else if (HtmlTreeBuilderState.isWhitespace(c)) + tb.insert(c); + else { + tb.insert(c); + tb.framesetOk(false); + } + break; + case Comment: + tb.insert(t.asComment()); + break; + case Doctype: + tb.error(this); + break; + case StartTag: + Token.StartTag start = t.asStartTag(); + if (StringUtil.in(start.normalName, InForeignToHtml)) + return processAsHtml(t, tb); + if (start.normalName.equals("font") && ( + start.attributes.hasKeyIgnoreCase("color") + || start.attributes.hasKeyIgnoreCase("face") + || start.attributes.hasKeyIgnoreCase("size"))) + return processAsHtml(t, tb); + + // Any other start: + // (whatwg says to fix up tag name and attribute case per a table - we will preserve original case instead) + tb.insertForeign(start, tb.currentElement().tag().namespace()); + // (self-closing handled in insert) + // if self-closing svg script -- level and execution elided + break; + + case EndTag: + Token.EndTag end = t.asEndTag(); + if (end.normalName.equals("br") || end.normalName.equals("p")) + return processAsHtml(t, tb); + if (end.normalName.equals("script") && tb.currentElementIs("script", Parser.NamespaceSvg)) { + // script level and execution elided. + tb.pop(); + return true; + } + + // Any other end tag + ArrayList stack = tb.getStack(); + if (stack.isEmpty()) + Validate.wtf("Stack unexpectedly empty"); + int i = stack.size() - 1; + Element el = stack.get(i); + if (!el.normalName().equals(end.normalName)) + tb.error(this); + while (i != 0) { + if (el.normalName().equals(end.normalName)) { + tb.popStackToClose(el.normalName()); + return true; + } + i--; + el = stack.get(i); + if (el.tag().namespace().equals(Parser.NamespaceHtml)) { + return processAsHtml(t, tb); + } + } + break; + + case EOF: + // won't come through here, but for completion: + break; + } return true; - // todo: implement. Also; how do we get here? + } + + boolean processAsHtml(Token t, HtmlTreeBuilder tb) { + return tb.state().process(t, tb); } }; @@ -1817,5 +1890,6 @@ static final class Constants { static final String[] InCaptionIgnore = new String[]{"body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"}; static final String[] InTemplateToHead = new String[] {"base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "template", "title"}; static final String[] InTemplateToTable = new String[] {"caption", "colgroup", "tbody", "tfoot", "thead"}; + static final String[] InForeignToHtml = new String[] {"b", "big", "blockquote", "body", "br", "center", "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol", "p", "pre", "ruby", "s", "small", "span", "strike", "strong", "sub", "sup", "table", "tt", "u", "ul", "var"}; } } diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java index 83789460af..123ddc3f8d 100644 --- a/src/main/java/org/jsoup/parser/Parser.java +++ b/src/main/java/org/jsoup/parser/Parser.java @@ -14,6 +14,11 @@

Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded environment, use {@link #newInstance()} to make copies. */ public class Parser { + public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml"; + public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace"; + public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML"; + public static final String NamespaceSvg = "http://www.w3.org/2000/svg"; + private TreeBuilder treeBuilder; private ParseErrorList errors; private ParseSettings settings; @@ -148,6 +153,10 @@ public boolean isContentForTagData(String normalName) { return getTreeBuilder().isContentForTagData(normalName); } + public String defaultNamespace() { + return getTreeBuilder().defaultNamespace(); + } + // static parse functions below /** * Parse HTML into a Document. diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java index 97ed500402..0e50c52c39 100644 --- a/src/main/java/org/jsoup/parser/Tag.java +++ b/src/main/java/org/jsoup/parser/Tag.java @@ -7,15 +7,16 @@ import java.util.Map; /** - * HTML Tag capabilities. + * Tag capabilities. * * @author Jonathan Hedley, jonathan@hedley.net */ public class Tag implements Cloneable { - private static final Map tags = new HashMap<>(); // map of known tags + private static final Map Tags = new HashMap<>(); // map of known tags private String tagName; - private String normalName; // always the lower case version of this tag, regardless of case preservation mode + private final String normalName; // always the lower case version of this tag, regardless of case preservation mode + private String namespace; private boolean isBlock = true; // block private boolean formatAsBlock = true; // should be formatted as a block private boolean empty = false; // can hold nothing; e.g. img @@ -24,9 +25,10 @@ public class Tag implements Cloneable { private boolean formList = false; // a control that appears in forms: input, textarea, output etc private boolean formSubmit = false; // a control that can be submitted in a form: input etc - private Tag(String tagName) { + private Tag(String tagName, String namespace) { this.tagName = tagName; normalName = Normalizer.lowerCase(tagName); + this.namespace = namespace; } /** @@ -46,35 +48,44 @@ public String normalName() { return normalName; } + public String namespace() { + return namespace; + } + /** * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. *

- * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). + * Pre-defined tags (p, div etc) will be ==, but unknown tags are not registered and will only .equals(). *

* - * @param tagName Name of tag, e.g. "p". Case insensitive. + * @param tagName Name of tag, e.g. "p". Case-insensitive. + * @param namespace the namespace for the tag. * @param settings used to control tag name sensitivity * @return The tag, either defined or new generic. */ - public static Tag valueOf(String tagName, ParseSettings settings) { - Validate.notNull(tagName); - Tag tag = tags.get(tagName); - - if (tag == null) { - tagName = settings.normalizeTag(tagName); // the name we'll use - Validate.notEmpty(tagName); - String normalName = Normalizer.lowerCase(tagName); // the lower-case name to get tag settings off - tag = tags.get(normalName); - - if (tag == null) { - // not defined: create default; go anywhere, do anything! (incl be inside a

) - tag = new Tag(tagName); - tag.isBlock = false; - } else if (settings.preserveTagCase() && !tagName.equals(normalName)) { + public static Tag valueOf(String tagName, String namespace, ParseSettings settings) { + Validate.notEmpty(tagName); + Validate.notNull(namespace); + Tag tag = Tags.get(tagName); + if (tag != null && tag.namespace.equals(namespace)) + return tag; + + tagName = settings.normalizeTag(tagName); // the name we'll use + Validate.notEmpty(tagName); + String normalName = Normalizer.lowerCase(tagName); // the lower-case name to get tag settings off + tag = Tags.get(normalName); + if (tag != null && tag.namespace.equals(namespace)) { + if (settings.preserveTagCase() && !tagName.equals(normalName)) { tag = tag.clone(); // get a new version vs the static one, so name update doesn't reset all tag.tagName = tagName; } + return tag; } + + // not defined: create default; go anywhere, do anything! (incl be inside a

) + tag = new Tag(tagName, namespace); + tag.isBlock = false; + return tag; } @@ -86,9 +97,25 @@ public static Tag valueOf(String tagName, ParseSettings settings) { * * @param tagName Name of tag, e.g. "p". Case sensitive. * @return The tag, either defined or new generic. + * @see #valueOf(String tagName, String namespace, ParseSettings settings) */ public static Tag valueOf(String tagName) { - return valueOf(tagName, ParseSettings.preserveCase); + return valueOf(tagName, Parser.NamespaceHtml, ParseSettings.preserveCase); + } + + /** + * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. + *

+ * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). + *

+ * + * @param tagName Name of tag, e.g. "p". Case sensitive. + * @param settings used to control tag name sensitivity + * @return The tag, either defined or new generic. + * @see #valueOf(String tagName, String namespace, ParseSettings settings) + */ + public static Tag valueOf(String tagName, ParseSettings settings) { + return valueOf(tagName, Parser.NamespaceHtml, settings); } /** @@ -128,9 +155,9 @@ public boolean isEmpty() { } /** - * Get if this tag is self closing. + * Get if this tag is self-closing. * - * @return if this tag should be output as self closing. + * @return if this tag should be output as self-closing. */ public boolean isSelfClosing() { return empty || selfClosing; @@ -142,7 +169,7 @@ public boolean isSelfClosing() { * @return if a known tag */ public boolean isKnownTag() { - return tags.containsKey(tagName); + return Tags.containsKey(tagName); } /** @@ -152,7 +179,7 @@ public boolean isKnownTag() { * @return if known HTML tag */ public static boolean isKnownTag(String tagName) { - return tags.containsKey(tagName); + return Tags.containsKey(tagName); } /** @@ -247,7 +274,9 @@ protected Tag clone() { "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", "data", "bdi", "s", "strike", "nobr", - "rb" // deprecated but still known / special handling + "rb", // deprecated but still known / special handling + "text", // in SVG NS + "mi", "mo", "msup", "mn", "mtext" // in MathML NS, to ensure inline }; private static final String[] emptyTags = { "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", @@ -270,14 +299,21 @@ protected Tag clone() { "input", "keygen", "object", "select", "textarea" }; + private static final Map namespaces = new HashMap<>(); + static { + namespaces.put(Parser.NamespaceMathml, new String[]{"math", "mi", "mo", "msup", "mn", "mtext"}); + namespaces.put(Parser.NamespaceSvg, new String[]{"svg", "text"}); + // We don't need absolute coverage here as other cases will be inferred by the HtmlTreeBuilder + } + static { // creates for (String tagName : blockTags) { - Tag tag = new Tag(tagName); + Tag tag = new Tag(tagName, Parser.NamespaceHtml); register(tag); } for (String tagName : inlineTags) { - Tag tag = new Tag(tagName); + Tag tag = new Tag(tagName, Parser.NamespaceHtml); tag.isBlock = false; tag.formatAsBlock = false; register(tag); @@ -285,37 +321,46 @@ protected Tag clone() { // mods: for (String tagName : emptyTags) { - Tag tag = tags.get(tagName); + Tag tag = Tags.get(tagName); Validate.notNull(tag); tag.empty = true; } for (String tagName : formatAsInlineTags) { - Tag tag = tags.get(tagName); + Tag tag = Tags.get(tagName); Validate.notNull(tag); tag.formatAsBlock = false; } for (String tagName : preserveWhitespaceTags) { - Tag tag = tags.get(tagName); + Tag tag = Tags.get(tagName); Validate.notNull(tag); tag.preserveWhitespace = true; } for (String tagName : formListedTags) { - Tag tag = tags.get(tagName); + Tag tag = Tags.get(tagName); Validate.notNull(tag); tag.formList = true; } for (String tagName : formSubmitTags) { - Tag tag = tags.get(tagName); + Tag tag = Tags.get(tagName); Validate.notNull(tag); tag.formSubmit = true; } + + // namespace setup + for (Map.Entry ns : namespaces.entrySet()) { + for (String tagName : ns.getValue()) { + Tag tag = Tags.get(tagName); + Validate.notNull(tag); + tag.namespace = ns.getKey(); + } + } } private static void register(Tag tag) { - tags.put(tag.tagName, tag); + Tags.put(tag.tagName, tag); } } diff --git a/src/main/java/org/jsoup/parser/Token.java b/src/main/java/org/jsoup/parser/Token.java index b0fc0af81f..e2ab656381 100644 --- a/src/main/java/org/jsoup/parser/Token.java +++ b/src/main/java/org/jsoup/parser/Token.java @@ -311,10 +311,11 @@ StartTag nameAttr(String name, Attributes attributes) { @Override public String toString() { + String closer = isSelfClosing() ? "/>" : ">"; if (hasAttributes() && attributes.size() > 0) - return "<" + toStringName() + " " + attributes.toString() + ">"; + return "<" + toStringName() + " " + attributes.toString() + closer; else - return "<" + toStringName() + ">"; + return "<" + toStringName() + closer; } } diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 77083b2ea0..6c9a78b5bc 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -15,6 +15,8 @@ import java.util.List; import java.util.Map; +import static org.jsoup.parser.Parser.NamespaceHtml; + /** * @author Jonathan Hedley */ @@ -41,7 +43,7 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { Validate.notNullParam(baseUri, "baseUri"); Validate.notNull(parser); - doc = new Document(baseUri); + doc = new Document(parser.defaultNamespace(), baseUri); doc.parser(parser); this.parser = parser; settings = parser.settings(); @@ -132,7 +134,7 @@ protected Element currentElement() { } /** - Checks if the Current Element's normal name equals the supplied name. + Checks if the Current Element's normal name equals the supplied name, in the HTML namespace. @param normalName name to check @return true if there is a current element on the stack, and its name equals the supplied */ @@ -140,7 +142,22 @@ protected boolean currentElementIs(String normalName) { if (stack.size() == 0) return false; Element current = currentElement(); - return current != null && current.normalName().equals(normalName); + return current != null && current.normalName().equals(normalName) + && current.tag().namespace().equals(NamespaceHtml); + } + + /** + Checks if the Current Element's normal name equals the supplied name, in the specified namespace. + @param normalName name to check + @param namespace the namespace + @return true if there is a current element on the stack, and its name equals the supplied + */ + protected boolean currentElementIs(String normalName, String namespace) { + if (stack.size() == 0) + return false; + Element current = currentElement(); + return current != null && current.normalName().equals(normalName) + && current.tag().namespace().equals(namespace); } /** @@ -170,13 +187,27 @@ protected boolean isContentForTagData(String normalName) { return false; } - protected Tag tagFor(String tagName, ParseSettings settings) { - Tag tag = seenTags.get(tagName); // note that we don't normalize the cache key. But tag via valueOf may be normalized. - if (tag == null) { - tag = Tag.valueOf(tagName, settings); + protected Tag tagFor(String tagName, String namespace, ParseSettings settings) { + Tag cached = seenTags.get(tagName); // note that we don't normalize the cache key. But tag via valueOf may be normalized. + if (cached == null || !cached.namespace().equals(namespace)) { + // only return from cache if the namespace is the same. not running nested cache to save double hit on the common flow + Tag tag = Tag.valueOf(tagName, namespace, settings); seenTags.put(tagName, tag); + return tag; } - return tag; + return cached; + } + + protected Tag tagFor(String tagName, ParseSettings settings) { + return tagFor(tagName, defaultNamespace(), settings); + } + + /** + Gets the default namespace for this TreeBuilder + * @return the default namespace + */ + protected String defaultNamespace() { + return NamespaceHtml; } /** diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java index 9660723935..e1999f9679 100644 --- a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java @@ -16,6 +16,8 @@ import java.io.StringReader; import java.util.List; +import static org.jsoup.parser.Parser.NamespaceXml; + /** * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the * document. @@ -51,6 +53,10 @@ XmlTreeBuilder newInstance() { return new XmlTreeBuilder(); } + @Override public String defaultNamespace() { + return NamespaceXml; + } + @Override protected boolean process(Token token) { // start tag, end tag, doctype, comment, character, eof @@ -90,15 +96,13 @@ protected void insertNode(Node node, Token token) { Element insert(Token.StartTag startTag) { Tag tag = tagFor(startTag.name(), settings); - // todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html. if (startTag.hasAttributes()) startTag.attributes.deduplicate(settings); Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); insertNode(el, startTag); if (startTag.isSelfClosing()) { - if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above. - tag.setSelfClosing(); + tag.setSelfClosing(); } else { stack.add(el); } diff --git a/src/main/java/org/jsoup/safety/Cleaner.java b/src/main/java/org/jsoup/safety/Cleaner.java index fb12b46e32..7b9317ec29 100644 --- a/src/main/java/org/jsoup/safety/Cleaner.java +++ b/src/main/java/org/jsoup/safety/Cleaner.java @@ -9,6 +9,7 @@ import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.parser.ParseErrorList; +import org.jsoup.parser.ParseSettings; import org.jsoup.parser.Parser; import org.jsoup.parser.Tag; import org.jsoup.select.NodeTraversor; @@ -179,7 +180,7 @@ private int copySafeNodes(Element source, Element dest) { private ElementMeta createSafeElement(Element sourceEl) { String sourceTag = sourceEl.tagName(); Attributes destAttrs = new Attributes(); - Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); + Element dest = new Element(Tag.valueOf(sourceTag, sourceEl.tag().namespace(), ParseSettings.preserveCase), sourceEl.baseUri(), destAttrs); int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); diff --git a/src/main/java/org/jsoup/select/Evaluator.java b/src/main/java/org/jsoup/select/Evaluator.java index d8929fc302..fb3b6a6909 100644 --- a/src/main/java/org/jsoup/select/Evaluator.java +++ b/src/main/java/org/jsoup/select/Evaluator.java @@ -10,6 +10,7 @@ import org.jsoup.nodes.PseudoTextElement; import org.jsoup.nodes.TextNode; import org.jsoup.nodes.XmlDeclaration; +import org.jsoup.parser.ParseSettings; import java.util.List; import java.util.regex.Matcher; @@ -969,7 +970,7 @@ public boolean matches(Element root, Element element) { List textNodes = element.textNodes(); for (TextNode textNode : textNodes) { PseudoTextElement pel = new PseudoTextElement( - org.jsoup.parser.Tag.valueOf(element.tagName()), element.baseUri(), element.attributes()); + org.jsoup.parser.Tag.valueOf(element.tagName(), element.tag().namespace(), ParseSettings.preserveCase), element.baseUri(), element.attributes()); textNode.replaceWith(pel); pel.appendChild(textNode); } diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 4172d0f59e..5f9eca9d7c 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -1752,4 +1752,126 @@ private boolean didAddElements(String input) { assertEquals(textContent, textArea.wholeText()); } + + @Test void svgParseTest() { + String html = "

One

"; + Document doc = Jsoup.parse(html); + + assertHtmlNamespace(doc); + Element div = doc.expectFirst("div"); + assertHtmlNamespace(div); + + Element svg = doc.expectFirst("svg"); + assertTrue(svg.attributes().hasKey("viewBox")); + assertSvgNamespace(svg); + assertSvgNamespace(doc.expectFirst("foreignObject")); + assertHtmlNamespace(doc.expectFirst("p")); + + String serialized = div.html(); + assertEquals("\n" + + " \n" + + "

One

\n" + + "
\n" + + "
", serialized); + } + + @Test void mathParseText() { + String html = "

One

Blah
"; + Document doc = Jsoup.parse(html); + + assertHtmlNamespace(doc.expectFirst("div")); + assertMathNamespace(doc.expectFirst("math")); + assertMathNamespace(doc.expectFirst("mi")); + assertHtmlNamespace(doc.expectFirst("p")); + assertSvgNamespace(doc.expectFirst("svg")); + assertSvgNamespace(doc.expectFirst("text")); + assertMathNamespace(doc.expectFirst("ms")); + + String serialized = doc.expectFirst("div").html(); + assertEquals("\n" + + " \n" + + "

One

\n" + + " \n" + + " Blah\n" + + "
\n" + + "
", serialized); + } + + private static void assertHtmlNamespace(Element el) { + assertEquals(Parser.NamespaceHtml, el.tag().namespace()); + } + + private static void assertSvgNamespace(Element el) { + assertEquals(Parser.NamespaceSvg, el.tag().namespace()); + } + + private static void assertMathNamespace(Element el) { + assertEquals(Parser.NamespaceMathml, el.tag().namespace()); + } + + @Test void mathSvgStyleTest() { + String html = ""; + Document doc = Jsoup.parse(html); + + Element htmlStyle = doc.expectFirst("style"); + assertHtmlNamespace(htmlStyle); + assertEquals("", htmlStyle.data()); // that's not an element, it's data (textish) + + Element svgStyle = doc.expectFirst("svg style"); + assertMathNamespace(svgStyle); // in inherited math namespace as not an HTML integration point + Element styleImg = svgStyle.expectFirst("img"); + assertHtmlNamespace(styleImg); // this one is an img tag - in foreign to html elements + + assertMathNamespace(doc.expectFirst("svg")); + assertMathNamespace(doc.expectFirst("math")); + } + + @Test void xmlnsAttributeError() { + String html = "

"; + Parser parser = Parser.htmlParser().setTrackErrors(10); + Document doc = Jsoup.parse(html, parser); + assertEquals(0, doc.parser().getErrors().size()); + + String html2 = "

"; + Document doc2 = Jsoup.parse(html2, parser); + assertEquals(1, doc2.parser().getErrors().size()); + assertEquals("Invalid xmlns attribute [xhtml] on tag [i]", parser.getErrors().get(0).getErrorMessage()); + } + + @Test void mathAnnotationSvg() { + String html = ""; // not in annotation, svg will be in math ns + Document doc = Jsoup.parse(html); + assertMathNamespace(doc.expectFirst("math")); + assertMathNamespace(doc.expectFirst("svg")); + + String html2 = ""; // svg will be in svg ns + Document doc2 = Jsoup.parse(html2); + assertMathNamespace(doc2.expectFirst("math")); + assertMathNamespace(doc2.expectFirst("annotation-xml")); + assertSvgNamespace(doc2.expectFirst("svg")); + } + + @Test void mathHtmlIntegrationPoint() { + String html = "

Hello"; + Document doc = Jsoup.parse(html); + assertMathNamespace(doc.expectFirst("math")); + assertHtmlNamespace(doc.expectFirst("div")); + + String html2 = "Hello"; + Document doc2 = Jsoup.parse(html2); + assertMathNamespace(doc2.expectFirst("math")); + assertMathNamespace(doc2.expectFirst("divv")); + + String html3 = "Hello"; + Document doc3 = Jsoup.parse(html3); + assertMathNamespace(doc3.expectFirst("math")); + assertMathNamespace(doc3.expectFirst("annotation-xml")); + assertMathNamespace(doc3.expectFirst("divv")); + + String html4 = "Hello"; + Document doc4 = Jsoup.parse(html4); + assertMathNamespace(doc4.expectFirst("math")); + assertMathNamespace(doc4.expectFirst("annotation-xml")); + assertHtmlNamespace(doc4.expectFirst("divv")); + } } diff --git a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java index 55b828e893..cb09e458cf 100644 --- a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java +++ b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java @@ -45,7 +45,7 @@ static void ensureSorted(List constants) { public void ensureArraysAreSorted() { List constants = findConstantArrays(Constants.class); ensureSorted(constants); - assertEquals(39, constants.size()); + assertEquals(40, constants.size()); } @Test public void ensureTagSearchesAreKnownTags() { diff --git a/src/test/java/org/jsoup/parser/HtmlTreeBuilderTest.java b/src/test/java/org/jsoup/parser/HtmlTreeBuilderTest.java index 10e6c254e9..dae8e4600f 100644 --- a/src/test/java/org/jsoup/parser/HtmlTreeBuilderTest.java +++ b/src/test/java/org/jsoup/parser/HtmlTreeBuilderTest.java @@ -18,7 +18,7 @@ public class HtmlTreeBuilderTest { public void ensureSearchArraysAreSorted() { List constants = HtmlTreeBuilderStateTest.findConstantArrays(HtmlTreeBuilder.class); HtmlTreeBuilderStateTest.ensureSorted(constants); - assertEquals(8, constants.size()); + assertEquals(10, constants.size()); } @Test diff --git a/src/test/java/org/jsoup/parser/TagTest.java b/src/test/java/org/jsoup/parser/TagTest.java index 65a794d847..999cfd0764 100644 --- a/src/test/java/org/jsoup/parser/TagTest.java +++ b/src/test/java/org/jsoup/parser/TagTest.java @@ -81,4 +81,26 @@ public void canBeInsensitive(Locale locale) { assertTrue(Tag.isKnownTag("div")); assertFalse(Tag.isKnownTag("explain")); } + + @Test public void knownSvgNamespace() { + Tag svgHtml = Tag.valueOf("svg"); // no namespace specified, defaults to html, so not the known tag + Tag svg = Tag.valueOf("svg", Parser.NamespaceSvg, ParseSettings.htmlDefault); + + assertEquals(Parser.NamespaceHtml, svgHtml.namespace()); + assertEquals(Parser.NamespaceSvg, svg.namespace()); + + assertFalse(svgHtml.isBlock()); // generated + assertTrue(svg.isBlock()); // known + } + + @Test public void unknownTagNamespace() { + Tag fooHtml = Tag.valueOf("foo"); // no namespace specified, defaults to html + Tag foo = Tag.valueOf("foo", Parser.NamespaceSvg, ParseSettings.htmlDefault); + + assertEquals(Parser.NamespaceHtml, fooHtml.namespace()); + assertEquals(Parser.NamespaceSvg, foo.namespace()); + + assertFalse(fooHtml.isBlock()); // generated + assertFalse(foo.isBlock()); // generated + } } diff --git a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java index d359a52881..2499f3f550 100644 --- a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java +++ b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java @@ -127,7 +127,7 @@ public void testDoesNotForceSelfClosingKnownTags() { public void testDoesHandleEOFInTag() { String html = "", xmlDoc.html()); + assertEquals("", xmlDoc.html()); } @Test @@ -312,6 +312,36 @@ public void handlesLTinScript() { assertEquals("FOO", t3.getName()); assertSame(t1, t2); assertSame(t3, t4); + } + + @Test void rootHasXmlSettings() { + Document doc = Jsoup.parse("", Parser.xmlParser()); + ParseSettings settings = doc.parser().settings(); + assertTrue(settings.preserveTagCase()); + assertTrue(settings.preserveAttributeCase()); + assertEquals(Parser.NamespaceXml, doc.parser().defaultNamespace()); + } + + @Test void xmlNamespace() { + String xml = "
Qux"; + Document doc = Jsoup.parse(xml, Parser.xmlParser()); + assertXmlNamespace(doc); + Elements els = doc.select("*"); + for (Element el : els) { + assertXmlNamespace(el); + } + + Document clone = doc.clone(); + assertXmlNamespace(clone); + assertXmlNamespace(clone.expectFirst("bar")); + + Document shallow = doc.shallowClone(); + assertXmlNamespace(shallow); } + + private static void assertXmlNamespace(Element el) { + assertEquals(Parser.NamespaceXml, el.tag().namespace(), String.format("Element %s not in XML namespace", el.tagName())); + } + }