Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for svg and math foreign elements #2008

Merged
merged 5 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@ Release 1.16.2 [PENDING]
matching process by ensuring that simpler evaluations (such as a tag name match) are conducted prior to more
complex evaluations (such as an attribute regex, or a deep child scan with a :has).

* Improvement: added support for <svg> and <math> tags (and their children). This includes tag namespaces and case
preservation on applicable tags and attributes.
<https://github.com/jhy/jsoup/pull/2008>

* Improvement: when converting jsoup Documents to W3C Documents in W3CDom, HTML documents will be placed in the
`http://www.w3.org/1999/xhtml` namespace by default, per the HTML5 spec. This can be controlled by setting
`W3CDom#namespaceAware(false)`.
<https://github.com/jhy/jsoup/pull/1848>

* Improvement: speed optimized the Structural Evaluators by memoizing previous evaluations. Particularly the `~`
(any preceeding sibling) and `:nth-of-type` selectors are improved.
(any preceding sibling) and `:nth-of-type` selectors are improved.
<https://github.com/jhy/jsoup/issues/1956>

* Improvement: tweaked the performance of the Element nextElementSibling, previousElementSibling, firstElementSibling,
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/org/jsoup/helper/W3CDom.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.parser.HtmlTreeBuilder;
import org.jsoup.parser.Parser;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.jsoup.select.Selector;
Expand Down Expand Up @@ -339,9 +340,9 @@ public String asString(Document doc) {
* Implements the conversion by walking the input.
*/
protected static class W3CBuilder implements NodeVisitor {
// TODO: move the namespace handling stuff into XmlTreeBuilder / HtmlTreeBuilder, now that Tags have namespaces
private static final String xmlnsKey = "xmlns";
private static final String xmlnsPrefix = "xmlns:";
private static final String xhtmlNs = "http://www.w3.org/1999/xhtml";

private final Document doc;
private boolean namespaceAware = true;
Expand All @@ -358,7 +359,7 @@ public W3CBuilder(Document doc) {
final org.jsoup.nodes.Document inDoc = contextElement.ownerDocument();
if (namespaceAware && inDoc != null && inDoc.parser().getTreeBuilder() instanceof HtmlTreeBuilder) {
// as per the WHATWG HTML5 spec § 2.1.3, elements are in the HTML namespace by default
namespacesStack.peek().put("", xhtmlNs);
namespacesStack.peek().put("", Parser.NamespaceHtml);
}
}

Expand Down
22 changes: 16 additions & 6 deletions src/main/java/org/jsoup/nodes/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import javax.annotation.Nullable;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayList;
import java.util.List;

/**
Expand All @@ -31,17 +30,28 @@ public class Document extends Element {
private boolean updateMetaCharset = false;

/**
Create a new, empty Document.
Create a new, empty Document, in the specified namespace.
@param namespace the namespace of this Document's root node.
@param baseUri base URI of document
@see org.jsoup.Jsoup#parse
@see #createShell
*/
public Document(String baseUri) {
super(Tag.valueOf("#root", ParseSettings.htmlDefault), baseUri);
public Document(String namespace, String baseUri) {
super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri);
this.location = baseUri;
this.parser = Parser.htmlParser(); // default, but overridable
}

/**
Create a new, empty Document, in the HTML namespace.
@param baseUri base URI of document
@see org.jsoup.Jsoup#parse
@see #Document(String namespace, String baseUri)
*/
public Document(String baseUri) {
this(Parser.NamespaceHtml, baseUri);
}

/**
Create a valid, empty shell of a document, suitable for adding more elements to.
@param baseUri baseUri of document
Expand Down Expand Up @@ -208,7 +218,7 @@ public void title(String title) {
@return new element
*/
public Element createElement(String tagName) {
return new Element(Tag.valueOf(tagName, ParseSettings.preserveCase), this.baseUri());
return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri());
}

@Override
Expand Down Expand Up @@ -312,7 +322,7 @@ public Document clone() {

@Override
public Document shallowClone() {
Document clone = new Document(baseUri());
Document clone = new Document(this.tag().namespace(), baseUri());
if (attributes != null)
clone.attributes = attributes.clone();
clone.outputSettings = this.outputSettings.clone();
Expand Down
43 changes: 38 additions & 5 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.jsoup.internal.NonnullByDefault;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.select.Collector;
import org.jsoup.select.Elements;
Expand Down Expand Up @@ -51,11 +52,21 @@ public class Element extends Node {
@Nullable Attributes attributes; // field is nullable but all methods for attributes are non-null

/**
* Create a new, standalone element.
* Create a new, standalone element, in the specified namespace.
* @param tag tag name
* @param namespace namespace for this element
*/
public Element(String tag, String namespace) {
this(Tag.valueOf(tag, namespace, ParseSettings.preserveCase), null);
}

/**
* Create a new, standalone element, in the HTML namespace.
* @param tag tag name
* @see #Element(String tag, String namespace)
*/
public Element(String tag) {
this(Tag.valueOf(tag), "", null);
this(Tag.valueOf(tag, Parser.NamespaceHtml, ParseSettings.preserveCase), "", null);
}

/**
Expand Down Expand Up @@ -172,8 +183,22 @@ public String normalName() {
* @see Elements#tagName(String)
*/
public Element tagName(String tagName) {
return tagName(tagName, tag.namespace());
}

/**
* Change (rename) the tag of this element. For example, convert a {@code <span>} to a {@code <div>} with
* {@code el.tagName("div");}.
*
* @param tagName new tag name for this element
* @param namespace the new namespace for this element
* @return this element, for chaining
* @see Elements#tagName(String)
*/
public Element tagName(String tagName, String namespace) {
Validate.notEmptyParam(tagName, "tagName");
tag = Tag.valueOf(tagName, NodeUtils.parser(this).settings()); // maintains the case option of the original parse
Validate.notEmptyParam(namespace, "namespace");
tag = Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()); // maintains the case option of the original parse
return this;
}

Expand Down Expand Up @@ -679,7 +704,11 @@ public Element insertChildren(int index, Node... children) {
* {@code parent.appendElement("h1").attr("id", "header").text("Welcome");}
*/
public Element appendElement(String tagName) {
Element child = new Element(Tag.valueOf(tagName, NodeUtils.parser(this).settings()), baseUri());
return appendElement(tagName, tag.namespace());
}

public Element appendElement(String tagName, String namespace) {
Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri());
appendChild(child);
return child;
}
Expand All @@ -692,7 +721,11 @@ public Element appendElement(String tagName) {
* {@code parent.prependElement("h1").attr("id", "header").text("Welcome");}
*/
public Element prependElement(String tagName) {
Element child = new Element(Tag.valueOf(tagName, NodeUtils.parser(this).settings()), baseUri());
return prependElement(tagName, tag.namespace());
}

public Element prependElement(String tagName, String namespace) {
Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri());
prependChild(child);
return child;
}
Expand Down
119 changes: 112 additions & 7 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.jsoup.parser;

import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Comment;
Expand All @@ -21,6 +22,7 @@

import static org.jsoup.internal.StringUtil.inSorted;
import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster;
import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent;

/**
* HTML Tree Builder; creates a DOM from Tokens.
Expand All @@ -42,6 +44,8 @@ public class HtmlTreeBuilder extends TreeBuilder {
"noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script",
"section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead",
"title", "tr", "ul", "wbr", "xmp"};
static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"};
static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"};

public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages

Expand Down Expand Up @@ -165,7 +169,86 @@ List<Node> parseFragment(String inputFragment, @Nullable Element context, String
@Override
protected boolean process(Token token) {
currentToken = token;
return this.state.process(token, this);

if (shouldDispatchToCurrentInsertionMode(token)) {
return this.state.process(token, this);
} else {
return ForeignContent.process(token, this);
}
}

boolean shouldDispatchToCurrentInsertionMode(Token token) {
// https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
// If the stack of open elements is empty
if (stack.isEmpty())
return true;
final Element el = currentElement();
final String ns = el.tag().namespace();

// If the adjusted current node is an element in the HTML namespace
if (Parser.NamespaceHtml.equals(ns))
return true;

// If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark"
// If the adjusted current node is a MathML text integration point and the token is a character token
if (isMathmlTextIntegration(el)) {
if (token.isStartTag()
&& !"mglyph".equals(token.asStartTag().normalName)
&& !"malignmark".equals(token.asStartTag().normalName))
return true;
if (token.isCharacter())
return true;
}
// If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg"
if (Parser.NamespaceMathml.equals(ns)
&& el.normalName().equals("annotation-xml")
&& token.isStartTag()
&& "svg".equals(token.asStartTag().normalName))
return true;

// If the adjusted current node is an HTML integration point and the token is a start tag
// If the adjusted current node is an HTML integration point and the token is a character token
if (isHtmlIntegration(el)
&& (token.isStartTag() || token.isCharacter()))
return true;

// If the token is an end-of-file token
return token.isEOF();
}

boolean isMathmlTextIntegration(Element el) {
/*
A node is a MathML text integration point if it is one of the following elements:
A MathML mi element
A MathML mo element
A MathML mn element
A MathML ms element
A MathML mtext element
*/
return (Parser.NamespaceMathml.equals(el.tag().namespace())
&& StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration));
}

boolean isHtmlIntegration(Element el) {
/*
A node is an HTML integration point if it is one of the following elements:
A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html"
A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml"
An SVG foreignObject element
An SVG desc element
An SVG title element
*/
if (Parser.NamespaceMathml.equals(el.tag().namespace())
&& el.normalName().equals("annotation-xml")) {
String encoding = Normalizer.normalize(el.attr("encoding"));
if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml"))
return true;
}
if (Parser.NamespaceSvg.equals(el.tag().namespace())
&& StringUtil.in(el.tagName(), TagSvgHtmlIntegration)) // note using .tagName for case-sensitive hit here of foreignObject
return true;

return false;
}

boolean process(Token token, HtmlTreeBuilderState state) {
Expand Down Expand Up @@ -245,6 +328,23 @@ Element insert(final Token.StartTag startTag) {
return el;
}

/**
Inserts a foreign element. Preserves the case of the tag name and of the attributes.
*/
Element insertForeign(final Token.StartTag startTag, String namespace) {
dedupeAttributes(startTag);
Tag tag = tagFor(startTag.name(), namespace, ParseSettings.preserveCase);
Element el = new Element(tag, null, ParseSettings.preserveCase.normalizeAttributes(startTag.attributes));
insert(el, startTag);

if (startTag.isSelfClosing()) {
tag.setSelfClosing(); // remember this is self-closing for output
pop();
}

return el;
}

Element insertStartTag(String startTagName) {
Element el = new Element(tagFor(startTagName, settings), null);
insert(el);
Expand Down Expand Up @@ -272,7 +372,7 @@ Element insertEmpty(Token.StartTag startTag) {
if (!tag.isEmpty())
tokeniser.error("Tag [%s] cannot be self closing; not a void tag", tag.normalName());
}
else // unknown tag, remember this is self closing for output
else // unknown tag, remember this is self-closing for output
tag.setSelfClosing();
}
return el;
Expand Down Expand Up @@ -306,6 +406,7 @@ void insert(Token.Character characterToken) {
insert(characterToken, el);
}

/** Inserts the provided character token into the provided element. */
void insert(Token.Character characterToken, Element el) {
final Node node;
final String tagName = el.normalName();
Expand All @@ -321,7 +422,7 @@ else if (isContentForTagData(tagName))
onNodeInserted(node, characterToken);
}

/** Inserts the provided character token into the provided element. Use when not going onto stack element */
/** Inserts the provided Node into the current element. */
private void insertNode(Node node, @Nullable Token token) {
// if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
if (stack.isEmpty())
Expand All @@ -331,10 +432,14 @@ else if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(),
else
currentElement().appendChild(node);

// connect form controls to their form element
if (node instanceof Element && ((Element) node).tag().isFormListed()) {
if (formElement != null)
formElement.addElement((Element) node);
if (node instanceof Element) {
Element el = (Element) node;
if (el.tag().isFormListed() && formElement != null)
formElement.addElement(el); // connect form controls to their form element

// in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to
if (el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace()))
error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName());
}
onNodeInserted(node, token);
}
Expand Down
Loading