Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for svg and math foreign elements #2008

Merged
merged 5 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions src/main/java/org/jsoup/nodes/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import javax.annotation.Nullable;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayList;
import java.util.List;

/**
Expand All @@ -31,17 +30,28 @@ public class Document extends Element {
private boolean updateMetaCharset = false;

/**
Create a new, empty Document.
Create a new, empty Document, in the specified namespace.
@param namespace the namespace of this Document's root node.
@param baseUri base URI of document
@see org.jsoup.Jsoup#parse
@see #createShell
*/
public Document(String baseUri) {
super(Tag.valueOf("#root", ParseSettings.htmlDefault), baseUri);
public Document(String namespace, String baseUri) {
super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri);
this.location = baseUri;
this.parser = Parser.htmlParser(); // default, but overridable
}

/**
Create a new, empty Document, in the HTML namespace.
@param baseUri base URI of document
@see org.jsoup.Jsoup#parse
@see #Document(String namespace, String baseUri)
*/
public Document(String baseUri) {
this(Parser.NamespaceHtml, baseUri);
}

/**
Create a valid, empty shell of a document, suitable for adding more elements to.
@param baseUri baseUri of document
Expand Down Expand Up @@ -208,7 +218,7 @@ public void title(String title) {
@return new element
*/
public Element createElement(String tagName) {
return new Element(Tag.valueOf(tagName, ParseSettings.preserveCase), this.baseUri());
return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri());
}

@Override
Expand Down Expand Up @@ -312,7 +322,7 @@ public Document clone() {

@Override
public Document shallowClone() {
Document clone = new Document(baseUri());
Document clone = new Document(this.tag().namespace(), baseUri());
if (attributes != null)
clone.attributes = attributes.clone();
clone.outputSettings = this.outputSettings.clone();
Expand Down
43 changes: 38 additions & 5 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.jsoup.internal.NonnullByDefault;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.select.Collector;
import org.jsoup.select.Elements;
Expand Down Expand Up @@ -51,11 +52,21 @@ public class Element extends Node {
@Nullable Attributes attributes; // field is nullable but all methods for attributes are non-null

/**
* Create a new, standalone element.
* Create a new, standalone element, in the specified namespace.
* @param tag tag name
* @param namespace namespace for this element
*/
public Element(String tag, String namespace) {
this(Tag.valueOf(tag, namespace, ParseSettings.preserveCase), null);
}

/**
* Create a new, standalone element, in the HTML namespace.
* @param tag tag name
* @see #Element(String tag, String namespace)
*/
public Element(String tag) {
this(Tag.valueOf(tag), "", null);
this(Tag.valueOf(tag, Parser.NamespaceHtml, ParseSettings.preserveCase), "", null);
}

/**
Expand Down Expand Up @@ -172,8 +183,22 @@ public String normalName() {
* @see Elements#tagName(String)
*/
public Element tagName(String tagName) {
return tagName(tagName, tag.namespace());
}

/**
* Change (rename) the tag of this element. For example, convert a {@code <span>} to a {@code <div>} with
* {@code el.tagName("div");}.
*
* @param tagName new tag name for this element
* @param namespace the new namespace for this element
* @return this element, for chaining
* @see Elements#tagName(String)
*/
public Element tagName(String tagName, String namespace) {
Validate.notEmptyParam(tagName, "tagName");
tag = Tag.valueOf(tagName, NodeUtils.parser(this).settings()); // maintains the case option of the original parse
Validate.notEmptyParam(namespace, "namespace");
tag = Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()); // maintains the case option of the original parse
return this;
}

Expand Down Expand Up @@ -679,7 +704,11 @@ public Element insertChildren(int index, Node... children) {
* {@code parent.appendElement("h1").attr("id", "header").text("Welcome");}
*/
public Element appendElement(String tagName) {
Element child = new Element(Tag.valueOf(tagName, NodeUtils.parser(this).settings()), baseUri());
return appendElement(tagName, tag.namespace());
}

public Element appendElement(String tagName, String namespace) {
Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri());
appendChild(child);
return child;
}
Expand All @@ -692,7 +721,11 @@ public Element appendElement(String tagName) {
* {@code parent.prependElement("h1").attr("id", "header").text("Welcome");}
*/
public Element prependElement(String tagName) {
Element child = new Element(Tag.valueOf(tagName, NodeUtils.parser(this).settings()), baseUri());
return prependElement(tagName, tag.namespace());
}

public Element prependElement(String tagName, String namespace) {
Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri());
prependChild(child);
return child;
}
Expand Down
104 changes: 102 additions & 2 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.jsoup.parser;

import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Comment;
Expand All @@ -21,6 +22,7 @@

import static org.jsoup.internal.StringUtil.inSorted;
import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster;
import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent;

/**
* HTML Tree Builder; creates a DOM from Tokens.
Expand All @@ -42,6 +44,8 @@ public class HtmlTreeBuilder extends TreeBuilder {
"noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script",
"section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead",
"title", "tr", "ul", "wbr", "xmp"};
static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"};
static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"};

public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages

Expand Down Expand Up @@ -165,7 +169,86 @@ List<Node> parseFragment(String inputFragment, @Nullable Element context, String
@Override
protected boolean process(Token token) {
currentToken = token;
return this.state.process(token, this);

if (shouldDispatchToCurrentInsertionMode(token)) {
return this.state.process(token, this);
} else {
return ForeignContent.process(token, this);
}
}

boolean shouldDispatchToCurrentInsertionMode(Token token) {
// https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
// If the stack of open elements is empty
if (stack.isEmpty())
return true;
final Element el = currentElement();
final String ns = el.tag().namespace();

// If the adjusted current node is an element in the HTML namespace
if (Parser.NamespaceHtml.equals(ns))
return true;

// If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark"
// If the adjusted current node is a MathML text integration point and the token is a character token
// If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg"
if (isMathmlTextIntegration(el)) {
if (token.isStartTag()
&& !"mglyph".equals(token.asStartTag().normalName)
&& !"malignmark".equals(token.asStartTag().normalName))
return true;
if (token.isCharacter())
return true;
}
if (Parser.NamespaceMathml.equals(ns)
&& el.normalName().equals("annotation-xml")
&& token.isStartTag()
&& "svg".equals(token.asStartTag().normalName))
return true;

// If the adjusted current node is an HTML integration point and the token is a start tag
// If the adjusted current node is an HTML integration point and the token is a character token
if (isHtmlIntegration(el)
&& (token.isStartTag() || token.isCharacter()))
return true;

// If the token is an end-of-file token
return token.isEOF();
}

boolean isMathmlTextIntegration(Element el) {
/*
A node is a MathML text integration point if it is one of the following elements:
A MathML mi element
A MathML mo element
A MathML mn element
A MathML ms element
A MathML mtext element
*/
return (Parser.NamespaceMathml.equals(el.tag().namespace())
&& StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration));
}

boolean isHtmlIntegration(Element el) {
/*
A node is an HTML integration point if it is one of the following elements:
A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html"
A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml"
An SVG foreignObject element
An SVG desc element
An SVG title element
*/
if (Parser.NamespaceMathml.equals(el.tag().namespace())
&& el.normalName().equals("annotation-xml")) {
String encoding = Normalizer.normalize(el.attr("encoding"));
if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml"))
return true;
}
if (Parser.NamespaceSvg.equals(el.tag().namespace())
&& StringUtil.in(el.tagName(), TagSvgHtmlIntegration)) // note using .tagName for case-sensitive hit here of foreignObject
return true;

return false;
}

boolean process(Token token, HtmlTreeBuilderState state) {
Expand Down Expand Up @@ -245,6 +328,23 @@ Element insert(final Token.StartTag startTag) {
return el;
}

/**
Inserts a foreign element. Preserves the case of the tag name and of the attributes.
*/
Element insertForeign(final Token.StartTag startTag, String namespace) {
dedupeAttributes(startTag);
Tag tag = tagFor(startTag.name(), namespace, ParseSettings.preserveCase);
Element el = new Element(tag, null, ParseSettings.preserveCase.normalizeAttributes(startTag.attributes));
insert(el, startTag);

if (startTag.isSelfClosing()) {
tag.setSelfClosing(); // remember this is self-closing for output
pop();
}

return el;
}

Element insertStartTag(String startTagName) {
Element el = new Element(tagFor(startTagName, settings), null);
insert(el);
Expand Down Expand Up @@ -272,7 +372,7 @@ Element insertEmpty(Token.StartTag startTag) {
if (!tag.isEmpty())
tokeniser.error("Tag [%s] cannot be self closing; not a void tag", tag.normalName());
}
else // unknown tag, remember this is self closing for output
else // unknown tag, remember this is self-closing for output
tag.setSelfClosing();
}
return el;
Expand Down
79 changes: 74 additions & 5 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.jsoup.parser;

import org.jsoup.helper.Validate;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
Expand Down Expand Up @@ -556,13 +557,11 @@
break;
case "math":
tb.reconstructFormattingElements();
// todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml)
tb.insert(startTag);
tb.insertForeign(startTag, Parser.NamespaceMathml);
break;
case "svg":
tb.reconstructFormattingElements();
// todo: handle A start tag whose tag name is "svg" (xlink, svg)
tb.insert(startTag);
tb.insertForeign(startTag, Parser.NamespaceSvg);
break;
// static final String[] Headings = new String[]{"h1", "h2", "h3", "h4", "h5", "h6"};
case "h1":
Expand Down Expand Up @@ -1740,9 +1739,78 @@
}
},
ForeignContent {
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
boolean process(Token t, HtmlTreeBuilder tb) {
switch (t.type) {
Fixed Show fixed Hide fixed
case Character:
Token.Character c = t.asCharacter();
if (c.getData().equals(nullString))
tb.error(this);
else if (HtmlTreeBuilderState.isWhitespace(c))
tb.insert(c);
else {
tb.insert(c);
tb.framesetOk(false);
}
break;
case Comment:
tb.insert(t.asComment());
break;
case Doctype:
tb.error(this);
break;
case StartTag:
Token.StartTag start = t.asStartTag();
if (StringUtil.in(start.normalName, InForeignToHtml))
return processAsHtml(t, tb);
if (start.normalName.equals("font") && (
start.attributes.hasKeyIgnoreCase("color")
|| start.attributes.hasKeyIgnoreCase("face")
|| start.attributes.hasKeyIgnoreCase("size")))
return processAsHtml(t, tb);

// Any other start:
// (whatwg says to fix up tag name and attribute case per a table - we will preserve original case instead)
tb.insertForeign(start, tb.currentElement().tag().namespace());
// (self-closing handled in insert)
// if self-closing svg script -- level and execution elided
break;

case EndTag:
Token.EndTag end = t.asEndTag();
if (end.normalName.equals("br") || end.normalName.equals("p"))
return processAsHtml(t, tb);
if (end.normalName.equals("script") && tb.currentElementIs("script", Parser.NamespaceSvg)) {
// script level and execution elided.
tb.pop();
return true;
}

// Any other end tag
ArrayList<Element> stack = tb.getStack();
if (stack.isEmpty())
Validate.wtf("Stack unexpectedly empty");
int i = stack.size() - 1;
Element el = stack.get(i);
if (!el.normalName().equals(end.normalName))
tb.error(this);
while (i != 0) {
if (el.normalName().equals(end.normalName)) {
tb.popStackToClose(el.normalName());
return true;
}
i--;
el = stack.get(i);
if (el.tag().namespace().equals(Parser.NamespaceHtml)) {
return processAsHtml(t, tb);
}
}
}
return true;
// todo: implement. Also; how do we get here?
}

boolean processAsHtml(Token t, HtmlTreeBuilder tb) {
return tb.state().process(t, tb);
}
};

Expand Down Expand Up @@ -1817,5 +1885,6 @@
static final String[] InCaptionIgnore = new String[]{"body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"};
static final String[] InTemplateToHead = new String[] {"base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "template", "title"};
static final String[] InTemplateToTable = new String[] {"caption", "colgroup", "tbody", "tfoot", "thead"};
static final String[] InForeignToHtml = new String[] {"b", "big", "blockquote", "body", "br", "center", "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol", "p", "pre", "ruby", "s", "small", "span", "strike", "strong", "sub", "sup", "table", "tt", "u", "ul", "var"};
}
}
Loading