diff --git a/CHANGES b/CHANGES index 33ed77eb49..55b406af03 100644 --- a/CHANGES +++ b/CHANGES @@ -19,6 +19,10 @@ jsoup changelog so that they are indented correctly. + * Improvement: in Element#selectXpath(), disable namespace awareness. This makes it possible to always select elements + by their simple local name, regardless of whether an xmlns attribute was set. + + * Bugfix: when using the readToByteBuffer method, such as in Connection.Response.body(), if the document has not already been parsed and must be read fully, and there is any maximum buffer size being applied, only the default internal buffer size is read. diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java index 753c91265f..76ff4fb058 100644 --- a/src/main/java/org/jsoup/helper/W3CDom.java +++ b/src/main/java/org/jsoup/helper/W3CDom.java @@ -52,7 +52,6 @@ public class W3CDom { private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context - /** To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}). @@ -60,12 +59,33 @@ public class W3CDom { public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup"; protected DocumentBuilderFactory factory; + private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience public W3CDom() { factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); } + /** + Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity + when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}. + @return the current namespace aware setting. + */ + public boolean namespaceAware() { + return namespaceAware; + } + + /** + Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes. + @param namespaceAware the updated setting + @return this W3CDom, for chaining. + */ + public W3CDom namespaceAware(boolean namespaceAware) { + this.namespaceAware = namespaceAware; + factory.setNamespaceAware(namespaceAware); + return this; + } + /** * Converts a jsoup DOM to a W3C DOM. * @@ -92,7 +112,6 @@ public static Document convert(org.jsoup.nodes.Document in) { * @see OutputKeys#STANDALONE * @see OutputKeys#STANDALONE * @see OutputKeys#DOCTYPE_PUBLIC - * @see OutputKeys#DOCTYPE_PUBLIC * @see OutputKeys#CDATA_SECTION_ELEMENTS * @see OutputKeys#INDENT * @see OutputKeys#MEDIA_TYPE @@ -314,7 +333,7 @@ public String asString(Document doc) { /** * Implements the conversion by walking the input. */ - protected static class W3CBuilder implements NodeVisitor { + protected class W3CBuilder implements NodeVisitor { private static final String xmlnsKey = "xmlns"; private static final String xmlnsPrefix = "xmlns:"; @@ -337,7 +356,7 @@ public void head(org.jsoup.nodes.Node source, int depth) { org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; String prefix = updateNamespaces(sourceEl); - String namespace = namespacesStack.peek().get(prefix); + String namespace = namespaceAware ? namespacesStack.peek().get(prefix) : null; String tagName = sourceEl.tagName(); /* Tag names in XML are quite permissive, but less permissive than HTML. Rather than reimplement the validation, diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index e1ab957e71..948c6fd2c8 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -522,6 +522,8 @@ public boolean is(Evaluator evaluator) { /** Find Elements that match the supplied XPath expression. +

Note that for convenience of writing the Xpath expression, namespaces are disabled, and queries can be + expressed using the elements local name only.

By default, XPath 1.0 expressions are supported. If you would to use XPath 2.0 or higher, you can provide an alternate XPathFactory implementation:

    diff --git a/src/main/java/org/jsoup/nodes/NodeUtils.java b/src/main/java/org/jsoup/nodes/NodeUtils.java index 722a614f1b..e45f5c532f 100644 --- a/src/main/java/org/jsoup/nodes/NodeUtils.java +++ b/src/main/java/org/jsoup/nodes/NodeUtils.java @@ -42,7 +42,7 @@ static List selectXpath(String xpath, Element el, Class n Validate.notNull(el); Validate.notNull(nodeType); - W3CDom w3c = new W3CDom(); + W3CDom w3c = new W3CDom().namespaceAware(false); org.w3c.dom.Document wDoc = w3c.fromJsoup(el); org.w3c.dom.Node contextNode = w3c.contextNode(wDoc); NodeList nodeList = w3c.selectXpath(xpath, contextNode); diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java index 09fc66f352..205737280d 100644 --- a/src/test/java/org/jsoup/helper/W3CDomTest.java +++ b/src/test/java/org/jsoup/helper/W3CDomTest.java @@ -278,6 +278,20 @@ public void xmlnsXpathTest() throws XPathExpressionException { assertNull(nodeList); } + @Test + void canDisableNamespaces() throws XPathExpressionException { + W3CDom w3c = new W3CDom(); + assertTrue(w3c.namespaceAware()); + + w3c.namespaceAware(false); + assertFalse(w3c.namespaceAware()); + + String html = "
    hello
    "; + Document dom = w3c.fromJsoup(Jsoup.parse(html)); + NodeList nodeList = xpath(dom, "//body");// no ns, so needs no prefix + assertEquals("div", nodeList.item(0).getLocalName()); + } + private NodeList xpath(Document w3cDoc, String query) throws XPathExpressionException { XPathExpression xpath = XPathFactory.newInstance().newXPath().compile(query); return ((NodeList) xpath.evaluate(w3cDoc, XPathConstants.NODE)); diff --git a/src/test/java/org/jsoup/select/XpathTest.java b/src/test/java/org/jsoup/select/XpathTest.java index 2b3393a2ce..274800eecc 100644 --- a/src/test/java/org/jsoup/select/XpathTest.java +++ b/src/test/java/org/jsoup/select/XpathTest.java @@ -1,7 +1,6 @@ package org.jsoup.select; import org.jsoup.Jsoup; -import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; @@ -17,7 +16,6 @@ import javax.xml.xpath.XPathFactoryConfigurationException; import javax.xml.xpath.XPathFunctionResolver; import javax.xml.xpath.XPathVariableResolver; - import java.util.List; import java.util.stream.Stream; @@ -76,8 +74,8 @@ public void throwsSelectException() { } @Test - public void supportsNamespaces() { - String xhtml = "
    hello
    ";; + public void supportsLocalname() { + String xhtml = "
    hello
    "; Document doc = Jsoup.parse(xhtml, Parser.xmlParser()); Elements elements = doc.selectXpath("//*[local-name()='body']"); assertEquals(1, elements.size()); @@ -86,7 +84,7 @@ public void supportsNamespaces() { @Test public void canDitchNamespaces() { - String xhtml = "
    hello
    ";; + String xhtml = "
    hello
    "; Document doc = Jsoup.parse(xhtml, Parser.xmlParser()); doc.select("[xmlns]").removeAttr("xmlns"); Elements elements = doc.selectXpath("//*[local-name()='body']"); @@ -192,8 +190,45 @@ public void canSupplyAlternateFactoryImpl() { } assertTrue(threw); System.clearProperty(XPathFactoryProperty); + } + + @Test + public void notNamespaceAware() { + String xhtml = "
    hello
    "; + Document doc = Jsoup.parse(xhtml, Parser.xmlParser()); + Elements elements = doc.selectXpath("//body"); + assertEquals(1, elements.size()); + assertEquals("One", elements.first().id()); + } + + @Test + public void supportsPrefixes() { + // example from https://www.w3.org/TR/xml-names/ + String xml = "\n" + + "\n" + + " Cheaper by the Dozen\n" + + " 1568491379\n" + + ""; + Document doc = Jsoup.parse(xml, Parser.xmlParser()); + + //Elements elements = doc.selectXpath("//bk:book/bk:title"); + Elements elements = doc.selectXpath("//book/title"); + assertEquals(1, elements.size()); + assertEquals("Cheaper by the Dozen", elements.first().text()); + + // with prefix + Elements byPrefix = doc.selectXpath("//*[name()='bk:book']/*[name()='bk:title']"); + assertEquals(1, byPrefix.size()); + assertEquals("Cheaper by the Dozen", byPrefix.first().text()); + Elements byLocalName = doc.selectXpath("//*[local-name()='book']/*[local-name()='title']"); + assertEquals(1, byLocalName.size()); + assertEquals("Cheaper by the Dozen", byLocalName.first().text()); + Elements isbn = doc.selectXpath("//book/number"); + assertEquals(1, isbn.size()); + assertEquals("1568491379", isbn.first().text()); } // minimal, no-op implementation class to verify users can load a factory to support XPath 2.0 etc