diff --git a/CHANGES b/CHANGES index b108fed3fc..1bb4842593 100644 --- a/CHANGES +++ b/CHANGES @@ -12,6 +12,11 @@ jsoup changelog useful when elements can only be distinguished by e.g. specific case, or leading whitespace, etc. + * Improvement: when evaluating an XPath query against a context element, the complete document is now visible to the + query, vs only the context element's sub-tree. This enables support for queries outside (parent or sibling) the + element, e.g. ancestor-or-self::*. + + *** Release 1.14.3 [2021-Sep-30] * Improvement: added native XPath support in Element#selectXpath(String) diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java index 3b0b5df692..753c91265f 100644 --- a/src/main/java/org/jsoup/helper/W3CDom.java +++ b/src/main/java/org/jsoup/helper/W3CDom.java @@ -49,6 +49,9 @@ public class W3CDom { /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */ public static final String SourceProperty = "jsoupSource"; + private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc + private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context + /** To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory @@ -161,12 +164,15 @@ public Document fromJsoup(org.jsoup.nodes.Document in) { } /** - * Convert a jsoup Element to a W3C Document. The created nodes will link back to the original + * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not - * flow to the other). + * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is + * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.) * * @param in jsoup element or doc * @return a W3C DOM Document representing the jsoup Document or Element contents. + * @see #sourceNodes(NodeList, Class) + * @see #contextNode(Document) */ public Document fromJsoup(org.jsoup.nodes.Element in) { Validate.notNull(in); @@ -174,9 +180,7 @@ public Document fromJsoup(org.jsoup.nodes.Element in) { try { builder = factory.newDocumentBuilder(); DOMImplementation impl = builder.getDOMImplementation(); - Document out; - - out = builder.newDocument(); + Document out = builder.newDocument(); org.jsoup.nodes.Document inDoc = in.ownerDocument(); org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null; if (doctype != null) { @@ -184,8 +188,10 @@ public Document fromJsoup(org.jsoup.nodes.Element in) { out.appendChild(documentType); } out.setXmlStandalone(true); - - convert(in, out); + // if in is Document, use the root element, not the wrapping document, as the context: + org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.child(0) : in; + out.setUserData(ContextProperty, context, null); + convert(inDoc != null ? inDoc : in, out); return out; } catch (ParserConfigurationException e) { throw new IllegalStateException(e); @@ -226,9 +232,25 @@ public void convert(org.jsoup.nodes.Element in, Document out) { NodeTraversor.traverse(builder, rootEl); } + /** + Evaluate an XPath query against the supplied document, and return the results. + @param xpath an XPath query + @param doc the document to evaluate against + @return the matches nodes + */ public NodeList selectXpath(String xpath, Document doc) { + return selectXpath(xpath, (Node) doc); + } + + /** + Evaluate an XPath query against the supplied context node, and return the results. + @param xpath an XPath query + @param contextNode the context node to evaluate against + @return the matches nodes + */ + public NodeList selectXpath(String xpath, Node contextNode) { Validate.notEmpty(xpath); - Validate.notNull(doc); + Validate.notNull(contextNode); NodeList nodeList; try { @@ -239,7 +261,7 @@ public NodeList selectXpath(String xpath, Document doc) { XPathFactory.newInstance(); XPathExpression expression = xPathFactory.newXPath().compile(xpath); - nodeList = (NodeList) expression.evaluate(doc, XPathConstants.NODESET); // love the strong typing here /s + nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s Validate.notNull(nodeList); } catch (XPathExpressionException | XPathFactoryConfigurationException e) { throw new Selector.SelectorParseException("Could not evaluate XPath query [%s]: %s", xpath, e.getMessage()); @@ -247,6 +269,13 @@ public NodeList selectXpath(String xpath, Document doc) { return nodeList; } + /** + Retrieves the original jsoup DOM nodes from a nodelist created by this convertor. + @param nodeList the W3C nodes to get the original jsoup nodes from + @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc) + @param node type + @return a list of the original nodes + */ public List sourceNodes(NodeList nodeList, Class nodeType) { Validate.notNull(nodeList); Validate.notNull(nodeType); @@ -262,6 +291,15 @@ public List sourceNodes(NodeList nodeList, C return nodes; } + /** + For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node. + @param wDoc Document created by this class + @return the corresponding W3C Node to the jsoup Element that was used as the creating context. + */ + public Node contextNode(Document wDoc) { + return (Node) wDoc.getUserData(ContextNodeProperty); + } + /** * Serialize a W3C document to a String. The output format will be XML or HTML depending on the content of the doc. * @@ -284,11 +322,13 @@ protected static class W3CBuilder implements NodeVisitor { private final Stack> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn private Node dest; private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available. + @Nullable private final org.jsoup.nodes.Element contextElement; public W3CBuilder(Document doc) { this.doc = doc; - this.namespacesStack.push(new HashMap<>()); - this.dest = doc; + namespacesStack.push(new HashMap<>()); + dest = doc; + contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element } public void head(org.jsoup.nodes.Node source, int depth) { @@ -310,6 +350,8 @@ public void head(org.jsoup.nodes.Node source, int depth) { doc.createElementNS(namespace, tagName); copyAttributes(sourceEl, el); append(el, sourceEl); + if (sourceEl == contextElement) + doc.setUserData(ContextNodeProperty, el, null); dest = el; // descend } catch (DOMException e) { append(doc.createTextNode("<" + tagName + ">"), sourceEl); diff --git a/src/main/java/org/jsoup/nodes/NodeUtils.java b/src/main/java/org/jsoup/nodes/NodeUtils.java index ea6f08159f..722a614f1b 100644 --- a/src/main/java/org/jsoup/nodes/NodeUtils.java +++ b/src/main/java/org/jsoup/nodes/NodeUtils.java @@ -44,7 +44,8 @@ static List selectXpath(String xpath, Element el, Class n W3CDom w3c = new W3CDom(); org.w3c.dom.Document wDoc = w3c.fromJsoup(el); - NodeList nodeList = w3c.selectXpath(xpath, wDoc); + org.w3c.dom.Node contextNode = w3c.contextNode(wDoc); + NodeList nodeList = w3c.selectXpath(xpath, contextNode); return w3c.sourceNodes(nodeList, nodeType); } } diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java index f1ff904454..09fc66f352 100644 --- a/src/test/java/org/jsoup/helper/W3CDomTest.java +++ b/src/test/java/org/jsoup/helper/W3CDomTest.java @@ -328,7 +328,7 @@ private void assertEqualsIgnoreCase(String want, String have) { Element jDiv = jdoc.selectFirst("div"); assertNotNull(jDiv); Document doc = w3CDom.fromJsoup(jDiv); - Node div = doc.getFirstChild(); + Node div = w3CDom.contextNode(doc); assertEquals("div", div.getLocalName()); assertEquals(jDiv, div.getUserData(W3CDom.SourceProperty)); diff --git a/src/test/java/org/jsoup/select/XpathTest.java b/src/test/java/org/jsoup/select/XpathTest.java index e46f17978c..2b3393a2ce 100644 --- a/src/test/java/org/jsoup/select/XpathTest.java +++ b/src/test/java/org/jsoup/select/XpathTest.java @@ -43,13 +43,15 @@ public void supportsXpath() { Element div = doc.selectFirst("div"); assertNotNull(div); + Element w3cDiv = div.selectXpath(".").first(); // self + assertSame(div, w3cDiv); - Elements els = div.selectXpath("/div/p"); + Elements els = div.selectXpath("p"); assertEquals(1, els.size()); assertEquals("One", els.get(0).text()); assertEquals("p", els.get(0).tagName()); - assertEquals(0, div.selectXpath("//body").size()); + assertEquals(1, div.selectXpath("//body").size()); // the whole document is visible on the div context assertEquals(1, doc.selectXpath("//body").size()); } @@ -146,6 +148,31 @@ private static Stream provideEvaluators() { assertEquals("/bar", hrefs.get(1)); } + @Test void selectOutsideOfElementTree() { + Document doc = Jsoup.parse("

One

Two

Three"); + Elements ps = doc.selectXpath("//p"); + assertEquals(3, ps.size()); + + Element p1 = ps.get(0); + assertEquals("One", p1.text()); + + Elements sibs = p1.selectXpath("following-sibling::p"); + assertEquals(2, sibs.size()); + assertEquals("Two", sibs.get(0).text()); + assertEquals("Three", sibs.get(1).text()); + } + + @Test void selectAncestorsOnContextElement() { + // https://github.com/jhy/jsoup/issues/1652 + Document doc = Jsoup.parse("

Hello"); + Element p = doc.selectFirst("p"); + assertNotNull(p); + Elements chain = p.selectXpath("ancestor-or-self::*"); + assertEquals(4, chain.size()); + assertEquals("html", chain.get(0).tagName()); + assertEquals("p", chain.get(3).tagName()); + } + @Test public void canSupplyAlternateFactoryImpl() { // previously we had a test to load Saxon and do an XPath 2.0 query. But we know Saxon works and so that's