Skip to content

Commit

Permalink
Sub-query should terminate on a combinator after any clause
Browse files Browse the repository at this point in the history
Missed setting seenCombinator for attributes / sub-clauses.

Fixes #2073
Regression in: d126488

Also added EvaluatorDebug which creates a XML outline and a sexpr for the query, to aid visualizing the query. Might make it public and add it to try.jsoup.org.
  • Loading branch information
jhy committed Dec 4, 2023
1 parent 0e94a36 commit 7e91601
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 44 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
thrown. [2068](https://github.com/jhy/jsoup/issues/2068)
* A multi-point encoded emoji entity may be incorrectly decoded to the replacement
character. [2074](https://github.com/jhy/jsoup/issues/2074)
* (Regression) in a selector like `parent [attr=va], other`, the `, OR` was binding to `[attr=va]` instead of
`parent [attr=va]`, causing incorrect selections. The fix includes a EvaluatorDebug class that generates a sexpr
to represent the query, allowing simpler and more thorough query parse
tests. [2073](https://github.com/jhy/jsoup/issues/2073)

---
Older changes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27) may be found in
Expand Down
18 changes: 9 additions & 9 deletions src/main/java/org/jsoup/select/QueryParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -145,21 +145,21 @@ private void combinator(char combinator) {

private String consumeSubQuery() {
StringBuilder sq = StringUtil.borrowBuilder();
boolean seenNonCombinator = false; // eat until we hit a combinator after eating something else
boolean seenClause = false; // eat until we hit a combinator after eating something else
while (!tq.isEmpty()) {
if (tq.matchesAny(Combinators)) {
if (seenClause)
break;
sq.append(tq.consume());
continue;
}
seenClause = true;
if (tq.matches("("))
sq.append("(").append(tq.chompBalanced('(', ')')).append(")");
else if (tq.matches("["))
sq.append("[").append(tq.chompBalanced('[', ']')).append("]");
else if (tq.matchesAny(Combinators))
if (seenNonCombinator)
break;
else
sq.append(tq.consume());
else {
seenNonCombinator = true;
else
sq.append(tq.consume());
}
}
return StringUtil.releaseBuilder(sq);
}
Expand Down
83 changes: 83 additions & 0 deletions src/test/java/org/jsoup/select/EvaluatorDebug.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package org.jsoup.select;

import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

public class EvaluatorDebug {

/**
Cast an Evaluator into a pseudo Document, to help visualize the query. Quite coupled to the current impl.
*/
public static Document asDocument(Evaluator eval) {
Document doc = new Document(null);
doc.outputSettings().outline(true).indentAmount(2);

Element el = asElement(eval);
doc.appendChild(el);

return doc;
}

public static Document asDocument(String query) {
Evaluator eval = QueryParser.parse(query);
return asDocument(eval);
}

public static Element asElement(Evaluator eval) {
Class<? extends Evaluator> evalClass = eval.getClass();
Element el = new Element(evalClass.getSimpleName());
el.attr("css", eval.toString());
el.attr("cost", Integer.toString(eval.cost()));

if (eval instanceof CombiningEvaluator) {
for (Evaluator inner : ((CombiningEvaluator) eval).sortedEvaluators) {
el.appendChild(asElement(inner));
}
} else if (eval instanceof StructuralEvaluator.ImmediateParentRun) {
for (Evaluator inner : ((StructuralEvaluator.ImmediateParentRun) eval).evaluators) {
el.appendChild(asElement(inner));
}
} else if (eval instanceof StructuralEvaluator) {
Evaluator inner = ((StructuralEvaluator) eval).evaluator;
el.appendChild(asElement(inner));
}

return el;
}

public static String sexpr(String query) {
Document doc = asDocument(query);

SexprVisitor sv = new SexprVisitor();
doc.childNode(0).traverse(sv); // skip outer #document
return sv.result();
}

static class SexprVisitor implements NodeVisitor {
StringBuilder sb = StringUtil.borrowBuilder();

@Override public void head(Node node, int depth) {
sb
.append('(')
.append(node.nodeName());

if (node.childNodeSize() == 0)
sb
.append(" '")
.append(node.attr("css"))
.append("'");
else
sb.append(" ");
}

@Override public void tail(Node node, int depth) {
sb.append(')');
}

String result() {
return StringUtil.releaseBuilder(sb);
}
}
}
115 changes: 80 additions & 35 deletions src/test/java/org/jsoup/select/QueryParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.Test;

import static org.jsoup.select.EvaluatorDebug.sexpr;
import static org.junit.jupiter.api.Assertions.*;

/**
Expand All @@ -26,49 +27,80 @@ public class QueryParserTest {

@Test public void testImmediateParentRun() {
String query = "div > p > bold.brass";
Evaluator eval1 = QueryParser.parse(query);
assertEquals(query, eval1.toString());

StructuralEvaluator.ImmediateParentRun run = (StructuralEvaluator.ImmediateParentRun) eval1;
assertTrue(run.evaluators.get(0) instanceof Evaluator.Tag);
assertTrue(run.evaluators.get(1) instanceof Evaluator.Tag);
assertTrue(run.evaluators.get(2) instanceof CombiningEvaluator.And);
assertEquals("(ImmediateParentRun (Tag 'div')(Tag 'p')(And (Tag 'bold')(Class '.brass')))", sexpr(query));

/*
<ImmediateParentRun css="div > p > bold.brass" cost="11">
<Tag css="div" cost="1"></Tag>
<Tag css="p" cost="1"></Tag>
<And css="bold.brass" cost="7">
<Tag css="bold" cost="1"></Tag>
<Class css=".brass" cost="6"></Class>
</And>
</ImmediateParentRun>
*/
}

@Test public void testOrGetsCorrectPrecedence() {
// tests that a selector "a b, c d, e f" evals to (a AND b) OR (c AND d) OR (e AND f)"
// top level or, three child ands
Evaluator eval = QueryParser.parse("a b, c d, e f");
assertTrue(eval instanceof CombiningEvaluator.Or);
CombiningEvaluator.Or or = (CombiningEvaluator.Or) eval;
assertEquals(3, or.evaluators.size());
for (Evaluator innerEval: or.evaluators) {
assertTrue(innerEval instanceof CombiningEvaluator.And);
CombiningEvaluator.And and = (CombiningEvaluator.And) innerEval;
assertEquals(2, and.evaluators.size());
assertTrue(and.evaluators.get(0) instanceof StructuralEvaluator.Parent);
assertTrue(and.evaluators.get(1) instanceof Evaluator.Tag);
}
String query = "a b, c d, e f";
String parsed = sexpr(query);
assertEquals("(Or (And (Tag 'b')(Parent (Tag 'a')))(And (Tag 'd')(Parent (Tag 'c')))(And (Tag 'f')(Parent (Tag 'e'))))", parsed);

/*
<Or css="a b, c d, e f" cost="9">
<And css="a b" cost="3">
<Tag css="b" cost="1"></Tag>
<Parent css="a " cost="2">
<Tag css="a" cost="1"></Tag>
</Parent>
</And>
<And css="c d" cost="3">
<Tag css="d" cost="1"></Tag>
<Parent css="c " cost="2">
<Tag css="c" cost="1"></Tag>
</Parent>
</And>
<And css="e f" cost="3">
<Tag css="f" cost="1"></Tag>
<Parent css="e " cost="2">
<Tag css="e" cost="1"></Tag>
</Parent>
</And>
</Or>
*/
}

@Test public void testParsesMultiCorrectly() {
String query = ".foo.qux > ol.bar, ol > li + li";
Evaluator eval = QueryParser.parse(query);
assertTrue(eval instanceof CombiningEvaluator.Or);
CombiningEvaluator.Or or = (CombiningEvaluator.Or) eval;
assertEquals(2, or.evaluators.size());

StructuralEvaluator.ImmediateParentRun run = (StructuralEvaluator.ImmediateParentRun) or.evaluators.get(0);
CombiningEvaluator.And andRight = (CombiningEvaluator.And) or.evaluators.get(1);

assertEquals(".foo.qux > ol.bar", run.toString());
assertEquals(2, run.evaluators.size());
Evaluator runAnd = run.evaluators.get(0);
assertTrue(runAnd instanceof CombiningEvaluator.And);
assertEquals(".foo.qux", runAnd.toString());
assertEquals("ol > li + li", andRight.toString());
assertEquals(2, andRight.evaluators.size());
assertEquals(query, eval.toString());
String query = ".foo.qux[attr=bar] > ol.bar, ol > li + li";
String parsed = sexpr(query);
assertEquals("(Or (And (Tag 'li')(ImmediatePreviousSibling (ImmediateParentRun (Tag 'ol')(Tag 'li'))))(ImmediateParentRun (And (AttributeWithValue '[attr=bar]')(Class '.foo')(Class '.qux'))(And (Tag 'ol')(Class '.bar'))))", parsed);

/*
<Or css=".foo.qux[attr=bar] > ol.bar, ol > li + li" cost="31">
<And css="ol > li + li" cost="7">
<Tag css="li" cost="1"></Tag>
<ImmediatePreviousSibling css="ol > li + " cost="6">
<ImmediateParentRun css="ol > li" cost="4">
<Tag css="ol" cost="1"></Tag>
<Tag css="li" cost="1"></Tag>
</ImmediateParentRun>
</ImmediatePreviousSibling>
</And>
<ImmediateParentRun css=".foo.qux[attr=bar] > ol.bar" cost="24">
<And css=".foo.qux[attr=bar]" cost="15">
<AttributeWithValue css="[attr=bar]" cost="3"></AttributeWithValue>
<Class css=".foo" cost="6"></Class>
<Class css=".qux" cost="6"></Class>
</And>
<And css="ol.bar" cost="7">
<Tag css="ol" cost="1"></Tag>
<Class css=".bar" cost="6"></Class>
</And>
</ImmediateParentRun>
</Or>
*/
}

@Test public void exceptionOnUncloseAttribute() {
Expand Down Expand Up @@ -97,5 +129,18 @@ public class QueryParserTest {
String q = "a:not(:has(span.foo)) b d > e + f ~ g";
Evaluator parse = QueryParser.parse(q);
assertEquals(q, parse.toString());
String parsed = sexpr(q);
assertEquals("(And (Tag 'g')(PreviousSibling (And (Tag 'f')(ImmediatePreviousSibling (ImmediateParentRun (And (Tag 'd')(Parent (And (Tag 'b')(Parent (And (Tag 'a')(Not (Has (And (Tag 'span')(Class '.foo')))))))))(Tag 'e'))))))", parsed);
}

@Test public void parsesOrAfterAttribute() {
// https://github.com/jhy/jsoup/issues/2073
String q = "#parent [class*=child], .some-other-selector .nested";
String parsed = sexpr(q);
assertEquals("(Or (And (Parent (Id '#parent'))(AttributeWithValueContaining '[class*=child]'))(And (Class '.nested')(Parent (Class '.some-other-selector'))))", parsed);

assertEquals("(Or (Class '.some-other-selector')(And (Parent (Id '#parent'))(AttributeWithValueContaining '[class*=child]')))", sexpr("#parent [class*=child], .some-other-selector"));
assertEquals("(Or (Class '.some-other-selector')(And (Id '#el')(AttributeWithValueContaining '[class*=child]')))", sexpr("#el[class*=child], .some-other-selector"));
assertEquals("(Or (And (Parent (Id '#parent'))(AttributeWithValueContaining '[class*=child]'))(And (Class '.nested')(Parent (Class '.some-other-selector'))))", sexpr("#parent [class*=child], .some-other-selector .nested"));
}
}
10 changes: 10 additions & 0 deletions src/test/java/org/jsoup/select/SelectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1220,4 +1220,14 @@ public void wildcardNamespaceMatchesNoNamespace() {
Evaluator parse = QueryParser.parse(query);
assertEquals(query, parse.toString());
}

@Test public void orAfterClass() {
// see also QueryParserTest#parsesOrAfterAttribute
// https://github.com/jhy/jsoup/issues/2073
Document doc = Jsoup.parse("<div id=parent><span class=child></span><span class=child></span><span class=child></span></div>");
String q = "#parent [class*=child], .some-other-selector .nested";
assertEquals("(Or (And (Parent (Id '#parent'))(AttributeWithValueContaining '[class*=child]'))(And (Class '.nested')(Parent (Class '.some-other-selector'))))", EvaluatorDebug.sexpr(q));
Elements els = doc.select(q);
assertEquals(3, els.size());
}
}

0 comments on commit 7e91601

Please sign in to comment.