Skip to content

Commit

Permalink
Add support for escaping selectors
Browse files Browse the repository at this point in the history
Fixes #838

Closes #1441
Closes #1442
Closes #598
  • Loading branch information
jhy committed Jan 19, 2023
1 parent 7ca20e8 commit e61f688
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 12 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
jsoup changelog

Release 1.15.4 [PENDING]
* Improvement: added the ability to escape CSS selectors (tags, IDs, classes) to match elements that don't follow
regular CSS syntax. For example, to match by classname <p class="one.two">, use document.select("p.one\\.two");
<https://github.com/jhy/jsoup/issues/838>

* Improvement: when pretty-printing, wrap text that follows a <br> tag.
<https://github.com/jhy/jsoup/issues/1858>

Expand Down
26 changes: 18 additions & 8 deletions src/main/java/org/jsoup/parser/TokenQueue.java
Original file line number Diff line number Diff line change
Expand Up @@ -323,11 +323,7 @@ public String consumeWord() {
* @return tag name
*/
public String consumeElementSelector() {
int start = pos;
while (!isEmpty() && (matchesWord() || matchesAny("*|","|", "_", "-")))
pos++;

return queue.substring(start, pos);
return consumeEscapedCssIdentifier("*|", "|", "_", "-");
}

/**
Expand All @@ -336,11 +332,25 @@ Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _)
@return identifier
*/
public String consumeCssIdentifier() {
return consumeEscapedCssIdentifier("-", "_");
}

private String consumeEscapedCssIdentifier(String... matches) {
int start = pos;
while (!isEmpty() && (matchesWord() || matchesAny('-', '_')))
pos++;
boolean escaped = false;
while (!isEmpty()) {
if (queue.charAt(pos) == ESC && remainingLength() >1 ) {
escaped = true;
pos+=2; // skip the escape and the escaped
} else if (matchesWord() || matchesAny(matches)) {
pos++;
} else {
break;
}
}

return queue.substring(start, pos);
String consumed = queue.substring(start, pos);
return escaped ? unescape(consumed) : consumed;
}

/**
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/jsoup/select/Selector.java
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@
* <tr><td><code>:empty</code></td><td>elements that have no children at all</td><td></td></tr>
* </table>
*
* <p>A word on using regular expressions in these selectors: depending on the content of the regex, you will need to quote the pattern using <b><code>Pattern.quote("regex")</code></b> for it to parse correclty through both the selector parser and the regex parser. E.g. <code>String query = "div:matches(" + Pattern.quote(regex) + ");"</code>.</p>
* <p>A word on using regular expressions in these selectors: depending on the content of the regex, you will need to quote the pattern using <b><code>Pattern.quote("regex")</code></b> for it to parse correctly through both the selector parser and the regex parser. E.g. <code>String query = "div:matches(" + Pattern.quote(regex) + ");"</code>.</p>
* <p><b>Escaping special characters:</b> to match a tag, ID, or other selector that does not follow the regular CSS syntax, the query must be escaped with the <code>\</code> character. For example, to match by ID {@code <p id="i.d">}, use {@code document.select("#i\\.d")}.</p>
*
* @author Jonathan Hedley, jonathan@hedley.net
* @see Element#select(String)
*/
public class Selector {
Expand Down
29 changes: 27 additions & 2 deletions src/test/java/org/jsoup/parser/TokenQueueTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

import java.util.regex.Pattern;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;
import static org.junit.jupiter.api.Assertions.*;

/**
* Token queue tests.
Expand Down Expand Up @@ -106,4 +105,30 @@ public void testQuotedPattern() {
assertEquals("\n( foo2",doc.select("div:matches(" + Pattern.quote("(") + ")").get(0).childNode(0).toString());
assertEquals("\n1) foo3",doc.select("div:matches(" + Pattern.quote("1)") + ")").get(0).childNode(0).toString());
}

@Test public void consumeEscapedTag() {
TokenQueue q = new TokenQueue("p\\\\p p\\.p p\\:p p\\!p");

assertEquals("p\\p", q.consumeElementSelector());
assertTrue(q.consumeWhitespace());

assertEquals("p.p", q.consumeElementSelector());
assertTrue(q.consumeWhitespace());

assertEquals("p:p", q.consumeElementSelector());
assertTrue(q.consumeWhitespace());

assertEquals("p!p", q.consumeElementSelector());
assertTrue(q.isEmpty());
}

@Test public void consumeEscapedId() {
TokenQueue q = new TokenQueue("i\\.d i\\\\d");

assertEquals("i.d", q.consumeCssIdentifier());
assertTrue(q.consumeWhitespace());

assertEquals("i\\d", q.consumeCssIdentifier());
assertTrue(q.isEmpty());
}
}
31 changes: 31 additions & 0 deletions src/test/java/org/jsoup/select/SelectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,17 @@ public class SelectorTest {
assertEquals(0, none.size());
}

@Test public void byEscapedTag() {
// tested same result as js document.querySelector
Document doc = Jsoup.parse("<p.p>One</p.p> <p\\p>Two</p\\p>");

Element one = doc.expectFirst("p\\.p");
assertEquals("One", one.text());

Element two = doc.expectFirst("p\\\\p");
assertEquals("Two", two.text());
}

@Test public void testById() {
Elements els = Jsoup.parse("<div><p id=foo>Hello</p><p id=foo>Foo two!</p></div>").select("#foo");
assertEquals(2, els.size());
Expand All @@ -40,6 +51,19 @@ public class SelectorTest {
assertEquals(0, none.size());
}

@Test public void byEscapedId() {
Document doc = Jsoup.parse("<p id='i.d'>One</p> <p id='i\\d'>Two</p> <p id='one-two/three'>Three</p>");

Element one = doc.expectFirst("#i\\.d");
assertEquals("One", one.text());

Element two = doc.expectFirst("#i\\\\d");
assertEquals("Two", two.text());

Element thr = doc.expectFirst("p#one-two\\/three");
assertEquals("Three", thr.text());
}

@Test public void testByClass() {
Elements els = Jsoup.parse("<p id=0 class='ONE two'><p id=1 class='one'><p id=2 class='two'>").select("P.One");
assertEquals(2, els.size());
Expand All @@ -53,6 +77,13 @@ public class SelectorTest {
assertEquals(1, els2.size());
}

@Test public void byEscapedClass() {
Element els = Jsoup.parse("<p class='one.two#three'>One</p>");

Element one = els.expectFirst("p.one\\.two\\#three");
assertEquals("One", one.text());
}

@Test public void testByClassCaseInsensitive() {
String html = "<p Class=foo>One <p Class=Foo>Two <p class=FOO>Three <p class=farp>Four";
Elements elsFromClass = Jsoup.parse(html).select("P.Foo");
Expand Down

0 comments on commit e61f688

Please sign in to comment.