From 9aef0721bb4e65f8364540152923fff8a8c8e0fa Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Tue, 24 Oct 2023 15:36:50 +1100 Subject: [PATCH 1/2] Extend Elements to update the backing DOM on set(), remove(), et al --- CHANGES | 5 + src/main/java/org/jsoup/select/Elements.java | 128 +++++++++++++- .../java/org/jsoup/select/ElementsTest.java | 164 ++++++++++++++++++ 3 files changed, 294 insertions(+), 3 deletions(-) diff --git a/CHANGES b/CHANGES index 8f7af76635..faabc93eea 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,10 @@ jsoup changelog +Release 1.17.1 [PENDING] + * Improvement: in the Elements list, added direct support for `#set(index, element)`, `#remove(index)`, + `#remove(object)`, `#clear()`, `#removeAll(collection)`, `#retainAll(collection)`, `#removeIf(filter)`, + `#replaceAll(operator)`. These methods update the original DOM, as well as the Elements list. + Release 1.16.2 [20-Oct-2023] * Improvement: optimized the performance of complex CSS selectors, by adding a cost-based query planner. Evaluators are sorted by their relative execution cost, and executed in order of lower to higher cost. This speeds the diff --git a/src/main/java/org/jsoup/select/Elements.java b/src/main/java/org/jsoup/select/Elements.java index 31838e1f89..a64f322b41 100644 --- a/src/main/java/org/jsoup/select/Elements.java +++ b/src/main/java/org/jsoup/select/Elements.java @@ -14,14 +14,17 @@ import java.util.Arrays; import java.util.Collection; import java.util.HashSet; +import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; +import java.util.function.Predicate; +import java.util.function.UnaryOperator; /** A list of {@link Element}s, with methods that act on every element in the list. -

- To get an {@code Elements} object, use the {@link Element#select(String)} method. -

+

To get an {@code Elements} object, use the {@link Element#select(String)} method.

+

Methods that {@link #set(int, Element) set}, {@link #remove(int) remove}, or {@link #replaceAll(UnaryOperator) + replace} Elements in the list will also act on the underlying {@link org.jsoup.nodes.Document DOM}.

@author Jonathan Hedley, jonathan@hedley.net */ public class Elements extends ArrayList { @@ -431,6 +434,7 @@ public Elements empty() { /** * Remove each matched element from the DOM. This is similar to setting the outer HTML of each element to nothing. + *

The elements will still be retained in this list, in case further processing of them is desired.

*

* E.g. HTML: {@code

Hello

there

}
* doc.select("p").remove();
@@ -440,6 +444,7 @@ public Elements empty() { * @return this, for chaining * @see Element#empty() * @see #empty() + * @see #clear() */ public Elements remove() { for (Element element : this) { @@ -683,4 +688,121 @@ private List childNodesOfType(Class tClass) { return nodes; } + // list methods that update the DOM: + + /** + Replace the Element at the specified index in this list, and in the DOM. + * @param index index of the element to replace + * @param element element to be stored at the specified position + * @return the old Element at this index + * @since 1.17.1 + */ + @Override public Element set(int index, Element element) { + Validate.notNull(element); + Element old = super.set(index, element); + old.replaceWith(element); + return old; + } + + /** + Remove the Element at the specified index in this ist, and from the DOM. + * @param index the index of the element to be removed + * @return the old element at this index + * @since 1.17.1 + */ + @Override public Element remove(int index) { + Element old = super.remove(index); + old.remove(); + return old; + } + + /** + Remove the specified Element from this list, and from th DOM + * @param o element to be removed from this list, if present + * @return if this list contained the Element + * @since 1.17.1 + */ + @Override public boolean remove(Object o) { + int index = super.indexOf(o); + if (index == -1) { + return false; + } else { + remove(index); + return true; + } + } + + /** + Removes all the elements from this list, and each of them from the DOM. + * @since 1.17.1 + * @see #remove() + */ + @Override public void clear() { + remove(); + super.clear(); + } + + /** + Removes from this list, and from the DOM, each of the elements that are contained in the specified collection and + are in this list. + * @param c collection containing elements to be removed from this list + * @return {@code true} if elements were removed from this list + * @since 1.17.1 + */ + @Override public boolean removeAll(Collection c) { + boolean anyRemoved = false; + for (Object o : c) { + anyRemoved |= this.remove(o); + } + return anyRemoved; + } + + /** + Retain in this list, and in the DOM, only the elements that are in the specified collection and are in this list. + In other words, remove elements from this list and the DOM any item that is in this list but not in the specified + collection. + * @param c collection containing elements to be retained in this list + * @return {@code true} if elements were removed from this list + * @since 1.17.1 + */ + @Override public boolean retainAll(Collection c) { + boolean anyRemoved = false; + for (Iterator it = this.iterator(); it.hasNext(); ) { + Element el = it.next(); + if (!c.contains(el)) { + it.remove(); + anyRemoved = true; + } + } + return anyRemoved; + } + + /** + Remove from the list, and from the DOM, all elements in this list that mach the given filter. + * @param filter a predicate which returns {@code true} for elements to be removed + * @return {@code true} if elements were removed from this list + * @since 1.17.1 + */ + @Override public boolean removeIf(Predicate filter) { + boolean anyRemoved = false; + for (Iterator it = this.iterator(); it.hasNext(); ) { + Element el = it.next(); + if (filter.test(el)) { + it.remove(); + anyRemoved = true; + } + } + return anyRemoved; + } + + /** + Replace each element in this list with the result of the operator, and update the DOM. + * @param operator the operator to apply to each element + * @since 1.17.1 + */ + @Override public void replaceAll(UnaryOperator operator) { + for (int i = 0; i < this.size(); i++) { + this.set(i, operator.apply(this.get(i))); + } + } } diff --git a/src/test/java/org/jsoup/select/ElementsTest.java b/src/test/java/org/jsoup/select/ElementsTest.java index d1895752e8..b5ea4ef358 100644 --- a/src/test/java/org/jsoup/select/ElementsTest.java +++ b/src/test/java/org/jsoup/select/ElementsTest.java @@ -11,9 +11,11 @@ import org.jsoup.nodes.TextNode; import org.junit.jupiter.api.Test; +import java.util.Iterator; import java.util.List; import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertFalse; /** Tests for ElementList. @@ -435,4 +437,166 @@ public void tail(Node node, int depth) { assertEquals("http://example.com/bar", absAttrs.get(1)); assertEquals("http://example.com", absAttrs.get(2)); } + + @Test public void setElementByIndex() { + Document doc = Jsoup.parse("

One

Two

Three"); + Element newP = doc.createElement("p").text("New").attr("id", "new"); + + Elements ps = doc.select("p"); + Element two = ps.get(1); + Element old = ps.set(1, newP); + assertSame(old, two); + assertSame(newP, ps.get(1)); // replaced in list + assertEquals("

One

\n

New

\n

Three

", doc.body().html()); // replaced in dom + } + + @Test public void removeElementByIndex() { + Document doc = Jsoup.parse("

One

Two

Three"); + + Elements ps = doc.select("p"); + Element two = ps.get(1); + assertTrue(ps.contains(two)); + Element old = ps.remove(1); + assertSame(old, two); + + assertEquals(2, ps.size()); // removed from list + assertFalse(ps.contains(old)); + assertEquals("

One

\n

Three

", doc.body().html()); // removed from dom + } + + @Test public void removeElementByObject() { + Document doc = Jsoup.parse("

One

Two

Three"); + + Elements ps = doc.select("p"); + Element two = ps.get(1); + assertTrue(ps.contains(two)); + boolean removed = ps.remove(two); + assertTrue(removed); + + assertEquals(2, ps.size()); // removed from list + assertFalse(ps.contains(two)); + assertEquals("

One

\n

Three

", doc.body().html()); // removed from dom + } + + @Test public void removeElementObjectNoops() { + Document doc = Jsoup.parse("

One

Two

Three"); + String origHtml = doc.html(); + Element newP = doc.createElement("p").text("New"); + + Elements ps = doc.select("p"); + int size = ps.size(); + assertFalse(ps.remove(newP)); + assertFalse(ps.remove(newP.childNodes())); + assertEquals(origHtml, doc.html()); + assertEquals(size, ps.size()); + } + + @Test public void clear() { + Document doc = Jsoup.parse("

One

Two

Three
"); + Elements ps = doc.select("p"); + assertEquals(2, ps.size()); + ps.clear(); + assertEquals(0, ps.size()); + + assertEquals(0, doc.select("p").size()); + } + + @Test public void removeAll() { + Document doc = Jsoup.parse("

One

Two

Three

Four

Div"); + Elements ps = doc.select("p"); + assertEquals(4, ps.size()); + Elements midPs = doc.select("p:gt(0):lt(3)"); //Two and Three + assertEquals(2, midPs.size()); + + boolean removed = ps.removeAll(midPs); + assertEquals(2, ps.size()); + assertTrue(removed); + assertEquals(2, midPs.size()); + + Elements divs = doc.select("div"); + assertEquals(1, divs.size()); + assertFalse(ps.removeAll(divs)); + assertEquals(2, ps.size()); + + assertEquals("

One

\n

Four

\n
\n Div\n
", doc.body().html()); + } + + @Test public void retainAll() { + Document doc = Jsoup.parse("

One

Two

Three

Four

Div"); + Elements ps = doc.select("p"); + assertEquals(4, ps.size()); + Elements midPs = doc.select("p:gt(0):lt(3)"); //Two and Three + assertEquals(2, midPs.size()); + + boolean removed = ps.retainAll(midPs); + assertEquals(2, ps.size()); + assertTrue(removed); + assertEquals(2, midPs.size()); + + assertEquals("

Two

\n

Three

\n
\n Div\n
", doc.body().html()); + + Elements psAgain = doc.select("p"); + assertFalse(midPs.retainAll(psAgain)); + + assertEquals("

Two

\n

Three

\n
\n Div\n
", doc.body().html()); + } + + @Test public void iteratorRemovesFromDom() { + Document doc = Jsoup.parse("

One

Two

Three

Four"); + Elements ps = doc.select("p"); + + assertEquals(4, ps.size()); + for (Iterator it = ps.iterator(); it.hasNext(); ) { + Element el = it.next(); + if (el.text().contains("Two")) + it.remove(); + } + assertEquals(3, ps.size()); + assertEquals("

One

\n

Three

\n

Four

", doc.body().html()); + } + + @Test public void removeIf() { + Document doc = Jsoup.parse("

One

Two

Three

Four"); + Elements ps = doc.select("p"); + + assertEquals(4, ps.size()); + boolean removed = ps.removeIf(el -> el.text().contains("Two")); + assertTrue(removed); + assertEquals(3, ps.size()); + assertEquals("

One

\n

Three

\n

Four

", doc.body().html()); + + assertFalse(ps.removeIf(el -> el.text().contains("Five"))); + assertEquals("

One

\n

Three

\n

Four

", doc.body().html()); + } + + @Test public void removeIfSupportsConcurrentRead() { + Document doc = Jsoup.parse("

One

Two

Three

Four"); + Elements ps = doc.select("p"); + assertEquals(4, ps.size()); + + boolean removed = ps.removeIf(el -> ps.contains(el)); + assertTrue(removed); + assertEquals(0, ps.size()); + assertEquals("", doc.body().html()); + } + + @Test public void replaceAll() { + Document doc = Jsoup.parse("

One

Two

Three

Four"); + Elements ps = doc.select("p"); + assertEquals(4, ps.size()); + + ps.replaceAll(el -> { + Element div = doc.createElement("div"); + div.text(el.text()); + return div; + }); + + // Check Elements + for (Element p : ps) { + assertEquals("div", p.tagName()); + } + + // check dom + assertEquals("

One
Two
Three
Four
", TextUtil.normalizeSpaces(doc.body().html())); + } } From 0248ab1e45bfd8943bc00d5c613366776b3e0561 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Tue, 24 Oct 2023 15:42:54 +1100 Subject: [PATCH 2/2] Allow Predicate and UnaryOperator --- pom.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pom.xml b/pom.xml index 04b5c7ed25..6a987b1a3e 100644 --- a/pom.xml +++ b/pom.xml @@ -82,6 +82,8 @@ java.util.function.Supplier java.lang.ThreadLocal java.io.UncheckedIOException + java.util.function.Predicate + java.util.function.UnaryOperator