diff --git a/pagefind/features/exclusions.feature b/pagefind/features/exclusions.feature
index 3eafd169..c887c8df 100644
--- a/pagefind/features/exclusions.feature
+++ b/pagefind/features/exclusions.feature
@@ -75,3 +75,47 @@ Feature: Exclusions
Then There should be no logs
Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Hooray!"
Then The selector "[data-search-two]" should contain "0 result(s)"
+
+ Scenario: Indexing can be limited to a given element
+ Given I have a "public/index.html" file with the body:
+ """
+
Nothing
+ Nothing
+ """
+ Given I have a "public/cat/index.html" file with the body:
+ """
+ Outer Content
+
+
Hello World, from Pagefind
+
Huzzah!
+
+ goodbye content
+ Little extra body
+ """
+ # The above data-pagefind-body existing on a page should
+ # exclude all pages that do not include it.
+ Given I have a "public/dog/index.html" file with the body:
+ """
+ No selector
+ goodbye content
+ """
+ When I run my program
+ Then I should see "Running Pagefind" in stdout
+ When I serve the "public" directory
+ When I load "/"
+ When I evaluate:
+ """
+ async function() {
+ let pagefind = await import("/_pagefind/pagefind.js");
+
+ let searchone = await pagefind.search("hello");
+ let searchonedata = await searchone.results[0].data();
+ document.querySelector('[data-search-one]').innerText = searchonedata.content;
+
+ let searchtwo = await pagefind.search("goodbye");
+ document.querySelector('[data-search-two]').innerText = `${searchtwo.results.length} result(s)`;
+ }
+ """
+ Then There should be no logs
+ Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Huzzah! Little extra body."
+ Then The selector "[data-search-two]" should contain "0 result(s)"
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index 8563b7ef..dcaff15b 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -21,6 +21,7 @@ pub struct FossickedData {
pub file_path: PathBuf,
pub fragment: PageFragment,
pub word_data: HashMap>,
+ pub has_custom_body: bool,
}
#[derive(Debug)]
@@ -109,6 +110,7 @@ impl Fossicker {
Ok(FossickedData {
file_path: self.file_path.clone(),
+ has_custom_body: data.has_custom_body,
fragment: PageFragment {
hash,
page_number: 0,
diff --git a/pagefind/src/fossick/parser.rs b/pagefind/src/fossick/parser.rs
index bfa1cbae..7d2857c5 100644
--- a/pagefind/src/fossick/parser.rs
+++ b/pagefind/src/fossick/parser.rs
@@ -47,6 +47,22 @@ struct DomParserData {
meta: HashMap,
}
+#[derive(Debug, PartialEq)]
+enum NodeStatus {
+ Indexing,
+ Ignored,
+ Body,
+ // There was a body element below us,
+ // so our content should be ignored.
+ ParentOfBody,
+}
+
+impl Default for NodeStatus {
+ fn default() -> Self {
+ Self::Indexing
+ }
+}
+
// A single HTML element that we're reading into.
// Contains a reference to the parent element,
// and since we collapse this tree upwards while we parse,
@@ -57,7 +73,7 @@ struct DomParsingNode {
parent: Option>>,
filter: Option,
meta: Option,
- ignore: bool,
+ status: NodeStatus,
}
/// The search-relevant data that was retrieved from the given input
@@ -66,6 +82,7 @@ pub struct DomParserResult {
pub digest: String,
pub filters: HashMap>,
pub meta: HashMap,
+ pub has_custom_body: bool,
}
// Some shorthand to clean up our use of Rc> in the lol_html macros
@@ -88,13 +105,22 @@ impl<'a> DomParser<'a> {
element_content_handlers: vec![
enclose! { (data) element!("html *", move |el| {
let should_ignore_el = el.has_attribute("data-pagefind-ignore") || REMOVE_SELECTORS.contains(&el.tag_name().as_str());
+ let treat_as_body = el.has_attribute("data-pagefind-body");
let filter = el.get_attribute("data-pagefind-filter").map(|attr| parse_attr_string(attr, el));
let meta = el.get_attribute("data-pagefind-meta").map(|attr| parse_attr_string(attr, el));
let tag_name = el.tag_name();
+ let status = if treat_as_body {
+ NodeStatus::Body
+ } else if should_ignore_el {
+ NodeStatus::Ignored
+ } else {
+ NodeStatus::Indexing
+ };
+
let node = Rc::new(RefCell::new(DomParsingNode{
parent: Some(Rc::clone(&data.borrow().current_node)),
- ignore: should_ignore_el,
+ status,
filter,
meta,
..DomParsingNode::default()
@@ -137,7 +163,7 @@ impl<'a> DomParser<'a> {
// If we bail out now, the content won't be persisted anywhere
// and the node + children will be dropped.
- if node.ignore {
+ if node.status == NodeStatus::Ignored {
return Ok(());
}
@@ -169,7 +195,27 @@ impl<'a> DomParser<'a> {
// and the order of tree traversal will mean that it
// is inserted in the correct position in the parent's content.
let mut parent = data.current_node.borrow_mut();
- parent.current_value.push_str(&node.current_value);
+
+ // If the parent is a parent of a body, we don't want to append
+ // any more content to it. (Unless, of course, we are another body)
+ if node.status != NodeStatus::Body && parent.status == NodeStatus::ParentOfBody {
+ return Ok(());
+ }
+ match node.status {
+ NodeStatus::Ignored => {},
+ NodeStatus::Indexing => {
+ parent.current_value.push_str(&node.current_value);
+ },
+ NodeStatus::Body | NodeStatus::ParentOfBody => {
+ // If our parent is already a parent of a body, then
+ // we're probably a subsequent body. Avoid clearing it out.
+ if parent.status != NodeStatus::ParentOfBody {
+ parent.current_value.clear();
+ }
+ parent.current_value.push_str(&node.current_value);
+ parent.status = NodeStatus::ParentOfBody;
+ }
+ };
Ok(())
}});
@@ -240,8 +286,20 @@ impl<'a> DomParser<'a> {
while node.borrow().parent.is_some() {
{
let node = node.borrow();
- let mut parent_node = node.parent.as_ref().unwrap().borrow_mut();
- parent_node.current_value.push_str(&node.current_value);
+ let mut parent = node.parent.as_ref().unwrap().borrow_mut();
+ if parent.status != NodeStatus::ParentOfBody {
+ match node.status {
+ NodeStatus::Ignored => {}
+ NodeStatus::Indexing => {
+ parent.current_value.push_str(&node.current_value);
+ }
+ NodeStatus::Body | NodeStatus::ParentOfBody => {
+ parent.current_value.clear();
+ parent.current_value.push_str(&node.current_value);
+ parent.status = NodeStatus::ParentOfBody;
+ }
+ };
+ }
}
let old_node = node.borrow();
let new_node = Rc::clone(old_node.parent.as_ref().unwrap());
@@ -254,6 +312,7 @@ impl<'a> DomParser<'a> {
digest: normalize_content(&node.current_value),
filters: data.filters,
meta: data.meta,
+ has_custom_body: node.status == NodeStatus::ParentOfBody,
}
}
}
diff --git a/pagefind/src/lib.rs b/pagefind/src/lib.rs
index e2ba0a58..aa36a2cb 100644
--- a/pagefind/src/lib.rs
+++ b/pagefind/src/lib.rs
@@ -50,10 +50,24 @@ impl SearchState {
.map(|f| f.fossick(&self.options))
.collect();
let all_pages = join_all(results).await;
- let pages_with_data = all_pages
- .into_iter()
- .flatten()
- .filter(|d| !d.word_data.is_empty());
+
+ let used_custom_body = all_pages.iter().flatten().any(|page| page.has_custom_body);
+ if used_custom_body {
+ println!(
+ "Found a data-pagefind-body element on the site.\n↳ Ignoring pages without this tag."
+ );
+ } else {
+ println!(
+ "Did not find a data-pagefind-body element on the site.\n↳ Indexing all elements on the site."
+ );
+ }
+
+ let pages_with_data = all_pages.into_iter().flatten().filter(|d| {
+ if used_custom_body && !d.has_custom_body {
+ return false;
+ }
+ !d.word_data.is_empty()
+ });
let indexes = build_indexes(pages_with_data, &self.options).await;
indexes.write_files(&self.options).await;