From 28ac09dc3582a500ce847f0c2b06bb9213eb2298 Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Tue, 21 Jun 2022 21:07:47 +1200
Subject: [PATCH] Add the `data-pagefind-body` selector

---
 pagefind/features/exclusions.feature | 44 +++++++++++++++++
 pagefind/src/fossick/mod.rs          |  2 +
 pagefind/src/fossick/parser.rs       | 71 +++++++++++++++++++++++++---
 pagefind/src/lib.rs                  | 22 +++++++--
 4 files changed, 129 insertions(+), 10 deletions(-)
diff --git a/pagefind/features/exclusions.feature b/pagefind/features/exclusions.feature
index 3eafd169..c887c8df 100644
--- a/pagefind/features/exclusions.feature
+++ b/pagefind/features/exclusions.feature
@@ -75,3 +75,47 @@ Feature: Exclusions
         Then There should be no logs
         Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Hooray!"
         Then The selector "[data-search-two]" should contain "0 result(s)"
+
+    Scenario: Indexing can be limited to a given element
+        Given I have a "public/index.html" file with the body:
+            """
+            <p data-search-one>Nothing</p>
+            <p data-search-two>Nothing</p>
+            """
+        Given I have a "public/cat/index.html" file with the body:
+            """
+            <h1>Outer Content</h1>
+            <div data-pagefind-body>
+                <p>Hello World, from Pagefind</p>
+                <p>Huzzah!</p>
+            </div>
+            <p>goodbye content</p>
+            <p data-pagefind-body>Little extra body</p>
+            """
+        # The above data-pagefind-body existing on a page should
+        # exclude all pages that do not include it.
+        Given I have a "public/dog/index.html" file with the body:
+            """
+            <h1>No selector</h1>
+            <p>goodbye content</p>
+            """
+        When I run my program
+        Then I should see "Running Pagefind" in stdout
+        When I serve the "public" directory
+        When I load "/"
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/_pagefind/pagefind.js");
+
+                let searchone = await pagefind.search("hello");
+                let searchonedata = await searchone.results[0].data();
+                document.querySelector('[data-search-one]').innerText = searchonedata.content;
+
+                let searchtwo = await pagefind.search("goodbye");
+                document.querySelector('[data-search-two]').innerText = `${searchtwo.results.length} result(s)`;
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Huzzah! Little extra body."
+        Then The selector "[data-search-two]" should contain "0 result(s)"
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index 8563b7ef..dcaff15b 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -21,6 +21,7 @@ pub struct FossickedData {
     pub file_path: PathBuf,
     pub fragment: PageFragment,
     pub word_data: HashMap<String, Vec<u32>>,
+    pub has_custom_body: bool,
 }
 
 #[derive(Debug)]
@@ -109,6 +110,7 @@ impl Fossicker {
 
         Ok(FossickedData {
             file_path: self.file_path.clone(),
+            has_custom_body: data.has_custom_body,
             fragment: PageFragment {
                 hash,
                 page_number: 0,
diff --git a/pagefind/src/fossick/parser.rs b/pagefind/src/fossick/parser.rs
index bfa1cbae..7d2857c5 100644
--- a/pagefind/src/fossick/parser.rs
+++ b/pagefind/src/fossick/parser.rs
@@ -47,6 +47,22 @@ struct DomParserData {
     meta: HashMap<String, String>,
 }
 
+#[derive(Debug, PartialEq)]
+enum NodeStatus {
+    Indexing,
+    Ignored,
+    Body,
+    // There was a body element below us,
+    // so our content should be ignored.
+    ParentOfBody,
+}
+
+impl Default for NodeStatus {
+    fn default() -> Self {
+        Self::Indexing
+    }
+}
+
 // A single HTML element that we're reading into.
 // Contains a reference to the parent element,
 // and since we collapse this tree upwards while we parse,
@@ -57,7 +73,7 @@ struct DomParsingNode {
     parent: Option<Rc<RefCell<DomParsingNode>>>,
     filter: Option<String>,
     meta: Option<String>,
-    ignore: bool,
+    status: NodeStatus,
 }
 
 /// The search-relevant data that was retrieved from the given input
@@ -66,6 +82,7 @@ pub struct DomParserResult {
     pub digest: String,
     pub filters: HashMap<String, Vec<String>>,
     pub meta: HashMap<String, String>,
+    pub has_custom_body: bool,
 }
 
 // Some shorthand to clean up our use of Rc<RefCell<*>> in the lol_html macros
@@ -88,13 +105,22 @@ impl<'a> DomParser<'a> {
                 element_content_handlers: vec![
                     enclose! { (data) element!("html *", move |el| {
                         let should_ignore_el = el.has_attribute("data-pagefind-ignore") || REMOVE_SELECTORS.contains(&el.tag_name().as_str());
+                        let treat_as_body = el.has_attribute("data-pagefind-body");
                         let filter = el.get_attribute("data-pagefind-filter").map(|attr| parse_attr_string(attr, el));
                         let meta = el.get_attribute("data-pagefind-meta").map(|attr| parse_attr_string(attr, el));
                         let tag_name = el.tag_name();
 
+                        let status = if treat_as_body {
+                            NodeStatus::Body
+                        } else if should_ignore_el {
+                            NodeStatus::Ignored
+                        } else {
+                            NodeStatus::Indexing
+                        };
+
                         let node = Rc::new(RefCell::new(DomParsingNode{
                             parent: Some(Rc::clone(&data.borrow().current_node)),
-                            ignore: should_ignore_el,
+                            status,
                             filter,
                             meta,
                             ..DomParsingNode::default()
@@ -137,7 +163,7 @@ impl<'a> DomParser<'a> {
 
                             // If we bail out now, the content won't be persisted anywhere
                             // and the node + children will be dropped.
-                            if node.ignore {
+                            if node.status == NodeStatus::Ignored {
                                 return Ok(());
                             }
 
@@ -169,7 +195,27 @@ impl<'a> DomParser<'a> {
                             // and the order of tree traversal will mean that it
                             // is inserted in the correct position in the parent's content.
                             let mut parent = data.current_node.borrow_mut();
-                            parent.current_value.push_str(&node.current_value);
+
+                            // If the parent is a parent of a body, we don't want to append
+                            // any more content to it. (Unless, of course, we are another body)
+                            if node.status != NodeStatus::Body && parent.status == NodeStatus::ParentOfBody {
+                                return Ok(());
+                            }
+                            match node.status {
+                                NodeStatus::Ignored => {},
+                                NodeStatus::Indexing => {
+                                    parent.current_value.push_str(&node.current_value);
+                                },
+                                NodeStatus::Body | NodeStatus::ParentOfBody => {
+                                    // If our parent is already a parent of a body, then
+                                    // we're probably a subsequent body. Avoid clearing it out.
+                                    if parent.status != NodeStatus::ParentOfBody {
+                                        parent.current_value.clear();
+                                    }
+                                    parent.current_value.push_str(&node.current_value);
+                                    parent.status = NodeStatus::ParentOfBody;
+                                }
+                            };
 
                             Ok(())
                         }});
@@ -240,8 +286,20 @@ impl<'a> DomParser<'a> {
         while node.borrow().parent.is_some() {
             {
                 let node = node.borrow();
-                let mut parent_node = node.parent.as_ref().unwrap().borrow_mut();
-                parent_node.current_value.push_str(&node.current_value);
+                let mut parent = node.parent.as_ref().unwrap().borrow_mut();
+                if parent.status != NodeStatus::ParentOfBody {
+                    match node.status {
+                        NodeStatus::Ignored => {}
+                        NodeStatus::Indexing => {
+                            parent.current_value.push_str(&node.current_value);
+                        }
+                        NodeStatus::Body | NodeStatus::ParentOfBody => {
+                            parent.current_value.clear();
+                            parent.current_value.push_str(&node.current_value);
+                            parent.status = NodeStatus::ParentOfBody;
+                        }
+                    };
+                }
             }
             let old_node = node.borrow();
             let new_node = Rc::clone(old_node.parent.as_ref().unwrap());
@@ -254,6 +312,7 @@ impl<'a> DomParser<'a> {
             digest: normalize_content(&node.current_value),
             filters: data.filters,
             meta: data.meta,
+            has_custom_body: node.status == NodeStatus::ParentOfBody,
         }
     }
 }
diff --git a/pagefind/src/lib.rs b/pagefind/src/lib.rs
index e2ba0a58..aa36a2cb 100644
--- a/pagefind/src/lib.rs
+++ b/pagefind/src/lib.rs
@@ -50,10 +50,24 @@ impl SearchState {
             .map(|f| f.fossick(&self.options))
             .collect();
         let all_pages = join_all(results).await;
-        let pages_with_data = all_pages
-            .into_iter()
-            .flatten()
-            .filter(|d| !d.word_data.is_empty());
+
+        let used_custom_body = all_pages.iter().flatten().any(|page| page.has_custom_body);
+        if used_custom_body {
+            println!(
+                "Found a data-pagefind-body element on the site.\n↳ Ignoring pages without this tag."
+            );
+        } else {
+            println!(
+                "Did not find a data-pagefind-body element on the site.\n↳ Indexing all <body> elements on the site."
+            );
+        }
+
+        let pages_with_data = all_pages.into_iter().flatten().filter(|d| {
+            if used_custom_body && !d.has_custom_body {
+                return false;
+            }
+            !d.word_data.is_empty()
+        });
 
         let indexes = build_indexes(pages_with_data, &self.options).await;
         indexes.write_files(&self.options).await;