From 28ac09dc3582a500ce847f0c2b06bb9213eb2298 Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Tue, 21 Jun 2022 21:07:47 +1200 Subject: [PATCH] Add the `data-pagefind-body` selector --- pagefind/features/exclusions.feature | 44 +++++++++++++++++ pagefind/src/fossick/mod.rs | 2 + pagefind/src/fossick/parser.rs | 71 +++++++++++++++++++++++++--- pagefind/src/lib.rs | 22 +++++++-- 4 files changed, 129 insertions(+), 10 deletions(-) diff --git a/pagefind/features/exclusions.feature b/pagefind/features/exclusions.feature index 3eafd169..c887c8df 100644 --- a/pagefind/features/exclusions.feature +++ b/pagefind/features/exclusions.feature @@ -75,3 +75,47 @@ Feature: Exclusions Then There should be no logs Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Hooray!" Then The selector "[data-search-two]" should contain "0 result(s)" + + Scenario: Indexing can be limited to a given element + Given I have a "public/index.html" file with the body: + """ +

Nothing

+

Nothing

+ """ + Given I have a "public/cat/index.html" file with the body: + """ +

Outer Content

+
+

Hello World, from Pagefind

+

Huzzah!

+
+

goodbye content

+

Little extra body

+ """ + # The above data-pagefind-body existing on a page should + # exclude all pages that do not include it. + Given I have a "public/dog/index.html" file with the body: + """ +

No selector

+

goodbye content

+ """ + When I run my program + Then I should see "Running Pagefind" in stdout + When I serve the "public" directory + When I load "/" + When I evaluate: + """ + async function() { + let pagefind = await import("/_pagefind/pagefind.js"); + + let searchone = await pagefind.search("hello"); + let searchonedata = await searchone.results[0].data(); + document.querySelector('[data-search-one]').innerText = searchonedata.content; + + let searchtwo = await pagefind.search("goodbye"); + document.querySelector('[data-search-two]').innerText = `${searchtwo.results.length} result(s)`; + } + """ + Then There should be no logs + Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Huzzah! Little extra body." + Then The selector "[data-search-two]" should contain "0 result(s)" diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index 8563b7ef..dcaff15b 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -21,6 +21,7 @@ pub struct FossickedData { pub file_path: PathBuf, pub fragment: PageFragment, pub word_data: HashMap>, + pub has_custom_body: bool, } #[derive(Debug)] @@ -109,6 +110,7 @@ impl Fossicker { Ok(FossickedData { file_path: self.file_path.clone(), + has_custom_body: data.has_custom_body, fragment: PageFragment { hash, page_number: 0, diff --git a/pagefind/src/fossick/parser.rs b/pagefind/src/fossick/parser.rs index bfa1cbae..7d2857c5 100644 --- a/pagefind/src/fossick/parser.rs +++ b/pagefind/src/fossick/parser.rs @@ -47,6 +47,22 @@ struct DomParserData { meta: HashMap, } +#[derive(Debug, PartialEq)] +enum NodeStatus { + Indexing, + Ignored, + Body, + // There was a body element below us, + // so our content should be ignored. + ParentOfBody, +} + +impl Default for NodeStatus { + fn default() -> Self { + Self::Indexing + } +} + // A single HTML element that we're reading into. // Contains a reference to the parent element, // and since we collapse this tree upwards while we parse, @@ -57,7 +73,7 @@ struct DomParsingNode { parent: Option>>, filter: Option, meta: Option, - ignore: bool, + status: NodeStatus, } /// The search-relevant data that was retrieved from the given input @@ -66,6 +82,7 @@ pub struct DomParserResult { pub digest: String, pub filters: HashMap>, pub meta: HashMap, + pub has_custom_body: bool, } // Some shorthand to clean up our use of Rc> in the lol_html macros @@ -88,13 +105,22 @@ impl<'a> DomParser<'a> { element_content_handlers: vec![ enclose! { (data) element!("html *", move |el| { let should_ignore_el = el.has_attribute("data-pagefind-ignore") || REMOVE_SELECTORS.contains(&el.tag_name().as_str()); + let treat_as_body = el.has_attribute("data-pagefind-body"); let filter = el.get_attribute("data-pagefind-filter").map(|attr| parse_attr_string(attr, el)); let meta = el.get_attribute("data-pagefind-meta").map(|attr| parse_attr_string(attr, el)); let tag_name = el.tag_name(); + let status = if treat_as_body { + NodeStatus::Body + } else if should_ignore_el { + NodeStatus::Ignored + } else { + NodeStatus::Indexing + }; + let node = Rc::new(RefCell::new(DomParsingNode{ parent: Some(Rc::clone(&data.borrow().current_node)), - ignore: should_ignore_el, + status, filter, meta, ..DomParsingNode::default() @@ -137,7 +163,7 @@ impl<'a> DomParser<'a> { // If we bail out now, the content won't be persisted anywhere // and the node + children will be dropped. - if node.ignore { + if node.status == NodeStatus::Ignored { return Ok(()); } @@ -169,7 +195,27 @@ impl<'a> DomParser<'a> { // and the order of tree traversal will mean that it // is inserted in the correct position in the parent's content. let mut parent = data.current_node.borrow_mut(); - parent.current_value.push_str(&node.current_value); + + // If the parent is a parent of a body, we don't want to append + // any more content to it. (Unless, of course, we are another body) + if node.status != NodeStatus::Body && parent.status == NodeStatus::ParentOfBody { + return Ok(()); + } + match node.status { + NodeStatus::Ignored => {}, + NodeStatus::Indexing => { + parent.current_value.push_str(&node.current_value); + }, + NodeStatus::Body | NodeStatus::ParentOfBody => { + // If our parent is already a parent of a body, then + // we're probably a subsequent body. Avoid clearing it out. + if parent.status != NodeStatus::ParentOfBody { + parent.current_value.clear(); + } + parent.current_value.push_str(&node.current_value); + parent.status = NodeStatus::ParentOfBody; + } + }; Ok(()) }}); @@ -240,8 +286,20 @@ impl<'a> DomParser<'a> { while node.borrow().parent.is_some() { { let node = node.borrow(); - let mut parent_node = node.parent.as_ref().unwrap().borrow_mut(); - parent_node.current_value.push_str(&node.current_value); + let mut parent = node.parent.as_ref().unwrap().borrow_mut(); + if parent.status != NodeStatus::ParentOfBody { + match node.status { + NodeStatus::Ignored => {} + NodeStatus::Indexing => { + parent.current_value.push_str(&node.current_value); + } + NodeStatus::Body | NodeStatus::ParentOfBody => { + parent.current_value.clear(); + parent.current_value.push_str(&node.current_value); + parent.status = NodeStatus::ParentOfBody; + } + }; + } } let old_node = node.borrow(); let new_node = Rc::clone(old_node.parent.as_ref().unwrap()); @@ -254,6 +312,7 @@ impl<'a> DomParser<'a> { digest: normalize_content(&node.current_value), filters: data.filters, meta: data.meta, + has_custom_body: node.status == NodeStatus::ParentOfBody, } } } diff --git a/pagefind/src/lib.rs b/pagefind/src/lib.rs index e2ba0a58..aa36a2cb 100644 --- a/pagefind/src/lib.rs +++ b/pagefind/src/lib.rs @@ -50,10 +50,24 @@ impl SearchState { .map(|f| f.fossick(&self.options)) .collect(); let all_pages = join_all(results).await; - let pages_with_data = all_pages - .into_iter() - .flatten() - .filter(|d| !d.word_data.is_empty()); + + let used_custom_body = all_pages.iter().flatten().any(|page| page.has_custom_body); + if used_custom_body { + println!( + "Found a data-pagefind-body element on the site.\n↳ Ignoring pages without this tag." + ); + } else { + println!( + "Did not find a data-pagefind-body element on the site.\n↳ Indexing all elements on the site." + ); + } + + let pages_with_data = all_pages.into_iter().flatten().filter(|d| { + if used_custom_body && !d.has_custom_body { + return false; + } + !d.word_data.is_empty() + }); let indexes = build_indexes(pages_with_data, &self.options).await; indexes.write_files(&self.options).await;