Skip to content

Commit

Permalink
Add the data-pagefind-body selector
Browse files Browse the repository at this point in the history
  • Loading branch information
bglw committed Jun 21, 2022
1 parent 85c4781 commit 28ac09d
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 10 deletions.
44 changes: 44 additions & 0 deletions pagefind/features/exclusions.feature
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,47 @@ Feature: Exclusions
Then There should be no logs
Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Hooray!"
Then The selector "[data-search-two]" should contain "0 result(s)"

Scenario: Indexing can be limited to a given element
Given I have a "public/index.html" file with the body:
"""
<p data-search-one>Nothing</p>
<p data-search-two>Nothing</p>
"""
Given I have a "public/cat/index.html" file with the body:
"""
<h1>Outer Content</h1>
<div data-pagefind-body>
<p>Hello World, from Pagefind</p>
<p>Huzzah!</p>
</div>
<p>goodbye content</p>
<p data-pagefind-body>Little extra body</p>
"""
# The above data-pagefind-body existing on a page should
# exclude all pages that do not include it.
Given I have a "public/dog/index.html" file with the body:
"""
<h1>No selector</h1>
<p>goodbye content</p>
"""
When I run my program
Then I should see "Running Pagefind" in stdout
When I serve the "public" directory
When I load "/"
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let searchone = await pagefind.search("hello");
let searchonedata = await searchone.results[0].data();
document.querySelector('[data-search-one]').innerText = searchonedata.content;
let searchtwo = await pagefind.search("goodbye");
document.querySelector('[data-search-two]').innerText = `${searchtwo.results.length} result(s)`;
}
"""
Then There should be no logs
Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Huzzah! Little extra body."
Then The selector "[data-search-two]" should contain "0 result(s)"
2 changes: 2 additions & 0 deletions pagefind/src/fossick/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub struct FossickedData {
pub file_path: PathBuf,
pub fragment: PageFragment,
pub word_data: HashMap<String, Vec<u32>>,
pub has_custom_body: bool,
}

#[derive(Debug)]
Expand Down Expand Up @@ -109,6 +110,7 @@ impl Fossicker {

Ok(FossickedData {
file_path: self.file_path.clone(),
has_custom_body: data.has_custom_body,
fragment: PageFragment {
hash,
page_number: 0,
Expand Down
71 changes: 65 additions & 6 deletions pagefind/src/fossick/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,22 @@ struct DomParserData {
meta: HashMap<String, String>,
}

#[derive(Debug, PartialEq)]
enum NodeStatus {
Indexing,
Ignored,
Body,
// There was a body element below us,
// so our content should be ignored.
ParentOfBody,
}

impl Default for NodeStatus {
fn default() -> Self {
Self::Indexing
}
}

// A single HTML element that we're reading into.
// Contains a reference to the parent element,
// and since we collapse this tree upwards while we parse,
Expand All @@ -57,7 +73,7 @@ struct DomParsingNode {
parent: Option<Rc<RefCell<DomParsingNode>>>,
filter: Option<String>,
meta: Option<String>,
ignore: bool,
status: NodeStatus,
}

/// The search-relevant data that was retrieved from the given input
Expand All @@ -66,6 +82,7 @@ pub struct DomParserResult {
pub digest: String,
pub filters: HashMap<String, Vec<String>>,
pub meta: HashMap<String, String>,
pub has_custom_body: bool,
}

// Some shorthand to clean up our use of Rc<RefCell<*>> in the lol_html macros
Expand All @@ -88,13 +105,22 @@ impl<'a> DomParser<'a> {
element_content_handlers: vec![
enclose! { (data) element!("html *", move |el| {
let should_ignore_el = el.has_attribute("data-pagefind-ignore") || REMOVE_SELECTORS.contains(&el.tag_name().as_str());
let treat_as_body = el.has_attribute("data-pagefind-body");
let filter = el.get_attribute("data-pagefind-filter").map(|attr| parse_attr_string(attr, el));
let meta = el.get_attribute("data-pagefind-meta").map(|attr| parse_attr_string(attr, el));
let tag_name = el.tag_name();

let status = if treat_as_body {
NodeStatus::Body
} else if should_ignore_el {
NodeStatus::Ignored
} else {
NodeStatus::Indexing
};

let node = Rc::new(RefCell::new(DomParsingNode{
parent: Some(Rc::clone(&data.borrow().current_node)),
ignore: should_ignore_el,
status,
filter,
meta,
..DomParsingNode::default()
Expand Down Expand Up @@ -137,7 +163,7 @@ impl<'a> DomParser<'a> {

// If we bail out now, the content won't be persisted anywhere
// and the node + children will be dropped.
if node.ignore {
if node.status == NodeStatus::Ignored {
return Ok(());
}

Expand Down Expand Up @@ -169,7 +195,27 @@ impl<'a> DomParser<'a> {
// and the order of tree traversal will mean that it
// is inserted in the correct position in the parent's content.
let mut parent = data.current_node.borrow_mut();
parent.current_value.push_str(&node.current_value);

// If the parent is a parent of a body, we don't want to append
// any more content to it. (Unless, of course, we are another body)
if node.status != NodeStatus::Body && parent.status == NodeStatus::ParentOfBody {
return Ok(());
}
match node.status {
NodeStatus::Ignored => {},
NodeStatus::Indexing => {
parent.current_value.push_str(&node.current_value);
},
NodeStatus::Body | NodeStatus::ParentOfBody => {
// If our parent is already a parent of a body, then
// we're probably a subsequent body. Avoid clearing it out.
if parent.status != NodeStatus::ParentOfBody {
parent.current_value.clear();
}
parent.current_value.push_str(&node.current_value);
parent.status = NodeStatus::ParentOfBody;
}
};

Ok(())
}});
Expand Down Expand Up @@ -240,8 +286,20 @@ impl<'a> DomParser<'a> {
while node.borrow().parent.is_some() {
{
let node = node.borrow();
let mut parent_node = node.parent.as_ref().unwrap().borrow_mut();
parent_node.current_value.push_str(&node.current_value);
let mut parent = node.parent.as_ref().unwrap().borrow_mut();
if parent.status != NodeStatus::ParentOfBody {
match node.status {
NodeStatus::Ignored => {}
NodeStatus::Indexing => {
parent.current_value.push_str(&node.current_value);
}
NodeStatus::Body | NodeStatus::ParentOfBody => {
parent.current_value.clear();
parent.current_value.push_str(&node.current_value);
parent.status = NodeStatus::ParentOfBody;
}
};
}
}
let old_node = node.borrow();
let new_node = Rc::clone(old_node.parent.as_ref().unwrap());
Expand All @@ -254,6 +312,7 @@ impl<'a> DomParser<'a> {
digest: normalize_content(&node.current_value),
filters: data.filters,
meta: data.meta,
has_custom_body: node.status == NodeStatus::ParentOfBody,
}
}
}
Expand Down
22 changes: 18 additions & 4 deletions pagefind/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,24 @@ impl SearchState {
.map(|f| f.fossick(&self.options))
.collect();
let all_pages = join_all(results).await;
let pages_with_data = all_pages
.into_iter()
.flatten()
.filter(|d| !d.word_data.is_empty());

let used_custom_body = all_pages.iter().flatten().any(|page| page.has_custom_body);
if used_custom_body {
println!(
"Found a data-pagefind-body element on the site.\n↳ Ignoring pages without this tag."
);
} else {
println!(
"Did not find a data-pagefind-body element on the site.\n↳ Indexing all <body> elements on the site."
);
}

let pages_with_data = all_pages.into_iter().flatten().filter(|d| {
if used_custom_body && !d.has_custom_body {
return false;
}
!d.word_data.is_empty()
});

let indexes = build_indexes(pages_with_data, &self.options).await;
indexes.write_files(&self.options).await;
Expand Down

0 comments on commit 28ac09d

Please sign in to comment.