Skip to content

Commit

Permalink
Merge pull request #1833 from mattheww/2022-06_searchindex
Browse files Browse the repository at this point in the history
Omit words longer than 80 characters from the search index
  • Loading branch information
ehuss authored Jun 27, 2022
2 parents 7e2752e + 000a93d commit b38792c
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 5 deletions.
20 changes: 18 additions & 2 deletions src/renderer/html_handlebars/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::path::Path;

use elasticlunr::Index;
use elasticlunr::{Index, IndexBuilder};
use pulldown_cmark::*;

use crate::book::{Book, BookItem};
Expand All @@ -13,9 +13,25 @@ use crate::utils;

use serde::Serialize;

const MAX_WORD_LENGTH_TO_INDEX: usize = 80;

/// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens.
fn tokenize(text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c == '-')
.filter(|s| !s.is_empty())
.map(|s| s.trim().to_lowercase())
.filter(|s| s.len() <= MAX_WORD_LENGTH_TO_INDEX)
.collect()
}

/// Creates all files required for search.
pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> {
let mut index = Index::new(&["title", "body", "breadcrumbs"]);
let mut index = IndexBuilder::new()
.add_field_with_tokenizer("title", Box::new(&tokenize))
.add_field_with_tokenizer("body", Box::new(&tokenize))
.add_field_with_tokenizer("breadcrumbs", Box::new(&tokenize))
.build();

let mut doc_urls = Vec::with_capacity(book.sections.len());

for item in book.iter() {
Expand Down
4 changes: 3 additions & 1 deletion tests/dummy_book/src/first/no-headers.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
Capybara capybara capybara.

Capybara capybara capybara.
Capybara capybara capybara.

ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex.
2 changes: 1 addition & 1 deletion tests/rendered_output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,7 @@ mod search {
);
assert_eq!(
docs[&no_headers]["body"],
"Capybara capybara capybara. Capybara capybara capybara."
"Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex."
);
}

Expand Down
2 changes: 1 addition & 1 deletion tests/searchindex_fixture.json
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@
"title": "Unicode stress tests"
},
"18": {
"body": "Capybara capybara capybara. Capybara capybara capybara.",
"body": "Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex.",
"breadcrumbs": "First Chapter » No Headers",
"id": "18",
"title": "First Chapter"
Expand Down

0 comments on commit b38792c

Please sign in to comment.