Skip to content

Commit

Permalink
feat: better html to markdown converter (#840)
Browse files Browse the repository at this point in the history
  • Loading branch information
sigoden committed Sep 6, 2024
1 parent 791b615 commit 555f4f5
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 25 deletions.
50 changes: 36 additions & 14 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ path-absolutize = "3.1.1"
hnsw_rs = "0.3.0"
rayon = "1.10.0"
uuid = { version = "1.9.1", features = ["v4"] }
html2text = "0.12.5"
scraper = { version = "0.20.0", default-features = false, features = ["deterministic"] }
sys-locale = "0.3.1"
html_to_markdown = "0.1.0"

[dependencies.reqwest]
version = "0.12.0"
Expand Down
18 changes: 18 additions & 0 deletions src/utils/html_to_md.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
use std::{cell::RefCell, rc::Rc};

use html_to_markdown::{markdown, TagHandler};

pub fn html_to_md(html: &str) -> String {
let mut handlers: Vec<TagHandler> = vec![
Rc::new(RefCell::new(markdown::ParagraphHandler)),
Rc::new(RefCell::new(markdown::HeadingHandler)),
Rc::new(RefCell::new(markdown::ListHandler)),
Rc::new(RefCell::new(markdown::TableHandler::new())),
Rc::new(RefCell::new(markdown::StyledTextHandler)),
Rc::new(RefCell::new(markdown::CodeHandler)),
Rc::new(RefCell::new(markdown::WebpageChromeRemover)),
];

html_to_markdown::convert_html_to_markdown(html.as_bytes(), &mut handlers)
.unwrap_or_else(|_| html.to_string())
}
2 changes: 2 additions & 0 deletions src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ mod abort_signal;
mod clipboard;
mod command;
mod crypto;
mod html_to_md;
mod path;
mod prompt_input;
mod render_prompt;
Expand All @@ -12,6 +13,7 @@ pub use self::abort_signal::*;
pub use self::clipboard::set_text;
pub use self::command::*;
pub use self::crypto::*;
pub use self::html_to_md::*;
pub use self::path::*;
pub use self::prompt_input::*;
pub use self::render_prompt::render_prompt;
Expand Down
16 changes: 6 additions & 10 deletions src/utils/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ use reqwest::Url;
use scraper::{Html, Selector};
use serde::Deserialize;
use serde_json::Value;
use std::{collections::HashMap, time::Duration};
use std::{collections::HashSet, sync::Arc};
use std::{
collections::{HashMap, HashSet},
sync::Arc,
time::Duration,
};
use tokio::io::AsyncWriteExt;
use tokio::sync::Semaphore;

Expand Down Expand Up @@ -136,10 +139,7 @@ pub async fn fetch(
None => {
let contents = res.text().await?;
if extension == "html" {
(
html2text::from_read(contents.as_bytes(), usize::MAX),
"md".into(),
)
(html_to_md(&contents), "md".into())
} else {
(contents, extension)
}
Expand Down Expand Up @@ -387,10 +387,6 @@ async fn crawl_page(
Ok((path.to_string(), text, links.into_iter().collect()))
}

fn html_to_md(html: &str) -> String {
html2text::from_read(html.as_bytes(), usize::MAX)
}

fn should_exclude_link(link: &str, exclude: &[String]) -> bool {
if link.contains("#") {
return true;
Expand Down

0 comments on commit 555f4f5

Please sign in to comment.