From 756a1a991297508e28d00d8485a8f66197fe7153 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Tue, 20 Jun 2023 12:35:25 -0400 Subject: [PATCH 01/16] Switch from std::mpsc to crossbeam-channel --- Cargo.lock | 1 + crates/noseyparker-cli/Cargo.toml | 1 + crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs | 6 +++--- crates/noseyparker/src/input_enumerator.rs | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cd3b632c4..cdc9d53a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2136,6 +2136,7 @@ dependencies = [ "assert_fs", "clap", "console", + "crossbeam-channel", "gix", "gix-features", "hex", diff --git a/crates/noseyparker-cli/Cargo.toml b/crates/noseyparker-cli/Cargo.toml index 57d90eefa..d9ea27159 100644 --- a/crates/noseyparker-cli/Cargo.toml +++ b/crates/noseyparker-cli/Cargo.toml @@ -35,6 +35,7 @@ vergen = { version = "8.1", features = ["build", "cargo", "git", "gitcl", "rustc anyhow = { version = "1.0" } clap = { version = "4.3", features = ["cargo", "derive", "env", "unicode", "wrap_help"] } console = "0.15" +crossbeam-channel = "0.5" gix-features = "0.30" gix = { version = "0.46", features = ["max-performance"] } hex = "0.4" diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index 74debfeff..c5959ec0a 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -1,8 +1,8 @@ use anyhow::{bail, Context, Result}; +use crossbeam_channel; use indicatif::{HumanBytes, HumanCount, HumanDuration}; use rayon::prelude::*; use std::str::FromStr; -use std::sync::mpsc; use std::sync::Mutex; use std::time::Instant; use tracing::{debug, debug_span, error, info, warn}; @@ -299,7 +299,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // Create a channel pair so that matcher threads can get their results to the database // recorder. - let (send_matches, recv_matches) = mpsc::sync_channel::>(512); + let (send_matches, recv_matches) = crossbeam_channel::bounded::>(512); // We create a separate thread for writing matches to the database. // The database uses SQLite, which does best with a single writer. @@ -310,7 +310,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let mut num_matches = 0u64; let mut num_added = 0usize; // keep reading until all the senders hang up; panic if recording matches fails - while let Ok(matches) = recv_matches.recv() { + for matches in recv_matches { num_matches += matches.len() as u64; num_added += datastore .record_matches(&matches) diff --git a/crates/noseyparker/src/input_enumerator.rs b/crates/noseyparker/src/input_enumerator.rs index 6e4ad82ce..9427ef414 100644 --- a/crates/noseyparker/src/input_enumerator.rs +++ b/crates/noseyparker/src/input_enumerator.rs @@ -105,7 +105,7 @@ impl<'t> Drop for Visitor<'t> { impl<'t> ignore::ParallelVisitor for Visitor<'t> { fn visit(&mut self, result: Result) -> ignore::WalkState { - // FIXME: dedupe based on (device, inode) on platforms where available + // FIXME: dedupe based on (device, inode) on platforms where available; see https://docs.rs/same-file/1.0.6/same_file/ for ideas let entry = match result { Err(e) => { From 532acf8d06dac9dff4034543e533b8371cc43cfa Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Tue, 20 Jun 2023 16:06:34 -0400 Subject: [PATCH 02/16] Centralize use of gix --- Cargo.lock | 4 ---- crates/noseyparker-cli/Cargo.toml | 3 --- .../src/bin/noseyparker/cmd_report.rs | 15 ++++--------- .../src/bin/noseyparker/cmd_scan.rs | 13 +++++------ crates/noseyparker/Cargo.toml | 1 - crates/noseyparker/src/blob_id.rs | 10 +++++++-- crates/noseyparker/src/digest.rs | 8 +++++++ crates/noseyparker/src/input_enumerator.rs | 22 +++++++++++++++++++ crates/noseyparker/src/lib.rs | 1 + 9 files changed, 49 insertions(+), 28 deletions(-) create mode 100644 crates/noseyparker/src/digest.rs diff --git a/Cargo.lock b/Cargo.lock index cdc9d53a8..77a17489d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2105,7 +2105,6 @@ dependencies = [ "chrono", "console", "gix", - "gix-features", "hex", "hyperx", "ignore", @@ -2137,9 +2136,6 @@ dependencies = [ "clap", "console", "crossbeam-channel", - "gix", - "gix-features", - "hex", "ignore", "indenter", "indicatif", diff --git a/crates/noseyparker-cli/Cargo.toml b/crates/noseyparker-cli/Cargo.toml index d9ea27159..a2742d37a 100644 --- a/crates/noseyparker-cli/Cargo.toml +++ b/crates/noseyparker-cli/Cargo.toml @@ -36,9 +36,6 @@ anyhow = { version = "1.0" } clap = { version = "4.3", features = ["cargo", "derive", "env", "unicode", "wrap_help"] } console = "0.15" crossbeam-channel = "0.5" -gix-features = "0.30" -gix = { version = "0.46", features = ["max-performance"] } -hex = "0.4" indenter = "0.3" # XXX Consider switching from indicatif to status_line: https://docs.rs/status-line/latest/status_line/struct.StatusLine.html indicatif = { version = "0.17", features = ["improved_unicode", "rayon"] } diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs index 8f3dcab1c..b364cf50c 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs @@ -8,6 +8,7 @@ use std::fmt::{Display, Formatter, Write}; use noseyparker::bstring_escape::Escaped; use noseyparker::datastore::{Datastore, MatchGroupMetadata}; +use noseyparker::digest::sha1_hexdigest; use noseyparker::match_type::Match; use noseyparker::provenance::Provenance; @@ -123,13 +124,9 @@ impl Reportable for DetailsReporter { let source_span = &m.location.source_span; // let offset_span = &m.location.offset_span; let uri = match m.provenance { - Provenance::File { path } => { - path.display().to_string() - } + Provenance::File { path } => path.display().to_string(), // FIXME: using this path is nonsense here - Provenance::GitRepo { path } => { - path.display().to_string() - } + Provenance::GitRepo { path } => path.display().to_string(), }; let location = sarif::LocationBuilder::default() @@ -170,11 +167,7 @@ impl Reportable for DetailsReporter { }) .collect::>()?; - let sha1_fingerprint = { - let mut h = gix_features::hash::Sha1::default(); - h.update(&metadata.match_content); - hex::encode(h.digest()) - }; + let sha1_fingerprint = sha1_hexdigest(&metadata.match_content); // Build the result for the match let result = sarif::ResultBuilder::default() diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index c5959ec0a..9a16e4421 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -297,12 +297,11 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let mut progress = Progress::new_bytes_bar(total_blob_bytes, "Scanning content", progress_enabled); - // Create a channel pair so that matcher threads can get their results to the database - // recorder. + // Create a channel pair for matcher threads to get their results to the datastore recorder. let (send_matches, recv_matches) = crossbeam_channel::bounded::>(512); - // We create a separate thread for writing matches to the database. - // The database uses SQLite, which does best with a single writer. + // We create a separate thread for writing matches to the datastore. + // The datastore uses SQLite, which does best with a single writer. let match_writer = { std::thread::Builder::new() .name("Datastore Writer".to_string()) @@ -314,11 +313,11 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> num_matches += matches.len() as u64; num_added += datastore .record_matches(&matches) - .expect("should be able to record matches to the database"); + .expect("should be able to record matches to the datastore"); } datastore .analyze() - .expect("should be able to analyze the database"); + .expect("should be able to analyze the datastore"); // FIXME: `num_added` is not computed correctly (datastore, num_matches, num_added as u64) }) @@ -400,7 +399,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> if seen_blobs.contains(blob_id) { return; } - let blob = match repo.find_object(gix::hash::ObjectId::from(blob_id.as_bytes())) { + let blob = match repo.find_object(blob_id) { Err(e) => { error!( "Failed to read blob {} from Git repository at {}: {}", diff --git a/crates/noseyparker/Cargo.toml b/crates/noseyparker/Cargo.toml index 4c157b335..6d3e39fe4 100644 --- a/crates/noseyparker/Cargo.toml +++ b/crates/noseyparker/Cargo.toml @@ -25,7 +25,6 @@ atoi = "2.0" bstr = { version = "1.0", features = ["serde"] } chrono = { version = "0.4", default_features = false, features = ["std"] } console = "0.15" -gix-features = "0.30" gix = { version = "0.46", features = ["max-performance"] } hex = "0.4" hyperx = "1.4" diff --git a/crates/noseyparker/src/blob_id.rs b/crates/noseyparker/src/blob_id.rs index 206feeec8..dec8f65ca 100644 --- a/crates/noseyparker/src/blob_id.rs +++ b/crates/noseyparker/src/blob_id.rs @@ -12,7 +12,7 @@ impl BlobId { /// Create a new BlobId computed from the given input. #[inline] pub fn new(input: &[u8]) -> Self { - use gix_features::hash::Sha1; + use crate::digest::Sha1; use std::io::Write; // XXX implement a Write instance for `Sha1`, in an attempt to avoid allocations for @@ -82,7 +82,13 @@ impl<'a> From<&'a gix::ObjectId> for BlobId { .expect("oid should be a 20-byte value"), ) } - } +} + +impl<'a> From<&'a BlobId> for gix::ObjectId { + fn from(blob_id: &'a BlobId) -> Self { + gix::hash::ObjectId::from(blob_id.as_bytes()) + } +} // ------------------------------------------------------------------------------------------------- // test diff --git a/crates/noseyparker/src/digest.rs b/crates/noseyparker/src/digest.rs new file mode 100644 index 000000000..6bd6db6a6 --- /dev/null +++ b/crates/noseyparker/src/digest.rs @@ -0,0 +1,8 @@ +pub use gix::features::hash::Sha1; +use hex::encode; + +pub fn sha1_hexdigest(input: &[u8]) -> String { + let mut h = Sha1::default(); + h.update(input); + encode(h.digest()) +} diff --git a/crates/noseyparker/src/input_enumerator.rs b/crates/noseyparker/src/input_enumerator.rs index 9427ef414..377cb05dd 100644 --- a/crates/noseyparker/src/input_enumerator.rs +++ b/crates/noseyparker/src/input_enumerator.rs @@ -172,6 +172,13 @@ impl<'t> ignore::ParallelVisitor for Visitor<'t> { } } +/// Provides capabitilies to recursively enumerate a filesystem. +/// +/// This provides a handful of features, including: +/// +/// - Enumeration of found files +/// - Enumeration of blobs found in Git repositories +/// - Support for ignoring files based on size or using path-based gitignore-style rules pub struct FilesystemEnumerator { walk_builder: WalkBuilder, @@ -185,6 +192,12 @@ impl FilesystemEnumerator { pub const DEFAULT_MAX_FILESIZE: u64 = 100 * 1024 * 1024; pub const DEFAULT_FOLLOW_LINKS: bool = false; + /// Create a new `FilesystemEnumerator` with the given set of input roots using default + /// settings. + /// + /// The default maximum file size is 100 MiB. + /// + /// The default behavior is to not follow symlinks. pub fn new>(inputs: &[T]) -> Result { let mut builder = WalkBuilder::new(&inputs[0]); for input in &inputs[1..] { @@ -201,11 +214,13 @@ impl FilesystemEnumerator { }) } + /// Set the number of parallel enumeration threads. pub fn threads(&mut self, threads: usize) -> &mut Self { self.walk_builder.threads(threads); self } + /// Add a set of gitignore-style rules from the given ignore file. pub fn add_ignore>(&mut self, path: T) -> Result<&mut Self> { match self.walk_builder.add_ignore(path) { Some(e) => Err(e)?, @@ -213,17 +228,24 @@ impl FilesystemEnumerator { } } + /// Enable or disable whether symbolic links are followed. pub fn follow_links(&mut self, follow_links: bool) -> &mut Self { self.walk_builder.follow_links(follow_links); self } + /// Set the maximum file size for enumerated files. + /// + /// Files larger than this value will be skipped. pub fn max_filesize(&mut self, max_filesize: Option) -> &mut Self { self.walk_builder.max_filesize(max_filesize); self.max_file_size = max_filesize; self } + /// Specify an ad-hoc filtering function to control which entries are enumerated. + /// + /// This can be used to skip entire directories. pub fn filter_entry

(&mut self, filter: P) -> &mut Self where P: Fn(&DirEntry) -> bool + Send + Sync + 'static diff --git a/crates/noseyparker/src/lib.rs b/crates/noseyparker/src/lib.rs index f55471089..0d7de511e 100644 --- a/crates/noseyparker/src/lib.rs +++ b/crates/noseyparker/src/lib.rs @@ -4,6 +4,7 @@ pub mod blob_id_set; pub mod bstring_escape; pub mod datastore; pub mod defaults; +pub mod digest; pub mod git_binary; pub mod git_url; pub mod github; From 781f5f4a4b83d6641f5ef9f876953b656c9d9448 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Tue, 20 Jun 2023 16:30:48 -0400 Subject: [PATCH 03/16] Reorganize code and add comments --- .../src/bin/noseyparker/cmd_scan.rs | 39 +-- crates/noseyparker/src/datastore.rs | 247 +++++++++++------- 2 files changed, 159 insertions(+), 127 deletions(-) diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index 9a16e4421..e787c451e 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -123,14 +123,13 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> args::GitCloneMode::Mirror => CloneMode::Mirror, args::GitCloneMode::Bare => CloneMode::Bare, }; - let clones_dir = datastore.clones_dir(); let git = Git::new(); let mut progress = Progress::new_bar(repo_urls.len() as u64, "Fetching Git repos", progress_enabled); for repo_url in repo_urls { - let output_dir = match clone_destination(&clones_dir, &repo_url) { + let output_dir = match datastore.clone_destination(&repo_url) { Err(e) => { progress.suspend(|| { error!("Failed to determine output directory for {repo_url}: {e}"); @@ -502,39 +501,3 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> Ok(()) } - -/// Get a path for a local clone of the given git URL underneath `root`. -fn clone_destination(root: &std::path::Path, repo: &GitUrl) -> Result { - Ok(root.join(repo.to_path_buf())) -} - -#[cfg(test)] -mod test { - macro_rules! clone_destination_success_tests { - ($($case_name:ident: ($root:expr, $repo:expr) => $expected:expr,)*) => { - mod clone_destination { - use noseyparker::git_url::GitUrl; - use pretty_assertions::assert_eq; - use std::path::{PathBuf, Path}; - use std::str::FromStr; - use super::super::clone_destination; - - $( - #[test] - fn $case_name() { - let expected: Option = Some(Path::new($expected).to_owned()); - - let root = Path::new($root); - let repo = GitUrl::from_str($repo).expect("repo should be a URL"); - assert_eq!(clone_destination(root, &repo).ok(), expected); - } - )* - } - } - } - - clone_destination_success_tests! { - https_01: ("rel_root", "https://example.com/testrepo.git") => "rel_root/https/example.com/testrepo.git", - https_02: ("/abs_root", "https://example.com/testrepo.git") => "/abs_root/https/example.com/testrepo.git", - } -} diff --git a/crates/noseyparker/src/datastore.rs b/crates/noseyparker/src/datastore.rs index 7e226b3dd..8c54643f0 100644 --- a/crates/noseyparker/src/datastore.rs +++ b/crates/noseyparker/src/datastore.rs @@ -7,6 +7,7 @@ use std::path::{Path, PathBuf}; use tracing::{debug, debug_span}; use crate::blob_id::BlobId; +use crate::git_url::GitUrl; use crate::location::{Location, OffsetSpan, SourcePoint, SourceSpan}; use crate::match_type::Match; use crate::provenance::Provenance; @@ -15,6 +16,14 @@ use crate::snippet::Snippet; // ------------------------------------------------------------------------------------------------- // Datastore // ------------------------------------------------------------------------------------------------- + +/// The source of truth for Nosey Parker findings and runtime state. +/// +/// A `Datastore` resides on disk as a directory, and stores a number of things: +/// +/// - A sqlite database for recording findings and scan information +/// - A scratch directory for providing temporary directories and files +/// - A directory used for storing clones of Git repositories pub struct Datastore { /// The root directory of everything contained in this `Datastore`. root_dir: PathBuf, @@ -23,6 +32,7 @@ pub struct Datastore { conn: Connection, } +// Public implementation impl Datastore { /// Create a new datastore at `root_dir` if one does not exist, /// or open an existing one if present. @@ -92,101 +102,18 @@ impl Datastore { self.root_dir.join("clones") } - fn new_connection(path: &Path) -> Result { - let conn = Connection::open(path)?; - - conn.pragma_update(None, "journal_mode", "wal")?; // https://www.sqlite.org/wal.html - conn.pragma_update(None, "foreign_keys", "on")?; // https://sqlite.org/foreignkeys.html - conn.pragma_update(None, "synchronous", "normal")?; // https://sqlite.org/pragma.html#pragma_synchronous - // - let limit: i64 = -512 * 1024; // 512MiB limit - conn.pragma_update(None, "cache_size", limit)?; // https://sqlite.org/pragma.html#pragma_cache_size - - Ok(conn) - } - - fn migrate(&mut self) -> Result { - let _span = debug_span!("Datastore::migrate", "{}", self.root_dir.display()).entered(); - let tx = self.conn.transaction()?; - - let get_user_version = || -> Result { - let user_version = tx.pragma_query_value(None, "user_version", |r| r.get(0))?; - Ok(user_version) - }; - - let set_user_version = |user_version: u64| -> Result<()> { - tx.pragma_update(None, "user_version", user_version)?; - Ok(()) - }; - - let user_version: u64 = get_user_version()?; - if user_version == 0 { - let new_user_version = user_version + 1; - debug!( - "Migrating database schema from version {} to {}", - user_version, new_user_version - ); - tx.execute_batch(indoc! {r#" - create table matches - -- This table is a fully denormalized representation of the matches found from - -- scanning. - -- - -- See the `noseyparker::match::Match` type for correspondence. - -- - -- Eventually we should refine the database schema, normalizing where appropriate. - -- Doing so could allow for better write performance and smaller databases. - ( - blob_id text not null, - - start_byte integer not null, - end_byte integer not null, - - start_line integer not null, - start_column integer not null, - - end_line integer not null, - end_column integer not null, - - before_snippet blob not null, - matching_input blob not null, - after_snippet blob not null, - - group_index integer not null, - group_input blob not null, - - rule_name text not null, - - provenance_type text not null, - provenance blob not null, - - -- NOTE: We really want this entire table to have unique values. - -- But checking just these fields ought to be sufficient to ensure that; - -- the remaining fields are either derived from these or are not relevant - -- to match deduping (like provenance). - -- Checking fewer fields should be cheaper than checking _all_ fields. - unique ( - blob_id, - start_byte, - end_byte, - group_index, - rule_name - ) - ); - - -- An index to allow quick grouping of equivalent matches - create index matches_grouping_index on matches (group_input, rule_name); - "#})?; - set_user_version(new_user_version)?; - tx.commit()?; - } - Ok(user_version) + /// Get a path for a local clone of the given git URL within this datastore's clones directory. + pub fn clone_destination(&self, repo: &GitUrl) -> Result { + clone_destination(&self.clones_dir(), repo) } + /// Analyze the datastore's sqlite database, potentially allowing for better query planning pub fn analyze(&self) -> Result<()> { self.conn.execute("analyze", [])?; Ok(()) } + /// Record the given matches into the datastore. pub fn record_matches<'a, T: IntoIterator>( &mut self, matches: T, @@ -245,6 +172,7 @@ impl Datastore { Ok(num_changed) } + /// Summarize all recorded findings. pub fn summarize(&self) -> Result { let _span = debug_span!("Datastore::summarize", "{}", self.root_dir.display()).entered(); @@ -272,10 +200,12 @@ impl Datastore { Ok(MatchSummary(es)) } + /// Get the root directory that contains this `Datastore`. pub fn root_dir(&self) -> &Path { &self.root_dir } + /// Get metadata for all groups of identical matches recorded within this `Datastore`. pub fn get_match_group_metadata(&self) -> Result> { let _span = debug_span!("Datastore::get_match_group_metadata", "{}", self.root_dir.display()).entered(); @@ -300,6 +230,7 @@ impl Datastore { Ok(es) } + /// Get up to `limit` matches that belong to the group with the given group metadata. pub fn get_match_group_matches( &self, metadata: &MatchGroupMetadata, @@ -372,6 +303,104 @@ impl Datastore { } } + +// Private implementation +impl Datastore { + fn new_connection(path: &Path) -> Result { + let conn = Connection::open(path)?; + + conn.pragma_update(None, "journal_mode", "wal")?; // https://www.sqlite.org/wal.html + conn.pragma_update(None, "foreign_keys", "on")?; // https://sqlite.org/foreignkeys.html + conn.pragma_update(None, "synchronous", "normal")?; // https://sqlite.org/pragma.html#pragma_synchronous + // + let limit: i64 = -512 * 1024; // 512MiB limit + conn.pragma_update(None, "cache_size", limit)?; // https://sqlite.org/pragma.html#pragma_cache_size + + Ok(conn) + } + + fn migrate(&mut self) -> Result { + let _span = debug_span!("Datastore::migrate", "{}", self.root_dir.display()).entered(); + let tx = self.conn.transaction()?; + + let get_user_version = || -> Result { + let user_version = tx.pragma_query_value(None, "user_version", |r| r.get(0))?; + Ok(user_version) + }; + + let set_user_version = |user_version: u64| -> Result<()> { + tx.pragma_update(None, "user_version", user_version)?; + Ok(()) + }; + + let user_version: u64 = get_user_version()?; + if user_version == 0 { + let new_user_version = user_version + 1; + debug!( + "Migrating database schema from version {} to {}", + user_version, new_user_version + ); + tx.execute_batch(indoc! {r#" + create table matches + -- This table is a fully denormalized representation of the matches found from + -- scanning. + -- + -- See the `noseyparker::match::Match` type for correspondence. + -- + -- Eventually we should refine the database schema, normalizing where appropriate. + -- Doing so could allow for better write performance and smaller databases. + ( + blob_id text not null, + + start_byte integer not null, + end_byte integer not null, + + start_line integer not null, + start_column integer not null, + + end_line integer not null, + end_column integer not null, + + before_snippet blob not null, + matching_input blob not null, + after_snippet blob not null, + + group_index integer not null, + group_input blob not null, + + rule_name text not null, + + provenance_type text not null, + provenance blob not null, + + -- NOTE: We really want this entire table to have unique values. + -- But checking just these fields ought to be sufficient to ensure that; + -- the remaining fields are either derived from these or are not relevant + -- to match deduping (like provenance). + -- Checking fewer fields should be cheaper than checking _all_ fields. + unique ( + blob_id, + start_byte, + end_byte, + group_index, + rule_name + ) + ); + + -- An index to allow quick grouping of equivalent matches + create index matches_grouping_index on matches (group_input, rule_name); + "#})?; + set_user_version(new_user_version)?; + tx.commit()?; + } + Ok(user_version) + } +} + + +// ------------------------------------------------------------------------------------------------- +// Implementation Utilities +// ------------------------------------------------------------------------------------------------- fn provenance_from_parts(tag: String, path: String) -> Result { match tag.as_str() { "git" => Ok(Provenance::GitRepo { @@ -384,10 +413,48 @@ fn provenance_from_parts(tag: String, path: String) -> Result { } } + +/// Get a path for a local clone of the given git URL underneath `root`. +fn clone_destination(root: &std::path::Path, repo: &GitUrl) -> Result { + Ok(root.join(repo.to_path_buf())) +} + +#[cfg(test)] +mod test { + macro_rules! clone_destination_success_tests { + ($($case_name:ident: ($root:expr, $repo:expr) => $expected:expr,)*) => { + mod clone_destination { + use crate::git_url::GitUrl; + use pretty_assertions::assert_eq; + use std::path::{PathBuf, Path}; + use std::str::FromStr; + use super::super::clone_destination; + + $( + #[test] + fn $case_name() { + let expected: Option = Some(Path::new($expected).to_owned()); + + let root = Path::new($root); + let repo = GitUrl::from_str($repo).expect("repo should be a URL"); + assert_eq!(clone_destination(root, &repo).ok(), expected); + } + )* + } + } + } + + clone_destination_success_tests! { + https_01: ("rel_root", "https://example.com/testrepo.git") => "rel_root/https/example.com/testrepo.git", + https_02: ("/abs_root", "https://example.com/testrepo.git") => "/abs_root/https/example.com/testrepo.git", + } +} + // ------------------------------------------------------------------------------------------------- // MatchSummary // ------------------------------------------------------------------------------------------------- -/// A summary of matches in a `Datastore` + +/// A summary of matches in a `Datastore`. #[derive(Deserialize, Serialize)] pub struct MatchSummary(pub Vec); @@ -410,6 +477,8 @@ impl std::fmt::Display for MatchSummary { // ------------------------------------------------------------------------------------------------- // MatchGroupMetadata // ------------------------------------------------------------------------------------------------- + +/// Metadata for a group of matches that have identical match content. #[derive(Debug, Deserialize, Serialize)] pub struct MatchGroupMetadata { /// The name of the rule of all the matches in the group From d71bbfdbc85ec9a73019e72645caf80bbac5aed3 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Wed, 21 Jun 2023 16:18:36 -0400 Subject: [PATCH 04/16] Avoid copying git blob data --- crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index e787c451e..29d17a1e5 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -395,6 +395,8 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // debug!("Scanning {} size {} from {:?}", oid, size, path); // Check for duplicates before even loading the entire blob contents + // At this point, a blob may have already been seen by another scanner thread, + // so this check can avoid some needless work in that case. if seen_blobs.contains(blob_id) { return; } @@ -408,8 +410,10 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> ); return; } - // FIXME: get rid of this extra copy - Ok(blob) => Blob::new(*blob_id, blob.data.to_owned()), + Ok(mut blob) => { + let data = std::mem::take(&mut blob.data); // avoid a copy + Blob::new(*blob_id, data) + } }; let provenance = Provenance::GitRepo { path: path.to_path_buf(), From fc8e46968c35d1d3fb248900924235f2c70c7c26 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Wed, 21 Jun 2023 17:05:07 -0400 Subject: [PATCH 05/16] Fix a typo in color terminal detection --- crates/noseyparker-cli/src/bin/noseyparker/args.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/noseyparker-cli/src/bin/noseyparker/args.rs b/crates/noseyparker-cli/src/bin/noseyparker/args.rs index 43ce18b69..3104b6958 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/args.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/args.rs @@ -196,7 +196,7 @@ impl GlobalArgs { match self.color { Mode::Never => false, Mode::Always => true, - Mode::Auto => std::io::stdin().is_terminal(), + Mode::Auto => std::io::stdout().is_terminal(), } } From 597f95fe74d644bb9f92250863054f99e270c850 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Wed, 21 Jun 2023 17:15:26 -0400 Subject: [PATCH 06/16] Checkpoint --- Cargo.lock | 56 ++++++- .../src/bin/noseyparker/cmd_scan.rs | 88 +++++------ crates/noseyparker-content-guesser/Cargo.toml | 20 +++ crates/noseyparker-content-guesser/src/lib.rs | 147 ++++++++++++++++++ crates/noseyparker/Cargo.toml | 1 + crates/noseyparker/src/input_enumerator.rs | 6 +- crates/noseyparker/src/lib.rs | 1 + crates/noseyparker/src/provenance.rs | 9 ++ 8 files changed, 279 insertions(+), 49 deletions(-) create mode 100644 crates/noseyparker-content-guesser/Cargo.toml create mode 100644 crates/noseyparker-content-guesser/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 77a17489d..261188ff6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -701,6 +701,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + [[package]] name = "errno" version = "0.3.1" @@ -1999,6 +2010,29 @@ version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +[[package]] +name = "magic" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87142e3acb1f4daa62eaea96605421a534119d4777a9fb43fb2784798fd89665" +dependencies = [ + "bitflags 1.3.2", + "errno 0.2.8", + "libc", + "magic-sys", + "thiserror", +] + +[[package]] +name = "magic-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eff86ae08895140d628119d407d568f3b657145ee8c265878064f717534bb3bc" +dependencies = [ + "libc", + "vcpkg", +] + [[package]] name = "matches" version = "0.1.10" @@ -2035,6 +2069,16 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "mime_guess" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" +dependencies = [ + "mime", + "unicase", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -2112,6 +2156,7 @@ dependencies = [ "indicatif", "indoc", "lazy_static", + "noseyparker_content_guesser", "pretty_assertions", "proptest", "regex", @@ -2160,6 +2205,15 @@ dependencies = [ "vergen", ] +[[package]] +name = "noseyparker_content_guesser" +version = "0.13.0-dev" +dependencies = [ + "magic", + "mime", + "mime_guess", +] + [[package]] name = "ntapi" version = "0.4.1" @@ -2730,7 +2784,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0" dependencies = [ "bitflags 1.3.2", - "errno", + "errno 0.3.1", "io-lifetimes", "libc", "linux-raw-sys", diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index 29d17a1e5..1054c1705 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -11,6 +11,7 @@ use crate::args; use noseyparker::blob::Blob; use noseyparker::blob_id_set::BlobIdSet; +use noseyparker::content_guesser; use noseyparker::datastore::Datastore; use noseyparker::defaults::DEFAULT_IGNORE_RULES; use noseyparker::git_binary::{CloneMode, Git}; @@ -297,11 +298,12 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> Progress::new_bytes_bar(total_blob_bytes, "Scanning content", progress_enabled); // Create a channel pair for matcher threads to get their results to the datastore recorder. - let (send_matches, recv_matches) = crossbeam_channel::bounded::>(512); + let channel_size = std::cmp::max(args.num_jobs * 32, 512); + let (send_matches, recv_matches) = crossbeam_channel::bounded::>(channel_size); // We create a separate thread for writing matches to the datastore. // The datastore uses SQLite, which does best with a single writer. - let match_writer = { + let datastore_writer = { std::thread::Builder::new() .name("Datastore Writer".to_string()) .spawn(move || { @@ -323,15 +325,33 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> .expect("should be able to start datastore writer thread") }; + let run_matcher = |send_matches: &crossbeam_channel::Sender::>, matcher: &mut Matcher, provenance: Provenance, blob: Blob| { + let matches = match matcher.scan_blob(&blob, &provenance) { + Err(e) => { + error!("Failed to scan blob {} from {}: {}", blob.id, provenance, e); + return; + } + Ok(v) => v, + }; + if matches.is_empty() { + return; + } + let matches = convert_blob_matches(&blob, matches, provenance); + send_matches + .send(matches) + .expect("should be able to send all matches"); + }; + // --------------------------------------------------------------------------------------------- // Scan plain files // --------------------------------------------------------------------------------------------- inputs.files.par_iter().for_each_init( || { let matcher = make_matcher().expect("should be able to create a matcher"); - (matcher, progress.clone()) + let guesser = content_guesser::Guesser::new(); + (matcher, guesser, progress.clone()) }, - |(matcher, progress), file_result: &FileResult| { + |(matcher, guesser, progress), file_result: &FileResult| { let fname = &file_result.path; let blob = match Blob::from_file(fname) { Err(e) => { @@ -344,20 +364,14 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let provenance = Provenance::File { path: fname.clone(), }; - let matches = match matcher.scan_blob(&blob, &provenance) { - Err(e) => { - error!("Failed to scan blob from {}: {}", fname.display(), e); - return; - } - Ok(v) => v, - }; - if matches.is_empty() { - return; + + { + let input = content_guesser::Input::from_path_and_bytes(fname.as_path(), &blob.bytes); + let guess = guesser.guess(input); + info!("*** {:?}: {:?}", fname, guess.guessed_types()); } - let matches = convert_blob_matches(&blob, matches, provenance); - send_matches - .send(matches) - .expect("should be able to send all matches"); + + run_matcher(&send_matches, matcher, provenance, blob); }, ); @@ -385,21 +399,16 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> git_repo_result.blobs.par_iter().for_each_init( || { - let matcher = make_matcher().expect("should be able to create a matcher"); let repo = repository.to_thread_local(); - (repo, matcher, progress.clone()) + let matcher = make_matcher().expect("should be able to create a matcher"); + let guesser = content_guesser::Guesser::new(); + (repo, matcher, guesser, progress.clone()) }, - |(repo, matcher, progress), (blob_id, size)| { + |(repo, matcher, guesser, progress), (blob_id, size)| { progress.inc(*size); let path = &git_repo_result.path; // debug!("Scanning {} size {} from {:?}", oid, size, path); - // Check for duplicates before even loading the entire blob contents - // At this point, a blob may have already been seen by another scanner thread, - // so this check can avoid some needless work in that case. - if seen_blobs.contains(blob_id) { - return; - } let blob = match repo.find_object(blob_id) { Err(e) => { error!( @@ -418,25 +427,14 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let provenance = Provenance::GitRepo { path: path.to_path_buf(), }; - match matcher.scan_blob(&blob, &provenance) { - Err(e) => { - error!( - "Failed to scan blob {} from Git repository at {}: {}", - blob_id, - path.display(), - e - ); - } - Ok(matches) => { - if matches.is_empty() { - return; - } - let matches = convert_blob_matches(&blob, matches, provenance); - send_matches - .send(matches) - .expect("should be able to send all matches"); - } + + { + let input = content_guesser::Input::from_bytes(&blob.bytes); + let guess = guesser.guess(input); + info!("*** {}: {:?}", blob_id, guess.guessed_types()); } + + run_matcher(&send_matches, matcher, provenance, blob); }, ); }); @@ -447,7 +445,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // Get rid of the reference to the sending channel after starting the scanners, // to ensure things terminate as expected. drop(send_matches); - let (datastore, num_matches, num_new_matches) = match_writer.join().unwrap(); + let (datastore, num_matches, num_new_matches) = datastore_writer.join().unwrap(); progress.finish(); // --------------------------------------------------------------------------------------------- diff --git a/crates/noseyparker-content-guesser/Cargo.toml b/crates/noseyparker-content-guesser/Cargo.toml new file mode 100644 index 000000000..e745f79ec --- /dev/null +++ b/crates/noseyparker-content-guesser/Cargo.toml @@ -0,0 +1,20 @@ +[package] + +edition.workspace = true +rust-version.workspace = true +license.workspace = true +authors.workspace = true +homepage.workspace = true +repository.workspace = true +publish.workspace = true + +name = "noseyparker_content_guesser" +version.workspace = true + +[lib] +path = "src/lib.rs" + +[dependencies] +magic = "0.13" +mime_guess = "2" +mime = "0.3" diff --git a/crates/noseyparker-content-guesser/src/lib.rs b/crates/noseyparker-content-guesser/src/lib.rs new file mode 100644 index 000000000..22a115151 --- /dev/null +++ b/crates/noseyparker-content-guesser/src/lib.rs @@ -0,0 +1,147 @@ +use magic; +use mime_guess::MimeGuess; +use std::io::Read; +use std::path::Path; + +pub enum Content { + /// No content + None, + + /// An incomplete prefix of the entire contents of a file + Prefix(PrefixContent), + + /// The entire contents of a file + Full(T), +} + +pub struct PrefixContent { + /// The prefix of the full content + content: T, + + /// The length of the full content + full_length: Option, +} + +/// The input to a `Guesser`. +pub struct Input<'a, T> { + path: Option<&'a Path>, + content: Content, +} + +impl<'a, T> Input<'a, T> { + /// Create an `Input` from a path without any content. No I/O is performed. + pub fn from_path_no_io(path: &'a Path) -> Self { + Self { + path: Some(path), + content: Content::None, + } + } +} + +impl<'a> Input<'a, &'a [u8]> { + pub fn from_path_and_bytes(path: &'a Path, bytes: &'a [u8]) -> Self { + Input { + path: Some(path), + content: Content::Full(bytes), + } + } + + pub fn from_bytes(bytes: &'a [u8]) -> Self { + Input { + path: None, + content: Content::Full(bytes), + } + } +} + +impl<'a> Input<'a, Vec> { + /// Create an `Input` from the given path, reading at most `max_length` bytes of input. + /// If no `max_length` is given, the entire file contents are read. + pub fn from_path(path: &'a Path, max_length: Option) -> std::io::Result { + let metadata = std::fs::metadata(path)?; + let expected_len = metadata.len(); + + let content = if let Some(max_length) = max_length { + let f = std::fs::File::open(path)?; + let mut buf = Vec::with_capacity(max_length); + let actual_len = f.take(max_length as u64).read_to_end(&mut buf)?; + if actual_len < expected_len as usize { + Content::Prefix(PrefixContent { + full_length: Some(expected_len as usize), + content: buf, + }) + } else { + Content::Full(buf) + } + } else { + Content::Full(std::fs::read(path)?) + }; + + Ok(Self { + path: Some(path), + content, + }) + } +} + +pub struct Output { + /// Path-based media type guess + mime_guess: Option, + + magic_guess: Option, +} + +impl Output { + pub fn guessed_types(&self) -> Vec { + let mut guessed_types = Vec::new(); + + if let Some(mime_guess) = self.mime_guess { + guessed_types.extend(mime_guess.iter().map(|m| m.to_string())); + } + + if let Some(magic_guess) = &self.magic_guess { + guessed_types.push(magic_guess.to_string()) + } + + guessed_types + } +} + +pub struct Guesser { + magic_cookie: magic::Cookie, +} + +// Public Implementation +impl Guesser { + pub fn new() -> Self { + let magic_cookie = magic::Cookie::open(magic::CookieFlags::ERROR).expect("FIXME"); + // Load the default database + magic_cookie.load::<&str>(&[]).expect("FIXME"); + Guesser { magic_cookie } + } + + pub fn guess<'a, T>(&self, input: Input<'a, T>) -> Output + where + T: AsRef<[u8]>, + { + let mime_guess = input.path.map(MimeGuess::from_path); + + let magic_guess = match &input.content { + Content::None => None, + Content::Prefix(PrefixContent { content, .. }) | Content::Full(content) => { + self.magic_cookie.buffer(content.as_ref()).ok() + } + }; + + Output { + mime_guess, + magic_guess, + } + } +} + +impl Default for Guesser { + fn default() -> Self { + Self::new() + } +} diff --git a/crates/noseyparker/Cargo.toml b/crates/noseyparker/Cargo.toml index 6d3e39fe4..c0ba6a121 100644 --- a/crates/noseyparker/Cargo.toml +++ b/crates/noseyparker/Cargo.toml @@ -34,6 +34,7 @@ indicatif = { version = "0.17", features = ["improved_unicode", "rayon"] } indoc = "2.0" ignore = "0.4" lazy_static = "1.4" +noseyparker_content_guesser = { path = "../noseyparker-content-guesser" } regex = "1.7" reqwest = { version = "0.11", features = ["json", "native-tls-vendored"] } rusqlite = { version = "0.29", features = ["bundled", "backup"] } diff --git a/crates/noseyparker/src/input_enumerator.rs b/crates/noseyparker/src/input_enumerator.rs index 377cb05dd..64e27eaef 100644 --- a/crates/noseyparker/src/input_enumerator.rs +++ b/crates/noseyparker/src/input_enumerator.rs @@ -247,8 +247,8 @@ impl FilesystemEnumerator { /// /// This can be used to skip entire directories. pub fn filter_entry

(&mut self, filter: P) -> &mut Self - where - P: Fn(&DirEntry) -> bool + Send + Sync + 'static + where + P: Fn(&DirEntry) -> bool + Send + Sync + 'static, { self.walk_builder.filter_entry(filter); self @@ -284,7 +284,7 @@ pub fn open_git_repo(path: &Path) -> Result> { match gix::open_opts(path, opts) { Err(gix::open::Error::NotARepository { .. }) => Ok(None), Err(err) => Err(err.into()), - Ok(r) => Ok(Some(r)), + Ok(repo) => Ok(Some(repo)), } } diff --git a/crates/noseyparker/src/lib.rs b/crates/noseyparker/src/lib.rs index 0d7de511e..f65c7e491 100644 --- a/crates/noseyparker/src/lib.rs +++ b/crates/noseyparker/src/lib.rs @@ -13,6 +13,7 @@ pub mod location; pub mod match_type; pub mod matcher; pub mod matcher_stats; +pub use noseyparker_content_guesser as content_guesser; pub mod progress; pub mod provenance; #[cfg(feature = "rule_profiling")] diff --git a/crates/noseyparker/src/provenance.rs b/crates/noseyparker/src/provenance.rs index 0edefbe3b..60a68f4c7 100644 --- a/crates/noseyparker/src/provenance.rs +++ b/crates/noseyparker/src/provenance.rs @@ -15,3 +15,12 @@ pub enum Provenance { path: PathBuf, }, } + +impl std::fmt::Display for Provenance { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Provenance::File { path } => write!(f, "file {:?}", path), + Provenance::GitRepo { path } => write!(f, "git repo {:?}", path), + } + } +} From 9cdc6a47d243b95f4aaa47f8b6ff350fdeedf503 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Thu, 22 Jun 2023 13:17:41 -0400 Subject: [PATCH 07/16] Update gix; Update cargo dependencies with `cargo update` --- Cargo.lock | 232 ++++++++++++++++++---------------- crates/noseyparker/Cargo.toml | 2 +- 2 files changed, 125 insertions(+), 109 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 261188ff6..6fd0325fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -77,15 +77,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" +checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" [[package]] name = "anstyle-parse" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e765fd216e48e067936442276d1d57399e37bce53c264d6fefbe298080cb57ee" +checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" dependencies = [ "utf8parse", ] @@ -198,7 +198,7 @@ dependencies = [ "lazycell", "log", "peeking_take_while", - "prettyplease 0.2.8", + "prettyplease 0.2.9", "proc-macro2", "quote", "regex", @@ -333,9 +333,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.4" +version = "4.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80672091db20273a15cf9fdd4e47ed43b5091ec9841bf4c6145c9dfbbcae09ed" +checksum = "2686c4115cb0810d9a984776e197823d08ec94f176549a89a9efded477c456dc" dependencies = [ "clap_builder", "clap_derive", @@ -344,9 +344,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.3.4" +version = "4.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1458a1df40e1e2afebb7ab60ce55c1fa8f431146205aa5f4887e0b111c27636" +checksum = "2e53afce1efce6ed1f633cf0e57612fe51db54a1ee4fd8f8503d078fe02d69ae" dependencies = [ "anstream", "anstyle", @@ -906,9 +906,9 @@ dependencies = [ [[package]] name = "gix" -version = "0.46.0" +version = "0.47.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99368b48a2f68c3fdc26e62c6425bdc4baeb4f30a4f24eb2e0904d29a2ba97ab" +checksum = "10f5281c55e0a7415877d91a15fae4a10ec7444615d64d78e48c07f20bcfcd9b" dependencies = [ "gix-actor", "gix-attributes", @@ -938,6 +938,7 @@ dependencies = [ "gix-revision", "gix-sec", "gix-tempfile", + "gix-trace", "gix-traverse", "gix-url", "gix-utils", @@ -953,9 +954,9 @@ dependencies = [ [[package]] name = "gix-actor" -version = "0.21.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fe73f9f6be1afbf1bd5be919a9636fa560e2f14d42262a934423ed6760cd838" +checksum = "b70d0d809ee387113df810ab4ebe585a076e35ae6ed59b5b280072146955a3ff" dependencies = [ "bstr", "btoi", @@ -967,9 +968,9 @@ dependencies = [ [[package]] name = "gix-attributes" -version = "0.13.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b79590ac382f80d87e06416f5fcac6fee5d83dcb152a00ed0bdbaa988acc31" +checksum = "03d7006cc5a508514207154046e18c3c39d98ba98f865ada83b6f3f3886543bb" dependencies = [ "bstr", "gix-glob", @@ -984,36 +985,36 @@ dependencies = [ [[package]] name = "gix-bitmap" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc02feb20ad313d52a450852f2005c2205d24f851e74d82b7807cbe12c371667" +checksum = "311e2fa997be6560c564b070c5da2d56d038b645a94e1e5796d5d85a350da33c" dependencies = [ "thiserror", ] [[package]] name = "gix-chunk" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7acf3bc6c4b91e8fb260086daf5e105ea3a6d913f5fd3318137f7e309d6e540" +checksum = "39db5ed0fc0a2e9b1b8265993f7efdbc30379dec268f3b91b7af0c2de4672fdd" dependencies = [ "thiserror", ] [[package]] name = "gix-command" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6141b70cfb21255223e42f3379855037cbbe8673b58dd8318d2f09b516fad1" +checksum = "bb49ab557a37b0abb2415bca2b10e541277dff0565deb5bd5e99fd95f93f51eb" dependencies = [ "bstr", ] [[package]] name = "gix-commitgraph" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8490ae1b3d55c47e6a71d247c082304a2f79f8d0332c1a2f5693d42a2021a09" +checksum = "0e498e98d0b477d6a1c1608bee39db201e7a38873460a130a97ce88b4d95b6e1" dependencies = [ "bstr", "gix-chunk", @@ -1025,9 +1026,9 @@ dependencies = [ [[package]] name = "gix-config" -version = "0.23.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51f310120ae1ba8f0ca52fb22876ce9bad5b15c8ffb3eb7302e4b64a3b9f681c" +checksum = "33b32541232a2c626849df7843e05b50cb43ac38a4f675abbe2f661874fc1e9d" dependencies = [ "bstr", "gix-config-value", @@ -1047,9 +1048,9 @@ dependencies = [ [[package]] name = "gix-config-value" -version = "0.12.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f216df1c33e6e1555923eff0096858a879e8aaadd35b5d788641e4e8064c892" +checksum = "4783caa23062f86acfd1bc9e72c62250923d1673171ce1a524d9486f8a4556a8" dependencies = [ "bitflags 2.3.2", "bstr", @@ -1060,9 +1061,9 @@ dependencies = [ [[package]] name = "gix-credentials" -version = "0.15.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f89fea8acd28f5ef8fa5042146f1637afd4d834bc8f13439d8fd1e5aca0d65" +checksum = "7dcec518a8db5b2e342ea7a2e785f46fd176b1b689ddd3f43052701bf3fa8ee3" dependencies = [ "bstr", "gix-command", @@ -1076,9 +1077,9 @@ dependencies = [ [[package]] name = "gix-date" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc164145670e9130a60a21670d9b6f0f4f8de04e5dd256c51fa5a0340c625902" +checksum = "0213f923d63c2c7d10799c1977f42df38ec586ebbf1d14fd00dfa363ac994c2b" dependencies = [ "bstr", "itoa", @@ -1088,9 +1089,9 @@ dependencies = [ [[package]] name = "gix-diff" -version = "0.30.1" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9029ad0083cc286a4bd2f5b3bf66bb66398abc26f2731a2824cd5edfc41a0e33" +checksum = "5049dd5a60d5608912da0ab184f35064901f192f4adf737716789715faffa080" dependencies = [ "gix-hash", "gix-object", @@ -1100,9 +1101,9 @@ dependencies = [ [[package]] name = "gix-discover" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba9c6c0d1f2b2efe65581de73de4305004612d49c83773e783202a7ef204f46" +checksum = "c14865cb9c6eb817d6a8d53595f1051239d2d31feae7a5e5b2f00910c94a8eb4" dependencies = [ "bstr", "dunce", @@ -1115,15 +1116,16 @@ dependencies = [ [[package]] name = "gix-features" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a8c493409bf6060d408eec9bbdd1b12ea351266b50012e2a522f75dfc7b8314" +checksum = "ae82dfceec06c034728c530399ee449f97b1e542e191247c52c169ca6af1fd89" dependencies = [ "bytesize", "crc32fast", "crossbeam-channel", "flate2", "gix-hash", + "gix-trace", "jwalk", "libc", "once_cell", @@ -1137,18 +1139,18 @@ dependencies = [ [[package]] name = "gix-fs" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30da8997008adb87f94e15beb7ee229f8a48e97af585a584bfee4a5a1880aab5" +checksum = "bb15956bc0256594c62a2399fcf6958a02a11724217eddfdc2b49b21b6292496" dependencies = [ "gix-features", ] [[package]] name = "gix-glob" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0ade1e80ab1f079703d1824e1daf73009096386aa7fd2f0477f6e4ac0a558e" +checksum = "f45cd7ab22faf154db0a9f5a8011ba9cda8b298b61b7299f43a21bbaf0b3f208" dependencies = [ "bitflags 2.3.2", "bstr", @@ -1158,9 +1160,9 @@ dependencies = [ [[package]] name = "gix-hash" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee181c85d3955f54c4426e6bfaeeada4428692e1a39b8788c2ac7785fc301dd8" +checksum = "a0dd58cdbe7ffa4032fc111864c80d5f8cecd9a2c9736c97ae7e5be834188272" dependencies = [ "hex", "thiserror", @@ -1168,9 +1170,9 @@ dependencies = [ [[package]] name = "gix-hashtable" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd259bd0d96e6153e357a8cdaca76c48e103fd34208b6c0ce77b1ad995834bd2" +checksum = "2cfd7f4ea905c13579565e3c264ca2c4103d192bd5fce2300c5a884cf1977d61" dependencies = [ "gix-hash", "hashbrown 0.13.2", @@ -1179,9 +1181,9 @@ dependencies = [ [[package]] name = "gix-ignore" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc6f7f101a0ccce808dbf7008ba131dede94e20257e7bde7a44cbb2f8c775625" +checksum = "27e82dec6975012b710837c6cd56353c3111d2308e016118bfc59275fcc8b5d0" dependencies = [ "bstr", "gix-glob", @@ -1191,9 +1193,9 @@ dependencies = [ [[package]] name = "gix-index" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca0380cdab7863e67966eee4aed32856c864c20b077e026b637af6bb3a9281b4" +checksum = "2ef2fa392d351e62ac3a6309146f61880abfbe0c07474e075d3b2ac78a6834a5" dependencies = [ "bitflags 2.3.2", "bstr", @@ -1213,9 +1215,9 @@ dependencies = [ [[package]] name = "gix-lock" -version = "6.0.0" +version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ec5d5e6f07316d3553aa7425e3ecd935ec29882556021fe1696297a448af8d2" +checksum = "328f50aad713ab606caeaf834459ef915ccdfbb9133ac6cd54616d601aa9249f" dependencies = [ "gix-tempfile", "gix-utils", @@ -1224,39 +1226,42 @@ dependencies = [ [[package]] name = "gix-mailmap" -version = "0.13.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4653701922c920e009f1bc4309feaff14882ade017770788f9a150928da3fa6a" +checksum = "d0bef8d360a6a9fc5a6d872471588d8ca7db77b940e48ff20c3b4706ad5f481d" dependencies = [ "bstr", "gix-actor", + "gix-date", "thiserror", ] [[package]] name = "gix-negotiate" -version = "0.2.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "945c3ef1e912e44a5f405fc9e924edf42000566a1b257ed52cb1293300f6f08c" +checksum = "b626aafb9f4088058f1baa5d2029b2191820c84f6c81e43535ba70bfdc7b7d56" dependencies = [ "bitflags 2.3.2", "gix-commitgraph", + "gix-date", "gix-hash", "gix-object", - "gix-revision", + "gix-revwalk", "smallvec", "thiserror", ] [[package]] name = "gix-object" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8926c8f51c44dec3e709cb5dbc93deb9e8d4064c43c9efc54c158dcdfe8446c7" +checksum = "255e477ae4cc8d10778238f011e6125b01cc0e7067dc8df87acd67a428a81f20" dependencies = [ "bstr", "btoi", "gix-actor", + "gix-date", "gix-features", "gix-hash", "gix-validate", @@ -1269,11 +1274,12 @@ dependencies = [ [[package]] name = "gix-odb" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91d98eaba4f649fed17250651c4ddfaf997c80a30f5ee4b47ac9bc18ffe3eb16" +checksum = "6b73469f145d1e6afbcfd0ab6499a366fbbcb958c2999d41d283d6c7b94024b9" dependencies = [ "arc-swap", + "gix-date", "gix-features", "gix-hash", "gix-object", @@ -1287,9 +1293,9 @@ dependencies = [ [[package]] name = "gix-pack" -version = "0.37.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e9e228f18cd87e7596e687b38619b5e4caebc678644ae6bb3d842598166d72" +checksum = "a1f3bcd1aaa72aea7163b147d2bde2480a01eadefc774a479d38f29920f7f1c8" dependencies = [ "clru", "gix-chunk", @@ -1310,11 +1316,12 @@ dependencies = [ [[package]] name = "gix-path" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1226f2e50adeb4d76c754c1856c06f13a24cad1624801653fbf09b869e5b808" +checksum = "4ea2a19d82dd55e5fad1d606b8a1ad2f7a804e10caa2efbb169cd37e0a07ede0" dependencies = [ "bstr", + "gix-trace", "home", "once_cell", "thiserror", @@ -1322,9 +1329,9 @@ dependencies = [ [[package]] name = "gix-prompt" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e15fe57fa48572b7d3bf465d6a2a0351cd3c55cba74fd5f0b9c23689f9c1a31e" +checksum = "8dfd363fd89a40c1e7bff9c9c1b136cd2002480f724b0c627c1bc771cd5480ec" dependencies = [ "gix-command", "gix-config-value", @@ -1335,9 +1342,9 @@ dependencies = [ [[package]] name = "gix-quote" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29d59489bff95b06dcdabe763b7266d3dc0a628cac1ac1caf65a7ca0a43eeae0" +checksum = "3874de636c2526de26a3405b8024b23ef1a327bebf4845d770d00d48700b6a40" dependencies = [ "bstr", "btoi", @@ -1346,11 +1353,12 @@ dependencies = [ [[package]] name = "gix-ref" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebdd999256f4ce8a5eefa89999879c159c263f3493a951d62aa5ce42c0397e1c" +checksum = "9b6c74873a9d8ff5d1310f2325f09164c15a91402ab5cde4d479ae12ff55ed69" dependencies = [ "gix-actor", + "gix-date", "gix-features", "gix-fs", "gix-hash", @@ -1366,9 +1374,9 @@ dependencies = [ [[package]] name = "gix-refspec" -version = "0.11.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72bfd622abc86dd8ad1ec51b9eb77b4f1a766b94e3a1b87cf4a022c5b5570cf4" +checksum = "ca1bc6c40bad62570683d642fcb04e977433ac8f76b674860ef7b1483c1f8990" dependencies = [ "bstr", "gix-hash", @@ -1380,9 +1388,9 @@ dependencies = [ [[package]] name = "gix-revision" -version = "0.15.2" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5044f56cd7a487ce9b034cbe0252ae0b6b47ff56ca3dabd79bc30214d0932cd7" +checksum = "f3751d6643d731fc5829d2f43ca049f4333c968f30908220ba0783c9dfe5010c" dependencies = [ "bstr", "gix-date", @@ -1395,11 +1403,12 @@ dependencies = [ [[package]] name = "gix-revwalk" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc2623ba8747914f151f5e12b65adac576ab459dbed5f50a36c7a3e9cbf2d3ca" +checksum = "144995229c6e5788b1c7386f8a3f7146ace3745c9a6b56cef9123a7d83b110c5" dependencies = [ "gix-commitgraph", + "gix-date", "gix-hash", "gix-hashtable", "gix-object", @@ -1409,9 +1418,9 @@ dependencies = [ [[package]] name = "gix-sec" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b7b38b766eb95dcc5350a9c450030b69892c0902fa35f4a6d0809273bd9dae" +checksum = "47f09860e2ddc7b13119e410c46d8e9f870acc7933fb53ae65817af83a8c9f80" dependencies = [ "bitflags 2.3.2", "gix-path", @@ -1421,9 +1430,9 @@ dependencies = [ [[package]] name = "gix-tempfile" -version = "6.0.0" +version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3785cb010e9dc5c446dfbf02bc1119fc17d3a48a27c029efcb3a3c32953eb10" +checksum = "4fac8310c17406ea619af72f42ee46dac795110f68f41b4f4fa231b69889c6a2" dependencies = [ "gix-fs", "libc", @@ -1434,13 +1443,20 @@ dependencies = [ "tempfile", ] +[[package]] +name = "gix-trace" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ff8a60073500f4d6edd181432ee11394d843db7dcf05756aa137a1233b1cbf6" + [[package]] name = "gix-traverse" -version = "0.27.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8673546506391a10fdfd4e48c8e0f3ec92355cf1fac787d2e714c7d45e301ede" +checksum = "c3f6bba1686bfbc7e0e93d4932bc6e14d479c9c9524f7c8d65b25d2a9446a99e" dependencies = [ "gix-commitgraph", + "gix-date", "gix-hash", "gix-hashtable", "gix-object", @@ -1451,9 +1467,9 @@ dependencies = [ [[package]] name = "gix-url" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1663df25ac42047a2547618d2a6979a26f478073f6306997429235d2cd4c863" +checksum = "ff1f984816338039b151a9f5dae6100e1e51e438cf61242ea8136fedc574d825" dependencies = [ "bstr", "gix-features", @@ -1465,18 +1481,18 @@ dependencies = [ [[package]] name = "gix-utils" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbcfcb150c7ef553d76988467d223254045bdcad0dc6724890f32fbe96415da5" +checksum = "1ca284c260845bc0724050aec59c7a596407678342614cdf5a1d69e044f29a36" dependencies = [ "fastrand", ] [[package]] name = "gix-validate" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57ea5845b506c7728b9d89f4227cc369a5fc5a1d5b26c3add0f0d323413a3a60" +checksum = "8d092b594c8af00a3a31fe526d363ee8a51a6f29d8496cdb991ed2f01ec0ec13" dependencies = [ "bstr", "thiserror", @@ -1484,9 +1500,9 @@ dependencies = [ [[package]] name = "gix-worktree" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b32a0e7ed52577bfb050f5350bdee2741d1b08a9ed02a2f2df6effe353896ca" +checksum = "4ee22549d6723189366235e1c6959ccdac73b58197cdbb437684eaa2169edcb9" dependencies = [ "bstr", "filetime", @@ -1815,9 +1831,9 @@ checksum = "9f2cb48b81b1dc9f39676bf99f5499babfec7cd8fe14307f7b3d747208fb5690" [[package]] name = "insta" -version = "1.29.0" +version = "1.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a28d25139df397cbca21408bb742cf6837e04cdbebf1b07b760caf971d6a972" +checksum = "28491f7753051e5704d4d0ae7860d45fae3238d7d235bc4289dcd45c48d3cec3" dependencies = [ "console", "lazy_static", @@ -2276,9 +2292,9 @@ checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "openssl" -version = "0.10.54" +version = "0.10.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69b3f656a17a6cbc115b5c7a40c616947d213ba182135b014d6051b73ab6f019" +checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" dependencies = [ "bitflags 1.3.2", "cfg-if", @@ -2317,9 +2333,9 @@ dependencies = [ [[package]] name = "openssl-sys" -version = "0.9.88" +version = "0.9.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2ce0f250f34a308dcfdbb351f511359857d4ed2134ba715a4eadd46e1ffd617" +checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6" dependencies = [ "cc", "libc", @@ -2380,9 +2396,9 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16833386b02953ca926d19f64af613b9bf742c48dcd5e09b32fbfc9740bf84e2" +checksum = "f73935e4d55e2abf7f130186537b19e7a4abc886a0252380b59248af473a3fc9" dependencies = [ "thiserror", "ucd-trie", @@ -2390,9 +2406,9 @@ dependencies = [ [[package]] name = "pest_derive" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7763190f9406839f99e5197afee8c9e759969f7dbfa40ad3b8dbee8757b745b5" +checksum = "aef623c9bbfa0eedf5a0efba11a5ee83209c326653ca31ff019bec3a95bfff2b" dependencies = [ "pest", "pest_generator", @@ -2400,9 +2416,9 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "249061b22e99973da1f5f5f1410284419e283bb60b79255bf5f42a94b66a2e00" +checksum = "b3e8cba4ec22bada7fc55ffe51e2deb6a0e0db2d0b7ab0b103acc80d2510c190" dependencies = [ "pest", "pest_meta", @@ -2413,9 +2429,9 @@ dependencies = [ [[package]] name = "pest_meta" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "457c310cfc9cf3f22bc58901cc7f0d3410ac5d6298e432a4f9a6138565cb6df6" +checksum = "a01f71cb40bd8bb94232df14b946909e14660e33fc05db3e50ae2a82d7ea0ca0" dependencies = [ "once_cell", "pest", @@ -2507,9 +2523,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b0377b720bde721213a46cda1289b2f34abf0a436907cad91578c20de0454d" +checksum = "9825a04601d60621feed79c4e6b56d65db77cdca55cef43b46b0de1096d1c282" dependencies = [ "proc-macro2", "syn 2.0.18", @@ -3324,9 +3340,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.25" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8803eee176538f94ae9a14b55b2804eb7e1441f8210b1c31290b3bccdccff73b" +checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", diff --git a/crates/noseyparker/Cargo.toml b/crates/noseyparker/Cargo.toml index c0ba6a121..03871881e 100644 --- a/crates/noseyparker/Cargo.toml +++ b/crates/noseyparker/Cargo.toml @@ -25,7 +25,7 @@ atoi = "2.0" bstr = { version = "1.0", features = ["serde"] } chrono = { version = "0.4", default_features = false, features = ["std"] } console = "0.15" -gix = { version = "0.46", features = ["max-performance"] } +gix = { version = "0.47", features = ["max-performance"] } hex = "0.4" hyperx = "1.4" include_dir = { version = "0.7", features = ["glob"] } From 971610f35e59e0e6ff62d0ea11705fabb1587f63 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Fri, 23 Jun 2023 18:01:48 -0400 Subject: [PATCH 08/16] Put content guessing behind a feature flag --- Cargo.toml | 14 +++- .../src/bin/noseyparker/cmd_scan.rs | 66 ++++++++++++------- crates/noseyparker/Cargo.toml | 3 +- crates/noseyparker/src/lib.rs | 1 + 4 files changed, 58 insertions(+), 26 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 31fa1d300..012864068 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,17 @@ [workspace] -members = ["crates/*"] +members = [ + "crates/noseyparker", + "crates/noseyparker-cli", + "crates/noseyparker-content-guesser", + "crates/vectorscan", + "crates/vectorscan-sys", +] +default-members = [ + "crates/noseyparker", + "crates/noseyparker-cli", + "crates/vectorscan", + "crates/vectorscan-sys", +] [workspace.package] edition = "2021" diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index 1054c1705..2f507de80 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -11,7 +11,12 @@ use crate::args; use noseyparker::blob::Blob; use noseyparker::blob_id_set::BlobIdSet; -use noseyparker::content_guesser; + +#[cfg(feature = "content_guesser")] +use noseyparker::{content_guesser, content_guesser::Guesser}; +#[cfg(not(feature = "content_guesser"))] +type Guesser = (); + use noseyparker::datastore::Datastore; use noseyparker::defaults::DEFAULT_IGNORE_RULES; use noseyparker::git_binary::{CloneMode, Git}; @@ -81,8 +86,9 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> ); let mut num_found: u64 = 0; let api_url = args.input_specifier_args.github_api_url.clone(); - for repo_string in github::enumerate_repo_urls(&repo_specifiers, api_url, Some(&mut progress)) - .context("Failed to enumerate GitHub repositories")? + for repo_string in + github::enumerate_repo_urls(&repo_specifiers, api_url, Some(&mut progress)) + .context("Failed to enumerate GitHub repositories")? { match GitUrl::from_str(&repo_string) { Ok(repo_url) => repo_urls.push(repo_url), @@ -267,9 +273,16 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let matcher_stats = Mutex::new(MatcherStats::default()); let seen_blobs = BlobIdSet::new(); - let make_matcher = || -> Result { + let make_matcher = || -> Result<(Matcher, Guesser)> { *num_matchers_counter.lock().unwrap() += 1; - Matcher::new(&rules_db, &seen_blobs, Some(&matcher_stats)) + let matcher = Matcher::new(&rules_db, &seen_blobs, Some(&matcher_stats))?; + + #[cfg(feature = "content_guesser")] + let guesser = content_guesser::Guesser::new(); + #[cfg(not(feature = "content_guesser"))] + let guesser = (); + + Ok((matcher, guesser)) }; // a function to convert BlobMatch into regular Match @@ -325,7 +338,25 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> .expect("should be able to start datastore writer thread") }; - let run_matcher = |send_matches: &crossbeam_channel::Sender::>, matcher: &mut Matcher, provenance: Provenance, blob: Blob| { + let run_matcher = |send_matches: &crossbeam_channel::Sender>, + matcher_guesser: &mut (Matcher, Guesser), + provenance: Provenance, + blob: Blob| { + #[allow(unused_variables)] + let (matcher, guesser) = matcher_guesser; + + #[cfg(feature = "content_guesser")] + { + let input = match provenance { + Provenance::File { path } => { + content_guesser::Input::from_path_and_bytes(&path, &blob.bytes) + } + Provenance::GitRepo {} => content_guesser::Input::from_bytes(&blob.bytes), + }; + let guess = guesser.guess(input); + info!("*** {}: {:?}", blob_id, guess.guessed_types()); + } + let matches = match matcher.scan_blob(&blob, &provenance) { Err(e) => { error!("Failed to scan blob {} from {}: {}", blob.id, provenance, e); @@ -348,10 +379,10 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> inputs.files.par_iter().for_each_init( || { let matcher = make_matcher().expect("should be able to create a matcher"); - let guesser = content_guesser::Guesser::new(); - (matcher, guesser, progress.clone()) + + (matcher, progress.clone()) }, - |(matcher, guesser, progress), file_result: &FileResult| { + |(matcher, progress), file_result: &FileResult| { let fname = &file_result.path; let blob = match Blob::from_file(fname) { Err(e) => { @@ -365,12 +396,6 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> path: fname.clone(), }; - { - let input = content_guesser::Input::from_path_and_bytes(fname.as_path(), &blob.bytes); - let guess = guesser.guess(input); - info!("*** {:?}: {:?}", fname, guess.guessed_types()); - } - run_matcher(&send_matches, matcher, provenance, blob); }, ); @@ -401,10 +426,9 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> || { let repo = repository.to_thread_local(); let matcher = make_matcher().expect("should be able to create a matcher"); - let guesser = content_guesser::Guesser::new(); - (repo, matcher, guesser, progress.clone()) + (repo, matcher, progress.clone()) }, - |(repo, matcher, guesser, progress), (blob_id, size)| { + |(repo, matcher, progress), (blob_id, size)| { progress.inc(*size); let path = &git_repo_result.path; // debug!("Scanning {} size {} from {:?}", oid, size, path); @@ -428,12 +452,6 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> path: path.to_path_buf(), }; - { - let input = content_guesser::Input::from_bytes(&blob.bytes); - let guess = guesser.guess(input); - info!("*** {}: {:?}", blob_id, guess.guessed_types()); - } - run_matcher(&send_matches, matcher, provenance, blob); }, ); diff --git a/crates/noseyparker/Cargo.toml b/crates/noseyparker/Cargo.toml index 03871881e..25523faf6 100644 --- a/crates/noseyparker/Cargo.toml +++ b/crates/noseyparker/Cargo.toml @@ -17,6 +17,7 @@ path = "src/lib.rs" [features] rule_profiling = [] +content_guesser = ["noseyparker_content_guesser"] [dependencies] # anyhow = { version = "1.0", features = ["backtrace"] } # add backtraces to errors -- not sure how expensive this is @@ -34,7 +35,7 @@ indicatif = { version = "0.17", features = ["improved_unicode", "rayon"] } indoc = "2.0" ignore = "0.4" lazy_static = "1.4" -noseyparker_content_guesser = { path = "../noseyparker-content-guesser" } +noseyparker_content_guesser = { path = "../noseyparker-content-guesser", optional = true } regex = "1.7" reqwest = { version = "0.11", features = ["json", "native-tls-vendored"] } rusqlite = { version = "0.29", features = ["bundled", "backup"] } diff --git a/crates/noseyparker/src/lib.rs b/crates/noseyparker/src/lib.rs index f65c7e491..e40cdeafd 100644 --- a/crates/noseyparker/src/lib.rs +++ b/crates/noseyparker/src/lib.rs @@ -13,6 +13,7 @@ pub mod location; pub mod match_type; pub mod matcher; pub mod matcher_stats; +#[cfg(feature = "content_guesser")] pub use noseyparker_content_guesser as content_guesser; pub mod progress; pub mod provenance; From ce2a383b502474c4ab87fd9d6032e688bd51caa5 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Mon, 26 Jun 2023 15:50:41 -0400 Subject: [PATCH 09/16] Checkpoint --- Cargo.lock | 2 + crates/noseyparker-cli/Cargo.toml | 1 + .../src/bin/noseyparker/cmd_scan.rs | 12 +- crates/noseyparker-cli/tests/help/mod.rs | 2 +- ...est_noseyparker__help__version_long-2.snap | 6 +- crates/noseyparker-content-guesser/Cargo.toml | 1 + .../noseyparker-content-guesser/src/error.rs | 5 + .../noseyparker-content-guesser/src/input.rs | 85 ++++++++++++ crates/noseyparker-content-guesser/src/lib.rs | 129 +++--------------- .../noseyparker-content-guesser/src/output.rs | 26 ++++ crates/noseyparker/Cargo.toml | 1 + crates/noseyparker/src/git_binary.rs | 43 ++---- crates/noseyparker/src/github/client.rs | 21 +-- .../noseyparker/src/github/client_builder.rs | 5 +- crates/noseyparker/src/github/error.rs | 41 ++---- crates/noseyparker/src/github/models/page.rs | 18 ++- 16 files changed, 193 insertions(+), 205 deletions(-) create mode 100644 crates/noseyparker-content-guesser/src/error.rs create mode 100644 crates/noseyparker-content-guesser/src/input.rs create mode 100644 crates/noseyparker-content-guesser/src/output.rs diff --git a/Cargo.lock b/Cargo.lock index 6fd0325fe..f0e7edfc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2181,6 +2181,7 @@ dependencies = [ "secrecy", "serde", "serde_yaml", + "thiserror", "tokio", "tracing", "url", @@ -2228,6 +2229,7 @@ dependencies = [ "magic", "mime", "mime_guess", + "thiserror", ] [[package]] diff --git a/crates/noseyparker-cli/Cargo.toml b/crates/noseyparker-cli/Cargo.toml index a2742d37a..e0a0ec4d6 100644 --- a/crates/noseyparker-cli/Cargo.toml +++ b/crates/noseyparker-cli/Cargo.toml @@ -21,6 +21,7 @@ build = "build.rs" [features] rule_profiling = ["noseyparker/rule_profiling"] +content_guesser = ["noseyparker/content_guesser"] [[bin]] name = "noseyparker" diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index 2f507de80..6f61e69c8 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -278,7 +278,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let matcher = Matcher::new(&rules_db, &seen_blobs, Some(&matcher_stats))?; #[cfg(feature = "content_guesser")] - let guesser = content_guesser::Guesser::new(); + let guesser = content_guesser::Guesser::new()?; #[cfg(not(feature = "content_guesser"))] let guesser = (); @@ -347,14 +347,16 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> #[cfg(feature = "content_guesser")] { - let input = match provenance { + let input = match &provenance { Provenance::File { path } => { - content_guesser::Input::from_path_and_bytes(&path, &blob.bytes) + content_guesser::Input::from_path_and_bytes(path, &blob.bytes) } - Provenance::GitRepo {} => content_guesser::Input::from_bytes(&blob.bytes), + Provenance::GitRepo {..} => content_guesser::Input::from_bytes(&blob.bytes), }; let guess = guesser.guess(input); - info!("*** {}: {:?}", blob_id, guess.guessed_types()); + let guesses = guess.guessed_types(); + let guesses: Vec<_> = guesses.iter().map(|m| m.essence_str()).collect(); + info!("*** {}: {:?}", blob.id, guesses); } let matches = match matcher.scan_blob(&blob, &provenance) { diff --git a/crates/noseyparker-cli/tests/help/mod.rs b/crates/noseyparker-cli/tests/help/mod.rs index 7d7536930..15f467fe4 100644 --- a/crates/noseyparker-cli/tests/help/mod.rs +++ b/crates/noseyparker-cli/tests/help/mod.rs @@ -99,7 +99,7 @@ fn version_short() { fn version_long() { with_settings!({ filters => vec![ - (r"(?m)^( [^:]+:\s+).+$", r"$1") + (r"(?m)^( [^:]+:[ \t]+).*$", r"$1") ], }, { assert_cmd_snapshot!(noseyparker_success!("--version")); diff --git a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__version_long-2.snap b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__version_long-2.snap index d634dc078..c8255f891 100644 --- a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__version_long-2.snap +++ b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__version_long-2.snap @@ -1,5 +1,5 @@ --- -source: crates/noseyparker-cli/tests/test_noseyparker_help.rs +source: crates/noseyparker-cli/tests/help/mod.rs expression: stdout --- noseyparker 0.13.0-dev @@ -12,8 +12,8 @@ Build Configuration: Commit Branch: Commit SHA: - Cargo Features: - + Cargo Features: + Debug: Optimization: Target Triple: diff --git a/crates/noseyparker-content-guesser/Cargo.toml b/crates/noseyparker-content-guesser/Cargo.toml index e745f79ec..0d1675bc2 100644 --- a/crates/noseyparker-content-guesser/Cargo.toml +++ b/crates/noseyparker-content-guesser/Cargo.toml @@ -18,3 +18,4 @@ path = "src/lib.rs" magic = "0.13" mime_guess = "2" mime = "0.3" +thiserror = "1" diff --git a/crates/noseyparker-content-guesser/src/error.rs b/crates/noseyparker-content-guesser/src/error.rs new file mode 100644 index 000000000..f7d847dc3 --- /dev/null +++ b/crates/noseyparker-content-guesser/src/error.rs @@ -0,0 +1,5 @@ +#[derive(Debug, thiserror::Error)] +pub enum GuesserError { + #[error("libmagic error: {0}")] + MagicError(#[from] magic::MagicError), +} diff --git a/crates/noseyparker-content-guesser/src/input.rs b/crates/noseyparker-content-guesser/src/input.rs new file mode 100644 index 000000000..9b2aff653 --- /dev/null +++ b/crates/noseyparker-content-guesser/src/input.rs @@ -0,0 +1,85 @@ +use std::io::Read; +use std::path::Path; + +pub enum Content { + /// No content + None, + + /// An incomplete prefix of the entire contents of a file + Prefix(PrefixContent), + + /// The entire contents of a file + Full(T), +} + +#[allow(dead_code)] +pub struct PrefixContent { + /// The prefix of the full content + pub(crate) content: T, + + /// The length of the full content + pub(crate) full_length: Option, +} + +/// The input to a `Guesser`. +pub struct Input<'a, T> { + pub(crate) path: Option<&'a Path>, + pub(crate) content: Content, +} + +impl<'a, T> Input<'a, T> { + /// Create an `Input` from a path without any content. No I/O is performed. + pub fn from_path_no_io(path: &'a Path) -> Self { + Self { + path: Some(path), + content: Content::None, + } + } +} + +impl<'a> Input<'a, &'a [u8]> { + pub fn from_path_and_bytes(path: &'a Path, bytes: &'a [u8]) -> Self { + Input { + path: Some(path), + content: Content::Full(bytes), + } + } + + pub fn from_bytes(bytes: &'a [u8]) -> Self { + Input { + path: None, + content: Content::Full(bytes), + } + } +} + +impl<'a> Input<'a, Vec> { + /// Create an `Input` from the given path, reading at most `max_length` bytes of input. + /// If no `max_length` is given, the entire file contents are read. + pub fn from_path(path: &'a Path, max_length: Option) -> std::io::Result { + let metadata = std::fs::metadata(path)?; + let expected_len = metadata.len(); + + let content = if let Some(max_length) = max_length { + let f = std::fs::File::open(path)?; + let mut buf = Vec::with_capacity(max_length); + let actual_len = f.take(max_length as u64).read_to_end(&mut buf)?; + if actual_len < expected_len as usize { + Content::Prefix(PrefixContent { + full_length: Some(expected_len as usize), + content: buf, + }) + } else { + Content::Full(buf) + } + } else { + Content::Full(std::fs::read(path)?) + }; + + Ok(Self { + path: Some(path), + content, + }) + } +} + diff --git a/crates/noseyparker-content-guesser/src/lib.rs b/crates/noseyparker-content-guesser/src/lib.rs index 22a115151..b59a5dbea 100644 --- a/crates/noseyparker-content-guesser/src/lib.rs +++ b/crates/noseyparker-content-guesser/src/lib.rs @@ -1,111 +1,16 @@ use magic; +pub use mime::Mime; use mime_guess::MimeGuess; -use std::io::Read; -use std::path::Path; -pub enum Content { - /// No content - None, +mod input; +pub use input::{Content, PrefixContent, Input}; - /// An incomplete prefix of the entire contents of a file - Prefix(PrefixContent), +mod output; +pub use output::Output; - /// The entire contents of a file - Full(T), -} - -pub struct PrefixContent { - /// The prefix of the full content - content: T, - - /// The length of the full content - full_length: Option, -} - -/// The input to a `Guesser`. -pub struct Input<'a, T> { - path: Option<&'a Path>, - content: Content, -} +mod error; +pub use error::GuesserError; -impl<'a, T> Input<'a, T> { - /// Create an `Input` from a path without any content. No I/O is performed. - pub fn from_path_no_io(path: &'a Path) -> Self { - Self { - path: Some(path), - content: Content::None, - } - } -} - -impl<'a> Input<'a, &'a [u8]> { - pub fn from_path_and_bytes(path: &'a Path, bytes: &'a [u8]) -> Self { - Input { - path: Some(path), - content: Content::Full(bytes), - } - } - - pub fn from_bytes(bytes: &'a [u8]) -> Self { - Input { - path: None, - content: Content::Full(bytes), - } - } -} - -impl<'a> Input<'a, Vec> { - /// Create an `Input` from the given path, reading at most `max_length` bytes of input. - /// If no `max_length` is given, the entire file contents are read. - pub fn from_path(path: &'a Path, max_length: Option) -> std::io::Result { - let metadata = std::fs::metadata(path)?; - let expected_len = metadata.len(); - - let content = if let Some(max_length) = max_length { - let f = std::fs::File::open(path)?; - let mut buf = Vec::with_capacity(max_length); - let actual_len = f.take(max_length as u64).read_to_end(&mut buf)?; - if actual_len < expected_len as usize { - Content::Prefix(PrefixContent { - full_length: Some(expected_len as usize), - content: buf, - }) - } else { - Content::Full(buf) - } - } else { - Content::Full(std::fs::read(path)?) - }; - - Ok(Self { - path: Some(path), - content, - }) - } -} - -pub struct Output { - /// Path-based media type guess - mime_guess: Option, - - magic_guess: Option, -} - -impl Output { - pub fn guessed_types(&self) -> Vec { - let mut guessed_types = Vec::new(); - - if let Some(mime_guess) = self.mime_guess { - guessed_types.extend(mime_guess.iter().map(|m| m.to_string())); - } - - if let Some(magic_guess) = &self.magic_guess { - guessed_types.push(magic_guess.to_string()) - } - - guessed_types - } -} pub struct Guesser { magic_cookie: magic::Cookie, @@ -113,11 +18,12 @@ pub struct Guesser { // Public Implementation impl Guesser { - pub fn new() -> Self { - let magic_cookie = magic::Cookie::open(magic::CookieFlags::ERROR).expect("FIXME"); + pub fn new() -> Result { + use magic::CookieFlags; + let magic_cookie = magic::Cookie::open(CookieFlags::ERROR | CookieFlags::MIME)?; // Load the default database - magic_cookie.load::<&str>(&[]).expect("FIXME"); - Guesser { magic_cookie } + magic_cookie.load::<&str>(&[])?; + Ok(Guesser { magic_cookie }) } pub fn guess<'a, T>(&self, input: Input<'a, T>) -> Output @@ -129,7 +35,10 @@ impl Guesser { let magic_guess = match &input.content { Content::None => None, Content::Prefix(PrefixContent { content, .. }) | Content::Full(content) => { - self.magic_cookie.buffer(content.as_ref()).ok() + match self.magic_cookie.buffer(content.as_ref()) { + Ok(m) => m.parse().ok(), + _ => None, + } } }; @@ -139,9 +48,3 @@ impl Guesser { } } } - -impl Default for Guesser { - fn default() -> Self { - Self::new() - } -} diff --git a/crates/noseyparker-content-guesser/src/output.rs b/crates/noseyparker-content-guesser/src/output.rs new file mode 100644 index 000000000..9ad6c0e26 --- /dev/null +++ b/crates/noseyparker-content-guesser/src/output.rs @@ -0,0 +1,26 @@ +use mime::Mime; +use mime_guess::MimeGuess; + +#[derive(Debug)] +pub struct Output { + /// Path-based media type guess + pub(crate) mime_guess: Option, + + pub(crate) magic_guess: Option, +} + +impl Output { + pub fn guessed_types(&self) -> Vec { + let mut guessed_types = Vec::new(); + + if let Some(mime_guess) = self.mime_guess { + guessed_types.extend(mime_guess); + } + + if let Some(magic_guess) = &self.magic_guess { + guessed_types.push(magic_guess.clone()); + } + + guessed_types + } +} diff --git a/crates/noseyparker/Cargo.toml b/crates/noseyparker/Cargo.toml index 25523faf6..ad5c65d68 100644 --- a/crates/noseyparker/Cargo.toml +++ b/crates/noseyparker/Cargo.toml @@ -42,6 +42,7 @@ rusqlite = { version = "0.29", features = ["bundled", "backup"] } secrecy = "0.8.0" serde = { version = "1.0", features = ["derive"] } serde_yaml = "0.9" +thiserror = "1" tokio = "1.23" tracing = "0.1" url = "2.3" diff --git a/crates/noseyparker/src/git_binary.rs b/crates/noseyparker/src/git_binary.rs index a31a26730..c8530d065 100644 --- a/crates/noseyparker/src/git_binary.rs +++ b/crates/noseyparker/src/git_binary.rs @@ -4,9 +4,15 @@ use tracing::{debug, debug_span}; use crate::git_url::GitUrl; -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum GitError { - IOError(std::io::Error), + #[error("git execution failed: {0}")] + IOError(#[from] std::io::Error), + + #[error("git execution failed\ncode={}\nstdout=```\n{}```\nstderr=```\n{}```", + .status, + String::from_utf8_lossy(.stdout), + String::from_utf8_lossy(.stderr))] GitError { stdout: Vec, stderr: Vec, @@ -14,39 +20,6 @@ pub enum GitError { }, } -impl From for GitError { - fn from(err: std::io::Error) -> GitError { - GitError::IOError(err) - } -} - -impl std::fmt::Display for GitError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - GitError::IOError(e) => write!(f, "git execution failed: {e}"), - GitError::GitError { - stdout, - stderr, - status, - } => write!( - f, - "git execution failed\ncode={status}\nstdout=```\n{}```\nstderr=```\n{}```", - String::from_utf8_lossy(stdout), - String::from_utf8_lossy(stderr) - ), - } - } -} - -impl std::error::Error for GitError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - GitError::IOError(e) => Some(e), - GitError::GitError { .. } => None, - } - } -} - pub struct Git { credentials: Vec, } diff --git a/crates/noseyparker/src/github/client.rs b/crates/noseyparker/src/github/client.rs index 820fa2673..8b60d7553 100644 --- a/crates/noseyparker/src/github/client.rs +++ b/crates/noseyparker/src/github/client.rs @@ -34,13 +34,13 @@ impl Client { pub async fn get_rate_limit(&self) -> Result { let response = self.get(&["rate_limit"]).await?; - let body = response.json().await.map_err(Error::ReqwestError)?; + let body = response.json().await?; Ok(body) } pub async fn get_user(&self, username: &str) -> Result { let response = self.get(&["users", username]).await?; - let body = response.json().await.map_err(Error::ReqwestError)?; + let body = response.json().await?; Ok(body) } @@ -126,12 +126,12 @@ fn url_from_path_parts_and_params( } buf.push_str(p); } - let url = base_url.join(&buf).map_err(Error::UrlParseError)?; + let url = base_url.join(&buf)?; let url = if params.is_empty() { - Url::parse(url.as_str()).map_err(Error::UrlParseError)? + Url::parse(url.as_str()) } else { - Url::parse_with_params(url.as_str(), params).map_err(Error::UrlParseError)? - }; + Url::parse_with_params(url.as_str(), params) + }?; Ok(url) } @@ -258,7 +258,7 @@ impl Client { }; // send request and wait for response - let response = request_builder.send().await.map_err(Error::ReqwestError)?; + let response = request_builder.send().await?; // Check for rate limiting. // @@ -279,7 +279,7 @@ impl Client { if response.status() == StatusCode::FORBIDDEN { if let Some(retry_after) = response.headers().get("Retry-After") { let wait = atoi::atoi::(retry_after.as_bytes()).map(Duration::seconds); - let client_error = response.json().await.map_err(Error::ReqwestError)?; + let client_error = response.json().await?; return Err(Error::RateLimited { client_error, wait }); } @@ -304,11 +304,12 @@ impl Client { Some(reset_time - date) }(); - let client_error = response.json().await.map_err(Error::ReqwestError)?; + let client_error = response.json().await?; return Err(Error::RateLimited { client_error, wait }); } } - response.error_for_status().map_err(Error::ReqwestError) + let response = response.error_for_status()?; + Ok(response) } } diff --git a/crates/noseyparker/src/github/client_builder.rs b/crates/noseyparker/src/github/client_builder.rs index bae0a9e48..e625e65b5 100644 --- a/crates/noseyparker/src/github/client_builder.rs +++ b/crates/noseyparker/src/github/client_builder.rs @@ -26,7 +26,7 @@ impl ClientBuilder { /// Use the specified base URL. pub fn base_url(mut self, url: T) -> Result { - self.base_url = url.into_url().map_err(Error::ReqwestError)?; + self.base_url = url.into_url()?; Ok(self) } @@ -62,8 +62,7 @@ impl ClientBuilder { pub fn build(self) -> Result { let inner = reqwest::ClientBuilder::new() .user_agent(Self::USER_AGENT) - .build() - .map_err(Error::ReqwestError)?; + .build()?; Ok(Client { base_url: self.base_url, auth: self.auth, diff --git a/crates/noseyparker/src/github/error.rs b/crates/noseyparker/src/github/error.rs index 7d2f7585d..1bc8555fe 100644 --- a/crates/noseyparker/src/github/error.rs +++ b/crates/noseyparker/src/github/error.rs @@ -4,8 +4,9 @@ use super::models; // ------------------------------------------------------------------------------------------------- // Error // ------------------------------------------------------------------------------------------------- -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum Error { + #[error("request was rate-limited: {}", .client_error.message)] RateLimited { /// The client error returned by GitHub client_error: models::ClientError, @@ -13,35 +14,19 @@ pub enum Error { /// The duration to wait until trying again wait: Option, }, + + #[error("invalid base url: {0}")] UrlBaseError(url::Url), - UrlParseError(url::ParseError), + + #[error("error parsing URL: {0}")] + UrlParseError(#[from] url::ParseError), + + #[error("error building URL: component {0:?} contains a slash")] UrlSlashError(String), - ReqwestError(reqwest::Error), - InvalidTokenEnvVar(String), -} -impl std::fmt::Display for Error { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Error::RateLimited{client_error, ..} => write!(f, "request was rate-limited: {}", client_error.message), - Error::UrlBaseError(u) =>write!(f, "invalid base url: {u}"), - Error::UrlParseError(e) => write!(f, "error parsing URL: {e}"), - Error::UrlSlashError(p) => write!(f, "error building URL: component {p:?} contains a slash"), - Error::ReqwestError(e) => write!(f, "error making request: {e}"), - Error::InvalidTokenEnvVar(v) => write!(f, "error loading token: ill-formed value of {v} environment variable"), - } - } -} + #[error("error making request: {0}")] + ReqwestError(#[from] reqwest::Error), -impl std::error::Error for Error { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - Error::RateLimited{..} => None, - Error::UrlBaseError(_) => None, - Error::UrlParseError(e) => Some(e), - Error::UrlSlashError(_) => None, - Error::ReqwestError(e) => Some(e), - Error::InvalidTokenEnvVar(_) => None, - } - } + #[error("error loading token: ill-formed value of {0} environment variable")] + InvalidTokenEnvVar(String), } diff --git a/crates/noseyparker/src/github/models/page.rs b/crates/noseyparker/src/github/models/page.rs index e522d1ed2..8e9d4a130 100644 --- a/crates/noseyparker/src/github/models/page.rs +++ b/crates/noseyparker/src/github/models/page.rs @@ -1,4 +1,4 @@ -use crate::github::{Error, Result}; +use crate::github::Result; use url::Url; // ------------------------------------------------------------------------------------------------- @@ -9,10 +9,10 @@ pub struct Page { pub links: HeaderLinks, } -impl Page { +impl Page { pub async fn from_response(response: reqwest::Response) -> Result { let links = get_header_links(&response)?; - let items = response.json().await.map_err(Error::ReqwestError)?; + let items = response.json().await?; Ok(Page { items, links }) } } @@ -46,13 +46,17 @@ fn get_header_links(response: &reqwest::Response) -> Result { _ => None, }; if let Some(dst) = dst { - *dst = Some(Url::parse(value.link()).map_err(Error::UrlParseError)?); - }; + *dst = Some(Url::parse(value.link())?); + } } } } } - Ok(HeaderLinks { first, prev, next, last }) + Ok(HeaderLinks { + first, + prev, + next, + last, + }) } - From 9d1d3a4460d19a59b5c0c0a32f407c43772a403d Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Tue, 27 Jun 2023 17:57:56 -0400 Subject: [PATCH 10/16] Checkpoint: blob metadata now recorded to the datastore --- CHANGELOG.md | 5 +- Cargo.lock | 2 + crates/noseyparker-cli/Cargo.toml | 1 + .../src/bin/noseyparker/cmd_scan.rs | 210 +++++++++++------- .../src/guesser.rs | 47 ++++ crates/noseyparker-content-guesser/src/lib.rs | 41 +--- .../noseyparker-content-guesser/src/output.rs | 19 +- crates/noseyparker/Cargo.toml | 1 + crates/noseyparker/src/datastore.rs | 152 ++++++++++++- 9 files changed, 348 insertions(+), 130 deletions(-) create mode 100644 crates/noseyparker-content-guesser/src/guesser.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 77fbc844f..108e22f70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,7 +42,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - When a Git repository is cloned, the default behavior is to match `git clone --bare` instead of `git clone --mirror`. This new default behavior results in cloning potentially less content, but avoids cloning content from forks from repositories hosted on GitHub. -- The command-line help has been refined for clarity +- The command-line help has been refined for clarity. + +- Scanning performance has been improved on particular workloads by as much as 2x by recording matches to the datastore in larger batches. + This is particularly relevant to heavy multithreaded scanning workloads where the inputs have many matches. ### Fixes diff --git a/Cargo.lock b/Cargo.lock index f0e7edfc3..bcf8160b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2172,6 +2172,7 @@ dependencies = [ "indicatif", "indoc", "lazy_static", + "mime", "noseyparker_content_guesser", "pretty_assertions", "proptest", @@ -2204,6 +2205,7 @@ dependencies = [ "indoc", "insta", "lazy_static", + "mime", "noseyparker", "predicates", "pretty_assertions", diff --git a/crates/noseyparker-cli/Cargo.toml b/crates/noseyparker-cli/Cargo.toml index e0a0ec4d6..a2d2115c4 100644 --- a/crates/noseyparker-cli/Cargo.toml +++ b/crates/noseyparker-cli/Cargo.toml @@ -43,6 +43,7 @@ indicatif = { version = "0.17", features = ["improved_unicode", "rayon"] } indoc = "2.0" ignore = "0.4" lazy_static = "1.4" +mime = "0.3" noseyparker = { path = "../noseyparker" } prettytable-rs = "0.10" rayon = "1.5" diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index 6f61e69c8..b834c9391 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -1,6 +1,7 @@ use anyhow::{bail, Context, Result}; use crossbeam_channel; use indicatif::{HumanBytes, HumanCount, HumanDuration}; +use mime::Mime; use rayon::prelude::*; use std::str::FromStr; use std::sync::Mutex; @@ -9,7 +10,7 @@ use tracing::{debug, debug_span, error, info, warn}; use crate::args; -use noseyparker::blob::Blob; +use noseyparker::blob::{Blob, BlobId}; use noseyparker::blob_id_set::BlobIdSet; #[cfg(feature = "content_guesser")] @@ -312,7 +313,11 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // Create a channel pair for matcher threads to get their results to the datastore recorder. let channel_size = std::cmp::max(args.num_jobs * 32, 512); - let (send_matches, recv_matches) = crossbeam_channel::bounded::>(channel_size); + enum DatastoreMessage { + Matches(Vec), + Blob((BlobId, usize, Option)), + } + let (send_ds, recv_ds) = crossbeam_channel::bounded::(channel_size); // We create a separate thread for writing matches to the datastore. // The datastore uses SQLite, which does best with a single writer. @@ -322,13 +327,57 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> .spawn(move || { let mut num_matches = 0u64; let mut num_added = 0usize; + // keep reading until all the senders hang up; panic if recording matches fails - for matches in recv_matches { - num_matches += matches.len() as u64; + // + // accumulate messages in batches to avoid an excessive number of tiny datastore + // transactions (which kills performance) + const BATCH_SIZE: usize = 512; + let mut batch_matches: Vec = Vec::with_capacity(BATCH_SIZE); + let mut batch_blob_metadata: Vec<(BlobId, usize, Option)> = Vec::with_capacity(BATCH_SIZE); + + for message in recv_ds { + match message { + DatastoreMessage::Matches(matches) => { + batch_matches.extend(matches); + + if batch_matches.len() >= BATCH_SIZE { + num_matches += batch_matches.len() as u64; + num_added += datastore + .record_matches(&batch_matches) + .expect("should be able to record matches to the datastore"); + batch_matches.clear(); + } + } + DatastoreMessage::Blob((blob_id, len, mime)) => { + batch_blob_metadata.push((blob_id, len, mime)); + + if batch_blob_metadata.len() >= BATCH_SIZE { + datastore + .record_blob_metadata(&batch_blob_metadata) + .expect("should be able to record blob metadata to the datastore"); + batch_blob_metadata.clear() + } + } + } + } + + // record any remaining batched up items + if !batch_matches.is_empty() { + num_matches += batch_matches.len() as u64; num_added += datastore - .record_matches(&matches) + .record_matches(&batch_matches) .expect("should be able to record matches to the datastore"); + batch_matches.clear(); + } + + if !batch_blob_metadata.is_empty() { + datastore + .record_blob_metadata(&batch_blob_metadata) + .expect("should be able to record blob metadata to the datastore"); + batch_blob_metadata.clear() } + datastore .analyze() .expect("should be able to analyze the datastore"); @@ -338,58 +387,60 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> .expect("should be able to start datastore writer thread") }; - let run_matcher = |send_matches: &crossbeam_channel::Sender>, - matcher_guesser: &mut (Matcher, Guesser), + let run_matcher = |matcher_guesser: &mut (Matcher, Guesser), provenance: Provenance, - blob: Blob| { + blob: Blob| + -> Result<()> { #[allow(unused_variables)] let (matcher, guesser) = matcher_guesser; #[cfg(feature = "content_guesser")] - { + let mime: Option = { let input = match &provenance { Provenance::File { path } => { content_guesser::Input::from_path_and_bytes(path, &blob.bytes) } - Provenance::GitRepo {..} => content_guesser::Input::from_bytes(&blob.bytes), + Provenance::GitRepo { .. } => content_guesser::Input::from_bytes(&blob.bytes), }; let guess = guesser.guess(input); - let guesses = guess.guessed_types(); - let guesses: Vec<_> = guesses.iter().map(|m| m.essence_str()).collect(); - info!("*** {}: {:?}", blob.id, guesses); - } + guess.content_guess() + }; + #[cfg(not(feature = "content_guesser"))] + let mime: Option = None; + + send_ds.send(DatastoreMessage::Blob((blob.id, blob.len(), mime)))?; let matches = match matcher.scan_blob(&blob, &provenance) { Err(e) => { error!("Failed to scan blob {} from {}: {}", blob.id, provenance, e); - return; + return Ok(()); } Ok(v) => v, }; if matches.is_empty() { - return; + return Ok(()); } let matches = convert_blob_matches(&blob, matches, provenance); - send_matches - .send(matches) - .expect("should be able to send all matches"); + send_ds.send(DatastoreMessage::Matches(matches))?; + + Ok(()) }; // --------------------------------------------------------------------------------------------- // Scan plain files // --------------------------------------------------------------------------------------------- - inputs.files.par_iter().for_each_init( + inputs.files.par_iter().try_for_each_init( || { let matcher = make_matcher().expect("should be able to create a matcher"); (matcher, progress.clone()) }, - |(matcher, progress), file_result: &FileResult| { + |(matcher, progress), file_result: &FileResult| -> Result<()> { let fname = &file_result.path; let blob = match Blob::from_file(fname) { Err(e) => { error!("Failed to load blob from {}: {}", fname.display(), e); - return; + return Ok(()); } Ok(v) => v, }; @@ -398,73 +449,80 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> path: fname.clone(), }; - run_matcher(&send_matches, matcher, provenance, blob); + run_matcher(matcher, provenance, blob)?; + + Ok(()) }, - ); + )?; // --------------------------------------------------------------------------------------------- // Scan Git repo inputs // --------------------------------------------------------------------------------------------- - inputs.git_repos.par_iter().for_each(|git_repo_result| { - let repository = match open_git_repo(&git_repo_result.path) { - Ok(Some(repository)) => repository.into_sync(), - Ok(None) => { - error!( - "Failed to re-open previously-found repository at {}", - git_repo_result.path.display() - ); - return; - } - Err(err) => { - error!( - "Failed to re-open previously-found repository at {}: {err}", - git_repo_result.path.display() - ); - return; - } - }; + inputs + .git_repos + .par_iter() + .try_for_each(|git_repo_result| -> Result<()> { + let repository = match open_git_repo(&git_repo_result.path) { + Ok(Some(repository)) => repository.into_sync(), + Ok(None) => { + error!( + "Failed to re-open previously-found repository at {}", + git_repo_result.path.display() + ); + return Ok(()); + } + Err(err) => { + error!( + "Failed to re-open previously-found repository at {}: {err}", + git_repo_result.path.display() + ); + return Ok(()); + } + }; - git_repo_result.blobs.par_iter().for_each_init( - || { - let repo = repository.to_thread_local(); - let matcher = make_matcher().expect("should be able to create a matcher"); - (repo, matcher, progress.clone()) - }, - |(repo, matcher, progress), (blob_id, size)| { - progress.inc(*size); - let path = &git_repo_result.path; - // debug!("Scanning {} size {} from {:?}", oid, size, path); - - let blob = match repo.find_object(blob_id) { - Err(e) => { - error!( - "Failed to read blob {} from Git repository at {}: {}", - blob_id, - path.display(), - e - ); - return; - } - Ok(mut blob) => { - let data = std::mem::take(&mut blob.data); // avoid a copy - Blob::new(*blob_id, data) - } - }; - let provenance = Provenance::GitRepo { - path: path.to_path_buf(), - }; + git_repo_result.blobs.par_iter().try_for_each_init( + || { + let repo = repository.to_thread_local(); + let matcher = make_matcher().expect("should be able to create a matcher"); + (repo, matcher, progress.clone()) + }, + |(repo, matcher, progress), (blob_id, size)| -> Result<()> { + progress.inc(*size); + let path = &git_repo_result.path; + // debug!("Scanning {} size {} from {:?}", oid, size, path); + + let blob = match repo.find_object(blob_id) { + Err(e) => { + error!( + "Failed to read blob {} from Git repository at {}: {}", + blob_id, + path.display(), + e + ); + return Ok(()); + } + Ok(mut blob) => { + let data = std::mem::take(&mut blob.data); // avoid a copy + Blob::new(*blob_id, data) + } + }; + let provenance = Provenance::GitRepo { + path: path.to_path_buf(), + }; - run_matcher(&send_matches, matcher, provenance, blob); - }, - ); - }); + run_matcher(matcher, provenance, blob)?; + + Ok(()) + }, + ) + })?; // --------------------------------------------------------------------------------------------- // Wait for all inputs to be scanned and the database thread to finish // --------------------------------------------------------------------------------------------- // Get rid of the reference to the sending channel after starting the scanners, // to ensure things terminate as expected. - drop(send_matches); + drop(send_ds); let (datastore, num_matches, num_new_matches) = datastore_writer.join().unwrap(); progress.finish(); diff --git a/crates/noseyparker-content-guesser/src/guesser.rs b/crates/noseyparker-content-guesser/src/guesser.rs new file mode 100644 index 000000000..22d4b6150 --- /dev/null +++ b/crates/noseyparker-content-guesser/src/guesser.rs @@ -0,0 +1,47 @@ +use magic; +use mime_guess::MimeGuess; + +use crate::{ + error::GuesserError, + input::{Content, Input, PrefixContent}, + output::Output, +}; + +pub struct Guesser { + magic_cookie: magic::Cookie, +} + +// Public Implementation +impl Guesser { + pub fn new() -> Result { + use magic::CookieFlags; + let flags = CookieFlags::ERROR | CookieFlags::MIME; + assert!(!flags.contains(CookieFlags::DEBUG)); + let magic_cookie = magic::Cookie::open(flags)?; + // Load the default database + magic_cookie.load::<&str>(&[])?; + Ok(Guesser { magic_cookie }) + } + + pub fn guess<'a, T>(&self, input: Input<'a, T>) -> Output + where + T: AsRef<[u8]>, + { + let mime_guess = input.path.map(MimeGuess::from_path); + + let magic_guess = match &input.content { + Content::None => None, + Content::Prefix(PrefixContent { content, .. }) | Content::Full(content) => { + match self.magic_cookie.buffer(content.as_ref()) { + Ok(m) => m.parse().ok(), + _ => None, + } + } + }; + + Output { + mime_guess, + magic_guess, + } + } +} diff --git a/crates/noseyparker-content-guesser/src/lib.rs b/crates/noseyparker-content-guesser/src/lib.rs index b59a5dbea..824175799 100644 --- a/crates/noseyparker-content-guesser/src/lib.rs +++ b/crates/noseyparker-content-guesser/src/lib.rs @@ -1,6 +1,4 @@ -use magic; pub use mime::Mime; -use mime_guess::MimeGuess; mod input; pub use input::{Content, PrefixContent, Input}; @@ -11,40 +9,5 @@ pub use output::Output; mod error; pub use error::GuesserError; - -pub struct Guesser { - magic_cookie: magic::Cookie, -} - -// Public Implementation -impl Guesser { - pub fn new() -> Result { - use magic::CookieFlags; - let magic_cookie = magic::Cookie::open(CookieFlags::ERROR | CookieFlags::MIME)?; - // Load the default database - magic_cookie.load::<&str>(&[])?; - Ok(Guesser { magic_cookie }) - } - - pub fn guess<'a, T>(&self, input: Input<'a, T>) -> Output - where - T: AsRef<[u8]>, - { - let mime_guess = input.path.map(MimeGuess::from_path); - - let magic_guess = match &input.content { - Content::None => None, - Content::Prefix(PrefixContent { content, .. }) | Content::Full(content) => { - match self.magic_cookie.buffer(content.as_ref()) { - Ok(m) => m.parse().ok(), - _ => None, - } - } - }; - - Output { - mime_guess, - magic_guess, - } - } -} +mod guesser; +pub use guesser::Guesser; diff --git a/crates/noseyparker-content-guesser/src/output.rs b/crates/noseyparker-content-guesser/src/output.rs index 9ad6c0e26..c577e3a6b 100644 --- a/crates/noseyparker-content-guesser/src/output.rs +++ b/crates/noseyparker-content-guesser/src/output.rs @@ -6,21 +6,18 @@ pub struct Output { /// Path-based media type guess pub(crate) mime_guess: Option, + /// Content-based media type guess pub(crate) magic_guess: Option, } impl Output { - pub fn guessed_types(&self) -> Vec { - let mut guessed_types = Vec::new(); - - if let Some(mime_guess) = self.mime_guess { - guessed_types.extend(mime_guess); - } - - if let Some(magic_guess) = &self.magic_guess { - guessed_types.push(magic_guess.clone()); - } + /// Get the path-based media type guess + pub fn path_guess(&self) -> Option { + self.mime_guess.and_then(|g| g.first()) + } - guessed_types + /// Get the content-based media type guess + pub fn content_guess(&self) -> Option { + self.magic_guess.clone() } } diff --git a/crates/noseyparker/Cargo.toml b/crates/noseyparker/Cargo.toml index ad5c65d68..ff94a2d6d 100644 --- a/crates/noseyparker/Cargo.toml +++ b/crates/noseyparker/Cargo.toml @@ -35,6 +35,7 @@ indicatif = { version = "0.17", features = ["improved_unicode", "rayon"] } indoc = "2.0" ignore = "0.4" lazy_static = "1.4" +mime = "0.3" noseyparker_content_guesser = { path = "../noseyparker-content-guesser", optional = true } regex = "1.7" reqwest = { version = "0.11", features = ["json", "native-tls-vendored"] } diff --git a/crates/noseyparker/src/datastore.rs b/crates/noseyparker/src/datastore.rs index 8c54643f0..034a268e1 100644 --- a/crates/noseyparker/src/datastore.rs +++ b/crates/noseyparker/src/datastore.rs @@ -1,6 +1,8 @@ use anyhow::{bail, Context, Result}; use bstr::BString; use indoc::indoc; +use mime; +use mime::Mime; use rusqlite::Connection; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; @@ -24,6 +26,13 @@ use crate::snippet::Snippet; /// - A sqlite database for recording findings and scan information /// - A scratch directory for providing temporary directories and files /// - A directory used for storing clones of Git repositories +/// +/// Note that a `Datastore` is not `Sync`, and thus cannot be directly shared between threads. +/// The recommended pattern in a case that requires concurrent access is to have a single thread +/// that mediates access to the `Datastore`. +/// +/// Accessing a single `Datastore` from multiple processes is untested and may not work correctly. +/// This implementation has not built-in mechanism to check for or prevent multi-process access. pub struct Datastore { /// The root directory of everything contained in this `Datastore`. root_dir: PathBuf, @@ -113,7 +122,74 @@ impl Datastore { Ok(()) } + /// Record the given blob metadata into the datastore. + /// + /// The given entries are recorded in a single transaction. + pub fn record_blob_metadata<'a, T: IntoIterator)>>( + &mut self, + blob_metadata: T, + ) -> Result<()> { + let _span = debug_span!("Datastore::record_blob_metadata", "{}", self.root_dir.display()).entered(); + + let tx = self.conn.transaction()?; + { + let mut add_blob_id = tx.prepare_cached(indoc! {r#" + insert into blob_id(blob_id) values (?) + on conflict do update set id = id + returning id + "#})?; + + let mut add_mime_type = tx.prepare_cached(indoc! {r#" + insert into mime_type(essence) values (?) + on conflict do update set id = id + returning id + "#})?; + + let mut add_charset = tx.prepare_cached(indoc! {r#" + insert into charset(charset) values (?) + on conflict do update set id = id + returning id + "#})?; + + let mut set_blob_size = tx.prepare_cached(indoc! {r#" + insert or ignore into blob_size(blob_id, size) values (?, ?) + "#})?; + + let mut set_blob_mime_type = tx.prepare_cached(indoc! {r#" + insert or ignore into blob_mime_type(blob_id, mime_type_id) values (?, ?) + "#})?; + + let mut set_blob_charset = tx.prepare_cached(indoc! {r#" + insert or ignore into blob_charset(blob_id, charset_id) values (?, ?) + "#})?; + + fn execute_get_id(stmt: &mut rusqlite::CachedStatement, params: P) -> rusqlite::Result { + stmt.query_row(params, |r| r.get(0)) + } + + for (blob_id, blob_len, mime) in blob_metadata { + let blob_id_id = execute_get_id(&mut add_blob_id, (blob_id.hex(), ))?; + set_blob_size.execute((blob_id_id, blob_len))?; + + if let Some(mime) = &mime { + let mime_type_id = execute_get_id(&mut add_mime_type, (mime.essence_str(), ))?; + set_blob_mime_type.execute((blob_id_id, mime_type_id))?; + + if let Some(charset) = mime.get_param(mime::CHARSET) { + let charset_id: i64 = execute_get_id(&mut add_charset, (charset.as_str(), ))?; + set_blob_charset.execute((blob_id_id, charset_id))?; + } + } + } + } + + tx.commit()?; + Ok(()) + } + /// Record the given matches into the datastore. + /// + /// The given entries are recorded in a single transaction. pub fn record_matches<'a, T: IntoIterator>( &mut self, matches: T, @@ -319,7 +395,7 @@ impl Datastore { Ok(conn) } - fn migrate(&mut self) -> Result { + fn migrate(&mut self) -> Result<()> { let _span = debug_span!("Datastore::migrate", "{}", self.root_dir.display()).entered(); let tx = self.conn.transaction()?; @@ -333,6 +409,9 @@ impl Datastore { Ok(()) }; + // ----------------------------------------------------------------------------------------- + // migration 1 + // ----------------------------------------------------------------------------------------- let user_version: u64 = get_user_version()?; if user_version == 0 { let new_user_version = user_version + 1; @@ -391,9 +470,76 @@ impl Datastore { create index matches_grouping_index on matches (group_input, rule_name); "#})?; set_user_version(new_user_version)?; - tx.commit()?; } - Ok(user_version) + + // ----------------------------------------------------------------------------------------- + // migration 2 + // ----------------------------------------------------------------------------------------- + let user_version: u64 = get_user_version()?; + if user_version == 1 { + let new_user_version = user_version + 1; + debug!( + "Migrating database schema from version {} to {}", + user_version, new_user_version + ); + + tx.execute_batch(indoc! {r#" + create table blob_id + -- Assigns a unique integer ID to a Git-style blob ID. + -- + -- Blob IDs are 40-character hexadecimal strings. + -- We introduce integer IDs to represent blob IDs instead of using them directly; + -- an integer is smaller and cheaper to manipulate than a 40-character string. + ( + id integer primary key, + blob_id text unique not null, + constraint valid_blob_id check( + length(blob_id) == 40 and not glob('*[^abcdefABCDEF1234567890]*', blob_id) + ) + ); + + create table blob_size + -- Records the size in bytes of a blob. + ( + blob_id integer primary key references blob_id(id), + size integer unique not null, + constraint valid_size check(0 <= size) + ); + + create table mime_type + -- Assigns a unique integer ID to a MIME type. + ( + id integer primary key, + -- The MIME "essence" string, comprising type, subtype, and optional suffix + essence text unique not null + ); + + create table charset + -- Assigns a unique integer ID to charset strings. + ( + id integer primary key, + charset text unique not null + ); + + create table blob_mime_type + -- Records the guessed mime type of a blob. + ( + blob_id integer primary key references blob_id(id), + mime_type_id integer references mime_type(id) + ); + + create table blob_charset + -- Records the guessed charset of a blob. + ( + blob_id integer primary key references blob_id(id), + charset_id integer references charset(id) + ); + "#})?; + set_user_version(new_user_version)?; + } + + tx.commit()?; + Ok(()) } } From 047a085981bc468c70a73c40aaeb10d7fcdbe03a Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Tue, 27 Jun 2023 22:29:27 -0400 Subject: [PATCH 11/16] Fix database performance with content_guesser feature enabled --- .../src/bin/noseyparker/cmd_scan.rs | 9 +- crates/noseyparker/src/datastore.rs | 94 +++++++++++++++---- 2 files changed, 80 insertions(+), 23 deletions(-) diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index b834c9391..871855ff6 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -312,12 +312,13 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> Progress::new_bytes_bar(total_blob_bytes, "Scanning content", progress_enabled); // Create a channel pair for matcher threads to get their results to the datastore recorder. - let channel_size = std::cmp::max(args.num_jobs * 32, 512); + let channel_size = std::cmp::max(args.num_jobs * 32, 1024); enum DatastoreMessage { Matches(Vec), - Blob((BlobId, usize, Option)), + BlobMetadata((BlobId, usize, Option)), } let (send_ds, recv_ds) = crossbeam_channel::bounded::(channel_size); + // let (send_ds, recv_ds) = crossbeam_channel::unbounded::(); // We create a separate thread for writing matches to the datastore. // The datastore uses SQLite, which does best with a single writer. @@ -349,7 +350,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> batch_matches.clear(); } } - DatastoreMessage::Blob((blob_id, len, mime)) => { + DatastoreMessage::BlobMetadata((blob_id, len, mime)) => { batch_blob_metadata.push((blob_id, len, mime)); if batch_blob_metadata.len() >= BATCH_SIZE { @@ -408,7 +409,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> #[cfg(not(feature = "content_guesser"))] let mime: Option = None; - send_ds.send(DatastoreMessage::Blob((blob.id, blob.len(), mime)))?; + send_ds.send(DatastoreMessage::BlobMetadata((blob.id, blob.len(), mime)))?; let matches = match matcher.scan_blob(&blob, &provenance) { Err(e) => { diff --git a/crates/noseyparker/src/datastore.rs b/crates/noseyparker/src/datastore.rs index 034a268e1..7754841c2 100644 --- a/crates/noseyparker/src/datastore.rs +++ b/crates/noseyparker/src/datastore.rs @@ -135,48 +135,67 @@ impl Datastore { { let mut add_blob_id = tx.prepare_cached(indoc! {r#" insert into blob_id(blob_id) values (?) - on conflict do update set id = id - returning id + on conflict do nothing "#})?; + let mut get_blob_id_id = tx.prepare_cached(indoc! {r#" + select id from blob_id where blob_id = ? limit 1 + "#})?; + + let mut add_mime_type = tx.prepare_cached(indoc! {r#" insert into mime_type(essence) values (?) - on conflict do update set id = id - returning id + on conflict do nothing + "#})?; + + let mut get_mime_type_id = tx.prepare_cached(indoc! {r#" + select id from mime_type where essence = ? limit 1 "#})?; + let mut add_charset = tx.prepare_cached(indoc! {r#" insert into charset(charset) values (?) - on conflict do update set id = id - returning id + on conflict do nothing + "#})?; + + let mut get_charset_id = tx.prepare_cached(indoc! {r#" + select id from charset where charset = ? limit 1 "#})?; + let mut set_blob_size = tx.prepare_cached(indoc! {r#" - insert or ignore into blob_size(blob_id, size) values (?, ?) + insert into blob_size(blob_id, size) values (?, ?) + on conflict do nothing "#})?; let mut set_blob_mime_type = tx.prepare_cached(indoc! {r#" - insert or ignore into blob_mime_type(blob_id, mime_type_id) values (?, ?) + insert into blob_mime_type(blob_id, mime_type_id) values (?, ?) + on conflict do nothing "#})?; let mut set_blob_charset = tx.prepare_cached(indoc! {r#" - insert or ignore into blob_charset(blob_id, charset_id) values (?, ?) + insert into blob_charset(blob_id, charset_id) values (?, ?) + on conflict do nothing "#})?; - fn execute_get_id(stmt: &mut rusqlite::CachedStatement, params: P) -> rusqlite::Result { - stmt.query_row(params, |r| r.get(0)) - } - for (blob_id, blob_len, mime) in blob_metadata { - let blob_id_id = execute_get_id(&mut add_blob_id, (blob_id.hex(), ))?; + add_blob_id.execute((blob_id.hex(), ))?; + let blob_id_id: i64 = get_blob_id_id.query_row((blob_id.hex(), ), |r| r.get(0))?; + set_blob_size.execute((blob_id_id, blob_len))?; if let Some(mime) = &mime { - let mime_type_id = execute_get_id(&mut add_mime_type, (mime.essence_str(), ))?; + let essence = mime.essence_str(); + add_mime_type.execute((essence, ))?; + let mime_type_id: i64 = get_mime_type_id.query_row((essence, ), |r| r.get(0))?; + set_blob_mime_type.execute((blob_id_id, mime_type_id))?; if let Some(charset) = mime.get_param(mime::CHARSET) { - let charset_id: i64 = execute_get_id(&mut add_charset, (charset.as_str(), ))?; + let charset_str = charset.as_str(); + add_charset.execute((charset_str, ))?; + let charset_id: i64 = get_charset_id.query_row((charset_str, ), |r| r.get(0))?; + set_blob_charset.execute((blob_id_id, charset_id))?; } } @@ -502,10 +521,21 @@ impl Datastore { -- Records the size in bytes of a blob. ( blob_id integer primary key references blob_id(id), - size integer unique not null, + size integer not null, constraint valid_size check(0 <= size) ); + create view blob_size_denorm as + -- A convenience view for the denormalized blob size data. + select + blob_id.id blob_id_id, + blob_id.blob_id, + blob_size.size + from + blob_size + inner join blob_id on (blob_size.blob_id = blob_id.id) + ; + create table mime_type -- Assigns a unique integer ID to a MIME type. ( @@ -525,15 +555,41 @@ impl Datastore { -- Records the guessed mime type of a blob. ( blob_id integer primary key references blob_id(id), - mime_type_id integer references mime_type(id) + mime_type_id integer not null references mime_type(id) ); + create view blob_mime_type_denorm as + -- A convenience view for the denormalized blob mime type data. + select + blob_id.id blob_id_id, + blob_id.blob_id, + mime_type.id mime_type_id, + mime_type.essence + from + blob_mime_type + inner join blob_id on (blob_mime_type.blob_id = blob_id.id) + inner join mime_type on (blob_mime_type.mime_type_id = mime_type.id) + ; + create table blob_charset -- Records the guessed charset of a blob. ( blob_id integer primary key references blob_id(id), - charset_id integer references charset(id) + charset_id integer not null references charset(id) ); + + create view blob_charset_denorm as + -- A convenience view for the denormalized blob charset data. + select + blob_id.id blob_id_id, + blob_id.blob_id, + charset.id charset_id, + charset.charset + from + blob_charset + inner join blob_id on (blob_charset.blob_id = blob_id.id) + inner join charset on (blob_charset.charset_id = charset.id) + ; "#})?; set_user_version(new_user_version)?; } From 3ee680a8b79824eeb60abae501665a0f1eca75cc Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Wed, 28 Jun 2023 22:38:23 -0400 Subject: [PATCH 12/16] Rework recording of blob metadata This should have somewhat better performance for big parallel jobs on a machine with a slow disk. --- CHANGELOG.md | 5 + .../src/bin/noseyparker/args.rs | 4 + .../src/bin/noseyparker/cmd_scan.rs | 103 ++++++----- .../test_noseyparker__help__help_scan-2.snap | 7 + ..._noseyparker__help__help_scan_short-2.snap | 3 + crates/noseyparker-cli/tests/scan/mod.rs | 1 + crates/noseyparker/src/datastore.rs | 165 ++---------------- 7 files changed, 101 insertions(+), 187 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 108e22f70..92df73198 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - The Git repository cloning behavior in the `scan` command can now be controlled with the new `--git-clone-mode MODE` parameter. +- In the `scan` command, basic blob metadata is recorded in the datastore for each discovered blob, including blob size in bytes. + If the `content_guesser` Cargo feature is enabled, the recorded metadata additionally includes guessed mime type and charset. + This metadata is recorded for each blob in which matches are found, but this behavior can be enabled for all blobs using the new `--record-all-blobs true` parameter. + This newly recorded metadata is not currently used for anything in Nosey Parker. + ### Changes - Existing rules were modified to reduce both false positives and false negatives: diff --git a/crates/noseyparker-cli/src/bin/noseyparker/args.rs b/crates/noseyparker-cli/src/bin/noseyparker/args.rs index 3104b6958..f9dbd3dc6 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/args.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/args.rs @@ -387,6 +387,10 @@ pub struct ScanArgs { #[command(flatten)] pub content_filtering_args: ContentFilteringArgs, + + /// Enable or disable metadata recording for all discovered blobs instead of just those with matches. + #[arg(long, default_value_t=false, action=ArgAction::Set, value_name="BOOL")] + pub record_all_blobs: bool, } /// The mode to use for cloning a Git repository diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index 871855ff6..d5f11ccb2 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -289,7 +289,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // a function to convert BlobMatch into regular Match let convert_blob_matches = |blob: &Blob, matches: Vec, provenance: Provenance| -> Vec { - assert!(!matches.is_empty()); + // assert!(!matches.is_empty()); let loc_mapping = { match matches .iter() @@ -312,13 +312,11 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> Progress::new_bytes_bar(total_blob_bytes, "Scanning content", progress_enabled); // Create a channel pair for matcher threads to get their results to the datastore recorder. - let channel_size = std::cmp::max(args.num_jobs * 32, 1024); - enum DatastoreMessage { - Matches(Vec), - BlobMetadata((BlobId, usize, Option)), - } - let (send_ds, recv_ds) = crossbeam_channel::bounded::(channel_size); - // let (send_ds, recv_ds) = crossbeam_channel::unbounded::(); + // let channel_size = std::cmp::max(args.num_jobs * 32, 1024); + type Metadata = (BlobId, usize, Option); + type DatastoreMessage = (Vec, Metadata); + // let (send_ds, recv_ds) = crossbeam_channel::bounded::(channel_size); + let (send_ds, recv_ds) = crossbeam_channel::unbounded::(); // We create a separate thread for writing matches to the datastore. // The datastore uses SQLite, which does best with a single writer. @@ -333,50 +331,72 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // // accumulate messages in batches to avoid an excessive number of tiny datastore // transactions (which kills performance) - const BATCH_SIZE: usize = 512; - let mut batch_matches: Vec = Vec::with_capacity(BATCH_SIZE); - let mut batch_blob_metadata: Vec<(BlobId, usize, Option)> = Vec::with_capacity(BATCH_SIZE); - - for message in recv_ds { - match message { - DatastoreMessage::Matches(matches) => { - batch_matches.extend(matches); - - if batch_matches.len() >= BATCH_SIZE { - num_matches += batch_matches.len() as u64; - num_added += datastore - .record_matches(&batch_matches) - .expect("should be able to record matches to the datastore"); - batch_matches.clear(); - } + + let mut last_tx_time = std::time::Instant::now(); + + const BUF_SIZE: usize = 16384; + let mut batch_matches: Vec> = Vec::with_capacity(BUF_SIZE); + let mut batch_matches_count: usize = 0; + let mut batch_metadata: Vec = Vec::with_capacity(BUF_SIZE); + + // Try to commit at least every second + const COMMIT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(1000); + + for (matches, metadata) in recv_ds.iter() { + batch_matches_count += matches.len(); + batch_matches.push(matches); + + batch_metadata.push(metadata); + + if batch_matches_count >= BUF_SIZE || batch_metadata.len() >= BUF_SIZE || last_tx_time.elapsed() >= COMMIT_INTERVAL { + let mut committed = false; + if batch_matches_count > 0 { + // let t1 = std::time::Instant::now(); + num_matches += batch_matches_count as u64; + num_added += datastore + .record_matches(batch_matches.iter().flatten()) + .expect("should be able to record matches to the datastore"); + // debug!("*** commit matches: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_matches_count, recv_ds.len()); + batch_matches.clear(); + batch_matches_count = 0; + committed = true; + } + + if !batch_metadata.is_empty() { + // let t1 = std::time::Instant::now(); + datastore + .record_blob_metadata(&batch_metadata) + .expect("should be able to record blob metadata to the datastore"); + // debug!("*** commit metadata: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_metadata.len(), recv_ds.len()); + batch_metadata.clear(); + committed = true; } - DatastoreMessage::BlobMetadata((blob_id, len, mime)) => { - batch_blob_metadata.push((blob_id, len, mime)); - - if batch_blob_metadata.len() >= BATCH_SIZE { - datastore - .record_blob_metadata(&batch_blob_metadata) - .expect("should be able to record blob metadata to the datastore"); - batch_blob_metadata.clear() - } + + if committed { + last_tx_time = std::time::Instant::now(); } } } // record any remaining batched up items if !batch_matches.is_empty() { - num_matches += batch_matches.len() as u64; + let t1 = std::time::Instant::now(); + num_matches += batch_matches_count as u64; num_added += datastore - .record_matches(&batch_matches) + .record_matches(batch_matches.iter().flatten()) .expect("should be able to record matches to the datastore"); + debug!("*** commit matches: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_matches_count, recv_ds.len()); batch_matches.clear(); + // batch_matches_count = 0; } - if !batch_blob_metadata.is_empty() { + if !batch_metadata.is_empty() { + let t1 = std::time::Instant::now(); datastore - .record_blob_metadata(&batch_blob_metadata) + .record_blob_metadata(&batch_metadata) .expect("should be able to record blob metadata to the datastore"); - batch_blob_metadata.clear() + debug!("*** commit metadata: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_metadata.len(), recv_ds.len()); + batch_metadata.clear(); } datastore @@ -409,7 +429,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> #[cfg(not(feature = "content_guesser"))] let mime: Option = None; - send_ds.send(DatastoreMessage::BlobMetadata((blob.id, blob.len(), mime)))?; + let metadata = (blob.id, blob.len(), mime); let matches = match matcher.scan_blob(&blob, &provenance) { Err(e) => { @@ -418,11 +438,12 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> } Ok(v) => v, }; - if matches.is_empty() { + if matches.is_empty() && !args.record_all_blobs { return Ok(()); } + let matches = convert_blob_matches(&blob, matches, provenance); - send_ds.send(DatastoreMessage::Matches(matches))?; + send_ds.send((matches, metadata))?; Ok(()) }; diff --git a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan-2.snap b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan-2.snap index 7da6d74cf..f3650b84c 100644 --- a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan-2.snap +++ b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan-2.snap @@ -129,6 +129,13 @@ Content Filtering Options: This option can be repeated. + --record-all-blobs + Enable or disable metadata recording for all discovered blobs instead of just those with + matches + + [default: false] + [possible values: true, false] + Global Options: -v, --verbose... Enable verbose output diff --git a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan_short-2.snap b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan_short-2.snap index 40eee5d3a..64296c2e2 100644 --- a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan_short-2.snap +++ b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan_short-2.snap @@ -29,6 +29,9 @@ Input Specifier Options: Content Filtering Options: --max-file-size Do not scan files larger than the specified size [default: 100] -i, --ignore Use custom path-based ignore rules from the specified file + --record-all-blobs Enable or disable metadata recording for all discovered blobs + instead of just those with matches [default: false] [possible + values: true, false] Global Options: -v, --verbose... Enable verbose output diff --git a/crates/noseyparker-cli/tests/scan/mod.rs b/crates/noseyparker-cli/tests/scan/mod.rs index 46d86996b..18019d923 100644 --- a/crates/noseyparker-cli/tests/scan/mod.rs +++ b/crates/noseyparker-cli/tests/scan/mod.rs @@ -13,3 +13,4 @@ mod snippet_length; // TODO: add test for scanning with `--git-clone-mode bare` and `--git-clone-mode mirror` // TODO: add test for scanning with `--github-api-url` // TODO: add tests for SARIF output format +// TODO: add tests for blob metadata recording diff --git a/crates/noseyparker/src/datastore.rs b/crates/noseyparker/src/datastore.rs index 7754841c2..69038de53 100644 --- a/crates/noseyparker/src/datastore.rs +++ b/crates/noseyparker/src/datastore.rs @@ -119,6 +119,7 @@ impl Datastore { /// Analyze the datastore's sqlite database, potentially allowing for better query planning pub fn analyze(&self) -> Result<()> { self.conn.execute("analyze", [])?; + // self.conn.execute("pragma wal_checkpoint(truncate)", [])?; Ok(()) } @@ -133,72 +134,18 @@ impl Datastore { let tx = self.conn.transaction()?; { - let mut add_blob_id = tx.prepare_cached(indoc! {r#" - insert into blob_id(blob_id) values (?) - on conflict do nothing - "#})?; - - let mut get_blob_id_id = tx.prepare_cached(indoc! {r#" - select id from blob_id where blob_id = ? limit 1 - "#})?; - - - let mut add_mime_type = tx.prepare_cached(indoc! {r#" - insert into mime_type(essence) values (?) - on conflict do nothing - "#})?; - - let mut get_mime_type_id = tx.prepare_cached(indoc! {r#" - select id from mime_type where essence = ? limit 1 - "#})?; - - - let mut add_charset = tx.prepare_cached(indoc! {r#" - insert into charset(charset) values (?) - on conflict do nothing - "#})?; - - let mut get_charset_id = tx.prepare_cached(indoc! {r#" - select id from charset where charset = ? limit 1 - "#})?; - - - let mut set_blob_size = tx.prepare_cached(indoc! {r#" - insert into blob_size(blob_id, size) values (?, ?) - on conflict do nothing - "#})?; - - let mut set_blob_mime_type = tx.prepare_cached(indoc! {r#" - insert into blob_mime_type(blob_id, mime_type_id) values (?, ?) - on conflict do nothing - "#})?; - - let mut set_blob_charset = tx.prepare_cached(indoc! {r#" - insert into blob_charset(blob_id, charset_id) values (?, ?) - on conflict do nothing + let mut stmt = tx.prepare_cached(indoc! {r#" + insert or replace into blob_metadata(blob_id, size, mime_essence, charset) + values (?, ?, ?, ?) "#})?; for (blob_id, blob_len, mime) in blob_metadata { - add_blob_id.execute((blob_id.hex(), ))?; - let blob_id_id: i64 = get_blob_id_id.query_row((blob_id.hex(), ), |r| r.get(0))?; - - set_blob_size.execute((blob_id_id, blob_len))?; - - if let Some(mime) = &mime { - let essence = mime.essence_str(); - add_mime_type.execute((essence, ))?; - let mime_type_id: i64 = get_mime_type_id.query_row((essence, ), |r| r.get(0))?; - - set_blob_mime_type.execute((blob_id_id, mime_type_id))?; - - if let Some(charset) = mime.get_param(mime::CHARSET) { - let charset_str = charset.as_str(); - add_charset.execute((charset_str, ))?; - let charset_id: i64 = get_charset_id.query_row((charset_str, ), |r| r.get(0))?; + let (mime_essence, charset) = match mime { + None => (None, None), + Some(mime) => (Some(mime.essence_str()), mime.get_param(mime::CHARSET).map(|n| n.as_str())), + }; - set_blob_charset.execute((blob_id_id, charset_id))?; - } - } + stmt.execute((&blob_id.hex(), blob_len, mime_essence, charset))?; } } @@ -407,8 +354,8 @@ impl Datastore { conn.pragma_update(None, "journal_mode", "wal")?; // https://www.sqlite.org/wal.html conn.pragma_update(None, "foreign_keys", "on")?; // https://sqlite.org/foreignkeys.html conn.pragma_update(None, "synchronous", "normal")?; // https://sqlite.org/pragma.html#pragma_synchronous - // - let limit: i64 = -512 * 1024; // 512MiB limit + + let limit: i64 = -8192 * 1024; // 8GiB limit conn.pragma_update(None, "cache_size", limit)?; // https://sqlite.org/pragma.html#pragma_cache_size Ok(conn) @@ -503,93 +450,19 @@ impl Datastore { ); tx.execute_batch(indoc! {r#" - create table blob_id - -- Assigns a unique integer ID to a Git-style blob ID. - -- - -- Blob IDs are 40-character hexadecimal strings. - -- We introduce integer IDs to represent blob IDs instead of using them directly; - -- an integer is smaller and cheaper to manipulate than a 40-character string. + create table blob_metadata ( - id integer primary key, - blob_id text unique not null, + blob_id text primary key, + size integer not null, + mime_essence text, + charset text /*, + constraint valid_blob_id check( length(blob_id) == 40 and not glob('*[^abcdefABCDEF1234567890]*', blob_id) - ) - ); - - create table blob_size - -- Records the size in bytes of a blob. - ( - blob_id integer primary key references blob_id(id), - size integer not null, + ), constraint valid_size check(0 <= size) + */ ); - - create view blob_size_denorm as - -- A convenience view for the denormalized blob size data. - select - blob_id.id blob_id_id, - blob_id.blob_id, - blob_size.size - from - blob_size - inner join blob_id on (blob_size.blob_id = blob_id.id) - ; - - create table mime_type - -- Assigns a unique integer ID to a MIME type. - ( - id integer primary key, - -- The MIME "essence" string, comprising type, subtype, and optional suffix - essence text unique not null - ); - - create table charset - -- Assigns a unique integer ID to charset strings. - ( - id integer primary key, - charset text unique not null - ); - - create table blob_mime_type - -- Records the guessed mime type of a blob. - ( - blob_id integer primary key references blob_id(id), - mime_type_id integer not null references mime_type(id) - ); - - create view blob_mime_type_denorm as - -- A convenience view for the denormalized blob mime type data. - select - blob_id.id blob_id_id, - blob_id.blob_id, - mime_type.id mime_type_id, - mime_type.essence - from - blob_mime_type - inner join blob_id on (blob_mime_type.blob_id = blob_id.id) - inner join mime_type on (blob_mime_type.mime_type_id = mime_type.id) - ; - - create table blob_charset - -- Records the guessed charset of a blob. - ( - blob_id integer primary key references blob_id(id), - charset_id integer not null references charset(id) - ); - - create view blob_charset_denorm as - -- A convenience view for the denormalized blob charset data. - select - blob_id.id blob_id_id, - blob_id.blob_id, - charset.id charset_id, - charset.charset - from - blob_charset - inner join blob_id on (blob_charset.blob_id = blob_id.id) - inner join charset on (blob_charset.charset_id = charset.id) - ; "#})?; set_user_version(new_user_version)?; } From 8906ad800dd278f923910ae025ef30346822e836 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Wed, 28 Jun 2023 23:01:11 -0400 Subject: [PATCH 13/16] comment out debugging code --- crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index d5f11ccb2..090f8326f 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -380,22 +380,22 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // record any remaining batched up items if !batch_matches.is_empty() { - let t1 = std::time::Instant::now(); + // let t1 = std::time::Instant::now(); num_matches += batch_matches_count as u64; num_added += datastore .record_matches(batch_matches.iter().flatten()) .expect("should be able to record matches to the datastore"); - debug!("*** commit matches: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_matches_count, recv_ds.len()); + // debug!("*** commit matches: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_matches_count, recv_ds.len()); batch_matches.clear(); // batch_matches_count = 0; } if !batch_metadata.is_empty() { - let t1 = std::time::Instant::now(); + // let t1 = std::time::Instant::now(); datastore .record_blob_metadata(&batch_metadata) .expect("should be able to record blob metadata to the datastore"); - debug!("*** commit metadata: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_metadata.len(), recv_ds.len()); + // debug!("*** commit metadata: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_metadata.len(), recv_ds.len()); batch_metadata.clear(); } From 228ebfd5205ebdc99240bde4ceb82a4579f77811 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Fri, 30 Jun 2023 18:32:10 -0400 Subject: [PATCH 14/16] Clean up content guesser; include output in `report` --- CHANGELOG.md | 7 +- Cargo.lock | 22 ++--- Cargo.toml | 14 +-- .../Cargo.toml | 7 +- .../src/error.rs | 1 + .../src/guesser.rs | 28 ++++-- .../src/input.rs | 1 + .../src/lib.rs | 0 .../src/output.rs | 10 +++ crates/noseyparker-cli/Cargo.toml | 2 +- .../src/bin/noseyparker/args.rs | 1 + .../src/bin/noseyparker/cmd_report.rs | 84 ++++++++++++------ .../src/bin/noseyparker/cmd_scan.rs | 71 +++++++-------- ...yparker__scan__basic__scan_secrets1-7.snap | 8 +- ...ngth__scan_changing_snippet_length-14.snap | 6 ++ ...ength__scan_changing_snippet_length-7.snap | 6 ++ crates/noseyparker/Cargo.toml | 4 +- crates/noseyparker/src/blob_metadata.rs | 35 ++++++++ crates/noseyparker/src/datastore.rs | 88 +++++++++++-------- crates/noseyparker/src/lib.rs | 4 +- crates/noseyparker/src/match_type.rs | 3 +- crates/noseyparker/src/utils.rs | 6 +- 22 files changed, 258 insertions(+), 150 deletions(-) rename crates/{noseyparker-content-guesser => content-guesser}/Cargo.toml (74%) rename crates/{noseyparker-content-guesser => content-guesser}/src/error.rs (80%) rename crates/{noseyparker-content-guesser => content-guesser}/src/guesser.rs (53%) rename crates/{noseyparker-content-guesser => content-guesser}/src/input.rs (99%) rename crates/{noseyparker-content-guesser => content-guesser}/src/lib.rs (100%) rename crates/{noseyparker-content-guesser => content-guesser}/src/output.rs (62%) create mode 100644 crates/noseyparker/src/blob_metadata.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 92df73198..7be360631 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,10 +29,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - The Git repository cloning behavior in the `scan` command can now be controlled with the new `--git-clone-mode MODE` parameter. -- In the `scan` command, basic blob metadata is recorded in the datastore for each discovered blob, including blob size in bytes. - If the `content_guesser` Cargo feature is enabled, the recorded metadata additionally includes guessed mime type and charset. +- In the `scan` command, basic blob metadata is recorded in the datastore for each discovered blob, including blob size in bytes and guessed mime type and charset when available. + A path-based mechanism is used to guess mime type; at present, this only works for plain file inputs (i.e., not for blobs found in Git history). + Optionally, if the `libmagic` Cargo feature is enabled, libmagic (the guts of the `file` command-line program) is used to guess mime type and charset based on content for blobs from all sources. This metadata is recorded for each blob in which matches are found, but this behavior can be enabled for all blobs using the new `--record-all-blobs true` parameter. - This newly recorded metadata is not currently used for anything in Nosey Parker. + This newly-recorded metadata is included in output of the `report` command. ### Changes diff --git a/Cargo.lock b/Cargo.lock index bcf8160b6..9d6a78a8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -411,6 +411,16 @@ dependencies = [ "windows-sys 0.45.0", ] +[[package]] +name = "content_guesser" +version = "0.13.0-dev" +dependencies = [ + "magic", + "mime", + "mime_guess", + "thiserror", +] + [[package]] name = "core-foundation" version = "0.9.3" @@ -2164,6 +2174,7 @@ dependencies = [ "bstr", "chrono", "console", + "content_guesser", "gix", "hex", "hyperx", @@ -2173,7 +2184,6 @@ dependencies = [ "indoc", "lazy_static", "mime", - "noseyparker_content_guesser", "pretty_assertions", "proptest", "regex", @@ -2224,16 +2234,6 @@ dependencies = [ "vergen", ] -[[package]] -name = "noseyparker_content_guesser" -version = "0.13.0-dev" -dependencies = [ - "magic", - "mime", - "mime_guess", - "thiserror", -] - [[package]] name = "ntapi" version = "0.4.1" diff --git a/Cargo.toml b/Cargo.toml index 012864068..31fa1d300 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,17 +1,5 @@ [workspace] -members = [ - "crates/noseyparker", - "crates/noseyparker-cli", - "crates/noseyparker-content-guesser", - "crates/vectorscan", - "crates/vectorscan-sys", -] -default-members = [ - "crates/noseyparker", - "crates/noseyparker-cli", - "crates/vectorscan", - "crates/vectorscan-sys", -] +members = ["crates/*"] [workspace.package] edition = "2021" diff --git a/crates/noseyparker-content-guesser/Cargo.toml b/crates/content-guesser/Cargo.toml similarity index 74% rename from crates/noseyparker-content-guesser/Cargo.toml rename to crates/content-guesser/Cargo.toml index 0d1675bc2..672d5c603 100644 --- a/crates/noseyparker-content-guesser/Cargo.toml +++ b/crates/content-guesser/Cargo.toml @@ -8,14 +8,17 @@ homepage.workspace = true repository.workspace = true publish.workspace = true -name = "noseyparker_content_guesser" +name = "content_guesser" version.workspace = true +[features] +libmagic = ["magic"] + [lib] path = "src/lib.rs" [dependencies] -magic = "0.13" +magic = { version = "0.13", optional = true } mime_guess = "2" mime = "0.3" thiserror = "1" diff --git a/crates/noseyparker-content-guesser/src/error.rs b/crates/content-guesser/src/error.rs similarity index 80% rename from crates/noseyparker-content-guesser/src/error.rs rename to crates/content-guesser/src/error.rs index f7d847dc3..a85b63fde 100644 --- a/crates/noseyparker-content-guesser/src/error.rs +++ b/crates/content-guesser/src/error.rs @@ -1,5 +1,6 @@ #[derive(Debug, thiserror::Error)] pub enum GuesserError { + #[cfg(feature = "libmagic")] #[error("libmagic error: {0}")] MagicError(#[from] magic::MagicError), } diff --git a/crates/noseyparker-content-guesser/src/guesser.rs b/crates/content-guesser/src/guesser.rs similarity index 53% rename from crates/noseyparker-content-guesser/src/guesser.rs rename to crates/content-guesser/src/guesser.rs index 22d4b6150..d80a33d73 100644 --- a/crates/noseyparker-content-guesser/src/guesser.rs +++ b/crates/content-guesser/src/guesser.rs @@ -1,18 +1,19 @@ -use magic; use mime_guess::MimeGuess; use crate::{ error::GuesserError, - input::{Content, Input, PrefixContent}, + input::Input, output::Output, }; pub struct Guesser { + #[cfg(feature = "libmagic")] magic_cookie: magic::Cookie, } // Public Implementation impl Guesser { + #[cfg(feature = "libmagic")] pub fn new() -> Result { use magic::CookieFlags; let flags = CookieFlags::ERROR | CookieFlags::MIME; @@ -23,21 +24,32 @@ impl Guesser { Ok(Guesser { magic_cookie }) } + #[cfg(not(feature = "libmagic"))] + pub fn new() -> Result { + Ok(Guesser {}) + } + pub fn guess<'a, T>(&self, input: Input<'a, T>) -> Output where T: AsRef<[u8]>, { let mime_guess = input.path.map(MimeGuess::from_path); - let magic_guess = match &input.content { - Content::None => None, - Content::Prefix(PrefixContent { content, .. }) | Content::Full(content) => { - match self.magic_cookie.buffer(content.as_ref()) { - Ok(m) => m.parse().ok(), - _ => None, + #[cfg(feature = "libmagic")] + let magic_guess = { + use crate::input::{Content, PrefixContent}; + match &input.content { + Content::None => None, + Content::Prefix(PrefixContent { content, .. }) | Content::Full(content) => { + match self.magic_cookie.buffer(content.as_ref()) { + Ok(m) => m.parse().ok(), + _ => None, + } } } }; + #[cfg(not(feature = "libmagic"))] + let magic_guess = None; Output { mime_guess, diff --git a/crates/noseyparker-content-guesser/src/input.rs b/crates/content-guesser/src/input.rs similarity index 99% rename from crates/noseyparker-content-guesser/src/input.rs rename to crates/content-guesser/src/input.rs index 9b2aff653..890ed39a0 100644 --- a/crates/noseyparker-content-guesser/src/input.rs +++ b/crates/content-guesser/src/input.rs @@ -22,6 +22,7 @@ pub struct PrefixContent { } /// The input to a `Guesser`. +#[allow(dead_code)] pub struct Input<'a, T> { pub(crate) path: Option<&'a Path>, pub(crate) content: Content, diff --git a/crates/noseyparker-content-guesser/src/lib.rs b/crates/content-guesser/src/lib.rs similarity index 100% rename from crates/noseyparker-content-guesser/src/lib.rs rename to crates/content-guesser/src/lib.rs diff --git a/crates/noseyparker-content-guesser/src/output.rs b/crates/content-guesser/src/output.rs similarity index 62% rename from crates/noseyparker-content-guesser/src/output.rs rename to crates/content-guesser/src/output.rs index c577e3a6b..a7e127383 100644 --- a/crates/noseyparker-content-guesser/src/output.rs +++ b/crates/content-guesser/src/output.rs @@ -12,12 +12,22 @@ pub struct Output { impl Output { /// Get the path-based media type guess + #[inline] pub fn path_guess(&self) -> Option { self.mime_guess.and_then(|g| g.first()) } /// Get the content-based media type guess + #[inline] pub fn content_guess(&self) -> Option { self.magic_guess.clone() } + + /// Get the guessed mime type that is considered to be the best. + /// + /// If a content-based guess is available, that is used. + /// Otherwise, the path-based guess is used. + pub fn best_guess(&self) -> Option { + self.content_guess().or_else(|| self.path_guess()) + } } diff --git a/crates/noseyparker-cli/Cargo.toml b/crates/noseyparker-cli/Cargo.toml index a2d2115c4..d3f7bcd3f 100644 --- a/crates/noseyparker-cli/Cargo.toml +++ b/crates/noseyparker-cli/Cargo.toml @@ -21,7 +21,7 @@ build = "build.rs" [features] rule_profiling = ["noseyparker/rule_profiling"] -content_guesser = ["noseyparker/content_guesser"] +libmagic = ["noseyparker/libmagic"] [[bin]] name = "noseyparker" diff --git a/crates/noseyparker-cli/src/bin/noseyparker/args.rs b/crates/noseyparker-cli/src/bin/noseyparker/args.rs index f9dbd3dc6..6e22f47ad 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/args.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/args.rs @@ -604,6 +604,7 @@ impl std::fmt::Display for OutputFormat { // ----------------------------------------------------------------------------- // report writer // ----------------------------------------------------------------------------- +// FIXME: refactor this to avoid having to implement bogus methods pub trait Reportable { fn human_format(&self, writer: W) -> Result<()>; fn json_format(&self, writer: W) -> Result<()>; diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs index b364cf50c..220288be1 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs @@ -2,10 +2,11 @@ use anyhow::{bail, Context, Result}; use indenter::indented; use lazy_static::lazy_static; use noseyparker::rules::Rules; -use serde::{Deserialize, Serialize, Serializer}; +use serde::Serialize; use serde_sarif::sarif; use std::fmt::{Display, Formatter, Write}; +use noseyparker::blob_metadata::BlobMetadata; use noseyparker::bstring_escape::Escaped; use noseyparker::datastore::{Datastore, MatchGroupMetadata}; use noseyparker::digest::sha1_hexdigest; @@ -22,6 +23,22 @@ pub fn run(_global_args: &GlobalArgs, args: &ReportArgs) -> Result<()> { struct DetailsReporter(Datastore); +impl DetailsReporter { + fn get_matches( + &self, + metadata: &MatchGroupMetadata, + limit: Option, + ) -> Result> { + Ok(self + .0 + .get_match_group_data(&metadata, limit) + .with_context(|| format!("Failed to get match data for group {metadata:?}"))? + .into_iter() + .map(|(md, m)| BlobMetadataMatch { md, m }) + .collect()) + } +} + impl Reportable for DetailsReporter { fn human_format(&self, mut writer: W) -> Result<()> { let datastore = &self.0; @@ -32,9 +49,7 @@ impl Reportable for DetailsReporter { let num_findings = group_metadata.len(); for (finding_num, metadata) in group_metadata.into_iter().enumerate() { let finding_num = finding_num + 1; - let matches = datastore - .get_match_group_matches(&metadata, Some(3)) - .with_context(|| format!("Failed to get matches for group {metadata:?}"))?; + let matches = self.get_matches(&metadata, Some(3))?; let match_group = MatchGroup { metadata, matches }; writeln!( &mut writer, @@ -56,14 +71,11 @@ impl Reportable for DetailsReporter { let es = group_metadata .into_iter() .map(|metadata| { - let matches = datastore - .get_match_group_matches(&metadata, None) - .with_context(|| format!("Failed to get matches for group {metadata:?}"))?; + let matches = self.get_matches(&metadata, None)?; Ok(MatchGroup { metadata, matches }) }) .collect::, anyhow::Error>>()?; - let mut ser = serde_json::Serializer::pretty(writer); - ser.collect_seq(es)?; + serde_json::to_writer_pretty(writer, &es)?; Ok(()) } @@ -74,9 +86,7 @@ impl Reportable for DetailsReporter { .context("Failed to get match group metadata from datastore")?; for metadata in group_metadata.into_iter() { - let matches = datastore - .get_match_group_matches(&metadata, None) - .with_context(|| format!("Failed to get matches for group {metadata:?}"))?; + let matches = self.get_matches(&metadata, None)?; let match_group = MatchGroup { metadata, matches }; serde_json::to_writer(&mut writer, &match_group)?; @@ -95,13 +105,11 @@ impl Reportable for DetailsReporter { let results: Vec = group_metadata .into_iter() .map(|metadata| { - let matches = datastore - .get_match_group_matches(&metadata, None) - .with_context(|| format!("Failed to get matches for group {metadata:?}"))?; + let matches = self.get_matches(&metadata, None)?; let first_match_blob_id = match matches.first() { - Some(m) => m.blob_id.to_string(), - None => bail!("Failed to get group matches for group {metadata:?}"), + Some(entry) => entry.m.blob_id.to_string(), + None => bail!("Failed to get group match data for group {metadata:?}"), }; let message = sarif::MessageBuilder::default() .text(format!( @@ -120,7 +128,7 @@ impl Reportable for DetailsReporter { // Will store every match location for the runs.results.location array property let locations: Vec = matches .into_iter() - .map(|m| { + .map(|BlobMetadataMatch { md, m }| { let source_span = &m.location.source_span; // let offset_span = &m.location.offset_span; let uri = match m.provenance { @@ -129,6 +137,12 @@ impl Reportable for DetailsReporter { Provenance::GitRepo { path } => path.display().to_string(), }; + let properties = sarif::PropertyBagBuilder::default().additional_properties([ + (String::from("mime_essence"), serde_json::json!(md.mime_essence)), + (String::from("charset"), serde_json::json!(md.charset)), + (String::from("num_bytes"), serde_json::json!(md.num_bytes)), + ]).build()?; + let location = sarif::LocationBuilder::default() .physical_location( sarif::PhysicalLocationBuilder::default() @@ -161,6 +175,7 @@ impl Reportable for DetailsReporter { .logical_locations([sarif::LogicalLocationBuilder::default() .kind("blob") .name(m.blob_id.to_string()) + .properties(properties) .build()?]) .build()?; Ok(location) @@ -258,11 +273,19 @@ fn noseyparker_sarif_tool() -> Result { } /// A group of matches that all have the same rule and capture group content -#[derive(Serialize, Deserialize)] +#[derive(Serialize)] struct MatchGroup { #[serde(flatten)] metadata: MatchGroupMetadata, - matches: Vec, + matches: Vec, +} + +#[derive(Serialize)] +struct BlobMetadataMatch { + #[serde(rename="blob_metadata")] + md: BlobMetadata, + #[serde(flatten)] + m: Match, } lazy_static! { @@ -292,7 +315,6 @@ impl MatchGroup { } } -// XXX this implementation is grotty impl Display for MatchGroup { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { writeln!(f, "{}", STYLE_RULE.apply_to(self.rule_name()))?; @@ -325,20 +347,27 @@ impl Display for MatchGroup { // print matches let mut f = indented(f).with_str(" "); - for (i, m) in self.matches.iter().enumerate() { + for (i, BlobMetadataMatch { md, m }) in self.matches.iter().enumerate() { let i = i + 1; writeln!( f, "{}", STYLE_HEADING.apply_to(format!("Occurrence {}/{}", i, self.total_matches())) )?; + let blob_metadata = + format!("{} bytes, {}, {}", + md.len(), + md.mime_essence().unwrap_or("unknown type"), + md.charset().unwrap_or("unknown charset"), + ); match &m.provenance { Provenance::File { path } => { writeln!( f, - "{} {}", + "{} {} ({})", STYLE_HEADING.apply_to("File:"), - STYLE_METADATA.apply_to(path.display()) + STYLE_METADATA.apply_to(path.display()), + STYLE_METADATA.apply_to(blob_metadata), )?; } Provenance::GitRepo { path } => { @@ -346,13 +375,14 @@ impl Display for MatchGroup { f, "{} {}", STYLE_HEADING.apply_to("Git repo:"), - STYLE_METADATA.apply_to(path.display()) + STYLE_METADATA.apply_to(path.display()), )?; writeln!( f, - "{} {}", + "{} {} ({})", STYLE_HEADING.apply_to("Blob:"), - STYLE_METADATA.apply_to(&m.blob_id) + STYLE_METADATA.apply_to(&m.blob_id), + STYLE_METADATA.apply_to(blob_metadata), )?; } } diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index 090f8326f..5d8397f3c 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -1,7 +1,6 @@ use anyhow::{bail, Context, Result}; use crossbeam_channel; use indicatif::{HumanBytes, HumanCount, HumanDuration}; -use mime::Mime; use rayon::prelude::*; use std::str::FromStr; use std::sync::Mutex; @@ -10,14 +9,9 @@ use tracing::{debug, debug_span, error, info, warn}; use crate::args; -use noseyparker::blob::{Blob, BlobId}; +use noseyparker::blob::Blob; use noseyparker::blob_id_set::BlobIdSet; - -#[cfg(feature = "content_guesser")] -use noseyparker::{content_guesser, content_guesser::Guesser}; -#[cfg(not(feature = "content_guesser"))] -type Guesser = (); - +use noseyparker::blob_metadata::BlobMetadata; use noseyparker::datastore::Datastore; use noseyparker::defaults::DEFAULT_IGNORE_RULES; use noseyparker::git_binary::{CloneMode, Git}; @@ -32,6 +26,7 @@ use noseyparker::progress::Progress; use noseyparker::provenance::Provenance; use noseyparker::rules::Rules; use noseyparker::rules_database::RulesDatabase; +use noseyparker::{content_guesser, content_guesser::Guesser}; /// This command scans multiple filesystem inputs for secrets. /// The implementation enumerates content in parallel, scans the enumerated content in parallel, @@ -277,12 +272,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let make_matcher = || -> Result<(Matcher, Guesser)> { *num_matchers_counter.lock().unwrap() += 1; let matcher = Matcher::new(&rules_db, &seen_blobs, Some(&matcher_stats))?; - - #[cfg(feature = "content_guesser")] let guesser = content_guesser::Guesser::new()?; - #[cfg(not(feature = "content_guesser"))] - let guesser = (); - Ok((matcher, guesser)) }; @@ -313,8 +303,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // Create a channel pair for matcher threads to get their results to the datastore recorder. // let channel_size = std::cmp::max(args.num_jobs * 32, 1024); - type Metadata = (BlobId, usize, Option); - type DatastoreMessage = (Vec, Metadata); + type DatastoreMessage = (BlobMetadata, Vec); // let (send_ds, recv_ds) = crossbeam_channel::bounded::(channel_size); let (send_ds, recv_ds) = crossbeam_channel::unbounded::(); @@ -337,18 +326,21 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> const BUF_SIZE: usize = 16384; let mut batch_matches: Vec> = Vec::with_capacity(BUF_SIZE); let mut batch_matches_count: usize = 0; - let mut batch_metadata: Vec = Vec::with_capacity(BUF_SIZE); + let mut batch_metadata: Vec = Vec::with_capacity(BUF_SIZE); // Try to commit at least every second const COMMIT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(1000); - for (matches, metadata) in recv_ds.iter() { + for (metadata, matches) in recv_ds.iter() { batch_matches_count += matches.len(); batch_matches.push(matches); batch_metadata.push(metadata); - if batch_matches_count >= BUF_SIZE || batch_metadata.len() >= BUF_SIZE || last_tx_time.elapsed() >= COMMIT_INTERVAL { + if batch_matches_count >= BUF_SIZE + || batch_metadata.len() >= BUF_SIZE + || last_tx_time.elapsed() >= COMMIT_INTERVAL + { let mut committed = false; if batch_matches_count > 0 { // let t1 = std::time::Instant::now(); @@ -415,22 +407,6 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> #[allow(unused_variables)] let (matcher, guesser) = matcher_guesser; - #[cfg(feature = "content_guesser")] - let mime: Option = { - let input = match &provenance { - Provenance::File { path } => { - content_guesser::Input::from_path_and_bytes(path, &blob.bytes) - } - Provenance::GitRepo { .. } => content_guesser::Input::from_bytes(&blob.bytes), - }; - let guess = guesser.guess(input); - guess.content_guess() - }; - #[cfg(not(feature = "content_guesser"))] - let mime: Option = None; - - let metadata = (blob.id, blob.len(), mime); - let matches = match matcher.scan_blob(&blob, &provenance) { Err(e) => { error!("Failed to scan blob {} from {}: {}", blob.id, provenance, e); @@ -438,12 +414,37 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> } Ok(v) => v, }; + if matches.is_empty() && !args.record_all_blobs { return Ok(()); } + let (mime_essence, charset) = { + let input = match &provenance { + Provenance::File { path } => { + content_guesser::Input::from_path_and_bytes(path, &blob.bytes) + } + Provenance::GitRepo { .. } => content_guesser::Input::from_bytes(&blob.bytes), + }; + let guess = guesser.guess(input); + match guess.best_guess() { + None => (None, None), + Some(m) => { + let essence = m.essence_str().to_owned(); + let charset = m.get_param(mime::CHARSET).map(|n| n.to_string()); + (Some(essence), charset) + } + } + }; + + let metadata = BlobMetadata { + id: blob.id, + num_bytes: blob.len(), + mime_essence, + charset, + }; let matches = convert_blob_matches(&blob, matches, provenance); - send_ds.send((matches, metadata))?; + send_ds.send((metadata, matches))?; Ok(()) }; diff --git a/crates/noseyparker-cli/tests/scan/basic/snapshots/test_noseyparker__scan__basic__scan_secrets1-7.snap b/crates/noseyparker-cli/tests/scan/basic/snapshots/test_noseyparker__scan__basic__scan_secrets1-7.snap index f84f90d11..70ad45797 100644 --- a/crates/noseyparker-cli/tests/scan/basic/snapshots/test_noseyparker__scan__basic__scan_secrets1-7.snap +++ b/crates/noseyparker-cli/tests/scan/basic/snapshots/test_noseyparker__scan__basic__scan_secrets1-7.snap @@ -1,5 +1,5 @@ --- -source: tests/test_noseyparker_scan.rs +source: crates/noseyparker-cli/tests/scan/basic/mod.rs expression: json_output --- [ @@ -8,6 +8,12 @@ expression: json_output "matches": [ { "blob_id": "7980f2571d9c04d65eb338f65f21edbff4469a11", + "blob_metadata": { + "charset": null, + "id": "7980f2571d9c04d65eb338f65f21edbff4469a11", + "mime_essence": "text/plain", + "num_bytes": 81 + }, "capture_group_index": 1, "location": { "offset_span": { diff --git a/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-14.snap b/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-14.snap index 27b15a3a1..3493e3abc 100644 --- a/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-14.snap +++ b/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-14.snap @@ -8,6 +8,12 @@ expression: json_output "matches": [ { "blob_id": "c3c55e6f7e1304573e25e85202e9f019bfc05087", + "blob_metadata": { + "charset": null, + "id": "c3c55e6f7e1304573e25e85202e9f019bfc05087", + "mime_essence": "text/plain", + "num_bytes": 1425 + }, "capture_group_index": 1, "location": { "offset_span": { diff --git a/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-7.snap b/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-7.snap index 13a9d11fd..b885d4ec1 100644 --- a/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-7.snap +++ b/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-7.snap @@ -8,6 +8,12 @@ expression: json_output "matches": [ { "blob_id": "c3c55e6f7e1304573e25e85202e9f019bfc05087", + "blob_metadata": { + "charset": null, + "id": "c3c55e6f7e1304573e25e85202e9f019bfc05087", + "mime_essence": "text/plain", + "num_bytes": 1425 + }, "capture_group_index": 1, "location": { "offset_span": { diff --git a/crates/noseyparker/Cargo.toml b/crates/noseyparker/Cargo.toml index ff94a2d6d..ecbef4e56 100644 --- a/crates/noseyparker/Cargo.toml +++ b/crates/noseyparker/Cargo.toml @@ -17,7 +17,7 @@ path = "src/lib.rs" [features] rule_profiling = [] -content_guesser = ["noseyparker_content_guesser"] +libmagic = ["content_guesser/libmagic"] [dependencies] # anyhow = { version = "1.0", features = ["backtrace"] } # add backtraces to errors -- not sure how expensive this is @@ -26,6 +26,7 @@ atoi = "2.0" bstr = { version = "1.0", features = ["serde"] } chrono = { version = "0.4", default_features = false, features = ["std"] } console = "0.15" +content_guesser = { path = "../content-guesser" } gix = { version = "0.47", features = ["max-performance"] } hex = "0.4" hyperx = "1.4" @@ -36,7 +37,6 @@ indoc = "2.0" ignore = "0.4" lazy_static = "1.4" mime = "0.3" -noseyparker_content_guesser = { path = "../noseyparker-content-guesser", optional = true } regex = "1.7" reqwest = { version = "0.11", features = ["json", "native-tls-vendored"] } rusqlite = { version = "0.29", features = ["bundled", "backup"] } diff --git a/crates/noseyparker/src/blob_metadata.rs b/crates/noseyparker/src/blob_metadata.rs new file mode 100644 index 000000000..0a5212a7e --- /dev/null +++ b/crates/noseyparker/src/blob_metadata.rs @@ -0,0 +1,35 @@ +use crate::blob_id::BlobId; + +/// Metadata about a blob +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub struct BlobMetadata { + /// The blob ID this metadata applies to + pub id: BlobId, + + /// The length in bytes of the blob + pub num_bytes: usize, + + /// The guessed multimedia type of the blob + pub mime_essence: Option, + + /// The guessed charset of the blob + pub charset: Option, +} + +impl BlobMetadata { + /// Get the length of the blob in bytes. + #[inline] + pub fn len(&self) -> usize { + self.num_bytes + } + + #[inline] + pub fn mime_essence(&self) -> Option<&str> { + self.mime_essence.as_ref().map(|s| s.as_str()) + } + + #[inline] + pub fn charset(&self) -> Option<&str> { + self.charset.as_ref().map(|s| s.as_str()) + } +} diff --git a/crates/noseyparker/src/datastore.rs b/crates/noseyparker/src/datastore.rs index 69038de53..d4c972d59 100644 --- a/crates/noseyparker/src/datastore.rs +++ b/crates/noseyparker/src/datastore.rs @@ -1,14 +1,13 @@ use anyhow::{bail, Context, Result}; use bstr::BString; use indoc::indoc; -use mime; -use mime::Mime; use rusqlite::Connection; -use serde::{Deserialize, Serialize}; +use serde::Serialize; use std::path::{Path, PathBuf}; use tracing::{debug, debug_span}; use crate::blob_id::BlobId; +use crate::blob_metadata::BlobMetadata; use crate::git_url::GitUrl; use crate::location::{Location, OffsetSpan, SourcePoint, SourceSpan}; use crate::match_type::Match; @@ -126,7 +125,7 @@ impl Datastore { /// Record the given blob metadata into the datastore. /// /// The given entries are recorded in a single transaction. - pub fn record_blob_metadata<'a, T: IntoIterator)>>( + pub fn record_blob_metadata<'a, T: IntoIterator>( &mut self, blob_metadata: T, ) -> Result<()> { @@ -139,13 +138,8 @@ impl Datastore { values (?, ?, ?, ?) "#})?; - for (blob_id, blob_len, mime) in blob_metadata { - let (mime_essence, charset) = match mime { - None => (None, None), - Some(mime) => (Some(mime.essence_str()), mime.get_param(mime::CHARSET).map(|n| n.as_str())), - }; - - stmt.execute((&blob_id.hex(), blob_len, mime_essence, charset))?; + for md in blob_metadata { + stmt.execute((&md.id.hex(), md.len(), md.mime_essence(), md.charset()))?; } } @@ -273,31 +267,36 @@ impl Datastore { } /// Get up to `limit` matches that belong to the group with the given group metadata. - pub fn get_match_group_matches( + pub fn get_match_group_data( &self, metadata: &MatchGroupMetadata, limit: Option, - ) -> Result> { - let _span = debug_span!("Datastore::match_groups", "{}", self.root_dir.display()).entered(); + ) -> Result> { + let _span = debug_span!("Datastore::get_match_group_data", "{}", self.root_dir.display()).entered(); let mut stmt = self.conn.prepare_cached(indoc! {r#" select - blob_id, - start_byte, - end_byte, - start_line, - start_column, - end_line, - end_column, - before_snippet, - matching_input, - after_snippet, - group_index, - provenance_type, - provenance - from matches - where rule_name = ? and group_input = ? - order by blob_id, start_byte, end_byte + m.blob_id, + m.start_byte, + m.end_byte, + m.start_line, + m.start_column, + m.end_line, + m.end_column, + m.before_snippet, + m.matching_input, + m.after_snippet, + m.group_index, + m.provenance_type, + m.provenance, + + b.size, + b.mime_essence, + b.charset + from matches m + inner join blob_metadata b on (m.blob_id = b.blob_id) + where m.rule_name = ? and m.group_input = ? + order by m.blob_id, m.start_byte, m.end_byte limit ? "#})?; @@ -307,8 +306,9 @@ impl Datastore { }; let entries = stmt.query_map((&metadata.rule_name, metadata.match_content.as_slice(), limit), |row| { let v0: String = row.get(0)?; - Ok(Match { - blob_id: BlobId::from_hex(&v0).expect("blob id from database should be valid"), + let blob_id = BlobId::from_hex(&v0).expect("blob id from database should be valid"); + let m = Match { + blob_id: blob_id.clone(), location: Location { offset_span: OffsetSpan { start: row.get(1)?, @@ -335,7 +335,16 @@ impl Datastore { rule_name: metadata.rule_name.clone(), provenance: provenance_from_parts(row.get(11)?, row.get(12)?) .expect("provenance value from database should be valid"), - }) + }; + let mime_essence: Option = row.get(14)?; + let charset: Option = row.get(15)?; + let b = BlobMetadata { + id: blob_id, + num_bytes: row.get(13)?, + mime_essence, + charset, + }; + Ok((b, m)) })?; let mut es = Vec::new(); for e in entries { @@ -355,7 +364,8 @@ impl Datastore { conn.pragma_update(None, "foreign_keys", "on")?; // https://sqlite.org/foreignkeys.html conn.pragma_update(None, "synchronous", "normal")?; // https://sqlite.org/pragma.html#pragma_synchronous - let limit: i64 = -8192 * 1024; // 8GiB limit + // FIXME: make this a command-line parameter + let limit: i64 = -8 * 1024 * 1024; // 8GiB limit conn.pragma_update(None, "cache_size", limit)?; // https://sqlite.org/pragma.html#pragma_cache_size Ok(conn) @@ -451,17 +461,17 @@ impl Datastore { tx.execute_batch(indoc! {r#" create table blob_metadata + -- This table records various bits of metadata about blobs. ( blob_id text primary key, size integer not null, mime_essence text, - charset text /*, + charset text, constraint valid_blob_id check( length(blob_id) == 40 and not glob('*[^abcdefABCDEF1234567890]*', blob_id) ), constraint valid_size check(0 <= size) - */ ); "#})?; set_user_version(new_user_version)?; @@ -530,10 +540,10 @@ mod test { // ------------------------------------------------------------------------------------------------- /// A summary of matches in a `Datastore`. -#[derive(Deserialize, Serialize)] +#[derive(Serialize)] pub struct MatchSummary(pub Vec); -#[derive(Deserialize, Serialize)] +#[derive(Serialize)] pub struct MatchSummaryEntry { pub rule_name: String, pub distinct_count: usize, @@ -554,7 +564,7 @@ impl std::fmt::Display for MatchSummary { // ------------------------------------------------------------------------------------------------- /// Metadata for a group of matches that have identical match content. -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Serialize)] pub struct MatchGroupMetadata { /// The name of the rule of all the matches in the group pub rule_name: String, diff --git a/crates/noseyparker/src/lib.rs b/crates/noseyparker/src/lib.rs index e40cdeafd..aa5ee4af9 100644 --- a/crates/noseyparker/src/lib.rs +++ b/crates/noseyparker/src/lib.rs @@ -1,6 +1,7 @@ pub mod blob; pub mod blob_id; pub mod blob_id_set; +pub mod blob_metadata; pub mod bstring_escape; pub mod datastore; pub mod defaults; @@ -13,8 +14,7 @@ pub mod location; pub mod match_type; pub mod matcher; pub mod matcher_stats; -#[cfg(feature = "content_guesser")] -pub use noseyparker_content_guesser as content_guesser; +pub use content_guesser; pub mod progress; pub mod provenance; #[cfg(feature = "rule_profiling")] diff --git a/crates/noseyparker/src/match_type.rs b/crates/noseyparker/src/match_type.rs index 8d18d1c6c..2816c237b 100644 --- a/crates/noseyparker/src/match_type.rs +++ b/crates/noseyparker/src/match_type.rs @@ -6,12 +6,11 @@ use crate::snippet::Snippet; use crate::utils::BStringSerde; use bstr::BString; -use serde::{Deserialize, Serialize}; // ------------------------------------------------------------------------------------------------- // Match // ------------------------------------------------------------------------------------------------- -#[derive(Debug, Clone, Deserialize, Serialize)] +#[derive(Debug, Clone, serde::Serialize)] pub struct Match { /// The blob this match comes from pub blob_id: BlobId, diff --git a/crates/noseyparker/src/utils.rs b/crates/noseyparker/src/utils.rs index dbf1c9f2e..a557283a3 100644 --- a/crates/noseyparker/src/utils.rs +++ b/crates/noseyparker/src/utils.rs @@ -32,16 +32,14 @@ impl From for BString { } } -#[inline] -pub fn serialize_bytes_string_lossy( +fn serialize_bytes_string_lossy( bytes: &[u8], s: S, ) -> Result { s.serialize_str(&String::from_utf8_lossy(bytes)) } -#[inline] -pub fn deserialize_bytes_string<'de, D: serde::Deserializer<'de>>( +fn deserialize_bytes_string<'de, D: serde::Deserializer<'de>>( d: D, ) -> Result, D::Error> { let s: &str = serde::Deserialize::deserialize(d)?; From d189e9401b170a0dc8830bd67066f3e724f145d2 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Fri, 30 Jun 2023 18:47:39 -0400 Subject: [PATCH 15/16] Fix clippy nits --- crates/content-guesser/src/guesser.rs | 2 +- crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs | 4 ++-- crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs | 1 - crates/noseyparker/src/blob_metadata.rs | 6 +++--- crates/noseyparker/src/datastore.rs | 4 ++-- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/crates/content-guesser/src/guesser.rs b/crates/content-guesser/src/guesser.rs index d80a33d73..4e7582d0d 100644 --- a/crates/content-guesser/src/guesser.rs +++ b/crates/content-guesser/src/guesser.rs @@ -29,7 +29,7 @@ impl Guesser { Ok(Guesser {}) } - pub fn guess<'a, T>(&self, input: Input<'a, T>) -> Output + pub fn guess(&self, input: Input) -> Output where T: AsRef<[u8]>, { diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs index 220288be1..9a9e57691 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs @@ -31,7 +31,7 @@ impl DetailsReporter { ) -> Result> { Ok(self .0 - .get_match_group_data(&metadata, limit) + .get_match_group_data(metadata, limit) .with_context(|| format!("Failed to get match data for group {metadata:?}"))? .into_iter() .map(|(md, m)| BlobMetadataMatch { md, m }) @@ -356,7 +356,7 @@ impl Display for MatchGroup { )?; let blob_metadata = format!("{} bytes, {}, {}", - md.len(), + md.num_bytes(), md.mime_essence().unwrap_or("unknown type"), md.charset().unwrap_or("unknown charset"), ); diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index 5d8397f3c..2fe6f552b 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -1,5 +1,4 @@ use anyhow::{bail, Context, Result}; -use crossbeam_channel; use indicatif::{HumanBytes, HumanCount, HumanDuration}; use rayon::prelude::*; use std::str::FromStr; diff --git a/crates/noseyparker/src/blob_metadata.rs b/crates/noseyparker/src/blob_metadata.rs index 0a5212a7e..1e781764a 100644 --- a/crates/noseyparker/src/blob_metadata.rs +++ b/crates/noseyparker/src/blob_metadata.rs @@ -19,17 +19,17 @@ pub struct BlobMetadata { impl BlobMetadata { /// Get the length of the blob in bytes. #[inline] - pub fn len(&self) -> usize { + pub fn num_bytes(&self) -> usize { self.num_bytes } #[inline] pub fn mime_essence(&self) -> Option<&str> { - self.mime_essence.as_ref().map(|s| s.as_str()) + self.mime_essence.as_deref() } #[inline] pub fn charset(&self) -> Option<&str> { - self.charset.as_ref().map(|s| s.as_str()) + self.charset.as_deref() } } diff --git a/crates/noseyparker/src/datastore.rs b/crates/noseyparker/src/datastore.rs index d4c972d59..b670d5077 100644 --- a/crates/noseyparker/src/datastore.rs +++ b/crates/noseyparker/src/datastore.rs @@ -139,7 +139,7 @@ impl Datastore { "#})?; for md in blob_metadata { - stmt.execute((&md.id.hex(), md.len(), md.mime_essence(), md.charset()))?; + stmt.execute((&md.id.hex(), md.num_bytes(), md.mime_essence(), md.charset()))?; } } @@ -308,7 +308,7 @@ impl Datastore { let v0: String = row.get(0)?; let blob_id = BlobId::from_hex(&v0).expect("blob id from database should be valid"); let m = Match { - blob_id: blob_id.clone(), + blob_id: blob_id, location: Location { offset_span: OffsetSpan { start: row.get(1)?, From aff5b99b141c8d8d7b21c80ef5aa0e6f5bcf63f2 Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Fri, 30 Jun 2023 19:02:23 -0400 Subject: [PATCH 16/16] Fix clippy nits --- crates/noseyparker/src/datastore.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/noseyparker/src/datastore.rs b/crates/noseyparker/src/datastore.rs index b670d5077..7eb5bf9fb 100644 --- a/crates/noseyparker/src/datastore.rs +++ b/crates/noseyparker/src/datastore.rs @@ -308,7 +308,7 @@ impl Datastore { let v0: String = row.get(0)?; let blob_id = BlobId::from_hex(&v0).expect("blob id from database should be valid"); let m = Match { - blob_id: blob_id, + blob_id, location: Location { offset_span: OffsetSpan { start: row.get(1)?,