Skip to content

Commit

Permalink
Rework parallelism in enumeration and scanning (#221)
Browse files Browse the repository at this point in the history
- The `FilesystemEnumerator` type now _only_ enumerates directories and files, and as such is a relatively lightweight task.

- Heavyweight enumeration tasks, including walking through Git history and extensible enumerator output (i.e., from the `--enumerator=FILE` option) are now performed by scanner threads instead of filesystem enumerator threads.

- Git repositories are now opened just a single time instead of twice.

- A new `ParallelBlobIterator` trait codifies things that can be enumerated to produce blobs with provenance metadata. This includes for now Git repositories, regular files, extensible enumerators. This starts to put in place necessary pieces to allow for enumeration of tarfiles, compressed archives, etc.

- Large parts of the `scan` command logic are now extracted into functions.

- The overall parallelism approach is a bit easier to see now: parallel filesystem input enumeration ==channel==> parallel scanning ==channel==> sequential datastore persistence.
  • Loading branch information
bradlarsen authored Sep 23, 2024
1 parent 5b0bd98 commit 0518d80
Show file tree
Hide file tree
Showing 6 changed files with 742 additions and 672 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 12 additions & 6 deletions crates/input-enumerator/src/git_repo_enumerator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ pub struct GitRepoResult {
/// Path to the repository clone
pub path: PathBuf,

/// The opened Git repository
pub repository: Repository,

/// The blobs to be scanned
pub blobs: Vec<BlobMetadata>,

Expand Down Expand Up @@ -119,20 +122,20 @@ pub struct BlobMetadata {
// -------------------------------------------------------------------------------------------------
pub struct GitRepoWithMetadataEnumerator<'a> {
path: &'a Path,
repo: &'a Repository,
repo: Repository,
gitignore: &'a Gitignore,
}

impl<'a> GitRepoWithMetadataEnumerator<'a> {
pub fn new(path: &'a Path, repo: &'a Repository, gitignore: &'a Gitignore) -> Self {
pub fn new(path: &'a Path, repo: Repository, gitignore: &'a Gitignore) -> Self {
Self {
path,
repo,
gitignore,
}
}

pub fn run(&self) -> Result<GitRepoResult> {
pub fn run(self) -> Result<GitRepoResult> {
let t1 = Instant::now();

use gix::object::Kind;
Expand Down Expand Up @@ -254,6 +257,7 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
})
.collect();
Ok(GitRepoResult {
repository: self.repo,
path,
blobs,
commit_metadata,
Expand Down Expand Up @@ -348,6 +352,7 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
.collect();

Ok(GitRepoResult {
repository: self.repo,
path,
blobs,
commit_metadata,
Expand All @@ -362,15 +367,15 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
// -------------------------------------------------------------------------------------------------
pub struct GitRepoEnumerator<'a> {
path: &'a Path,
repo: &'a Repository,
repo: Repository,
}

impl<'a> GitRepoEnumerator<'a> {
pub fn new(path: &'a Path, repo: &'a Repository) -> Self {
pub fn new(path: &'a Path, repo: Repository) -> Self {
Self { path, repo }
}

pub fn run(&self) -> Result<GitRepoResult> {
pub fn run(self) -> Result<GitRepoResult> {
use gix::object::Kind;
use gix::odb::store::iter::Ordering;
use gix::prelude::*;
Expand Down Expand Up @@ -406,6 +411,7 @@ impl<'a> GitRepoEnumerator<'a> {
})
.collect();
Ok(GitRepoResult {
repository: self.repo,
path,
blobs,
commit_metadata: Default::default(),
Expand Down
135 changes: 26 additions & 109 deletions crates/input-enumerator/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,56 +2,35 @@ pub mod blob_appearance;
pub mod bstring_table;
pub mod git_commit_metadata;
pub mod git_metadata_graph;
pub use gix::{Repository, ThreadSafeRepository};

use anyhow::{bail, Result};
use bstr::BString;
use crossbeam_channel::Sender;
use ignore::{
gitignore::{Gitignore, GitignoreBuilder},
DirEntry, WalkBuilder, WalkState,
};
pub use ignore::gitignore::{Gitignore, GitignoreBuilder};
use ignore::{DirEntry, WalkBuilder, WalkState};
use std::path::{Path, PathBuf};
use std::time::Instant;
use tracing::{debug, error, warn};

mod git_repo_enumerator;
pub use git_repo_enumerator::{GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator};

pub enum FoundInput {
File(FileResult),
GitRepo(GitRepoResult),
EnumeratorBlob(EnumeratorBlobResult),
Directory(DirectoryResult),
EnumeratorFile(EnumeratorFileResult),
}

pub struct FileResult {
pub path: PathBuf,
pub num_bytes: u64,
}

#[derive(serde::Deserialize, serde::Serialize)]
pub enum Content {
#[serde(rename = "content_base64")]
Base64(#[serde(with = "bstring_serde::BStringBase64")] BString),

#[serde(rename = "content")]
Utf8(String),
}

impl Content {
pub fn as_bytes(&self) -> &[u8] {
match self {
Content::Base64(s) => s.as_slice(),
Content::Utf8(s) => s.as_bytes(),
}
}
pub struct EnumeratorFileResult {
pub path: PathBuf,
}

#[derive(serde::Deserialize, serde::Serialize)]
pub struct EnumeratorBlobResult {
#[serde(flatten)]
pub content: Content,

pub provenance: serde_json::Value,
pub struct DirectoryResult {
pub path: PathBuf,
}

pub type Output = Sender<FoundInput>;
Expand All @@ -61,9 +40,6 @@ pub type Output = Sender<FoundInput>;
// -------------------------------------------------------------------------------------------------
struct VisitorBuilder<'t> {
max_file_size: Option<u64>,
collect_git_metadata: bool,
enumerate_git_history: bool,
gitignore: &'t Gitignore,
output: &'t Output,
}

Expand All @@ -74,9 +50,6 @@ where
fn build(&mut self) -> Box<dyn ignore::ParallelVisitor + 's> {
Box::new(Visitor {
max_file_size: self.max_file_size,
collect_git_metadata: self.collect_git_metadata,
enumerate_git_history: self.enumerate_git_history,
gitignore: self.gitignore,
output: self.output,
})
}
Expand All @@ -86,10 +59,7 @@ where
// Visitor
// -------------------------------------------------------------------------------------------------
struct Visitor<'t> {
collect_git_metadata: bool,
enumerate_git_history: bool,
max_file_size: Option<u64>,
gitignore: &'t Gitignore,
output: &'t Output,
}

Expand All @@ -103,8 +73,8 @@ impl<'t> Visitor<'t> {
self.output.send(FoundInput::File(r)).unwrap();
}

fn found_git_repo(&mut self, r: GitRepoResult) {
self.output.send(FoundInput::GitRepo(r)).unwrap();
fn found_directory(&mut self, r: DirectoryResult) {
self.output.send(FoundInput::Directory(r)).unwrap();
}
}

Expand All @@ -131,68 +101,17 @@ impl<'t> ignore::ParallelVisitor for Visitor<'t> {
let is_dir = metadata.is_dir();

if metadata.is_file() {
let bytes = metadata.len();
let path = path.to_owned();
if self.file_too_big(bytes) {
debug!("Skipping {}: size {bytes} exceeds max size", path.display());
let num_bytes = metadata.len();
if self.file_too_big(num_bytes) {
debug!("Skipping {}: size {num_bytes} exceeds max size", path.display());
} else {
self.found_file(FileResult {
path,
num_bytes: bytes,
});
let path = path.to_owned();
self.found_file(FileResult { path, num_bytes });
}
} else if is_dir {
if self.enumerate_git_history {
match open_git_repo(path) {
Err(e) => {
error!(
"Failed to open Git repository at {}: {e}; skipping",
path.display()
);
return WalkState::Skip;
}
Ok(Some(repository)) => {
let t1 = Instant::now();
debug!("Found Git repository at {}", path.display());

if self.collect_git_metadata {
let enumerator = GitRepoWithMetadataEnumerator::new(
path,
&repository,
&self.gitignore,
);
match enumerator.run() {
Err(e) => {
error!(
"Failed to enumerate Git repository at {}: {e}; skipping",
path.display(),
);
return WalkState::Skip;
}
Ok(r) => self.found_git_repo(r),
}
} else {
let enumerator = GitRepoEnumerator::new(path, &repository);
match enumerator.run() {
Err(e) => {
error!(
"Failed to enumerate Git repository at {}: {e}; skipping",
path.display(),
);
return WalkState::Skip;
}
Ok(r) => self.found_git_repo(r),
}
}
debug!(
"Enumerated Git repository at {} in {:.6}s",
path.display(),
t1.elapsed().as_secs_f64()
);
}
Ok(None) => {}
}
}
self.found_directory(DirectoryResult {
path: path.to_owned(),
});
} else if metadata.is_symlink() {
// No problem; just ignore it
//
Expand Down Expand Up @@ -261,14 +180,12 @@ impl FilesystemEnumerator {
builder.max_filesize(max_file_size);
builder.standard_filters(false);

let gitignore_builder = GitignoreBuilder::new("");

Ok(FilesystemEnumerator {
walk_builder: builder,
max_file_size,
collect_git_metadata: Self::DEFAULT_COLLECT_GIT_METADATA,
enumerate_git_history: Self::DEFAULT_ENUMERATE_GIT_HISTORY,
gitignore_builder,
gitignore_builder: GitignoreBuilder::new(""),
})
}

Expand Down Expand Up @@ -330,14 +247,14 @@ impl FilesystemEnumerator {
self
}

pub fn run(&self, output: Output) -> Result<()> {
let gitignore = self.gitignore_builder.build()?;
/// Get the configured Gitignore for this enumerator.
pub fn gitignore(&self) -> Result<Gitignore> {
Ok(self.gitignore_builder.build()?)
}

pub fn run(&self, output: Output) -> Result<()> {
let mut visitor_builder = VisitorBuilder {
collect_git_metadata: self.collect_git_metadata,
enumerate_git_history: self.enumerate_git_history,
max_file_size: self.max_file_size,
gitignore: &gitignore,
output: &output,
};

Expand All @@ -350,7 +267,7 @@ impl FilesystemEnumerator {
}

/// Opens the given Git repository if it exists, returning None otherwise.
pub fn open_git_repo(path: &Path) -> Result<Option<gix::Repository>> {
pub fn open_git_repo(path: &Path) -> Result<Option<Repository>> {
let opts = gix::open::Options::isolated().open_path_as_is(true);
match gix::open_opts(path, opts) {
Err(gix::open::Error::NotARepository { .. }) => Ok(None),
Expand Down
1 change: 1 addition & 0 deletions crates/noseyparker-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ vergen = { version = "8.3", features = ["build", "cargo", "git", "gitcl", "rustc

[dependencies]
anyhow = { version = "1.0" }
bstring-serde = { path = "../bstring-serde" }
bstr = { version = "1.0" }
clap = { version = "4.3", features = ["cargo", "derive", "env", "unicode", "wrap_help"] }
clap_complete = "4.4"
Expand Down
Loading

0 comments on commit 0518d80

Please sign in to comment.