Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use gitoxide for list_files_git #13592

Merged
merged 11 commits into from
Mar 24, 2024
22 changes: 22 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ filetime = "0.2.23"
flate2 = { version = "1.0.28", default-features = false, features = ["zlib"] }
git2 = "0.18.2"
git2-curl = "0.19.0"
gix = { version = "0.61.0", default-features = false, features = ["blocking-http-transport-curl", "progress-tree", "revision", "parallel"] }
gix = { version = "0.61.0", default-features = false, features = ["blocking-http-transport-curl", "progress-tree", "revision", "parallel", "dirwalk"] }
glob = "0.3.1"
handlebars = { version = "5.1.0", features = ["dir_source"] }
hex = "0.4.3"
Expand Down
290 changes: 288 additions & 2 deletions src/cargo/sources/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::util::{internal, CargoResult, GlobalContext};
use anyhow::Context as _;
use cargo_util::paths;
use filetime::FileTime;
use gix::bstr::{ByteSlice, ByteVec};
use ignore::gitignore::GitignoreBuilder;
use tracing::{trace, warn};
use walkdir::WalkDir;
Expand Down Expand Up @@ -139,7 +140,16 @@ impl<'gctx> PathSource<'gctx> {
let root = pkg.root();
let no_include_option = pkg.manifest().include().is_empty();
let git_repo = if no_include_option {
self.discover_git_repo(root)?
if self
.gctx
.cli_unstable()
.gitoxide
.map_or(false, |features| features.fetch)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I don't know if it's great to above fetch. For now I left it as you might decide to use gix by default here. If not, one could add another feature toggle like list, even though I fear very low adoption/testing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A separate flag might be of use if we want to stabilize piece-meal.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! This is implemented now, and I called the flag 'list-files'.

{
self.discover_gix_repo(root)?.map(Git2OrGixRepository::Gix)
} else {
self.discover_git_repo(root)?.map(Git2OrGixRepository::Git2)
}
} else {
None
};
Expand Down Expand Up @@ -197,7 +207,10 @@ impl<'gctx> PathSource<'gctx> {
// Attempt Git-prepopulate only if no `include` (see rust-lang/cargo#4135).
if no_include_option {
if let Some(repo) = git_repo {
return self.list_files_git(pkg, &repo, &filter);
return match repo {
Git2OrGixRepository::Git2(repo) => self.list_files_git(pkg, &repo, &filter),
Git2OrGixRepository::Gix(repo) => self.list_files_gix(pkg, &repo, &filter),
};
}
}
self.list_files_walk(pkg, &filter)
Expand Down Expand Up @@ -246,6 +259,52 @@ impl<'gctx> PathSource<'gctx> {
Ok(None)
}

/// Returns [`Some(gix::Repository)`](gix::Repository) if there is a sibling `Cargo.toml` and `.git`
arlosi marked this conversation as resolved.
Show resolved Hide resolved
/// directory; otherwise, the caller should fall back on full file list.
fn discover_gix_repo(&self, root: &Path) -> CargoResult<Option<gix::Repository>> {
let repo = match gix::discover(root) {
epage marked this conversation as resolved.
Show resolved Hide resolved
Ok(repo) => repo,
Err(e) => {
tracing::debug!(
"could not discover git repo at or above {}: {}",
root.display(),
e
);
return Ok(None);
}
};
let index = repo
.index_or_empty()
.with_context(|| format!("failed to open git index at {}", repo.path().display()))?;
let repo_root = repo.work_dir().ok_or_else(|| {
anyhow::format_err!(
"did not expect repo at {} to be bare",
repo.path().display()
)
})?;
let repo_relative_path = match paths::strip_prefix_canonical(root, repo_root) {
Ok(p) => p,
Err(e) => {
tracing::warn!(
"cannot determine if path `{:?}` is in git repo `{:?}`: {:?}",
root,
repo_root,
e
);
return Ok(None);
}
};
let manifest_path = gix::path::join_bstr_unix_pathsep(
gix::path::into_bstr(repo_relative_path),
"Cargo.toml",
);
if index.entry_index_by_path(&manifest_path).is_ok() {
return Ok(Some(repo));
}
// Package Cargo.toml is not in git, don't use git to guide our selection.
Ok(None)
}

/// Lists files relevant to building this package inside this source by
/// consulting both Git index (tracked) or status (untracked) under
/// a given Git repository.
Expand Down Expand Up @@ -411,6 +470,228 @@ impl<'gctx> PathSource<'gctx> {
}
}

/// Lists files relevant to building this package inside this source by
/// traversing the git working tree, while avoiding ignored files.
///
/// This looks into Git submodules as well, resolving them to individual files.
/// Symlinks to directories will also be resolved, which may include `.git` folders
/// in entirety if the link points to a submodule.
fn list_files_gix(
&self,
pkg: &Package,
repo: &gix::Repository,
filter: &dyn Fn(&Path, bool) -> bool,
) -> CargoResult<Vec<PathBuf>> {
use gix::dir::entry::Status;
warn!("list_files_gix {}", pkg.package_id());
epage marked this conversation as resolved.
Show resolved Hide resolved
let options = repo
.dirwalk_options()?
.emit_untracked(gix::dir::walk::EmissionMode::Matching)
.emit_ignored(None)
.emit_tracked(true)
.recurse_repositories(false)
.symlinks_to_directories_are_ignored_like_directories(true)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This option I added specifically to support a difference in behaviour when comparing libgit2 to Git. Symlinks to directories are treated as directories by the .gitignore machinery (on Unix only in libgit2, all platforms in gitoxide). Git itself always treats symlinks as files.

.emit_empty_directories(false);
let index = repo.index_or_empty()?;
let root = repo
.work_dir()
.ok_or_else(|| anyhow::format_err!("can't list files on a bare repository"))?;
assert!(
root.is_absolute(),
"BUG: paths used internally are absolute, and the repo inherits that"
);

let pkg_path = pkg.root();
let mut target_prefix;
let repo_relative_pkg_path = pkg_path.strip_prefix(root).ok().unwrap_or(Path::new(""));
arlosi marked this conversation as resolved.
Show resolved Hide resolved

let pathspec = {
let mut include = gix::path::to_unix_separators_on_windows(gix::path::into_bstr(
repo_relative_pkg_path,
));

target_prefix = include.clone();
target_prefix.to_mut().push_str(if include.is_empty() {
"target/"
} else {
"/target/"
});

// The `target` directory is never included if it is in the package root.
let exclude = {
let mut pattern = include.clone();
pattern
.to_mut()
.insert_str(0, if include.is_empty() { ":!" } else { ":!/" });
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This pathspec prefix is short for 'exclude from top', with the 'top' being required to avoid the CWD to affect the pathspec.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the : syntax and where can I read more about it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, is it a path separator? Why are we using strings with separators rather than Vec?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pathspec syntax is document in the Git glossary of all places.

With a top-level Cargo.toml file, this code produces two pathspecs: :/ and :!/target/, with a sub-package it would look like :/sub-package and :!/sub-package/target/.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is a Rust API using stringly-typed values based on a command-line?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I see what you are getting at.

I didn't think of innovating here, following the API used by libgit2 and git2 respectively. Having strings for pathspecs runs deep, and they are treated more like paths or regex or globs rather than a data structure. The datastructure itself also isn't particularly well suited to be constructed by hand, as git settings affect it (like case-sensitivity). It's quite complex, but I can imagine to tackle this by adding a trait like TryIntoPathspec which then works for strings as well as for hand-created pathspecs.

I see how being able to not have to go through the parser here would be beneficial though, so I have put this into the roadmap.

(Since it's not worse than git2 which is also stringly typed, I hope this won't have to block this PR.)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, thank you for the explanation!

For me, it is more of trying to make the intent of the code clearer. I'll mark this as resolved but have some other thoughts for improving it with what we have now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, was tempted to encourage pathspec being added to the variables but the leading let pathspec = helps. Still easy to miss the structure these variables are intended to uphold. I feel like I could go either way.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's definitely quite complex code, with a bunch of variables and inter-dependencies to make it all line up. This could already be improved a lit if format!() could be used, but since paths are involved, that's not possible :/.
Maybe the pathspec-code can be refactored into something better nonetheless.

pattern.to_mut().push_str(b"/target/");
pattern
};
include.to_mut().insert_str(0, ":/");

vec![include, exclude]
};

let mut delegate = Delegate::new(self, pkg, pkg_path, repo, root, filter)?;
repo.dirwalk(
&index,
pathspec,
&Default::default(),
options,
&mut delegate,
)?;

let mut files = delegate.into_result()?;
// Append all normal files that might be tracked in `target/`.
files.extend(
index
.prefixed_entries(target_prefix.as_ref())
.unwrap_or_default()
.iter()
.filter(|entry| {
entry.stage() == 0
epage marked this conversation as resolved.
Show resolved Hide resolved
&& entry
.mode
.to_tree_entry_mode()
.map(|mode| mode.kind())
.map_or(false, |kind| {
use gix::object::tree::EntryKind::*;
matches!(kind, Blob | BlobExecutable | Link)
})
})
.map(|entry| root.join(gix::path::from_bstr(entry.path(&index)))),
);
return Ok(files);

struct Delegate<'a, 'gctx> {
root: &'a Path,
pkg: &'a Package,
pkg_path: &'a Path,
parent: &'a PathSource<'gctx>,
paths: Vec<PathBuf>,
submodules_by_rela_path: Vec<(gix::bstr::BString, gix::Submodule<'a>)>,
subpackages_found: Vec<PathBuf>,
filter: &'a dyn Fn(&Path, bool) -> bool,
err: Option<anyhow::Error>,
}

impl<'a, 'gctx> gix::dir::walk::Delegate for Delegate<'a, 'gctx> {
fn emit(
&mut self,
entry: gix::dir::EntryRef<'_>,
_collapsed_directory_status: Option<Status>,
) -> gix::dir::walk::Action {
match self.handle_entry(entry) {
Ok(()) => gix::dir::walk::Action::Continue,
Err(e) => {
self.err = Some(e.into());
gix::dir::walk::Action::Cancel
}
}
}
}

impl<'a, 'gctx> Delegate<'a, 'gctx> {
fn new(
parent: &'a PathSource<'gctx>,
pkg: &'a Package,
pkg_path: &'a Path,
repo: &'a gix::Repository,
root: &'a Path,
filter: &'a dyn Fn(&Path, bool) -> bool,
) -> CargoResult<Self> {
let submodules_by_rela_path: Vec<_> = repo
.submodules()?
.into_iter()
.flatten()
.map(|sm| {
sm.path()
.map(|path| path.into_owned())
.map(|path| (path, sm))
})
.collect::<Result<_, _>>()?;
Ok(Self {
root,
pkg,
pkg_path,
parent,
filter,
submodules_by_rela_path,
paths: vec![],
subpackages_found: vec![],
err: None,
})
}

fn into_result(self) -> CargoResult<Vec<PathBuf>> {
match self.err {
None => {
return Ok(self.paths);
}
Some(e) => return Err(e),
}
}

fn handle_entry(&mut self, entry: gix::dir::EntryRef<'_>) -> CargoResult<()> {
if entry.status == Status::Untracked && entry.rela_path.as_ref() == "Cargo.lock" {
return Ok(());
}
let file_path = self
.root
.join(gix::path::from_bstr(entry.rela_path.as_ref()));
if file_path.file_name().and_then(|name| name.to_str()) == Some("Cargo.toml") {
// Keep track of all sub-packages found and also strip out all
// matches we've found so far. Note, though, that if we find
// our own `Cargo.toml`, we keep going.
let path = file_path.parent().unwrap();
if path != self.pkg_path {
warn!("subpackage found: {}", path.display());
self.paths.retain(|p| !p.starts_with(path));
self.subpackages_found.push(path.to_path_buf());
return Ok(());
}
}

// If this file is part of any other sub-package we've found so far,
// skip it.
if self
.subpackages_found
.iter()
.any(|p| file_path.starts_with(p))
{
return Ok(());
}

let is_dir = entry.disk_kind.map_or(false, |kind| kind.is_dir());
if entry.disk_kind == Some(gix::dir::entry::Kind::Repository) {
match self
.submodules_by_rela_path
.binary_search_by(|(sm_path, _)| {
sm_path.as_bstr().cmp(entry.rela_path.as_ref())
})
.map(|idx| self.submodules_by_rela_path[idx].1.open())
{
Ok(Ok(Some(sm_repo))) => {
let files =
self.parent
.list_files_gix(self.pkg, &sm_repo, self.filter)?;
self.paths.extend(files);
}
_ => {
self.parent
.walk(&file_path, &mut self.paths, false, self.filter)?;
}
}
} else if (self.filter)(&file_path, is_dir) {
assert!(!is_dir);
// We found a file!
warn!(" found {}", file_path.display());
epage marked this conversation as resolved.
Show resolved Hide resolved
self.paths.push(file_path);
}
Ok(())
}
}
}

/// Lists files relevant to building this package inside this source by
/// walking the filesystem from the package root path.
///
Expand Down Expand Up @@ -541,6 +822,11 @@ impl<'gctx> PathSource<'gctx> {
}
}

enum Git2OrGixRepository {
Git2(git2::Repository),
Gix(gix::Repository),
}

impl<'gctx> Debug for PathSource<'gctx> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
write!(f, "the paths source")
Expand Down
Loading