Skip to content

Commit

Permalink
Exclude big files when performing a worktree diff.
Browse files Browse the repository at this point in the history
This was lost previously when switching it over to a read-only
implementation.
Implementing it with an ignore list will take time, 400ms in the GitLab
repository, but it's not slower than it was before and it is always
preferred to not dump objects into the ODB unnecessarily.
  • Loading branch information
Byron committed Aug 29, 2024
1 parent a0e2361 commit 79798c7
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 57 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions crates/gitbutler-branch-actions/src/status.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,16 @@ pub fn get_applied_status_cached(
worktree_changes: Option<gitbutler_diff::DiffByPathMap>,
) -> Result<VirtualBranchesStatus> {
assure_open_workspace_mode(ctx).context("ng applied status requires open workspace mode")?;
// TODO(ST): this was `get_workspace_head()`, which is slow and ideally, we don't dynamically
// calculate which should already be 'fixed' - why do we have the integration branch
// if we can't assume it's in the right state? So ideally, we assure that the code
// that affects the integration branch also updates it?
let integration_commit_id = ctx.repository().head_commit()?.id();
let mut virtual_branches = ctx
.project()
.virtual_branches()
.list_branches_in_workspace()?;
let base_file_diffs = worktree_changes.map(Ok).unwrap_or_else(|| {
// TODO(ST): this was `get_workspace_head()`, which is slow and ideally, we don't dynamically
// calculate which should already be 'fixed' - why do we have the integration branch
// if we can't assume it's in the right state? So ideally, we assure that the code
// that affects the integration branch also updates it?
let integration_commit_id = ctx.repository().head_commit()?.id();
gitbutler_diff::workdir(ctx.repository(), &integration_commit_id.to_owned())
.context("failed to diff workdir")
})?;
Expand Down
1 change: 1 addition & 0 deletions crates/gitbutler-command-context/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ gix.workspace = true
tracing.workspace = true
gitbutler-project.workspace = true
itertools = "0.13"
bstr = "1.10.0"
3 changes: 3 additions & 0 deletions crates/gitbutler-command-context/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,6 @@ impl CommandContext {
)?)
}
}

mod repository_ext;
pub use repository_ext::RepositoryExtLite;
50 changes: 50 additions & 0 deletions crates/gitbutler-command-context/src/repository_ext.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use anyhow::{Context, Result};
use gix::bstr::{BString, ByteVec};
use tracing::instrument;

/// An extension trait that should avoid pulling in large amounts of dependency so it can be used
/// in more places without causing cycles.
/// `gitbutler_repo::RepositoryExt` may not be usable everywhere due to that.
pub trait RepositoryExtLite {
/// Exclude files that are larger than `limit_in_bytes` (eg. database.sql which may never be intended to be committed)
/// so they don't show up in the next diff.
fn ignore_large_files_in_diffs(&self, limit_in_bytes: u64) -> Result<()>;
}

impl RepositoryExtLite for git2::Repository {
#[instrument(level = tracing::Level::DEBUG, skip(self), err(Debug))]
fn ignore_large_files_in_diffs(&self, limit_in_bytes: u64) -> Result<()> {
use gix::bstr::ByteSlice;
let repo = gix::open(self.path())?;
let worktree_dir = repo
.work_dir()
.context("All repos are expected to have a worktree")?;
let files_to_exclude: Vec<_> = repo
.dirwalk_iter(
repo.index_or_empty()?,
None::<BString>,
Default::default(),
repo.dirwalk_options()?
.emit_ignored(None)
.emit_pruned(false)
.emit_untracked(gix::dir::walk::EmissionMode::Matching),
)?
.filter_map(Result::ok)
.filter_map(|item| {
let path = worktree_dir.join(gix::path::from_bstr(item.entry.rela_path.as_bstr()));
let file_is_too_large = path
.metadata()
.map_or(false, |md| md.is_file() && md.len() > limit_in_bytes);
file_is_too_large
.then(|| Vec::from(item.entry.rela_path).into_string().ok())
.flatten()
})
.collect();
// TODO(ST): refactor this to be path-safe and ' ' save - the returned list is space separated (!!)
// Just make sure this isn't needed anymore.
let ignore_list = files_to_exclude.join(" ");
// In-memory, libgit2 internal ignore rule
self.add_ignore_rule(&ignore_list)?;
Ok(())
}
}
3 changes: 2 additions & 1 deletion crates/gitbutler-diff/src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::{borrow::Cow, collections::HashMap, path::PathBuf, str};
use anyhow::{Context, Result};
use bstr::{BStr, BString, ByteSlice, ByteVec};
use gitbutler_cherry_pick::RepositoryExt;
use gitbutler_command_context::RepositoryExtLite;
use gitbutler_serde::BStringForFrontend;
use serde::{Deserialize, Serialize};
use tracing::instrument;
Expand Down Expand Up @@ -155,8 +156,8 @@ pub fn workdir(repo: &git2::Repository, commit_oid: &git2::Oid) -> Result<DiffBy
for conflict_path_to_resolve in paths_to_add {
index.add_path(conflict_path_to_resolve.as_ref())?;
}
repo.ignore_large_files_in_diffs(50_000_000)?;
let diff = repo.diff_tree_to_workdir_with_index(Some(&old_tree), Some(&mut diff_opts))?;
// TODO(ST): bring back support for skipped (large) files.
hunks_by_filepath(Some(repo), &diff)
}

Expand Down
1 change: 1 addition & 0 deletions crates/gitbutler-oplog/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ strum = { version = "0.26", features = ["derive"] }
tracing.workspace = true
gix = { workspace = true, features = ["dirwalk", "credentials", "parallel"] }
toml.workspace = true
gitbutler-command-context.workspace = true
gitbutler-project.workspace = true
gitbutler-branch.workspace = true
gitbutler-serde.workspace = true
Expand Down
56 changes: 5 additions & 51 deletions crates/gitbutler-oplog/src/oplog.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
use std::{
collections::{hash_map::Entry, HashMap},
fs,
path::{Path, PathBuf},
path::PathBuf,
str::{from_utf8, FromStr},
time::Duration,
};

use anyhow::{anyhow, bail, Context, Result};
use git2::{DiffOptions, FileMode};
use gitbutler_branch::{Branch, SignaturePurpose, VirtualBranchesHandle, VirtualBranchesState};
use gitbutler_command_context::RepositoryExtLite;
use gitbutler_diff::{hunks_by_filepath, FileDiff};
use gitbutler_project::{
access::{WorktreeReadPermission, WorktreeWritePermission},
Project,
};
use gitbutler_repo::RepositoryExt;
use gix::bstr::{BString, ByteSlice, ByteVec};
use tracing::instrument;

use super::{
Expand Down Expand Up @@ -286,11 +286,7 @@ impl OplogExt for Project {
let old_wd_tree_id = tree_from_applied_vbranches(&repo, commit.parent(0)?.id())?;
let old_wd_tree = repo.find_tree(old_wd_tree_id)?;

// Exclude files that are larger than the limit (eg. database.sql which may never be intended to be committed)
let files_to_exclude =
worktree_files_larger_than_limit_as_git2_ignore_rule(&repo, worktree_dir)?;
// In-memory, libgit2 internal ignore rule
repo.add_ignore_rule(&files_to_exclude)?;
repo.ignore_large_files_in_diffs(SNAPSHOT_FILE_LIMIT_BYTES)?;

let mut diff_opts = git2::DiffOptions::new();
diff_opts
Expand Down Expand Up @@ -579,11 +575,7 @@ fn restore_snapshot(
let workdir_tree_id = tree_from_applied_vbranches(&repo, snapshot_commit_id)?;
let workdir_tree = repo.find_tree(workdir_tree_id)?;

// Exclude files that are larger than the limit (eg. database.sql which may never be intended to be committed)
let files_to_exclude =
worktree_files_larger_than_limit_as_git2_ignore_rule(&repo, worktree_dir)?;
// In-memory, libgit2 internal ignore rule
repo.add_ignore_rule(&files_to_exclude)?;
repo.ignore_large_files_in_diffs(SNAPSHOT_FILE_LIMIT_BYTES)?;

// Define the checkout builder
let mut checkout_builder = git2::build::CheckoutBuilder::new();
Expand Down Expand Up @@ -711,38 +703,6 @@ fn write_conflicts_tree(
Ok(conflicts_tree)
}

/// Exclude files that are larger than the limit (eg. database.sql which may never be intended to be committed)
/// TODO(ST): refactor this to be path-safe and ' ' save - the returned list is space separated (!!)
#[instrument(level = tracing::Level::DEBUG, skip(repo), err(Debug))]
fn worktree_files_larger_than_limit_as_git2_ignore_rule(
repo: &git2::Repository,
worktree_dir: &Path,
) -> Result<String> {
let repo = gix::open(repo.path())?;
let files_to_exclude: Vec<_> = repo
.dirwalk_iter(
repo.index_or_empty()?,
None::<BString>,
Default::default(),
repo.dirwalk_options()?
.emit_ignored(None)
.emit_pruned(false)
.emit_untracked(gix::dir::walk::EmissionMode::Matching),
)?
.filter_map(Result::ok)
.filter_map(|item| {
let path = worktree_dir.join(gix::path::from_bstr(item.entry.rela_path.as_bstr()));
let file_is_too_large = path.metadata().map_or(false, |md| {
md.is_file() && md.len() > SNAPSHOT_FILE_LIMIT_BYTES
});
file_is_too_large
.then(|| Vec::from(item.entry.rela_path).into_string().ok())
.flatten()
})
.collect();
Ok(files_to_exclude.join(" "))
}

/// Returns the number of lines of code (added + removed) since the last snapshot in `project`.
/// Includes untracked files.
/// `repo` is an already opened project repository.
Expand All @@ -752,13 +712,7 @@ fn lines_since_snapshot(project: &Project, repo: &git2::Repository) -> Result<us
// This looks at the diff between the tree of the currently selected as 'default' branch (where new changes go)
// and that same tree in the last snapshot. For some reason, comparing workdir to the workdir subree from
// the snapshot simply does not give us what we need here, so instead using tree to tree comparison.
let worktree_dir = project.path.as_path();

// Exclude files that are larger than the limit (eg. database.sql which may never be intended to be committed)
let files_to_exclude =
worktree_files_larger_than_limit_as_git2_ignore_rule(repo, worktree_dir)?;
// In-memory, libgit2 internal ignore rule
repo.add_ignore_rule(&files_to_exclude)?;
repo.ignore_large_files_in_diffs(SNAPSHOT_FILE_LIMIT_BYTES)?;

let oplog_state = OplogHandle::new(&project.gb_dir());
let Some(oplog_commit_id) = oplog_state.oplog_head()? else {
Expand Down

0 comments on commit 79798c7

Please sign in to comment.