From 3aabafbaded1ea7bbff0a22d15e429374e39853b Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Wed, 18 Dec 2024 23:47:31 -0500 Subject: [PATCH] perf(cargo-package): match certain path prefix with pathspec `check_repo_state` checks the entire git repo status. This is usually fine if you have only a few packages in a workspace. For huge monorepos, it may hit performance issues. For example, on awslabs/aws-sdk-rust@2cbd34db53389aed1cafe4c56d23186ef5ff304a the workspace has roughly 434 members to publish. `git ls-files` reported us 204379 files in this Git repository. That means git may need to check status of all files 434 times. That would be `204379 * 434 = 88,700,486` checks! Moreover, the current algorithm is finding the intersection of `PathSource::list_files` and `git status`. It is an `O(n^2)` check. Let's assume files are evenly distributed into each package, so roughly 470 files per package. If we're unlucky to have some dirty files, say 100 files. We will have to do `470 * 100 = 47,000` times of path comparisons. Even worse, because we `git status` everything in the repo, we'll have to it for all members, even when those dirty files are not part of the current package in question. So it becomes `470 * 100 * 434 = 20,398,000`! Instead of comparing with the status of the entire repository, this patch use the magic pathspec[1] to tell git only reports paths that match a certain path prefix. This wouldn't help the `O(n^2)` algorithm, but at least it won't check dirty files outside the current package. [1]: https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec --- src/cargo/ops/cargo_package.rs | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/cargo/ops/cargo_package.rs b/src/cargo/ops/cargo_package.rs index f48b58df419d..d6f9fa51e578 100644 --- a/src/cargo/ops/cargo_package.rs +++ b/src/cargo/ops/cargo_package.rs @@ -815,7 +815,8 @@ fn check_repo_state( // - ignored (in case the user has an `include` directive that // conflicts with .gitignore). let mut dirty_files = Vec::new(); - collect_statuses(repo, &mut dirty_files)?; + let pathspec = relative_pathspec(repo, pkg.root()); + collect_statuses(repo, &[pathspec.as_str()], &mut dirty_files)?; // Include each submodule so that the error message can provide // specifically *which* files in a submodule are modified. status_submodules(repo, &mut dirty_files)?; @@ -857,16 +858,27 @@ fn check_repo_state( } } + /// Use pathspec so git only matches a certain path prefix + fn relative_pathspec(repo: &git2::Repository, pkg_root: &Path) -> String { + let workdir = repo.workdir().unwrap(); + let relpath = pkg_root.strip_prefix(workdir).unwrap_or(Path::new("")); + // to unix separators + relpath.to_str().unwrap().replace('\\', "/") + } + // Helper to collect dirty statuses for a single repo. fn collect_statuses( repo: &git2::Repository, + pathspecs: &[&str], dirty_files: &mut Vec, ) -> CargoResult<()> { let mut status_opts = git2::StatusOptions::new(); // Exclude submodules, as they are being handled manually by recursing // into each one so that details about specific files can be // retrieved. - status_opts + pathspecs + .iter() + .fold(&mut status_opts, git2::StatusOptions::pathspec) .exclude_submodules(true) .include_ignored(true) .include_untracked(true); @@ -901,7 +913,7 @@ fn check_repo_state( // If its files are required, then the verification step should fail. if let Ok(sub_repo) = submodule.open() { status_submodules(&sub_repo, dirty_files)?; - collect_statuses(&sub_repo, dirty_files)?; + collect_statuses(&sub_repo, &[], dirty_files)?; } } Ok(())