From 4c06c57d0dc303b2bc93a5a52f5b962cae48bbce Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 31 Dec 2024 13:24:38 -0500 Subject: [PATCH 1/6] refactor(cargo-util): one generic for each argument So `path` and `base` are able to accept different types --- crates/cargo-util/src/paths.rs | 6 +++--- src/cargo/ops/cargo_package/vcs.rs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/cargo-util/src/paths.rs b/crates/cargo-util/src/paths.rs index 5d7e3c5a681..cc73fbad892 100644 --- a/crates/cargo-util/src/paths.rs +++ b/crates/cargo-util/src/paths.rs @@ -710,9 +710,9 @@ pub fn set_file_time_no_err>(path: P, time: FileTime) { /// This canonicalizes both paths before stripping. This is useful if the /// paths are obtained in different ways, and one or the other may or may not /// have been normalized in some way. -pub fn strip_prefix_canonical>( - path: P, - base: P, +pub fn strip_prefix_canonical( + path: impl AsRef, + base: impl AsRef, ) -> Result { // Not all filesystems support canonicalize. Just ignore if it doesn't work. let safe_canonicalize = |path: &Path| match path.canonicalize() { diff --git a/src/cargo/ops/cargo_package/vcs.rs b/src/cargo/ops/cargo_package/vcs.rs index 3c447f32b72..990c944a6c1 100644 --- a/src/cargo/ops/cargo_package/vcs.rs +++ b/src/cargo/ops/cargo_package/vcs.rs @@ -235,11 +235,11 @@ fn dirty_metadata_paths(pkg: &Package, repo: &git2::Repository) -> CargoResult Date: Tue, 31 Dec 2024 12:40:23 -0500 Subject: [PATCH 2/6] refactor(package): simplify metadata path field report path join `abs_path` and `workdir.join(rel_path)` are the same at that point. --- src/cargo/ops/cargo_package/vcs.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/cargo/ops/cargo_package/vcs.rs b/src/cargo/ops/cargo_package/vcs.rs index 990c944a6c1..522133b94aa 100644 --- a/src/cargo/ops/cargo_package/vcs.rs +++ b/src/cargo/ops/cargo_package/vcs.rs @@ -242,12 +242,7 @@ fn dirty_metadata_paths(pkg: &Package, repo: &git2::Repository) -> CargoResult Date: Sat, 28 Dec 2024 22:42:11 -0500 Subject: [PATCH 3/6] refactor(source): preserve whether a path is under a symlink dir This is helpful for VCS status check. Paths emitted by PathSource are always under package root, We lose the track of file type info of paths under symlink dirs, so we need this extra bit of information. --- src/cargo/sources/path.rs | 40 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/cargo/sources/path.rs b/src/cargo/sources/path.rs index 8bafcf18099..c1e81528fbd 100644 --- a/src/cargo/sources/path.rs +++ b/src/cargo/sources/path.rs @@ -448,6 +448,8 @@ impl From for FileType { pub struct PathEntry { path: PathBuf, ty: FileType, + /// Whether this path was visited when traversing a symlink directory. + under_symlink_dir: bool, } impl PathEntry { @@ -469,10 +471,21 @@ impl PathEntry { /// Similar to [`std::path::Path::is_symlink`] /// but doesn't follow the symbolic link nor make any system call + /// + /// If the path is not a symlink but under a symlink parent directory, + /// this will return false. + /// See [`PathEntry::is_symlink_or_under_symlink`] for an alternative. pub fn is_symlink(&self) -> bool { matches!(self.ty, FileType::Symlink) } + /// Whether a path is a symlink or a path under a symlink directory. + /// + /// Use [`PathEntry::is_symlink`] to get the exact file type of the path only. + pub fn is_symlink_or_under_symlink(&self) -> bool { + self.is_symlink() || self.under_symlink_dir + } + /// Whether this path might be a plain text symlink. /// /// Git may check out symlinks as plain text files that contain the link texts, @@ -826,6 +839,9 @@ fn list_files_gix( files.push(PathEntry { path: file_path, ty, + // Git index doesn't include files from symlink diretory, + // symlink dirs are handled in `list_files_walk`. + under_symlink_dir: false, }); } } @@ -847,6 +863,10 @@ fn list_files_walk( ) -> CargoResult<()> { let walkdir = WalkDir::new(path) .follow_links(true) + // While this is the default, set it explicitly. + // We need walkdir to visit the directory tree in depth-first order, + // so we can ensure a path visited later be under a certain directory. + .contents_first(false) .into_iter() .filter_entry(|entry| { let path = entry.path(); @@ -876,10 +896,27 @@ fn list_files_walk( true }); + + let mut current_symlink_dir = None; for entry in walkdir { match entry { Ok(entry) => { let file_type = entry.file_type(); + + match current_symlink_dir.as_ref() { + Some(dir) if entry.path().starts_with(dir) => { + // Still walk under the same parent symlink dir, so keep it + } + Some(_) | None => { + // Not under any parent symlink dir, update the current one. + current_symlink_dir = if file_type.is_dir() && entry.path_is_symlink() { + Some(entry.path().to_path_buf()) + } else { + None + }; + } + } + if file_type.is_file() || file_type.is_symlink() { // We follow_links(true) here so check if entry was created from a symlink let ty = if entry.path_is_symlink() { @@ -890,6 +927,8 @@ fn list_files_walk( ret.push(PathEntry { path: entry.into_path(), ty, + // This rely on contents_first(false), which walks in depth-first order + under_symlink_dir: current_symlink_dir.is_some(), }); } } @@ -907,6 +946,7 @@ fn list_files_walk( Some(path) => ret.push(PathEntry { path: path.to_path_buf(), ty: FileType::Other, + under_symlink_dir: false, }), None => return Err(err.into()), }, From 014e516e743cc9edd567ce956304f98841c45f17 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 29 Dec 2024 01:43:43 -0500 Subject: [PATCH 4/6] test(package): symlink dirty also under dirtiness check This show that a regular file under a symlink directory is also not tarcked by the current vcs dirtiness check. --- tests/testsuite/package.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/testsuite/package.rs b/tests/testsuite/package.rs index 9c6d0493f83..7ee9433a2e3 100644 --- a/tests/testsuite/package.rs +++ b/tests/testsuite/package.rs @@ -1339,10 +1339,12 @@ fn dirty_file_outside_pkg_root_considered_dirty() { license-file = "../LICENSE" "#, ) + .file("original-dir/file", "before") .symlink("lib.rs", "isengard/src/lib.rs") .symlink("README.md", "isengard/README.md") .file(&main_outside_pkg_root, "fn main() {}") .symlink(&main_outside_pkg_root, "isengard/src/main.rs") + .symlink_dir("original-dir", "isengard/symlink-dir") }); git::commit(&repo); @@ -1352,6 +1354,7 @@ fn dirty_file_outside_pkg_root_considered_dirty() { // * Changes in files outside package root that source files symlink to p.change_file("README.md", "after"); p.change_file("lib.rs", "pub fn after() {}"); + p.change_file("original-dir/file", "after"); // * Changes in files outside pkg root that `license-file`/`readme` point to p.change_file("LICENSE", "after"); // * When workspace inheritance is involved and changed @@ -1388,7 +1391,7 @@ to proceed despite this and include the uncommitted changes, pass the `--allow-d p.cargo("package --workspace --no-verify --allow-dirty") .with_stderr_data(str![[r#" [PACKAGING] isengard v0.0.0 ([ROOT]/foo/isengard) -[PACKAGED] 8 files, [FILE_SIZE]B ([FILE_SIZE]B compressed) +[PACKAGED] 9 files, [FILE_SIZE]B ([FILE_SIZE]B compressed) "#]]) .run(); @@ -1411,6 +1414,7 @@ edition = "2021" "Cargo.toml.orig", "src/lib.rs", "src/main.rs", + "symlink-dir/file", "Cargo.lock", "LICENSE", "README.md", @@ -1418,6 +1422,7 @@ edition = "2021" [ ("src/lib.rs", str!["pub fn after() {}"]), ("src/main.rs", str![r#"fn main() { eprintln!("after"); }"#]), + ("symlink-dir/file", str!["after"]), ("README.md", str!["after"]), ("LICENSE", str!["after"]), ("Cargo.toml", cargo_toml), From de39f59e260eaeb7a6186da3311c8658ddf6c453 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 24 Dec 2024 00:09:17 -0500 Subject: [PATCH 5/6] fix(package): check dirtiness of symlink source files This adds a special case for checking source files are symlinks and have been modified when under a VCS control This is required because those paths may link to a file outside the current package root, but still under the git workdir, affecting the final packaged `.crate` file. This may have potential performance issue. If a package contains thousands of symlinks, Cargo will fire `git status` for each of them. --- src/cargo/ops/cargo_package/vcs.rs | 30 ++++++++++++++++++++++++++++++ tests/testsuite/package.rs | 5 ++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/cargo/ops/cargo_package/vcs.rs b/src/cargo/ops/cargo_package/vcs.rs index 522133b94aa..e007841e9e8 100644 --- a/src/cargo/ops/cargo_package/vcs.rs +++ b/src/cargo/ops/cargo_package/vcs.rs @@ -1,5 +1,6 @@ //! Helpers to gather the VCS information for `cargo package`. +use std::collections::HashSet; use std::path::Path; use std::path::PathBuf; @@ -187,6 +188,7 @@ fn git( .filter(|src_file| dirty_files.iter().any(|path| src_file.starts_with(path))) .map(|p| p.as_ref()) .chain(dirty_metadata_paths(pkg, repo)?.iter()) + .chain(dirty_symlinks(pkg, repo, src_files)?.iter()) .map(|path| { pathdiff::diff_paths(path, cwd) .as_ref() @@ -249,6 +251,34 @@ fn dirty_metadata_paths(pkg: &Package, repo: &git2::Repository) -> CargoResult CargoResult> { + let workdir = repo.workdir().unwrap(); + let mut dirty_symlinks = HashSet::new(); + for rel_path in src_files + .iter() + .filter(|p| p.is_symlink_or_under_symlink()) + .map(|p| p.as_ref().as_path()) + // If inside package root. Don't bother checking git status. + .filter(|p| paths::strip_prefix_canonical(p, pkg.root()).is_err()) + // Handle files outside package root but under git workdir, + .filter_map(|p| paths::strip_prefix_canonical(p, workdir).ok()) + { + if repo.status_file(&rel_path)? != git2::Status::CURRENT { + dirty_symlinks.insert(workdir.join(rel_path)); + } + } + Ok(dirty_symlinks) +} + /// Helper to collect dirty statuses for a single repo. fn collect_statuses(repo: &git2::Repository, dirty_files: &mut Vec) -> CargoResult<()> { let mut status_opts = git2::StatusOptions::new(); diff --git a/tests/testsuite/package.rs b/tests/testsuite/package.rs index 7ee9433a2e3..bb20f53e947 100644 --- a/tests/testsuite/package.rs +++ b/tests/testsuite/package.rs @@ -1378,10 +1378,13 @@ fn dirty_file_outside_pkg_root_considered_dirty() { p.cargo("package --workspace --no-verify") .with_status(101) .with_stderr_data(str![[r#" -[ERROR] 2 files in the working directory contain changes that were not yet committed into git: +[ERROR] 5 files in the working directory contain changes that were not yet committed into git: LICENSE README.md +README.md +lib.rs +original-dir/file to proceed despite this and include the uncommitted changes, pass the `--allow-dirty` flag From 24dd205d5a395ef9c462d1bc4599df3fb2c87f9a Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 31 Dec 2024 13:32:11 -0500 Subject: [PATCH 6/6] fix(package): deduplicate dirty symlink detection metdata path fields may point to a dirty symlilnk and cause duplicate report. This commit combines `dirty_metadata_paths` and `dirty_symlinks` into one so is able to de-duplicate them. --- src/cargo/ops/cargo_package/vcs.rs | 55 +++++++++++------------------- tests/testsuite/package.rs | 3 +- 2 files changed, 20 insertions(+), 38 deletions(-) diff --git a/src/cargo/ops/cargo_package/vcs.rs b/src/cargo/ops/cargo_package/vcs.rs index e007841e9e8..9f2f57284c4 100644 --- a/src/cargo/ops/cargo_package/vcs.rs +++ b/src/cargo/ops/cargo_package/vcs.rs @@ -1,7 +1,6 @@ //! Helpers to gather the VCS information for `cargo package`. use std::collections::HashSet; -use std::path::Path; use std::path::PathBuf; use anyhow::Context as _; @@ -187,8 +186,7 @@ fn git( .iter() .filter(|src_file| dirty_files.iter().any(|path| src_file.starts_with(path))) .map(|p| p.as_ref()) - .chain(dirty_metadata_paths(pkg, repo)?.iter()) - .chain(dirty_symlinks(pkg, repo, src_files)?.iter()) + .chain(dirty_files_outside_pkg_root(pkg, repo, src_files)?.iter()) .map(|path| { pathdiff::diff_paths(path, cwd) .as_ref() @@ -221,54 +219,39 @@ fn git( } } -/// Checks whether files at paths specified in `package.readme` and -/// `package.license-file` have been modified. +/// Checks whether "included" source files outside package root have been modified. /// -/// This is required because those paths may link to a file outside the -/// current package root, but still under the git workdir, affecting the -/// final packaged `.crate` file. -fn dirty_metadata_paths(pkg: &Package, repo: &git2::Repository) -> CargoResult> { - let mut dirty_files = Vec::new(); - let workdir = repo.workdir().unwrap(); - let root = pkg.root(); - let meta = pkg.manifest().metadata(); - for path in [&meta.license_file, &meta.readme] { - let Some(path) = path.as_deref().map(Path::new) else { - continue; - }; - let abs_path = paths::normalize_path(&root.join(path)); - if paths::strip_prefix_canonical(&abs_path, root).is_ok() { - // Inside package root. Don't bother checking git status. - continue; - } - if let Ok(rel_path) = paths::strip_prefix_canonical(&abs_path, workdir) { - // Outside package root but under git workdir, - if repo.status_file(&rel_path)? != git2::Status::CURRENT { - dirty_files.push(workdir.join(rel_path)) - } - } - } - Ok(dirty_files) -} - -/// Checks whether source files are symlinks and have been modified. +/// This currently looks at +/// +/// * `package.readme` and `package.license-file` pointing to paths outside package root +/// * symlinks targets reside outside package root /// /// This is required because those paths may link to a file outside the /// current package root, but still under the git workdir, affecting the /// final packaged `.crate` file. -fn dirty_symlinks( +fn dirty_files_outside_pkg_root( pkg: &Package, repo: &git2::Repository, src_files: &[PathEntry], ) -> CargoResult> { + let pkg_root = pkg.root(); let workdir = repo.workdir().unwrap(); + + let meta = pkg.manifest().metadata(); + let metadata_paths: Vec<_> = [&meta.license_file, &meta.readme] + .into_iter() + .filter_map(|p| p.as_deref()) + .map(|path| paths::normalize_path(&pkg_root.join(path))) + .collect(); + let mut dirty_symlinks = HashSet::new(); for rel_path in src_files .iter() .filter(|p| p.is_symlink_or_under_symlink()) - .map(|p| p.as_ref().as_path()) + .map(|p| p.as_ref()) + .chain(metadata_paths.iter()) // If inside package root. Don't bother checking git status. - .filter(|p| paths::strip_prefix_canonical(p, pkg.root()).is_err()) + .filter(|p| paths::strip_prefix_canonical(p, pkg_root).is_err()) // Handle files outside package root but under git workdir, .filter_map(|p| paths::strip_prefix_canonical(p, workdir).ok()) { diff --git a/tests/testsuite/package.rs b/tests/testsuite/package.rs index bb20f53e947..ae1c5d71024 100644 --- a/tests/testsuite/package.rs +++ b/tests/testsuite/package.rs @@ -1378,11 +1378,10 @@ fn dirty_file_outside_pkg_root_considered_dirty() { p.cargo("package --workspace --no-verify") .with_status(101) .with_stderr_data(str![[r#" -[ERROR] 5 files in the working directory contain changes that were not yet committed into git: +[ERROR] 4 files in the working directory contain changes that were not yet committed into git: LICENSE README.md -README.md lib.rs original-dir/file