From a8082a6c531c6c75f88b6829eecbc9570fd87400 Mon Sep 17 00:00:00 2001 From: Bas Zalmstra Date: Thu, 5 Dec 2024 11:18:22 +0100 Subject: [PATCH] fix: merge purls from matching records in lock-file (#965) --- crates/rattler_lock/src/builder.rs | 126 +++++++++++++++++- crates/rattler_lock/src/conda.rs | 97 +++++++++++++- crates/rattler_lock/src/lib.rs | 6 +- crates/rattler_lock/src/parse/deserialize.rs | 95 ++++++++++--- crates/rattler_lock/src/parse/v3.rs | 8 +- ...uilder__test__merge_records_and_purls.snap | 23 ++++ ...v5__similar-with-and-without-purl.yml.snap | 20 +++ .../v5/similar-with-and-without-purl.yml | 34 +++++ 8 files changed, 379 insertions(+), 30 deletions(-) create mode 100644 crates/rattler_lock/src/snapshots/rattler_lock__builder__test__merge_records_and_purls.snap create mode 100644 crates/rattler_lock/src/snapshots/rattler_lock__test__v5__similar-with-and-without-purl.yml.snap create mode 100644 test-data/conda-lock/v5/similar-with-and-without-purl.yml diff --git a/crates/rattler_lock/src/builder.rs b/crates/rattler_lock/src/builder.rs index c077b6608..d1ab10bd2 100644 --- a/crates/rattler_lock/src/builder.rs +++ b/crates/rattler_lock/src/builder.rs @@ -1,6 +1,7 @@ //! Builder for the creation of lock files. use std::{ + borrow::Cow, collections::{BTreeSet, HashMap}, sync::Arc, }; @@ -8,7 +9,7 @@ use std::{ use fxhash::FxHashMap; use indexmap::{IndexMap, IndexSet}; use pep508_rs::ExtraName; -use rattler_conda_types::Platform; +use rattler_conda_types::{Platform, Version}; use crate::{ file_format_version::FileFormatVersion, Channel, CondaBinaryData, CondaPackageData, @@ -118,11 +119,34 @@ pub struct LockFileBuilder { environments: IndexMap, /// A list of all package metadata stored in the lock file. - conda_packages: IndexSet, + conda_packages: IndexMap, pypi_packages: IndexSet, pypi_runtime_configurations: IndexSet, } +/// A unique identifier for a conda package. This is used to deduplicate +/// packages. This only includes the unique identifying aspects of a package. +#[derive(Debug, Hash, Eq, PartialEq)] +struct UniqueCondaIdentifier { + location: UrlOrPath, + normalized_name: String, + version: Version, + build: String, + subdir: String, +} + +impl<'a> From<&'a CondaPackageData> for UniqueCondaIdentifier { + fn from(value: &'a CondaPackageData) -> Self { + Self { + location: value.location().clone(), + normalized_name: value.record().name.as_normalized().to_string(), + version: value.record().version.version().clone(), + build: value.record().build.clone(), + subdir: value.record().subdir.clone(), + } + } +} + impl LockFileBuilder { /// Generate a new lock file using the builder pattern pub fn new() -> Self { @@ -184,15 +208,25 @@ impl LockFileBuilder { indexes: None, }); + let unique_identifier = UniqueCondaIdentifier::from(&locked_package); + // Add the package to the list of packages. - let package_idx = self.conda_packages.insert_full(locked_package).0; + let entry = self.conda_packages.entry(unique_identifier); + let package_idx = entry.index(); + entry + .and_modify(|pkg| { + if let Cow::Owned(merged_package) = pkg.merge(&locked_package) { + *pkg = merged_package; + } + }) + .or_insert(locked_package); // Add the package to the environment that it is intended for. environment .packages .entry(platform) .or_default() - .push(EnvironmentPackageData::Conda(package_idx)); + .insert(EnvironmentPackageData::Conda(package_idx)); self } @@ -231,7 +265,7 @@ impl LockFileBuilder { .packages .entry(platform) .or_default() - .push(EnvironmentPackageData::Pypi(package_idx, runtime_idx)); + .insert(EnvironmentPackageData::Pypi(package_idx, runtime_idx)); self } @@ -327,7 +361,7 @@ impl LockFileBuilder { LockFile { inner: Arc::new(LockFileInner { version: FileFormatVersion::LATEST, - conda_packages: self.conda_packages.into_iter().collect(), + conda_packages: self.conda_packages.into_values().collect(), pypi_packages: self.pypi_packages.into_iter().collect(), pypi_environment_package_data: self .pypi_runtime_configurations @@ -362,3 +396,83 @@ impl From for HashablePypiPackageEnvironmentData { } } } + +#[cfg(test)] +mod test { + use std::str::FromStr; + + use rattler_conda_types::{PackageName, PackageRecord, Platform, Version}; + use url::Url; + + use crate::{CondaBinaryData, LockFile}; + + #[test] + fn test_merge_records_and_purls() { + let record = PackageRecord { + subdir: "linux-64".into(), + ..PackageRecord::new( + PackageName::new_unchecked("foobar"), + Version::from_str("1.0.0").unwrap(), + "build".into(), + ) + }; + + let record_with_purls = PackageRecord { + purls: Some( + ["pkg:pypi/foobar@1.0.0".parse().unwrap()] + .into_iter() + .collect(), + ), + ..record.clone() + }; + + let lock_file = LockFile::builder() + .with_conda_package( + "default", + Platform::Linux64, + CondaBinaryData { + package_record: record.clone(), + location: Url::parse( + "https://prefix.dev/example/linux-64/foobar-1.0.0-build.tar.bz2", + ) + .unwrap() + .into(), + file_name: "foobar-1.0.0-build.tar.bz2".to_string(), + channel: None, + } + .into(), + ) + .with_conda_package( + "default", + Platform::Linux64, + CondaBinaryData { + package_record: record.clone(), + location: Url::parse( + "https://prefix.dev/example/linux-64/foobar-1.0.0-build.tar.bz2", + ) + .unwrap() + .into(), + file_name: "foobar-1.0.0-build.tar.bz2".to_string(), + channel: None, + } + .into(), + ) + .with_conda_package( + "foobar", + Platform::Linux64, + CondaBinaryData { + package_record: record_with_purls, + location: Url::parse( + "https://prefix.dev/example/linux-64/foobar-1.0.0-build.tar.bz2", + ) + .unwrap() + .into(), + file_name: "foobar-1.0.0-build.tar.bz2".to_string(), + channel: None, + } + .into(), + ) + .finish(); + insta::assert_snapshot!(lock_file.render_to_string().unwrap()); + } +} diff --git a/crates/rattler_lock/src/conda.rs b/crates/rattler_lock/src/conda.rs index 91ce6a839..82e7a9d94 100644 --- a/crates/rattler_lock/src/conda.rs +++ b/crates/rattler_lock/src/conda.rs @@ -1,4 +1,4 @@ -use std::{cmp::Ordering, hash::Hash}; +use std::{borrow::Cow, cmp::Ordering, hash::Hash}; use rattler_conda_types::{ ChannelUrl, MatchSpec, Matches, NamelessMatchSpec, PackageRecord, RepoDataRecord, @@ -78,6 +78,27 @@ impl CondaPackageData { Self::Source(data) => Some(data), } } + + /// Performs the best effort merge of two conda packages. + /// Some fields in the packages are optional, if one of the packages + /// contain an optional field they are merged. + pub(crate) fn merge(&self, other: &Self) -> Cow<'_, Self> { + match (self, other) { + (CondaPackageData::Binary(left), CondaPackageData::Binary(right)) => { + if let Cow::Owned(merged) = left.merge(right) { + return Cow::Owned(merged.into()); + } + } + (CondaPackageData::Source(left), CondaPackageData::Source(right)) => { + if let Cow::Owned(merged) = left.merge(right) { + return Cow::Owned(merged.into()); + } + } + _ => {} + } + + Cow::Borrowed(self) + } } /// Information about a binary conda package stored in the lock-file. @@ -102,6 +123,23 @@ impl From for CondaPackageData { } } +impl CondaBinaryData { + pub(crate) fn merge(&self, other: &Self) -> Cow<'_, Self> { + if self.location == other.location { + if let Cow::Owned(merged) = + merge_package_record(&self.package_record, &other.package_record) + { + return Cow::Owned(Self { + package_record: merged, + ..self.clone() + }); + } + } + + Cow::Borrowed(self) + } +} + /// Information about a source package stored in the lock-file. #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub struct CondaSourceData { @@ -121,6 +159,23 @@ impl From for CondaPackageData { } } +impl CondaSourceData { + pub(crate) fn merge(&self, other: &Self) -> Cow<'_, Self> { + if self.location == other.location { + if let Cow::Owned(merged) = + merge_package_record(&self.package_record, &other.package_record) + { + return Cow::Owned(Self { + package_record: merged, + ..self.clone() + }); + } + } + + Cow::Borrowed(self) + } +} + /// A record of input files that were used to define the metadata of the /// package. #[derive(Clone, Debug, PartialEq, Eq, Hash)] @@ -270,3 +325,43 @@ impl Matches for CondaPackageData { spec.matches(self.record()) } } + +fn merge_package_record<'a>( + left: &'a PackageRecord, + right: &PackageRecord, +) -> Cow<'a, PackageRecord> { + let mut result = Cow::Borrowed(left); + + // If the left package doesn't contain purls we merge those from the right one. + if left.purls.is_none() && right.purls.is_some() { + result = Cow::Owned(PackageRecord { + purls: right.purls.clone(), + ..result.into_owned() + }); + } + + // If the left package doesn't contain run_exports we merge those from the right + // one. + if left.run_exports.is_none() && right.run_exports.is_some() { + result = Cow::Owned(PackageRecord { + run_exports: right.run_exports.clone(), + ..result.into_owned() + }); + } + + // Merge hashes if the left package doesn't contain them. + if left.md5.is_none() && right.md5.is_some() { + result = Cow::Owned(PackageRecord { + md5: right.md5, + ..result.into_owned() + }); + } + if left.sha256.is_none() && right.sha256.is_some() { + result = Cow::Owned(PackageRecord { + sha256: right.sha256, + ..result.into_owned() + }); + } + + result +} diff --git a/crates/rattler_lock/src/lib.rs b/crates/rattler_lock/src/lib.rs index be147a03c..080356cdc 100644 --- a/crates/rattler_lock/src/lib.rs +++ b/crates/rattler_lock/src/lib.rs @@ -79,6 +79,7 @@ use std::{collections::HashMap, io::Read, path::Path, str::FromStr, sync::Arc}; use fxhash::FxHashMap; +use indexmap::IndexSet; use rattler_conda_types::{Platform, RepoDataRecord}; mod builder; @@ -136,7 +137,7 @@ struct LockFileInner { /// enum and might contain additional data that is specific to the environment. /// For instance different environments might select the same Pypi package but /// with different extras. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)] enum EnvironmentPackageData { Conda(usize), Pypi(usize, usize), @@ -158,7 +159,7 @@ struct EnvironmentData { /// For each individual platform this environment supports we store the /// package identifiers associated with the environment. - packages: FxHashMap>, + packages: FxHashMap>, } impl LockFile { @@ -532,6 +533,7 @@ mod test { #[case::v4_pypi_path("v4/path-based-lock.yml")] #[case::v4_pypi_absolute_path("v4/absolute-path-lock.yml")] #[case::v5_pypi_flat_index("v5/flat-index-lock.yml")] + #[case::v5_with_and_without_purl("v5/similar-with-and-without-purl.yml")] #[case::v6_conda_source_path("v6/conda-path-lock.yml")] #[case::v6_derived_channel("v6/derived-channel-lock.yml")] fn test_parse(#[case] file_name: &str) { diff --git a/crates/rattler_lock/src/parse/deserialize.rs b/crates/rattler_lock/src/parse/deserialize.rs index 1a63ae32e..1c6eb47b6 100644 --- a/crates/rattler_lock/src/parse/deserialize.rs +++ b/crates/rattler_lock/src/parse/deserialize.rs @@ -1,4 +1,5 @@ use std::{ + borrow::Cow, collections::{BTreeMap, BTreeSet}, marker::PhantomData, sync::Arc, @@ -6,6 +7,7 @@ use std::{ use fxhash::FxHashMap; use indexmap::IndexSet; +use itertools::Either; use pep508_rs::ExtraName; use rattler_conda_types::{PackageName, Platform, VersionWithSource}; use serde::{de::Error, Deserialize, Deserializer}; @@ -219,37 +221,92 @@ fn parse_from_lock

( // older lock-file there can be more than one package // with the same url. Instead, we should look at the // subdir to disambiguate. - let package_index = if file_version + let package_index: Option = if file_version < FileFormatVersion::V6 { - // Find the first package that matches the subdir of - // this environment, or the first package if there - // is no match. - all_packages + // Find the packages that match the subdir of + // this environment + let mut all_packages_with_subdir = all_packages .iter() - .find(|&idx| { + .filter(|&idx| { let conda_package = &conda_packages[*idx]; conda_package.record().subdir.as_str() == platform.as_str() }) - .or_else(|| all_packages.first()) + .peekable(); + + // If there are no packages for the subdir, use all + // packages. + let mut matching_packages = + if all_packages_with_subdir.peek().is_some() { + Either::Left(all_packages_with_subdir) + } else { + Either::Right(all_packages.iter()) + }; + + // Merge all matching packages. + if let Some(&first_package_idx) = + matching_packages.next() + { + let merged_package = Cow::Borrowed( + &conda_packages[first_package_idx], + ); + let package = matching_packages.fold( + merged_package, + |acc, &next_package_idx| { + if let Cow::Owned(merged) = acc.merge( + &conda_packages[next_package_idx], + ) { + Cow::Owned(merged) + } else { + acc + } + }, + ); + Some(match package { + Cow::Borrowed(_) => first_package_idx, + Cow::Owned(package) => { + conda_packages.push(package); + conda_packages.len() - 1 + } + }) + } else { + None + } } else { - all_packages.iter().find(|&idx| { - let conda_package = &conda_packages[*idx]; - name.as_ref().map_or(true, |name| { - name == &conda_package.record().name - }) && version.as_ref().map_or(true, |version| { - version == &conda_package.record().version - }) && build.as_ref().map_or(true, |build| { - build == &conda_package.record().build - }) && subdir.as_ref().map_or(true, |subdir| { - subdir == &conda_package.record().subdir + // Find the package that matches all the fields from + // the selector. + all_packages + .iter() + .find(|&idx| { + let conda_package = &conda_packages[*idx]; + name.as_ref().map_or(true, |name| { + name == &conda_package.record().name + }) && version.as_ref().map_or( + true, + |version| { + version + == &conda_package + .record() + .version + }, + ) && build.as_ref().map_or(true, |build| { + build == &conda_package.record().build + }) && subdir.as_ref().map_or( + true, + |subdir| { + subdir + == &conda_package + .record() + .subdir + }, + ) }) - }) + .copied() }; EnvironmentPackageData::Conda( - package_index.copied().ok_or_else(|| { + package_index.ok_or_else(|| { ParseCondaLockError::MissingPackage( environment_name.clone(), platform, diff --git a/crates/rattler_lock/src/parse/v3.rs b/crates/rattler_lock/src/parse/v3.rs index cfdf7b51d..2e05225bd 100644 --- a/crates/rattler_lock/src/parse/v3.rs +++ b/crates/rattler_lock/src/parse/v3.rs @@ -138,7 +138,8 @@ pub fn parse_v3_or_lower( let mut conda_packages = IndexSet::with_capacity(lock_file.package.len()); let mut pypi_packages = IndexSet::with_capacity(lock_file.package.len()); let mut pypi_runtime_configs = IndexSet::with_capacity(lock_file.package.len()); - let mut per_platform: FxHashMap> = FxHashMap::default(); + let mut per_platform: FxHashMap> = + FxHashMap::default(); for package in lock_file.package { let LockedPackageV3 { platform, kind } = package; @@ -245,7 +246,10 @@ pub fn parse_v3_or_lower( } }; - per_platform.entry(package.platform).or_default().push(pkg); + per_platform + .entry(package.platform) + .or_default() + .insert(pkg); } // Construct the default environment diff --git a/crates/rattler_lock/src/snapshots/rattler_lock__builder__test__merge_records_and_purls.snap b/crates/rattler_lock/src/snapshots/rattler_lock__builder__test__merge_records_and_purls.snap new file mode 100644 index 000000000..25f71a688 --- /dev/null +++ b/crates/rattler_lock/src/snapshots/rattler_lock__builder__test__merge_records_and_purls.snap @@ -0,0 +1,23 @@ +--- +source: crates/rattler_lock/src/builder.rs +expression: lock_file.render_to_string().unwrap() +--- +version: 6 +environments: + default: + channels: [] + packages: + linux-64: + - conda: https://prefix.dev/example/linux-64/foobar-1.0.0-build.tar.bz2 + foobar: + channels: [] + packages: + linux-64: + - conda: https://prefix.dev/example/linux-64/foobar-1.0.0-build.tar.bz2 +packages: +- conda: https://prefix.dev/example/linux-64/foobar-1.0.0-build.tar.bz2 + arch: x86_64 + platform: linux + channel: null + purls: + - pkg:pypi/foobar@1.0.0 diff --git a/crates/rattler_lock/src/snapshots/rattler_lock__test__v5__similar-with-and-without-purl.yml.snap b/crates/rattler_lock/src/snapshots/rattler_lock__test__v5__similar-with-and-without-purl.yml.snap new file mode 100644 index 000000000..088a3d5f2 --- /dev/null +++ b/crates/rattler_lock/src/snapshots/rattler_lock__test__v5__similar-with-and-without-purl.yml.snap @@ -0,0 +1,20 @@ +--- +source: crates/rattler_lock/src/lib.rs +expression: conda_lock +--- +version: 6 +environments: + default: + channels: + - url: "https://conda.anaconda.org/conda-forge/" + packages: + win-64: + - conda: "https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda" +packages: + - conda: "https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda" + sha256: 4fde5c3008bf5d2db82f2b50204464314cc3c91c1d953652f7bd01d9e52aefdf + md5: 8ac3367aafb1cc0a068483c580af8015 + license: LicenseRef-Public-Domain + purls: [] + size: 122354 + timestamp: 1728047496079 diff --git a/test-data/conda-lock/v5/similar-with-and-without-purl.yml b/test-data/conda-lock/v5/similar-with-and-without-purl.yml new file mode 100644 index 000000000..636ce0222 --- /dev/null +++ b/test-data/conda-lock/v5/similar-with-and-without-purl.yml @@ -0,0 +1,34 @@ +version: 5 +environments: + default: + channels: + - url: https://conda.anaconda.org/conda-forge/ + packages: + win-64: + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda +packages: +- kind: conda + name: tzdata + version: 2024b + build: hc8b5060_0 + subdir: noarch + noarch: generic + url: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda + sha256: 4fde5c3008bf5d2db82f2b50204464314cc3c91c1d953652f7bd01d9e52aefdf + md5: 8ac3367aafb1cc0a068483c580af8015 + license: LicenseRef-Public-Domain + size: 122354 + timestamp: 1728047496079 +- kind: conda + name: tzdata + version: 2024b + build: hc8b5060_0 + subdir: noarch + noarch: generic + url: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda + sha256: 4fde5c3008bf5d2db82f2b50204464314cc3c91c1d953652f7bd01d9e52aefdf + md5: 8ac3367aafb1cc0a068483c580af8015 + license: LicenseRef-Public-Domain + purls: [] + size: 122354 + timestamp: 1728047496079