From d9eeafd688b0769be588c04160f058d9f0728aa1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 5 Oct 2024 15:46:29 -0700 Subject: [PATCH 01/26] implement downsample_max_hash in terms of downsample_scaled --- src/core/src/sketch/minhash.rs | 74 ++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index e949162cc..d777b4f50 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -723,21 +723,7 @@ impl KmerMinHash { // create a downsampled copy of self pub fn downsample_max_hash(&self, max_hash: u64) -> Result { let scaled = scaled_for_max_hash(max_hash); - - let mut new_mh = KmerMinHash::new( - scaled, - self.ksize, - self.hash_function.clone(), - self.seed, - self.abunds.is_some(), - self.num, - ); - if self.abunds.is_some() { - new_mh.add_many_with_abund(&self.to_vec_abunds())?; - } else { - new_mh.add_many(&self.mins)?; - } - Ok(new_mh) + self.downsample_scaled(scaled) } pub fn sum_abunds(&self) -> u64 { @@ -783,8 +769,25 @@ impl KmerMinHash { // create a downsampled copy of self pub fn downsample_scaled(&self, scaled: u64) -> Result { - let max_hash = max_hash_for_scaled(scaled); - self.downsample_max_hash(max_hash) + // @CTB shouldn't we check that new scaled > old scaled? + if self.scaled() == scaled { + Ok(self.clone()) // avoid clone CTB + } else { + let mut new_mh = KmerMinHash::new( + scaled, + self.ksize, + self.hash_function.clone(), + self.seed, + self.abunds.is_some(), + self.num, + ); + if self.abunds.is_some() { + new_mh.add_many_with_abund(&self.to_vec_abunds())?; + } else { + new_mh.add_many(&self.mins)?; + } + Ok(new_mh) + } } pub fn inflate(&mut self, abunds_from: &KmerMinHash) -> Result<(), Error> { @@ -1531,27 +1534,30 @@ impl KmerMinHashBTree { // create a downsampled copy of self pub fn downsample_max_hash(&self, max_hash: u64) -> Result { let scaled = scaled_for_max_hash(max_hash); - - let mut new_mh = KmerMinHashBTree::new( - scaled, - self.ksize, - self.hash_function.clone(), - self.seed, - self.abunds.is_some(), - self.num, - ); - if self.abunds.is_some() { - new_mh.add_many_with_abund(&self.to_vec_abunds())?; - } else { - new_mh.add_many(&self.mins())?; - } - Ok(new_mh) + self.downsample_scaled(scaled) } // create a downsampled copy of self pub fn downsample_scaled(&self, scaled: u64) -> Result { - let max_hash = max_hash_for_scaled(scaled); - self.downsample_max_hash(max_hash) + // @CTB shouldn't we check that new scaled > old scaled? + if self.scaled() == scaled { + Ok(self.clone()) // CTB avoid clone... + } else { + let mut new_mh = KmerMinHashBTree::new( + scaled, + self.ksize, + self.hash_function.clone(), + self.seed, + self.abunds.is_some(), + self.num, + ); + if self.abunds.is_some() { + new_mh.add_many_with_abund(&self.to_vec_abunds())?; + } else { + new_mh.add_many(&self.mins())?; + } + Ok(new_mh) + } } pub fn to_vec_abunds(&self) -> Vec<(u64, u64)> { From 4ab87e4b3ed4de1ec55f3177fe8a3fb4b070c814 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 6 Oct 2024 19:14:39 -0400 Subject: [PATCH 02/26] adjust references & use clone too much --- src/core/src/signature.rs | 2 +- src/core/src/sketch/minhash.rs | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 0ab8190f9..8cc4ba915 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -842,7 +842,7 @@ impl Select for Signature { // TODO: also account for LargeMinHash if let Sketch::MinHash(mh) = sketch { if (mh.scaled() as u32) < sel_scaled { - *sketch = Sketch::MinHash(mh.downsample_scaled(sel_scaled as u64)?); + *sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled as u64)?); } } } diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index d777b4f50..df518aa92 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -543,7 +543,7 @@ impl KmerMinHash { } else { (other, self) }; - let downsampled_mh = second.downsample_max_hash(first.max_hash)?; + let downsampled_mh = second.clone().downsample_max_hash(first.max_hash)?; first.count_common(&downsampled_mh, false) } else { self.check_compatible(other)?; @@ -691,7 +691,7 @@ impl KmerMinHash { } else { (other, self) }; - let downsampled_mh = second.downsample_max_hash(first.max_hash)?; + let downsampled_mh = second.clone().downsample_max_hash(first.max_hash)?; first.similarity(&downsampled_mh, ignore_abundance, false) } else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() { self.jaccard(other) @@ -721,7 +721,7 @@ impl KmerMinHash { } // create a downsampled copy of self - pub fn downsample_max_hash(&self, max_hash: u64) -> Result { + pub fn downsample_max_hash(self, max_hash: u64) -> Result { let scaled = scaled_for_max_hash(max_hash); self.downsample_scaled(scaled) } @@ -768,7 +768,7 @@ impl KmerMinHash { } // create a downsampled copy of self - pub fn downsample_scaled(&self, scaled: u64) -> Result { + pub fn downsample_scaled(self, scaled: u64) -> Result { // @CTB shouldn't we check that new scaled > old scaled? if self.scaled() == scaled { Ok(self.clone()) // avoid clone CTB @@ -1365,7 +1365,7 @@ impl KmerMinHashBTree { } else { (other, self) }; - let downsampled_mh = second.downsample_max_hash(first.max_hash)?; + let downsampled_mh = second.clone().downsample_max_hash(first.max_hash)?; first.count_common(&downsampled_mh, false) } else { self.check_compatible(other)?; @@ -1496,7 +1496,7 @@ impl KmerMinHashBTree { } else { (other, self) }; - let downsampled_mh = second.downsample_max_hash(first.max_hash)?; + let downsampled_mh = second.clone().downsample_max_hash(first.max_hash)?; first.similarity(&downsampled_mh, ignore_abundance, false) } else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() { self.jaccard(other) @@ -1532,13 +1532,13 @@ impl KmerMinHashBTree { } // create a downsampled copy of self - pub fn downsample_max_hash(&self, max_hash: u64) -> Result { + pub fn downsample_max_hash(self, max_hash: u64) -> Result { let scaled = scaled_for_max_hash(max_hash); self.downsample_scaled(scaled) } // create a downsampled copy of self - pub fn downsample_scaled(&self, scaled: u64) -> Result { + pub fn downsample_scaled(self, scaled: u64) -> Result { // @CTB shouldn't we check that new scaled > old scaled? if self.scaled() == scaled { Ok(self.clone()) // CTB avoid clone... From ffa968345074260fc7158ac3e4420a0b4efd9b23 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 6 Oct 2024 19:23:57 -0400 Subject: [PATCH 03/26] switch to using scaled; avoid unnecessary clones? --- src/core/src/sketch/minhash.rs | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index df518aa92..a59703711 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -537,13 +537,13 @@ impl KmerMinHash { } pub fn count_common(&self, other: &KmerMinHash, downsample: bool) -> Result { - if downsample && self.max_hash != other.max_hash { - let (first, second) = if self.max_hash < other.max_hash { + if downsample && self.scaled() != other.scaled() { + let (first, second) = if self.scaled() > other.scaled() { (self, other) } else { (other, self) }; - let downsampled_mh = second.clone().downsample_max_hash(first.max_hash)?; + let downsampled_mh = second.clone().downsample_scaled(first.scaled())?; first.count_common(&downsampled_mh, false) } else { self.check_compatible(other)?; @@ -685,13 +685,14 @@ impl KmerMinHash { ignore_abundance: bool, downsample: bool, ) -> Result { - if downsample && self.max_hash != other.max_hash { - let (first, second) = if self.max_hash < other.max_hash { + if downsample && self.scaled() != other.scaled() { + // downsample to larger of two scaled + let (first, second) = if self.scaled() > other.scaled() { (self, other) } else { (other, self) }; - let downsampled_mh = second.clone().downsample_max_hash(first.max_hash)?; + let downsampled_mh = second.clone().downsample_scaled(first.scaled())?; first.similarity(&downsampled_mh, ignore_abundance, false) } else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() { self.jaccard(other) @@ -771,7 +772,7 @@ impl KmerMinHash { pub fn downsample_scaled(self, scaled: u64) -> Result { // @CTB shouldn't we check that new scaled > old scaled? if self.scaled() == scaled { - Ok(self.clone()) // avoid clone CTB + Ok(self) } else { let mut new_mh = KmerMinHash::new( scaled, @@ -1359,13 +1360,14 @@ impl KmerMinHashBTree { } pub fn count_common(&self, other: &KmerMinHashBTree, downsample: bool) -> Result { - if downsample && self.max_hash != other.max_hash { - let (first, second) = if self.max_hash < other.max_hash { + if downsample && self.scaled() != other.scaled() { + // downsample to the larger of the two scaled values + let (first, second) = if self.scaled() > other.scaled() { (self, other) } else { (other, self) }; - let downsampled_mh = second.clone().downsample_max_hash(first.max_hash)?; + let downsampled_mh = second.clone().downsample_scaled(first.scaled())?; first.count_common(&downsampled_mh, false) } else { self.check_compatible(other)?; @@ -1490,13 +1492,14 @@ impl KmerMinHashBTree { ignore_abundance: bool, downsample: bool, ) -> Result { - if downsample && self.max_hash != other.max_hash { - let (first, second) = if self.max_hash < other.max_hash { + if downsample && self.scaled() != other.scaled() { + // downsample to larger of two scaled + let (first, second) = if self.scaled() > other.scaled() { (self, other) } else { (other, self) }; - let downsampled_mh = second.clone().downsample_max_hash(first.max_hash)?; + let downsampled_mh = second.clone().downsample_scaled(first.scaled())?; first.similarity(&downsampled_mh, ignore_abundance, false) } else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() { self.jaccard(other) @@ -1541,7 +1544,7 @@ impl KmerMinHashBTree { pub fn downsample_scaled(self, scaled: u64) -> Result { // @CTB shouldn't we check that new scaled > old scaled? if self.scaled() == scaled { - Ok(self.clone()) // CTB avoid clone... + Ok(self) } else { let mut new_mh = KmerMinHashBTree::new( scaled, From 5f70fcc6573baecd37e520763beeba8f60626a60 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 07:49:15 -0400 Subject: [PATCH 04/26] update to fix? scaled=0 situation --- src/core/src/sketch/minhash.rs | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index a59703711..3c351b646 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -20,7 +20,7 @@ use crate::Error; pub fn max_hash_for_scaled(scaled: u64) -> u64 { match scaled { - 0 => 0, + 0 => 0, // scaled == 0 indicates this is a num minhash 1 => u64::MAX, _ => (u64::MAX as f64 / scaled as f64) as u64, } @@ -28,7 +28,7 @@ pub fn max_hash_for_scaled(scaled: u64) -> u64 { pub fn scaled_for_max_hash(max_hash: u64) -> u64 { match max_hash { - 0 => 0, + 0 => 0, // scaled == 0 indicates this is a num minhash _ => (u64::MAX as f64 / max_hash as f64) as u64, } } @@ -723,8 +723,12 @@ impl KmerMinHash { // create a downsampled copy of self pub fn downsample_max_hash(self, max_hash: u64) -> Result { - let scaled = scaled_for_max_hash(max_hash); - self.downsample_scaled(scaled) + if self.max_hash == 0 { + Ok(self) + } else { + let scaled = scaled_for_max_hash(max_hash); + self.downsample_scaled(scaled) + } } pub fn sum_abunds(&self) -> u64 { @@ -770,8 +774,8 @@ impl KmerMinHash { // create a downsampled copy of self pub fn downsample_scaled(self, scaled: u64) -> Result { - // @CTB shouldn't we check that new scaled > old scaled? - if self.scaled() == scaled { + // @CTB: check that new scaled > old scaled + if self.scaled() == scaled || self.scaled() == 0 { Ok(self) } else { let mut new_mh = KmerMinHash::new( @@ -1536,14 +1540,18 @@ impl KmerMinHashBTree { // create a downsampled copy of self pub fn downsample_max_hash(self, max_hash: u64) -> Result { - let scaled = scaled_for_max_hash(max_hash); - self.downsample_scaled(scaled) + if self.max_hash == 0 { + Ok(self) + } else { + let scaled = scaled_for_max_hash(max_hash); + self.downsample_scaled(scaled) + } } // create a downsampled copy of self pub fn downsample_scaled(self, scaled: u64) -> Result { - // @CTB shouldn't we check that new scaled > old scaled? - if self.scaled() == scaled { + // @CTB TODO: check that new scaled > old scaled + if self.scaled() == scaled || self.scaled() == 0 { Ok(self) } else { let mut new_mh = KmerMinHashBTree::new( From 17f56a6d78447a9a17d179e3c7de17f5c5522a29 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 07:56:37 -0400 Subject: [PATCH 05/26] error on upsampling --- src/core/src/sketch/minhash.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 3c351b646..f0114f6b6 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -774,9 +774,10 @@ impl KmerMinHash { // create a downsampled copy of self pub fn downsample_scaled(self, scaled: u64) -> Result { - // @CTB: check that new scaled > old scaled if self.scaled() == scaled || self.scaled() == 0 { Ok(self) + } else if self.scaled() > scaled { // cannot upsample + Err(Error::MismatchScaled) // @CTB new error } else { let mut new_mh = KmerMinHash::new( scaled, @@ -1550,9 +1551,10 @@ impl KmerMinHashBTree { // create a downsampled copy of self pub fn downsample_scaled(self, scaled: u64) -> Result { - // @CTB TODO: check that new scaled > old scaled if self.scaled() == scaled || self.scaled() == 0 { Ok(self) + } else if self.scaled() > scaled { // cannot upsample + Err(Error::MismatchScaled) // @CTB new error } else { let mut new_mh = KmerMinHashBTree::new( scaled, From ffd8ffe33f8c7939eeaf6c501057a3aab6c3fbfa Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 08:20:08 -0400 Subject: [PATCH 06/26] define new error --- src/core/src/errors.rs | 5 +++++ src/core/src/sketch/minhash.rs | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/core/src/errors.rs b/src/core/src/errors.rs index 90c028eb3..1eba37518 100644 --- a/src/core/src/errors.rs +++ b/src/core/src/errors.rs @@ -7,6 +7,9 @@ pub enum SourmashError { #[error("internal error: {message:?}")] Internal { message: String }, + #[error("new scaled smaller than previous; cannot upsample")] + CannotUpsample, + #[error("must have same num: {n1} != {n2}")] MismatchNum { n1: u32, n2: u32 }, @@ -104,6 +107,7 @@ pub enum SourmashErrorCode { NonEmptyMinHash = 1_06, MismatchNum = 1_07, NeedsAbundanceTracking = 1_08, + CannotUpsample = 1_09, // Input sequence errors InvalidDNA = 11_01, InvalidProt = 11_02, @@ -132,6 +136,7 @@ impl SourmashErrorCode { match error { SourmashError::Internal { .. } => SourmashErrorCode::Internal, SourmashError::Panic { .. } => SourmashErrorCode::Panic, + SourmashError::CannotUpsample { .. } => SourmashErrorCode::CannotUpsample, SourmashError::MismatchNum { .. } => SourmashErrorCode::MismatchNum, SourmashError::NeedsAbundanceTracking { .. } => { SourmashErrorCode::NeedsAbundanceTracking diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index f0114f6b6..ecf3cd14e 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -20,7 +20,7 @@ use crate::Error; pub fn max_hash_for_scaled(scaled: u64) -> u64 { match scaled { - 0 => 0, // scaled == 0 indicates this is a num minhash + 0 => 0, // scaled == 0 indicates this is a num minhash 1 => u64::MAX, _ => (u64::MAX as f64 / scaled as f64) as u64, } @@ -28,7 +28,7 @@ pub fn max_hash_for_scaled(scaled: u64) -> u64 { pub fn scaled_for_max_hash(max_hash: u64) -> u64 { match max_hash { - 0 => 0, // scaled == 0 indicates this is a num minhash + 0 => 0, // scaled == 0 indicates this is a num minhash _ => (u64::MAX as f64 / max_hash as f64) as u64, } } @@ -776,8 +776,8 @@ impl KmerMinHash { pub fn downsample_scaled(self, scaled: u64) -> Result { if self.scaled() == scaled || self.scaled() == 0 { Ok(self) - } else if self.scaled() > scaled { // cannot upsample - Err(Error::MismatchScaled) // @CTB new error + } else if self.scaled() > scaled { + Err(Error::CannotUpsample) } else { let mut new_mh = KmerMinHash::new( scaled, @@ -1553,8 +1553,8 @@ impl KmerMinHashBTree { pub fn downsample_scaled(self, scaled: u64) -> Result { if self.scaled() == scaled || self.scaled() == 0 { Ok(self) - } else if self.scaled() > scaled { // cannot upsample - Err(Error::MismatchScaled) // @CTB new error + } else if self.scaled() > scaled { + Err(Error::CannotUpsample) } else { let mut new_mh = KmerMinHashBTree::new( scaled, From b67fc04295fd271e6e777f84fb2cdb241ced97d9 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 09:42:56 -0400 Subject: [PATCH 07/26] bump sourmash core to 0.16.0 b/c of API change --- src/core/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index bc5c3e336..abae4d3dd 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash" -version = "0.15.2" +version = "0.16.0" authors = ["Luiz Irber ", "N. Tessa Pierce-Ward "] description = "tools for comparing biological sequences with k-mer sketches" repository = "https://github.com/sourmash-bio/sourmash" From 9b9fc5a4d40521e14390766fb6ffde4c6921062c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 09:45:18 -0400 Subject: [PATCH 08/26] update include/sourmash.h --- Cargo.lock | 2 +- include/sourmash.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index e76320240..a759b33ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1646,7 +1646,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" [[package]] name = "sourmash" -version = "0.15.2" +version = "0.16.0" dependencies = [ "az", "byteorder", diff --git a/include/sourmash.h b/include/sourmash.h index 410a07eab..d1cd50b45 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -30,6 +30,7 @@ enum SourmashErrorCode { SOURMASH_ERROR_CODE_NON_EMPTY_MIN_HASH = 106, SOURMASH_ERROR_CODE_MISMATCH_NUM = 107, SOURMASH_ERROR_CODE_NEEDS_ABUNDANCE_TRACKING = 108, + SOURMASH_ERROR_CODE_CANNOT_UPSAMPLE = 109, SOURMASH_ERROR_CODE_INVALID_DNA = 1101, SOURMASH_ERROR_CODE_INVALID_PROT = 1102, SOURMASH_ERROR_CODE_INVALID_CODON_LENGTH = 1103, From a0b93eba788ce4d7200e130bccd7b136b554d92b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 10:47:42 -0400 Subject: [PATCH 09/26] add some downsampling-focused tests --- src/core/tests/minhash.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index 195091056..67937a120 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -281,6 +281,7 @@ fn oracle_mins_scaled(hashes in vec(u64::ANY, 1..10000)) { let mut e = a.downsample_max_hash(100).unwrap(); let scaled = scaled_for_max_hash(100); let mut f = b.downsample_scaled(scaled).unwrap(); + assert_eq!(f.scaled(), scaled); // Can't compare different scaled without explicit downsample assert!(c.similarity(&e, false, false).is_err()); @@ -889,3 +890,16 @@ fn test_n_unique_kmers() { mh.add_hash(30); assert_eq!(mh.n_unique_kmers(), 30) } + + +#[test] +fn test_scaled_downsampling() { + let mh = KmerMinHash::new(10, 21, HashFunctions::Murmur64Dna, 42, true, 0); + + // downsampling is OK: + let new_mh = mh.clone().downsample_scaled(100).unwrap(); + assert_eq!(new_mh.scaled(), 100); + + // upsampling not ok + assert!(mh.clone().downsample_scaled(1).is_err()); +} From 79e7b7d74a5850016d229b3cd2e9e1ae1c8c514a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 10:59:14 -0400 Subject: [PATCH 10/26] rename error type; test specific error type --- src/core/src/errors.rs | 6 +++--- src/core/src/sketch/minhash.rs | 6 +++--- src/core/tests/minhash.rs | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/core/src/errors.rs b/src/core/src/errors.rs index 1eba37518..c1f3562e6 100644 --- a/src/core/src/errors.rs +++ b/src/core/src/errors.rs @@ -8,7 +8,7 @@ pub enum SourmashError { Internal { message: String }, #[error("new scaled smaller than previous; cannot upsample")] - CannotUpsample, + CannotUpsampleScaled, #[error("must have same num: {n1} != {n2}")] MismatchNum { n1: u32, n2: u32 }, @@ -107,7 +107,7 @@ pub enum SourmashErrorCode { NonEmptyMinHash = 1_06, MismatchNum = 1_07, NeedsAbundanceTracking = 1_08, - CannotUpsample = 1_09, + CannotUpsampleScaled = 1_09, // Input sequence errors InvalidDNA = 11_01, InvalidProt = 11_02, @@ -136,7 +136,7 @@ impl SourmashErrorCode { match error { SourmashError::Internal { .. } => SourmashErrorCode::Internal, SourmashError::Panic { .. } => SourmashErrorCode::Panic, - SourmashError::CannotUpsample { .. } => SourmashErrorCode::CannotUpsample, + SourmashError::CannotUpsampleScaled { .. } => SourmashErrorCode::CannotUpsampleScaled, SourmashError::MismatchNum { .. } => SourmashErrorCode::MismatchNum, SourmashError::NeedsAbundanceTracking { .. } => { SourmashErrorCode::NeedsAbundanceTracking diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index ecf3cd14e..23b183e75 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -723,7 +723,7 @@ impl KmerMinHash { // create a downsampled copy of self pub fn downsample_max_hash(self, max_hash: u64) -> Result { - if self.max_hash == 0 { + if self.max_hash == 0 { // does this make sense? @CTB Ok(self) } else { let scaled = scaled_for_max_hash(max_hash); @@ -777,7 +777,7 @@ impl KmerMinHash { if self.scaled() == scaled || self.scaled() == 0 { Ok(self) } else if self.scaled() > scaled { - Err(Error::CannotUpsample) + Err(Error::CannotUpsampleScaled) } else { let mut new_mh = KmerMinHash::new( scaled, @@ -1554,7 +1554,7 @@ impl KmerMinHashBTree { if self.scaled() == scaled || self.scaled() == 0 { Ok(self) } else if self.scaled() > scaled { - Err(Error::CannotUpsample) + Err(Error::CannotUpsampleScaled) } else { let mut new_mh = KmerMinHashBTree::new( scaled, diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index 67937a120..c8e9e7587 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -901,5 +901,6 @@ fn test_scaled_downsampling() { assert_eq!(new_mh.scaled(), 100); // upsampling not ok - assert!(mh.clone().downsample_scaled(1).is_err()); + let e = mh.clone().downsample_scaled(1).unwrap_err(); + assert!(matches!(e, sourmash::Error::CannotUpsampleScaled)); } From b11877989c543f05d0002fe75d0f129669e37354 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 11:37:02 -0400 Subject: [PATCH 11/26] cargo fmt --- src/core/src/sketch/minhash.rs | 3 ++- src/core/tests/minhash.rs | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 23b183e75..c12cdfb62 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -723,7 +723,8 @@ impl KmerMinHash { // create a downsampled copy of self pub fn downsample_max_hash(self, max_hash: u64) -> Result { - if self.max_hash == 0 { // does this make sense? @CTB + if self.max_hash == 0 { + // does this make sense? @CTB Ok(self) } else { let scaled = scaled_for_max_hash(max_hash); diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index c8e9e7587..5922142b9 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -891,7 +891,6 @@ fn test_n_unique_kmers() { assert_eq!(mh.n_unique_kmers(), 30) } - #[test] fn test_scaled_downsampling() { let mh = KmerMinHash::new(10, 21, HashFunctions::Murmur64Dna, 42, true, 0); From 464ec60965ba7adb6e22889b939097ccb32fb564 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 15:10:18 -0400 Subject: [PATCH 12/26] fix by doing too much downsampling --- include/sourmash.h | 2 +- src/core/src/index/mod.rs | 12 ++++++++++-- src/core/src/index/revindex/disk_revindex.rs | 17 ++++++++++++----- src/core/src/index/revindex/mod.rs | 2 +- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/include/sourmash.h b/include/sourmash.h index d1cd50b45..d5f0365ec 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -30,7 +30,7 @@ enum SourmashErrorCode { SOURMASH_ERROR_CODE_NON_EMPTY_MIN_HASH = 106, SOURMASH_ERROR_CODE_MISMATCH_NUM = 107, SOURMASH_ERROR_CODE_NEEDS_ABUNDANCE_TRACKING = 108, - SOURMASH_ERROR_CODE_CANNOT_UPSAMPLE = 109, + SOURMASH_ERROR_CODE_CANNOT_UPSAMPLE_SCALED = 109, SOURMASH_ERROR_CODE_INVALID_DNA = 1101, SOURMASH_ERROR_CODE_INVALID_PROT = 1102, SOURMASH_ERROR_CODE_INVALID_CODON_LENGTH = 1103, diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 4816a56cc..53147ceee 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -13,6 +13,7 @@ pub mod search; use std::path::Path; +use std::cmp::max; use getset::{CopyGetters, Getters, Setters}; use log::trace; use serde::{Deserialize, Serialize}; @@ -220,10 +221,17 @@ pub fn calculate_gather_stats( confidence: Option, ) -> Result { // get match_mh - let match_mh = match_sig.minhash().unwrap(); + let match_mh = match_sig.minhash().expect("cannot retrieve sketch"); + let match_mh = match_mh.clone(); + + eprintln!("XXX 2 {}, {}", match_mh.scaled(), query.scaled()); + + let max_scaled = max(match_mh.scaled(), query.scaled()); + let query = query.downsample_scaled(max_scaled).expect("cannot downsample query"); + let match_mh = match_mh.downsample_scaled(max_scaled).expect("cannot downsample match"); // calculate intersection - let isect = match_mh.intersection(&query)?; + let isect = match_mh.intersection(&query).expect("could not do intersection"); let isect_size = isect.0.len(); trace!("isect_size: {}", isect_size); trace!("query.size: {}", query.size()); diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 7aba74fea..12335da16 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -1,4 +1,5 @@ use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; +use std::cmp::max; use std::path::Path; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, RwLock}; @@ -398,8 +399,14 @@ impl RevIndexOps for RevIndex { // get downsampled minhashes for comparison. let match_mh = match_sig.minhash().unwrap().clone(); - query = query.downsample_scaled(match_mh.scaled())?; - orig_query_ds = orig_query_ds.downsample_scaled(match_mh.scaled())?; + + let max_scaled = max(query.scaled(), match_mh.scaled()); + + let match_mh = match_mh.downsample_scaled(max_scaled).expect("cannot downsample match"); + + eprintln!("XXX {}, {}, {}", query.scaled(), match_mh.scaled(), orig_query_ds.scaled()); + query = query.downsample_scaled(max_scaled)?; + orig_query_ds = orig_query_ds.downsample_scaled(max_scaled)?; // just calculate essentials here let gather_result_rank = matches.len(); @@ -407,7 +414,7 @@ impl RevIndexOps for RevIndex { let query_mh = KmerMinHash::from(query.clone()); // grab the specific intersection: - let isect = match_mh.intersection(&query_mh)?; + let isect = match_mh.intersection(&query_mh).expect("failed to intersect"); let mut isect_mh = match_mh.clone(); isect_mh.clear(); isect_mh.add_many(&isect.0)?; @@ -415,7 +422,7 @@ impl RevIndexOps for RevIndex { // Calculate stats let gather_result = calculate_gather_stats( &orig_query_ds, - KmerMinHash::from(query.clone()), + query_mh, match_sig, match_size, gather_result_rank, @@ -424,7 +431,7 @@ impl RevIndexOps for RevIndex { calc_abund_stats, calc_ani_ci, ani_confidence_interval_fraction, - )?; + ).expect("could not calculate gather stats"); // keep track of the sum weighted found sum_weighted_found = gather_result.sum_weighted_found(); matches.push(gather_result); diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index 013c17081..d34816138 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -910,7 +910,7 @@ mod test { 0, &query, Some(selection.clone()), - )?; + ).expect("failed to gather!"); { let mut index = index; From 641d9fc162bb8d0d6a06ee7d0086e8370f190106 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 15:37:46 -0400 Subject: [PATCH 13/26] upd --- src/core/src/index/revindex/disk_revindex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 12335da16..5b021bfec 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -406,7 +406,7 @@ impl RevIndexOps for RevIndex { eprintln!("XXX {}, {}, {}", query.scaled(), match_mh.scaled(), orig_query_ds.scaled()); query = query.downsample_scaled(max_scaled)?; - orig_query_ds = orig_query_ds.downsample_scaled(max_scaled)?; + // orig_query_ds = orig_query_ds.downsample_scaled(max_scaled)?; // just calculate essentials here let gather_result_rank = matches.len(); From a9b91c98dc9edf52289a07664639f2a6b4be0133 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 15:39:59 -0400 Subject: [PATCH 14/26] upd --- src/core/src/index/revindex/disk_revindex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 5b021bfec..12335da16 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -406,7 +406,7 @@ impl RevIndexOps for RevIndex { eprintln!("XXX {}, {}, {}", query.scaled(), match_mh.scaled(), orig_query_ds.scaled()); query = query.downsample_scaled(max_scaled)?; - // orig_query_ds = orig_query_ds.downsample_scaled(max_scaled)?; + orig_query_ds = orig_query_ds.downsample_scaled(max_scaled)?; // just calculate essentials here let gather_result_rank = matches.len(); From 79afb857967d5f48393341a77e43ea27ab3caf22 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 15:51:28 -0400 Subject: [PATCH 15/26] simplify --- src/core/src/index/mod.rs | 14 +++++++++---- src/core/src/index/revindex/disk_revindex.rs | 21 ++++++++++---------- src/core/src/index/revindex/mod.rs | 18 +++++++++-------- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 53147ceee..6abdacaad 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -13,11 +13,11 @@ pub mod search; use std::path::Path; -use std::cmp::max; use getset::{CopyGetters, Getters, Setters}; use log::trace; use serde::{Deserialize, Serialize}; use stats::{median, stddev}; +use std::cmp::max; use typed_builder::TypedBuilder; use crate::ani_utils::{ani_ci_from_containment, ani_from_containment}; @@ -227,11 +227,17 @@ pub fn calculate_gather_stats( eprintln!("XXX 2 {}, {}", match_mh.scaled(), query.scaled()); let max_scaled = max(match_mh.scaled(), query.scaled()); - let query = query.downsample_scaled(max_scaled).expect("cannot downsample query"); - let match_mh = match_mh.downsample_scaled(max_scaled).expect("cannot downsample match"); + let query = query + .downsample_scaled(max_scaled) + .expect("cannot downsample query"); + let match_mh = match_mh + .downsample_scaled(max_scaled) + .expect("cannot downsample match"); // calculate intersection - let isect = match_mh.intersection(&query).expect("could not do intersection"); + let isect = match_mh + .intersection(&query) + .expect("could not do intersection"); let isect_size = isect.0.len(); trace!("isect_size: {}", isect_size); trace!("query.size: {}", query.size()); diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 12335da16..2cd5a56b8 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -1,5 +1,4 @@ use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; -use std::cmp::max; use std::path::Path; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, RwLock}; @@ -373,7 +372,7 @@ impl RevIndexOps for RevIndex { let mut query = KmerMinHashBTree::from(orig_query.clone()); let mut sum_weighted_found = 0; let _selection = selection.unwrap_or_else(|| self.collection.selection()); - let mut orig_query_ds = orig_query.clone(); + let orig_query_ds = orig_query.clone(); let total_weighted_hashes = orig_query.sum_abunds(); // or set this with user --track-abundance? @@ -399,14 +398,11 @@ impl RevIndexOps for RevIndex { // get downsampled minhashes for comparison. let match_mh = match_sig.minhash().unwrap().clone(); + let scaled = query.scaled(); - let max_scaled = max(query.scaled(), match_mh.scaled()); - - let match_mh = match_mh.downsample_scaled(max_scaled).expect("cannot downsample match"); - - eprintln!("XXX {}, {}, {}", query.scaled(), match_mh.scaled(), orig_query_ds.scaled()); - query = query.downsample_scaled(max_scaled)?; - orig_query_ds = orig_query_ds.downsample_scaled(max_scaled)?; + let match_mh = match_mh + .downsample_scaled(scaled) + .expect("cannot downsample match"); // just calculate essentials here let gather_result_rank = matches.len(); @@ -414,7 +410,9 @@ impl RevIndexOps for RevIndex { let query_mh = KmerMinHash::from(query.clone()); // grab the specific intersection: - let isect = match_mh.intersection(&query_mh).expect("failed to intersect"); + let isect = match_mh + .intersection(&query_mh) + .expect("failed to intersect"); let mut isect_mh = match_mh.clone(); isect_mh.clear(); isect_mh.add_many(&isect.0)?; @@ -431,7 +429,8 @@ impl RevIndexOps for RevIndex { calc_abund_stats, calc_ani_ci, ani_confidence_interval_fraction, - ).expect("could not calculate gather stats"); + ) + .expect("could not calculate gather stats"); // keep track of the sum weighted found sum_weighted_found = gather_result.sum_weighted_found(); matches.push(gather_result); diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index d34816138..f1248be71 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -903,14 +903,16 @@ mod test { let (counter, query_colors, hash_to_color) = index.prepare_gather_counters(&query); - let matches_external = index.gather( - counter, - query_colors, - hash_to_color, - 0, - &query, - Some(selection.clone()), - ).expect("failed to gather!"); + let matches_external = index + .gather( + counter, + query_colors, + hash_to_color, + 0, + &query, + Some(selection.clone()), + ) + .expect("failed to gather!"); { let mut index = index; From ddcb049e99749f1d16c414e0fdb2d06d55a38db7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 15:55:01 -0400 Subject: [PATCH 16/26] upd again --- src/core/src/index/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 6abdacaad..d319c12cf 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -222,15 +222,13 @@ pub fn calculate_gather_stats( ) -> Result { // get match_mh let match_mh = match_sig.minhash().expect("cannot retrieve sketch"); - let match_mh = match_mh.clone(); - - eprintln!("XXX 2 {}, {}", match_mh.scaled(), query.scaled()); let max_scaled = max(match_mh.scaled(), query.scaled()); let query = query .downsample_scaled(max_scaled) .expect("cannot downsample query"); let match_mh = match_mh + .clone() .downsample_scaled(max_scaled) .expect("cannot downsample match"); From 6eb86a390c53fc243bf65dd38fab3d1712c9f579 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Oct 2024 06:04:42 -0400 Subject: [PATCH 17/26] simplify --- src/core/src/index/mod.rs | 1 - src/core/src/index/revindex/disk_revindex.rs | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index d319c12cf..c71bb5e58 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -206,7 +206,6 @@ where } } -// note all mh should be selected/downsampled prior to being passed in here. #[allow(clippy::too_many_arguments)] pub fn calculate_gather_stats( orig_query: &KmerMinHash, diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 2cd5a56b8..b11ddb168 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -372,7 +372,6 @@ impl RevIndexOps for RevIndex { let mut query = KmerMinHashBTree::from(orig_query.clone()); let mut sum_weighted_found = 0; let _selection = selection.unwrap_or_else(|| self.collection.selection()); - let orig_query_ds = orig_query.clone(); let total_weighted_hashes = orig_query.sum_abunds(); // or set this with user --track-abundance? @@ -393,7 +392,6 @@ impl RevIndexOps for RevIndex { break; } - // this should downsample mh for us let match_sig = self.collection.sig_for_dataset(dataset_id)?; // get downsampled minhashes for comparison. @@ -419,7 +417,7 @@ impl RevIndexOps for RevIndex { // Calculate stats let gather_result = calculate_gather_stats( - &orig_query_ds, + &orig_query, query_mh, match_sig, match_size, From b47a6c4d08189b7c81d5e40ec4e0e7dc1f2224ba Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Oct 2024 06:11:26 -0400 Subject: [PATCH 18/26] comment --- src/core/src/sketch/minhash.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index c12cdfb62..734103167 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -724,7 +724,7 @@ impl KmerMinHash { // create a downsampled copy of self pub fn downsample_max_hash(self, max_hash: u64) -> Result { if self.max_hash == 0 { - // does this make sense? @CTB + // CTB: this is a num minhash. Should we just blithely return? Ok(self) } else { let scaled = scaled_for_max_hash(max_hash); @@ -1543,6 +1543,7 @@ impl KmerMinHashBTree { // create a downsampled copy of self pub fn downsample_max_hash(self, max_hash: u64) -> Result { if self.max_hash == 0 { + // CTB: this is a num minhash. Just blithely return. Ok(self) } else { let scaled = scaled_for_max_hash(max_hash); From 62f03eb3de8f4b05307efad74f321ced04de40f1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Oct 2024 06:16:34 -0400 Subject: [PATCH 19/26] add KmerMinHashBTree test --- src/core/tests/minhash.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index 5922142b9..040f3506b 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -892,9 +892,30 @@ fn test_n_unique_kmers() { } #[test] -fn test_scaled_downsampling() { +fn test_scaled_downsampling_kmerminhash() { let mh = KmerMinHash::new(10, 21, HashFunctions::Murmur64Dna, 42, true, 0); + // downsampling to same scaled is OK: + let new_mh = mh.clone().downsample_scaled(10).unwrap(); + assert_eq!(new_mh.scaled(), 10); + + // downsampling is OK: + let new_mh = mh.clone().downsample_scaled(100).unwrap(); + assert_eq!(new_mh.scaled(), 100); + + // upsampling not ok + let e = mh.clone().downsample_scaled(1).unwrap_err(); + assert!(matches!(e, sourmash::Error::CannotUpsampleScaled)); +} + +#[test] +fn test_scaled_downsampling_kmerminhashbtree() { + let mh = KmerMinHashBTree::new(10, 21, HashFunctions::Murmur64Dna, 42, true, 0); + + // downsampling to same scaled is OK: + let new_mh = mh.clone().downsample_scaled(10).unwrap(); + assert_eq!(new_mh.scaled(), 10); + // downsampling is OK: let new_mh = mh.clone().downsample_scaled(100).unwrap(); assert_eq!(new_mh.scaled(), 100); From ff4eeda317541270b824441b5f6e6f897dcfcd19 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 13 Oct 2024 11:15:22 +0000 Subject: [PATCH 20/26] feat: Implement TryInto to convert Signature and SigStore into KmerMinHash (#3348) Ref: https://github.com/sourmash-bio/sourmash_plugin_branchwater/pull/467/files#r1797783380 Implement `TryInto` for Signature and SigStore to avoid having to clone a (potentially big) minhash sketch. --- src/core/src/collection.rs | 1 - src/core/src/errors.rs | 15 +++++++++++++++ src/core/src/signature.rs | 22 ++++++++++++++++++++++ src/core/src/storage/mod.rs | 10 ++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index aa8e33e6e..9526eab23 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -241,7 +241,6 @@ mod test { use crate::prelude::Select; use crate::selection::Selection; use crate::signature::Signature; - use crate::Result; #[test] fn sigstore_selection_with_downsample() { diff --git a/src/core/src/errors.rs b/src/core/src/errors.rs index c1f3562e6..30d269a13 100644 --- a/src/core/src/errors.rs +++ b/src/core/src/errors.rs @@ -31,6 +31,15 @@ pub enum SourmashError { #[error("sketch needs abundance for this operation")] NeedsAbundanceTracking, + #[error("Expected a MinHash sketch in this signature")] + NoMinHashFound, + + #[error("Empty signature")] + EmptySignature, + + #[error("Multiple sketches found, expected one")] + MultipleSketchesFound, + #[error("Invalid hash function: {function:?}")] InvalidHashFunction { function: String }, @@ -108,6 +117,9 @@ pub enum SourmashErrorCode { MismatchNum = 1_07, NeedsAbundanceTracking = 1_08, CannotUpsampleScaled = 1_09, + NoMinHashFound = 1_10, + EmptySignature = 1_11, + MultipleSketchesFound = 1_12, // Input sequence errors InvalidDNA = 11_01, InvalidProt = 11_02, @@ -147,6 +159,9 @@ impl SourmashErrorCode { SourmashError::MismatchSeed => SourmashErrorCode::MismatchSeed, SourmashError::MismatchSignatureType => SourmashErrorCode::MismatchSignatureType, SourmashError::NonEmptyMinHash { .. } => SourmashErrorCode::NonEmptyMinHash, + SourmashError::NoMinHashFound => SourmashErrorCode::NoMinHashFound, + SourmashError::EmptySignature => SourmashErrorCode::EmptySignature, + SourmashError::MultipleSketchesFound => SourmashErrorCode::MultipleSketchesFound, SourmashError::InvalidDNA { .. } => SourmashErrorCode::InvalidDNA, SourmashError::InvalidProt { .. } => SourmashErrorCode::InvalidProt, SourmashError::InvalidCodonLength { .. } => SourmashErrorCode::InvalidCodonLength, diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 8cc4ba915..520f8ad04 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -887,6 +887,28 @@ impl PartialEq for Signature { } } +impl TryInto for Signature { + type Error = Error; + + fn try_into(self) -> Result { + match self.signatures.len() { + 1 => self + .signatures + .into_iter() + .find_map(|sk| { + if let Sketch::MinHash(mh) = sk { + Some(mh) + } else { + None + } + }) + .ok_or_else(|| Error::NoMinHashFound), + 0 => Err(Error::EmptySignature), + 2.. => Err(Error::MultipleSketchesFound), + } + } +} + #[cfg(test)] mod test { use std::fs::File; diff --git a/src/core/src/storage/mod.rs b/src/core/src/storage/mod.rs index 12f456fc2..536ec5292 100644 --- a/src/core/src/storage/mod.rs +++ b/src/core/src/storage/mod.rs @@ -16,6 +16,7 @@ use typed_builder::TypedBuilder; use crate::errors::ReadDataError; use crate::prelude::*; use crate::signature::SigsTrait; +use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; use crate::{Error, Result}; @@ -550,6 +551,15 @@ impl From for SigStore { } } +impl TryInto for SigStore { + type Error = crate::Error; + + fn try_into(self) -> std::result::Result { + let sig: Signature = self.into(); + sig.try_into() + } +} + impl Comparable for SigStore { fn similarity(&self, other: &SigStore) -> f64 { let ng: &Signature = self.data().unwrap(); From e4e5555fd81a9a8677bbe065cf7f528270b01fed Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Oct 2024 07:19:13 -0400 Subject: [PATCH 21/26] update include/sourmash.h --- include/sourmash.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/sourmash.h b/include/sourmash.h index d5f0365ec..65585519d 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -31,6 +31,9 @@ enum SourmashErrorCode { SOURMASH_ERROR_CODE_MISMATCH_NUM = 107, SOURMASH_ERROR_CODE_NEEDS_ABUNDANCE_TRACKING = 108, SOURMASH_ERROR_CODE_CANNOT_UPSAMPLE_SCALED = 109, + SOURMASH_ERROR_CODE_NO_MIN_HASH_FOUND = 110, + SOURMASH_ERROR_CODE_EMPTY_SIGNATURE = 111, + SOURMASH_ERROR_CODE_MULTIPLE_SKETCHES_FOUND = 112, SOURMASH_ERROR_CODE_INVALID_DNA = 1101, SOURMASH_ERROR_CODE_INVALID_PROT = 1102, SOURMASH_ERROR_CODE_INVALID_CODON_LENGTH = 1103, From ceaea393d95b3b85575b51c20784d3b9442da149 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Oct 2024 07:56:15 -0400 Subject: [PATCH 22/26] fix two rs errors --- src/core/src/collection.rs | 2 ++ src/core/src/signature.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index 9526eab23..9ad2d891b 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -241,6 +241,8 @@ mod test { use crate::prelude::Select; use crate::selection::Selection; use crate::signature::Signature; + #[cfg(all(feature = "branchwater", not(target_arch = "wasm32")))] + use crate::Result; #[test] fn sigstore_selection_with_downsample() { diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 520f8ad04..da5841d66 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -904,7 +904,7 @@ impl TryInto for Signature { }) .ok_or_else(|| Error::NoMinHashFound), 0 => Err(Error::EmptySignature), - 2.. => Err(Error::MultipleSketchesFound), + _ => Err(Error::MultipleSketchesFound), } } } From 0eeca48e07ac3d2f3a6f144785c9f7aeae00c3d8 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 14 Oct 2024 14:25:01 -0400 Subject: [PATCH 23/26] refactor calculate_gather_stats to take ds query & return isect --- src/core/src/index/mod.rs | 31 +++++++++++--------- src/core/src/index/revindex/disk_revindex.rs | 31 +++++++++++--------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index c71bb5e58..3e91cdb6d 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -17,7 +17,6 @@ use getset::{CopyGetters, Getters, Setters}; use log::trace; use serde::{Deserialize, Serialize}; use stats::{median, stddev}; -use std::cmp::max; use typed_builder::TypedBuilder; use crate::ani_utils::{ani_ci_from_containment, ani_from_containment}; @@ -29,6 +28,7 @@ use crate::signature::SigsTrait; use crate::sketch::minhash::KmerMinHash; use crate::storage::SigStore; use crate::Result; +use crate::Error::CannotUpsampleScaled; #[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)] pub struct GatherResult { @@ -209,7 +209,7 @@ where #[allow(clippy::too_many_arguments)] pub fn calculate_gather_stats( orig_query: &KmerMinHash, - query: KmerMinHash, + query_foo: KmerMinHash, match_sig: SigStore, match_size: usize, gather_result_rank: usize, @@ -218,29 +218,31 @@ pub fn calculate_gather_stats( calc_abund_stats: bool, calc_ani_ci: bool, confidence: Option, -) -> Result { +) -> Result<(GatherResult, (Vec, u64))> { // get match_mh let match_mh = match_sig.minhash().expect("cannot retrieve sketch"); - let max_scaled = max(match_mh.scaled(), query.scaled()); - let query = query - .downsample_scaled(max_scaled) - .expect("cannot downsample query"); + // it's ok to downsample match, but query is often big and repeated, + // so we do not allow downsampling here. + if match_mh.scaled() > query_foo.scaled() { + return Err(CannotUpsampleScaled); + } + let match_mh = match_mh .clone() - .downsample_scaled(max_scaled) + .downsample_scaled(query_foo.scaled()) .expect("cannot downsample match"); // calculate intersection let isect = match_mh - .intersection(&query) + .intersection(&query_foo) .expect("could not do intersection"); let isect_size = isect.0.len(); trace!("isect_size: {}", isect_size); - trace!("query.size: {}", query.size()); + trace!("query.size: {}", query_foo.size()); //bp remaining in subtracted query - let remaining_bp = (query.size() - isect_size) * query.scaled() as usize; + let remaining_bp = (query_foo.size() - isect_size) * query_foo.scaled() as usize; // stats for this match vs original query let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap(); @@ -300,7 +302,7 @@ pub fn calculate_gather_stats( // If abundance, calculate abund-related metrics (vs current query) if calc_abund_stats { // take abunds from subtracted query - let (abunds, unique_weighted_found) = match match_mh.inflated_abundances(&query) { + let (abunds, unique_weighted_found) = match match_mh.inflated_abundances(&query_foo) { Ok((abunds, unique_weighted_found)) => (abunds, unique_weighted_found), Err(e) => { return Err(e); @@ -347,7 +349,7 @@ pub fn calculate_gather_stats( .sum_weighted_found(sum_total_weighted_found) .total_weighted_hashes(total_weighted_hashes) .build(); - Ok(result) + Ok((result, isect)) } #[cfg(test)] @@ -403,7 +405,7 @@ mod test_calculate_gather_stats { let gather_result_rank = 0; let calc_abund_stats = true; let calc_ani_ci = false; - let result = calculate_gather_stats( + let (result, _isect) = calculate_gather_stats( &orig_query, query, match_sig.into(), @@ -416,6 +418,7 @@ mod test_calculate_gather_stats { None, ) .unwrap(); + // first, print all results assert_eq!(result.filename(), "match-filename"); assert_eq!(result.name(), "match-name"); diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index b11ddb168..70c8c0984 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -1,3 +1,4 @@ +use std::cmp::max; use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; use std::path::Path; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -393,30 +394,26 @@ impl RevIndexOps for RevIndex { } let match_sig = self.collection.sig_for_dataset(dataset_id)?; - - // get downsampled minhashes for comparison. let match_mh = match_sig.minhash().unwrap().clone(); - let scaled = query.scaled(); + + // make downsampled minhashes + let max_scaled = max(match_mh.scaled(), query.scaled()); let match_mh = match_mh - .downsample_scaled(scaled) + .downsample_scaled(max_scaled) .expect("cannot downsample match"); + query = query + .downsample_scaled(max_scaled) + .expect("cannot downsample query"); + let query_mh = KmerMinHash::from(query.clone()); + // just calculate essentials here let gather_result_rank = matches.len(); - let query_mh = KmerMinHash::from(query.clone()); - // grab the specific intersection: - let isect = match_mh - .intersection(&query_mh) - .expect("failed to intersect"); - let mut isect_mh = match_mh.clone(); - isect_mh.clear(); - isect_mh.add_many(&isect.0)?; - // Calculate stats - let gather_result = calculate_gather_stats( + let (gather_result, isect) = calculate_gather_stats( &orig_query, query_mh, match_sig, @@ -429,6 +426,12 @@ impl RevIndexOps for RevIndex { ani_confidence_interval_fraction, ) .expect("could not calculate gather stats"); + + // use intersection from calc_gather_stats to make a KmerMinHash. + let mut isect_mh = match_mh.clone(); + isect_mh.clear(); + isect_mh.add_many(&isect.0)?; + // keep track of the sum weighted found sum_weighted_found = gather_result.sum_weighted_found(); matches.push(gather_result); From cd0a209f1757921a34dc55ef406ba65b548c9274 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 14 Oct 2024 14:35:06 -0400 Subject: [PATCH 24/26] fix format --- src/core/src/index/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 3e91cdb6d..072090c61 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -27,8 +27,8 @@ use crate::selection::Selection; use crate::signature::SigsTrait; use crate::sketch::minhash::KmerMinHash; use crate::storage::SigStore; -use crate::Result; use crate::Error::CannotUpsampleScaled; +use crate::Result; #[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)] pub struct GatherResult { @@ -223,7 +223,7 @@ pub fn calculate_gather_stats( let match_mh = match_sig.minhash().expect("cannot retrieve sketch"); // it's ok to downsample match, but query is often big and repeated, - // so we do not allow downsampling here. + // so we do not allow downsampling of query in this function. if match_mh.scaled() > query_foo.scaled() { return Err(CannotUpsampleScaled); } From 2da0084c03837a49907117919c6102835bfab9f2 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 14 Oct 2024 15:20:21 -0400 Subject: [PATCH 25/26] fix name of query to remaining_query --- src/core/src/index/mod.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 072090c61..8ed7f63e0 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -209,7 +209,7 @@ where #[allow(clippy::too_many_arguments)] pub fn calculate_gather_stats( orig_query: &KmerMinHash, - query_foo: KmerMinHash, + remaining_query: KmerMinHash, match_sig: SigStore, match_size: usize, gather_result_rank: usize, @@ -224,25 +224,25 @@ pub fn calculate_gather_stats( // it's ok to downsample match, but query is often big and repeated, // so we do not allow downsampling of query in this function. - if match_mh.scaled() > query_foo.scaled() { + if match_mh.scaled() > remaining_query.scaled() { return Err(CannotUpsampleScaled); } let match_mh = match_mh .clone() - .downsample_scaled(query_foo.scaled()) + .downsample_scaled(remaining_query.scaled()) .expect("cannot downsample match"); // calculate intersection let isect = match_mh - .intersection(&query_foo) + .intersection(&remaining_query) .expect("could not do intersection"); let isect_size = isect.0.len(); trace!("isect_size: {}", isect_size); - trace!("query.size: {}", query_foo.size()); + trace!("query.size: {}", remaining_query.size()); //bp remaining in subtracted query - let remaining_bp = (query_foo.size() - isect_size) * query_foo.scaled() as usize; + let remaining_bp = (remaining_query.size() - isect_size) * remaining_query.scaled() as usize; // stats for this match vs original query let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap(); @@ -302,7 +302,7 @@ pub fn calculate_gather_stats( // If abundance, calculate abund-related metrics (vs current query) if calc_abund_stats { // take abunds from subtracted query - let (abunds, unique_weighted_found) = match match_mh.inflated_abundances(&query_foo) { + let (abunds, unique_weighted_found) = match match_mh.inflated_abundances(&remaining_query) { Ok((abunds, unique_weighted_found)) => (abunds, unique_weighted_found), Err(e) => { return Err(e); From 72d204422a4bc7747f0278db324e7caacb6bb4de Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 15 Oct 2024 13:19:36 -0400 Subject: [PATCH 26/26] fix merge mistakes --- src/core/src/index/mod.rs | 1 - src/core/src/index/revindex/disk_revindex.rs | 14 +++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 306f47948..8ed7f63e0 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -17,7 +17,6 @@ use getset::{CopyGetters, Getters, Setters}; use log::trace; use serde::{Deserialize, Serialize}; use stats::{median, stddev}; -use std::cmp::max; use typed_builder::TypedBuilder; use crate::ani_utils::{ani_ci_from_containment, ani_from_containment}; diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index ba5720881..7386e9ebb 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -395,12 +395,20 @@ impl RevIndexOps for RevIndex { let match_sig = self.collection.sig_for_dataset(dataset_id)?; let match_mh = match_sig.minhash().unwrap().clone(); - let scaled = query.scaled(); + + // make downsampled minhashes + let max_scaled = max(match_mh.scaled(), query.scaled()); let match_mh = match_mh - .downsample_scaled(scaled) + .downsample_scaled(max_scaled) .expect("cannot downsample match"); ->>>>>>> 7d111732d66faa0f957f7612d39f50e0d05aff01 + + // repeatedly downsample query, then extract to KmerMinHash + // => calculate_gather_stats + query = query + .downsample_scaled(max_scaled) + .expect("cannot downsample query"); + let query_mh = KmerMinHash::from(query.clone()); // just calculate essentials here let gather_result_rank = matches.len();