Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: refactor calculate_gather_stats to disallow repeated downsampling #3352

Merged
merged 29 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d9eeafd
implement downsample_max_hash in terms of downsample_scaled
ctb Oct 5, 2024
4ab87e4
adjust references & use clone too much
ctb Oct 6, 2024
ffa9683
switch to using scaled; avoid unnecessary clones?
ctb Oct 6, 2024
5f70fcc
update to fix? scaled=0 situation
ctb Oct 11, 2024
17f56a6
error on upsampling
ctb Oct 11, 2024
b60fbb0
Merge branch 'latest' of github.com:sourmash-bio/sourmash into refact…
ctb Oct 11, 2024
ffd8ffe
define new error
ctb Oct 11, 2024
b67fc04
bump sourmash core to 0.16.0 b/c of API change
ctb Oct 11, 2024
9b9fc5a
update include/sourmash.h
ctb Oct 11, 2024
a0b93eb
add some downsampling-focused tests
ctb Oct 11, 2024
79e7b7d
rename error type; test specific error type
ctb Oct 11, 2024
b118779
cargo fmt
ctb Oct 11, 2024
464ec60
fix by doing too much downsampling
ctb Oct 11, 2024
641d9fc
upd
ctb Oct 11, 2024
a9b91c9
upd
ctb Oct 11, 2024
79afb85
simplify
ctb Oct 11, 2024
ddcb049
upd again
ctb Oct 11, 2024
6eb86a3
simplify
ctb Oct 12, 2024
b47a6c4
comment
ctb Oct 12, 2024
62f03eb
add KmerMinHashBTree test
ctb Oct 12, 2024
ff4eeda
feat: Implement TryInto to convert Signature and SigStore into KmerMi…
luizirber Oct 13, 2024
e4e5555
update include/sourmash.h
ctb Oct 13, 2024
ceaea39
fix two rs errors
ctb Oct 13, 2024
0eeca48
refactor calculate_gather_stats to take ds query & return isect
ctb Oct 14, 2024
cd0a209
fix format
ctb Oct 14, 2024
2da0084
fix name of query to remaining_query
ctb Oct 14, 2024
405c518
Merge branch 'latest' of github.com:sourmash-bio/sourmash into gather…
ctb Oct 14, 2024
03102a5
Merge branch 'latest' of github.com:sourmash-bio/sourmash into gather…
ctb Oct 15, 2024
72d2044
fix merge mistakes
ctb Oct 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ enum SourmashErrorCode {
SOURMASH_ERROR_CODE_NON_EMPTY_MIN_HASH = 106,
SOURMASH_ERROR_CODE_MISMATCH_NUM = 107,
SOURMASH_ERROR_CODE_NEEDS_ABUNDANCE_TRACKING = 108,
SOURMASH_ERROR_CODE_CANNOT_UPSAMPLE_SCALED = 109,
SOURMASH_ERROR_CODE_NO_MIN_HASH_FOUND = 110,
SOURMASH_ERROR_CODE_EMPTY_SIGNATURE = 111,
SOURMASH_ERROR_CODE_MULTIPLE_SKETCHES_FOUND = 112,
SOURMASH_ERROR_CODE_INVALID_DNA = 1101,
SOURMASH_ERROR_CODE_INVALID_PROT = 1102,
SOURMASH_ERROR_CODE_INVALID_CODON_LENGTH = 1103,
Expand Down
2 changes: 1 addition & 1 deletion src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.15.2"
version = "0.16.0"
authors = ["Luiz Irber <luiz@sourmash.bio>", "N. Tessa Pierce-Ward <tessa@sourmash.bio>"]
description = "tools for comparing biological sequences with k-mer sketches"
repository = "https://github.com/sourmash-bio/sourmash"
Expand Down
1 change: 1 addition & 0 deletions src/core/src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ mod test {
use crate::prelude::Select;
use crate::selection::Selection;
use crate::signature::Signature;
#[cfg(all(feature = "branchwater", not(target_arch = "wasm32")))]
use crate::Result;

#[test]
Expand Down
20 changes: 20 additions & 0 deletions src/core/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ pub enum SourmashError {
#[error("internal error: {message:?}")]
Internal { message: String },

#[error("new scaled smaller than previous; cannot upsample")]
CannotUpsampleScaled,

#[error("must have same num: {n1} != {n2}")]
MismatchNum { n1: u32, n2: u32 },

Expand All @@ -28,6 +31,15 @@ pub enum SourmashError {
#[error("sketch needs abundance for this operation")]
NeedsAbundanceTracking,

#[error("Expected a MinHash sketch in this signature")]
NoMinHashFound,

#[error("Empty signature")]
EmptySignature,

#[error("Multiple sketches found, expected one")]
MultipleSketchesFound,

#[error("Invalid hash function: {function:?}")]
InvalidHashFunction { function: String },

Expand Down Expand Up @@ -104,6 +116,10 @@ pub enum SourmashErrorCode {
NonEmptyMinHash = 1_06,
MismatchNum = 1_07,
NeedsAbundanceTracking = 1_08,
CannotUpsampleScaled = 1_09,
NoMinHashFound = 1_10,
EmptySignature = 1_11,
MultipleSketchesFound = 1_12,
// Input sequence errors
InvalidDNA = 11_01,
InvalidProt = 11_02,
Expand Down Expand Up @@ -132,6 +148,7 @@ impl SourmashErrorCode {
match error {
SourmashError::Internal { .. } => SourmashErrorCode::Internal,
SourmashError::Panic { .. } => SourmashErrorCode::Panic,
SourmashError::CannotUpsampleScaled { .. } => SourmashErrorCode::CannotUpsampleScaled,
SourmashError::MismatchNum { .. } => SourmashErrorCode::MismatchNum,
SourmashError::NeedsAbundanceTracking { .. } => {
SourmashErrorCode::NeedsAbundanceTracking
Expand All @@ -142,6 +159,9 @@ impl SourmashErrorCode {
SourmashError::MismatchSeed => SourmashErrorCode::MismatchSeed,
SourmashError::MismatchSignatureType => SourmashErrorCode::MismatchSignatureType,
SourmashError::NonEmptyMinHash { .. } => SourmashErrorCode::NonEmptyMinHash,
SourmashError::NoMinHashFound => SourmashErrorCode::NoMinHashFound,
SourmashError::EmptySignature => SourmashErrorCode::EmptySignature,
SourmashError::MultipleSketchesFound => SourmashErrorCode::MultipleSketchesFound,
SourmashError::InvalidDNA { .. } => SourmashErrorCode::InvalidDNA,
SourmashError::InvalidProt { .. } => SourmashErrorCode::InvalidProt,
SourmashError::InvalidCodonLength { .. } => SourmashErrorCode::InvalidCodonLength,
Expand Down
34 changes: 24 additions & 10 deletions src/core/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
use crate::signature::SigsTrait;
use crate::sketch::minhash::KmerMinHash;
use crate::storage::SigStore;
use crate::Error::CannotUpsampleScaled;
use crate::Result;

#[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)]
Expand Down Expand Up @@ -205,11 +206,10 @@
}
}

// note all mh should be selected/downsampled prior to being passed in here.
#[allow(clippy::too_many_arguments)]
pub fn calculate_gather_stats(
orig_query: &KmerMinHash,
query: KmerMinHash,
remaining_query: KmerMinHash,
match_sig: SigStore,
match_size: usize,
gather_result_rank: usize,
Expand All @@ -218,18 +218,31 @@
calc_abund_stats: bool,
calc_ani_ci: bool,
confidence: Option<f64>,
) -> Result<GatherResult> {
) -> Result<(GatherResult, (Vec<u64>, u64))> {
// get match_mh
let match_mh = match_sig.minhash().unwrap();
let match_mh = match_sig.minhash().expect("cannot retrieve sketch");

// it's ok to downsample match, but query is often big and repeated,
// so we do not allow downsampling of query in this function.
if match_mh.scaled() > remaining_query.scaled() {
return Err(CannotUpsampleScaled);

Check warning on line 228 in src/core/src/index/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/index/mod.rs#L228

Added line #L228 was not covered by tests
}

let match_mh = match_mh
.clone()
.downsample_scaled(remaining_query.scaled())
.expect("cannot downsample match");

// calculate intersection
let isect = match_mh.intersection(&query)?;
let isect = match_mh
.intersection(&remaining_query)
.expect("could not do intersection");
let isect_size = isect.0.len();
trace!("isect_size: {}", isect_size);
trace!("query.size: {}", query.size());
trace!("query.size: {}", remaining_query.size());

//bp remaining in subtracted query
let remaining_bp = (query.size() - isect_size) * query.scaled() as usize;
let remaining_bp = (remaining_query.size() - isect_size) * remaining_query.scaled() as usize;

// stats for this match vs original query
let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap();
Expand Down Expand Up @@ -289,7 +302,7 @@
// If abundance, calculate abund-related metrics (vs current query)
if calc_abund_stats {
// take abunds from subtracted query
let (abunds, unique_weighted_found) = match match_mh.inflated_abundances(&query) {
let (abunds, unique_weighted_found) = match match_mh.inflated_abundances(&remaining_query) {
Ok((abunds, unique_weighted_found)) => (abunds, unique_weighted_found),
Err(e) => {
return Err(e);
Expand Down Expand Up @@ -336,7 +349,7 @@
.sum_weighted_found(sum_total_weighted_found)
.total_weighted_hashes(total_weighted_hashes)
.build();
Ok(result)
Ok((result, isect))
}

#[cfg(test)]
Expand Down Expand Up @@ -392,7 +405,7 @@
let gather_result_rank = 0;
let calc_abund_stats = true;
let calc_ani_ci = false;
let result = calculate_gather_stats(
let (result, _isect) = calculate_gather_stats(
&orig_query,
query,
match_sig.into(),
Expand All @@ -405,6 +418,7 @@
None,
)
.unwrap();

// first, print all results
assert_eq!(result.filename(), "match-filename");
assert_eq!(result.name(), "match-name");
Expand Down
41 changes: 24 additions & 17 deletions src/core/src/index/revindex/disk_revindex.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::cmp::max;
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
Expand Down Expand Up @@ -372,7 +373,6 @@ impl RevIndexOps for RevIndex {
let mut query = KmerMinHashBTree::from(orig_query.clone());
let mut sum_weighted_found = 0;
let _selection = selection.unwrap_or_else(|| self.collection.selection());
let mut orig_query_ds = orig_query.clone();
let total_weighted_hashes = orig_query.sum_abunds();

// or set this with user --track-abundance?
Expand All @@ -393,29 +393,29 @@ impl RevIndexOps for RevIndex {
break;
}

// this should downsample mh for us
let match_sig = self.collection.sig_for_dataset(dataset_id)?;

// get downsampled minhashes for comparison.
let match_mh = match_sig.minhash().unwrap().clone();
query = query.downsample_scaled(match_mh.scaled())?;
orig_query_ds = orig_query_ds.downsample_scaled(match_mh.scaled())?;

// just calculate essentials here
let gather_result_rank = matches.len();
// make downsampled minhashes
let max_scaled = max(match_mh.scaled(), query.scaled());

let match_mh = match_mh
.downsample_scaled(max_scaled)
.expect("cannot downsample match");

query = query
.downsample_scaled(max_scaled)
.expect("cannot downsample query");
let query_mh = KmerMinHash::from(query.clone());

// grab the specific intersection:
let isect = match_mh.intersection(&query_mh)?;
let mut isect_mh = match_mh.clone();
isect_mh.clear();
isect_mh.add_many(&isect.0)?;
// just calculate essentials here
let gather_result_rank = matches.len();

// grab the specific intersection:
// Calculate stats
let gather_result = calculate_gather_stats(
&orig_query_ds,
KmerMinHash::from(query.clone()),
let (gather_result, isect) = calculate_gather_stats(
&orig_query,
query_mh,
match_sig,
match_size,
gather_result_rank,
Expand All @@ -424,7 +424,14 @@ impl RevIndexOps for RevIndex {
calc_abund_stats,
calc_ani_ci,
ani_confidence_interval_fraction,
)?;
)
.expect("could not calculate gather stats");

// use intersection from calc_gather_stats to make a KmerMinHash.
let mut isect_mh = match_mh.clone();
isect_mh.clear();
isect_mh.add_many(&isect.0)?;

// keep track of the sum weighted found
sum_weighted_found = gather_result.sum_weighted_found();
matches.push(gather_result);
Expand Down
18 changes: 10 additions & 8 deletions src/core/src/index/revindex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -903,14 +903,16 @@ mod test {

let (counter, query_colors, hash_to_color) = index.prepare_gather_counters(&query);

let matches_external = index.gather(
counter,
query_colors,
hash_to_color,
0,
&query,
Some(selection.clone()),
)?;
let matches_external = index
.gather(
counter,
query_colors,
hash_to_color,
0,
&query,
Some(selection.clone()),
)
.expect("failed to gather!");

{
let mut index = index;
Expand Down
24 changes: 23 additions & 1 deletion src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@
// TODO: also account for LargeMinHash
if let Sketch::MinHash(mh) = sketch {
if (mh.scaled() as u32) < sel_scaled {
*sketch = Sketch::MinHash(mh.downsample_scaled(sel_scaled as u64)?);
*sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled as u64)?);
}
}
}
Expand Down Expand Up @@ -887,6 +887,28 @@
}
}

impl TryInto<KmerMinHash> for Signature {
type Error = Error;

fn try_into(self) -> Result<KmerMinHash, Error> {
match self.signatures.len() {
1 => self

Check warning on line 895 in src/core/src/signature.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/signature.rs#L893-L895

Added lines #L893 - L895 were not covered by tests
.signatures
.into_iter()
.find_map(|sk| {
if let Sketch::MinHash(mh) = sk {

Check warning on line 899 in src/core/src/signature.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/signature.rs#L898-L899

Added lines #L898 - L899 were not covered by tests
Some(mh)
} else {
None

Check warning on line 902 in src/core/src/signature.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/signature.rs#L902

Added line #L902 was not covered by tests
}
})
.ok_or_else(|| Error::NoMinHashFound),
0 => Err(Error::EmptySignature),
_ => Err(Error::MultipleSketchesFound),

Check warning on line 907 in src/core/src/signature.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/signature.rs#L905-L907

Added lines #L905 - L907 were not covered by tests
}
}
}

#[cfg(test)]
mod test {
use std::fs::File;
Expand Down
Loading
Loading