Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: fix RocksDB-based gather & other rust-based infelicities revealed by plugins #3193

Merged
merged 14 commits into from
Jun 9, 2024
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.13.1"
version = "0.14.0"
authors = ["Luiz Irber <luiz@sourmash.bio>", "N. Tessa Pierce-Ward <tessa@sourmash.bio>"]
description = "tools for comparing biological sequences with k-mer sketches"
repository = "https://github.com/sourmash-bio/sourmash"
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ impl std::fmt::Display for HashFunctions {
f,
"{}",
match self {
HashFunctions::Murmur64Dna => "dna",
HashFunctions::Murmur64Dna => "DNA",
HashFunctions::Murmur64Protein => "protein",
HashFunctions::Murmur64Dayhoff => "dayhoff",
HashFunctions::Murmur64Hp => "hp",
Expand Down
14 changes: 11 additions & 3 deletions src/core/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub mod search;
use std::path::Path;

use getset::{CopyGetters, Getters, Setters};
use log::trace;
use serde::{Deserialize, Serialize};
use stats::{median, stddev};
use typed_builder::TypedBuilder;
Expand Down Expand Up @@ -220,8 +221,15 @@ pub fn calculate_gather_stats(
) -> Result<GatherResult> {
// get match_mh
let match_mh = match_sig.minhash().unwrap();

// calculate intersection
let isect = match_mh.intersection(&query)?;
let isect_size = isect.0.len();
trace!("isect_size: {}", isect_size);
trace!("query.size: {}", query.size());

//bp remaining in subtracted query
let remaining_bp = (query.size() - match_size) * query.scaled() as usize;
let remaining_bp = (query.size() - isect_size) * query.scaled() as usize;

// stats for this match vs original query
let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap();
Expand All @@ -231,8 +239,8 @@ pub fn calculate_gather_stats(

// stats for this match vs current (subtracted) query
let f_match = match_size as f64 / match_mh.size() as f64;
let unique_intersect_bp = match_mh.scaled() as usize * match_size;
let f_unique_to_query = match_size as f64 / orig_query.size() as f64;
let unique_intersect_bp = match_mh.scaled() as usize * isect_size;
let f_unique_to_query = isect_size as f64 / orig_query.size() as f64;

// // get ANI values
let ksize = match_mh.ksize() as f64;
Expand Down
19 changes: 12 additions & 7 deletions src/core/src/index/revindex/disk_revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,14 @@ impl RevIndexOps for RevIndex {
// just calculate essentials here
let gather_result_rank = matches.len();

let query_mh = KmerMinHash::from(query.clone());

// grab the specific intersection:
let isect = match_mh.intersection(&query_mh)?;
let mut isect_mh = match_mh.clone();
isect_mh.clear();
isect_mh.add_many(&isect.0)?;

// Calculate stats
let gather_result = calculate_gather_stats(
&orig_query_ds,
Expand All @@ -371,8 +379,9 @@ impl RevIndexOps for RevIndex {

// TODO: Use HashesToColors here instead. If not initialized,
// build it.
match_mh
.iter_mins()
isect
.0
.iter()
Comment on lines +382 to +384
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! And with this change there is a block below:

counter.entry(dataset).and_modify(|e| {
    if *e > 0 {
        *e -= 1
    }
});

that can be

counter.entry(dataset).and_modify(|e| { *e -= 1 });

instead, and will panic if e goes below 0.

With this change the error triggers in latest in the index::revindex::test::revindex_load_and_gather_2 test, and doesn't trigger in this PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added in 231a134, thanks!

.filter_map(|hash| hash_to_color.get(hash))
.flat_map(|color| {
// TODO: remove this clone
Expand All @@ -381,11 +390,7 @@ impl RevIndexOps for RevIndex {
.for_each(|dataset| {
// TODO: collect the flat_map into a Counter, and remove more
// than one at a time...
counter.entry(dataset).and_modify(|e| {
if *e > 0 {
*e -= 1
}
});
counter.entry(dataset).and_modify(|e| { *e -= 1 });
});

counter.remove(&dataset_id);
Expand Down
152 changes: 140 additions & 12 deletions src/core/src/index/revindex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -644,14 +644,13 @@ mod test {
counter,
query_colors,
hash_to_color,
0,
5, // 50kb threshold
&query,
Some(selection),
)?;

// should be 11, based on test_gather_metagenome_num_results @CTB.
// see sourmash#3139 and sourmash_plugin_branchwater#322.
assert_eq!(matches.len(), 6);
// should be 11, based on test_gather_metagenome_num_results
assert_eq!(matches.len(), 11);

fn round5(a: f64) -> f64 {
(a * 1e5).round() / 1e5
Expand Down Expand Up @@ -691,18 +690,147 @@ mod test {
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_009486.1");
assert_eq!(round5(match_.f_match()), round5(0.4842105));
assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557));

let match_ = &matches[6];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_006905.1");
assert_eq!(round5(match_.f_match()), round5(0.161016949152542));
assert_eq!(
round5(match_.f_unique_to_query()),
round5(0.0518417462482947)
);

let match_ = &matches[7];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_011080.1");
assert_eq!(round5(match_.f_match()), round5(0.125799573560768));
assert_eq!(
round5(match_.f_unique_to_query()),
round5(0.04024556616643930)
);

let match_ = &matches[8];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_011274.1");
assert_eq!(round5(match_.f_match()), round5(0.0919037199124727));
assert_eq!(
round5(match_.f_unique_to_query()),
round5(0.0286493860845839)
);

let match_ = &matches[9];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_006511.1");
assert_eq!(round5(match_.f_match()), round5(0.0725995316159251));
assert_eq!(
round5(match_.f_unique_to_query()),
round5(0.021145975443383400)
);

let match_ = &matches[10];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_011294.1");
assert_eq!(round5(match_.f_match()), round5(0.0148619957537155));
assert_eq!(
round5(match_.f_unique_to_query()),
round5(0.0047748976807639800)
);

Ok(())
}

#[test]
// a more detailed/focused version of revindex_load_and_gather_2,
// added in sourmash#3193 for debugging purposes.
fn revindex_load_and_gather_3() -> Result<()> {
let mut basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
basedir.push("../../tests/test-data/gather/");

let against = vec![
"GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig",
"GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig",
"GCF_000008545.1_ASM854v1_genomic.fna.gz.sig",
];
let against: Vec<_> = against
.iter()
.map(|sig| {
let mut filename = basedir.clone();
filename.push(sig);
filename
})
.collect();

// build 'against' sketches into a revindex
let selection = Selection::builder().ksize(21).scaled(10000).build();
let output = TempDir::new()?;

// @CTB this fails: 0.43158 != 0.48421
// assert_eq!(round5(match_.f_match()), round5(0.4842105));
let collection = Collection::from_paths(&against)?.select(&selection)?;
let _index = RevIndex::create(output.path(), collection.try_into()?, false);

let index = RevIndex::open(output.path(), true, None)?;

let mut query = None;
let mut query_filename = basedir.clone();
query_filename.push("combined.sig");
let query_sig = Signature::from_path(query_filename)?
.swap_remove(0)
.select(&selection)?;

if let Some(q) = prepare_query(query_sig, &selection) {
query = Some(q);
}
let query = query.unwrap();

let (counter, query_colors, hash_to_color) = index.prepare_gather_counters(&query);

let matches = index.gather(
counter,
query_colors,
hash_to_color,
0,
&query,
Some(selection),
)?;

// @CTB this fails: 0.05593 != 0.06276
// assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557));
// should be 3.
// see sourmash#3193.
assert_eq!(matches.len(), 3);

// @CTB fails
// assert_eq!(match_.unique_intersect_bp, 820000);
fn round5(a: f64) -> f64 {
(a * 1e5).round() / 1e5
}

// @CTB fails
// assert_eq!(match_.remaining_bp, 2170000);
let match_ = &matches[0];
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_000853.1");
assert_eq!(match_.f_match(), 1.0);
assert_eq!(round5(match_.f_unique_to_query()), round5(0.13096862));
assert_eq!(match_.unique_intersect_bp, 1920000);
assert_eq!(match_.remaining_bp, 12740000);

let match_ = &matches[1];
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_011978.1");
assert_eq!(match_.f_match(), 0.898936170212766);
assert_eq!(round5(match_.f_unique_to_query()), round5(0.115279));
assert_eq!(match_.unique_intersect_bp, 1690000);
assert_eq!(match_.remaining_bp, 11050000);

let match_ = &matches[2];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_009486.1");
assert_eq!(round5(match_.f_match()), round5(0.4842105));
assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557));
assert_eq!(match_.unique_intersect_bp, 920000);
assert_eq!(match_.remaining_bp, 10130000);

Ok(())
}
Expand Down
31 changes: 31 additions & 0 deletions src/core/src/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,37 @@ mod test {
}
}

#[test]
fn manifest_to_writer_moltype_dna() {
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

let test_sigs = vec![PathBuf::from("../../tests/test-data/47.fa.sig")];

let full_paths: Vec<PathBuf> = test_sigs
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

let manifest = Manifest::from(&full_paths[..]); // pass full_paths as a slice

let temp_dir = TempDir::new().unwrap();
let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf())
.expect("Path should be valid UTF-8");

let filename = utf8_output.join("sigs.manifest.csv");
let mut wtr = File::create(&filename).expect("Failed to create file");

manifest.to_writer(&mut wtr).unwrap();

// check that we can reopen the file as a manifest + properly check abund
let infile = File::open(&filename).expect("Failed to open file");
let m2 = Manifest::from_reader(&infile).unwrap();
for record in m2.iter() {
eprintln!("{:?} {}", record.name(), record.moltype());
assert_eq!(record.moltype().to_string(), "DNA");
}
}

#[test]
fn manifest_selection() {
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
Expand Down
2 changes: 1 addition & 1 deletion tests/test_sourmash_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def test_protein_override_bad_rust_foo():
with pytest.raises(ValueError) as exc:
sig.add_protein(record.sequence)

assert 'Invalid hash function: "dna"' in str(exc)
assert 'Invalid hash function: "DNA"' in str(exc)


def test_dayhoff_defaults():
Expand Down
Loading