Skip to content

Commit

Permalink
feat: deprecate old alignment params
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov committed Oct 11, 2023
1 parent f85a6bc commit 6ed138e
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 138 deletions.
12 changes: 6 additions & 6 deletions packages_rs/nextclade/src/align/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,12 @@ pub fn align_nuc(
);
}

if ref_len + qry_len < (10 * params.seed_length) {
// for very short sequences, use full square
let stripes = full_matrix(ref_len, qry_len);
trace!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band construction: short sequences, using full matrix");
return Ok(align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes));
}
// if ref_len + qry_len < (10 * params.seed_length) {
// // for very short sequences, use full square
// let stripes = full_matrix(ref_len, qry_len);
// trace!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band construction: short sequences, using full matrix");
// return Ok(align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes));
// }

// otherwise, determine seed matches roughly regularly spaced along the query sequence
let SeedMatchesResult {
Expand Down
107 changes: 72 additions & 35 deletions packages_rs/nextclade/src/align/params.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
use crate::{make_error, o};
use clap::{Parser, ValueEnum};
use eyre::Report;
use itertools::Itertools;
use optfield::optfield;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;

#[derive(ValueEnum, Copy, Clone, Debug, Deserialize, Serialize, schemars::JsonSchema)]
#[serde(rename_all = "kebab-case")]
Expand Down Expand Up @@ -52,30 +56,6 @@ pub struct AlignPairwiseParams {
#[clap(long)]
pub max_band_area: usize,

/// Maximum length of insertions or deletions allowed to proceed with alignment. Alignments with long indels are slow to compute and require substantial memory in the current implementation. Alignment of sequences with indels longer that this value, will not be attempted and a warning will be emitted.
#[clap(long)]
pub max_indel: usize,

/// k-mer length to determine approximate alignments between query and reference and determine the bandwidth of the banded alignment.
#[clap(long)]
pub seed_length: usize,

/// Maximum number of mismatching nucleotides allowed for a seed to be considered a match.
#[clap(long)]
pub mismatches_allowed: usize,

/// Minimum number of seeds to search for during nucleotide alignment. Relevant for short sequences. In long sequences, the number of seeds is determined by `--seed-spacing`.
#[clap(long)]
pub min_seeds: i32,

/// Minimum seed mathing rate (a ratio of seed matches to total number of attempted seeds)
#[clap(long)]
pub min_match_rate: f64,

/// Spacing between seeds during nucleotide alignment.
#[clap(long)]
pub seed_spacing: i32,

/// Retry seed matching step with a reverse complement if the first attempt failed
#[clap(long)]
#[clap(num_args=0..=1, default_missing_value = "true")]
Expand Down Expand Up @@ -107,15 +87,15 @@ pub struct AlignPairwiseParams {
#[clap(long, value_enum)]
pub gap_alignment_side: GapAlignmentSide,

/// Length of exactly matching kmers used in the seed alignment of the query to the reference.
/// Length of exactly matching k-mers used in the seed alignment of the query to the reference.
#[clap(long)]
pub kmer_length: usize,

/// Interval of successive kmers on the query sequence. Should be small compared to the query length.
/// Interval of successive k-mers on the query sequence. Should be small compared to the query length.
#[clap(long)]
pub kmer_distance: usize,

/// Exactly matching kmers are extended to the left and right until more
/// Exactly matching k-mers are extended to the left and right until more
/// than `allowed_mismatches` are observed in a sliding window (`window_size`).
#[clap(long)]
pub allowed_mismatches: usize,
Expand All @@ -124,7 +104,7 @@ pub struct AlignPairwiseParams {
#[clap(long)]
pub window_size: usize,

/// Minimum length of extended kmers
/// Minimum length of extended k-mers
#[clap(long)]
pub min_match_length: usize,

Expand All @@ -136,6 +116,31 @@ pub struct AlignPairwiseParams {
/// Number of times Nextclade will retry alignment with more relaxed results if alignment band boundaries are hit
#[clap(long)]
pub max_alignment_attempts: usize,

// The following args are deprecated and are kept for backwards compatibility (to emit errors if they are set)
/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub max_indel: Option<f64>,

/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub seed_length: Option<f64>,

/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub mismatches_allowed: Option<f64>,

/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub min_seeds: Option<f64>,

/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub min_match_rate: Option<f64>,

/// REMOVED
#[clap(long, hide_long_help = true, hide_short_help = true)]
pub seed_spacing: Option<f64>,
}

impl Default for AlignPairwiseParams {
Expand All @@ -149,12 +154,6 @@ impl Default for AlignPairwiseParams {
penalty_mismatch: 1,
score_match: 3,
max_band_area: 500_000_000, // requires around 500Mb for paths, 2GB for the scores
max_indel: 400, // obsolete
seed_length: 21, // obsolete
min_seeds: 10, // obsolete
min_match_rate: 0.3, // obsolete
seed_spacing: 100, // obsolete
mismatches_allowed: 3, // obsolete
retry_reverse_complement: false,
no_translate_past_stop: false,
left_terminal_gaps_free: true,
Expand All @@ -164,11 +163,49 @@ impl Default for AlignPairwiseParams {
terminal_bandwidth: 50,
min_seed_cover: 0.33,
kmer_length: 10, // Should not be much larger than 1/divergence of amino acids
kmer_distance: 50, // Distance between successive kmers
kmer_distance: 50, // Distance between successive k-mers
min_match_length: 40, // Experimentally determined, to keep off-target matches reasonably low
allowed_mismatches: 8, // Ns count as mismatches
window_size: 30,
max_alignment_attempts: 3,

// The following args are deprecated and are kept for backwards compatibility (to emit errors if they are set)
max_indel: None,
seed_length: None,
min_seeds: None,
min_match_rate: None,
seed_spacing: None,
mismatches_allowed: None,
}
}
}

impl AlignPairwiseParams {
pub fn validate(&self) -> Result<(), Report> {
#[rustfmt::skip]
let deprecated = BTreeMap::from([
(o!("--min-seeds (minSeeds)"), &self.min_seeds),
(o!("--seed-length (seedLength)"), &self.seed_length),
(o!("--seed-spacing (seedSpacing)"), &self.seed_spacing),
(o!("--min-match-rate (minMatchRate)"), &self.min_match_rate),
(o!("--mismatches-allowed (mismatchesAllowed)"), &self.mismatches_allowed),
(o!("--max-indel (maxIndel)"), &self.max_indel),
]);

if deprecated.values().any(|val| val.is_some()) {
let keys = deprecated.keys().map(|key| format!(" {key}")).join("\n");

return make_error!(
r#"The following alignment parameters (CLI arguments and config file fields) were removed in Nextclade 3.0.0 due to changes in the alignment algorithm:
{keys}
Please remove them from your dataset and/or from command-line invocation.
Refer to user documentation for more details."#
);
}

Ok(())
}
}
91 changes: 0 additions & 91 deletions packages_rs/nextclade/src/align/seed_alignment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,97 +45,6 @@ pub struct SeedMatch {
pub score: usize,
}

/// Determine seed matches between query and reference sequence. will only attempt to
/// match k-mers without ambiguous characters. Search is performed via a left-to-right
/// search starting and the previous valid seed and extending at most to the maximally
/// allowed insertion/deletion (shift) distance.
pub fn get_seed_matches<L: Letter<L>>(
qry_seq: &[L],
ref_seq: &[L],
params: &AlignPairwiseParams,
) -> (Vec<SeedMatch>, i32) {
let mut seed_matches = Vec::<SeedMatch>::new();

// get list of valid k-mer start positions
let map_to_good_positions = get_map_to_good_positions(qry_seq, params.seed_length);
let n_good_positions = map_to_good_positions.len();
if n_good_positions < params.seed_length {
return (seed_matches, 0);
}

// use 1/seed_spacing for long sequences, min_seeds otherwise
let n_seeds = if ref_seq.len() > (params.min_seeds * params.seed_spacing) as usize {
(ref_seq.len() as f32 / params.seed_spacing as f32) as i32
} else {
params.min_seeds
};

// distance of first seed from the end of the sequence (third of seed spacing)
let margin = (ref_seq.len() as f32 / (n_seeds * 3) as f32).round() as i32;

// Generate kmers equally spaced on the query
let effective_margin = (margin as f32).min(n_good_positions as f32 / 4.0);
let seed_cover = n_good_positions as f32 - 2.0 * effective_margin;
let kmer_spacing = (seed_cover - 1.0) / ((n_seeds - 1) as f32);

// loop over seeds and find matches, store in seed_matches
let mut start_pos = 0; // start position of ref search
let mut end_pos = ref_seq.len(); // end position of ref search
let qry_pos = 0;

for ni in 0..n_seeds {
// pick index of of seed in map
let good_position_index = (effective_margin + (kmer_spacing * ni as f32)).round() as usize;
// get new query kmer-position
let qry_pos_old = qry_pos;
let qry_pos = map_to_good_positions[good_position_index];
// increment upper bound for search in reference
end_pos += qry_pos - qry_pos_old;

//extract seed and find match
let seed = &qry_seq[qry_pos..(qry_pos + params.seed_length)];
let tmp_match = seed_match(seed, ref_seq, start_pos, end_pos, params.mismatches_allowed);

// Only use seeds with at most allowed_mismatches
if tmp_match.score >= params.seed_length - params.mismatches_allowed {
// if this isn't the first match, check that reference position of current match after previous
// if previous seed matched AFTER current seed, remove previous seed
if let Some(prev_match) = seed_matches.last() {
if tmp_match.ref_pos > prev_match.ref_pos {
start_pos = prev_match.ref_pos;
} else {
// warn!("Crossed over seed removed. {:?}", prev_match);
seed_matches.pop();
}
}

let seed_match = SeedMatch {
qry_pos,
ref_pos: tmp_match.ref_pos,
score: tmp_match.score,
};

// check that current seed matches AFTER previous seed (-2 if already triggered above)
// and add current seed to list of matches
if seed_matches
.last()
.map_or(true, |prev_match| prev_match.ref_pos < tmp_match.ref_pos)
{
//ensure seed positions increase strictly monotonically
seed_matches.push(seed_match);
// } else {
// warn!(
// "Seed not used because of identical ref_pos with previous seed: {:?}",
// seed_match
// );
}
// increment the "reference search end-pos" as the current reference + maximally allowed indel
end_pos = tmp_match.ref_pos + params.max_indel;
}
}
(seed_matches, n_seeds)
}

fn abs_shift(seed1: &SeedMatch2, seed2: &SeedMatch2) -> isize {
abs(seed2.offset - seed1.offset)
}
Expand Down
2 changes: 1 addition & 1 deletion packages_rs/nextclade/src/run/nextclade_wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ impl Nextclade {
virus_properties,
} = inputs;

let params = NextcladeInputParams::from_optional(params, &virus_properties);
let params = NextcladeInputParams::from_optional(params, &virus_properties)?;
let ref_seq = to_nuc_seq(&ref_record.seq).wrap_err("When converting reference sequence")?;
let seed_index = CodonSpacedIndex::from_sequence(&ref_seq);

Expand Down
16 changes: 12 additions & 4 deletions packages_rs/nextclade/src/run/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@ use crate::align::params::{AlignPairwiseParams, AlignPairwiseParamsOptional};
use crate::analyze::virus_properties::VirusProperties;
use crate::run::params_general::{NextcladeGeneralParams, NextcladeGeneralParamsOptional};
use crate::tree::params::{TreeBuilderParams, TreeBuilderParamsOptional};
use crate::{make_error, o};
use clap::Parser;
use eyre::Report;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;

#[derive(Parser, Debug, Default, Clone, Serialize, Deserialize, schemars::JsonSchema)]
#[serde(rename_all = "camelCase")]
Expand All @@ -27,9 +31,11 @@ pub struct NextcladeInputParams {
}

impl NextcladeInputParams {
pub fn from_optional(params: &NextcladeInputParamsOptional, virus_properties: &VirusProperties) -> Self {
pub fn from_optional(
params: &NextcladeInputParamsOptional,
virus_properties: &VirusProperties,
) -> Result<Self, Report> {
// FIXME: this code is repetitive and error prone

let general = {
// Start with defaults
let mut general_params = NextcladeGeneralParams::default();
Expand Down Expand Up @@ -58,6 +64,8 @@ impl NextcladeInputParams {
alignment_params
};

alignment.validate()?;

let tree_builder = {
// Start with defaults
let mut tree_builder_params = TreeBuilderParams::default();
Expand All @@ -72,10 +80,10 @@ impl NextcladeInputParams {
tree_builder_params
};

Self {
Ok(Self {
general,
tree_builder,
alignment,
}
})
}
}
2 changes: 1 addition & 1 deletion packages_rs/nextclade/src/translate/translate_genes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ pub fn translate_cds(
// Set to false for internal genes
left_terminal_gaps_free: first(&qry_cds_seq)?.is_gap(),
right_terminal_gaps_free: last(&qry_cds_seq)?.is_gap(),
..*params
..params.clone()
};

// Make sure subsequent gap stripping does not introduce frame shift
Expand Down

0 comments on commit 6ed138e

Please sign in to comment.