Skip to content

Commit

Permalink
Added HashMap representing DNA ambig chars
Browse files Browse the repository at this point in the history
Added a HashMap that is used to represent ambiguous DNA characters that can be reused in
other places so that the ambiguous characters are always fixed and exist only in one place
  • Loading branch information
junniest committed Dec 5, 2023
1 parent f6c3634 commit ee31c1d
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 45 deletions.
1 change: 1 addition & 0 deletions phylo/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ argmin = "0.8.1"
assert_matches = "1.5.0"
bio = "1.3.1"
log = "0.4.19"
map-macro = "0.2.6"
nalgebra = "0.32.3"
ordered-float = "3.7.0"
pest = "2.7.2"
Expand Down
13 changes: 5 additions & 8 deletions phylo/src/pip_model/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use nalgebra::{Const, DMatrix, DVector, DimMin};
use crate::evolutionary_models::{EvolutionaryModel, EvolutionaryModelInfo};
use crate::likelihood::LikelihoodCostFunction;
use crate::phylo_info::PhyloInfo;
use crate::sequences::GAP;
use crate::substitution_models::dna_models::nucleotide_index;
use crate::substitution_models::protein_models::{aminoacid_index, ProteinSubstModel};
use crate::substitution_models::FreqVector;
Expand Down Expand Up @@ -107,10 +108,8 @@ impl EvolutionaryModel<4> for PIPModel<4> {
}

fn get_char_probability(&self, char: u8) -> FreqVector {
if char == b'-' {
let mut probs = FreqVector::from_column_slice(&[0.0; 5]);
probs[4] = 1.0;
probs
if char == GAP {
FreqVector::from_column_slice(&[0.0, 0.0, 0.0, 0.0, 1.0])
} else {
self.subst_model
.get_char_probability(char)
Expand Down Expand Up @@ -143,10 +142,8 @@ impl EvolutionaryModel<20> for PIPModel<20> {
}

fn get_char_probability(&self, char: u8) -> FreqVector {
if char == b'-' {
let mut probs = FreqVector::from_column_slice(&[0.0; 21]);
probs[20] = 1.0;
probs
if char == GAP {
FreqVector::from_column_slice(&[vec![0.0; 20], vec![1.0]].concat())
} else {
self.subst_model
.get_char_probability(char)
Expand Down
86 changes: 49 additions & 37 deletions phylo/src/substitution_models/dna_models/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};

use anyhow::bail;
use log::warn;
use map_macro::{hash_map, hash_set};
use ordered_float::OrderedFloat;

use crate::evolutionary_models::{EvolutionaryModel, EvolutionaryModelInfo};
Expand All @@ -28,6 +29,35 @@ pub use jc69::*;
pub use k80::*;
pub use tn93::*;

fn dna_ambiguous_chars() -> HashMap<u8, HashSet<u8>> {
hash_map! {
b'V' => hash_set! {b'C', b'A', b'G'},
b'v' => hash_set! {b'C', b'A', b'G'},
b'D' => hash_set! {b'T', b'A', b'G'},
b'd' => hash_set! {b'T', b'A', b'G'},
b'B' => hash_set! {b'T', b'C', b'G'},
b'b' => hash_set! {b'T', b'C', b'G'},
b'H' => hash_set! {b'T', b'C' ,b'A'},
b'h' => hash_set! {b'T', b'C', b'A'},
b'M' => hash_set! {b'A', b'C'},
b'm' => hash_set! {b'A', b'C'},
b'R' => hash_set! {b'A', b'G'},
b'r' => hash_set! {b'A', b'G'},
b'W' => hash_set! {b'A', b'T'},
b'w' => hash_set! {b'A', b'T'},
b'S' => hash_set! {b'C', b'G'},
b's' => hash_set! {b'C', b'G'},
b'Y' => hash_set! {b'C', b'T'},
b'y' => hash_set! {b'C', b'T'},
b'K' => hash_set! {b'G', b'T'},
b'k' => hash_set! {b'G', b'T'},
b'X' => hash_set! {b'A', b'C', b'G', b'T'},
b'x' => hash_set! {b'A', b'C', b'G', b'T'},
b'N' => hash_set! {b'A', b'C', b'G', b'T'},
b'n' => hash_set! {b'A', b'C', b'G', b'T'},
}
}

fn make_dna_model(params: Vec<f64>, q: SubstMatrix, pi: FreqVector) -> DNASubstModel {
DNASubstModel {
params,
Expand Down Expand Up @@ -65,49 +95,31 @@ impl EvolutionaryModel<4> for DNASubstModel {
}

fn get_char_probability(&self, char: u8) -> FreqVector {
let mut vec = FreqVector::zeros(4);
let mut probs = FreqVector::zeros(4);
let dna_ambiguous_chars = dna_ambiguous_chars();
let dna_char_set: HashSet<u8> =
HashSet::from_iter(NUCLEOTIDES_STR.as_bytes().iter().cloned());
if NUCLEOTIDES_STR.contains(char as char) {
vec[self.index[char as usize] as usize] = 1.0;
probs[self.index[char as usize] as usize] = 1.0;
} else {
vec = FreqVector::from_column_slice(self.get_stationary_distribution().as_slice());
match char {
b'V' => vec[self.index[b'T' as usize] as usize] = 0.0,
b'D' => vec[self.index[b'C' as usize] as usize] = 0.0,
b'B' => vec[self.index[b'A' as usize] as usize] = 0.0,
b'H' => vec[self.index[b'G' as usize] as usize] = 0.0,
b'M' => {
vec[self.index[b'T' as usize] as usize] = 0.0;
vec[self.index[b'G' as usize] as usize] = 0.0
}
b'R' => {
vec[self.index[b'T' as usize] as usize] = 0.0;
vec[self.index[b'C' as usize] as usize] = 0.0
}
b'W' => {
vec[self.index[b'G' as usize] as usize] = 0.0;
vec[self.index[b'C' as usize] as usize] = 0.0
}
b'S' => {
vec[self.index[b'T' as usize] as usize] = 0.0;
vec[self.index[b'A' as usize] as usize] = 0.0
}
b'Y' => {
vec[self.index[b'A' as usize] as usize] = 0.0;
vec[self.index[b'G' as usize] as usize] = 0.0
}
b'K' => {
vec[self.index[b'C' as usize] as usize] = 0.0;
vec[self.index[b'A' as usize] as usize] = 0.0
probs = FreqVector::from_column_slice(self.get_stationary_distribution().as_slice());
let other = dna_ambiguous_chars.get(&char);
match other {
Some(other) => {
let difference = dna_char_set.difference(other);
for &char in difference {
probs[self.index[char as usize] as usize] = 0.0;
}
}
_ => {
None => {
warn!("Unknown character {} encountered, treating it as X.", char);
vec =
probs =
FreqVector::from_column_slice(self.get_stationary_distribution().as_slice())
}
};
}
}
vec.scale_mut(1.0 / vec.sum());
vec
probs.scale_mut(1.0 / probs.sum());
probs
}
}

Expand Down

0 comments on commit ee31c1d

Please sign in to comment.