Skip to content

Commit

Permalink
Alphabet is defined in Sequences and only there
Browse files Browse the repository at this point in the history
The sequence struct now defines the Alphabet for the whole setup and that is the only
place where the Alphabet exists. The Alphabet is used to compute leaf sequence encodings
to simplify likelihood computation.
  • Loading branch information
junniest committed Aug 14, 2024
1 parent 16eb6fd commit f7c7a4b
Show file tree
Hide file tree
Showing 10 changed files with 124 additions and 170 deletions.
2 changes: 1 addition & 1 deletion phylo/src/alignment/alignment_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ impl<'a> AlignmentBuilder<'a> {
/// This assumes that the tree structure matches the alignment structure.
fn build_from_map(self) -> Result<Alignment> {
let mut alignment = Alignment {
seqs: Sequences::new(Vec::new()),
seqs: Sequences::with_attrs(Vec::new(), self.seqs.gap_handling()),
leaf_map: HashMap::new(),
node_map: self.node_map,
};
Expand Down
5 changes: 5 additions & 0 deletions phylo/src/alignment/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::collections::HashMap;

use bio::io::fasta::Record;

use crate::alphabets::Alphabet;
use crate::tree::{NodeIdx, NodeIdx::Internal as Int, NodeIdx::Leaf, Tree};
use crate::Result;

Expand Down Expand Up @@ -54,6 +55,10 @@ pub struct Alignment {
}

impl Alignment {
pub fn alphabet(&self) -> &Alphabet {
&self.seqs.alphabet
}

pub fn len(&self) -> usize {
self.leaf_map.len()
}
Expand Down
45 changes: 32 additions & 13 deletions phylo/src/alignment/sequences.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use crate::alphabets::GAP;
use crate::{
alphabets::{detect_alphabet, Alphabet, GAP},
phylo_info::GapHandling,
};

use bio::io::fasta::Record;

Expand All @@ -7,23 +10,35 @@ pub struct Sequences {
pub(crate) s: Vec<Record>,
pub(crate) aligned: bool,
pub(crate) msa_len: usize,
pub(crate) alphabet: Alphabet,
}

impl Sequences {
/// Creates a new Sequences object from a vector of bio::io::fasta::Record.
/// The Sequences object is considered aligned if all sequences have the same length.
/// By default gap handling is set to proper.
pub fn new(s: Vec<Record>) -> Sequences {
Self::with_attrs(s, &GapHandling::Proper)
}

/// Creates a new Sequences object from a vector of bio::io::fasta::Record.
/// The Sequences object is considered aligned if all sequences have the same length.
pub fn with_attrs(s: Vec<Record>, gap_handling: &GapHandling) -> Sequences {
let len = if s.is_empty() { 0 } else { s[0].seq().len() };
let alphabet = detect_alphabet(&s, gap_handling);
if s.iter().filter(|rec| rec.seq().len() != len).count() == 0 {
Sequences {
s,
aligned: true,
msa_len: len,
alphabet,
}
} else {
Sequences {
s,
aligned: false,
msa_len: 0,
alphabet,
}
}
}
Expand Down Expand Up @@ -56,7 +71,16 @@ impl Sequences {
self.msa_len
}

pub fn without_gaps(&self) -> Sequences {
pub fn alphabet(&self) -> &Alphabet {
&self.alphabet
}

pub fn gap_handling(&self) -> &GapHandling {
&self.alphabet.gap_handling
}

/// Removes all gaps from the sequences and returns a new Sequences object.
pub fn without_gaps(self) -> Sequences {
let seqs = self
.s
.iter()
Expand All @@ -70,16 +94,11 @@ impl Sequences {
Record::with_attrs(rec.id(), rec.desc(), &sequence)
})
.collect();
Sequences::new(seqs)
}

/// Converts the given sequences to uppercase and returns a new vector.
pub fn make_uppercase(self) -> Sequences {
let seqs = self
.s
.iter()
.map(|rec| Record::with_attrs(rec.id(), rec.desc(), &rec.seq().to_ascii_uppercase()))
.collect();
Sequences::new(seqs)
Sequences {
s: seqs,
aligned: false,
msa_len: 0,
alphabet: self.alphabet,
}
}
}
12 changes: 6 additions & 6 deletions phylo/src/alphabets/alphabets_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,26 @@ use rstest::*;

use std::path::PathBuf;

use crate::alignment::Sequences;
use crate::alphabets::sequence_type;
use crate::alphabets::detect_alphabet;
use crate::evolutionary_models::ModelType;
use crate::io::read_sequences_from_file;
use crate::phylo_info::GapHandling;

#[rstest]
#[case::aligned("./data/sequences_DNA1.fasta")]
#[case::unaligned("./data/sequences_DNA2_unaligned.fasta")]
#[case::long("./data/sequences_long.fasta")]
fn dna_type_test(#[case] input: &str) {
let seqs = read_sequences_from_file(&PathBuf::from(input)).unwrap();
let alphabet = sequence_type(&Sequences::new(seqs));
assert_matches!(alphabet, ModelType::DNA(_));
let alphabet = detect_alphabet(&seqs, &GapHandling::Proper);
assert_matches!(alphabet.model_type, ModelType::DNA(_));
}

#[rstest]
#[case("./data/sequences_protein1.fasta")]
#[case("./data/sequences_protein2.fasta")]
fn protein_type_test(#[case] input: &str) {
let seqs = read_sequences_from_file(&PathBuf::from(input)).unwrap();
let alphabet = sequence_type(&Sequences::new(seqs));
assert_matches!(alphabet, ModelType::Protein(_));
let alphabet = detect_alphabet(&seqs, &GapHandling::Proper);
assert_matches!(alphabet.model_type, ModelType::Protein(_));
}
96 changes: 50 additions & 46 deletions phylo/src/alphabets/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::collections::HashSet;

use bio::io::fasta::Record;
use lazy_static::lazy_static;

use crate::alignment::Sequences;
use crate::evolutionary_models::{
DNAModelType,
ModelType::{self, *},
Expand All @@ -18,13 +18,15 @@ pub static NUCLEOTIDES: &[u8] = b"TCAG";
pub static AMB_NUCLEOTIDES: &[u8] = b"RYSWKMBDHVNZX";
pub static GAP: u8 = b'-';

#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Clone)]
pub struct Alphabet {
model_type: ModelType,
pub(crate) gap_handling: GapHandling,
symbols: &'static [u8],
ambiguous: &'static [u8],
char_sets: &'static [FreqVector],
index: &'static [usize; 255],
valid_symbols: HashSet<u8>,
valid_symbols: &'static HashSet<u8>,
}

impl Alphabet {
Expand Down Expand Up @@ -55,14 +57,14 @@ impl Alphabet {
}
}

pub fn sequence_type(sequences: &Sequences) -> ModelType {
let dna_alphabet = dna_alphabet(&GapHandling::Ambiguous);
pub fn detect_alphabet(sequences: &[Record], gap_handling: &GapHandling) -> Alphabet {
let dna_alphabet = dna_alphabet(gap_handling);
for record in sequences.iter() {
if !dna_alphabet.is_word(record.seq()) {
return Protein(ProteinModelType::UNDEF);
return protein_alphabet(gap_handling);
}
}
DNA(DNAModelType::UNDEF)
dna_alphabet
}

pub fn alphabet_from_type(model_type: ModelType, gap_handling: &GapHandling) -> Alphabet {
Expand All @@ -73,46 +75,34 @@ pub fn alphabet_from_type(model_type: ModelType, gap_handling: &GapHandling) ->
}

fn dna_alphabet(gap_handling: &GapHandling) -> Alphabet {
let mut valid_symbols: HashSet<u8> = NUCLEOTIDES.iter().cloned().collect();
valid_symbols.extend(AMB_NUCLEOTIDES.iter().cloned());
valid_symbols.insert(GAP);
match gap_handling {
GapHandling::Ambiguous => Alphabet {
symbols: NUCLEOTIDES,
ambiguous: AMB_NUCLEOTIDES,
char_sets: &NUCLEOTIDE_SETS,
index: &NUCLEOTIDE_INDEX,
valid_symbols,
},
_ => Alphabet {
symbols: NUCLEOTIDES,
ambiguous: AMB_NUCLEOTIDES,
char_sets: &NUCLEOTIDE_GAP_SETS,
index: &NUCLEOTIDE_INDEX,
valid_symbols,
},
let char_sets: &Vec<FreqVector> = match gap_handling {
GapHandling::Ambiguous => &NUCLEOTIDE_SETS_AMBIG,
_ => &NUCLEOTIDE_SETS_PROPER,
};
Alphabet {
model_type: DNA(DNAModelType::UNDEF),
gap_handling: gap_handling.clone(),
symbols: NUCLEOTIDES,
ambiguous: AMB_NUCLEOTIDES,
index: &NUCLEOTIDE_INDEX,
valid_symbols: &VALID_NUCLEOTIDES,
char_sets,
}
}

fn protein_alphabet(gap_handlind: &GapHandling) -> Alphabet {
let mut valid_symbols: HashSet<u8> = AMINOACIDS.iter().cloned().collect();
valid_symbols.extend(AMB_AMINOACIDS.iter().cloned());
valid_symbols.insert(GAP);
match gap_handlind {
GapHandling::Ambiguous => Alphabet {
symbols: AMINOACIDS,
ambiguous: AMB_AMINOACIDS,
char_sets: &AMINOACID_SETS_AMBIG,
index: &AMINOACID_INDEX,
valid_symbols,
},
_ => Alphabet {
symbols: AMINOACIDS,
ambiguous: AMB_AMINOACIDS,
char_sets: &AMINOACID_SETS_PROPER,
index: &AMINOACID_INDEX,
valid_symbols,
},
fn protein_alphabet(gap_handling: &GapHandling) -> Alphabet {
let char_sets: &Vec<FreqVector> = match gap_handling {
GapHandling::Ambiguous => &AMINOACID_SETS_AMBIG,
_ => &AMINOACID_SETS_PROPER,
};
Alphabet {
model_type: Protein(ProteinModelType::UNDEF),
gap_handling: gap_handling.clone(),
symbols: AMINOACIDS,
ambiguous: AMB_AMINOACIDS,
index: &AMINOACID_INDEX,
valid_symbols: &VALID_AMINOACIDS,
char_sets,
}
}

Expand All @@ -126,7 +116,7 @@ lazy_static! {
index[GAP as usize] = 4;
index
};
pub static ref NUCLEOTIDE_GAP_SETS: Vec<FreqVector> = {
pub static ref NUCLEOTIDE_SETS_PROPER: Vec<FreqVector> = {
let mut map = vec![frequencies!(&[0.0; 4]); 255];
for (i, elem) in map.iter_mut().enumerate() {
let char = i as u8;
Expand All @@ -138,14 +128,21 @@ lazy_static! {
}
map
};
pub static ref NUCLEOTIDE_SETS: Vec<FreqVector> = {
pub static ref NUCLEOTIDE_SETS_AMBIG: Vec<FreqVector> = {
let mut map = vec![frequencies!(&[0.0; 4]); 255];
for (i, elem) in map.iter_mut().enumerate() {
let char = i as u8;
elem.set_column(0, &generic_nucleotide_sets(char));
}
map
};
pub static ref VALID_NUCLEOTIDES: HashSet<u8> = {
NUCLEOTIDES
.iter()
.chain(AMB_NUCLEOTIDES.iter().chain([GAP].iter()))
.cloned()
.collect()
};
}

fn generic_nucleotide_sets(char: u8) -> FreqVector {
Expand Down Expand Up @@ -207,6 +204,13 @@ lazy_static! {
}
map
};
pub static ref VALID_AMINOACIDS: HashSet<u8> = {
AMINOACIDS
.iter()
.chain(AMB_AMINOACIDS.iter().chain([GAP].iter()))
.cloned()
.collect()
};
}

fn generic_aminoacid_sets(char: u8) -> FreqVector {
Expand Down
Loading

0 comments on commit f7c7a4b

Please sign in to comment.