Alphabet is defined in Sequences and only there

The sequence struct now defines the Alphabet for the whole setup and that is the only place where the Alphabet exists. The Alphabet is used to compute leaf sequence encodings to simplify likelihood computation.
acg-team · Aug 14, 2024 · f7c7a4b · f7c7a4b
1 parent 16eb6fd
commit f7c7a4b
Show file tree

Hide file tree

Showing 10 changed files with 124 additions and 170 deletions.
diff --git a/phylo/src/alignment/alignment_builder.rs b/phylo/src/alignment/alignment_builder.rs
@@ -81,7 +81,7 @@ impl<'a> AlignmentBuilder<'a> {
     /// This assumes that the tree structure matches the alignment structure.
     fn build_from_map(self) -> Result<Alignment> {
         let mut alignment = Alignment {
-            seqs: Sequences::new(Vec::new()),
+            seqs: Sequences::with_attrs(Vec::new(), self.seqs.gap_handling()),
             leaf_map: HashMap::new(),
             node_map: self.node_map,
         };

diff --git a/phylo/src/alignment/mod.rs b/phylo/src/alignment/mod.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;
 
 use bio::io::fasta::Record;
 
+use crate::alphabets::Alphabet;
 use crate::tree::{NodeIdx, NodeIdx::Internal as Int, NodeIdx::Leaf, Tree};
 use crate::Result;
 
@@ -54,6 +55,10 @@ pub struct Alignment {
 }
 
 impl Alignment {
+    pub fn alphabet(&self) -> &Alphabet {
+        &self.seqs.alphabet
+    }
+
     pub fn len(&self) -> usize {
         self.leaf_map.len()
     }

diff --git a/phylo/src/alignment/sequences.rs b/phylo/src/alignment/sequences.rs
@@ -1,4 +1,7 @@
-use crate::alphabets::GAP;
+use crate::{
+    alphabets::{detect_alphabet, Alphabet, GAP},
+    phylo_info::GapHandling,
+};
 
 use bio::io::fasta::Record;
 
@@ -7,23 +10,35 @@ pub struct Sequences {
     pub(crate) s: Vec<Record>,
     pub(crate) aligned: bool,
     pub(crate) msa_len: usize,
+    pub(crate) alphabet: Alphabet,
 }
 
 impl Sequences {
     /// Creates a new Sequences object from a vector of bio::io::fasta::Record.
+    /// The Sequences object is considered aligned if all sequences have the same length.
+    /// By default gap handling is set to proper.
     pub fn new(s: Vec<Record>) -> Sequences {
+        Self::with_attrs(s, &GapHandling::Proper)
+    }
+
+    /// Creates a new Sequences object from a vector of bio::io::fasta::Record.
+    /// The Sequences object is considered aligned if all sequences have the same length.
+    pub fn with_attrs(s: Vec<Record>, gap_handling: &GapHandling) -> Sequences {
         let len = if s.is_empty() { 0 } else { s[0].seq().len() };
+        let alphabet = detect_alphabet(&s, gap_handling);
         if s.iter().filter(|rec| rec.seq().len() != len).count() == 0 {
             Sequences {
                 s,
                 aligned: true,
                 msa_len: len,
+                alphabet,
             }
         } else {
             Sequences {
                 s,
                 aligned: false,
                 msa_len: 0,
+                alphabet,
             }
         }
     }
@@ -56,7 +71,16 @@ impl Sequences {
         self.msa_len
     }
 
-    pub fn without_gaps(&self) -> Sequences {
+    pub fn alphabet(&self) -> &Alphabet {
+        &self.alphabet
+    }
+
+    pub fn gap_handling(&self) -> &GapHandling {
+        &self.alphabet.gap_handling
+    }
+
+    /// Removes all gaps from the sequences and returns a new Sequences object.
+    pub fn without_gaps(self) -> Sequences {
         let seqs = self
             .s
             .iter()
@@ -70,16 +94,11 @@ impl Sequences {
                 Record::with_attrs(rec.id(), rec.desc(), &sequence)
             })
             .collect();
-        Sequences::new(seqs)
-    }
-
-    /// Converts the given sequences to uppercase and returns a new vector.
-    pub fn make_uppercase(self) -> Sequences {
-        let seqs = self
-            .s
-            .iter()
-            .map(|rec| Record::with_attrs(rec.id(), rec.desc(), &rec.seq().to_ascii_uppercase()))
-            .collect();
-        Sequences::new(seqs)
+        Sequences {
+            s: seqs,
+            aligned: false,
+            msa_len: 0,
+            alphabet: self.alphabet,
+        }
     }
 }
diff --git a/phylo/src/alphabets/alphabets_tests.rs b/phylo/src/alphabets/alphabets_tests.rs
@@ -3,26 +3,26 @@ use rstest::*;
 
 use std::path::PathBuf;
 
-use crate::alignment::Sequences;
-use crate::alphabets::sequence_type;
+use crate::alphabets::detect_alphabet;
 use crate::evolutionary_models::ModelType;
 use crate::io::read_sequences_from_file;
+use crate::phylo_info::GapHandling;
 
 #[rstest]
 #[case::aligned("./data/sequences_DNA1.fasta")]
 #[case::unaligned("./data/sequences_DNA2_unaligned.fasta")]
 #[case::long("./data/sequences_long.fasta")]
 fn dna_type_test(#[case] input: &str) {
     let seqs = read_sequences_from_file(&PathBuf::from(input)).unwrap();
-    let alphabet = sequence_type(&Sequences::new(seqs));
-    assert_matches!(alphabet, ModelType::DNA(_));
+    let alphabet = detect_alphabet(&seqs, &GapHandling::Proper);
+    assert_matches!(alphabet.model_type, ModelType::DNA(_));
 }
 
 #[rstest]
 #[case("./data/sequences_protein1.fasta")]
 #[case("./data/sequences_protein2.fasta")]
 fn protein_type_test(#[case] input: &str) {
     let seqs = read_sequences_from_file(&PathBuf::from(input)).unwrap();
-    let alphabet = sequence_type(&Sequences::new(seqs));
-    assert_matches!(alphabet, ModelType::Protein(_));
+    let alphabet = detect_alphabet(&seqs, &GapHandling::Proper);
+    assert_matches!(alphabet.model_type, ModelType::Protein(_));
 }
diff --git a/phylo/src/alphabets/mod.rs b/phylo/src/alphabets/mod.rs
@@ -1,8 +1,8 @@
 use std::collections::HashSet;
 
+use bio::io::fasta::Record;
 use lazy_static::lazy_static;
 
-use crate::alignment::Sequences;
 use crate::evolutionary_models::{
     DNAModelType,
     ModelType::{self, *},
@@ -18,13 +18,15 @@ pub static NUCLEOTIDES: &[u8] = b"TCAG";
 pub static AMB_NUCLEOTIDES: &[u8] = b"RYSWKMBDHVNZX";
 pub static GAP: u8 = b'-';
 
-#[derive(Debug, PartialEq)]
+#[derive(Debug, PartialEq, Clone)]
 pub struct Alphabet {
+    model_type: ModelType,
+    pub(crate) gap_handling: GapHandling,
     symbols: &'static [u8],
     ambiguous: &'static [u8],
     char_sets: &'static [FreqVector],
     index: &'static [usize; 255],
-    valid_symbols: HashSet<u8>,
+    valid_symbols: &'static HashSet<u8>,
 }
 
 impl Alphabet {
@@ -55,14 +57,14 @@ impl Alphabet {
     }
 }
 
-pub fn sequence_type(sequences: &Sequences) -> ModelType {
-    let dna_alphabet = dna_alphabet(&GapHandling::Ambiguous);
+pub fn detect_alphabet(sequences: &[Record], gap_handling: &GapHandling) -> Alphabet {
+    let dna_alphabet = dna_alphabet(gap_handling);
     for record in sequences.iter() {
         if !dna_alphabet.is_word(record.seq()) {
-            return Protein(ProteinModelType::UNDEF);
+            return protein_alphabet(gap_handling);
         }
     }
-    DNA(DNAModelType::UNDEF)
+    dna_alphabet
 }
 
 pub fn alphabet_from_type(model_type: ModelType, gap_handling: &GapHandling) -> Alphabet {
@@ -73,46 +75,34 @@ pub fn alphabet_from_type(model_type: ModelType, gap_handling: &GapHandling) ->
 }
 
 fn dna_alphabet(gap_handling: &GapHandling) -> Alphabet {
-    let mut valid_symbols: HashSet<u8> = NUCLEOTIDES.iter().cloned().collect();
-    valid_symbols.extend(AMB_NUCLEOTIDES.iter().cloned());
-    valid_symbols.insert(GAP);
-    match gap_handling {
-        GapHandling::Ambiguous => Alphabet {
-            symbols: NUCLEOTIDES,
-            ambiguous: AMB_NUCLEOTIDES,
-            char_sets: &NUCLEOTIDE_SETS,
-            index: &NUCLEOTIDE_INDEX,
-            valid_symbols,
-        },
-        _ => Alphabet {
-            symbols: NUCLEOTIDES,
-            ambiguous: AMB_NUCLEOTIDES,
-            char_sets: &NUCLEOTIDE_GAP_SETS,
-            index: &NUCLEOTIDE_INDEX,
-            valid_symbols,
-        },
+    let char_sets: &Vec<FreqVector> = match gap_handling {
+        GapHandling::Ambiguous => &NUCLEOTIDE_SETS_AMBIG,
+        _ => &NUCLEOTIDE_SETS_PROPER,
+    };
+    Alphabet {
+        model_type: DNA(DNAModelType::UNDEF),
+        gap_handling: gap_handling.clone(),
+        symbols: NUCLEOTIDES,
+        ambiguous: AMB_NUCLEOTIDES,
+        index: &NUCLEOTIDE_INDEX,
+        valid_symbols: &VALID_NUCLEOTIDES,
+        char_sets,
     }
 }
 
-fn protein_alphabet(gap_handlind: &GapHandling) -> Alphabet {
-    let mut valid_symbols: HashSet<u8> = AMINOACIDS.iter().cloned().collect();
-    valid_symbols.extend(AMB_AMINOACIDS.iter().cloned());
-    valid_symbols.insert(GAP);
-    match gap_handlind {
-        GapHandling::Ambiguous => Alphabet {
-            symbols: AMINOACIDS,
-            ambiguous: AMB_AMINOACIDS,
-            char_sets: &AMINOACID_SETS_AMBIG,
-            index: &AMINOACID_INDEX,
-            valid_symbols,
-        },
-        _ => Alphabet {
-            symbols: AMINOACIDS,
-            ambiguous: AMB_AMINOACIDS,
-            char_sets: &AMINOACID_SETS_PROPER,
-            index: &AMINOACID_INDEX,
-            valid_symbols,
-        },
+fn protein_alphabet(gap_handling: &GapHandling) -> Alphabet {
+    let char_sets: &Vec<FreqVector> = match gap_handling {
+        GapHandling::Ambiguous => &AMINOACID_SETS_AMBIG,
+        _ => &AMINOACID_SETS_PROPER,
+    };
+    Alphabet {
+        model_type: Protein(ProteinModelType::UNDEF),
+        gap_handling: gap_handling.clone(),
+        symbols: AMINOACIDS,
+        ambiguous: AMB_AMINOACIDS,
+        index: &AMINOACID_INDEX,
+        valid_symbols: &VALID_AMINOACIDS,
+        char_sets,
     }
 }
 
@@ -126,7 +116,7 @@ lazy_static! {
         index[GAP as usize] = 4;
         index
     };
-    pub static ref NUCLEOTIDE_GAP_SETS: Vec<FreqVector> = {
+    pub static ref NUCLEOTIDE_SETS_PROPER: Vec<FreqVector> = {
         let mut map = vec![frequencies!(&[0.0; 4]); 255];
         for (i, elem) in map.iter_mut().enumerate() {
             let char = i as u8;
@@ -138,14 +128,21 @@ lazy_static! {
         }
         map
     };
-    pub static ref NUCLEOTIDE_SETS: Vec<FreqVector> = {
+    pub static ref NUCLEOTIDE_SETS_AMBIG: Vec<FreqVector> = {
         let mut map = vec![frequencies!(&[0.0; 4]); 255];
         for (i, elem) in map.iter_mut().enumerate() {
             let char = i as u8;
             elem.set_column(0, &generic_nucleotide_sets(char));
         }
         map
     };
+    pub static ref VALID_NUCLEOTIDES: HashSet<u8> = {
+        NUCLEOTIDES
+            .iter()
+            .chain(AMB_NUCLEOTIDES.iter().chain([GAP].iter()))
+            .cloned()
+            .collect()
+    };
 }
 
 fn generic_nucleotide_sets(char: u8) -> FreqVector {
@@ -207,6 +204,13 @@ lazy_static! {
         }
         map
     };
+    pub static ref VALID_AMINOACIDS: HashSet<u8> = {
+        AMINOACIDS
+            .iter()
+            .chain(AMB_AMINOACIDS.iter().chain([GAP].iter()))
+            .cloned()
+            .collect()
+    };
 }
 
 fn generic_aminoacid_sets(char: u8) -> FreqVector {