Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: use array instead of Vec for Codon representation (and other free perf improvements) #169

Merged
merged 1 commit into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ serde = { version = "1.0", features = ["derive"] }
thiserror = "1.0"
indexmap = { version = "2", features = ["serde"] }
biocommons-bioutils = "0.1.0"
ahash = "0.8.11"

[dev-dependencies]
anyhow = "1.0"
Expand Down
75 changes: 41 additions & 34 deletions src/sequences.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
//!
//! Partially ported over from `bioutils.sequences`.

use ahash::AHashMap;
use md5::{Digest, Md5};
use rustc_hash::FxHashMap;

pub use crate::sequences::error::Error;

Expand Down Expand Up @@ -713,42 +713,48 @@ lazy_static::lazy_static! {
("YTR", "L"),
];

static ref AA1_TO_AA3: FxHashMap<&'static [u8], &'static str> = {
let mut m = FxHashMap::default();
static ref AA1_TO_AA3: AHashMap<&'static [u8], &'static str> = {
let mut m = AHashMap::default();
for (aa3, aa1) in AA3_TO_AA1_VEC.iter() {
m.insert(aa1.as_bytes(), *aa3);
}
m
};

static ref AA3_TO_AA1: FxHashMap<&'static [u8], &'static str> = {
let mut m = FxHashMap::default();
static ref AA3_TO_AA1: AHashMap<&'static [u8], &'static str> = {
let mut m = AHashMap::default();
for (aa3, aa1) in AA3_TO_AA1_VEC.iter() {
m.insert(aa3.as_bytes(), *aa1);
}
m
};

static ref DNA_TO_AA1_LUT: FxHashMap<Vec<u8>, u8> = {
let mut m = FxHashMap::default();
static ref DNA_TO_AA1_LUT: AHashMap<Codon, u8> = {
let mut m = AHashMap::default();
for (dna, aa1) in DNA_TO_AA1_LUT_VEC.iter() {
m.insert(Vec::from(dna.as_bytes()), aa1.as_bytes()[0]);
assert_eq!(dna.len(), 3);
let d = dna.as_bytes();
m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]);
}
m
};

static ref DNA_TO_AA1_SEC: FxHashMap<Vec<u8>, u8> = {
let mut m = FxHashMap::default();
static ref DNA_TO_AA1_SEC: AHashMap<Codon, u8> = {
let mut m = AHashMap::default();
for (dna, aa1) in DNA_TO_AA1_SEC_VEC.iter() {
m.insert(Vec::from(dna.as_bytes()), aa1.as_bytes()[0]);
assert_eq!(dna.len(), 3);
let d = dna.as_bytes();
m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]);
}
m
};

static ref DNA_TO_AA1_CHRMT_VERTEBRATE: FxHashMap<Vec<u8>, u8> = {
let mut m = FxHashMap::default();
static ref DNA_TO_AA1_CHRMT_VERTEBRATE: AHashMap<Codon, u8> = {
let mut m = AHashMap::default();
for (dna, aa1) in DNA_TO_AA1_CHRMT_VERTEBRATE_VEC.iter() {
m.insert(Vec::from(dna.as_bytes()), aa1.as_bytes()[0]);
assert_eq!(dna.len(), 3);
let d = dna.as_bytes();
m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]);
}
m
};
Expand Down Expand Up @@ -926,6 +932,8 @@ fn looks_like_aa3_p(seq: &str) -> bool {
seq.len() % 3 == 0 && seq.chars().nth(1).map(|c| c.is_lowercase()).unwrap_or(true)
}

type Codon = [u8; 3];

/// Allow translation of `&[u8]` DNA codons to `u8` amino acids.
///
/// We use separate structs here to encapsulate getting the lazy static global data.
Expand All @@ -940,10 +948,10 @@ struct CodonTranslator {
/// Mapping from 2bit DNA codon to amino acid 1-letter ASCII.
codon_2bit_to_aa1: &'static [u8; 64],
/// Mapping from DNA 2-bit to amino acid 1-letter ASCII including degenerate codons.
full_dna_to_aa1: &'static FxHashMap<Vec<u8>, u8>,
full_dna_to_aa1: &'static AHashMap<Codon, u8>,

/// Buffer.
codon: Vec<u8>,
codon: Codon,
}

impl CodonTranslator {
Expand All @@ -965,7 +973,7 @@ impl CodonTranslator {
TranslationTable::VertebrateMitochondrial => &DNA_TO_AA1_CHRMT_VERTEBRATE,
},

codon: Vec::with_capacity(3),
codon: [0; 3],
}
}

Expand All @@ -981,31 +989,31 @@ impl CodonTranslator {
pub fn translate(&mut self, codon: &[u8]) -> Result<u8, Error> {
// Normalize (to upper case etc.) codon.
self.normalize_codon(codon);
// Attempt fast translation of codon.
if let Some(aa) = self.codon_to_aa1(&self.codon) {
return Ok(aa);
}
if let Some(aa) = self.full_dna_to_aa1.get(&self.codon) {

let translation = self
// Attempt fast translation of codon
.codon_to_aa1(&self.codon)
// Fast translation fails, but slower hash map succeeded.
Ok(*aa)
} else {
.or_else(|| self.full_dna_to_aa1.get(&self.codon).copied())
// If this contains an ambiguous code, set aa to X, otherwise, throw error
for c in codon.iter() {
if self.iupac_ambiguity_codes.contains(c) {
return Ok(b'X');
}
}
return Err(Error::UndefinedCodon(
.or_else(|| {
codon
.iter()
.any(|c| self.iupac_ambiguity_codes.contains(c))
.then_some(b'X')
});
translation.ok_or_else(|| {
Error::UndefinedCodon(
std::str::from_utf8(codon)
.expect("cannot decode UTF-8")
.to_owned(),
));
}
)
})
}

fn dna3_to_2bit(&self, c: &[u8]) -> Option<u8> {
let mut result = 0;
for i in c.iter().take(3) {
for i in &c[..3] {
result <<= 2;
let tmp = self.dna_ascii_to_2bit[*i as usize];
if tmp == 255 {
Expand All @@ -1018,7 +1026,6 @@ impl CodonTranslator {

/// Helper function to extract normalized codon to `self.codon`.
fn normalize_codon(&mut self, codon: &[u8]) {
self.codon.resize(3, 0);
for (i, c) in codon.iter().enumerate() {
self.codon[i] = self.dna_ascii_map[*c as usize];
}
Expand Down
Loading