Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Nov 19, 2023
1 parent bb64be2 commit f81a41f
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 10 deletions.
15 changes: 15 additions & 0 deletions src/data/cdot/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use crate::{
self, GeneInfoRecord, TxExonsRecord, TxForRegionRecord, TxIdentityInfo, TxInfoRecord,
TxMappingOptionsRecord, TxSimilarityRecord,
},
sequences::TranslationTable,
};
use biocommons_bioutils::assemblies::{Assembly, ASSEMBLY_INFOS};

Expand Down Expand Up @@ -977,6 +978,15 @@ impl TxProvider {
.get(tx_ac)
.ok_or(Error::NoTranscriptFound(tx_ac.to_string()))?;

let needle = "UGA stop codon recoded as selenocysteine";
let is_selenoprotein = tx.genome_builds.iter().any(|(_, genome_alignment)| {
genome_alignment
.note
.clone()
.unwrap_or_default()
.contains(needle)
});

let hgnc = tx
.gene_name
.as_ref()
Expand All @@ -1003,6 +1013,11 @@ impl TxProvider {
cds_end_i: tx.stop_codon.unwrap_or_default(),
lengths,
hgnc,
translation_table: if is_selenoprotein {
TranslationTable::Selenocysteine
} else {
TranslationTable::Standard
},
})
}

Expand Down
4 changes: 3 additions & 1 deletion src/data/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use chrono::NaiveDateTime;
use indexmap::IndexMap;

use crate::data::error::Error;
use crate::{data::error::Error, sequences::TranslationTable};
use biocommons_bioutils::assemblies::Assembly;

/// Information about a gene.
Expand Down Expand Up @@ -140,6 +140,8 @@ pub struct TxIdentityInfo {
pub cds_end_i: i32,
pub lengths: Vec<i32>,
pub hgnc: String,
/// The translation table to use for this transcript.
pub translation_table: TranslationTable,
}

/// ```text
Expand Down
24 changes: 21 additions & 3 deletions src/data/uta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use quick_cache::sync::Cache;
use std::fmt::Debug;
use std::sync::Mutex;

use crate::sequences::seq_md5;
use crate::sequences::{seq_md5, TranslationTable};
use biocommons_bioutils::assemblies::{Assembly, ASSEMBLY_INFOS};

use crate::data::{
Expand Down Expand Up @@ -113,18 +113,35 @@ impl TryFrom<Row> for TxForRegionRecord {
}
}

/// HGNC symbols of selenoproteins.
///
/// Obtained on 2023-11-19 from https://www.genenames.org/data/genegroup/#!/group/890
const SELENOPROTEIN_SYMBOLS: [&str; 25] = [
"DIO1", "DIO3", "GPX1", "GPX2", "GPX3", "GPX4", "GPX6", "SELENOF", "SELENOH", "SELENOI",
"SELENOK", "SELENOM", "SELENON", "SELENOO", "SELENOP", "MSRB1", "SELENOS", "SELENOT",
"SELENOV", "SELENOW", "DIO2", "SEPHS2", "TXNRD1", "TXNRD2", "TXNRD3",
];

impl TryFrom<Row> for TxIdentityInfo {
type Error = Error;

fn try_from(row: Row) -> Result<Self, Self::Error> {
let hgnc = row.try_get("hgnc")?;
let is_selenoprotein = SELENOPROTEIN_SYMBOLS.contains(&hgnc);
Ok(Self {
tx_ac: row.try_get("tx_ac")?,
alt_ac: row.try_get("alt_ac")?,
alt_aln_method: row.try_get("alt_aln_method")?,
cds_start_i: row.try_get("cds_start_i")?,
cds_end_i: row.try_get("cds_end_i")?,
lengths: row.try_get("lengths")?,
hgnc: row.try_get("hgnc")?,
hgnc: hgnc.to_string(),
// UTA database does not support selenoproteins (yet).
translation_table: if is_selenoprotein {
TranslationTable::Selenocysteine
} else {
TranslationTable::Standard
},
})
}
}
Expand Down Expand Up @@ -783,7 +800,8 @@ mod test {
format!("{:?}", &record),
"TxIdentityInfo { tx_ac: \"ENST00000421528\", alt_ac: \"ENST00000421528\", \
alt_aln_method: \"transcript\", cds_start_i: 0, cds_end_i: 985, lengths: \
[24, 229, 174, 108, 129, 75, 150, 143, 1073], hgnc: \"OMA1\" }",
[24, 229, 174, 108, 129, 75, 150, 143, 1073], hgnc: \"OMA1\", translation_table: \
Standard }",
);

Ok(())
Expand Down
14 changes: 11 additions & 3 deletions src/mapper/altseq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ pub struct RefTranscriptData {
pub cds_stop: i32,
/// Accession of the protein or `MD5_${md5sum}`.
pub protein_accession: String,
/// The translation table to use.
pub translation_table: TranslationTable,
}

impl RefTranscriptData {
Expand Down Expand Up @@ -56,8 +58,7 @@ impl RefTranscriptData {
));
}

let aa_sequence =
translate_cds(tx_seq_to_translate, true, "*", TranslationTable::Standard)?;
let aa_sequence = translate_cds(tx_seq_to_translate, true, "*", tx_info.translation_table)?;
let protein_accession = if let Some(pro_ac) = pro_ac {
pro_ac.to_owned()
} else if let Some(pro_ac) = provider.as_ref().get_pro_ac_for_tx_ac(tx_ac)? {
Expand All @@ -84,6 +85,7 @@ impl RefTranscriptData {
cds_start,
cds_stop,
protein_accession,
translation_table: tx_info.translation_table,
})
}
}
Expand Down Expand Up @@ -130,12 +132,13 @@ impl AltTranscriptData {
ref_aa_sequence: &str,
is_substitution: bool,
is_ambiguous: bool,
translation_table: TranslationTable,
) -> Result<Self, Error> {
let transcript_sequence = seq.to_owned();
let aa_sequence = if !seq.is_empty() {
let seq_cds = &transcript_sequence[((cds_start - 1) as usize)..];
let seq_aa = if variant_start_aa.is_some() {
translate_cds(seq_cds, false, "X", TranslationTable::Standard)?
translate_cds(seq_cds, false, "X", translation_table)?
} else {
ref_aa_sequence.to_owned()
};
Expand Down Expand Up @@ -442,6 +445,7 @@ impl AltSeqBuilder {
&self.reference_data.aa_sequence,
is_substitution,
self.ref_has_multiple_stops && self.first_stop_pos.map(|p| p <= start).unwrap_or(false),
self.reference_data.translation_table,
)
}

Expand Down Expand Up @@ -474,6 +478,7 @@ impl AltSeqBuilder {
&self.reference_data.aa_sequence,
false,
self.ref_has_multiple_stops && self.first_stop_pos.map(|p| p <= start).unwrap_or(false),
self.reference_data.translation_table,
)
}

Expand Down Expand Up @@ -505,6 +510,7 @@ impl AltSeqBuilder {
&self.reference_data.aa_sequence,
false,
self.ref_has_multiple_stops && self.first_stop_pos.map(|p| p <= start).unwrap_or(false),
self.reference_data.translation_table,
)
}

Expand All @@ -520,6 +526,7 @@ impl AltSeqBuilder {
&self.reference_data.aa_sequence,
false,
true,
self.reference_data.translation_table,
)
}

Expand All @@ -535,6 +542,7 @@ impl AltSeqBuilder {
&self.reference_data.aa_sequence,
false,
false,
self.reference_data.translation_table,
)
}
}
Expand Down
1 change: 1 addition & 0 deletions src/mapper/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1252,6 +1252,7 @@ mod test {
cds_end_i: record.cds_end_i,
lengths: Vec::new(),
hgnc: "MOCK".to_string(),
..Default::default()
});
}
}
Expand Down
15 changes: 14 additions & 1 deletion src/sequences.rs
Original file line number Diff line number Diff line change
Expand Up @@ -588,9 +588,22 @@ lazy_static::lazy_static! {
static IUPAC_AMBIGUITY_CODES: &[u8] = b"BDHVNUWSMKRYZ";

/// Allow selection of translation table.
#[derive(
Debug,
Default,
Clone,
Copy,
PartialEq,
Eq,
Hash,
PartialOrd,
Ord,
serde::Serialize,
serde::Deserialize,
)]
pub enum TranslationTable {
#[default]
Standard,
#[allow(dead_code)]
Selenocysteine,
}

Expand Down
4 changes: 2 additions & 2 deletions tests/data/mapper/real_cp.tsv
Git LFS file not shown

0 comments on commit f81a41f

Please sign in to comment.