From 546c9e855e88a60792203655b2e426e45f24a060 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Sat, 21 Oct 2023 13:48:57 +0200 Subject: [PATCH] fix: problem with annotating stop_retained insertions (#131) --- Cargo.toml | 3 +- src/mapper/altseq.rs | 13 +++-- ...gvs__mapper__variant__test__issue_131.snap | 10 ++++ src/mapper/variant.rs | 13 +++++ src/parser/ds.rs | 52 +++++++++---------- tests/data/data/bootstrap.sh | 1 + tests/data/data/uta_20210129-subset.pgd.gz | 4 +- tests/data/seqrepo_cache.fasta | 4 +- 8 files changed, 65 insertions(+), 35 deletions(-) create mode 100644 src/mapper/snapshots/hgvs__mapper__variant__test__issue_131.snap diff --git a/Cargo.toml b/Cargo.toml index 53b6d19..211719c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,13 +31,14 @@ seqrepo = { version = "0.8", features = ["cached"] } serde_json = "1.0" serde = { version = "1.0", features = ["derive"] } thiserror = "1.0" -indexmap = { version = "2.0.0", features = ["serde"] } +indexmap = { version = "2", features = ["serde"] } [dev-dependencies] anyhow = "1.0" criterion = "0.5" csv = "1.2" env_logger = "0.10" +insta = { version = "1", features = ["yaml"] } pretty_assertions = "1.3" rstest = "0.18" test-log = "0.2" diff --git a/src/mapper/altseq.rs b/src/mapper/altseq.rs index 0ad9585..9e301de 100644 --- a/src/mapper/altseq.rs +++ b/src/mapper/altseq.rs @@ -88,6 +88,7 @@ impl RefTranscriptData { } } +#[derive(Debug, Clone)] pub struct AltTranscriptData { /// Transcript nucleotide sequence. #[allow(dead_code)] @@ -697,10 +698,14 @@ impl AltSeqToHgvsp { .last() .expect("should not happen; checked for being non-empty above") + 1; - ( - format!("{}{}", deletion, &ref_sub[..max_diff]), - format!("{}{}", insertion, &alt_sub[..max_diff]), - ) + if max_diff > ref_sub.len() || max_diff > alt_sub.len() { + (deletion.clone(), insertion.clone()) + } else { + ( + format!("{}{}", deletion, &ref_sub[..max_diff]), + format!("{}{}", insertion, &alt_sub[..max_diff]), + ) + } } else { (deletion, insertion) }; diff --git a/src/mapper/snapshots/hgvs__mapper__variant__test__issue_131.snap b/src/mapper/snapshots/hgvs__mapper__variant__test__issue_131.snap new file mode 100644 index 0000000..5071dd6 --- /dev/null +++ b/src/mapper/snapshots/hgvs__mapper__variant__test__issue_131.snap @@ -0,0 +1,10 @@ +--- +source: src/mapper/variant.rs +expression: "&var_p_test" +--- +ProtVariant: + accession: + value: NP_001240838.1 + gene_symbol: ~ + loc_edit: NoChange + diff --git a/src/mapper/variant.rs b/src/mapper/variant.rs index a57b44e..103c8a8 100644 --- a/src/mapper/variant.rs +++ b/src/mapper/variant.rs @@ -959,6 +959,19 @@ mod test { use super::{Config, Mapper}; + #[test] + fn issue_131() -> Result<(), Error> { + let mapper = build_mapper()?; + + let var_c = HgvsVariant::from_str("NM_001253909.2:c.416_417insGTG")?; + let var_p_test = mapper.c_to_p(&var_c, None)?; + + assert_eq!(format!("{}", &var_p_test), "NP_001240838.1:p.="); + insta::assert_yaml_snapshot!(&var_p_test); + + Ok(()) + } + #[test] fn test_sync() { fn is_sync() {} diff --git a/src/parser/ds.rs b/src/parser/ds.rs index 1913240..c577b5f 100644 --- a/src/parser/ds.rs +++ b/src/parser/ds.rs @@ -6,7 +6,7 @@ use crate::parser::error::Error; use log::warn; /// Expression of "maybe uncertain". -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub enum Mu { /// Certain variant of `T`. Certain(T), @@ -53,7 +53,7 @@ impl Mu { } /// Representation of gene symbol, e.g., `TTN` or `Ttn`. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct GeneSymbol { pub value: String, } @@ -79,7 +79,7 @@ impl Deref for GeneSymbol { } /// Edit of nucleic acids. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub enum NaEdit { /// A substitution where both reference and alternative allele are nucleic acid strings /// (or empty). @@ -189,7 +189,7 @@ impl NaEdit { } /// Uncertain change through extension. -#[derive(Clone, Debug, PartialEq, Default)] +#[derive(Clone, Debug, PartialEq, Default, serde::Serialize, serde::Deserialize)] pub enum UncertainLengthChange { #[default] None, @@ -198,7 +198,7 @@ pub enum UncertainLengthChange { } /// Representation of accession, e.g., `NM_01234.5`. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct Accession { pub value: String, } @@ -224,7 +224,7 @@ impl Accession { } /// Protein edit with interval end edit. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub enum ProteinEdit { Fs { alternative: Option, @@ -259,7 +259,7 @@ pub enum ProteinEdit { } /// A HGVS variant specification. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub enum HgvsVariant { /// Variant specification with `c.` location. CdsVariant { @@ -563,7 +563,7 @@ impl HgvsVariant { } /// Coding sequence location with edit. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct CdsLocEdit { /// Location on the CDS. pub loc: Mu, @@ -596,7 +596,7 @@ impl CdsLocEdit { } /// CDS position interval. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct CdsInterval { /// Start position pub start: CdsPos, @@ -627,14 +627,14 @@ impl TryFrom for Range { /// Specifies whether the CDS position is relative to the CDS start or /// CDS end. -#[derive(Clone, Copy, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub enum CdsFrom { Start, End, } /// CDS position. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct CdsPos { /// Base position. pub base: i32, @@ -645,7 +645,7 @@ pub struct CdsPos { } /// Genome sequence location with edit. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct GenomeLocEdit { /// Location on the genome. pub loc: Mu, @@ -678,7 +678,7 @@ impl GenomeLocEdit { } /// Genome position interval. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct GenomeInterval { /// Start position pub start: Option, @@ -701,7 +701,7 @@ impl TryInto> for GenomeInterval { } /// Mitochondrial sequence location with edit. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct MtLocEdit { /// Location on the mitochondrium. pub loc: Mu, @@ -733,7 +733,7 @@ impl MtLocEdit { } } /// Mitochondrial position interval. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct MtInterval { /// Start position pub start: Option, @@ -756,7 +756,7 @@ impl TryInto> for MtInterval { } /// Transcript sequence location with edit. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct TxLocEdit { /// Loction on a transcript. pub loc: Mu, @@ -789,7 +789,7 @@ impl TxLocEdit { } /// Transcript position interval. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct TxInterval { /// Start position pub start: TxPos, @@ -813,7 +813,7 @@ impl From for Range { } /// Transcript position. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct TxPos { /// Base position. pub base: i32, @@ -822,7 +822,7 @@ pub struct TxPos { } /// RNA sequence location with edit. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct RnaLocEdit { /// Location on a transcript. pub loc: Mu, @@ -854,7 +854,7 @@ impl RnaLocEdit { } } /// RNA position interval. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct RnaInterval { /// Start position pub start: RnaPos, @@ -878,7 +878,7 @@ impl From for Range { } /// RNA position. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct RnaPos { /// Base position. pub base: i32, @@ -887,7 +887,7 @@ pub struct RnaPos { } /// Protein sequence location with edit or special. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub enum ProtLocEdit { Ordinary { loc: Mu, @@ -908,7 +908,7 @@ pub enum ProtLocEdit { } /// Protein position interval. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct ProtInterval { /// Start position pub start: ProtPos, @@ -927,7 +927,7 @@ impl From for Range { } /// Protein position. -#[derive(Clone, Debug, PartialEq, Default)] +#[derive(Clone, Debug, PartialEq, Default, serde::Serialize, serde::Deserialize)] pub struct ProtPos { /// Amino acid value. pub aa: String, @@ -973,13 +973,13 @@ mod test { assert_eq!(Mu::from(Some(1), false), Mu::Uncertain(Some(1))); } - #[derive(Clone, Debug, PartialEq)] + #[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct TestInterval { pub start: TestPos, pub end: TestPos, } - #[derive(Clone, Debug, PartialEq)] + #[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] pub struct TestPos { pub base: i32, pub offset: Option, diff --git a/tests/data/data/bootstrap.sh b/tests/data/data/bootstrap.sh index ffbab02..d28a084 100644 --- a/tests/data/data/bootstrap.sh +++ b/tests/data/data/bootstrap.sh @@ -67,6 +67,7 @@ ADGRL3 ADRA2B ADRB2 AGBL5 +AKR1C3 ALG9 AOAH ASB18 diff --git a/tests/data/data/uta_20210129-subset.pgd.gz b/tests/data/data/uta_20210129-subset.pgd.gz index bb5b891..8f2ea26 100644 --- a/tests/data/data/uta_20210129-subset.pgd.gz +++ b/tests/data/data/uta_20210129-subset.pgd.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:280b647608377b041e4bf084159e3903a94d957704c1e9b0381cd3ad58127595 -size 1945350 +oid sha256:1509d58b13c96381d36253952aaa3b2806a14c24506aba065559e73410e783f1 +size 1955694 diff --git a/tests/data/seqrepo_cache.fasta b/tests/data/seqrepo_cache.fasta index 6118e39..3413c31 100644 --- a/tests/data/seqrepo_cache.fasta +++ b/tests/data/seqrepo_cache.fasta @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce29049101f0c60081c12f64debb272e79e2f1cb519879450963c7c9af6791cc -size 662064 +oid sha256:4c6bf18bf1961d2b3856ee78b4853b4227ac20e215e85c5c24bc99933d5c3ba7 +size 663119