From 78dfe638237b6251584a121ed521b7137d0c8a1f Mon Sep 17 00:00:00 2001 From: Aliaksandr Dziarkach <18146690+AliaksandrDziarkach@users.noreply.github.com> Date: Tue, 10 Sep 2024 12:21:44 +0300 Subject: [PATCH] #2336 - Ribose sugar doesn't allow to load IDT custom mixed bases (#2342) --- .../integration/ref/formats/idt_to_ket.py.out | 2 + .../integration/ref/formats/ket_to_idt.py.out | 1 + .../integration/tests/formats/idt_to_ket.py | 2 + .../integration/tests/formats/ket_to_idt.py | 1 + .../formats/ref/idt_rna_dna_mixed_custom.ket | 1109 +++++++++++++++++ core/indigo-core/molecule/monomer_commons.h | 7 +- .../molecule/src/sequence_loader.cpp | 4 +- 7 files changed, 1123 insertions(+), 3 deletions(-) create mode 100644 api/tests/integration/tests/formats/ref/idt_rna_dna_mixed_custom.ket diff --git a/api/tests/integration/ref/formats/idt_to_ket.py.out b/api/tests/integration/ref/formats/idt_to_ket.py.out index 749f5565bb..b7e05e094a 100644 --- a/api/tests/integration/ref/formats/idt_to_ket.py.out +++ b/api/tests/integration/ref/formats/idt_to_ket.py.out @@ -22,6 +22,7 @@ idt_mixed_std.ket:SUCCEED idt_mod_phosphates.ket:SUCCEED idt_modifications.ket:SUCCEED idt_prefix_suffix.ket:SUCCEED +idt_rna_dna_mixed_custom.ket:SUCCEED idt_single_nucleoside.ket:SUCCEED idt_std_phosphates.ket:SUCCEED idt_t_i2moera.ket:SUCCEED @@ -46,5 +47,6 @@ Test 'Ar/3Phos/': got expected error 'Sugar prefix could not be used with modifi Test 'T/52MOErA/': got expected error 'IDT alias '52MOErA' cannot be used at three prime end.' Test 'T/5Phos/': got expected error 'IDT alias '5Phos' cannot be used at three prime end.' Test 'm/5Phos/A': got expected error 'Sugar prefix could not be used with modified monomer.' +Test 'r(B1:50003000)(B1)': got expected error 'Unknown mixed base 'B1'' Test 'r+A': got expected error 'Sugar prefix 'r' whithout base.' Test 'r/5Phos/A': got expected error 'Sugar prefix could not be used with modified monomer.' diff --git a/api/tests/integration/ref/formats/ket_to_idt.py.out b/api/tests/integration/ref/formats/ket_to_idt.py.out index 67ee5b7644..223ce78375 100644 --- a/api/tests/integration/ref/formats/ket_to_idt.py.out +++ b/api/tests/integration/ref/formats/ket_to_idt.py.out @@ -23,6 +23,7 @@ idt_mod_phosphates.ket:SUCCEED idt_modifications.ket:SUCCEED idt_more_than_80_chars.ket:SUCCEED idt_prefix_suffix.ket:SUCCEED +idt_rna_dna_mixed_custom.ket:SUCCEED idt_single_nucleoside.ket:SUCCEED idt_std_phosphates.ket:SUCCEED idt_t_i2moera.ket:SUCCEED diff --git a/api/tests/integration/tests/formats/idt_to_ket.py b/api/tests/integration/tests/formats/idt_to_ket.py index 769b675cca..5ecb097c67 100644 --- a/api/tests/integration/tests/formats/idt_to_ket.py +++ b/api/tests/integration/tests/formats/idt_to_ket.py @@ -58,6 +58,7 @@ def find_diff(a, b): "idt_unsplit": "/5UNSPLIT//iUNSPLIT//3UNSPLIT/", "idt_mixed_std": "ARAS", "idt_mixed_custom": "(N1:10203050)(N1)N", + "idt_rna_dna_mixed_custom": "r(R1:50003000)(R1)", } lib = indigo.loadMonomerLibraryFromFile( @@ -99,6 +100,7 @@ def find_diff(a, b): "/52MOErA/*/3Phos/": "Symbol '*' could be placed only between two nucleotides/nucleosides.", "/52MOErA//32MOErA/*": "Monomer /32MOErA/ doesn't have phosphate, so '*' couldn't be applied.", "/3Phos/*": "Symbol '*' could be placed only between two nucleotides/nucleosides.", + "r(B1:50003000)(B1)": "Unknown mixed base 'B1'", } for idt_seq in sorted(idt_errors.keys()): error = idt_errors[idt_seq] diff --git a/api/tests/integration/tests/formats/ket_to_idt.py b/api/tests/integration/tests/formats/ket_to_idt.py index d6a132af0e..f26f155180 100644 --- a/api/tests/integration/tests/formats/ket_to_idt.py +++ b/api/tests/integration/tests/formats/ket_to_idt.py @@ -63,6 +63,7 @@ def find_diff(a, b): "idt_more_than_80_chars": "/52MOErA//i2MOErA//i2MOErA//i2MOErA//i2MOErA//i2MOErA//i2MOErA//i2MOErA//i2MOErA//i2MOErA//i2MOErA//i2MOErA//3Phos/", "idt_mixed_std": "ARAS", "idt_mixed_custom": "(N1:10203050)(N1)N", + "idt_rna_dna_mixed_custom": "r(R1:50003000)(R1)", } for filename in sorted(idt_data.keys()): diff --git a/api/tests/integration/tests/formats/ref/idt_rna_dna_mixed_custom.ket b/api/tests/integration/tests/formats/ref/idt_rna_dna_mixed_custom.ket new file mode 100644 index 0000000000..86760e2c57 --- /dev/null +++ b/api/tests/integration/tests/formats/ref/idt_rna_dna_mixed_custom.ket @@ -0,0 +1,1109 @@ +{ + "root": { + "nodes": [ + { + "$ref": "monomer0" + }, + { + "$ref": "ambiguousMonomer-1" + }, + { + "$ref": "monomer2" + }, + { + "$ref": "monomer3" + }, + { + "$ref": "ambiguousMonomer-4" + } + ], + "connections": [ + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer0", + "attachmentPointId": "R3" + }, + "endpoint2": { + "monomerId": "ambiguousMonomer-1", + "attachmentPointId": "R1" + } + }, + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer0", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer2", + "attachmentPointId": "R1" + } + }, + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer3", + "attachmentPointId": "R3" + }, + "endpoint2": { + "monomerId": "ambiguousMonomer-4", + "attachmentPointId": "R1" + } + }, + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer2", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer3", + "attachmentPointId": "R1" + } + } + ], + "templates": [ + { + "$ref": "monomerTemplate-A___Adenine" + }, + { + "$ref": "monomerTemplate-G___Guanine" + }, + { + "$ref": "monomerTemplate-R___Ribose" + }, + { + "$ref": "monomerTemplate-P___Phosphate" + }, + { + "$ref": "monomerTemplate-dR___Deoxy-Ribose" + }, + { + "$ref": "ambiguousMonomerTemplate-(R1)" + } + ] + }, + "monomer0": { + "type": "monomer", + "id": "0", + "seqid": 0, + "position": { + "x": 0.000000, + "y": -0.000000 + }, + "alias": "R", + "templateId": "R___Ribose" + }, + "ambiguousMonomer-1": { + "type": "ambiguousMonomer", + "id": "1", + "position": { + "x": 0.000000, + "y": -1.600000 + }, + "seqid": 0, + "alias": "(R1)", + "templateId": "(R1)" + }, + "monomer2": { + "type": "monomer", + "id": "2", + "seqid": 0, + "position": { + "x": 1.600000, + "y": -0.000000 + }, + "alias": "P", + "templateId": "P___Phosphate" + }, + "monomer3": { + "type": "monomer", + "id": "3", + "seqid": 1, + "position": { + "x": 3.200000, + "y": -0.000000 + }, + "alias": "dR", + "templateId": "dR___Deoxy-Ribose" + }, + "ambiguousMonomer-4": { + "type": "ambiguousMonomer", + "id": "4", + "position": { + "x": 3.200000, + "y": -1.600000 + }, + "seqid": 1, + "alias": "(R1)", + "templateId": "(R1)" + }, + "monomerTemplate-A___Adenine": { + "type": "monomerTemplate", + "id": "A___Adenine", + "class": "Base", + "classHELM": "RNA", + "fullName": "Adenine", + "alias": "A", + "naturalAnalogShort": "A", + "attachmentPoints": [ + { + "attachmentAtom": 6, + "type": "left", + "leavingGroup": { + "atoms": [ + 10 + ] + } + } + ], + "atoms": [ + { + "label": "C", + "location": [ + 1.035400, + 0.249800, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.079200, + -0.754000, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -1.505700, + -0.290600, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -1.817700, + 1.176600, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.703100, + 2.180400, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + 0.723500, + 1.717000, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -2.387100, + -1.503400, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -1.505300, + -2.716800, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -0.078700, + -2.253200, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + 2.176800, + -0.120900, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -3.587100, + -1.503400, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 9 + ] + }, + { + "type": 2, + "atoms": [ + 0, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 8, + 1 + ] + }, + { + "type": 2, + "atoms": [ + 1, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 2, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 4, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 7 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 10 + ] + }, + { + "type": 2, + "atoms": [ + 7, + 8 + ] + } + ] + }, + "monomerTemplate-G___Guanine": { + "type": "monomerTemplate", + "id": "G___Guanine", + "class": "Base", + "classHELM": "RNA", + "fullName": "Guanine", + "alias": "G", + "naturalAnalogShort": "G", + "attachmentPoints": [ + { + "attachmentAtom": 6, + "type": "left", + "leavingGroup": { + "atoms": [ + 11 + ] + } + } + ], + "atoms": [ + { + "label": "C", + "location": [ + 1.035400, + 0.249800, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.079200, + -0.754000, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -1.505700, + -0.290600, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -1.817700, + 1.176600, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.703100, + 2.180400, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + 0.723500, + 1.717000, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -2.387100, + -1.503400, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -1.505300, + -2.716800, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -0.078700, + -2.253200, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 2.176800, + -0.120900, + 0.000000 + ] + }, + { + "label": "N", + "location": [ + -0.952700, + 3.354200, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -3.587100, + -1.503400, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 2, + "atoms": [ + 0, + 9 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 8, + 1 + ] + }, + { + "type": 2, + "atoms": [ + 1, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 2, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 4, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 4, + 10 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 7 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 11 + ] + }, + { + "type": 2, + "atoms": [ + 7, + 8 + ] + } + ] + }, + "monomerTemplate-R___Ribose": { + "type": "monomerTemplate", + "id": "R___Ribose", + "class": "Sugar", + "classHELM": "RNA", + "fullName": "Ribose", + "alias": "R", + "naturalAnalogShort": "R", + "attachmentPoints": [ + { + "attachmentAtom": 9, + "type": "left", + "leavingGroup": { + "atoms": [ + 10 + ] + } + }, + { + "attachmentAtom": 5, + "type": "right", + "leavingGroup": { + "atoms": [ + 11 + ] + } + }, + { + "attachmentAtom": 2, + "type": "side", + "leavingGroup": { + "atoms": [ + 8 + ] + } + } + ], + "atoms": [ + { + "label": "O", + "location": [ + -1.101700, + -1.066300, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.589700, + 0.343600, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "C", + "location": [ + 0.080900, + -1.988900, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "C", + "location": [ + 0.909500, + 0.292400, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "C", + "location": [ + 1.323900, + -1.149300, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "O", + "location": [ + 1.828500, + 1.475500, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 2.451800, + -1.558900, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -1.431000, + 1.583400, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 0.039900, + -3.188100, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + -2.927900, + 1.475500, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -3.601700, + 2.468400, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + 3.017400, + 1.312500, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 7 + ], + "stereo": 6 + }, + { + "type": 1, + "atoms": [ + 2, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 8 + ], + "stereo": 6 + }, + { + "type": 1, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 3, + 5 + ], + "stereo": 1 + }, + { + "type": 1, + "atoms": [ + 4, + 6 + ], + "stereo": 1 + }, + { + "type": 1, + "atoms": [ + 5, + 11 + ] + }, + { + "type": 1, + "atoms": [ + 7, + 9 + ] + }, + { + "type": 1, + "atoms": [ + 9, + 10 + ] + } + ] + }, + "monomerTemplate-P___Phosphate": { + "type": "monomerTemplate", + "id": "P___Phosphate", + "class": "Phosphate", + "classHELM": "RNA", + "fullName": "Phosphate", + "alias": "P", + "naturalAnalogShort": "P", + "attachmentPoints": [ + { + "attachmentAtom": 0, + "type": "left", + "leavingGroup": { + "atoms": [ + 1 + ] + } + }, + { + "attachmentAtom": 0, + "type": "right", + "leavingGroup": { + "atoms": [ + 3 + ] + } + } + ], + "atoms": [ + { + "label": "P", + "location": [ + -0.239900, + 0.000000, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + -1.439900, + 0.000000, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 0.359800, + -1.039400, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 0.960100, + 0.000000, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 0.359800, + 1.039400, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 2, + "atoms": [ + 0, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 4 + ] + } + ] + }, + "monomerTemplate-dR___Deoxy-Ribose": { + "type": "monomerTemplate", + "id": "dR___Deoxy-Ribose", + "class": "Sugar", + "classHELM": "RNA", + "fullName": "Deoxy-Ribose", + "alias": "dR", + "naturalAnalogShort": "R", + "attachmentPoints": [ + { + "attachmentAtom": 8, + "type": "left", + "leavingGroup": { + "atoms": [ + 9 + ] + } + }, + { + "attachmentAtom": 5, + "type": "right", + "leavingGroup": { + "atoms": [ + 10 + ] + } + }, + { + "attachmentAtom": 2, + "type": "side", + "leavingGroup": { + "atoms": [ + 7 + ] + } + } + ], + "atoms": [ + { + "label": "O", + "location": [ + -0.878800, + -1.208000, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -0.366800, + 0.201900, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "C", + "location": [ + 0.303800, + -2.130700, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "C", + "location": [ + 1.132300, + 0.150600, + 0.000000 + ], + "stereoLabel": "abs" + }, + { + "label": "C", + "location": [ + 1.546800, + -1.291000, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 2.051500, + 1.333800, + 0.000000 + ] + }, + { + "label": "C", + "location": [ + -1.208100, + 1.441700, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + 0.262800, + -3.329900, + 0.000000 + ] + }, + { + "label": "O", + "location": [ + -2.705000, + 1.333800, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + -3.378800, + 2.326700, + 0.000000 + ] + }, + { + "label": "H", + "location": [ + 3.240300, + 1.170900, + 0.000000 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 6 + ], + "stereo": 6 + }, + { + "type": 1, + "atoms": [ + 2, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 7 + ], + "stereo": 6 + }, + { + "type": 1, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 3, + 5 + ], + "stereo": 1 + }, + { + "type": 1, + "atoms": [ + 5, + 10 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 8 + ] + }, + { + "type": 1, + "atoms": [ + 8, + 9 + ] + } + ] + }, + "ambiguousMonomerTemplate-(R1)": { + "type": "ambiguousMonomerTemplate", + "subtype": "mixture", + "id": "(R1)", + "alias": "(R1)", + "options": [ + { + "templateId": "A___Adenine", + "ratio": 50.000000 + }, + { + "templateId": "G___Guanine", + "ratio": 30.000000 + } + ] + } +} \ No newline at end of file diff --git a/core/indigo-core/molecule/monomer_commons.h b/core/indigo-core/molecule/monomer_commons.h index 9361d554ff..7a3338f273 100644 --- a/core/indigo-core/molecule/monomer_commons.h +++ b/core/indigo-core/molecule/monomer_commons.h @@ -133,7 +133,10 @@ namespace indigo {{"E", "Q"}, "Z"}, {{"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "Y"}, "X"}}; static const std::map, std::string> STANDARD_MIXED_BASES_TO_ALIAS = { - {{"A", "G"}, "R"}, {{"C", "T"}, "Y"}, {{"A", "C"}, "M"}, {{"G", "T"}, "K"}, {{"G", "C"}, "S"}, {{"A", "T"}, "W"}, - {{"A", "C", "T"}, "H"}, {{"C", "G", "T"}, "B"}, {{"A", "C", "G"}, "V"}, {{"A", "G", "T"}, "D"}, {{"A", "C", "G", "T"}, "N"}}; + {{"A", "G"}, "R"}, {{"C", "T"}, "Y"}, {{"C", "U"}, "rY"}, {{"A", "C"}, "M"}, {{"G", "T"}, "K"}, + {{"G", "U"}, "rK"}, {{"G", "C"}, "S"}, {{"A", "T"}, "W"}, {{"A", "U"}, "rW"}, {{"A", "C", "T"}, "H"}, + {{"A", "C", "U"}, "rH"}, {{"C", "G", "T"}, "B"}, {{"C", "G", "U"}, "rB"}, {{"A", "C", "G"}, "V"}, {{"A", "G", "T"}, "D"}, + {{"A", "G", "U"}, "rD"}, {{"A", "C", "G", "T"}, "N"}, {{"A", "C", "G", "U"}, "rN"}}; + static const std::set RNA_DNA_MIXED_BASES = {"R", "M", "S", "V"}; } #endif \ No newline at end of file diff --git a/core/indigo-core/molecule/src/sequence_loader.cpp b/core/indigo-core/molecule/src/sequence_loader.cpp index 315ce0e671..8a3c223c69 100644 --- a/core/indigo-core/molecule/src/sequence_loader.cpp +++ b/core/indigo-core/molecule/src/sequence_loader.cpp @@ -1093,8 +1093,10 @@ void SequenceLoader::loadIdt(KetDocument& document) mixed_base = mixed_base[0]; } } - if (sugar == "R") + if (sugar == "R" && RNA_DNA_MIXED_BASES.count(mixed_base) == 0) + { idt_alias = 'r' + idt_alias; + } if (!document.hasVariantMonomerTemplate(idt_alias)) { auto it = STANDARD_MIXED_BASES.find(mixed_base);