From 5819db52fe806dee5b428579aefd6d29aac7687f Mon Sep 17 00:00:00 2001 From: Aliaksandr Dziarkach <18146690+AliaksandrDziarkach@users.noreply.github.com> Date: Fri, 13 Sep 2024 14:42:19 +0300 Subject: [PATCH] #2341 Save to Sequence and FASTA doesn't work for monomers loaded from HELM with inline SMILES Fix FASTA export. Add UT. --- .../ref/formats/ket_to_fasta.py.out | 1 + .../integration/tests/formats/ket_to_fasta.py | 5 +- .../formats/molecules/2341-no-analog.ket | 784 ++++++++++++++++++ .../tests/formats/ref/2341-no-analog.fasta | 2 + .../molecule/src/sequence_saver.cpp | 22 +- 5 files changed, 809 insertions(+), 5 deletions(-) create mode 100644 api/tests/integration/tests/formats/molecules/2341-no-analog.ket create mode 100644 api/tests/integration/tests/formats/ref/2341-no-analog.fasta diff --git a/api/tests/integration/ref/formats/ket_to_fasta.py.out b/api/tests/integration/ref/formats/ket_to_fasta.py.out index 34d04436b8..f4d9f28502 100644 --- a/api/tests/integration/ref/formats/ket_to_fasta.py.out +++ b/api/tests/integration/ref/formats/ket_to_fasta.py.out @@ -2,4 +2,5 @@ 1822-peptide.fasta:SUCCEED 1843-rna.fasta:SUCCEED 1950-mixed-seq.fasta:SUCCEED +2341-no-analog.fasta:SUCCEED nucleotides.fasta:SUCCEED diff --git a/api/tests/integration/tests/formats/ket_to_fasta.py b/api/tests/integration/tests/formats/ket_to_fasta.py index b0557a5d4d..e3c11ea62b 100644 --- a/api/tests/integration/tests/formats/ket_to_fasta.py +++ b/api/tests/integration/tests/formats/ket_to_fasta.py @@ -27,6 +27,7 @@ def find_diff(a, b): "1843-rna", "1950-mixed-seq", "nucleotides", + "2341-no-analog", ] lib = indigo.loadMonomerLibraryFromFile( @@ -35,9 +36,9 @@ def find_diff(a, b): files.sort() for filename in files: - mol = indigo.loadMoleculeFromFile(os.path.join(root, filename + ".ket")) + mol = indigo.loadKetDocumentFromFile(os.path.join(root, filename + ".ket")) # with open(os.path.join(ref_path, filename) + ".fasta", "w") as file: - # file.write(mol.fasta()) + # file.write(mol.fasta(lib)) with open(os.path.join(ref_path, filename) + ".fasta", "r") as file: seq_ref = file.read() seq = mol.fasta(lib) diff --git a/api/tests/integration/tests/formats/molecules/2341-no-analog.ket b/api/tests/integration/tests/formats/molecules/2341-no-analog.ket new file mode 100644 index 0000000000..59c37ff9c0 --- /dev/null +++ b/api/tests/integration/tests/formats/molecules/2341-no-analog.ket @@ -0,0 +1,784 @@ +{ + "root": { + "nodes": [ + { + "$ref": "monomer65" + }, + { + "$ref": "monomer66" + }, + { + "$ref": "monomer67" + }, + { + "$ref": "monomer68" + } + ], + "connections": [ + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer65", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer66", + "attachmentPointId": "R1" + } + }, + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer66", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer67", + "attachmentPointId": "R1" + } + }, + { + "connectionType": "single", + "endpoint1": { + "monomerId": "monomer67", + "attachmentPointId": "R2" + }, + "endpoint2": { + "monomerId": "monomer68", + "attachmentPointId": "R1" + } + } + ], + "templates": [ + { + "$ref": "monomerTemplate-Mod0" + }, + { + "$ref": "monomerTemplate-Mod1" + }, + { + "$ref": "monomerTemplate-Mod2" + }, + { + "$ref": "monomerTemplate-Mod3" + } + ] + }, + "monomer65": { + "type": "monomer", + "id": "65", + "position": { + "x": 17.500000000000004, + "y": -11.3625 + }, + "alias": "Mod0", + "templateId": "Mod0", + "seqid": 1 + }, + "monomerTemplate-Mod0": { + "type": "monomerTemplate", + "atoms": [ + { + "label": "N", + "location": [ + -0.5, + 0.866025, + 0 + ] + }, + { + "label": "C", + "location": [ + 0, + 0, + 0 + ] + }, + { + "label": "C", + "location": [ + 1, + 0, + 0 + ] + }, + { + "label": "O", + "location": [ + 1.5, + 0.866025, + 0 + ] + }, + { + "label": "C", + "location": [ + -0.5, + -0.866025, + 0 + ] + }, + { + "label": "H", + "location": [ + 1.5, + -0.866025, + 0 + ] + }, + { + "label": "H", + "location": [ + -1.5, + 0.866025, + 0 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 2 + ] + }, + { + "type": 2, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 5, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 0 + ] + } + ], + "class": "AminoAcid", + "id": "Mod0", + "fullName": "Mod0", + "alias": "Mod0", + "attachmentPoints": [ + { + "attachmentAtom": 0, + "label": "R1", + "leavingGroup": { + "atoms": [ + 6 + ] + } + }, + { + "attachmentAtom": 2, + "label": "R2", + "leavingGroup": { + "atoms": [ + 5 + ] + } + } + ], + "naturalAnalogShort": "" + }, + "monomer66": { + "type": "monomer", + "id": "66", + "position": { + "x": 19.100000000000005, + "y": -11.3625 + }, + "alias": "Mod1", + "templateId": "Mod1", + "seqid": 2 + }, + "monomerTemplate-Mod1": { + "type": "monomerTemplate", + "atoms": [ + { + "label": "C", + "location": [ + 1, + 0, + 0 + ] + }, + { + "label": "C", + "location": [ + 0, + 0, + 0 + ] + }, + { + "label": "C", + "location": [ + -0.5, + -0.866025, + 0 + ] + }, + { + "label": "S", + "location": [ + -1.5, + -0.866025, + 0 + ] + }, + { + "label": "N", + "location": [ + -0.5, + 0.866025, + 0 + ] + }, + { + "label": "O", + "location": [ + 1.5, + -0.866025, + 0 + ] + }, + { + "label": "H", + "location": [ + 1.5, + 0.866025, + 0 + ] + }, + { + "label": "H", + "location": [ + -1.5, + 0.866026, + 0 + ] + }, + { + "label": "H", + "location": [ + -2, + -1.73205, + 0 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 4 + ] + }, + { + "type": 2, + "atoms": [ + 0, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 6, + 0 + ] + }, + { + "type": 1, + "atoms": [ + 7, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 8, + 3 + ] + } + ], + "class": "AminoAcid", + "id": "Mod1", + "fullName": "Mod1", + "alias": "Mod1", + "attachmentPoints": [ + { + "attachmentAtom": 4, + "label": "R1", + "leavingGroup": { + "atoms": [ + 7 + ] + } + }, + { + "attachmentAtom": 0, + "label": "R2", + "leavingGroup": { + "atoms": [ + 6 + ] + } + }, + { + "attachmentAtom": 3, + "label": "R3", + "leavingGroup": { + "atoms": [ + 8 + ] + } + } + ], + "naturalAnalogShort": "" + }, + "monomer67": { + "type": "monomer", + "id": "67", + "position": { + "x": 20.700000000000003, + "y": -11.3625 + }, + "alias": "Mod2", + "templateId": "Mod2", + "seqid": 3 + }, + "monomerTemplate-Mod2": { + "type": "monomerTemplate", + "atoms": [ + { + "label": "C", + "location": [ + -0.866025, + -0.5, + 0 + ] + }, + { + "label": "C", + "location": [ + 0, + 0, + 0 + ] + }, + { + "label": "C", + "location": [ + 0.866025, + -0.5, + 0 + ] + }, + { + "label": "C", + "location": [ + 1.732051, + 0, + 0 + ] + }, + { + "label": "O", + "location": [ + 2.598076, + -0.5, + 0 + ] + }, + { + "label": "O", + "location": [ + 1.732051, + 1, + 0 + ] + }, + { + "label": "N", + "location": [ + 0, + 1, + 0 + ] + }, + { + "label": "O", + "location": [ + -0.866026, + -1.5, + 0 + ] + }, + { + "label": "H", + "location": [ + -0.866025, + 1.5, + 0 + ] + }, + { + "label": "H", + "location": [ + -1.732051, + 0, + 0 + ] + }, + { + "label": "H", + "location": [ + 3.464102, + 0, + 0 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 3, + 4 + ] + }, + { + "type": 2, + "atoms": [ + 3, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 6 + ] + }, + { + "type": 2, + "atoms": [ + 0, + 7 + ] + }, + { + "type": 1, + "atoms": [ + 8, + 6 + ] + }, + { + "type": 1, + "atoms": [ + 9, + 0 + ] + }, + { + "type": 1, + "atoms": [ + 10, + 4 + ] + } + ], + "class": "AminoAcid", + "id": "Mod2", + "fullName": "Mod2", + "alias": "Mod2", + "attachmentPoints": [ + { + "attachmentAtom": 6, + "label": "R1", + "leavingGroup": { + "atoms": [ + 8 + ] + } + }, + { + "attachmentAtom": 0, + "label": "R2", + "leavingGroup": { + "atoms": [ + 9 + ] + } + }, + { + "attachmentAtom": 4, + "label": "R3", + "leavingGroup": { + "atoms": [ + 10 + ] + } + } + ], + "naturalAnalogShort": "" + }, + "monomer68": { + "type": "monomer", + "id": "68", + "position": { + "x": 22.300000000000004, + "y": -11.3625 + }, + "alias": "Mod3", + "templateId": "Mod3", + "seqid": 4 + }, + "monomerTemplate-Mod3": { + "type": "monomerTemplate", + "atoms": [ + { + "label": "C", + "location": [ + 0.866025, + -0.5, + 0 + ] + }, + { + "label": "C", + "location": [ + 0, + 0, + 0 + ] + }, + { + "label": "C", + "location": [ + -0.866025, + -0.5, + 0 + ] + }, + { + "label": "O", + "location": [ + -0.866025, + -1.5, + 0 + ] + }, + { + "label": "N", + "location": [ + 0, + 1, + 0 + ] + }, + { + "label": "C", + "location": [ + 1.732051, + 0, + 0 + ] + }, + { + "label": "C", + "location": [ + 1.73205, + 1, + 0 + ] + }, + { + "label": "C", + "location": [ + 2.598076, + -0.5, + 0 + ] + }, + { + "label": "H", + "location": [ + -1.732051, + 0, + 0 + ] + }, + { + "label": "H", + "location": [ + -0.866026, + 1.5, + 0 + ] + } + ], + "bonds": [ + { + "type": 1, + "atoms": [ + 0, + 1 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 2 + ] + }, + { + "type": 2, + "atoms": [ + 2, + 3 + ] + }, + { + "type": 1, + "atoms": [ + 1, + 4 + ] + }, + { + "type": 1, + "atoms": [ + 0, + 5 + ] + }, + { + "type": 1, + "atoms": [ + 5, + 6 + ] + }, + { + "type": 1, + "atoms": [ + 5, + 7 + ] + }, + { + "type": 1, + "atoms": [ + 8, + 2 + ] + }, + { + "type": 1, + "atoms": [ + 9, + 4 + ] + } + ], + "class": "AminoAcid", + "id": "Mod3", + "fullName": "Mod3", + "alias": "Mod3", + "attachmentPoints": [ + { + "attachmentAtom": 4, + "label": "R1", + "leavingGroup": { + "atoms": [ + 9 + ] + } + }, + { + "attachmentAtom": 2, + "label": "R2", + "leavingGroup": { + "atoms": [ + 8 + ] + } + } + ], + "naturalAnalogShort": "" + } +} \ No newline at end of file diff --git a/api/tests/integration/tests/formats/ref/2341-no-analog.fasta b/api/tests/integration/tests/formats/ref/2341-no-analog.fasta new file mode 100644 index 0000000000..b85d38217c --- /dev/null +++ b/api/tests/integration/tests/formats/ref/2341-no-analog.fasta @@ -0,0 +1,2 @@ +>Sequence1 +XXXX \ No newline at end of file diff --git a/core/indigo-core/molecule/src/sequence_saver.cpp b/core/indigo-core/molecule/src/sequence_saver.cpp index 15df4d0264..06c84c9806 100644 --- a/core/indigo-core/molecule/src/sequence_saver.cpp +++ b/core/indigo-core/molecule/src/sequence_saver.cpp @@ -769,7 +769,8 @@ void SequenceSaver::saveKetDocument(KetDocument& doc, SeqFormat sf) auto monomer_alias = monomer->alias(); if (monomer_class == MonomerClass::CHEM) throw Error("Can't save chem '%s' to sequence format", monomer_alias.c_str()); - if (monomer_class == MonomerClass::Sugar || monomer_class == MonomerClass::Phosphate) + if (monomer_class == MonomerClass::Sugar || monomer_class == MonomerClass::Phosphate || + (monomer_class == MonomerClass::Base && sequence.size() == 1)) continue; if (monomer_alias.size() > 1 || @@ -778,8 +779,23 @@ void SequenceSaver::saveKetDocument(KetDocument& doc, SeqFormat sf) (monomer_class == MonomerClass::Base && STANDARD_NUCLEOTIDES.count(monomer_alias) == 0 && STANDARD_MIXED_BASES.count(monomer_alias) == 0)) { const auto& monomer_template = doc.templates().at(monomer->templateId()); - if (monomer_template.hasStringProp("naturalAnalogShort")) - monomer_alias = monomer_template.getStringProp("naturalAnalogShort"); + std::string short_analog; + if (monomer_template.hasStringProp("naturalAnalog")) + { + std::string analog = monomer_template.getStringProp("naturalAnalog"); + short_analog = monomerAliasByName(MonomerTemplate::MonomerClassToStr(monomer_class), analog); + if (short_analog == analog && analog.size() > 1) + short_analog = ""; + } + if (short_analog.size() == 0 && monomer_template.hasStringProp("naturalAnalogShort")) + { + short_analog = monomer_template.getStringProp("naturalAnalogShort"); + } + + if (short_analog.size() == 1) + monomer_alias = short_analog; + else if (monomer_class == MonomerClass::AminoAcid) + monomer_alias = "X"; else throw Error("Can't save '%s' to sequence format", monomer_alias.c_str()); }