diff --git a/mutalyzer/parsers/genbank.py b/mutalyzer/parsers/genbank.py index b85dd531..266def8f 100644 --- a/mutalyzer/parsers/genbank.py +++ b/mutalyzer/parsers/genbank.py @@ -441,8 +441,12 @@ def create_record(self, filename): # the genbank file) are from the original NC reference. We try to # set the .id field to the working value in the caller. record.source_id = biorecord.id - record.source_accession, record.source_version = biorecord.id.split('.')[:2] - record.source_gi = biorecord.annotations['gi'] + try: + record.source_accession, record.source_version = biorecord.id.split('.')[:2] + except ValueError: + record.source_accession = biorecord.id + record.source_version = '1' + record.source_gi = biorecord.annotations.get('gi') record.organism = biorecord.annotations['organism'] # Todo: This will change once we support protein references diff --git a/tests/data/UD_143772172095.gb.bz2 b/tests/data/UD_143772172095.gb.bz2 new file mode 100644 index 00000000..30f97e92 Binary files /dev/null and b/tests/data/UD_143772172095.gb.bz2 differ diff --git a/tests/data/references.yml b/tests/data/references.yml index 17afd29a..6ab753a8 100644 --- a/tests/data/references.yml +++ b/tests/data/references.yml @@ -144,6 +144,10 @@ MARK1: - null - - XM_005273136 - null +ADAC: + accession: UD_143772172095 + checksum: 0b7f7991c1fb50bdfd04d3b0e405ecf3 + filename: UD_143772172095.gb.bz2 NG_008939.1: checksum: 114a03e16ad2f63531d796c2fb0d7039 filename: NG_008939.1.gb.bz2 diff --git a/tests/test_parsers_genbank.py b/tests/test_parsers_genbank.py index 2248318e..f997e89c 100644 --- a/tests/test_parsers_genbank.py +++ b/tests/test_parsers_genbank.py @@ -48,3 +48,17 @@ def test_only_complete_genes_included(settings, references, parser): filename = os.path.join(settings.CACHE_DIR, '%s.gb.bz2' % accession) record = parser.create_record(filename) assert [g.name for g in record.geneList] == ['A1BG'] + +@with_references('ADAC') +def test_no_version(settings, references, parser): + """ + Genbank file without 'version' field, so BioPython record.id is the + accession number without version. Our parser used to crash on that. + + This genbank file was contributed by Gerard Schaafsma (original + source unknown). + """ + accession = references[0].accession + genbank_filename = os.path.join(settings.CACHE_DIR, + '%s.gb.bz2' % accession) + parser.create_record(genbank_filename)