diff --git a/CHANGES.md b/CHANGES.md index 0a56c1c79..734a2b83d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -32,6 +32,8 @@ * If `TreeTimeError` is encountered Augur now exits with code 2 rather than 0. (This restores the original behaviour.) [#1367][] (@jameshadfield) * ancestral, translate: Avoid incompatibilities with Biopython 1.82. [#1374][] (@victorlin) * ancestral, translate: Address Biopython deprecation warnings. [#1379][] (@victorlin) +* ancestral: Previously, the help text for `--genes` falsely claimed that it could accept a file. Now, it can truly claim that. [#1353][] (@victorlin) +* Deprecate `read_strains` from `augur.utils` and add it to the public API under `augur.io`. [#1353][] (@victorlin) [#1344]: https://github.com/nextstrain/augur/pull/1344 @@ -43,6 +45,7 @@ [#1374]: https://github.com/nextstrain/augur/pull/1374 [#1379]: https://github.com/nextstrain/augur/pull/1379 [#1352]: https://github.com/nextstrain/augur/pull/1352 +[#1353]: https://github.com/nextstrain/augur/pull/1353 ## 23.1.1 (7 November 2023) diff --git a/DEPRECATED.md b/DEPRECATED.md index 22eb6f35c..18fe3938c 100644 --- a/DEPRECATED.md +++ b/DEPRECATED.md @@ -4,6 +4,16 @@ These features are deprecated, which means they are no longer maintained and will go away in a future major version of Augur. They are currently still available for backwards compatibility, but should not be used in new code. +## `augur.utils.read_strains` + +*Deprecated in December 2023. Planned for removal March 2024 or after.* + +This is part of a [larger effort](https://github.com/nextstrain/augur/issues/1011) +to formalize Augur's Python API. + +We recognize the existing usage of this function, so it has been moved to +`augur.io.read_strains`. + ## `augur export v1` *Deprecated in version 22.2.0 (July 2023). Planned for [removal](https://github.com/nextstrain/augur/issues/1266) diff --git a/augur/ancestral.py b/augur/ancestral.py index 621faeb6a..6eab76daf 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -28,7 +28,7 @@ from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -from .utils import read_tree, InvalidTreeError, write_json, get_json_name +from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name from treetime.vcf_utils import read_vcf, write_vcf from collections import defaultdict @@ -320,9 +320,11 @@ def run(args): 'strand': '+', 'type': 'source'}} if not is_vcf and args.genes: + genes = parse_genes_argument(args.genes) + from .utils import load_features ## load features; only requested features if genes given - features = load_features(args.annotation, args.genes) + features = load_features(args.annotation, genes) # Ensure the already-created nuc annotation coordinates match those parsed from the reference file if (features['nuc'].location.start+1 != anc_seqs['annotations']['nuc']['start'] or features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']): @@ -330,7 +332,7 @@ def run(args): f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).") print("Read in {} features from reference sequence file".format(len(features))) - for gene in args.genes: + for gene in genes: print(f"Processing gene: {gene}") fname = args.translations.replace("%GENE", gene) feat = features[gene] diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py index 9889ba2e6..9ea1e03e8 100644 --- a/augur/filter/include_exclude_rules.py +++ b/augur/filter/include_exclude_rules.py @@ -9,8 +9,8 @@ from augur.errors import AugurError from augur.io.metadata import METADATA_DATE_COLUMN from augur.io.print import print_err +from augur.io.strains import read_strains from augur.io.vcf import is_vcf as filename_is_vcf -from augur.utils import read_strains from . import constants try: diff --git a/augur/io/__init__.py b/augur/io/__init__.py index 685f9ff78..4a721bffb 100644 --- a/augur/io/__init__.py +++ b/augur/io/__init__.py @@ -5,3 +5,4 @@ from .file import open_file # noqa: F401 from .metadata import read_metadata # noqa: F401 from .sequences import read_sequences, write_sequences # noqa: F401 +from .strains import read_strains # noqa: F401 diff --git a/augur/io/strains.py b/augur/io/strains.py new file mode 100644 index 000000000..170212633 --- /dev/null +++ b/augur/io/strains.py @@ -0,0 +1,27 @@ +from augur.utils import read_entries + + +def read_strains(*files, comment_char="#"): + """Reads strain names from one or more plain text files and returns the + set of distinct strains. + + Strain names can be commented with full-line or inline comments. For + example, the following is a valid strain names file:: + + # this is a comment at the top of the file + strain1 # exclude strain1 because it isn't sequenced properly + strain2 + # this is an empty line that will be ignored. + + Parameters + ---------- + files : iterable of str + one or more names of text files with one strain name per line + + Returns + ------- + set : + strain names from the given input files + + """ + return set(read_entries(*files, comment_char=comment_char)) diff --git a/augur/translate.py b/augur/translate.py index bc4a1bc13..79ce911e7 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -13,11 +13,11 @@ The mutation positions in the node-data JSON are one-based. """ -import os, sys +import sys import numpy as np from Bio import SeqIO, Seq, SeqRecord, Phylo from .io.vcf import write_VCF_translation -from .utils import read_node_data, load_features, write_json, get_json_name +from .utils import parse_genes_argument, read_node_data, load_features, write_json, get_json_name from treetime.vcf_utils import read_vcf from augur.errors import AugurError from textwrap import dedent @@ -303,24 +303,6 @@ def assign_aa_fasta(tree, translations): return aa_muts -def get_genes_from_file(fname): - genes = [] - if os.path.isfile(fname): - with open(fname, encoding='utf-8') as ifile: - for line in ifile: - fields = line.strip().split('#') - if fields[0].strip(): - genes.append(fields[0].strip()) - else: - print("File with genes not found. Looking for", fname) - - unique_genes = np.unique(np.array(genes)) - if len(unique_genes) != len(genes): - print("You have duplicates in your genes file. They are being ignored.") - print("Read in {} specified genes to translate.".format(len(unique_genes))) - - return unique_genes - def sequences_vcf(reference_fasta, vcf): """ Extract the nucleotide variation in the VCF @@ -395,11 +377,7 @@ def run(args): is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]) check_arg_combinations(args, is_vcf) - # If genes is a file, read in the genes to translate - if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]): - genes = get_genes_from_file(args.genes[0]) - else: - genes = args.genes + genes = parse_genes_argument(args.genes) ## load features; only requested features if genes given features = load_features(args.reference_sequence, genes) diff --git a/augur/utils.py b/augur/utils.py index 96bcb65b5..f691a774b 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -5,10 +5,12 @@ import os, json, sys import pandas as pd from collections import defaultdict, OrderedDict +from textwrap import dedent from .__version__ import __version__ from augur.data import as_file from augur.io.file import open_file +from augur.io.print import print_err from augur.types import ValidationMode from augur.errors import AugurError @@ -749,11 +751,17 @@ def load_mask_sites(mask_file): def read_strains(*files, comment_char="#"): - """Reads strain names from one or more plain text files and returns the - set of distinct strains. + print_err(dedent(""" + DEPRECATION WARNING: augur.utils.read_strains is no longer maintained and will be removed in the future. + Please use augur.io.read_strains instead.""")) + return set(read_entries(*files, comment_char=comment_char)) - Strain names can be commented with full-line or inline comments. For - example, the following is a valid strain names file:: + +def read_entries(*files, comment_char="#"): + """Reads entries (one per line) from one or more plain text files. + + Entries can be commented with full-line or inline comments. For example, the + following is a valid file:: # this is a comment at the top of the file strain1 # exclude strain1 because it isn't sequenced properly @@ -763,21 +771,48 @@ def read_strains(*files, comment_char="#"): Parameters ---------- files : iterable of str - one or more names of text files with one strain name per line + one or more names of text files with one entry per line Returns ------- set : - strain names from the given input files + lines from the given input files """ - strains = set() + entries = list() for input_file in files: with open_file(input_file, 'r') as ifile: for line in ifile: # Allow comments anywhere in a given line. - strain_name = line.split(comment_char)[0].strip() - if len(strain_name) > 0: - strains.add(strain_name) + entry = line.split(comment_char)[0].strip() + if len(entry) > 0: + entries.append(entry) + + return entries + + +def parse_genes_argument(input): + if input is None: + return None + + # If input is a file, read in the genes to translate + if len(input) == 1 and os.path.isfile(input[0]): + return _get_genes_from_file(input[0]) + + # Otherwise, the input itself is assumed to be a list of genes + return input + + +def _get_genes_from_file(fname): + if os.path.isfile(fname): + genes = read_entries(fname) + else: + print("File with genes not found. Looking for", fname) + genes = [] + + unique_genes = np.unique(np.array(genes)) + if len(unique_genes) != len(genes): + print("You have duplicates in your genes file. They are being ignored.") + print("Read in {} specified genes to translate.".format(len(unique_genes))) - return strains + return unique_genes diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t new file mode 100644 index 000000000..09ed75bb4 --- /dev/null +++ b/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t @@ -0,0 +1,22 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Infer ancestral nucleotide and amino acid sequences, using a genes file. + + $ ${AUGUR} ancestral \ + > --tree $TESTDIR/../data/tree.nwk \ + > --alignment $TESTDIR/../data/aligned.fasta \ + > --annotation $TESTDIR/../data/zika_outgroup.gb \ + > --genes $TESTDIR/../data/genes.txt \ + > --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \ + > --output-node-data ancestral_mutations.json \ + > --output-sequences ancestral_sequences.fasta \ + > --output-translations ancestral_aa_sequences_%GENE.fasta > /dev/null + +Check that the reference length was correctly exported as the nuc annotation + + $ grep -E "\"(ENV|PRO|nuc)\": {" ancestral_mutations.json + "ENV": { + "PRO": { + "nuc": { diff --git a/tests/functional/ancestral/data/genes.txt b/tests/functional/ancestral/data/genes.txt new file mode 100644 index 000000000..7451d03eb --- /dev/null +++ b/tests/functional/ancestral/data/genes.txt @@ -0,0 +1,2 @@ +ENV +PRO diff --git a/tests/io/test_strains.py b/tests/io/test_strains.py new file mode 100644 index 000000000..dfd4ffe8b --- /dev/null +++ b/tests/io/test_strains.py @@ -0,0 +1,19 @@ +from pathlib import Path + +from augur.io.strains import read_strains + + +def test_read_strains(tmpdir): + # Write one list of filenames with some unnecessary whitespace. + strains1 = Path(tmpdir) / Path("strains1.txt") + with open(strains1, "w") as oh: + oh.write("strain1 # this is an inline comment about strain 1\nstrain2\n # this is a comment preceded by whitespace.\n") + + # Write another list of filenames with a comment. + strains2 = Path(tmpdir) / Path("strains2.txt") + with open(strains2, "w") as oh: + oh.write("# this is a comment. ignore this.\nstrain2\nstrain3\n") + + strains = read_strains(strains1, strains2) + assert len(strains) == 3 + assert "strain1" in strains diff --git a/tests/test_utils.py b/tests/test_utils.py index e4661a249..a1183cf58 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -79,21 +79,6 @@ def test_read_mask_file_drm_file(self, tmpdir): fh.write("\n".join(drm_lines)) assert utils.read_mask_file(drm_file) == expected_sites - def test_read_strains(self, tmpdir): - # Write one list of filenames with some unnecessary whitespace. - strains1 = Path(tmpdir) / Path("strains1.txt") - with open(strains1, "w") as oh: - oh.write("strain1 # this is an inline comment about strain 1\nstrain2\n # this is a comment preceded by whitespace.\n") - - # Write another list of filenames with a comment. - strains2 = Path(tmpdir) / Path("strains2.txt") - with open(strains2, "w") as oh: - oh.write("# this is a comment. ignore this.\nstrain2\nstrain3\n") - - strains = utils.read_strains(strains1, strains2) - assert len(strains) == 3 - assert "strain1" in strains - def test_write_json_data_types(self, tmpdir): """write_json should be able to serialize various data types.""" data = {