Add a deprecation warning that the id column will be reordered in the…

… future. Instead of a refactor and breaking change from 8dc0694 , this PR adds a deprecation warning to the `parse` step that the default `id` field used will be reordered from ('name', 'strain') to ('strain', 'name'). This will give users time to update their scripts and workflows with an `--output-id-field 'name'` before the change is made.
nextstrain · Feb 7, 2024 · 40e9a00 · 40e9a00
1 parent f587119
commit 40e9a00
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 8 deletions.
diff --git a/DEPRECATED.md b/DEPRECATED.md
@@ -4,6 +4,14 @@ These features are deprecated, which means they are no longer maintained and
 will go away in a future major version of Augur. They are currently still
 available for backwards compatibility, but should not be used in new code.
 
+## `augur parse PARSE_DEFAULT_ID_COLUMNS reordering`
+
+*Deprecated in February 2024. Planned to be reordered June 2024 or after.*
+
+Existing usage of augur parse had a hardcoded check for a 'name' field and then a 'strain' field to use as a sequence ID. This order will be deprecated in favor of searching for a 'strain' and then a 'name' filed to be more consistent with the rest of augur.
+
+Users who have both 'name' and 'strain' fields in their data, and want to favor using the 'name' field should add the following augur parse parameter `--output-id-field name`
+
 ## `augur.utils.read_strains`
 
 *Deprecated in December 2023. Planned for removal March 2024 or after.*

diff --git a/augur/parse.py b/augur/parse.py
@@ -6,10 +6,11 @@
 
 from .io.file import open_file
 from .io.sequences import read_sequences, write_sequences
-from .io.metadata import DEFAULT_ID_COLUMNS
 from .dates import get_numerical_date_from_value
 from .errors import AugurError
 
+PARSE_DEFAULT_ID_COLUMNS = ("name", "strain")
+
 forbidden_characters = str.maketrans(
     {' ': None,
      '(': '_',
@@ -143,7 +144,7 @@ def register_parser(parent_subparsers):
     parser.add_argument('--output-sequences', required=True, help="output sequences file")
     parser.add_argument('--output-metadata', required=True, help="output metadata file")
     parser.add_argument('--output-id-field', required=False,
-                        help=f"The record field to use as the sequence identifier in the FASTA output. If not provided, this will use the first available of {DEFAULT_ID_COLUMNS}. If none of those are available, this will use the first field in the fasta header.")
+                        help=f"The record field to use as the sequence identifier in the FASTA output. If not provided, this will use the first available of {PARSE_DEFAULT_ID_COLUMNS}. If none of those are available, this will use the first field in the fasta header.")
     parser.add_argument('--fields', required=True, nargs='+', help="fields in fasta header")
     parser.add_argument('--prettify-fields', nargs='+', help="apply string prettifying operations (underscores to spaces, capitalization, etc) to specified metadata fields")
     parser.add_argument('--separator', default='|', help="separator of fasta header")
@@ -169,9 +170,11 @@ def run(args):
             raise AugurError(f"Output id field '{args.output_id_field}' not found in fields {args.fields}.")
         strain_key = args.output_id_field
     else:
-        for possible_id in DEFAULT_ID_COLUMNS:
+        for possible_id in PARSE_DEFAULT_ID_COLUMNS:
             if possible_id in args.fields:
                 strain_key = possible_id
+                if possible_id == "name" and "strain" in args.fields:
+                    print("DEPRECATED: The default search order for the ID field will be changing from ('name', 'strain') to ('strain', 'name').\nUsers who prefer to keep using 'name' instead of 'strain' should use the parameter: --output-id-field 'name'", file=sys.stderr)
                 break
         if not strain_key:
             strain_key = args.fields[0]

diff --git a/tests/functional/parse.t b/tests/functional/parse.t
@@ -62,7 +62,7 @@ This should fail.
   ERROR: Output id field 'notexist' not found in fields ['strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url'].
   [2]
 
-Parse Zika sequences into sequences and metadata, preferred default ids is 'strain', then 'name', then first field.
+Parse Zika sequences into sequences and metadata, preferred default ids is 'name', then 'strain', then first field.
 
   $ ${AUGUR} parse \
   >   --sequences parse/zika.fasta \
@@ -71,18 +71,20 @@ Parse Zika sequences into sequences and metadata, preferred default ids is 'stra
   >   --fields strain virus name date region country division city db segment authors url title journal paper_url \
   >   --prettify-fields region country division city \
   >   --fix-dates monthfirst
+  DEPRECATED: The default search order for the ID field will be changing from ('name', 'strain') to ('strain', 'name').
+  Users who prefer to keep using 'name' instead of 'strain' should use the parameter: --output-id-field 'name'
 
-  $ diff -u "parse/sequences.fasta" "$TMP/sequences.fasta"
+  $ diff -u "parse/sequences_other.fasta" "$TMP/sequences.fasta"
   $ rm -f "$TMP/sequences.fasta" "$TMP/metadata.tsv"
 
-Parse Zika sequences into sequences and metadata when there is no 'strain' field.
-This should use the 2nd entry in DEFAULT_ID_COLUMNS ('strain', 'name') instead.
+Parse Zika sequences into sequences and metadata when there is no 'name' field.
+This should use the 2nd entry in DEFAULT_ID_COLUMNS ('name', 'strain') instead.
 
   $ ${AUGUR} parse \
   >   --sequences parse/zika.fasta \
   >   --output-sequences "$TMP/sequences.fasta" \
   >   --output-metadata "$TMP/metadata.tsv" \
-  >   --fields col1 virus name date region country division city db segment authors url title journal paper_url \
+  >   --fields col1 virus strain date region country division city db segment authors url title journal paper_url \
   >   --prettify-fields region country division city \
   >   --fix-dates monthfirst