Merge remote-tracking branch 'origin/victorlin/parse-reorder-id-columns'

nextstrain · Sep 17, 2024 · 3de75cd · 3de75cd
2 parents d672de0 + 5e0bc54
commit 3de75cd
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 16 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,6 +5,7 @@
 ### Major Changes
 
 * filter: Duplicate header names in the FASTA file (`--sequences`) will now result in an error. [#1613] (@victorlin)
+* parse: When both `strain` and `name` fields are present, the `strain` field will now be used as the sequence ID field. [#1629][] (@victorlin)
 * merge: Generated source columns (e.g. `__source_metadata_{NAME}`) are now omitted by default.  They may be explicitly included with `--source-columns=TEMPLATE` or explicitly omitted with `--no-source-columns`.  This may be a breaking change for any existing uses of `augur merge` relying on the generated columns, though as `augur merge` is relatively new we believe usage to be scant if extant at all. [#1625][] [#1632][] (@tsibley)
 
 ### Bug Fixes
@@ -15,6 +16,7 @@
 [#1598]: https://github.com/nextstrain/augur/issues/1598
 [#1613]: https://github.com/nextstrain/augur/pull/1613
 [#1625]: https://github.com/nextstrain/augur/issues/1625
+[#1629]: https://github.com/nextstrain/augur/pull/1629
 [#1632]: https://github.com/nextstrain/augur/issues/1632
 
 ## 25.4.0 (3 September 2024)

diff --git a/DEPRECATED.md b/DEPRECATED.md
@@ -10,9 +10,7 @@ available for backwards compatibility, but should not be used in new code.
 
 ## `augur parse` preference of `name` over `strain` as the sequence ID field
 
-*Deprecated in version 24.2.0 (February 2024). Planned to be reordered June 2024 or after.*
-
-Currently, `augur parse` checks for a 'name' field and then a 'strain' field to use as a sequence ID. This order will be changed in favor of searching for a 'strain' and then a 'name' field to be more consistent with the rest of Augur.
+*Deprecated in version 24.2.0 (February 2024). Reordered in version 26.0.0 (September 2024).*
 
 Users who have both 'name' and 'strain' fields in their data, and want to favor using the 'name' field should add the following `augur parse` parameter `--output-id-field 'name'`.
 

diff --git a/augur/parse.py b/augur/parse.py
@@ -1,15 +1,17 @@
 """
 Parse delimited fields from FASTA sequence names into a TSV and FASTA file.
 """
+import Bio.SeqRecord
 import pandas as pd
 import sys
+from typing import Dict, Sequence, Tuple
 
 from .io.file import open_file
 from .io.sequences import read_sequences, write_sequences
 from .dates import get_numerical_date_from_value
 from .errors import AugurError
 
-PARSE_DEFAULT_ID_COLUMNS = ("name", "strain")
+PARSE_DEFAULT_ID_COLUMNS = ("strain", "name")
 
 forbidden_characters = str.maketrans(
     {' ': None,
@@ -88,27 +90,34 @@ def prettify(x, trim=0, camelCase=False, etal=None, removeComma=False):
     return res
 
 
-def parse_sequence(sequence, fields, strain_key="strain", separator="|", prettify_fields=None, fix_dates_format=None):
+def parse_sequence(
+        sequence: Bio.SeqRecord.SeqRecord,
+        fields: Sequence[str],
+        strain_key: str,
+        separator: str,
+        prettify_fields: Sequence[str],
+        fix_dates_format: str,
+    ) -> Tuple[Bio.SeqRecord.SeqRecord, Dict[str, str]]:
     """Parse a single sequence record into a sequence record and associated metadata.
 
     Parameters
     ----------
-    sequence : Bio.SeqRecord.SeqRecord
+    sequence
         a BioPython sequence record to parse with metadata stored in its description field.
 
-    fields : list or tuple
+    fields
         a list of names for fields expected in the given record's description.
 
-    strain_key : str
+    strain_key
         name of the field to use as the given sequence's unique id
 
-    separator : str
+    separator
         delimiter to split record description by.
 
-    prettify_fields : list or tuple
+    prettify_fields
         a list of field names for which the values in those fields should be prettified.
 
-    fix_dates_format : str
+    fix_dates_format
         parse "date" field into the requested canonical format ("dayfirst" or "monthfirst").
 
     Returns
@@ -178,8 +187,6 @@ def run(args):
         for possible_id in PARSE_DEFAULT_ID_COLUMNS:
             if possible_id in args.fields:
                 strain_key = possible_id
-                if possible_id == "name" and "strain" in args.fields:
-                    print("DEPRECATED: The default search order for the ID field will be changing from ('name', 'strain') to ('strain', 'name').\nUsers who prefer to keep using 'name' instead of 'strain' should use the parameter: --output-id-field 'name'", file=sys.stderr)
                 break
         if not strain_key:
             strain_key = args.fields[0]

diff --git a/tests/functional/parse.t b/tests/functional/parse.t
@@ -69,10 +69,9 @@ Parse Zika sequences into sequences and metadata, preferred default ids is 'name
   >   --output-sequences "$TMP/sequences.fasta" \
   >   --output-metadata "$TMP/metadata.tsv" \
   >   --fields strain virus name date region country division city db segment authors url title journal paper_url \
+  >   --output-id-field 'name' \
   >   --prettify-fields region country division city \
   >   --fix-dates monthfirst
-  DEPRECATED: The default search order for the ID field will be changing from ('name', 'strain') to ('strain', 'name').
-  Users who prefer to keep using 'name' instead of 'strain' should use the parameter: --output-id-field 'name'
 
   $ diff -u "parse/sequences_other.fasta" "$TMP/sequences.fasta"
   $ rm -f "$TMP/sequences.fasta" "$TMP/metadata.tsv"

diff --git a/tests/test_parse.py b/tests/test_parse.py
@@ -71,7 +71,9 @@ def test_parse_sequence(self):
             sequence_record,
             fields=fields,
             strain_key="strain",
-            prettify_fields=["region"]
+            separator="|",
+            prettify_fields=["region"],
+            fix_dates_format=None,
         )
 
         assert sequence.id == metadata["strain"]