From a14a35007cf50e8c0a932cbe0d5cc1691ba289c1 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 13 Sep 2023 13:55:23 -0700 Subject: [PATCH 1/2] Copy transform-strain-names from monkeypox Copied from: https://github.com/nextstrain/monkeypox/blob/5969604dfe426745b789746427b580c69d484790/ingest/bin/transform-strain-names --- README.md | 1 + .../transform-strain-names.t | 17 +++++++ transform-strain-names | 50 +++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 tests/transform-strain-names/transform-strain-names.t create mode 100755 transform-strain-names diff --git a/README.md b/README.md index 0fbe706..008ec43 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ Potential augur curate scripts - [transform-authors](transform-authors) - Abbreviates full author lists to ' et al.' - [transform-field-names](transform-field-names) - Rename fields of NDJSON records - [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"[:][, ]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/) +- [transform-strain-names](transform-strain-names) - Ordered search for strain names across several fields. ## Software requirements diff --git a/tests/transform-strain-names/transform-strain-names.t b/tests/transform-strain-names/transform-strain-names.t new file mode 100644 index 0000000..1c05df7 --- /dev/null +++ b/tests/transform-strain-names/transform-strain-names.t @@ -0,0 +1,17 @@ +Look for strain name in "strain" or a list of backup fields. + +If strain entry exists, do not do anything. + + $ echo '{"strain": "i/am/a/strain", "strain_s": "other"}' \ + > | $TESTDIR/../../transform-strain-names \ + > --strain-regex '^.+$' \ + > --backup-fields strain_s accession + {"strain":"i/am/a/strain","strain_s":"other"} + +If strain entry does not exists, search the backup fields + + $ echo '{"strain_s": "other"}' \ + > | $TESTDIR/../../transform-strain-names \ + > --strain-regex '^.+$' \ + > --backup-fields accession strain_s + {"strain_s":"other","strain":"other"} \ No newline at end of file diff --git a/transform-strain-names b/transform-strain-names new file mode 100755 index 0000000..d86c0e4 --- /dev/null +++ b/transform-strain-names @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +""" +Verifies strain name pattern in the 'strain' field of the NDJSON record from +stdin. Adds a 'strain' field to the record if it does not already exist. + +Outputs the modified records to stdout. +""" +import argparse +import json +import re +from sys import stderr, stdin, stdout + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--strain-regex", default="^.+$", + help="Regex pattern for strain names. " + + "Strain names that do not match the pattern will be dropped.") + parser.add_argument("--backup-fields", nargs="*", + help="List of backup fields to use as strain name if the value in 'strain' " + + "does not match the strain regex pattern. " + + "If multiple fields are provided, will use the first field that has a non-empty string.") + + args = parser.parse_args() + + strain_name_pattern = re.compile(args.strain_regex) + + for index, record in enumerate(stdin): + record = json.loads(record) + + # Verify strain name matches the strain regex pattern + if strain_name_pattern.match(record.get('strain', '')) is None: + # Default to empty string if not matching pattern + record['strain'] = '' + # Use non-empty value of backup fields if provided + if args.backup_fields: + for field in args.backup_fields: + if record.get(field): + record['strain'] = str(record[field]) + break + + if record['strain'] == '': + print(f"WARNING: Record number {index} has an empty string as the strain name.", file=stderr) + + + json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') + print() From 6f196f77968ccab63d895fbadb11b87f72e8404d Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 15 Sep 2023 16:40:17 -0700 Subject: [PATCH 2/2] Add Cram tests --- .github/workflows/ci.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 610bbe0..c6a218a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,3 +13,11 @@ jobs: steps: - uses: actions/checkout@v3 - uses: nextstrain/.github/actions/shellcheck@master + + cram: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + - run: pip install cram + - run: cram tests/ \ No newline at end of file