From 1a168b5d3107e9fd5ce9b95fd298774c62ef06a6 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 17 Dec 2024 16:37:14 -0800
Subject: [PATCH] Ingest: change `genbank_accession` to `accession`

To match the pathogen repo guide, change:

* `genbank_accession` to `accession`
* `genbank_accession_rev` to `accession_version`

There should be a subsequent change in the phylogenetic workflow
---
 ingest/defaults/config.yaml                    | 18 +++++++++---------
 ingest/rules/fetch_from_ncbi.smk               |  6 +++---
 ingest/rules/nextclade.smk                     |  3 +++
 ...ene-converage-from-nextclade-translation.py | 14 ++++++++++----
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index 51e960ea..f5c16b2b 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -42,8 +42,8 @@ curate:
   # The original field names should match the ncbi_datasets_fields provided above.
   # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
   field_map:
-    accession: genbank_accession
-    accession-rev: genbank_accession_rev
+    accession: accession
+    accession_version: accession_version
     isolate-lineage: strain
     sourcedb: database
     geo-region: region
@@ -63,7 +63,7 @@ curate:
   # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
   strain_regex: "^.+$"
   # Back up strain name field to use if "strain" doesn"t match regex above
-  strain_backup_fields: ["genbank_accession"]
+  strain_backup_fields: ["accession"]
   # List of date fields to standardize to ISO format YYYY-MM-DD
   date_fields: ["date", "date_released", "date_updated"]
   # List of expected date formats that are present in the date fields provided above
@@ -94,18 +94,18 @@ curate:
   # Serotype field name inferred from NCBI Genbank annotation
   serotype_field: "serotype_genbank"
   # The ID field in the metadata to use to merge the manual annotations
-  annotations_id: "genbank_accession"
+  annotations_id: "accession"
   # The ID field in the metadata to use as the sequence id in the output FASTA file
-  output_id_field: "genbank_accession"
+  output_id_field: "accession"
   # The field in the NDJSON record that contains the actual genomic sequence
   output_sequence_field: "sequence"
   # The field in the NDJSON record that contains the actual GenBank accession
-  genbank_accession: "genbank_accession"
+  genbank_accession: "accession"
   # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: [
     "strain",
-    "genbank_accession",
-    "genbank_accession_rev",
+    "accession",
+    "accession_version",
     "date",
     "region",
     "country",
@@ -129,7 +129,7 @@ nextclade:
   gene: ["E","C","M","pr","NS1","NS2A","NS2B","NS3","NS4A","2K","NS4B","NS5"]
   # Nextclade Fields to rename to metadata field names.
   field_map:
-    seqName: genbank_accession # ID field used to merge annotations
+    seqName: accession # ID field used to merge annotations
     clade: genotype_nextclade
     alignmentStart: alignmentStart
     alignmentEnd: alignmentEnd
diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
index 55a66936..0932c2e2 100644
--- a/ingest/rules/fetch_from_ncbi.smk
+++ b/ingest/rules/fetch_from_ncbi.smk
@@ -77,8 +77,8 @@ rule format_ncbi_dataset_report:
             --elide-header \
             | csvtk fix-quotes -Ht \
             | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \
-            | csvtk rename -t -f accession -n accession-rev \
-            | csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \
+            | csvtk rename -t -f accession -n accession_version \
+            | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \
             | csvtk del-quotes -t \
             | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}
@@ -99,7 +99,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column accession-rev \
+            --seq-id-column accession_version \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
index 643fbb5e..06644d61 100644
--- a/ingest/rules/nextclade.smk
+++ b/ingest/rules/nextclade.smk
@@ -107,10 +107,13 @@ rule calculate_gene_coverage:
         gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv",
     wildcard_constraints:
         serotype=SEROTYPE_CONSTRAINTS,
+    params:
+        id_field=config["curate"]["output_id_field"],
     shell:
         """
         python scripts/calculate-gene-converage-from-nextclade-translation.py \
           --fasta {input.nextclade_translation} \
+          --out-id {params.id_field} \
           --out-col {wildcards.gene}_coverage \
           > {output.gene_coverage}
         """
diff --git a/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py b/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py
index 4ab7c199..c6bdcb5d 100644
--- a/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py
+++ b/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py
@@ -17,6 +17,12 @@ def parse_args():
         type=str,
         help="FASTA file of CDS translations from Nextclade.",
     )
+    parser.add_argument(
+        "--out-id",
+        type=str,
+        default="accession",
+        help="Output record ID.",
+    )
     parser.add_argument(
         "--out-col",
         type=str,
@@ -26,16 +32,16 @@ def parse_args():
     return parser.parse_args()
 
 
-def calculate_gene_coverage_from_nextclade_cds(fasta, out_col):
+def calculate_gene_coverage_from_nextclade_cds(fasta, out_id, out_col):
     """
     Calculate gene coverage from amino acid sequence in gene translation FASTA file from Nextclade.
     """
-    print(f"genbank_accession\t{out_col}")
+    print(f"{out_id}\t{out_col}")
     # Iterate over the sequences in the FASTA file
     for record in SeqIO.parse(fasta, "fasta"):
         sequence_id = record.id
         sequence = str(record.seq)
-        
+
         # Calculate gene coverage
         results = re.findall(r"([ACDEFGHIKLMNPQRSTVWY])",  sequence.upper())
         gene_coverage = round(len(results) / len(sequence), 3)
@@ -47,7 +53,7 @@ def calculate_gene_coverage_from_nextclade_cds(fasta, out_col):
 def main():
     args = parse_args()
 
-    calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_col)
+    calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_id, args.out_col)
 
 
 if __name__ == "__main__":