From 1a168b5d3107e9fd5ce9b95fd298774c62ef06a6 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 17 Dec 2024 16:37:14 -0800 Subject: [PATCH] Ingest: change `genbank_accession` to `accession` To match the pathogen repo guide, change: * `genbank_accession` to `accession` * `genbank_accession_rev` to `accession_version` There should be a subsequent change in the phylogenetic workflow --- ingest/defaults/config.yaml | 18 +++++++++--------- ingest/rules/fetch_from_ncbi.smk | 6 +++--- ingest/rules/nextclade.smk | 3 +++ ...ene-converage-from-nextclade-translation.py | 14 ++++++++++---- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 51e960ea..f5c16b2b 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -42,8 +42,8 @@ curate: # The original field names should match the ncbi_datasets_fields provided above. # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names field_map: - accession: genbank_accession - accession-rev: genbank_accession_rev + accession: accession + accession_version: accession_version isolate-lineage: strain sourcedb: database geo-region: region @@ -63,7 +63,7 @@ curate: # Currently accepts any characters because we do not have a clear standard for strain names across pathogens strain_regex: "^.+$" # Back up strain name field to use if "strain" doesn"t match regex above - strain_backup_fields: ["genbank_accession"] + strain_backup_fields: ["accession"] # List of date fields to standardize to ISO format YYYY-MM-DD date_fields: ["date", "date_released", "date_updated"] # List of expected date formats that are present in the date fields provided above @@ -94,18 +94,18 @@ curate: # Serotype field name inferred from NCBI Genbank annotation serotype_field: "serotype_genbank" # The ID field in the metadata to use to merge the manual annotations - annotations_id: "genbank_accession" + annotations_id: "accession" # The ID field in the metadata to use as the sequence id in the output FASTA file - output_id_field: "genbank_accession" + output_id_field: "accession" # The field in the NDJSON record that contains the actual genomic sequence output_sequence_field: "sequence" # The field in the NDJSON record that contains the actual GenBank accession - genbank_accession: "genbank_accession" + genbank_accession: "accession" # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [ "strain", - "genbank_accession", - "genbank_accession_rev", + "accession", + "accession_version", "date", "region", "country", @@ -129,7 +129,7 @@ nextclade: gene: ["E","C","M","pr","NS1","NS2A","NS2B","NS3","NS4A","2K","NS4B","NS5"] # Nextclade Fields to rename to metadata field names. field_map: - seqName: genbank_accession # ID field used to merge annotations + seqName: accession # ID field used to merge annotations clade: genotype_nextclade alignmentStart: alignmentStart alignmentEnd: alignmentEnd diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 55a66936..0932c2e2 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -77,8 +77,8 @@ rule format_ncbi_dataset_report: --elide-header \ | csvtk fix-quotes -Ht \ | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \ - | csvtk rename -t -f accession -n accession-rev \ - | csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \ + | csvtk rename -t -f accession -n accession_version \ + | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \ | csvtk del-quotes -t \ | tsv-select -H -f accession --rest last \ > {output.ncbi_dataset_tsv} @@ -99,7 +99,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column accession-rev \ + --seq-id-column accession_version \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 643fbb5e..06644d61 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -107,10 +107,13 @@ rule calculate_gene_coverage: gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv", wildcard_constraints: serotype=SEROTYPE_CONSTRAINTS, + params: + id_field=config["curate"]["output_id_field"], shell: """ python scripts/calculate-gene-converage-from-nextclade-translation.py \ --fasta {input.nextclade_translation} \ + --out-id {params.id_field} \ --out-col {wildcards.gene}_coverage \ > {output.gene_coverage} """ diff --git a/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py b/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py index 4ab7c199..c6bdcb5d 100644 --- a/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py +++ b/ingest/scripts/calculate-gene-converage-from-nextclade-translation.py @@ -17,6 +17,12 @@ def parse_args(): type=str, help="FASTA file of CDS translations from Nextclade.", ) + parser.add_argument( + "--out-id", + type=str, + default="accession", + help="Output record ID.", + ) parser.add_argument( "--out-col", type=str, @@ -26,16 +32,16 @@ def parse_args(): return parser.parse_args() -def calculate_gene_coverage_from_nextclade_cds(fasta, out_col): +def calculate_gene_coverage_from_nextclade_cds(fasta, out_id, out_col): """ Calculate gene coverage from amino acid sequence in gene translation FASTA file from Nextclade. """ - print(f"genbank_accession\t{out_col}") + print(f"{out_id}\t{out_col}") # Iterate over the sequences in the FASTA file for record in SeqIO.parse(fasta, "fasta"): sequence_id = record.id sequence = str(record.seq) - + # Calculate gene coverage results = re.findall(r"([ACDEFGHIKLMNPQRSTVWY])", sequence.upper()) gene_coverage = round(len(results) / len(sequence), 3) @@ -47,7 +53,7 @@ def calculate_gene_coverage_from_nextclade_cds(fasta, out_col): def main(): args = parse_args() - calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_col) + calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_id, args.out_col) if __name__ == "__main__":