Skip to content

Commit

Permalink
Ingest: change genbank_accession to accession
Browse files Browse the repository at this point in the history
To match the pathogen repo guide, change:

* `genbank_accession` to `accession`
* `genbank_accession_rev` to `accession_version`

There should be a subsequent change in the phylogenetic workflow
  • Loading branch information
j23414 committed Dec 18, 2024
1 parent 57a8457 commit 1a168b5
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 16 deletions.
18 changes: 9 additions & 9 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ curate:
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
field_map:
accession: genbank_accession
accession-rev: genbank_accession_rev
accession: accession
accession_version: accession_version
isolate-lineage: strain
sourcedb: database
geo-region: region
Expand All @@ -63,7 +63,7 @@ curate:
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: "^.+$"
# Back up strain name field to use if "strain" doesn"t match regex above
strain_backup_fields: ["genbank_accession"]
strain_backup_fields: ["accession"]
# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: ["date", "date_released", "date_updated"]
# List of expected date formats that are present in the date fields provided above
Expand Down Expand Up @@ -94,18 +94,18 @@ curate:
# Serotype field name inferred from NCBI Genbank annotation
serotype_field: "serotype_genbank"
# The ID field in the metadata to use to merge the manual annotations
annotations_id: "genbank_accession"
annotations_id: "accession"
# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: "genbank_accession"
output_id_field: "accession"
# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: "sequence"
# The field in the NDJSON record that contains the actual GenBank accession
genbank_accession: "genbank_accession"
genbank_accession: "accession"
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: [
"strain",
"genbank_accession",
"genbank_accession_rev",
"accession",
"accession_version",
"date",
"region",
"country",
Expand All @@ -129,7 +129,7 @@ nextclade:
gene: ["E","C","M","pr","NS1","NS2A","NS2B","NS3","NS4A","2K","NS4B","NS5"]
# Nextclade Fields to rename to metadata field names.
field_map:
seqName: genbank_accession # ID field used to merge annotations
seqName: accession # ID field used to merge annotations
clade: genotype_nextclade
alignmentStart: alignmentStart
alignmentEnd: alignmentEnd
Expand Down
6 changes: 3 additions & 3 deletions ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ rule format_ncbi_dataset_report:
--elide-header \
| csvtk fix-quotes -Ht \
| csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \
| csvtk rename -t -f accession -n accession-rev \
| csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \
| csvtk rename -t -f accession -n accession_version \
| csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \
| csvtk del-quotes -t \
| tsv-select -H -f accession --rest last \
> {output.ncbi_dataset_tsv}
Expand All @@ -99,7 +99,7 @@ rule format_ncbi_datasets_ndjson:
augur curate passthru \
--metadata {input.ncbi_dataset_tsv} \
--fasta {input.ncbi_dataset_sequences} \
--seq-id-column accession-rev \
--seq-id-column accession_version \
--seq-field sequence \
--unmatched-reporting warn \
--duplicate-reporting warn \
Expand Down
3 changes: 3 additions & 0 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,13 @@ rule calculate_gene_coverage:
gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv",
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS,
params:
id_field=config["curate"]["output_id_field"],
shell:
"""
python scripts/calculate-gene-converage-from-nextclade-translation.py \
--fasta {input.nextclade_translation} \
--out-id {params.id_field} \
--out-col {wildcards.gene}_coverage \
> {output.gene_coverage}
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ def parse_args():
type=str,
help="FASTA file of CDS translations from Nextclade.",
)
parser.add_argument(
"--out-id",
type=str,
default="accession",
help="Output record ID.",
)
parser.add_argument(
"--out-col",
type=str,
Expand All @@ -26,16 +32,16 @@ def parse_args():
return parser.parse_args()


def calculate_gene_coverage_from_nextclade_cds(fasta, out_col):
def calculate_gene_coverage_from_nextclade_cds(fasta, out_id, out_col):
"""
Calculate gene coverage from amino acid sequence in gene translation FASTA file from Nextclade.
"""
print(f"genbank_accession\t{out_col}")
print(f"{out_id}\t{out_col}")
# Iterate over the sequences in the FASTA file
for record in SeqIO.parse(fasta, "fasta"):
sequence_id = record.id
sequence = str(record.seq)

# Calculate gene coverage
results = re.findall(r"([ACDEFGHIKLMNPQRSTVWY])", sequence.upper())
gene_coverage = round(len(results) / len(sequence), 3)
Expand All @@ -47,7 +53,7 @@ def calculate_gene_coverage_from_nextclade_cds(fasta, out_col):
def main():
args = parse_args()

calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_col)
calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_id, args.out_col)


if __name__ == "__main__":
Expand Down

0 comments on commit 1a168b5

Please sign in to comment.