Skip to content

Commit

Permalink
fix: one method to rescue title data from GenBank
Browse files Browse the repository at this point in the history
The issue this commit is trying to fix is that ingest/bin/genbank-url line 71 is fetching the GenBank Definition Line instead of the references title.

An example GenBank Reference title is at:

https://github.com/nextstrain/dengue/blob/0591cf83fe81b9f75225e492651186d75fd09694/config/reference_dengue_all.gb#L15

To fetch the correct titles:

* Use biopython Entrez queries in batches of 200 to fetch GenBank Reference Titles
* If one of the 200 GenBank IDs fails for any reason, loop fetch one citation at a time with a time delay between each fetch
* For GenBank IDs on genbank-url, but not on Entrez, pass through the ndjson record without modifiying the title
* Capture log messages and incorporate title correction into the fetch snakemake rule
  • Loading branch information
j23414 committed Apr 4, 2023
1 parent 430ff78 commit 099600c
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 1 deletion.
118 changes: 118 additions & 0 deletions ingest/bin/transform-citations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#! /usr/bin/env python
"""
Updates the title and journal fields of the NDJSON record from stdin and outputs by making Entrez queries to fetch citations
"""

import argparse
import json
import re
from sys import stderr, stdin, stdout
import time

from Bio import SeqIO
from Bio import Entrez
import requests
import warnings


def parse_args():
parser = argparse.ArgumentParser(
description="Use Entrez query to extract citations. Fetch in batches of group size."
)
parser.add_argument(
"--genbank-id-field",
default="genbank_accession",
)
parser.add_argument(
"--group-size",
default=200,
help="Fetch in batches of group size [default:200]",
required=False,
)
parser.add_argument(
"--entrez-email",
default="hello@nextstrain.org",
help="Entrez email address [default:hello@nextstrain.org]",
required=False,
)
return parser.parse_args()


def fetch_and_print_citations(data: dict, genbank_id_field: str):
genbank_ids = ",".join([d[genbank_id_field] for d in data])

try:
handle = Entrez.efetch(
db="nucleotide", id=genbank_ids, rettype="gb", retmode="text", retmax=1000
)
record = SeqIO.parse(handle, "genbank")

results_dict = dict()
for row in record:
results_dict[row.id.split(".")[0]] = row.annotations["references"][0].title

# Maintain the order in data
for d in data:
if d["genbank_accession"] in results_dict:
d["title"] = results_dict[d["genbank_accession"]]

# Always print the record
json.dump(d, stdout, allow_nan=False, indent=None, separators=",:")
print()

handle.close()
except Exception as exception_msg:
for d in data:
fetch_one_citation(d, genbank_id_field)
time.sleep(1)

return None


def fetch_one_citation(data: dict, genbank_id_field: str):
genbank_id = data[genbank_id_field]

try:
handle = Entrez.efetch(
db="nucleotide", id=genbank_id, rettype="gb", retmode="text", retmax=1000
)
record = SeqIO.read(handle, "genbank")

data["title"] = record.annotations["references"][0].title

json.dump(data, stdout, allow_nan=False, indent=None, separators=",:")
print()

handle.close()

except Exception as exception_msg:
warnings.warn(f"Pass through and skip title processing for {data[genbank_id_field]}: {exception_msg}", stacklevel=2)

# example: GenBank ON123563-81 records are in the ndjson but were removed from Entrez
json.dump(data, stdout, allow_nan=False, indent=None, separators=",:")
print()

return None


def main():
args = parse_args()

data = []
chunk_size = args.group_size
genbank_id_field = args.genbank_id_field

Entrez.email = args.entrez_email

for index, record in enumerate(stdin):
data.append(json.loads(record))
if (index + 1) % chunk_size == 0:
fetch_and_print_citations(data, genbank_id_field)
data = []

if data:
fetch_and_print_citations(data, genbank_id_field)


if __name__ == "__main__":
main()
5 changes: 4 additions & 1 deletion ingest/workflow/snakemake_rules/fetch_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,12 @@ rule fetch_from_genbank:
genbank_ndjson="data/genbank_{serotype}.ndjson",
params:
serotype_tax_id=download_serotype,
log:
"logs/fetch_from_genbank_{serotype}.txt",
shell:
"""
./bin/fetch-from-genbank {params.serotype_tax_id} > {output.genbank_ndjson}
./bin/fetch-from-genbank {params.serotype_tax_id} > {output.genbank_ndjson}_temp
( cat {output.genbank_ndjson}_temp | ./bin/transform-citations.py > {output.genbank_ndjson} ) 2>> {log}
"""


Expand Down

0 comments on commit 099600c

Please sign in to comment.