-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: one method to rescue title data from GenBank
The issue this commit is trying to fix is that ingest/bin/genbank-url line 71 is fetching the GenBank Definition Line instead of the references title. An example GenBank Reference title is at: https://github.com/nextstrain/dengue/blob/0591cf83fe81b9f75225e492651186d75fd09694/config/reference_dengue_all.gb#L15 To fetch the correct titles: * Use biopython Entrez queries in batches of 200 to fetch GenBank Reference Titles * If one of the 200 GenBank IDs fails for any reason, loop fetch one citation at a time with a time delay between each fetch * For GenBank IDs on genbank-url, but not on Entrez, pass through the ndjson record without modifiying the title * Capture log messages and incorporate title correction into the fetch snakemake rule
- Loading branch information
Showing
2 changed files
with
122 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
#! /usr/bin/env python | ||
""" | ||
Updates the title and journal fields of the NDJSON record from stdin and outputs by making Entrez queries to fetch citations | ||
""" | ||
|
||
import argparse | ||
import json | ||
import re | ||
from sys import stderr, stdin, stdout | ||
import time | ||
|
||
from Bio import SeqIO | ||
from Bio import Entrez | ||
import requests | ||
import warnings | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser( | ||
description="Use Entrez query to extract citations. Fetch in batches of group size." | ||
) | ||
parser.add_argument( | ||
"--genbank-id-field", | ||
default="genbank_accession", | ||
) | ||
parser.add_argument( | ||
"--group-size", | ||
default=200, | ||
help="Fetch in batches of group size [default:200]", | ||
required=False, | ||
) | ||
parser.add_argument( | ||
"--entrez-email", | ||
default="hello@nextstrain.org", | ||
help="Entrez email address [default:hello@nextstrain.org]", | ||
required=False, | ||
) | ||
return parser.parse_args() | ||
|
||
|
||
def fetch_and_print_citations(data: dict, genbank_id_field: str): | ||
genbank_ids = ",".join([d[genbank_id_field] for d in data]) | ||
|
||
try: | ||
handle = Entrez.efetch( | ||
db="nucleotide", id=genbank_ids, rettype="gb", retmode="text", retmax=1000 | ||
) | ||
record = SeqIO.parse(handle, "genbank") | ||
|
||
results_dict = dict() | ||
for row in record: | ||
results_dict[row.id.split(".")[0]] = row.annotations["references"][0].title | ||
|
||
# Maintain the order in data | ||
for d in data: | ||
if d["genbank_accession"] in results_dict: | ||
d["title"] = results_dict[d["genbank_accession"]] | ||
|
||
# Always print the record | ||
json.dump(d, stdout, allow_nan=False, indent=None, separators=",:") | ||
print() | ||
|
||
handle.close() | ||
except Exception as exception_msg: | ||
for d in data: | ||
fetch_one_citation(d, genbank_id_field) | ||
time.sleep(1) | ||
|
||
return None | ||
|
||
|
||
def fetch_one_citation(data: dict, genbank_id_field: str): | ||
genbank_id = data[genbank_id_field] | ||
|
||
try: | ||
handle = Entrez.efetch( | ||
db="nucleotide", id=genbank_id, rettype="gb", retmode="text", retmax=1000 | ||
) | ||
record = SeqIO.read(handle, "genbank") | ||
|
||
data["title"] = record.annotations["references"][0].title | ||
|
||
json.dump(data, stdout, allow_nan=False, indent=None, separators=",:") | ||
print() | ||
|
||
handle.close() | ||
|
||
except Exception as exception_msg: | ||
warnings.warn(f"Pass through and skip title processing for {data[genbank_id_field]}: {exception_msg}", stacklevel=2) | ||
|
||
# example: GenBank ON123563-81 records are in the ndjson but were removed from Entrez | ||
json.dump(data, stdout, allow_nan=False, indent=None, separators=",:") | ||
print() | ||
|
||
return None | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
|
||
data = [] | ||
chunk_size = args.group_size | ||
genbank_id_field = args.genbank_id_field | ||
|
||
Entrez.email = args.entrez_email | ||
|
||
for index, record in enumerate(stdin): | ||
data.append(json.loads(record)) | ||
if (index + 1) % chunk_size == 0: | ||
fetch_and_print_citations(data, genbank_id_field) | ||
data = [] | ||
|
||
if data: | ||
fetch_and_print_citations(data, genbank_id_field) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters