From 099600cc4e9a2c2f5f34210b9a373c88c3e940f9 Mon Sep 17 00:00:00 2001
From: j23414 <jennifer.chang.bioinform@gmail.com>
Date: Fri, 31 Mar 2023 16:39:59 -0700
Subject: [PATCH] fix: one method to rescue title data from GenBank

The issue this commit is trying to fix is that ingest/bin/genbank-url line 71 is fetching the GenBank Definition Line instead of the references title.

An example GenBank Reference title is at:

https://github.com/nextstrain/dengue/blob/0591cf83fe81b9f75225e492651186d75fd09694/config/reference_dengue_all.gb#L15

To fetch the correct titles:

* Use biopython Entrez queries in batches of 200 to fetch GenBank Reference Titles
* If one of the 200 GenBank IDs fails for any reason, loop fetch one citation at a time with a time delay between each fetch
* For GenBank IDs on genbank-url, but not on Entrez, pass through the ndjson record without modifiying the title
* Capture log messages and incorporate title correction into the fetch snakemake rule
---
 ingest/bin/transform-citations.py             | 118 ++++++++++++++++++
 .../snakemake_rules/fetch_sequences.smk       |   5 +-
 2 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100755 ingest/bin/transform-citations.py

diff --git a/ingest/bin/transform-citations.py b/ingest/bin/transform-citations.py
new file mode 100755
index 00000000..03f80b15
--- /dev/null
+++ b/ingest/bin/transform-citations.py
@@ -0,0 +1,118 @@
+#! /usr/bin/env python
+"""
+Updates the title and journal fields of the NDJSON record from stdin and outputs by making Entrez queries to fetch citations
+"""
+
+import argparse
+import json
+import re
+from sys import stderr, stdin, stdout
+import time
+
+from Bio import SeqIO
+from Bio import Entrez
+import requests
+import warnings
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Use Entrez query to extract citations. Fetch in batches of group size."
+    )
+    parser.add_argument(
+        "--genbank-id-field",
+        default="genbank_accession",
+    )
+    parser.add_argument(
+        "--group-size",
+        default=200,
+        help="Fetch in batches of group size [default:200]",
+        required=False,
+    )
+    parser.add_argument(
+        "--entrez-email",
+        default="hello@nextstrain.org",
+        help="Entrez email address [default:hello@nextstrain.org]",
+        required=False,
+    )
+    return parser.parse_args()
+
+
+def fetch_and_print_citations(data: dict, genbank_id_field: str):
+    genbank_ids = ",".join([d[genbank_id_field] for d in data])
+
+    try:
+        handle = Entrez.efetch(
+            db="nucleotide", id=genbank_ids, rettype="gb", retmode="text", retmax=1000
+        )
+        record = SeqIO.parse(handle, "genbank")
+
+        results_dict = dict()
+        for row in record:
+            results_dict[row.id.split(".")[0]] = row.annotations["references"][0].title
+
+        # Maintain the order in data
+        for d in data:
+            if d["genbank_accession"] in results_dict:
+                d["title"] = results_dict[d["genbank_accession"]]
+
+            # Always print the record
+            json.dump(d, stdout, allow_nan=False, indent=None, separators=",:")
+            print()
+
+        handle.close()
+    except Exception as exception_msg:
+        for d in data:
+            fetch_one_citation(d, genbank_id_field)
+            time.sleep(1)
+
+    return None
+
+
+def fetch_one_citation(data: dict, genbank_id_field: str):
+    genbank_id = data[genbank_id_field]
+
+    try:
+        handle = Entrez.efetch(
+            db="nucleotide", id=genbank_id, rettype="gb", retmode="text", retmax=1000
+        )
+        record = SeqIO.read(handle, "genbank")
+
+        data["title"] = record.annotations["references"][0].title
+
+        json.dump(data, stdout, allow_nan=False, indent=None, separators=",:")
+        print()
+
+        handle.close()
+
+    except Exception as exception_msg:
+        warnings.warn(f"Pass through and skip title processing for {data[genbank_id_field]}: {exception_msg}", stacklevel=2)
+
+        # example: GenBank ON123563-81 records are in the ndjson but were removed from Entrez
+        json.dump(data, stdout, allow_nan=False, indent=None, separators=",:")
+        print()
+
+    return None
+
+
+def main():
+    args = parse_args()
+
+    data = []
+    chunk_size = args.group_size
+    genbank_id_field = args.genbank_id_field
+
+    Entrez.email = args.entrez_email
+
+    for index, record in enumerate(stdin):
+        data.append(json.loads(record))
+        if (index + 1) % chunk_size == 0:
+            fetch_and_print_citations(data, genbank_id_field)
+            data = []
+
+    if data:
+        fetch_and_print_citations(data, genbank_id_field)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
index 22c2d2c6..31b62ee3 100644
--- a/ingest/workflow/snakemake_rules/fetch_sequences.smk
+++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -31,9 +31,12 @@ rule fetch_from_genbank:
         genbank_ndjson="data/genbank_{serotype}.ndjson",
     params:
         serotype_tax_id=download_serotype,
+    log:
+        "logs/fetch_from_genbank_{serotype}.txt",
     shell:
         """
-        ./bin/fetch-from-genbank {params.serotype_tax_id} > {output.genbank_ndjson}
+        ./bin/fetch-from-genbank {params.serotype_tax_id} > {output.genbank_ndjson}_temp
+        ( cat {output.genbank_ndjson}_temp | ./bin/transform-citations.py > {output.genbank_ndjson} ) 2>> {log}
         """