Skip to content

Commit

Permalink
Add option ensembl_tark and ensembl_rest in CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
Xiaoyun Liu committed Apr 25, 2024
1 parent fba47d5 commit b3c76c0
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 116 deletions.
7 changes: 1 addition & 6 deletions mutalyzer_retriever/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _parse_args(args):
parser.add_argument("--id", help="the reference id")

parser.add_argument(
"-s", "--source", help="retrieval source", choices=["ncbi", "ensembl", "lrg"]
"-s", "--source", help="retrieval source", choices=["ncbi", "ensembl_tark", "ensembl_rest", "lrg"]
)

parser.add_argument(
Expand All @@ -49,10 +49,6 @@ def _parse_args(args):
default="all",
)

parser.add_argument(
"-a", "--api", help="retrieval api", choices=["tark", "rest"]
)

parser.add_argument(
"-r", "--related", help="retrieve related reference ids", action="store_true"
)
Expand Down Expand Up @@ -162,7 +158,6 @@ def _retrieve_model(args):
reference_source=args.source,
reference_type=args.type,
model_type=args.model_type,
reference_api=args.api,
size_off=args.sizeoff,
timeout=args.timeout,
)
Expand Down
3 changes: 1 addition & 2 deletions mutalyzer_retriever/parsers/json_ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,7 @@ def parse(tark_result):

exon_features = _exons(tark_result["exons"])

# TODO: find examples of null or 2
# one translations per transcript, somtimes null or 2 (strand -1/1)
# one translations per transcript, null for non-coding, rarely 2 for different version
translation_features = _translation(tark_result["translations"])

transcript_features = _transcript(tark_result, exon_features, translation_features)
Expand Down
15 changes: 6 additions & 9 deletions mutalyzer_retriever/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def _fetch_unknown_source(reference_id, reference_type, size_off=True, timeout=1
# Ensembl
try:
reference_content, reference_type = ensembl.fetch(
reference_id, reference_type, timeout
reference_id, reference_type, None, timeout
)
except (NameError, ConnectionError, ValueError) as e:
status["ensembl"]["errors"].append(e)
Expand All @@ -104,7 +104,6 @@ def retrieve_raw(
reference_id,
reference_source=None,
reference_type=None,
reference_api=None,
size_off=True,
timeout=1,
):
Expand All @@ -128,22 +127,20 @@ def retrieve_raw(
elif reference_source == "ncbi":
reference_content, reference_type = ncbi.fetch(
reference_id, reference_type, timeout
)
elif reference_source == "ensembl":
reference_content, reference_type = ensembl.fetch(reference_id,reference_type,reference_api)
)
elif reference_source in ["ensembl","ensembl_tark", "ensembl_rest"]:
reference_content, reference_type = ensembl.fetch(reference_id,reference_type,reference_source,timeout)
elif reference_source == "lrg":
reference_content = lrg.fetch_lrg(reference_id, timeout=timeout)
if reference_content:
reference_type = "lrg"
print(reference_content)
return reference_content, reference_type, reference_source


def retrieve_model(
reference_id,
reference_source=None,
reference_type=None,
reference_api=None,
size_off=True,
model_type="all",
timeout=1,
Expand All @@ -163,7 +160,7 @@ def retrieve_model(


reference_content, reference_type, reference_source = retrieve_raw(
reference_id, reference_source, reference_type, reference_api, size_off, timeout=timeout
reference_id, reference_source, reference_type, size_off, timeout=timeout
)

if reference_type == "lrg":
Expand Down Expand Up @@ -199,7 +196,7 @@ def retrieve_model(
}

elif reference_type == "json":
if reference_source == "ensembl":
if "ensembl" in reference_source:
return parser.parse(reference_content, "json")


Expand Down
129 changes: 53 additions & 76 deletions mutalyzer_retriever/sources/ensembl.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,10 @@
import json

import requests

from ..configuration import settings
from ..request import Http400, RequestErrors, request
from ..util import f_e
import pprint


def fetch_json(feature_id, api_base, timeout=1):
url = f"{api_base}/lookup/id/{feature_id}"
params = {"feature": ["gene", "transcript", "cds"], "expand": 1}
headers = {"Content-Type": "application/json"}
try:
response = request(url, params, headers, timeout=timeout)
except RequestErrors as e:
raise ConnectionError(f"(json) {str(e)}")
except Http400 as e:
response_json = e.response.json()
if response_json and response_json.get("error") == "ID '{}' not found".format(
feature_id
):
raise NameError(f"(json) {str(e)}")
else:
raise e
else:
return response


def fetch_fasta(feature_id, api_base, timeout=1):
Expand Down Expand Up @@ -68,10 +49,10 @@ def fetch_gff3(feature_id, api_base, timeout=1):
return response


def _get_tark_versions(reference_id, api_base, timeout=4):
def _get_tark_versions(reference_id, api_base, timeout=1):
endpoint = "transcript"
params = {"stable_id": reference_id}
tark_req = json.loads(request(url=f"{api_base}/{endpoint}", params=params))
tark_req = json.loads(request(url=f"{api_base}/{endpoint}", params=params, timeout=timeout))
tark_versions_38 = []
tark_versions_37 = []
if tark_req["results"]:
Expand All @@ -80,9 +61,10 @@ def _get_tark_versions(reference_id, api_base, timeout=4):
tark_versions_37.append(int(r["stable_id_version"]))
elif r["assembly"] == "GRCh38":
tark_versions_38.append(int(r["stable_id_version"]))

return tark_versions_38, tark_versions_37


def _get_most_recent_version(reference_id, api_base, timeout=1):
return int(_get_reference_information(reference_id, api_base, timeout)["version"])

Expand All @@ -109,70 +91,65 @@ def _get_id_and_version(reference_id):
return r_id, r_version


def _in_grch37(r_id, r_version, r_info, timeout):
api_base = settings.get("ENSEMBL_API_GRCH37")
if r_info["species"] == "homo_sapiens" and int(r_info["version"]) > r_version:
grch37_version = _get_most_recent_version(r_id, api_base, timeout)
if grch37_version and grch37_version == r_version:
return True
return False

def fetch_json(reference_id, reference_version, api_base, assembly="GRCh38", timeout=1):
endpoint = "transcript"
params = {
"stable_id": reference_id,
"assembly_name": assembly,
"stable_id_version": reference_version,
"expand": "translations, genes, exons",
}
req = requests.request(
method="get", url=f"{api_base}/{endpoint}", params=params, timeout=timeout
)
return req.json()


def get_rest_api_base(r_id, r_version):
rest_version_38 = _get_most_recent_version(r_id, settings.get("ENSEMBL_API"))
if r_version in [None, rest_version_38]:
return settings.get("ENSEMBL_API"), "GRCh38"
elif r_version == _get_most_recent_version(r_id, settings.get("ENSEMBL_API_GRCH37")):
return settings.get("ENSEMBL_API_GRCH37"), "GRCh37"
raise ValueError(f"Cannot fetch {r_id}.{r_version} from Ensembl REST")

def fetch_tark(reference_id, reference_version, api_base, assembly= "GRCh38"):
endpoint = "transcript"
params = {"stable_id": reference_id,
"assembly_name": assembly,
"stable_id_version":reference_version,
"expand": "translations, genes, exons"}
req = requests.request(method="get", url=f"{api_base}/{endpoint}", params=params)
return req.json()

def get_transcript_api_base(r_id, r_version, r_source):
if r_source == "ensembl_rest":
return get_rest_api_base(r_id, r_version)

tark_versions_38, tark_versions_37 = _get_tark_versions(r_id, settings.get("ENSEMBL_TARK_API"))
if r_version is None or r_version in tark_versions_38:
return settings.get("ENSEMBL_TARK_API"), "GRCh38"
elif r_version in tark_versions_37:
return settings.get("ENSEMBL_TARK_API"), "GRCh37"
raise ValueError(f"Cannot fetch {r_id} from Ensembl Tark")


def get_api_base(r_id, r_version, transcript = False):
if "ENST" in r_id:
transcript = True
rest_version_38 = _get_most_recent_version(r_id,settings.get("ENSEMBL_API"))
if not transcript:
if r_version in [None, rest_version_38]:
return settings.get("ENSEMBL_API"), "GRCh38"
elif r_version == _get_most_recent_version(r_id,settings.get("ENSEMBL_API_GRCH37")):
return settings.get("ENSEMBL_API_GRCH37") , "GRCh37"

if transcript:
tark_versions_38, tark_versions_37 = _get_tark_versions(r_id,settings.get("ENSEMBL_TARK_API"))
if r_version == rest_version_38:
return settings.get("ENSEMBL_API"), "GRCh38"
elif r_version == None or r_version in tark_versions_38:
return settings.get("ENSEMBL_TARK_API"), "GRCh38"
elif r_version == _get_most_recent_version(r_id,settings.get("ENSEMBL_API_GRCH37")):
return settings.get("ENSEMBL_API_GRCH37"), "GRCh37"
elif r_version in tark_versions_37:
return settings.get("ENSEMBL_TARK_API"), "GRCh37"
raise ValueError(f"Cannot fetch {r_id} with version {r_version} from Ensembl")


def fetch(reference_id, reference_type=None, reference_api=None, timeout=20):
def fetch(reference_id, reference_type=None, reference_source=None, timeout=1):
r_id, r_version = _get_id_and_version(reference_id)
if r_id is None:
raise NameError
api_base, assembly = get_api_base(r_id, r_version, reference_api)
if reference_api in [None, "rest"]:
if reference_type in [None, "gff3"]:

if "ENST" in r_id:
api_base, assembly = get_transcript_api_base(r_id, r_version, reference_source)
else:
api_base, assembly = get_rest_api_base(r_id, r_version)

if reference_type is None:
try:
return fetch_gff3(r_id, api_base, timeout), "gff3"
elif reference_type == "fasta":
return fetch_fasta(r_id, api_base, timeout), "fasta"
if reference_api in [None, "tark"]:
if reference_type == "json" and api_base == settings.get("ENSEMBL_TARK_API"):
return fetch_tark(r_id, r_version, api_base,assembly), "json"
except ConnectionError:
return fetch_json(r_id, r_version, api_base, assembly, timeout), "json"
elif reference_type == "gff3":
return fetch_gff3(r_id, api_base, timeout), "gff3"
elif reference_type == "fasta":
return fetch_fasta(r_id, api_base, timeout), "fasta"
elif reference_type == "json":
if reference_source in [None, "ensembl_tark"]:
return fetch_json(r_id, r_version, api_base, assembly, timeout), "json"

elif reference_type == "genbank":
return None, "genbank"

raise ValueError(
f"Ensembl fetch does not support {reference_type} reference type from {api_base} for {reference_id}."
)

raise ValueError(f"{reference_source} fetch does not support {reference_type} reference type.")
12 changes: 6 additions & 6 deletions tests/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ def patch_retriever(monkeypatch):
from .test_fetch import (
_fetch_gff3,
_fetch_json,
_fetch_tark,
_get_reference_information,
_get_tark_versions,
)
Expand All @@ -25,7 +24,6 @@ def patch_retriever(monkeypatch):
monkeypatch.setattr(
"mutalyzer_retriever.sources.ensembl._get_tark_versions", _get_tark_versions
)
monkeypatch.setattr("mutalyzer_retriever.sources.ensembl.fetch_tark", _fetch_tark)
monkeypatch.setattr("mutalyzer_retriever.sources.ensembl.fetch_json", _fetch_json)
monkeypatch.setattr("mutalyzer_retriever.retriever.retrieve_raw", _retrieve_raw)

Expand Down Expand Up @@ -58,7 +56,7 @@ def _retrieve_raw(
elif r_id.startswith("LRG_"):
return _get_content("data/" + r_id + ".lrg"), "lrg", "lrg"
elif r_type == "json":
return (json.loads(_get_content("data/" + r_id + ".tark_raw.json")),"json","ensembl")
return (json.loads(_get_content("data/" + r_id + ".tark_raw.json")),"json","ensembl_tark")
else:
return _get_content("data/" + r_id + ".gff3"), "gff3", "ncbi"

Expand Down Expand Up @@ -88,19 +86,21 @@ def _retrieve_raw(
"NR_023343.1",
]
},
"ensembl": {
"ensembl_rest": {
"gff3": [
"ENSG00000147889",
"ENST00000383925",
"ENST00000304494",
"ENSG00000198899",
],
]
},
"ensembl_tark":{
"json": [
"ENST00000383925.1",
"ENST00000383925",
"ENST00000304494",
"ENST00000304494.10",
],
]
},
"lrg": {"lrg": ["LRG_11", "LRG_417", "LRG_857"]},
}
19 changes: 4 additions & 15 deletions tests/test_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
}


def _fetch_tark(r_id, r_version, api_base, assembly):
def _fetch_json(r_id, r_version, api_base, assembly, timeout):
if api_base == TARK_API_BASE:
return _get_content(
f"data/{r_id}.{r_version}.tark_raw.model.json"
Expand All @@ -43,17 +43,6 @@ def _get_tark_versions(r_id, api_base, timeout=1):
)


def _fetch_json(r_id, api_base, timeout=1):
if api_base == API_BASE_GRCH37:
return _get_content(
f"data/{r_id}.{API_BASE_GRCH37_MAP[r_id]['version']}.rest_raw.model.json"
)
if api_base == API_BASE_GRCH37_MAP:
return _get_content(
f"data/{r_id}.{API_BASE[r_id]['version']}.rest_raw.model.json"
)


def _fetch_gff3(feature_id, api_base, timeout=1):
if api_base == API_BASE_GRCH37:
return _get_content(
Expand Down Expand Up @@ -89,10 +78,10 @@ def test_ensembl_fetch_transcript_rest_38(r_id):
assert fetch(r_id)[0] == _get_content(f"data/{r_id}.gff3")


@pytest.mark.parametrize("r_id, r_type", [("ENST00000304494.5", "json")])
def test_ensembl_fetch_transcript_rest_37(r_id, r_type):
@pytest.mark.parametrize("r_id, r_type, r_source", [("ENST00000304494.5", "json", "ensembl_rest")])
def test_ensembl_fetch_transcript_rest_37(r_id, r_type, r_source):
with pytest.raises(ValueError):
assert fetch(r_id, r_type)[0] == None
fetch(r_id, r_type, r_source)


@pytest.mark.parametrize("r_id, r_type", [("ENST00000304494.7", "json")])
Expand Down
5 changes: 3 additions & 2 deletions tests/test_model_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def get_references_content(references):
path_gb = (
Path(Path(__file__).parent)
/ "data"
/ f"{r_id}.tark.model.{r_type}"
/ f"{r_id}.tark_raw.{r_type}"
)
r_content = json.loads(path_gb.open().read())
else:
Expand Down Expand Up @@ -57,7 +57,8 @@ def test_schema_validation(r_source, r_type, r_content, r_id, monkeypatch: pytes
reference_type=r_type,
reference_source=r_source,
)
if r_source == "lrg":

if r_source in ["ensembl_tark", "lrg"]:
assert validate(r_model["annotations"]) is None
else:
assert validate(r_model) is None

0 comments on commit b3c76c0

Please sign in to comment.