Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix pfam, rhea, and mirbase importers #158

Merged
merged 2 commits into from
Sep 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions src/pyobo/sources/mirbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def get_obo(force: bool = False) -> Obo:
def get_terms(version: str, force: bool = False) -> List[Term]:
"""Parse miRNA data from filepath and convert it to dictionary."""
url = f"{BASE_URL}/{version}/miRNA.dat.gz"
definitions_path = ensure_path(PREFIX, url=url, version=version, force=force)
definitions_path = ensure_path(PREFIX, url=url, version=version, force=force, verify=False)

file_handle = (
gzip.open(definitions_path, "rt")
Expand All @@ -63,13 +63,21 @@ def get_terms(version: str, force: bool = False) -> List[Term]:

def _prepare_organisms(version: str, force: bool = False):
url = f"{BASE_URL}/{version}/organisms.txt.gz"
df = ensure_df(PREFIX, url=url, sep="\t", dtype={"#NCBI-taxid": str}, version=version)
df = ensure_df(
PREFIX,
url=url,
sep="\t",
dtype={"#NCBI-taxid": str},
version=version,
verify=False,
force=force,
)
return {division: (taxonomy_id, name) for _, division, name, _tree, taxonomy_id in df.values}


def _prepare_aliases(version: str, force: bool = False) -> Mapping[str, List[str]]:
url = f"{BASE_URL}/{version}/aliases.txt.gz"
df = ensure_df(PREFIX, url=url, sep="\t", version=version)
df = ensure_df(PREFIX, url=url, sep="\t", version=version, verify=False, force=force)
return {
mirbase_id: [s.strip() for s in synonyms.split(";") if s and s.strip()]
for mirbase_id, synonyms in df.values
Expand Down
1 change: 1 addition & 0 deletions src/pyobo/sources/pfam.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def get_pfam_clan_df(version: str, force: bool = False) -> pd.DataFrame:
version=version,
dtype=str,
force=force,
backend="urllib",
)


Expand Down
50 changes: 23 additions & 27 deletions src/pyobo/sources/rhea.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ def get_obo(force: bool = False) -> Obo:

def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in Rhea."""
url = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz"
graph = pystow.ensure_rdf(
"pyobo", "raw", PREFIX, version, url=url, force=force, parse_kwargs=dict(format="xml")
)
result = graph.query(
"""
PREFIX rh:<http://rdf.rhea-db.org/>
SELECT ?reaction ?reactionId ?reactionLabel WHERE {
?reaction rdfs:subClassOf rh:Reaction .
?reaction rh:id ?reactionId .
?reaction rdfs:label ?reactionLabel .
}
"""
)
names = {str(identifier): name for _, identifier, name in result}

terms = {}

directions = ensure_df(
Expand All @@ -50,10 +66,12 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
force=force,
)
for master, lr, rl, bi in directions.values:
terms[master] = Term(reference=Reference(PREFIX, master))
terms[lr] = Term(reference=Reference(PREFIX, lr))
terms[rl] = Term(reference=Reference(PREFIX, rl))
terms[bi] = Term(reference=Reference(PREFIX, bi))
terms[master] = Term(
reference=Reference(prefix=PREFIX, identifier=master, name=names.get(master))
)
terms[lr] = Term(reference=Reference(prefix=PREFIX, identifier=lr, name=names.get(lr)))
terms[rl] = Term(reference=Reference(prefix=PREFIX, identifier=rl, name=names.get(rl)))
terms[bi] = Term(reference=Reference(prefix=PREFIX, identifier=bi, name=names.get(bi)))

terms[master].append_relationship(has_left_to_right_reaction, terms[lr])
terms[master].append_relationship(has_right_to_left_reaction, terms[rl])
Expand Down Expand Up @@ -97,33 +115,11 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
xref_id,
)
continue
terms[rhea_id].append_xref(Reference(xref_prefix, xref_id))
terms[rhea_id].append_xref(Reference(prefix=xref_prefix, identifier=xref_id))

# TODO are EC codes equivalent?
# TODO uniprot enabled by (RO:0002333)
# TODO names?

url = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz"
graph = pystow.ensure_rdf(
"pyobo", "raw", PREFIX, version, url=url, force=force, parse_kwargs=dict(format="xml")
)
result = graph.query(
"""
PREFIX rh:<http://rdf.rhea-db.org/>
SELECT ?reaction ?reactionId ?reactionLabel WHERE {
?reaction rdfs:subClassOf rh:Reaction .
?reaction rh:id ?reactionId .
?reaction rdfs:label ?reactionLabel .
}
"""
)
for _, identifier, name in result:
identifier = str(identifier)
if identifier not in terms:
logger.debug("isolated element in rdf: rhea:%s ! %s", identifier, name)
continue
terms[identifier].reference.name = name

# TODO participants?

yield from terms.values()
Expand Down
2 changes: 1 addition & 1 deletion src/pyobo/struct/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def _ensure_ref(reference: ReferenceHint) -> Reference:
raise ValueError
return _rv
if isinstance(reference, tuple):
return Reference(*reference)
return Reference(prefix=reference[0], identifier=reference[1])
if isinstance(reference, Reference):
return reference
raise TypeError
Expand Down
4 changes: 3 additions & 1 deletion src/pyobo/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import Dict, Iterable, List, Mapping, Optional, Set, Tuple, TypeVar, Union
from xml.etree.ElementTree import Element

import pandas as pd
from lxml import etree
from tqdm.auto import tqdm

Expand Down Expand Up @@ -108,7 +109,8 @@ def multisetdict(pairs: Iterable[Tuple[X, Y]]) -> Dict[X, Set[Y]]:
"""Accumulate a multisetdict from a list of pairs."""
rv = defaultdict(set)
for key, value in pairs:
rv[key].add(value)
if pd.notna(value):
rv[key].add(value)
return dict(rv)


Expand Down
29 changes: 27 additions & 2 deletions src/pyobo/utils/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import logging
from pathlib import Path
from typing import Callable, Optional, Union
from typing import Any, Callable, Dict, Literal, Optional, Union

import pandas as pd
import requests_ftp
from pystow.utils import download, name_from_url, read_tarfile_csv

from .misc import cleanup_version
Expand All @@ -26,6 +27,8 @@

VersionHint = Union[None, str, Callable[[], str]]

requests_ftp.monkeypatch_session()


def prefix_directory_join(
prefix: str,
Expand Down Expand Up @@ -62,6 +65,8 @@ def ensure_path(
name: Optional[str] = None,
force: bool = False,
error_on_missing: bool = False,
backend: Literal["requests", "urllib"] = "urllib",
verify: bool = True,
) -> str:
"""Download a file if it doesn't exist."""
if name is None:
Expand All @@ -72,10 +77,19 @@ def ensure_path(
if not path.exists() and error_on_missing:
raise FileNotFoundError

kwargs: Dict[str, Any]
if verify:
kwargs = {"backend": backend}
else:
if backend != "requests":
logger.warning("using requests since verify=False")
kwargs = {"backend": "requests", "verify": False}

download(
url=url,
path=path,
force=force,
**kwargs,
)
return path.as_posix()

Expand All @@ -89,10 +103,21 @@ def ensure_df(
force: bool = False,
sep: str = "\t",
dtype=str,
verify: bool = True,
backend: Literal["requests", "urllib"] = "urllib",
**kwargs,
) -> pd.DataFrame:
"""Download a file and open as a dataframe."""
_path = ensure_path(prefix, *parts, url=url, version=version, name=name, force=force)
_path = ensure_path(
prefix,
*parts,
url=url,
version=version,
name=name,
force=force,
verify=verify,
backend=backend,
)
return pd.read_csv(_path, sep=sep, dtype=dtype, **kwargs)


Expand Down
Loading