Skip to content

Commit

Permalink
Update sources (#147)
Browse files Browse the repository at this point in the history
* Update sources

* Update pombase.py

* Update pombase.py

* Update expasy.py

* Update hgnc.py
  • Loading branch information
cthoyt authored Mar 18, 2023
1 parent a619342 commit a292753
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 18 deletions.
14 changes: 14 additions & 0 deletions src/pyobo/sources/expasy.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,20 @@ class ExpasyGetter(Obo):

bioversions_key = ontology = PREFIX
typedefs = [has_member, enables]
root_terms = [
Reference("eccode", "1"),
Reference("eccode", "2"),
Reference("eccode", "3"),
Reference("eccode", "4"),
Reference("eccode", "5"),
Reference("eccode", "6"),
Reference("eccode", "7"),
]
idspaces = {
"uniprot": "https://bioregistry.io/uniprot:",
"eccode": "https://bioregistry.io/eccode:",
"go": "http://purl.obolibrary.org/obo/GO_",
}

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
Expand Down
42 changes: 32 additions & 10 deletions src/pyobo/sources/hgnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

"""Converter for HGNC."""

import itertools as itt
import json
import logging
import typing
Expand Down Expand Up @@ -145,7 +146,7 @@

IDSPACES = {
prefix: f"https://bioregistry.io/{prefix}:"
for prefix in [
for prefix in {
"rgd",
"mgi",
"eccode",
Expand All @@ -154,17 +155,34 @@
"uniprot",
"mirbase",
"snornabase",
"hgnc",
"hgnc.genegroup",
]
"debio",
"ensembl",
"NCBIGene",
"vega",
"ucsc",
"ena",
"ccds",
"omim",
"cosmic",
"merops",
"orphanet",
"pseudogene",
"lncipedia",
"refseq",
}
}
IDSPACES["NCBITaxon"] = "http://purl.obolibrary.org/obo/NCBITaxon_"
IDSPACES.update(
NCBITaxon="http://purl.obolibrary.org/obo/NCBITaxon_",
SO="http://purl.obolibrary.org/obo/SO_",
)


class HGNCGetter(Obo):
"""An ontology representation of HGNC's gene nomenclature."""

bioversions_key = ontology = PREFIX
dynamic_version = True
typedefs = [
from_species,
has_gene_product,
Expand All @@ -180,6 +198,7 @@ class HGNCGetter(Obo):
alias_name_type,
alias_symbol_type,
]
root_terms = [Reference("SO", so_id) for so_id in sorted(set(LOCUS_TYPE_TO_SO.values()))]

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
Expand Down Expand Up @@ -207,6 +226,7 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
with open(path) as file:
entries = json.load(file)["response"]["docs"]

yield Term.from_triple("NCBITaxon", "9606", "Homo sapiens")
yield from sorted(
{
Term(reference=Reference.auto("SO", so_id))
Expand Down Expand Up @@ -242,14 +262,14 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
for uniprot_id in entry.pop("uniprot_ids", []):
term.append_relationship(
has_gene_product,
Reference.auto("uniprot", uniprot_id),
Reference("uniprot", uniprot_id),
)
for ec_code in entry.pop("enzyme_id", []):
if "-" in ec_code:
continue # only add concrete annotations
term.append_relationship(
gene_product_member_of,
Reference.auto("eccode", ec_code),
Reference("eccode", ec_code),
)
for rna_central_ids in entry.pop("rna_central_id", []):
for rna_central_id in rna_central_ids.split(","):
Expand All @@ -261,7 +281,7 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
if mirbase_id:
term.append_relationship(
transcribes_to,
Reference.auto(
Reference(
"mirbase",
mirbase_id,
),
Expand All @@ -279,7 +299,7 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
rgd_id = rgd_curie[len("RGD:") :]
term.append_relationship(
orthologous,
Reference.auto(prefix="rgd", identifier=rgd_id),
Reference(prefix="rgd", identifier=rgd_id),
)
for mgi_curie in entry.pop("mgd_id", []):
if not mgi_curie.startswith("MGI:"):
Expand All @@ -290,7 +310,7 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
continue
term.append_relationship(
orthologous,
Reference.auto(prefix="mgi", identifier=mgi_id),
Reference(prefix="mgi", identifier=mgi_id),
)

for xref_prefix, key in gene_xrefs:
Expand Down Expand Up @@ -321,7 +341,9 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
term.append_synonym(Synonym(name=alias_symbol, type=alias_symbol_type))
for alias_name in entry.pop("alias_name", []):
term.append_synonym(Synonym(name=alias_name, type=alias_name_type))
for previous_symbol in entry.pop("previous_symbol", []):
for previous_symbol in itt.chain(
entry.pop("previous_symbol", []), entry.pop("prev_symbol", [])
):
term.append_synonym(Synonym(name=previous_symbol, type=previous_symbol_type))
for previous_name in entry.pop("prev_name", []):
term.append_synonym(Synonym(name=previous_name, type=previous_name_type))
Expand Down
2 changes: 1 addition & 1 deletion src/pyobo/sources/mgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def get_terms(force: bool = False) -> Iterable[Term]:
mgi_to_ensemble_protein_ids[mgi_id].append(ensemble_protein_id)

for mgi_curie, name, definition in tqdm(
df[COLUMNS].values, total=len(df.index), desc=f"Mapping {PREFIX}"
df[COLUMNS].values, total=len(df.index), desc=f"Mapping {PREFIX}", unit_scale=True
):
identifier = mgi_curie[len("MGI:") :]
term = Term(
Expand Down
9 changes: 7 additions & 2 deletions src/pyobo/sources/mirbase_mature.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from typing import Iterable

import pandas as pd
from tqdm.auto import tqdm

from .mirbase_constants import get_mature_df
Expand Down Expand Up @@ -35,9 +36,13 @@ def get_obo(force: bool = False) -> Obo:
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
"""Get miRBase mature terms."""
df = get_mature_df(version, force=force)
for _, name, previous_name, mirbase_mature_id in tqdm(df.values, total=len(df.index)):
for _, name, previous_name, mirbase_mature_id in tqdm(
df.values, total=len(df.index), unit_scale=True
):
yield Term(
reference=Reference(prefix=PREFIX, identifier=mirbase_mature_id, name=name),
reference=Reference(
prefix=PREFIX, identifier=mirbase_mature_id, name=name if pd.notna(name) else None
),
synonyms=[
Synonym(name=previous_name),
],
Expand Down
12 changes: 8 additions & 4 deletions src/pyobo/sources/pombase.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import pyobo
from pyobo import Reference
from pyobo.struct import Obo, Synonym, Term, from_species, has_gene_product, orthologous
from pyobo.struct import Obo, Term, from_species, has_gene_product, orthologous
from pyobo.utils.path import ensure_df

__all__ = [
Expand Down Expand Up @@ -43,6 +43,8 @@ def get_obo(force: bool = False) -> Obo:

#: A mapping from PomBase gene type to sequence ontology terms
POMBASE_TO_SO = {
# None: "0000704", # gene,
"gene_type": "0000704", # unannotated
"protein coding gene": "0001217",
"pseudogene": "0000336",
"tRNA gene": "0001272",
Expand Down Expand Up @@ -74,6 +76,8 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
for _, reference in sorted(so.items()):
yield Term(reference=reference)
for identifier, _, symbol, chromosome, name, uniprot_id, gtype, synonyms in tqdm(df.values):
if pd.isna(identifier):
continue
term = Term.from_triple(
prefix=PREFIX,
identifier=identifier,
Expand All @@ -84,12 +88,12 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
term.append_parent(so[gtype])
term.set_species(identifier="4896", name="Schizosaccharomyces pombe")
for hgnc_id in identifier_to_hgnc_ids.get(identifier, []):
term.append_relationship(orthologous, Reference.auto("hgnc", hgnc_id))
term.append_relationship(orthologous, Reference("hgnc", hgnc_id))
if uniprot_id and pd.notna(uniprot_id):
term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id))
term.append_relationship(has_gene_product, Reference("uniprot", uniprot_id))
if synonyms and pd.notna(synonyms):
for synonym in synonyms.split(","):
term.append_synonym(Synonym(synonym))
term.append_synonym(synonym.strip())
yield term


Expand Down
5 changes: 4 additions & 1 deletion src/pyobo/struct/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,14 +453,17 @@ class Obo:
#: For super-sized datasets that shouldn't be read into memory
iter_only: ClassVar[bool] = False

#: Set to true for resources that are unversioned/very dynamic, like HGNC
#: Set to true for resources that are unversioned/very dynamic, like MGI
dynamic_version: ClassVar[bool] = False

#: Set to a static version for the resource (i.e., the resource is not itself versioned)
static_version: ClassVar[Optional[str]] = None

bioversions_key: ClassVar[Optional[str]] = None

#: Root terms to use for the ontology
root_terms: ClassVar[Optional[List[Reference]]] = None

#: The date the ontology was generated
date: Optional[datetime] = field(default_factory=datetime.today)

Expand Down

0 comments on commit a292753

Please sign in to comment.