diff --git a/.gitignore b/.gitignore index fefaf5f..b26b624 100644 --- a/.gitignore +++ b/.gitignore @@ -899,3 +899,4 @@ FodyWeavers.xsd # End of https://www.toptal.com/developers/gitignore/api/macos,linux,windows,python,jupyternotebooks,jetbrains,pycharm,vim,emacs,visualstudiocode,visualstudio scratch/ +lexica/obo/cache diff --git a/lexica/obo/README.md b/lexica/obo/README.md new file mode 100644 index 0000000..42baa86 --- /dev/null +++ b/lexica/obo/README.md @@ -0,0 +1,62 @@ +# OBO Foundry Lexicon + +This contains all the terms from OBO Foundry ontologies +(minus Protein Ontology, which is stubborn and won't download). + +The following script can be adapted to check new ontologies against existing terms: + +```python +import json +import gilda +from urllib.request import urlretrieve + +# download the URL until https://github.com/gyorilab/gilda/pull/132 +# is accepted, then the URL can be used in gilda.Grounder directly +url = "https://github.com/biopragmatics/biolexica/raw/main/lexica/obo/terms.tsv.gz" +path = "terms.tsv.gz" +urlretrieve(url, path) + +grounder = gilda.Grounder(path) + +obo_prefix = ... +obo_uri_prefix = f"http://purl.obolibrary.org/obo/{obo_prefix}_" +path_to_obograph_json = ... +with open(path_to_obograph_json) as file: + data = json.load(file) + +safe = [] + +print("## Lexical matching returned results\n") +for graph in data['graphs']: + for node in sorted(graph['nodes'], key=lambda n: n['id']): + if node['type'] == "PROPERTY": + continue + uri = node['id'] + if not uri.startswith(obo_uri_prefix): + continue + + identifier = uri[len(obo_uri_prefix) :] + name = node['lbl'] + + results = [] + results.extend(grounder.ground(name)) + results.extend( + scored_match + for synonym in node.get("meta", {}).get("synonyms", []) + for scored_match in grounder.ground(synonym['val']) + ) + + if not results: + safe.append((identifier, name)) + else: + print(f'- f`{obo_prefix}:{identifier}`', name) + for res in results: + curie = res.term.get_curie() + print(f' - [`{curie}`](https://bioregistry.io/{curie}) {res.term.entry_name} ({round(res.score, 3)})') + +print("\n## Lexical matching returned no results\n") +for identifier, name in safe: + print(f'- `{obo_prefix}:{identifier}`', name) +``` + +Inspired by https://gist.github.com/cthoyt/d26df3ec12f6a15f3157546c6ebee3a2. diff --git a/lexica/obo/generate.py b/lexica/obo/generate.py new file mode 100644 index 0000000..361f714 --- /dev/null +++ b/lexica/obo/generate.py @@ -0,0 +1,43 @@ +from pathlib import Path + +import bioregistry +from gilda import dump_terms +from gilda.grounder import load_entries_from_terms_file +from tqdm import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm + +from biolexica import iter_terms_by_prefix + +HERE = Path(__file__).parent.resolve() +TERMS_OUTPUT_PATH = HERE.joinpath("terms.tsv.gz") +CACHE = HERE.joinpath("cache") +CACHE.mkdir(exist_ok=True, parents=True) + + +def main(): + skip = {"pr"} + prefixes = sorted( + resource.prefix + for resource in bioregistry.resources() + if resource.get_obo_preferred_prefix() + and not resource.is_deprecated() + and not resource.no_own_terms + and resource.prefix not in skip + ) + + all_terms = [] + for prefix in tqdm(prefixes): + path = CACHE.joinpath(prefix).with_suffix(".tsv.gz") + if path.is_file(): + all_terms.extend(load_entries_from_terms_file(path)) + else: + local_terms = list(iter_terms_by_prefix(prefix, processor="bioontologies")) + with logging_redirect_tqdm(): + dump_terms(local_terms, path) + all_terms.extend(local_terms) + + dump_terms(all_terms, TERMS_OUTPUT_PATH) + + +if __name__ == "__main__": + main() diff --git a/lexica/obo/terms.tsv.gz b/lexica/obo/terms.tsv.gz new file mode 100644 index 0000000..c8af036 Binary files /dev/null and b/lexica/obo/terms.tsv.gz differ diff --git a/src/biolexica/api.py b/src/biolexica/api.py index 58ea107..f2aadd0 100644 --- a/src/biolexica/api.py +++ b/src/biolexica/api.py @@ -3,7 +3,7 @@ import logging import tempfile from pathlib import Path -from typing import TYPE_CHECKING, Any, Iterable, List, Literal, Optional, Union, Dict +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Literal, Optional, Union from urllib.request import urlretrieve import bioregistry @@ -41,7 +41,7 @@ GrounderHint = Union[gilda.Grounder, str, Path] -class Input(BaseModel): +class Input(BaseModel): # type:ignore """An input towards lexicon assembly.""" processor: Processor @@ -59,7 +59,7 @@ class Configuration(BaseModel): ) -PREDEFINED = ["cell", "anatomy", "phenotype"] +PREDEFINED = ["cell", "anatomy", "phenotype", "obo"] URL_FMT = "https://github.com/biopragmatics/biolexica/raw/main/lexica/{key}/terms.tsv.gz"