diff --git a/src/biolexica/literature/__init__.py b/src/biolexica/literature/__init__.py index 0ecae59..7328cfb 100644 --- a/src/biolexica/literature/__init__.py +++ b/src/biolexica/literature/__init__.py @@ -6,12 +6,13 @@ annotate_abstracts_from_pubmeds, annotate_abstracts_from_search, ) -from .retrieve import get_pubmed_dataframe -from .search import query_pubmed +from .retrieve import get_article_dataframe_from_pubmeds +from .search import get_article_dataframe_from_search, query_pubmed __all__ = [ "query_pubmed", - "get_pubmed_dataframe", + "get_article_dataframe_from_pubmeds", + "get_article_dataframe_from_search", "AnnotatedArticle", "Annotation", "annotate_abstracts_from_pubmeds", diff --git a/src/biolexica/literature/analyze.py b/src/biolexica/literature/analyze.py index 89141e8..e2846d4 100644 --- a/src/biolexica/literature/analyze.py +++ b/src/biolexica/literature/analyze.py @@ -8,10 +8,12 @@ from curies import Reference from .annotate import AnnotatedArticle +from ..api import GrounderHint, load_grounder __all__ = [ "count_references", "count_cooccurrences", + "analyze_pretokens", ] @@ -35,3 +37,64 @@ def count_cooccurrences( for annotated_article in annotated_articles for pair in combinations(annotated_article.count_references(), 2) ) + + +def analyze_pretokens( + text: str, *, grounder: GrounderHint, min_length: int = 1, max_length: int = 4 +) -> t.Counter[str]: + """Take a histogram over tokens appearing before matches to identify more detailed terms for curation. + + :param text: The text to analyze + :param grounder: The grounder + :param min_length: The minimum number of pre-tokens to keep a histogram + :param max_length: The maximum number of pre-tokens to keep a histogram + :returns: A counter of pre-tokens in the given length range + + Here's an example where we look at recent literature about dementia and try and + identify if there are: + + 1. synonyms that could be curated in one of the upstream first-party lexical resources + or third-party lexical resources like Biosynonyms + 2. terms that can be added to upstream ontologies, databases, etc. + + .. code-block:: python + + from collections import Counter + from tabulate import tabulate + import biolexica + from biolexica.literature import get_article_dataframe_from_search + from biolexica.literature.analyze import analyze_pretokens + + grounder = biolexica.load_grounder("phenotype") + df = get_article_dataframe_from_search("dementia") + counter = Counter() + for abstract in df["abstract"]: + counter.update(analyze_pretokens(abstract, grounder=grounder)) + + table = tabulate(counter.most_common(), headers=["phrase", "count"], tablefmt="github") + print(table) + """ + from gilda.ner import stop_words + + grounder = load_grounder(grounder) + text = text.replace("\n", " ").replace(" ", " ") + rv: t.Counter[str] = Counter() + for annotation in grounder.annotate(text): + parts = text[: annotation.start].split() + for i in range(min_length, max_length + 1): + reduced_parts = parts[-i:] + if len(reduced_parts) < min_length: + continue + if reduced_parts[0].lower() in stop_words: + # doesn't make sense for a named entity to start + # with one of these words, like "of" + continue + if reduced_parts[0].isnumeric(): + continue + if any(part.strip().endswith(".") for part in reduced_parts): + # If any of the parts ends with a dot, it means that this + # set of pre-words goes into the previous sentence, so skip + continue + pre = " ".join(reduced_parts) + rv[pre] += 1 + return rv diff --git a/src/biolexica/literature/annotate.py b/src/biolexica/literature/annotate.py index 7c492d7..5d1d57d 100644 --- a/src/biolexica/literature/annotate.py +++ b/src/biolexica/literature/annotate.py @@ -3,18 +3,16 @@ from __future__ import annotations import logging -import time import typing as t from collections import Counter from typing import List, Optional, Union from curies import Reference -from more_itertools import batched from pydantic import BaseModel from tqdm.auto import tqdm from biolexica.api import Annotation, GrounderHint, load_grounder -from biolexica.literature.retrieve import get_pubmed_dataframe +from biolexica.literature.retrieve import _iter_dataframes_from_pubmeds from biolexica.literature.search import query_pubmed __all__ = [ @@ -63,48 +61,33 @@ def annotate_abstracts_from_pubmeds( grounder: GrounderHint, *, use_indra_db: bool = True, - batch_size: int = 20_000, + batch_size: Optional[int] = None, show_progress: bool = True, ) -> List[AnnotatedArticle]: """Annotate the given articles using the given Gilda grounder.""" - n_pmids = len(pubmed_ids) - - rv: List[AnnotatedArticle] = [] - grounder = load_grounder(grounder) - - outer_it = tqdm( - batched(pubmed_ids, batch_size), - total=1 + n_pmids // batch_size, - unit="batch", - desc="Annotating articles", - disable=not show_progress, + df_iterator = _iter_dataframes_from_pubmeds( + pubmed_ids=pubmed_ids, + batch_size=batch_size, + use_indra_db=use_indra_db, + show_progress=show_progress, ) - for i, pubmed_batch in enumerate(outer_it, start=1): - t = time.time() - pubmed_batch = list(pubmed_batch) - articles_df = get_pubmed_dataframe(pubmed_batch, use_indra_db=use_indra_db).reset_index() - n_retrieved = len(articles_df.index) - tqdm.write( - f"[batch {i}] Got {n_retrieved:,} articles " - f"({n_retrieved/len(pubmed_batch):.1%}) in {time.time() - t:.2f} seconds" + rv: List[AnnotatedArticle] = [ + AnnotatedArticle( + pubmed=pubmed, + title=title, + abstract=abstract, + annotations=grounder.annotate(abstract), ) - for pmid, title, abstract in tqdm( - articles_df.values, + for i, df in enumerate(df_iterator, start=1) + for pubmed, title, abstract in tqdm( + df.itertuples(), desc=f"Annotating batch {i}", unit_scale=True, unit="article", - total=n_retrieved, + total=len(df.index), leave=False, disable=not show_progress, - ): - rv.append( - AnnotatedArticle( - pubmed=pmid, - title=title, - abstract=abstract, - annotations=grounder.annotate(abstract), - ) - ) - + ) + ] return rv diff --git a/src/biolexica/literature/retrieve.py b/src/biolexica/literature/retrieve.py index e271e77..d6caf23 100644 --- a/src/biolexica/literature/retrieve.py +++ b/src/biolexica/literature/retrieve.py @@ -3,14 +3,16 @@ from __future__ import annotations import logging -from typing import Dict, Iterable, List, Union +import time +from typing import Dict, Iterable, List, Optional, Union import pandas as pd +from more_itertools import batched from tqdm.asyncio import tqdm from tqdm.contrib.logging import logging_redirect_tqdm __all__ = [ - "get_pubmed_dataframe", + "get_article_dataframe_from_pubmeds", "PUBMED_DATAFRAME_COLUMNS", "clean_df", ] @@ -21,10 +23,27 @@ PUBMED_DATAFRAME_COLUMNS = ["pubmed", "title", "abstract"] -def get_pubmed_dataframe( - pubmed_ids: Iterable[Union[str, int]], *, use_indra_db: bool = True, db=None +def get_article_dataframe_from_pubmeds( + pubmed_ids: Iterable[Union[str, int]], + *, + use_indra_db: bool = True, + db=None, + batch_size: Optional[int] = None, + show_progress: bool = True, ) -> pd.DataFrame: """Get a dataframe indexed by PubMed identifier (str) with title and abstract columns.""" + return pd.concat( + _iter_dataframes_from_pubmeds( + pubmed_ids=pubmed_ids, + use_indra_db=use_indra_db, + db=db, + batch_size=batch_size, + show_progress=show_progress, + ) + ) + + +def _get_batch(pubmed_ids: Iterable[Union[str, int]], *, use_indra_db: bool = True, db=None): if use_indra_db: try: return _from_indra_db(pubmed_ids, db=db) @@ -36,6 +55,41 @@ def get_pubmed_dataframe( return _from_api(pubmed_ids) +def _iter_dataframes_from_pubmeds( + pubmed_ids: Iterable[Union[str, int]], + *, + use_indra_db: bool = True, + db=None, + batch_size: Optional[int] = None, + show_progress: bool = True, +) -> Iterable[pd.DataFrame]: + """Query PubMed for article identifiers based on a given search and get a dataframe.""" + if batch_size is None: + batch_size = 20_000 + + pubmed_ids = _clean_pubmeds(pubmed_ids) + if len(pubmed_ids) < batch_size: + # only a single batch, iterator not needed + show_progress = False + outer_it = tqdm( + batched(pubmed_ids, batch_size), + total=1 + len(pubmed_ids) // batch_size, + unit="batch", + desc="Getting articles", + disable=not show_progress, + ) + for i, pubmed_batch in enumerate(outer_it, start=1): + pubmed_batch = list(pubmed_batch) + t = time.time() + df = _get_batch(pubmed_batch, use_indra_db=use_indra_db, db=db) + n_retrieved = len(df.index) + outer_it.write( + f"[batch {i}] Got {n_retrieved:,} articles " + f"({n_retrieved/len(pubmed_batch):.1%}) in {time.time() - t:.2f} seconds" + ) + yield df + + def _clean_pubmeds(pubmeds: Iterable[Union[str, int]]) -> List[str]: return sorted(map(str, pubmeds), key=int) @@ -58,7 +112,8 @@ def _from_api(pmids: Iterable[Union[str, int]]) -> pd.DataFrame: desc="Getting PubMed titles/abstracts", ) ] - df = pd.DataFrame(rows, columns=PUBMED_DATAFRAME_COLUMNS).set_index("pubmed") + df = pd.DataFrame(rows, columns=PUBMED_DATAFRAME_COLUMNS) + df = df.set_index("pubmed") df = clean_df(df) return df diff --git a/src/biolexica/literature/search.py b/src/biolexica/literature/search.py index d8234fa..2127e41 100644 --- a/src/biolexica/literature/search.py +++ b/src/biolexica/literature/search.py @@ -5,15 +5,46 @@ import subprocess from typing import Any, List, Literal, Optional +import pandas as pd + +from .retrieve import get_article_dataframe_from_pubmeds + __all__ = [ + "get_article_dataframe_from_search", "query_pubmed", ] +Method = Literal["api", "esearch"] + + +def get_article_dataframe_from_search( + search_term: str, + *, + method: Optional[Method] = None, + use_indra_db: bool = True, + db=None, + batch_size: Optional[int] = None, + show_progress: bool = True, + limit: Optional[int] = None, + **kwargs: Any, +) -> pd.DataFrame: + """Query PubMed for article identifiers based on a given search and get a dataframe.""" + pubmed_ids = query_pubmed(search_term, method=method, **kwargs) + if limit: + pubmed_ids = pubmed_ids[:limit] + return get_article_dataframe_from_pubmeds( + pubmed_ids, + use_indra_db=use_indra_db, + db=db, + batch_size=batch_size, + show_progress=show_progress, + ) + def query_pubmed( search_term: str, *, - method: Optional[Literal["api", "esearch"]] = None, + method: Optional[Method] = None, **kwargs: Any, ) -> List[str]: """Query PubMed for article identifiers based on a given search."""