biopragmatics · cthoyt · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/src/biolexica/literature/__init__.py b/src/biolexica/literature/__init__.py
@@ -6,12 +6,13 @@
     annotate_abstracts_from_pubmeds,
     annotate_abstracts_from_search,
 )
-from .retrieve import get_pubmed_dataframe
-from .search import query_pubmed
+from .retrieve import get_article_dataframe_from_pubmeds
+from .search import get_article_dataframe_from_search, query_pubmed
 
 __all__ = [
     "query_pubmed",
-    "get_pubmed_dataframe",
+    "get_article_dataframe_from_pubmeds",
+    "get_article_dataframe_from_search",
     "AnnotatedArticle",
     "Annotation",
     "annotate_abstracts_from_pubmeds",

diff --git a/src/biolexica/literature/analyze.py b/src/biolexica/literature/analyze.py
@@ -8,10 +8,12 @@
 from curies import Reference
 
 from .annotate import AnnotatedArticle
+from ..api import GrounderHint, load_grounder
 
 __all__ = [
     "count_references",
     "count_cooccurrences",
+    "analyze_pretokens",
 ]
 
 
@@ -35,3 +37,64 @@
         for annotated_article in annotated_articles
         for pair in combinations(annotated_article.count_references(), 2)
     )
+
+
+def analyze_pretokens(
+    text: str, *, grounder: GrounderHint, min_length: int = 1, max_length: int = 4
+) -> t.Counter[str]:
+    """Take a histogram over tokens appearing before matches to identify more detailed terms for curation.
+
+    :param text: The text to analyze
+    :param grounder: The grounder
+    :param min_length: The minimum number of pre-tokens to keep a histogram
+    :param max_length: The maximum number of pre-tokens to keep a histogram
+    :returns: A counter of pre-tokens in the given length range
+
+    Here's an example where we look at recent literature about dementia and try and
+    identify if there are:
+
+    1. synonyms that could be curated in one of the upstream first-party lexical resources
+       or third-party lexical resources like Biosynonyms
+    2. terms that can be added to upstream ontologies, databases, etc.
+
+    .. code-block:: python
+
+        from collections import Counter
+        from tabulate import tabulate
+        import biolexica
+        from biolexica.literature import get_article_dataframe_from_search
+        from biolexica.literature.analyze import analyze_pretokens
+
+        grounder = biolexica.load_grounder("phenotype")
+        df = get_article_dataframe_from_search("dementia")
+        counter = Counter()
+        for abstract in df["abstract"]:
+            counter.update(analyze_pretokens(abstract, grounder=grounder))
+
+        table = tabulate(counter.most_common(), headers=["phrase", "count"], tablefmt="github")
+        print(table)
+    """
+    from gilda.ner import stop_words
+
+    grounder = load_grounder(grounder)
+    text = text.replace("\n", " ").replace("  ", " ")
+    rv: t.Counter[str] = Counter()
+    for annotation in grounder.annotate(text):
+        parts = text[: annotation.start].split()
+        for i in range(min_length, max_length + 1):
+            reduced_parts = parts[-i:]
+            if len(reduced_parts) < min_length:
+                continue
+            if reduced_parts[0].lower() in stop_words:
+                # doesn't make sense for a named entity to start
+                # with one of these words, like "of"
+                continue
+            if reduced_parts[0].isnumeric():
+                continue
+            if any(part.strip().endswith(".") for part in reduced_parts):
+                # If any of the parts ends with a dot, it means that this
+                # set of pre-words goes into the previous sentence, so skip
+                continue
+            pre = " ".join(reduced_parts)
+            rv[pre] += 1
+    return rv
diff --git a/src/biolexica/literature/annotate.py b/src/biolexica/literature/annotate.py
@@ -3,18 +3,16 @@
 from __future__ import annotations
 
 import logging
-import time
 import typing as t
 from collections import Counter
 from typing import List, Optional, Union
 
 from curies import Reference
-from more_itertools import batched
 from pydantic import BaseModel
 from tqdm.auto import tqdm
 
 from biolexica.api import Annotation, GrounderHint, load_grounder
-from biolexica.literature.retrieve import get_pubmed_dataframe
+from biolexica.literature.retrieve import _iter_dataframes_from_pubmeds
 from biolexica.literature.search import query_pubmed
 
 __all__ = [
@@ -63,48 +61,33 @@ def annotate_abstracts_from_pubmeds(
     grounder: GrounderHint,
     *,
     use_indra_db: bool = True,
-    batch_size: int = 20_000,
+    batch_size: Optional[int] = None,
     show_progress: bool = True,
 ) -> List[AnnotatedArticle]:
     """Annotate the given articles using the given Gilda grounder."""
-    n_pmids = len(pubmed_ids)
-
-    rv: List[AnnotatedArticle] = []
-
     grounder = load_grounder(grounder)
-
-    outer_it = tqdm(
-        batched(pubmed_ids, batch_size),
-        total=1 + n_pmids // batch_size,
-        unit="batch",
-        desc="Annotating articles",
-        disable=not show_progress,
+    df_iterator = _iter_dataframes_from_pubmeds(
+        pubmed_ids=pubmed_ids,
+        batch_size=batch_size,
+        use_indra_db=use_indra_db,
+        show_progress=show_progress,
     )
-    for i, pubmed_batch in enumerate(outer_it, start=1):
-        t = time.time()
-        pubmed_batch = list(pubmed_batch)
-        articles_df = get_pubmed_dataframe(pubmed_batch, use_indra_db=use_indra_db).reset_index()
-        n_retrieved = len(articles_df.index)
-        tqdm.write(
-            f"[batch {i}] Got {n_retrieved:,} articles "
-            f"({n_retrieved/len(pubmed_batch):.1%}) in {time.time() - t:.2f} seconds"
+    rv: List[AnnotatedArticle] = [
+        AnnotatedArticle(
+            pubmed=pubmed,
+            title=title,
+            abstract=abstract,
+            annotations=grounder.annotate(abstract),
         )
-        for pmid, title, abstract in tqdm(
-            articles_df.values,
+        for i, df in enumerate(df_iterator, start=1)
+        for pubmed, title, abstract in tqdm(
+            df.itertuples(),
             desc=f"Annotating batch {i}",
             unit_scale=True,
             unit="article",
-            total=n_retrieved,
+            total=len(df.index),
             leave=False,
             disable=not show_progress,
-        ):
-            rv.append(
-                AnnotatedArticle(
-                    pubmed=pmid,
-                    title=title,
-                    abstract=abstract,
-                    annotations=grounder.annotate(abstract),
-                )
-            )
-
+        )
+    ]
     return rv
diff --git a/src/biolexica/literature/retrieve.py b/src/biolexica/literature/retrieve.py
@@ -3,14 +3,16 @@
 from __future__ import annotations
 
 import logging
-from typing import Dict, Iterable, List, Union
+import time
+from typing import Dict, Iterable, List, Optional, Union
 
 import pandas as pd
+from more_itertools import batched
 from tqdm.asyncio import tqdm
 from tqdm.contrib.logging import logging_redirect_tqdm
 
 __all__ = [
-    "get_pubmed_dataframe",
+    "get_article_dataframe_from_pubmeds",
     "PUBMED_DATAFRAME_COLUMNS",
     "clean_df",
 ]
@@ -21,10 +23,27 @@
 PUBMED_DATAFRAME_COLUMNS = ["pubmed", "title", "abstract"]
 
 
-def get_pubmed_dataframe(
-    pubmed_ids: Iterable[Union[str, int]], *, use_indra_db: bool = True, db=None
+def get_article_dataframe_from_pubmeds(
+    pubmed_ids: Iterable[Union[str, int]],
+    *,
+    use_indra_db: bool = True,
+    db=None,
+    batch_size: Optional[int] = None,
+    show_progress: bool = True,
 ) -> pd.DataFrame:
     """Get a dataframe indexed by PubMed identifier (str) with title and abstract columns."""
+    return pd.concat(
+        _iter_dataframes_from_pubmeds(
+            pubmed_ids=pubmed_ids,
+            use_indra_db=use_indra_db,
+            db=db,
+            batch_size=batch_size,
+            show_progress=show_progress,
+        )
+    )
+
+
+def _get_batch(pubmed_ids: Iterable[Union[str, int]], *, use_indra_db: bool = True, db=None):
     if use_indra_db:
         try:
             return _from_indra_db(pubmed_ids, db=db)
@@ -36,6 +55,41 @@
     return _from_api(pubmed_ids)
 
 
+def _iter_dataframes_from_pubmeds(
+    pubmed_ids: Iterable[Union[str, int]],
+    *,
+    use_indra_db: bool = True,
+    db=None,
+    batch_size: Optional[int] = None,
+    show_progress: bool = True,
+) -> Iterable[pd.DataFrame]:
+    """Query PubMed for article identifiers based on a given search and get a dataframe."""
+    if batch_size is None:
+        batch_size = 20_000
+
+    pubmed_ids = _clean_pubmeds(pubmed_ids)
+    if len(pubmed_ids) < batch_size:
+        # only a single batch, iterator not needed
+        show_progress = False
+    outer_it = tqdm(
+        batched(pubmed_ids, batch_size),
+        total=1 + len(pubmed_ids) // batch_size,
+        unit="batch",
+        desc="Getting articles",
+        disable=not show_progress,
+    )
+    for i, pubmed_batch in enumerate(outer_it, start=1):
+        pubmed_batch = list(pubmed_batch)
+        t = time.time()
+        df = _get_batch(pubmed_batch, use_indra_db=use_indra_db, db=db)
+        n_retrieved = len(df.index)
+        outer_it.write(
+            f"[batch {i}] Got {n_retrieved:,} articles "
+            f"({n_retrieved/len(pubmed_batch):.1%}) in {time.time() - t:.2f} seconds"
+        )
+        yield df
+
+
 def _clean_pubmeds(pubmeds: Iterable[Union[str, int]]) -> List[str]:
     return sorted(map(str, pubmeds), key=int)
 
@@ -58,7 +112,8 @@
                 desc="Getting PubMed titles/abstracts",
             )
         ]
-    df = pd.DataFrame(rows, columns=PUBMED_DATAFRAME_COLUMNS).set_index("pubmed")
+    df = pd.DataFrame(rows, columns=PUBMED_DATAFRAME_COLUMNS)
+    df = df.set_index("pubmed")
     df = clean_df(df)
     return df
 

diff --git a/src/biolexica/literature/search.py b/src/biolexica/literature/search.py
@@ -5,15 +5,46 @@
 import subprocess
 from typing import Any, List, Literal, Optional
 
+import pandas as pd
+
+from .retrieve import get_article_dataframe_from_pubmeds
+
 __all__ = [
+    "get_article_dataframe_from_search",
     "query_pubmed",
 ]
 
+Method = Literal["api", "esearch"]
+
+
+def get_article_dataframe_from_search(
+    search_term: str,
+    *,
+    method: Optional[Method] = None,
+    use_indra_db: bool = True,
+    db=None,
+    batch_size: Optional[int] = None,
+    show_progress: bool = True,
+    limit: Optional[int] = None,
+    **kwargs: Any,
+) -> pd.DataFrame:
+    """Query PubMed for article identifiers based on a given search and get a dataframe."""
+    pubmed_ids = query_pubmed(search_term, method=method, **kwargs)
+    if limit:
+        pubmed_ids = pubmed_ids[:limit]
+    return get_article_dataframe_from_pubmeds(
+        pubmed_ids,
+        use_indra_db=use_indra_db,
+        db=db,
+        batch_size=batch_size,
+        show_progress=show_progress,
+    )
+
 
 def query_pubmed(
     search_term: str,
     *,
-    method: Optional[Literal["api", "esearch"]] = None,
+    method: Optional[Method] = None,
     **kwargs: Any,
 ) -> List[str]:
     """Query PubMed for article identifiers based on a given search."""