Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change API for getting article dataframes and add pretoken analysis #12

Merged
merged 3 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/biolexica/literature/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
annotate_abstracts_from_pubmeds,
annotate_abstracts_from_search,
)
from .retrieve import get_pubmed_dataframe
from .search import query_pubmed
from .retrieve import get_article_dataframe_from_pubmeds
from .search import get_article_dataframe_from_search, query_pubmed

__all__ = [
"query_pubmed",
"get_pubmed_dataframe",
"get_article_dataframe_from_pubmeds",
"get_article_dataframe_from_search",
"AnnotatedArticle",
"Annotation",
"annotate_abstracts_from_pubmeds",
Expand Down
63 changes: 63 additions & 0 deletions src/biolexica/literature/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
from curies import Reference

from .annotate import AnnotatedArticle
from ..api import GrounderHint, load_grounder

Check warning on line 11 in src/biolexica/literature/analyze.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/analyze.py#L11

Added line #L11 was not covered by tests

__all__ = [
"count_references",
"count_cooccurrences",
"analyze_pretokens",
]


Expand All @@ -35,3 +37,64 @@
for annotated_article in annotated_articles
for pair in combinations(annotated_article.count_references(), 2)
)


def analyze_pretokens(

Check warning on line 42 in src/biolexica/literature/analyze.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/analyze.py#L42

Added line #L42 was not covered by tests
text: str, *, grounder: GrounderHint, min_length: int = 1, max_length: int = 4
) -> t.Counter[str]:
"""Take a histogram over tokens appearing before matches to identify more detailed terms for curation.

:param text: The text to analyze
:param grounder: The grounder
:param min_length: The minimum number of pre-tokens to keep a histogram
:param max_length: The maximum number of pre-tokens to keep a histogram
:returns: A counter of pre-tokens in the given length range

Here's an example where we look at recent literature about dementia and try and
identify if there are:

1. synonyms that could be curated in one of the upstream first-party lexical resources
or third-party lexical resources like Biosynonyms
2. terms that can be added to upstream ontologies, databases, etc.

.. code-block:: python

from collections import Counter
from tabulate import tabulate
import biolexica
from biolexica.literature import get_article_dataframe_from_search
from biolexica.literature.analyze import analyze_pretokens

grounder = biolexica.load_grounder("phenotype")
df = get_article_dataframe_from_search("dementia")
counter = Counter()
for abstract in df["abstract"]:
counter.update(analyze_pretokens(abstract, grounder=grounder))

table = tabulate(counter.most_common(), headers=["phrase", "count"], tablefmt="github")
print(table)
"""
from gilda.ner import stop_words

Check warning on line 77 in src/biolexica/literature/analyze.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/analyze.py#L77

Added line #L77 was not covered by tests

grounder = load_grounder(grounder)
text = text.replace("\n", " ").replace(" ", " ")
rv: t.Counter[str] = Counter()

Check warning on line 81 in src/biolexica/literature/analyze.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/analyze.py#L79-L81

Added lines #L79 - L81 were not covered by tests
for annotation in grounder.annotate(text):
parts = text[: annotation.start].split()

Check warning on line 83 in src/biolexica/literature/analyze.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/analyze.py#L83

Added line #L83 was not covered by tests
for i in range(min_length, max_length + 1):
reduced_parts = parts[-i:]

Check warning on line 85 in src/biolexica/literature/analyze.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/analyze.py#L85

Added line #L85 was not covered by tests
if len(reduced_parts) < min_length:
continue

Check warning on line 87 in src/biolexica/literature/analyze.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/analyze.py#L87

Added line #L87 was not covered by tests
if reduced_parts[0].lower() in stop_words:
# doesn't make sense for a named entity to start
# with one of these words, like "of"
continue

Check warning on line 91 in src/biolexica/literature/analyze.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/analyze.py#L91

Added line #L91 was not covered by tests
if reduced_parts[0].isnumeric():
continue

Check warning on line 93 in src/biolexica/literature/analyze.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/analyze.py#L93

Added line #L93 was not covered by tests
if any(part.strip().endswith(".") for part in reduced_parts):
# If any of the parts ends with a dot, it means that this
# set of pre-words goes into the previous sentence, so skip
continue
pre = " ".join(reduced_parts)
rv[pre] += 1
return rv

Check warning on line 100 in src/biolexica/literature/analyze.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/analyze.py#L97-L100

Added lines #L97 - L100 were not covered by tests
55 changes: 19 additions & 36 deletions src/biolexica/literature/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,16 @@
from __future__ import annotations

import logging
import time
import typing as t
from collections import Counter
from typing import List, Optional, Union

from curies import Reference
from more_itertools import batched
from pydantic import BaseModel
from tqdm.auto import tqdm

from biolexica.api import Annotation, GrounderHint, load_grounder
from biolexica.literature.retrieve import get_pubmed_dataframe
from biolexica.literature.retrieve import _iter_dataframes_from_pubmeds
from biolexica.literature.search import query_pubmed

__all__ = [
Expand Down Expand Up @@ -63,48 +61,33 @@ def annotate_abstracts_from_pubmeds(
grounder: GrounderHint,
*,
use_indra_db: bool = True,
batch_size: int = 20_000,
batch_size: Optional[int] = None,
show_progress: bool = True,
) -> List[AnnotatedArticle]:
"""Annotate the given articles using the given Gilda grounder."""
n_pmids = len(pubmed_ids)

rv: List[AnnotatedArticle] = []

grounder = load_grounder(grounder)

outer_it = tqdm(
batched(pubmed_ids, batch_size),
total=1 + n_pmids // batch_size,
unit="batch",
desc="Annotating articles",
disable=not show_progress,
df_iterator = _iter_dataframes_from_pubmeds(
pubmed_ids=pubmed_ids,
batch_size=batch_size,
use_indra_db=use_indra_db,
show_progress=show_progress,
)
for i, pubmed_batch in enumerate(outer_it, start=1):
t = time.time()
pubmed_batch = list(pubmed_batch)
articles_df = get_pubmed_dataframe(pubmed_batch, use_indra_db=use_indra_db).reset_index()
n_retrieved = len(articles_df.index)
tqdm.write(
f"[batch {i}] Got {n_retrieved:,} articles "
f"({n_retrieved/len(pubmed_batch):.1%}) in {time.time() - t:.2f} seconds"
rv: List[AnnotatedArticle] = [
AnnotatedArticle(
pubmed=pubmed,
title=title,
abstract=abstract,
annotations=grounder.annotate(abstract),
)
for pmid, title, abstract in tqdm(
articles_df.values,
for i, df in enumerate(df_iterator, start=1)
for pubmed, title, abstract in tqdm(
df.itertuples(),
desc=f"Annotating batch {i}",
unit_scale=True,
unit="article",
total=n_retrieved,
total=len(df.index),
leave=False,
disable=not show_progress,
):
rv.append(
AnnotatedArticle(
pubmed=pmid,
title=title,
abstract=abstract,
annotations=grounder.annotate(abstract),
)
)

)
]
return rv
65 changes: 60 additions & 5 deletions src/biolexica/literature/retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
from __future__ import annotations

import logging
from typing import Dict, Iterable, List, Union
import time
from typing import Dict, Iterable, List, Optional, Union

import pandas as pd
from more_itertools import batched
from tqdm.asyncio import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

__all__ = [
"get_pubmed_dataframe",
"get_article_dataframe_from_pubmeds",
"PUBMED_DATAFRAME_COLUMNS",
"clean_df",
]
Expand All @@ -21,10 +23,27 @@
PUBMED_DATAFRAME_COLUMNS = ["pubmed", "title", "abstract"]


def get_pubmed_dataframe(
pubmed_ids: Iterable[Union[str, int]], *, use_indra_db: bool = True, db=None
def get_article_dataframe_from_pubmeds(
pubmed_ids: Iterable[Union[str, int]],
*,
use_indra_db: bool = True,
db=None,
batch_size: Optional[int] = None,
show_progress: bool = True,
) -> pd.DataFrame:
"""Get a dataframe indexed by PubMed identifier (str) with title and abstract columns."""
return pd.concat(

Check warning on line 35 in src/biolexica/literature/retrieve.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/retrieve.py#L35

Added line #L35 was not covered by tests
_iter_dataframes_from_pubmeds(
pubmed_ids=pubmed_ids,
use_indra_db=use_indra_db,
db=db,
batch_size=batch_size,
show_progress=show_progress,
)
)


def _get_batch(pubmed_ids: Iterable[Union[str, int]], *, use_indra_db: bool = True, db=None):
if use_indra_db:
try:
return _from_indra_db(pubmed_ids, db=db)
Expand All @@ -36,6 +55,41 @@
return _from_api(pubmed_ids)


def _iter_dataframes_from_pubmeds(
pubmed_ids: Iterable[Union[str, int]],
*,
use_indra_db: bool = True,
db=None,
batch_size: Optional[int] = None,
show_progress: bool = True,
) -> Iterable[pd.DataFrame]:
"""Query PubMed for article identifiers based on a given search and get a dataframe."""
if batch_size is None:
batch_size = 20_000

pubmed_ids = _clean_pubmeds(pubmed_ids)
if len(pubmed_ids) < batch_size:
# only a single batch, iterator not needed
show_progress = False
outer_it = tqdm(
batched(pubmed_ids, batch_size),
total=1 + len(pubmed_ids) // batch_size,
unit="batch",
desc="Getting articles",
disable=not show_progress,
)
for i, pubmed_batch in enumerate(outer_it, start=1):
pubmed_batch = list(pubmed_batch)
t = time.time()
df = _get_batch(pubmed_batch, use_indra_db=use_indra_db, db=db)
n_retrieved = len(df.index)
outer_it.write(
f"[batch {i}] Got {n_retrieved:,} articles "
f"({n_retrieved/len(pubmed_batch):.1%}) in {time.time() - t:.2f} seconds"
)
yield df


def _clean_pubmeds(pubmeds: Iterable[Union[str, int]]) -> List[str]:
return sorted(map(str, pubmeds), key=int)

Expand All @@ -58,7 +112,8 @@
desc="Getting PubMed titles/abstracts",
)
]
df = pd.DataFrame(rows, columns=PUBMED_DATAFRAME_COLUMNS).set_index("pubmed")
df = pd.DataFrame(rows, columns=PUBMED_DATAFRAME_COLUMNS)
df = df.set_index("pubmed")
df = clean_df(df)
return df

Expand Down
33 changes: 32 additions & 1 deletion src/biolexica/literature/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,46 @@
import subprocess
from typing import Any, List, Literal, Optional

import pandas as pd

from .retrieve import get_article_dataframe_from_pubmeds

__all__ = [
"get_article_dataframe_from_search",
"query_pubmed",
]

Method = Literal["api", "esearch"]


def get_article_dataframe_from_search(
search_term: str,
*,
method: Optional[Method] = None,
use_indra_db: bool = True,
db=None,
batch_size: Optional[int] = None,
show_progress: bool = True,
limit: Optional[int] = None,
**kwargs: Any,
) -> pd.DataFrame:
"""Query PubMed for article identifiers based on a given search and get a dataframe."""
pubmed_ids = query_pubmed(search_term, method=method, **kwargs)

Check warning on line 32 in src/biolexica/literature/search.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/search.py#L32

Added line #L32 was not covered by tests
if limit:
pubmed_ids = pubmed_ids[:limit]
return get_article_dataframe_from_pubmeds(

Check warning on line 35 in src/biolexica/literature/search.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/literature/search.py#L34-L35

Added lines #L34 - L35 were not covered by tests
pubmed_ids,
use_indra_db=use_indra_db,
db=db,
batch_size=batch_size,
show_progress=show_progress,
)


def query_pubmed(
search_term: str,
*,
method: Optional[Literal["api", "esearch"]] = None,
method: Optional[Method] = None,
**kwargs: Any,
) -> List[str]:
"""Query PubMed for article identifiers based on a given search."""
Expand Down
Loading