Skip to content

Commit

Permalink
Merge pull request #105 from umcu/improve-default-settings
Browse files Browse the repository at this point in the history
Simplify Term defaults
  • Loading branch information
vmenger authored Jun 27, 2024
2 parents 46990cb + 77b2634 commit 33280b4
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 37 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* In `InformationExtractionDataset`, renamed `span_counts`, `label_counts` and `qualifier_counts` to `span_freqs`, `label_freqs` and `qualifier_freqs` respectively.
* The `clinlp_component` utility now returns the class itself, rather than a helper function for making it
* Changed order of `direction` and `qualifier` arguments of `ContextRule`
* Simplified default settings for `clinlp` components
* Simplified default settings for `clinlp` components and `Term` class

## 0.8.0 (2024-06-03)

Expand Down
Binary file removed media/clinlp-features-v0.6.png
Binary file not shown.
29 changes: 15 additions & 14 deletions src/clinlp/ie/entity.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,19 @@
"""Component for rule based entity matching."""

import intervaltree as ivt
import numpy as np
import pandas as pd
import pydantic
from spacy.language import Doc, Language
from spacy.matcher import Matcher, PhraseMatcher
from spacy.pipeline import Pipe
from spacy.tokens import Span

from clinlp.ie.term import Term, _defaults_term
from clinlp.ie.term import Term
from clinlp.util import clinlp_component

SPANS_KEY = "ents"


_non_phrase_matcher_fields = ["proximity", "fuzzy", "fuzzy_min_len"]


def create_concept_dict(path: str, concept_col: str = "concept") -> dict:
"""
Create a dictionary of concepts and their terms from a ``csv`` file.
Expand All @@ -42,10 +38,13 @@ def create_concept_dict(path: str, concept_col: str = "concept") -> dict:
RuntimeError
If a value in the input ``csv`` cannot be parsed.
"""
df = pd.read_csv(path).replace([np.nan], [None])
df = pd.read_csv(path)

try:
df["term"] = df.apply(lambda x: Term(**x.to_dict()), axis=1)
df["term"] = df.apply(
lambda x: Term(**{k: v for k, v in x.to_dict().items() if not pd.isna(v)}),
axis=1,
)
except pydantic.ValidationError as e:
msg = (
"There is a value in your input csv which cannot be"
Expand Down Expand Up @@ -85,6 +84,8 @@ class RuleBasedEntityMatcher(Pipe):
matcher level are overridden by the settings at the term level.
"""

_non_phrase_matcher_fields = ("proximity", "fuzzy", "fuzzy_min_len")

def __init__(
self,
nlp: Language,
Expand Down Expand Up @@ -117,7 +118,6 @@ def __init__(
Whether to resolve overlapping entities.
"""
self.nlp = nlp
self.attr = attr

self.resolve_overlap = resolve_overlap

Expand All @@ -130,7 +130,7 @@ def __init__(
}

self._matcher = Matcher(self.nlp.vocab)
self._phrase_matcher = PhraseMatcher(self.nlp.vocab, attr=self.attr)
self._phrase_matcher = PhraseMatcher(self.nlp.vocab, attr=attr)

self._terms = {}
self._concepts = {}
Expand All @@ -148,10 +148,11 @@ def _use_phrase_matcher(self) -> bool:
``bool``
Whether the phrase matcher can be used.
"""
term_defaults = Term.defaults()

return all(
self.term_args[field] == _defaults_term[field]
for field in _non_phrase_matcher_fields
if field in self.term_args
self.term_args[field] == term_defaults[field]
for field in self._non_phrase_matcher_fields
)

def load_concepts(self, concepts: dict) -> None:
Expand Down Expand Up @@ -195,7 +196,7 @@ def load_concepts(self, concepts: dict) -> None:
term_args_with_override = {}

for field, value in self.term_args.items():
if getattr(concept_term, field) is not None:
if field in concept_term.fields_set:
term_args_with_override[field] = getattr(
concept_term, field
)
Expand All @@ -214,7 +215,7 @@ def load_concepts(self, concepts: dict) -> None:
else:
msg = (
f"Not sure how to load a term with type {type(concept_term)}, "
f"please provide str, list or clinlp.Term."
f"please provide str, list or clinlp.ie.Term."
)
raise TypeError(msg)

Expand Down
58 changes: 36 additions & 22 deletions src/clinlp/ie/term.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,7 @@
import pydantic
from spacy.language import Language

_defaults_term = {
"attr": "TEXT",
"proximity": 0,
"fuzzy": 0,
"fuzzy_min_len": 0,
"pseudo": False,
}
from clinlp.util import get_class_init_signature


class Term(pydantic.BaseModel):
Expand All @@ -20,19 +14,19 @@ class Term(pydantic.BaseModel):
phrase: str
"""The literal phrase to match."""

attr: Optional[str] = None
attr: Optional[str] = "TEXT"
"""The attribute to match on."""

proximity: Optional[int] = None
proximity: Optional[int] = 0
""" The number of tokens to allow between each token in the phrase."""

fuzzy: Optional[int] = None
fuzzy: Optional[int] = 0
"""The threshold for fuzzy matching."""

fuzzy_min_len: Optional[int] = None
fuzzy_min_len: Optional[int] = 0
"""The minimum length for fuzzy matching."""

pseudo: Optional[bool] = None
pseudo: Optional[bool] = False
"""Whether this term is a pseudo-term, which is excluded from matches."""

model_config = {"extra": "ignore"}
Expand All @@ -41,6 +35,32 @@ class Term(pydantic.BaseModel):
def __init__(self, phrase: str, **kwargs) -> None:
super().__init__(phrase=phrase, **kwargs)

@classmethod
def defaults(cls) -> dict:
"""
Get the default values for each term attribute, if any.
Returns
-------
``dict``
The default values for each attribute, if any.
"""
_, defaults = get_class_init_signature(cls)

return defaults

@property
def fields_set(self) -> set[str]:
"""
Get the fields set for this term.
Returns
-------
``set[str]``
The fields set for this term.
"""
return self.__pydantic_fields_set__

def to_spacy_pattern(self, nlp: Language) -> list[dict]:
"""
Convert the term to a ``spaCy`` pattern.
Expand All @@ -55,25 +75,19 @@ def to_spacy_pattern(self, nlp: Language) -> list[dict]:
``list[dict]``
The ``spaCy`` pattern.
"""
fields = {
field: getattr(self, field) or _defaults_term[field]
for field in ["attr", "proximity", "fuzzy", "fuzzy_min_len", "pseudo"]
}

spacy_pattern = []

phrase_tokens = [token.text for token in nlp.tokenizer(self.phrase)]

for i, token in enumerate(phrase_tokens):
if (fields["fuzzy"] > 0) and (len(token) >= fields["fuzzy_min_len"]):
token_pattern = {f"FUZZY{fields['fuzzy']}": token}
if (self.fuzzy > 0) and (len(token) >= self.fuzzy_min_len):
token_pattern = {f"FUZZY{self.fuzzy}": token}
else:
token_pattern = token

spacy_pattern.append({fields["attr"]: token_pattern})
spacy_pattern.append({self.attr: token_pattern})

if i != len(phrase_tokens) - 1:
for _ in range(fields["proximity"]):
spacy_pattern.append({"OP": "?"}) # noqa: PERF401
spacy_pattern += [{"OP": "?"}] * self.proximity

return spacy_pattern
23 changes: 23 additions & 0 deletions tests/unit/ie/test_term.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,29 @@ def test_term_from_dict_with_extra_items(self):
with pytest.raises(AttributeError):
_ = t.comment

def test_defaults(self):
# Act
defaults = Term.defaults()

# Assert
assert defaults == {
"attr": "TEXT",
"proximity": 0,
"fuzzy": 0,
"fuzzy_min_len": 0,
"pseudo": False,
}

def test_fields_set(self):
# Arrange
term = Term(phrase="Diabetes", fuzzy=1)

# Act
fields_set = term.fields_set

# Assert
assert fields_set == {"phrase", "fuzzy"}

def test_spacy_pattern(self, nlp):
# Arrange
t = Term(phrase="diabetes", attr="NORM")
Expand Down

0 comments on commit 33280b4

Please sign in to comment.