diff --git a/CHANGELOG.md b/CHANGELOG.md index 466706c..9b357e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * In `InformationExtractionDataset`, renamed `span_counts`, `label_counts` and `qualifier_counts` to `span_freqs`, `label_freqs` and `qualifier_freqs` respectively. * The `clinlp_component` utility now returns the class itself, rather than a helper function for making it * Changed order of `direction` and `qualifier` arguments of `ContextRule` -* Simplified default settings for `clinlp` components +* Simplified default settings for `clinlp` components and `Term` class ## 0.8.0 (2024-06-03) diff --git a/media/clinlp-features-v0.6.png b/media/clinlp-features-v0.6.png deleted file mode 100644 index 65d593c..0000000 Binary files a/media/clinlp-features-v0.6.png and /dev/null differ diff --git a/src/clinlp/ie/entity.py b/src/clinlp/ie/entity.py index b17b830..8570df6 100644 --- a/src/clinlp/ie/entity.py +++ b/src/clinlp/ie/entity.py @@ -1,7 +1,6 @@ """Component for rule based entity matching.""" import intervaltree as ivt -import numpy as np import pandas as pd import pydantic from spacy.language import Doc, Language @@ -9,15 +8,12 @@ from spacy.pipeline import Pipe from spacy.tokens import Span -from clinlp.ie.term import Term, _defaults_term +from clinlp.ie.term import Term from clinlp.util import clinlp_component SPANS_KEY = "ents" -_non_phrase_matcher_fields = ["proximity", "fuzzy", "fuzzy_min_len"] - - def create_concept_dict(path: str, concept_col: str = "concept") -> dict: """ Create a dictionary of concepts and their terms from a ``csv`` file. @@ -42,10 +38,13 @@ def create_concept_dict(path: str, concept_col: str = "concept") -> dict: RuntimeError If a value in the input ``csv`` cannot be parsed. """ - df = pd.read_csv(path).replace([np.nan], [None]) + df = pd.read_csv(path) try: - df["term"] = df.apply(lambda x: Term(**x.to_dict()), axis=1) + df["term"] = df.apply( + lambda x: Term(**{k: v for k, v in x.to_dict().items() if not pd.isna(v)}), + axis=1, + ) except pydantic.ValidationError as e: msg = ( "There is a value in your input csv which cannot be" @@ -85,6 +84,8 @@ class RuleBasedEntityMatcher(Pipe): matcher level are overridden by the settings at the term level. """ + _non_phrase_matcher_fields = ("proximity", "fuzzy", "fuzzy_min_len") + def __init__( self, nlp: Language, @@ -117,7 +118,6 @@ def __init__( Whether to resolve overlapping entities. """ self.nlp = nlp - self.attr = attr self.resolve_overlap = resolve_overlap @@ -130,7 +130,7 @@ def __init__( } self._matcher = Matcher(self.nlp.vocab) - self._phrase_matcher = PhraseMatcher(self.nlp.vocab, attr=self.attr) + self._phrase_matcher = PhraseMatcher(self.nlp.vocab, attr=attr) self._terms = {} self._concepts = {} @@ -148,10 +148,11 @@ def _use_phrase_matcher(self) -> bool: ``bool`` Whether the phrase matcher can be used. """ + term_defaults = Term.defaults() + return all( - self.term_args[field] == _defaults_term[field] - for field in _non_phrase_matcher_fields - if field in self.term_args + self.term_args[field] == term_defaults[field] + for field in self._non_phrase_matcher_fields ) def load_concepts(self, concepts: dict) -> None: @@ -195,7 +196,7 @@ def load_concepts(self, concepts: dict) -> None: term_args_with_override = {} for field, value in self.term_args.items(): - if getattr(concept_term, field) is not None: + if field in concept_term.fields_set: term_args_with_override[field] = getattr( concept_term, field ) @@ -214,7 +215,7 @@ def load_concepts(self, concepts: dict) -> None: else: msg = ( f"Not sure how to load a term with type {type(concept_term)}, " - f"please provide str, list or clinlp.Term." + f"please provide str, list or clinlp.ie.Term." ) raise TypeError(msg) diff --git a/src/clinlp/ie/term.py b/src/clinlp/ie/term.py index 5ae7180..5dc17db 100644 --- a/src/clinlp/ie/term.py +++ b/src/clinlp/ie/term.py @@ -5,13 +5,7 @@ import pydantic from spacy.language import Language -_defaults_term = { - "attr": "TEXT", - "proximity": 0, - "fuzzy": 0, - "fuzzy_min_len": 0, - "pseudo": False, -} +from clinlp.util import get_class_init_signature class Term(pydantic.BaseModel): @@ -20,19 +14,19 @@ class Term(pydantic.BaseModel): phrase: str """The literal phrase to match.""" - attr: Optional[str] = None + attr: Optional[str] = "TEXT" """The attribute to match on.""" - proximity: Optional[int] = None + proximity: Optional[int] = 0 """ The number of tokens to allow between each token in the phrase.""" - fuzzy: Optional[int] = None + fuzzy: Optional[int] = 0 """The threshold for fuzzy matching.""" - fuzzy_min_len: Optional[int] = None + fuzzy_min_len: Optional[int] = 0 """The minimum length for fuzzy matching.""" - pseudo: Optional[bool] = None + pseudo: Optional[bool] = False """Whether this term is a pseudo-term, which is excluded from matches.""" model_config = {"extra": "ignore"} @@ -41,6 +35,32 @@ class Term(pydantic.BaseModel): def __init__(self, phrase: str, **kwargs) -> None: super().__init__(phrase=phrase, **kwargs) + @classmethod + def defaults(cls) -> dict: + """ + Get the default values for each term attribute, if any. + + Returns + ------- + ``dict`` + The default values for each attribute, if any. + """ + _, defaults = get_class_init_signature(cls) + + return defaults + + @property + def fields_set(self) -> set[str]: + """ + Get the fields set for this term. + + Returns + ------- + ``set[str]`` + The fields set for this term. + """ + return self.__pydantic_fields_set__ + def to_spacy_pattern(self, nlp: Language) -> list[dict]: """ Convert the term to a ``spaCy`` pattern. @@ -55,25 +75,19 @@ def to_spacy_pattern(self, nlp: Language) -> list[dict]: ``list[dict]`` The ``spaCy`` pattern. """ - fields = { - field: getattr(self, field) or _defaults_term[field] - for field in ["attr", "proximity", "fuzzy", "fuzzy_min_len", "pseudo"] - } - spacy_pattern = [] phrase_tokens = [token.text for token in nlp.tokenizer(self.phrase)] for i, token in enumerate(phrase_tokens): - if (fields["fuzzy"] > 0) and (len(token) >= fields["fuzzy_min_len"]): - token_pattern = {f"FUZZY{fields['fuzzy']}": token} + if (self.fuzzy > 0) and (len(token) >= self.fuzzy_min_len): + token_pattern = {f"FUZZY{self.fuzzy}": token} else: token_pattern = token - spacy_pattern.append({fields["attr"]: token_pattern}) + spacy_pattern.append({self.attr: token_pattern}) if i != len(phrase_tokens) - 1: - for _ in range(fields["proximity"]): - spacy_pattern.append({"OP": "?"}) # noqa: PERF401 + spacy_pattern += [{"OP": "?"}] * self.proximity return spacy_pattern diff --git a/tests/unit/ie/test_term.py b/tests/unit/ie/test_term.py index ea911c8..c03c6bb 100644 --- a/tests/unit/ie/test_term.py +++ b/tests/unit/ie/test_term.py @@ -36,6 +36,29 @@ def test_term_from_dict_with_extra_items(self): with pytest.raises(AttributeError): _ = t.comment + def test_defaults(self): + # Act + defaults = Term.defaults() + + # Assert + assert defaults == { + "attr": "TEXT", + "proximity": 0, + "fuzzy": 0, + "fuzzy_min_len": 0, + "pseudo": False, + } + + def test_fields_set(self): + # Arrange + term = Term(phrase="Diabetes", fuzzy=1) + + # Act + fields_set = term.fields_set + + # Assert + assert fields_set == {"phrase", "fuzzy"} + def test_spacy_pattern(self, nlp): # Arrange t = Term(phrase="diabetes", attr="NORM")