Merge pull request #105 from umcu/improve-default-settings

Simplify Term defaults
umcu · Jun 27, 2024 · 33280b4 · 33280b4
2 parents 46990cb + 77b2634
commit 33280b4
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 37 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,7 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * In `InformationExtractionDataset`, renamed `span_counts`, `label_counts` and `qualifier_counts` to `span_freqs`, `label_freqs` and `qualifier_freqs` respectively.
 * The `clinlp_component` utility now returns the class itself, rather than a helper function for making it
 * Changed order of `direction` and `qualifier` arguments of `ContextRule`
-* Simplified default settings for `clinlp` components
+* Simplified default settings for `clinlp` components and `Term` class
 
 ## 0.8.0 (2024-06-03)
 

diff --git a/media/clinlp-features-v0.6.png b/media/clinlp-features-v0.6.png
diff --git a/src/clinlp/ie/entity.py b/src/clinlp/ie/entity.py
@@ -1,23 +1,19 @@
 """Component for rule based entity matching."""
 
 import intervaltree as ivt
-import numpy as np
 import pandas as pd
 import pydantic
 from spacy.language import Doc, Language
 from spacy.matcher import Matcher, PhraseMatcher
 from spacy.pipeline import Pipe
 from spacy.tokens import Span
 
-from clinlp.ie.term import Term, _defaults_term
+from clinlp.ie.term import Term
 from clinlp.util import clinlp_component
 
 SPANS_KEY = "ents"
 
 
-_non_phrase_matcher_fields = ["proximity", "fuzzy", "fuzzy_min_len"]
-
-
 def create_concept_dict(path: str, concept_col: str = "concept") -> dict:
     """
     Create a dictionary of concepts and their terms from a ``csv`` file.
@@ -42,10 +38,13 @@ def create_concept_dict(path: str, concept_col: str = "concept") -> dict:
     RuntimeError
         If a value in the input ``csv`` cannot be parsed.
     """
-    df = pd.read_csv(path).replace([np.nan], [None])
+    df = pd.read_csv(path)
 
     try:
-        df["term"] = df.apply(lambda x: Term(**x.to_dict()), axis=1)
+        df["term"] = df.apply(
+            lambda x: Term(**{k: v for k, v in x.to_dict().items() if not pd.isna(v)}),
+            axis=1,
+        )
     except pydantic.ValidationError as e:
         msg = (
             "There is a value in your input csv which cannot be"
@@ -85,6 +84,8 @@ class RuleBasedEntityMatcher(Pipe):
     matcher level are overridden by the settings at the term level.
     """
 
+    _non_phrase_matcher_fields = ("proximity", "fuzzy", "fuzzy_min_len")
+
     def __init__(
         self,
         nlp: Language,
@@ -117,7 +118,6 @@ def __init__(
             Whether to resolve overlapping entities.
         """
         self.nlp = nlp
-        self.attr = attr
 
         self.resolve_overlap = resolve_overlap
 
@@ -130,7 +130,7 @@ def __init__(
         }
 
         self._matcher = Matcher(self.nlp.vocab)
-        self._phrase_matcher = PhraseMatcher(self.nlp.vocab, attr=self.attr)
+        self._phrase_matcher = PhraseMatcher(self.nlp.vocab, attr=attr)
 
         self._terms = {}
         self._concepts = {}
@@ -148,10 +148,11 @@ def _use_phrase_matcher(self) -> bool:
         ``bool``
             Whether the phrase matcher can be used.
         """
+        term_defaults = Term.defaults()
+
         return all(
-            self.term_args[field] == _defaults_term[field]
-            for field in _non_phrase_matcher_fields
-            if field in self.term_args
+            self.term_args[field] == term_defaults[field]
+            for field in self._non_phrase_matcher_fields
         )
 
     def load_concepts(self, concepts: dict) -> None:
@@ -195,7 +196,7 @@ def load_concepts(self, concepts: dict) -> None:
                     term_args_with_override = {}
 
                     for field, value in self.term_args.items():
-                        if getattr(concept_term, field) is not None:
+                        if field in concept_term.fields_set:
                             term_args_with_override[field] = getattr(
                                 concept_term, field
                             )
@@ -214,7 +215,7 @@ def load_concepts(self, concepts: dict) -> None:
                 else:
                     msg = (
                         f"Not sure how to load a term with type {type(concept_term)}, "
-                        f"please provide str, list or clinlp.Term."
+                        f"please provide str, list or clinlp.ie.Term."
                     )
                     raise TypeError(msg)
 

diff --git a/src/clinlp/ie/term.py b/src/clinlp/ie/term.py
@@ -5,13 +5,7 @@
 import pydantic
 from spacy.language import Language
 
-_defaults_term = {
-    "attr": "TEXT",
-    "proximity": 0,
-    "fuzzy": 0,
-    "fuzzy_min_len": 0,
-    "pseudo": False,
-}
+from clinlp.util import get_class_init_signature
 
 
 class Term(pydantic.BaseModel):
@@ -20,19 +14,19 @@ class Term(pydantic.BaseModel):
     phrase: str
     """The literal phrase to match."""
 
-    attr: Optional[str] = None
+    attr: Optional[str] = "TEXT"
     """The attribute to match on."""
 
-    proximity: Optional[int] = None
+    proximity: Optional[int] = 0
     """ The number of tokens to allow between each token in the phrase."""
 
-    fuzzy: Optional[int] = None
+    fuzzy: Optional[int] = 0
     """The threshold for fuzzy matching."""
 
-    fuzzy_min_len: Optional[int] = None
+    fuzzy_min_len: Optional[int] = 0
     """The minimum length for fuzzy matching."""
 
-    pseudo: Optional[bool] = None
+    pseudo: Optional[bool] = False
     """Whether this term is a pseudo-term, which is excluded from matches."""
 
     model_config = {"extra": "ignore"}
@@ -41,6 +35,32 @@ class Term(pydantic.BaseModel):
     def __init__(self, phrase: str, **kwargs) -> None:
         super().__init__(phrase=phrase, **kwargs)
 
+    @classmethod
+    def defaults(cls) -> dict:
+        """
+        Get the default values for each term attribute, if any.
+
+        Returns
+        -------
+        ``dict``
+            The default values for each attribute, if any.
+        """
+        _, defaults = get_class_init_signature(cls)
+
+        return defaults
+
+    @property
+    def fields_set(self) -> set[str]:
+        """
+        Get the fields set for this term.
+
+        Returns
+        -------
+        ``set[str]``
+            The fields set for this term.
+        """
+        return self.__pydantic_fields_set__
+
     def to_spacy_pattern(self, nlp: Language) -> list[dict]:
         """
         Convert the term to a ``spaCy`` pattern.
@@ -55,25 +75,19 @@ def to_spacy_pattern(self, nlp: Language) -> list[dict]:
         ``list[dict]``
             The ``spaCy`` pattern.
         """
-        fields = {
-            field: getattr(self, field) or _defaults_term[field]
-            for field in ["attr", "proximity", "fuzzy", "fuzzy_min_len", "pseudo"]
-        }
-
         spacy_pattern = []
 
         phrase_tokens = [token.text for token in nlp.tokenizer(self.phrase)]
 
         for i, token in enumerate(phrase_tokens):
-            if (fields["fuzzy"] > 0) and (len(token) >= fields["fuzzy_min_len"]):
-                token_pattern = {f"FUZZY{fields['fuzzy']}": token}
+            if (self.fuzzy > 0) and (len(token) >= self.fuzzy_min_len):
+                token_pattern = {f"FUZZY{self.fuzzy}": token}
             else:
                 token_pattern = token
 
-            spacy_pattern.append({fields["attr"]: token_pattern})
+            spacy_pattern.append({self.attr: token_pattern})
 
             if i != len(phrase_tokens) - 1:
-                for _ in range(fields["proximity"]):
-                    spacy_pattern.append({"OP": "?"})  # noqa: PERF401
+                spacy_pattern += [{"OP": "?"}] * self.proximity
 
         return spacy_pattern
diff --git a/tests/unit/ie/test_term.py b/tests/unit/ie/test_term.py
@@ -36,6 +36,29 @@ def test_term_from_dict_with_extra_items(self):
         with pytest.raises(AttributeError):
             _ = t.comment
 
+    def test_defaults(self):
+        # Act
+        defaults = Term.defaults()
+
+        # Assert
+        assert defaults == {
+            "attr": "TEXT",
+            "proximity": 0,
+            "fuzzy": 0,
+            "fuzzy_min_len": 0,
+            "pseudo": False,
+        }
+
+    def test_fields_set(self):
+        # Arrange
+        term = Term(phrase="Diabetes", fuzzy=1)
+
+        # Act
+        fields_set = term.fields_set
+
+        # Assert
+        assert fields_set == {"phrase", "fuzzy"}
+
     def test_spacy_pattern(self, nlp):
         # Arrange
         t = Term(phrase="diabetes", attr="NORM")