Merge pull request #106 from umcu/normalize-using-casefold

Use casefold for case normalizing
umcu · Jun 27, 2024 · ea02ca8 · ea02ca8
2 parents 33280b4 + 0693544
commit ea02ca8
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * The `clinlp_component` utility now returns the class itself, rather than a helper function for making it
 * Changed order of `direction` and `qualifier` arguments of `ContextRule`
 * Simplified default settings for `clinlp` components and `Term` class
+* Normalizer uses casefold rather than lower for normalizing text
 
 ## 0.8.0 (2024-06-03)
 

diff --git a/src/clinlp/normalizer.py b/src/clinlp/normalizer.py
@@ -49,7 +49,7 @@ def _lowercase(text: str) -> str:
         ``str``
             The lowercased text.
         """
-        return text.lower()
+        return text.casefold()
 
     @staticmethod
     def _map_non_ascii_char(char: str) -> str:

diff --git a/tests/unit/test_normalizer.py b/tests/unit/test_normalizer.py
@@ -18,6 +18,8 @@ class TestNormalizer:
             ("test", "test"),
             ("Test", "test"),
             ("TEST", "test"),
+            ("ß", "ss"),
+            ("µg", "μg")
         ],
     )
     def test_lowercase(self, input_text, expected_lowercased_text):
@@ -92,7 +94,7 @@ def test_map_non_ascii_string(self, input_string, expected_non_ascii_string):
 
     def test_call_normalizer_default(self, mock_doc):
         # Arange
-        expected_norms = ["patient", "250", "µg", "toedienen"]
+        expected_norms = ["patient", "250", "μg", "toedienen"]
         n = Normalizer()
 
         # Act
@@ -118,7 +120,7 @@ def test_call_normalizer_disable_lowercase(self, mock_doc):
 
     def test_call_normalizer_disable_map_non_ascii(self, mock_doc):
         # Arange
-        expected_norms = ["patiënt", "250", "µg", "toedienen"]
+        expected_norms = ["patiënt", "250", "μg", "toedienen"]
         n = Normalizer(map_non_ascii=False)
 
         # Act