From 6306d90d9dad2ccbf800b3cabcd812644add5729 Mon Sep 17 00:00:00 2001 From: sebastien cossin Date: Sat, 11 Mar 2023 21:12:05 -0300 Subject: [PATCH 1/4] Docs: add missing init documentation --- docs/source/api_doc.rst | 28 ++++++++++++++++++++++++++++ src/iamsystem/fuzzy/api.py | 2 +- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/source/api_doc.rst b/docs/source/api_doc.rst index 1e9da26..ba9bbd6 100644 --- a/docs/source/api_doc.rst +++ b/docs/source/api_doc.rst @@ -75,6 +75,8 @@ Keyword :undoc-members: :show-inheritance: + .. automethod:: __init__ + Entity ^^^^^^ .. autoclass:: iamsystem.Entity @@ -82,12 +84,16 @@ Entity :undoc-members: :show-inheritance: + .. automethod:: __init__ + Terminology ^^^^^^^^^^^ .. autoclass:: iamsystem.Terminology :members: :show-inheritance: + .. automethod:: __init__ + Tokenization ------- @@ -168,6 +174,8 @@ Stopwords :undoc-members: :show-inheritance: + .. automethod:: __init__ + NegativeStopwords ^^^^^^^^^^^^ .. autoclass:: iamsystem.NegativeStopwords @@ -175,6 +183,8 @@ NegativeStopwords :undoc-members: :show-inheritance: + .. automethod:: __init__ + NoStopwords ^^^^^^^^^^ .. autoclass:: iamsystem.NoStopwords @@ -217,6 +227,8 @@ CacheFuzzyAlgos :undoc-members: :show-inheritance: + .. automethod:: __init__ + Abbreviations ^^^^^^^^^^^^^ @@ -235,6 +247,8 @@ FuzzyRegex :undoc-members: :show-inheritance: + .. automethod:: __init__ + WordNormalizer ^^^^^^^^^^^^^^ .. autoclass:: iamsystem.WordNormalizer @@ -242,6 +256,8 @@ WordNormalizer :undoc-members: :show-inheritance: + .. automethod:: __init__ + SpellWise ^^^^^^^^^ @@ -301,6 +317,8 @@ ContSeqFormatter :undoc-members: :show-inheritance: + .. automethod:: __init__ + ContSeqStopFormatter """""""""""""" .. autoclass:: iamsystem.ContSeqStopFormatter @@ -308,6 +326,8 @@ ContSeqStopFormatter :undoc-members: :show-inheritance: + .. automethod:: __init__ + TokenFormatter """""""""""""" .. autoclass:: iamsystem.TokenFormatter @@ -315,6 +335,8 @@ TokenFormatter :undoc-members: :show-inheritance: + .. automethod:: __init__ + SpanFormatter """""""""""""" .. autoclass:: iamsystem.SpanFormatter @@ -322,6 +344,8 @@ SpanFormatter :undoc-members: :show-inheritance: + .. automethod:: __init__ + BratDocument ^^^^^^^^^^^^ .. autoclass:: iamsystem.BratDocument @@ -329,6 +353,8 @@ BratDocument :undoc-members: :show-inheritance: + .. automethod:: __init__ + BratEntity ^^^^^^^^^^^^^ .. autoclass:: iamsystem.BratEntity @@ -353,6 +379,8 @@ BratWriter :members: :show-inheritance: + .. automethod:: __init__ + spaCy ----- diff --git a/src/iamsystem/fuzzy/api.py b/src/iamsystem/fuzzy/api.py index 193be07..49f949d 100644 --- a/src/iamsystem/fuzzy/api.py +++ b/src/iamsystem/fuzzy/api.py @@ -102,7 +102,7 @@ def get_synonyms( :param tokens: the sequence of tokens of the document. Useful when the fuzzy algorithm needs context, namely the tokens - around the token of interest given by 'i' parameter. + around the token of interest. :param token: the token of this sequence for which synonyms are expected. :param transitions: the state transitions in which the algorithm From ac8d104e70a86cc4d9215d7fa8d149df2cc980e4 Mon Sep 17 00:00:00 2001 From: sebastien cossin Date: Tue, 21 Mar 2023 20:33:55 -0300 Subject: [PATCH 2/4] #18: no new annotation when keywords are repeated - add a (failing) test to show the new expected behavior. --- tests/test_matcher.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 7473458..f6e513e 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -562,6 +562,16 @@ def test_fuzzy_algorithms_with_negative_stopwords(self): annots = matcher.annot_text(text="k poumons") self.assertEqual(1, len(annots)) + def test_repeated_words(self): + """Check repeated words are annotated multiple times. + https://github.com/scossin/iamsystem_python/issues/18 + """ + from iamsystem import Matcher + + matcher = Matcher.build(keywords=["cancer"]) + annots = matcher.annot_text(text="cancer cancer") + self.assertEqual(2, len(annots)) + if __name__ == "__main__": unittest.main() From cd3aea50003b9e6f9cf68fc7019332e41f38f5ab Mon Sep 17 00:00:00 2001 From: sebastien cossin Date: Tue, 21 Mar 2023 20:52:52 -0300 Subject: [PATCH 3/4] #18: remove checking if a transition state already exists before creating an annotation. Fix #18 --- src/iamsystem/matcher/strategy.py | 26 +++++------------------- tests/test_matcher.py | 33 +++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/iamsystem/matcher/strategy.py b/src/iamsystem/matcher/strategy.py index a6595b3..984eaaa 100644 --- a/src/iamsystem/matcher/strategy.py +++ b/src/iamsystem/matcher/strategy.py @@ -79,15 +79,7 @@ def detect( count_not_stopword=count_not_stopword, ) new_trans.append(next_trans) - # Why 'next_trans not in transitions: - # Don't create multiple annotations for the same transition - # For example 'cancer cancer' with keyword 'cancer': - # if an annotation was created for the first 'cancer' - # occurent, don't create a new one of the second occurence. - if ( - next_node.is_a_final_state() - and next_trans not in transitions - ): + if next_node.is_a_final_state(): annot = create_annot( last_trans=next_trans, stop_tokens=stop_tokens ) @@ -183,19 +175,11 @@ def detect( ) new_trans.add(next_trans) for trans in new_trans: - # create an annotation if: - # 1) node is a final state - # 2) an annotation wasn't created yet for this state: - # 2.1 there is no previous 'none-obsolete state'. if trans.node.is_a_final_state(): - old_trans = transitions.get(trans.id, None) - if old_trans is None or old_trans.is_obsolete( - count_not_stopword=count_not_stopword, w=w - ): - annot = create_annot( - last_trans=trans, stop_tokens=stop_tokens - ) - annots.append(annot) + annot = create_annot( + last_trans=trans, stop_tokens=stop_tokens + ) + annots.append(annot) for nexttoken in trans.node.get_children_tokens(): avaible_trans[nexttoken].add(trans.id) transitions[trans.id] = trans diff --git a/tests/test_matcher.py b/tests/test_matcher.py index f6e513e..186e40c 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -211,28 +211,36 @@ def test_duplicate_states_generate_lot_of_overlaps(self): If the algorithm takes all possible paths then it outputs 16 annotations. By storing algorithms' states in a set rather than in an array, an existing state is replaced. + New behavior due to + https://github.com/scossin/iamsystem_python/issues/18 issue: + two annotations are created since prostate is repeated. """ matcher = Matcher.build(keywords=["cancer de la prostate"], w=3) annots = matcher.annot_text( text="cancer cancer de de la la prostate prostate" ) - self.assertEqual(len(annots), 1) + self.assertEqual(len(annots), 2) self.assertEqual( str(annots[0]), "cancer de la prostate 7 13;17 19;23 34 cancer de la prostate", ) - def test_duplicate_states_annotations_created(self): - """Check it creates two annotations, one for the first occurence of - 'cancer', the next one using the last occurence of 'cancer'.""" + def test_states_override(self): + """States overriding avoid multiple overlapping. + See https://github.com/scossin/iamsystem_python/issues/11 + Here it creates three annotations: 1) first occurence of + 'cancer', 2) second occurence of cancer, 3) a single annotation for + cancer de la prostate (state 'cancer' overrides the previous ones). + """ matcher = Matcher.build( keywords=["cancer", "cancer de la prostate"], w=10 ) annots = matcher.annot_text(text="cancer cancer cancer de la prostate") - self.assertEqual(len(annots), 2) + self.assertEqual(len(annots), 3) self.assertEqual(str(annots[0]), "cancer 0 6 cancer") + self.assertEqual(str(annots[1]), "cancer 7 13 cancer") self.assertEqual( - str(annots[1]), + str(annots[2]), "cancer de la prostate 14 35 cancer de la prostate", ) @@ -566,12 +574,21 @@ def test_repeated_words(self): """Check repeated words are annotated multiple times. https://github.com/scossin/iamsystem_python/issues/18 """ - from iamsystem import Matcher - matcher = Matcher.build(keywords=["cancer"]) annots = matcher.annot_text(text="cancer cancer") self.assertEqual(2, len(annots)) + def test_repeated_words_large_window(self): + """Check repeated words are annotated multiple times with the large + window strategy. + https://github.com/scossin/iamsystem_python/issues/18 + """ + matcher = Matcher.build( + keywords=["cancer"], strategy=EMatchingStrategy.LARGE_WINDOW + ) + annots = matcher.annot_text(text="cancer cancer") + self.assertEqual(2, len(annots)) + if __name__ == "__main__": unittest.main() From f6b655c2d2c37fd846e92eef817a133f88098652 Mon Sep 17 00:00:00 2001 From: sebastien cossin Date: Tue, 21 Mar 2023 21:02:27 -0300 Subject: [PATCH 4/4] update to version 0.5.0 --- CHANGELOG.md | 3 +++ docs/source/conf.py | 2 +- pyproject.toml | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68f9a08..98a8828 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # ChangeLog +## Version 0.5.0 (beta) +- Fix issue 18: create multiple annotations when a keyword is repeated in the same window. + ## Version 0.4.0 (beta) ### Breaking changes diff --git a/docs/source/conf.py b/docs/source/conf.py index 0582ec2..ce971a0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -24,7 +24,7 @@ author = "Sebastien Cossin" # The full version, including alpha/beta/rc tags -release = "0.4.0" +release = "0.5.0" # -- General configuration --------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 5a9355c..fa22ab5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "iamsystem" -version = "0.4.0" +version = "0.5.0" authors = [ { name="Sebastien Cossin", email="cossin.sebastien@gmail.com" }, ]