From 6306d90d9dad2ccbf800b3cabcd812644add5729 Mon Sep 17 00:00:00 2001
From: sebastien cossin <cossin.sebastien@gmail.com>
Date: Sat, 11 Mar 2023 21:12:05 -0300
Subject: [PATCH 1/4] Docs: add missing init documentation

---
 docs/source/api_doc.rst    | 28 ++++++++++++++++++++++++++++
 src/iamsystem/fuzzy/api.py |  2 +-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/docs/source/api_doc.rst b/docs/source/api_doc.rst
index 1e9da26..ba9bbd6 100644
--- a/docs/source/api_doc.rst
+++ b/docs/source/api_doc.rst
@@ -75,6 +75,8 @@ Keyword
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 Entity
 ^^^^^^
 .. autoclass:: iamsystem.Entity
@@ -82,12 +84,16 @@ Entity
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 Terminology
 ^^^^^^^^^^^
 .. autoclass:: iamsystem.Terminology
    :members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 Tokenization
 -------
 
@@ -168,6 +174,8 @@ Stopwords
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 NegativeStopwords
 ^^^^^^^^^^^^
 .. autoclass:: iamsystem.NegativeStopwords
@@ -175,6 +183,8 @@ NegativeStopwords
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 NoStopwords
 ^^^^^^^^^^
 .. autoclass:: iamsystem.NoStopwords
@@ -217,6 +227,8 @@ CacheFuzzyAlgos
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 Abbreviations
 ^^^^^^^^^^^^^
 
@@ -235,6 +247,8 @@ FuzzyRegex
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 WordNormalizer
 ^^^^^^^^^^^^^^
 .. autoclass:: iamsystem.WordNormalizer
@@ -242,6 +256,8 @@ WordNormalizer
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 SpellWise
 ^^^^^^^^^
 
@@ -301,6 +317,8 @@ ContSeqFormatter
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 ContSeqStopFormatter
 """"""""""""""
 .. autoclass:: iamsystem.ContSeqStopFormatter
@@ -308,6 +326,8 @@ ContSeqStopFormatter
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 TokenFormatter
 """"""""""""""
 .. autoclass:: iamsystem.TokenFormatter
@@ -315,6 +335,8 @@ TokenFormatter
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 SpanFormatter
 """"""""""""""
 .. autoclass:: iamsystem.SpanFormatter
@@ -322,6 +344,8 @@ SpanFormatter
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 BratDocument
 ^^^^^^^^^^^^
 .. autoclass:: iamsystem.BratDocument
@@ -329,6 +353,8 @@ BratDocument
    :undoc-members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 BratEntity
 ^^^^^^^^^^^^^
 .. autoclass:: iamsystem.BratEntity
@@ -353,6 +379,8 @@ BratWriter
    :members:
    :show-inheritance:
 
+   .. automethod:: __init__
+
 spaCy
 -----
 
diff --git a/src/iamsystem/fuzzy/api.py b/src/iamsystem/fuzzy/api.py
index 193be07..49f949d 100644
--- a/src/iamsystem/fuzzy/api.py
+++ b/src/iamsystem/fuzzy/api.py
@@ -102,7 +102,7 @@ def get_synonyms(
 
         :param tokens: the sequence of tokens of the document.
             Useful when the fuzzy algorithm needs context, namely the tokens
-            around the token of interest given by 'i' parameter.
+            around the token of interest.
         :param token: the token of this sequence for which synonyms
             are expected.
         :param transitions: the state transitions in which the algorithm

From ac8d104e70a86cc4d9215d7fa8d149df2cc980e4 Mon Sep 17 00:00:00 2001
From: sebastien cossin <cossin.sebastien@gmail.com>
Date: Tue, 21 Mar 2023 20:33:55 -0300
Subject: [PATCH 2/4] #18: no new annotation when keywords are repeated - add a
 (failing) test to show the new expected behavior.

---
 tests/test_matcher.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/test_matcher.py b/tests/test_matcher.py
index 7473458..f6e513e 100644
--- a/tests/test_matcher.py
+++ b/tests/test_matcher.py
@@ -562,6 +562,16 @@ def test_fuzzy_algorithms_with_negative_stopwords(self):
         annots = matcher.annot_text(text="k poumons")
         self.assertEqual(1, len(annots))
 
+    def test_repeated_words(self):
+        """Check repeated words are annotated multiple times.
+        https://github.com/scossin/iamsystem_python/issues/18
+        """
+        from iamsystem import Matcher
+
+        matcher = Matcher.build(keywords=["cancer"])
+        annots = matcher.annot_text(text="cancer cancer")
+        self.assertEqual(2, len(annots))
+
 
 if __name__ == "__main__":
     unittest.main()

From cd3aea50003b9e6f9cf68fc7019332e41f38f5ab Mon Sep 17 00:00:00 2001
From: sebastien cossin <cossin.sebastien@gmail.com>
Date: Tue, 21 Mar 2023 20:52:52 -0300
Subject: [PATCH 3/4] #18: remove checking if a transition state already exists
 before creating an annotation. Fix #18

---
 src/iamsystem/matcher/strategy.py | 26 +++++-------------------
 tests/test_matcher.py             | 33 +++++++++++++++++++++++--------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/src/iamsystem/matcher/strategy.py b/src/iamsystem/matcher/strategy.py
index a6595b3..984eaaa 100644
--- a/src/iamsystem/matcher/strategy.py
+++ b/src/iamsystem/matcher/strategy.py
@@ -79,15 +79,7 @@ def detect(
                         count_not_stopword=count_not_stopword,
                     )
                     new_trans.append(next_trans)
-                    # Why 'next_trans not in transitions:
-                    # Don't create multiple annotations for the same transition
-                    # For example 'cancer cancer' with keyword 'cancer':
-                    # if an annotation was created for the first 'cancer'
-                    # occurent, don't create a new one of the second occurence.
-                    if (
-                        next_node.is_a_final_state()
-                        and next_trans not in transitions
-                    ):
+                    if next_node.is_a_final_state():
                         annot = create_annot(
                             last_trans=next_trans, stop_tokens=stop_tokens
                         )
@@ -183,19 +175,11 @@ def detect(
                     )
                     new_trans.add(next_trans)
             for trans in new_trans:
-                # create an annotation if:
-                # 1) node is a final state
-                # 2) an annotation wasn't created yet for this state:
-                # 2.1 there is no previous 'none-obsolete state'.
                 if trans.node.is_a_final_state():
-                    old_trans = transitions.get(trans.id, None)
-                    if old_trans is None or old_trans.is_obsolete(
-                        count_not_stopword=count_not_stopword, w=w
-                    ):
-                        annot = create_annot(
-                            last_trans=trans, stop_tokens=stop_tokens
-                        )
-                        annots.append(annot)
+                    annot = create_annot(
+                        last_trans=trans, stop_tokens=stop_tokens
+                    )
+                    annots.append(annot)
                 for nexttoken in trans.node.get_children_tokens():
                     avaible_trans[nexttoken].add(trans.id)
                 transitions[trans.id] = trans
diff --git a/tests/test_matcher.py b/tests/test_matcher.py
index f6e513e..186e40c 100644
--- a/tests/test_matcher.py
+++ b/tests/test_matcher.py
@@ -211,28 +211,36 @@ def test_duplicate_states_generate_lot_of_overlaps(self):
         If the algorithm takes all possible paths then it outputs 16
         annotations. By storing algorithms' states in a set rather than in
         an array, an existing state is replaced.
+        New behavior due to
+        https://github.com/scossin/iamsystem_python/issues/18 issue:
+        two annotations are created since prostate is repeated.
         """
         matcher = Matcher.build(keywords=["cancer de la prostate"], w=3)
         annots = matcher.annot_text(
             text="cancer cancer de de la la prostate prostate"
         )
-        self.assertEqual(len(annots), 1)
+        self.assertEqual(len(annots), 2)
         self.assertEqual(
             str(annots[0]),
             "cancer de la prostate	7 13;17 19;23 34	cancer de la prostate",
         )
 
-    def test_duplicate_states_annotations_created(self):
-        """Check it creates two annotations, one for the first occurence of
-        'cancer', the next one using the last occurence of 'cancer'."""
+    def test_states_override(self):
+        """States overriding avoid multiple overlapping.
+        See https://github.com/scossin/iamsystem_python/issues/11
+        Here it creates three annotations: 1) first occurence of
+        'cancer', 2) second occurence of cancer, 3) a single annotation for
+        cancer de la prostate (state 'cancer' overrides the previous ones).
+        """
         matcher = Matcher.build(
             keywords=["cancer", "cancer de la prostate"], w=10
         )
         annots = matcher.annot_text(text="cancer cancer cancer de la prostate")
-        self.assertEqual(len(annots), 2)
+        self.assertEqual(len(annots), 3)
         self.assertEqual(str(annots[0]), "cancer	0 6	cancer")
+        self.assertEqual(str(annots[1]), "cancer	7 13	cancer")
         self.assertEqual(
-            str(annots[1]),
+            str(annots[2]),
             "cancer de la prostate	14 35	cancer de la prostate",
         )
 
@@ -566,12 +574,21 @@ def test_repeated_words(self):
         """Check repeated words are annotated multiple times.
         https://github.com/scossin/iamsystem_python/issues/18
         """
-        from iamsystem import Matcher
-
         matcher = Matcher.build(keywords=["cancer"])
         annots = matcher.annot_text(text="cancer cancer")
         self.assertEqual(2, len(annots))
 
+    def test_repeated_words_large_window(self):
+        """Check repeated words are annotated multiple times with the large
+        window strategy.
+        https://github.com/scossin/iamsystem_python/issues/18
+        """
+        matcher = Matcher.build(
+            keywords=["cancer"], strategy=EMatchingStrategy.LARGE_WINDOW
+        )
+        annots = matcher.annot_text(text="cancer cancer")
+        self.assertEqual(2, len(annots))
+
 
 if __name__ == "__main__":
     unittest.main()

From f6b655c2d2c37fd846e92eef817a133f88098652 Mon Sep 17 00:00:00 2001
From: sebastien cossin <cossin.sebastien@gmail.com>
Date: Tue, 21 Mar 2023 21:02:27 -0300
Subject: [PATCH 4/4] update to version 0.5.0

---
 CHANGELOG.md        | 3 +++
 docs/source/conf.py | 2 +-
 pyproject.toml      | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 68f9a08..98a8828 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # ChangeLog
 
+## Version 0.5.0 (beta)
+- Fix issue 18: create multiple annotations when a keyword is repeated in the same window.
+
 ## Version 0.4.0 (beta)
 
 ### Breaking changes
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0582ec2..ce971a0 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,7 +24,7 @@
 author = "Sebastien Cossin"
 
 # The full version, including alpha/beta/rc tags
-release = "0.4.0"
+release = "0.5.0"
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/pyproject.toml b/pyproject.toml
index 5a9355c..fa22ab5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "iamsystem"
-version = "0.4.0"
+version = "0.5.0"
 authors = [
   { name="Sebastien Cossin", email="cossin.sebastien@gmail.com" },
 ]