Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #18 no new annotation when keywords are repeated window strategy #19

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# ChangeLog

## Version 0.5.0 (beta)
- Fix issue 18: create multiple annotations when a keyword is repeated in the same window.

## Version 0.4.0 (beta)

### Breaking changes
Expand Down
28 changes: 28 additions & 0 deletions docs/source/api_doc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,19 +75,25 @@ Keyword
:undoc-members:
:show-inheritance:

.. automethod:: __init__

Entity
^^^^^^
.. autoclass:: iamsystem.Entity
:members:
:undoc-members:
:show-inheritance:

.. automethod:: __init__

Terminology
^^^^^^^^^^^
.. autoclass:: iamsystem.Terminology
:members:
:show-inheritance:

.. automethod:: __init__

Tokenization
-------

Expand Down Expand Up @@ -168,13 +174,17 @@ Stopwords
:undoc-members:
:show-inheritance:

.. automethod:: __init__

NegativeStopwords
^^^^^^^^^^^^
.. autoclass:: iamsystem.NegativeStopwords
:members:
:undoc-members:
:show-inheritance:

.. automethod:: __init__

NoStopwords
^^^^^^^^^^
.. autoclass:: iamsystem.NoStopwords
Expand Down Expand Up @@ -217,6 +227,8 @@ CacheFuzzyAlgos
:undoc-members:
:show-inheritance:

.. automethod:: __init__

Abbreviations
^^^^^^^^^^^^^

Expand All @@ -235,13 +247,17 @@ FuzzyRegex
:undoc-members:
:show-inheritance:

.. automethod:: __init__

WordNormalizer
^^^^^^^^^^^^^^
.. autoclass:: iamsystem.WordNormalizer
:members:
:undoc-members:
:show-inheritance:

.. automethod:: __init__

SpellWise
^^^^^^^^^

Expand Down Expand Up @@ -301,34 +317,44 @@ ContSeqFormatter
:undoc-members:
:show-inheritance:

.. automethod:: __init__

ContSeqStopFormatter
""""""""""""""
.. autoclass:: iamsystem.ContSeqStopFormatter
:members:
:undoc-members:
:show-inheritance:

.. automethod:: __init__

TokenFormatter
""""""""""""""
.. autoclass:: iamsystem.TokenFormatter
:members:
:undoc-members:
:show-inheritance:

.. automethod:: __init__

SpanFormatter
""""""""""""""
.. autoclass:: iamsystem.SpanFormatter
:members:
:undoc-members:
:show-inheritance:

.. automethod:: __init__

BratDocument
^^^^^^^^^^^^
.. autoclass:: iamsystem.BratDocument
:members:
:undoc-members:
:show-inheritance:

.. automethod:: __init__

BratEntity
^^^^^^^^^^^^^
.. autoclass:: iamsystem.BratEntity
Expand All @@ -353,6 +379,8 @@ BratWriter
:members:
:show-inheritance:

.. automethod:: __init__

spaCy
-----

Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
author = "Sebastien Cossin"

# The full version, including alpha/beta/rc tags
release = "0.4.0"
release = "0.5.0"

# -- General configuration ---------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "iamsystem"
version = "0.4.0"
version = "0.5.0"
authors = [
{ name="Sebastien Cossin", email="cossin.sebastien@gmail.com" },
]
Expand Down
2 changes: 1 addition & 1 deletion src/iamsystem/fuzzy/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def get_synonyms(

:param tokens: the sequence of tokens of the document.
Useful when the fuzzy algorithm needs context, namely the tokens
around the token of interest given by 'i' parameter.
around the token of interest.
:param token: the token of this sequence for which synonyms
are expected.
:param transitions: the state transitions in which the algorithm
Expand Down
26 changes: 5 additions & 21 deletions src/iamsystem/matcher/strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,7 @@ def detect(
count_not_stopword=count_not_stopword,
)
new_trans.append(next_trans)
# Why 'next_trans not in transitions:
# Don't create multiple annotations for the same transition
# For example 'cancer cancer' with keyword 'cancer':
# if an annotation was created for the first 'cancer'
# occurent, don't create a new one of the second occurence.
if (
next_node.is_a_final_state()
and next_trans not in transitions
):
if next_node.is_a_final_state():
annot = create_annot(
last_trans=next_trans, stop_tokens=stop_tokens
)
Expand Down Expand Up @@ -183,19 +175,11 @@ def detect(
)
new_trans.add(next_trans)
for trans in new_trans:
# create an annotation if:
# 1) node is a final state
# 2) an annotation wasn't created yet for this state:
# 2.1 there is no previous 'none-obsolete state'.
if trans.node.is_a_final_state():
old_trans = transitions.get(trans.id, None)
if old_trans is None or old_trans.is_obsolete(
count_not_stopword=count_not_stopword, w=w
):
annot = create_annot(
last_trans=trans, stop_tokens=stop_tokens
)
annots.append(annot)
annot = create_annot(
last_trans=trans, stop_tokens=stop_tokens
)
annots.append(annot)
for nexttoken in trans.node.get_children_tokens():
avaible_trans[nexttoken].add(trans.id)
transitions[trans.id] = trans
Expand Down
39 changes: 33 additions & 6 deletions tests/test_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,28 +211,36 @@ def test_duplicate_states_generate_lot_of_overlaps(self):
If the algorithm takes all possible paths then it outputs 16
annotations. By storing algorithms' states in a set rather than in
an array, an existing state is replaced.
New behavior due to
https://github.com/scossin/iamsystem_python/issues/18 issue:
two annotations are created since prostate is repeated.
"""
matcher = Matcher.build(keywords=["cancer de la prostate"], w=3)
annots = matcher.annot_text(
text="cancer cancer de de la la prostate prostate"
)
self.assertEqual(len(annots), 1)
self.assertEqual(len(annots), 2)
self.assertEqual(
str(annots[0]),
"cancer de la prostate 7 13;17 19;23 34 cancer de la prostate",
)

def test_duplicate_states_annotations_created(self):
"""Check it creates two annotations, one for the first occurence of
'cancer', the next one using the last occurence of 'cancer'."""
def test_states_override(self):
"""States overriding avoid multiple overlapping.
See https://github.com/scossin/iamsystem_python/issues/11
Here it creates three annotations: 1) first occurence of
'cancer', 2) second occurence of cancer, 3) a single annotation for
cancer de la prostate (state 'cancer' overrides the previous ones).
"""
matcher = Matcher.build(
keywords=["cancer", "cancer de la prostate"], w=10
)
annots = matcher.annot_text(text="cancer cancer cancer de la prostate")
self.assertEqual(len(annots), 2)
self.assertEqual(len(annots), 3)
self.assertEqual(str(annots[0]), "cancer 0 6 cancer")
self.assertEqual(str(annots[1]), "cancer 7 13 cancer")
self.assertEqual(
str(annots[1]),
str(annots[2]),
"cancer de la prostate 14 35 cancer de la prostate",
)

Expand Down Expand Up @@ -562,6 +570,25 @@ def test_fuzzy_algorithms_with_negative_stopwords(self):
annots = matcher.annot_text(text="k poumons")
self.assertEqual(1, len(annots))

def test_repeated_words(self):
"""Check repeated words are annotated multiple times.
https://github.com/scossin/iamsystem_python/issues/18
"""
matcher = Matcher.build(keywords=["cancer"])
annots = matcher.annot_text(text="cancer cancer")
self.assertEqual(2, len(annots))

def test_repeated_words_large_window(self):
"""Check repeated words are annotated multiple times with the large
window strategy.
https://github.com/scossin/iamsystem_python/issues/18
"""
matcher = Matcher.build(
keywords=["cancer"], strategy=EMatchingStrategy.LARGE_WINDOW
)
annots = matcher.annot_text(text="cancer cancer")
self.assertEqual(2, len(annots))


if __name__ == "__main__":
unittest.main()