Merge pull request #35 from maxbachmann/patch-1

Improve rapidfuzz usage
chrisjbryant · Apr 14, 2022 · bd99745 · bd99745
2 parents 7f41822 + 9cb7a0c
commit bd99745
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 5 deletions.
diff --git a/errant/alignment.py b/errant/alignment.py
@@ -1,5 +1,5 @@
 from itertools import groupby
-from rapidfuzz import fuzz
+from rapidfuzz.distance import Indel
 import spacy.parts_of_speech as POS
 from errant.edit import Edit
 
@@ -94,7 +94,7 @@ def get_sub_cost(self, o, c):
         elif o.pos in self._open_pos and c.pos in self._open_pos: pos_cost = 0.25
         else: pos_cost = 0.5
         # Char cost
-        char_cost = 1-(fuzz.ratio(o.text, c.text)/100)
+        char_cost = Indel.normalized_distance(o.text, c.text)
         # Combine the costs
         return lemma_cost + pos_cost + char_cost
 
@@ -171,4 +171,4 @@ def __str__(self):
         cost_matrix = "\n".join(["Cost Matrix:"]+[str(row) for row in self.cost_matrix])
         op_matrix = "\n".join(["Operation Matrix:"]+[str(row) for row in self.op_matrix])
         seq = "Best alignment: "+str([a[0] for a in self.align_seq])
-        return "\n".join([orig, cor, cost_matrix, op_matrix, seq])
+        return "\n".join([orig, cor, cost_matrix, op_matrix, seq])
diff --git a/errant/en/classifier.py b/errant/en/classifier.py
@@ -214,7 +214,7 @@ def get_two_sided_type(o_toks, c_toks):
                 # Use string similarity to detect true spelling errors.
                 else:
                     # Normalised Lev distance works better than Lev ratio
-                    str_sim = 1-Levenshtein.normalized_distance(o_toks[0].lower_, c_toks[0].lower_)
+                    str_sim = Levenshtein.normalized_similarity(o_toks[0].lower_, c_toks[0].lower_)
                     # WARNING: THIS IS AN APPROXIMATION.
                     # Thresholds tuned manually on FCE_train + W&I_train
                     # str_sim > 0.55 is almost always a true spelling error
@@ -328,7 +328,7 @@ def get_two_sided_type(o_toks, c_toks):
         # These rules are quite language specific.
         if o_toks[0].text.isalpha() and c_toks[0].text.isalpha():
             # Normalised Lev distance works better than Lev ratio
-            str_sim = 1-Levenshtein.normalized_distance(o_toks[0].lower_, c_toks[0].lower_)
+            str_sim = Levenshtein.normalized_similarity(o_toks[0].lower_, c_toks[0].lower_)
             # WARNING: THIS IS AN APPROXIMATION.
             # Thresholds tuned manually on FCE_train + W&I_train
             # A. Short sequences are likely to be SPELL or function word errors