diff --git a/errant/alignment.py b/errant/alignment.py index 5ed0320..0091879 100644 --- a/errant/alignment.py +++ b/errant/alignment.py @@ -1,5 +1,5 @@ from itertools import groupby -from rapidfuzz import fuzz +from rapidfuzz.distance import Indel import spacy.parts_of_speech as POS from errant.edit import Edit @@ -94,7 +94,7 @@ def get_sub_cost(self, o, c): elif o.pos in self._open_pos and c.pos in self._open_pos: pos_cost = 0.25 else: pos_cost = 0.5 # Char cost - char_cost = 1-(fuzz.ratio(o.text, c.text)/100) + char_cost = Indel.normalized_distance(o.text, c.text) # Combine the costs return lemma_cost + pos_cost + char_cost @@ -171,4 +171,4 @@ def __str__(self): cost_matrix = "\n".join(["Cost Matrix:"]+[str(row) for row in self.cost_matrix]) op_matrix = "\n".join(["Operation Matrix:"]+[str(row) for row in self.op_matrix]) seq = "Best alignment: "+str([a[0] for a in self.align_seq]) - return "\n".join([orig, cor, cost_matrix, op_matrix, seq]) \ No newline at end of file + return "\n".join([orig, cor, cost_matrix, op_matrix, seq]) diff --git a/errant/en/classifier.py b/errant/en/classifier.py index 8e61e0d..4a071ed 100644 --- a/errant/en/classifier.py +++ b/errant/en/classifier.py @@ -214,7 +214,7 @@ def get_two_sided_type(o_toks, c_toks): # Use string similarity to detect true spelling errors. else: # Normalised Lev distance works better than Lev ratio - str_sim = 1-Levenshtein.normalized_distance(o_toks[0].lower_, c_toks[0].lower_) + str_sim = Levenshtein.normalized_similarity(o_toks[0].lower_, c_toks[0].lower_) # WARNING: THIS IS AN APPROXIMATION. # Thresholds tuned manually on FCE_train + W&I_train # str_sim > 0.55 is almost always a true spelling error @@ -328,7 +328,7 @@ def get_two_sided_type(o_toks, c_toks): # These rules are quite language specific. if o_toks[0].text.isalpha() and c_toks[0].text.isalpha(): # Normalised Lev distance works better than Lev ratio - str_sim = 1-Levenshtein.normalized_distance(o_toks[0].lower_, c_toks[0].lower_) + str_sim = Levenshtein.normalized_similarity(o_toks[0].lower_, c_toks[0].lower_) # WARNING: THIS IS AN APPROXIMATION. # Thresholds tuned manually on FCE_train + W&I_train # A. Short sequences are likely to be SPELL or function word errors