Skip to content

Commit

Permalink
Merge pull request #35 from maxbachmann/patch-1
Browse files Browse the repository at this point in the history
Improve rapidfuzz usage
  • Loading branch information
chrisjbryant authored Apr 14, 2022
2 parents 7f41822 + 9cb7a0c commit bd99745
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
6 changes: 3 additions & 3 deletions errant/alignment.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from itertools import groupby
from rapidfuzz import fuzz
from rapidfuzz.distance import Indel
import spacy.parts_of_speech as POS
from errant.edit import Edit

Expand Down Expand Up @@ -94,7 +94,7 @@ def get_sub_cost(self, o, c):
elif o.pos in self._open_pos and c.pos in self._open_pos: pos_cost = 0.25
else: pos_cost = 0.5
# Char cost
char_cost = 1-(fuzz.ratio(o.text, c.text)/100)
char_cost = Indel.normalized_distance(o.text, c.text)
# Combine the costs
return lemma_cost + pos_cost + char_cost

Expand Down Expand Up @@ -171,4 +171,4 @@ def __str__(self):
cost_matrix = "\n".join(["Cost Matrix:"]+[str(row) for row in self.cost_matrix])
op_matrix = "\n".join(["Operation Matrix:"]+[str(row) for row in self.op_matrix])
seq = "Best alignment: "+str([a[0] for a in self.align_seq])
return "\n".join([orig, cor, cost_matrix, op_matrix, seq])
return "\n".join([orig, cor, cost_matrix, op_matrix, seq])
4 changes: 2 additions & 2 deletions errant/en/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def get_two_sided_type(o_toks, c_toks):
# Use string similarity to detect true spelling errors.
else:
# Normalised Lev distance works better than Lev ratio
str_sim = 1-Levenshtein.normalized_distance(o_toks[0].lower_, c_toks[0].lower_)
str_sim = Levenshtein.normalized_similarity(o_toks[0].lower_, c_toks[0].lower_)
# WARNING: THIS IS AN APPROXIMATION.
# Thresholds tuned manually on FCE_train + W&I_train
# str_sim > 0.55 is almost always a true spelling error
Expand Down Expand Up @@ -328,7 +328,7 @@ def get_two_sided_type(o_toks, c_toks):
# These rules are quite language specific.
if o_toks[0].text.isalpha() and c_toks[0].text.isalpha():
# Normalised Lev distance works better than Lev ratio
str_sim = 1-Levenshtein.normalized_distance(o_toks[0].lower_, c_toks[0].lower_)
str_sim = Levenshtein.normalized_similarity(o_toks[0].lower_, c_toks[0].lower_)
# WARNING: THIS IS AN APPROXIMATION.
# Thresholds tuned manually on FCE_train + W&I_train
# A. Short sequences are likely to be SPELL or function word errors
Expand Down

0 comments on commit bd99745

Please sign in to comment.