Skip to content

Commit

Permalink
Merge pull request #77 from maxbachmann/master
Browse files Browse the repository at this point in the history
add support for rapidfuzz
  • Loading branch information
orsinium authored Jun 29, 2022
2 parents 5f3a0dc + b18e949 commit b8dbc02
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 24 deletions.
10 changes: 6 additions & 4 deletions .drone.star
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ def main(ctx):
steps=[
dict(
name="install task",
image="alpine:latest",
image="debian:latest",
commands=[
"apk add --no-cache wget",
"apt update",
"apt install -y wget",
"wget https://taskfile.dev/install.sh",
"sh install.sh -- latest",
"rm install.sh",
Expand All @@ -34,14 +35,15 @@ def main(ctx):
def step(env, python):
result = dict(
name="{} (py{})".format(env, python),
image="python:{}-alpine".format(python),
image="python:{}-buster".format(python),
depends_on=["install task"],
environment=dict(
# set coverage database file name to avoid conflicts between steps
COVERAGE_FILE=".coverage.{}.{}".format(env, python),
),
commands=[
"apk add curl git gcc libc-dev",
"apt update",
"apt install -y curl git build-essential",
"./bin/task PYTHON_BIN=python3 VENVS=/opt/py{python}/ -f {env}:run".format(
python=python,
env=env,
Expand Down
8 changes: 3 additions & 5 deletions Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ vars:
ISORT_ENV: "{{.VENVS}}isort"
TWINE_ENV: "{{.VENVS}}twine"

TESTS_PATH: tests/

tasks:
venv:create:
status:
Expand All @@ -21,7 +19,7 @@ tasks:
- "{{.ENV}}/bin/python3 -m pip install -U pip setuptools wheel"
pip:install:
sources:
- pyproject.toml
- setup.py
- "{{.ENV}}/bin/activate"
deps:
- task: venv:create
Expand Down Expand Up @@ -74,7 +72,7 @@ tasks:
ENV: "{{.PYTEST_PURE_ENV}}"
EXTRA: test
cmds:
- "{{.PYTEST_PURE_ENV}}/bin/pytest -m 'not external' {{.ARGS}} {{.TESTS_PATH}}"
- "{{.PYTEST_PURE_ENV}}/bin/pytest -m 'not external' {{.CLI_ARGS}}"

pytest-external:run:
deps:
Expand All @@ -83,7 +81,7 @@ tasks:
ENV: "{{.PYTEST_EXT_ENV}}"
EXTRA: test,benchmark
cmds:
- "{{.PYTEST_EXT_ENV}}/bin/pytest {{.ARGS}} {{.TESTS_PATH}}"
- "{{.PYTEST_EXT_ENV}}/bin/pytest {{.CLI_ARGS}}"

isort:run:
sources:
Expand Down
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
'numpy', # for SmithWaterman and other
'python-Levenshtein', # for Jaro and Levenshtein
'pyxDamerauLevenshtein', # for DamerauLevenshtein
'rapidfuzz>=2.0.0', # for Jaro, Levenshtein and other
],

# needed for benchmarking, optimization and testing
Expand All @@ -22,6 +23,7 @@
'numpy',
'python-Levenshtein',
'pyxDamerauLevenshtein',
'rapidfuzz>=2.0.0',
# slow
'distance',
'pylev',
Expand All @@ -43,17 +45,21 @@
],
'Hamming': [
'python-Levenshtein', # only same length and strings
'rapidfuzz>=2.0.0', # only same length, any iterators of hashable elements
'jellyfish', # only strings, any length
'distance', # only same length, any iterators
'abydos', # any iterators
],
'Jaro': [
'rapidfuzz>=2.0.0', # any iterators of hashable elements
'python-Levenshtein', # only text
],
'JaroWinkler': [
'rapidfuzz>=2.0.0', # any iterators of hashable elements
'jellyfish', # only text
],
'Levenshtein': [
'rapidfuzz>=2.0.0', # any iterators of hashable elements
'python-Levenshtein', # only text
# yeah, other libs slower than textdistance
],
Expand Down
45 changes: 31 additions & 14 deletions tests/test_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,15 @@

libraries = prototype.clone()

# numpy throws a bunch of warning about abydos using `np.int` isntead of `int`.
ABYDOS_WARNINGS = (
'ignore:`np.int` is a deprecated alias',
'ignore:`np.float` is a deprecated alias',
'ignore:Using or importing the ABCs',
)


@pytest.mark.filterwarnings(*ABYDOS_WARNINGS)
@pytest.mark.external
@pytest.mark.parametrize('alg', libraries.get_algorithms())
@hypothesis.settings(deadline=None)
Expand All @@ -37,34 +45,40 @@ def test_compare(left, right, alg):
assert isclose(int_result, ext_result), str(lib)


@pytest.mark.filterwarnings(*ABYDOS_WARNINGS)
@pytest.mark.external
@pytest.mark.parametrize('alg', libraries.get_algorithms())
@hypothesis.given(
left=hypothesis.strategies.text(min_size=1),
right=hypothesis.strategies.text(min_size=1),
)
def test_qval(left, right, alg):
@pytest.mark.parametrize('qval', (None, 1, 2, 3))
def test_qval(left, right, alg, qval):
for lib in libraries.get_libs(alg):
conditions = lib.conditions or {}
internal_func = getattr(textdistance, alg)(external=False, **conditions)
external_func = lib.get_function()
# algorithm doesn't support q-grams
if not hasattr(internal_func, 'qval'):
continue
for qval in (None, 1, 2, 3):
internal_func.qval = qval
# if qval unsopporting already set for lib
s1, s2 = internal_func._get_sequences(left, right)
if not lib.check_conditions(internal_func, s1, s2):
continue

# test
int_result = internal_func(left, right)
s1, s2 = lib.prepare(s1, s2)
ext_result = external_func(s1, s2)
assert isclose(int_result, ext_result), str(lib)

internal_func.qval = qval
# if qval unsopporting already set for lib
s1, s2 = internal_func._get_sequences(left, right)
if not lib.check_conditions(internal_func, s1, s2):
continue
quick_answer = internal_func.quick_answer(s1, s2)
if quick_answer is not None:
continue

# test
int_result = internal_func(left, right)
s1, s2 = lib.prepare(s1, s2)
ext_result = external_func(s1, s2)
assert isclose(int_result, ext_result), f'{lib}({repr(s1)}, {repr(s2)})'


@pytest.mark.filterwarnings(*ABYDOS_WARNINGS)
@pytest.mark.external
@pytest.mark.parametrize('alg', libraries.get_algorithms())
@hypothesis.given(
Expand All @@ -79,10 +93,13 @@ def test_list_of_numbers(left, right, alg):
if external_func is None:
raise RuntimeError('cannot import {}'.format(str(lib)))

quick_answer = internal_func.quick_answer(left, right)
if quick_answer is not None:
continue
if not lib.check_conditions(internal_func, left, right):
continue

int_result = internal_func(left, right)
s1, s2 = lib.prepare(left, right)
ext_result = external_func(s1, s2)
assert isclose(int_result, ext_result), str(lib)
assert isclose(int_result, ext_result), f'{lib}({repr(s1)}, {repr(s2)})'
9 changes: 8 additions & 1 deletion textdistance/algorithms/edit_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class Hamming(_Base):
https://en.wikipedia.org/wiki/Hamming_distance
"""

def __init__(self, qval=1, test_func=None, truncate=False, external=True):
self.qval = qval
self.test_func = test_func or self._ident
Expand Down Expand Up @@ -62,6 +63,7 @@ class Levenshtein(_Base):
https://en.wikipedia.org/wiki/Levenshtein_distance
TODO: https://gist.github.com/kylebgorman/1081951/9b38b7743a3cb5167ab2c6608ac8eea7fc629dca
"""

def __init__(self, qval=1, test_func=None, external=True):
self.qval = qval
self.test_func = test_func or self._ident
Expand Down Expand Up @@ -130,6 +132,7 @@ class DamerauLevenshtein(_Base):
https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
"""

def __init__(self, qval=1, test_func=None, external=True):
self.qval = qval
self.test_func = test_func or self._ident
Expand Down Expand Up @@ -229,6 +232,7 @@ class JaroWinkler(_BaseSimilarity):
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro.js
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js
"""

def __init__(self, long_tolerance=False, winklerize=True, qval=1, external=True):
self.qval = qval
self.long_tolerance = long_tolerance
Expand Down Expand Up @@ -302,7 +306,7 @@ def __call__(self, s1, s2, prefix_weight=0.1):
# adjust for up to first 4 chars in common
j = min(min_len, 4)
i = 0
while i < j and s1[i] == s2[i] and s1[i]:
while i < j and s1[i] == s2[i]:
i += 1
if i:
weight += i * prefix_weight * (1.0 - weight)
Expand Down Expand Up @@ -422,6 +426,7 @@ class SmithWaterman(_BaseSimilarity):
https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/smith-waterman.js
"""

def __init__(self, gap_cost=1.0, sim_func=None, qval=1, external=True):
self.qval = qval
self.gap_cost = gap_cost
Expand Down Expand Up @@ -464,6 +469,7 @@ class Gotoh(NeedlemanWunsch):
penalties:
https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf
"""

def __init__(self, gap_open=1, gap_ext=0.4, sim_func=None, qval=1, external=True):
self.qval = qval
self.gap_open = gap_open
Expand Down Expand Up @@ -687,6 +693,7 @@ class MLIPNS(_BaseSimilarity):
http://www.sial.iias.spb.su/files/386-386-1-PB.pdf
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/mlipns.js
"""

def __init__(self, threshold=0.25, maxmismatches=2, qval=1, external=True):
self.qval = qval
self.threshold = threshold
Expand Down
16 changes: 16 additions & 0 deletions textdistance/libraries.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
"Levenshtein",
"hamming"
],
[
"rapidfuzz.distance.hamming",
"distance"
],
[
"jellyfish",
"hamming_distance"
Expand All @@ -32,6 +36,10 @@
]
],
"Jaro": [
[
"rapidfuzz.distance.Jaro",
"similarity"
],
[
"Levenshtein",
"jaro"
Expand All @@ -46,12 +54,20 @@
]
],
"JaroWinkler": [
[
"rapidfuzz.distance.JaroWinkler",
"similarity"
],
[
"jellyfish",
"jaro_winkler_similarity"
]
],
"Levenshtein": [
[
"rapidfuzz.distance.Levenshtein",
"distance"
],
[
"Levenshtein",
"distance"
Expand Down
5 changes: 5 additions & 0 deletions textdistance/libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,17 @@ class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
prototype.register('Hamming', SameLengthLibrary('distance', 'hamming'))
prototype.register('Hamming', SameLengthTextLibrary('Levenshtein', 'hamming'))
prototype.register('Hamming', TextLibrary('jellyfish', 'hamming_distance'))
prototype.register('Hamming', SameLengthLibrary('rapidfuzz.distance.Hamming', 'distance'))

prototype.register('Jaro', TextLibrary('jellyfish', 'jaro_similarity'))
prototype.register('Jaro', LibraryBase('rapidfuzz.distance.Jaro', 'similarity'))
# prototype.register('Jaro', TextLibrary('Levenshtein', 'jaro'))
# prototype.register('Jaro', TextLibrary('py_stringmatching.similarity_measure.jaro', 'jaro'))

# prototype.register('JaroWinkler', LibraryBase('py_stringmatching.similarity_measure.jaro_winkler', 'jaro_winkler'))
prototype.register('JaroWinkler', TextLibrary('jellyfish', 'jaro_winkler_similarity', conditions=dict(winklerize=True)))
prototype.register('JaroWinkler', LibraryBase('rapidfuzz.distance.JaroWinkler', 'similarity',
conditions=dict(winklerize=True)))
# https://github.com/life4/textdistance/issues/39
# prototype.register('JaroWinkler', TextLibrary('Levenshtein', 'jaro_winkler', conditions=dict(winklerize=True)))

Expand All @@ -174,4 +178,5 @@ class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
prototype.register('Levenshtein', LibraryBase('pylev', 'levenshtein'))
prototype.register('Levenshtein', TextLibrary('jellyfish', 'levenshtein_distance'))
prototype.register('Levenshtein', TextLibrary('Levenshtein', 'distance'))
prototype.register('Levenshtein', LibraryBase('rapidfuzz.distance.Levenshtein', 'distance'))
# prototype.register('Levenshtein', TextLibrary('py_stringmatching.similarity_measure.levenshtein', 'levenshtein'))

0 comments on commit b8dbc02

Please sign in to comment.