From 2d95794d4191b1fdb95b476de60b1610b7c44c0d Mon Sep 17 00:00:00 2001 From: atomflunder <80397293+atomflunder@users.noreply.github.com> Date: Wed, 27 Apr 2022 14:32:59 +0200 Subject: [PATCH 1/6] v0.5.0 - Initial commit - Renamed "*_with_score" functions to "*_with_ratio" - Removed Exceptions --- .github/workflows/build.yml | 2 +- CHANGELOG.md | 15 +++- README.md | 20 +++--- stringmatch/__init__.py | 3 +- stringmatch/exceptions.py | 16 ----- stringmatch/match.py | 137 +++++++++++++++--------------------- stringmatch/ratio.py | 11 ++- tests/test_match.py | 45 ++++++------ tests/test_ratio.py | 7 +- 9 files changed, 109 insertions(+), 147 deletions(-) delete mode 100644 stringmatch/exceptions.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3c29f35..519b200 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,4 +1,4 @@ -name: build +name: Build on: [push, pull_request] diff --git a/CHANGELOG.md b/CHANGELOG.md index 976e8d4..df36cce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ This is a broad overview of the changes that have been made over the lifespan of this library. +## v0.5.0 - 2022-04-27 + +- Renamed *_with_score functions to *_with_ratio to be consistent with naming + - This affects the three functions added in v0.4.0 +- Removed Exceptions + - Returning a score of 0 instead of raising EmptySearchException + - Using "levenshtein" as default instead of raising InvalidScorerException + - Setting no limit instead of raising InvalidLimitException, if a limit less than 1 is set + - Updated docstrings to reflect these changes + - Updated tests to reflect these changes + ## v0.4.1 - 2022-04-27 - Added proper Python Versions to setup classifiers @@ -9,7 +20,7 @@ This is a broad overview of the changes that have been made over the lifespan of ## v0.4.0 - 2022-04-27 - Added match_with_score, get_best_match_with_score and get_best_matches_with_score functions -- Added tests for those functions + - Added tests for those functions - Updated documentation a bit ## v0.3.1 - 2022-04-26 @@ -26,7 +37,7 @@ This is a broad overview of the changes that have been made over the lifespan of - Made library public and installable via git - Added multiple scorers - Added new kwargs to Match functions -- Added tests for those + - Added tests for those - Improved various functions - Added exception type - Some documentation improvements diff --git a/README.md b/README.md index d8ea409..010836f 100644 --- a/README.md +++ b/README.md @@ -35,11 +35,9 @@ pip install -U git+https://github.com/atomflunder/stringmatch ```python from stringmatch import Match, Ratio, Strings +# Basic usage: match = Match() -ratio = Ratio() -strings = Strings() -# Basic usage: match.match("searchlib", "srchlib") # returns True match.match("searchlib", "something else") # returns False @@ -49,19 +47,23 @@ match.get_best_match("searchlib", searches) # returns "searchli" match.get_best_matches("searchlib", searches) # returns ['searchli', 'searhli', 'search'] # Ratios: +ratio = Ratio() + ratio.ratio("searchlib", "searchlib") # returns 100 ratio.ratio("searchlib", "srechlib") # returns 82 searches = ["searchlib", "srechlib"] ratio.ratio_list("searchlib", searches) # returns [100, 82] # Getting matches and ratios: -match.match_with_score("searchlib", "srechlib") # returns (True, 82) +match.match_with_ratio("searchlib", "srechlib") # returns (True, 82) searches = ["test", "nope", "tset"] -match.get_best_match_with_score("test", searches) # returns ("test", 100) -match.get_best_matches_with_score("test", searches) # returns [("test", 100), ("tset", 75)] +match.get_best_match_with_ratio("test", searches) # returns ("test", 100) +match.get_best_matches_with_ratio("test", searches) # returns [("test", 100), ("tset", 75)] # Modify strings: # This is meant for internal use, but you can also use it yourself, if you choose to. +strings = Strings() + strings.latinise("Héllö, world!") # returns "Hello, world!" strings.remove_punctuation("wh'at;, ever") # returns "what ever" strings.only_letters("Héllö, world!") # returns "Hll world" @@ -82,7 +84,7 @@ match("searchlib", "srechlib", score=70) # returns True #### `limit=int` -The limit of how many matches to return. Only available for `Matches().get_best_matches()`. By default this is set to `5`. +The limit of how many matches to return. Only available for `Matches().get_best_matches()`. If you want to return every match set this to 0. By default this is set to `5`. ```python searches = ["limit 5", "limit 4", "limit 3", "limit 2", "limit 1", "limit 0"] @@ -110,7 +112,7 @@ match("test", "TEST", ignore_case=False) # returns False #### `remove_punctuation=bool` -Removes commonly used punctuation symbols from the strings, like `.,;:!?` and so on. Be careful when using this, because if you pass in a string that is only made up of punctuation symbols, you will get an `EmptySearchException`. By default turned off. +Removes commonly used punctuation symbols from the strings, like `.,;:!?` and so on. By default turned off. ```python match("test,---....", "test", remove_punctuation=True) # returns True @@ -119,7 +121,7 @@ match("test,---....", "test", remove_punctuation=False) # returns False #### `only_letters=bool` -Removes every character that is not in the latin alphabet, a more extreme version of `remove_punctuation`. The same rules apply here, be careful when you use it or you might get an `EmptySearchException`. By default turned off. +Removes every character that is not in the latin alphabet, a more extreme version of `remove_punctuation`. By default turned off. ```python match("»»ᅳtestᅳ►", "test", only_letters=True) # returns True diff --git a/stringmatch/__init__.py b/stringmatch/__init__.py index 16f96ad..f1f4ac3 100644 --- a/stringmatch/__init__.py +++ b/stringmatch/__init__.py @@ -1,8 +1,7 @@ # flake8: noqa -from .exceptions import * from .match import * from .ratio import * from .strings import * __title__ = "stringmatch" -__version__ = "0.4.1" +__version__ = "0.5.0" diff --git a/stringmatch/exceptions.py b/stringmatch/exceptions.py deleted file mode 100644 index ad4a3ae..0000000 --- a/stringmatch/exceptions.py +++ /dev/null @@ -1,16 +0,0 @@ -class EmptySearchException(Exception): - """Exception that will get raised when you try to compare an empty string to something.""" - - pass - - -class InvalidLimitException(Exception): - """Exception that will get raised when you try to set a limit that is less than 1.""" - - pass - - -class InvalidScorerException(Exception): - """Exception that will get raised when you try to set an invalid scorer.""" - - pass diff --git a/stringmatch/match.py b/stringmatch/match.py index 28b9851..3566111 100644 --- a/stringmatch/match.py +++ b/stringmatch/match.py @@ -1,6 +1,5 @@ from typing import Optional -from stringmatch.exceptions import EmptySearchException, InvalidLimitException from stringmatch.ratio import Ratio from stringmatch.strings import Strings @@ -21,8 +20,6 @@ def match( scorer: str = "levenshtein", ) -> bool: """Matches two strings, returns True if they are similar enough. - Be careful when using remove_punctuation and only_letters, - if they remove everything in the string, this will raise an EmptySearchException. Parameters ---------- @@ -33,29 +30,24 @@ def match( score : int, optional The cutoff for the score, by default 70. latinise : bool, optional - If special unicode characters should be removed from the strings, by default False + If special unicode characters should be removed from the strings, by default False. ignore_case : bool, optional - If the strings should be compared ignoring case, by default False + If the strings should be compared ignoring case, by default False. remove_punctuation : bool, optional - If punctuation should be removed from the strings, by default False + If punctuation should be removed from the strings, by default False. only_letters : bool, optional - If the strings should only be compared by their latin letters, by default False + If the strings should only be compared by their latin letters, by default False. scorer : str, optional The scorer to use, by default "levenshtein". Available scorers: - "levenshtein" - "jaro" - "jaro_winkler" + "levenshtein", + "jaro", + "jaro_winkler". Returns ------- bool If the strings are similar enough. - - Raises - ------ - EmptySearchException - If one of the strings to compare is empty. """ if latinise: @@ -76,12 +68,9 @@ def match( string2 ) - if not string1 or not string2: - raise EmptySearchException("Cannot compare an empty string.") - return Ratio().ratio(string1, string2, scorer=scorer) >= score - def match_with_score( + def match_with_ratio( self, string1: str, string2: str, @@ -104,29 +93,24 @@ def match_with_score( score : int, optional The cutoff for the score, by default 70. latinise : bool, optional - If special unicode characters should be removed from the strings, by default False + If special unicode characters should be removed from the strings, by default False. ignore_case : bool, optional - If the strings should be compared ignoring case, by default False + If the strings should be compared ignoring case, by default False. remove_punctuation : bool, optional - If punctuation should be removed from the strings, by default False + If punctuation should be removed from the strings, by default False. only_letters : bool, optional - If the strings should only be compared by their latin letters, by default False + If the strings should only be compared by their latin letters, by default False. scorer : str, optional The scorer to use, by default "levenshtein". Available scorers: - "levenshtein" - "jaro" - "jaro_winkler" + "levenshtein", + "jaro", + "jaro_winkler". Returns ------- tuple[bool, int] If the strings are similar and their score. - - Raises - ------ - EmptySearchException - If one of the strings to compare is empty. """ kwargs = { "score": score, @@ -155,8 +139,6 @@ def get_best_match( scorer: str = "levenshtein", ) -> Optional[str]: """Returns the best match from a list of strings. - Be careful when using remove_punctuation and only_letters, - if they remove everything in the string, this will raise an EmptySearchException. Parameters ---------- @@ -165,21 +147,21 @@ def get_best_match( string_list : list[str] The list of strings to compare to. score : int, optional - The cutoff for the score, by default 70 + The cutoff for the score, by default 70. latinise : bool, optional - If special unicode characters should be removed from the strings, by default False + If special unicode characters should be removed from the strings, by default False. ignore_case : bool, optional - If the strings should be compared ignoring case, by default False + If the strings should be compared ignoring case, by default False. remove_punctuation : bool, optional - If punctuation should be removed from the strings, by default False + If punctuation should be removed from the strings, by default False. only_letters : bool, optional - If the strings should only be compared by their latin letters, by default False + If the strings should only be compared by their latin letters, by default False. scorer : str, optional The scorer to use, by default "levenshtein". Available scorers: - "levenshtein" - "jaro" - "jaro_winkler" + "levenshtein", + "jaro", + "jaro_winkler". Returns ------- @@ -201,7 +183,7 @@ def get_best_match( else None ) - def get_best_match_with_score( + def get_best_match_with_ratio( self, string: str, string_list: list[str], @@ -222,21 +204,21 @@ def get_best_match_with_score( string_list : list[str] The list of strings to compare to. score : int, optional - The cutoff for the score, by default 70 + The cutoff for the score, by default 70. latinise : bool, optional - If special unicode characters should be removed from the strings, by default False + If special unicode characters should be removed from the strings, by default False. ignore_case : bool, optional - If the strings should be compared ignoring case, by default False + If the strings should be compared ignoring case, by default False. remove_punctuation : bool, optional - If punctuation should be removed from the strings, by default False + If punctuation should be removed from the strings, by default False. only_letters : bool, optional - If the strings should only be compared by their latin letters, by default False + If the strings should only be compared by their latin letters, by default False. scorer : str, optional The scorer to use, by default "levenshtein". Available scorers: - "levenshtein" - "jaro" - "jaro_winkler" + "levenshtein", + "jaro", + "jaro_winkler". Returns ------- @@ -274,8 +256,6 @@ def get_best_matches( If there are more than `limit` matches, only the `limit` best matches are returned, sorted by score. If no matches are found, returns an empty list. - Be careful when using remove_punctuation and only_letters, - if they remove everything in the string, this will raise an EmptySearchException. Parameters ---------- @@ -284,36 +264,33 @@ def get_best_matches( string_list : list[str] The list of strings to compare to. score : int, optional - The cutoff for the score, by default 70 + The cutoff for the score, by default 70. limit : int, optional - The number of matches to return, by default 5 + The number of matches to return, by default 5. + If you want to return every match, set this to 0. latinise : bool, optional - If special unicode characters should be removed from the strings, by default False + If special unicode characters should be removed from the strings, by default False. ignore_case : bool, optional - If the strings should be compared ignoring case, by default False + If the strings should be compared ignoring case, by default False. remove_punctuation : bool, optional - If punctuation should be removed from the strings, by default False + If punctuation should be removed from the strings, by default False. only_letters : bool, optional - If the strings should only be compared by their latin letters, by default False + If the strings should only be compared by their latin letters, by default False. scorer : str, optional The scorer to use, by default "levenshtein". Available scorers: - "levenshtein" - "jaro" - "jaro_winkler" + "levenshtein", + "jaro", + "jaro_winkler". Returns ------- list[str] All of the matches found. - - Raises - ------ - InvalidLimitException - If the limit used is less than 1. """ + # we return every match found if the limit is 0 or less if limit < 1: - raise InvalidLimitException("Limit must be greater than 1.") + limit = None kwargs = { "score": score, @@ -331,7 +308,7 @@ def get_best_matches( reverse=True, )[:limit] - def get_best_matches_with_score( + def get_best_matches_with_ratio( self, string: str, string_list: list[str], @@ -353,33 +330,29 @@ def get_best_matches_with_score( string_list : list[str] The list of strings to compare to. score : int, optional - The cutoff for the score, by default 70 + The cutoff for the score, by default 70. limit : int, optional - The number of matches to return, by default 5 + The number of matches to return, by default 5. + If you want to return every match, set this to 0. latinise : bool, optional - If special unicode characters should be removed from the strings, by default False + If special unicode characters should be removed from the strings, by default False. ignore_case : bool, optional - If the strings should be compared ignoring case, by default False + If the strings should be compared ignoring case, by default False. remove_punctuation : bool, optional - If punctuation should be removed from the strings, by default False + If punctuation should be removed from the strings, by default False. only_letters : bool, optional - If the strings should only be compared by their latin letters, by default False + If the strings should only be compared by their latin letters, by default False. scorer : str, optional The scorer to use, by default "levenshtein". Available scorers: - "levenshtein" - "jaro" - "jaro_winkler" + "levenshtein", + "jaro", + "jaro_winkler". Returns ------- list[tuple[str, int]] All of the matches found. - - Raises - ------ - InvalidLimitException - If the limit used is less than 1. """ kwargs = { "score": score, diff --git a/stringmatch/ratio.py b/stringmatch/ratio.py index 7a24107..8b6ca5b 100644 --- a/stringmatch/ratio.py +++ b/stringmatch/ratio.py @@ -1,7 +1,5 @@ import Levenshtein -from stringmatch.exceptions import InvalidScorerException - class Ratio: """Contains functions for calculating the ratio of similarity between two strings.""" @@ -37,7 +35,11 @@ def ratio(self, string1: str, string2: str, scorer: str = "levenshtein") -> int: """ if scorer not in self.scorers: - raise InvalidScorerException("Scorer not in available scorers.") + scorer = "levenshtein" + + # if either string is empty we wanna return 0 + if not string1 or not string2: + return 0 return round(self.scorers[scorer](string1, string2) * 100) @@ -64,7 +66,4 @@ def ratio_list( list[int] The scores between 0 and 100. """ - if scorer not in self.scorers: - raise InvalidScorerException("Scorer not in available scorers.") - return [self.ratio(string, s, scorer=scorer) for s in string_list] diff --git a/tests/test_match.py b/tests/test_match.py index 6486dc4..24bd69e 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -1,6 +1,3 @@ -import pytest - -from stringmatch.exceptions import EmptySearchException, InvalidLimitException from stringmatch.match import Match @@ -17,8 +14,7 @@ def test_match(): assert Match().match("test", "TEST", ignore_case=False) is False assert Match().match("test", "TEST", ignore_case=True) is True assert Match().match("test", "-- test --!<<><", only_letters=True) is True - with pytest.raises(EmptySearchException): - assert Match().match("", "f") + assert Match().match("", "f") is False assert Match().match("séärçh", "search", latinise=True) is True assert Match().match("séärçh", "search", latinise=False) is False @@ -33,11 +29,11 @@ def test_match(): assert Match().match("test", "th test", scorer="jaro_winkler") is False -def test_match_with_score(): - assert Match().match_with_score("test", "test") == (True, 100) - assert Match().match_with_score("test", "nope") == (False, 25) - assert Match().match_with_score("searchlib", "srechlib") == (True, 82) - assert Match().match_with_score("test", "th test", scorer="jaro_winkler") == ( +def test_match_with_ratio(): + assert Match().match_with_ratio("test", "test") == (True, 100) + assert Match().match_with_ratio("test", "nope") == (False, 25) + assert Match().match_with_ratio("searchlib", "srechlib") == (True, 82) + assert Match().match_with_ratio("test", "th test", scorer="jaro_winkler") == ( False, 60, ) @@ -52,20 +48,18 @@ def test_get_best_match(): ) == "srechlib" ) - with pytest.raises(EmptySearchException): - assert Match().get_best_match("", ["f"]) + assert Match().get_best_match("", ["f"]) is None - with pytest.raises(EmptySearchException): - assert Match().get_best_match("....-", ["f"], remove_punctuation=True) + assert Match().get_best_match("....-", ["f"], remove_punctuation=True) is None -def test_get_best_match_with_score(): - assert Match().get_best_match_with_score("test", ["test", "nope", "tset"]) == ( +def test_get_best_match_with_ratio(): + assert Match().get_best_match_with_ratio("test", ["test", "nope", "tset"]) == ( "test", 100, ) assert ( - Match().get_best_match_with_score("whatever", ["test", "nope", "tset"]) is None + Match().get_best_match_with_ratio("whatever", ["test", "nope", "tset"]) is None ) @@ -79,18 +73,21 @@ def test_get_best_matches(): ["limit 5", "limit 4", "limit 3", "limit 2", "limit 1", "limit 0"], limit=2, ) == ["limit 5", "limit 4"] - with pytest.raises(EmptySearchException): - assert Match().get_best_matches("", ["f"]) - with pytest.raises(InvalidLimitException): - assert Match().get_best_matches("test", ["test", "nope", "tset"], limit=-1) + + assert Match().get_best_matches("", ["f"]) == [] + + assert Match().get_best_matches("test", ["test", "nope", "tset"], limit=0) == [ + "test", + "tset", + ] -def test_get_best_matches_with_score(): - assert Match().get_best_matches_with_score("test", ["test", "nope", "tset"]) == [ +def test_get_best_matches_with_ratio(): + assert Match().get_best_matches_with_ratio("test", ["test", "nope", "tset"]) == [ ("test", 100), ("tset", 75), ] - assert Match().get_best_matches_with_score( + assert Match().get_best_matches_with_ratio( "limit 5", ["limit 5", "limit 4", "limit 3", "limit 2", "limit 1", "limit 0"], limit=2, diff --git a/tests/test_ratio.py b/tests/test_ratio.py index ed44575..b1ba852 100644 --- a/tests/test_ratio.py +++ b/tests/test_ratio.py @@ -1,6 +1,3 @@ -import pytest - -from stringmatch.exceptions import InvalidScorerException from stringmatch.ratio import Ratio @@ -11,8 +8,8 @@ def test_ratio(): assert Ratio().ratio("searchlib", "srechlib", scorer="jaro_winkler") == 93 assert Ratio().ratio("test", "th test", scorer="levenshtein") == 73 assert Ratio().ratio("test", "th test", scorer="jaro_winkler") == 60 - with pytest.raises(InvalidScorerException): - assert Ratio().ratio("searchlib", "srechlib", scorer="nope") + assert Ratio().ratio("searchlib", "srechlib", scorer="nope") == 82 + assert Ratio().ratio("", "f") == 0 def test_ratio_list(): From be2f8cd7ab80bb63dca2f06d988fa44a100e4eb1 Mon Sep 17 00:00:00 2001 From: atomflunder <80397293+atomflunder@users.noreply.github.com> Date: Wed, 27 Apr 2022 15:03:11 +0200 Subject: [PATCH 2/6] v0.5.0 - Passing in different scorers in initialiser I thought the scoring algorithms don't make much sense as an argument, they are better off being passed in while initialising the class. Reducing a bit of the arguments needed in the functions. --- CHANGELOG.md | 1 + README.md | 14 +++++--- stringmatch/match.py | 76 ++++++++++++++------------------------------ stringmatch/ratio.py | 42 ++++++++++-------------- tests/test_match.py | 6 ++-- tests/test_ratio.py | 10 +++--- 6 files changed, 59 insertions(+), 90 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df36cce..36c8c99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ This is a broad overview of the changes that have been made over the lifespan of ## v0.5.0 - 2022-04-27 +- Removed scorer argument from functions, added it into `__init__` in both Match() and Ratio() - Renamed *_with_score functions to *_with_ratio to be consistent with naming - This affects the three functions added in v0.4.0 - Removed Exceptions diff --git a/README.md b/README.md index 010836f..f3eaa76 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Inspired by [seatgeek/thefuzz](https://github.com/seatgeek/thefuzz), which did n - [Installation](#installation) - [Basic Usage](#basic-usage) - [Additional Arguments](#additional-arguments) + - [Using different scorers](#using-different-scorers) - [Links](#links) ## Requirements @@ -128,13 +129,18 @@ match("»»ᅳtestᅳ►", "test", only_letters=True) # returns True match("»»ᅳtestᅳ►", "test", only_letters=False) # returns False ``` -#### `scorer=str` +### Using different scorers -The scoring algorithm to use, the available options are: [`"levenshtein"`](https://en.wikipedia.org/wiki/Levenshtein_distance), [`"jaro"`](https://en.wikipedia.org/wiki/Jaro–Winkler_distance#Jaro_similarity), [`"jaro_winkler"`](https://en.wikipedia.org/wiki/Jaro–Winkler_distance#Jaro–Winkler_similarity). Different algorithms will produce different results, obviously. By default set to `"levenshtein"`. +You can pass in different scorering algorithms when initialising the Match() and Ratio() classes. +The available options are: [`"levenshtein"`](https://en.wikipedia.org/wiki/Levenshtein_distance), [`"jaro"`](https://en.wikipedia.org/wiki/Jaro–Winkler_distance#Jaro_similarity), [`"jaro_winkler"`](https://en.wikipedia.org/wiki/Jaro–Winkler_distance#Jaro–Winkler_similarity). +Different algorithms will produce different results, obviously. By default set to `"levenshtein"`. ```python -match("test", "th test", scorer="levenshtein") # returns True (score = 73) -match("test", "th test", scorer="jaro_winkler") # returns False (score = 60) +levenshtein_matcher = Match(scorer="levenshtein") +jaro_winkler_matcher = Match(scorer="jaro_winkler") + +levenshtein_matcher.match("test", "th test") # returns True (score = 73) +jaro_winkler_matcher.match("test", "th test") # returns False (score = 60) ``` diff --git a/stringmatch/match.py b/stringmatch/match.py index 3566111..e6c87b4 100644 --- a/stringmatch/match.py +++ b/stringmatch/match.py @@ -7,6 +7,21 @@ class Match: """Contains methods for comparing and matching strings.""" + def __init__(self, scorer: str = "levenshtein") -> None: + """Initialise the Match class with the correct scoring algorithm, + to be passed along to the Ratio class. + + Parameters + ---------- + scorer : str, optional + The scorer to use, by default "levenshtein" + Available scorers: + "levenshtein", + "jaro", + "jaro_winkler". + """ + self.scorer = scorer + def match( self, string1: str, @@ -17,7 +32,6 @@ def match( ignore_case: bool = False, remove_punctuation: bool = False, only_letters: bool = False, - scorer: str = "levenshtein", ) -> bool: """Matches two strings, returns True if they are similar enough. @@ -37,12 +51,6 @@ def match( If punctuation should be removed from the strings, by default False. only_letters : bool, optional If the strings should only be compared by their latin letters, by default False. - scorer : str, optional - The scorer to use, by default "levenshtein". - Available scorers: - "levenshtein", - "jaro", - "jaro_winkler". Returns ------- @@ -68,7 +76,7 @@ def match( string2 ) - return Ratio().ratio(string1, string2, scorer=scorer) >= score + return Ratio(scorer=self.scorer).ratio(string1, string2) >= score def match_with_ratio( self, @@ -80,7 +88,6 @@ def match_with_ratio( ignore_case: bool = False, remove_punctuation: bool = False, only_letters: bool = False, - scorer: str = "levenshtein", ) -> tuple[bool, int]: """Same as match, but returns the boolean in a tuple, together with the score. @@ -100,12 +107,6 @@ def match_with_ratio( If punctuation should be removed from the strings, by default False. only_letters : bool, optional If the strings should only be compared by their latin letters, by default False. - scorer : str, optional - The scorer to use, by default "levenshtein". - Available scorers: - "levenshtein", - "jaro", - "jaro_winkler". Returns ------- @@ -118,12 +119,11 @@ def match_with_ratio( "ignore_case": ignore_case, "remove_punctuation": remove_punctuation, "only_letters": only_letters, - "scorer": scorer, } return ( self.match(string1, string2, **kwargs), - Ratio().ratio(string1, string2, scorer=scorer), + Ratio(scorer=self.scorer).ratio(string1, string2), ) def get_best_match( @@ -136,7 +136,6 @@ def get_best_match( ignore_case: bool = False, remove_punctuation: bool = False, only_letters: bool = False, - scorer: str = "levenshtein", ) -> Optional[str]: """Returns the best match from a list of strings. @@ -156,12 +155,6 @@ def get_best_match( If punctuation should be removed from the strings, by default False. only_letters : bool, optional If the strings should only be compared by their latin letters, by default False. - scorer : str, optional - The scorer to use, by default "levenshtein". - Available scorers: - "levenshtein", - "jaro", - "jaro_winkler". Returns ------- @@ -174,11 +167,10 @@ def get_best_match( "remove_punctuation": remove_punctuation, "ignore_case": ignore_case, "only_letters": only_letters, - "scorer": scorer, } return ( - max(string_list, key=lambda s: Ratio().ratio(string, s, scorer=scorer)) + max(string_list, key=lambda s: Ratio(scorer=self.scorer).ratio(string, s)) if any(s for s in string_list if self.match(string, s, **kwargs)) else None ) @@ -193,7 +185,6 @@ def get_best_match_with_ratio( ignore_case: bool = False, remove_punctuation: bool = False, only_letters: bool = False, - scorer: str = "levenshtein", ) -> Optional[tuple[str, int]]: """Same as get_best_match, but returns a tuple with the best match and its score. @@ -213,12 +204,6 @@ def get_best_match_with_ratio( If punctuation should be removed from the strings, by default False. only_letters : bool, optional If the strings should only be compared by their latin letters, by default False. - scorer : str, optional - The scorer to use, by default "levenshtein". - Available scorers: - "levenshtein", - "jaro", - "jaro_winkler". Returns ------- @@ -232,12 +217,13 @@ def get_best_match_with_ratio( "remove_punctuation": remove_punctuation, "ignore_case": ignore_case, "only_letters": only_letters, - "scorer": scorer, } match = self.get_best_match(string, string_list, **kwargs) - return (match, Ratio().ratio(string, match, scorer=scorer)) if match else None + return ( + (match, Ratio(scorer=self.scorer).ratio(string, match)) if match else None + ) def get_best_matches( self, @@ -250,7 +236,6 @@ def get_best_matches( ignore_case: bool = False, remove_punctuation: bool = False, only_letters: bool = False, - scorer: str = "levenshtein", ) -> list[str]: """Matches a string to a list of strings, returns the strings found that are similar. If there are more than `limit` matches, @@ -276,12 +261,6 @@ def get_best_matches( If punctuation should be removed from the strings, by default False. only_letters : bool, optional If the strings should only be compared by their latin letters, by default False. - scorer : str, optional - The scorer to use, by default "levenshtein". - Available scorers: - "levenshtein", - "jaro", - "jaro_winkler". Returns ------- @@ -298,12 +277,11 @@ def get_best_matches( "remove_punctuation": remove_punctuation, "ignore_case": ignore_case, "only_letters": only_letters, - "scorer": scorer, } return sorted( [s for s in string_list if self.match(string, s, **kwargs)], - key=lambda s: Ratio().ratio(string, s, scorer=scorer), + key=lambda s: Ratio(scorer=self.scorer).ratio(string, s), # by default this would sort the list from lowest to highest. reverse=True, )[:limit] @@ -319,7 +297,6 @@ def get_best_matches_with_ratio( ignore_case: bool = False, remove_punctuation: bool = False, only_letters: bool = False, - scorer: str = "levenshtein", ) -> list[tuple[str, int]]: """Same as get_best_matches, but returns a list of tuples with the best matches and their score. @@ -342,12 +319,6 @@ def get_best_matches_with_ratio( If punctuation should be removed from the strings, by default False. only_letters : bool, optional If the strings should only be compared by their latin letters, by default False. - scorer : str, optional - The scorer to use, by default "levenshtein". - Available scorers: - "levenshtein", - "jaro", - "jaro_winkler". Returns ------- @@ -361,11 +332,10 @@ def get_best_matches_with_ratio( "remove_punctuation": remove_punctuation, "ignore_case": ignore_case, "only_letters": only_letters, - "scorer": scorer, } matches = self.get_best_matches(string, string_list, **kwargs) return [ - (match, Ratio().ratio(string, match, scorer=scorer)) for match in matches + (match, Ratio(scorer=self.scorer).ratio(string, match)) for match in matches ] diff --git a/stringmatch/ratio.py b/stringmatch/ratio.py index 8b6ca5b..058159d 100644 --- a/stringmatch/ratio.py +++ b/stringmatch/ratio.py @@ -4,14 +4,24 @@ class Ratio: """Contains functions for calculating the ratio of similarity between two strings.""" - def __init__(self): - self.scorers = { + def __init__(self, scorer: str = "levenshtein") -> None: + """Initialize the Ratio class with the correct scoring algorithm. + + Parameters + ---------- + scorer : str, optional + _description_, by default "levenshtein" + """ + self.available_scorers = { "levenshtein": Levenshtein.ratio, "jaro": Levenshtein.jaro, "jaro_winkler": Levenshtein.jaro_winkler, } - def ratio(self, string1: str, string2: str, scorer: str = "levenshtein") -> int: + # if the scorer is not found, use levenshtein as the default. + self.scorer = self.available_scorers.get(scorer, Levenshtein.ratio) + + def ratio(self, string1: str, string2: str) -> int: """Returns the similarity score between two strings. Parameters @@ -20,12 +30,6 @@ def ratio(self, string1: str, string2: str, scorer: str = "levenshtein") -> int: The first string to compare. string2 : str The second string to compare. - scorer : str, optional - The scorer to use, by default "levenshtein". - Available scorers: - "levenshtein" - "jaro" - "jaro_winkler" Returns @@ -34,18 +38,10 @@ def ratio(self, string1: str, string2: str, scorer: str = "levenshtein") -> int: The score between 0 and 100. """ - if scorer not in self.scorers: - scorer = "levenshtein" - # if either string is empty we wanna return 0 - if not string1 or not string2: - return 0 + return round(self.scorer(string1, string2) * 100) if string1 and string2 else 0 - return round(self.scorers[scorer](string1, string2) * 100) - - def ratio_list( - self, string: str, string_list: list[str], scorer: str = "levenshtein" - ) -> list[int]: + def ratio_list(self, string: str, string_list: list[str]) -> list[int]: """Returns the similarity score between a string and a list of strings. Parameters @@ -54,16 +50,10 @@ def ratio_list( The string to compare. string_list : list[str] The list of strings to compare to. - scorer : str, optional - The scorer to use, by default "levenshtein". - Available scorers: - "levenshtein" - "jaro" - "jaro_winkler" Returns ------- list[int] The scores between 0 and 100. """ - return [self.ratio(string, s, scorer=scorer) for s in string_list] + return [self.ratio(string, s) for s in string_list] diff --git a/tests/test_match.py b/tests/test_match.py index 24bd69e..7b1d974 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -25,15 +25,15 @@ def test_match(): assert Match().match("»»ᅳtestᅳ►", "test", only_letters=True) is True assert Match().match("»»ᅳtestᅳ►", "test", only_letters=False) is False - assert Match().match("test", "th test", scorer="levenshtein") is True - assert Match().match("test", "th test", scorer="jaro_winkler") is False + assert Match(scorer="levenshtein").match("test", "th test") is True + assert Match(scorer="jaro_winkler").match("test", "th test") is False def test_match_with_ratio(): assert Match().match_with_ratio("test", "test") == (True, 100) assert Match().match_with_ratio("test", "nope") == (False, 25) assert Match().match_with_ratio("searchlib", "srechlib") == (True, 82) - assert Match().match_with_ratio("test", "th test", scorer="jaro_winkler") == ( + assert Match(scorer="jaro_winkler").match_with_ratio("test", "th test") == ( False, 60, ) diff --git a/tests/test_ratio.py b/tests/test_ratio.py index b1ba852..dccf12d 100644 --- a/tests/test_ratio.py +++ b/tests/test_ratio.py @@ -5,11 +5,13 @@ def test_ratio(): assert Ratio().ratio("test", "test") == 100 assert Ratio().ratio("bla", "nope") == 0 assert Ratio().ratio("searchlib", "srechlib") == 82 - assert Ratio().ratio("searchlib", "srechlib", scorer="jaro_winkler") == 93 - assert Ratio().ratio("test", "th test", scorer="levenshtein") == 73 - assert Ratio().ratio("test", "th test", scorer="jaro_winkler") == 60 - assert Ratio().ratio("searchlib", "srechlib", scorer="nope") == 82 + assert Ratio(scorer="jaro_winkler").ratio("searchlib", "srechlib") == 93 + assert Ratio(scorer="levenshtein").ratio("test", "th test") == 73 + assert Ratio(scorer="jaro_winkler").ratio("test", "th test") == 60 + assert Ratio(scorer="nope").ratio("searchlib", "srechlib") == 82 assert Ratio().ratio("", "f") == 0 + assert Ratio("levenshtein").ratio_list("test", ["th test", "hwatever"]) == [73, 33] + assert Ratio("jaro_winkler").ratio_list("test", ["th test", "hwatever"]) == [60, 58] def test_ratio_list(): From 9536ad2c29b60a05d93c80f269580fdf8bb8fa60 Mon Sep 17 00:00:00 2001 From: atomflunder <80397293+atomflunder@users.noreply.github.com> Date: Wed, 27 Apr 2022 15:10:56 +0200 Subject: [PATCH 3/6] v0.5.0 - Set limit to be an Optional argument --- stringmatch/match.py | 10 +++++----- tests/test_match.py | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/stringmatch/match.py b/stringmatch/match.py index e6c87b4..d1bbc78 100644 --- a/stringmatch/match.py +++ b/stringmatch/match.py @@ -231,7 +231,7 @@ def get_best_matches( string_list: list[str], *, score: int = 70, - limit: int = 5, + limit: Optional[int] = 5, latinise: bool = False, ignore_case: bool = False, remove_punctuation: bool = False, @@ -252,7 +252,7 @@ def get_best_matches( The cutoff for the score, by default 70. limit : int, optional The number of matches to return, by default 5. - If you want to return every match, set this to 0. + If you want to return every match, set this to 0 (or less than 0) or None. latinise : bool, optional If special unicode characters should be removed from the strings, by default False. ignore_case : bool, optional @@ -268,7 +268,7 @@ def get_best_matches( All of the matches found. """ # we return every match found if the limit is 0 or less - if limit < 1: + if limit is not None and limit < 1: limit = None kwargs = { @@ -292,7 +292,7 @@ def get_best_matches_with_ratio( string_list: list[str], *, score: int = 70, - limit: int = 5, + limit: Optional[int] = 5, latinise: bool = False, ignore_case: bool = False, remove_punctuation: bool = False, @@ -310,7 +310,7 @@ def get_best_matches_with_ratio( The cutoff for the score, by default 70. limit : int, optional The number of matches to return, by default 5. - If you want to return every match, set this to 0. + If you want to return every match, set this to 0 (or less than 0) or None. latinise : bool, optional If special unicode characters should be removed from the strings, by default False. ignore_case : bool, optional diff --git a/tests/test_match.py b/tests/test_match.py index 7b1d974..f67d573 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -81,6 +81,11 @@ def test_get_best_matches(): "tset", ] + assert Match().get_best_matches("test", ["test", "nope", "tset"], limit=None) == [ + "test", + "tset", + ] + def test_get_best_matches_with_ratio(): assert Match().get_best_matches_with_ratio("test", ["test", "nope", "tset"]) == [ From 7a7f59ba78a6b674b152b0e57d93a76e26d1a3d3 Mon Sep 17 00:00:00 2001 From: atomflunder <80397293+atomflunder@users.noreply.github.com> Date: Wed, 27 Apr 2022 18:23:44 +0200 Subject: [PATCH 4/6] v0.5.0 - Improved Readme --- README.md | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index f3eaa76..c246d1b 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,9 @@ Inspired by [seatgeek/thefuzz](https://github.com/seatgeek/thefuzz), which did n - [Requirements](#requirements) - [Installation](#installation) - [Basic Usage](#basic-usage) - - [Additional Arguments](#additional-arguments) - - [Using different scorers](#using-different-scorers) +- [Advanced Usage](#advanced-usage) + - [Keyword Arguments](#keyword-arguments) + - [Scoring Algorithms](#scoring-algorithms) - [Links](#links) ## Requirements @@ -71,11 +72,12 @@ strings.only_letters("Héllö, world!") # returns "Hll world" strings.ignore_case("test test!", lower=False) # returns "TEST TEST!" ``` -### Additional Arguments -You can pass in additional arguments for the `Match()` functions to customise your search further: +## Advanced Usage -#### `score=int` +### Keyword Arguments +You can pass in additional arguments for the `Match()` functions to customise your search further: +**`score=70`** The score cutoff for matching, by default set to 70. ```python @@ -83,8 +85,9 @@ match("searchlib", "srechlib", score=85) # returns False match("searchlib", "srechlib", score=70) # returns True ``` -#### `limit=int` +--- +**`limit=5`** The limit of how many matches to return. Only available for `Matches().get_best_matches()`. If you want to return every match set this to 0. By default this is set to `5`. ```python @@ -93,8 +96,9 @@ get_best_matches("limit 5", searches, limit=2) # returns ["limit 5", "limit 4"] get_best_matches("limit 5", searches, limit=1) # returns ["limit 5"] ``` -#### `latinise=bool` +--- +**`latinise=False`** Replaces special unicode characters with their latin alphabet equivalents. By default turned off. ```python @@ -102,8 +106,9 @@ match("séärçh", "search", latinise=True) # returns True match("séärçh", "search", latinise=False) # returns False ``` -#### `ignore_case=bool` +--- +**`ignore_case=False`** If you want to ignore case sensitivity while searching. By default turned off. ```python @@ -111,8 +116,9 @@ match("test", "TEST", ignore_case=True) # returns True match("test", "TEST", ignore_case=False) # returns False ``` -#### `remove_punctuation=bool` +--- +**`remove_punctuation=False`** Removes commonly used punctuation symbols from the strings, like `.,;:!?` and so on. By default turned off. ```python @@ -120,8 +126,9 @@ match("test,---....", "test", remove_punctuation=True) # returns True match("test,---....", "test", remove_punctuation=False) # returns False ``` -#### `only_letters=bool` +--- +**`only_letters=False`** Removes every character that is not in the latin alphabet, a more extreme version of `remove_punctuation`. By default turned off. ```python @@ -129,9 +136,9 @@ match("»»ᅳtestᅳ►", "test", only_letters=True) # returns True match("»»ᅳtestᅳ►", "test", only_letters=False) # returns False ``` -### Using different scorers +### Scoring Algorithms -You can pass in different scorering algorithms when initialising the Match() and Ratio() classes. +You can pass in different scoring algorithms when initialising the `Match()` and `Ratio()` classes. The available options are: [`"levenshtein"`](https://en.wikipedia.org/wiki/Levenshtein_distance), [`"jaro"`](https://en.wikipedia.org/wiki/Jaro–Winkler_distance#Jaro_similarity), [`"jaro_winkler"`](https://en.wikipedia.org/wiki/Jaro–Winkler_distance#Jaro–Winkler_similarity). Different algorithms will produce different results, obviously. By default set to `"levenshtein"`. From 6bb85b0ec15b2339b4fccbe2cd82d4aaf809fe56 Mon Sep 17 00:00:00 2001 From: atomflunder <80397293+atomflunder@users.noreply.github.com> Date: Wed, 27 Apr 2022 18:41:19 +0200 Subject: [PATCH 5/6] v0.5.0 - Split up basic usage part of Readme --- README.md | 71 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index c246d1b..3bcc607 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,10 @@ Inspired by [seatgeek/thefuzz](https://github.com/seatgeek/thefuzz), which did n - [Requirements](#requirements) - [Installation](#installation) - [Basic Usage](#basic-usage) + - [Matching](#matching) + - [Ratios](#ratios) + - [Matching & Ratios](#matching--ratios) + - [Strings](#strings) - [Advanced Usage](#advanced-usage) - [Keyword Arguments](#keyword-arguments) - [Scoring Algorithms](#scoring-algorithms) @@ -34,42 +38,71 @@ pip install -U git+https://github.com/atomflunder/stringmatch ## Basic Usage +### Matching + +The match functions allow you to compare 2 strings and check if they are "similar enough" to each other, or get the best match(es) from a list of strings: + ```python -from stringmatch import Match, Ratio, Strings +from stringmatch import Match -# Basic usage: match = Match() -match.match("searchlib", "srchlib") # returns True -match.match("searchlib", "something else") # returns False +# Checks if the strings are similar. +match.match("searchlib", "srchlib") # returns True +match.match("searchlib", "something else") # returns False -# Matching lists: +# Returns the best match(es) found in the list. searches = ["searchli", "searhli", "search", "lib", "whatever", "s"] -match.get_best_match("searchlib", searches) # returns "searchli" -match.get_best_matches("searchlib", searches) # returns ['searchli', 'searhli', 'search'] +match.get_best_match("searchlib", searches) # returns "searchli" +match.get_best_matches("searchlib", searches) # returns ['searchli', 'searhli', 'search'] +``` + +### Ratios + +You can get the "ratio of similarity" between strings like this: + +```python +from stringmatch import Ratio -# Ratios: ratio = Ratio() -ratio.ratio("searchlib", "searchlib") # returns 100 -ratio.ratio("searchlib", "srechlib") # returns 82 +# Getting the ratio between the two strings. +ratio.ratio("searchlib", "searchlib") # returns 100 +ratio.ratio("searchlib", "srechlib") # returns 82 + +# Getting the ratio between the first string and the list of strings at once. searches = ["searchlib", "srechlib"] -ratio.ratio_list("searchlib", searches) # returns [100, 82] +ratio.ratio_list("searchlib", searches) # returns [100, 82] +``` -# Getting matches and ratios: -match.match_with_ratio("searchlib", "srechlib") # returns (True, 82) +### Matching & Ratios + +You can also get both the match and the ratio together in a tuple using these functions: + +```python +from stringmatch import Match + +match = Match() searches = ["test", "nope", "tset"] + +match.match_with_ratio("searchlib", "srechlib") # returns (True, 82) match.get_best_match_with_ratio("test", searches) # returns ("test", 100) match.get_best_matches_with_ratio("test", searches) # returns [("test", 100), ("tset", 75)] +``` + +### Strings + +This is primarily meant for internal usage, but you can also use this library to modify strings: + +```python +from stringmatch import Strings -# Modify strings: -# This is meant for internal use, but you can also use it yourself, if you choose to. strings = Strings() -strings.latinise("Héllö, world!") # returns "Hello, world!" -strings.remove_punctuation("wh'at;, ever") # returns "what ever" -strings.only_letters("Héllö, world!") # returns "Hll world" -strings.ignore_case("test test!", lower=False) # returns "TEST TEST!" +strings.latinise("Héllö, world!") # returns "Hello, world!" +strings.remove_punctuation("wh'at;, ever") # returns "what ever" +strings.only_letters("Héllö, world!") # returns "Hll world" +strings.ignore_case("test test!", lower=False) # returns "TEST TEST!" ``` ## Advanced Usage From 7153742df7a3dd4fbeb6f4177d5879ebb0d46181 Mon Sep 17 00:00:00 2001 From: atomflunder <80397293+atomflunder@users.noreply.github.com> Date: Wed, 27 Apr 2022 18:58:09 +0200 Subject: [PATCH 6/6] v0.5.0 - Adding contributing guidelines --- .github/CONTRIBUTING.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .github/CONTRIBUTING.md diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000..b479013 --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,18 @@ +# Contributing to stringmatch + +First off, thanks for being interested in contributing to stringmatch! Every contribution is appreciated a lot. The following are some guidelines to get you started. They are *guidelines* and not strict rules. + +If you just want to ask a question, go ahead and visit the [GitHub Discussions Tab](https://github.com/atomflunder/stringmatch/discussions). + +## Bug reports + +While submitting a bug report, make sure to follow the template and be clear in how to reproduce the bug. If you already know how to fix the bug, go ahead and either describe it in the report, or submit a pull request directly. + +## Pull requests + +Submitting a pull request is just as straight-forward as submitting a bug report. Follow the template and you will be fine. +If you make any changes to the functionality of the code, please make sure to test the functionality beforehand, writing tests is greatly encouraged. +It would also be greatly appreciated if you stick to the general style of the library, but not really required. + +Thanks again for your interest in contributing! +If you still have doubt in contributing to this library, I can assure you there is no bad contribution.