diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ca1664e..97c46335 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ ## Changelog +### [1.2.0] - 2022-08-20 +#### Added +- added damerau levenshtein implementation + - Not API stable yet, since it will be extended with weights in a future version + ### [1.1.1] - 2022-07-29 #### Performance - improve performance for banded Levenshtein implementation diff --git a/CMakeLists.txt b/CMakeLists.txt index 14c61156..3fb4a062 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt") endif() -project(rapidfuzz LANGUAGES CXX VERSION 1.1.1) +project(rapidfuzz LANGUAGES CXX VERSION 1.2.0) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") include(GNUInstallDirs) diff --git a/extras/rapidfuzz_amalgamated.hpp b/extras/rapidfuzz_amalgamated.hpp index 95ae5d9e..386761f6 100644 --- a/extras/rapidfuzz_amalgamated.hpp +++ b/extras/rapidfuzz_amalgamated.hpp @@ -1,7 +1,7 @@ // Licensed under the MIT License . // SPDX-License-Identifier: MIT // RapidFuzz v1.0.2 -// Generated: 2022-08-18 23:05:26.329830 +// Generated: 2022-08-20 03:56:31.798204 // ---------------------------------------------------------- // This file is an amalgamation of multiple different files. // You probably shouldn't edit it directly. @@ -293,10 +293,8 @@ auto vector_slice(const Vec& vec, int start, int stop, int step) -> Vec { Vec new_vec; - if (step == 0) - throw std::invalid_argument("slice step cannot be zero"); - if (step < 0) - throw std::invalid_argument("step sizes below 0 lead to an invalid order of editops"); + if (step == 0) throw std::invalid_argument("slice step cannot be zero"); + if (step < 0) throw std::invalid_argument("step sizes below 0 lead to an invalid order of editops"); if (start < 0) start = std::max(start + static_cast(vec.size()), 0); @@ -308,8 +306,7 @@ auto vector_slice(const Vec& vec, int start, int stop, int step) -> Vec else if (stop > static_cast(vec.size())) stop = static_cast(vec.size()); - if (start >= stop) - return new_vec; + if (start >= stop) return new_vec; int count = (stop - 1 - start) / step + 1; new_vec.reserve(static_cast(count)); @@ -323,10 +320,8 @@ auto vector_slice(const Vec& vec, int start, int stop, int step) -> Vec template void vector_remove_slice(Vec& vec, int start, int stop, int step) { - if (step == 0) - throw std::invalid_argument("slice step cannot be zero"); - if (step < 0) - throw std::invalid_argument("step sizes below 0 lead to an invalid order of editops"); + if (step == 0) throw std::invalid_argument("slice step cannot be zero"); + if (step < 0) throw std::invalid_argument("step sizes below 0 lead to an invalid order of editops"); if (start < 0) start = std::max(start + static_cast(vec.size()), 0); @@ -338,8 +333,7 @@ void vector_remove_slice(Vec& vec, int start, int stop, int step) else if (stop > static_cast(vec.size())) stop = static_cast(vec.size()); - if (start >= stop) - return; + if (start >= stop) return; auto iter = vec.begin() + start; for (int i = start; i < static_cast(vec.size()); i++) @@ -673,11 +667,9 @@ class Opcodes : private std::vector { inline bool operator==(const Opcodes& lhs, const Opcodes& rhs) { - if (lhs.get_src_len() != rhs.get_src_len() || lhs.get_dest_len() != rhs.get_dest_len()) - return false; + if (lhs.get_src_len() != rhs.get_src_len() || lhs.get_dest_len() != rhs.get_dest_len()) return false; - if (lhs.size() != rhs.size()) - return false; + if (lhs.size() != rhs.size()) return false; return std::equal(lhs.begin(), lhs.end(), rhs.begin()); } @@ -1780,8 +1772,6 @@ double CachedHamming::normalized_similarity(const Sentence2& s2, double return hamming_normalized_similarity(s1, s2, score_cutoff); } -/**@}*/ - } // namespace rapidfuzz @@ -1993,7 +1983,6 @@ struct PatternMatchVector { template uint64_t get(CharT key) const noexcept { - /** treat char as value between 0 and 127 for performance reasons */ if (key >= 0 && key <= 255) return m_extendedAscii[static_cast(key)]; else @@ -2010,7 +1999,8 @@ struct PatternMatchVector { void insert_mask(char key, uint64_t mask) noexcept { - insert_mask(static_cast(key), mask); + /** treat char as value between 0 and 127 for performance reasons */ + m_extendedAscii[static_cast(key)] |= mask; } template @@ -2032,8 +2022,7 @@ struct BlockPatternMatchVector { BlockPatternMatchVector(size_t str_len) : m_block_count(ceil_div(str_len, 64)), m_map(nullptr), m_extendedAscii(256, m_block_count, 0) - { - } + {} template BlockPatternMatchVector(Range s) : BlockPatternMatchVector(static_cast(s.size())) @@ -2082,10 +2071,8 @@ struct BlockPatternMatchVector { assert(block < size()); if (key >= 0 && key <= 255) m_extendedAscii[static_cast(key)][block] |= mask; - else - { - if (!m_map) - m_map = new BitvectorHashmap[m_block_count]; + else { + if (!m_map) m_map = new BitvectorHashmap[m_block_count]; m_map[block].insert_mask(key, mask); } } @@ -2100,7 +2087,7 @@ struct BlockPatternMatchVector { { if (key >= 0 && key <= 255) return m_extendedAscii[static_cast(key)][block]; - else if(m_map) + else if (m_map) return m_map[block].get(key); else return 0; @@ -3638,22 +3625,19 @@ int64_t levenshtein_hyrroe2003_small_band(const BlockPatternMatchVector& PM, Ran /* Searching */ ptrdiff_t i = 0; - for (; i < s1.size() - max; ++i,++start_pos) { + for (; i < s1.size() - max; ++i, ++start_pos) { /* Step 1: Computing D0 */ uint64_t PM_j = 0; - if (start_pos < 0) - { + if (start_pos < 0) { PM_j = PM.get(0, s2[i]) << (-start_pos); } - else - { + else { size_t word = static_cast(start_pos) / 64; size_t word_pos = static_cast(start_pos) % 64; PM_j = PM.get(word, s2[i]) >> word_pos; - if (word + 1 < words && word_pos != 0) - PM_j |= PM.get(word + 1, s2[i]) << (64 - word_pos); + if (word + 1 < words && word_pos != 0) PM_j |= PM.get(word + 1, s2[i]) << (64 - word_pos); } uint64_t X = PM_j; uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN; @@ -3670,22 +3654,19 @@ int64_t levenshtein_hyrroe2003_small_band(const BlockPatternMatchVector& PM, Ran VN = (D0 >> 1) & HP; } - for (; i < s2.size(); ++i,++start_pos) { + for (; i < s2.size(); ++i, ++start_pos) { /* Step 1: Computing D0 */ uint64_t PM_j = 0; - if (start_pos < 0) - { + if (start_pos < 0) { PM_j = PM.get(0, s2[i]) << (-start_pos); } - else - { + else { size_t word = static_cast(start_pos) / 64; size_t word_pos = static_cast(start_pos) % 64; PM_j = PM.get(word, s2[i]) >> word_pos; - if (word + 1 < words && word_pos != 0) - PM_j |= PM.get(word + 1, s2[i]) << (64 - word_pos); + if (word + 1 < words && word_pos != 0) PM_j |= PM.get(word + 1, s2[i]) << (64 - word_pos); } uint64_t X = PM_j; uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN; @@ -4511,6 +4492,610 @@ double CachedLevenshtein::normalized_similarity(const Sentence2& s2, dou } } // namespace rapidfuzz +#include +#include +#include +#include + +namespace rapidfuzz { +namespace detail { + +/* + * based on the paper Linear space string correction algorithm using the Damerau-Levenshtein distance + * from Chunchun Zhao and Sartaj Sahni + */ +template +int64_t damerau_levenshtein(Range s1, Range s2, int64_t max); + +} // namespace detail + +/* the API will require a change when adding custom weights */ +namespace experimental { +/** + * @brief Calculates the Damerau Levenshtein distance between two strings. + * + * + * @tparam Sentence1 This is a string that can be converted to + * basic_string_view + * @tparam Sentence2 This is a string that can be converted to + * basic_string_view + * + * @param s1 + * string to compare with s2 (for type info check Template parameters above) + * @param s2 + * string to compare with s1 (for type info check Template parameters above) + * @param max + * Maximum Damerau Levenshtein distance between s1 and s2, that is + * considered as a result. If the distance is bigger than max, + * max + 1 is returned instead. Default is std::numeric_limits::max(), + * which deactivates this behaviour. + * + * @return Damerau Levenshtein distance between s1 and s2 + */ +template +int64_t damerau_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, + int64_t score_cutoff = std::numeric_limits::max()); + +template +int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2, + int64_t score_cutoff = std::numeric_limits::max()); + +template +int64_t damerau_levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, + int64_t score_cutoff = 0); + +template +int64_t damerau_levenshtein_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0); + +template +double damerau_levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2, double score_cutoff = 1.0); + +template +double damerau_levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2, + double score_cutoff = 1.0); + +/** + * @brief Calculates a normalized Damerau Levenshtein similarity + * + * @details + * Both string require a similar length + * + * + * @tparam Sentence1 This is a string that can be converted to + * basic_string_view + * @tparam Sentence2 This is a string that can be converted to + * basic_string_view + * + * @param s1 + * string to compare with s2 (for type info check Template parameters above) + * @param s2 + * string to compare with s1 (for type info check Template parameters above) + * @param score_cutoff + * Optional argument for a score threshold as a float between 0 and 1.0. + * For ratio < score_cutoff 0 is returned instead. Default is 0, + * which deactivates this behaviour. + * + * @return Normalized Damerau Levenshtein distance between s1 and s2 + * as a float between 0 and 1.0 + */ +template +double damerau_levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2, double score_cutoff = 0.0); + +template +double damerau_levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2, + double score_cutoff = 0.0); + +template +struct CachedDamerauLevenshtein { + template + CachedDamerauLevenshtein(const Sentence1& s1_) + : CachedDamerauLevenshtein(detail::to_begin(s1_), detail::to_end(s1_)) + {} + + template + CachedDamerauLevenshtein(InputIt1 first1, InputIt1 last1) : s1(first1, last1) + {} + + template + int64_t distance(InputIt2 first2, InputIt2 last2, + int64_t score_cutoff = std::numeric_limits::max()) const; + + template + int64_t distance(const Sentence2& s2, int64_t score_cutoff = std::numeric_limits::max()) const; + + template + int64_t similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0) const; + + template + int64_t similarity(const Sentence2& s2, int64_t score_cutoff = 0) const; + + template + double normalized_distance(InputIt2 first2, InputIt2 last2, double score_cutoff = 1.0) const; + + template + double normalized_distance(const Sentence2& s2, double score_cutoff = 1.0) const; + + template + double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) const; + + template + double normalized_similarity(const Sentence2& s2, double score_cutoff = 0.0) const; + +private: + std::basic_string s1; +}; + +#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L) +template +CachedDamerauLevenshtein(const Sentence1& s1_) -> CachedDamerauLevenshtein>; + +template +CachedDamerauLevenshtein(InputIt1 first1, InputIt1 last1) -> CachedDamerauLevenshtein>; +#endif + +} // namespace experimental +} // namespace rapidfuzz + + +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include + +namespace rapidfuzz { +namespace detail { + +/* hashmap for integers which can only grow, but can't remove elements */ +template +struct GrowingHashmap { + using key_type = T_Key; + using value_type = T_Entry; + using size_type = unsigned int; + +private: + static constexpr size_type min_size = 8; + struct MapElem { + key_type key; + value_type value = _empty_val; + }; + + int used; + int fill; + int mask; + MapElem* m_map; + +public: + GrowingHashmap() : used(0), fill(0), mask(-1), m_map(NULL) + {} + ~GrowingHashmap() + { + delete[] m_map; + } + + GrowingHashmap(const GrowingHashmap& other) : used(other.used), fill(other.fill), mask(other.mask) + { + int size = mask + 1; + m_map = new MapElem[size]; + std::copy(other.m_map, other.m_map + size, m_map); + } + + GrowingHashmap(GrowingHashmap&& other) noexcept : GrowingHashmap() + { + swap(*this, other); + } + + GrowingHashmap& operator=(GrowingHashmap other) + { + swap(*this, other); + return *this; + } + + friend void swap(GrowingHashmap& first, GrowingHashmap& second) noexcept + { + std::swap(first.used, second.used); + std::swap(first.fill, second.fill); + std::swap(first.mask, second.mask); + std::swap(first.m_map, second.m_map); + } + + size_type size() const + { + return used; + } + size_type capacity() const + { + return mask + 1; + } + bool empty() const + { + return used == 0; + } + + value_type get(key_type key) const noexcept + { + if (m_map == NULL) return _empty_val; + + return m_map[lookup(static_cast(key))].value; + } + + void insert(key_type key, value_type val) + { + if (m_map == NULL) allocate(); + + size_t i = lookup(static_cast(key)); + + if (m_map[i].value == _empty_val) { + /* resize when 2/3 full */ + if (++fill * 3 >= (mask + 1) * 2) { + grow((used + 1) * 2); + i = lookup(static_cast(key)); + } + + used++; + } + + m_map[i].key = key; + m_map[i].value = val; + } + +private: + void allocate() + { + mask = min_size - 1; + m_map = new MapElem[min_size]; + } + + /** + * lookup key inside the hashmap using a similar collision resolution + * strategy to CPython and Ruby + */ + size_t lookup(size_t key) const + { + size_t i = key & static_cast(mask); + + if (m_map[i].value == _empty_val || m_map[i].key == key) return i; + + size_t perturb = key; + while (true) { + i = (i * 5 + perturb + 1) & static_cast(mask); + if (m_map[i].value == _empty_val || m_map[i].key == key) return i; + + perturb >>= 5; + } + } + + void grow(int minUsed) + { + int newSize = mask + 1; + while (newSize <= minUsed) + newSize <<= 1; + + MapElem* oldMap = m_map; + m_map = new MapElem[static_cast(newSize)]; + + fill = used; + mask = newSize - 1; + + for (int i = 0; used > 0; i++) + if (oldMap[i].value != _empty_val) { + size_t j = lookup(static_cast(oldMap[i].key)); + + m_map[j].key = oldMap[i].key; + m_map[j].value = oldMap[i].value; + used--; + } + + used = fill; + delete[] oldMap; + } +}; + +template +struct HybridGrowingHashmap { + using key_type = T_Key; + using value_type = T_Entry; + + HybridGrowingHashmap() + { + m_extendedAscii.fill(_empty_val); + } + + value_type get(char key) const noexcept + { + /** treat char as value between 0 and 127 for performance reasons */ + return m_extendedAscii[static_cast(key)]; + } + + template + value_type get(CharT key) const noexcept + { + if (key >= 0 && key <= 255) + return m_extendedAscii[static_cast(key)]; + else + return m_map.get(static_cast(key)); + } + + value_type insert(char key, value_type val) noexcept + { + /** treat char as value between 0 and 127 for performance reasons */ + m_extendedAscii[static_cast(key)] = val; + } + + template + void insert(CharT key, value_type val) + { + if (key >= 0 && key <= 255) + m_extendedAscii[static_cast(key)] = val; + else + m_map.insert(static_cast(key), val); + } + +private: + GrowingHashmap m_map; + std::array m_extendedAscii; +}; + +} // namespace detail +} // namespace rapidfuzz +namespace rapidfuzz { +namespace detail { + +/* + * based on the paper + * "Linear space string correction algorithm using the Damerau-Levenshtein distance" + * from Chunchun Zhao and Sartaj Sahni + */ +template +int64_t damerau_levenshtein_distance_zhao(Range s1, Range s2, int64_t max) +{ + IntType len1 = static_cast(s1.size()); + IntType len2 = static_cast(s2.size()); + IntType maxVal = static_cast(std::max(len1, len2) + 1); + assert(std::numeric_limits::max() > maxVal); + + HybridGrowingHashmap last_row_id; + size_t size = static_cast(s2.size() + 2); + assume(size != 0); + std::vector FR_arr(size, maxVal); + std::vector R1_arr(size, maxVal); + std::vector R_arr(size); + R_arr[0] = maxVal; + std::iota(R_arr.begin() + 1, R_arr.end(), IntType(0)); + + IntType* R = &R_arr[1]; + IntType* R1 = &R1_arr[1]; + IntType* FR = &FR_arr[1]; + + for (IntType i = 1; i <= len1; i++) { + std::swap(R, R1); + IntType last_col_id = -1; + IntType last_i2l1 = R[0]; + R[0] = i; + IntType T = maxVal; + + for (IntType j = 1; j <= len2; j++) { + ptrdiff_t diag = R1[j - 1] + static_cast(s1[i - 1] != s2[j - 1]); + ptrdiff_t left = R[j - 1] + 1; + ptrdiff_t up = R1[j] + 1; + ptrdiff_t temp = std::min({diag, left, up}); + + if (s1[i - 1] == s2[j - 1]) { + last_col_id = j; // last occurence of s1_i + FR[j] = R1[j - 2]; // save H_k-1,j-2 + T = last_i2l1; // save H_i-2,l-1 + } + else { + ptrdiff_t k = last_row_id.get(static_cast(s2[j - 1])); + ptrdiff_t l = last_col_id; + + if ((j - l) == 1) { + ptrdiff_t transpose = FR[j] + (i - k); + temp = std::min(temp, transpose); + } + else if ((i - k) == 1) { + ptrdiff_t transpose = T + (j - l); + temp = std::min(temp, transpose); + } + } + + last_i2l1 = R[j]; + R[j] = static_cast(temp); + } + last_row_id.insert(static_cast(s1[i - 1]), i); + } + + int64_t dist = R[s2.size()]; + return (dist <= max) ? dist : max + 1; +} + +template +int64_t damerau_levenshtein_distance(Range s1, Range s2, int64_t max) +{ + int64_t min_edits = std::abs(s1.size() - s2.size()); + if (min_edits > max) return max + 1; + + /* common affix does not effect Levenshtein distance */ + remove_common_affix(s1, s2); + + ptrdiff_t maxVal = std::max(s1.size(), s2.size()) + 1; + if (std::numeric_limits::max() > maxVal) + return damerau_levenshtein_distance_zhao(s1, s2, max); + else if (std::numeric_limits::max() > maxVal) + return damerau_levenshtein_distance_zhao(s1, s2, max); + else + return damerau_levenshtein_distance_zhao(s1, s2, max); +} + +template +int64_t damerau_levenshtein_similarity(Range s1, Range s2, int64_t score_cutoff) +{ + auto maximum = std::max(s1.size(), s2.size()); + int64_t cutoff_distance = maximum - score_cutoff; + int64_t dist = damerau_levenshtein_distance(s1, s2, cutoff_distance); + int64_t sim = maximum - dist; + return (sim >= score_cutoff) ? sim : 0; +} + +template +double damerau_levenshtein_normalized_distance(Range s1, Range s2, double score_cutoff) +{ + auto maximum = std::max(s1.size(), s2.size()); + int64_t cutoff_distance = static_cast(std::ceil(static_cast(maximum) * score_cutoff)); + int64_t dist = damerau_levenshtein_distance(s1, s2, cutoff_distance); + double norm_dist = (maximum) ? static_cast(dist) / static_cast(maximum) : 0.0; + return (norm_dist <= score_cutoff) ? norm_dist : 1.0; +} + +template +double damerau_levenshtein_normalized_similarity(Range s1, Range s2, double score_cutoff) +{ + double cutoff_score = detail::NormSim_to_NormDist(score_cutoff); + double norm_dist = damerau_levenshtein_normalized_distance(s1, s2, cutoff_score); + double norm_sim = 1.0 - norm_dist; + return (norm_sim >= score_cutoff) ? norm_sim : 0.0; +} + +} // namespace detail + +namespace experimental { + +template +int64_t damerau_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, + int64_t max) +{ + return detail::damerau_levenshtein_distance(detail::make_range(first1, last1), + detail::make_range(first2, last2), max); +} + +template +int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2, int64_t max) +{ + return detail::damerau_levenshtein_distance(detail::make_range(s1), detail::make_range(s2), max); +} + +template +double damerau_levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2, double score_cutoff) +{ + return detail::damerau_levenshtein_normalized_distance(detail::make_range(first1, last1), + detail::make_range(first2, last2), score_cutoff); +} + +template +double damerau_levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff) +{ + return detail::damerau_levenshtein_normalized_distance(detail::make_range(s1), detail::make_range(s2), + score_cutoff); +} + +template +int64_t damerau_levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, + int64_t score_cutoff) +{ + return detail::damerau_levenshtein_similarity(detail::make_range(first1, last1), + detail::make_range(first2, last2), score_cutoff); +} + +template +int64_t damerau_levenshtein_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff) +{ + return detail::damerau_levenshtein_similarity(detail::make_range(s1), detail::make_range(s2), + score_cutoff); +} + +template +double damerau_levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2, double score_cutoff) +{ + return detail::damerau_levenshtein_normalized_similarity(detail::make_range(first1, last1), + detail::make_range(first2, last2), score_cutoff); +} + +template +double damerau_levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2, + double score_cutoff) +{ + return detail::damerau_levenshtein_normalized_similarity(detail::make_range(s1), detail::make_range(s2), + score_cutoff); +} + +template +template +int64_t CachedDamerauLevenshtein::distance(InputIt2 first2, InputIt2 last2, + int64_t score_cutoff) const +{ + return damerau_levenshtein_distance(detail::to_begin(s1), detail::to_end(s1), first2, last2, + score_cutoff); +} + +template +template +int64_t CachedDamerauLevenshtein::distance(const Sentence2& s2, int64_t score_cutoff) const +{ + return damerau_levenshtein_distance(s1, s2, score_cutoff); +} + +template +template +int64_t CachedDamerauLevenshtein::similarity(InputIt2 first2, InputIt2 last2, + int64_t score_cutoff) const +{ + return damerau_levenshtein_similarity(detail::to_begin(s1), detail::to_end(s1), first2, last2, + score_cutoff); +} + +template +template +int64_t CachedDamerauLevenshtein::similarity(const Sentence2& s2, int64_t score_cutoff) const +{ + return damerau_levenshtein_similarity(s1, s2, score_cutoff); +} + +template +template +double CachedDamerauLevenshtein::normalized_distance(InputIt2 first2, InputIt2 last2, + double score_cutoff) const +{ + return damerau_levenshtein_normalized_distance(detail::to_begin(s1), detail::to_end(s1), first2, last2, + score_cutoff); +} + +template +template +double CachedDamerauLevenshtein::normalized_distance(const Sentence2& s2, double score_cutoff) const +{ + return damerau_levenshtein_normalized_distance(s1, s2, score_cutoff); +} + +template +template +double CachedDamerauLevenshtein::normalized_similarity(InputIt2 first2, InputIt2 last2, + double score_cutoff) const +{ + return damerau_levenshtein_normalized_similarity(detail::to_begin(s1), detail::to_end(s1), first2, last2, + score_cutoff); +} + +template +template +double CachedDamerauLevenshtein::normalized_similarity(const Sentence2& s2, double score_cutoff) const +{ + return damerau_levenshtein_normalized_similarity(s1, s2, score_cutoff); +} + +} // namespace experimental +} // namespace rapidfuzz + namespace rapidfuzz { template diff --git a/rapidfuzz/details/GrowingHashmap.hpp b/rapidfuzz/details/GrowingHashmap.hpp new file mode 100644 index 00000000..e2c260aa --- /dev/null +++ b/rapidfuzz/details/GrowingHashmap.hpp @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright (c) 2022 Max Bachmann */ + +#pragma once + +#include +#include +#include +#include + +namespace rapidfuzz { +namespace detail { + +/* hashmap for integers which can only grow, but can't remove elements */ +template +struct GrowingHashmap { + using key_type = T_Key; + using value_type = T_Entry; + using size_type = unsigned int; + +private: + static constexpr size_type min_size = 8; + struct MapElem { + key_type key; + value_type value = _empty_val; + }; + + int used; + int fill; + int mask; + MapElem* m_map; + +public: + GrowingHashmap() : used(0), fill(0), mask(-1), m_map(NULL) + {} + ~GrowingHashmap() + { + delete[] m_map; + } + + GrowingHashmap(const GrowingHashmap& other) : used(other.used), fill(other.fill), mask(other.mask) + { + int size = mask + 1; + m_map = new MapElem[size]; + std::copy(other.m_map, other.m_map + size, m_map); + } + + GrowingHashmap(GrowingHashmap&& other) noexcept : GrowingHashmap() + { + swap(*this, other); + } + + GrowingHashmap& operator=(GrowingHashmap other) + { + swap(*this, other); + return *this; + } + + friend void swap(GrowingHashmap& first, GrowingHashmap& second) noexcept + { + std::swap(first.used, second.used); + std::swap(first.fill, second.fill); + std::swap(first.mask, second.mask); + std::swap(first.m_map, second.m_map); + } + + size_type size() const + { + return used; + } + size_type capacity() const + { + return mask + 1; + } + bool empty() const + { + return used == 0; + } + + value_type get(key_type key) const noexcept + { + if (m_map == NULL) return _empty_val; + + return m_map[lookup(static_cast(key))].value; + } + + void insert(key_type key, value_type val) + { + if (m_map == NULL) allocate(); + + size_t i = lookup(static_cast(key)); + + if (m_map[i].value == _empty_val) { + /* resize when 2/3 full */ + if (++fill * 3 >= (mask + 1) * 2) { + grow((used + 1) * 2); + i = lookup(static_cast(key)); + } + + used++; + } + + m_map[i].key = key; + m_map[i].value = val; + } + +private: + void allocate() + { + mask = min_size - 1; + m_map = new MapElem[min_size]; + } + + /** + * lookup key inside the hashmap using a similar collision resolution + * strategy to CPython and Ruby + */ + size_t lookup(size_t key) const + { + size_t i = key & static_cast(mask); + + if (m_map[i].value == _empty_val || m_map[i].key == key) return i; + + size_t perturb = key; + while (true) { + i = (i * 5 + perturb + 1) & static_cast(mask); + if (m_map[i].value == _empty_val || m_map[i].key == key) return i; + + perturb >>= 5; + } + } + + void grow(int minUsed) + { + int newSize = mask + 1; + while (newSize <= minUsed) + newSize <<= 1; + + MapElem* oldMap = m_map; + m_map = new MapElem[static_cast(newSize)]; + + fill = used; + mask = newSize - 1; + + for (int i = 0; used > 0; i++) + if (oldMap[i].value != _empty_val) { + size_t j = lookup(static_cast(oldMap[i].key)); + + m_map[j].key = oldMap[i].key; + m_map[j].value = oldMap[i].value; + used--; + } + + used = fill; + delete[] oldMap; + } +}; + +template +struct HybridGrowingHashmap { + using key_type = T_Key; + using value_type = T_Entry; + + HybridGrowingHashmap() + { + m_extendedAscii.fill(_empty_val); + } + + value_type get(char key) const noexcept + { + /** treat char as value between 0 and 127 for performance reasons */ + return m_extendedAscii[static_cast(key)]; + } + + template + value_type get(CharT key) const noexcept + { + if (key >= 0 && key <= 255) + return m_extendedAscii[static_cast(key)]; + else + return m_map.get(static_cast(key)); + } + + value_type insert(char key, value_type val) noexcept + { + /** treat char as value between 0 and 127 for performance reasons */ + m_extendedAscii[static_cast(key)] = val; + } + + template + void insert(CharT key, value_type val) + { + if (key >= 0 && key <= 255) + m_extendedAscii[static_cast(key)] = val; + else + m_map.insert(static_cast(key), val); + } + +private: + GrowingHashmap m_map; + std::array m_extendedAscii; +}; + +} // namespace detail +} // namespace rapidfuzz \ No newline at end of file diff --git a/rapidfuzz/details/PatternMatchVector.hpp b/rapidfuzz/details/PatternMatchVector.hpp index 557e1ad9..0ce3b15a 100644 --- a/rapidfuzz/details/PatternMatchVector.hpp +++ b/rapidfuzz/details/PatternMatchVector.hpp @@ -103,7 +103,6 @@ struct PatternMatchVector { template uint64_t get(CharT key) const noexcept { - /** treat char as value between 0 and 127 for performance reasons */ if (key >= 0 && key <= 255) return m_extendedAscii[static_cast(key)]; else @@ -120,7 +119,8 @@ struct PatternMatchVector { void insert_mask(char key, uint64_t mask) noexcept { - insert_mask(static_cast(key), mask); + /** treat char as value between 0 and 127 for performance reasons */ + m_extendedAscii[static_cast(key)] |= mask; } template @@ -142,8 +142,7 @@ struct BlockPatternMatchVector { BlockPatternMatchVector(size_t str_len) : m_block_count(ceil_div(str_len, 64)), m_map(nullptr), m_extendedAscii(256, m_block_count, 0) - { - } + {} template BlockPatternMatchVector(Range s) : BlockPatternMatchVector(static_cast(s.size())) @@ -192,10 +191,8 @@ struct BlockPatternMatchVector { assert(block < size()); if (key >= 0 && key <= 255) m_extendedAscii[static_cast(key)][block] |= mask; - else - { - if (!m_map) - m_map = new BitvectorHashmap[m_block_count]; + else { + if (!m_map) m_map = new BitvectorHashmap[m_block_count]; m_map[block].insert_mask(key, mask); } } @@ -210,7 +207,7 @@ struct BlockPatternMatchVector { { if (key >= 0 && key <= 255) return m_extendedAscii[static_cast(key)][block]; - else if(m_map) + else if (m_map) return m_map[block].get(key); else return 0; diff --git a/rapidfuzz/details/types.hpp b/rapidfuzz/details/types.hpp index 8d54031d..eb87962d 100644 --- a/rapidfuzz/details/types.hpp +++ b/rapidfuzz/details/types.hpp @@ -112,10 +112,8 @@ auto vector_slice(const Vec& vec, int start, int stop, int step) -> Vec { Vec new_vec; - if (step == 0) - throw std::invalid_argument("slice step cannot be zero"); - if (step < 0) - throw std::invalid_argument("step sizes below 0 lead to an invalid order of editops"); + if (step == 0) throw std::invalid_argument("slice step cannot be zero"); + if (step < 0) throw std::invalid_argument("step sizes below 0 lead to an invalid order of editops"); if (start < 0) start = std::max(start + static_cast(vec.size()), 0); @@ -127,8 +125,7 @@ auto vector_slice(const Vec& vec, int start, int stop, int step) -> Vec else if (stop > static_cast(vec.size())) stop = static_cast(vec.size()); - if (start >= stop) - return new_vec; + if (start >= stop) return new_vec; int count = (stop - 1 - start) / step + 1; new_vec.reserve(static_cast(count)); @@ -142,10 +139,8 @@ auto vector_slice(const Vec& vec, int start, int stop, int step) -> Vec template void vector_remove_slice(Vec& vec, int start, int stop, int step) { - if (step == 0) - throw std::invalid_argument("slice step cannot be zero"); - if (step < 0) - throw std::invalid_argument("step sizes below 0 lead to an invalid order of editops"); + if (step == 0) throw std::invalid_argument("slice step cannot be zero"); + if (step < 0) throw std::invalid_argument("step sizes below 0 lead to an invalid order of editops"); if (start < 0) start = std::max(start + static_cast(vec.size()), 0); @@ -157,8 +152,7 @@ void vector_remove_slice(Vec& vec, int start, int stop, int step) else if (stop > static_cast(vec.size())) stop = static_cast(vec.size()); - if (start >= stop) - return; + if (start >= stop) return; auto iter = vec.begin() + start; for (int i = start; i < static_cast(vec.size()); i++) @@ -492,11 +486,9 @@ class Opcodes : private std::vector { inline bool operator==(const Opcodes& lhs, const Opcodes& rhs) { - if (lhs.get_src_len() != rhs.get_src_len() || lhs.get_dest_len() != rhs.get_dest_len()) - return false; + if (lhs.get_src_len() != rhs.get_src_len() || lhs.get_dest_len() != rhs.get_dest_len()) return false; - if (lhs.size() != rhs.size()) - return false; + if (lhs.size() != rhs.size()) return false; return std::equal(lhs.begin(), lhs.end(), rhs.begin()); } diff --git a/rapidfuzz/distance.hpp b/rapidfuzz/distance.hpp index 11d522ae..bad1b486 100644 --- a/rapidfuzz/distance.hpp +++ b/rapidfuzz/distance.hpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace rapidfuzz { diff --git a/rapidfuzz/distance/DamerauLevenshtein.hpp b/rapidfuzz/distance/DamerauLevenshtein.hpp new file mode 100644 index 00000000..b7fd21d4 --- /dev/null +++ b/rapidfuzz/distance/DamerauLevenshtein.hpp @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2022-present Max Bachmann */ + +#include +#include +#include +#include +#include + +namespace rapidfuzz { +namespace detail { + +/* + * based on the paper Linear space string correction algorithm using the Damerau-Levenshtein distance + * from Chunchun Zhao and Sartaj Sahni + */ +template +int64_t damerau_levenshtein(Range s1, Range s2, int64_t max); + +} // namespace detail + +/* the API will require a change when adding custom weights */ +namespace experimental { +/** + * @brief Calculates the Damerau Levenshtein distance between two strings. + * + * + * @tparam Sentence1 This is a string that can be converted to + * basic_string_view + * @tparam Sentence2 This is a string that can be converted to + * basic_string_view + * + * @param s1 + * string to compare with s2 (for type info check Template parameters above) + * @param s2 + * string to compare with s1 (for type info check Template parameters above) + * @param max + * Maximum Damerau Levenshtein distance between s1 and s2, that is + * considered as a result. If the distance is bigger than max, + * max + 1 is returned instead. Default is std::numeric_limits::max(), + * which deactivates this behaviour. + * + * @return Damerau Levenshtein distance between s1 and s2 + */ +template +int64_t damerau_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, + int64_t score_cutoff = std::numeric_limits::max()); + +template +int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2, + int64_t score_cutoff = std::numeric_limits::max()); + +template +int64_t damerau_levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, + int64_t score_cutoff = 0); + +template +int64_t damerau_levenshtein_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff = 0); + +template +double damerau_levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2, double score_cutoff = 1.0); + +template +double damerau_levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2, + double score_cutoff = 1.0); + +/** + * @brief Calculates a normalized Damerau Levenshtein similarity + * + * @details + * Both string require a similar length + * + * + * @tparam Sentence1 This is a string that can be converted to + * basic_string_view + * @tparam Sentence2 This is a string that can be converted to + * basic_string_view + * + * @param s1 + * string to compare with s2 (for type info check Template parameters above) + * @param s2 + * string to compare with s1 (for type info check Template parameters above) + * @param score_cutoff + * Optional argument for a score threshold as a float between 0 and 1.0. + * For ratio < score_cutoff 0 is returned instead. Default is 0, + * which deactivates this behaviour. + * + * @return Normalized Damerau Levenshtein distance between s1 and s2 + * as a float between 0 and 1.0 + */ +template +double damerau_levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2, double score_cutoff = 0.0); + +template +double damerau_levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2, + double score_cutoff = 0.0); + +template +struct CachedDamerauLevenshtein { + template + CachedDamerauLevenshtein(const Sentence1& s1_) + : CachedDamerauLevenshtein(detail::to_begin(s1_), detail::to_end(s1_)) + {} + + template + CachedDamerauLevenshtein(InputIt1 first1, InputIt1 last1) : s1(first1, last1) + {} + + template + int64_t distance(InputIt2 first2, InputIt2 last2, + int64_t score_cutoff = std::numeric_limits::max()) const; + + template + int64_t distance(const Sentence2& s2, int64_t score_cutoff = std::numeric_limits::max()) const; + + template + int64_t similarity(InputIt2 first2, InputIt2 last2, int64_t score_cutoff = 0) const; + + template + int64_t similarity(const Sentence2& s2, int64_t score_cutoff = 0) const; + + template + double normalized_distance(InputIt2 first2, InputIt2 last2, double score_cutoff = 1.0) const; + + template + double normalized_distance(const Sentence2& s2, double score_cutoff = 1.0) const; + + template + double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0) const; + + template + double normalized_similarity(const Sentence2& s2, double score_cutoff = 0.0) const; + +private: + std::basic_string s1; +}; + +#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L) +template +CachedDamerauLevenshtein(const Sentence1& s1_) -> CachedDamerauLevenshtein>; + +template +CachedDamerauLevenshtein(InputIt1 first1, InputIt1 last1) -> CachedDamerauLevenshtein>; +#endif + +} // namespace experimental +} // namespace rapidfuzz + +#include diff --git a/rapidfuzz/distance/DamerauLevenshtein.impl b/rapidfuzz/distance/DamerauLevenshtein.impl new file mode 100644 index 00000000..c0f222f9 --- /dev/null +++ b/rapidfuzz/distance/DamerauLevenshtein.impl @@ -0,0 +1,261 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2022-present Max Bachmann */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rapidfuzz { +namespace detail { + +/* + * based on the paper + * "Linear space string correction algorithm using the Damerau-Levenshtein distance" + * from Chunchun Zhao and Sartaj Sahni + */ +template +int64_t damerau_levenshtein_distance_zhao(Range s1, Range s2, int64_t max) +{ + IntType len1 = static_cast(s1.size()); + IntType len2 = static_cast(s2.size()); + IntType maxVal = static_cast(std::max(len1, len2) + 1); + assert(std::numeric_limits::max() > maxVal); + + HybridGrowingHashmap last_row_id; + size_t size = static_cast(s2.size() + 2); + assume(size != 0); + std::vector FR_arr(size, maxVal); + std::vector R1_arr(size, maxVal); + std::vector R_arr(size); + R_arr[0] = maxVal; + std::iota(R_arr.begin() + 1, R_arr.end(), IntType(0)); + + IntType* R = &R_arr[1]; + IntType* R1 = &R1_arr[1]; + IntType* FR = &FR_arr[1]; + + for (IntType i = 1; i <= len1; i++) { + std::swap(R, R1); + IntType last_col_id = -1; + IntType last_i2l1 = R[0]; + R[0] = i; + IntType T = maxVal; + + for (IntType j = 1; j <= len2; j++) { + ptrdiff_t diag = R1[j - 1] + static_cast(s1[i - 1] != s2[j - 1]); + ptrdiff_t left = R[j - 1] + 1; + ptrdiff_t up = R1[j] + 1; + ptrdiff_t temp = std::min({diag, left, up}); + + if (s1[i - 1] == s2[j - 1]) { + last_col_id = j; // last occurence of s1_i + FR[j] = R1[j - 2]; // save H_k-1,j-2 + T = last_i2l1; // save H_i-2,l-1 + } + else { + ptrdiff_t k = last_row_id.get(static_cast(s2[j - 1])); + ptrdiff_t l = last_col_id; + + if ((j - l) == 1) { + ptrdiff_t transpose = FR[j] + (i - k); + temp = std::min(temp, transpose); + } + else if ((i - k) == 1) { + ptrdiff_t transpose = T + (j - l); + temp = std::min(temp, transpose); + } + } + + last_i2l1 = R[j]; + R[j] = static_cast(temp); + } + last_row_id.insert(static_cast(s1[i - 1]), i); + } + + int64_t dist = R[s2.size()]; + return (dist <= max) ? dist : max + 1; +} + +template +int64_t damerau_levenshtein_distance(Range s1, Range s2, int64_t max) +{ + int64_t min_edits = std::abs(s1.size() - s2.size()); + if (min_edits > max) return max + 1; + + /* common affix does not effect Levenshtein distance */ + remove_common_affix(s1, s2); + + ptrdiff_t maxVal = std::max(s1.size(), s2.size()) + 1; + if (std::numeric_limits::max() > maxVal) + return damerau_levenshtein_distance_zhao(s1, s2, max); + else if (std::numeric_limits::max() > maxVal) + return damerau_levenshtein_distance_zhao(s1, s2, max); + else + return damerau_levenshtein_distance_zhao(s1, s2, max); +} + +template +int64_t damerau_levenshtein_similarity(Range s1, Range s2, int64_t score_cutoff) +{ + auto maximum = std::max(s1.size(), s2.size()); + int64_t cutoff_distance = maximum - score_cutoff; + int64_t dist = damerau_levenshtein_distance(s1, s2, cutoff_distance); + int64_t sim = maximum - dist; + return (sim >= score_cutoff) ? sim : 0; +} + +template +double damerau_levenshtein_normalized_distance(Range s1, Range s2, double score_cutoff) +{ + auto maximum = std::max(s1.size(), s2.size()); + int64_t cutoff_distance = static_cast(std::ceil(static_cast(maximum) * score_cutoff)); + int64_t dist = damerau_levenshtein_distance(s1, s2, cutoff_distance); + double norm_dist = (maximum) ? static_cast(dist) / static_cast(maximum) : 0.0; + return (norm_dist <= score_cutoff) ? norm_dist : 1.0; +} + +template +double damerau_levenshtein_normalized_similarity(Range s1, Range s2, double score_cutoff) +{ + double cutoff_score = detail::NormSim_to_NormDist(score_cutoff); + double norm_dist = damerau_levenshtein_normalized_distance(s1, s2, cutoff_score); + double norm_sim = 1.0 - norm_dist; + return (norm_sim >= score_cutoff) ? norm_sim : 0.0; +} + +} // namespace detail + +namespace experimental { + +template +int64_t damerau_levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, + int64_t max) +{ + return detail::damerau_levenshtein_distance(detail::make_range(first1, last1), + detail::make_range(first2, last2), max); +} + +template +int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2, int64_t max) +{ + return detail::damerau_levenshtein_distance(detail::make_range(s1), detail::make_range(s2), max); +} + +template +double damerau_levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2, double score_cutoff) +{ + return detail::damerau_levenshtein_normalized_distance(detail::make_range(first1, last1), + detail::make_range(first2, last2), score_cutoff); +} + +template +double damerau_levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2, double score_cutoff) +{ + return detail::damerau_levenshtein_normalized_distance(detail::make_range(s1), detail::make_range(s2), + score_cutoff); +} + +template +int64_t damerau_levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, + int64_t score_cutoff) +{ + return detail::damerau_levenshtein_similarity(detail::make_range(first1, last1), + detail::make_range(first2, last2), score_cutoff); +} + +template +int64_t damerau_levenshtein_similarity(const Sentence1& s1, const Sentence2& s2, int64_t score_cutoff) +{ + return detail::damerau_levenshtein_similarity(detail::make_range(s1), detail::make_range(s2), + score_cutoff); +} + +template +double damerau_levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, + InputIt2 last2, double score_cutoff) +{ + return detail::damerau_levenshtein_normalized_similarity(detail::make_range(first1, last1), + detail::make_range(first2, last2), score_cutoff); +} + +template +double damerau_levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2, + double score_cutoff) +{ + return detail::damerau_levenshtein_normalized_similarity(detail::make_range(s1), detail::make_range(s2), + score_cutoff); +} + +template +template +int64_t CachedDamerauLevenshtein::distance(InputIt2 first2, InputIt2 last2, + int64_t score_cutoff) const +{ + return damerau_levenshtein_distance(detail::to_begin(s1), detail::to_end(s1), first2, last2, + score_cutoff); +} + +template +template +int64_t CachedDamerauLevenshtein::distance(const Sentence2& s2, int64_t score_cutoff) const +{ + return damerau_levenshtein_distance(s1, s2, score_cutoff); +} + +template +template +int64_t CachedDamerauLevenshtein::similarity(InputIt2 first2, InputIt2 last2, + int64_t score_cutoff) const +{ + return damerau_levenshtein_similarity(detail::to_begin(s1), detail::to_end(s1), first2, last2, + score_cutoff); +} + +template +template +int64_t CachedDamerauLevenshtein::similarity(const Sentence2& s2, int64_t score_cutoff) const +{ + return damerau_levenshtein_similarity(s1, s2, score_cutoff); +} + +template +template +double CachedDamerauLevenshtein::normalized_distance(InputIt2 first2, InputIt2 last2, + double score_cutoff) const +{ + return damerau_levenshtein_normalized_distance(detail::to_begin(s1), detail::to_end(s1), first2, last2, + score_cutoff); +} + +template +template +double CachedDamerauLevenshtein::normalized_distance(const Sentence2& s2, double score_cutoff) const +{ + return damerau_levenshtein_normalized_distance(s1, s2, score_cutoff); +} + +template +template +double CachedDamerauLevenshtein::normalized_similarity(InputIt2 first2, InputIt2 last2, + double score_cutoff) const +{ + return damerau_levenshtein_normalized_similarity(detail::to_begin(s1), detail::to_end(s1), first2, last2, + score_cutoff); +} + +template +template +double CachedDamerauLevenshtein::normalized_similarity(const Sentence2& s2, double score_cutoff) const +{ + return damerau_levenshtein_normalized_similarity(s1, s2, score_cutoff); +} + +} // namespace experimental +} // namespace rapidfuzz diff --git a/rapidfuzz/distance/Hamming.impl b/rapidfuzz/distance/Hamming.impl index 8368bd48..143a3058 100644 --- a/rapidfuzz/distance/Hamming.impl +++ b/rapidfuzz/distance/Hamming.impl @@ -139,6 +139,4 @@ double CachedHamming::normalized_similarity(const Sentence2& s2, double return hamming_normalized_similarity(s1, s2, score_cutoff); } -/**@}*/ - } // namespace rapidfuzz diff --git a/rapidfuzz/distance/Levenshtein.impl b/rapidfuzz/distance/Levenshtein.impl index 51cb4c53..e1fece8c 100644 --- a/rapidfuzz/distance/Levenshtein.impl +++ b/rapidfuzz/distance/Levenshtein.impl @@ -229,22 +229,19 @@ int64_t levenshtein_hyrroe2003_small_band(const BlockPatternMatchVector& PM, Ran /* Searching */ ptrdiff_t i = 0; - for (; i < s1.size() - max; ++i,++start_pos) { + for (; i < s1.size() - max; ++i, ++start_pos) { /* Step 1: Computing D0 */ uint64_t PM_j = 0; - if (start_pos < 0) - { + if (start_pos < 0) { PM_j = PM.get(0, s2[i]) << (-start_pos); } - else - { + else { size_t word = static_cast(start_pos) / 64; size_t word_pos = static_cast(start_pos) % 64; PM_j = PM.get(word, s2[i]) >> word_pos; - if (word + 1 < words && word_pos != 0) - PM_j |= PM.get(word + 1, s2[i]) << (64 - word_pos); + if (word + 1 < words && word_pos != 0) PM_j |= PM.get(word + 1, s2[i]) << (64 - word_pos); } uint64_t X = PM_j; uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN; @@ -261,22 +258,19 @@ int64_t levenshtein_hyrroe2003_small_band(const BlockPatternMatchVector& PM, Ran VN = (D0 >> 1) & HP; } - for (; i < s2.size(); ++i,++start_pos) { + for (; i < s2.size(); ++i, ++start_pos) { /* Step 1: Computing D0 */ uint64_t PM_j = 0; - if (start_pos < 0) - { + if (start_pos < 0) { PM_j = PM.get(0, s2[i]) << (-start_pos); } - else - { + else { size_t word = static_cast(start_pos) / 64; size_t word_pos = static_cast(start_pos) % 64; PM_j = PM.get(word, s2[i]) >> word_pos; - if (word + 1 < words && word_pos != 0) - PM_j |= PM.get(word + 1, s2[i]) << (64 - word_pos); + if (word + 1 < words && word_pos != 0) PM_j |= PM.get(word + 1, s2[i]) << (64 - word_pos); } uint64_t X = PM_j; uint64_t D0 = (((X & VP) + VP) ^ VP) | X | VN; diff --git a/test/distance/CMakeLists.txt b/test/distance/CMakeLists.txt index 2a700544..c4824775 100644 --- a/test/distance/CMakeLists.txt +++ b/test/distance/CMakeLists.txt @@ -9,3 +9,4 @@ rapidfuzz_add_test(Hamming) rapidfuzz_add_test(Indel) rapidfuzz_add_test(LCSseq) rapidfuzz_add_test(Levenshtein) +rapidfuzz_add_test(DamerauLevenshtein) diff --git a/test/distance/tests-DamerauLevenshtein.cpp b/test/distance/tests-DamerauLevenshtein.cpp new file mode 100644 index 00000000..079dc33d --- /dev/null +++ b/test/distance/tests-DamerauLevenshtein.cpp @@ -0,0 +1,84 @@ +#include "rapidfuzz/details/Range.hpp" +#include "rapidfuzz/details/types.hpp" +#include +#include +#include +#include + +#include + +template +std::basic_string str_multiply(std::basic_string a, unsigned int b) +{ + std::basic_string output; + while (b--) + output += a; + + return output; +} + +template +int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2, + int64_t max = std::numeric_limits::max()) +{ + int64_t res1 = rapidfuzz::experimental::damerau_levenshtein_distance(s1, s2, max); + rapidfuzz::experimental::CachedDamerauLevenshtein scorer(s1); + int64_t res2 = scorer.distance(s2, max); + REQUIRE(res1 == res2); + return res1; +} + +template +double damerau_levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2, + double score_cutoff = 0.0) +{ + double res1 = rapidfuzz::experimental::damerau_levenshtein_normalized_similarity(s1, s2, score_cutoff); + rapidfuzz::experimental::CachedDamerauLevenshtein scorer(s1); + double res2 = scorer.normalized_similarity(s2, score_cutoff); + REQUIRE(res1 == Catch::Approx(res2).epsilon(0.0001)); + return res1; +} + +TEST_CASE("Levenshtein") +{ + std::string test = "aaaa"; + std::wstring no_suffix = L"aaa"; + std::string no_suffix2 = "aaab"; + std::string swapped1 = "abaa"; + std::string swapped2 = "baaa"; + std::string replace_all = "bbbb"; + + SECTION("damerau levenshtein calculates correct distances") + { + REQUIRE(damerau_levenshtein_distance(test, test) == 0); + REQUIRE(damerau_levenshtein_distance(test, no_suffix) == 1); + REQUIRE(damerau_levenshtein_distance(swapped1, swapped2) == 1); + REQUIRE(damerau_levenshtein_distance(test, no_suffix2) == 1); + REQUIRE(damerau_levenshtein_distance(test, replace_all) == 4); + + { + std::string s1 = "CA"; + std::string s2 = "ABC"; + REQUIRE(damerau_levenshtein_distance(s1, s2) == 2); + } + } + + SECTION("weighted levenshtein calculates correct ratios") + { + REQUIRE(damerau_levenshtein_normalized_similarity(test, test) == 1.0); + REQUIRE(damerau_levenshtein_normalized_similarity(test, no_suffix) == + Catch::Approx(0.75).epsilon(0.0001)); + REQUIRE(damerau_levenshtein_normalized_similarity(swapped1, swapped2) == + Catch::Approx(0.75).epsilon(0.0001)); + REQUIRE(damerau_levenshtein_normalized_similarity(test, no_suffix2) == + Catch::Approx(0.75).epsilon(0.0001)); + REQUIRE(damerau_levenshtein_normalized_similarity(test, replace_all) == 0.0); + + { + std::string s1 = "CA"; + std::string s2 = "ABC"; + REQUIRE(damerau_levenshtein_normalized_similarity(s1, s2) == + Catch::Approx(0.33333).epsilon(0.0001)); + } + } +} diff --git a/test/distance/tests-Levenshtein.cpp b/test/distance/tests-Levenshtein.cpp index 828b0355..1092f4e1 100644 --- a/test/distance/tests-Levenshtein.cpp +++ b/test/distance/tests-Levenshtein.cpp @@ -9,7 +9,8 @@ #include template -std::basic_string str_multiply(std::basic_string a, unsigned int b) { +std::basic_string str_multiply(std::basic_string a, unsigned int b) +{ std::basic_string output; while (b--) output += a; @@ -50,6 +51,15 @@ TEST_CASE("Levenshtein") std::string swapped2 = "baaa"; std::string replace_all = "bbbb"; + SECTION("levenshtein calculates correct distances") + { + REQUIRE(levenshtein_distance(test, test) == 0); + REQUIRE(levenshtein_distance(test, no_suffix) == 1); + REQUIRE(levenshtein_distance(swapped1, swapped2) == 2); + REQUIRE(levenshtein_distance(test, no_suffix2) == 1); + REQUIRE(levenshtein_distance(test, replace_all) == 4); + } + SECTION("weighted levenshtein calculates correct distances") { REQUIRE(levenshtein_distance(test, test, {1, 1, 2}) == 0); @@ -125,22 +135,28 @@ TEST_CASE("Levenshtein") } { - std::string s1 = "accccccccccaaaaaaaccccccccccccccccccccccccccccccaccccccccccccccccccccccccccccccccccccccccccccccccccaaaaaaaaaaaaacccccccccccccccccccccc"; - std::string s2 = "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccbcccb"; + std::string s1 = "accccccccccaaaaaaaccccccccccccccccccccccccccccccacccccccccccccccccccccccccccccc" + "ccccccccccccccccccccaaaaaaaaaaaaacccccccccccccccccccccc"; + std::string s2 = "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" + "ccccccccccccccccccccccccccccccccccccbcccb"; REQUIRE(levenshtein_distance(s1, s2, {1, 1, 1}) == 24); REQUIRE(levenshtein_distance(s1, s2, {1, 1, 1}, 25) == 24); } { - std::string s1 = "miiiiiiiiiiliiiiiiibghiiaaaaaaaaaaaaaaacccfccccedddaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - std::string s2 = "aaaaaaajaaaaaaaabghiiaaaaaaaaaaaaaaacccfccccedddaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaajjdim"; + std::string s1 = "miiiiiiiiiiliiiiiiibghiiaaaaaaaaaaaaaaacccfccccedddaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + "aaaaaaaaaaaaa"; + std::string s2 = + "aaaaaaajaaaaaaaabghiiaaaaaaaaaaaaaaacccfccccedddaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaajjdim"; REQUIRE(levenshtein_distance(s1, s2, {1, 1, 1}) == 27); REQUIRE(levenshtein_distance(s1, s2, {1, 1, 1}, 27) == 27); } { - std::string s1 = "lllllfllllllllllllllllllllllllllllllllllllllllllllllllglllllilldcaaaaaaaaaaaaaaaaaaadbbllllllllllhllllllllllllllllllllllllllgl"; - std::string s2 = "aaaaaaaaaaaaaadbbllllllllllllllelllllllllllllllllllllllllllllllglllllilldcaaaaaaaaaaaaaaaaaaadbbllllllllllllllellllllllllllllhlllllllllill"; + std::string s1 = "lllllfllllllllllllllllllllllllllllllllllllllllllllllllglllllilldcaaaaaaaaaaaaaa" + "aaaaadbbllllllllllhllllllllllllllllllllllllllgl"; + std::string s2 = "aaaaaaaaaaaaaadbbllllllllllllllelllllllllllllllllllllllllllllllglllllilldcaaaaa" + "aaaaaaaaaaaaaadbbllllllllllllllellllllllllllllhlllllllllill"; REQUIRE(levenshtein_distance(s1, s2, {1, 1, 1}) == 23); REQUIRE(levenshtein_distance(s1, s2, {1, 1, 1}, 27) == 23); REQUIRE(levenshtein_distance(s1, s2, {1, 1, 1}, 28) == 23); @@ -173,9 +189,8 @@ TEST_CASE("Levenshtein_find_hirschberg_pos") std::string s1 = str_multiply(std::string("abb"), 2); std::string s2 = str_multiply(std::string("ccccca"), 2); - auto hpos = rapidfuzz::detail::find_hirschberg_pos( - rapidfuzz::detail::make_range(s1), rapidfuzz::detail::make_range(s2) - ); + auto hpos = rapidfuzz::detail::find_hirschberg_pos(rapidfuzz::detail::make_range(s1), + rapidfuzz::detail::make_range(s2)); REQUIRE(hpos.left_score == 5); REQUIRE(hpos.right_score == 6); REQUIRE(hpos.s2_mid == 6); @@ -183,12 +198,11 @@ TEST_CASE("Levenshtein_find_hirschberg_pos") } { - std::string s1 = str_multiply(std::string("abb"), 8*64); - std::string s2 = str_multiply(std::string("ccccca"), 8*64); + std::string s1 = str_multiply(std::string("abb"), 8 * 64); + std::string s2 = str_multiply(std::string("ccccca"), 8 * 64); - auto hpos = rapidfuzz::detail::find_hirschberg_pos( - rapidfuzz::detail::make_range(s1), rapidfuzz::detail::make_range(s2) - ); + auto hpos = rapidfuzz::detail::find_hirschberg_pos(rapidfuzz::detail::make_range(s1), + rapidfuzz::detail::make_range(s2)); REQUIRE(hpos.left_score == 1280); REQUIRE(hpos.right_score == 1281); REQUIRE(hpos.s2_mid == 1536); @@ -214,10 +228,9 @@ TEST_CASE("Levenshtein_editops[fuzzing_regressions]") } { - std::string s1 = str_multiply(std::string("abb"), 8*64); - std::string s2 = str_multiply(std::string("ccccca"), 8*64); + std::string s1 = str_multiply(std::string("abb"), 8 * 64); + std::string s2 = str_multiply(std::string("ccccca"), 8 * 64); rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2); REQUIRE(s2 == rapidfuzz::editops_apply(ops, s1, s2)); } - } \ No newline at end of file