generated from eigerco/beerus
-
Notifications
You must be signed in to change notification settings - Fork 104
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add levenshtein distance calculator (#260)
This PR adds calculation of the Levenshtein distance of two ByteArray. ## Pull Request type <!-- Please try to limit your pull request to one type; submit multiple pull requests if needed. --> Please check the type of change your PR introduces: - [ ] Bugfix - [x] Feature - [ ] Code style update (formatting, renaming) - [ ] Refactoring (no functional changes, no API changes) - [ ] Build-related changes - [ ] Documentation content changes - [ ] Other (please describe): ## What is the current behavior? <!-- Please describe the current behavior that you are modifying, or link to a relevant issue. --> Issue Number: N/A ## What is the new behavior? <!-- Please describe the behavior or changes that are being added by this PR. --> - calculating the Levenshtein distance of two ByteArray. - all tests are passed. ## Does this introduce a breaking change? - [ ] Yes - [x] No <!-- If this does introduce a breaking change, please describe the impact and migration path for existing applications below. --> ## Other information <!-- Any other information that is important to this PR, such as screenshots of how the component looks before and after the change. -->
- Loading branch information
Showing
5 changed files
with
197 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
// The Levenshtein Distance | ||
use dict::Felt252DictTrait; | ||
|
||
|
||
/// Compute the edit distance between two byte arrays | ||
/// * `arr1` - The first byte array. | ||
/// * `arr2` - The second byte array. | ||
/// # Returns | ||
/// * `usize` - The edit distance between the two byte arrays. | ||
fn levenshtein_distance(arr1: @ByteArray, arr2: @ByteArray) -> usize { | ||
// Get the lengths of both arrays | ||
let arr1_len = arr1.len(); | ||
let arr2_len = arr2.len(); | ||
|
||
// If the first array is empty, the distance is the length of the second array | ||
if arr1_len == 0 { | ||
return arr2_len; | ||
} | ||
|
||
// Initialize a dictionary to store previous distances, with keys and values as indices | ||
let mut prev_distances = felt252_dict_new::<usize>(); | ||
let mut index: usize = 0; | ||
loop { | ||
// Break the loop when index equals the length of the first array plus 1 | ||
if index == arr1_len + 1 { | ||
break; | ||
} | ||
prev_distances.insert(index.into(), index); | ||
index += 1; | ||
}; | ||
|
||
// Initialize a variable to keep track of the current row | ||
let mut current_row: usize = 0; | ||
loop { | ||
// Break the loop when current row equals the length of the second array | ||
if current_row == arr2_len { | ||
break; | ||
} | ||
let second_array_element = arr2.at(current_row).unwrap(); | ||
let mut previous_substitution_cost = prev_distances.get(0); | ||
prev_distances.insert(0, current_row + 1); | ||
|
||
// Initialize a variable to keep track of the current column | ||
let mut current_column: usize = 0; | ||
loop { | ||
// Break the loop when current column equals the length of the first array | ||
if current_column == arr1_len { | ||
break; | ||
} | ||
let first_array_element = arr1.at(current_column).unwrap(); | ||
let deletion_cost = prev_distances.get(current_column.into()) + 1; | ||
let insertion_cost = prev_distances.get((current_column + 1).into()) + 1; | ||
let substitution_cost = if first_array_element == second_array_element { | ||
previous_substitution_cost | ||
} else { | ||
previous_substitution_cost + 1 | ||
}; | ||
|
||
previous_substitution_cost = prev_distances.get((current_column + 1).into()); | ||
let mut min_cost = deletion_cost; | ||
if insertion_cost < min_cost { | ||
min_cost = insertion_cost; | ||
} | ||
if substitution_cost < min_cost { | ||
min_cost = substitution_cost; | ||
} | ||
prev_distances.insert((current_column + 1).into(), min_cost); | ||
|
||
current_column += 1 | ||
}; | ||
|
||
current_row += 1; | ||
}; | ||
|
||
// Return the Levenshtein distance | ||
prev_distances.get(arr1_len.into()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
mod binary_search; | ||
mod bm_search; | ||
mod dijkstra; | ||
mod levenshtein_distance; | ||
|
||
#[cfg(test)] | ||
mod tests; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
mod binary_search_test; | ||
mod bm_search_test; | ||
mod dijkstra_test; | ||
mod levenshtein_distance_test; |
114 changes: 114 additions & 0 deletions
114
src/searching/src/tests/levenshtein_distance_test.cairo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
use alexandria_searching::levenshtein_distance::levenshtein_distance; | ||
|
||
|
||
#[test] | ||
#[available_gas(5000000)] | ||
fn bm_search_test_1() { | ||
// FROG -> 46,52,4f,47 | ||
let mut arr1: ByteArray = Default::default(); | ||
arr1.append_byte(0x46_u8); | ||
arr1.append_byte(0x52_u8); | ||
arr1.append_byte(0x4f_u8); | ||
arr1.append_byte(0x47_u8); | ||
// DOG -> 44,4f,47 | ||
let mut arr2: ByteArray = Default::default(); | ||
arr2.append_byte(0x44_u8); | ||
arr2.append_byte(0x4f_u8); | ||
arr2.append_byte(0x47_u8); | ||
|
||
let dist = levenshtein_distance(@arr1, @arr2); | ||
assert(dist == 2, 'invalid result'); | ||
} | ||
|
||
#[test] | ||
#[available_gas(5000000)] | ||
fn bm_search_test_2() { | ||
let mut arr1: ByteArray = Default::default(); | ||
let mut arr2: ByteArray = Default::default(); | ||
|
||
let dist = levenshtein_distance(@arr1, @arr2); | ||
assert(dist == 0, 'invalid result'); | ||
} | ||
|
||
#[test] | ||
#[available_gas(5000000)] | ||
fn bm_search_test_3() { | ||
let mut arr1: ByteArray = Default::default(); | ||
let mut arr2: ByteArray = Default::default(); | ||
arr2.append_byte(0x61_u8); | ||
|
||
let dist = levenshtein_distance(@arr1, @arr2); | ||
assert(dist == 1, 'invalid result'); | ||
} | ||
|
||
#[test] | ||
#[available_gas(5000000)] | ||
fn bm_search_test_4() { | ||
let mut arr1: ByteArray = Default::default(); | ||
arr1.append_byte(0x61_u8); | ||
let mut arr2: ByteArray = Default::default(); | ||
|
||
let dist = levenshtein_distance(@arr1, @arr2); | ||
assert(dist == 1, 'invalid result'); | ||
} | ||
|
||
#[test] | ||
#[available_gas(5000000)] | ||
fn bm_search_test_5() { | ||
let mut arr1: ByteArray = Default::default(); | ||
arr1.append_byte(0x61_u8); | ||
arr1.append_byte(0x62_u8); | ||
let mut arr2: ByteArray = Default::default(); | ||
arr2.append_byte(0x61_u8); | ||
|
||
let dist = levenshtein_distance(@arr1, @arr2); | ||
assert(dist == 1, 'invalid result'); | ||
} | ||
|
||
#[test] | ||
#[available_gas(5000000)] | ||
fn bm_search_test_6() { | ||
// foobar -> 66,6f,6f,62,61,72 | ||
let mut arr1: ByteArray = Default::default(); | ||
arr1.append_byte(0x66_u8); | ||
arr1.append_byte(0x6f_u8); | ||
arr1.append_byte(0x6f_u8); | ||
arr1.append_byte(0x62_u8); | ||
arr1.append_byte(0x61_u8); | ||
arr1.append_byte(0x72_u8); | ||
// foobar -> 66,6f,6f,62,61,72 | ||
let mut arr2: ByteArray = Default::default(); | ||
arr2.append_byte(0x66_u8); | ||
arr2.append_byte(0x6f_u8); | ||
arr2.append_byte(0x6f_u8); | ||
arr2.append_byte(0x62_u8); | ||
arr2.append_byte(0x61_u8); | ||
arr2.append_byte(0x72_u8); | ||
|
||
let dist = levenshtein_distance(@arr1, @arr2); | ||
assert(dist == 0, 'invalid result'); | ||
} | ||
|
||
#[test] | ||
#[available_gas(5000000)] | ||
fn bm_search_test_7() { | ||
// foobar -> 66,6f,6f,62,61,72 | ||
let mut arr1: ByteArray = Default::default(); | ||
arr1.append_byte(0x66_u8); | ||
arr1.append_byte(0x6f_u8); | ||
arr1.append_byte(0x6f_u8); | ||
arr1.append_byte(0x62_u8); | ||
arr1.append_byte(0x61_u8); | ||
arr1.append_byte(0x72_u8); | ||
// barfoo -> 62,61,72,66,6f,6f | ||
let mut arr2: ByteArray = Default::default(); | ||
arr2.append_byte(0x62_u8); | ||
arr2.append_byte(0x61_u8); | ||
arr2.append_byte(0x72_u8); | ||
arr2.append_byte(0x66_u8); | ||
arr2.append_byte(0x6f_u8); | ||
arr2.append_byte(0x6f_u8); | ||
|
||
let dist = levenshtein_distance(@arr1, @arr2); | ||
assert(dist == 6, 'invalid result'); | ||
} |