Skip to content

Commit

Permalink
feat: add levenshtein distance calculator (#260)
Browse files Browse the repository at this point in the history
This PR adds calculation of the Levenshtein distance of two ByteArray.

## Pull Request type

<!-- Please try to limit your pull request to one type; submit multiple
pull requests if needed. -->

Please check the type of change your PR introduces:

- [ ] Bugfix
- [x] Feature
- [ ] Code style update (formatting, renaming)
- [ ] Refactoring (no functional changes, no API changes)
- [ ] Build-related changes
- [ ] Documentation content changes
- [ ] Other (please describe):

## What is the current behavior?

<!-- Please describe the current behavior that you are modifying, or
link to a relevant issue. -->

Issue Number: N/A

## What is the new behavior?

<!-- Please describe the behavior or changes that are being added by
this PR. -->

- calculating the Levenshtein distance of two ByteArray.
- all tests are passed.

## Does this introduce a breaking change?

- [ ] Yes
- [x] No

<!-- If this does introduce a breaking change, please describe the
impact and migration path for existing applications below. -->

## Other information

<!-- Any other information that is important to this PR, such as
screenshots of how the component looks before and after the change. -->
  • Loading branch information
Soptq authored Jan 25, 2024
1 parent 7dea99e commit c2308aa
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/searching/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ The binary search algorithm is a simple search in an ordered array-like compound
## [Dijkstra](./src/dijkstra.cairo)

Dijkstra's algorithm is a graph search algorithm that finds the shortest path from a source node to all other nodes in a weighted graph, ensuring the shortest distances are progressively updated as it explores nodes. It maintains a priority queue of nodes based on their tentative distances from the source and greedily selects the node with the smallest distance at each step.

## [Levenshtein distance](./src/levenshtein_distance.cairo)

The Levenshtein distance is a string metric for measuring the difference between two sequences. It is the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one string into the other. This version of the algorithm optmizes the space complexity. Time complexity: O(nm). Space complexity: O(n),
77 changes: 77 additions & 0 deletions src/searching/src/levenshtein_distance.cairo
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// The Levenshtein Distance
use dict::Felt252DictTrait;


/// Compute the edit distance between two byte arrays
/// * `arr1` - The first byte array.
/// * `arr2` - The second byte array.
/// # Returns
/// * `usize` - The edit distance between the two byte arrays.
fn levenshtein_distance(arr1: @ByteArray, arr2: @ByteArray) -> usize {
// Get the lengths of both arrays
let arr1_len = arr1.len();
let arr2_len = arr2.len();

// If the first array is empty, the distance is the length of the second array
if arr1_len == 0 {
return arr2_len;
}

// Initialize a dictionary to store previous distances, with keys and values as indices
let mut prev_distances = felt252_dict_new::<usize>();
let mut index: usize = 0;
loop {
// Break the loop when index equals the length of the first array plus 1
if index == arr1_len + 1 {
break;
}
prev_distances.insert(index.into(), index);
index += 1;
};

// Initialize a variable to keep track of the current row
let mut current_row: usize = 0;
loop {
// Break the loop when current row equals the length of the second array
if current_row == arr2_len {
break;
}
let second_array_element = arr2.at(current_row).unwrap();
let mut previous_substitution_cost = prev_distances.get(0);
prev_distances.insert(0, current_row + 1);

// Initialize a variable to keep track of the current column
let mut current_column: usize = 0;
loop {
// Break the loop when current column equals the length of the first array
if current_column == arr1_len {
break;
}
let first_array_element = arr1.at(current_column).unwrap();
let deletion_cost = prev_distances.get(current_column.into()) + 1;
let insertion_cost = prev_distances.get((current_column + 1).into()) + 1;
let substitution_cost = if first_array_element == second_array_element {
previous_substitution_cost
} else {
previous_substitution_cost + 1
};

previous_substitution_cost = prev_distances.get((current_column + 1).into());
let mut min_cost = deletion_cost;
if insertion_cost < min_cost {
min_cost = insertion_cost;
}
if substitution_cost < min_cost {
min_cost = substitution_cost;
}
prev_distances.insert((current_column + 1).into(), min_cost);

current_column += 1
};

current_row += 1;
};

// Return the Levenshtein distance
prev_distances.get(arr1_len.into())
}
1 change: 1 addition & 0 deletions src/searching/src/lib.cairo
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod binary_search;
mod bm_search;
mod dijkstra;
mod levenshtein_distance;

#[cfg(test)]
mod tests;
1 change: 1 addition & 0 deletions src/searching/src/tests.cairo
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod binary_search_test;
mod bm_search_test;
mod dijkstra_test;
mod levenshtein_distance_test;
114 changes: 114 additions & 0 deletions src/searching/src/tests/levenshtein_distance_test.cairo
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
use alexandria_searching::levenshtein_distance::levenshtein_distance;


#[test]
#[available_gas(5000000)]
fn bm_search_test_1() {
// FROG -> 46,52,4f,47
let mut arr1: ByteArray = Default::default();
arr1.append_byte(0x46_u8);
arr1.append_byte(0x52_u8);
arr1.append_byte(0x4f_u8);
arr1.append_byte(0x47_u8);
// DOG -> 44,4f,47
let mut arr2: ByteArray = Default::default();
arr2.append_byte(0x44_u8);
arr2.append_byte(0x4f_u8);
arr2.append_byte(0x47_u8);

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 2, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_2() {
let mut arr1: ByteArray = Default::default();
let mut arr2: ByteArray = Default::default();

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 0, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_3() {
let mut arr1: ByteArray = Default::default();
let mut arr2: ByteArray = Default::default();
arr2.append_byte(0x61_u8);

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 1, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_4() {
let mut arr1: ByteArray = Default::default();
arr1.append_byte(0x61_u8);
let mut arr2: ByteArray = Default::default();

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 1, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_5() {
let mut arr1: ByteArray = Default::default();
arr1.append_byte(0x61_u8);
arr1.append_byte(0x62_u8);
let mut arr2: ByteArray = Default::default();
arr2.append_byte(0x61_u8);

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 1, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_6() {
// foobar -> 66,6f,6f,62,61,72
let mut arr1: ByteArray = Default::default();
arr1.append_byte(0x66_u8);
arr1.append_byte(0x6f_u8);
arr1.append_byte(0x6f_u8);
arr1.append_byte(0x62_u8);
arr1.append_byte(0x61_u8);
arr1.append_byte(0x72_u8);
// foobar -> 66,6f,6f,62,61,72
let mut arr2: ByteArray = Default::default();
arr2.append_byte(0x66_u8);
arr2.append_byte(0x6f_u8);
arr2.append_byte(0x6f_u8);
arr2.append_byte(0x62_u8);
arr2.append_byte(0x61_u8);
arr2.append_byte(0x72_u8);

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 0, 'invalid result');
}

#[test]
#[available_gas(5000000)]
fn bm_search_test_7() {
// foobar -> 66,6f,6f,62,61,72
let mut arr1: ByteArray = Default::default();
arr1.append_byte(0x66_u8);
arr1.append_byte(0x6f_u8);
arr1.append_byte(0x6f_u8);
arr1.append_byte(0x62_u8);
arr1.append_byte(0x61_u8);
arr1.append_byte(0x72_u8);
// barfoo -> 62,61,72,66,6f,6f
let mut arr2: ByteArray = Default::default();
arr2.append_byte(0x62_u8);
arr2.append_byte(0x61_u8);
arr2.append_byte(0x72_u8);
arr2.append_byte(0x66_u8);
arr2.append_byte(0x6f_u8);
arr2.append_byte(0x6f_u8);

let dist = levenshtein_distance(@arr1, @arr2);
assert(dist == 6, 'invalid result');
}

0 comments on commit c2308aa

Please sign in to comment.