Skip to content

Commit

Permalink
Try to group highlighted changes together
Browse files Browse the repository at this point in the history
Sometimes the highlighting of a change is not as clear as it could
be. For example the change

    -printf "%s\n" s y y ...
    +test_write_lines s y n ...

is highlighted as

    -[printf "%]s[\n"] [s ]y y ...
    +[test_write_lines ]s y y ...

rather than

    -[printf "%s\n"] s y n ...
    +[test_write_lines] s y y ...

This is because the Levenshtein distance calculation only checks if
the current tokens match without considering if the previous tokens
matched or not. Adding a small penalty for starting a run of changed
tokens forces the changes to be grouped together whenever possible. A
knock on effect of adding the penalty is that the cost a substitution
needs to be increased relative to the cost of an insertion/deletion
otherwise the lowest cost for "ab" -> "ba" is two substitutions rather
than an insertion, match and deletion.

There are several changes to the tests

 - the Levenshtein distance is updated to reflect the new calculation in
   tests::align::*

 - several new tests are added to tests::align to check the grouping of
   different combinations of insertions and deletions

 - tests::edits::test_infer_edits_10 shows an improvement in the
   highlighting as the unchanged space at the end of the changed tokens
   is no longer highlighted. This is because it is no longer considered
   to be deleted by the Levenshtein matching as deleting the space
   between the deleted words now has a lower cost.

 - there is a new test in tests::edits using the example in this
   message.
  • Loading branch information
phillipwood committed Nov 25, 2022
1 parent 115c38d commit b3057d4
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 22 deletions.
146 changes: 129 additions & 17 deletions src/align.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
use std::cmp::max;
use std::collections::VecDeque;

const SUBSTITUTION_COST: usize = 1;
const DELETION_COST: usize = 1;
const INSERTION_COST: usize = 1;
// Cost invariants:
// SUBSTITUTION_COST < DELETION_COST + INSERTION_COST
// DELETION_COST + INSERTION_COST + INITIAL_MISMATCH_PENALITY < 2 * SUBSTITUTION_COST
const SUBSTITUTION_COST: usize = 3;
const DELETION_COST: usize = 2;
const INSERTION_COST: usize = 2;
// extra cost for starting a new group of changed tokens
const INITIAL_MISMATCH_PENALITY: usize = 1;

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Operation {
Expand Down Expand Up @@ -61,14 +66,14 @@ impl<'a> Alignment<'a> {
self.table[i] = Cell {
parent: 0,
operation: Deletion,
cost: i,
cost: i * DELETION_COST + INITIAL_MISMATCH_PENALITY,
};
}
for j in 1..self.dim[0] {
self.table[j * self.dim[1]] = Cell {
parent: 0,
operation: Insertion,
cost: j,
cost: j * INSERTION_COST + INITIAL_MISMATCH_PENALITY,
};
}

Expand All @@ -77,26 +82,31 @@ impl<'a> Alignment<'a> {
let (left, diag, up) =
(self.index(i, j + 1), self.index(i, j), self.index(i + 1, j));
// The order of the candidates matters if two of them have the
// same cost as in that case we choose the first one. Insertions
// are preferred to deletions in order to highlight moved tokens
// as a deletion followed by an insertion (as the edit sequence
// is read backwards we need to choose the insertion first)
// same cost as in that case we choose the first one. Consider
// insertions and deletions before matches in order to group
// changes together. Insertions are preferred to deletions in
// order to highlight moved tokens as a deletion followed by an
// insertion (as the edit sequence is read backwards we need to
// choose the insertion first)
let candidates = [
Cell {
parent: up,
operation: Insertion,
cost: self.table[up].cost + INSERTION_COST,
cost: self.mismatch_cost(up, INSERTION_COST),
},
Cell {
parent: left,
operation: Deletion,
cost: self.table[left].cost + DELETION_COST,
cost: self.mismatch_cost(left, DELETION_COST),
},
Cell {
parent: diag,
operation: if x_i == y_j { NoOp } else { Substitution },
cost: self.table[diag].cost
+ if x_i == y_j { 0 } else { SUBSTITUTION_COST },
cost: if x_i == y_j {
self.table[diag].cost
} else {
self.mismatch_cost(diag, SUBSTITUTION_COST)
},
},
];
let index = self.index(i + 1, j + 1);
Expand All @@ -109,6 +119,16 @@ impl<'a> Alignment<'a> {
}
}

fn mismatch_cost(&self, parent: usize, basic_cost: usize) -> usize {
self.table[parent].cost
+ basic_cost
+ if self.table[parent].operation == NoOp {
INITIAL_MISMATCH_PENALITY
} else {
0
}
}

/// Read edit operations from the table.
pub fn operations(&self) -> Vec<Operation> {
let mut ops = VecDeque::with_capacity(max(self.x.len(), self.y.len()));
Expand Down Expand Up @@ -248,7 +268,7 @@ mod tests {
TestCase {
before: "aaa",
after: "aba",
distance: 1,
distance: 4,
parts: (1, 3),
operations: vec![NoOp, Substitution, NoOp],
}
Expand All @@ -260,7 +280,7 @@ mod tests {
TestCase {
before: "ááb",
after: "áaa",
distance: 2,
distance: 7,
parts: (2, 3),
operations: vec![NoOp, Substitution, Substitution],
}
Expand All @@ -272,7 +292,7 @@ mod tests {
TestCase {
before: "kitten",
after: "sitting",
distance: 3,
distance: 11,
parts: (3, 7),
operations: vec![
Substitution, // K S
Expand All @@ -292,7 +312,7 @@ mod tests {
TestCase {
before: "saturday",
after: "sunday",
distance: 3,
distance: 9,
parts: (3, 8),
operations: vec![
NoOp, // S S
Expand All @@ -308,6 +328,98 @@ mod tests {
.run();
}

#[test]
fn test_3() {
TestCase {
// Prefer [Deletion NoOp Insertion] over [Substitution Substitution]
before: "ab",
after: "ba",
distance: 6,
parts: (2, 3),
operations: vec![
Deletion, // a -
NoOp, // b b
Insertion, // - a
],
}
.run();
}

#[test]
fn test_4() {
// Deletions are grouped together.
TestCase {
before: "AABB",
after: "AB",
distance: 5,
parts: (2, 4),
operations: vec![
NoOp, // A A
Deletion, // A -
Deletion, // B -
NoOp, // B B
],
}
.run();
}

#[test]
fn test_5() {
// Insertions are grouped together.
TestCase {
before: "AB",
after: "AABB",
distance: 5,
parts: (2, 4),
operations: vec![
NoOp, // A A
Insertion, // - A
Insertion, // - B
NoOp, // B B
],
}
.run();
}

#[test]
fn test_6() {
// Deletion and substitution are grouped together.
TestCase {
before: "AAABBB",
after: "ACB",
distance: 10,
parts: (4, 6),
operations: vec![
NoOp, // A A
Substitution, // A C
Deletion, // A -
Deletion, // B -
Deletion, // B -
NoOp, // B B
],
}
.run();
}

#[test]
fn test_7() {
// Insertion and Substitution are grouped together
TestCase {
before: "ACB",
after: "AADBB",
distance: 8,
parts: (3, 5),
operations: vec![
NoOp, // A A
Substitution, // C A
Insertion, // - D
Insertion, // - B
NoOp, // B B
],
}
.run();
}

struct TestCase<'a> {
before: &'a str,
after: &'a str,
Expand Down
29 changes: 24 additions & 5 deletions src/edits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -691,17 +691,15 @@ mod tests {
(Deletion, "the"),
(Deletion, " "),
(Deletion, "commit"),
(Deletion, " "),
(Deletion, "number "),
(MinusNoop, "from any one of them."),
(Deletion, " number"),
(MinusNoop, " from any one of them."),
]],
vec![vec![
(PlusNoop, "so it is safe to read "),
(Insertion, "build"),
(Insertion, " "),
(Insertion, "info"),
(Insertion, " "),
(PlusNoop, "from any one of them."),
(PlusNoop, " from any one of them."),
]],
),
1.0,
Expand Down Expand Up @@ -811,6 +809,27 @@ mod tests {
);
}

#[test]
fn test_infer_edits_15() {
assert_paired_edits(
vec![r#"printf "%s\n" s y y | git add -p &&"#],
vec!["test_write_lines s y y | git add -p &&"],
(
vec![vec![
(MinusNoop, ""),
(Deletion, "printf"),
(Deletion, r#" "%s\n""#),
(MinusNoop, " s y y | git add -p &&"),
]],
vec![vec![
(PlusNoop, ""),
(Insertion, "test_write_lines"),
(PlusNoop, " s y y | git add -p &&"),
]],
),
);
}

fn assert_edits(
minus_lines: Vec<&str>,
plus_lines: Vec<&str>,
Expand Down

0 comments on commit b3057d4

Please sign in to comment.