Skip to content

Commit

Permalink
Stop using substitutions in Levenshtein calculation
Browse files Browse the repository at this point in the history
Now that the lowest cost edit path groups deletions and insertions
together there seems to be little point in keeping substitutions. Indeed
the lower cost of substitutions compared to a deletion followed by an
insertion can adversely affect the highlighting. If the length of the
line does not change delta will prefer to use substitutions over
deletions and insertions even if it results in unchanged tokens being
marked as changed. For example in

    -[a] a a a a a [b b b]
    +[c] a a a a a [a c c]

unchanged tokens are marked as deleted and inserted. Without
substitutions this is highlighted as

    -a a a a a a [b b b]
    +[c ]a a a a a a [c c]

which highlights the insertion at the beginning of the line and the
change at the end of the line more clearly.

There is a change to the distance calculation as substitutions only
contributed the length of the deletion whereas now the same change will
add the length of both the deletion and the insertion. This is addressed
by doubling the contribution of unchanged sections so that the
denominator equals the sum of the distance contributions of the plus
line and minus line.
  • Loading branch information
phillipwood committed Nov 25, 2022
1 parent b3057d4 commit ef4a6aa
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 78 deletions.
103 changes: 41 additions & 62 deletions src/align.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
use std::cmp::max;
use std::collections::VecDeque;

// Cost invariants:
// SUBSTITUTION_COST < DELETION_COST + INSERTION_COST
// DELETION_COST + INSERTION_COST + INITIAL_MISMATCH_PENALITY < 2 * SUBSTITUTION_COST
const SUBSTITUTION_COST: usize = 3;
const DELETION_COST: usize = 2;
const INSERTION_COST: usize = 2;
// extra cost for starting a new group of changed tokens
Expand All @@ -13,7 +9,6 @@ const INITIAL_MISMATCH_PENALITY: usize = 1;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Operation {
NoOp,
Substitution,
Deletion,
Insertion,
}
Expand Down Expand Up @@ -101,11 +96,11 @@ impl<'a> Alignment<'a> {
},
Cell {
parent: diag,
operation: if x_i == y_j { NoOp } else { Substitution },
operation: NoOp,
cost: if x_i == y_j {
self.table[diag].cost
} else {
self.mismatch_cost(diag, SUBSTITUTION_COST)
usize::MAX
},
},
];
Expand Down Expand Up @@ -186,7 +181,6 @@ impl<'a> Alignment<'a> {
let parent = &self.table[cell.parent];
let op = match cell.operation {
Deletion => "-",
Substitution => "*",
Insertion => "+",
NoOp => ".",
};
Expand Down Expand Up @@ -268,9 +262,9 @@ mod tests {
TestCase {
before: "aaa",
after: "aba",
distance: 4,
parts: (1, 3),
operations: vec![NoOp, Substitution, NoOp],
distance: 5,
parts: (2, 4),
operations: vec![NoOp, Deletion, Insertion, NoOp],
}
.run();
}
Expand All @@ -280,9 +274,9 @@ mod tests {
TestCase {
before: "ááb",
after: "áaa",
distance: 7,
parts: (2, 3),
operations: vec![NoOp, Substitution, Substitution],
distance: 9,
parts: (4, 5),
operations: vec![NoOp, Deletion, Deletion, Insertion, Insertion],
}
.run();
}
Expand All @@ -292,16 +286,18 @@ mod tests {
TestCase {
before: "kitten",
after: "sitting",
distance: 11,
parts: (3, 7),
distance: 13,
parts: (5, 9),
operations: vec![
Substitution, // K S
NoOp, // I I
NoOp, // T T
NoOp, // T T
Substitution, // E I
NoOp, // N N
Insertion, // - G
Deletion, // K -
Insertion, // - S
NoOp, // I I
NoOp, // T T
NoOp, // T T
Deletion, // E -
Insertion, // - I
NoOp, // N N
Insertion, // - G
],
}
.run();
Expand All @@ -312,17 +308,18 @@ mod tests {
TestCase {
before: "saturday",
after: "sunday",
distance: 9,
parts: (3, 8),
distance: 10,
parts: (4, 9),
operations: vec![
NoOp, // S S
Deletion, // A -
Deletion, // T -
NoOp, // U U
Substitution, // R N
NoOp, // D D
NoOp, // A A
NoOp, // Y Y
NoOp, // S S
Deletion, // A -
Deletion, // T -
NoOp, // U U
Deletion, // R -
Insertion, // - N
NoOp, // D D
NoOp, // A A
NoOp, // Y Y
],
}
.run();
Expand All @@ -331,7 +328,7 @@ mod tests {
#[test]
fn test_3() {
TestCase {
// Prefer [Deletion NoOp Insertion] over [Substitution Substitution]
// Prefer [Deletion NoOp Insertion] over [Insertion NoOp Deletion]
before: "ab",
after: "ba",
distance: 6,
Expand Down Expand Up @@ -383,38 +380,20 @@ mod tests {

#[test]
fn test_6() {
// Deletion and substitution are grouped together.
// Insertion and Deletion are grouped together.
TestCase {
before: "AAABBB",
after: "ACB",
distance: 10,
parts: (4, 6),
operations: vec![
NoOp, // A A
Substitution, // A C
Deletion, // A -
Deletion, // B -
Deletion, // B -
NoOp, // B B
],
}
.run();
}

#[test]
fn test_7() {
// Insertion and Substitution are grouped together
TestCase {
before: "ACB",
after: "AADBB",
distance: 8,
parts: (3, 5),
distance: 11,
parts: (5, 7),
operations: vec![
NoOp, // A A
Substitution, // C A
Insertion, // - D
Insertion, // - B
NoOp, // B B
NoOp, // A A
Deletion, // A -
Deletion, // A -
Deletion, // B -
Deletion, // B -
Insertion, // - C
NoOp, // B B
],
}
.run();
Expand Down
46 changes: 30 additions & 16 deletions src/edits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ where
align::Operation::NoOp => {
let minus_section = minus_section(n, &mut x_offset);
let n_d = distance_contribution(minus_section);
d_denom += n_d;
d_denom += 2 * n_d;
let is_space = minus_section.trim().is_empty();
let coalesce_space_with_previous = is_space
&& ((minus_op_prev == deletion
Expand All @@ -239,16 +239,6 @@ where
minus_op_prev = noop_deletion;
plus_op_prev = noop_insertion;
}
align::Operation::Substitution => {
let minus_section = minus_section(n, &mut x_offset);
let n_d = distance_contribution(minus_section);
d_denom += n_d;
d_numer += n_d;
annotated_minus_line.push((deletion, minus_section));
annotated_plus_line.push((insertion, plus_section(n, &mut y_offset)));
minus_op_prev = deletion;
plus_op_prev = insertion;
}
align::Operation::Insertion => {
let plus_section = plus_section(n, &mut y_offset);
let n_d = distance_contribution(plus_section);
Expand Down Expand Up @@ -688,10 +678,9 @@ mod tests {
// TODO: Coalesce runs of the same operation.
vec![vec![
(MinusNoop, "so it is safe to read "),
(Deletion, "the"),
(Deletion, "the commit"),
(Deletion, " "),
(Deletion, "commit"),
(Deletion, " number"),
(Deletion, "number"),
(MinusNoop, " from any one of them."),
]],
vec![vec![
Expand Down Expand Up @@ -817,8 +806,7 @@ mod tests {
(
vec![vec![
(MinusNoop, ""),
(Deletion, "printf"),
(Deletion, r#" "%s\n""#),
(Deletion, r#"printf "%s\n""#),
(MinusNoop, " s y y | git add -p &&"),
]],
vec![vec![
Expand All @@ -830,6 +818,32 @@ mod tests {
);
}

#[test]
fn test_infer_edits_16() {
assert_edits(
vec!["a a a a a a b b b"],
vec!["c a a a a a a c c"],
(
vec![vec![
(MinusNoop, ""),
(MinusNoop, "a a a a a a "),
(Deletion, "b b"),
(Deletion, " "),
(Deletion, "b"),
]],
vec![vec![
(PlusNoop, ""),
(Insertion, "c "),
(PlusNoop, "a a a a a a "),
(Insertion, "c"),
(Insertion, " "),
(Insertion, "c"),
]],
),
1.0,
);
}

fn assert_edits(
minus_lines: Vec<&str>,
plus_lines: Vec<&str>,
Expand Down

0 comments on commit ef4a6aa

Please sign in to comment.