From ef4a6aa978f8a9da1edd1eb48beff2a6cccc4ed3 Mon Sep 17 00:00:00 2001 From: Phillip Wood Date: Wed, 5 Jan 2022 15:00:39 +0000 Subject: [PATCH] Stop using substitutions in Levenshtein calculation Now that the lowest cost edit path groups deletions and insertions together there seems to be little point in keeping substitutions. Indeed the lower cost of substitutions compared to a deletion followed by an insertion can adversely affect the highlighting. If the length of the line does not change delta will prefer to use substitutions over deletions and insertions even if it results in unchanged tokens being marked as changed. For example in -[a] a a a a a [b b b] +[c] a a a a a [a c c] unchanged tokens are marked as deleted and inserted. Without substitutions this is highlighted as -a a a a a a [b b b] +[c ]a a a a a a [c c] which highlights the insertion at the beginning of the line and the change at the end of the line more clearly. There is a change to the distance calculation as substitutions only contributed the length of the deletion whereas now the same change will add the length of both the deletion and the insertion. This is addressed by doubling the contribution of unchanged sections so that the denominator equals the sum of the distance contributions of the plus line and minus line. --- src/align.rs | 103 ++++++++++++++++++++------------------------------- src/edits.rs | 46 +++++++++++++++-------- 2 files changed, 71 insertions(+), 78 deletions(-) diff --git a/src/align.rs b/src/align.rs index 8fa5a96a9..b2e2bdfee 100644 --- a/src/align.rs +++ b/src/align.rs @@ -1,10 +1,6 @@ use std::cmp::max; use std::collections::VecDeque; -// Cost invariants: -// SUBSTITUTION_COST < DELETION_COST + INSERTION_COST -// DELETION_COST + INSERTION_COST + INITIAL_MISMATCH_PENALITY < 2 * SUBSTITUTION_COST -const SUBSTITUTION_COST: usize = 3; const DELETION_COST: usize = 2; const INSERTION_COST: usize = 2; // extra cost for starting a new group of changed tokens @@ -13,7 +9,6 @@ const INITIAL_MISMATCH_PENALITY: usize = 1; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Operation { NoOp, - Substitution, Deletion, Insertion, } @@ -101,11 +96,11 @@ impl<'a> Alignment<'a> { }, Cell { parent: diag, - operation: if x_i == y_j { NoOp } else { Substitution }, + operation: NoOp, cost: if x_i == y_j { self.table[diag].cost } else { - self.mismatch_cost(diag, SUBSTITUTION_COST) + usize::MAX }, }, ]; @@ -186,7 +181,6 @@ impl<'a> Alignment<'a> { let parent = &self.table[cell.parent]; let op = match cell.operation { Deletion => "-", - Substitution => "*", Insertion => "+", NoOp => ".", }; @@ -268,9 +262,9 @@ mod tests { TestCase { before: "aaa", after: "aba", - distance: 4, - parts: (1, 3), - operations: vec![NoOp, Substitution, NoOp], + distance: 5, + parts: (2, 4), + operations: vec![NoOp, Deletion, Insertion, NoOp], } .run(); } @@ -280,9 +274,9 @@ mod tests { TestCase { before: "ááb", after: "áaa", - distance: 7, - parts: (2, 3), - operations: vec![NoOp, Substitution, Substitution], + distance: 9, + parts: (4, 5), + operations: vec![NoOp, Deletion, Deletion, Insertion, Insertion], } .run(); } @@ -292,16 +286,18 @@ mod tests { TestCase { before: "kitten", after: "sitting", - distance: 11, - parts: (3, 7), + distance: 13, + parts: (5, 9), operations: vec![ - Substitution, // K S - NoOp, // I I - NoOp, // T T - NoOp, // T T - Substitution, // E I - NoOp, // N N - Insertion, // - G + Deletion, // K - + Insertion, // - S + NoOp, // I I + NoOp, // T T + NoOp, // T T + Deletion, // E - + Insertion, // - I + NoOp, // N N + Insertion, // - G ], } .run(); @@ -312,17 +308,18 @@ mod tests { TestCase { before: "saturday", after: "sunday", - distance: 9, - parts: (3, 8), + distance: 10, + parts: (4, 9), operations: vec![ - NoOp, // S S - Deletion, // A - - Deletion, // T - - NoOp, // U U - Substitution, // R N - NoOp, // D D - NoOp, // A A - NoOp, // Y Y + NoOp, // S S + Deletion, // A - + Deletion, // T - + NoOp, // U U + Deletion, // R - + Insertion, // - N + NoOp, // D D + NoOp, // A A + NoOp, // Y Y ], } .run(); @@ -331,7 +328,7 @@ mod tests { #[test] fn test_3() { TestCase { - // Prefer [Deletion NoOp Insertion] over [Substitution Substitution] + // Prefer [Deletion NoOp Insertion] over [Insertion NoOp Deletion] before: "ab", after: "ba", distance: 6, @@ -383,38 +380,20 @@ mod tests { #[test] fn test_6() { - // Deletion and substitution are grouped together. + // Insertion and Deletion are grouped together. TestCase { before: "AAABBB", after: "ACB", - distance: 10, - parts: (4, 6), - operations: vec![ - NoOp, // A A - Substitution, // A C - Deletion, // A - - Deletion, // B - - Deletion, // B - - NoOp, // B B - ], - } - .run(); - } - - #[test] - fn test_7() { - // Insertion and Substitution are grouped together - TestCase { - before: "ACB", - after: "AADBB", - distance: 8, - parts: (3, 5), + distance: 11, + parts: (5, 7), operations: vec![ - NoOp, // A A - Substitution, // C A - Insertion, // - D - Insertion, // - B - NoOp, // B B + NoOp, // A A + Deletion, // A - + Deletion, // A - + Deletion, // B - + Deletion, // B - + Insertion, // - C + NoOp, // B B ], } .run(); diff --git a/src/edits.rs b/src/edits.rs index b71dd45c5..bcbead11b 100644 --- a/src/edits.rs +++ b/src/edits.rs @@ -213,7 +213,7 @@ where align::Operation::NoOp => { let minus_section = minus_section(n, &mut x_offset); let n_d = distance_contribution(minus_section); - d_denom += n_d; + d_denom += 2 * n_d; let is_space = minus_section.trim().is_empty(); let coalesce_space_with_previous = is_space && ((minus_op_prev == deletion @@ -239,16 +239,6 @@ where minus_op_prev = noop_deletion; plus_op_prev = noop_insertion; } - align::Operation::Substitution => { - let minus_section = minus_section(n, &mut x_offset); - let n_d = distance_contribution(minus_section); - d_denom += n_d; - d_numer += n_d; - annotated_minus_line.push((deletion, minus_section)); - annotated_plus_line.push((insertion, plus_section(n, &mut y_offset))); - minus_op_prev = deletion; - plus_op_prev = insertion; - } align::Operation::Insertion => { let plus_section = plus_section(n, &mut y_offset); let n_d = distance_contribution(plus_section); @@ -688,10 +678,9 @@ mod tests { // TODO: Coalesce runs of the same operation. vec![vec![ (MinusNoop, "so it is safe to read "), - (Deletion, "the"), + (Deletion, "the commit"), (Deletion, " "), - (Deletion, "commit"), - (Deletion, " number"), + (Deletion, "number"), (MinusNoop, " from any one of them."), ]], vec![vec![ @@ -817,8 +806,7 @@ mod tests { ( vec![vec![ (MinusNoop, ""), - (Deletion, "printf"), - (Deletion, r#" "%s\n""#), + (Deletion, r#"printf "%s\n""#), (MinusNoop, " s y y | git add -p &&"), ]], vec![vec![ @@ -830,6 +818,32 @@ mod tests { ); } + #[test] + fn test_infer_edits_16() { + assert_edits( + vec!["a a a a a a b b b"], + vec!["c a a a a a a c c"], + ( + vec![vec![ + (MinusNoop, ""), + (MinusNoop, "a a a a a a "), + (Deletion, "b b"), + (Deletion, " "), + (Deletion, "b"), + ]], + vec![vec![ + (PlusNoop, ""), + (Insertion, "c "), + (PlusNoop, "a a a a a a "), + (Insertion, "c"), + (Insertion, " "), + (Insertion, "c"), + ]], + ), + 1.0, + ); + } + fn assert_edits( minus_lines: Vec<&str>, plus_lines: Vec<&str>,