Merge pull request #524 from tomdaffurn/tom/fuzzy_match_experiment

Experimental Improved Search Algorithm
moov-io · Dec 14, 2023 · f476bbd · f476bbd
2 parents 193f8ec + ddb46e7
commit f476bbd
Show file tree

Hide file tree

Showing 9 changed files with 305 additions and 105 deletions.
diff --git a/README.md b/README.md
@@ -184,14 +184,18 @@ You should get this response:
 PONG
 ```
 
-### Configuration settings
+### Configuration settings 
 
 | Environmental Variable | Description | Default |
 |-----|-----|-----|
 | `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h |
 | `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty |
 | `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
 | `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
+| `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9       | 
+| `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3    |
+| `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9   |
+| `UNMATCHED_INDEX_TOKEN_WEIGHT` | Weight of penalty applied to scores when part of the indexed name isn't matched. | 0.15    |
 | `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
 | `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
 | `WEBHOOK_BATCH_SIZE` | How many watches to read from database per batch of async searches. | 100 |

diff --git a/cmd/server/issue115_test.go b/cmd/server/issue115_test.go
@@ -29,13 +29,13 @@ func TestIssue115__TopSDNs(t *testing.T) {
 	s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "2680", SDNName: "HABBASH, George", SDNType: "INDIVIDUAL"}}, nil, pipe)
 
 	out := s.TopSDNs(1, 0.00, "george bush", keeper)
-	eql(t, "issue115: top SDN 2680", out[0].match, 0.732)
+	eql(t, "issue115: top SDN 2680", out[0].match, 0.687)
 
 	// was 88.3% match
 	s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "9432", SDNName: "CHIWESHE, George", SDNType: "INDIVIDUAL"}}, nil, pipe)
 
 	out = s.TopSDNs(1, 0.00, "george bush", keeper)
-	eql(t, "issue115: top SDN 18996", out[0].match, 0.764)
+	eql(t, "issue115: top SDN 18996", out[0].match, 0.650)
 
 	// another example
 	s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "0", SDNName: "Bush, George W", SDNType: "INDIVIDUAL"}}, nil, pipe)
@@ -47,5 +47,5 @@ func TestIssue115__TopSDNs(t *testing.T) {
 	eql(t, "issue115: top SDN 0", out[0].match, 1.0)
 
 	out = s.TopSDNs(1, 0.00, "george bush", keeper)
-	eql(t, "issue115: top SDN 0", out[0].match, 0.667)
+	eql(t, "issue115: top SDN 0", out[0].match, 0.986)
 }
diff --git a/cmd/server/new_algorithm_test.go b/cmd/server/new_algorithm_test.go
@@ -0,0 +1,76 @@
+// Copyright 2022 The Moov Authors
+// Use of this source code is governed by an Apache License
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) {
+	// Words in the query should be matched against at most one indexed word. Doubled names on the sanctioned list can
+	// skew results
+	// 1. SDN Entity 40273, VLADIMIROV, Vladimir Vladimirovich
+	oldScore, newScore := compareAlgorithms("vladimirov vladimir vladimirovich", "vladimir levenshtein")
+	eql(t, "Score is too high", oldScore, 0.961)
+	eql(t, "New score is better", newScore, 0.603)
+
+	// 2. SDN Entity 7788 "SHAQIRI, Shaqir"
+	oldScore, newScore = compareAlgorithms("shaqiri shaqir", "zaid shakir")
+	eql(t, "Score is too high", oldScore, 0.908)
+	eql(t, "New score is better", newScore, 0.704)
+
+	// Single-word sanctioned names shouldn't match any query with that name part
+	// 1. SDN Entity 15050 "HADI"
+	oldScore, newScore = compareAlgorithms("hadi", "hadi alwai")
+	eql(t, "Score is too high", oldScore, 0.900)
+	eql(t, "New score is better", newScore, 0.615)
+
+	// Name-part scores should be weighted by the character length. If not, small words can have unfair weight
+	// 1. SDN Entity "LI, Shangfu"
+	oldScore, newScore = compareAlgorithms("li shangfu", "li shanlan")
+	eql(t, "Score is too high", oldScore, 0.914)
+	eql(t, "New score is better", newScore, 0.867)
+
+	// Words with different lengths shouldn't match very highly
+	oldScore, newScore = compareAlgorithms("browningweight", "brown")
+	eql(t, "Score is too high", oldScore, 0.871)
+	eql(t, "New score is better", newScore, 0.703)
+
+	// Words that start with different letters shouldn't match very highly
+	oldScore, newScore = compareAlgorithms("dominguez", "jimenez")
+	eql(t, "Score is too high", oldScore, 0.690)
+	eql(t, "New score is better", newScore, 0.580)
+}
+
+func TestBestPairsJaroWinkler__TruePositives(t *testing.T) {
+	// Unmatched indexed words had a large weight, causing false negatives for missing "middle names"
+	// 1. Saddam Hussein
+	oldScore, newScore := compareAlgorithms("saddam hussein al tikriti", "saddam hussien")
+	eql(t, "Score is too low", oldScore, 0.656)
+	eql(t, "New score is better", newScore, 0.924)
+
+	// 2. SDN Entity 7574 "VALENCIA TRUJILLO, Joaquin Mario"
+	oldScore, newScore = compareAlgorithms("valencia trujillo joaquin mario", "valencia trujillo joaquin")
+	eql(t, "Score is too low", oldScore, 0.868)
+	eql(t, "New score is better", newScore, 0.973)
+
+	// 3. SDN Entity 9760 "LUKASHENKO, Alexander Grigoryevich"
+	oldScore, newScore = compareAlgorithms("lukashenko alexander grigoryevich", "alexander lukashenko")
+	eql(t, "Score is too low", oldScore, 0.765)
+	eql(t, "New score is better", newScore, 0.942)
+
+	// Small words had too much weight, causing false negatives
+	// 1. SDN Entity 4691 "A.I.C. SOGO KENKYUSHO"
+	oldScore, newScore = compareAlgorithms("a i c sogo kenkyusho", "sogo kenkyusho")
+	eql(t, "Score is too low", oldScore, 0.400)
+	eql(t, "New score is better", newScore, 0.972)
+}
+
+func compareAlgorithms(indexedName string, query string) (float64, float64) {
+	oldScore := jaroWinkler(indexedName, query)
+	newScore := bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	return oldScore, newScore
+}