From 8c5555a64fab89997c59fc457e5c265b93b39a7f Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 13 Jun 2022 12:43:18 +0200 Subject: [PATCH] feat(analysis): append ordinal suffix to numeric street names --- lib/analysis/Token.js | 4 ++++ lib/analysis/ordinals.js | 45 ++++++++++++++++++++++++++++++++++++++++ lib/cleanup_v2.js | 4 ++++ test/cleanup_v2.js | 25 ++++++++++++++++++++-- 4 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 lib/analysis/ordinals.js diff --git a/lib/analysis/Token.js b/lib/analysis/Token.js index abdd9f49a..9969216c7 100644 --- a/lib/analysis/Token.js +++ b/lib/analysis/Token.js @@ -9,6 +9,10 @@ class Token { return _.isString(this.body) && !_.isEmpty(this.body); } + isNumeric() { + return /^\d+$/.test(this.body); + } + findCase() { if (this.body === _.toLower(this.body)) { return Token.LOWERCASED; } if (this.body === _.toUpper(this.body)) { return Token.UPPERCASED; } diff --git a/lib/analysis/ordinals.js b/lib/analysis/ordinals.js new file mode 100644 index 000000000..870a3f39e --- /dev/null +++ b/lib/analysis/ordinals.js @@ -0,0 +1,45 @@ +const _ = require('lodash'); + +// The ordinal function replaces all numeric street names (ie. 30 street) +// with a version including ordinals (ie. 30th street). +// note: this is currently only configured for the English language + +function ordinals(opts) { + return (tokens) => { + + // consider all but final token + for (var o = 0; o < tokens.length-1; o++) { + + // token must be entirely numeric + if (!tokens[o].isNumeric()) { continue; } + + // token must be followed by a street type token + if (!_.has(opts.dict.streetTypes, _.toLower(tokens[o+1].body))) { continue; } + + // token must either be the leftmost token or be preceeded by a directional token + if(o !== 0) { + if (!_.has(opts.dict.directionalExpansions, _.toLower(tokens[o-1].body))) { + continue; + } + } + + // append the english ordinal suffix + tokens[o].body += englishOrdinalSuffix(tokens[o].body); + + // maximum of one replacement + break; + } + + return tokens; + }; +} + +function englishOrdinalSuffix(i) { + const j = i % 10, k = i % 100; + if (j === 1 && k !== 11) { return 'st'; } + if (j === 2 && k !== 12) { return 'nd'; } + if (j === 3 && k !== 13) { return 'rd'; } + return 'th'; +} + +module.exports = ordinals; diff --git a/lib/cleanup_v2.js b/lib/cleanup_v2.js index 8a91f2633..45f612ab1 100644 --- a/lib/cleanup_v2.js +++ b/lib/cleanup_v2.js @@ -1,6 +1,7 @@ const _ = require('lodash'); const dictionary = require('./analysis/dictionary'); const synonyms = require('./analysis/synonyms'); +const ordinals = require('./analysis/ordinals'); const Token = require('./analysis/Token'); /** @@ -116,6 +117,9 @@ function cleanupStreetName(input) { // capitalize lowercased tokens (leaving mixed case tokens unchanged) tokens.forEach(token => token.selectivelyCapitalize()); + // add ordinals to english numeric street names + tokens = ordinals({ dict })(tokens); + // convert objects to strings and join by whitespace return tokens.map(token => token.body).join(' '); } diff --git a/test/cleanup_v2.js b/test/cleanup_v2.js index 2f4b48c65..27c6465a5 100644 --- a/test/cleanup_v2.js +++ b/test/cleanup_v2.js @@ -181,6 +181,27 @@ tape('contract english diagonals - last token position', (t) => { t.end(); }); +// add missing English street name ordinals +tape('add missing English street name ordinals', (t) => { + t.equal(analyzer('W 26 St'), 'West 26th Street'); + t.equal(analyzer('W 26th St'), 'West 26th Street'); + t.equal(analyzer('1 St'), '1st Street'); + t.equal(analyzer('2 Rd'), '2nd Road'); + t.equal(analyzer('3 Ave'), '3rd Avenue'); + t.equal(analyzer('4 Ln'), '4th Lane'); + t.equal(analyzer('11 St'), '11th Street'); + t.equal(analyzer('12 Rd'), '12th Road'); + t.equal(analyzer('13 Ave'), '13th Avenue'); + t.equal(analyzer('14 Ln'), '14th Lane'); + t.equal(analyzer('101 St'), '101st Street'); + t.equal(analyzer('102 Rd'), '102nd Road'); + t.equal(analyzer('103 Ave'), '103rd Avenue'); + t.equal(analyzer('104 Ln'), '104th Lane'); + t.equal(analyzer('no 1 st'), 'No 1 Street'); + t.equal(analyzer('no #1 st'), 'No #1 Street'); + t.end(); +}); + // --- NOOP inputs which should never change --- // no-ops, these inputs should not change regardless of the algorithm used @@ -223,7 +244,7 @@ tape('misc', (t) => { t.equal(analyzer('YELLOWSTONE BLVD'), 'Yellowstone Boulevard'); t.equal(analyzer('YESHIVA LN'), 'Yeshiva Lane'); t.equal(analyzer('WYGANT PL'), 'Wygant Place'); - t.equal(analyzer('W 262 ST'), 'West 262 Street'); + t.equal(analyzer('W 262 ST'), 'West 262nd Street'); t.equal(analyzer('W 26TH ST'), 'West 26th Street'); t.equal(analyzer('WILLIE MC DONALD WAY'), 'Willie Mc Donald Way'); t.equal(analyzer('West 93rd Street'), 'West 93rd Street'); @@ -232,7 +253,7 @@ tape('misc', (t) => { t.equal(analyzer('E HAMPTON BLVD'), 'East Hampton Boulevard'); t.equal(analyzer('MARATHON PKWY'), 'Marathon Parkway'); t.equal(analyzer('ANDREWS AVE S'), 'Andrews Avenue South'); - t.equal(analyzer('W 13 ST'), 'West 13 Street'); + t.equal(analyzer('W 13 ST'), 'West 13th Street'); t.end(); });