Skip to content

Commit

Permalink
feat(street_name_normalization): improvements to dedupe algo, more co…
Browse files Browse the repository at this point in the history
…nservative approach
  • Loading branch information
missinglink committed Jan 28, 2021
1 parent 5be4d95 commit 8aabc40
Show file tree
Hide file tree
Showing 15 changed files with 328 additions and 165 deletions.
3 changes: 2 additions & 1 deletion lib/analysis/Token.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ class Token {
const casing = this.findCase();
if (casing !== Token.UPPERCASED) { return false; }
if (this.body.endsWith('.')) { return true; }
if (this.body.length > 3){ return false; }
if (this.containsVowels()) { return false; }
if (this.isOrdinalNumber()) { return false; }
if (_.has(surnamePrefixes, _.toLower(this.body))){ return false; }
return true;
return false;
}

removeLeadingZeros() {
Expand Down
9 changes: 4 additions & 5 deletions lib/analysis/dictionaries/en/directionals.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
center|c|ctr
central|c|cn|ctrl|cntrl
centre|c|ctr
center|ctr
central|cn|ctrl|cntrl
east|e
eastern|eastrn|estrn|estn
lower|lowr|lwr
Expand All @@ -15,6 +14,6 @@ southeast|south east|southe|south e|seast|s east|se|s e|soeast|so east|so e|sthe
southeastern|south eastern|southeastrn|south eastrn|southestrn|south estrn|southestn|south estn|seastern|s eastern|seastrn|s eastrn|sestrn|s estrn|sestn|s estn|soeastern|so eastern|soeastrn|so eastrn|soestrn|so estrn|soestn|so estn|stheastern|sth eastern|stheastrn|sth eastrn|sthestrn|sth estrn|sthestn|sth estn
southwest|south west|southw|south w|southwst|south wst|swest|s west|sw|s w|swst|s wst|sowest|so west|so w|sowst|so wst|sthwest|sth west|sthw|sth w|sthwst|sth wst
southwestern|south western|southwestrn|south westrn|southwstrn|south wstrn|southwstn|south wstn|swestern|s western|swestrn|s westrn|swstrn|s wstrn|swstn|s wstn|sowestern|so western|sowestrn|so westrn|sowstrn|so wstrn|sowstn|so wstn|sthwestern|sth western|sthwestrn|sth westrn|sthwstrn|sth wstrn|sthwstn|sth wstn
upper|uppr|upr|up
upper|uppr|upr
west|w|wst
western|westrn|wstrn|wstn
western|westrn|wstrn|wstn
10 changes: 8 additions & 2 deletions lib/analysis/dictionaries/en/highway_types.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
state highway|s.highway|s highway|st.highway|st highway|sh|s.h.|s.h|s h|st.h|st h|s.hw|s hw|st.hw|st hw|s.hwy|s hwy|shwy|s.hgwy|s hgwy|st.hgwy|st hgwy|s.hway|s hway|st.hway|st hway|s.hwy|s hwy|st.hwy|st hwy|s.hi|s hi|st.hi|st hi|statehighway
state road|sr|stateroad|s.r.|s.r|s r|s.road|s road|st.road|st road|staterd|srd|s.rd|s rd|state rd|strd|st.rd|st rd
state highway|s.highway|s highway|st.highway|st highway|sh|s.h.|s.h|s h|st.h|st h|s.hw|s hw|st.hw|st hw|s.hwy|s hwy|shwy|s.hgwy|s hgwy|st.hgwy|st hgwy|s.hway|s hway|st.hway|st hway|s hwy|st.hwy|st hwy|s.hi|s hi|st.hi|st hi|statehighway
state route|sr|stateroute|s.r.|s.r|s r|s.route|s route|st.route|st route|statert|srt|s.rt|s rt|srte|s.rte|s rte|state rt|state rte|strt|strte|st.rt|st rt|st.rte|st rte
county highway|ch|c.h.|c.h|c h|c.hw|c hw|co.hw|co hw|cty.hw|cty hw|c.hgwy|c hgwy|co.hgwy|co hgwy|cty.hgwy|cty hgwy|c.hway|c hway|co.hway|co hway|cty.hway|cty hway|c.hwy|c hwy|co.hwy|co hwy|cty.hwy|cty hwy|c.hi|c hi|co.hi|co hi|cty.hi|cty hi
county route|cr|c.r.|c.r|c r|co.r|co r|c.rt|c rt|co.rt|co rt|cty.r|cty r|cty.rt|cty rt|c.rte|c rte|co.rte|co rte|cty.rte|cty rte|county touring route
rural route|rr|r.r|r r
township highway|th|t.h.|t.h|t h|twp.h|twp h|tshp.h|tshp h|t.hw|t hw|twp.hw|twp hw|tshp.hw|tshp hw|t.hgwy|t hgwy|twp.hgwy|twp hgwy|tshp.hgwy|tshp hgwy|t.hway|t hway|twp.hway|twp hway|tshp.hway|tshp hway|t.hwy|t hwy|twp.hwy|twp hwy|tshp.hwy|tshp hwy|t.hi|t hi|twp.hi|twp hi|tshp.hi|tshp hi
township route|tr|t.r.|t.r|t r|t rt|t.rt|trt|t.rte|t rte|twpr|twp.r|twp r|twp.rt|twp rt|twp.rte|twp rte|tshp.r|tshp r|tshp.rt|tshp rt|tshp.rte|tshp rte
us highway|us hwy|u s hwy
us route|us rte|u s rte
78 changes: 0 additions & 78 deletions lib/analysis/dictionaries/en/personal_titles.txt
Original file line number Diff line number Diff line change
@@ -1,80 +1,2 @@
ambassador
arch bishop|archbishop
bishop
brigadier general|brig gen
cardinal
colonel|col
commander|cmdr
congressman|congress man
congresswoman|congress woman
corporal|cpl
captain|capt|cpt
chairman|chair man
chairwoman|chair woman
czar|tsar
dame
deputy prime minister|deputy pm
district judge
doctor|dr|doc
doctors|drs|docs
duke
dutchess
emperor
brother|br
father|fr
sister|sr
his royal highness|hrh|h r h
her royal highness|hrh|h r h
general|gen
his honor|his honour
her honor|her honour
honorable|honourable|hon
judge
lady
lieutenant|lieut|lgt|lt
lieutenant governor|lieut governor|lgt governor|lieut gov|lgt gov|lt governor|lt gov
lieutenant colonel|lieut colonel|lgt colonel|lieut col|lgt col|lt colonel|lt col
lieutenant commander|lieut commander|lgt commander|lieut cmdr|lgt cmdr|lt commander|lt cmdr
lieutenant general|lieut general|lgt general|lieut gen|lgt gen|lt general|lt gen
lord
king|kg
madame
madames
major|maj
major general|maj gen|maj general|major gen
messrs
mp|member of parliament
mps|members of parliament
mr|mister
mrs|misses
ms|miss
officer|ofcr
pope
president|pres
prime minister|pm
prince
princess
private first class|pfc|p f c
professor|prof
professors|profs
queen
reverend|rev
right honorable|right honorable|right honourable|right hon|rt honorable|rt honorable|rt honourable|rt hon|rh|r h|r hon|right and honorable|right and honourable
saint|st
saints|ss
sainte|ste
san|s
santa|sta
sargeant|sgt
secretary|sec
sir
sirs
representative|rep
representatives|reps
senator|sen
senators|sens
vice chairman|vice chair man
vice chairperson|vice chair|vice chair person
vice chairwoman|vice chair woman
vice president|vice pres
vice prime minister|vice pm
5 changes: 5 additions & 0 deletions lib/analysis/dictionaries/en/street_prefixes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
beach|bch
fort|ft
mount|mt
court|ct|crt
square|sqr|sqre|squ|sq
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
abbey|abby
access|accs|acc
acres|acrs
alley|aly|ally|alee|al
alleyway|alwy|allyway|allwy
amble|ambl
Expand Down Expand Up @@ -29,11 +28,8 @@ boulevard|blvd|bd|bde|blv|bl|blvde|blvrd|boulavard|boul|boulv|bvd|boulevarde
bottom|bot|bottm|btm|bttm
bottoms|bttms|btms|bottms
boundary|bdy
bowl|bl
brace|br|brce
branch|br|brnch|brch
brae|br
break|brk
brace|brce
branch|brnch|brch
bridge|bdge|br|brdg|bri|brg
broadway|bdwy|bway|bwy|brdway
brook|brk
Expand All @@ -45,8 +41,7 @@ burrow|burw
butte|btte|bte
bypass|bypa|byps|bps|byp
byway|bywy
camp|cp
cape|cpe|cp
cape|cpe
canyon|cyn|cnyn
caravan|cvan|cvn|c van
causeway|csway|cswy|causewy|caus|cause|cway
Expand All @@ -71,21 +66,18 @@ concord|cncd|cncrd
concession|conc
concourse|con|concs|concse|cnc
connection|cntn|cxn
connector|conr|cnctr|cntr
connector|conr|cnctr
copse|cps
corner|cnr|crn|cor
corners|cnrs|crns|cors
corseo|cseo
corso|cso
county highway|ch|c.h.|c.h|c h|c.hw|c hw|co.hw|co hw|cty.hw|cty hw|c.hgwy|c hgwy|co.hgwy|co hgwy|cty.hgwy|cty hgwy|c.hway|c hway|co.hway|co hway|cty.hway|cty hway|c.hwy|c hwy|co.hwy|co hwy|cty.hwy|cty hwy|c.hi|c hi|co.hi|co hi|cty.hi|cty hi
county road|cr|c.r.|c.r|c r|co.r|co r|c.rd|c rd|co.rd|co rd|cty.r|cty r|cty.rd|cty rd
county route|cr|c.r.|c.r|c r|co.r|co r|c.rt|c rt|co.rt|co rt|cty.r|cty r|cty.rt|cty rt|c.rte|c rte|co.rte|co rte|cty.rte|cty rte|county touring route
course|crse
court|ct|crt
courts|crts|cts
courtyard|cyd|ctyd
cove|cov|ce|cv
creek|cr|crk
creek|crk
crescent|cr|cres|crs|crecent
crest|crst|cst
crief|crf
Expand All @@ -94,8 +86,7 @@ cross|cs|crss
crossing|crsg|xing|csg|x ing|x-ing
crossroad|crd|cross road|xroad|x-road|x road|xrd|x-rd|x rd
crossroads|cross roads|xrds
crossway|cowy|crwy|xway|xwy|x-way|x way|x-wy|xwy
cruiseway|cuwy|crwy
crossway|cowy|crwy|xway|xwy|x-way|x way|x-wy
cul
cul de sac|cul-de-sac|culdesac|cds|cusac|csac
curve|cve|crv|crve|curv
Expand Down Expand Up @@ -126,14 +117,13 @@ exit
expressway|exp|expwy|expway|expy|exwy
extension|ex|ext|extn|exten
extensions|exts
fairway|fawy|fy
fall|fl
fairway|fawy
falls|fls
fare
farm|frm
farms|frms
fern
ferry|fry|fy
ferry|fry
field|fld|fd
fields|flds|fds
fireline|fline|fire line|flne
Expand Down Expand Up @@ -203,18 +193,15 @@ landing|ldg|lndg|landng
lane|l|ln|la
laneway|lnwy
lees
light|lgt|lt
light|lgt
limits|lmts
line|ln
link|lnk|lk
little|ltl|lttl|littl|litl|lit|lt
little|ltl|lttl|littl|litl|lit
loaf|lf
lookout|lkt|look out
loop|lp
loops|lps
lot|lt
lynne|lynn
mall|ml
manor|mnr
maze
meadow|mdw
Expand All @@ -224,7 +211,6 @@ meander|mndr|mdr|mr
mew|mw
mews|mws
mile|mi
mill|ml
mills|mls
moor
motorway|mway|mwy|mtwy
Expand All @@ -242,28 +228,26 @@ overpass|over pass|opas
paddock|padk
palms|plms
parade|pde|prd|prde|pard
park|pk|prk
park|prk
parklands|pkld|pklds|parkland|park lands|park land
parkway|pkwy|parkwy|pky|pkway|prkwy|prkway|pkw|pwy|prkw
parkways|pkwys
part|prt
pass|ps
passage|psge|pass|pasg
path|pth
pathway|phwy|pway|pthway|pthwy|ptway|ptwy
peninsula|psla
piazza|piaz|pzza
pier
pike|pk|pke
pike|pke
pine|pne|pn
pines|pns|pnes
pond
place|pl|pla|plc|plac
plain|pln|pl
plain|pln
plains|plns|pls
plateau|plat|plt
plaza|plz|plza|pz
prarie|pr
pocket|pkt|pokt|pckt
point|piont|pnt|pt
pointe|pte|pnte
Expand All @@ -274,9 +258,8 @@ priors|prrs
private|pvt
promenade|prom|prm
pursuit|pur
quad|qd
quadrangle|qdgl
quadrant|qdrt|qd
quadrant|qdrt
quay|quy|qy
quays|quys|qys
radial|radl
Expand All @@ -292,7 +275,7 @@ reach|rch
reef
reserve|res|resrv|resv|rsrv|rserv|rserve|rsrve
rest|rst
retreat|rt|rtt
retreat|rtt
return|rtn
ride
ridge|rdge|rdg
Expand All @@ -316,7 +299,6 @@ roundabout
route|rt|rte
row|rw
run|rn
rural route|rr|r.r|r r
service road|svc rd|svc road|service rd|sv rd|svrd
serviceway|swy|svwy|svcwy
shoal|shl
Expand All @@ -340,32 +322,27 @@ stairway|stwy|strwy|stair way|st.wy|st wy|str.way|str way
steps|stps
strand|stra|strnd|strd
strands|strnds|strds
stravenue|stra|strav
stravenue|strav
street|st|str|stre|stree|strt
streets|sts
strip|strp
subdivision|subdiv
subway|sbwy
summit|smt|sumt
tarn|tn
terrace|tce|ter|tr|terr|terace|terrac|terrasse|tsse
thicket|thick
thoroughfare|thor|throughfare|thorough fare|thfr
thoroughway|thwy
throughway|thru|thro|thruway|trwy|thwy
tollway|tlwy|twy
top
tor
township highway|th|t.h.|t.h|t h|twp.h|twp h|tshp.h|tshp h|t.hw|t hw|twp.hw|twp hw|tshp.hw|tshp hw|t.hgwy|t hgwy|twp.hgwy|twp hgwy|tshp.hgwy|tshp hgwy|t.hway|t hway|twp.hway|twp hway|tshp.hway|tshp hway|t.hwy|t hwy|twp.hwy|twp hwy|tshp.hwy|tshp hwy|t.hi|t hi|twp.hi|twp hi|tshp.hi|tshp hi
township road|tr|t.r.|t.r|t r|t rd|t.rd|trd|twpr|twp.r|twp r|twp.rd|twp rd|tshp.r|tshp r|tshp.rd|tshp rd|township rd|tp rd
township route|tr|t.r.|t.r|t r|t rt|t.rt|trt|t.rte|t rte|twpr|twp.r|twp r|twp.rt|twp rt|twp.rte|twp rte|tshp.r|tshp r|tshp.rt|tshp rt|tshp.rte|tshp rte
tower|twr
towers|twrs
townline|tline
trace|trce|trc
track|tr|trk|trak
track|trk|trak
trafficway|trfy
trail|tr|trl
trail|trl
trailer|trlr
tram
tramway|tmwy
Expand All @@ -380,8 +357,6 @@ turnpike|tpk|tpke
underpass|upas|upass|ups|under pass
union|un
unions|uns
us highway|us hwy|u s hwy
us route|us rte|u s rte
vale|va|vl
valley|vlly|vly|vy
valleys|vlys|vllys
Expand Down
21 changes: 1 addition & 20 deletions lib/analysis/dictionaries/en/street_types_overrides.txt
Original file line number Diff line number Diff line change
@@ -1,20 +1 @@
creek|cr
crossway|crwy
fairway|fy
flat|fl
lane|ln
mill|ml
park|pk
place|pl
port|prt
quadrant|qd
route|rt
terrasse|tr
turn|tn
boulevarde|bl
bridge|br
brook|brk
center|cntr
crescent|cr
passage|pass
pike|pk
concourse|conc
Loading

0 comments on commit 8aabc40

Please sign in to comment.