Skip to content

Commit

Permalink
feat(unit-number-extractor): add new mapper stream to separate concat…
Browse files Browse the repository at this point in the history
…enated unit numbers
  • Loading branch information
missinglink committed Feb 3, 2022
1 parent 71a7849 commit 2a02967
Show file tree
Hide file tree
Showing 6 changed files with 268 additions and 2 deletions.
4 changes: 3 additions & 1 deletion lib/streams/recordStream.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const ContentHashStream = require('./contentHashStream');
const ValidRecordFilterStream = require('./validRecordFilterStream');
const DocumentStream = require('./documentStream');
const gnafMapperStreamFactory = require('./gnafMapperStream');
const unitSplittingMapperStreamFactory = require('./unitSplittingMapperStream');

/*
* Construct a suitable id prefix for a CSV file given
Expand Down Expand Up @@ -65,7 +66,8 @@ function createRecordStream( filePath, dirPath ){
.pipe( validRecordFilterStream )
.pipe( cleanupStream )
.pipe( documentStream )
.pipe( gnafMapperStreamFactory() );
.pipe( gnafMapperStreamFactory() )
.pipe( unitSplittingMapperStreamFactory() );
}

function geojsonStream(stream) {
Expand Down
73 changes: 73 additions & 0 deletions lib/streams/unitSplittingMapperStream.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/**
The unit splitting mapper is responsible for detecting when the address.number
field contains the concatenation of the unit and the housenumber.
eg. Flat 2 14 Smith St
In this case we attempt to split the two terms into their consituent parts.
note: Addressing formats vary between countries, it's unlikely that a pattern
which works for one country will also work internationally. For this reason this
mapper accepts a country code which can be used to select the appropriate pattern(s).
Feel free to make changes to this mapping file!
**/

const _ = require('lodash');
const through = require('through2');
const logger = require('pelias-logger').get('openaddresses');
const mappers = {};

// Australasian Unit Number Mapper
// https://auspost.com.au/content/dam/auspost_corp/media/documents/Appendix-01.pdf
// https://www.nzpost.co.nz/sites/nz/files/2021-10/adv358-address-standards.pdf
const australasian = (doc) =>{
const number = doc.getAddress('number');
if(!_.isString(number) || number.length < 3){ return; }

// 2/14
const solidus = number.match(/^(\d+)\s*\/\s*(\d+)$/);
if (solidus) {
doc.setAddress('unit', solidus[1]);
doc.setAddress('number', solidus[2]);
return;
}

// Flat 2 14 | F 2 14 | Unit 2 14 | APT 2 14
const verbose = number.match(/^(flat|f|unit|apartment|apt)\s+(\d+)\s+(\d+)$/i);
if (verbose) {
doc.setAddress('unit', verbose[2]);
doc.setAddress('number', verbose[3]);
return;
}
};

// associate mappers with country codes
mappers.AU = australasian;
mappers.NZ = australasian;

module.exports = function () {
return through.obj((doc, enc, next) => {
try {
// only applies to records with a 'number' set and no 'unit' set (yet).
if (doc.hasAddress('number') && !doc.hasAddress('unit')) {

// select the appropriate mapper based on country code
const mapper = _.get(mappers, doc.getMeta('country_code'));
if (_.isFunction(mapper)) {

// run the country-specific mapper
mapper(doc);
}
}
}

catch (e) {
logger.error('unit_mapper error');
logger.error(e.stack);
logger.error(JSON.stringify(doc, null, 2));
}

return next(null, doc);
});
};
11 changes: 11 additions & 0 deletions test/data/au/input_file_4.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
LON,LAT,HASH,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID
144.9804144,-37.8723977,710daac656ffd0c3,10/244,BARKLY STREET,,ST KILDA,,VIC,"3182","50579518"
145.0378718,-37.8637847,92862c98c20bbe3d,10/244-246,WATTLETREE ROAD,,MALVERN,,VIC,"3144","208518759"
145.0003807,-37.8289596,d0a21035cebcd8ab,10/244-246,MARY STREET,,RICHMOND,,VIC,"3121","51463974"
144.978361,-37.8002503,4e891155eb009dc3,10/244,BRUNSWICK STREET,,FITZROY,,VIC,"3065","210464257"
144.9591621,-37.8331898,e20c57c01d5d42c0,110/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672310"
144.9591621,-37.8331898,50c85f85cce9181f,210/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672321"
144.9591621,-37.8331898,4e737a8cc6ada9ec,310/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672332"
144.9591621,-37.8331898,d6ed0494e8c53ff8,410/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672343"
144.9591621,-37.8331898,fa0691071a173dab,510/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672353"
144.925714,-37.7516895,00be263cea28bea0,10/244,PASCOE VALE ROAD,,ESSENDON,,VIC,"3040","429232726"
2 changes: 1 addition & 1 deletion test/data/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -354,4 +354,4 @@
}
}
}
]
]
179 changes: 179 additions & 0 deletions test/streams/unitSplittingMapperStream.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
var tape = require('tape');
const through = require('through2');
const mapper = require('../../lib/streams/unitSplittingMapperStream');
const Document = require('pelias-model').Document;

module.exports.tests = {};

// test exports
module.exports.tests.interface = function (test) {
test('interface: factory', t => {
t.equal(typeof mapper, 'function', 'stream factory');
t.end();
});
test('interface: stream', t => {
var stream = mapper();
t.equal(typeof stream, 'object', 'valid stream');
t.equal(typeof stream._read, 'function', 'valid readable');
t.equal(typeof stream._write, 'function', 'valid writeable');
t.end();
});
};

// ===================== australasian unit number mapping ======================

module.exports.tests.australasian_solidus = function (test) {
var doc = new Document('oa', 'example', 1);
doc.setName('default', '2/14 Smith Street');
doc.setAddress('number', '2/14');
doc.setAddress('street', 'Smith Street');
doc.setMeta('country_code', 'AU');

test('maps - split unit from housenumber', t => {
var stream = mapper();
stream.pipe(through.obj((doc, enc, next) => {
t.deepEqual(doc.getName('default'), '2/14 Smith Street', 'unchanged');
t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
t.deepEqual(doc.getAddress('number'), '14', 'mapped');
t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
t.end();
next();
}));
stream.write(doc);
});
};

module.exports.tests.australasian_solidus_with_whitespace = function (test) {
var doc = new Document('oa', 'example', 1);
doc.setName('default', '2 / 14 Smith Street');
doc.setAddress('number', '2 / 14');
doc.setAddress('street', 'Smith Street');
doc.setMeta('country_code', 'AU');

test('maps - split unit from housenumber', t => {
var stream = mapper();
stream.pipe(through.obj((doc, enc, next) => {
t.deepEqual(doc.getName('default'), '2 / 14 Smith Street', 'unchanged');
t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
t.deepEqual(doc.getAddress('number'), '14', 'mapped');
t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
t.end();
next();
}));
stream.write(doc);
});
};

module.exports.tests.australasian_flat_prefix = function (test) {
var doc = new Document('oa', 'example', 1);
doc.setName('default', 'Flat 2 14 Smith Street');
doc.setAddress('number', 'Flat 2 14');
doc.setAddress('street', 'Smith Street');
doc.setMeta('country_code', 'AU');

test('maps - split unit from housenumber', t => {
var stream = mapper();
stream.pipe(through.obj((doc, enc, next) => {
t.deepEqual(doc.getName('default'), 'Flat 2 14 Smith Street', 'unchanged');
t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
t.deepEqual(doc.getAddress('number'), '14', 'mapped');
t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
t.end();
next();
}));
stream.write(doc);
});
};

module.exports.tests.australasian_flat_prefix_abbreviated = function (test) {
var doc = new Document('oa', 'example', 1);
doc.setName('default', 'F 2 14 Smith Street');
doc.setAddress('number', 'F 2 14');
doc.setAddress('street', 'Smith Street');
doc.setMeta('country_code', 'AU');

test('maps - split unit from housenumber', t => {
var stream = mapper();
stream.pipe(through.obj((doc, enc, next) => {
t.deepEqual(doc.getName('default'), 'F 2 14 Smith Street', 'unchanged');
t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
t.deepEqual(doc.getAddress('number'), '14', 'mapped');
t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
t.end();
next();
}));
stream.write(doc);
});
};

module.exports.tests.australasian_unit_prefix = function (test) {
var doc = new Document('oa', 'example', 1);
doc.setName('default', 'Unit 2 14 Smith Street');
doc.setAddress('number', 'Unit 2 14');
doc.setAddress('street', 'Smith Street');
doc.setMeta('country_code', 'AU');

test('maps - split unit from housenumber', t => {
var stream = mapper();
stream.pipe(through.obj((doc, enc, next) => {
t.deepEqual(doc.getName('default'), 'Unit 2 14 Smith Street', 'unchanged');
t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
t.deepEqual(doc.getAddress('number'), '14', 'mapped');
t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
t.end();
next();
}));
stream.write(doc);
});
};

module.exports.tests.australasian_apartment_prefix = function (test) {
var doc = new Document('oa', 'example', 1);
doc.setName('default', 'Apartment 2 14 Smith Street');
doc.setAddress('number', 'Apartment 2 14');
doc.setAddress('street', 'Smith Street');
doc.setMeta('country_code', 'AU');

test('maps - split unit from housenumber', t => {
var stream = mapper();
stream.pipe(through.obj((doc, enc, next) => {
t.deepEqual(doc.getName('default'), 'Apartment 2 14 Smith Street', 'unchanged');
t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
t.deepEqual(doc.getAddress('number'), '14', 'mapped');
t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
t.end();
next();
}));
stream.write(doc);
});
};

module.exports.tests.australasian_apartment_prefix_abbreviated = function (test) {
var doc = new Document('oa', 'example', 1);
doc.setName('default', 'APT 2 14 Smith Street');
doc.setAddress('number', 'APT 2 14');
doc.setAddress('street', 'Smith Street');
doc.setMeta('country_code', 'AU');

test('maps - split unit from housenumber', t => {
var stream = mapper();
stream.pipe(through.obj((doc, enc, next) => {
t.deepEqual(doc.getName('default'), 'APT 2 14 Smith Street', 'unchanged');
t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
t.deepEqual(doc.getAddress('number'), '14', 'mapped');
t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
t.end();
next();
}));
stream.write(doc);
});
};


function test(name, testFunction) {
return tape('unit_splitting_mapper: ' + name, testFunction);
}

for (var testCase in module.exports.tests) {
module.exports.tests[testCase](test);
}
1 change: 1 addition & 0 deletions test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ require( './streams/gnafMapperStream' );
require( './streams/germanicAbbreviationStream');
require( './streams/isUSorCAHouseNumberZero' );
require( './streams/recordStream' );
require( './streams/unitSplittingMapperStream' );

0 comments on commit 2a02967

Please sign in to comment.