Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mapper stream to separate concatenated unit numbers #502

Merged
merged 2 commits into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 18 additions & 16 deletions lib/streams/documentStream.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
const through = require( 'through2' );
const peliasModel = require( 'pelias-model' );

// examples: GAACT718519668, GASA_424005553
const GNAF_PID_PATTERN = /^(GA)(NSW|VIC|QLD|SA_|WA_|TAS|NT_|ACT|OT_)([0-9]{9})$/;
// patter to match a two character country code from the directory prefix
const COUNTRY_CODE_PATTERN = /^([A-Za-z]{2})\//;

/*
* Create a stream of Documents from valid, cleaned CSV records
Expand All @@ -22,26 +22,28 @@ function createDocumentStream(id_prefix, stats) {
uid++;

try {
const addrDoc = new peliasModel.Document( 'openaddresses', 'address', model_id )
.setName( 'default', (record.NUMBER + ' ' + record.STREET) )
.setCentroid( { lon: record.LON, lat: record.LAT } );

addrDoc.setAddress( 'number', record.NUMBER );

addrDoc.setAddress( 'street', record.STREET );
const doc = new peliasModel.Document('openaddresses', 'address', model_id)
.setName('default', `${record.NUMBER} ${record.STREET}`)
.setAddress('number', record.NUMBER)
.setAddress('street', record.STREET)
.setCentroid({ lon: record.LON, lat: record.LAT });

if (record.POSTCODE) {
addrDoc.setAddress( 'zip', record.POSTCODE );
doc.setAddress('zip', record.POSTCODE);
}

// detect Australian G-NAF PID concordances
if (id_prefix.startsWith('au/')) {
if (record.ID.length === 14 && record.ID.match(GNAF_PID_PATTERN)) {
addrDoc.setAddendum('concordances', {'gnaf:pid': record.ID});
}
// attempt to set the country code based on the directory prefix
const match = id_prefix.match(COUNTRY_CODE_PATTERN);
if (match && match[1]) {
doc.setMeta('country_code', match[1].toUpperCase());
}

this.push( addrDoc );
// store a reference to the original OA record in a 'meta'
// field, this is available through the pipeline but is not
// saved to elasticsearch.
doc.setMeta('oa', record);

this.push(doc);
}
catch ( ex ){
stats.badRecordCount++;
Expand Down
34 changes: 34 additions & 0 deletions lib/streams/gnafMapperStream.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/**
The GNAF mapper is responsible for extracting Australian GNAF
identifiers from the OA 'ID' property, where available.
**/

const _ = require('lodash');
const through = require('through2');
const logger = require('pelias-logger').get('openaddresses');

// examples: GAACT718519668, GASA_424005553
const GNAF_PID_PATTERN = /^(GA)(NSW|VIC|QLD|SA_|WA_|TAS|NT_|ACT|OT_)([0-9]{9})$/;

module.exports = function () {
return through.obj((doc, enc, next) => {
try {
if (doc.getMeta('country_code') === 'AU') {

// detect Australian G-NAF PID concordances
const oaid = _.get(doc.getMeta('oa'), 'ID');
if (oaid.length === 14 && oaid.match(GNAF_PID_PATTERN)) {
doc.setAddendum('concordances', { 'gnaf:pid': oaid });
}
}
}

catch (e) {
logger.error('gnaf_mapper error');
logger.error(e.stack);
logger.error(JSON.stringify(doc, null, 2));
}

return next(null, doc);
});
};
6 changes: 5 additions & 1 deletion lib/streams/recordStream.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ const CleanupStream = require('./cleanupStream');
const ContentHashStream = require('./contentHashStream');
const ValidRecordFilterStream = require('./validRecordFilterStream');
const DocumentStream = require('./documentStream');
const gnafMapperStreamFactory = require('./gnafMapperStream');
const unitSplittingMapperStreamFactory = require('./unitSplittingMapperStream');

/*
* Construct a suitable id prefix for a CSV file given
Expand Down Expand Up @@ -63,7 +65,9 @@ function createRecordStream( filePath, dirPath ){
.pipe( contentHashStream )
.pipe( validRecordFilterStream )
.pipe( cleanupStream )
.pipe( documentStream );
.pipe( documentStream )
.pipe( gnafMapperStreamFactory() )
.pipe( unitSplittingMapperStreamFactory() );
}

function geojsonStream(stream) {
Expand Down
73 changes: 73 additions & 0 deletions lib/streams/unitSplittingMapperStream.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/**
The unit splitting mapper is responsible for detecting when the address.number
field contains the concatenation of the unit and the housenumber.

eg. Flat 2 14 Smith St

In this case we attempt to split the two terms into their consituent parts.

note: Addressing formats vary between countries, it's unlikely that a pattern
which works for one country will also work internationally. For this reason this
mapper accepts a country code which can be used to select the appropriate pattern(s).

Feel free to make changes to this mapping file!
**/

const _ = require('lodash');
const through = require('through2');
const logger = require('pelias-logger').get('openaddresses');
const mappers = {};

// Australasian Unit Number Mapper
// https://auspost.com.au/content/dam/auspost_corp/media/documents/Appendix-01.pdf
// https://www.nzpost.co.nz/sites/nz/files/2021-10/adv358-address-standards.pdf
const australasian = (doc) =>{
const number = doc.getAddress('number');
if(!_.isString(number) || number.length < 3){ return; }

// 2/14
const solidus = number.match(/^(\d+)\s*\/\s*(\d+)$/);
if (solidus) {
doc.setAddress('unit', solidus[1]);
doc.setAddress('number', solidus[2]);
return;
}

// Flat 2 14 | F 2 14 | Unit 2 14 | APT 2 14
const verbose = number.match(/^(flat|f|unit|apartment|apt)\s*(\d+)\s+(\d+)$/i);
if (verbose) {
doc.setAddress('unit', verbose[2]);
doc.setAddress('number', verbose[3]);
return;
}
};

// associate mappers with country codes
mappers.AU = australasian;
mappers.NZ = australasian;

module.exports = function () {
return through.obj((doc, enc, next) => {
try {
// only applies to records with a 'number' set and no 'unit' set (yet).
if (doc.hasAddress('number') && !doc.hasAddress('unit')) {

// select the appropriate mapper based on country code
const mapper = _.get(mappers, doc.getMeta('country_code'));
if (_.isFunction(mapper)) {

// run the country-specific mapper
mapper(doc);
}
}
}

catch (e) {
logger.error('unit_mapper error');
logger.error(e.stack);
logger.error(JSON.stringify(doc, null, 2));
}

return next(null, doc);
});
};
2 changes: 2 additions & 0 deletions test/data/au/input_file_3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID
144.931874,-37.791488,10,Smith Street,,input city,input district,input region,input postcode,GAVIC718519668
11 changes: 11 additions & 0 deletions test/data/au/input_file_4.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
LON,LAT,HASH,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID
144.9804144,-37.8723977,710daac656ffd0c3,10/244,BARKLY STREET,,ST KILDA,,VIC,"3182","50579518"
145.0378718,-37.8637847,92862c98c20bbe3d,10/244-246,WATTLETREE ROAD,,MALVERN,,VIC,"3144","208518759"
145.0003807,-37.8289596,d0a21035cebcd8ab,10/244-246,MARY STREET,,RICHMOND,,VIC,"3121","51463974"
144.978361,-37.8002503,4e891155eb009dc3,10/244,BRUNSWICK STREET,,FITZROY,,VIC,"3065","210464257"
144.9591621,-37.8331898,e20c57c01d5d42c0,110/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672310"
144.9591621,-37.8331898,50c85f85cce9181f,210/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672321"
144.9591621,-37.8331898,4e737a8cc6ada9ec,310/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672332"
144.9591621,-37.8331898,d6ed0494e8c53ff8,410/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672343"
144.9591621,-37.8331898,fa0691071a173dab,510/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672353"
144.925714,-37.7516895,00be263cea28bea0,10/244,PASCOE VALE ROAD,,ESSENDON,,VIC,"3040","429232726"
Loading