Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor dedupe middleware #1222

Merged
merged 6 commits into from
Oct 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions helper/TypeMapping.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ var TypeMapping = function(){
*/
this.layer_aliases = {};

/*
* A list of the canonical sources included in the default Pelias configuration
*/
this.canonical_sources = [];

/*
* An object that contains all sources or aliases. The key is the source or alias,
* the value is either that source, or the canonical name for that alias if it's an alias.
Expand Down Expand Up @@ -65,6 +70,11 @@ TypeMapping.prototype.setLayerAliases = function( aliases ){
this.layer_aliases = aliases;
};

// canonical sources setter
TypeMapping.prototype.setCanonicalSources = function( sources ){
this.canonical_sources = sources;
};

// generate mappings after setters have been run
TypeMapping.prototype.generateMappings = function(){
this.sources = Object.keys( this.layers_by_source );
Expand All @@ -75,6 +85,17 @@ TypeMapping.prototype.generateMappings = function(){
this.layer_mapping = TypeMapping.addStandardTargetsToAliases(this.layers, this.layer_aliases);
};

// generate a list of all layers which are part of the canonical Pelias configuration
TypeMapping.prototype.getCanonicalLayers = function(){
var canonicalLayers = [];
for( var source in this.layers_by_source ){
if( _.includes( this.canonical_sources, source ) ){
canonicalLayers = _.uniq( canonicalLayers.concat( this.layers_by_source[source] ) );
}
}
return canonicalLayers;
};

// load values from targets block
TypeMapping.prototype.loadTargets = function( targetsBlock ){

Expand All @@ -84,6 +105,7 @@ TypeMapping.prototype.loadTargets = function( targetsBlock ){
this.setSourceAliases( targetsBlock.source_aliases || {} );
this.setLayersBySource( targetsBlock.layers_by_source || {} );
this.setLayerAliases( targetsBlock.layer_aliases || {} );
this.setCanonicalSources( targetsBlock.canonical_sources || [] );

// generate the mappings
this.generateMappings();
Expand Down
236 changes: 105 additions & 131 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
@@ -1,176 +1,150 @@
var _ = require('lodash');
var placeTypes = require('./placeTypes');
const _ = require('lodash');
const placeTypes = require('./placeTypes');
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
const field = require('../helper/fieldValue');

/**
* Compare the layer properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Returns false if the objects are the same, else true.
*/
function assertLayerMatch(item1, item2) {
if (item1.layer === item2.layer) {
return false;
function isLayerDifferent(item1, item2){
if( isPropertyDifferent(item1, item2, 'layer') ){
// consider all custom layers to be analogous to a venue
if( ( item1.layer === 'venue' || !_.includes( canonicalLayers, item1.layer ) ) &&
( item2.layer === 'venue' || !_.includes( canonicalLayers, item2.layer ) ) ){
return false;
}
return true;
}

throw new Error('different');
return false;
}

/**
* Compare the parent.*_id properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Compare the parent properties if they exist.
* Returns false if the objects are the same, else true.
*/
function assertParentHierarchyMatch(item1, item2) {
// if neither object has parent, assume same
if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) {
return false;
}
function isParentHierarchyDifferent(item1, item2){
let parent1 = _.get(item1, 'parent');
let parent2 = _.get(item2, 'parent');

// if both have parent, do the rest of the checking
if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) {
placeTypes.forEach(function (placeType) {
// don't consider its own id
if (placeType === item1.layer) {
return;
}
propMatch(item1.parent, item2.parent, placeType + '_id');
});
return false;
}
// check if these are plain 'ol javascript objects
let isPojo1 = _.isPlainObject(parent1);
let isPojo2 = _.isPlainObject(parent2);

// if neither object has parent info, we consider them the same
if( !isPojo1 && !isPojo2 ){ return false; }

// if only one has parent info, we consider them the same
// note: this really shouldn't happen as at least on parent should exist
if( !isPojo1 || !isPojo2 ){ return false; }

// else both have parent info
// iterate over all the placetypes, comparing between items
return placeTypes.some( placeType => {

// if one has parent and the other doesn't consider different
throw new Error('different');
// skip the parent field corresponding to the item placetype
if( placeType === item1.layer ){ return false; }

// ensure the parent ids are the same for all placetypes
return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' );
});
}

/**
* Compare the name.* properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Compare the name properties if they exist.
* Returns false if the objects are the same, else true.
*/
function assertNameMatch(item1, item2) {
if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) {
for (var lang in item1.name) {
if(item2.name.hasOwnProperty(lang) || lang === 'default') {
// do not consider absence of an additional name as a difference
propMatch(item1.name, item2.name, lang);
}
function isNameDifferent(item1, item2){
let names1 = _.get(item1, 'name');
let names2 = _.get(item2, 'name');

// check if these are plain 'ol javascript objects
let isPojo1 = _.isPlainObject(names1);
let isPojo2 = _.isPlainObject(names2);

// if neither object has name info, we consider them the same
if( !isPojo1 && !isPojo2 ){ return false; }

// if only one has name info, we consider them the same
// note: this really shouldn't happen as name is a mandatory field
if( !isPojo1 || !isPojo2 ){ return false; }

// else both have name info
// iterate over all the languages in item1, comparing between items
return Object.keys(names1).some( lang => {

// do not consider absence of an additional name as a difference
// but strictly enfore that 'default' must be present and match
if( _.has(names2, lang) || lang === 'default' ){

// do not consider absence of an additional name as a difference
return isPropertyDifferent(names1, names2, lang);
}
}
else {
propMatch(item1, item2, 'name');
}
});
}

/**
* Compare the address_parts properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Returns false if the objects are the same, else true.
*/
function assertAddressMatch(item1, item2) {
// if neither record has address, assume same
if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) {
return false;
}
function isAddressDifferent(item1, item2){
let address1 = _.get(item1, 'address_parts');
let address2 = _.get(item2, 'address_parts');

// if both have address, check parts
if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) {
propMatch(item1.address_parts, item2.address_parts, 'number');
propMatch(item1.address_parts, item2.address_parts, 'street');
// check if these are plain 'ol javascript objects
let isPojo1 = _.isPlainObject(address1);
let isPojo2 = _.isPlainObject(address2);

// only compare zip if both records have it, otherwise just ignore and assume it's the same
// since by this time we've already compared parent hierarchies
if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) {
propMatch(item1.address_parts, item2.address_parts, 'zip');
}
// if neither object has address info, we consider them the same
if( !isPojo1 && !isPojo2 ){ return false; }

// if only one has address info, we consider them the same
if( !isPojo1 || !isPojo2 ){ return false; }

return false;
// else both have address info
if( isPropertyDifferent(address1, address2, 'number') ){ return true; }
if( isPropertyDifferent(address1, address2, 'street') ){ return true; }

// only compare zip if both records have it, otherwise just ignore and assume it's the same
// since by this time we've already compared parent hierarchies
if( _.has(address1, 'zip') && _.has(address2, 'zip') ){
if( isPropertyDifferent(address1, address2, 'zip') ){ return true; }
}

// one has address and the other doesn't, different!
throw new Error('different');
return false;
}

/**
* Compare the two records and return true if they differ and false if same.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/
function isDifferent(item1, item2) {
try {
assertLayerMatch(item1, item2);
assertParentHierarchyMatch(item1, item2);
assertNameMatch(item1, item2);
assertAddressMatch(item1, item2);
}
catch (err) {
if (err.message === 'different') {
return true;
}
throw err;
}

function isDifferent(item1, item2){
if( isLayerDifferent( item1, item2 ) ){ return true; }
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
if( isNameDifferent( item1, item2 ) ){ return true; }
if( isAddressDifferent( item1, item2 ) ){ return true; }
return false;
}

/**
* Throw exception if properties are different
*
* @param {object} item1
* @param {object} item2
* @param {string} prop
* @throws {Error}
* return true if properties are different
*/
function propMatch(item1, item2, prop) {
var prop1 = item1[prop];
var prop2 = item2[prop];
function isPropertyDifferent(item1, item2, prop ){

// in the case the property is an array (currently only in parent schema)
// simply take the 1st item. this will change in the near future to support multiple hierarchies
if (_.isArray(prop1)) { prop1 = prop1[0]; }
if (_.isArray(prop2)) { prop2 = prop2[0]; }
// if neither item has prop, we consider them the same
if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; }

if (normalizeString(prop1) !== normalizeString(prop2)) {
throw new Error('different');
}
// handle arrays and other non-string values
var prop1 = field.getStringValue( _.get( item1, prop ) );
var prop2 = field.getStringValue( _.get( item2, prop ) );

// compare strings
return normalizeString(prop1) !== normalizeString(prop2);
}

/**
* Remove punctuation and lowercase
*
* @param {string} str
* @returns {string}
* lowercase characters and remove some punctuation
*/
function normalizeString(str) {
if (!_.isString(str)) {
return str;
}

if (_.isEmpty(str)) {
return '';
}

function normalizeString(str){
return str.toLowerCase().split(/[ ,-]+/).join(' ');
}

Expand Down
Loading