Skip to content

Commit

Permalink
Merge pull request #1222 from pelias/dedupe
Browse files Browse the repository at this point in the history
refactor dedupe middleware
  • Loading branch information
orangejulius authored Oct 30, 2018
2 parents 6f20209 + b206960 commit 3535f9e
Show file tree
Hide file tree
Showing 7 changed files with 307 additions and 203 deletions.
22 changes: 22 additions & 0 deletions helper/TypeMapping.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ var TypeMapping = function(){
*/
this.layer_aliases = {};

/*
* A list of the canonical sources included in the default Pelias configuration
*/
this.canonical_sources = [];

/*
* An object that contains all sources or aliases. The key is the source or alias,
* the value is either that source, or the canonical name for that alias if it's an alias.
Expand Down Expand Up @@ -65,6 +70,11 @@ TypeMapping.prototype.setLayerAliases = function( aliases ){
this.layer_aliases = aliases;
};

// canonical sources setter
TypeMapping.prototype.setCanonicalSources = function( sources ){
this.canonical_sources = sources;
};

// generate mappings after setters have been run
TypeMapping.prototype.generateMappings = function(){
this.sources = Object.keys( this.layers_by_source );
Expand All @@ -75,6 +85,17 @@ TypeMapping.prototype.generateMappings = function(){
this.layer_mapping = TypeMapping.addStandardTargetsToAliases(this.layers, this.layer_aliases);
};

// generate a list of all layers which are part of the canonical Pelias configuration
TypeMapping.prototype.getCanonicalLayers = function(){
var canonicalLayers = [];
for( var source in this.layers_by_source ){
if( _.includes( this.canonical_sources, source ) ){
canonicalLayers = _.uniq( canonicalLayers.concat( this.layers_by_source[source] ) );
}
}
return canonicalLayers;
};

// load values from targets block
TypeMapping.prototype.loadTargets = function( targetsBlock ){

Expand All @@ -84,6 +105,7 @@ TypeMapping.prototype.loadTargets = function( targetsBlock ){
this.setSourceAliases( targetsBlock.source_aliases || {} );
this.setLayersBySource( targetsBlock.layers_by_source || {} );
this.setLayerAliases( targetsBlock.layer_aliases || {} );
this.setCanonicalSources( targetsBlock.canonical_sources || [] );

// generate the mappings
this.generateMappings();
Expand Down
236 changes: 105 additions & 131 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
@@ -1,176 +1,150 @@
var _ = require('lodash');
var placeTypes = require('./placeTypes');
const _ = require('lodash');
const placeTypes = require('./placeTypes');
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
const field = require('../helper/fieldValue');

/**
* Compare the layer properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Returns false if the objects are the same, else true.
*/
function assertLayerMatch(item1, item2) {
if (item1.layer === item2.layer) {
return false;
function isLayerDifferent(item1, item2){
if( isPropertyDifferent(item1, item2, 'layer') ){
// consider all custom layers to be analogous to a venue
if( ( item1.layer === 'venue' || !_.includes( canonicalLayers, item1.layer ) ) &&
( item2.layer === 'venue' || !_.includes( canonicalLayers, item2.layer ) ) ){
return false;
}
return true;
}

throw new Error('different');
return false;
}

/**
* Compare the parent.*_id properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Compare the parent properties if they exist.
* Returns false if the objects are the same, else true.
*/
function assertParentHierarchyMatch(item1, item2) {
// if neither object has parent, assume same
if (!item1.hasOwnProperty('parent') && !item2.hasOwnProperty('parent')) {
return false;
}
function isParentHierarchyDifferent(item1, item2){
let parent1 = _.get(item1, 'parent');
let parent2 = _.get(item2, 'parent');

// if both have parent, do the rest of the checking
if (item1.hasOwnProperty('parent') && item2.hasOwnProperty('parent')) {
placeTypes.forEach(function (placeType) {
// don't consider its own id
if (placeType === item1.layer) {
return;
}
propMatch(item1.parent, item2.parent, placeType + '_id');
});
return false;
}
// check if these are plain 'ol javascript objects
let isPojo1 = _.isPlainObject(parent1);
let isPojo2 = _.isPlainObject(parent2);

// if neither object has parent info, we consider them the same
if( !isPojo1 && !isPojo2 ){ return false; }

// if only one has parent info, we consider them the same
// note: this really shouldn't happen as at least on parent should exist
if( !isPojo1 || !isPojo2 ){ return false; }

// else both have parent info
// iterate over all the placetypes, comparing between items
return placeTypes.some( placeType => {

// if one has parent and the other doesn't consider different
throw new Error('different');
// skip the parent field corresponding to the item placetype
if( placeType === item1.layer ){ return false; }

// ensure the parent ids are the same for all placetypes
return isPropertyDifferent( item1.parent, item2.parent, placeType + '_id' );
});
}

/**
* Compare the name.* properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Compare the name properties if they exist.
* Returns false if the objects are the same, else true.
*/
function assertNameMatch(item1, item2) {
if (item1.hasOwnProperty('name') && item2.hasOwnProperty('name')) {
for (var lang in item1.name) {
if(item2.name.hasOwnProperty(lang) || lang === 'default') {
// do not consider absence of an additional name as a difference
propMatch(item1.name, item2.name, lang);
}
function isNameDifferent(item1, item2){
let names1 = _.get(item1, 'name');
let names2 = _.get(item2, 'name');

// check if these are plain 'ol javascript objects
let isPojo1 = _.isPlainObject(names1);
let isPojo2 = _.isPlainObject(names2);

// if neither object has name info, we consider them the same
if( !isPojo1 && !isPojo2 ){ return false; }

// if only one has name info, we consider them the same
// note: this really shouldn't happen as name is a mandatory field
if( !isPojo1 || !isPojo2 ){ return false; }

// else both have name info
// iterate over all the languages in item1, comparing between items
return Object.keys(names1).some( lang => {

// do not consider absence of an additional name as a difference
// but strictly enfore that 'default' must be present and match
if( _.has(names2, lang) || lang === 'default' ){

// do not consider absence of an additional name as a difference
return isPropertyDifferent(names1, names2, lang);
}
}
else {
propMatch(item1, item2, 'name');
}
});
}

/**
* Compare the address_parts properties if they exist.
* Returns false if the objects are the same, and throws
* an exception with the message 'different' if not.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
* Returns false if the objects are the same, else true.
*/
function assertAddressMatch(item1, item2) {
// if neither record has address, assume same
if (!item1.hasOwnProperty('address_parts') && !item2.hasOwnProperty('address_parts')) {
return false;
}
function isAddressDifferent(item1, item2){
let address1 = _.get(item1, 'address_parts');
let address2 = _.get(item2, 'address_parts');

// if both have address, check parts
if (item1.hasOwnProperty('address_parts') && item2.hasOwnProperty('address_parts')) {
propMatch(item1.address_parts, item2.address_parts, 'number');
propMatch(item1.address_parts, item2.address_parts, 'street');
// check if these are plain 'ol javascript objects
let isPojo1 = _.isPlainObject(address1);
let isPojo2 = _.isPlainObject(address2);

// only compare zip if both records have it, otherwise just ignore and assume it's the same
// since by this time we've already compared parent hierarchies
if (item1.address_parts.hasOwnProperty('zip') && item2.address_parts.hasOwnProperty('zip')) {
propMatch(item1.address_parts, item2.address_parts, 'zip');
}
// if neither object has address info, we consider them the same
if( !isPojo1 && !isPojo2 ){ return false; }

// if only one has address info, we consider them the same
if( !isPojo1 || !isPojo2 ){ return false; }

return false;
// else both have address info
if( isPropertyDifferent(address1, address2, 'number') ){ return true; }
if( isPropertyDifferent(address1, address2, 'street') ){ return true; }

// only compare zip if both records have it, otherwise just ignore and assume it's the same
// since by this time we've already compared parent hierarchies
if( _.has(address1, 'zip') && _.has(address2, 'zip') ){
if( isPropertyDifferent(address1, address2, 'zip') ){ return true; }
}

// one has address and the other doesn't, different!
throw new Error('different');
return false;
}

/**
* Compare the two records and return true if they differ and false if same.
*
* @param {object} item1
* @param {object} item2
* @returns {boolean}
* @throws {Error}
*/
function isDifferent(item1, item2) {
try {
assertLayerMatch(item1, item2);
assertParentHierarchyMatch(item1, item2);
assertNameMatch(item1, item2);
assertAddressMatch(item1, item2);
}
catch (err) {
if (err.message === 'different') {
return true;
}
throw err;
}

function isDifferent(item1, item2){
if( isLayerDifferent( item1, item2 ) ){ return true; }
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
if( isNameDifferent( item1, item2 ) ){ return true; }
if( isAddressDifferent( item1, item2 ) ){ return true; }
return false;
}

/**
* Throw exception if properties are different
*
* @param {object} item1
* @param {object} item2
* @param {string} prop
* @throws {Error}
* return true if properties are different
*/
function propMatch(item1, item2, prop) {
var prop1 = item1[prop];
var prop2 = item2[prop];
function isPropertyDifferent(item1, item2, prop ){

// in the case the property is an array (currently only in parent schema)
// simply take the 1st item. this will change in the near future to support multiple hierarchies
if (_.isArray(prop1)) { prop1 = prop1[0]; }
if (_.isArray(prop2)) { prop2 = prop2[0]; }
// if neither item has prop, we consider them the same
if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; }

if (normalizeString(prop1) !== normalizeString(prop2)) {
throw new Error('different');
}
// handle arrays and other non-string values
var prop1 = field.getStringValue( _.get( item1, prop ) );
var prop2 = field.getStringValue( _.get( item2, prop ) );

// compare strings
return normalizeString(prop1) !== normalizeString(prop2);
}

/**
* Remove punctuation and lowercase
*
* @param {string} str
* @returns {string}
* lowercase characters and remove some punctuation
*/
function normalizeString(str) {
if (!_.isString(str)) {
return str;
}

if (_.isEmpty(str)) {
return '';
}

function normalizeString(str){
return str.toLowerCase().split(/[ ,-]+/).join(' ');
}

Expand Down
Loading

0 comments on commit 3535f9e

Please sign in to comment.