From 8af64f309886ddc6e37ed18b6557e75ab9a19c4b Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 2 May 2019 16:13:50 +0200 Subject: [PATCH 01/55] feat(parser): replace addressit with pelias native parser --- controller/predicates/is_addressit_parse.js | 14 - controller/predicates/is_pelias_parse.js | 14 + package.json | 1 + query/autocomplete.js | 2 +- query/search_addressit.js | 6 +- query/text_parser_pelias.js | 88 ++++ routes/v1.js | 12 +- sanitizer/_text_pelias_parser.js | 133 ++++++ sanitizer/_tokenizer.js | 6 +- sanitizer/autocomplete.js | 2 +- ...addressit.js => defer_to_pelias_parser.js} | 2 +- ..._addressit_parse.js => is_pelias_parse.js} | 24 +- .../fixture/search_full_address_original.js | 20 +- .../search_partial_address_original.js | 9 - .../search_regions_address_original.js | 11 +- test/unit/query/autocomplete.js | 5 +- test/unit/query/search_addressit.js | 24 +- test/unit/run.js | 5 +- test/unit/sanitizer/_text_pelias_parser.js | 419 ++++++++++++++++++ test/unit/sanitizer/_tokenizer.js | 8 +- test/unit/sanitizer/autocomplete.js | 6 +- ...addressit.js => defer_to_pelias_parser.js} | 28 +- 22 files changed, 723 insertions(+), 116 deletions(-) delete mode 100644 controller/predicates/is_addressit_parse.js create mode 100644 controller/predicates/is_pelias_parse.js create mode 100644 query/text_parser_pelias.js create mode 100644 sanitizer/_text_pelias_parser.js rename sanitizer/{defer_to_addressit.js => defer_to_pelias_parser.js} (92%) rename test/unit/controller/predicates/{is_addressit_parse.js => is_pelias_parse.js} (59%) create mode 100644 test/unit/sanitizer/_text_pelias_parser.js rename test/unit/sanitizer/{defer_to_addressit.js => defer_to_pelias_parser.js} (73%) diff --git a/controller/predicates/is_addressit_parse.js b/controller/predicates/is_addressit_parse.js deleted file mode 100644 index 288b173cb..000000000 --- a/controller/predicates/is_addressit_parse.js +++ /dev/null @@ -1,14 +0,0 @@ -const _ = require('lodash'); -const Debug = require('../../helper/debug'); -const debugLog = new Debug('controller:predicates:is_addressit_parse'); -const stackTraceLine = require('../../helper/stackTraceLine'); - -// returns true IFF req.clean.parser is addressit -module.exports = (req, res) => { - const is_addressit_parse = _.get(req, 'clean.parser') === 'addressit'; - debugLog.push(req, () => ({ - reply: is_addressit_parse, - stack_trace: stackTraceLine() - })); - return is_addressit_parse; -}; diff --git a/controller/predicates/is_pelias_parse.js b/controller/predicates/is_pelias_parse.js new file mode 100644 index 000000000..79a6149ad --- /dev/null +++ b/controller/predicates/is_pelias_parse.js @@ -0,0 +1,14 @@ +const _ = require('lodash'); +const Debug = require('../../helper/debug'); +const debugLog = new Debug('controller:predicates:is_pelias_parse'); +const stackTraceLine = require('../../helper/stackTraceLine'); + +// returns true IFF req.clean.parser is pelias +module.exports = (req, res) => { + const is_pelias_parse = _.get(req, 'clean.parser') === 'pelias'; + debugLog.push(req, () => ({ + reply: is_pelias_parse, + stack_trace: stackTraceLine() + })); + return is_pelias_parse; +}; diff --git a/package.json b/package.json index 0b5102519..359beb0f0 100644 --- a/package.json +++ b/package.json @@ -56,6 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", + "pelias-parser": "^1.2.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", diff --git a/query/autocomplete.js b/query/autocomplete.js index c2ee3b5fe..446e5e3fe 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -1,6 +1,6 @@ const peliasQuery = require('pelias-query'); const defaults = require('./autocomplete_defaults'); -const textParser = require('./text_parser_addressit'); +const textParser = require('./text_parser_pelias'); const check = require('check-types'); const logger = require('pelias-logger').get('api'); const config = require('pelias-config').generate(); diff --git a/query/search_addressit.js b/query/search_addressit.js index 04823dd80..00d3acf12 100644 --- a/query/search_addressit.js +++ b/query/search_addressit.js @@ -1,6 +1,6 @@ const peliasQuery = require('pelias-query'); const defaults = require('./search_defaults'); -const textParser = require('./text_parser_addressit'); +const textParser = require('./text_parser_pelias'); const check = require('check-types'); const logger = require('pelias-logger').get('api'); const config = require('pelias-config').generate().api; @@ -37,8 +37,8 @@ query.score( peliasQuery.view.address('postcode') ); // country_a and region_a are left as matches here because the text-analyzer // can sometimes detect them, in which case a query more specific than a // multi_match is appropriate. -query.score( peliasQuery.view.admin('country_a') ); -query.score( peliasQuery.view.admin('region_a') ); +// query.score( peliasQuery.view.admin('country_a') ); +// query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin_multi_match(adminFields, 'peliasAdmin') ); query.score( views.custom_boosts( config.customBoosts ) ); diff --git a/query/text_parser_pelias.js b/query/text_parser_pelias.js new file mode 100644 index 000000000..394b80d3d --- /dev/null +++ b/query/text_parser_pelias.js @@ -0,0 +1,88 @@ +const _ = require('lodash'); +const logger = require('pelias-logger').get('api'); +const placeTypes = require('../helper/placeTypes'); + +/* +This list should only contain admin fields we are comfortable matching in the case +when we can't identify parts of an address. This shouldn't contain fields like country_a +or postalcode because we should only try to match those when we're sure that's what they are. + */ +const adminFields = placeTypes.concat([ + 'region_a' +]); + +// all the address parsing logic +function addParsedVariablesToQueryVariables(clean, vs) { + // ==== add parsed matches [address components] ==== + + // prefix (any unparsed text before any matched fields) + if (!_.isEmpty(clean.parsed_text.name)) { + vs.var('input:query', clean.parsed_text.name); + } + + // housenumber + if (!_.isEmpty(clean.parsed_text.housenumber)) { + vs.var('input:housenumber', clean.parsed_text.housenumber); + } + + // street name + if (!_.isEmpty(clean.parsed_text.street)) { + vs.var('input:street', clean.parsed_text.street); + } + + // cross street name + if (!_.isEmpty(clean.parsed_text.cross_street)) { + vs.var('input:cross_street', clean.parsed_text.cross_street); + } + + // postcode + if (!_.isEmpty(clean.parsed_text.postcode)) { + vs.var('input:postcode', clean.parsed_text.postcode); + } + + // ==== legacy components ==== + // @todo: can we remove this functionality? + + // is the 'name' label set? + if (clean.parsed_text.name) { + vs.var('input:name', clean.parsed_text.name); + } + else { + // is it a street address? + var isStreetAddress = !_.isEmpty(clean.parsed_text.housenumber) && !_.isEmpty(clean.parsed_text.street); + if (isStreetAddress) { + vs.var('input:name', clean.parsed_text.housenumber + ' ' + clean.parsed_text.street); + } + } + + // ==== add parsed matches [admin components] ==== + + // // locality + // if (!_.isEmpty(clean.parsed_text.locality)) { + // vs.var('input:locality', clean.parsed_text.locality); + // } + + // // region + // if (!_.isEmpty(clean.parsed_text.region)) { + // vs.var('input:region', clean.parsed_text.region); + // } + + // // country + // if (!_.isEmpty(clean.parsed_text.country)) { + // vs.var('input:country', clean.parsed_text.country); + // } + + // postfix + if (!_.isEmpty(clean.parsed_text.admin_parts)) { + // assign postfix to any admin fields which currently don't have a value assigned. + + // cycle through fields and set fields which are still currently unset + adminFields.forEach(key => { + if (!vs.isset('input:' + key)) { + vs.var('input:' + key, clean.parsed_text.admin_parts); + } + }); + } +} + +module.exports = addParsedVariablesToQueryVariables; diff --git a/routes/v1.js b/routes/v1.js index 29895ea3e..535203a49 100644 --- a/routes/v1.js +++ b/routes/v1.js @@ -11,7 +11,7 @@ var sanitizers = { autocomplete: require('../sanitizer/autocomplete'), place: require('../sanitizer/place'), search: require('../sanitizer/search'), - defer_to_addressit: require('../sanitizer/defer_to_addressit'), + defer_to_pelias_parser: require('../sanitizer/defer_to_pelias_parser'), structured_geocoding: require('../sanitizer/structured_geocoding'), reverse: require('../sanitizer/reverse'), nearby: require('../sanitizer/nearby') @@ -74,7 +74,7 @@ const hasRequestErrors = require('../controller/predicates/has_request_errors'); const isCoarseReverse = require('../controller/predicates/is_coarse_reverse'); const isAdminOnlyAnalysis = require('../controller/predicates/is_admin_only_analysis'); const hasResultsAtLayers = require('../controller/predicates/has_results_at_layers'); -const isAddressItParse = require('../controller/predicates/is_addressit_parse'); +const isPeliasItParse = require('../controller/predicates/is_pelias_parse'); const hasRequestCategories = require('../controller/predicates/has_request_parameter')('categories'); const isOnlyNonAdminLayers = require('../controller/predicates/is_only_non_admin_layers'); const isRequestLayersAnyAddressRelated = require('../controller/predicates/is_request_layers_any_address_related'); @@ -224,8 +224,8 @@ function addRoutes(app, peliasConfig) { not(placeholderShouldHaveExecuted) ); - // defer to addressit for analysis IF there's no response AND placeholder should not have executed - const shouldDeferToAddressIt = all( + // defer to pelias parser for analysis IF there's no response AND placeholder should not have executed + const shouldDeferToPeliasParser = all( not(hasRequestErrors), not(hasResponseData) ); @@ -233,7 +233,7 @@ function addRoutes(app, peliasConfig) { // call search addressit query if addressit was the parser const searchAddressitShouldExecute = all( not(hasRequestErrors), - isAddressItParse + isPeliasItParse ); // get language adjustments if: @@ -291,7 +291,7 @@ function addRoutes(app, peliasConfig) { // try 3 different query types: address search using ids, cascading fallback, addressit controllers.search(peliasConfig.api, esclient, queries.address_using_ids, searchWithIdsShouldExecute), controllers.search(peliasConfig.api, esclient, queries.cascading_fallback, fallbackQueryShouldExecute), - sanitizers.defer_to_addressit(shouldDeferToAddressIt), //run additional sanitizers needed for addressit parser + sanitizers.defer_to_pelias_parser(shouldDeferToPeliasParser), //run additional sanitizers needed for pelias parser controllers.search(peliasConfig.api, esclient, queries.search_addressit, searchAddressitShouldExecute), postProc.trimByGranularity(), postProc.distances('focus.point.'), diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js new file mode 100644 index 000000000..6c56df7af --- /dev/null +++ b/sanitizer/_text_pelias_parser.js @@ -0,0 +1,133 @@ +const Tokenizer = require('pelias-parser/tokenization/Tokenizer'); +const Solution = require('pelias-parser/solver/Solution'); +const AddressParser = require('pelias-parser/parser/AddressParser'); +const parser = new AddressParser(); +const _ = require('lodash'); + +/** + this module provides fulltext parsing using the pelias/parser module. + see: https://github.com/pelias/parser + + 'pelias parser' provides the following fields: + 'name', + 'housenumber', 'street', 'postcode', + 'locality', 'region', 'country', + 'admin_parts' +**/ + +// validate texts, convert types and apply defaults +function _sanitize (raw, clean) { + // error & warning messages + var messages = { errors: [], warnings: [] }; + + // invalid input 'text' + const text = _.trim(raw.text); + if (!_.isString(text) || _.isEmpty(text)) { + messages.errors.push('invalid param \'text\': text length, must be >0'); + } + + // valid input 'text' + else { + // parse text with pelias/parser + clean.text = text; + clean.parser = 'pelias'; + clean.parsed_text = parse(clean); + } + + return messages; +} + +function parse (clean) { + // parse text + const t = new Tokenizer(clean.text); + parser.classify(t); + parser.solve(t); + + // only use the first solution generated + // @todo: we could expand this in the future to accomodate more solutions + let solution = new Solution(); + if (t.solution.length) { solution = t.solution[0]; } + + // 1. map the output of the parser in to parsed_text + let parsed_text = {}; + + solution.pair.forEach(p => { + let field = p.classification.label; + + // handle intersections + if (field === 'street') { + field = (!parsed_text.street) ? 'street' : 'cross_street'; + } + + // set field + parsed_text[field] = p.span.body; + }); + + // 2. find any unclassified characters: + + // generate a classification mask, eg: + // 'Foo Cafe 10 Main St London 10010 Earth' + // ' NN SSSSSSS AAAAAA PPPPP ' + let mask = solution.mask(t); + + // the entire input text as seen by the parser with any postcode classification(s) removed + let body = t.span.body.split('') + .map((c, i) => (mask[i] !== 'P') ? c : ' ') + .join(''); + + // scan through the input text and 'bucket' characters in to one of two buckets: + // prefix: all unparsed characters that came before any parsed fields + // postfix: all unparsed characters from the first admin field to the end of the string + + // set cursor to the first classified character + let cursor = mask.search(/\S/); + if (cursor === -1) { cursor = body.length; } + let prefix = _.trim(body.substr(0, cursor), ' ,'); + + // set cursor to the first character of the first classified admin field + cursor = mask.indexOf('A'); + if (cursor === -1) { cursor = body.length; } + let postfix = _.trim(body.substr(cursor), ' ,'); + + // clean up spacing around commas + prefix = prefix.split(/[,\n\t]/).join(', '); + postfix = postfix.split(/[,\n\t]/).join(', '); + + // squash multiple adjacent whitespace characters into a single space + prefix = prefix.replace(/\s\s+/g, ' ').trim(); + postfix = postfix.replace(/\s\s+/g, ' ').trim(); + + // handle the case where 'parsed_text' is completely empty + // ie. the parser was not able to classify anything at all + // note: this is common for venue names + if (Object.keys(parsed_text).length === 0) { + if (prefix.length && !postfix.length) { + // if the prefix contains a comma + // then only use the first part for the prefix for the + // name and use the remaining tokens for the postfix + // eg. 'Friendly Cafe, Footown' + // note: this is how the old 'naive' parser worked + let split = prefix.split(','); + if (split.length > 1) { + prefix = split[0].trim(); + postfix = split.slice(1).join(', ').trim(); + } + } + } + + // 3. store the unparsed characters in fields which can be used for querying + if (prefix.length) { parsed_text.name = prefix; } + if (postfix.length) { parsed_text.admin_parts = postfix; } + + return parsed_text; +} + +function _expected () { + return [{ name: 'text' }]; +} + +// export function +module.exports = () => ({ + sanitize: _sanitize, + expected: _expected +}); diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index 966b88be4..cc02e42a1 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -36,14 +36,14 @@ function _sanitize( raw, clean ){ // else handle the case where parsed_text.street was produced but // no parsed_text.name is produced. - // additionally, handle the case where parsed_text.number is present + // additionally, handle the case where parsed_text.housenumber is present // note: the addressit module may also produce parsed_text.unit info // for now, we discard that information as we don't have an appropriate else if( _.has(clean.parsed_text, 'street') ){ text = [ - clean.parsed_text.number, + clean.parsed_text.housenumber, clean.parsed_text.street - ].filter(function(el){return el;}) + ].filter((el) => el) .join(' '); // remove empty elements } } diff --git a/sanitizer/autocomplete.js b/sanitizer/autocomplete.js index e987da2da..1abfb7cad 100644 --- a/sanitizer/autocomplete.js +++ b/sanitizer/autocomplete.js @@ -6,7 +6,7 @@ module.exports.middleware = (_api_pelias_config) => { var sanitizers = { singleScalarParameters: require('../sanitizer/_single_scalar_parameters')(), debug: require('../sanitizer/_debug')(), - text: require('../sanitizer/_text_addressit')(), + text: require('../sanitizer/_text_pelias_parser')(), tokenizer: require('../sanitizer/_tokenizer')(), size: require('../sanitizer/_size')(/* use defaults*/), layers: require('../sanitizer/_targets')('layers', type_mapping.layer_mapping), diff --git a/sanitizer/defer_to_addressit.js b/sanitizer/defer_to_pelias_parser.js similarity index 92% rename from sanitizer/defer_to_addressit.js rename to sanitizer/defer_to_pelias_parser.js index 371687046..4ff8bdb7b 100644 --- a/sanitizer/defer_to_addressit.js +++ b/sanitizer/defer_to_pelias_parser.js @@ -1,7 +1,7 @@ const sanitizeAll = require('../sanitizer/sanitizeAll'), sanitizers = { debug: require('../sanitizer/_debug')(), - text: require('../sanitizer/_text_addressit')() + text: require('../sanitizer/_text_pelias_parser')() }; const logger = require('pelias-logger').get('api'); diff --git a/test/unit/controller/predicates/is_addressit_parse.js b/test/unit/controller/predicates/is_pelias_parse.js similarity index 59% rename from test/unit/controller/predicates/is_addressit_parse.js rename to test/unit/controller/predicates/is_pelias_parse.js index 219c36fff..07fecefdf 100644 --- a/test/unit/controller/predicates/is_addressit_parse.js +++ b/test/unit/controller/predicates/is_pelias_parse.js @@ -1,24 +1,24 @@ const _ = require('lodash'); -const is_addressit_parse = require('../../../../controller/predicates/is_addressit_parse'); +const is_pelias_parse = require('../../../../controller/predicates/is_pelias_parse'); module.exports.tests = {}; module.exports.tests.interface = (test, common) => { test('valid interface', t => { - t.ok(_.isFunction(is_addressit_parse), 'is_addressit_parse is a function'); + t.ok(_.isFunction(is_pelias_parse), 'is_pelias_parse is a function'); t.end(); }); }; module.exports.tests.true_conditions = (test, common) => { - test('request.clean.parser=addressit should return true', t => { + test('request.clean.parser=pelias should return true', t => { const req = { clean: { - parser: 'addressit' + parser: 'pelias' } }; - t.ok(is_addressit_parse(req)); + t.ok(is_pelias_parse(req)); t.end(); }); @@ -27,14 +27,14 @@ module.exports.tests.true_conditions = (test, common) => { module.exports.tests.false_conditions = (test, common) => { test('undefined request should return false', t => { - t.notOk(is_addressit_parse(undefined)); + t.notOk(is_pelias_parse(undefined)); t.end(); }); test('undefined request.clean should return false', t => { const req = {}; - t.notOk(is_addressit_parse(req)); + t.notOk(is_pelias_parse(req)); t.end(); }); @@ -43,18 +43,18 @@ module.exports.tests.false_conditions = (test, common) => { clean: {} }; - t.notOk(is_addressit_parse(req)); + t.notOk(is_pelias_parse(req)); t.end(); }); - test('non-\'addressit\' request.clean.parser should return false', t => { + test('non-\'pelias\' request.clean.parser should return false', t => { const req = { clean: { - parser: 'not addressit' + parser: 'not pelias' } }; - t.notOk(is_addressit_parse(req)); + t.notOk(is_pelias_parse(req)); t.end(); }); @@ -62,7 +62,7 @@ module.exports.tests.false_conditions = (test, common) => { module.exports.all = (tape, common) => { function test(name, testFunction) { - return tape(`GET /is_addressit_parse ${name}`, testFunction); + return tape(`GET /is_pelias_parse ${name}`, testFunction); } for( const testCase in module.exports.tests ){ diff --git a/test/unit/fixture/search_full_address_original.js b/test/unit/fixture/search_full_address_original.js index e400def60..9caff384b 100644 --- a/test/unit/fixture/search_full_address_original.js +++ b/test/unit/fixture/search_full_address_original.js @@ -105,24 +105,6 @@ module.exports = { 'analyzer': vs['address:postcode:analyzer'] } } - }, { - 'match': { - 'parent.country_a': { - 'query': 'USA', - 'cutoff_frequency': 0.01, - 'boost': vs['admin:country_a:boost'], - 'analyzer': vs['admin:country_a:analyzer'] - } - } - }, { - 'match': { - 'parent.region_a': { - 'query': 'NY', - 'cutoff_frequency': 0.01, - 'boost': vs['admin:region_a:boost'], - 'analyzer': vs['admin:region_a:analyzer'] - } - } }, { 'multi_match': { 'fields': [ @@ -135,7 +117,7 @@ module.exports = { 'parent.neighbourhood^1', 'parent.region_a^1' ], - 'query': 'new york', + 'query': 'new york ny US', 'analyzer': 'peliasAdmin', 'cutoff_frequency': 0.01 } diff --git a/test/unit/fixture/search_partial_address_original.js b/test/unit/fixture/search_partial_address_original.js index 57bd61f84..f808896d4 100644 --- a/test/unit/fixture/search_partial_address_original.js +++ b/test/unit/fixture/search_partial_address_original.js @@ -77,15 +77,6 @@ module.exports = { 'weight': 2 }] } - }, { - 'match': { - 'parent.region_a': { - 'analyzer': 'peliasAdmin', - 'boost': 1, - 'cutoff_frequency': 0.01, - 'query': 'NY' - } - } }, { 'multi_match': { 'fields': [ diff --git a/test/unit/fixture/search_regions_address_original.js b/test/unit/fixture/search_regions_address_original.js index 9d05aaefa..c5053a037 100644 --- a/test/unit/fixture/search_regions_address_original.js +++ b/test/unit/fixture/search_regions_address_original.js @@ -95,15 +95,6 @@ module.exports = { 'analyzer': vs['address:street:analyzer'] } } - }, { - 'match': { - 'parent.region_a': { - 'query': 'NY', - 'cutoff_frequency': 0.01, - 'boost': vs['admin:region_a:boost'], - 'analyzer': vs['admin:region_a:analyzer'] - } - } }, { 'multi_match': { 'fields': [ @@ -116,7 +107,7 @@ module.exports = { 'parent.neighbourhood^1', 'parent.region_a^1' ], - 'query': 'manhattan', + 'query': 'manhattan ny', 'analyzer': 'peliasAdmin', 'cutoff_frequency': 0.01 } diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index 16d16f4b6..6b324c898 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -57,7 +57,6 @@ module.exports.tests.query = function(test, common) { text: 'one two, three', parsed_text: { name: 'one two', - regions: [ 'one two', 'three' ], admin_parts: 'three' }, tokens: ['one','two'], @@ -258,9 +257,9 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'k road, laird', parsed_text: { - name: 'k road', street: 'k road', - regions: [ 'laird' ] + locality: 'laird', + admin_parts: 'laird' }, tokens: ['k', 'road'], tokens_complete: ['k', 'road'], diff --git a/test/unit/query/search_addressit.js b/test/unit/query/search_addressit.js index 2d1180e23..401d282e2 100644 --- a/test/unit/query/search_addressit.js +++ b/test/unit/query/search_addressit.js @@ -106,12 +106,12 @@ module.exports.tests.query = function(test, common) { layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, parsed_text: { - number: '123', + housenumber: '123', street: 'main st', - state: 'NY', - country: 'USA', - postalcode: '10010', - regions: [ 'new york' ] + region: 'new york', + locality: 'ny', + postcode: '10010', + admin_parts: 'new york ny US' } }); @@ -127,9 +127,9 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'soho grand, new york', layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: { name: 'soho grand', - state: 'NY', - regions: [ 'soho grand' ], + parsed_text: { + name: 'soho grand', + region: 'new york', admin_parts: 'new york' } }); @@ -146,10 +146,12 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: '1 water st manhattan ny', layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, - parsed_text: { number: '1', + parsed_text: { + housenumber: '1', street: 'water st', - state: 'NY', - regions: [ 'manhattan' ] + locality: 'manhattan', + region: 'ny', + admin_parts: 'manhattan ny' } }); diff --git a/test/unit/run.js b/test/unit/run.js index a4f826991..1d38d5ac6 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -24,7 +24,7 @@ var tests = [ require('./controller/predicates/has_results_at_layers'), require('./controller/predicates/has_request_parameter'), require('./controller/predicates/has_request_errors'), - require('./controller/predicates/is_addressit_parse'), + require('./controller/predicates/is_pelias_parse'), require('./controller/predicates/is_admin_only_analysis'), require('./controller/predicates/is_coarse_reverse'), require('./controller/predicates/is_only_non_admin_layers'), @@ -98,6 +98,7 @@ var tests = [ require('./sanitizer/_synthesize_analysis'), require('./sanitizer/_text'), require('./sanitizer/_text_addressit'), + require('./sanitizer/_text_pelias_parser'), require('./sanitizer/_tokenizer'), require('./sanitizer/_categories'), require('./sanitizer/_boundary_gid'), @@ -108,7 +109,7 @@ var tests = [ require('./sanitizer/reverse'), require('./sanitizer/sanitizeAll'), require('./sanitizer/search'), - require('./sanitizer/defer_to_addressit'), + require('./sanitizer/defer_to_pelias_parser'), require('./sanitizer/wrap'), require('./service/configurations/Interpolation'), require('./service/configurations/Language'), diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js new file mode 100644 index 000000000..9a40ef7fc --- /dev/null +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -0,0 +1,419 @@ +var sanitizer = require('../../../sanitizer/_text_pelias_parser')(); +var type_mapping = require('../../../helper/type_mapping'); + +module.exports.tests = {}; + +module.exports.tests.text_parser = function (test, common) { + test('short input text has admin layers set ', function (t) { + var raw = { + text: 'emp' //start of empire state building + }; + var clean = { + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + + var usQueries = [ + { name: 'soho', admin_parts: 'new york', region: 'NY' }, + { name: '123 main', admin_parts: 'new york', region: 'NY' } + ]; + + usQueries.forEach(function (query) { + test('naive parsing ' + query, function (t) { + var raw = { + text: query.name + ', ' + query.admin_parts + }; + var clean = {}; + + var expected_clean = { + text: raw.text.trim(), + parser: 'pelias', + parsed_text: { + name: query.name, + region: query.admin_parts, + admin_parts: query.admin_parts + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('naive parsing ' + query + ' without spaces', function (t) { + var raw = { + text: query.name + ',' + query.admin_parts + }; + var clean = {}; + + var expected_clean = { + text: raw.text.trim(), + parser: 'pelias', + parsed_text: { + name: query.name, + region: query.admin_parts, + admin_parts: query.admin_parts + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('naive parsing ' + query + ' with leading and trailing junk', function (t) { + var raw = { + text: ' , ' + query.name + ',' + query.admin_parts + ' , ' + }; + var clean = {}; + + var expected_clean = { + text: raw.text.trim(), + parser: 'pelias', + parsed_text: { + name: query.name, + region: query.admin_parts, + admin_parts: query.admin_parts + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + }); + + var nonUSQueries = [ + { name: 'chelsea', admin_parts: 'london' }, + ]; + + nonUSQueries.forEach(function (query) { + test('naive parsing ' + query, function (t) { + var raw = { + text: query.name + ', ' + query.admin_parts + }; + var clean = {}; + + var expected_clean = { + text: query.name + ', ' + query.admin_parts, + parser: 'pelias', + parsed_text: { + locality: query.name, + admin_parts: query.name + ', ' + query.admin_parts + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('naive parsing ' + query + ' without spaces', function (t) { + var raw = { + text: query.name + ',' + query.admin_parts + }; + var clean = {}; + + var expected_clean = { + text: query.name + ',' + query.admin_parts, + parser: 'pelias', + parsed_text: { + locality: query.name, + admin_parts: query.name + ', ' + query.admin_parts + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + }); + + test('query with one token', function (t) { + var raw = { + text: 'yugolsavia' + }; + var clean = {}; + clean.parsed_text = 'this should be removed'; + + var expected_clean = { + parser: 'pelias', + text: 'yugolsavia', + parsed_text: { + name: 'yugolsavia' + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('query with two tokens, no numbers', function (t) { + var raw = { + text: 'small town' + }; + var clean = {}; + clean.parsed_text = 'this should be removed'; + + var expected_clean = { + parser: 'pelias', + text: 'small town', + parsed_text: { + name: 'small town' + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('query with two tokens, number first', function (t) { + var raw = { + text: '123 main' + }; + var clean = {}; + clean.parsed_text = 'this should be removed'; + + var expected_clean = { + parser: 'pelias', + text: '123 main', + parsed_text: { + name: '123 main' + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('query with two tokens, number second', function (t) { + var raw = { + text: 'main 123' + }; + var clean = {}; + clean.parsed_text = 'this should be removed'; + + var expected_clean = { + parser: 'pelias', + text: 'main 123', + parsed_text: { + name: 'main 123' + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('query with many tokens', function (t) { + var raw = { + text: 'main particle new york' + }; + var clean = {}; + clean.parsed_text = 'this should be removed'; + + var expected_clean = { + text: 'main particle new york', + parser: 'pelias', + parsed_text: { + name: 'main particle', + region: 'new york', + admin_parts: 'new york' + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('valid address, house number', function (t) { + var raw = { + text: '123 main st new york ny' + }; + var clean = {}; + + var expected_clean = { + text: '123 main st new york ny', + parser: 'pelias', + parsed_text: { + housenumber: '123', + street: 'main st', + region: 'new york', + locality: 'ny', + admin_parts: 'new york ny' + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('valid address, zipcode', function (t) { + var raw = { + text: '123 main st new york ny 10010' + }; + var clean = {}; + + var expected_clean = { + text: '123 main st new york ny 10010', + parser: 'pelias', + parsed_text: { + housenumber: '123', + street: 'main st', + region: 'new york', + locality: 'ny', + postcode: '10010', + admin_parts: 'new york ny' + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + }); + + test('valid address with leading 0s in zipcode', function (t) { + var raw = { + text: '339 W Main St, Cheshire, 06410' + }; + var clean = {}; + + var expected_clean = { + text: '339 W Main St, Cheshire, 06410', + parser: 'pelias', + parsed_text: { + housenumber: '339', + street: 'W Main St', + postcode: '06410', + region: 'Cheshire', + admin_parts: 'Cheshire' + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + }); + + test('valid address without spaces after commas', function (t) { + var raw = { + text: '339 W Main St,Lancaster,PA' + }; + var clean = {}; + + var expected_clean = { + text: '339 W Main St,Lancaster,PA', + parser: 'pelias', + parsed_text: { + housenumber: '339', + street: 'W Main St', + locality: 'Lancaster', + region: 'PA', + admin_parts: 'Lancaster, PA' + } + }; + + var messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }); + t.deepEqual(clean, expected_clean); + t.end(); + + }); + + test('whitespace-only input counts as empty', (t) => { + const raw = { text: ' ' }; + const clean = {}; + + const expected_clean = {}; + + const messages = sanitizer.sanitize(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, ['invalid param \'text\': text length, must be >0']); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + }); + + test('return an array of expected parameters in object form for validation', (t) => { + const expected = [{ name: 'text' }]; + const validParameters = sanitizer.expected(); + t.deepEquals(validParameters, expected); + t.end(); + }); + + test('Australia - state only', (t) => { + const raw = { text: 'NSW' }; + const clean = {}; + const expected_clean = { text: 'NSW', parser: 'pelias', parsed_text: { + region: 'NSW', + admin_parts: 'NSW' + }}; + const messages = sanitizer.sanitize(raw, clean); + + t.deepEquals(clean, expected_clean); + t.deepEquals(messages.errors, []); + t.deepEquals(messages.warnings, [], 'no warnings'); + t.end(); + }); +}; + +module.exports.all = function (tape, common) { + function test(name, testFunction) { + return tape('sanitizer _text: ' + name, testFunction); + } + + for (var testCase in module.exports.tests) { + module.exports.tests[testCase](test, common); + } +}; diff --git a/test/unit/sanitizer/_tokenizer.js b/test/unit/sanitizer/_tokenizer.js index 3de769d5c..181a32106 100644 --- a/test/unit/sanitizer/_tokenizer.js +++ b/test/unit/sanitizer/_tokenizer.js @@ -183,12 +183,12 @@ module.exports.tests.sanity_checks = function(test, common) { }); test('favor clean.parsed_text street data over clean.text', function(t) { - var clean = { parsed_text: { number: '190', street: 'foo st' }, text: 'bar' }; + var clean = { parsed_text: { housenumber: '190', street: 'foo st' }, text: 'bar' }; var messages = sanitizer.sanitize({}, clean); // favor clean.parsed_text.name over clean.text - t.deepEquals(clean.tokens, [ '190', 'foo', 'st' ], 'use street name + number'); - t.deepEquals(clean.tokens_complete, [ '190', 'foo', 'st' ], 'use street name + number'); + t.deepEquals(clean.tokens, [ '190', 'foo', 'st' ], 'use street name + housenumber'); + t.deepEquals(clean.tokens_complete, [ '190', 'foo', 'st' ], 'use street name + housenumber'); t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); // no errors/warnings produced @@ -199,7 +199,7 @@ module.exports.tests.sanity_checks = function(test, common) { }); test('favor clean.parsed_text.name over clean.parsed_text street data', function(t) { - var clean = { parsed_text: { number: '190', street: 'foo st', name: 'foo' }, text: 'bar' }; + var clean = { parsed_text: { housenumber: '190', street: 'foo st', name: 'foo' }, text: 'bar' }; var messages = sanitizer.sanitize({}, clean); // favor clean.parsed_text.name over all other variables diff --git a/test/unit/sanitizer/autocomplete.js b/test/unit/sanitizer/autocomplete.js index 1ff106dcd..2f7227a61 100644 --- a/test/unit/sanitizer/autocomplete.js +++ b/test/unit/sanitizer/autocomplete.js @@ -24,10 +24,10 @@ module.exports.tests.sanitizers = function(test, common) { } }; }, - '../sanitizer/_text_addressit': function () { + '../sanitizer/_text_pelias_parser': function () { return { sanitize: () => { - called_sanitizers.push('_text_addressit'); + called_sanitizers.push('_text_pelias_parser'); return { errors: [], warnings: [] }; } }; @@ -142,7 +142,7 @@ module.exports.tests.sanitizers = function(test, common) { const expected_sanitizers = [ '_single_scalar_parameters', '_debug', - '_text_addressit', + '_text_pelias_parser', '_tokenizer', '_size', '_targets/layers', diff --git a/test/unit/sanitizer/defer_to_addressit.js b/test/unit/sanitizer/defer_to_pelias_parser.js similarity index 73% rename from test/unit/sanitizer/defer_to_addressit.js rename to test/unit/sanitizer/defer_to_pelias_parser.js index d649c2970..d245f078b 100644 --- a/test/unit/sanitizer/defer_to_addressit.js +++ b/test/unit/sanitizer/defer_to_pelias_parser.js @@ -11,11 +11,11 @@ module.exports.tests.sanitize = (test, common) => { // rather than re-verify the functionality of all the sanitizers, this test just verifies that they // were all called correctly - const defer_to_addressit = proxyquire('../../../sanitizer/defer_to_addressit', { - '../sanitizer/_text_addressit': function () { + const defer_to_pelias_parser = proxyquire('../../../sanitizer/defer_to_pelias_parser', { + '../sanitizer/_text_pelias_parser': function () { return { sanitize: () => { - t.fail('_text_addressit should not have been called'); + t.fail('_text_pelias_parser should not have been called'); } }; }, @@ -29,25 +29,25 @@ module.exports.tests.sanitize = (test, common) => { } })(() => false); - defer_to_addressit({}, {}, () => { + defer_to_pelias_parser({}, {}, () => { t.equals(logger.getInfoMessages().length, 0); t.end(); }); }); - test('verify that _text_addressit sanitizer was called when should_execute returns true', (t) => { + test('verify that _text_pelias_parser sanitizer was called when should_execute returns true', (t) => { t.plan(2); const logger = mock_logger(); // rather than re-verify the functionality of all the sanitizers, this test just verifies that they // were all called correctly - const defer_to_addressit = proxyquire('../../../sanitizer/defer_to_addressit', { - '../sanitizer/_text_addressit': function () { + const defer_to_pelias_parser = proxyquire('../../../sanitizer/defer_to_pelias_parser', { + '../sanitizer/_text_pelias_parser': function () { return { sanitize: () => { - t.pass('_text_addressit should have been called'); + t.pass('_text_pelias_parser should have been called'); return { errors: [], warnings: [] }; } }; @@ -73,7 +73,7 @@ module.exports.tests.sanitize = (test, common) => { } }; - defer_to_addressit(req, {}, () => { + defer_to_pelias_parser(req, {}, () => { t.end(); }); @@ -86,11 +86,11 @@ module.exports.tests.sanitize = (test, common) => { // rather than re-verify the functionality of all the sanitizers, this test just verifies that they // were all called correctly - const defer_to_addressit = proxyquire('../../../sanitizer/defer_to_addressit', { - '../sanitizer/_text_addressit': function () { + const defer_to_pelias_parser = proxyquire('../../../sanitizer/defer_to_pelias_parser', { + '../sanitizer/_text_pelias_parser': function () { return { sanitize: () => { - t.pass('_text_addressit should have been called'); + t.pass('_text_pelias_parser should have been called'); return { errors: [], warnings: [] }; } }; @@ -113,7 +113,7 @@ module.exports.tests.sanitize = (test, common) => { } }; - defer_to_addressit(req, {}, () => { + defer_to_pelias_parser(req, {}, () => { t.deepEquals(logger.getInfoMessages(), []); t.end(); }); @@ -124,7 +124,7 @@ module.exports.tests.sanitize = (test, common) => { module.exports.all = function (tape, common) { function test(name, testFunction) { - return tape(`SANITIZE /defer_to_addressit ${name}`, testFunction); + return tape(`SANITIZE /defer_to_pelias_parser ${name}`, testFunction); } for( var testCase in module.exports.tests ){ From 62d23e2ceac5a75243ba8c4288ceae9eb7616794 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 2 May 2019 16:34:29 +0200 Subject: [PATCH 02/55] feat(parser): improved postfix cursor position for text with no admin classification --- sanitizer/_text_pelias_parser.js | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index 6c56df7af..c4cc7199c 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -77,16 +77,23 @@ function parse (clean) { // scan through the input text and 'bucket' characters in to one of two buckets: // prefix: all unparsed characters that came before any parsed fields - // postfix: all unparsed characters from the first admin field to the end of the string + // postfix: all characters from the first admin field to the end of the string // set cursor to the first classified character let cursor = mask.search(/\S/); if (cursor === -1) { cursor = body.length; } let prefix = _.trim(body.substr(0, cursor), ' ,'); - // set cursor to the first character of the first classified admin field - cursor = mask.indexOf('A'); - if (cursor === -1) { cursor = body.length; } + // solution includes address classification + // set cursor after the last classified address character + if (mask.search(/[NS]/) > -1) { + cursor = Math.max(mask.lastIndexOf('N'), mask.lastIndexOf('S')) + 1; + } + // solution includes admin classification + // set cursor to the first classified admin character + else if( mask.includes('A') ){ cursor = mask.indexOf('A'); } + // else set cursor to end-of-text + else { cursor = body.length; } let postfix = _.trim(body.substr(cursor), ' ,'); // clean up spacing around commas From 463245246595d9069a449c0f730b1e9407756b2e Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 3 May 2019 13:12:11 +0200 Subject: [PATCH 03/55] feat(parser): pelias/parser improvements --- package.json | 2 +- query/text_parser_pelias.js | 25 ++------ sanitizer/_text_pelias_parser.js | 46 +++++++++++-- sanitizer/_tokenizer.js | 17 +---- test/unit/query/autocomplete.js | 6 +- test/unit/query/search_addressit.js | 9 ++- test/unit/sanitizer/_text_pelias_parser.js | 75 +++++++++++++--------- test/unit/sanitizer/_tokenizer.js | 10 ++- 8 files changed, 112 insertions(+), 78 deletions(-) diff --git a/package.json b/package.json index 359beb0f0..0ac55677a 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.2.0", + "pelias-parser": "^1.3.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", diff --git a/query/text_parser_pelias.js b/query/text_parser_pelias.js index 394b80d3d..779a3895b 100644 --- a/query/text_parser_pelias.js +++ b/query/text_parser_pelias.js @@ -15,9 +15,9 @@ const adminFields = placeTypes.concat([ function addParsedVariablesToQueryVariables(clean, vs) { // ==== add parsed matches [address components] ==== - // prefix (any unparsed text before any matched fields) - if (!_.isEmpty(clean.parsed_text.name)) { - vs.var('input:query', clean.parsed_text.name); + // name + if (!_.isEmpty(clean.parsed_text.subject)) { + vs.var('input:name', clean.parsed_text.subject); } // housenumber @@ -40,21 +40,6 @@ function addParsedVariablesToQueryVariables(clean, vs) { vs.var('input:postcode', clean.parsed_text.postcode); } - // ==== legacy components ==== - // @todo: can we remove this functionality? - - // is the 'name' label set? - if (clean.parsed_text.name) { - vs.var('input:name', clean.parsed_text.name); - } - else { - // is it a street address? - var isStreetAddress = !_.isEmpty(clean.parsed_text.housenumber) && !_.isEmpty(clean.parsed_text.street); - if (isStreetAddress) { - vs.var('input:name', clean.parsed_text.housenumber + ' ' + clean.parsed_text.street); - } - } - // ==== add parsed matches [admin components] ==== // // locality @@ -73,13 +58,13 @@ function addParsedVariablesToQueryVariables(clean, vs) { // } // postfix - if (!_.isEmpty(clean.parsed_text.admin_parts)) { + if (!_.isEmpty(clean.parsed_text.admin)) { // assign postfix to any admin fields which currently don't have a value assigned. // cycle through fields and set fields which are still currently unset adminFields.forEach(key => { if (!vs.isset('input:' + key)) { - vs.var('input:' + key, clean.parsed_text.admin_parts); + vs.var('input:' + key, clean.parsed_text.admin); } }); } diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index c4cc7199c..c2426480d 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -12,7 +12,7 @@ const _ = require('lodash'); 'name', 'housenumber', 'street', 'postcode', 'locality', 'region', 'country', - 'admin_parts' + 'admin' **/ // validate texts, convert types and apply defaults @@ -49,7 +49,7 @@ function parse (clean) { if (t.solution.length) { solution = t.solution[0]; } // 1. map the output of the parser in to parsed_text - let parsed_text = {}; + let parsed_text = { subject: undefined }; solution.pair.forEach(p => { let field = p.classification.label; @@ -107,7 +107,8 @@ function parse (clean) { // handle the case where 'parsed_text' is completely empty // ie. the parser was not able to classify anything at all // note: this is common for venue names - if (Object.keys(parsed_text).length === 0) { + // note: length == 1 accounts for 'subject' + if (Object.keys(parsed_text).length === 1) { if (prefix.length && !postfix.length) { // if the prefix contains a comma // then only use the first part for the prefix for the @@ -124,7 +125,44 @@ function parse (clean) { // 3. store the unparsed characters in fields which can be used for querying if (prefix.length) { parsed_text.name = prefix; } - if (postfix.length) { parsed_text.admin_parts = postfix; } + if (postfix.length) { parsed_text.admin = postfix; } + + // 4. set 'subject', this is the text which will target the 'name.*' + // fields in elasticsearch queries + + // an address query + if (!_.isEmpty(parsed_text.housenumber) && !_.isEmpty(parsed_text.street)) { + parsed_text.subject = `${parsed_text.housenumber} ${parsed_text.street}`; + } + // a street query + else if (!_.isEmpty(parsed_text.street)) { + parsed_text.subject = parsed_text.street; + } + // query with a name such as a venue query + else if (!_.isEmpty(parsed_text.name)){ + parsed_text.subject = parsed_text.name; + } + // a postcode query + else if (!_.isEmpty(parsed_text.postcode)) { + parsed_text.subject = parsed_text.postcode; + } + // a locality query + else if (!_.isEmpty(parsed_text.locality)) { + parsed_text.subject = parsed_text.locality; + } + // a region query + else if (!_.isEmpty(parsed_text.region)) { + parsed_text.subject = parsed_text.region; + } + // a country query + else if (!_.isEmpty(parsed_text.country)) { + parsed_text.subject = parsed_text.country; + } + + // unknown query type + else { + parsed_text.subject = t.span.body; + } return parsed_text; } diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index cc02e42a1..4dbfbd9a9 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -30,21 +30,8 @@ function _sanitize( raw, clean ){ inputParserRanSuccessfully = true; // parsed_text.name is set, this is the highest priority, use this string - if( _.has(clean.parsed_text, 'name') ){ - text = clean.parsed_text.name; // use this string instead - } - - // else handle the case where parsed_text.street was produced but - // no parsed_text.name is produced. - // additionally, handle the case where parsed_text.housenumber is present - // note: the addressit module may also produce parsed_text.unit info - // for now, we discard that information as we don't have an appropriate - else if( _.has(clean.parsed_text, 'street') ){ - text = [ - clean.parsed_text.housenumber, - clean.parsed_text.street - ].filter((el) => el) - .join(' '); // remove empty elements + if( _.has(clean.parsed_text, 'subject') ){ + text = clean.parsed_text.subject; // use this string instead } } diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index 6b324c898..fee51406b 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -56,8 +56,9 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'one two, three', parsed_text: { + subject: 'one two', name: 'one two', - admin_parts: 'three' + admin: 'three' }, tokens: ['one','two'], tokens_complete: ['one','two'], @@ -257,9 +258,10 @@ module.exports.tests.query = function(test, common) { var query = generate({ text: 'k road, laird', parsed_text: { + subject: 'k road', street: 'k road', locality: 'laird', - admin_parts: 'laird' + admin: 'laird' }, tokens: ['k', 'road'], tokens_complete: ['k', 'road'], diff --git a/test/unit/query/search_addressit.js b/test/unit/query/search_addressit.js index 401d282e2..88b7a0655 100644 --- a/test/unit/query/search_addressit.js +++ b/test/unit/query/search_addressit.js @@ -106,12 +106,13 @@ module.exports.tests.query = function(test, common) { layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, parsed_text: { + subject: '123 main st', housenumber: '123', street: 'main st', region: 'new york', locality: 'ny', postcode: '10010', - admin_parts: 'new york ny US' + admin: 'new york ny US' } }); @@ -128,9 +129,10 @@ module.exports.tests.query = function(test, common) { layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, parsed_text: { + subject: 'soho grand', name: 'soho grand', region: 'new york', - admin_parts: 'new york' + admin: 'new york' } }); @@ -147,11 +149,12 @@ module.exports.tests.query = function(test, common) { layers: [ 'address', 'venue', 'country', 'region', 'county', 'neighbourhood', 'locality', 'localadmin' ], querySize: 10, parsed_text: { + subject: '1 water st', housenumber: '1', street: 'water st', locality: 'manhattan', region: 'ny', - admin_parts: 'manhattan ny' + admin: 'manhattan ny' } }); diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index 9a40ef7fc..3a0fd3ea1 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -20,14 +20,14 @@ module.exports.tests.text_parser = function (test, common) { }); var usQueries = [ - { name: 'soho', admin_parts: 'new york', region: 'NY' }, - { name: '123 main', admin_parts: 'new york', region: 'NY' } + { name: 'soho', admin: 'new york', region: 'NY' }, + { name: '123 main', admin: 'new york', region: 'NY' } ]; usQueries.forEach(function (query) { test('naive parsing ' + query, function (t) { var raw = { - text: query.name + ', ' + query.admin_parts + text: query.name + ', ' + query.admin }; var clean = {}; @@ -35,9 +35,10 @@ module.exports.tests.text_parser = function (test, common) { text: raw.text.trim(), parser: 'pelias', parsed_text: { + subject: query.name, name: query.name, - region: query.admin_parts, - admin_parts: query.admin_parts + locality: query.admin, + admin: query.admin } }; @@ -51,7 +52,7 @@ module.exports.tests.text_parser = function (test, common) { test('naive parsing ' + query + ' without spaces', function (t) { var raw = { - text: query.name + ',' + query.admin_parts + text: query.name + ',' + query.admin }; var clean = {}; @@ -59,9 +60,10 @@ module.exports.tests.text_parser = function (test, common) { text: raw.text.trim(), parser: 'pelias', parsed_text: { + subject: query.name, name: query.name, - region: query.admin_parts, - admin_parts: query.admin_parts + locality: query.admin, + admin: query.admin } }; @@ -75,7 +77,7 @@ module.exports.tests.text_parser = function (test, common) { test('naive parsing ' + query + ' with leading and trailing junk', function (t) { var raw = { - text: ' , ' + query.name + ',' + query.admin_parts + ' , ' + text: ' , ' + query.name + ',' + query.admin + ' , ' }; var clean = {}; @@ -83,9 +85,10 @@ module.exports.tests.text_parser = function (test, common) { text: raw.text.trim(), parser: 'pelias', parsed_text: { + subject: query.name, name: query.name, - region: query.admin_parts, - admin_parts: query.admin_parts + locality: query.admin, + admin: query.admin } }; @@ -99,22 +102,23 @@ module.exports.tests.text_parser = function (test, common) { }); var nonUSQueries = [ - { name: 'chelsea', admin_parts: 'london' }, + { name: 'chelsea', admin: 'london' }, ]; nonUSQueries.forEach(function (query) { test('naive parsing ' + query, function (t) { var raw = { - text: query.name + ', ' + query.admin_parts + text: query.name + ', ' + query.admin }; var clean = {}; var expected_clean = { - text: query.name + ', ' + query.admin_parts, + text: query.name + ', ' + query.admin, parser: 'pelias', parsed_text: { + subject: query.name, locality: query.name, - admin_parts: query.name + ', ' + query.admin_parts + admin: query.name + ', ' + query.admin } }; @@ -128,16 +132,17 @@ module.exports.tests.text_parser = function (test, common) { test('naive parsing ' + query + ' without spaces', function (t) { var raw = { - text: query.name + ',' + query.admin_parts + text: query.name + ',' + query.admin }; var clean = {}; var expected_clean = { - text: query.name + ',' + query.admin_parts, + text: query.name + ',' + query.admin, parser: 'pelias', parsed_text: { + subject: query.name, locality: query.name, - admin_parts: query.name + ', ' + query.admin_parts + admin: query.name + ', ' + query.admin } }; @@ -162,6 +167,7 @@ module.exports.tests.text_parser = function (test, common) { parser: 'pelias', text: 'yugolsavia', parsed_text: { + subject: 'yugolsavia', name: 'yugolsavia' } }; @@ -185,6 +191,7 @@ module.exports.tests.text_parser = function (test, common) { parser: 'pelias', text: 'small town', parsed_text: { + subject: 'small town', name: 'small town' } }; @@ -208,6 +215,7 @@ module.exports.tests.text_parser = function (test, common) { parser: 'pelias', text: '123 main', parsed_text: { + subject: '123 main', name: '123 main' } }; @@ -231,6 +239,7 @@ module.exports.tests.text_parser = function (test, common) { parser: 'pelias', text: 'main 123', parsed_text: { + subject: 'main 123', name: 'main 123' } }; @@ -254,9 +263,10 @@ module.exports.tests.text_parser = function (test, common) { text: 'main particle new york', parser: 'pelias', parsed_text: { + subject: 'main particle', name: 'main particle', - region: 'new york', - admin_parts: 'new york' + locality: 'new york', + admin: 'new york' } }; @@ -278,11 +288,12 @@ module.exports.tests.text_parser = function (test, common) { text: '123 main st new york ny', parser: 'pelias', parsed_text: { + subject: '123 main st', housenumber: '123', street: 'main st', - region: 'new york', - locality: 'ny', - admin_parts: 'new york ny' + locality: 'new york', + region: 'ny', + admin: 'new york ny' } }; @@ -304,12 +315,13 @@ module.exports.tests.text_parser = function (test, common) { text: '123 main st new york ny 10010', parser: 'pelias', parsed_text: { + subject: '123 main st', housenumber: '123', street: 'main st', - region: 'new york', - locality: 'ny', + locality: 'new york', + region: 'ny', postcode: '10010', - admin_parts: 'new york ny' + admin: 'new york ny' } }; @@ -330,11 +342,12 @@ module.exports.tests.text_parser = function (test, common) { text: '339 W Main St, Cheshire, 06410', parser: 'pelias', parsed_text: { + subject: '339 W Main St', housenumber: '339', street: 'W Main St', + locality: 'Cheshire', postcode: '06410', - region: 'Cheshire', - admin_parts: 'Cheshire' + admin: 'Cheshire' } }; @@ -355,11 +368,12 @@ module.exports.tests.text_parser = function (test, common) { text: '339 W Main St,Lancaster,PA', parser: 'pelias', parsed_text: { + subject: '339 W Main St', housenumber: '339', street: 'W Main St', locality: 'Lancaster', region: 'PA', - admin_parts: 'Lancaster, PA' + admin: 'Lancaster, PA' } }; @@ -396,8 +410,9 @@ module.exports.tests.text_parser = function (test, common) { const raw = { text: 'NSW' }; const clean = {}; const expected_clean = { text: 'NSW', parser: 'pelias', parsed_text: { + subject: 'NSW', region: 'NSW', - admin_parts: 'NSW' + admin: 'NSW' }}; const messages = sanitizer.sanitize(raw, clean); diff --git a/test/unit/sanitizer/_tokenizer.js b/test/unit/sanitizer/_tokenizer.js index 181a32106..f02de49f7 100644 --- a/test/unit/sanitizer/_tokenizer.js +++ b/test/unit/sanitizer/_tokenizer.js @@ -167,7 +167,7 @@ module.exports.tests.sanity_checks = function(test, common) { }); test('favor clean.parsed_text.name over clean.text', function(t) { - var clean = { parsed_text: { name: 'foo' }, text: 'bar' }; + var clean = { parsed_text: { subject: 'foo' }, text: 'bar' }; var messages = sanitizer.sanitize({}, clean); // favor clean.parsed_text.name over clean.text @@ -183,7 +183,9 @@ module.exports.tests.sanity_checks = function(test, common) { }); test('favor clean.parsed_text street data over clean.text', function(t) { - var clean = { parsed_text: { housenumber: '190', street: 'foo st' }, text: 'bar' }; + var clean = { parsed_text: { + housenumber: '190', street: 'foo st', subject: '190 foo st' + }, text: 'bar' }; var messages = sanitizer.sanitize({}, clean); // favor clean.parsed_text.name over clean.text @@ -199,7 +201,9 @@ module.exports.tests.sanity_checks = function(test, common) { }); test('favor clean.parsed_text.name over clean.parsed_text street data', function(t) { - var clean = { parsed_text: { housenumber: '190', street: 'foo st', name: 'foo' }, text: 'bar' }; + var clean = { parsed_text: { + housenumber: '190', street: 'foo st', subject: 'foo' + }, text: 'bar' }; var messages = sanitizer.sanitize({}, clean); // favor clean.parsed_text.name over all other variables From 10c98890291fb42a44307b84faea22cc39432003 Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 3 May 2019 15:44:19 +0200 Subject: [PATCH 04/55] feat(parser): bump pelias/parser version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 0ac55677a..f76b35ca8 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.3.0", + "pelias-parser": "^1.4.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 559fdb041d78237d0663f10bf74dd1bde2f11817 Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 13 May 2019 14:35:03 +0200 Subject: [PATCH 05/55] feat(parser): bump pelias/parser version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index f76b35ca8..bc8cf457e 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.4.0", + "pelias-parser": "^1.9.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 30a62866828a3e840ba8c21014dea876d05ab019 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 14 May 2019 15:58:13 +0200 Subject: [PATCH 06/55] feat(parser): bump pelias/parser version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index bc8cf457e..382e43f1b 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.9.0", + "pelias-parser": "^1.12.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From b24bf31ee9e2463a02630eeb05519757173b7a9a Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 14 May 2019 18:01:02 +0200 Subject: [PATCH 07/55] feat(parser): bump pelias/parser version --- package.json | 2 +- sanitizer/_text_pelias_parser.js | 8 +- test/unit/sanitizer/_text_pelias_parser.js | 523 ++++++--------------- 3 files changed, 151 insertions(+), 382 deletions(-) diff --git a/package.json b/package.json index 382e43f1b..ca0a0d883 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.12.0", + "pelias-parser": "^1.13.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index c2426480d..607d1aefb 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -100,10 +100,6 @@ function parse (clean) { prefix = prefix.split(/[,\n\t]/).join(', '); postfix = postfix.split(/[,\n\t]/).join(', '); - // squash multiple adjacent whitespace characters into a single space - prefix = prefix.replace(/\s\s+/g, ' ').trim(); - postfix = postfix.replace(/\s\s+/g, ' ').trim(); - // handle the case where 'parsed_text' is completely empty // ie. the parser was not able to classify anything at all // note: this is common for venue names @@ -123,6 +119,10 @@ function parse (clean) { } } + // squash multiple adjacent whitespace characters into a single space + prefix = prefix.replace(/\s+/g, ' ').trim(); + postfix = postfix.replace(/\s+/g, ' ').trim(); + // 3. store the unparsed characters in fields which can be used for querying if (prefix.length) { parsed_text.name = prefix; } if (postfix.length) { parsed_text.admin = postfix; } diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index 3a0fd3ea1..e49dd60b1 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -19,370 +19,155 @@ module.exports.tests.text_parser = function (test, common) { t.end(); }); - var usQueries = [ - { name: 'soho', admin: 'new york', region: 'NY' }, - { name: '123 main', admin: 'new york', region: 'NY' } - ]; - - usQueries.forEach(function (query) { - test('naive parsing ' + query, function (t) { - var raw = { - text: query.name + ', ' + query.admin - }; - var clean = {}; - - var expected_clean = { - text: raw.text.trim(), - parser: 'pelias', - parsed_text: { - subject: query.name, - name: query.name, - locality: query.admin, - admin: query.admin - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('naive parsing ' + query + ' without spaces', function (t) { - var raw = { - text: query.name + ',' + query.admin - }; - var clean = {}; - - var expected_clean = { - text: raw.text.trim(), - parser: 'pelias', - parsed_text: { - subject: query.name, - name: query.name, - locality: query.admin, - admin: query.admin - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('naive parsing ' + query + ' with leading and trailing junk', function (t) { - var raw = { - text: ' , ' + query.name + ',' + query.admin + ' , ' - }; - var clean = {}; - - var expected_clean = { - text: raw.text.trim(), - parser: 'pelias', - parsed_text: { - subject: query.name, - name: query.name, - locality: query.admin, - admin: query.admin - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - }); - - var nonUSQueries = [ - { name: 'chelsea', admin: 'london' }, - ]; - - nonUSQueries.forEach(function (query) { - test('naive parsing ' + query, function (t) { - var raw = { - text: query.name + ', ' + query.admin - }; - var clean = {}; - - var expected_clean = { - text: query.name + ', ' + query.admin, - parser: 'pelias', - parsed_text: { - subject: query.name, - locality: query.name, - admin: query.name + ', ' + query.admin - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('naive parsing ' + query + ' without spaces', function (t) { - var raw = { - text: query.name + ',' + query.admin - }; - var clean = {}; - - var expected_clean = { - text: query.name + ',' + query.admin, - parser: 'pelias', - parsed_text: { - subject: query.name, - locality: query.name, - admin: query.name + ', ' + query.admin + let cases = []; + + // USA queries + cases.push(['soho, new york, NY', { + subject: 'soho', + name: 'soho', + locality: 'new york', + region: 'NY', + admin: 'new york, NY' + }]); + cases.push(['123 main st, new york, NY', { + subject: '123 main st', + housenumber: '123', + street: 'main st', + locality: 'new york', + region: 'NY', + admin: 'new york, NY' + }]); + + // GBR queries + cases.push(['chelsea, london', { + subject: 'chelsea', + locality: 'chelsea', + admin: 'chelsea, london' + }]); + + // Query with one token + cases.push(['yugolsavia', { + subject: 'yugolsavia', + name: 'yugolsavia' + }]); + + // Query with two tokens, no numbers + cases.push(['small town', { + subject: 'small town', + name: 'small town' + }]); + + // Query with two tokens, number first + cases.push(['123 main', { + subject: '123 main', + name: '123 main' + }]); + + // Query with two tokens, number second + cases.push(['main 123', { + subject: 'main 123', + name: 'main 123' + }]); + + // Query with many tokens + cases.push(['main particle new york', { + subject: 'main particle', + name: 'main particle', + locality: 'new york', + admin: 'new york' + }]); + + // Valid address with housenumber + cases.push(['123 main st new york ny', { + subject: '123 main st', + housenumber: '123', + street: 'main st', + locality: 'new york', + region: 'ny', + admin: 'new york ny' + }]); + + // Valid address with postcode + cases.push(['123 main st new york ny 10010', { + subject: '123 main st', + housenumber: '123', + street: 'main st', + locality: 'new york', + region: 'ny', + postcode: '10010', + admin: 'new york ny' + }]); + + // Valid address with leading 0 in postcode + cases.push(['339 W Main St, Cheshire, 06410', { + subject: '339 W Main St', + housenumber: '339', + street: 'W Main St', + locality: 'Cheshire', + postcode: '06410', + admin: 'Cheshire' + }]); + + // Valid address with no spaces after comma + cases.push(['339 W Main St,Lancaster,PA', { + subject: '339 W Main St', + housenumber: '339', + street: 'W Main St', + locality: 'Lancaster', + region: 'PA', + admin: 'Lancaster, PA' + }]); + + // Valid address without commas + cases.push(['123 main st new york ny', { + subject: '123 main st', + housenumber: '123', + street: 'main st', + locality: 'new york', + region: 'ny', + admin: 'new york ny' + }]); + + // AUS - state only + cases.push(['NSW', { + subject: 'NSW', + region: 'NSW', + admin: 'NSW' + }]); + + cases.forEach(testcase => { + let input = testcase[0]; + let expected = testcase[1]; + + function assert(label, replacement, replaceAdmin) { + let text = input.replace(/\s+/, ' '); + let clone = Object.assign({}, expected); + if (Array.isArray(replacement) && replacement.length === 2) { + text = text.replace(replacement[0], replacement[1]); + if (replaceAdmin === true && clone.admin) { + clone.admin = clone.admin.replace(replacement[0], replacement[1]).trim(); } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - }); - - test('query with one token', function (t) { - var raw = { - text: 'yugolsavia' - }; - var clean = {}; - clean.parsed_text = 'this should be removed'; - - var expected_clean = { - parser: 'pelias', - text: 'yugolsavia', - parsed_text: { - subject: 'yugolsavia', - name: 'yugolsavia' - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('query with two tokens, no numbers', function (t) { - var raw = { - text: 'small town' - }; - var clean = {}; - clean.parsed_text = 'this should be removed'; - - var expected_clean = { - parser: 'pelias', - text: 'small town', - parsed_text: { - subject: 'small town', - name: 'small town' - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('query with two tokens, number first', function (t) { - var raw = { - text: '123 main' - }; - var clean = {}; - clean.parsed_text = 'this should be removed'; - - var expected_clean = { - parser: 'pelias', - text: '123 main', - parsed_text: { - subject: '123 main', - name: '123 main' - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('query with two tokens, number second', function (t) { - var raw = { - text: 'main 123' - }; - var clean = {}; - clean.parsed_text = 'this should be removed'; - - var expected_clean = { - parser: 'pelias', - text: 'main 123', - parsed_text: { - subject: 'main 123', - name: 'main 123' - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('query with many tokens', function (t) { - var raw = { - text: 'main particle new york' - }; - var clean = {}; - clean.parsed_text = 'this should be removed'; - - var expected_clean = { - text: 'main particle new york', - parser: 'pelias', - parsed_text: { - subject: 'main particle', - name: 'main particle', - locality: 'new york', - admin: 'new york' - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('valid address, house number', function (t) { - var raw = { - text: '123 main st new york ny' - }; - var clean = {}; - - var expected_clean = { - text: '123 main st new york ny', - parser: 'pelias', - parsed_text: { - subject: '123 main st', - housenumber: '123', - street: 'main st', - locality: 'new york', - region: 'ny', - admin: 'new york ny' - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('valid address, zipcode', function (t) { - var raw = { - text: '123 main st new york ny 10010' - }; - var clean = {}; - - var expected_clean = { - text: '123 main st new york ny 10010', - parser: 'pelias', - parsed_text: { - subject: '123 main st', - housenumber: '123', - street: 'main st', - locality: 'new york', - region: 'ny', - postcode: '10010', - admin: 'new york ny' - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - }); - - test('valid address with leading 0s in zipcode', function (t) { - var raw = { - text: '339 W Main St, Cheshire, 06410' - }; - var clean = {}; - - var expected_clean = { - text: '339 W Main St, Cheshire, 06410', - parser: 'pelias', - parsed_text: { - subject: '339 W Main St', - housenumber: '339', - street: 'W Main St', - locality: 'Cheshire', - postcode: '06410', - admin: 'Cheshire' } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - }); - - test('valid address without spaces after commas', function (t) { - var raw = { - text: '339 W Main St,Lancaster,PA' - }; - var clean = {}; - - var expected_clean = { - text: '339 W Main St,Lancaster,PA', - parser: 'pelias', - parsed_text: { - subject: '339 W Main St', - housenumber: '339', - street: 'W Main St', - locality: 'Lancaster', - region: 'PA', - admin: 'Lancaster, PA' + if (clone.admin) { + clone.admin = clone.admin.replace(/\s+/g, ' ').trim(); } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] }); - t.deepEqual(clean, expected_clean); - t.end(); - + test(`${label}: ${text}`, t => { + let raw = { text: text }; + let clean = { parsed_text: 'this should be removed' }; + let messages = sanitizer.sanitize(raw, clean); + + t.deepEqual(messages, { errors: [], warnings: [] }, 'messages'); + t.equal(clean.text, raw.text.trim(), 'text'); + t.equal(clean.parser, 'pelias', 'parser'); + t.deepEqual(clean.parsed_text, clone, `${label}: ${text}`); + t.end(); + }); + } + + assert('literal'); + assert('no commas', [/,/g, ' '], true); + assert('no space after comma', [/,\s+/g, ',']); + assert('leading and trailing junk', [/^(.+)$/g, ' , $1 , ']); }); test('whitespace-only input counts as empty', (t) => { @@ -405,22 +190,6 @@ module.exports.tests.text_parser = function (test, common) { t.deepEquals(validParameters, expected); t.end(); }); - - test('Australia - state only', (t) => { - const raw = { text: 'NSW' }; - const clean = {}; - const expected_clean = { text: 'NSW', parser: 'pelias', parsed_text: { - subject: 'NSW', - region: 'NSW', - admin: 'NSW' - }}; - const messages = sanitizer.sanitize(raw, clean); - - t.deepEquals(clean, expected_clean); - t.deepEquals(messages.errors, []); - t.deepEquals(messages.warnings, [], 'no warnings'); - t.end(); - }); }; module.exports.all = function (tape, common) { From 2d82d38a6a9abefa8b75780cc1b98af82c7f9fae Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 13:19:01 +0200 Subject: [PATCH 08/55] feat(parser): updates to tokenizer sanitizer --- sanitizer/_tokenizer.js | 13 +++++--- test/unit/sanitizer/_text_pelias_parser.js | 1 + test/unit/sanitizer/_tokenizer.js | 36 ++++++---------------- 3 files changed, 20 insertions(+), 30 deletions(-) diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index 4dbfbd9a9..c3205ed72 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -22,16 +22,21 @@ function _sanitize( raw, clean ){ var text = clean.text; // a boolean to track whether the input parser successfully ran; or not. - var inputParserRanSuccessfully = false; + var parserConsumedAllTokens = false; // if the text parser has run then we only tokenize the 'name' section // of the 'parsed_text' object, ignoring the 'admin' parts. if( _.isPlainObject(clean, 'parsed_text') && !_.isEmpty(clean.parsed_text) ) { - inputParserRanSuccessfully = true; - // parsed_text.name is set, this is the highest priority, use this string if( _.has(clean.parsed_text, 'subject') ){ text = clean.parsed_text.subject; // use this string instead + + // when both housenumber and street fields are present then the pelias parser + // will simply set $subject to be a concatination of these fields. + // in this case we can be sure that all tokens were complete + if (_.has(clean.parsed_text, 'housenumber') && _.has(clean.parsed_text, 'street')){ + parserConsumedAllTokens = true; + } } } @@ -66,7 +71,7 @@ function _sanitize( raw, clean ){ if( clean.tokens.length ){ // if all the tokens are complete, simply copy them from clean.tokens - if( inputParserRanSuccessfully ){ + if( parserConsumedAllTokens ){ // all these tokens are complete! clean.tokens_complete = clean.tokens.slice(); diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index e49dd60b1..ddc32bee2 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -29,6 +29,7 @@ module.exports.tests.text_parser = function (test, common) { region: 'NY', admin: 'new york, NY' }]); + cases.push(['123 main st, new york, NY', { subject: '123 main st', housenumber: '123', diff --git a/test/unit/sanitizer/_tokenizer.js b/test/unit/sanitizer/_tokenizer.js index f02de49f7..a0eb2371e 100644 --- a/test/unit/sanitizer/_tokenizer.js +++ b/test/unit/sanitizer/_tokenizer.js @@ -149,7 +149,7 @@ module.exports.tests.sanity_checks = function(test, common) { t.end(); }); - test('clean.parsed_text set but clean.parsed_text.name invalid', function(t) { + test('clean.parsed_text set but clean.parsed_text.subject invalid', function(t) { var clean = { parsed_text: { text: {} } }; var messages = sanitizer.sanitize({}, clean); @@ -165,15 +165,15 @@ module.exports.tests.sanity_checks = function(test, common) { t.end(); }); - test('favor clean.parsed_text.name over clean.text', function(t) { + test('favor clean.parsed_text.subject over clean.text', function(t) { var clean = { parsed_text: { subject: 'foo' }, text: 'bar' }; var messages = sanitizer.sanitize({}, clean); - // favor clean.parsed_text.name over clean.text - t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); - t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); - t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + // favor clean.parsed_text.subject over clean.text + t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.subject'); + t.deepEquals(clean.tokens_complete, [], 'complete'); + t.deepEquals(clean.tokens_incomplete, [ 'foo' ], 'incomplete'); // no errors/warnings produced t.deepEquals(messages.errors, [], 'no errors'); @@ -184,11 +184,13 @@ module.exports.tests.sanity_checks = function(test, common) { test('favor clean.parsed_text street data over clean.text', function(t) { var clean = { parsed_text: { - housenumber: '190', street: 'foo st', subject: '190 foo st' + subject: '190 foo st', + housenumber: '190', + street: 'foo st' }, text: 'bar' }; var messages = sanitizer.sanitize({}, clean); - // favor clean.parsed_text.name over clean.text + // favor clean.parsed_text.subject over clean.text t.deepEquals(clean.tokens, [ '190', 'foo', 'st' ], 'use street name + housenumber'); t.deepEquals(clean.tokens_complete, [ '190', 'foo', 'st' ], 'use street name + housenumber'); t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); @@ -197,24 +199,6 @@ module.exports.tests.sanity_checks = function(test, common) { t.deepEquals(messages.errors, [], 'no errors'); t.deepEquals(messages.warnings, [], 'no warnings'); - t.end(); - }); - test('favor clean.parsed_text.name over clean.parsed_text street data', function(t) { - - var clean = { parsed_text: { - housenumber: '190', street: 'foo st', subject: 'foo' - }, text: 'bar' }; - var messages = sanitizer.sanitize({}, clean); - - // favor clean.parsed_text.name over all other variables - t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.name'); - t.deepEquals(clean.tokens_complete, [ 'foo' ], 'use clean.parsed_text.name'); - t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); - - // no errors/warnings produced - t.deepEquals(messages.errors, [], 'no errors'); - t.deepEquals(messages.warnings, [], 'no warnings'); - t.end(); }); }; From 4de4983f2c810fcd007eca3f5d31d635ed00f7fb Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 13:20:03 +0200 Subject: [PATCH 09/55] typo --- sanitizer/_tokenizer.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index c3205ed72..4017576e5 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -24,10 +24,10 @@ function _sanitize( raw, clean ){ // a boolean to track whether the input parser successfully ran; or not. var parserConsumedAllTokens = false; - // if the text parser has run then we only tokenize the 'name' section + // if the text parser has run then we only tokenize the 'subject' section // of the 'parsed_text' object, ignoring the 'admin' parts. if( _.isPlainObject(clean, 'parsed_text') && !_.isEmpty(clean.parsed_text) ) { - // parsed_text.name is set, this is the highest priority, use this string + // parsed_text.subject is set, this is the highest priority, use this string if( _.has(clean.parsed_text, 'subject') ){ text = clean.parsed_text.subject; // use this string instead From 88e2390aaa4961e57a9458bfda73d1d43ba154c2 Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 13:34:28 +0200 Subject: [PATCH 10/55] feat(parser): stricter tokenization of exact matching admin queries --- sanitizer/_tokenizer.js | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index 4017576e5..e1cec8f91 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -37,6 +37,14 @@ function _sanitize( raw, clean ){ if (_.has(clean.parsed_text, 'housenumber') && _.has(clean.parsed_text, 'street')){ parserConsumedAllTokens = true; } + + // when $subject exactly equals one of the admin fields + else if ( + text === clean.parsed_text.locality || + text === clean.parsed_text.region || + text === clean.parsed_text.country) { + parserConsumedAllTokens = true; + } } } From 308df52a91b1893a9ab3cf05ba0f1b0e7f175504 Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 14:51:07 +0200 Subject: [PATCH 11/55] feat(parser): switch to using multi_match for admin subqueries --- query/autocomplete.js | 15 ++-- .../autocomplete_linguistic_with_admin.js | 90 +++---------------- .../autocomplete_single_character_street.js | 86 ++++-------------- 3 files changed, 34 insertions(+), 157 deletions(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index 446e5e3fe..5d786d733 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -4,6 +4,7 @@ const textParser = require('./text_parser_pelias'); const check = require('check-types'); const logger = require('pelias-logger').get('api'); const config = require('pelias-config').generate(); +const placeTypes = require('../helper/placeTypes'); // additional views (these may be merged in to pelias/query at a later date) var views = { @@ -17,6 +18,10 @@ var views = { focus_point_filter: require('./view/focus_point_distance_filter') }; +// region_a is also an admin field. pelias/parser tries to detect +// region_a, in which case we use a match query specifically for it. +var adminFields = placeTypes.concat(['region_a']); + //------------------------------ // autocomplete query //------------------------------ @@ -32,15 +37,7 @@ query.score( peliasQuery.view.address('street') ); query.score( peliasQuery.view.address('postcode') ); // admin components -query.score( peliasQuery.view.admin('country') ); -query.score( peliasQuery.view.admin('country_a') ); -query.score( peliasQuery.view.admin('region') ); -query.score( peliasQuery.view.admin('region_a') ); -query.score( peliasQuery.view.admin('county') ); -query.score( peliasQuery.view.admin('borough') ); -query.score( peliasQuery.view.admin('localadmin') ); -query.score( peliasQuery.view.admin('locality') ); -query.score( peliasQuery.view.admin('neighbourhood') ); +query.score( peliasQuery.view.admin_multi_match(adminFields, 'peliasAdmin') ); // scoring boost query.score( views.boost_exact_matches ); diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 54437fa07..412711ecc 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -17,83 +17,19 @@ module.exports = { ], 'should': [ { - 'match': { - 'parent.country.ngram': { - 'analyzer': 'peliasAdmin', - 'boost': 800, - 'cutoff_frequency': 0.01, - 'query': 'three' - } - } - }, - { - 'match': { - 'parent.region.ngram': { - 'analyzer': 'peliasAdmin', - 'cutoff_frequency': 0.01, - 'boost': 600, - 'query': 'three' - } - } - }, - { - 'match': { - 'parent.region_a.ngram': { - 'analyzer': 'peliasAdmin', - 'cutoff_frequency': 0.01, - 'boost': 600, - 'query': 'three' - } - } - }, - { - 'match': { - 'parent.county.ngram': { - 'analyzer': 'peliasAdmin', - 'cutoff_frequency': 0.01, - 'boost': 400, - 'query': 'three' - } - } - }, - { - 'match': { - 'parent.borough.ngram': { - 'analyzer': 'peliasAdmin', - 'cutoff_frequency': 0.01, - 'boost': 600, - 'query': 'three' - } - } - }, - { - 'match': { - 'parent.localadmin.ngram': { - 'analyzer': 'peliasAdmin', - 'cutoff_frequency': 0.01, - 'boost': 200, - 'query': 'three' - } - } - }, - { - 'match': { - 'parent.locality.ngram': { - 'analyzer': 'peliasAdmin', - 'cutoff_frequency': 0.01, - 'boost': 200, - 'query': 'three' - } - } - }, - { - 'match': { - 'parent.neighbourhood.ngram': { - 'analyzer': 'peliasAdmin', - 'cutoff_frequency': 0.01, - 'boost': 200, - 'query': 'three' - } + 'multi_match': { + 'fields': [ + 'parent.country.ngram^800', + 'parent.region.ngram^600', + 'parent.county.ngram^400', + 'parent.localadmin.ngram^200', + 'parent.locality.ngram^200', + 'parent.borough.ngram^600', + 'parent.neighbourhood.ngram^200', + 'parent.region_a.ngram^600' + ], + 'query': 'three', + 'analyzer': 'peliasAdmin' } }, { diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index a625e5e04..756ccb0f3 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -23,77 +23,21 @@ module.exports = { 'analyzer': 'peliasStreet' } } - }, { - 'match': { - 'parent.country.ngram': { - 'query': 'laird', - 'cutoff_frequency': 0.01, - 'boost': 800, - 'analyzer': 'peliasAdmin' - } - } - }, { - 'match': { - 'parent.region.ngram': { - 'query': 'laird', - 'cutoff_frequency': 0.01, - 'boost': 600, - 'analyzer': 'peliasAdmin' - } - } - }, { - 'match': { - 'parent.region_a.ngram': { - 'query': 'laird', - 'cutoff_frequency': 0.01, - 'boost': 600, - 'analyzer': 'peliasAdmin' - } - } - }, { - 'match': { - 'parent.county.ngram': { - 'query': 'laird', - 'cutoff_frequency': 0.01, - 'boost': 400, - 'analyzer': 'peliasAdmin' - } - } - }, { - 'match': { - 'parent.borough.ngram': { - 'analyzer': 'peliasAdmin', - 'cutoff_frequency': 0.01, - 'boost': 600, - 'query': 'laird' - } - } - }, { - 'match': { - 'parent.localadmin.ngram': { - 'query': 'laird', - 'cutoff_frequency': 0.01, - 'boost': 200, - 'analyzer': 'peliasAdmin' - } - } - }, { - 'match': { - 'parent.locality.ngram': { - 'query': 'laird', - 'cutoff_frequency': 0.01, - 'boost': 200, - 'analyzer': 'peliasAdmin' - } - } - }, { - 'match': { - 'parent.neighbourhood.ngram': { - 'query': 'laird', - 'cutoff_frequency': 0.01, - 'boost': 200, - 'analyzer': 'peliasAdmin' - } + }, + { + 'multi_match': { + 'fields': [ + 'parent.country.ngram^800', + 'parent.region.ngram^600', + 'parent.county.ngram^400', + 'parent.localadmin.ngram^200', + 'parent.locality.ngram^200', + 'parent.borough.ngram^600', + 'parent.neighbourhood.ngram^200', + 'parent.region_a.ngram^600' + ], + 'query': 'laird', + 'analyzer': 'peliasAdmin' } }, { From f5dcf3cb9ff19aae3499b238df51fee0a9fd773b Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 16:14:53 +0200 Subject: [PATCH 12/55] feat(admin_subqueries): test cross_fields query --- query/autocomplete_defaults.js | 3 ++- test/unit/fixture/autocomplete_linguistic_with_admin.js | 4 +++- test/unit/fixture/autocomplete_single_character_street.js | 4 +++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index c9c3a9291..0609b550e 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -51,7 +51,8 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'address:postcode:boost': 2000, 'address:postcode:cutoff_frequency': 0.01, - // generic multi_match cutoff_frequency + // generic multi_match config + 'multi_match:type': 'cross_fields', 'multi_match:cutoff_frequency': 0.01, 'admin:country_a:analyzer': 'standard', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 412711ecc..cc5f18911 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -29,7 +29,9 @@ module.exports = { 'parent.region_a.ngram^600' ], 'query': 'three', - 'analyzer': 'peliasAdmin' + 'analyzer': 'peliasAdmin', + 'type': 'cross_fields', + 'cutoff_frequency': 0.01 } }, { diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 756ccb0f3..61410114e 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -37,7 +37,9 @@ module.exports = { 'parent.region_a.ngram^600' ], 'query': 'laird', - 'analyzer': 'peliasAdmin' + 'analyzer': 'peliasAdmin', + 'type': 'cross_fields', + 'cutoff_frequency': 0.01 } }, { From 58f8171b4e149464d87019dd2592295732e82680 Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 16:46:38 +0200 Subject: [PATCH 13/55] feat(admin_subqueries): test operator:and query --- query/autocomplete_defaults.js | 1 + test/unit/fixture/autocomplete_linguistic_with_admin.js | 1 + test/unit/fixture/autocomplete_single_character_street.js | 1 + 3 files changed, 3 insertions(+) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index 0609b550e..35aa6c80c 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -53,6 +53,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { // generic multi_match config 'multi_match:type': 'cross_fields', + 'multi_match:operator': 'and', 'multi_match:cutoff_frequency': 0.01, 'admin:country_a:analyzer': 'standard', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index cc5f18911..245d4f0f5 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -31,6 +31,7 @@ module.exports = { 'query': 'three', 'analyzer': 'peliasAdmin', 'type': 'cross_fields', + 'operator': 'and', 'cutoff_frequency': 0.01 } }, diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 61410114e..6a0e4cd48 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -39,6 +39,7 @@ module.exports = { 'query': 'laird', 'analyzer': 'peliasAdmin', 'type': 'cross_fields', + 'operator': 'and', 'cutoff_frequency': 0.01 } }, From cd2f159d05aa13683c2cf191a90e1ceeb150012d Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 16:53:14 +0200 Subject: [PATCH 14/55] feat(admin_subqueries): set all boosts to 1 --- query/autocomplete_defaults.js | 18 +++++++++--------- .../autocomplete_linguistic_with_admin.js | 16 ++++++++-------- .../autocomplete_single_character_street.js | 16 ++++++++-------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index 35aa6c80c..980c94dc9 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -58,47 +58,47 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:country_a:analyzer': 'standard', 'admin:country_a:field': 'parent.country_a.ngram', - 'admin:country_a:boost': 1000, + 'admin:country_a:boost': 1, 'admin:country_a:cutoff_frequency': 0.01, 'admin:country:analyzer': 'peliasAdmin', 'admin:country:field': 'parent.country.ngram', - 'admin:country:boost': 800, + 'admin:country:boost': 1, 'admin:country:cutoff_frequency': 0.01, 'admin:region:analyzer': 'peliasAdmin', 'admin:region:field': 'parent.region.ngram', - 'admin:region:boost': 600, + 'admin:region:boost': 1, 'admin:region:cutoff_frequency': 0.01, 'admin:region_a:analyzer': 'peliasAdmin', 'admin:region_a:field': 'parent.region_a.ngram', - 'admin:region_a:boost': 600, + 'admin:region_a:boost': 1, 'admin:region_a:cutoff_frequency': 0.01, 'admin:county:analyzer': 'peliasAdmin', 'admin:county:field': 'parent.county.ngram', - 'admin:county:boost': 400, + 'admin:county:boost': 1, 'admin:county:cutoff_frequency': 0.01, 'admin:localadmin:analyzer': 'peliasAdmin', 'admin:localadmin:field': 'parent.localadmin.ngram', - 'admin:localadmin:boost': 200, + 'admin:localadmin:boost': 1, 'admin:localadmin:cutoff_frequency': 0.01, 'admin:locality:analyzer': 'peliasAdmin', 'admin:locality:field': 'parent.locality.ngram', - 'admin:locality:boost': 200, + 'admin:locality:boost': 1, 'admin:locality:cutoff_frequency': 0.01, 'admin:neighbourhood:analyzer': 'peliasAdmin', 'admin:neighbourhood:field': 'parent.neighbourhood.ngram', - 'admin:neighbourhood:boost': 200, + 'admin:neighbourhood:boost': 1, 'admin:neighbourhood:cutoff_frequency': 0.01, 'admin:borough:analyzer': 'peliasAdmin', 'admin:borough:field': 'parent.borough.ngram', - 'admin:borough:boost': 600, + 'admin:borough:boost': 1, 'admin:borough:cutoff_frequency': 0.01, 'popularity:field': 'popularity', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 245d4f0f5..75cfc6d17 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -19,14 +19,14 @@ module.exports = { { 'multi_match': { 'fields': [ - 'parent.country.ngram^800', - 'parent.region.ngram^600', - 'parent.county.ngram^400', - 'parent.localadmin.ngram^200', - 'parent.locality.ngram^200', - 'parent.borough.ngram^600', - 'parent.neighbourhood.ngram^200', - 'parent.region_a.ngram^600' + 'parent.country.ngram^1', + 'parent.region.ngram^1', + 'parent.county.ngram^1', + 'parent.localadmin.ngram^1', + 'parent.locality.ngram^1', + 'parent.borough.ngram^1', + 'parent.neighbourhood.ngram^1', + 'parent.region_a.ngram^1' ], 'query': 'three', 'analyzer': 'peliasAdmin', diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 6a0e4cd48..1f62be732 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -27,14 +27,14 @@ module.exports = { { 'multi_match': { 'fields': [ - 'parent.country.ngram^800', - 'parent.region.ngram^600', - 'parent.county.ngram^400', - 'parent.localadmin.ngram^200', - 'parent.locality.ngram^200', - 'parent.borough.ngram^600', - 'parent.neighbourhood.ngram^200', - 'parent.region_a.ngram^600' + 'parent.country.ngram^1', + 'parent.region.ngram^1', + 'parent.county.ngram^1', + 'parent.localadmin.ngram^1', + 'parent.locality.ngram^1', + 'parent.borough.ngram^1', + 'parent.neighbourhood.ngram^1', + 'parent.region_a.ngram^1' ], 'query': 'laird', 'analyzer': 'peliasAdmin', From 22d69aabf5e7a1edb022c552d143bf0455fa559c Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 17:14:16 +0200 Subject: [PATCH 15/55] feat(admin_subqueries): add locality_a and country_a to multi_match --- query/autocomplete.js | 5 ++--- query/autocomplete_defaults.js | 5 +++++ query/text_parser_pelias.js | 1 + test/unit/fixture/autocomplete_linguistic_with_admin.js | 4 +++- test/unit/fixture/autocomplete_single_character_street.js | 4 +++- 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index 5d786d733..a0e24db0b 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -18,9 +18,8 @@ var views = { focus_point_filter: require('./view/focus_point_distance_filter') }; -// region_a is also an admin field. pelias/parser tries to detect -// region_a, in which case we use a match query specifically for it. -var adminFields = placeTypes.concat(['region_a']); +// add abbrevations for the fields pelias/parser is able to detect. +var adminFields = placeTypes.concat(['locality_a', 'region_a', 'country_a']); //------------------------------ // autocomplete query diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index 980c94dc9..f0b835fc9 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -91,6 +91,11 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:locality:boost': 1, 'admin:locality:cutoff_frequency': 0.01, + 'admin:locality_a:analyzer': 'peliasAdmin', + 'admin:locality_a:field': 'parent.locality_a.ngram', + 'admin:locality_a:boost': 1, + 'admin:locality_a:cutoff_frequency': 0.01, + 'admin:neighbourhood:analyzer': 'peliasAdmin', 'admin:neighbourhood:field': 'parent.neighbourhood.ngram', 'admin:neighbourhood:boost': 1, diff --git a/query/text_parser_pelias.js b/query/text_parser_pelias.js index 779a3895b..141ad2df6 100644 --- a/query/text_parser_pelias.js +++ b/query/text_parser_pelias.js @@ -65,6 +65,7 @@ function addParsedVariablesToQueryVariables(clean, vs) { adminFields.forEach(key => { if (!vs.isset('input:' + key)) { vs.var('input:' + key, clean.parsed_text.admin); + vs.var('input:' + key + '_a', clean.parsed_text.admin); } }); } diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 75cfc6d17..9f37c2bac 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -26,7 +26,9 @@ module.exports = { 'parent.locality.ngram^1', 'parent.borough.ngram^1', 'parent.neighbourhood.ngram^1', - 'parent.region_a.ngram^1' + 'parent.locality_a.ngram^1', + 'parent.region_a.ngram^1', + 'parent.country_a.ngram^1' ], 'query': 'three', 'analyzer': 'peliasAdmin', diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 1f62be732..8d0b3b277 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -34,7 +34,9 @@ module.exports = { 'parent.locality.ngram^1', 'parent.borough.ngram^1', 'parent.neighbourhood.ngram^1', - 'parent.region_a.ngram^1' + 'parent.locality_a.ngram^1', + 'parent.region_a.ngram^1', + 'parent.country_a.ngram^1' ], 'query': 'laird', 'analyzer': 'peliasAdmin', From 79c5c4557403ab06c876493aaef5dcd389ad2a86 Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 17:20:42 +0200 Subject: [PATCH 16/55] feat(admin_subqueries): revert to operator:or --- query/autocomplete_defaults.js | 1 - test/unit/fixture/autocomplete_linguistic_with_admin.js | 1 - test/unit/fixture/autocomplete_single_character_street.js | 1 - 3 files changed, 3 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index f0b835fc9..ba7d54b79 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -53,7 +53,6 @@ module.exports = _.merge({}, peliasQuery.defaults, { // generic multi_match config 'multi_match:type': 'cross_fields', - 'multi_match:operator': 'and', 'multi_match:cutoff_frequency': 0.01, 'admin:country_a:analyzer': 'standard', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 9f37c2bac..95b5c4404 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -33,7 +33,6 @@ module.exports = { 'query': 'three', 'analyzer': 'peliasAdmin', 'type': 'cross_fields', - 'operator': 'and', 'cutoff_frequency': 0.01 } }, diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 8d0b3b277..ccea321b9 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -41,7 +41,6 @@ module.exports = { 'query': 'laird', 'analyzer': 'peliasAdmin', 'type': 'cross_fields', - 'operator': 'and', 'cutoff_frequency': 0.01 } }, From f31d5ae4484745a8da6093eda66a4caa02b3ef1b Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 17:31:56 +0200 Subject: [PATCH 17/55] feat(admin_subqueries): remove cutoff_frequency --- query/autocomplete_defaults.js | 1 - test/unit/fixture/autocomplete_linguistic_with_admin.js | 3 +-- test/unit/fixture/autocomplete_single_character_street.js | 3 +-- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index ba7d54b79..ea121e055 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -53,7 +53,6 @@ module.exports = _.merge({}, peliasQuery.defaults, { // generic multi_match config 'multi_match:type': 'cross_fields', - 'multi_match:cutoff_frequency': 0.01, 'admin:country_a:analyzer': 'standard', 'admin:country_a:field': 'parent.country_a.ngram', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 95b5c4404..f737a52b9 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -32,8 +32,7 @@ module.exports = { ], 'query': 'three', 'analyzer': 'peliasAdmin', - 'type': 'cross_fields', - 'cutoff_frequency': 0.01 + 'type': 'cross_fields' } }, { diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index ccea321b9..978f0ddc3 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -40,8 +40,7 @@ module.exports = { ], 'query': 'laird', 'analyzer': 'peliasAdmin', - 'type': 'cross_fields', - 'cutoff_frequency': 0.01 + 'type': 'cross_fields' } }, { From 325c0706bdd6c0856e45da734127b6cdf864a542 Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 15 May 2019 17:50:12 +0200 Subject: [PATCH 18/55] feat(admin_subqueries): move admin matching to MUST condition --- query/autocomplete.js | 6 +-- .../autocomplete_linguistic_with_admin.js | 8 ++-- .../autocomplete_single_character_street.js | 37 +++++++++---------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index a0e24db0b..8b967c2a5 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -30,14 +30,14 @@ var query = new peliasQuery.layout.FilteredBooleanQuery(); query.score( views.phrase_first_tokens_only, 'must' ); query.score( views.ngrams_last_token_only, 'must' ); +// admin components +query.score(peliasQuery.view.admin_multi_match(adminFields, 'peliasAdmin'), 'must'); + // address components query.score( peliasQuery.view.address('housenumber') ); query.score( peliasQuery.view.address('street') ); query.score( peliasQuery.view.address('postcode') ); -// admin components -query.score( peliasQuery.view.admin_multi_match(adminFields, 'peliasAdmin') ); - // scoring boost query.score( views.boost_exact_matches ); query.score( peliasQuery.view.focus( views.ngrams_strict ) ); diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index f737a52b9..bb58888ba 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -13,9 +13,7 @@ module.exports = { 'query': 'one two' } } - } - ], - 'should': [ + }, { 'multi_match': { 'fields': [ @@ -34,7 +32,9 @@ module.exports = { 'analyzer': 'peliasAdmin', 'type': 'cross_fields' } - }, + } + ], + 'should': [ { 'match': { 'phrase.default': { diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 978f0ddc3..90a481394 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -12,6 +12,24 @@ module.exports = { 'query': 'k road' } } + }, { + 'multi_match': { + 'fields': [ + 'parent.country.ngram^1', + 'parent.region.ngram^1', + 'parent.county.ngram^1', + 'parent.localadmin.ngram^1', + 'parent.locality.ngram^1', + 'parent.borough.ngram^1', + 'parent.neighbourhood.ngram^1', + 'parent.locality_a.ngram^1', + 'parent.region_a.ngram^1', + 'parent.country_a.ngram^1' + ], + 'query': 'laird', + 'analyzer': 'peliasAdmin', + 'type': 'cross_fields' + } }], 'should':[ { @@ -24,25 +42,6 @@ module.exports = { } } }, - { - 'multi_match': { - 'fields': [ - 'parent.country.ngram^1', - 'parent.region.ngram^1', - 'parent.county.ngram^1', - 'parent.localadmin.ngram^1', - 'parent.locality.ngram^1', - 'parent.borough.ngram^1', - 'parent.neighbourhood.ngram^1', - 'parent.locality_a.ngram^1', - 'parent.region_a.ngram^1', - 'parent.country_a.ngram^1' - ], - 'query': 'laird', - 'analyzer': 'peliasAdmin', - 'type': 'cross_fields' - } - }, { 'match': { 'phrase.default': { From 5e82ec692d8c47bfe5fed341e50e29c3aebd9ea4 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 08:18:38 +0200 Subject: [PATCH 19/55] feat(tokenizer): consider query as complete if the final char is a numeral --- sanitizer/_tokenizer.js | 8 +++++- test/unit/sanitizer/_tokenizer.js | 41 ++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index e1cec8f91..56b41b7c3 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -48,6 +48,12 @@ function _sanitize( raw, clean ){ } } + // if the final character is a numeral then consider all tokens + // as complete in order to avoid prefix matching numerals. + if (/[0-9]$/.test(text) ) { + parserConsumedAllTokens = true; + } + // always set 'clean.tokens*' arrays for consistency and to avoid upstream errors. clean.tokens = []; clean.tokens_complete = []; @@ -61,7 +67,7 @@ function _sanitize( raw, clean ){ // see: settings.analysis.tokenizer.peliasNameTokenizer clean.tokens = text .split(/[\s,\\\/]+/) // split on delimeters - .filter(function(el){return el;}); // remove empty elements + .filter(el => el); // remove empty elements } else { // text is empty, this sanitizer should be a no-op return messages; diff --git a/test/unit/sanitizer/_tokenizer.js b/test/unit/sanitizer/_tokenizer.js index a0eb2371e..6ead9f76b 100644 --- a/test/unit/sanitizer/_tokenizer.js +++ b/test/unit/sanitizer/_tokenizer.js @@ -428,13 +428,11 @@ module.exports.tests.final_token_single_gram = function(test, common) { // all but last token marked as 'complete' t.deepEquals(clean.tokens_complete, [ - 'grolmanstrasse', + 'grolmanstrasse', '1' ], 'tokens produced'); // last token marked as 'incomplete' - t.deepEquals(clean.tokens_incomplete, [ - '1' - ], 'tokens produced'); + t.deepEquals(clean.tokens_incomplete, [], 'tokens produced'); // no errors/warnings produced t.deepEquals(messages.errors, [], 'no errors'); @@ -534,6 +532,41 @@ module.exports.tests.mixed_delimiter = function(test, common) { }); }; +module.exports.tests.numeric_final_char = function (test, common) { + test('numeric final char, single token', function (t) { + + var clean = { text: '7-11', parsed_text: { subject: '7-11' } }; + var messages = sanitizer.sanitize({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, ['7-11'], 'tokens produced'); + t.deepEquals(clean.tokens_complete, ['7-11'], 'complete'); + t.deepEquals(clean.tokens_incomplete, [], 'incomplete'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); + test('numeric final char, multiple token', function (t) { + + var clean = { text: 'stop 3', parsed_text: { subject: 'stop 3' } }; + var messages = sanitizer.sanitize({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, ['stop', '3'], 'tokens produced'); + t.deepEquals(clean.tokens_complete, ['stop', '3'], 'complete'); + t.deepEquals(clean.tokens_incomplete, [], 'incomplete'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { return tape('sanitizeR _tokenizer: ' + name, testFunction); From 74a337df914ac067660709489580f462cd261f3e Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 10:16:04 +0200 Subject: [PATCH 20/55] feat(autocomplete): test removing exact_matching subquery --- query/autocomplete.js | 1 - test/unit/fixture/autocomplete_custom_boosts.json | 12 ------------ .../fixture/autocomplete_linguistic_final_token.js | 11 ----------- .../autocomplete_linguistic_multiple_tokens.js | 12 ------------ .../fixture/autocomplete_linguistic_with_admin.js | 12 ------------ .../fixture/autocomplete_single_character_street.js | 12 ------------ .../autocomplete_token_matching_permutations.js | 12 ++---------- test/unit/query/autocomplete_with_custom_boosts.js | 2 +- 8 files changed, 3 insertions(+), 71 deletions(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index 8b967c2a5..8ad2606d6 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -39,7 +39,6 @@ query.score( peliasQuery.view.address('street') ); query.score( peliasQuery.view.address('postcode') ); // scoring boost -query.score( views.boost_exact_matches ); query.score( peliasQuery.view.focus( views.ngrams_strict ) ); query.score( peliasQuery.view.popularity( views.pop_subquery ) ); query.score( peliasQuery.view.population( views.pop_subquery ) ); diff --git a/test/unit/fixture/autocomplete_custom_boosts.json b/test/unit/fixture/autocomplete_custom_boosts.json index 361de970f..6bf9b7994 100644 --- a/test/unit/fixture/autocomplete_custom_boosts.json +++ b/test/unit/fixture/autocomplete_custom_boosts.json @@ -18,18 +18,6 @@ } ], "should": [ - { - "match": { - "phrase.default": { - "analyzer": "peliasPhrase", - "cutoff_frequency": 0.01, - "type": "phrase", - "boost": 1, - "slop": 3, - "query": "foo" - } - } - }, { "function_score": { "query": { diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index 5967f9b08..5bfab75d6 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -14,17 +14,6 @@ module.exports = { } }], 'should':[{ - 'match': { - 'phrase.default': { - 'analyzer': 'peliasPhrase', - 'cutoff_frequency': 0.01, - 'boost': 1, - 'slop': 3, - 'query': 'one', - 'type': 'phrase' - } - } - },{ 'function_score': { 'query': { 'match_all': {} diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index d6fb9275f..679fa5aa8 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -31,18 +31,6 @@ module.exports = { } }], 'should':[ - { - 'match': { - 'phrase.default': { - 'analyzer' : 'peliasPhrase', - 'type' : 'phrase', - 'boost' : 1, - 'slop' : 3, - 'cutoff_frequency': 0.01, - 'query' : 'one two' - } - } - }, { 'function_score': { 'query': { diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index bb58888ba..e746149ae 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -35,18 +35,6 @@ module.exports = { } ], 'should': [ - { - 'match': { - 'phrase.default': { - 'analyzer' : 'peliasPhrase', - 'cutoff_frequency': 0.01, - 'type' : 'phrase', - 'boost' : 1, - 'slop' : 3, - 'query' : 'one two' - } - } - }, { 'function_score': { 'query': { diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 90a481394..6fa061ed4 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -42,18 +42,6 @@ module.exports = { } } }, - { - 'match': { - 'phrase.default': { - 'analyzer' : 'peliasPhrase', - 'type' : 'phrase', - 'boost' : 1, - 'slop' : 3, - 'cutoff_frequency': 0.01, - 'query' : 'k road' - } - } - }, { 'function_score': { 'query': { diff --git a/test/unit/query/autocomplete_token_matching_permutations.js b/test/unit/query/autocomplete_token_matching_permutations.js index 0806014a9..069465ed8 100644 --- a/test/unit/query/autocomplete_token_matching_permutations.js +++ b/test/unit/query/autocomplete_token_matching_permutations.js @@ -17,8 +17,7 @@ const defaults = new peliasQuery.Vars( require('../../../query/autocomplete_defa const views = { ngrams_last_token_only: require('../../../query/view/ngrams_last_token_only'), phrase_first_tokens_only: require('../../../query/view/phrase_first_tokens_only'), - pop_subquery: require('../../../query/view/pop_subquery'), - boost_exact_matches: require('../../../query/view/boost_exact_matches') + pop_subquery: require('../../../query/view/pop_subquery') }; module.exports.tests = {}; @@ -44,7 +43,7 @@ function assert( t, actual, expected, debug ){ } t.deepEqual(_actual.type, 'autocomplete', 'query type set'); - t.deepEqual(_actual.body.query.bool, _expected); + t.deepEqual(_actual.body.query.bool, _expected, 'autocomplete_token_matching_permutations'); t.end(); } @@ -83,7 +82,6 @@ module.exports.tests.single_token = function(test, common) { assert( t, generate( clean ), { must: [ views.phrase_first_tokens_only( vs ) ], should: [ - views.boost_exact_matches( vs ), peliasQuery.view.popularity( views.pop_subquery )( vs ), peliasQuery.view.population( views.pop_subquery )( vs ) ] @@ -124,7 +122,6 @@ module.exports.tests.single_token = function(test, common) { assert( t, generate( clean ), { must: [ views.phrase_first_tokens_only( vs ) ], should: [ - views.boost_exact_matches( vs ), peliasQuery.view.popularity( views.pop_subquery )( vs ), peliasQuery.view.population( views.pop_subquery )( vs ) ] @@ -165,7 +162,6 @@ module.exports.tests.single_token = function(test, common) { assert( t, generate( clean ), { must: [ views.phrase_first_tokens_only( vs ) ], should: [ - views.boost_exact_matches( vs ), peliasQuery.view.popularity( views.pop_subquery )( vs ), peliasQuery.view.population( views.pop_subquery )( vs ) ] @@ -191,7 +187,6 @@ module.exports.tests.multiple_tokens = function(test, common) { views.ngrams_last_token_only( vs ) ], should: [ - views.boost_exact_matches( vs ), peliasQuery.view.popularity( views.pop_subquery )( vs ), peliasQuery.view.population( views.pop_subquery )( vs ) ] @@ -214,7 +209,6 @@ module.exports.tests.multiple_tokens = function(test, common) { views.phrase_first_tokens_only( vs ) ], should: [ - views.boost_exact_matches( vs ), peliasQuery.view.popularity( views.pop_subquery )( vs ), peliasQuery.view.population( views.pop_subquery )( vs ) ] @@ -238,7 +232,6 @@ module.exports.tests.multiple_tokens = function(test, common) { views.ngrams_last_token_only( vs ) ], should: [ - views.boost_exact_matches( vs ), peliasQuery.view.popularity( views.pop_subquery )( vs ), peliasQuery.view.population( views.pop_subquery )( vs ) ] @@ -261,7 +254,6 @@ module.exports.tests.multiple_tokens = function(test, common) { views.phrase_first_tokens_only( vs ) ], should: [ - views.boost_exact_matches( vs ), peliasQuery.view.popularity( views.pop_subquery )( vs ), peliasQuery.view.population( views.pop_subquery )( vs ) ] diff --git a/test/unit/query/autocomplete_with_custom_boosts.js b/test/unit/query/autocomplete_with_custom_boosts.js index 01c292757..3cc38ef8d 100644 --- a/test/unit/query/autocomplete_with_custom_boosts.js +++ b/test/unit/query/autocomplete_with_custom_boosts.js @@ -40,7 +40,7 @@ module.exports.tests.query = function(test, common) { const actual_query = JSON.parse( JSON.stringify( autocomplete_query_module(clean) ) ); - t.deepEqual(actual_query, expected_query, 'query as expected'); + t.deepEqual(actual_query, expected_query, 'autocomplete_custom_boosts'); t.pass(); t.end(); }); From e0b8a6b350c8d5fd054015fef8f3234f8e1d46ce Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 10:19:24 +0200 Subject: [PATCH 21/55] feat(admin_subqueries): add cutoff_frequency --- query/autocomplete_defaults.js | 1 + test/unit/fixture/autocomplete_linguistic_with_admin.js | 3 ++- test/unit/fixture/autocomplete_single_character_street.js | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index ea121e055..ba7d54b79 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -53,6 +53,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { // generic multi_match config 'multi_match:type': 'cross_fields', + 'multi_match:cutoff_frequency': 0.01, 'admin:country_a:analyzer': 'standard', 'admin:country_a:field': 'parent.country_a.ngram', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index e746149ae..2d74a79a8 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -30,7 +30,8 @@ module.exports = { ], 'query': 'three', 'analyzer': 'peliasAdmin', - 'type': 'cross_fields' + 'type': 'cross_fields', + 'cutoff_frequency': 0.01 } } ], diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 6fa061ed4..5aeb95275 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -28,7 +28,8 @@ module.exports = { ], 'query': 'laird', 'analyzer': 'peliasAdmin', - 'type': 'cross_fields' + 'type': 'cross_fields', + 'cutoff_frequency': 0.01 } }], 'should':[ From 235fafef2499c61fa8a0cb468fa148b003421a9c Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 10:51:09 +0200 Subject: [PATCH 22/55] feat(pelias_parser): admin queries - remove subject from admin subquery --- sanitizer/_text_pelias_parser.js | 30 ++++++++++++++++++++++ test/unit/sanitizer/_text_pelias_parser.js | 20 ++++++++++++--- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index 607d1aefb..ed3e1a738 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -149,14 +149,44 @@ function parse (clean) { // a locality query else if (!_.isEmpty(parsed_text.locality)) { parsed_text.subject = parsed_text.locality; + + // remove the locality name from $admin + if ( parsed_text.admin ) { + let width = parsed_text.subject.length; + let cut = parsed_text.admin.substr(0, width); + if( cut === parsed_text.subject ){ + parsed_text.admin = _.trim(parsed_text.admin.substr(width), ', '); + if( !parsed_text.admin.length ){ delete parsed_text.admin; } + } + } } // a region query else if (!_.isEmpty(parsed_text.region)) { parsed_text.subject = parsed_text.region; + + // remove the region name from $admin + if (parsed_text.admin) { + let width = parsed_text.subject.length; + let cut = parsed_text.admin.substr(0, width); + if (cut === parsed_text.subject) { + parsed_text.admin = _.trim(parsed_text.admin.substr(width), ', '); + if( !parsed_text.admin.length ){ delete parsed_text.admin; } + } + } } // a country query else if (!_.isEmpty(parsed_text.country)) { parsed_text.subject = parsed_text.country; + + // remove the country name from $admin + if (parsed_text.admin) { + let width = parsed_text.subject.length; + let cut = parsed_text.admin.substr(0, width); + if (cut === parsed_text.subject) { + parsed_text.admin = _.trim(parsed_text.admin.substr(width), ', '); + if (!parsed_text.admin.length) { delete parsed_text.admin; } + } + } } // unknown query type diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index ddc32bee2..79e30f697 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -43,7 +43,7 @@ module.exports.tests.text_parser = function (test, common) { cases.push(['chelsea, london', { subject: 'chelsea', locality: 'chelsea', - admin: 'chelsea, london' + admin: 'london' }]); // Query with one token @@ -132,8 +132,22 @@ module.exports.tests.text_parser = function (test, common) { // AUS - state only cases.push(['NSW', { subject: 'NSW', - region: 'NSW', - admin: 'NSW' + region: 'NSW' + }]); + + // when admin name is $subject it should + // be removed from $admin + cases.push(['paris texas', { + subject: 'paris', + locality: 'paris', + region: 'texas', + admin: 'texas' + }]); + cases.push(['rome italy', { + subject: 'rome', + locality: 'rome', + country: 'italy', + admin: 'italy' }]); cases.forEach(testcase => { From f2816882e15ac379033c1e0be75b3d5f3d4168af Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 11:10:11 +0200 Subject: [PATCH 23/55] feat(admin_subqueries): remove cutoff_frequency --- query/autocomplete_defaults.js | 5 ++++- test/unit/fixture/autocomplete_linguistic_with_admin.js | 3 +-- test/unit/fixture/autocomplete_single_character_street.js | 3 +-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index ba7d54b79..121d64284 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -53,7 +53,10 @@ module.exports = _.merge({}, peliasQuery.defaults, { // generic multi_match config 'multi_match:type': 'cross_fields', - 'multi_match:cutoff_frequency': 0.01, + + // setting 'cutoff_frequency' will result in very common + // terms such as country not scoring at all + // 'multi_match:cutoff_frequency': 0.01, 'admin:country_a:analyzer': 'standard', 'admin:country_a:field': 'parent.country_a.ngram', diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 2d74a79a8..e746149ae 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -30,8 +30,7 @@ module.exports = { ], 'query': 'three', 'analyzer': 'peliasAdmin', - 'type': 'cross_fields', - 'cutoff_frequency': 0.01 + 'type': 'cross_fields' } } ], diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 5aeb95275..6fa061ed4 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -28,8 +28,7 @@ module.exports = { ], 'query': 'laird', 'analyzer': 'peliasAdmin', - 'type': 'cross_fields', - 'cutoff_frequency': 0.01 + 'type': 'cross_fields' } }], 'should':[ From cfbd5f7f0189048a644e6837860162e261d2195f Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 11:12:15 +0200 Subject: [PATCH 24/55] feat(autocomplete): use phrase index for complete tokens --- query/autocomplete_defaults.js | 2 +- test/unit/fixture/autocomplete_custom_boosts.json | 2 +- test/unit/fixture/autocomplete_linguistic_final_token.js | 2 +- test/unit/fixture/autocomplete_linguistic_multiple_tokens.js | 2 +- test/unit/fixture/autocomplete_linguistic_with_admin.js | 2 +- test/unit/fixture/autocomplete_single_character_street.js | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index 121d64284..b5a5ed2bf 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -22,7 +22,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'ngram:cutoff_frequency': 0.01, 'phrase:analyzer': 'peliasQuery', - 'phrase:field': 'name.default', + 'phrase:field': 'phrase.default', 'phrase:boost': 1, 'phrase:slop': 3, 'phrase:cutoff_frequency': 0.01, diff --git a/test/unit/fixture/autocomplete_custom_boosts.json b/test/unit/fixture/autocomplete_custom_boosts.json index 6bf9b7994..a29ceab55 100644 --- a/test/unit/fixture/autocomplete_custom_boosts.json +++ b/test/unit/fixture/autocomplete_custom_boosts.json @@ -6,7 +6,7 @@ "must": [ { "match": { - "name.default": { + "phrase.default": { "analyzer": "peliasQuery", "cutoff_frequency": 0.01, "type": "phrase", diff --git a/test/unit/fixture/autocomplete_linguistic_final_token.js b/test/unit/fixture/autocomplete_linguistic_final_token.js index 5bfab75d6..a55f842a9 100644 --- a/test/unit/fixture/autocomplete_linguistic_final_token.js +++ b/test/unit/fixture/autocomplete_linguistic_final_token.js @@ -3,7 +3,7 @@ module.exports = { 'bool': { 'must': [{ 'match': { - 'name.default': { + 'phrase.default': { 'analyzer': 'peliasQuery', 'cutoff_frequency': 0.01, 'boost': 1, diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index 679fa5aa8..5e8db15e1 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -3,7 +3,7 @@ module.exports = { 'bool': { 'must': [{ 'match': { - 'name.default': { + 'phrase.default': { 'analyzer': 'peliasQuery', 'type': 'phrase', 'boost': 1, diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index e746149ae..088b8da65 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -4,7 +4,7 @@ module.exports = { 'must': [ { 'match': { - 'name.default': { + 'phrase.default': { 'analyzer': 'peliasQuery', 'type': 'phrase', 'boost': 1, diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 6fa061ed4..a4ea7695a 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -3,7 +3,7 @@ module.exports = { 'bool': { 'must': [{ 'match': { - 'name.default': { + 'phrase.default': { 'analyzer': 'peliasQuery', 'cutoff_frequency': 0.01, 'type': 'phrase', From 68a0776db98ae728b62d1aaf4e16e05553257894 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 11:34:28 +0200 Subject: [PATCH 25/55] feat(parser): remove parsed_text.name --- sanitizer/_text_pelias_parser.js | 8 ++++---- test/unit/sanitizer/_text_pelias_parser.js | 14 ++++---------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index ed3e1a738..722dd03b7 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -124,7 +124,7 @@ function parse (clean) { postfix = postfix.replace(/\s+/g, ' ').trim(); // 3. store the unparsed characters in fields which can be used for querying - if (prefix.length) { parsed_text.name = prefix; } + // if (prefix.length) { parsed_text.name = prefix; } if (postfix.length) { parsed_text.admin = postfix; } // 4. set 'subject', this is the text which will target the 'name.*' @@ -138,9 +138,9 @@ function parse (clean) { else if (!_.isEmpty(parsed_text.street)) { parsed_text.subject = parsed_text.street; } - // query with a name such as a venue query - else if (!_.isEmpty(parsed_text.name)){ - parsed_text.subject = parsed_text.name; + // query with a $prefix such as a venue query + else if (!_.isEmpty(prefix)){ + parsed_text.subject = prefix; } // a postcode query else if (!_.isEmpty(parsed_text.postcode)) { diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index 79e30f697..462572ce8 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -24,7 +24,6 @@ module.exports.tests.text_parser = function (test, common) { // USA queries cases.push(['soho, new york, NY', { subject: 'soho', - name: 'soho', locality: 'new york', region: 'NY', admin: 'new york, NY' @@ -48,32 +47,27 @@ module.exports.tests.text_parser = function (test, common) { // Query with one token cases.push(['yugolsavia', { - subject: 'yugolsavia', - name: 'yugolsavia' + subject: 'yugolsavia' }]); // Query with two tokens, no numbers cases.push(['small town', { - subject: 'small town', - name: 'small town' + subject: 'small town' }]); // Query with two tokens, number first cases.push(['123 main', { - subject: '123 main', - name: '123 main' + subject: '123 main' }]); // Query with two tokens, number second cases.push(['main 123', { - subject: 'main 123', - name: 'main 123' + subject: 'main 123' }]); // Query with many tokens cases.push(['main particle new york', { subject: 'main particle', - name: 'main particle', locality: 'new york', admin: 'new york' }]); From 9258f2e264a3f87fee783301acb7cf7fe29f5964 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 11:38:34 +0200 Subject: [PATCH 26/55] feat(parser): so not consider address parses as safe to use with an ngrams index due to parses potentially containing partial suffixes --- sanitizer/_tokenizer.js | 11 ++++------- test/unit/sanitizer/_tokenizer.js | 4 ++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index 56b41b7c3..3b9bb9c7e 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -31,15 +31,12 @@ function _sanitize( raw, clean ){ if( _.has(clean.parsed_text, 'subject') ){ text = clean.parsed_text.subject; // use this string instead - // when both housenumber and street fields are present then the pelias parser - // will simply set $subject to be a concatination of these fields. - // in this case we can be sure that all tokens were complete - if (_.has(clean.parsed_text, 'housenumber') && _.has(clean.parsed_text, 'street')){ - parserConsumedAllTokens = true; - } + // note: we cannot be sure that the input is complete if a street is + // detected because the parser will detect partially completed suffixes + // which are not safe to match against an ngrams index // when $subject exactly equals one of the admin fields - else if ( + if ( text === clean.parsed_text.locality || text === clean.parsed_text.region || text === clean.parsed_text.country) { diff --git a/test/unit/sanitizer/_tokenizer.js b/test/unit/sanitizer/_tokenizer.js index 6ead9f76b..626639911 100644 --- a/test/unit/sanitizer/_tokenizer.js +++ b/test/unit/sanitizer/_tokenizer.js @@ -192,8 +192,8 @@ module.exports.tests.sanity_checks = function(test, common) { // favor clean.parsed_text.subject over clean.text t.deepEquals(clean.tokens, [ '190', 'foo', 'st' ], 'use street name + housenumber'); - t.deepEquals(clean.tokens_complete, [ '190', 'foo', 'st' ], 'use street name + housenumber'); - t.deepEquals(clean.tokens_incomplete, [], 'no tokens'); + t.deepEquals(clean.tokens_complete, [ '190', 'foo' ], 'complete'); + t.deepEquals(clean.tokens_incomplete, [ 'st' ], 'incomplete'); // no errors/warnings produced t.deepEquals(messages.errors, [], 'no errors'); From 7a62b3d3ecd01e38129a7234f944c3aefce0c66b Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 13:00:49 +0200 Subject: [PATCH 27/55] feat(parser): bump pelias/parser version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index ca0a0d883..c58bfaa2a 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.13.0", + "pelias-parser": "^1.14.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 6bcd91de596eee79b4445d6d3ba114372595e9a6 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 13:53:11 +0200 Subject: [PATCH 28/55] feat(tokenizer): consider query as complete if the $subject is not at the end of $clean.text --- sanitizer/_tokenizer.js | 15 ++++++++++++--- test/unit/sanitizer/_tokenizer.js | 30 ++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index 3b9bb9c7e..f4612e11c 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -33,10 +33,19 @@ function _sanitize( raw, clean ){ // note: we cannot be sure that the input is complete if a street is // detected because the parser will detect partially completed suffixes - // which are not safe to match against an ngrams index - + // which are not safe to match against a phrase index + if( _.has(clean.parsed_text, 'housenumber') && _.has(clean.parsed_text, 'street') ){ + parserConsumedAllTokens = false; + } + + // when $subject is not the end of $clean.text + // then there must be tokens coming afterwards + else if (!clean.text.endsWith(text)) { + parserConsumedAllTokens = true; + } + // when $subject exactly equals one of the admin fields - if ( + else if ( text === clean.parsed_text.locality || text === clean.parsed_text.region || text === clean.parsed_text.country) { diff --git a/test/unit/sanitizer/_tokenizer.js b/test/unit/sanitizer/_tokenizer.js index 626639911..9edd4a4ce 100644 --- a/test/unit/sanitizer/_tokenizer.js +++ b/test/unit/sanitizer/_tokenizer.js @@ -172,8 +172,8 @@ module.exports.tests.sanity_checks = function(test, common) { // favor clean.parsed_text.subject over clean.text t.deepEquals(clean.tokens, [ 'foo' ], 'use clean.parsed_text.subject'); - t.deepEquals(clean.tokens_complete, [], 'complete'); - t.deepEquals(clean.tokens_incomplete, [ 'foo' ], 'incomplete'); + t.deepEquals(clean.tokens_complete, [ 'foo' ], 'complete'); + t.deepEquals(clean.tokens_incomplete, [ ], 'incomplete'); // no errors/warnings produced t.deepEquals(messages.errors, [], 'no errors'); @@ -567,6 +567,32 @@ module.exports.tests.numeric_final_char = function (test, common) { }); }; +module.exports.tests.subject_complete = function (test, common) { + test('subject complete', function (t) { + + var clean = { + text: '혜화로, seoul', + parsed_text: { + subject: '혜화로', + locality: 'seoul', + admin: 'seoul' + } + }; + var messages = sanitizer.sanitize({}, clean); + + // tokens produced + t.deepEquals(clean.tokens, ['혜화로'], 'tokens produced'); + t.deepEquals(clean.tokens_complete, ['혜화로'], 'complete'); + t.deepEquals(clean.tokens_incomplete, [], 'incomplete'); + + // no errors/warnings produced + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [], 'no warnings'); + + t.end(); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { return tape('sanitizeR _tokenizer: ' + name, testFunction); From 5923a1ace4d213dd5b31e3b4b894e11d357da997 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 15:55:56 +0200 Subject: [PATCH 29/55] feat(parser): bump pelias/parser version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index c58bfaa2a..9fd0e65bb 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.14.0", + "pelias-parser": "^1.16.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 12af8cc9a8df0d882b1e69b0016389a95f3d9aba Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 16 May 2019 16:15:53 +0200 Subject: [PATCH 30/55] feat(autocomplete): experiment adding name.default to admin multi_match --- query/autocomplete.js | 8 ++++++++ test/unit/fixture/autocomplete_linguistic_with_admin.js | 3 ++- test/unit/fixture/autocomplete_single_character_street.js | 3 ++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index 8ad2606d6..475c85a95 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -21,6 +21,9 @@ var views = { // add abbrevations for the fields pelias/parser is able to detect. var adminFields = placeTypes.concat(['locality_a', 'region_a', 'country_a']); +// add name field to improve venue matching +adminFields = adminFields.concat(['add_name_to_multimatch']); + //------------------------------ // autocomplete query //------------------------------ @@ -160,6 +163,11 @@ function generateQuery( clean ){ textParser( clean, vs ); } + let isAdminSet = adminFields.some(field => vs.isset('input:' + field)); + if ( isAdminSet ){ vs.var('input:add_name_to_multimatch', 'enabled'); } + + vs.var('admin:add_name_to_multimatch:field', 'name.default'); + return { type: 'autocomplete', body: query.render(vs) diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 088b8da65..3d1e77583 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -26,7 +26,8 @@ module.exports = { 'parent.neighbourhood.ngram^1', 'parent.locality_a.ngram^1', 'parent.region_a.ngram^1', - 'parent.country_a.ngram^1' + 'parent.country_a.ngram^1', + 'name.default^1' ], 'query': 'three', 'analyzer': 'peliasAdmin', diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index a4ea7695a..952da297f 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -24,7 +24,8 @@ module.exports = { 'parent.neighbourhood.ngram^1', 'parent.locality_a.ngram^1', 'parent.region_a.ngram^1', - 'parent.country_a.ngram^1' + 'parent.country_a.ngram^1', + 'name.default^1' ], 'query': 'laird', 'analyzer': 'peliasAdmin', From 43d727b0ff94fa7195e43a43714169faf3a07dfc Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 3 Jun 2019 12:46:01 +0200 Subject: [PATCH 31/55] feat(autocomplete): progess commit --- package.json | 2 +- query/autocomplete_defaults.js | 10 ++++++ sanitizer/_address_layer_filter.js | 17 ++++++++-- sanitizer/_text_pelias_parser.js | 31 +++++++++++++++++-- sanitizer/_tokenizer.js | 2 +- .../autocomplete_linguistic_with_admin.js | 2 ++ .../autocomplete_single_character_street.js | 2 ++ test/unit/sanitizer/_text_pelias_parser.js | 8 +++++ 8 files changed, 67 insertions(+), 7 deletions(-) diff --git a/package.json b/package.json index 9fd0e65bb..b6de70d2f 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.16.0", + "pelias-parser": "^1.21.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index b5a5ed2bf..64ee3cad1 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -68,6 +68,11 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:country:boost': 1, 'admin:country:cutoff_frequency': 0.01, + 'admin:dependency:analyzer': 'peliasAdmin', + 'admin:dependency:field': 'parent.dependency.ngram', + 'admin:dependency:boost': 1, + 'admin:dependency:cutoff_frequency': 0.01, + 'admin:region:analyzer': 'peliasAdmin', 'admin:region:field': 'parent.region.ngram', 'admin:region:boost': 1, @@ -78,6 +83,11 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:region_a:boost': 1, 'admin:region_a:cutoff_frequency': 0.01, + 'admin:macroregion:analyzer': 'peliasAdmin', + 'admin:macroregion:field': 'parent.macroregion.ngram', + 'admin:macroregion:boost': 1, + 'admin:macroregion:cutoff_frequency': 0.01, + 'admin:county:analyzer': 'peliasAdmin', 'admin:county:field': 'parent.county.ngram', 'admin:county:boost': 1, diff --git a/sanitizer/_address_layer_filter.js b/sanitizer/_address_layer_filter.js index 87c4be5a4..3b6febd2d 100644 --- a/sanitizer/_address_layer_filter.js +++ b/sanitizer/_address_layer_filter.js @@ -21,6 +21,8 @@ const check = require('check-types'); * Update: added additional check that enforces that the input must also contain at least one numeral */ + // note: this runs before libpostal (which is a service) + const ADDRESS_FILTER_WARNING = 'performance optimization: excluding \'address\' layer'; function _setup(tm) { @@ -51,9 +53,15 @@ function _setup(tm) { // be subject to change. if (check.nonEmptyObject(clean.parsed_text)) { - // if 'addressit' or 'libpostal' identified input as a street address var isStreetAddress = clean.parsed_text.hasOwnProperty('number') && clean.parsed_text.hasOwnProperty('street'); - if (isStreetAddress) { + + // use $subject where available (pelias parser) + if (_.has(clean, 'parsed_text.subject')) { + input = clean.parsed_text.subject; + } + + // if 'addressit' or 'libpostal' identified input as a street address + else if (isStreetAddress) { input = clean.parsed_text.number + ' ' + clean.parsed_text.street; } @@ -69,6 +77,11 @@ function _setup(tm) { // check that at least one numeral was specified let hasNumeral = /\d/.test(input); + // do not consider numeric street names, such as '26 st' in numeric check. + if( _.has(clean, 'parsed_text.street') ){ + hasNumeral = /\d/.test(input.replace(clean.parsed_text.street, '')); + } + // if less than two words were specified /or no numeral is present // then it is safe to apply the layer filter if (totalWords < 2 || !hasNumeral) { diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index 722dd03b7..73d87abc3 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -67,9 +67,22 @@ function parse (clean) { // generate a classification mask, eg: // 'Foo Cafe 10 Main St London 10010 Earth' - // ' NN SSSSSSS AAAAAA PPPPP ' + // ' VVVV NN SSSSSSS AAAAAA PPPPP ' let mask = solution.mask(t); + // special handling of intersection queries + // here we do not trust intersection parses which also contain another + // classification, such as a house number, postcode or admin field. + // this is to avoid errors for queries such as: + // eg 'air & space museum, washington, dc' + if (parsed_text.street && parsed_text.cross_street) { + if (Object.keys(parsed_text).length > 3) { + delete parsed_text.street; + delete parsed_text.cross_street; + mask = mask.replace(/S/g, ' '); + } + } + // the entire input text as seen by the parser with any postcode classification(s) removed let body = t.span.body.split('') .map((c, i) => (mask[i] !== 'P') ? c : ' ') @@ -79,8 +92,13 @@ function parse (clean) { // prefix: all unparsed characters that came before any parsed fields // postfix: all characters from the first admin field to the end of the string - // set cursor to the first classified character - let cursor = mask.search(/\S/); + // set cursor to the first classified character from selected classes + let cursor = mask.search(/[NSAP]/); + + // >> solution includes venue classification + // set cursor after the venue name + if (mask.includes('V')) { cursor = mask.lastIndexOf('V') +1; } + if (cursor === -1) { cursor = body.length; } let prefix = _.trim(body.substr(0, cursor), ' ,'); @@ -92,6 +110,9 @@ function parse (clean) { // solution includes admin classification // set cursor to the first classified admin character else if( mask.includes('A') ){ cursor = mask.indexOf('A'); } + // >> solution includes venue classification + // set cursor after the venue name + else if (mask.includes('V')) { cursor = mask.lastIndexOf('V') + 1; } // else set cursor to end-of-text else { cursor = body.length; } let postfix = _.trim(body.substr(cursor), ' ,'); @@ -134,6 +155,10 @@ function parse (clean) { if (!_.isEmpty(parsed_text.housenumber) && !_.isEmpty(parsed_text.street)) { parsed_text.subject = `${parsed_text.housenumber} ${parsed_text.street}`; } + // an intersection query + else if (!_.isEmpty(parsed_text.street) && !_.isEmpty(parsed_text.cross_street)) { + parsed_text.subject = `${parsed_text.street} & ${parsed_text.cross_street}`; + } // a street query else if (!_.isEmpty(parsed_text.street)) { parsed_text.subject = parsed_text.street; diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index f4612e11c..739e3a9cf 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -34,7 +34,7 @@ function _sanitize( raw, clean ){ // note: we cannot be sure that the input is complete if a street is // detected because the parser will detect partially completed suffixes // which are not safe to match against a phrase index - if( _.has(clean.parsed_text, 'housenumber') && _.has(clean.parsed_text, 'street') ){ + if( _.has(clean.parsed_text, 'street') ){ parserConsumedAllTokens = false; } diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index 3d1e77583..cd652dc0f 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -18,6 +18,8 @@ module.exports = { 'multi_match': { 'fields': [ 'parent.country.ngram^1', + 'parent.dependency.ngram^1', + 'parent.macroregion.ngram^1', 'parent.region.ngram^1', 'parent.county.ngram^1', 'parent.localadmin.ngram^1', diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 952da297f..25d08b243 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -16,6 +16,8 @@ module.exports = { 'multi_match': { 'fields': [ 'parent.country.ngram^1', + 'parent.dependency.ngram^1', + 'parent.macroregion.ngram^1', 'parent.region.ngram^1', 'parent.county.ngram^1', 'parent.localadmin.ngram^1', diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index 462572ce8..317f43b25 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -144,6 +144,14 @@ module.exports.tests.text_parser = function (test, common) { admin: 'italy' }]); + // university + cases.push(['Union College, Kentucky', { + subject: 'Union College', + place: 'Union College', + region: 'Kentucky', + admin: 'Kentucky' + }]); + cases.forEach(testcase => { let input = testcase[0]; let expected = testcase[1]; From 61cceebe56e18b22e5cfc96b538c52066f0b294d Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 4 Jun 2019 10:08:56 +0200 Subject: [PATCH 32/55] feat(autocomplete): typo --- routes/v1.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/routes/v1.js b/routes/v1.js index 535203a49..aaf52004f 100644 --- a/routes/v1.js +++ b/routes/v1.js @@ -74,7 +74,7 @@ const hasRequestErrors = require('../controller/predicates/has_request_errors'); const isCoarseReverse = require('../controller/predicates/is_coarse_reverse'); const isAdminOnlyAnalysis = require('../controller/predicates/is_admin_only_analysis'); const hasResultsAtLayers = require('../controller/predicates/has_results_at_layers'); -const isPeliasItParse = require('../controller/predicates/is_pelias_parse'); +const isPeliasParse = require('../controller/predicates/is_pelias_parse'); const hasRequestCategories = require('../controller/predicates/has_request_parameter')('categories'); const isOnlyNonAdminLayers = require('../controller/predicates/is_only_non_admin_layers'); const isRequestLayersAnyAddressRelated = require('../controller/predicates/is_request_layers_any_address_related'); @@ -233,7 +233,7 @@ function addRoutes(app, peliasConfig) { // call search addressit query if addressit was the parser const searchAddressitShouldExecute = all( not(hasRequestErrors), - isPeliasItParse + isPeliasParse ); // get language adjustments if: From dc69e0d31458d666f5ef54414979a5285ff48f76 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 4 Jun 2019 11:23:37 +0200 Subject: [PATCH 33/55] feat(autocomplete): improved matching at the cusp --- query/autocomplete.js | 3 +- query/autocomplete_defaults.js | 4 +-- query/view/ngrams_last_token_only_multi.js | 36 +++++++++++++++++++ ...autocomplete_linguistic_multiple_tokens.js | 29 +++++++++------ .../autocomplete_linguistic_with_admin.js | 4 +-- .../autocomplete_single_character_street.js | 4 +-- ...utocomplete_token_matching_permutations.js | 13 +++++-- 7 files changed, 74 insertions(+), 19 deletions(-) create mode 100644 query/view/ngrams_last_token_only_multi.js diff --git a/query/autocomplete.js b/query/autocomplete.js index 475c85a95..6be8eb8ac 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -11,6 +11,7 @@ var views = { custom_boosts: require('./view/boost_sources_and_layers'), ngrams_strict: require('./view/ngrams_strict'), ngrams_last_token_only: require('./view/ngrams_last_token_only'), + ngrams_last_token_only_multi: require('./view/ngrams_last_token_only_multi'), phrase_first_tokens_only: require('./view/phrase_first_tokens_only'), pop_subquery: require('./view/pop_subquery'), boost_exact_matches: require('./view/boost_exact_matches'), @@ -31,7 +32,7 @@ var query = new peliasQuery.layout.FilteredBooleanQuery(); // mandatory matches query.score( views.phrase_first_tokens_only, 'must' ); -query.score( views.ngrams_last_token_only, 'must' ); +query.score( views.ngrams_last_token_only_multi( adminFields ), 'must' ); // admin components query.score(peliasQuery.view.admin_multi_match(adminFields, 'peliasAdmin'), 'must'); diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index 64ee3cad1..fe20d06a1 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -60,7 +60,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:country_a:analyzer': 'standard', 'admin:country_a:field': 'parent.country_a.ngram', - 'admin:country_a:boost': 1, + 'admin:country_a:boost': 4, 'admin:country_a:cutoff_frequency': 0.01, 'admin:country:analyzer': 'peliasAdmin', @@ -80,7 +80,7 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:region_a:analyzer': 'peliasAdmin', 'admin:region_a:field': 'parent.region_a.ngram', - 'admin:region_a:boost': 1, + 'admin:region_a:boost': 4, 'admin:region_a:cutoff_frequency': 0.01, 'admin:macroregion:analyzer': 'peliasAdmin', diff --git a/query/view/ngrams_last_token_only_multi.js b/query/view/ngrams_last_token_only_multi.js new file mode 100644 index 000000000..369bef183 --- /dev/null +++ b/query/view/ngrams_last_token_only_multi.js @@ -0,0 +1,36 @@ +const peliasQuery = require('pelias-query'); +const ngrams_last_token_only = require('./ngrams_last_token_only'); + +module.exports = function (adminFields){ + const subview = peliasQuery.view.admin_multi_match( adminFields, 'peliasQueryPartialToken' ); + + return function (vs) { + + // get a copy of the *tokens_incomplete* tokens produced from the input:name + var tokens = vs.var('input:name:tokens_incomplete').get(); + + // no valid tokens to use, fail now, don't render this view. + if (!tokens || tokens.length < 1) { return null; } + + var complete_tokens = vs.var('input:name:tokens_complete').get(); + if (!complete_tokens || complete_tokens.length < 1) { return ngrams_last_token_only(vs); } + + // make a copy Vars so we don't mutate the original + var vsCopy = new peliasQuery.Vars( vs.export() ); + + adminFields.forEach(field => { + // set the admin variables in the copy to only the last token + vsCopy.var(`input:${field}`).set(tokens.join(' ')); + }); + + var rendered = subview( vsCopy ); + if( !rendered ){ return rendered; } + + // return the view rendered using the copy + return { + 'constant_score': { + 'query': rendered + } + }; + }; +}; diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js index 5e8db15e1..f1cd848de 100644 --- a/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens.js @@ -16,16 +16,25 @@ module.exports = { { 'constant_score': { 'query': { - 'match': { - 'name.default': { - 'analyzer': 'peliasQuery', - 'boost': 100, - 'query': 'three', - 'type': 'phrase', - 'operator': 'and', - 'cutoff_frequency': 0.01, - 'slop': 3 - } + 'multi_match': { + 'fields': [ + 'parent.country.ngram^1', + 'parent.dependency.ngram^1', + 'parent.macroregion.ngram^1', + 'parent.region.ngram^1', + 'parent.county.ngram^1', + 'parent.localadmin.ngram^1', + 'parent.locality.ngram^1', + 'parent.borough.ngram^1', + 'parent.neighbourhood.ngram^1', + 'parent.locality_a.ngram^1', + 'parent.region_a.ngram^4', + 'parent.country_a.ngram^4', + 'name.default^1' + ], + 'query': 'three', + 'analyzer': 'peliasQuery', + 'type': 'cross_fields' } } } diff --git a/test/unit/fixture/autocomplete_linguistic_with_admin.js b/test/unit/fixture/autocomplete_linguistic_with_admin.js index cd652dc0f..6b7a5b39d 100644 --- a/test/unit/fixture/autocomplete_linguistic_with_admin.js +++ b/test/unit/fixture/autocomplete_linguistic_with_admin.js @@ -27,8 +27,8 @@ module.exports = { 'parent.borough.ngram^1', 'parent.neighbourhood.ngram^1', 'parent.locality_a.ngram^1', - 'parent.region_a.ngram^1', - 'parent.country_a.ngram^1', + 'parent.region_a.ngram^4', + 'parent.country_a.ngram^4', 'name.default^1' ], 'query': 'three', diff --git a/test/unit/fixture/autocomplete_single_character_street.js b/test/unit/fixture/autocomplete_single_character_street.js index 25d08b243..fb113cbcf 100644 --- a/test/unit/fixture/autocomplete_single_character_street.js +++ b/test/unit/fixture/autocomplete_single_character_street.js @@ -25,8 +25,8 @@ module.exports = { 'parent.borough.ngram^1', 'parent.neighbourhood.ngram^1', 'parent.locality_a.ngram^1', - 'parent.region_a.ngram^1', - 'parent.country_a.ngram^1', + 'parent.region_a.ngram^4', + 'parent.country_a.ngram^4', 'name.default^1' ], 'query': 'laird', diff --git a/test/unit/query/autocomplete_token_matching_permutations.js b/test/unit/query/autocomplete_token_matching_permutations.js index 069465ed8..597f543a6 100644 --- a/test/unit/query/autocomplete_token_matching_permutations.js +++ b/test/unit/query/autocomplete_token_matching_permutations.js @@ -6,6 +6,10 @@ const defaultPeliasConfig = { } }; +// admin fields +const placeTypes = require('../../../helper/placeTypes'); +var adminFields = placeTypes.concat(['locality_a', 'region_a', 'country_a', 'add_name_to_multimatch']); + var generate = proxyquire('../../../query/autocomplete', { 'pelias-config': defaultPeliasConfig }); @@ -16,6 +20,7 @@ const defaults = new peliasQuery.Vars( require('../../../query/autocomplete_defa // additional views const views = { ngrams_last_token_only: require('../../../query/view/ngrams_last_token_only'), + ngrams_last_token_only_multi: require('../../../query/view/ngrams_last_token_only_multi')(adminFields), phrase_first_tokens_only: require('../../../query/view/phrase_first_tokens_only'), pop_subquery: require('../../../query/view/pop_subquery') }; @@ -180,11 +185,13 @@ module.exports.tests.multiple_tokens = function(test, common) { }; var vs = vars( clean ); + vs.var('input:add_name_to_multimatch', 'enabled'); + vs.var('admin:add_name_to_multimatch:field', 'name.default'); assert( t, generate( clean ), { must: [ views.phrase_first_tokens_only( vs ), - views.ngrams_last_token_only( vs ) + views.ngrams_last_token_only_multi( vs ) ], should: [ peliasQuery.view.popularity( views.pop_subquery )( vs ), @@ -225,11 +232,13 @@ module.exports.tests.multiple_tokens = function(test, common) { }; var vs = vars( clean ); + vs.var('input:add_name_to_multimatch', 'enabled'); + vs.var('admin:add_name_to_multimatch:field', 'name.default'); assert( t, generate( clean ), { must: [ views.phrase_first_tokens_only( vs ), - views.ngrams_last_token_only( vs ) + views.ngrams_last_token_only_multi( vs ) ], should: [ peliasQuery.view.popularity( views.pop_subquery )( vs ), From 0cdc5e8eea1025e589de189f4f6cdde0ad859dd4 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 4 Jun 2019 13:05:13 +0200 Subject: [PATCH 34/55] feat(autocomplete): improved performance and reduced noise for admin matching --- query/autocomplete.js | 5 ++- query/view/admin_multi_match_first.js | 46 ++++++++++++++++++++++ query/view/admin_multi_match_last.js | 38 ++++++++++++++++++ query/view/ngrams_last_token_only_multi.js | 3 ++ 4 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 query/view/admin_multi_match_first.js create mode 100644 query/view/admin_multi_match_last.js diff --git a/query/autocomplete.js b/query/autocomplete.js index 6be8eb8ac..7a51fad25 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -12,6 +12,8 @@ var views = { ngrams_strict: require('./view/ngrams_strict'), ngrams_last_token_only: require('./view/ngrams_last_token_only'), ngrams_last_token_only_multi: require('./view/ngrams_last_token_only_multi'), + admin_multi_match_first: require('./view/admin_multi_match_first'), + admin_multi_match_last: require('./view/admin_multi_match_last'), phrase_first_tokens_only: require('./view/phrase_first_tokens_only'), pop_subquery: require('./view/pop_subquery'), boost_exact_matches: require('./view/boost_exact_matches'), @@ -35,7 +37,8 @@ query.score( views.phrase_first_tokens_only, 'must' ); query.score( views.ngrams_last_token_only_multi( adminFields ), 'must' ); // admin components -query.score(peliasQuery.view.admin_multi_match(adminFields, 'peliasAdmin'), 'must'); +query.score( views.admin_multi_match_first( adminFields ), 'must'); +query.score( views.admin_multi_match_last( adminFields ), 'must'); // address components query.score( peliasQuery.view.address('housenumber') ); diff --git a/query/view/admin_multi_match_first.js b/query/view/admin_multi_match_first.js new file mode 100644 index 000000000..bcecd1387 --- /dev/null +++ b/query/view/admin_multi_match_first.js @@ -0,0 +1,46 @@ +const peliasQuery = require('pelias-query'); + +module.exports = function (adminFields) { + const subview = peliasQuery.view.admin_multi_match(adminFields, 'peliasAdmin'); + + return (vs) => { + + // check which of the possible admin_properties are actually set + // from the query + var valid_admin_properties = adminFields.filter(admin_property => { + return admin_property && + vs.isset('input:' + admin_property) && + vs.isset('admin:' + admin_property + ':field'); + }); + + if (valid_admin_properties.length === 0) { + return null; + } + + // the actual query text is simply taken from the first valid admin field + // this assumes all the values would be the same, which is probably not true + // TODO: handle the case where not all admin area input values are the same + var tokens = vs.var('input:' + valid_admin_properties[0]).get().split(/\s+/g); + + // no valid tokens to use, fail now, don't render this view. + if (!tokens || tokens.length < 2) { return null; } + + // make a copy Vars so we don't mutate the original + var vsCopy = new peliasQuery.Vars(vs.export()); + + // change field mappings + vsCopy.var('admin:add_name_to_multimatch:field', 'phrase.default'); + adminFields.forEach(field => { + if( vsCopy.isset(`admin:${field}:field`) ){ + vsCopy.var(`admin:${field}:field`, vsCopy.var(`admin:${field}:field`).get().replace('.ngram', '')); + } + }); + + adminFields.forEach(field => { + // set the admin variables in the copy to only the last token + vsCopy.var(`input:${field}`).set(tokens.slice(0, -1).join(' ')); + }); + + return subview(vsCopy); + }; +}; diff --git a/query/view/admin_multi_match_last.js b/query/view/admin_multi_match_last.js new file mode 100644 index 000000000..ceaab8a98 --- /dev/null +++ b/query/view/admin_multi_match_last.js @@ -0,0 +1,38 @@ +const peliasQuery = require('pelias-query'); + +module.exports = function (adminFields) { + const subview = peliasQuery.view.admin_multi_match(adminFields, 'peliasAdmin'); + + return (vs) => { + + // check which of the possible admin_properties are actually set + // from the query + var valid_admin_properties = adminFields.filter(admin_property => { + return admin_property && + vs.isset('input:' + admin_property) && + vs.isset('admin:' + admin_property + ':field'); + }); + + if (valid_admin_properties.length === 0) { + return null; + } + + // the actual query text is simply taken from the first valid admin field + // this assumes all the values would be the same, which is probably not true + // TODO: handle the case where not all admin area input values are the same + var tokens = vs.var('input:' + valid_admin_properties[0]).get().split(/\s+/g); + + // no valid tokens to use, fail now, don't render this view. + if (!tokens || tokens.length < 1) { return null; } + + // make a copy Vars so we don't mutate the original + var vsCopy = new peliasQuery.Vars(vs.export()); + + adminFields.forEach(field => { + // set the admin variables in the copy to only the last token + vsCopy.var(`input:${field}`).set(tokens[ tokens.length -1 ]); + }); + + return subview(vsCopy); + }; +}; diff --git a/query/view/ngrams_last_token_only_multi.js b/query/view/ngrams_last_token_only_multi.js index 369bef183..3eaaddea9 100644 --- a/query/view/ngrams_last_token_only_multi.js +++ b/query/view/ngrams_last_token_only_multi.js @@ -6,6 +6,9 @@ module.exports = function (adminFields){ return function (vs) { + // return the simple view for address queries + if( vs.isset('input:street') ){ return ngrams_last_token_only(vs); } + // get a copy of the *tokens_incomplete* tokens produced from the input:name var tokens = vs.var('input:name:tokens_incomplete').get(); From 4b79aa138ae074256406d20f9e30eeb8e0f7cc16 Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 5 Jun 2019 19:26:34 +0200 Subject: [PATCH 35/55] feat(deps): bump parser dep version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index b6de70d2f..070d62d07 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.21.0", + "pelias-parser": "^1.24.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 1045c84a5395793dbc78efa33e14cc9069010d08 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 6 Jun 2019 13:30:14 +0200 Subject: [PATCH 36/55] feat(deps): bump parser dep version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 070d62d07..3ce4c731d 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.24.0", + "pelias-parser": "^1.25.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 2743575507cb9bedec4aa3cb27e963cb3339ae9e Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 6 Jun 2019 15:39:54 +0200 Subject: [PATCH 37/55] feat(deps): bump parser dep version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 3ce4c731d..8874daf1b 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.25.0", + "pelias-parser": "^1.27.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From f079baf1a60203707e087ece2e8f619ef20df45b Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 6 Jun 2019 15:52:41 +0200 Subject: [PATCH 38/55] test: disable parserConsumedAllTokens for admin parses --- sanitizer/_tokenizer.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index 739e3a9cf..8e58105f1 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -45,12 +45,12 @@ function _sanitize( raw, clean ){ } // when $subject exactly equals one of the admin fields - else if ( - text === clean.parsed_text.locality || - text === clean.parsed_text.region || - text === clean.parsed_text.country) { - parserConsumedAllTokens = true; - } + // else if ( + // text === clean.parsed_text.locality || + // text === clean.parsed_text.region || + // text === clean.parsed_text.country) { + // parserConsumedAllTokens = true; + // } } } From f07cb90fc9b4ffb6cd6b6223861c481676ddf038 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 6 Jun 2019 19:00:00 +0200 Subject: [PATCH 39/55] feat(deps): bump parser dep version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 8874daf1b..c840d4572 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.27.0", + "pelias-parser": "^1.28.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 40b62bc821a3619de174e77e483e84e2a149c35f Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 7 Jun 2019 15:03:46 +0200 Subject: [PATCH 40/55] feat(query): add should subquery for cross_street matching --- query/autocomplete.js | 1 + query/autocomplete_defaults.js | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/query/autocomplete.js b/query/autocomplete.js index 7a51fad25..454a96e62 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -43,6 +43,7 @@ query.score( views.admin_multi_match_last( adminFields ), 'must'); // address components query.score( peliasQuery.view.address('housenumber') ); query.score( peliasQuery.view.address('street') ); +query.score( peliasQuery.view.address('cross_street') ); query.score( peliasQuery.view.address('postcode') ); // scoring boost diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index fe20d06a1..e2d7e3ee3 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -46,6 +46,11 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'address:street:boost': 5, 'address:street:cutoff_frequency': 0.01, + 'address:cross_street:analyzer': 'peliasStreet', + 'address:cross_street:field': 'address_parts.cross_street', + 'address:cross_street:boost': 5, + 'address:cross_street:cutoff_frequency': 0.01, + 'address:postcode:analyzer': 'peliasZip', 'address:postcode:field': 'address_parts.zip', 'address:postcode:boost': 2000, From 9d69fe183a7e7e67dd620a0ebdda9dab0d8939e8 Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 10 Jun 2019 16:35:11 +0200 Subject: [PATCH 41/55] feat(logging): add summary logging for pelias parser --- sanitizer/_text_pelias_parser.js | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index 73d87abc3..005ca57ae 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -1,3 +1,4 @@ +const logger = require('pelias-logger').get('api'); const Tokenizer = require('pelias-parser/tokenization/Tokenizer'); const Solution = require('pelias-parser/solver/Solution'); const AddressParser = require('pelias-parser/parser/AddressParser'); @@ -38,11 +39,19 @@ function _sanitize (raw, clean) { } function parse (clean) { + // parse text + let start = new Date(); const t = new Tokenizer(clean.text); parser.classify(t); parser.solve(t); + // log summary info + logger.info('pelias_parser', { + took: (new Date()) - start, + solutions: t.solution.length + }); + // only use the first solution generated // @todo: we could expand this in the future to accomodate more solutions let solution = new Solution(); From 5974f7a83ae16ec1189b3f55e0de91652995167d Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 12 Jun 2019 14:22:04 +0200 Subject: [PATCH 42/55] feat(deps): bump parser dep version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index c840d4572..52958540a 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.28.0", + "pelias-parser": "^1.34.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 85693a83c65f4079fca7f41a63ccbe6904ddd3e9 Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 17 Jun 2019 16:04:59 +0200 Subject: [PATCH 43/55] feat(pelias_parser): additional parser tests --- test/unit/sanitizer/_text_pelias_parser.js | 208 ++++++++++++++++++++- 1 file changed, 207 insertions(+), 1 deletion(-) diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index 317f43b25..abdae52bd 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -152,9 +152,211 @@ module.exports.tests.text_parser = function (test, common) { admin: 'Kentucky' }]); + // street (USA style) + cases.push(['M', { subject: 'M' }, true]); + cases.push(['Ma', { subject: 'Ma' }, true]); + cases.push(['Mai', { subject: 'Mai' }, true]); + cases.push(['Main', { subject: 'Main' }, true]); + cases.push(['Main ', { subject: 'Main' }, true]); + cases.push(['Main S', { subject: 'Main S' }, true]); + cases.push(['Main St', { subject: 'Main St' }, true]); + cases.push(['Main St S', { subject: 'Main St' }, true]); + // cases.push(['Main St Se', { subject: 'Main St' }, true]); // jitter on SE + cases.push(['Main St Sea', { subject: 'Main St' }, true]); + cases.push(['Main St Seat', { subject: 'Main St' }, true]); + cases.push(['Main St Seatt', { subject: 'Main St' }, true]); + cases.push(['Main St Seattl', { subject: 'Main St' }, true]); + cases.push(['Main St Seattle', { subject: 'Main St' }, true]); + + // address (USA style) + cases.push(['1', { subject: '1' }, true]); + cases.push(['10', { subject: '10' }, true]); + cases.push(['10 ', { subject: '10' }, true]); + cases.push(['10 M', { subject: '10 M' }, true]); + cases.push(['10 Ma', { subject: '10 Ma' }, true]); + cases.push(['10 Mai', { subject: '10 Mai' }, true]); + cases.push(['10 Main', { subject: '10 Main' }, true]); + cases.push(['10 Main ', { subject: '10 Main' }, true]); + cases.push(['10 Main S', { subject: '10 Main S' }, true]); + cases.push(['10 Main St', { subject: '10 Main St' }, true]); + cases.push(['10 Main St S', { subject: '10 Main St' }, true]); + // cases.push(['10 Main St Se', { subject: '10 Main St' }, true]); // jitter issue + cases.push(['10 Main St Sea', { subject: '10 Main St' }, true]); + cases.push(['10 Main St Seat', { subject: '10 Main St' }, true]); + cases.push(['10 Main St Seatt', { subject: '10 Main St' }, true]); + cases.push(['10 Main St Seattl', { subject: '10 Main St' }, true]); + cases.push(['10 Main St Seattle', { subject: '10 Main St' }, true]); + + // street (ESP style) + cases.push(['C', { subject: 'C' }, true]); + cases.push(['Ca', { subject: 'Ca' }, true]); + cases.push(['Cal', { subject: 'Cal' }, true]); + cases.push(['Call', { subject: 'Call' }, true]); + cases.push(['Calle', { subject: 'Calle' }, true]); + cases.push(['Calle ', { subject: 'Calle' }, true]); + cases.push(['Calle P', { subject: 'Calle P' }, true]); + cases.push(['Calle Pr', { subject: 'Calle Pr' }, true]); + cases.push(['Calle Pri', { subject: 'Calle Pri' }, true]); + cases.push(['Calle Prin', { subject: 'Calle Prin' }, true]); + cases.push(['Calle Princ', { subject: 'Calle Princ' }, true]); + cases.push(['Calle Princi', { subject: 'Calle Princi' }, true]); + cases.push(['Calle Princip', { subject: 'Calle Princip' }, true]); + cases.push(['Calle Principa', { subject: 'Calle Principa' }, true]); + cases.push(['Calle Principal', { subject: 'Calle Principal' }, true]); + cases.push(['Calle Principal ', { subject: 'Calle Principal' }, true]); + cases.push(['Calle Principal B', { subject: 'Calle Principal' }, true]); + // cases.push(['Calle Principal Ba', { subject: 'Calle Principal' }, true]); // jitter issue + cases.push(['Calle Principal Bar', { subject: 'Calle Principal' }, true]); + cases.push(['Calle Principal Barc', { subject: 'Calle Principal' }, true]); + // cases.push(['Calle Principal Barce', { subject: 'Calle Principal' }, true]); // jitter issue + // cases.push(['Calle Principal Barcel', { subject: 'Calle Principal' }, true]); // jitter issue + // cases.push(['Calle Principal Barcelo', { subject: 'Calle Principal' }, true]); // jitter issue + // cases.push(['Calle Principal Barcelon', { subject: 'Calle Principal' }, true]); // jitter issue + cases.push(['Calle Principal Barcelona', { subject: 'Calle Principal' }, true]); + + // address (ESP style) + cases.push(['Calle Principal 20', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20 ', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20 B', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20 Ba', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20 Bar', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20 Barc', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20 Barce', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20 Barcel', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20 Barcelo', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20 Barcelon', { subject: '20 Calle Principal' }, true]); + cases.push(['Calle Principal 20 Barcelona', { subject: '20 Calle Principal' }, true]); + + // street (DEU style) + cases.push(['H', { subject: 'H' }, true]); + cases.push(['Ha', { subject: 'Ha' }, true]); + cases.push(['Hau', { subject: 'Hau' }, true]); + cases.push(['Haup', { subject: 'Haup' }, true]); + cases.push(['Haupt', { subject: 'Haupt' }, true]); + cases.push(['Haupts', { subject: 'Haupts' }, true]); + cases.push(['Hauptst', { subject: 'Hauptst' }, true]); + cases.push(['Hauptstr', { subject: 'Hauptstr' }, true]); + cases.push(['Hauptstra', { subject: 'Hauptstra' }, true]); + cases.push(['Hauptstraß', { subject: 'Hauptstraß' }, true]); + cases.push(['Hauptstraße', { subject: 'Hauptstraße' }, true]); + cases.push(['Hauptstraße ', { subject: 'Hauptstraße' }, true]); + cases.push(['Hauptstraße B', { subject: 'Hauptstraße' }, true]); + cases.push(['Hauptstraße Be', { subject: 'Hauptstraße' }, true]); + cases.push(['Hauptstraße Ber', { subject: 'Hauptstraße' }, true]); + cases.push(['Hauptstraße Berl', { subject: 'Hauptstraße' }, true]); + cases.push(['Hauptstraße Berli', { subject: 'Hauptstraße' }, true]); + cases.push(['Hauptstraße Berlin', { subject: 'Hauptstraße' }, true]); + + // address (DEU style) + cases.push(['H', { subject: 'H' }, true]); + cases.push(['Ha', { subject: 'Ha' }, true]); + cases.push(['Hau', { subject: 'Hau' }, true]); + cases.push(['Haup', { subject: 'Haup' }, true]); + cases.push(['Haupt', { subject: 'Haupt' }, true]); + cases.push(['Haupts', { subject: 'Haupts' }, true]); + cases.push(['Hauptst', { subject: 'Hauptst' }, true]); + cases.push(['Hauptstr', { subject: 'Hauptstr' }, true]); + cases.push(['Hauptstra', { subject: 'Hauptstra' }, true]); + cases.push(['Hauptstraß', { subject: 'Hauptstraß' }, true]); + cases.push(['Hauptstraße', { subject: 'Hauptstraße' }, true]); + cases.push(['Hauptstraße ', { subject: 'Hauptstraße' }, true]); + cases.push(['Hauptstraße 5', { subject: '5 Hauptstraße' }, true]); + cases.push(['Hauptstraße 50', { subject: '50 Hauptstraße' }, true]); + cases.push(['Hauptstraße 50 ', { subject: '50 Hauptstraße' }, true]); + cases.push(['Hauptstraße 50 B', { subject: '50 Hauptstraße' }, true]); + cases.push(['Hauptstraße 50 Be', { subject: '50 Hauptstraße' }, true]); + cases.push(['Hauptstraße 50 Ber', { subject: '50 Hauptstraße' }, true]); + cases.push(['Hauptstraße 50 Berl', { subject: '50 Hauptstraße' }, true]); + cases.push(['Hauptstraße 50 Berli', { subject: '50 Hauptstraße' }, true]); + cases.push(['Hauptstraße 50 Berlin', { subject: '50 Hauptstraße' }, true]); + + // venues + cases.push(['K', { subject: 'K' }, true]); + cases.push(['Ka', { subject: 'Ka' }, true]); + cases.push(['Kas', { subject: 'Kas' }, true]); + cases.push(['Kasc', { subject: 'Kasc' }, true]); + cases.push(['Kasch', { subject: 'Kasch' }, true]); + cases.push(['Kaschk', { subject: 'Kaschk' }, true]); + cases.push(['Kaschk ', { subject: 'Kaschk' }, true]); + // cases.push(['Kaschk B', { subject: 'Kaschk' }, true]); // jitter issue + cases.push(['Kaschk Be', { subject: 'Kaschk' }, true]); + // cases.push(['Kaschk Ber', { subject: 'Kaschk' }, true]); // jitter issue + // cases.push(['Kaschk Berl', { subject: 'Kaschk' }, true]); // jitter issue + // cases.push(['Kaschk Berli', { subject: 'Kaschk' }, true]); // jitter issue + cases.push(['Kaschk Berlin', { subject: 'Kaschk' }, true]); + + cases.push(['A', { subject: 'A' }, true]); + cases.push(['Ai', { subject: 'Ai' }, true]); + cases.push(['Air', { subject: 'Air' }, true]); + cases.push(['Air ', { subject: 'Air' }, true]); + cases.push(['Air &', { subject: 'Air &' }, true]); + cases.push(['Air & ', { subject: 'Air &' }, true]); + cases.push(['Air & S', { subject: 'Air & S' }, true]); + cases.push(['Air & Sp', { subject: 'Air & Sp' }, true]); + cases.push(['Air & Spa', { subject: 'Air & Spa' }, true]); + cases.push(['Air & Spac', { subject: 'Air & Spac' }, true]); + cases.push(['Air & Space', { subject: 'Air & Space' }, true]); + cases.push(['Air & Space ', { subject: 'Air & Space' }, true]); + // cases.push(['Air & Space M', { subject: 'Air & Space M' }, true]); // jitter issue + // cases.push(['Air & Space Mu', { subject: 'Air & Space Mu' }, true]); // jitter issue + cases.push(['Air & Space Mus', { subject: 'Air & Space Mus' }, true]); + // cases.push(['Air & Space Muse', { subject: 'Air & Space Muse' }, true]); // jitter issue + // cases.push(['Air & Space Museu', { subject: 'Air & Space Museu' }, true]); // jitter issue + cases.push(['Air & Space Museum', { subject: 'Air & Space Museum' }, true]); + cases.push(['Air & Space Museum ', { subject: 'Air & Space Museum' }, true]); + cases.push(['Air & Space Museum D', { subject: 'Air & Space Museum' }, true]); + cases.push(['Air & Space Museum DC', { subject: 'Air & Space Museum' }, true]); + + // admin areas + cases.push(['N', { subject: 'N' }, true]); + cases.push(['Ne', { subject: 'Ne' }, true]); + cases.push(['New', { subject: 'New' }, true]); + cases.push(['New ', { subject: 'New' }, true]); + cases.push(['New Y', { subject: 'New Y' }, true]); + // cases.push(['New Yo', { subject: 'New Yo' }, true]); // jitter issue + // cases.push(['New Yor', { subject: 'New Yor' }, true]); // jitter issue + cases.push(['New York', { subject: 'New York' }, true]); + cases.push(['New York N', { subject: 'New York' }, true]); + cases.push(['New York NY', { subject: 'New York' }, true]); + + cases.push(['B', { subject: 'B' }, true]); + cases.push(['Be', { subject: 'Be' }, true]); + cases.push(['Ber', { subject: 'Ber' }, true]); + cases.push(['Berl', { subject: 'Berl' }, true]); + cases.push(['Berli', { subject: 'Berli' }, true]); + cases.push(['Berlin', { subject: 'Berlin' }, true]); + cases.push(['Berlin ', { subject: 'Berlin' }, true]); + cases.push(['Berlin D', { subject: 'Berlin' }, true]); + cases.push(['Berlin De', { subject: 'Berlin' }, true]); + cases.push(['Berlin Deu', { subject: 'Berlin' }, true]); + cases.push(['Berlin Deut', { subject: 'Berlin' }, true]); + cases.push(['Berlin Deuts', { subject: 'Berlin' }, true]); + cases.push(['Berlin Deutsc', { subject: 'Berlin' }, true]); + cases.push(['Berlin Deutsch', { subject: 'Berlin' }, true]); + cases.push(['Berlin Deutschl', { subject: 'Berlin' }, true]); + cases.push(['Berlin Deutschla', { subject: 'Berlin' }, true]); + cases.push(['Berlin Deutschlan', { subject: 'Berlin' }, true]); + cases.push(['Berlin Deutschland', { subject: 'Berlin' }, true]); + + // postcodes + cases.push(['2000', { subject: '2000' }, true]); + cases.push(['Sydney 2000', { subject: '2000' }, true]); + cases.push(['10010', { subject: '10010' }, true]); + cases.push(['New York 10010', { subject: '10010' }, true]); + cases.push(['10437', { subject: '10437' }, true]); + cases.push(['Berlin 10437', { subject: '10437' }, true]); + cases.push(['E81DN', { subject: 'E81DN' }, true]); + cases.push(['London E81DN', { subject: 'E81DN' }, true]); + cases.push(['e81dn', { subject: 'e81dn' }, true]); + cases.push(['london e81dn', { subject: 'e81dn' }, true]); + cases.push(['e8 1dn', { subject: 'e8 1dn' }, true]); + // cases.push(['london e8 1dn', { subject: 'e8 1dn' }, true]); // issue + cases.forEach(testcase => { let input = testcase[0]; let expected = testcase[1]; + let subjectOnly = (testcase[2] === true); function assert(label, replacement, replaceAdmin) { let text = input.replace(/\s+/, ' '); @@ -176,7 +378,11 @@ module.exports.tests.text_parser = function (test, common) { t.deepEqual(messages, { errors: [], warnings: [] }, 'messages'); t.equal(clean.text, raw.text.trim(), 'text'); t.equal(clean.parser, 'pelias', 'parser'); - t.deepEqual(clean.parsed_text, clone, `${label}: ${text}`); + if( subjectOnly ){ + t.equals(clean.parsed_text.subject, clone.subject, `${label}: ${text}`); + } else { + t.deepEqual(clean.parsed_text, clone, `${label}: ${text}`); + } t.end(); }); } From abeb48f013a5937836966db1c3168f4939583371 Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 17 Jun 2019 16:06:23 +0200 Subject: [PATCH 44/55] feat(deps): bump parser dep version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 52958540a..b62eb01fc 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.34.0", + "pelias-parser": "^1.36.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 770e820bb57670058e598d1ef97b81f7e492316d Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 10 Jul 2019 13:20:49 +0200 Subject: [PATCH 45/55] feat(pelias_parser): fix tests --- test/unit/query/autocomplete.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index fee51406b..c38a6cab6 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -309,7 +309,7 @@ module.exports.tests.query = function(test, common) { var expected = require('../fixture/autocomplete_linguistic_bbox_san_francisco'); t.deepEqual(compiled.type, 'autocomplete', 'query type set'); - t.deepEqual(compiled.body, expected, 'autocomplete_linguistic_focus_null_island'); + t.deepEqual(compiled.body, expected, 'autocomplete_linguistic_bbox_san_francisco'); t.end(); }); From 2257ec77952a5a9759c76a13f6f42c39d752ec53 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 15 Aug 2019 16:34:10 +0200 Subject: [PATCH 46/55] feat(search_addressit): generate cross_street subquery where available --- query/search_addressit.js | 1 + 1 file changed, 1 insertion(+) diff --git a/query/search_addressit.js b/query/search_addressit.js index 00d3acf12..44151f16f 100644 --- a/query/search_addressit.js +++ b/query/search_addressit.js @@ -31,6 +31,7 @@ query.score( peliasQuery.view.population( peliasQuery.view.phrase ) ); // address components query.score( peliasQuery.view.address('housenumber') ); query.score( peliasQuery.view.address('street') ); +query.score( peliasQuery.view.address('cross_street') ); query.score( peliasQuery.view.address('postcode') ); // admin components From d7d5f7b1dfce8bcb7583ab24f61e9fa31257aa8d Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 16 Sep 2019 12:48:23 +0200 Subject: [PATCH 47/55] feat(pelias_parser): limit input text to 140 characters --- sanitizer/_text.js | 3 ++- sanitizer/_text_pelias_parser.js | 18 ++++++++++++++---- test/unit/sanitizer/_text_pelias_parser.js | 17 +++++++++++++++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/sanitizer/_text.js b/sanitizer/_text.js index 477c5e2e2..6003fa37e 100644 --- a/sanitizer/_text.js +++ b/sanitizer/_text.js @@ -10,9 +10,10 @@ function _sanitize( raw, clean ){ // error & warning messages const messages = { errors: [], warnings: [] }; - // invalid input 'text' + // remove superfluous whitespace and quotes let text = _.trim( _.trim( raw.text ), QUOTES ); + // validate input 'text' if( !_.isString(text) || _.isEmpty(text) ){ messages.errors.push(`invalid param 'text': text length, must be >0`); } else { diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index 005ca57ae..ec619f3d2 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -4,6 +4,7 @@ const Solution = require('pelias-parser/solver/Solution'); const AddressParser = require('pelias-parser/parser/AddressParser'); const parser = new AddressParser(); const _ = require('lodash'); +const MAX_TEXT_LENGTH = 140; /** this module provides fulltext parsing using the pelias/parser module. @@ -21,14 +22,23 @@ function _sanitize (raw, clean) { // error & warning messages var messages = { errors: [], warnings: [] }; - // invalid input 'text' - const text = _.trim(raw.text); - if (!_.isString(text) || _.isEmpty(text)) { - messages.errors.push('invalid param \'text\': text length, must be >0'); + // remove superfluous whitespace + let text = _.trim(raw.text); + + // validate input 'text' + if( !_.isString(text) || _.isEmpty(text) ){ + messages.errors.push(`invalid param 'text': text length, must be >0`); } // valid input 'text' else { + + // truncate text to $MAX_TEXT_LENGTH chars + if (text.length > MAX_TEXT_LENGTH) { + messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`); + text = text.substring(0, MAX_TEXT_LENGTH); + } + // parse text with pelias/parser clean.text = text; clean.parser = 'pelias'; diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index abdae52bd..fb998eec7 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -413,6 +413,23 @@ module.exports.tests.text_parser = function (test, common) { t.deepEquals(validParameters, expected); t.end(); }); + + test('should truncate very long text inputs', (t) => { + const raw = { + text: ` +Sometimes we make the process more complicated than we need to. +We will never make a journey of a thousand miles by fretting about +how long it will take or how hard it will be. +We make the journey by taking each day step by step and then repeating +it again and again until we reach our destination.` }; + const clean = {}; + const messages = sanitizer.sanitize(raw, clean); + + t.equals(clean.text.length, 140); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]); + t.end(); + }); }; module.exports.all = function (tape, common) { From 670666cb54eac42f4e381a8682c23b091fdb99f1 Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 25 Sep 2019 13:47:54 +0200 Subject: [PATCH 48/55] feat(pelias_parser): replace peliasQueryPartialToken analyzer with peliasQuery --- query/view/ngrams_last_token_only_multi.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/query/view/ngrams_last_token_only_multi.js b/query/view/ngrams_last_token_only_multi.js index 3eaaddea9..d0845a62a 100644 --- a/query/view/ngrams_last_token_only_multi.js +++ b/query/view/ngrams_last_token_only_multi.js @@ -2,7 +2,7 @@ const peliasQuery = require('pelias-query'); const ngrams_last_token_only = require('./ngrams_last_token_only'); module.exports = function (adminFields){ - const subview = peliasQuery.view.admin_multi_match( adminFields, 'peliasQueryPartialToken' ); + const subview = peliasQuery.view.admin_multi_match( adminFields, 'peliasQuery' ); return function (vs) { From 30760b96afcf4f192f1f21757f7e2897d702f81c Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 25 Sep 2019 14:16:55 +0200 Subject: [PATCH 49/55] feat(pelias_parser): disable "ngrams_last_token_only_multi" view when every "completed" token is numeric --- query/view/ngrams_last_token_only_multi.js | 5 ++ ...uistic_multiple_tokens_complete_numeric.js | 74 +++++++++++++++++++ test/unit/query/autocomplete.js | 17 +++++ 3 files changed, 96 insertions(+) create mode 100644 test/unit/fixture/autocomplete_linguistic_multiple_tokens_complete_numeric.js diff --git a/query/view/ngrams_last_token_only_multi.js b/query/view/ngrams_last_token_only_multi.js index d0845a62a..17fa6aeec 100644 --- a/query/view/ngrams_last_token_only_multi.js +++ b/query/view/ngrams_last_token_only_multi.js @@ -15,9 +15,14 @@ module.exports = function (adminFields){ // no valid tokens to use, fail now, don't render this view. if (!tokens || tokens.length < 1) { return null; } + // return the simple view for queries with no complete tokens var complete_tokens = vs.var('input:name:tokens_complete').get(); if (!complete_tokens || complete_tokens.length < 1) { return ngrams_last_token_only(vs); } + // return the simple view when every complete token is numeric + var all_complete_tokens_numeric = complete_tokens.every(token => !token.replace(/[0-9]/g, '').length); + if (all_complete_tokens_numeric) { return ngrams_last_token_only(vs); } + // make a copy Vars so we don't mutate the original var vsCopy = new peliasQuery.Vars( vs.export() ); diff --git a/test/unit/fixture/autocomplete_linguistic_multiple_tokens_complete_numeric.js b/test/unit/fixture/autocomplete_linguistic_multiple_tokens_complete_numeric.js new file mode 100644 index 000000000..c9f53cff9 --- /dev/null +++ b/test/unit/fixture/autocomplete_linguistic_multiple_tokens_complete_numeric.js @@ -0,0 +1,74 @@ +module.exports = { + 'query': { + 'bool': { + 'must': [{ + 'match': { + 'phrase.default': { + 'analyzer': 'peliasQuery', + 'type': 'phrase', + 'boost': 1, + 'slop': 3, + 'cutoff_frequency': 0.01, + 'query': '1 2' + } + } + }, + { + 'constant_score': { + 'query': { + 'match': { + 'name.default': { + 'analyzer': 'peliasQuery', + 'boost': 100, + 'query': 'three', + 'cutoff_frequency': 0.01, + 'type': 'phrase', + 'operator': 'and', + 'slop': 3 + } + } + } + } + }], + 'should': [ + { + 'function_score': { + 'query': { + 'match_all': {} + }, + 'max_boost': 20, + 'score_mode': 'first', + 'boost_mode': 'replace', + 'functions': [{ + 'field_value_factor': { + 'modifier': 'log1p', + 'field': 'popularity', + 'missing': 1 + }, + 'weight': 1 + }] + } + }, { + 'function_score': { + 'query': { + 'match_all': {} + }, + 'max_boost': 20, + 'score_mode': 'first', + 'boost_mode': 'replace', + 'functions': [{ + 'field_value_factor': { + 'modifier': 'log1p', + 'field': 'population', + 'missing': 1 + }, + 'weight': 3 + }] + } + }] + } + }, + 'sort': ['_score'], + 'size': 20, + 'track_scores': true +}; diff --git a/test/unit/query/autocomplete.js b/test/unit/query/autocomplete.js index c38a6cab6..4651502f1 100644 --- a/test/unit/query/autocomplete.js +++ b/test/unit/query/autocomplete.js @@ -52,6 +52,23 @@ module.exports.tests.query = function(test, common) { t.end(); }); + // This is to prevent a query like '30 west' from considering the 'west' part as an admin component + test('valid lingustic autocomplete with 3 tokens - first two are numeric', function (t) { + var query = generate({ + text: '1 1 three', + tokens: ['1', '2', 'three'], + tokens_complete: ['1', '2'], + tokens_incomplete: ['three'] + }); + + var compiled = JSON.parse(JSON.stringify(query)); + var expected = require('../fixture/autocomplete_linguistic_multiple_tokens_complete_numeric'); + + t.deepEqual(compiled.type, 'autocomplete', 'query type set'); + t.deepEqual(compiled.body, expected, 'autocomplete_linguistic_multiple_tokens_complete_numeric'); + t.end(); + }); + test('valid lingustic autocomplete with comma delimited admin section', function(t) { var query = generate({ text: 'one two, three', From 866c479bbd3c3f1d07e2d518d9174a068dbe6ba3 Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Wed, 25 Sep 2019 11:19:07 -0400 Subject: [PATCH 50/55] Add context to pelias parser logs --- sanitizer/_text_pelias_parser.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index ec619f3d2..b0a10e24d 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -58,8 +58,10 @@ function parse (clean) { // log summary info logger.info('pelias_parser', { - took: (new Date()) - start, - solutions: t.solution.length + response_time: (new Date()) - start, + params: clean, + solutions: t.solution.length, + text_length: _.get(clean, 'text.length', 0) }); // only use the first solution generated From c0749a0cf78a5118e6a62b1774d912d31510e67d Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Wed, 25 Sep 2019 12:43:17 -0400 Subject: [PATCH 51/55] Pin to pelias-parser-1.38.0 for now --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index b62eb01fc..1d90bdf44 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "^1.36.0", + "pelias-parser": "1.38.0", "pelias-query": "^9.14.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 97f6496ac99a9d909e7e876609b2c7c898b09c96 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 1 Oct 2019 15:16:45 +0200 Subject: [PATCH 52/55] refactor(pelias_parser): add code comments relating to "add_name_to_multimatch", clean up related code --- query/autocomplete.js | 18 +++++++++++++++--- query/autocomplete_defaults.js | 7 ++++++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/query/autocomplete.js b/query/autocomplete.js index 454a96e62..537c22574 100644 --- a/query/autocomplete.js +++ b/query/autocomplete.js @@ -24,7 +24,16 @@ var views = { // add abbrevations for the fields pelias/parser is able to detect. var adminFields = placeTypes.concat(['locality_a', 'region_a', 'country_a']); -// add name field to improve venue matching +// add some name field(s) to the admin fields in order to improve venue matching +// note: this is a bit of a hacky way to add a 'name' field to the list +// of multimatch fields normally reserved for admin subquerying. +// in some cases we are not sure if certain tokens refer to admin components +// or are part of the place name (such as some venue names). +// the variable name 'add_name_to_multimatch' is arbitrary, it can be any value so +// long as there is a corresponding 'admin:*:field' variable set which defines +// the name of the field to use. +// this functionality is not enabled unless the 'input:add_name_to_multimatch' +// variable is set to a non-empty value at query-time. adminFields = adminFields.concat(['add_name_to_multimatch']); //------------------------------ @@ -168,11 +177,14 @@ function generateQuery( clean ){ textParser( clean, vs ); } + // set the 'add_name_to_multimatch' variable only in the case where one + // or more of the admin variables are set. + // the value 'enabled' is not relevant, it just needs to be any non-empty + // value so that the associated field is added to the multimatch query. + // see code comments above for additional information. let isAdminSet = adminFields.some(field => vs.isset('input:' + field)); if ( isAdminSet ){ vs.var('input:add_name_to_multimatch', 'enabled'); } - vs.var('admin:add_name_to_multimatch:field', 'name.default'); - return { type: 'autocomplete', body: query.render(vs) diff --git a/query/autocomplete_defaults.js b/query/autocomplete_defaults.js index e2d7e3ee3..e78e56800 100644 --- a/query/autocomplete_defaults.js +++ b/query/autocomplete_defaults.js @@ -123,6 +123,11 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'admin:borough:boost': 1, 'admin:borough:cutoff_frequency': 0.01, + // an additional 'name' field to add to admin multi-match queries. + // this is used to improve venue matching in cases where the we + // are unsure if the tokens represent admin or name components. + 'admin:add_name_to_multimatch:field': 'name.default', + 'popularity:field': 'popularity', 'popularity:modifier': 'log1p', 'popularity:max_boost': 20, @@ -139,4 +144,4 @@ module.exports = _.merge({}, peliasQuery.defaults, { 'custom:boosting:max_boost': 50, // maximum boosting which can be applied (max_boost/boost = max_score) 'custom:boosting:score_mode': 'sum', // sum all function scores before multiplying the boost 'custom:boosting:boost_mode': 'multiply' // this mode is not relevant because there is no query section -}); +}); \ No newline at end of file From b2d3b160c41dcfcfba937992c4016cd1a6e420e5 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 1 Oct 2019 15:23:40 +0200 Subject: [PATCH 53/55] refactor(pelias_parser): remove disused code/comments --- query/search_addressit.js | 5 ----- query/text_parser_pelias.js | 19 +------------------ sanitizer/_tokenizer.js | 8 -------- 3 files changed, 1 insertion(+), 31 deletions(-) diff --git a/query/search_addressit.js b/query/search_addressit.js index 44151f16f..7fc4af306 100644 --- a/query/search_addressit.js +++ b/query/search_addressit.js @@ -35,11 +35,6 @@ query.score( peliasQuery.view.address('cross_street') ); query.score( peliasQuery.view.address('postcode') ); // admin components -// country_a and region_a are left as matches here because the text-analyzer -// can sometimes detect them, in which case a query more specific than a -// multi_match is appropriate. -// query.score( peliasQuery.view.admin('country_a') ); -// query.score( peliasQuery.view.admin('region_a') ); query.score( peliasQuery.view.admin_multi_match(adminFields, 'peliasAdmin') ); query.score( views.custom_boosts( config.customBoosts ) ); diff --git a/query/text_parser_pelias.js b/query/text_parser_pelias.js index 141ad2df6..8a43ae0b0 100644 --- a/query/text_parser_pelias.js +++ b/query/text_parser_pelias.js @@ -40,24 +40,7 @@ function addParsedVariablesToQueryVariables(clean, vs) { vs.var('input:postcode', clean.parsed_text.postcode); } - // ==== add parsed matches [admin components] ==== - - // // locality - // if (!_.isEmpty(clean.parsed_text.locality)) { - // vs.var('input:locality', clean.parsed_text.locality); - // } - - // // region - // if (!_.isEmpty(clean.parsed_text.region)) { - // vs.var('input:region', clean.parsed_text.region); - // } - - // // country - // if (!_.isEmpty(clean.parsed_text.country)) { - // vs.var('input:country', clean.parsed_text.country); - // } - - // postfix + // ==== add admin components [postfix] ==== if (!_.isEmpty(clean.parsed_text.admin)) { // assign postfix to any admin fields which currently don't have a value assigned. diff --git a/sanitizer/_tokenizer.js b/sanitizer/_tokenizer.js index 8e58105f1..cc6008571 100644 --- a/sanitizer/_tokenizer.js +++ b/sanitizer/_tokenizer.js @@ -43,14 +43,6 @@ function _sanitize( raw, clean ){ else if (!clean.text.endsWith(text)) { parserConsumedAllTokens = true; } - - // when $subject exactly equals one of the admin fields - // else if ( - // text === clean.parsed_text.locality || - // text === clean.parsed_text.region || - // text === clean.parsed_text.country) { - // parserConsumedAllTokens = true; - // } } } From 1e1cf245c2f07caf0b780b74db7392be53147589 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 1 Oct 2019 15:44:25 +0200 Subject: [PATCH 54/55] feat(pelias_parser): completely remove "addressit" and references to it --- middleware/confidenceScore.js | 2 +- package.json | 3 +- query/search.js | 2 +- ...h_addressit.js => search_pelias_parser.js} | 9 +- query/text_parser_addressit.js | 99 ---- routes/v1.js | 10 +- sanitizer/_address_layer_filter.js | 2 +- sanitizer/_text_addressit.js | 123 ----- .../fixture/search_with_custom_boosts.json | 2 +- test/unit/middleware/confidenceScore.js | 12 +- ...h_addressit.js => search_pelias_parser.js} | 26 +- test/unit/query/search_with_custom_boosts.js | 2 +- test/unit/run.js | 3 +- test/unit/sanitizer/_address_layer_filter.js | 4 +- test/unit/sanitizer/_text_addressit.js | 429 ------------------ 15 files changed, 37 insertions(+), 691 deletions(-) rename query/{search_addressit.js => search_pelias_parser.js} (93%) delete mode 100644 query/text_parser_addressit.js delete mode 100644 sanitizer/_text_addressit.js rename test/unit/query/{search_addressit.js => search_pelias_parser.js} (87%) delete mode 100644 test/unit/sanitizer/_text_addressit.js diff --git a/middleware/confidenceScore.js b/middleware/confidenceScore.js index c132868b5..111513eac 100644 --- a/middleware/confidenceScore.js +++ b/middleware/confidenceScore.js @@ -29,7 +29,7 @@ function computeScores(req, res, next) { // do nothing if no result data set or if query is not of the original variety if (check.undefined(req.clean) || check.undefined(res) || check.undefined(res.data) || check.undefined(res.meta) || - res.meta.query_type !== 'search_addressit') { + res.meta.query_type !== 'search_pelias_parser') { return next(); } diff --git a/package.json b/package.json index 1d90bdf44..1cf56c48e 100644 --- a/package.json +++ b/package.json @@ -36,8 +36,8 @@ "node": ">=8.0.0" }, "dependencies": { + "@hapi/joi": "^15.0.0", "@mapbox/geojson-extent": "^0.3.1", - "addressit": "1.7.0", "async": "^3.0.1", "check-types": "^10.0.0", "elasticsearch": "^16.0.0", @@ -45,7 +45,6 @@ "geojson": "^0.5.0", "geolib": "^3.0.0", "iso-639-3": "^1.0.0", - "@hapi/joi": "^15.0.0", "locale": "^0.1.0", "lodash": "^4.17.4", "markdown": "^0.5.0", diff --git a/query/search.js b/query/search.js index 2cbe93310..04a4bcc8c 100644 --- a/query/search.js +++ b/query/search.js @@ -129,7 +129,7 @@ function getQuery(vs) { }; } - // returning undefined is a signal to a later step that the addressit-parsed + // returning undefined is a signal to a later step that a fallback parser // query should be queried for return undefined; diff --git a/query/search_addressit.js b/query/search_pelias_parser.js similarity index 93% rename from query/search_addressit.js rename to query/search_pelias_parser.js index 7fc4af306..418e9cc1e 100644 --- a/query/search_addressit.js +++ b/query/search_pelias_parser.js @@ -8,10 +8,9 @@ const config = require('pelias-config').generate().api; var placeTypes = require('../helper/placeTypes'); var views = { custom_boosts: require('./view/boost_sources_and_layers') }; -// region_a is also an admin field. addressit tries to detect -// region_a, in which case we use a match query specifically for it. -// but address it doesn't know about all of them so it helps to search -// against this with the other admin parts as a fallback +// region_a is also an admin field which can be identified by +// the pelias_parser. this functionality was inherited from the +// previous parser we used prior to the creation of pelias_parser. var adminFields = placeTypes.concat(['region_a']); //------------------------------ @@ -138,7 +137,7 @@ function generateQuery( clean ){ } return { - type: 'search_addressit', + type: 'search_pelias_parser', body: query.render(vs) }; } diff --git a/query/text_parser_addressit.js b/query/text_parser_addressit.js deleted file mode 100644 index 65c7ea775..000000000 --- a/query/text_parser_addressit.js +++ /dev/null @@ -1,99 +0,0 @@ -var logger = require('pelias-logger').get('api'); -var placeTypes = require('../helper/placeTypes'); - -/* -This list should only contain admin fields we are comfortable matching in the case -when we can't identify parts of an address. This shouldn't contain fields like country_a -or postalcode because we should only try to match those when we're sure that's what they are. - */ -var adminFields = placeTypes.concat([ - 'region_a' -]); - -/** - @todo: refactor me -**/ - -// all the address parsing logic -function addParsedVariablesToQueryVariables( clean, vs ){ - - // is it a street address? - var isStreetAddress = clean.parsed_text.hasOwnProperty('number') && clean.parsed_text.hasOwnProperty('street'); - if( isStreetAddress ){ - vs.var( 'input:name', clean.parsed_text.number + ' ' + clean.parsed_text.street ); - } - - // if the 'naive parser' was used, input is equal to 'name' - // see: 'sanitizer/_text_addressit.js' function 'naive' - else if (clean.parsed_text.admin_parts && clean.parsed_text.name ) { - vs.var( 'input:name', clean.parsed_text.name ); - } - - // ? - else { - logger.warn( 'chaos monkey asks: what happens now?', { - params: clean - }); - } - - // ==== add parsed matches [address components] ==== - - // house number - if( clean.parsed_text.hasOwnProperty('number') ){ - vs.var( 'input:housenumber', clean.parsed_text.number ); - } - - // street name - if( clean.parsed_text.hasOwnProperty('street') ){ - vs.var( 'input:street', clean.parsed_text.street ); - } - - // postal code - if( clean.parsed_text.hasOwnProperty('postalcode') ){ - vs.var( 'input:postcode', clean.parsed_text.postalcode ); - } - - // ==== add parsed matches [admin components] ==== - - // city - if( clean.parsed_text.hasOwnProperty('city') ){ - vs.var( 'input:county', clean.parsed_text.city ); - } - - // state - if( clean.parsed_text.hasOwnProperty('state') ){ - vs.var( 'input:region_a', clean.parsed_text.state ); - } - - // country - if( clean.parsed_text.hasOwnProperty('country') ){ - vs.var( 'input:country_a', clean.parsed_text.country ); - } - - // ==== deal with the 'leftover' components ==== - // @todo: clean up this code - - // a concept called 'leftovers' which is just 'admin_parts' /or 'regions'. - var leftoversString = ''; - if( clean.parsed_text.hasOwnProperty('admin_parts') ){ - leftoversString = clean.parsed_text.admin_parts; - } - else if( clean.parsed_text.hasOwnProperty('regions') ){ - leftoversString = clean.parsed_text.regions.join(' '); - } - - // if we have 'leftovers' then assign them to any fields which - // currently don't have a value assigned. - if( leftoversString.length ){ - - // cycle through fields and set fields which - // are still currently unset - adminFields.forEach( function( key ){ - if( !vs.isset( 'input:' + key ) ){ - vs.var( 'input:' + key, leftoversString ); - } - }); - } -} - -module.exports = addParsedVariablesToQueryVariables; diff --git a/routes/v1.js b/routes/v1.js index aaf52004f..3290ec5fe 100644 --- a/routes/v1.js +++ b/routes/v1.js @@ -38,7 +38,7 @@ var controllers = { var queries = { cascading_fallback: require('../query/search'), - search_addressit: require('../query/search_addressit'), + search_pelias_parser: require('../query/search_pelias_parser'), structured_geocoding: require('../query/structured_geocoding'), reverse: require('../query/reverse'), autocomplete: require('../query/autocomplete'), @@ -230,8 +230,8 @@ function addRoutes(app, peliasConfig) { not(hasResponseData) ); - // call search addressit query if addressit was the parser - const searchAddressitShouldExecute = all( + // call search_pelias_parser query if pelias_parser was the parser + const searchPeliasParserShouldExecute = all( not(hasRequestErrors), isPeliasParse ); @@ -288,11 +288,11 @@ function addRoutes(app, peliasConfig) { controllers.libpostal(libpostalService, libpostalShouldExecute), controllers.placeholder(placeholderService, geometricFiltersApply, placeholderGeodisambiguationShouldExecute), controllers.placeholder(placeholderService, geometricFiltersApply, placeholderIdsLookupShouldExecute), - // try 3 different query types: address search using ids, cascading fallback, addressit + // try 3 different query types: address search using ids, cascading fallback, pelias parser controllers.search(peliasConfig.api, esclient, queries.address_using_ids, searchWithIdsShouldExecute), controllers.search(peliasConfig.api, esclient, queries.cascading_fallback, fallbackQueryShouldExecute), sanitizers.defer_to_pelias_parser(shouldDeferToPeliasParser), //run additional sanitizers needed for pelias parser - controllers.search(peliasConfig.api, esclient, queries.search_addressit, searchAddressitShouldExecute), + controllers.search(peliasConfig.api, esclient, queries.search_pelias_parser, searchPeliasParserShouldExecute), postProc.trimByGranularity(), postProc.distances('focus.point.'), postProc.confidenceScores(peliasConfig.api), diff --git a/sanitizer/_address_layer_filter.js b/sanitizer/_address_layer_filter.js index 3b6febd2d..c4e545798 100644 --- a/sanitizer/_address_layer_filter.js +++ b/sanitizer/_address_layer_filter.js @@ -60,7 +60,7 @@ function _setup(tm) { input = clean.parsed_text.subject; } - // if 'addressit' or 'libpostal' identified input as a street address + // if 'pelias_parser' or 'libpostal' identified input as a street address else if (isStreetAddress) { input = clean.parsed_text.number + ' ' + clean.parsed_text.street; } diff --git a/sanitizer/_text_addressit.js b/sanitizer/_text_addressit.js deleted file mode 100644 index 4c674218a..000000000 --- a/sanitizer/_text_addressit.js +++ /dev/null @@ -1,123 +0,0 @@ -const addressit = require('addressit'); -const _ = require('lodash'); -const logger = require('pelias-logger').get('api'); -const MAX_TEXT_LENGTH = 140; - -/** - this module provides extremely basic parsing using two methods. - - note: this code is old and well due for a makover/replacement, we - are not happy with either of these methods but they remain in place - for purely legacy reasons. - - 'naive parser' provides the following fields: - 'name', 'admin_parts' - - 'addressit parser' provides the following fields: - 'unit', 'number', 'street', 'state', 'country', 'postalcode', 'regions' -**/ - -// ref: https://en.wikipedia.org/wiki/Quotation_mark -const QUOTES = `"'«»‘’‚‛“”„‟‹›⹂「」『』〝〞〟﹁﹂﹃﹄"'「」`; -const DELIM = ','; -const ADDRESSIT_MIN_CHAR_LENGTH = 4; - -// validate texts, convert types and apply defaults -function _sanitize( raw, clean ){ - - // error & warning messages - var messages = { errors: [], warnings: [] }; - - // remove superfluous whitespace & quotes - let text = _.trim( _.trim( raw.text ), QUOTES ); - - // validate input 'text' - if (!_.isString(text) || _.isEmpty(text)) { - messages.errors.push(`invalid param 'text': text length, must be >0`); - } - - // valid input 'text' - else { - - // truncate text to $MAX_TEXT_LENGTH chars - if (text.length > MAX_TEXT_LENGTH) { - messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`); - text = text.substring(0, MAX_TEXT_LENGTH); - } - - // parse text with query parser - clean.text = text; - clean.parser = 'addressit'; - clean.parsed_text = parse(clean); - } - - return messages; -} - -// naive approach - for admin matching during query time -// split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny' -var naive = function(tokens) { - var parsed_text = {}; - - if( tokens.length > 1 ){ - parsed_text.name = tokens[0]; - - // 1. slice away all parts after the first one - // 2. trim spaces from each part just in case - // 3. join the parts back together with appropriate delimiter and spacing - parsed_text.admin_parts = tokens.slice(1).join(`${DELIM} `); - } - - return parsed_text; -}; - -function parse(clean) { - - // split query on delimiter, trim tokens and remove empty elements - var tokens = clean.text.split(DELIM) - .map( part => part.trim() ) - .filter( part => part.length > 0 ); - - // call the naive parser to try and split tokens - var parsed_text = naive(tokens); - - // join tokens back togther with normalized delimiters - var joined = tokens.join(`${DELIM} `); - - // query addressit - perform full address parsing - // except on queries so short they obviously can't contain an address - if( joined.length >= ADDRESSIT_MIN_CHAR_LENGTH ) { - var parsed = addressit(joined); - - // copy fields from addressit response to parsed_text - for( var attr in parsed ){ - if( 'text' === attr ){ continue; } // ignore 'text' - if( !_.isEmpty( parsed[ attr ] ) && _.isUndefined( parsed_text[ attr ] ) ){ - parsed_text[ attr ] = parsed[ attr ]; - } - } - } - - // if all we found was regions, ignore it as it is not enough information to make smarter decisions - if( Object.keys(parsed_text).length === 1 && !_.isUndefined(parsed_text.regions) ){ - logger.info('Ignoring address parser output, regions only', { - parsed: parsed_text, - params: clean - }); - - // return empty parsed_text - return {}; - } - - return parsed_text; -} - -function _expected(){ - return [{ name: 'text' }]; -} - -// export function -module.exports = () => ({ - sanitize: _sanitize, - expected: _expected -}); diff --git a/test/unit/fixture/search_with_custom_boosts.json b/test/unit/fixture/search_with_custom_boosts.json index 6da91c123..a17912d01 100644 --- a/test/unit/fixture/search_with_custom_boosts.json +++ b/test/unit/fixture/search_with_custom_boosts.json @@ -1,5 +1,5 @@ { - "type": "search_addressit", + "type": "search_pelias_parser", "body": { "query": { "bool": { diff --git a/test/unit/middleware/confidenceScore.js b/test/unit/middleware/confidenceScore.js index 36a505c0c..f9386d552 100644 --- a/test/unit/middleware/confidenceScore.js +++ b/test/unit/middleware/confidenceScore.js @@ -47,7 +47,7 @@ module.exports.tests.confidenceScore = function(test, common) { }], meta: { scores: [10], - query_type: 'search_addressit' + query_type: 'search_pelias_parser' } }; @@ -89,7 +89,7 @@ module.exports.tests.confidenceScore = function(test, common) { }], meta: { scores: [10], - query_type: 'search_addressit' + query_type: 'search_pelias_parser' } }; @@ -125,7 +125,7 @@ module.exports.tests.confidenceScore = function(test, common) { }], meta: { scores: [10], - query_type: 'search_addressit' + query_type: 'search_pelias_parser' } }; @@ -134,7 +134,7 @@ module.exports.tests.confidenceScore = function(test, common) { t.end(); }); - test('should only work for search_addressit query_type', function(t) { + test('should only work for search_pelias_parser query_type', function(t) { var req = { clean: { text: '123 Main St, City, NM', @@ -191,7 +191,7 @@ module.exports.tests.confidenceScore = function(test, common) { }], meta: { scores: [10], - query_type: 'search_addressit' + query_type: 'search_pelias_parser' } }; @@ -223,7 +223,7 @@ module.exports.tests.confidenceScore = function(test, common) { }], meta: { scores: [10], - query_type: 'search_addressit' + query_type: 'search_pelias_parser' } }; diff --git a/test/unit/query/search_addressit.js b/test/unit/query/search_pelias_parser.js similarity index 87% rename from test/unit/query/search_addressit.js rename to test/unit/query/search_pelias_parser.js index 88b7a0655..814fd2836 100644 --- a/test/unit/query/search_addressit.js +++ b/test/unit/query/search_pelias_parser.js @@ -6,7 +6,7 @@ const defaultPeliasConfig = { } }; -var generate = proxyquire('../../../query/search_addressit', { +var generate = proxyquire('../../../query/search_pelias_parser', { 'pelias-config': defaultPeliasConfig }); @@ -34,7 +34,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_bbox_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_linguistic_focus_bbox_original'); t.end(); }); @@ -52,7 +52,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_bbox_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_linguistic_bbox'); t.end(); }); @@ -66,7 +66,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_only_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_linguistic_only'); t.end(); }); @@ -81,7 +81,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_linguistic_focus'); t.end(); }); @@ -96,7 +96,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_linguistic_focus_null_island_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_linguistic_focus_null_island'); t.end(); }); @@ -119,7 +119,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_full_address_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_full_address'); t.end(); }); @@ -139,7 +139,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_partial_address_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_partial_address'); t.end(); }); @@ -161,7 +161,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_regions_address_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_regions_address'); t.end(); }); @@ -176,7 +176,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_boundary_country_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search: valid boundary.country query'); t.end(); }); @@ -190,7 +190,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_with_source_filtering_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search: valid search query with source filtering'); t.end(); }); @@ -204,7 +204,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_with_category_filtering_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'correct search_with_category_filtering_original query'); t.end(); }); @@ -219,7 +219,7 @@ module.exports.tests.query = function(test, common) { var compiled = JSON.parse( JSON.stringify( query ) ); var expected = require('../fixture/search_boundary_gid_original'); - t.deepEqual(compiled.type, 'search_addressit', 'query type set'); + t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search: valid boundary.gid filter'); t.end(); }); diff --git a/test/unit/query/search_with_custom_boosts.js b/test/unit/query/search_with_custom_boosts.js index 911c5d055..cd2efdf33 100644 --- a/test/unit/query/search_with_custom_boosts.js +++ b/test/unit/query/search_with_custom_boosts.js @@ -31,7 +31,7 @@ module.exports.tests.query = function(test, common) { var expected_query = require('../fixture/search_with_custom_boosts.json'); - const search_query_module = proxyquire('../../../query/search_addressit', { + const search_query_module = proxyquire('../../../query/search_pelias_parser', { 'pelias-config': config_with_boosts }); diff --git a/test/unit/run.js b/test/unit/run.js index 1d38d5ac6..82e94cceb 100644 --- a/test/unit/run.js +++ b/test/unit/run.js @@ -71,7 +71,7 @@ var tests = [ require('./query/search'), require('./query/search_with_custom_boosts'), require('./query/search_defaults'), - require('./query/search_addressit'), + require('./query/search_pelias_parser'), require('./query/structured_geocoding'), require('./query/text_parser'), require('./query/view/boost_sources_and_layers'), @@ -97,7 +97,6 @@ var tests = [ require('./sanitizer/_address_layer_filter'), require('./sanitizer/_synthesize_analysis'), require('./sanitizer/_text'), - require('./sanitizer/_text_addressit'), require('./sanitizer/_text_pelias_parser'), require('./sanitizer/_tokenizer'), require('./sanitizer/_categories'), diff --git a/test/unit/sanitizer/_address_layer_filter.js b/test/unit/sanitizer/_address_layer_filter.js index 5543a7293..681096843 100644 --- a/test/unit/sanitizer/_address_layer_filter.js +++ b/test/unit/sanitizer/_address_layer_filter.js @@ -120,14 +120,14 @@ module.exports.tests.parsed_text = function (test, common) { t.end(); }); - test('addressit/libpostal - do not apply filter for numeric addresses', (t) => { + test('pelias_parser/libpostal - do not apply filter for numeric addresses', (t) => { let clean = { text: 'A', parsed_text: { number: '1', street: 'Main St' } }; t.deepEqual(s.sanitize(null, clean), NO_MESSAGES); t.false(clean.layers); t.end(); }); - test('addressit/libpostal - apply filter for non-numeric addresses', (t) => { + test('pelias_parser/libpostal - apply filter for non-numeric addresses', (t) => { let clean = { text: 'A', parsed_text: { number: 'Foo', street: 'Main St' } }; t.deepEqual(s.sanitize(null, clean), STD_MESSAGES); t.deepEqual(clean.layers, ['A', 'B', 'C']); diff --git a/test/unit/sanitizer/_text_addressit.js b/test/unit/sanitizer/_text_addressit.js deleted file mode 100644 index ee071567c..000000000 --- a/test/unit/sanitizer/_text_addressit.js +++ /dev/null @@ -1,429 +0,0 @@ -var sanitizer = require('../../../sanitizer/_text_addressit')(); -var type_mapping = require('../../../helper/type_mapping'); - -module.exports.tests = {}; - -module.exports.tests.text_parser = function(test, common) { - test('short input text has admin layers set ', function(t) { - var raw = { - text: 'emp' //start of empire state building - }; - var clean = { - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEquals(messages.errors, [], 'no errors'); - t.deepEquals(messages.warnings, [], 'no warnings'); - - t.end(); - }); - - var usQueries = [ - { name: 'soho', admin_parts: 'new york', state: 'NY' }, - { name: '123 main', admin_parts: 'new york', state: 'NY' } - ]; - - usQueries.forEach(function (query) { - test('naive parsing ' + query, function(t) { - var raw = { - text: query.name + ', ' + query.admin_parts - }; - var clean = {}; - - var expected_clean = { - text: raw.text.trim(), - parser: 'addressit', - parsed_text: { - name: query.name, - regions: [ query.name ], - admin_parts: query.admin_parts, - state: query.state - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('naive parsing ' + query + ' without spaces', function(t) { - var raw = { - text: query.name + ',' + query.admin_parts - }; - var clean = {}; - - var expected_clean = { - text: raw.text.trim(), - parser: 'addressit', - parsed_text: { - name: query.name, - regions: [ query.name ], - admin_parts: query.admin_parts, - state: query.state - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('naive parsing ' + query + ' with leading and trailing junk', function(t) { - var raw = { - text: ' , ' + query.name + ',' + query.admin_parts + ' , ' - }; - var clean = {}; - - var expected_clean = { - text: raw.text.trim(), - parser: 'addressit', - parsed_text: { - name: query.name, - regions: [ query.name ], - admin_parts: query.admin_parts, - state: query.state - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - }); - - var nonUSQueries = [ - { name: 'chelsea', admin_parts: 'london' }, - ]; - - nonUSQueries.forEach(function (query) { - test('naive parsing ' + query, function(t) { - var raw = { - text: query.name + ', ' + query.admin_parts - }; - var clean = {}; - - var expected_clean = { - text: query.name + ', ' + query.admin_parts, - parser: 'addressit', - parsed_text: { - name: query.name, - regions: [ query.name, query.admin_parts ], - admin_parts: query.admin_parts - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('naive parsing ' + query + ' without spaces', function(t) { - var raw = { - text: query.name + ',' + query.admin_parts - }; - var clean = {}; - - var expected_clean = { - text: query.name + ',' + query.admin_parts, - parser: 'addressit', - parsed_text: { - name: query.name, - regions: [ query.name, query.admin_parts ], - admin_parts: query.admin_parts - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - }); - - test('query with one token', function (t) { - var raw = { - text: 'yugolsavia' - }; - var clean = {}; - clean.parsed_text = 'this should be removed'; - - var expected_clean = { - parser: 'addressit', - text: 'yugolsavia', - parsed_text: {} - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('query with two tokens, no numbers', function (t) { - var raw = { - text: 'small town' - }; - var clean = {}; - clean.parsed_text = 'this should be removed'; - - var expected_clean = { - parser: 'addressit', - text: 'small town', - parsed_text: {} - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('query with two tokens, number first', function (t) { - var raw = { - text: '123 main' - }; - var clean = {}; - clean.parsed_text = 'this should be removed'; - - var expected_clean = { - parser: 'addressit', - text: '123 main', - parsed_text: {} - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('query with two tokens, number second', function (t) { - var raw = { - text: 'main 123' - }; - var clean = {}; - clean.parsed_text = 'this should be removed'; - - var expected_clean = { - parser: 'addressit', - text: 'main 123', - parsed_text: {} - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('query with many tokens', function(t) { - var raw = { - text: 'main particle new york' - }; - var clean = {}; - clean.parsed_text = 'this should be removed'; - - var expected_clean = { - text: 'main particle new york', - parser: 'addressit', - parsed_text: { - regions: [ 'main particle' ], - state: 'NY' - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('valid address, house number', function(t) { - var raw = { - text: '123 main st new york ny' - }; - var clean = {}; - - var expected_clean = { - text: '123 main st new york ny', - parser: 'addressit', - parsed_text: { - number: '123', - street: 'main st', - state: 'NY', - regions: [ 'new york' ] - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('valid address, zipcode', function(t) { - var raw = { - text: '123 main st new york ny 10010' - }; - var clean = {}; - - var expected_clean = { - text: '123 main st new york ny 10010', - parser: 'addressit', - parsed_text: { - number: '123', - street: 'main st', - state: 'NY', - postalcode: '10010', - regions: [ 'new york' ] - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - }); - - test('valid address with leading 0s in zipcode', function(t) { - var raw = { - text: '339 W Main St, Cheshire, 06410' - }; - var clean = {}; - - var expected_clean = { - text: '339 W Main St, Cheshire, 06410', - parser: 'addressit', - parsed_text: { - name: '339 W Main St', - number: '339', - street: 'W Main St', - postalcode: '06410', - regions: [ 'Cheshire' ], - admin_parts: 'Cheshire, 06410' - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - }); - - test('valid address without spaces after commas', function(t) { - var raw = { - text: '339 W Main St,Lancaster,PA' - }; - var clean = {}; - - var expected_clean = { - text: '339 W Main St,Lancaster,PA', - parser: 'addressit', - parsed_text: { - name: '339 W Main St', - number: '339', - street: 'W Main St', - state: 'PA', - regions: [ 'Lancaster' ], - admin_parts: 'Lancaster, PA' - } - }; - - var messages = sanitizer.sanitize(raw, clean); - - t.deepEqual(messages, { errors: [], warnings: [] } ); - t.deepEqual(clean, expected_clean); - t.end(); - - }); - - test('whitespace-only input counts as empty', (t) => { - const raw = { text: ' ' }; - const clean = {}; - - const expected_clean = {}; - - const messages = sanitizer.sanitize(raw, clean); - - t.deepEquals(clean, expected_clean); - t.deepEquals(messages.errors, ['invalid param \'text\': text length, must be >0']); - t.deepEquals(messages.warnings, [], 'no warnings'); - t.end(); - }); - - test('return an array of expected parameters in object form for validation', (t) => { - const expected = [{ name: 'text' }]; - const validParameters = sanitizer.expected(); - t.deepEquals(validParameters, expected); - t.end(); - }); - - test('Australia - state only', (t) => { - const raw = { text: 'NSW' }; - const clean = {}; - const expected_clean = { text: 'NSW', parser: 'addressit', parsed_text: {} }; - const messages = sanitizer.sanitize(raw, clean); - - t.deepEquals(clean, expected_clean); - t.deepEquals(messages.errors, []); - t.deepEquals(messages.warnings, [], 'no warnings'); - t.end(); - }); - - test('should truncate very long text inputs', (t) => { - const raw = { - text: ` -Sometimes we make the process more complicated than we need to. -We will never make a journey of a thousand miles by fretting about -how long it will take or how hard it will be. -We make the journey by taking each day step by step and then repeating -it again and again until we reach our destination.` }; - const clean = {}; - const messages = sanitizer.sanitize(raw, clean); - - t.equals(clean.text.length, 140); - t.deepEquals(messages.errors, [], 'no errors'); - t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]); - t.end(); - }); -}; - -module.exports.all = function (tape, common) { - function test(name, testFunction) { - return tape('sanitizer _text: ' + name, testFunction); - } - - for( var testCase in module.exports.tests ){ - module.exports.tests[testCase](test, common); - } -}; From 88656e0a793663a673a4e2eb3ba545f590cb68d9 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 1 Oct 2019 16:00:25 +0200 Subject: [PATCH 55/55] refactor(pelias_parser): remove references to "original style queries" --- middleware/confidenceScore.js | 2 +- ... search_pelias_parser_boundary_country.js} | 0 ...s => search_pelias_parser_boundary_gid.js} | 0 ...s => search_pelias_parser_full_address.js} | 0 ...> search_pelias_parser_linguistic_bbox.js} | 0 ... search_pelias_parser_linguistic_focus.js} | 0 ...ch_pelias_parser_linguistic_focus_bbox.js} | 0 ...as_parser_linguistic_focus_null_island.js} | 0 ...> search_pelias_parser_linguistic_only.js} | 0 ...> search_pelias_parser_partial_address.js} | 0 ...> search_pelias_parser_regions_address.js} | 0 ..._pelias_parser_with_category_filtering.js} | 0 ...ch_pelias_parser_with_source_filtering.js} | 0 test/unit/query/search_pelias_parser.js | 28 +++++++++---------- 14 files changed, 15 insertions(+), 15 deletions(-) rename test/unit/fixture/{search_boundary_country_original.js => search_pelias_parser_boundary_country.js} (100%) rename test/unit/fixture/{search_boundary_gid_original.js => search_pelias_parser_boundary_gid.js} (100%) rename test/unit/fixture/{search_full_address_original.js => search_pelias_parser_full_address.js} (100%) rename test/unit/fixture/{search_linguistic_bbox_original.js => search_pelias_parser_linguistic_bbox.js} (100%) rename test/unit/fixture/{search_linguistic_focus_original.js => search_pelias_parser_linguistic_focus.js} (100%) rename test/unit/fixture/{search_linguistic_focus_bbox_original.js => search_pelias_parser_linguistic_focus_bbox.js} (100%) rename test/unit/fixture/{search_linguistic_focus_null_island_original.js => search_pelias_parser_linguistic_focus_null_island.js} (100%) rename test/unit/fixture/{search_linguistic_only_original.js => search_pelias_parser_linguistic_only.js} (100%) rename test/unit/fixture/{search_partial_address_original.js => search_pelias_parser_partial_address.js} (100%) rename test/unit/fixture/{search_regions_address_original.js => search_pelias_parser_regions_address.js} (100%) rename test/unit/fixture/{search_with_category_filtering_original.js => search_pelias_parser_with_category_filtering.js} (100%) rename test/unit/fixture/{search_with_source_filtering_original.js => search_pelias_parser_with_source_filtering.js} (100%) diff --git a/middleware/confidenceScore.js b/middleware/confidenceScore.js index 111513eac..c75b88c58 100644 --- a/middleware/confidenceScore.js +++ b/middleware/confidenceScore.js @@ -26,7 +26,7 @@ function setup(peliasConfig) { } function computeScores(req, res, next) { - // do nothing if no result data set or if query is not of the original variety + // do nothing if no result data set or if query is not of the pelias_parser variety if (check.undefined(req.clean) || check.undefined(res) || check.undefined(res.data) || check.undefined(res.meta) || res.meta.query_type !== 'search_pelias_parser') { diff --git a/test/unit/fixture/search_boundary_country_original.js b/test/unit/fixture/search_pelias_parser_boundary_country.js similarity index 100% rename from test/unit/fixture/search_boundary_country_original.js rename to test/unit/fixture/search_pelias_parser_boundary_country.js diff --git a/test/unit/fixture/search_boundary_gid_original.js b/test/unit/fixture/search_pelias_parser_boundary_gid.js similarity index 100% rename from test/unit/fixture/search_boundary_gid_original.js rename to test/unit/fixture/search_pelias_parser_boundary_gid.js diff --git a/test/unit/fixture/search_full_address_original.js b/test/unit/fixture/search_pelias_parser_full_address.js similarity index 100% rename from test/unit/fixture/search_full_address_original.js rename to test/unit/fixture/search_pelias_parser_full_address.js diff --git a/test/unit/fixture/search_linguistic_bbox_original.js b/test/unit/fixture/search_pelias_parser_linguistic_bbox.js similarity index 100% rename from test/unit/fixture/search_linguistic_bbox_original.js rename to test/unit/fixture/search_pelias_parser_linguistic_bbox.js diff --git a/test/unit/fixture/search_linguistic_focus_original.js b/test/unit/fixture/search_pelias_parser_linguistic_focus.js similarity index 100% rename from test/unit/fixture/search_linguistic_focus_original.js rename to test/unit/fixture/search_pelias_parser_linguistic_focus.js diff --git a/test/unit/fixture/search_linguistic_focus_bbox_original.js b/test/unit/fixture/search_pelias_parser_linguistic_focus_bbox.js similarity index 100% rename from test/unit/fixture/search_linguistic_focus_bbox_original.js rename to test/unit/fixture/search_pelias_parser_linguistic_focus_bbox.js diff --git a/test/unit/fixture/search_linguistic_focus_null_island_original.js b/test/unit/fixture/search_pelias_parser_linguistic_focus_null_island.js similarity index 100% rename from test/unit/fixture/search_linguistic_focus_null_island_original.js rename to test/unit/fixture/search_pelias_parser_linguistic_focus_null_island.js diff --git a/test/unit/fixture/search_linguistic_only_original.js b/test/unit/fixture/search_pelias_parser_linguistic_only.js similarity index 100% rename from test/unit/fixture/search_linguistic_only_original.js rename to test/unit/fixture/search_pelias_parser_linguistic_only.js diff --git a/test/unit/fixture/search_partial_address_original.js b/test/unit/fixture/search_pelias_parser_partial_address.js similarity index 100% rename from test/unit/fixture/search_partial_address_original.js rename to test/unit/fixture/search_pelias_parser_partial_address.js diff --git a/test/unit/fixture/search_regions_address_original.js b/test/unit/fixture/search_pelias_parser_regions_address.js similarity index 100% rename from test/unit/fixture/search_regions_address_original.js rename to test/unit/fixture/search_pelias_parser_regions_address.js diff --git a/test/unit/fixture/search_with_category_filtering_original.js b/test/unit/fixture/search_pelias_parser_with_category_filtering.js similarity index 100% rename from test/unit/fixture/search_with_category_filtering_original.js rename to test/unit/fixture/search_pelias_parser_with_category_filtering.js diff --git a/test/unit/fixture/search_with_source_filtering_original.js b/test/unit/fixture/search_pelias_parser_with_source_filtering.js similarity index 100% rename from test/unit/fixture/search_with_source_filtering_original.js rename to test/unit/fixture/search_pelias_parser_with_source_filtering.js diff --git a/test/unit/query/search_pelias_parser.js b/test/unit/query/search_pelias_parser.js index 814fd2836..7c2eba0c9 100644 --- a/test/unit/query/search_pelias_parser.js +++ b/test/unit/query/search_pelias_parser.js @@ -32,10 +32,10 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_linguistic_focus_bbox_original'); + var expected = require('../fixture/search_pelias_parser_linguistic_focus_bbox'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); - t.deepEqual(compiled.body, expected, 'search_linguistic_focus_bbox_original'); + t.deepEqual(compiled.body, expected, 'search_linguistic_focus_bbox'); t.end(); }); @@ -50,7 +50,7 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_linguistic_bbox_original'); + var expected = require('../fixture/search_pelias_parser_linguistic_bbox'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_linguistic_bbox'); @@ -64,7 +64,7 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_linguistic_only_original'); + var expected = require('../fixture/search_pelias_parser_linguistic_only'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_linguistic_only'); @@ -79,7 +79,7 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_linguistic_focus_original'); + var expected = require('../fixture/search_pelias_parser_linguistic_focus'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_linguistic_focus'); @@ -94,7 +94,7 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_linguistic_focus_null_island_original'); + var expected = require('../fixture/search_pelias_parser_linguistic_focus_null_island'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_linguistic_focus_null_island'); @@ -117,7 +117,7 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_full_address_original'); + var expected = require('../fixture/search_pelias_parser_full_address'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_full_address'); @@ -137,7 +137,7 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_partial_address_original'); + var expected = require('../fixture/search_pelias_parser_partial_address'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_partial_address'); @@ -159,7 +159,7 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_regions_address_original'); + var expected = require('../fixture/search_pelias_parser_regions_address'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search_regions_address'); @@ -174,7 +174,7 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_boundary_country_original'); + var expected = require('../fixture/search_pelias_parser_boundary_country'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search: valid boundary.country query'); @@ -188,7 +188,7 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_with_source_filtering_original'); + var expected = require('../fixture/search_pelias_parser_with_source_filtering'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search: valid search query with source filtering'); @@ -202,10 +202,10 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_with_category_filtering_original'); + var expected = require('../fixture/search_pelias_parser_with_category_filtering'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); - t.deepEqual(compiled.body, expected, 'correct search_with_category_filtering_original query'); + t.deepEqual(compiled.body, expected, 'correct search_with_category_filtering query'); t.end(); }); @@ -217,7 +217,7 @@ module.exports.tests.query = function(test, common) { }); var compiled = JSON.parse( JSON.stringify( query ) ); - var expected = require('../fixture/search_boundary_gid_original'); + var expected = require('../fixture/search_pelias_parser_boundary_gid'); t.deepEqual(compiled.type, 'search_pelias_parser', 'query type set'); t.deepEqual(compiled.body, expected, 'search: valid boundary.gid filter');