From be13193229763d5cd94c3de0cfcf759a2f5c2255 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Tue, 12 Jan 2016 15:38:14 +0100 Subject: [PATCH 01/15] better remove_ordinals regex pattern --- integration/analyzer_peliasStreet.js | 39 ++++++++++++++++++++++++++++ settings.js | 4 +-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index 2ef143af..27c392fa 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -63,6 +63,45 @@ module.exports.tests.normalize_punctuation = function(test, common){ }); }; +module.exports.tests.remove_ordinals = function(test, common){ + test( 'remove ordinals', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + assertAnalysis( 'ordindals', "1st", ["1"] ); + assertAnalysis( 'ordindals', "22nd", ["22"] ); + assertAnalysis( 'ordindals', "333rd", ["333"] ); + assertAnalysis( 'ordindals', "4444th", ["4444"] ); + assertAnalysis( 'ordindals', "2500th", ["2500"] ); + + assertAnalysis( 'uppercase', "1ST", ["1"] ); + assertAnalysis( 'uppercase', "22ND", ["22"] ); + assertAnalysis( 'uppercase', "333RD", ["333"] ); + assertAnalysis( 'uppercase', "4444TH", ["4444"] ); + + assertAnalysis( 'autocomplete', "26", ["26"] ); + assertAnalysis( 'autocomplete', "26t", ["26"] ); + assertAnalysis( 'autocomplete', "26th", ["26"] ); + assertAnalysis( 'autocomplete', "3", ["3"] ); + assertAnalysis( 'autocomplete', "3r", ["3"] ); + assertAnalysis( 'autocomplete', "3rd", ["3"] ); + + assertAnalysis( 'wrong suffix (do nothing)', "0th", ["0th"] ); + assertAnalysis( 'wrong suffix (do nothing)', "26s", ["26s"] ); + assertAnalysis( 'wrong suffix (do nothing)', "26st", ["26st"] ); + assertAnalysis( 'wrong suffix (do nothing)', "31t", ["31t"] ); + assertAnalysis( 'wrong suffix (do nothing)', "31th", ["31th"] ); + assertAnalysis( 'wrong suffix (do nothing)', "21r", ["21r"] ); + assertAnalysis( 'wrong suffix (do nothing)', "21rd", ["21rd"] ); + assertAnalysis( 'wrong suffix (do nothing)', "29n", ["29n"] ); + assertAnalysis( 'wrong suffix (do nothing)', "29nd", ["29nd"] ); + + suite.run( t.end ); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { diff --git a/settings.js b/settings.js index a0597bfc..98004227 100644 --- a/settings.js +++ b/settings.js @@ -148,8 +148,8 @@ function generate(){ }, "remove_ordinals" : { "type" : "pattern_replace", - "pattern": "(([0-9])(st|nd|rd|th))", - "replacement": "$2" + "pattern": "(([0-9]*1)st?|([0-9]*2)nd?|([0-9]*3)rd?|([0-9]*[456789])th?|([0-9]+0)th?)", + "replacement": "$2$3$4$5$6" }, "remove_duplicate_spaces" : { "type" : "pattern_replace", From 1d5adc286fa5ee538b65d493dec86004ec4cc202 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Tue, 12 Jan 2016 16:01:42 +0100 Subject: [PATCH 02/15] handle teens differently --- integration/analyzer_peliasStreet.js | 24 ++++++++++++++++++++++++ settings.js | 4 ++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index 27c392fa..b3255e6b 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -76,6 +76,30 @@ module.exports.tests.remove_ordinals = function(test, common){ assertAnalysis( 'ordindals', "4444th", ["4444"] ); assertAnalysis( 'ordindals', "2500th", ["2500"] ); + // teens + assertAnalysis( 'teens', "11th", ["11"] ); + assertAnalysis( 'teens', "12th", ["12"] ); + assertAnalysis( 'teens', "13th", ["13"] ); + assertAnalysis( 'teens', "14th", ["14"] ); + assertAnalysis( 'teens', "15th", ["15"] ); + assertAnalysis( 'teens', "16th", ["16"] ); + assertAnalysis( 'teens', "17th", ["17"] ); + assertAnalysis( 'teens', "18th", ["18"] ); + assertAnalysis( 'teens', "19th", ["19"] ); + assertAnalysis( 'teens', "20th", ["20"] ); + + // teens (hundreds) + assertAnalysis( 'teens - hundreds', "111th", ["111"] ); + assertAnalysis( 'teens - hundreds', "112th", ["112"] ); + assertAnalysis( 'teens - hundreds', "113th", ["113"] ); + assertAnalysis( 'teens - hundreds', "114th", ["114"] ); + assertAnalysis( 'teens - hundreds', "115th", ["115"] ); + assertAnalysis( 'teens - hundreds', "116th", ["116"] ); + assertAnalysis( 'teens - hundreds', "117th", ["117"] ); + assertAnalysis( 'teens - hundreds', "118th", ["118"] ); + assertAnalysis( 'teens - hundreds', "119th", ["119"] ); + assertAnalysis( 'teens - hundreds', "120th", ["120"] ); + assertAnalysis( 'uppercase', "1ST", ["1"] ); assertAnalysis( 'uppercase', "22ND", ["22"] ); assertAnalysis( 'uppercase', "333RD", ["333"] ); diff --git a/settings.js b/settings.js index 98004227..18a633c3 100644 --- a/settings.js +++ b/settings.js @@ -148,8 +148,8 @@ function generate(){ }, "remove_ordinals" : { "type" : "pattern_replace", - "pattern": "(([0-9]*1)st?|([0-9]*2)nd?|([0-9]*3)rd?|([0-9]*[456789])th?|([0-9]+0)th?)", - "replacement": "$2$3$4$5$6" + "pattern": "(([0-9]*1)st?|([0-9]*2)nd?|([0-9]*3)rd?|([0-9]*[456789])th?|([0-9]+0)th?|([0-9]*1[0-9])th?)", + "replacement": "$2$3$4$5$6$7" }, "remove_duplicate_spaces" : { "type" : "pattern_replace", From 14424efc276085cef96d2ee5b278abb10921365b Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Tue, 12 Jan 2016 16:02:57 +0100 Subject: [PATCH 03/15] fix tests --- test/fixtures/expected.json | 4 ++-- test/settings.js | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 84de6cbc..8773db54 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -629,8 +629,8 @@ }, "remove_ordinals": { "type": "pattern_replace", - "pattern": "(([0-9])(st|nd|rd|th))", - "replacement": "$2" + "pattern": "(([0-9]*1)st?|([0-9]*2)nd?|([0-9]*3)rd?|([0-9]*[456789])th?|([0-9]+0)th?|([0-9]*1[0-9])th?)", + "replacement": "$2$3$4$5$6$7" }, "remove_duplicate_spaces": { "type": "pattern_replace", diff --git a/test/settings.js b/test/settings.js index b36155f7..32a1a421 100644 --- a/test/settings.js +++ b/test/settings.js @@ -365,8 +365,8 @@ module.exports.tests.removeOrdinalsFilter = function(test, common) { t.equal(typeof s.analysis.filter.remove_ordinals, 'object', 'there is an remove_ordinals filter'); var filter = s.analysis.filter.remove_ordinals; t.equal(filter.type, 'pattern_replace'); - t.equal(filter.pattern, '(([0-9])(st|nd|rd|th))'); - t.equal(filter.replacement, '$2'); + t.equal(filter.pattern, '(([0-9]*1)st?|([0-9]*2)nd?|([0-9]*3)rd?|([0-9]*[456789])th?|([0-9]+0)th?|([0-9]*1[0-9])th?)'); + t.equal(filter.replacement, '$2$3$4$5$6$7'); t.end(); }); }; From 4e72b129f463062209ee4ef258e36a0e588e0525 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Tue, 12 Jan 2016 19:22:12 +0100 Subject: [PATCH 04/15] yak shaving --- integration/analyzer_peliasStreet.js | 11 +++++++++++ settings.js | 4 ++-- test/fixtures/expected.json | 4 ++-- test/settings.js | 4 ++-- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index b3255e6b..5efd831d 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -100,11 +100,21 @@ module.exports.tests.remove_ordinals = function(test, common){ assertAnalysis( 'teens - hundreds', "119th", ["119"] ); assertAnalysis( 'teens - hundreds', "120th", ["120"] ); + // teens (wrong suffix) + assertAnalysis( 'teens - wrong suffix', "11st", ["11st"] ); + assertAnalysis( 'teens - wrong suffix', "12nd", ["12nd"] ); + assertAnalysis( 'teens - wrong suffix', "13rd", ["13rd"] ); + assertAnalysis( 'teens - wrong suffix', "111st", ["111st"] ); + assertAnalysis( 'teens - wrong suffix', "112nd", ["112nd"] ); + assertAnalysis( 'teens - wrong suffix', "113rd", ["113rd"] ); + + // uppercase assertAnalysis( 'uppercase', "1ST", ["1"] ); assertAnalysis( 'uppercase', "22ND", ["22"] ); assertAnalysis( 'uppercase', "333RD", ["333"] ); assertAnalysis( 'uppercase', "4444TH", ["4444"] ); + // autocomplete assertAnalysis( 'autocomplete', "26", ["26"] ); assertAnalysis( 'autocomplete', "26t", ["26"] ); assertAnalysis( 'autocomplete', "26th", ["26"] ); @@ -112,6 +122,7 @@ module.exports.tests.remove_ordinals = function(test, common){ assertAnalysis( 'autocomplete', "3r", ["3"] ); assertAnalysis( 'autocomplete', "3rd", ["3"] ); + // wrong suffix assertAnalysis( 'wrong suffix (do nothing)', "0th", ["0th"] ); assertAnalysis( 'wrong suffix (do nothing)', "26s", ["26s"] ); assertAnalysis( 'wrong suffix (do nothing)', "26st", ["26st"] ); diff --git a/settings.js b/settings.js index 18a633c3..9407324b 100644 --- a/settings.js +++ b/settings.js @@ -148,8 +148,8 @@ function generate(){ }, "remove_ordinals" : { "type" : "pattern_replace", - "pattern": "(([0-9]*1)st?|([0-9]*2)nd?|([0-9]*3)rd?|([0-9]*[456789])th?|([0-9]+0)th?|([0-9]*1[0-9])th?)", - "replacement": "$2$3$4$5$6$7" + "pattern": "(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))", + "replacement": "$2$4$5$6$7$9$10$12$14$15$16$17$18" }, "remove_duplicate_spaces" : { "type" : "pattern_replace", diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 8773db54..8a41eac1 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -629,8 +629,8 @@ }, "remove_ordinals": { "type": "pattern_replace", - "pattern": "(([0-9]*1)st?|([0-9]*2)nd?|([0-9]*3)rd?|([0-9]*[456789])th?|([0-9]+0)th?|([0-9]*1[0-9])th?)", - "replacement": "$2$3$4$5$6$7" + "pattern": "(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))", + "replacement": "$2$4$5$6$7$9$10$12$14$15$16$17$18" }, "remove_duplicate_spaces": { "type": "pattern_replace", diff --git a/test/settings.js b/test/settings.js index 32a1a421..37b54e5a 100644 --- a/test/settings.js +++ b/test/settings.js @@ -365,8 +365,8 @@ module.exports.tests.removeOrdinalsFilter = function(test, common) { t.equal(typeof s.analysis.filter.remove_ordinals, 'object', 'there is an remove_ordinals filter'); var filter = s.analysis.filter.remove_ordinals; t.equal(filter.type, 'pattern_replace'); - t.equal(filter.pattern, '(([0-9]*1)st?|([0-9]*2)nd?|([0-9]*3)rd?|([0-9]*[456789])th?|([0-9]+0)th?|([0-9]*1[0-9])th?)'); - t.equal(filter.replacement, '$2$3$4$5$6$7'); + t.equal(filter.pattern, '(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))'); + t.equal(filter.replacement, '$2$4$5$6$7$9$10$12$14$15$16$17$18'); t.end(); }); }; From 357885882b29b66e0dae4aa22506a254d2ddb0a5 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Wed, 16 Mar 2016 13:39:09 +0100 Subject: [PATCH 05/15] refactor analyzers, add tests --- ....js => analyzer_peliasIndexOneEdgeGram.js} | 12 +- ....js => analyzer_peliasIndexTwoEdgeGram.js} | 14 +- integration/analyzer_peliasQueryFullToken.js | 137 ++++++++++++++ .../analyzer_peliasQueryPartialToken.js | 153 ++++++++++++++++ integration/autocomplete_synonym_expansion.js | 0 integration/dynamic_templates.js | 6 +- integration/run.js | 6 +- mappings/document.js | 2 +- schema.js | 2 +- settings.js | 48 ++++- test/compile.js | 18 +- test/document.js | 2 +- test/fixtures/expected.json | 170 +++++++++++++++--- test/settings.js | 36 ++-- 14 files changed, 533 insertions(+), 73 deletions(-) rename integration/{analyzer_peliasOneEdgeGram.js => analyzer_peliasIndexOneEdgeGram.js} (89%) rename integration/{analyzer_peliasTwoEdgeGram.js => analyzer_peliasIndexTwoEdgeGram.js} (89%) create mode 100644 integration/analyzer_peliasQueryFullToken.js create mode 100644 integration/analyzer_peliasQueryPartialToken.js create mode 100644 integration/autocomplete_synonym_expansion.js diff --git a/integration/analyzer_peliasOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js similarity index 89% rename from integration/analyzer_peliasOneEdgeGram.js rename to integration/analyzer_peliasIndexOneEdgeGram.js index 82eb9d1e..51f15424 100644 --- a/integration/analyzer_peliasOneEdgeGram.js +++ b/integration/analyzer_peliasIndexOneEdgeGram.js @@ -12,7 +12,7 @@ module.exports.tests.analyze = function(test, common){ test( 'analyze', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); - var assertAnalysis = analyze.bind( null, suite, t, 'peliasOneEdgeGram' ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexOneEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'lowercase', 'F', ['f']); @@ -26,7 +26,7 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'ampersand', 'a & b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'a and & and b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'land', ['l','la','lan','land'] ); // should not replace inside tokens - assertAnalysis( 'peliasOneEdgeGramFilter', '1 a ab abc abcdefghij', ['1','a','ab','abc','abcd','abcde','abcdef','abcdefg','abcdefgh','abcdefghi','abcdefghij'] ); + assertAnalysis( 'peliasIndexOneEdgeGramFilter', '1 a ab abc abcdefghij', ['1','a','ab','abc','abcd','abcde','abcdef','abcdefg','abcdefgh','abcdefghi','abcdefghij'] ); assertAnalysis( 'removeAllZeroNumericPrefix', '00001', ['1'] ); assertAnalysis( 'unique', '1 1 1', ['1'] ); assertAnalysis( 'notnull', ' / / ', [] ); @@ -55,7 +55,7 @@ module.exports.tests.address_suffix_expansions = function(test, common){ test( 'address suffix expansions', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); - var assertAnalysis = analyze.bind( null, suite, t, 'peliasOneEdgeGram' ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexOneEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'safe expansions', 'aly', [ @@ -83,7 +83,7 @@ module.exports.tests.stop_words = function(test, common){ test( 'stop words', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); - var assertAnalysis = analyze.bind( null, suite, t, 'peliasOneEdgeGram' ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexOneEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'street suffix', 'AB street', [ @@ -102,7 +102,7 @@ module.exports.tests.functional = function(test, common){ test( 'functional', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); - var assertAnalysis = analyze.bind( null, suite, t, 'peliasOneEdgeGram' ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexOneEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'country', 'Trinidad and Tobago', [ @@ -124,7 +124,7 @@ module.exports.tests.functional = function(test, common){ module.exports.all = function (tape, common) { function test(name, testFunction) { - return tape('peliasOneEdgeGram: ' + name, testFunction); + return tape('peliasIndexOneEdgeGram: ' + name, testFunction); } for( var testCase in module.exports.tests ){ diff --git a/integration/analyzer_peliasTwoEdgeGram.js b/integration/analyzer_peliasIndexTwoEdgeGram.js similarity index 89% rename from integration/analyzer_peliasTwoEdgeGram.js rename to integration/analyzer_peliasIndexTwoEdgeGram.js index c8ac0f5d..1e6fcdab 100644 --- a/integration/analyzer_peliasTwoEdgeGram.js +++ b/integration/analyzer_peliasIndexTwoEdgeGram.js @@ -12,7 +12,7 @@ module.exports.tests.analyze = function(test, common){ test( 'analyze', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); - var assertAnalysis = analyze.bind( null, suite, t, 'peliasTwoEdgeGram' ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexTwoEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'lowercase', 'FA', ['fa']); @@ -31,7 +31,7 @@ module.exports.tests.analyze = function(test, common){ // assertAnalysis( 'ampersand', 'aa & bb', ['aa','&','bb'] ); // assertAnalysis( 'ampersand', 'aa and & and bb', ['aa','&','bb'] ); - assertAnalysis( 'peliasTwoEdgeGramFilter', '1 a ab abc abcdefghij', ['ab','abc','abcd','abcde','abcdef','abcdefg','abcdefgh','abcdefghi','abcdefghij'] ); + assertAnalysis( 'peliasIndexTwoEdgeGramFilter', '1 a ab abc abcdefghij', ['ab','abc','abcd','abcde','abcdef','abcdefg','abcdefgh','abcdefghi','abcdefghij'] ); assertAnalysis( 'removeAllZeroNumericPrefix', '0002 00011', ['11'] ); assertAnalysis( 'unique', '11 11 11', ['11'] ); assertAnalysis( 'notnull', ' / / ', [] ); @@ -63,7 +63,7 @@ module.exports.tests.address_suffix_expansions = function(test, common){ test( 'address suffix expansions', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); - var assertAnalysis = analyze.bind( null, suite, t, 'peliasTwoEdgeGram' ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexTwoEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'safe expansions', 'aly', [ @@ -91,7 +91,7 @@ module.exports.tests.stop_words = function(test, common){ test( 'stop words', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); - var assertAnalysis = analyze.bind( null, suite, t, 'peliasTwoEdgeGram' ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexTwoEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'street suffix', 'AB street', [ @@ -110,7 +110,7 @@ module.exports.tests.functional = function(test, common){ test( 'functional', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); - var assertAnalysis = analyze.bind( null, suite, t, 'peliasTwoEdgeGram' ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexTwoEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'country', 'Trinidad and Tobago', [ @@ -134,7 +134,7 @@ module.exports.tests.functional = function(test, common){ test( 'address suffix expansion', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); - var assertAnalysis = analyze.bind( null, suite, t, 'peliasTwoEdgeGram' ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexTwoEdgeGram' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'street', 'FOO rd', [ @@ -152,7 +152,7 @@ module.exports.tests.functional = function(test, common){ module.exports.all = function (tape, common) { function test(name, testFunction) { - return tape('peliasTwoEdgeGram: ' + name, testFunction); + return tape('peliasIndexTwoEdgeGram: ' + name, testFunction); } for( var testCase in module.exports.tests ){ diff --git a/integration/analyzer_peliasQueryFullToken.js b/integration/analyzer_peliasQueryFullToken.js new file mode 100644 index 00000000..0a175363 --- /dev/null +++ b/integration/analyzer_peliasQueryFullToken.js @@ -0,0 +1,137 @@ + +// validate analyzer is behaving as expected + +var tape = require('tape'), + elastictest = require('elastictest'), + schema = require('../schema'), + punctuation = require('../punctuation'); + +module.exports.tests = {}; + +module.exports.tests.analyze = function(test, common){ + test( 'analyze', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryFullToken' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + assertAnalysis( 'lowercase', 'F', ['f']); + assertAnalysis( 'asciifolding', 'é', ['e']); + assertAnalysis( 'asciifolding', 'ß', ['ss']); + assertAnalysis( 'asciifolding', 'æ', ['ae']); + assertAnalysis( 'asciifolding', 'ł', ['l']); + assertAnalysis( 'asciifolding', 'ɰ', ['m']); + assertAnalysis( 'trim', ' f ', ['f'] ); + assertAnalysis( 'ampersand', 'a and b', ['a','&','b'] ); + assertAnalysis( 'ampersand', 'a & b', ['a','&','b'] ); + assertAnalysis( 'ampersand', 'a and & and b', ['a','&','b'] ); + assertAnalysis( 'ampersand', 'land', ['land'] ); // should not replace inside tokens + assertAnalysis( 'peliasQueryFullTokenFilter', '1 a ab abc abcdefghij', ['1','a','ab','abc','abcdefghij'] ); + assertAnalysis( 'removeAllZeroNumericPrefix', '00001', ['1'] ); + assertAnalysis( 'unique', '1 1 1', ['1'] ); + assertAnalysis( 'notnull', ' / / ', [] ); + + assertAnalysis( 'kstem', 'mcdonalds', ['mcdonald'] ); + assertAnalysis( 'kstem', 'McDonald\'s', ['mcdonald'] ); + assertAnalysis( 'kstem', 'peoples', ['people'] ); + + // remove punctuation (handled by the char_filter) + assertAnalysis( 'punctuation', punctuation.all.join(''), ['-&'] ); + + // ensure that very large tokens are created + assertAnalysis( 'largeGrams', 'grolmanstrasse', [ 'grolmanstrasse' ]); + + suite.run( t.end ); + }); +}; + +// address suffix expansions should only performed in a way that is +// safe for 'partial tokens'. +module.exports.tests.address_suffix_expansions = function(test, common){ + test( 'address suffix expansions', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryFullToken' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + assertAnalysis( 'safe expansions', 'aly', [ 'alley' ]); + + assertAnalysis( 'safe expansions', 'xing', [ 'crossing' ]); + + assertAnalysis( 'safe expansions', 'rd', [ 'road' ]); + + assertAnalysis( 'unsafe expansion', 'ct st', [ 'ct', 'st' ]); + + suite.run( t.end ); + }); +}; + +// stop words should be disabled so that the entire token is used +module.exports.tests.stop_words = function(test, common){ + test( 'stop words', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryFullToken' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + assertAnalysis( 'street suffix', 'AB street', [ 'ab', 'street' ]); + + assertAnalysis( 'street suffix (abbreviation)', 'AB st', [ 'ab', 'st' ]); + + suite.run( t.end ); + }); +}; + +module.exports.tests.functional = function(test, common){ + test( 'functional', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryFullToken' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + assertAnalysis( 'country', 'Trinidad and Tobago', [ + 'trinidad', '&', 'tobago' + ]); + + assertAnalysis( 'place', 'Toys "R" Us!', [ + 'toy', 'r', 'us' + ]); + + assertAnalysis( 'address', '101 mapzen place', [ + '101', 'mapzen', 'place' + ]); + + suite.run( t.end ); + }); +}; + +module.exports.all = function (tape, common) { + + function test(name, testFunction) { + return tape('peliasQueryFullToken: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; + +function analyze( suite, t, analyzer, comment, text, expected ){ + suite.assert( function( done ){ + suite.client.indices.analyze({ + index: suite.props.index, + analyzer: analyzer, + text: text + }, function( err, res ){ + if( err ) console.error( err ); + t.deepEqual( simpleTokens( res.tokens ), expected, comment ); + done(); + }); + }); +} + +function simpleTokens( tokens ){ + return tokens.map( function( t ){ + return t.token; + }); +} diff --git a/integration/analyzer_peliasQueryPartialToken.js b/integration/analyzer_peliasQueryPartialToken.js new file mode 100644 index 00000000..23771473 --- /dev/null +++ b/integration/analyzer_peliasQueryPartialToken.js @@ -0,0 +1,153 @@ + +// validate analyzer is behaving as expected + +var tape = require('tape'), + elastictest = require('elastictest'), + schema = require('../schema'), + punctuation = require('../punctuation'); + +module.exports.tests = {}; + +module.exports.tests.analyze = function(test, common){ + test( 'analyze', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryPartialToken' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + assertAnalysis( 'lowercase', 'F', ['f']); + assertAnalysis( 'asciifolding', 'é', ['e']); + assertAnalysis( 'asciifolding', 'ß', ['s','ss']); + assertAnalysis( 'asciifolding', 'æ', ['a','ae']); + assertAnalysis( 'asciifolding', 'ł', ['l']); + assertAnalysis( 'asciifolding', 'ɰ', ['m']); + assertAnalysis( 'trim', ' f ', ['f'] ); + assertAnalysis( 'ampersand', 'a and b', ['a','&','b'] ); + assertAnalysis( 'ampersand', 'a & b', ['a','&','b'] ); + assertAnalysis( 'ampersand', 'a and & and b', ['a','&','b'] ); + assertAnalysis( 'ampersand', 'land', ['l','la','lan','land'] ); // should not replace inside tokens + assertAnalysis( 'peliasQueryPartialTokenFilter', '1 a ab abc abcdefghij', ['1','a','ab','abc','abcd','abcde','abcdef','abcdefg','abcdefgh','abcdefghi','abcdefghij'] ); + assertAnalysis( 'removeAllZeroNumericPrefix', '00001', ['1'] ); + assertAnalysis( 'unique', '1 1 1', ['1'] ); + assertAnalysis( 'notnull', ' / / ', [] ); + + assertAnalysis( 'kstem', 'mcdonalds', ['m', 'mc', 'mcd', 'mcdo', 'mcdon', 'mcdona', 'mcdonal', 'mcdonald'] ); + assertAnalysis( 'kstem', 'McDonald\'s', ['m', 'mc', 'mcd', 'mcdo', 'mcdon', 'mcdona', 'mcdonal', 'mcdonald'] ); + assertAnalysis( 'kstem', 'peoples', ['p', 'pe', 'peo', 'peop', 'peopl', 'people'] ); + + // remove punctuation (handled by the char_filter) + assertAnalysis( 'punctuation', punctuation.all.join(''), ['-','-&'] ); + + // ensure that very large grams are created + assertAnalysis( 'largeGrams', 'grolmanstrasse', [ + 'g','gr','gro','grol','grolm','grolma','grolman','grolmans','grolmanst', + 'grolmanstr','grolmanstra','grolmanstras','grolmanstrass', + 'grolmanstrasse' + ]); + + suite.run( t.end ); + }); +}; + +// address suffix expansions should only performed in a way that is +// safe for 'partial tokens'. +module.exports.tests.address_suffix_expansions = function(test, common){ + test( 'address suffix expansions', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryPartialToken' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + assertAnalysis( 'safe expansions', 'aly', [ + 'a', 'al', 'all', 'alle', 'alley' + ]); + + assertAnalysis( 'safe expansions', 'xing', [ + 'c', 'cr', 'cro', 'cros', 'cross', 'crossi', 'crossin', 'crossing' + ]); + + assertAnalysis( 'safe expansions', 'rd', [ + 'r', 'ro', 'roa', 'road' + ]); + + assertAnalysis( 'unsafe expansion', 'ct st', [ + 'c', 'ct', 's', 'st' + ]); + + suite.run( t.end ); + }); +}; + +// stop words should be disabled so that the entire street prefix is indexed as ngrams +module.exports.tests.stop_words = function(test, common){ + test( 'stop words', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryPartialToken' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + assertAnalysis( 'street suffix', 'AB street', [ + 'a', 'ab', 's', 'st', 'str', 'stre', 'stree', 'street' + ]); + + assertAnalysis( 'street suffix (abbreviation)', 'AB st', [ + 'a', 'ab', 's', 'st' + ]); + + suite.run( t.end ); + }); +}; + +module.exports.tests.functional = function(test, common){ + test( 'functional', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryPartialToken' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + assertAnalysis( 'country', 'Trinidad and Tobago', [ + 't', 'tr', 'tri', 'trin', 'trini', 'trinid', 'trinida', 'trinidad', '&', 'to', 'tob', 'toba', 'tobag', 'tobago' + ]); + + assertAnalysis( 'place', 'Toys "R" Us!', [ + 't', 'to', 'toy', 'r', 'u', 'us' + ]); + + assertAnalysis( 'address', '101 mapzen place', [ + '1', '10', '101', 'm', 'ma', 'map', 'mapz', 'mapze', 'mapzen', 'p', 'pl', 'pla', 'plac', 'place' + ]); + + suite.run( t.end ); + }); +}; + +module.exports.all = function (tape, common) { + + function test(name, testFunction) { + return tape('peliasQueryPartialToken: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; + +function analyze( suite, t, analyzer, comment, text, expected ){ + suite.assert( function( done ){ + suite.client.indices.analyze({ + index: suite.props.index, + analyzer: analyzer, + text: text + }, function( err, res ){ + if( err ) console.error( err ); + t.deepEqual( simpleTokens( res.tokens ), expected, comment ); + done(); + }); + }); +} + +function simpleTokens( tokens ){ + return tokens.map( function( t ){ + return t.token; + }); +} diff --git a/integration/autocomplete_synonym_expansion.js b/integration/autocomplete_synonym_expansion.js new file mode 100644 index 00000000..e69de29b diff --git a/integration/dynamic_templates.js b/integration/dynamic_templates.js index 6eb7ea68..82b3a706 100644 --- a/integration/dynamic_templates.js +++ b/integration/dynamic_templates.js @@ -9,9 +9,9 @@ module.exports.tests = {}; // 'admin' mappings have a different 'name' dynamic_template to the other types module.exports.tests.dynamic_templates_name = function(test, common){ - test( 'admin->name (legacy)', nameAssertion( 'admin0', 'peliasOneEdgeGram' ) ); - test( 'admin->name', nameAssertion( 'country', 'peliasOneEdgeGram' ) ); - test( 'document->name', nameAssertion( 'myType', 'peliasTwoEdgeGram' ) ); + test( 'admin->name (legacy)', nameAssertion( 'admin0', 'peliasIndexOneEdgeGram' ) ); + test( 'admin->name', nameAssertion( 'country', 'peliasIndexOneEdgeGram' ) ); + test( 'document->name', nameAssertion( 'myType', 'peliasIndexTwoEdgeGram' ) ); }; // all types share the same phrase mapping diff --git a/integration/run.js b/integration/run.js index e6f3e495..8e96d0fd 100644 --- a/integration/run.js +++ b/integration/run.js @@ -5,8 +5,10 @@ var common = {}; var tests = [ require('./validate.js'), require('./dynamic_templates.js'), - require('./analyzer_peliasOneEdgeGram.js'), - require('./analyzer_peliasTwoEdgeGram.js'), + require('./analyzer_peliasIndexOneEdgeGram.js'), + require('./analyzer_peliasIndexTwoEdgeGram.js'), + require('./analyzer_peliasQueryPartialToken.js'), + require('./analyzer_peliasQueryFullToken.js'), require('./analyzer_peliasPhrase.js'), require('./analyzer_peliasAdmin.js'), require('./analyzer_peliasHousenumber.js'), diff --git a/mappings/document.js b/mappings/document.js index b79d6beb..7a0eb25c 100644 --- a/mappings/document.js +++ b/mappings/document.js @@ -109,7 +109,7 @@ var schema = { match_mapping_type: 'string', mapping: { type: 'string', - analyzer: 'peliasTwoEdgeGram', + analyzer: 'peliasIndexTwoEdgeGram', fielddata : { format : 'fst', loading: 'eager_global_ordinals' diff --git a/schema.js b/schema.js index 0369ddfc..b90623f4 100644 --- a/schema.js +++ b/schema.js @@ -7,7 +7,7 @@ var oneGramMapping = { match_mapping_type: 'string', mapping: { type: 'string', - analyzer: 'peliasOneEdgeGram', + analyzer: 'peliasIndexOneEdgeGram', fielddata : { format : 'fst', loading: 'eager_global_ordinals' diff --git a/settings.js b/settings.js index 3dc469ef..a78419ac 100644 --- a/settings.js +++ b/settings.js @@ -25,7 +25,8 @@ function generate(){ "notnull" ] }, - "peliasOneEdgeGram" : { + + "peliasIndexOneEdgeGram" : { "type": "custom", "tokenizer" : "whitespace", "char_filter" : ["punctuation"], @@ -33,7 +34,7 @@ function generate(){ "lowercase", "asciifolding", "trim", - "address_suffix_expansion", + "full_token_address_suffix_expansion", "ampersand", "removeAllZeroNumericPrefix", "kstem", @@ -42,7 +43,7 @@ function generate(){ "notnull" ] }, - "peliasTwoEdgeGram" : { + "peliasIndexTwoEdgeGram" : { "type": "custom", "tokenizer" : "whitespace", "char_filter" : ["punctuation"], @@ -50,7 +51,7 @@ function generate(){ "lowercase", "asciifolding", "trim", - "address_suffix_expansion", + "full_token_address_suffix_expansion", "ampersand", "removeAllZeroNumericPrefix", "kstem", @@ -59,6 +60,39 @@ function generate(){ "notnull" ] }, + "peliasQueryPartialToken" : { + "type": "custom", + "tokenizer" : "whitespace", + "char_filter" : ["punctuation"], + "filter": [ + "lowercase", + "asciifolding", + "trim", + "partial_token_address_suffix_expansion", + "ampersand", + "removeAllZeroNumericPrefix", + "kstem", + "peliasOneEdgeGramFilter", + "unique", + "notnull" + ] + }, + "peliasQueryFullToken" : { + "type": "custom", + "tokenizer" : "whitespace", + "char_filter" : ["punctuation"], + "filter": [ + "lowercase", + "asciifolding", + "trim", + "full_token_address_suffix_expansion", + "ampersand", + "removeAllZeroNumericPrefix", + "kstem", + "unique", + "notnull" + ] + }, "peliasPhrase": { "type": "custom", "tokenizer":"whitespace", @@ -139,7 +173,11 @@ function generate(){ "type": "synonym", "synonyms": street_suffix.synonyms }, - "address_suffix_expansion": { + "partial_token_address_suffix_expansion": { + "type": "synonym", + "synonyms": street_suffix.safe_expansions + }, + "full_token_address_suffix_expansion": { "type": "synonym", "synonyms": street_suffix.safe_expansions }, diff --git a/test/compile.js b/test/compile.js index b8525ef8..e3293200 100644 --- a/test/compile.js +++ b/test/compile.js @@ -19,25 +19,25 @@ module.exports.tests.compile = function(test, common) { module.exports.tests.indeces = function(test, common) { test('contains "_default_" index definition', function(t) { t.equal(typeof schema.mappings._default_, 'object', 'mappings present'); - t.equal(schema.mappings._default_.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasTwoEdgeGram'); + t.equal(schema.mappings._default_.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasIndexTwoEdgeGram'); t.end(); }); test('explicitly specify some admin indeces and their analyzer', function(t) { t.equal(typeof schema.mappings.country, 'object', 'mappings present'); - t.equal(schema.mappings.country.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasOneEdgeGram'); + t.equal(schema.mappings.country.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasIndexOneEdgeGram'); t.equal(typeof schema.mappings.region, 'object', 'mappings present'); - t.equal(schema.mappings.region.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasOneEdgeGram'); + t.equal(schema.mappings.region.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasIndexOneEdgeGram'); t.equal(typeof schema.mappings.county, 'object', 'mappings present'); - t.equal(schema.mappings.county.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasOneEdgeGram'); + t.equal(schema.mappings.county.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasIndexOneEdgeGram'); t.end(); }); test('explicitly specify some admin indeces and their analyzer (legacy)', function(t) { t.equal(typeof schema.mappings.admin0, 'object', 'mappings present'); - t.equal(schema.mappings.admin0.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasOneEdgeGram'); + t.equal(schema.mappings.admin0.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasIndexOneEdgeGram'); t.equal(typeof schema.mappings.admin1, 'object', 'mappings present'); - t.equal(schema.mappings.admin1.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasOneEdgeGram'); + t.equal(schema.mappings.admin1.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasIndexOneEdgeGram'); t.equal(typeof schema.mappings.admin2, 'object', 'mappings present'); - t.equal(schema.mappings.admin2.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasOneEdgeGram'); + t.equal(schema.mappings.admin2.dynamic_templates[0].nameGram.mapping.analyzer, 'peliasIndexOneEdgeGram'); t.end(); }); }; @@ -51,7 +51,7 @@ module.exports.tests.dynamic_templates = function(test, common) { t.equal(template.match_mapping_type, 'string'); t.deepEqual(template.mapping, { type: 'string', - analyzer: 'peliasOneEdgeGram', + analyzer: 'peliasIndexOneEdgeGram', fielddata: { format: 'fst', loading: 'eager_global_ordinals' @@ -70,7 +70,7 @@ module.exports.tests.dynamic_templates_legacy = function(test, common) { t.equal(template.match_mapping_type, 'string'); t.deepEqual(template.mapping, { type: 'string', - analyzer: 'peliasOneEdgeGram', + analyzer: 'peliasIndexOneEdgeGram', fielddata: { format: 'fst', loading: 'eager_global_ordinals' diff --git a/test/document.js b/test/document.js index 639b8517..90c51b7b 100644 --- a/test/document.js +++ b/test/document.js @@ -123,7 +123,7 @@ module.exports.tests.dynamic_templates = function(test, common) { t.equal(template.match_mapping_type, 'string'); t.deepEqual(template.mapping, { type: 'string', - analyzer: 'peliasTwoEdgeGram', + analyzer: 'peliasIndexTwoEdgeGram', fielddata: { format: 'fst', loading: 'eager_global_ordinals' diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index a25a5259..1ffc2c9f 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -16,17 +16,15 @@ "notnull" ] }, - "peliasOneEdgeGram": { + "peliasIndexOneEdgeGram" : { "type": "custom", - "tokenizer": "whitespace", - "char_filter": [ - "punctuation" - ], + "tokenizer" : "whitespace", + "char_filter" : ["punctuation"], "filter": [ "lowercase", "asciifolding", "trim", - "address_suffix_expansion", + "full_token_address_suffix_expansion", "ampersand", "removeAllZeroNumericPrefix", "kstem", @@ -35,17 +33,15 @@ "notnull" ] }, - "peliasTwoEdgeGram": { + "peliasIndexTwoEdgeGram" : { "type": "custom", - "tokenizer": "whitespace", - "char_filter": [ - "punctuation" - ], + "tokenizer" : "whitespace", + "char_filter" : ["punctuation"], "filter": [ "lowercase", "asciifolding", "trim", - "address_suffix_expansion", + "full_token_address_suffix_expansion", "ampersand", "removeAllZeroNumericPrefix", "kstem", @@ -54,6 +50,39 @@ "notnull" ] }, + "peliasQueryPartialToken" : { + "type": "custom", + "tokenizer" : "whitespace", + "char_filter" : ["punctuation"], + "filter": [ + "lowercase", + "asciifolding", + "trim", + "partial_token_address_suffix_expansion", + "ampersand", + "removeAllZeroNumericPrefix", + "kstem", + "peliasOneEdgeGramFilter", + "unique", + "notnull" + ] + }, + "peliasQueryFullToken" : { + "type": "custom", + "tokenizer" : "whitespace", + "char_filter" : ["punctuation"], + "filter": [ + "lowercase", + "asciifolding", + "trim", + "full_token_address_suffix_expansion", + "ampersand", + "removeAllZeroNumericPrefix", + "kstem", + "unique", + "notnull" + ] + }, "peliasPhrase": { "type": "custom", "tokenizer": "whitespace", @@ -514,7 +543,108 @@ "way => wy" ] }, - "address_suffix_expansion": { + "partial_token_address_suffix_expansion": { + "type": "synonym", + "synonyms": [ + "aly => alley", + "anx => annex", + "byu => bayou", + "bch => beach", + "bnd => bend", + "blf => bluff", + "blfs => bluffs", + "btm => bottom", + "blvd => boulevard", + "brg => bridge", + "brk => brook", + "cyn => canyon", + "cp => cape", + "cswy => causeway", + "ctr => center", + "chnnl => channel", + "clf => cliff", + "clb => club", + "cmn => common", + "cmns => commons", + "crse => course", + "cv => cove", + "crk => creek", + "crst => crest", + "xing => crossing", + "xrd => crossroad", + "xrds => crossroads", + "dl => dale", + "dm => dam", + "expy => expressway", + "fls => falls", + "fry => ferry", + "fld => field", + "flds => fields", + "flt => flat", + "flts => flats", + "frd => ford", + "frst => forest", + "frg => forge", + "frk => fork", + "frks => forks", + "fwy => freeway", + "gdn => garden", + "gdns => gardens", + "gtwy => gateway", + "gln => glenn", + "grn => green", + "grv => grove", + "hbr => harbor", + "hvn => haven", + "hts => heights", + "hwy => highway", + "hl => hill", + "hls => hills", + "holw => hollow", + "jct => junction", + "ky => key", + "kys => keys", + "knl => knoll", + "knls => knolls", + "lndg => landing", + "ln => lane", + "lgt => light", + "lgts => lights", + "lck => lock", + "lcks => locks", + "mnr => manor", + "mdw => meadow", + "mdws => meadows", + "ml => mill", + "mls => mills", + "mnt => mountain", + "mtwy => motorway", + "nck => neck", + "pkwy => parkway", + "psge => pasage", + "pne => pine", + "pnes => pines", + "plz => plaza", + "rnch => ranch", + "rdg => ridge", + "rdgs => ridges", + "rd => road", + "rte => route", + "shr => shore", + "shrs => shores", + "skwy => skyway", + "spg => spring", + "spgs => springs", + "ste => suite", + "trfy => trafficway", + "tunl => tunnel", + "tpke => turnpike", + "vly => valley", + "vlg => village", + "wy => way" + ] + }, + "full_token_address_suffix_expansion": { "type": "synonym", "synonyms": [ "aly => alley", @@ -1592,7 +1722,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasTwoEdgeGram", + "analyzer": "peliasIndexTwoEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" @@ -1633,7 +1763,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasOneEdgeGram", + "analyzer": "peliasIndexOneEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" @@ -1651,7 +1781,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasOneEdgeGram", + "analyzer": "peliasIndexOneEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" @@ -1669,7 +1799,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasOneEdgeGram", + "analyzer": "peliasIndexOneEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" @@ -1687,7 +1817,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasOneEdgeGram", + "analyzer": "peliasIndexOneEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" @@ -1705,7 +1835,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasOneEdgeGram", + "analyzer": "peliasIndexOneEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" @@ -1723,7 +1853,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasOneEdgeGram", + "analyzer": "peliasIndexOneEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" diff --git a/test/settings.js b/test/settings.js index a3db4d65..efd1955f 100644 --- a/test/settings.js +++ b/test/settings.js @@ -44,24 +44,24 @@ module.exports.tests.peliasAdminAnalyzer = function(test, common) { }); }; -module.exports.tests.peliasOneEdgeGramAnalyzer = function(test, common) { - test('has peliasOneEdgeGram analyzer', function(t) { +module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) { + test('has peliasIndexOneEdgeGram analyzer', function(t) { var s = settings(); - t.equal(typeof s.analysis.analyzer.peliasOneEdgeGram, 'object', 'there is a peliasOneEdgeGram analyzer'); - var analyzer = s.analysis.analyzer.peliasOneEdgeGram; + t.equal(typeof s.analysis.analyzer.peliasIndexOneEdgeGram, 'object', 'there is a peliasIndexOneEdgeGram analyzer'); + var analyzer = s.analysis.analyzer.peliasIndexOneEdgeGram; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); t.deepEqual(analyzer.char_filter, ["punctuation"], 'punctuation filter specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); - test('peliasOneEdgeGram token filters', function(t) { - var analyzer = settings().analysis.analyzer.peliasOneEdgeGram; + test('peliasIndexOneEdgeGram token filters', function(t) { + var analyzer = settings().analysis.analyzer.peliasIndexOneEdgeGram; t.deepEqual( analyzer.filter, [ "lowercase", "asciifolding", "trim", - "address_suffix_expansion", + "full_token_address_suffix_expansion", "ampersand", "removeAllZeroNumericPrefix", "kstem", @@ -73,24 +73,24 @@ module.exports.tests.peliasOneEdgeGramAnalyzer = function(test, common) { }); }; -module.exports.tests.peliasTwoEdgeGramAnalyzer = function(test, common) { - test('has peliasTwoEdgeGram analyzer', function(t) { +module.exports.tests.peliasIndexTwoEdgeGramAnalyzer = function(test, common) { + test('has peliasIndexTwoEdgeGram analyzer', function(t) { var s = settings(); - t.equal(typeof s.analysis.analyzer.peliasTwoEdgeGram, 'object', 'there is a peliasTwoEdgeGram analyzer'); - var analyzer = s.analysis.analyzer.peliasTwoEdgeGram; + t.equal(typeof s.analysis.analyzer.peliasIndexTwoEdgeGram, 'object', 'there is a peliasIndexTwoEdgeGram analyzer'); + var analyzer = s.analysis.analyzer.peliasIndexTwoEdgeGram; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); t.deepEqual(analyzer.char_filter, ["punctuation"], 'punctuation filter specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); - test('peliasTwoEdgeGram token filters', function(t) { - var analyzer = settings().analysis.analyzer.peliasTwoEdgeGram; + test('peliasIndexTwoEdgeGram token filters', function(t) { + var analyzer = settings().analysis.analyzer.peliasIndexTwoEdgeGram; t.deepEqual( analyzer.filter, [ "lowercase", "asciifolding", "trim", - "address_suffix_expansion", + "full_token_address_suffix_expansion", "ampersand", "removeAllZeroNumericPrefix", "kstem", @@ -256,9 +256,9 @@ module.exports.tests.notnullFilter = function(test, common) { // this filter creates edgeNGrams with the minimum size of 1 module.exports.tests.peliasOneEdgeGramFilter = function(test, common) { - test('has peliasOneEdgeGram filter', function(t) { + test('has peliasIndexOneEdgeGram filter', function(t) { var s = settings(); - t.equal(typeof s.analysis.filter.peliasOneEdgeGramFilter, 'object', 'there is a peliasOneEdgeGram filter'); + t.equal(typeof s.analysis.filter.peliasOneEdgeGramFilter, 'object', 'there is a peliasIndexOneEdgeGram filter'); var filter = s.analysis.filter.peliasOneEdgeGramFilter; t.equal(filter.type, 'edgeNGram'); t.equal(filter.min_gram, 1); @@ -269,9 +269,9 @@ module.exports.tests.peliasOneEdgeGramFilter = function(test, common) { // this filter creates edgeNGrams with the minimum size of 2 module.exports.tests.peliasTwoEdgeGramFilter = function(test, common) { - test('has peliasTwoEdgeGram filter', function(t) { + test('has peliasIndexTwoEdgeGram filter', function(t) { var s = settings(); - t.equal(typeof s.analysis.filter.peliasTwoEdgeGramFilter, 'object', 'there is a peliasTwoEdgeGram filter'); + t.equal(typeof s.analysis.filter.peliasTwoEdgeGramFilter, 'object', 'there is a peliasIndexTwoEdgeGram filter'); var filter = s.analysis.filter.peliasTwoEdgeGramFilter; t.equal(filter.type, 'edgeNGram'); t.equal(filter.min_gram, 2); From 939f9d9bcd04a3ab3013e9274fb8c5ca0eba1438 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Wed, 16 Mar 2016 15:10:40 +0100 Subject: [PATCH 06/15] more tests, better handling of directional synonyms --- .../analyzer_peliasIndexOneEdgeGram.js | 7 +- .../analyzer_peliasIndexTwoEdgeGram.js | 13 +- integration/analyzer_peliasQueryFullToken.js | 7 +- .../analyzer_peliasQueryPartialToken.js | 5 + .../autocomplete_abbreviated_street_names.js | 141 ++++++++++ ...ocomplete_directional_synonym_expansion.js | 252 ++++++++++++++++++ .../autocomplete_street_synonym_expansion.js | 252 ++++++++++++++++++ integration/autocomplete_synonym_expansion.js | 0 integration/openstreetmap | 1 + integration/run.js | 5 +- settings.js | 10 +- street_suffix.js | 21 +- test/fixtures/expected.json | 16 +- test/settings.js | 1 + 14 files changed, 722 insertions(+), 9 deletions(-) create mode 100644 integration/autocomplete_abbreviated_street_names.js create mode 100644 integration/autocomplete_directional_synonym_expansion.js create mode 100644 integration/autocomplete_street_synonym_expansion.js delete mode 100644 integration/autocomplete_synonym_expansion.js create mode 160000 integration/openstreetmap diff --git a/integration/analyzer_peliasIndexOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js index 51f15424..81aef6a5 100644 --- a/integration/analyzer_peliasIndexOneEdgeGram.js +++ b/integration/analyzer_peliasIndexOneEdgeGram.js @@ -16,7 +16,7 @@ module.exports.tests.analyze = function(test, common){ suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'lowercase', 'F', ['f']); - assertAnalysis( 'asciifolding', 'é', ['e']); + assertAnalysis( 'asciifolding', 'á', ['a']); assertAnalysis( 'asciifolding', 'ß', ['s','ss']); assertAnalysis( 'asciifolding', 'æ', ['a','ae']); assertAnalysis( 'asciifolding', 'ł', ['l']); @@ -26,6 +26,11 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'ampersand', 'a & b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'a and & and b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'land', ['l','la','lan','land'] ); // should not replace inside tokens + + // full_token_address_suffix_expansion + assertAnalysis( 'full_token_address_suffix_expansion', 'rd', ['r','ro','roa','road'] ); + assertAnalysis( 'full_token_address_suffix_expansion', 'ctr', ['c','ce','cen','cent','cente','center'] ); + assertAnalysis( 'peliasIndexOneEdgeGramFilter', '1 a ab abc abcdefghij', ['1','a','ab','abc','abcd','abcde','abcdef','abcdefg','abcdefgh','abcdefghi','abcdefghij'] ); assertAnalysis( 'removeAllZeroNumericPrefix', '00001', ['1'] ); assertAnalysis( 'unique', '1 1 1', ['1'] ); diff --git a/integration/analyzer_peliasIndexTwoEdgeGram.js b/integration/analyzer_peliasIndexTwoEdgeGram.js index 1e6fcdab..cde75a0a 100644 --- a/integration/analyzer_peliasIndexTwoEdgeGram.js +++ b/integration/analyzer_peliasIndexTwoEdgeGram.js @@ -16,12 +16,17 @@ module.exports.tests.analyze = function(test, common){ suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'lowercase', 'FA', ['fa']); - assertAnalysis( 'asciifolding', 'éA', ['ea']); + assertAnalysis( 'asciifolding', 'lé', ['le']); assertAnalysis( 'asciifolding', 'ß', ['ss']); assertAnalysis( 'asciifolding', 'æ', ['ae']); assertAnalysis( 'asciifolding', 'łA', ['la']); assertAnalysis( 'asciifolding', 'ɰA', ['ma']); assertAnalysis( 'trim', ' fA ', ['fa'] ); + + // full_token_address_suffix_expansion + assertAnalysis( 'full_token_address_suffix_expansion', 'rd', ['ro','roa','road'] ); + assertAnalysis( 'full_token_address_suffix_expansion', 'ctr', ['ce','cen','cent','cente','center'] ); + assertAnalysis( 'ampersand', 'aa and bb', ['aa','bb'] ); assertAnalysis( 'ampersand', 'land', ['la','lan','land'] ); // should not replace inside tokens @@ -46,6 +51,12 @@ module.exports.tests.analyze = function(test, common){ // ensure that single grams are not created assertAnalysis( '1grams', 'a aa b bb 1 11', ['aa','bb','11'] ); + // for directionals (north/south/east/west) we allow single grams + assertAnalysis( 'direction_synonym_contraction_keep_original', 'a', [] ); + assertAnalysis( 'direction_synonym_contraction_keep_original', 'n', ['no','nor','nort','north','n'] ); + // note the single gram created below + assertAnalysis( 'direction_synonym_contraction_keep_original', 'north', ['no','nor','nort','north','n'] ); + // ensure that very large grams are created assertAnalysis( 'largeGrams', 'grolmanstrasse', [ 'gr','gro','grol','grolm','grolma','grolman','grolmans','grolmanst', diff --git a/integration/analyzer_peliasQueryFullToken.js b/integration/analyzer_peliasQueryFullToken.js index 0a175363..4521c8cc 100644 --- a/integration/analyzer_peliasQueryFullToken.js +++ b/integration/analyzer_peliasQueryFullToken.js @@ -16,7 +16,7 @@ module.exports.tests.analyze = function(test, common){ suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'lowercase', 'F', ['f']); - assertAnalysis( 'asciifolding', 'é', ['e']); + assertAnalysis( 'asciifolding', 'á', ['a']); assertAnalysis( 'asciifolding', 'ß', ['ss']); assertAnalysis( 'asciifolding', 'æ', ['ae']); assertAnalysis( 'asciifolding', 'ł', ['l']); @@ -26,6 +26,11 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'ampersand', 'a & b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'a and & and b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'land', ['land'] ); // should not replace inside tokens + + // full_token_address_suffix_expansion + assertAnalysis( 'full_token_address_suffix_expansion', 'rd', ['road'] ); + assertAnalysis( 'full_token_address_suffix_expansion', 'ctr', ['center'] ); + assertAnalysis( 'peliasQueryFullTokenFilter', '1 a ab abc abcdefghij', ['1','a','ab','abc','abcdefghij'] ); assertAnalysis( 'removeAllZeroNumericPrefix', '00001', ['1'] ); assertAnalysis( 'unique', '1 1 1', ['1'] ); diff --git a/integration/analyzer_peliasQueryPartialToken.js b/integration/analyzer_peliasQueryPartialToken.js index 23771473..60cbe04f 100644 --- a/integration/analyzer_peliasQueryPartialToken.js +++ b/integration/analyzer_peliasQueryPartialToken.js @@ -26,6 +26,11 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'ampersand', 'a & b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'a and & and b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'land', ['l','la','lan','land'] ); // should not replace inside tokens + + // partial_token_address_suffix_expansion + assertAnalysis( 'partial_token_address_suffix_expansion', 'rd', ['r','ro','roa','road'] ); + assertAnalysis( 'partial_token_address_suffix_expansion', 'ctr', ['c','ce','cen','cent','cente','center'] ); + assertAnalysis( 'peliasQueryPartialTokenFilter', '1 a ab abc abcdefghij', ['1','a','ab','abc','abcd','abcde','abcdef','abcdefg','abcdefgh','abcdefghi','abcdefghij'] ); assertAnalysis( 'removeAllZeroNumericPrefix', '00001', ['1'] ); assertAnalysis( 'unique', '1 1 1', ['1'] ); diff --git a/integration/autocomplete_abbreviated_street_names.js b/integration/autocomplete_abbreviated_street_names.js new file mode 100644 index 00000000..122b739c --- /dev/null +++ b/integration/autocomplete_abbreviated_street_names.js @@ -0,0 +1,141 @@ + +// Tests to ensure no regressions in the way the autocomplete analyzers handle +// synonym expansions and the corresponding matching of those tokens. + +// The greater issue is descriped in: https://github.com/pelias/pelias/issues/211 +// The cases tested here are described in: https://github.com/pelias/schema/issues/105 + +var tape = require('tape'), + elastictest = require('elastictest'), + schema = require('../schema'), + punctuation = require('../punctuation'); + +module.exports.tests = {}; + +// index the name as 'Grolmanstraße' and then retrieve with partially complete token 'Grolmanstr.' +module.exports.tests.index_expanded_form_search_contracted = function(test, common){ + test( 'index and retrieve expanded form', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document with a name which contains a synonym (center) + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: 'Grolmanstraße' } } + }, done); + }); + + // search using 'peliasQueryPartialToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryPartialToken', + 'query': 'Grolmanstr.' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + // search using 'peliasQueryFullToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'Grolmanstr.' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + suite.run( t.end ); + }); +}; + +// index the name as 'Grolmanstr.' and then retrieve with 'Grolmanstraße' +module.exports.tests.index_contracted_form_search_expanded = function(test, common){ + test( 'index and retrieve contracted form', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document with a name which contains a synonym (center) + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: 'Grolmanstr.' } } + }, done); + }); + + // search using 'peliasQueryPartialToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryPartialToken', + 'query': 'Grolmanstraße' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + // search using 'peliasQueryFullToken' + // @note: this case is currently not supported. + // Please index your data in the expanded form. + + // suite.assert( function( done ){ + // suite.client.search({ + // index: suite.props.index, + // type: 'test', + // body: { query: { match: { + // 'name.default': { + // 'analyzer': 'peliasQueryFullToken', + // 'query': 'Grolmanstraße' + // } + // }}} + // }, function( err, res ){ + // t.equal( err, undefined ); + // t.equal( res.hits.total, 1, 'document found' ); + // done(); + // }); + // }); + + suite.run( t.end ); + }); +}; + +module.exports.all = function (tape, common) { + + function test(name, testFunction) { + return tape('autocomplete street synonym expansion: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/integration/autocomplete_directional_synonym_expansion.js b/integration/autocomplete_directional_synonym_expansion.js new file mode 100644 index 00000000..7d5f314a --- /dev/null +++ b/integration/autocomplete_directional_synonym_expansion.js @@ -0,0 +1,252 @@ + +// Tests to ensure no regressions in the way the autocomplete analyzers handle +// synonym expansions and the corresponding matching of those tokens. + +// The greater issue is descriped in: https://github.com/pelias/pelias/issues/211 +// The cases tested here are described in: https://github.com/pelias/schema/issues/105 + +var tape = require('tape'), + elastictest = require('elastictest'), + schema = require('../schema'), + punctuation = require('../punctuation'); + +module.exports.tests = {}; + +// index the name as 'north' and then retrieve with partially complete token 'nor' +module.exports.tests.index_and_retrieve_expanded_form = function(test, common){ + test( 'index and retrieve expanded form', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document with a name which contains a synonym (center) + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: 'north' } } + }, done); + }); + + // search using 'peliasQueryPartialToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryPartialToken', + 'query': 'nor' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + // search using 'peliasQueryFullToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'north' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + suite.run( t.end ); + }); +}; + +// index the name as 'n' and then retrieve with 'n' +module.exports.tests.index_and_retrieve_contracted_form = function(test, common){ + test( 'index and retrieve contracted form', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document with a name which contains a synonym (center) + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: 'n' } } + }, done); + }); + + // search using 'peliasQueryPartialToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryPartialToken', + 'query': 'n' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + // search using 'peliasQueryFullToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'n' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + suite.run( t.end ); + }); +}; + +// index the name as 'n' and then retrieve with partially complete token 'nor' +module.exports.tests.index_and_retrieve_mixed_form_1 = function(test, common){ + test( 'index and retrieve mixed form 1', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document with a name which contains a synonym (center) + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: 'n' } } + }, done); + }); + + // search using 'peliasQueryPartialToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryPartialToken', + 'query': 'nor' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + // search using 'peliasQueryFullToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'north' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + suite.run( t.end ); + }); +}; + +// index the name as 'north' and then retrieve with 'n' +module.exports.tests.index_and_retrieve_mixed_form_2 = function(test, common){ + test( 'index and retrieve mixed form 2', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document with a name which contains a synonym (center) + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: 'north' } } + }, done); + }); + + // search using 'peliasQueryPartialToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryPartialToken', + 'query': 'n' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + // search using 'peliasQueryFullToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'n' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + suite.run( t.end ); + }); +}; + +module.exports.all = function (tape, common) { + + function test(name, testFunction) { + return tape('autocomplete directional synonym expansion: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/integration/autocomplete_street_synonym_expansion.js b/integration/autocomplete_street_synonym_expansion.js new file mode 100644 index 00000000..afb81da9 --- /dev/null +++ b/integration/autocomplete_street_synonym_expansion.js @@ -0,0 +1,252 @@ + +// Tests to ensure no regressions in the way the autocomplete analyzers handle +// synonym expansions and the corresponding matching of those tokens. + +// The greater issue is descriped in: https://github.com/pelias/pelias/issues/211 +// The cases tested here are described in: https://github.com/pelias/schema/issues/105 + +var tape = require('tape'), + elastictest = require('elastictest'), + schema = require('../schema'), + punctuation = require('../punctuation'); + +module.exports.tests = {}; + +// index the name as 'center' and then retrieve with partially complete token 'cent' +module.exports.tests.index_and_retrieve_expanded_form = function(test, common){ + test( 'index and retrieve expanded form', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document with a name which contains a synonym (center) + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: 'center' } } + }, done); + }); + + // search using 'peliasQueryPartialToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryPartialToken', + 'query': 'cent' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + // search using 'peliasQueryFullToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'center' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + suite.run( t.end ); + }); +}; + +// index the name as 'ctr' and then retrieve with 'ctr' +module.exports.tests.index_and_retrieve_contracted_form = function(test, common){ + test( 'index and retrieve contracted form', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document with a name which contains a synonym (center) + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: 'ctr' } } + }, done); + }); + + // search using 'peliasQueryPartialToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryPartialToken', + 'query': 'ctr' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + // search using 'peliasQueryFullToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'ctr' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + suite.run( t.end ); + }); +}; + +// index the name as 'ctr' and then retrieve with partially complete token 'cent' +module.exports.tests.index_and_retrieve_mixed_form_1 = function(test, common){ + test( 'index and retrieve mixed form 1', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document with a name which contains a synonym (center) + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: 'ctr' } } + }, done); + }); + + // search using 'peliasQueryPartialToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryPartialToken', + 'query': 'cent' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + // search using 'peliasQueryFullToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'center' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + suite.run( t.end ); + }); +}; + +// index the name as 'center' and then retrieve with 'ctr' +module.exports.tests.index_and_retrieve_mixed_form_2 = function(test, common){ + test( 'index and retrieve mixed form 2', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document with a name which contains a synonym (center) + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: 'center' } } + }, done); + }); + + // search using 'peliasQueryPartialToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryPartialToken', + 'query': 'ctr' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + // search using 'peliasQueryFullToken' + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'ctr' + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + suite.run( t.end ); + }); +}; + +module.exports.all = function (tape, common) { + + function test(name, testFunction) { + return tape('autocomplete street synonym expansion: ' + name, testFunction); + } + + for( var testCase in module.exports.tests ){ + module.exports.tests[testCase](test, common); + } +}; diff --git a/integration/autocomplete_synonym_expansion.js b/integration/autocomplete_synonym_expansion.js deleted file mode 100644 index e69de29b..00000000 diff --git a/integration/openstreetmap b/integration/openstreetmap new file mode 160000 index 00000000..a4f9a3bd --- /dev/null +++ b/integration/openstreetmap @@ -0,0 +1 @@ +Subproject commit a4f9a3bd44b1c68c29cfbf8892a6ce5bd3ff8a80 diff --git a/integration/run.js b/integration/run.js index 8e96d0fd..fbdb398c 100644 --- a/integration/run.js +++ b/integration/run.js @@ -17,7 +17,10 @@ var tests = [ require('./address_matching.js'), require('./admin_matching.js'), require('./source_layer_sourceid_filtering.js'), - require('./bounding_box.js') + require('./bounding_box.js'), + require('./autocomplete_street_synonym_expansion.js'), + require('./autocomplete_directional_synonym_expansion.js'), + require('./autocomplete_abbreviated_street_names.js') ]; tests.map(function(t) { diff --git a/settings.js b/settings.js index a78419ac..040c02ce 100644 --- a/settings.js +++ b/settings.js @@ -25,7 +25,6 @@ function generate(){ "notnull" ] }, - "peliasIndexOneEdgeGram" : { "type": "custom", "tokenizer" : "whitespace", @@ -56,6 +55,7 @@ function generate(){ "removeAllZeroNumericPrefix", "kstem", "peliasTwoEdgeGramFilter", + "direction_synonym_contraction_keep_original", "unique", "notnull" ] @@ -175,16 +175,20 @@ function generate(){ }, "partial_token_address_suffix_expansion": { "type": "synonym", - "synonyms": street_suffix.safe_expansions + "synonyms": street_suffix.partial_token_safe_expansions }, "full_token_address_suffix_expansion": { "type": "synonym", - "synonyms": street_suffix.safe_expansions + "synonyms": street_suffix.full_token_safe_expansions }, "direction_synonym": { "type": "synonym", "synonyms": street_suffix.direction_synonyms }, + "direction_synonym_contraction_keep_original": { + "type": "synonym", + "synonyms": street_suffix.direction_synonyms_keep_original + }, "remove_ordinals" : { "type" : "pattern_replace", "pattern": "(([0-9])(st|nd|rd|th))", diff --git a/street_suffix.js b/street_suffix.js index 24f6c017..e279f695 100644 --- a/street_suffix.js +++ b/street_suffix.js @@ -160,6 +160,15 @@ module.exports.direction_synonyms = [ "west => w" ]; +// note: this is a bit of a hack, it can be placed AFTER an 2+ ngram filter in +// order to allow single grams in the index. +module.exports.direction_synonyms_keep_original = [ + "north => north,n", + "south => south,s", + "east => east,e", + "west => west,w" +]; + /** a list of 'safe' street suffix expansions. @@ -183,7 +192,7 @@ module.exports.direction_synonyms = [ please use judgement when adding new expansions as it may cause the 'jitter' behaviour as outlined in https://github.com/pelias/schema/pull/83 **/ -module.exports.safe_expansions = [ +module.exports.partial_token_safe_expansions = [ "aly => alley", "anx => annex", "byu => bayou", @@ -283,3 +292,13 @@ module.exports.safe_expansions = [ "vlg => village", "wy => way" ]; + +module.exports.full_token_safe_expansions = []; + +// copy the unsafe expansions +module.exports.partial_token_safe_expansions.forEach( function( expansion ){ + module.exports.full_token_safe_expansions.push( expansion ); +}); + +// add the expansions which are only safe on complete tokens (not partial tokens) +module.exports.full_token_safe_expansions.push( "n => north", "s => south", "e => east", "w => west" ); diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 1ffc2c9f..4f9bbd18 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -46,6 +46,7 @@ "removeAllZeroNumericPrefix", "kstem", "peliasTwoEdgeGramFilter", + "direction_synonym_contraction_keep_original", "unique", "notnull" ] @@ -742,7 +743,11 @@ "tpke => turnpike", "vly => valley", "vlg => village", - "wy => way" + "wy => way", + "n => north", + "s => south", + "e => east", + "w => west" ] }, "direction_synonym": { @@ -758,6 +763,15 @@ "west => w" ] }, + "direction_synonym_contraction_keep_original": { + "type": "synonym", + "synonyms": [ + "north => north,n", + "south => south,s", + "east => east,e", + "west => west,w" + ] + }, "remove_ordinals": { "type": "pattern_replace", "pattern": "(([0-9])(st|nd|rd|th))", diff --git a/test/settings.js b/test/settings.js index efd1955f..d4d2018d 100644 --- a/test/settings.js +++ b/test/settings.js @@ -95,6 +95,7 @@ module.exports.tests.peliasIndexTwoEdgeGramAnalyzer = function(test, common) { "removeAllZeroNumericPrefix", "kstem", "peliasTwoEdgeGramFilter", + "direction_synonym_contraction_keep_original", "unique", "notnull" ]); From 7b8597f680b978087b6ae17bc27580863124fb75 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Wed, 16 Mar 2016 15:12:35 +0100 Subject: [PATCH 07/15] delete this directory which shouldnt be here --- integration/openstreetmap | 1 - 1 file changed, 1 deletion(-) delete mode 160000 integration/openstreetmap diff --git a/integration/openstreetmap b/integration/openstreetmap deleted file mode 160000 index a4f9a3bd..00000000 --- a/integration/openstreetmap +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a4f9a3bd44b1c68c29cfbf8892a6ce5bd3ff8a80 From 659509dbb1c742a5565efe8b440d7fb595b4212e Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Mon, 28 Mar 2016 15:42:18 +0200 Subject: [PATCH 08/15] custom tokenizers peliasNameTokenizer & peliasStreetTokenizer --- integration/analyzer_peliasAdmin.js | 25 +++++++++++++++++++++++++ integration/analyzer_peliasPhrase.js | 25 +++++++++++++++++++++++++ integration/analyzer_peliasStreet.js | 22 ++++++++++++++++++++++ punctuation.js | 6 +++--- settings.js | 20 +++++++++++++++----- 5 files changed, 90 insertions(+), 8 deletions(-) diff --git a/integration/analyzer_peliasAdmin.js b/integration/analyzer_peliasAdmin.js index 9c3dfeb8..e69afff2 100644 --- a/integration/analyzer_peliasAdmin.js +++ b/integration/analyzer_peliasAdmin.js @@ -74,6 +74,31 @@ module.exports.tests.functional = function(test, common){ }); }; +module.exports.tests.tokenizer = function(test, common){ + test( 'tokenizer', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasAdmin' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // specify 2 parts with a delimeter + assertAnalysis( 'forward slash', 'Trinidad/Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'forward slash', 'Trinidad /Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'forward slash', 'Trinidad/ Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'back slash', 'Trinidad\\Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'back slash', 'Trinidad \\Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'back slash', 'Trinidad\\ Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'comma', 'Trinidad,Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'comma', 'Trinidad ,Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'comma', 'Trinidad, Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'space', 'Trinidad,Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'space', 'Trinidad ,Tobago', [ 'trinidad', 'tobago' ]); + assertAnalysis( 'space', 'Trinidad, Tobago', [ 'trinidad', 'tobago' ]); + + suite.run( t.end ); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { diff --git a/integration/analyzer_peliasPhrase.js b/integration/analyzer_peliasPhrase.js index f7f752a1..d9f3197f 100644 --- a/integration/analyzer_peliasPhrase.js +++ b/integration/analyzer_peliasPhrase.js @@ -95,6 +95,31 @@ module.exports.tests.functional = function(test, common){ }); }; +module.exports.tests.tokenizer = function(test, common){ + test( 'tokenizer', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasPhrase' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // specify 2 parts with a delimeter + assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'space', 'Bedell Street,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'space', 'Bedell Street ,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + assertAnalysis( 'space', 'Bedell Street, 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + + suite.run( t.end ); + }); +}; + // @ref: https://www.elastic.co/guide/en/elasticsearch/guide/current/phrase-matching.html // @ref: https://www.elastic.co/guide/en/elasticsearch/guide/current/slop.html module.exports.tests.slop = function(test, common){ diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index 2ef143af..f9b3761c 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -63,6 +63,28 @@ module.exports.tests.normalize_punctuation = function(test, common){ }); }; +module.exports.tests.tokenizer = function(test, common){ + test( 'tokenizer', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // specify 2 streets with a delimeter + assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', [ 'bedell st', '133 ave' ]); + assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', [ 'bedell st', '133 ave' ]); + assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', [ 'bedell st', '133 ave' ]); + assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', [ 'bedell st', '133 ave' ]); + assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', [ 'bedell st', '133 ave' ]); + assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', [ 'bedell st', '133 ave' ]); + assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', [ 'bedell st', '133 ave' ]); + assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', [ 'bedell st', '133 ave' ]); + assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', [ 'bedell st', '133 ave' ]); + + suite.run( t.end ); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { diff --git a/punctuation.js b/punctuation.js index 5cc843f8..574a0d78 100644 --- a/punctuation.js +++ b/punctuation.js @@ -3,8 +3,8 @@ // @see: org/apache/lucene/analysis/cn/smart/stopwords.txt module.exports.all = [ - ",",".","`","-","_","=","?","'","|","\"","(",")","{","}","[","]","<",">","*", - "#","&","^","$","@","!","~",":",";","+","/","\\\\","《","》","—","-",",","。", + ".","`","-","_","=","?","'","|","\"","(",")","{","}","[","]","<",">","*", + "#","&","^","$","@","!","~",":",";","+","《","》","—","-",",","。", "、", ":",";","!","·","?","“","”",")","(","【","】","[","]","●" ]; @@ -21,4 +21,4 @@ module.exports.allowed.forEach(function(item){ if( index > -1 ){ module.exports.blacklist.splice(index, 1); } -}); \ No newline at end of file +}); diff --git a/settings.js b/settings.js index 3dc469ef..0f982764 100644 --- a/settings.js +++ b/settings.js @@ -12,10 +12,20 @@ function generate(){ // Default settings var settings = { "analysis": { + "tokenizer": { + "peliasNameTokenizer": { + "type": "pattern", + "pattern": "[\\s,/\\\\]+" + }, + "peliasStreetTokenizer": { + "type": "pattern", + "pattern": "[,/\\\\]+" + } + }, "analyzer": { "peliasAdmin": { "type": "custom", - "tokenizer": "whitespace", + "tokenizer": "peliasNameTokenizer", "char_filter" : ["punctuation"], "filter": [ "lowercase", @@ -27,7 +37,7 @@ function generate(){ }, "peliasOneEdgeGram" : { "type": "custom", - "tokenizer" : "whitespace", + "tokenizer" : "peliasNameTokenizer", "char_filter" : ["punctuation"], "filter": [ "lowercase", @@ -44,7 +54,7 @@ function generate(){ }, "peliasTwoEdgeGram" : { "type": "custom", - "tokenizer" : "whitespace", + "tokenizer" : "peliasNameTokenizer", "char_filter" : ["punctuation"], "filter": [ "lowercase", @@ -61,7 +71,7 @@ function generate(){ }, "peliasPhrase": { "type": "custom", - "tokenizer":"whitespace", + "tokenizer":"peliasNameTokenizer", "char_filter" : ["punctuation"], "filter": [ "lowercase", @@ -91,7 +101,7 @@ function generate(){ }, "peliasStreet": { "type": "custom", - "tokenizer":"keyword", + "tokenizer":"peliasStreetTokenizer", "char_filter" : ["punctuation"], "filter": [ "lowercase", From 6b5701b57f019ddd3ba64aa44e4157d17880f3a0 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Mon, 28 Mar 2016 18:18:19 +0200 Subject: [PATCH 09/15] final changes before staging --- integration/analyzer_peliasQueryFullToken.js | 18 ++++++------ integration/run.js | 30 ++++++++++---------- settings.js | 4 +++ 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/integration/analyzer_peliasQueryFullToken.js b/integration/analyzer_peliasQueryFullToken.js index 0942092d..b29d3c45 100644 --- a/integration/analyzer_peliasQueryFullToken.js +++ b/integration/analyzer_peliasQueryFullToken.js @@ -118,15 +118,15 @@ module.exports.tests.tokenizer = function(test, common){ suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up // specify 2 streets with a delimeter - assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', [ 'bedell st', '133 ave' ]); + assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); suite.run( t.end ); }); diff --git a/integration/run.js b/integration/run.js index fbdb398c..f57955ae 100644 --- a/integration/run.js +++ b/integration/run.js @@ -4,23 +4,23 @@ var common = {}; var tests = [ require('./validate.js'), - require('./dynamic_templates.js'), - require('./analyzer_peliasIndexOneEdgeGram.js'), - require('./analyzer_peliasIndexTwoEdgeGram.js'), + // require('./dynamic_templates.js'), + // require('./analyzer_peliasIndexOneEdgeGram.js'), + // require('./analyzer_peliasIndexTwoEdgeGram.js'), require('./analyzer_peliasQueryPartialToken.js'), require('./analyzer_peliasQueryFullToken.js'), - require('./analyzer_peliasPhrase.js'), - require('./analyzer_peliasAdmin.js'), - require('./analyzer_peliasHousenumber.js'), - require('./analyzer_peliasZip.js'), - require('./analyzer_peliasStreet.js'), - require('./address_matching.js'), - require('./admin_matching.js'), - require('./source_layer_sourceid_filtering.js'), - require('./bounding_box.js'), - require('./autocomplete_street_synonym_expansion.js'), - require('./autocomplete_directional_synonym_expansion.js'), - require('./autocomplete_abbreviated_street_names.js') + // require('./analyzer_peliasPhrase.js'), + // require('./analyzer_peliasAdmin.js'), + // require('./analyzer_peliasHousenumber.js'), + // require('./analyzer_peliasZip.js'), + // require('./analyzer_peliasStreet.js'), + // require('./address_matching.js'), + // require('./admin_matching.js'), + // require('./source_layer_sourceid_filtering.js'), + // require('./bounding_box.js'), + // require('./autocomplete_street_synonym_expansion.js'), + // require('./autocomplete_directional_synonym_expansion.js'), + // require('./autocomplete_abbreviated_street_names.js') ]; tests.map(function(t) { diff --git a/settings.js b/settings.js index fb0b1add..ca4bfa49 100644 --- a/settings.js +++ b/settings.js @@ -45,6 +45,7 @@ function generate(){ "trim", "full_token_address_suffix_expansion", "ampersand", + "remove_ordinals", "removeAllZeroNumericPrefix", "kstem", "peliasOneEdgeGramFilter", @@ -62,6 +63,7 @@ function generate(){ "trim", "full_token_address_suffix_expansion", "ampersand", + "remove_ordinals", "removeAllZeroNumericPrefix", "kstem", "peliasTwoEdgeGramFilter", @@ -80,6 +82,7 @@ function generate(){ "trim", "partial_token_address_suffix_expansion", "ampersand", + "remove_ordinals", "removeAllZeroNumericPrefix", "kstem", "peliasOneEdgeGramFilter", @@ -95,6 +98,7 @@ function generate(){ "lowercase", "asciifolding", "trim", + "remove_ordinals", "full_token_address_suffix_expansion", "ampersand", "removeAllZeroNumericPrefix", From 96da14091f4f46b156506e73f985668e7a450015 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 7 Apr 2016 13:03:57 +0200 Subject: [PATCH 10/15] allow single digit grams in the 2gram index --- .../analyzer_peliasIndexTwoEdgeGram.js | 27 +++++++++++++---- integration/run.js | 30 +++++++++---------- settings.js | 7 +++++ 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/integration/analyzer_peliasIndexTwoEdgeGram.js b/integration/analyzer_peliasIndexTwoEdgeGram.js index cde75a0a..a1f1d4fd 100644 --- a/integration/analyzer_peliasIndexTwoEdgeGram.js +++ b/integration/analyzer_peliasIndexTwoEdgeGram.js @@ -36,8 +36,8 @@ module.exports.tests.analyze = function(test, common){ // assertAnalysis( 'ampersand', 'aa & bb', ['aa','&','bb'] ); // assertAnalysis( 'ampersand', 'aa and & and bb', ['aa','&','bb'] ); - assertAnalysis( 'peliasIndexTwoEdgeGramFilter', '1 a ab abc abcdefghij', ['ab','abc','abcd','abcde','abcdef','abcdefg','abcdefgh','abcdefghi','abcdefghij'] ); - assertAnalysis( 'removeAllZeroNumericPrefix', '0002 00011', ['11'] ); + assertAnalysis( 'peliasIndexTwoEdgeGramFilter', '1 a ab abc abcdefghij', ['1', 'ab','abc','abcd','abcde','abcdef','abcdefg','abcdefgh','abcdefghi','abcdefghij'] ); + assertAnalysis( 'removeAllZeroNumericPrefix', '0002 00011', ['2', '11'] ); assertAnalysis( 'unique', '11 11 11', ['11'] ); assertAnalysis( 'notnull', ' / / ', [] ); @@ -49,7 +49,7 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'punctuation', punctuation.all.join(''), ['-&'] ); // ensure that single grams are not created - assertAnalysis( '1grams', 'a aa b bb 1 11', ['aa','bb','11'] ); + assertAnalysis( '1grams', 'a aa b bb 1 11', ['aa','bb','1','11'] ); // for directionals (north/south/east/west) we allow single grams assertAnalysis( 'direction_synonym_contraction_keep_original', 'a', [] ); @@ -140,8 +140,7 @@ module.exports.tests.functional = function(test, common){ }); }; - -module.exports.tests.functional = function(test, common){ +module.exports.tests.address_suffix_expansions = function(test, common){ test( 'address suffix expansion', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); @@ -160,6 +159,24 @@ module.exports.tests.functional = function(test, common){ }); }; +// handle special cases for numerals +module.exports.tests.numerals = function(test, common){ + test( 'numerals', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + var assertAnalysis = analyze.bind( null, suite, t, 'peliasIndexTwoEdgeGram' ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // allow single grams for single digit numbers + assertAnalysis( 'single digit', '1 2', [ '1', '2' ]); + + // do not produce single grams for 2+ digit numbers + assertAnalysis( 'multi digits', '12 999', [ '12', '99', '999' ]); + + suite.run( t.end ); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { diff --git a/integration/run.js b/integration/run.js index f57955ae..fbdb398c 100644 --- a/integration/run.js +++ b/integration/run.js @@ -4,23 +4,23 @@ var common = {}; var tests = [ require('./validate.js'), - // require('./dynamic_templates.js'), - // require('./analyzer_peliasIndexOneEdgeGram.js'), - // require('./analyzer_peliasIndexTwoEdgeGram.js'), + require('./dynamic_templates.js'), + require('./analyzer_peliasIndexOneEdgeGram.js'), + require('./analyzer_peliasIndexTwoEdgeGram.js'), require('./analyzer_peliasQueryPartialToken.js'), require('./analyzer_peliasQueryFullToken.js'), - // require('./analyzer_peliasPhrase.js'), - // require('./analyzer_peliasAdmin.js'), - // require('./analyzer_peliasHousenumber.js'), - // require('./analyzer_peliasZip.js'), - // require('./analyzer_peliasStreet.js'), - // require('./address_matching.js'), - // require('./admin_matching.js'), - // require('./source_layer_sourceid_filtering.js'), - // require('./bounding_box.js'), - // require('./autocomplete_street_synonym_expansion.js'), - // require('./autocomplete_directional_synonym_expansion.js'), - // require('./autocomplete_abbreviated_street_names.js') + require('./analyzer_peliasPhrase.js'), + require('./analyzer_peliasAdmin.js'), + require('./analyzer_peliasHousenumber.js'), + require('./analyzer_peliasZip.js'), + require('./analyzer_peliasStreet.js'), + require('./address_matching.js'), + require('./admin_matching.js'), + require('./source_layer_sourceid_filtering.js'), + require('./bounding_box.js'), + require('./autocomplete_street_synonym_expansion.js'), + require('./autocomplete_directional_synonym_expansion.js'), + require('./autocomplete_abbreviated_street_names.js') ]; tests.map(function(t) { diff --git a/settings.js b/settings.js index ca4bfa49..22c856c4 100644 --- a/settings.js +++ b/settings.js @@ -66,7 +66,9 @@ function generate(){ "remove_ordinals", "removeAllZeroNumericPrefix", "kstem", + "prefixZeroToSingleDigitNumbers", "peliasTwoEdgeGramFilter", + "removeAllZeroNumericPrefix", "direction_synonym_contraction_keep_original", "unique", "notnull" @@ -174,6 +176,11 @@ function generate(){ "min_gram" : 2, "max_gram" : 18 }, + "prefixZeroToSingleDigitNumbers" :{ + "type" : "pattern_replace", + "pattern" : "^([0-9])$", + "replacement" : "0$1" + }, "removeAllZeroNumericPrefix" :{ "type" : "pattern_replace", "pattern" : "^(0*)", From 034114665aaf752db574b569cc5bbce82af2c93b Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 7 Apr 2016 13:04:05 +0200 Subject: [PATCH 11/15] slop tests --- integration/analyzer_peliasQueryFullToken.js | 44 ++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/integration/analyzer_peliasQueryFullToken.js b/integration/analyzer_peliasQueryFullToken.js index b29d3c45..c7cbf69f 100644 --- a/integration/analyzer_peliasQueryFullToken.js +++ b/integration/analyzer_peliasQueryFullToken.js @@ -132,6 +132,50 @@ module.exports.tests.tokenizer = function(test, common){ }); }; +// test the minimum amount of slop required to retrieve address documents +module.exports.tests.slop = function(test, common){ + test( 'slop', function(t){ + + var suite = new elastictest.Suite( null, { schema: schema } ); + suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + + // index a document + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, + type: 'test', + id: '1', + body: { name: { default: '52 Görlitzer Straße' } } + }, done); + }); + + // search using 'peliasQueryFullToken' + // in this case we require a slop of 3 to return the same + // record with the street number and street name reversed. + // (as is common in European countries, such as Germany). + suite.assert( function( done ){ + suite.client.search({ + index: suite.props.index, + type: 'test', + body: { query: { match: { + 'name.default': { + 'analyzer': 'peliasQueryFullToken', + 'query': 'Görlitzer Straße 52', + 'type': 'phrase', + 'slop': 3, + } + }}} + }, function( err, res ){ + t.equal( err, undefined ); + t.equal( res.hits.total, 1, 'document found' ); + done(); + }); + }); + + suite.run( t.end ); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { From eb9c17f42b67c255d08f77ad07af4aa9e36f8d1e Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 18 Apr 2016 20:57:17 +0200 Subject: [PATCH 12/15] change peliasQueryPartialToken analysis to produce single tokens per word instead of ngrams --- .../analyzer_peliasQueryPartialToken.js | 67 ++++++------------- .../autocomplete_abbreviated_street_names.js | 44 ++++++------ settings.js | 1 - test/fixtures/expected.json | 1 - 4 files changed, 44 insertions(+), 69 deletions(-) diff --git a/integration/analyzer_peliasQueryPartialToken.js b/integration/analyzer_peliasQueryPartialToken.js index 60cbe04f..fe52a95c 100644 --- a/integration/analyzer_peliasQueryPartialToken.js +++ b/integration/analyzer_peliasQueryPartialToken.js @@ -17,38 +17,34 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'lowercase', 'F', ['f']); assertAnalysis( 'asciifolding', 'é', ['e']); - assertAnalysis( 'asciifolding', 'ß', ['s','ss']); - assertAnalysis( 'asciifolding', 'æ', ['a','ae']); + assertAnalysis( 'asciifolding', 'ß', ['ss']); + assertAnalysis( 'asciifolding', 'æ', ['ae']); assertAnalysis( 'asciifolding', 'ł', ['l']); assertAnalysis( 'asciifolding', 'ɰ', ['m']); assertAnalysis( 'trim', ' f ', ['f'] ); assertAnalysis( 'ampersand', 'a and b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'a & b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'a and & and b', ['a','&','b'] ); - assertAnalysis( 'ampersand', 'land', ['l','la','lan','land'] ); // should not replace inside tokens + assertAnalysis( 'ampersand', 'land', ['land'] ); // should not replace inside tokens // partial_token_address_suffix_expansion - assertAnalysis( 'partial_token_address_suffix_expansion', 'rd', ['r','ro','roa','road'] ); - assertAnalysis( 'partial_token_address_suffix_expansion', 'ctr', ['c','ce','cen','cent','cente','center'] ); + assertAnalysis( 'partial_token_address_suffix_expansion', 'rd', ['road'] ); + assertAnalysis( 'partial_token_address_suffix_expansion', 'ctr', ['center'] ); - assertAnalysis( 'peliasQueryPartialTokenFilter', '1 a ab abc abcdefghij', ['1','a','ab','abc','abcd','abcde','abcdef','abcdefg','abcdefgh','abcdefghi','abcdefghij'] ); + assertAnalysis( 'peliasQueryPartialTokenFilter', '1 a ab abc abcdefghij', ['1','a','ab','abc','abcdefghij'] ); assertAnalysis( 'removeAllZeroNumericPrefix', '00001', ['1'] ); assertAnalysis( 'unique', '1 1 1', ['1'] ); assertAnalysis( 'notnull', ' / / ', [] ); - assertAnalysis( 'kstem', 'mcdonalds', ['m', 'mc', 'mcd', 'mcdo', 'mcdon', 'mcdona', 'mcdonal', 'mcdonald'] ); - assertAnalysis( 'kstem', 'McDonald\'s', ['m', 'mc', 'mcd', 'mcdo', 'mcdon', 'mcdona', 'mcdonal', 'mcdonald'] ); - assertAnalysis( 'kstem', 'peoples', ['p', 'pe', 'peo', 'peop', 'peopl', 'people'] ); + assertAnalysis( 'kstem', 'mcdonalds', ['mcdonald'] ); + assertAnalysis( 'kstem', 'McDonald\'s', ['mcdonald'] ); + assertAnalysis( 'kstem', 'peoples', ['people'] ); // remove punctuation (handled by the char_filter) - assertAnalysis( 'punctuation', punctuation.all.join(''), ['-','-&'] ); + assertAnalysis( 'punctuation', punctuation.all.join(''), ['-&'] ); // ensure that very large grams are created - assertAnalysis( 'largeGrams', 'grolmanstrasse', [ - 'g','gr','gro','grol','grolm','grolma','grolman','grolmans','grolmanst', - 'grolmanstr','grolmanstra','grolmanstras','grolmanstrass', - 'grolmanstrasse' - ]); + assertAnalysis( 'largeGrams', 'grolmanstrasse', ['grolmanstrasse']); suite.run( t.end ); }); @@ -63,21 +59,11 @@ module.exports.tests.address_suffix_expansions = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryPartialToken' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up - assertAnalysis( 'safe expansions', 'aly', [ - 'a', 'al', 'all', 'alle', 'alley' - ]); + assertAnalysis( 'safe expansions', 'aly', [ 'alley' ]); + assertAnalysis( 'safe expansions', 'xing', [ 'crossing' ]); + assertAnalysis( 'safe expansions', 'rd', [ 'road' ]); - assertAnalysis( 'safe expansions', 'xing', [ - 'c', 'cr', 'cro', 'cros', 'cross', 'crossi', 'crossin', 'crossing' - ]); - - assertAnalysis( 'safe expansions', 'rd', [ - 'r', 'ro', 'roa', 'road' - ]); - - assertAnalysis( 'unsafe expansion', 'ct st', [ - 'c', 'ct', 's', 'st' - ]); + assertAnalysis( 'unsafe expansion', 'ct st', [ 'ct', 'st' ]); suite.run( t.end ); }); @@ -91,13 +77,8 @@ module.exports.tests.stop_words = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryPartialToken' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up - assertAnalysis( 'street suffix', 'AB street', [ - 'a', 'ab', 's', 'st', 'str', 'stre', 'stree', 'street' - ]); - - assertAnalysis( 'street suffix (abbreviation)', 'AB st', [ - 'a', 'ab', 's', 'st' - ]); + assertAnalysis( 'street suffix', 'AB street', [ 'ab', 'street' ]); + assertAnalysis( 'street suffix (abbreviation)', 'AB st', [ 'ab', 'st' ]); suite.run( t.end ); }); @@ -110,17 +91,9 @@ module.exports.tests.functional = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasQueryPartialToken' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up - assertAnalysis( 'country', 'Trinidad and Tobago', [ - 't', 'tr', 'tri', 'trin', 'trini', 'trinid', 'trinida', 'trinidad', '&', 'to', 'tob', 'toba', 'tobag', 'tobago' - ]); - - assertAnalysis( 'place', 'Toys "R" Us!', [ - 't', 'to', 'toy', 'r', 'u', 'us' - ]); - - assertAnalysis( 'address', '101 mapzen place', [ - '1', '10', '101', 'm', 'ma', 'map', 'mapz', 'mapze', 'mapzen', 'p', 'pl', 'pla', 'plac', 'place' - ]); + assertAnalysis( 'country', 'Trinidad and Tobago', [ 'trinidad', '&', 'tobago' ]); + assertAnalysis( 'place', 'Toys "R" Us!', [ 'toy', 'r', 'us' ]); + assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]); suite.run( t.end ); }); diff --git a/integration/autocomplete_abbreviated_street_names.js b/integration/autocomplete_abbreviated_street_names.js index 122b739c..6d9778ed 100644 --- a/integration/autocomplete_abbreviated_street_names.js +++ b/integration/autocomplete_abbreviated_street_names.js @@ -14,7 +14,7 @@ module.exports.tests = {}; // index the name as 'Grolmanstraße' and then retrieve with partially complete token 'Grolmanstr.' module.exports.tests.index_expanded_form_search_contracted = function(test, common){ - test( 'index and retrieve expanded form', function(t){ + test( 'index expanded and retrieve contracted form', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up @@ -71,7 +71,7 @@ module.exports.tests.index_expanded_form_search_contracted = function(test, comm // index the name as 'Grolmanstr.' and then retrieve with 'Grolmanstraße' module.exports.tests.index_contracted_form_search_expanded = function(test, common){ - test( 'index and retrieve contracted form', function(t){ + test( 'index contracted and search expanded', function(t){ var suite = new elastictest.Suite( null, { schema: schema } ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up @@ -86,23 +86,27 @@ module.exports.tests.index_contracted_form_search_expanded = function(test, comm }, done); }); - // search using 'peliasQueryPartialToken' - suite.assert( function( done ){ - suite.client.search({ - index: suite.props.index, - type: 'test', - body: { query: { match: { - 'name.default': { - 'analyzer': 'peliasQueryPartialToken', - 'query': 'Grolmanstraße' - } - }}} - }, function( err, res ){ - t.equal( err, undefined ); - t.equal( res.hits.total, 1, 'document found' ); - done(); - }); - }); + // @note: these tests are commented out, the issue would be better solved + // with https://github.com/pelias/openaddresses/pull/68 + + // + // // search using 'peliasQueryPartialToken' + // suite.assert( function( done ){ + // suite.client.search({ + // index: suite.props.index, + // type: 'test', + // body: { query: { match: { + // 'name.default': { + // 'analyzer': 'peliasQueryPartialToken', + // 'query': 'Grolmanstraße' + // } + // }}} + // }, function( err, res ){ + // t.equal( err, undefined ); + // t.equal( res.hits.total, 1, 'document found' ); + // done(); + // }); + // }); // search using 'peliasQueryFullToken' // @note: this case is currently not supported. @@ -132,7 +136,7 @@ module.exports.tests.index_contracted_form_search_expanded = function(test, comm module.exports.all = function (tape, common) { function test(name, testFunction) { - return tape('autocomplete street synonym expansion: ' + name, testFunction); + return tape('autocomplete abbreviated street names: ' + name, testFunction); } for( var testCase in module.exports.tests ){ diff --git a/settings.js b/settings.js index 040c02ce..4838d26c 100644 --- a/settings.js +++ b/settings.js @@ -72,7 +72,6 @@ function generate(){ "ampersand", "removeAllZeroNumericPrefix", "kstem", - "peliasOneEdgeGramFilter", "unique", "notnull" ] diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 4f9bbd18..fba7f427 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -63,7 +63,6 @@ "ampersand", "removeAllZeroNumericPrefix", "kstem", - "peliasOneEdgeGramFilter", "unique", "notnull" ] From 3dea77f0f16f3332f731701427ddd2b5bc6728e1 Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 18 Apr 2016 20:57:53 +0200 Subject: [PATCH 13/15] add delete commmand required by update_settings.js --- scripts/update_settings.js | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/update_settings.js b/scripts/update_settings.js index d668c7b9..7fc52890 100644 --- a/scripts/update_settings.js +++ b/scripts/update_settings.js @@ -8,6 +8,7 @@ var _index = 'pelias'; if( schema.settings.hasOwnProperty('index') && schema.settings.index.hasOwnProperty('number_of_shards') ){ delete schema.settings.index.number_of_shards; + delete schema.settings.index.number_of_replicas; } client.indices.close( { index: _index }, function( err, res ){ From 2234bcd30d135a44a547c0a3b171e4a595ba62d8 Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 22 Apr 2016 21:08:38 +0200 Subject: [PATCH 14/15] add difflet support for debugging complex objects using common.diff() --- package.json | 1 + test/compile.js | 2 +- test/run.js | 11 +++++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/package.json b/package.json index 867f52a7..ed2c7a53 100644 --- a/package.json +++ b/package.json @@ -35,6 +35,7 @@ "pelias-config": "latest" }, "devDependencies": { + "difflet": "^1.0.1", "elastictest": "^1.2.0", "pelias-esclient": "latest", "tap-spec": "^4.1.1", diff --git a/test/compile.js b/test/compile.js index e3293200..327267cf 100644 --- a/test/compile.js +++ b/test/compile.js @@ -94,7 +94,7 @@ module.exports.tests.current_schema = function(test, common) { delete process.env.PELIAS_CONFIG; // code intentionally commented to allow quick debugging of expected.json - // console.log( JSON.stringify( schemaCopy, null, 2 ) ); + // common.diff(fixture, schemaCopy); t.deepEqual(schemaCopy, fixture); t.end(); diff --git a/test/run.js b/test/run.js index 468d5eeb..24f4a23b 100644 --- a/test/run.js +++ b/test/run.js @@ -1,6 +1,13 @@ -var tape = require('tape'); -var common = {}; +var tape = require('tape'), + diff = require('difflet')({ indent : 2, comment : true }); + +var common = { + // a visual deep diff rendered using console.error() + diff: function( actual, expected ){ + console.error( diff.compare( actual, expected ) ); + } +}; var tests = [ require('./compile.js'), From 44f71eb96e7e8b3be1ee6aa7522fee6cff68e42a Mon Sep 17 00:00:00 2001 From: missinglink Date: Fri, 22 Apr 2016 21:09:10 +0200 Subject: [PATCH 15/15] update tests --- test/fixtures/expected.json | 46 ++++++++++++++++++++++++++----------- test/settings.js | 6 ++++- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index d3ea0f07..8c3af149 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -1,10 +1,20 @@ { "settings": { "analysis": { + "tokenizer": { + "peliasNameTokenizer": { + "type": "pattern", + "pattern": "[\\s,/\\\\]+" + }, + "peliasStreetTokenizer": { + "type": "pattern", + "pattern": "[,/\\\\]+" + } + }, "analyzer": { "peliasAdmin": { "type": "custom", - "tokenizer": "whitespace", + "tokenizer": "peliasNameTokenizer", "char_filter": [ "punctuation" ], @@ -18,7 +28,7 @@ }, "peliasIndexOneEdgeGram" : { "type": "custom", - "tokenizer" : "whitespace", + "tokenizer" : "peliasNameTokenizer", "char_filter" : ["punctuation"], "filter": [ "lowercase", @@ -26,6 +36,7 @@ "trim", "full_token_address_suffix_expansion", "ampersand", + "remove_ordinals", "removeAllZeroNumericPrefix", "kstem", "peliasOneEdgeGramFilter", @@ -35,7 +46,7 @@ }, "peliasIndexTwoEdgeGram" : { "type": "custom", - "tokenizer" : "whitespace", + "tokenizer" : "peliasNameTokenizer", "char_filter" : ["punctuation"], "filter": [ "lowercase", @@ -43,9 +54,12 @@ "trim", "full_token_address_suffix_expansion", "ampersand", + "remove_ordinals", "removeAllZeroNumericPrefix", "kstem", + "prefixZeroToSingleDigitNumbers", "peliasTwoEdgeGramFilter", + "removeAllZeroNumericPrefix", "direction_synonym_contraction_keep_original", "unique", "notnull" @@ -53,7 +67,7 @@ }, "peliasQueryPartialToken" : { "type": "custom", - "tokenizer" : "whitespace", + "tokenizer" : "peliasNameTokenizer", "char_filter" : ["punctuation"], "filter": [ "lowercase", @@ -61,6 +75,7 @@ "trim", "partial_token_address_suffix_expansion", "ampersand", + "remove_ordinals", "removeAllZeroNumericPrefix", "kstem", "unique", @@ -69,12 +84,13 @@ }, "peliasQueryFullToken" : { "type": "custom", - "tokenizer" : "whitespace", + "tokenizer" : "peliasNameTokenizer", "char_filter" : ["punctuation"], "filter": [ "lowercase", "asciifolding", "trim", + "remove_ordinals", "full_token_address_suffix_expansion", "ampersand", "removeAllZeroNumericPrefix", @@ -85,7 +101,7 @@ }, "peliasPhrase": { "type": "custom", - "tokenizer": "whitespace", + "tokenizer": "peliasNameTokenizer", "char_filter": [ "punctuation" ], @@ -121,7 +137,7 @@ }, "peliasStreet": { "type": "custom", - "tokenizer": "keyword", + "tokenizer": "peliasStreetTokenizer", "char_filter": [ "punctuation" ], @@ -283,6 +299,11 @@ "min_gram": 2, "max_gram": 18 }, + "prefixZeroToSingleDigitNumbers" : { + "type" : "pattern_replace", + "pattern" : "^([0-9])$", + "replacement" : "0$1" + }, "removeAllZeroNumericPrefix": { "type": "pattern_replace", "pattern": "^(0*)", @@ -1426,7 +1447,6 @@ "punctuation": { "type": "mapping", "mappings": [ - ",=>", ".=>", "`=>", "_=>", @@ -1453,8 +1473,6 @@ ":=>", ";=>", "+=>", - "/=>", - "\\\\=>", "《=>", "》=>", "—=>", @@ -1794,7 +1812,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasOneEdgeGram", + "analyzer": "peliasIndexOneEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" @@ -1830,7 +1848,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasOneEdgeGram", + "analyzer": "peliasIndexOneEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" @@ -1866,7 +1884,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasOneEdgeGram", + "analyzer": "peliasIndexOneEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" @@ -1884,7 +1902,7 @@ "match_mapping_type": "string", "mapping": { "type": "string", - "analyzer": "peliasOneEdgeGram", + "analyzer": "peliasIndexOneEdgeGram", "fielddata": { "format": "fst", "loading": "eager_global_ordinals" diff --git a/test/settings.js b/test/settings.js index 3ba0860f..b434377b 100644 --- a/test/settings.js +++ b/test/settings.js @@ -63,6 +63,7 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) { "trim", "full_token_address_suffix_expansion", "ampersand", + "remove_ordinals", "removeAllZeroNumericPrefix", "kstem", "peliasOneEdgeGramFilter", @@ -92,9 +93,12 @@ module.exports.tests.peliasIndexTwoEdgeGramAnalyzer = function(test, common) { "trim", "full_token_address_suffix_expansion", "ampersand", + "remove_ordinals", "removeAllZeroNumericPrefix", "kstem", + "prefixZeroToSingleDigitNumbers", "peliasTwoEdgeGramFilter", + "removeAllZeroNumericPrefix", "direction_synonym_contraction_keep_original", "unique", "notnull" @@ -384,7 +388,7 @@ module.exports.tests.punctuationCharFilter = function(test, common) { var char_filter = s.analysis.char_filter.punctuation; t.equal(char_filter.type, 'mapping'); t.true(Array.isArray(char_filter.mappings)); - t.equal(char_filter.mappings.length, 50); + t.equal(char_filter.mappings.length, 47); t.end(); }); };