From f08bc0b06b9c23cada3b6defe9b71d5170e809cc Mon Sep 17 00:00:00 2001 From: Siarhei Fedartsou Date: Thu, 26 Sep 2024 11:26:27 +0200 Subject: [PATCH] feature(punctuation): amend punctuation list --- integration/analyzer_peliasPhrase.js | 3 ++- punctuation.js | 21 ++++++++------------- test/fixtures/expected.json | 14 ++++++++++++-- test/settings.js | 2 +- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/integration/analyzer_peliasPhrase.js b/integration/analyzer_peliasPhrase.js index 11828cd6..75262a87 100644 --- a/integration/analyzer_peliasPhrase.js +++ b/integration/analyzer_peliasPhrase.js @@ -47,7 +47,8 @@ module.exports.tests.analyze = function(test, common){ // remove punctuation (handled by the char_filter) assertAnalysis( 'punctuation', punctuation.all.join(''), ['0:&', '0:and', '0:und'] ); assertAnalysis( 'punctuation', 'Hawai‘i', ['hawaii'] ); - + assertAnalysis( 'punctuation - « in between', '«res»pub«lika»', ['respublika'] ); + assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] ); assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] ); diff --git a/punctuation.js b/punctuation.js index 5c36452d..67191534 100644 --- a/punctuation.js +++ b/punctuation.js @@ -1,23 +1,18 @@ // These characters will be removed from ngrams/shingles // @see: org/apache/lucene/analysis/cn/smart/stopwords.txt -module.exports.all = [ - ".","`","‘","-","_","=","?","'","|","\"","(",")","{","}","[","]","<",">","*", - "#","&","^","$","@","!","~",":",";","+","《","》","—","-",",","。", - "、", ":",";","!","·","?","„","“","”",")","(","【","】","[","]","●" +const all = [ + ".","`","‘","’","‛","-","_","=","?","'","|","\"","(",")","{","}","[","]","<",">","*", + "#","&","^","$","@","!","~",":",";","+","《","》","—","-",",","。","‹","›","⹂","〝","〞", + "、", ":",";","!","·","?","„","“","”","‟",")","(","【","】","[","]","●","«","»" ]; -module.exports.allowed = [ +const allowed = [ "-", // allow hypens "&" // allow ampersands ]; -module.exports.blacklist = module.exports.all.slice(); - // remove alowed chars from blacklist -module.exports.allowed.forEach(function(item){ - var index = module.exports.blacklist.indexOf(item); - if( index > -1 ){ - module.exports.blacklist.splice(index, 1); - } -}); +const blacklist = all.filter(s => !allowed.includes(s)); + +module.exports = { all, allowed, blacklist }; \ No newline at end of file diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 40d2cdc1..8bddef1e 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -2277,6 +2277,8 @@ ".=>", "`=>", "‘=>", + "’=>", + "‛=>", "_=>", "==>", "?=>", @@ -2307,6 +2309,11 @@ "-=>", ",=>", "。=>", + "‹=>", + "›=>", + "⹂=>", + "〝=>", + "〞=>", "、=>", ":=>", ";=>", @@ -2316,13 +2323,16 @@ "„=>", "“=>", "”=>", + "‟=>", ")=>", "(=>", "【=>", "】=>", "[=>", "]=>", - "●=>" + "●=>", + "«=>", + "»=>" ] }, "alphanumeric": { @@ -3023,4 +3033,4 @@ }, "dynamic": "strict" } -} +} \ No newline at end of file diff --git a/test/settings.js b/test/settings.js index d23b1c63..78c6b2ba 100644 --- a/test/settings.js +++ b/test/settings.js @@ -591,7 +591,7 @@ module.exports.tests.punctuationCharFilter = function(test, common) { var char_filter = s.analysis.char_filter.punctuation; t.equal(char_filter.type, 'mapping'); t.true(Array.isArray(char_filter.mappings)); - t.equal(char_filter.mappings.length, 49); + t.equal(char_filter.mappings.length, 59); t.end(); }); };