From 9c589eff4524037bdeb9d15216109cbf78c3293a Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Mon, 1 Apr 2019 15:51:07 +1000 Subject: [PATCH] Add User Dictionary Rules to NoriTokenizer Relates #3615, elastic/elasticsearch#3620 --- src/Nest/Analysis/Tokenizers/NoriTokenizer.cs | 25 +++++++++++++++++-- .../Analysis/Tokenizers/TokenizerTests.cs | 23 +++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs b/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs index 761f4e3d8c4..6ab009e2b9b 100644 --- a/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs +++ b/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs @@ -1,4 +1,5 @@ -using System.Runtime.Serialization; +using System.Collections.Generic; +using System.Runtime.Serialization; using Newtonsoft.Json; using Newtonsoft.Json.Converters; @@ -32,10 +33,20 @@ public interface INoriTokenizer : ITokenizer /// /// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be appended to - /// the default dictionary. This property allows you to specify this file on disk + /// the default dictionary. This property allows you to specify a path to this file on disk /// [JsonProperty("user_dictionary")] string UserDictionary { get; set; } + + /// + /// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) + /// can be specified inline with this property + /// + /// + /// Valid for Elasticsearch 6.6.0+ + /// + [JsonProperty("user_dictionary_rules")] + IEnumerable UserDictionaryRules { get; set; } } /// @@ -48,6 +59,9 @@ public class NoriTokenizer : TokenizerBase, INoriTokenizer /// public string UserDictionary { get; set; } + + /// + public IEnumerable UserDictionaryRules { get; set; } } /// @@ -58,11 +72,18 @@ public class NoriTokenizerDescriptor NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; } string INoriTokenizer.UserDictionary { get; set; } + IEnumerable INoriTokenizer.UserDictionaryRules { get; set; } /// public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(a => a.DecompoundMode = mode); /// public NoriTokenizerDescriptor UserDictionary(string path) => Assign(a => a.UserDictionary = path); + + /// + public NoriTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(a => a.UserDictionaryRules = rules); + + /// + public NoriTokenizerDescriptor UserDictionaryRules(IEnumerable rules) => Assign(a => a.UserDictionaryRules = rules); } } diff --git a/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs b/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs index bb9a8e9858c..c124b1ca055 100644 --- a/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs +++ b/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs @@ -222,6 +222,29 @@ public class NoriTests : TokenizerAssertionBase public override string Name => "nori"; } + [SkipVersion("<6.6.0", "inline user dictionary rules introduced in 6.6.0")] + public class NoriWithUserDictionaryTests : TokenizerAssertionBase + { + public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e + .DecompoundMode(NoriDecompoundMode.Mixed) + .UserDictionaryRules("c++", "C샤프", "세종", "세종시 세종 시") + ); + + public override ITokenizer Initializer => new NoriTokenizer + { + DecompoundMode = NoriDecompoundMode.Mixed, + UserDictionaryRules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" } + }; + + public override object Json => new + { + type = "nori_tokenizer", + decompound_mode = "mixed", + user_dictionary_rules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" } + }; + public override string Name => "nori_userdictionary"; + } + [SkipVersion("<6.4.0", "char_group introduced in 6.4.0")] public class CharGroupTests : TokenizerAssertionBase {