Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add User Dictionary Rules to NoriTokenizer #3634

Merged
merged 1 commit into from
Apr 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions src/Nest/Analysis/Tokenizers/NoriTokenizer.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Runtime.Serialization;
using System.Collections.Generic;
using System.Runtime.Serialization;
using Newtonsoft.Json;
using Newtonsoft.Json.Converters;

Expand Down Expand Up @@ -32,10 +33,20 @@ public interface INoriTokenizer : ITokenizer

/// <summary>
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be appended to
/// the default dictionary. This property allows you to specify this file on disk
/// the default dictionary. This property allows you to specify a path to this file on disk
/// </summary>
[JsonProperty("user_dictionary")]
string UserDictionary { get; set; }

/// <summary>
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG)
/// can be specified inline with this property
/// </summary>
/// <remarks>
/// Valid for Elasticsearch 6.6.0+
/// </remarks>
[JsonProperty("user_dictionary_rules")]
IEnumerable<string> UserDictionaryRules { get; set; }
}

/// <inheritdoc cref="INoriTokenizer" />
Expand All @@ -48,6 +59,9 @@ public class NoriTokenizer : TokenizerBase, INoriTokenizer

/// <inheritdoc cref="INoriTokenizer.UserDictionary" />
public string UserDictionary { get; set; }

/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
public IEnumerable<string> UserDictionaryRules { get; set; }
}

/// <inheritdoc cref="INoriTokenizer" />
Expand All @@ -58,11 +72,18 @@ public class NoriTokenizerDescriptor

NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; }
string INoriTokenizer.UserDictionary { get; set; }
IEnumerable<string> INoriTokenizer.UserDictionaryRules { get; set; }

/// <inheritdoc cref="INoriTokenizer.DecompoundMode" />
public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(a => a.DecompoundMode = mode);

/// <inheritdoc cref="INoriTokenizer.UserDictionary" />
public NoriTokenizerDescriptor UserDictionary(string path) => Assign(a => a.UserDictionary = path);

/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
public NoriTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(a => a.UserDictionaryRules = rules);

/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
public NoriTokenizerDescriptor UserDictionaryRules(IEnumerable<string> rules) => Assign(a => a.UserDictionaryRules = rules);
}
}
23 changes: 23 additions & 0 deletions src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,29 @@ public class NoriTests : TokenizerAssertionBase<NoriTests>
public override string Name => "nori";
}

[SkipVersion("<6.6.0", "inline user dictionary rules introduced in 6.6.0")]
public class NoriWithUserDictionaryTests : TokenizerAssertionBase<NoriWithUserDictionaryTests>
{
public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e
.DecompoundMode(NoriDecompoundMode.Mixed)
.UserDictionaryRules("c++", "C샤프", "세종", "세종시 세종 시")
);

public override ITokenizer Initializer => new NoriTokenizer
{
DecompoundMode = NoriDecompoundMode.Mixed,
UserDictionaryRules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
};

public override object Json => new
{
type = "nori_tokenizer",
decompound_mode = "mixed",
user_dictionary_rules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
};
public override string Name => "nori_userdictionary";
}

[SkipVersion("<6.4.0", "char_group introduced in 6.4.0")]
public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
{
Expand Down