Skip to content

Commit

Permalink
Add discard_compound_token to kuromoji tokenizer (#4912) (#4928)
Browse files Browse the repository at this point in the history
Relates: elastic/elasticsearch#57421

Co-authored-by: Russ Cam <russ.cam@elastic.co>
  • Loading branch information
github-actions[bot] and russcam authored Aug 5, 2020
1 parent 3719053 commit b93c631
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 8 deletions.
32 changes: 24 additions & 8 deletions src/Nest/Analysis/Plugins/Kuromoji/KuromojiTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,16 @@ public interface IKuromojiTokenizer : ITokenizer
[JsonFormatter(typeof(NullableStringBooleanFormatter))]
bool? DiscardPunctuation { get; set; }

/// <summary>
/// Whether original compound tokens should be discarded from the output with
/// <see cref="KuromojiTokenizationMode.Search"/> <see cref="Mode"/>. Defaults to `false`.
/// <para />
/// Valid in Elasticsearch 7.9.0+
/// </summary>
[DataMember(Name ="discard_compound_token")]
[JsonFormatter(typeof(NullableStringBooleanFormatter))]
bool? DiscardCompoundToken { get; set; }

/// <summary>
/// The tokenization mode determines how the tokenizer handles compound and unknown words.
/// </summary>
Expand Down Expand Up @@ -64,6 +74,9 @@ public class KuromojiTokenizer : TokenizerBase, IKuromojiTokenizer
/// <inheritdoc />
public bool? DiscardPunctuation { get; set; }

/// <inheritdoc />
public bool? DiscardCompoundToken { get; set; }

/// <inheritdoc />
public KuromojiTokenizationMode? Mode { get; set; }

Expand All @@ -86,32 +99,35 @@ public class KuromojiTokenizerDescriptor
{
protected override string Type => "kuromoji_tokenizer";
bool? IKuromojiTokenizer.DiscardPunctuation { get; set; }

bool? IKuromojiTokenizer.DiscardCompoundToken { get; set; }
KuromojiTokenizationMode? IKuromojiTokenizer.Mode { get; set; }
int? IKuromojiTokenizer.NBestCost { get; set; }
string IKuromojiTokenizer.NBestExamples { get; set; }
string IKuromojiTokenizer.UserDictionary { get; set; }
IEnumerable<string> IKuromojiTokenizer.UserDictionaryRules { get; set; }

/// <inheritdoc />
/// <inheritdoc cref="IKuromojiTokenizer.Mode" />
public KuromojiTokenizerDescriptor Mode(KuromojiTokenizationMode? mode) => Assign(mode, (a, v) => a.Mode = v);

/// <inheritdoc />
/// <inheritdoc cref="IKuromojiTokenizer.DiscardPunctuation" />
public KuromojiTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(discard, (a, v) => a.DiscardPunctuation = v);

/// <inheritdoc />
/// <inheritdoc cref="IKuromojiTokenizer.DiscardCompoundToken" />
public KuromojiTokenizerDescriptor DiscardCompoundToken(bool? discard = true) => Assign(discard, (a, v) => a.DiscardCompoundToken = v);

/// <inheritdoc cref="IKuromojiTokenizer.UserDictionary" />
public KuromojiTokenizerDescriptor UserDictionary(string userDictionary) => Assign(userDictionary, (a, v) => a.UserDictionary = v);

/// <inheritdoc />
/// <inheritdoc cref="IKuromojiTokenizer.NBestExamples" />
public KuromojiTokenizerDescriptor NBestExamples(string examples) => Assign(examples, (a, v) => a.NBestExamples = v);

/// <inheritdoc />
/// <inheritdoc cref="IKuromojiTokenizer.NBestCost" />
public KuromojiTokenizerDescriptor NBestCost(int? cost) => Assign(cost, (a, v) => a.NBestCost = v);

/// <inheritdoc />
/// <inheritdoc cref="IKuromojiTokenizer.UserDictionaryRules" />
public KuromojiTokenizerDescriptor UserDictionaryRules(IEnumerable<string> rules) => Assign(rules, (a, v) => a.UserDictionaryRules = rules);

/// <inheritdoc />
/// <inheritdoc cref="IKuromojiTokenizer.UserDictionaryRules" />
public KuromojiTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(rules, (a, v) => a.UserDictionaryRules = rules);
}
}
28 changes: 28 additions & 0 deletions tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,34 @@ public class KuromojiTests : TokenizerAssertionBase<KuromojiTests>
public override string Name => "kuro";
}

[SkipVersion("<7.9.0", "discard_compound_token introduced in 7.9.0")]
public class KuromojiDiscardCompoundTokenTests : TokenizerAssertionBase<KuromojiDiscardCompoundTokenTests>
{
private const string Example = "/箱根山-箱根/成田空港-成田/";
private const string Inline = "東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞";

public override FuncTokenizer Fluent => (n, t) => t
.Kuromoji(n, e => e
.Mode(KuromojiTokenizationMode.Search)
.DiscardCompoundToken()
);

public override ITokenizer Initializer => new KuromojiTokenizer
{
Mode = KuromojiTokenizationMode.Search,
DiscardCompoundToken = true,
};

public override object Json => new
{
discard_compound_token = true,
mode = "search",
type = "kuromoji_tokenizer",
};

public override string Name => "kuro_discard_compound_token";
}

public class UaxTests : TokenizerAssertionBase<UaxTests>
{
public override FuncTokenizer Fluent => (n, t) => t.UaxEmailUrl(n, e => e
Expand Down

0 comments on commit b93c631

Please sign in to comment.