Skip to content

Commit

Permalink
Add prebuilt ICU Analyzer (#3635)
Browse files Browse the repository at this point in the history
  • Loading branch information
russcam committed May 8, 2019
1 parent 82b4ba7 commit e647e4a
Show file tree
Hide file tree
Showing 17 changed files with 138 additions and 21 deletions.
1 change: 1 addition & 0 deletions src/Nest/Analysis/Analyzers/AnalyzerJsonConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist
case "fingerprint": return o.ToObject<FingerprintAnalyzer>(ElasticContractResolver.Empty);
case "kuromoji": return o.ToObject<KuromojiAnalyzer>(ElasticContractResolver.Empty);
case "nori": return o.ToObject<NoriAnalyzer>(ElasticContractResolver.Empty);
case "icu_analyzer": return o.ToObject<IcuAnalyzer>(ElasticContractResolver.Empty);
default:
if (o.Property("tokenizer") != null)
return o.ToObject<CustomAnalyzer>(ElasticContractResolver.Empty);
Expand Down
4 changes: 4 additions & 0 deletions src/Nest/Analysis/Analyzers/Analyzers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -107,5 +107,9 @@ public AnalyzersDescriptor Kuromoji(string name, Func<KuromojiAnalyzerDescriptor
/// <inheritdoc cref="INoriAnalyzer" />
public AnalyzersDescriptor Nori(string name, Func<NoriAnalyzerDescriptor, INoriAnalyzer> selector) =>
Assign(name, selector?.Invoke(new NoriAnalyzerDescriptor()));

/// <inheritdoc cref="IIcuAnalyzer" />
public AnalyzersDescriptor Icu(string name, Func<IcuAnalyzerDescriptor, IIcuAnalyzer> selector) =>
Assign(name, selector?.Invoke(new IcuAnalyzerDescriptor()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ namespace Nest
/// Sets the alternate handling for strength quaternary to be either shifted or non-ignorable.
/// Which boils down to ignoring punctuation and whitespace.
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuCollationAlternate
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
namespace Nest
{
/// <summary>
/// Sets the alternate handling for strength quaternary to be either shifted or non-ignorable.
/// Which boils down to ignoring punctuation and whitespace.
/// Controls which case is sorted first when case is not ignored for
/// strength tertiary. The default depends on the collation.
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuCollationCaseFirst
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ namespace Nest
/// great many of the world’s languages do not require text normalization, most locales
/// set no as the default decomposition mode.
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuCollationDecomposition
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ namespace Nest
/// difference considered significant during comparison.
/// See also: http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuCollationStrength
{
Expand Down
54 changes: 54 additions & 0 deletions src/Nest/Analysis/Plugins/Icu/IcuAnalyzer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// An ICU analyzer that performs basic normalization, tokenization and character folding,
/// using the <see cref="IIcuNormalizationCharFilter" /> char filter,
/// <see cref="IIcuTokenizer" /> and <see cref="IcuNormalizationTokenFilter" /> token filter
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed and Elasticsearch 6.6.0+
/// </remarks>
public interface IIcuAnalyzer : IAnalyzer
{
/// <summary>
/// Normalization method. Default is <see cref="IcuNormalizationType.CompatibilityCaseFold" />
/// </summary>
[JsonProperty("method")]
IcuNormalizationType? Method { get; set; }

/// <summary>
/// Normalization mode. Default is <see cref="IcuNormalizationMode.Compose" />
/// </summary>
[JsonProperty("mode")]
IcuNormalizationMode? Mode { get; set; }
}

/// <inheritdoc cref="IIcuAnalyzer" />
public class IcuAnalyzer : AnalyzerBase, IIcuAnalyzer
{
public IcuAnalyzer() : base("icu_analyzer") { }

/// <inheritdoc />
public IcuNormalizationType? Method { get; set; }

/// <inheritdoc />
public IcuNormalizationMode? Mode { get; set; }
}

/// <inheritdoc cref="IIcuAnalyzer" />
public class IcuAnalyzerDescriptor : AnalyzerDescriptorBase<IcuAnalyzerDescriptor, IIcuAnalyzer>, IIcuAnalyzer
{
protected override string Type => "icu_analyzer";

IcuNormalizationType? IIcuAnalyzer.Method { get; set; }
IcuNormalizationMode? IIcuAnalyzer.Mode { get; set; }

/// <inheritdoc cref="IIcuAnalyzer.Method"/>
public IcuAnalyzerDescriptor Method(IcuNormalizationType? method) => Assign(a => a.Method = method);

/// <inheritdoc cref="IIcuAnalyzer.Mode"/>
public IcuAnalyzerDescriptor Mode(IcuNormalizationMode? mode) => Assign(a => a.Mode = mode);
}
}
29 changes: 16 additions & 13 deletions src/Nest/Analysis/Plugins/Icu/IcuCollationTokenFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ namespace Nest
/// defaults to using the DUCET collation, which is a best-effort attempt at language-neutral sorting.
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuCollationTokenFilter : ITokenFilter
{
/// <summary>
Expand Down Expand Up @@ -75,7 +78,7 @@ public interface IIcuCollationTokenFilter : ITokenFilter
string Variant { get; set; }
}

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter" />
public class IcuCollationTokenFilter : TokenFilterBase, IIcuCollationTokenFilter
{
public IcuCollationTokenFilter() : base("icu_collation") { }
Expand Down Expand Up @@ -114,7 +117,7 @@ public IcuCollationTokenFilter() : base("icu_collation") { }
public string Variant { get; set; }
}

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter" />
public class IcuCollationTokenFilterDescriptor
: TokenFilterDescriptorBase<IcuCollationTokenFilterDescriptor, IIcuCollationTokenFilter>, IIcuCollationTokenFilter
{
Expand All @@ -132,38 +135,38 @@ public class IcuCollationTokenFilterDescriptor
string IIcuCollationTokenFilter.VariableTop { get; set; }
string IIcuCollationTokenFilter.Variant { get; set; }

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Language" />
public IcuCollationTokenFilterDescriptor Language(string language) => Assign(language, (a, v) => a.Language = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Country" />
public IcuCollationTokenFilterDescriptor Country(string country) => Assign(country, (a, v) => a.Country = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Variant" />
public IcuCollationTokenFilterDescriptor Variant(string variant) => Assign(variant, (a, v) => a.Variant = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Strength" />
public IcuCollationTokenFilterDescriptor Strength(IcuCollationStrength? strength) => Assign(strength, (a, v) => a.Strength = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Decomposition" />
public IcuCollationTokenFilterDescriptor Decomposition(IcuCollationDecomposition? decomposition) =>
Assign(decomposition, (a, v) => a.Decomposition = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Alternate" />
public IcuCollationTokenFilterDescriptor Alternate(IcuCollationAlternate? alternate) => Assign(alternate, (a, v) => a.Alternate = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.CaseFirst" />
public IcuCollationTokenFilterDescriptor CaseFirst(IcuCollationCaseFirst? caseFirst) => Assign(caseFirst, (a, v) => a.CaseFirst = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.CaseLevel" />
public IcuCollationTokenFilterDescriptor CaseLevel(bool? caseLevel = true) => Assign(caseLevel, (a, v) => a.CaseLevel = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Numeric" />
public IcuCollationTokenFilterDescriptor Numeric(bool? numeric = true) => Assign(numeric, (a, v) => a.Numeric = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.HiraganaQuaternaryMode" />
public IcuCollationTokenFilterDescriptor HiraganaQuaternaryMode(bool? mode = true) => Assign(mode, (a, v) => a.HiraganaQuaternaryMode = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.VariableTop" />
public IcuCollationTokenFilterDescriptor VariableTop(string variableTop) => Assign(variableTop, (a, v) => a.VariableTop = v);
}
}
4 changes: 3 additions & 1 deletion src/Nest/Analysis/Plugins/Icu/IcuFoldingTokenFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ namespace Nest
{
/// <summary>
/// Case folding of Unicode characters based on UTR#30, like the ASCII-folding token filter on steroids.
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuFoldingTokenFilter : ITokenFilter
{
/// <summary>
Expand Down
4 changes: 3 additions & 1 deletion src/Nest/Analysis/Plugins/Icu/IcuNormalizationCharFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ namespace Nest
{
/// <summary>
/// Normalizes as defined here: http://userguide.icu-project.org/transforms/normalization
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuNormalizationCharFilter : ICharFilter
{
/// <summary>
Expand Down
4 changes: 3 additions & 1 deletion src/Nest/Analysis/Plugins/Icu/IcuNormalizationTokenFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ namespace Nest
{
/// <summary>
/// Normalizes as defined here: http://userguide.icu-project.org/transforms/normalization
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuNormalizationTokenFilter : ITokenFilter
{
/// <summary>
Expand Down
4 changes: 3 additions & 1 deletion src/Nest/Analysis/Plugins/Icu/IcuTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ namespace Nest
/// like the standard tokenizer, but adds better support for some Asian languages by using a dictionary-based approach
/// to identify words in Thai, Lao, Chinese, Japanese, and Korean, and using custom rules to break Myanmar and Khmer
/// text into syllables.
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuTokenizer : ITokenizer
{
/// <summary>
Expand Down
4 changes: 3 additions & 1 deletion src/Nest/Analysis/Plugins/Icu/IcuTransformTokenFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ namespace Nest
/// <summary>
/// Transforms are used to process Unicode text in many different ways, such as case mapping,
/// normalization, transliteration and bidirectional text handling.
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuTransformTokenFilter : ITokenFilter
{
/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ namespace Nest
/// <summary>
/// Normalization mode https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuNormalizationMode
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ namespace Nest
/// <summary>
/// Normalization forms https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuNormalizationType
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ namespace Nest
/// <summary>
/// Forward (default) for LTR and reverse for RTL
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuTransformDirection
{
Expand Down
26 changes: 25 additions & 1 deletion src/Tests/Tests/Analysis/Analyzers/AnalyzerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public class SimpleTests : AnalyzerAssertionBase<SimpleTests>
public override string Name => "mySimple";
}

public class LanguageTests : AnalyzerAssertionBase<SimpleTests>
public class LanguageTests : AnalyzerAssertionBase<LanguageTests>
{
public override FuncTokenizer Fluent => (n, an) => an
.Language("myLanguage", a => a.Language(Language.Dutch));
Expand Down Expand Up @@ -216,5 +216,29 @@ public class NoriTests : AnalyzerAssertionBase<NoriTests>

public override string Name => "nori";
}

[SkipVersion("<6.6.0", "introduced in 6.6.0")]
public class IcuTests : AnalyzerAssertionBase<IcuTests>
{
public override FuncTokenizer Fluent => (n, t) => t.Icu(n, e => e
.Method(IcuNormalizationType.Canonical)
.Mode(IcuNormalizationMode.Decompose)
);

public override IAnalyzer Initializer => new IcuAnalyzer
{
Method = IcuNormalizationType.Canonical,
Mode = IcuNormalizationMode.Decompose
};

public override object Json => new
{
type = "icu_analyzer",
method = "nfc",
mode = "decompose"
};

public override string Name => "icu_analyzer";
}
}
}

0 comments on commit e647e4a

Please sign in to comment.