Skip to content

Commit

Permalink
Move language analyzers from server to analysis-common module. (#31300)
Browse files Browse the repository at this point in the history
The following analyzers were moved from server module to analysis-common module:
`greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`,
`lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`,
`sorani`, `spanish`, `swedish`, `turkish` and `thai`.

Relates to #23658
  • Loading branch information
martijnvg committed Jun 18, 2018
1 parent a26b9b7 commit e81ee30
Show file tree
Hide file tree
Showing 23 changed files with 655 additions and 255 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
Expand All @@ -52,17 +53,27 @@
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.ga.IrishAnalyzer;
import org.apache.lucene.analysis.gl.GalicianAnalyzer;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
import org.apache.lucene.analysis.lv.LatvianAnalyzer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
Expand All @@ -79,19 +90,26 @@
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.no.NorwegianAnalyzer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.analysis.ro.RomanianAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.ClassicFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.th.ThaiTokenizer;
import org.apache.lucene.analysis.tr.ApostropheFilter;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.logging.Loggers;
Expand Down Expand Up @@ -130,6 +148,8 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
analyzers.put("standard_html_strip", StandardHtmlStripAnalyzerProvider::new);
analyzers.put("pattern", PatternAnalyzerProvider::new);
analyzers.put("snowball", SnowballAnalyzerProvider::new);

// Language analyzers:
analyzers.put("arabic", ArabicAnalyzerProvider::new);
analyzers.put("armenian", ArmenianAnalyzerProvider::new);
analyzers.put("basque", BasqueAnalyzerProvider::new);
Expand All @@ -147,6 +167,24 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
analyzers.put("french", FrenchAnalyzerProvider::new);
analyzers.put("galician", GalicianAnalyzerProvider::new);
analyzers.put("german", GermanAnalyzerProvider::new);
analyzers.put("greek", GreekAnalyzerProvider::new);
analyzers.put("hindi", HindiAnalyzerProvider::new);
analyzers.put("hungarian", HungarianAnalyzerProvider::new);
analyzers.put("indonesian", IndonesianAnalyzerProvider::new);
analyzers.put("irish", IrishAnalyzerProvider::new);
analyzers.put("italian", ItalianAnalyzerProvider::new);
analyzers.put("latvian", LatvianAnalyzerProvider::new);
analyzers.put("lithuanian", LithuanianAnalyzerProvider::new);
analyzers.put("norwegian", NorwegianAnalyzerProvider::new);
analyzers.put("persian", PersianAnalyzerProvider::new);
analyzers.put("portuguese", PortugueseAnalyzerProvider::new);
analyzers.put("romanian", RomanianAnalyzerProvider::new);
analyzers.put("russian", RussianAnalyzerProvider::new);
analyzers.put("sorani", SoraniAnalyzerProvider::new);
analyzers.put("spanish", SpanishAnalyzerProvider::new);
analyzers.put("swedish", SwedishAnalyzerProvider::new);
analyzers.put("turkish", TurkishAnalyzerProvider::new);
analyzers.put("thai", ThaiAnalyzerProvider::new);
return analyzers;
}

Expand Down Expand Up @@ -248,13 +286,15 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
@Override
public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactories() {
List<PreBuiltAnalyzerProviderFactory> analyzers = new ArrayList<>();
analyzers.add(new PreBuiltAnalyzerProviderFactory("standard_html_strip", CachingStrategy.LUCENE,
analyzers.add(new PreBuiltAnalyzerProviderFactory("standard_html_strip", CachingStrategy.ELASTICSEARCH,
() -> new StandardHtmlStripAnalyzer(CharArraySet.EMPTY_SET)));
analyzers.add(new PreBuiltAnalyzerProviderFactory("pattern", CachingStrategy.ELASTICSEARCH,
() -> new PatternAnalyzer(Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true,
CharArraySet.EMPTY_SET)));
analyzers.add(new PreBuiltAnalyzerProviderFactory("snowball", CachingStrategy.LUCENE,
() -> new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS_SET)));

// Language analyzers:
analyzers.add(new PreBuiltAnalyzerProviderFactory("arabic", CachingStrategy.LUCENE, ArabicAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("armenian", CachingStrategy.LUCENE, ArmenianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("basque", CachingStrategy.LUCENE, BasqueAnalyzer::new));
Expand All @@ -263,7 +303,7 @@ public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactorie
analyzers.add(new PreBuiltAnalyzerProviderFactory("bulgarian", CachingStrategy.LUCENE, BulgarianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("catalan", CachingStrategy.LUCENE, CatalanAnalyzer::new));
// chinese analyzer: only for old indices, best effort
analyzers.add(new PreBuiltAnalyzerProviderFactory("chinese", CachingStrategy.LUCENE, StandardAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("chinese", CachingStrategy.ONE, StandardAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("cjk", CachingStrategy.LUCENE, CJKAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("czech", CachingStrategy.LUCENE, CzechAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("danish", CachingStrategy.LUCENE, DanishAnalyzer::new));
Expand All @@ -273,6 +313,24 @@ public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactorie
analyzers.add(new PreBuiltAnalyzerProviderFactory("french", CachingStrategy.LUCENE, FrenchAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("galician", CachingStrategy.LUCENE, GalicianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("german", CachingStrategy.LUCENE, GermanAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("greek", CachingStrategy.LUCENE, GreekAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("hindi", CachingStrategy.LUCENE, HindiAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("hungarian", CachingStrategy.LUCENE, HungarianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("indonesian", CachingStrategy.LUCENE, IndonesianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("irish", CachingStrategy.LUCENE, IrishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("italian", CachingStrategy.LUCENE, ItalianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("latvian", CachingStrategy.LUCENE, LatvianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("lithuanian", CachingStrategy.LUCENE, LithuanianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("norwegian", CachingStrategy.LUCENE, NorwegianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("persian", CachingStrategy.LUCENE, PersianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("portuguese", CachingStrategy.LUCENE, PortugueseAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("romanian", CachingStrategy.LUCENE, RomanianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("russian", CachingStrategy.LUCENE, RussianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("sorani", CachingStrategy.LUCENE, SoraniAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("spanish", CachingStrategy.LUCENE, SpanishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("swedish", CachingStrategy.LUCENE, SwedishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("turkish", CachingStrategy.LUCENE, TurkishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("thai", CachingStrategy.LUCENE, ThaiAnalyzer::new));
return analyzers;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,20 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class GreekAnalyzerProvider extends AbstractIndexAnalyzerProvider<GreekAnalyzer> {

private final GreekAnalyzer analyzer;

public GreekAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
GreekAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new GreekAnalyzer(
Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, GreekAnalyzer.getDefaultStopSet()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class HindiAnalyzerProvider extends AbstractIndexAnalyzerProvider<HindiAnalyzer> {

private final HindiAnalyzer analyzer;

public HindiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
HindiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new HindiAnalyzer(
Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, HindiAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class HungarianAnalyzerProvider extends AbstractIndexAnalyzerProvider<HungarianAnalyzer> {

private final HungarianAnalyzer analyzer;

public HungarianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
HungarianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new HungarianAnalyzer(
Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, HungarianAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class IndonesianAnalyzerProvider extends AbstractIndexAnalyzerProvider<IndonesianAnalyzer> {

private final IndonesianAnalyzer analyzer;

public IndonesianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
IndonesianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new IndonesianAnalyzer(
Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, IndonesianAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ga.IrishAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

/**
* Provider for {@link IrishAnalyzer}
Expand All @@ -32,7 +34,7 @@ public class IrishAnalyzerProvider extends AbstractIndexAnalyzerProvider<IrishAn

private final IrishAnalyzer analyzer;

public IrishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
IrishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new IrishAnalyzer(
Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, IrishAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class ItalianAnalyzerProvider extends AbstractIndexAnalyzerProvider<ItalianAnalyzer> {

private final ItalianAnalyzer analyzer;

public ItalianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
ItalianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new ItalianAnalyzer(
Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, ItalianAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.lv.LatvianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class LatvianAnalyzerProvider extends AbstractIndexAnalyzerProvider<LatvianAnalyzer> {

private final LatvianAnalyzer analyzer;

public LatvianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
LatvianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new LatvianAnalyzer(
Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, LatvianAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

/**
* Provider for {@link LithuanianAnalyzer}
Expand All @@ -32,7 +34,7 @@ public class LithuanianAnalyzerProvider extends AbstractIndexAnalyzerProvider<Li

private final LithuanianAnalyzer analyzer;

public LithuanianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
LithuanianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new LithuanianAnalyzer(
Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, LithuanianAnalyzer.getDefaultStopSet()),
Expand Down
Loading

0 comments on commit e81ee30

Please sign in to comment.