diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 02a4197fba94a..69c8afb3e2fc6 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -193,6 +193,7 @@ public Map> getTokenizers() { tokenizers.put("pattern", PatternTokenizerFactory::new); tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new); tokenizers.put("whitespace", WhitespaceTokenizerFactory::new); + tokenizers.put("keyword", KeywordTokenizerFactory::new); return tokenizers; } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/KeywordTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeywordTokenizerFactory.java similarity index 89% rename from server/src/main/java/org/elasticsearch/index/analysis/KeywordTokenizerFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeywordTokenizerFactory.java index 1d94cad150785..abe88462cb996 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/KeywordTokenizerFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeywordTokenizerFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; @@ -30,7 +30,7 @@ public class KeywordTokenizerFactory extends AbstractTokenizerFactory { private final int bufferSize; - public KeywordTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + KeywordTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); bufferSize = settings.getAsInt("buffer_size", 256); } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 7deadcbcc25f6..5084306587847 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -24,7 +24,6 @@ import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory; import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; -import org.elasticsearch.index.analysis.KeywordTokenizerFactory; import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; import org.elasticsearch.index.analysis.SynonymTokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; @@ -56,6 +55,7 @@ protected Map> getTokenizers() { tokenizers.put("pattern", PatternTokenizerFactory.class); tokenizers.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class); tokenizers.put("whitespace", WhitespaceTokenizerFactory.class); + tokenizers.put("keyword", KeywordTokenizerFactory.class); return tokenizers; } diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml index cffd4496f1fb7..9a7c158fc4734 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml @@ -5,9 +5,22 @@ indices.analyze: body: text: Foo Bar! + explain: true tokenizer: keyword - - length: { tokens: 1 } - - match: { tokens.0.token: Foo Bar! } + - length: { detail.tokenizer.tokens: 1 } + - match: { detail.tokenizer.name: keyword } + - match: { detail.tokenizer.tokens.0.token: Foo Bar! } + + - do: + indices.analyze: + body: + text: Foo Bar! + explain: true + tokenizer: + type: keyword + - length: { detail.tokenizer.tokens: 1 } + - match: { detail.tokenizer.name: _anonymous_tokenizer } + - match: { detail.tokenizer.tokens.0.token: Foo Bar! } --- "nGram": diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_analyze.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_analyze.yml index 1737d743a6d1c..936736e93de93 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_analyze.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_analyze.yml @@ -97,3 +97,19 @@ - length: { tokens: 2 } - match: { tokens.0.token: sha } - match: { tokens.1.token: hay } + +--- +"Custom normalizer in request": + - do: + indices.analyze: + body: + text: ABc + explain: true + filter: ["lowercase"] + + - length: { detail.tokenizer.tokens: 1 } + - length: { detail.tokenfilters.0.tokens: 1 } + - match: { detail.tokenizer.name: keyword_for_normalizer } + - match: { detail.tokenizer.tokens.0.token: ABc } + - match: { detail.tokenfilters.0.name: lowercase } + - match: { detail.tokenfilters.0.tokens.0.token: abc } diff --git a/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml b/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml index 521d8f0714070..c9ff2b2fb6463 100644 --- a/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml +++ b/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml @@ -16,9 +16,11 @@ body: filter: [icu_normalizer] text: Foo Bar Ruß - tokenizer: keyword - - length: { tokens: 1 } - - match: { tokens.0.token: foo bar russ } + tokenizer: standard + - length: { tokens: 3 } + - match: { tokens.0.token: foo} + - match: { tokens.1.token: bar } + - match: { tokens.2.token: russ } --- "Normalization charfilter": - do: @@ -26,9 +28,11 @@ body: char_filter: [icu_normalizer] text: Foo Bar Ruß - tokenizer: keyword - - length: { tokens: 1 } - - match: { tokens.0.token: foo bar russ } + tokenizer: standard + - length: { tokens: 3 } + - match: { tokens.0.token: foo } + - match: { tokens.1.token: bar } + - match: { tokens.2.token: russ } --- "Folding filter": - do: @@ -36,9 +40,11 @@ body: filter: [icu_folding] text: Foo Bar résumé - tokenizer: keyword - - length: { tokens: 1 } - - match: { tokens.0.token: foo bar resume } + tokenizer: standard + - length: { tokens: 3 } + - match: { tokens.0.token: foo } + - match: { tokens.1.token: bar } + - match: { tokens.2.token: resume } --- "Normalization with a UnicodeSet Filter": - do: @@ -64,25 +70,34 @@ index: test body: char_filter: ["charfilter_icu_normalizer"] - tokenizer: keyword + tokenizer: standard text: charfilter Föo Bâr Ruß - - length: { tokens: 1 } - - match: { tokens.0.token: charfilter föo bâr ruß } + - length: { tokens: 4 } + - match: { tokens.0.token: charfilter } + - match: { tokens.1.token: föo } + - match: { tokens.2.token: bâr } + - match: { tokens.3.token: ruß } - do: indices.analyze: index: test body: - tokenizer: keyword + tokenizer: standard filter: ["tokenfilter_icu_normalizer"] text: tokenfilter Föo Bâr Ruß - - length: { tokens: 1 } - - match: { tokens.0.token: tokenfilter föo Bâr ruß } + - length: { tokens: 4 } + - match: { tokens.0.token: tokenfilter } + - match: { tokens.1.token: föo } + - match: { tokens.2.token: Bâr } + - match: { tokens.3.token: ruß } - do: indices.analyze: index: test body: - tokenizer: keyword + tokenizer: standard filter: ["tokenfilter_icu_folding"] text: icufolding Föo Bâr Ruß - - length: { tokens: 1 } - - match: { tokens.0.token: icufolding foo bâr russ } + - length: { tokens: 4 } + - match: { tokens.0.token: icufolding } + - match: { tokens.1.token: foo } + - match: { tokens.2.token: bâr } + - match: { tokens.3.token: russ } diff --git a/plugins/analysis-stempel/src/test/resources/rest-api-spec/test/analysis_stempel/10_basic.yml b/plugins/analysis-stempel/src/test/resources/rest-api-spec/test/analysis_stempel/10_basic.yml index 1941126c64fb8..3400a7f9bdf1a 100644 --- a/plugins/analysis-stempel/src/test/resources/rest-api-spec/test/analysis_stempel/10_basic.yml +++ b/plugins/analysis-stempel/src/test/resources/rest-api-spec/test/analysis_stempel/10_basic.yml @@ -5,7 +5,7 @@ indices.analyze: body: text: studenci - tokenizer: keyword + tokenizer: standard filter: [polish_stem] - length: { tokens: 1 } - match: { tokens.0.token: student } diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml index d95417c16ca5d..85861a23d5943 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml @@ -75,19 +75,3 @@ - match: { detail.tokenizer.tokens.2.token: buzz } - match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter" } - match: { detail.tokenfilters.0.tokens.0.token: bar } - ---- -"Custom normalizer in request": - - do: - indices.analyze: - body: - text: ABc - explain: true - filter: ["lowercase"] - - - length: { detail.tokenizer.tokens: 1 } - - length: { detail.tokenfilters.0.tokens: 1 } - - match: { detail.tokenizer.name: keyword_for_normalizer } - - match: { detail.tokenizer.tokens.0.token: ABc } - - match: { detail.tokenfilters.0.name: lowercase } - - match: { detail.tokenfilters.0.tokens.0.token: abc } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index 63e56651cd000..77be68fbbe2e5 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -548,6 +548,10 @@ private void processNormalizerFactory( TokenizerFactory keywordTokenizerFactory, Map tokenFilters, Map charFilters) { + if (keywordTokenizerFactory == null) { + throw new IllegalStateException("keyword tokenizer factory is null, normalizers require analysis-common module"); + } + if (normalizerFactory instanceof CustomNormalizerProvider) { ((CustomNormalizerProvider) normalizerFactory).build(keywordTokenizerFactory, charFilters, tokenFilters); } diff --git a/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 1054744422638..bc590381c3c7c 100644 --- a/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -56,7 +56,6 @@ import org.elasticsearch.index.analysis.IrishAnalyzerProvider; import org.elasticsearch.index.analysis.ItalianAnalyzerProvider; import org.elasticsearch.index.analysis.KeywordAnalyzerProvider; -import org.elasticsearch.index.analysis.KeywordTokenizerFactory; import org.elasticsearch.index.analysis.LatvianAnalyzerProvider; import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider; import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider; @@ -225,7 +224,6 @@ static Map setupPreConfiguredTokenizers(List> setupTokenizers(List plugins) { NamedRegistry> tokenizers = new NamedRegistry<>("tokenizer"); tokenizers.register("standard", StandardTokenizerFactory::new); - tokenizers.register("keyword", KeywordTokenizerFactory::new); tokenizers.extractAndRegister(plugins, AnalysisPlugin::getTokenizers); return tokenizers; } diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java index de11902d9141d..c0404a47ab237 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java @@ -19,6 +19,7 @@ package org.elasticsearch.action.admin.indices; import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.elasticsearch.Version; import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest; @@ -37,6 +38,7 @@ import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.AnalysisModuleTests.AppendCharFilter; @@ -107,6 +109,12 @@ public Map> getCharFilters() { return singletonMap("append", AppendCharFilterFactory::new); } + @Override + public Map> getTokenizers() { + return singletonMap("keyword", (indexSettings, environment, name, settings) -> + () -> new MockTokenizer(MockTokenizer.KEYWORD, false)); + } + @Override public Map> getTokenFilters() { return singletonMap("mock", MockFactory::new); diff --git a/server/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java b/server/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java index c55e4851edbc2..b5a596401cbbc 100644 --- a/server/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java +++ b/server/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java @@ -37,10 +37,13 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.engine.VersionConflictEngineException; import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.MockKeywordPlugin; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -58,6 +61,12 @@ import static org.hamcrest.Matchers.nullValue; public class GetTermVectorsIT extends AbstractTermVectorsTestCase { + + @Override + protected Collection> nodePlugins() { + return Collections.singleton(MockKeywordPlugin.class); + } + public void testNoSuchDoc() throws Exception { XContentBuilder mapping = jsonBuilder().startObject().startObject("type1") .startObject("properties") diff --git a/server/src/test/java/org/elasticsearch/gateway/GatewayIndexStateIT.java b/server/src/test/java/org/elasticsearch/gateway/GatewayIndexStateIT.java index aeadcf30e3678..209fbd37c5953 100644 --- a/server/src/test/java/org/elasticsearch/gateway/GatewayIndexStateIT.java +++ b/server/src/test/java/org/elasticsearch/gateway/GatewayIndexStateIT.java @@ -432,7 +432,7 @@ public void testRecoverMissingAnalyzer() throws Exception { logger.info("--> starting one node"); internalCluster().startNode(); prepareCreate("test").setSettings(Settings.builder() - .put("index.analysis.analyzer.test.tokenizer", "keyword") + .put("index.analysis.analyzer.test.tokenizer", "standard") .put("index.number_of_shards", "1")) .addMapping("type1", "{\n" + " \"type1\": {\n" + diff --git a/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java b/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java index 7d8d64e6962d5..e2025145241c0 100644 --- a/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java +++ b/server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java @@ -20,6 +20,8 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.MockLowerCaseFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; @@ -71,7 +73,7 @@ public void testTokenizer() throws IOException { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings)); + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN)); assertEquals("Custom normalizer [my_normalizer] cannot configure a tokenizer", e.getMessage()); } @@ -135,7 +137,7 @@ public Reader create(Reader reader) { @Override public int read(char[] cbuf, int off, int len) throws IOException { int result = reader.read(cbuf, off, len); - for (int i = off; i < result; i++) { + for (int i = off; i < off + len; i++) { if (cbuf[i] == 'a') { cbuf[i] = 'z'; } @@ -157,5 +159,11 @@ public Object getMultiTermComponent() { return new Factory(); }); } + + @Override + public Map> getTokenizers() { + return singletonMap("keyword", (indexSettings, environment, name, settings) -> + () -> new MockTokenizer(MockTokenizer.KEYWORD, false)); + } } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index bffb1737eeb93..86cf7b4b76619 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -20,6 +20,8 @@ package org.elasticsearch.index.mapper; import org.apache.lucene.analysis.MockLowerCaseFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; @@ -33,7 +35,9 @@ import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.IndexService; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; +import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.index.mapper.MapperService.MergeReason; +import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESSingleNodeTestCase; @@ -44,8 +48,10 @@ import java.util.Arrays; import java.util.Collection; import java.util.List; +import java.util.Map; import static java.util.Collections.singletonList; +import static java.util.Collections.singletonMap; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -58,6 +64,21 @@ public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin public List getPreConfiguredTokenFilters() { return singletonList(PreConfiguredTokenFilter.singleton("mock_other_lowercase", true, MockLowerCaseFilter::new)); } + + @Override + public Map> getTokenizers() { + return singletonMap("keyword", (indexSettings, environment, name, settings) -> { + class Factory implements TokenizerFactory { + + @Override + public Tokenizer create() { + return new MockTokenizer(MockTokenizer.KEYWORD, false); + } + } + return new Factory(); + }); + } + }; @Override diff --git a/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java b/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java index 2bc98885f9096..a31dcc81f722e 100644 --- a/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java +++ b/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -31,6 +32,7 @@ import org.apache.lucene.store.SimpleFSDirectory; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.io.Streams; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.env.Environment; @@ -49,6 +51,7 @@ import org.elasticsearch.index.analysis.StopTokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; @@ -60,6 +63,8 @@ import java.io.IOException; import java.io.InputStream; import java.io.Reader; +import java.io.StringReader; +import java.io.UncheckedIOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -222,7 +227,7 @@ private Path generateWordList(String[] words) throws Exception { public void testUnderscoreInAnalyzerName() throws IOException { Settings settings = Settings.builder() - .put("index.analysis.analyzer._invalid_name.tokenizer", "keyword") + .put("index.analysis.analyzer._invalid_name.tokenizer", "standard") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, "1") .build(); @@ -256,6 +261,13 @@ public List getPreConfiguredCharFilters() { (tokenStream, esVersion) -> new AppendCharFilter(tokenStream, esVersion.toString())) ); } + + @Override + public Map> getTokenizers() { + // Need mock keyword tokenizer here, because alpha / beta versions are broken up by the dash. + return singletonMap("keyword", (indexSettings, environment, name, settings) -> + () -> new MockTokenizer(MockTokenizer.KEYWORD, false)); + } })).getAnalysisRegistry(); Version version = VersionUtils.randomVersion(random()); @@ -305,11 +317,11 @@ public List getPreConfiguredTokenFilters() { Version version = VersionUtils.randomVersion(random()); IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder() - .put("index.analysis.analyzer.no_version.tokenizer", "keyword") + .put("index.analysis.analyzer.no_version.tokenizer", "standard") .put("index.analysis.analyzer.no_version.filter", "no_version") - .put("index.analysis.analyzer.lucene_version.tokenizer", "keyword") + .put("index.analysis.analyzer.lucene_version.tokenizer", "standard") .put("index.analysis.analyzer.lucene_version.filter", "lucene_version") - .put("index.analysis.analyzer.elasticsearch_version.tokenizer", "keyword") + .put("index.analysis.analyzer.elasticsearch_version.tokenizer", "standard") .put("index.analysis.analyzer.elasticsearch_version.filter", "elasticsearch_version") .put(IndexMetaData.SETTING_VERSION_CREATED, version) .build()); @@ -425,12 +437,17 @@ public Map getHunspellDictionaries() { // Simple char filter that appends text to the term public static class AppendCharFilter extends CharFilter { - private final char[] appendMe; - private int offsetInAppendMe = -1; + + static Reader append(Reader input, String appendMe) { + try { + return new StringReader(Streams.copyToString(input) + appendMe); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } public AppendCharFilter(Reader input, String appendMe) { - super(input); - this.appendMe = appendMe.toCharArray(); + super(append(input, appendMe)); } @Override @@ -440,24 +457,7 @@ protected int correct(int currentOff) { @Override public int read(char[] cbuf, int off, int len) throws IOException { - if (offsetInAppendMe < 0) { - int read = input.read(cbuf, off, len); - if (read == len) { - return read; - } - off += read; - len -= read; - int allowedLen = Math.min(len, appendMe.length); - System.arraycopy(appendMe, 0, cbuf, off, allowedLen); - offsetInAppendMe = allowedLen; - return read + allowedLen; - } - if (offsetInAppendMe >= appendMe.length) { - return -1; - } - int allowedLen = Math.max(len, appendMe.length - offsetInAppendMe); - System.arraycopy(appendMe, offsetInAppendMe, cbuf, off, allowedLen); - return allowedLen; + return input.read(cbuf, off, len); } } diff --git a/server/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java b/server/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java index 802761780a713..9d0c512f785d2 100644 --- a/server/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java +++ b/server/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java @@ -22,11 +22,18 @@ import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequestBuilder; import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.analysis.CharFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.test.MockKeywordPlugin; import org.hamcrest.core.IsNull; import java.io.IOException; import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -38,6 +45,12 @@ import static org.hamcrest.Matchers.startsWith; public class AnalyzeActionIT extends ESIntegTestCase { + + @Override + protected Collection> nodePlugins() { + return Collections.singleton(MockKeywordPlugin.class); + } + public void testSimpleAnalyzerTests() throws Exception { assertAcked(prepareCreate("test").addAlias(new Alias("alias"))); ensureGreen(); diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index 9011b0b8dd69c..d1f91d60e2506 100644 --- a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -50,14 +50,15 @@ import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.InternalSettingsPlugin; +import org.elasticsearch.test.MockKeywordPlugin; import org.hamcrest.Matcher; import org.hamcrest.Matchers; import org.joda.time.DateTime; import org.joda.time.chrono.ISOChronology; import java.io.IOException; +import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.Locale; import java.util.Map; @@ -105,7 +106,7 @@ public class HighlighterSearchIT extends ESIntegTestCase { @Override protected Collection> nodePlugins() { - return Collections.singletonList(InternalSettingsPlugin.class); + return Arrays.asList(InternalSettingsPlugin.class, MockKeywordPlugin.class); } public void testHighlightingWithStoredKeyword() throws IOException { diff --git a/server/src/test/java/org/elasticsearch/search/query/MultiMatchQueryIT.java b/server/src/test/java/org/elasticsearch/search/query/MultiMatchQueryIT.java index c8d57b968568f..926839f2af22c 100644 --- a/server/src/test/java/org/elasticsearch/search/query/MultiMatchQueryIT.java +++ b/server/src/test/java/org/elasticsearch/search/query/MultiMatchQueryIT.java @@ -32,15 +32,19 @@ import org.elasticsearch.index.query.Operator; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.search.MatchQuery; +import org.elasticsearch.plugins.Plugin; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.sort.SortBuilders; import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.test.MockKeywordPlugin; import org.junit.Before; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Set; import java.util.concurrent.ExecutionException; @@ -72,6 +76,11 @@ public class MultiMatchQueryIT extends ESIntegTestCase { + @Override + protected Collection> nodePlugins() { + return Collections.singleton(MockKeywordPlugin.class); + } + @Before public void init() throws Exception { CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder() diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index f26c44e05f506..c5b89adfd738e 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -24,7 +24,6 @@ import org.apache.lucene.analysis.util.TokenizerFactory; import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.index.analysis.HunspellTokenFilterFactory; -import org.elasticsearch.index.analysis.KeywordTokenizerFactory; import org.elasticsearch.index.analysis.MultiTermAwareComponent; import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; @@ -79,7 +78,7 @@ private static String toCamelCase(String s) { // exposed in ES .put("classic", MovedToAnalysisCommon.class) .put("edgengram", MovedToAnalysisCommon.class) - .put("keyword", KeywordTokenizerFactory.class) + .put("keyword", MovedToAnalysisCommon.class) .put("letter", MovedToAnalysisCommon.class) .put("lowercase", MovedToAnalysisCommon.class) .put("ngram", MovedToAnalysisCommon.class) diff --git a/test/framework/src/main/java/org/elasticsearch/test/MockKeywordPlugin.java b/test/framework/src/main/java/org/elasticsearch/test/MockKeywordPlugin.java new file mode 100644 index 0000000000000..fb9da1dad40fd --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/test/MockKeywordPlugin.java @@ -0,0 +1,54 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.test; + +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.elasticsearch.index.analysis.TokenizerFactory; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.Plugin; + +import java.util.Map; + +import static java.util.Collections.singletonMap; + +/** + * Some tests rely on the keyword tokenizer, but this tokenizer isn't part of lucene-core and therefor not available + * in some modules. What this test plugin does, is use the mock tokenizer and advertise that as the keyword tokenizer. + * + * Most tests that need this test plugin use normalizers. When normalizers are constructed they try to resolve the + * keyword tokenizer, but if the keyword tokenizer isn't available then constructing normalizers will fail. + */ +public class MockKeywordPlugin extends Plugin implements AnalysisPlugin { + + @Override + public Map> getTokenizers() { + return singletonMap("keyword", (indexSettings, environment, name, settings) -> { + class Factory implements TokenizerFactory { + + @Override + public Tokenizer create() { + return new MockTokenizer(MockTokenizer.KEYWORD, false); + } + } + return new Factory(); + }); + } +}