diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc index dd5cb2e702cff..ee891fdd09aa7 100644 --- a/docs/reference/analysis/tokenfilters.asciidoc +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -35,6 +35,8 @@ include::tokenfilters/word-delimiter-tokenfilter.asciidoc[] include::tokenfilters/word-delimiter-graph-tokenfilter.asciidoc[] +include::tokenfilters/multiplexer-tokenfilter.asciidoc[] + include::tokenfilters/stemmer-tokenfilter.asciidoc[] include::tokenfilters/stemmer-override-tokenfilter.asciidoc[] diff --git a/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc new file mode 100644 index 0000000000000..51937084e3984 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc @@ -0,0 +1,116 @@ +[[analysis-multiplexer-tokenfilter]] +=== Multiplexer Token Filter + +A token filter of type `multiplexer` will emit multiple tokens at the same position, +each version of the token having been run through a different filter. Identical +output tokens at the same position will be removed. + +WARNING: If the incoming token stream has duplicate tokens, then these will also be +removed by the multiplexer + +[float] +=== Options +[horizontal] +filters:: a list of token filters to apply to incoming tokens. These can be any + token filters defined elsewhere in the index mappings. Filters can be chained + using a comma-delimited string, so for example `"lowercase, porter_stem"` would + apply the `lowercase` filter and then the `porter_stem` filter to a single token. + +WARNING: Shingle or multi-word synonym token filters will not function normally + when they are declared in the filters array because they read ahead internally + which is unsupported by the multiplexer + +preserve_original:: if `true` (the default) then emit the original token in + addition to the filtered tokens + + +[float] +=== Settings example + +You can set it up like: + +[source,js] +-------------------------------------------------- +PUT /multiplexer_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : [ "my_multiplexer" ] + } + }, + "filter" : { + "my_multiplexer" : { + "type" : "multiplexer", + "filters" : [ "lowercase", "lowercase, porter_stem" ] + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +And test it like: + +[source,js] +-------------------------------------------------- +POST /multiplexer_example/_analyze +{ + "analyzer" : "my_analyzer", + "text" : "Going HOME" +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +And it'd respond: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "Going", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "going", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "go", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "HOME", + "start_offset": 6, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "home", <1> + "start_offset": 6, + "end_offset": 10, + "type": "", + "position": 1 + } + ] +} +-------------------------------------------------- +// TESTRESPONSE + +<1> The stemmer has also emitted a token `home` at position 1, but because it is a +duplicate of this token it has been removed from the token stream \ No newline at end of file diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index cdd8101a73c70..ca2f74b5efee0 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -226,6 +226,7 @@ public Map> getTokenFilters() { filters.put("limit", LimitTokenCountFilterFactory::new); filters.put("lowercase", LowerCaseTokenFilterFactory::new); filters.put("min_hash", MinHashTokenFilterFactory::new); + filters.put("multiplexer", MultiplexerTokenFilterFactory::new); filters.put("ngram", NGramTokenFilterFactory::new); filters.put("nGram", NGramTokenFilterFactory::new); filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java new file mode 100644 index 0000000000000..1cf5303a77209 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java @@ -0,0 +1,195 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; +import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.ReferringFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.function.Function; + +public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory { + + private List filters; + private List filterNames; + private final boolean preserveOriginal; + + private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() { + @Override + public String name() { + return "identity"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return tokenStream; + } + }; + + public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException { + super(indexSettings, name, settings); + this.filterNames = settings.getAsList("filters"); + this.preserveOriginal = settings.getAsBoolean("preserve_original", true); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + List> functions = new ArrayList<>(); + for (TokenFilterFactory tff : filters) { + functions.add(tff::create); + } + return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions)); + } + + @Override + public void setReferences(Map factories) { + filters = new ArrayList<>(); + if (preserveOriginal) { + filters.add(IDENTITY_FACTORY); + } + for (String filter : filterNames) { + String[] parts = Strings.tokenizeToStringArray(filter, ","); + if (parts.length == 1) { + filters.add(resolveFilterFactory(factories, parts[0])); + } else { + List chain = new ArrayList<>(); + for (String subfilter : parts) { + chain.add(resolveFilterFactory(factories, subfilter)); + } + filters.add(chainFilters(filter, chain)); + } + } + } + + private TokenFilterFactory chainFilters(String name, List filters) { + return new TokenFilterFactory() { + @Override + public String name() { + return name; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + for (TokenFilterFactory tff : filters) { + tokenStream = tff.create(tokenStream); + } + return tokenStream; + } + }; + } + + private TokenFilterFactory resolveFilterFactory(Map factories, String name) { + if (factories.containsKey(name) == false) { + throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]"); + } else { + return factories.get(name); + } + } + + private final class MultiplexTokenFilter extends TokenFilter { + + private final TokenStream source; + private final int filterCount; + + private int selector; + + /** + * Creates a MultiplexTokenFilter on the given input with a set of filters + */ + MultiplexTokenFilter(TokenStream input, List> filters) { + super(input); + TokenStream source = new MultiplexerFilter(input); + for (int i = 0; i < filters.size(); i++) { + final int slot = i; + source = new ConditionalTokenFilter(source, filters.get(i)) { + @Override + protected boolean shouldFilter() { + return slot == selector; + } + }; + } + this.source = source; + this.filterCount = filters.size(); + this.selector = filterCount - 1; + } + + @Override + public boolean incrementToken() throws IOException { + return source.incrementToken(); + } + + @Override + public void end() throws IOException { + source.end(); + } + + @Override + public void reset() throws IOException { + source.reset(); + } + + private final class MultiplexerFilter extends TokenFilter { + + State state; + PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + + private MultiplexerFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (selector >= filterCount - 1) { + selector = 0; + if (input.incrementToken() == false) { + return false; + } + state = captureState(); + return true; + } + restoreState(state); + posIncAtt.setPositionIncrement(0); + selector++; + return true; + } + + @Override + public void reset() throws IOException { + super.reset(); + selector = filterCount - 1; + this.state = null; + } + } + + } +} diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterTests.java new file mode 100644 index 0000000000000..c39fa05c26f72 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterTests.java @@ -0,0 +1,106 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.elasticsearch.test.IndexSettingsModule; + +import java.io.IOException; +import java.util.Collections; + +public class MultiplexerTokenFilterTests extends ESTokenStreamTestCase { + + public void testMultiplexingFilter() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("index.analysis.filter.t.type", "truncate") + .put("index.analysis.filter.t.length", "2") + .put("index.analysis.filter.multiplexFilter.type", "multiplexer") + .putList("index.analysis.filter.multiplexFilter.filters", "lowercase, t", "uppercase") + .put("index.analysis.analyzer.myAnalyzer.type", "custom") + .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.myAnalyzer.filter", "multiplexFilter") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings); + + try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) { + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "ONe tHree", new String[]{ + "ONe", "on", "ONE", "tHree", "th", "THREE" + }, new int[]{ + 1, 0, 0, 1, 0, 0 + }); + // Duplicates are removed + assertAnalyzesTo(analyzer, "ONe THREE", new String[]{ + "ONe", "on", "ONE", "THREE", "th" + }, new int[]{ + 1, 0, 0, 1, 0, 0 + }); + } + } + + public void testMultiplexingNoOriginal() throws IOException { + + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("index.analysis.filter.t.type", "truncate") + .put("index.analysis.filter.t.length", "2") + .put("index.analysis.filter.multiplexFilter.type", "multiplexer") + .put("index.analysis.filter.multiplexFilter.preserve_original", "false") + .putList("index.analysis.filter.multiplexFilter.filters", "lowercase, t", "uppercase") + .put("index.analysis.analyzer.myAnalyzer.type", "custom") + .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.myAnalyzer.filter", "multiplexFilter") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings); + + try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) { + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "ONe tHree", new String[]{ + "on", "ONE", "th", "THREE" + }, new int[]{ + 1, 0, 1, 0, + }); + } + + } + +} diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index 61b5cb9171244..c61a7cf070680 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -166,7 +166,18 @@ public Map buildTokenFilterFactories(IndexSettings i */ tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings))); tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings))); - return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters); + + Map mappings + = buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters); + + // ReferringTokenFilters require references to other tokenfilters, so we pass these in + // after all factories have been registered + for (TokenFilterFactory tff : mappings.values()) { + if (tff instanceof ReferringFilterFactory) { + ((ReferringFilterFactory)tff).setReferences(mappings); + } + } + return mappings; } public Map buildTokenizerFactories(IndexSettings indexSettings) throws IOException { diff --git a/server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java new file mode 100644 index 0000000000000..9eb9bc2dbd653 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java @@ -0,0 +1,37 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import java.util.Map; + +/** + * Marks a {@link TokenFilterFactory} that refers to other filter factories. + * + * The analysis registry will call {@link #setReferences(Map)} with a map of all + * available TokenFilterFactories after all factories have been registered + */ +public interface ReferringFilterFactory { + + /** + * Called with a map of all registered filter factories + */ + void setReferences(Map factories); + +} diff --git a/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index 36da9761b978d..26a5b87866c21 100644 --- a/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -20,7 +20,6 @@ package org.elasticsearch.index.analysis; import com.carrotsearch.randomizedtesting.generators.RandomPicks; - import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.en.EnglishAnalyzer;